#!/usr/local/bin/perl -w # vote: process a timbl file and choose most frequent feature # usage: vote [-a] [-p] [-o] [-r] [-w] [-v] [tune] test # -a: use overall tagger accuracy # -p: use tag precision # -r: use tag precision and recall # -w: use pairwise voting # -o: show results # -v: (verbose) show results + weights # notes: default is: use majority # relevant paper: Van Halteren et.al. 1996 "Improving..." # 990713 erikt@uia.ua.ac.be # version date 20000730 use strict; my $false = 0; my $true = 1; my $method = "majority"; my $tune = ""; my $test = ""; my $showResult = $false; my $separator = " "; my $usage = "usage: vote [-a] [-p] [-o] [-r] [-w] [-v] [tune] test\n"; my $verbose = $false; # read and check command line options if (! defined($ARGV[0])) { print STDERR $usage; exit(1); } while ($ARGV[0] =~ /^-/) { if ($ARGV[0] eq "-a") { $method = "accuracy"; } elsif ($ARGV[0] eq "-o") { $showResult = $true; } elsif ($ARGV[0] eq "-p") { $method = "precision"; } elsif ($ARGV[0] eq "-r") { $method = "recall"; } elsif ($ARGV[0] eq "-w") { $method = "pairwise"; } elsif ($ARGV[0] eq "-v") { $verbose = $true; $showResult = $true; } else { print STDERR $usage; exit(1); } shift(@ARGV); if (! defined($ARGV[0])) { print STDERR $usage; exit(1); } } # read file names if (! defined($ARGV[0])||defined($ARGV[2])) { print STDERR $usage; exit(1); } if (! defined($ARGV[1])) { $test = $ARGV[0]; $method = "majority"; } else { $tune = $ARGV[0]; $test = $ARGV[1]; } print STDERR "Using $method. Test data is $test. "; if ($tune ne "") { print STDERR "Tune data is $tune.\n"; } else { print STDERR "No tune data supplied.\n"; } my $nbrOfFeatures = 0; # number of columns except for answer my ($answer,$code,$code1,$code2,$i,$j,$k,$line,$nbrOfItems); my (@correct,@words); my (%accuracy,%correct,%found,%pairs,%precision,%recall,%total); if ($method ne "majority" && $tune ne "") { # read tuning data $nbrOfItems = 0; open(TUNE,$tune) || die("cannot open tune file $tune\n"); while () { chomp($line = $_); @words = split(/$separator/,$line); $i = $#words; $code = "$i$separator$words[$i]"; $found{$code} = $found{$code} ? $found{$code}+1 : 1; $answer = pop(@words); if ($nbrOfFeatures <= 0) { $nbrOfFeatures = $#words+1; } if ($method ne "pairwise") { for ($i=0;$i<=$#words;$i++) { $code = "$i$separator$words[$i]"; $found{$code} = $found{$code} ? $found{$code}+1 : 1; if ($words[$i] eq $answer) { $correct{$code} = $correct{$code} ? $correct{$code}+1 : 1; $correct[$i] = $correct[$i] ? $correct[$i]+1 : 1; } } } else { # $method eq "pairwise" print STDERR "pair-wise code temporarily removed\n"; exit(1); } if ($#words != $nbrOfFeatures-1) { print STDERR "inconsistent number of columns in tune data\n"; exit(1); } $nbrOfItems++; } close(TUNE); # determine weights if ($method eq "accuracy") { for ($i=0;$i<=$nbrOfFeatures;$i++) { $correct[$i] = $correct[$i] ? $correct[$i] : 0; $accuracy{$i} = $correct[$i]/$nbrOfItems; } } if ($method eq "precision" or $method eq "recall") { foreach $i (keys %found) { $correct{$i} = $correct{$i} ? $correct{$i} : 0; $precision{$i} = $correct{$i}/$found{$i}; } } if ($method eq "recall") { foreach $i (keys %found) { ($j,$k) = split(/$separator/,$i); $code = "$nbrOfFeatures$separator$k"; $recall{$i} = $correct{$i}/$found{$code}; } } } my $correct = 0; my $nbrOfLines = 0; my ($bestDict,$bestCount,$found,$tmp,$weight); my (@count,@dict,@tmp); my (%seen,%storedAnswers); open(TEST,$test) || die("cannot open test file $test\n"); while () { chomp($line = $_); @words = split(/$separator/,$line); # remove answer $answer = pop(@words); $line = join($separator,@words); if ($nbrOfFeatures <= 0) { $nbrOfFeatures = $#words+1; } if (defined($storedAnswers{$line})) { @tmp = split(/$separator/,$storedAnswers{$line}); $bestCount = pop(@tmp); $bestDict = pop(@tmp); } else { if ($method ne "pairwise") { @count = (); @dict = (); for ($i=0;$i<=$#words;$i++) { $found = $false; # did this value occur earlier? $j = 0; while ($j <= $#dict and not $found) { if ($dict[$j] eq $words[$i]) { $found = $true; } else { $j++; } } if ($j > $#dict) { $dict[$j] = $words[$i]; } $code = "$i$separator$words[$i]"; if ($method eq "majority") { $weight = 1; } elsif ($method eq "accuracy") { $weight = $accuracy{$i}; } elsif ($method eq "precision") { $weight = $precision{$code}; } elsif ($method eq "recall") { $weight = $precision{$code}; %seen = (); for ($k=0;$k<=$#words;$k++) { if ($words[$k] ne $words[$i] and not $seen{$words[$k]}) { $code = "$i$separator$words[$k]"; $weight -= 1-$recall{$code}; $seen{$words[$k]} = $true; } } } $count[$j] = $count[$j] ? $count[$j]+$weight : $weight; } } else { # pairwise voting print STDERR "pair-wise code temporarily removed\n"; exit(1); } if ($#words != $nbrOfFeatures-1) { print STDERR "inconsistent number of columns in test data\n"; exit(1); } $bestDict = $dict[0]; $bestCount = $count[0]; for ($i=1;$i<=$#dict;$i++) { if ($count[$i] > $bestCount) { $bestDict = $dict[$i]; $bestCount = $count[$i]; } } } if (! defined($bestDict)) { print STDERR "error: cannot apply pairwise voting to this data set\n"; exit(0); } $storedAnswers{$line} = "$line$separator$answer$separator$bestDict$separator$bestCount"; if ($showResult) { if ($verbose) { print "$storedAnswers{$line}\n"; } else { $tmp = $storedAnswers{$line}; $tmp =~ s/$separator[^$separator]*$//; print "$tmp\n"; } } if ($bestDict eq $answer) { $correct++; } $nbrOfLines++; } close(TEST); printf STDERR "Correct %d of %d (%5.2f%%)\n",$correct,$nbrOfLines, 100*$correct/$nbrOfLines; exit(0); sub stringMember { my $member = shift(@_); my $listString = shift(@_); my @list = split(/$separator/,$listString); my $i = 0; my $found = $false; while ($i <= $#list && ! $found) { if ($list[$i] eq $member) { $found = $true; } else { $i++; } } $found; }