#!/usr/bin/perl -w
# nerCleanup: postprocessing rules for named entity annotation of clef2003 data
# usage: nerCleanup < file
# 20050411 erikt@science.uva.nl

$caps = "A-Z";

while (<STDIN>) {
   $line = $_;
   chomp($line);
   @fields = split(/\s+/,$line);
   $prevT = "O";
   $prevW = "";
   $prevS = "";
   $lastI = -1;
   for ($i=0;$i<=$#fields;$i++) {
      # empty or xml tag
      if ($fields[$i] eq "" or $fields[$i] !~ ?/?) { next; }
      ($w,$s,$t) = split(/([\/]+)/,$fields[$i]);
      # unknown name must contain capital character
      if ($s eq "//" and $w !~ /[$caps]/) { $t = "O"; }
#     # unknown XXX-xxx must receive tag MISC
#     if ($s eq "//" and $w =~ /^[A-Z]+-[a-z]/ and $t !~ /MISC/) { 
#        $t = "I-MISC"; 
#     }
      # name cannot end with lower case word
      if ($prevT ne "O" and $prevW !~ /[$caps]/ and $t eq "O") { 
         $prevT = "O";
         $fields[$lastI] = "$prevW$prevS$prevT";
      }
      # name cannot start with lower case word
      if ($prevT eq "O" and $w !~ /[$caps]/) { $t = "O"; }
      $fields[$i] = "$w$s$t";
      $prevS = $s;
      $prevT = $t;
      $prevW = $w;
      $lastI = $i;
   }
   $line = join(" ",@fields);
   print "$line\n";
}
exit(0);
