#!/usr/bin/perl -w
# process: perform named entity tagging on xml files of Maarten Marx
# usage: process file
# 20051102 erikt@science.uva.nl

use IPC::Open2;

$command = $0;
$BASEDIR="/scratch/erikt/software/ner"
;
$tokenizer = "$BASEDIR/bin/tokenize";
$tagger = "$BASEDIR/bin/ner";
$xml = shift(@ARGV) or die "usage: $command file\n";

open(XMLIN,$xml) or die "$command: cannot open file $xml";
open2(INFILE,OUTFILE,"$tokenizer|$tagger") or 
   die "$command: cannot run $tagger or $tokenizer\n";
while (<XMLIN>) {
   $line = $_;
   chomp($line);
   $line =~ s/<[^>]*>/\n\n/g;
   $line =~ s/,,/ \'\' /g;
   $line =~ s/&\#x(.)[^;\s]+;/$1/g;
   $line =~ s/\./\. /g;
   print OUTFILE "$line\n";
}
close(OUTFILE);
close(XMLIN);

@chars = ();
@tags = ();
while (<INFILE>) {
   $line = $_;
   chomp($line);
   @tokens = split(/\s+/,$line);
   for ($i=0;$i<=$#tokens;$i++) {
      ($word,$tag) = split(/\/+/,$tokens[$i]);
      $tag =~ s/^[A-Z]-//;
      @c = split(//,$word);
      for ($j=0;$j<=$#c;$j++) {
         push(@chars,$c[$j]);
         push(@tags,$tag);
      }
   }
}
close(INFILE);

open(XMLIN,$xml) or die "$command: cannot open file $xml";
$inxml = 0;
$lastTag = "O";
while (<XMLIN>) {
   $line = $_;
   chomp($line);
   @c = split(//,$line);
   for ($i=0;$i<=$#c;$i++) {
      while (defined $chars[0] and $chars[0] !~ /[a-zA-Z0-9\-]/) {
         if ($tags[0] eq "O") { $lastTag = "O"; }
         shift(@chars);
         shift(@tags);
      }
      if ($c[$i] eq "<") { $inxml = 1; }
      if (not $inxml and lc($c[$i]) eq lc($chars[0])) {
         if ($tags[0] ne "O" and $lastTag ne $tags[0]) {
            print "<ne type=\"$tags[0]\">";
         }
         print $c[$i];
         if ($tags[0] ne "O" and 
             (not defined $tags[1] or $tags[1] ne $tags[0])) {
            print "</ne>";
         }
         $lastTag = $tags[0];
         shift(@chars);
         shift(@tags);
      } else { print $c[$i]; }
      if ($c[$i] eq ">") { $inxml = 0; }
   }
   print "\n";
}

exit(0);

   
