#!/usr/bin/perl -w

# This script transforms the text annotation produced by the ne tagger
# in xml format. It deletes the annotations: "/O" and "//O", and
# transforms the following text annotations "word/TYPE", "word//TYPE"
# in "<ne type="TYPE">word</ne>". It also this rule:
# word1//?TYPE1 word2//?TYPE2, where TYPE1=TYPE2 ->
# <ne type="TYPE1">word1 word2</ne>
#
# typical run: tokenize < input.txt | ner | perl ner2xml.pl > output.xml
# 20060124 lafanasi@science.uva.nl
# 20060125 erikt@science.uva.nl (code cleanup)

# undef $/;

my $string = "";

while (<STDIN>) { 
   $string = $_; 
   chomp($string);
   $type=''; 
   @temp=();
   $str = '';
   while($string =~ s/(\S*?)\/\/?(\w+)//){
      $type = $2 unless $type;
      if($type eq $2) { push @temp, $1; }
      else {
	 if ($type eq 'O'){
	    $str .= (join " ", @temp)." ";
	    $str =~ s/&slash;/\//g;
	 } else {
	    $str .= "<ne type=\"$type\">".(join " ", @temp)."</ne> ";
	    $str =~ s/&slash;/\//g;
	 }
	 $type = $2;
	 @temp =();
	 push @temp, $1;
      }
   }
   if (@temp) { 
      if ($type eq 'O'){
	 $str .= (join " ", @temp);
	 $str =~ s/&slash;/\//g;
      } else {
	 $str .= "<ne type=\"$type\">".(join " ", @temp)."</ne> ";
	 $str =~ s/&slash;/\//g;
      }
   }
   print STDOUT "$str\n";
}

exit(0);
