#!/usr/bin/perl -w
# sent2aif: convert annotated text files to aif format for Callisto
# usage: sent2aif file
# note: words in text should be annotated: word/TAG
# 20060103 erikt@science.uva.nl

$command = $0;
$BASEDIR="/scratch/erikt/software/ner"
;
$inFile = shift(@ARGV);
if (not defined $inFile) { die "usage: $command file\n"; }
$inFileBase = $inFile;
$inFileBase =~ s?.*/??; # name without directory info

# read text and collect starting points, ending points and types of spans
@out = (); # text without tags
@starts = ();
@ends = ();
@types = ();
%types = (); # unique types in data
$lastType = "O";
open(INFILE,$inFile) or die "$command: cannot open $INFILE\n";
while (<INFILE>) {
   $line = $_;
   @in = split(//,$line);
   $start = -1;
   for ($i=0;$i<=$#in;$i++) {
      # store start positions of words
      if ($in[$i] =~ /\s/) { $start = -1; }
      elsif ($start < 0) { $start = $#out+1; }
      # process character
      if ($in[$i] ne "/") { push(@out,$in[$i]); }
      else {
         # every / character should start a tag!
         $j = $i+1;
         while (defined $in[$j] and $in[$j] eq "/") { $j++; }
         $type = "";
         while (defined $in[$j] and $in[$j] !~ /\s/) { 
            $type .= $in[$j];
            $j++; 
         }
         $end = $#out+1;
         if ($type ne "O") {
            # if last type was the same: extend last span
            if ($lastType eq $type) { $ends[$#ends] = $end; }
            else {
               push(@starts,$start);
               push(@ends,$end);
               push(@types,$type);
               $types{$type} = 1;
            }
            $lastType = $type;
         } else { $lastType = "O"; }
         $i = $j-1;
      }
   }
}
close(INFILE);

# generate aif file header
print <<THEEND;
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Corpus SYSTEM "http://www.nist.gov/speech/atlas/aif.dtd">

<Corpus xmlns="http://www.nist.gov/speech/atlas" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:dc="http://www.ukoln.ac.uk/interop-focus/activities/z3950/int_profile/bath/draft/stable1.html" id="Cor1" AIFVersion="1.1" type="\${task_name}" schemeLocation="http://callisto.mitre.org/maia/generic?org.conll.ner.maia.xml">
  <Metadata/>
  <SimpleSignal id="Sig1" type="text" mimeClass="text" mimeType="plain" xlink:href="$inFileBase" encoding="UTF-8" track="ALL" xlink:type="simple">
    <body encoding="Base64">
THEEND

# generate base 64 version of text
# store text in temporary file
$text = join("",@out);
$tmpFile = "/tmp/sent2aif.$$";
open(OUTFILE,">$tmpFile.in") or die "$command: cannot write to $tmpFile.in\n";
print OUTFILE $text;
close(OUTFILE);
# run b64 on file
system("$BASEDIR/bin/b64 -e $tmpFile.in $tmpFile.out"); # or die "$command: cannot run b64\n";
# read encoded text from output file
open(INFILE,"$tmpFile.out") or die "$command: cannot read from $tmpFile.out\n";
while (<INFILE>) { print $_; }
close(INFILE);
unlink("$tmpFile.in","$tmpFile.out");

print <<THEEND;
</body>
  </SimpleSignal>
  <AnchorSet containedType="text-point">
THEEND

# names of anchor points
%pointNames = ();
$count = 1;
for ($i=0;$i<=$#starts;$i++) {
   if (not defined $pointNames{$starts[$i]}) {
      $pointNames{$starts[$i]} = "Anc$count";
      $count++;
      print <<THEEND;
    <Anchor id="$pointNames{$starts[$i]}" type="text-point">
      <Parameter type="char" unit="NULL_UNIT" role="char">$starts[$i]</Parameter>
      <SignalRef xlink:href="#Sig1" role="text" xlink:type="simple"/>
    </Anchor>
THEEND
   }
   if (not defined $pointNames{$ends[$i]}) {
      $pointNames{$ends[$i]} = "Anc$count";
      $count++;
      print <<THEEND;
    <Anchor id="$pointNames{$ends[$i]}" type="text-point">
      <Parameter type="char" unit="NULL_UNIT" role="char">$ends[$i]</Parameter>
      <SignalRef xlink:href="#Sig1" role="text" xlink:type="simple"/>
    </Anchor>
THEEND
   }
}

print <<THEEND;
  </AnchorSet>
  <RegionSet containedType="text-extent">
THEEND

# link anchor points to spans (regions)
for ($i=0;$i<=$#starts;$i++) {
   $count = $i+1;
   print <<THEEND;
    <Region id="Reg$count" type="text-extent">
      <AnchorRef xlink:href="#$pointNames{$ends[$i]}" role="end" xlink:type="simple"/>
      <AnchorRef xlink:href="#$pointNames{$starts[$i]}" role="start" xlink:type="simple"/>
    </Region>
THEEND
}

print <<THEEND;
  </RegionSet>
  <Analysis id="Ana1" type="generic-set" role="generic-set">
THEEND

$annCount = 0;
foreach $type (sort keys %types) {
   print <<THEEND;
    <AnnotationSet containedType="$type">
THEEND
   for ($i=0;$i<=$#starts;$i++) {
      if ($types[$i] eq $type) {
         $regCount = $i+1;
         $annCount++;         
         print <<THEEND;
      <Annotation id="Ann$annCount" type="$types[$i]">
        <RegionRef xlink:href="#Reg$regCount" role="text-extent" xlink:type="simple"/>
        <Content type="empty-content"/>
      </Annotation>
THEEND
      }
   }
   print "    </AnnotationSet>\n";
}

print <<THEEND;
  </Analysis>
</Corpus>
THEEND

exit(0);
