#!/usr/bin/perl -w
# tntwrapper: simulate mbtclient functionality for tnt
# usage:      tntwrapper [ -b baseDir ] < file
# option:     -b base directory: location of lexicon and n-gram file (train)
# 20031006 erikt@uia.ua.ac.be

use strict;
use FileHandle;

# script parameters: p: port; s: server
use vars qw($opt_b); 
use Getopt::Std;
# get parameters
getopt("b");

# variables
my ($baseDir,$lastTag,$line,$slash,$tmpFile,$usage,$w,$wordSeen,
    @words);

# default base dir for POS tagging with CGN
$baseDir = "/scratch/erikt/software/mbsp/cgn/tnt";
if (defined $opt_b) { $baseDir = $opt_b; }
$usage = "usage: tntwrapper -c configuration < file";

# if (not defined $opt_c) { die "$usage\n"; }
# elsif (not -f "$baseDir/pos.train.lex") { 
#    die "unknown configuration $opt_c\n"; 
# }

# read data
srand(time^($$+($$<<15)));
$tmpFile = "/tmp/tntwrapper.".(10000+int(rand(90000)));
while (-f $tmpFile) { $tmpFile = "/tmp/tntwrapper".(10000+int(rand(90000))); }
open(OUTFILE,">$tmpFile");
while (<STDIN>) {
   $line = $_;
   chomp($line);
   @words = split(/\s+/,$line);
   $wordSeen = 0;
   foreach $w (@words) {
      if ($w ne "") { print OUTFILE "$w\n"; $wordSeen = 1; }
   }
   if ($wordSeen) { print OUTFILE "\n"; }
}
close(OUTFILE);

# process data
$wordSeen = 0;
$lastTag = "";
# tnt options: -v 0: verbosity = off; -m: mark unknown words; -H: ignore xml
open(INFILE,"cd $baseDir; ../bin/tnt -v 0 -m -H train $tmpFile 2>/dev/null |");
while (<INFILE>) {
   $line = $_;
   chomp($line);
   @words = split(/\s+/,$line);
   if ($#words < 0 or $words[0] eq "") { print "\n"; $wordSeen = 0; }
   else {
      if ($wordSeen) { print " "; }
      # 20050404 commented away ET
#     $words[1] = &correctTag($words[0],$words[1],$lastTag);
      $slash = (defined $words[2]) ? "//" : "/"; # unknown word : known word
      print "$words[0]";
      if (defined $words[1]) { 
          print "$slash$words[1]"; 
          $lastTag = $words[1];
      }
      $wordSeen = 1;
   }
}
if ($wordSeen) { print "\n"; }
close(INFILE);
unlink($tmpFile);
exit(0);

# correct frequent errors
sub correctTag {
   my ($word,$tag,$lastTag,$lastVerb);

   ($word,$tag,$lastTag) = @_;
#  if ($opt_c ne "cgn/pos/tnt") { return($tag); }
   if ($word =~ /^[0-9][0-9,\.]*$/) { return("TW(hoofd,vrij)"); }
   if ($lastTag =~ /^LID/ and $tag =~ /^WW/ and $tag !~ /^WW.inf/) {
      return("N(soort,ev,basis,zijd,stan)");
   }
   return($tag);
}
