#!/usr/bin/perl -w
# tokenize: tokenize Dutch text
# usage: tokenize [-dns] < file
# options: -d: print debugging information
#          -n: do not add or remove newline characters
#          -s: split words with internal punctuation
#          -x: embed sentences in xml tags
# note: 20050415: bug for ad19940819 with -x: generates line " <s>"
# 20040604 erikt@uia.ua.ac.be

use vars qw($opt_n $opt_s); 
use Getopt::Std;
# get parameters
getopt("");

# definition of upper and lower case characters
$caps = "A-Z";
$lower = "a-z";
$char = "0-9$caps$lower";
$BASEDIR="/scratch/erikt/software/ner"
;

# abbreviation definition
%abbrev = ();
$abbrevFile = "$BASEDIR/etc/abbrev";
&readAbbrev($abbrevFile);

$lastWord = "\n";
$lastRealWord = "";
while (<STDIN>) {
   $line = $_;
   chomp($line);
   # convert &amp;
   $line =~ s/&amp;/&/g;
   if ($line =~ /^!!! *(.*)/) {
      # if ($lastWord ne "\n") { print "\n"; $lastWord = "\n"; }
      print "$1\n"; 
      next; 
   }
   @words = split(/\s+/,$line);
   $i = 0;
   while ($i <= $#words) {
      if (not defined $words[$i] or $words[$i] eq "") { 
         # RULE 1:delete empty element
         splice(@words,$i,1);
         if (defined $opt_d) { print STDERR "1\n"; }
      } elsif ($words[$i] =~ /^([^$char\-])(.+)$/ and
               ($1 ne "'" or $2 !~ /^([mns]|[0-9]+)$/)) {
         # RULE 2: remove punctuation sign prefix (includes period)
         $words[$i] =~ s/^([^$char\-])//;
         splice(@words,$i,0,$1);
         if (defined $opt_d) { print STDERR "2: $1 $words[$i]\n"; }
      } elsif ($words[$i] =~ /.[^$char\-\.]$/) {
         # RULE 3:remove punctuation sign suffix (not including period)
         $words[$i] =~ s/([^$char\-\.])$//;
         splice(@words,$i+1,0,$1);
         if (defined $opt_d) { print STDERR "3: $words[$i] $1\n"; }
      } elsif ($words[$i] =~ /^(.+)\.$/ and 
               not &isAbbrev($words[$i])) {
         # RULE 4: remove period suffix
         $words[$i] =~ s/([\.])$//;
         splice(@words,$i+1,0,$1);
         if (defined $opt_d) { print STDERR "4: $words[$i] $1\n"; }
      } elsif ($words[$i] =~ /.([^$char\-\.\'])([\'\"$char].*)$/ and 
#              ($1 ne "'" or $2 !~ /^[mns]$/) and
               not &isAbbrev($words[$i]) and
               $words[$i] =~ /[$caps$lower]/) {
         # RULE 5: split words with internal punctuation (not including period)
         $words[$i] =~ s/^(.+)([^$char\-\.])([\'\"$char].*)$/$3/;
         splice(@words,$i,0,$1,$2);
         if (defined $opt_d) { print STDERR "5: $1 $2 $words[$i]\n"; }
      } elsif (defined $opt_s and
               not &isAbbrev($words[$i]) and
               $words[$i] =~ /^([$caps$lower\.\-]+\.)./ and
               &isAbbrev("$1")) {
         # RULE 6: split words with preceding abbreviation
         $words[$i] =~ s/^($1)//;
         splice(@words,$i,0,$1);
         if (defined $opt_d) { print STDERR "6: $1 $words[$i]\n"; }
      # no comma here: Boer,v.Bronckhorst,Seedorf,Cocu,Davids
      } elsif (defined $opt_s and
               $words[$i] !~ /^([$caps]\.)+$/ and
               $words[$i] =~ /.[\.!?:;][$char][^\.]*$/ and 
               not &isAbbrev($words[$i]) and
               $words[$i] =~ /[$caps$lower]/ and $words[$i] !~ /www/i) {
         # RULE 7: split words with internal punctuation (including period)
         $words[$i] =~ s/^(.+[\.!?:;])([$char][^\.]*)$/$2/;
         splice(@words,$i,0,$1);
         if (defined $opt_d) { print STDERR "7: $1 $words[$i]\n"; }
      } elsif ($i > 1 and $words[$i] =~ /^[\.?!\*]+$/ and
               $words[$i-1] eq "\n" and $words[$i-2] =~ /^[\.?!\*]+$/) {
         # RULE 8: combine punctuation in non-opt_n mode
         $words[$i-2] .= $words[$i];
         splice(@words,$i,1);
         if (defined $opt_d) { print STDERR "8: $words[$i]\n"; }
      } elsif ($i > 0 and $words[$i] =~ /^[\.]+$/ and
               $words[$i-1] =~ /^[\.]+$/) {
         # RULE 9: combine punctuation in opt_n mode
         $words[$i-1] .= $words[$i];
         splice(@words,$i,1);
         if (defined $opt_d) { print STDERR "9: ",$words[$i-1],"\n"; }
      } else {
         if (not defined $opt_n and $words[$i] =~ /^[\.!?]+$/) {
            # RULE 10: add newline behind punctuation
            splice(@words,$i+1,0,"\n"); 
            if (defined $opt_d) { print STDERR "10: $words[$i]\n"; }
         }
         $i++; 
      }
   }
   if ($#words < 0) { $words[0] = "\n"; }
   elsif ($#words > 1 and $words[$#words] =~ /^[0-9]+([\/\-,][0-9]+)?$/ and
          $words[$#words-1] eq "\n") {
      # delete sentence boundary before final number on line (tt index page)
      splice(@words,$#words-1,1);
   }
   # do not place a " in a sentence by itself
   if ($#words > 1 and $words[$#words] eq "\"" and 
       $words[$#words-1] eq "\n") {
      splice(@words,$#words-1,1);
   }
   # allow period after line-initial number
   if ($#words >= 2 and $words[0] =~ /^[0-9]+/ and $words[1] eq "." and 
       $words[2] eq "\n") {
       $words[0] .= $words[1];
       splice(@words,1,2);
   }    
   # show line
   for ($i=0;$i<=$#words;$i++) {
      if ($lastWord ne "\n" and $words[$i] ne "\n") { print " "; }
      if ($lastRealWord =~ /.-$/ and $words[$i] =~ /^(en|of)$/i) {
         # do not combine word ending on hyphen with conjunction
         print " ";
      }
      if ($lastWord ne "\n" or $words[$i] ne "\n") { 
         if (defined $opt_x and $lastWord eq "\n") { print "<s>\n"; } 
         print $words[$i];
         if (defined $opt_x and $words[$i] eq "\n") { print "</s>\n"; } 
      }
      $lastWord = $words[$i];
      if ($words[$i] ne "\n") { $lastRealWord = $words[$i]; }
      # do not add a space after a line-final word ending with a hyphen
      # if ($words[$i] =~ /.-$/ and $i == $#words) { $lastWord = "\n"; }
   }
}
if ($lastWord ne "\n") { print "\n"; }
exit(0);

# abbreviations
# 1. one capital followed by a period
# 2. sequences of single character - period pairs
# 3. strings finishing with str (street) followed by a period
#    strings containing only consonants followed by a period
# 4. strings contained in abbreviation lexicon
#
sub isAbbrev {
   my ($lcw,$w);

   $w = shift(@_);
   $lcw = lc($w);
   # 1. single caps plus period
   # 2. sequence of char (+h?) plus period (Th/Ph)
   # 3. word ending on str (street) plus period
   # 4. consonants followed by period
   if ($w =~ /^[$caps]\.$/ or
       $lcw =~ /^[$lower$caps][hHjJ]?\.([$lower$caps][hHjJ]?\.)+$/ or
       $lcw =~ /str\.$/ or
       ($lcw =~ /^[b-df-hj-np-tv-xz]+\.$/ and $w =~ /[a-z]/) or
       defined $abbrev{$lcw}) { return(1); }
   else { return(0); }
}

sub readAbbrev {
   my ($abbrevFile,$line);

   $abbrevFile = shift(@_);
   open(INFILE,$abbrevFile) or die "cannot open abbrev file $abbrevFile\n";
   while(<INFILE>) {
      $line = $_;
      chomp($line);
      $line =~ s/^\s*//;
      $line =~ s/\s*$//;
      $abbrev{lc($line)} = 1;
   }
   return();
   close(INFILE);
}
