#!/bin/ksh
# This is an example Perl program which can be used by people with
# programming experience in PASCAL, C, C++ or PERL for writing
# software for exercise 1.4 of the course Språkstatistik HT96 and
# HT97, Department of Linguistics, University of Uppsala 
# 960927,971009 erik.tjong@ling.uu.se

# The Perl script expects as input a file which contains a list of
# bigrams (the output of the paste command). Insert bigram corpus
# generation commands here finished by |\ or pipe the output of those 
# commands to this script

# IMPORTANT: the list of bigrams is too large to run tests on: the
# expected run time is larger than an hour. So we will run the script
# only on the bigrams containing the two words we are working on
# you need to change this command!
grep -e "word1" -e "word2" |\
# now we have selected lines containing word1 and word2
# please note that unfortunately these two words have to be specified
# at two places in the script: here and in the perl code; sorry

# start of perl script
perl -e '
   ### Start Perl script
   ### This script will read bigrams from standard input and display
   ### the counts for the words that we are interested in.

   ### Define the words we are looking for: you need to change this!
   ### remember that the words have to be specified in the grep part as well
   $word1="word1";
   $word2="word2";

   ### Repeat the first group of commands for every sentence. The
   ### <> means read from standard input and the while statement will
   ### succeeds as long as there are sentences left to process.

   while (<>) {

      ### Split the first line into a list of words called @words
      ### The first word will be in @words[0] and the second one
      ### @words[1]. The \s+ bit means use non-zero sequences of
      ### white space (spaces, tabs and new lines) as word. The
      ### $_ stands for the current text bit that is being 
      ### processed (a line).

      @words=split(/\s+/,$_);

      ### We want to store the number of occurrences of a bigram in a
      ### list with two indices: bigrams[word1,word2]. Unfortunately
      ### Perl does not support lists like these. This means that we
      ### need to convert word1 and word2 to one index string first 
      ### and then use this string as an index for $bigrams{$index}
      ### The makeIndex function will do that for us.

      $index=&makeIndex(@words[0],@words[1]);

      ### Now that we have an index string we can store the count in
      ### $bigrams{$index}. Every variable will start at value zero so 
      ### if we add one to $bigrams{$index} we will get a count 1 when
      ### the bigram is the first of its kind. If it has occurred
      ### earlier we will store the old value plus one which is ok.

      $bigrams{$index}=$bigrams{$index}+1;

      ### We should repeat these lines of code for every bigram
      ### (=line in the text). The while loop will take care of that.

   }

   ### At this point we have collected all the bigram counts in the 
   ### list $bigrams. We should go through this list and we should
   ### print the count for every bigram that has one of the two 
   ### search words ($word1 and $word2) as first word. We will print 
   ### the count for the corresponding bigram with the other word as 
   ### well.

   ### This cryptic for statement will select the counted bigrams 
   ### that have been stored in $bigram.

   foreach $index (keys %bigrams) {

      ### We know the index string of the bigram but we are really 
      ### interested in the words that are part of the bigram. There
      ### are two functions available for computing these words:
      ### &getFirst will get the first word of the bigram and
      ### &getSecond will retrieve the second one.

      $bigramWord1=&getFirst($index);

      ### We are only interested in the bigrams that have one of the 
      ### two search words $word1 or $word2 as first word. So we will
      ### perform a test to see if that is the case. The || in the
      ### if-statement means "or". The eq operator tests for string
      ### equality.

      if (($bigramWord1 eq $word1)||($bigramWord1 eq $word2)) {

         ### At this point we know that the bigram starts with one of
         ### the two search words. It is sensible to retrieve the
         ### second word of the bigram as well

         $bigramWord2=&getSecond($index);

         ### We will print information about the related bigram as
         ### well. For example: if we are looking for "strong" and
         ### "powerful" then we want to print both the counts for
         ### "strong tea" and "powerful tea". The second word is
         ### the same for both bigrams but we have to compute the
         ### first word of the second bigram.

         if ($bigramWord1 eq $word1) { $secondBigramWord1=$word2; }
         if ($bigramWord1 eq $word2) { $secondBigramWord1=$word1; }

         ### We know both the first and the second word of second
         ### bigram. In order to address it in the bigram count
         ### list we will have to compute its index string.

         $secondBigramIndex=&makeIndex($secondBigramWord1,$bigramWord2);

         ### Now we can print the information we were looking for.
         ### If the second bigram did not occur in the input then
         ### its bigram count will be zero automatically.

         printf "%4d %s %s\n",$bigrams{$index},
                              $bigramWord1,$bigramWord2;
         printf "%4d %s %s\n",$bigrams{$secondBigramIndex},
                              $secondBigramWord1,$bigramWord2;
   
         ### print a separator between the pairs of bigrams

         printf "************************\n";
      }

      ### We do not need to do anything if the bigram does not start
      ### with one of the two search words. However we need to repeat
      ### these commands for all counted bigrams. The foreach
      ### statement will take care of that.

   }

   ### The Perl scripts finishes here. The remainder of the code are
   ### the three functions which define the index string computations.

   ### Make an index string by concatenating the two words separated
   ### by !!!. The period is the string concatenation operator. Here
   ### @_ is the list of input arguments of the sub-routine.

   sub makeIndex {
      local($word1,$word2)=@_;
      $word1 . "!!!" . $word2;
   }

   ### Get the first word of a bigram by deleting everything from
   ### !!! in an index string.

   sub getFirst {
      local($string)=@_;
      $string=~s/!!!.*//;
      $string;
   }

   ### Get the second word of a bigram by deleting everything until
   ### !!! in an index string.

   sub getSecond {
      local($string)=@_;
      $string=~s/.*!!!//;
      $string;
   }'