#!/bin/ksh # This is an example Perl program which can be used by people with # programming experience in PASCAL, C, C++ or PERL for writing # software for exercise 1.4 of the course Språkstatistik HT96 and # HT97, Department of Linguistics, University of Uppsala # 960927,971009 erik.tjong@ling.uu.se # The Perl script expects as input a file which contains a list of # bigrams (the output of the paste command). Insert bigram corpus # generation commands here finished by |\ or pipe the output of those # commands to this script # IMPORTANT: the list of bigrams is too large to run tests on: the # expected run time is larger than an hour. So we will run the script # only on the bigrams containing the two words we are working on # you need to change this command! grep -e "word1" -e "word2" |\ # now we have selected lines containing word1 and word2 # please note that unfortunately these two words have to be specified # at two places in the script: here and in the perl code; sorry # start of perl script perl -e ' ### Start Perl script ### This script will read bigrams from standard input and display ### the counts for the words that we are interested in. ### Define the words we are looking for: you need to change this! ### remember that the words have to be specified in the grep part as well $word1="word1"; $word2="word2"; ### Repeat the first group of commands for every sentence. The ### <> means read from standard input and the while statement will ### succeeds as long as there are sentences left to process. while (<>) { ### Split the first line into a list of words called @words ### The first word will be in @words[0] and the second one ### @words[1]. The \s+ bit means use non-zero sequences of ### white space (spaces, tabs and new lines) as word. The ### $_ stands for the current text bit that is being ### processed (a line). @words=split(/\s+/,$_); ### We want to store the number of occurrences of a bigram in a ### list with two indices: bigrams[word1,word2]. Unfortunately ### Perl does not support lists like these. This means that we ### need to convert word1 and word2 to one index string first ### and then use this string as an index for $bigrams{$index} ### The makeIndex function will do that for us. $index=&makeIndex(@words[0],@words[1]); ### Now that we have an index string we can store the count in ### $bigrams{$index}. Every variable will start at value zero so ### if we add one to $bigrams{$index} we will get a count 1 when ### the bigram is the first of its kind. If it has occurred ### earlier we will store the old value plus one which is ok. $bigrams{$index}=$bigrams{$index}+1; ### We should repeat these lines of code for every bigram ### (=line in the text). The while loop will take care of that. } ### At this point we have collected all the bigram counts in the ### list $bigrams. We should go through this list and we should ### print the count for every bigram that has one of the two ### search words ($word1 and $word2) as first word. We will print ### the count for the corresponding bigram with the other word as ### well. ### This cryptic for statement will select the counted bigrams ### that have been stored in $bigram. foreach $index (keys %bigrams) { ### We know the index string of the bigram but we are really ### interested in the words that are part of the bigram. There ### are two functions available for computing these words: ### &getFirst will get the first word of the bigram and ### &getSecond will retrieve the second one. $bigramWord1=&getFirst($index); ### We are only interested in the bigrams that have one of the ### two search words $word1 or $word2 as first word. So we will ### perform a test to see if that is the case. The || in the ### if-statement means "or". The eq operator tests for string ### equality. if (($bigramWord1 eq $word1)||($bigramWord1 eq $word2)) { ### At this point we know that the bigram starts with one of ### the two search words. It is sensible to retrieve the ### second word of the bigram as well $bigramWord2=&getSecond($index); ### We will print information about the related bigram as ### well. For example: if we are looking for "strong" and ### "powerful" then we want to print both the counts for ### "strong tea" and "powerful tea". The second word is ### the same for both bigrams but we have to compute the ### first word of the second bigram. if ($bigramWord1 eq $word1) { $secondBigramWord1=$word2; } if ($bigramWord1 eq $word2) { $secondBigramWord1=$word1; } ### We know both the first and the second word of second ### bigram. In order to address it in the bigram count ### list we will have to compute its index string. $secondBigramIndex=&makeIndex($secondBigramWord1,$bigramWord2); ### Now we can print the information we were looking for. ### If the second bigram did not occur in the input then ### its bigram count will be zero automatically. printf "%4d %s %s\n",$bigrams{$index}, $bigramWord1,$bigramWord2; printf "%4d %s %s\n",$bigrams{$secondBigramIndex}, $secondBigramWord1,$bigramWord2; ### print a separator between the pairs of bigrams printf "************************\n"; } ### We do not need to do anything if the bigram does not start ### with one of the two search words. However we need to repeat ### these commands for all counted bigrams. The foreach ### statement will take care of that. } ### The Perl scripts finishes here. The remainder of the code are ### the three functions which define the index string computations. ### Make an index string by concatenating the two words separated ### by !!!. The period is the string concatenation operator. Here ### @_ is the list of input arguments of the sub-routine. sub makeIndex { local($word1,$word2)=@_; $word1 . "!!!" . $word2; } ### Get the first word of a bigram by deleting everything from ### !!! in an index string. sub getFirst { local($string)=@_; $string=~s/!!!.*//; $string; } ### Get the second word of a bigram by deleting everything until ### !!! in an index string. sub getSecond { local($string)=@_; $string=~s/.*!!!//; $string; }'