#!/bin/sh
# makeClusterData: convert a text into a continguency table for characters
# 
# Texts normally contain a lot of noise. This is information that we
# would like to filter out. We are interested in clustering characters.
# The difference between capitals and lower case characters is not
# important so we will translate all capitals to lower case characters 
#
tr 'A-Z' 'a-z' |\
# 
# We are only interested in the characters and not in punctuation
# marks and things like that. We will replace all these characters
# with new lines and end up with a file containing one word per line.
# Remark: [:lower:] is an abbreviation for the class of lower case
#         characters and [\012*] is a list of new lines. This tr
#         command will replace all the characters that are not in
#         the lower class (option -c = complement) with new lines
#         and will squeeze (option -s) squences of more than one 
#         new line into one new line. 
#
tr -sc '[:lower:]' '[\012*]' |\
# 
# Just like in the previous exercise we will send the data to awk and
# for this purpose it is nice to have only one character per line. So
# we will run the text through addNewlines
#
./addNewlines |\
#
# awk will count all bigrams and print the result. It will build two
# tables: a table containing unigram data (unigramData) and a table 
# containing bigram data (bigramTable). The values in this table are
# initially zero. For each character in the text we will increment
# bigramTable[ThisCharacter][PrevCharacter]. Collecting the unigram
# data is necessary for being able to print the data.
#  
awk '
   BEGIN { prevChar='X'; count=0 }
         {
           if (unigramTable[$1]==0) count++;
           unigramTable[$1]++; 
           bigramTable[prevChar,$1]++; 
           prevChar=$1 
         }
   END   { 
           printf "%d\n", count
           for (index1 in unigramTable)
           {
              printf "%c ",index1
              for (index2 in unigramTable) 
                 { printf "%d ",bigramTable[index1,index2] } ;
              printf "\n"
           }      
         }
   ' |\
# 
# The resulting data still contains some extra characters, for '
# example the space. We will replace the space character with a 
# capital S.
sed 's/^ /S /'|\
#
# Sometimes the character ] stays in as well! I do not know how to
# prevent this from happening.
#
# awk's output contains all continguncy table elements for one item
# on one line. The clustering program needs input with one character
# per line. We will use tr to achieve this (addNewlines does not work
# here because it will create empty lines). tr will replace every 
# space by a new line and squeeze (option -s) sequences of new lines
# into one new line.
#
tr -s ' ' '\012'
#
# There is more information available about the commands tr, sed
# and awk in the manual pages which are accessible by typing info 
# at the prompt and searching for tr, sed or awk.
#