#!/bin/ksh
#
# This script will compare two files and count the number of
# characters in the files that are different. This script should be 
# invoked like: count file1 file2
# Erik Tjong Kim Sang (erikt@strindberg.ling.uu.se)  
# Thu Oct  5 13:57:51 DFT 1995
#
# First we will have to check if the script was invoked correctly. The
# arguments of the script will be put in the variables $1 (first
# argument, $2 (second argument) and so on. $0 will contain the name
# of the script. We will test if $2 is an empty string (zero
# characters). If this is the fact, the program will stop.
#
if [ -z "$2" ]
then
   echo "Usage: $0 file1 file2" >&2
   exit 1
fi
#
# Now that we know that the arguments of the scripts are in $1 and $2,
# we will rename these two variables to FILE1 and FILE2. This is not
# really necessary but it improves the readability of the remainder of
# the program.
#
FILE1="$1"
FILE2="$2"
#
# We need two temporary files for storing intermediate results. We
# will place the names of these files in TMPFILE1 and TMPFILE2. The
# names will be count.1 and count.2. We append the process number
# of this script ($$) to assure that the files are unique.
#
TMPFILE1="count.1.$$"
TMPFILE2="count.2.$$"
#
# Now we will use the addNewlines script for converting the texts to
# versions with only one character per line. The addNewline program is
# explained in the script messUp. Note that we can obtain the value of
# a variable in a UNIX script by refering to ${VARIABLENAME}.
#
./addNewlines < ${FILE1} > ${TMPFILE1}
./addNewlines < ${FILE2} > ${TMPFILE2}
#
# We have stored one character per line versions of the text
# in the two temporary files. Now we can start comparing them.
# First we will combine the two files in one file. The command
# paste will do this for us. paste takes two file nam arguments. 
# It will take the two first lines of the files and concatenate
# them and do the same for the two second lines and so on. So what
# this command will produce is something like "W W[nl]e e[nl]" if
# the two files started with a line with "W" and a line with "e"
# ([nl] is a new line).
#
paste ${TMPFILE1} ${TMPFILE2} |\
#
# Note the pipe command behind paste. It will send its output to 
# another command: awk. This command will perform the counting.
# awk will initialize its variables count and error at zero and
# increase count for every line of input (no pattern or condition
# specified) but increase error only if the first word in the input
# ($1) is different from the second word of the input ($2). When all 
# lines have been read, awk will print the number of characters 
# (count), the number of errors (error) and the error percentage.
# 
 awk '
    BEGIN    { count=0; error=0 }
             { count++ }
    $1 != $2 { error++ }
    END      { printf "%d errors in %d characters, error rate: %5.2f%%\n" , 
                       error, count, 100*error/count }
 '
#
# We have obtained the results we were looking for. There is no need
# for keeping the temporary files, so we will remove them
#
rm ${TMPFILE1} ${TMPFILE2}
#
# There is more information available about the awk and the paste 
# command in the manual pages which are accessible by typing info 
# at the prompt and searching for awk or paste.
#
