#!/bin/ksh
### This script will compare two files and count the number of 
### characters in the files that are different.
### Usage: ./count file1 file2
# 951005 erik.tjong@ling.uu.se
# 961016 replaced awk part by perl
# 971015 added extra comments
#
# First we will have to check if the script was invoked correctly. The
# arguments of the script will be put in the variables $1 (first
# argument, $2 (second argument) and so on. $0 will contain the name
# of the script. We will test if $2 is an empty string (zero
# characters). If this is the fact, the program will stop.
#
if [ -z "$2" ]
then
   echo "Usage: $0 file1 file2" >&2
   exit 1
fi
#
# Now that we know that the arguments of the script are in $1 and $2,
# we will rename these two variables to FILE1 and FILE2. This is not
# really necessary but it improves the readability of the remainder of
# the program.
#
FILE1="$1"
FILE2="$2"
#
# We need two temporary files for storing intermediate results. We
# will place the names of these files in TMPFILE1 and TMPFILE2. The
# names will be count.1 and count.2. We append the process number
# of this script ($$) to the names to assure that they are unique.
#
TMPFILE1="count.1.$$"
TMPFILE2="count.2.$$"
#
# We will compare the characters in the two files. We will do two
# preprocessing steps:
# 1. we will put every character on a separate line
# 2. we will put the two files next to each other.
# These two steps will result in another file which contains the
# first characters of the two files on line one, the second 
# characters on line two and so on. We only have to count the 
# number of different characters on each line of this extra file.
# We are assuming that each n-th character in file 1 corresponds with
# the n-th character in file 2.

### (assignment 2.4)
###
### First we put every character on a separate line. We put a
### very rare character behind each character with the command
### sed and then change this character to a newline with the command 
### tr. The results are stored in the temporary files.
###
### In the sed command the first string between slashes // will be
### replaced by the second string between slashes. A period will
### match any character. The character sequences \( and \) have been
### put before and after the period so that it gets a name. This name
### is \1 and we will use it in the second pattern in sed which is
### the replacement pattern: \1®. The sed command replaces every 
### character with itself followed by the ® character. Example: abc
### will become a®b®c®.
###
### The tr command will replace the ® characters by new lines so that
### every character appears on a separate line.
### 
sed 's/\(.\)/\1®/g' $FILE1 | tr '®' '\012' > $TMPFILE1
sed 's/\(.\)/\1®/g' $FILE2 | tr '®' '\012' > $TMPFILE2
#
# We have stored one character per line versions of the text
# in the two temporary files. Now we will combine the two files in one
# file. The command paste will do this for us. paste takes two file
# name arguments. It will take the two first lines of the files and
# put the second one to the right of the first and then do the same
# for the two second lines and so on. So what this command will
# produce is something like "W W[nl]e e[nl]" if the two files started
# with a line with "W" and a line with "e" ([nl] is a new line).
#
paste $TMPFILE1 $TMPFILE2 |\
#
# Note the pipe command behind paste. It will send its output to 
# perl. perl will perform the character counting.
# 
perl -e '

   # We will use two variables for counting in this script:
   # $nbrOfChars contains the total number of characters in the two files
   # and $wrongChars contains the number of wrong characters in the files
   # Both counters will start at value 0 
   #
   $nbrOfChars = 0;
   $wrongChars = 0;

   # Perform an action for every line in the file
   #
   while (<>) {
  
      # $_ is the current item that is being processed; here it is a line
      #
      $line = $_;

      # Extract the characters from the line. We will obtain the
      # character of the $FILE1 in @character[0] and the character of
      # $FILE2 in @character[1]. 
      #
      @character = split(/\s+/,$line);

      ### (assignment 2.3)
      ###
      ### We collect the number of wrong characters in the variable $wrongChars
      ### If the two characters are different then we add 1 to this variable.
      ### @character[0] is a character of the first file and @character[1]
      ### is the corresponding character of the second file.
      ### "ne" means "is not equal to"
      ###
      if ( @character[0] ne @character[1] ) { $wrongChars = $wrongChars+1; }

      ### (assignment 2.3)
      ###
      ### Every line in this file contains a character of the original files.
      ### Since we have processed another character we add 1 to the variable
      ### that contains the total number of characters: $nbrOfChars
      ###
      $nbrOfChars = $nbrOfChars+1;
   }

   # Print the result.
   # %d is a gap in a string which can be filled with a number
   # %5.2f is a gap of length five in a string which can be filled
   # with a number that contains two digits behind the .
   # The gaps in the string will be filled with the values of the
   # variables in the three lines following the printf command
   #
   printf "%d errors %d characters, correct: %5.2f%%\n", 
           $wrongChars,
           $nbrOfChars,
           100*(($nbrOfChars-$wrongChars)/$nbrOfChars);

   # this is where the Perl program ends
   #
   '
#
# We have obtained the results we were looking for. There is no need
# for keeping the temporary files, so we will remove them
#
rm $TMPFILE1 $TMPFILE2
#
