#!/bin/sh
# checkWords: check isolated words against a dictionary
# usage:      checkWords < wordlist
# note:       this program expects a word list as input.
# 960312 erik.tjong@ling.uu.se

# this is the location of the dictionary
DICTIONARY="/usr/users/staff/erikt/P/st97/lrt/words.swedish"

# we assume that the input is a word list: one word per line. We will 
# sort it and remove all the double words. sort -u does exactly that.
# -u stands for unique: remove the doubles.
#
sort -u |\
#  
# The list of words may also include numbers. We do not want to
# include them in the spelling checker so we will remove the items
# without "normal" characters from the list.
# The command grep will remove the words that contain characters that
# are not in the list A-ZÅÄÖÉa-zåäöé
# 
grep '[A-ZÅÄÖÉa-zåäöé]' |\
#
# Now we are ready to compare the words in the list with the words in
# the dictionary. This dictionary is by no means complete so do not
# expect perfect results!
# The command comm takes two SORTED lists as input and produces a three
# column output:
#    1. a column with words appearing only in file 1
#    2. a column with words appearing only in file 2
#    3. a column with words appearing in file 1 and file 2
# file 1 will be the list of words and file 2 the dictionary list. 
# We are only interested in the first column so we prevent comm from
# printing the other two: -23 means don't print second and third column,
# - means use the standard input as first file
#
comm -23 - $DICTIONARY
#
# now we have a list with words that appear in the file but not in the
# dictionary. We can treat these as mispellings.
# 
exit 0
