#!/bin/bash # Usage: make-lm corpus.txt umask 077 # files in ugrad /tmp dir shouldn't be readable by other students if [ $# -ne 1 ]; then >&2 echo "Usage: $0 corpus.txt" exit 1 elif [ ! -r "$1" ]; then >&2 echo "Can't read corpus file $1" exit 1 fi fn=$(basename "$1") file_base=$fn ext="${fn##*.}" orig="$1" if [ "$ext" == "txt" ]; then file_base="${fn%.*}" fi RDIR=${0%/*}/.. # parent dir of this script contains resources for this assignment : ${OOV_SYMBOL=""} : ${order=3} : ${num_lines_remove=100} sym=$file_base.sym #get the number of occurrences of each observed type in the test file perl -p -e 's/\s+/\n/g;' $orig | grep -v '^\s*$' | sort | uniq -c > /tmp/$file_base.$$.uniqcedwords num_unique_words=$(wc -l /tmp/$file_base.$$.uniqcedwords | awk '{print $1}') #now grab singletons and shuffle them grep '^\s*1 ' /tmp/$file_base.$$.uniqcedwords | awk '{print $2}' | sort > /tmp/$file_base.$$.singletons num_singletons=$(wc -l /tmp/$file_base.$$.singletons | awk '{print $1}') to_remove=$num_singletons if [[ $num_singletons -lt $num_lines_remove ]]; then to_remove=$(echo "0.5 * $num_singletons" | bc); else to_remove=$(echo "$num_lines_remove" | bc ); fi to_remove=$(echo "$to_remove" | xargs printf "%1.0f") #and non-singletons grep -v '^\s*1 ' /tmp/$file_base.$$.uniqcedwords | awk '{print $2}' > /tmp/$file_base.$$.alpha #and *remove* $to_remove number of singletons awk 'NR % '`echo "$num_unique_words/$to_remove" | bc`' != 0' /tmp/$file_base.$$.singletons >> /tmp/$file_base.$$.alpha #and make the two-column, symbol <-> byte translation transducer paste /tmp/$file_base.$$.alpha /tmp/$file_base.$$.alpha > $file_base.alpha #now make the symbol table sort /tmp/$file_base.$$.alpha | ngramsymbols --OOV_symbol="$OOV_SYMBOL" > $sym #create straight-line FSAs from the corpus: one FSA per line! farcompilestrings --symbols=$sym --keep_symbols=1 --unknown_symbol="$OOV_SYMBOL" $orig > /tmp/$file_base.$$.far #count ngramcount --order=${order} --epsilon_as_backoff=false /tmp/$file_base.$$.far > /tmp/$file_base.$$.counts #and now make an appropriate model ngrammake --method=kneser_ney --backoff=true /tmp/$file_base.$$.counts > $file_base.fst if [[ ${cleanup:=1} != 0 ]]; then ## /tmp/$file_base.withSA.$$.txt rm /tmp/$file_base.$$.far /tmp/$file_base.$$.counts fi