#!/bin/bash # Usage: make-hmm-lm # Create a bigram language model from text, where each # line is # / # e.g., # dog/N # Text is assumed to be in # Write language model to tags.model file_base="$1" : ${order=2} #separate out the training data into a word dump perl -ne '/^(.+)\/(.+)$/; print "$1 ";' $file_base | perl -p -e 'chomp; s/\#\#\#/\n/g;' | grep -v '^\s*$' > $file_base.words #and a tag dump perl -ne '/^(.+)\/(.+)$/; print "$2 ";' $file_base | perl -p -e 'chomp; s/\#\#\#/\n/g;' | grep -v '^\s*$' > $file_base.tags #from these, minimize the lists to create the alphabets perl -p -e 's/\s+/\n/g;' $file_base.words | sort |uniq > $file_base.words.alpha echo '" "' >> $file_base.words.alpha perl -p -e 's/\s+/\n/g;' $file_base.tags | sort | uniq > $file_base.tags.alpha echo '_' >> $file_base.tags.alpha #and make the human readable emission table perl -ne '/^(.+)\/(.+)$/; print "$2\t$1\n";' $file_base | sort | uniq -c | perl make-transitions.pl $file_base > $file_base.emit.grm mv $file_base.emit.grm datahelp/. ln -s -f datahelp/$file_base.emit.grm emit.grm cat $file_base.{words,tags} > $file_base.withSA.txt #create the symbol tables ngramsymbols < $file_base.words.alpha > $file_base.words.sym ngramsymbols < $file_base.tags.alpha > $file_base.tags.sym #make bigram tag models: transition probabilities farcompilestrings --symbols=$file_base.tags.sym --keep_symbols=1 $file_base.tags > $file_base.tags.far ngramcount --order=${order} $file_base.tags.far > $file_base.tags.counts ngrammake --backoff=true $file_base.tags.counts > $file_base.tags.model mv $file_base.tags.counts $file_base.tags.model datahelp/. ln -s -f datahelp/$file_base.tags.model tags.model mv $file_base.{words,tags} $file_base.withSA.txt $file_base.tags.far datahelp/. for arg in alpha sym; do for which in words tags; do mv $file_base.$which.$arg datahelp/. ln -s -f datahelp/$file_base.$which.$arg $which.$arg done done