#!/bin/bash # Usage: editdist reference_text hypothesized_text # # Set environment variable cleanup=0 if you don't want temporary files removed. # Set environment variable FSTSYMBOLS to a symbol table file if you want to # parse the files as a sequence of symbols rather than a sequence of bytes. ref=$(readlink -f "$1") hyp=$(readlink -f "$2") refedit=/tmp/${ref##*/}.$$.edit.fst hypedit=/tmp/${hyp##*/}.$$.edit.fst umask 077 # files in ugrad /tmp dir shouldn't be readable by other students # Edit machines were compiled from grammars/editdist.grm . # Their sym table was extracted with # fstsymbols --save_osymbols=../symbol_tables/edit.sym edit1.fst > /dev/null edit1="${0%/*}/../fsts/edit1.fst" edit2="${0%/*}/../fsts/edit2.fst" editsym="${0%/*}/../symbol_tables/edit.sym" # Cumulative edit distance stats total=0 count=0 while read -r -u3 ref_line; read -r -u4 hyp_line; do ##echo "$line1:$line2"; # Convert each file to a single string (replace non-final newlines with spaces), # and compose its automaton with an edit automaton. echo $ref_line | perl -0777 -pe 's/\n(?=.)/ /gs' | FSTSYMBOLS=$editsym fstcompilestring | fstcompose - $edit1 | fstarcsort > $refedit echo $hyp_line | perl -0777 -pe 's/\n(?=.)/ /gs' | FSTSYMBOLS=$editsym fstcompilestring | fstcompose $edit2 - | fstarcsort > $hypedit # Finally, mash the two edited strings together to find the best alignment. editdist=$(fstcompose $refedit $hypedit | fstshortestdistance --reverse | head -1 | cut -f2) total=$(($total + $editdist)) count=$(($count + 1)) echo "Line $count: edit distance is $editdist" echo -e "\t$ref_line\n\t$hyp_line" done 3< "$ref" 4< "$hyp" echo "Total edit distance $total over $count lines (about $(($total/$count)) per line)" if [ "${cleanup:=1}" -eq "1" ]; then rm -f $refedit $hypedit fi