#!/usr/bin/perl # Usage: randsent [-t] [-T] [-n num-sentences] [-s start-symbol] grammarfiles... # Type "randsent -?" for details. # # Author: Jason Eisner , 2001-09-08. # # Modifications by Noah A. Smith 6/17/02-6/30/02. # + Ignores lexicalizations in []. # + Can specify start symbol on command line. # + Logs generated "S1"s. # - to a new file each time! # + Can generate multiple sentences on one call. # + Grammar files given on command line (not piped in) # + Calls check-for-new-terms which verifies that no terminal vocab # items have been added. # # Modifications by Jason Eisner 1/16/02. # + Removed Noah's modifications that were only for purpose of WS02 lab: # - Changed default top symbol from S1 back to START. # - Turned off sentence logging via $log (but left the code in). # - Removed redundant & undocumented sentence logging via $sentfile. # - Removed call to check-for-new-terms. # + Added -T option for automatic prettyprinting. # + Improved usage message slightly. # + Usage message will be printed if arguments are bad. use strict 'vars'; # Process options. my($usage, $tree, $pretty, $N, $topsym, $log); $usage = "randsent [-t] [-T] [-n num-sentences] [-s start-symbol] grammarfiles... -t: output full parse trees -T: output full parse trees, prettyprinted -n: specify how many sentences to generate (default 1) -s: specify the start symbol (default START) Grammar files are concatenated; if none, grammar is read from stdin. For grammar file format and examples, see http://www.cs.jhu.edu/~jason/465/hw1/hw1.pdf\n"; $tree = 0; $pretty = 0; $N = 1; $topsym = "START"; $log = 0; # if turned on, will keep a record of the generated sentences (not trees) in a hidden file while (@ARGV and $ARGV[0] =~ /^\-./) { my $flag = shift(@ARGV); if ($flag eq "-t") { $tree = 1 } elsif ($flag eq "-T") { $tree = 1; $pretty = 1 } elsif ($flag eq "-n") { defined($N = shift(@ARGV)) || die $usage } elsif ($flag eq "-s") { defined($topsym = shift(@ARGV)) || die$usage } else { die $usage } # bad flag, including -help or -? } open(LOG, ">.randsentlog.$$") if($log); # Read grammar. my(%rules, %totalweight); while (<>) { print C; s/\#.*//; # kill comments (including end-of-line comments) s/\[.*?\]//g; # kill lexicalizations (NAS 6/17/02) next unless /\S/; # skip blank lines my(@rule)=split; die "Weights must be positive" unless $rule[0] > 0; push(@{$rules{$rule[1]}}, \@rule); # add to list of rules whose LHS=$rule[1] $totalweight{$rule[1]} += $rule[0]; # each LHS maintains total weight of rules on its list } close C; # Generate a single sentence. # Maybe pipe it through prettyprint, depending on command-line opts. if ($tree && $pretty) { open(OUT, "| prettyprint") || open(OUT, "| ./prettyprint") || die "$0: Couldn't open pipe to \"prettyprint\" script (for -T option)\n"; } else { open(OUT); } die "Error: $topsym not in the grammar.\n" unless defined $rules{$topsym}; while($N > 0){ printrand($topsym); print OUT "\n"; print LOG "\n" if($log); $N--; } close(OUT); if ($log){ close(LOG); system("chmod a+r .randsentlog.$$"); } ###################################################################### # printrand("FOO") prints a random FOO-rooted sentence (or tree, if $tree) # to OUT. sub printrand { my ($root) = @_; if (!defined $rules{$root}) { # terminal symbol print OUT $root, " "; print LOG $root, " " if($log); } else { # nonterminal symbol my(@rules) = @{$rules{$root}}; # possible rewrites my($r) = rand(0) * $totalweight{$root}; # choose random number $r my(@rule); die "oops" unless $r >= 0; while ($r >= 0) { # pick rewrite matching $r die "oops" unless @rules; @rule = @{pop(@rules)}; $r -= shift(@rule); } die "oops" unless $root eq shift(@rule); print OUT "($root " if $tree; # print tree structure map printrand($_), @rule; # recurse to kids print OUT ")" if $tree; # print tree structure } }