# Support for thesis section on how to get arc competition to yield # a Zipf-like distribution. source("utils.R") # Simulated corpus data generated from a multinomial. # counts(k,m) takes a k-sample from the multinomial m and returns a sorted vector of the positive counts. counts <- function(tokens,typeprobs) { if (any(!is.finite(typeprobs))) { errtypeprobs <<- typeprobs; stopifnot(all(is.finite(typeprobs))); } sample <- sort(fastestsamplerepl(seqa(typeprobs), tokens, prob=typeprobs)); # generate word tokens from the distribution over word types changeindices <- (1:1e6)[sample[1:1e6]>c(0,sample[0:(1e6-1)])]; # find the indices where a new word starts being generated rev(sort(diff(c(changeindices,1e6+1)))) } # lengths of the same-word runs (when > 0), from largest to smallest # ------------------------------------------------ # let's look at some weight density functions. (this section is skippable) # generate (smallish) artificial corpora -- 15 corpora from each density zipf <- repl(3,counts(1e5,1/(1:1e4))) # generate from perfect zipf expzipf <- repl(15,counts(1e5,exp(rexp(1e4)))) # generate from competing words whose weights come from exp distrib randsign <- function(k) { rbinom(k,1,.5)*2-1 } # utility fn symmexpzipf <- repl(15,counts(1e5,exp(rexp(1e4)*randsign(1e4)))) # generate from competing words whose weights come from reflected exp distrib smallzipf <- repl(15,counts(1e5,exp({x <- seq(-15,15,length=1000); fastestsamplerepl(x,1e4,replace=TRUE,prob=exp(-abs(x)**1.1))}))) # weights come from exp(-abs(x)**1.1) smalltiltzipf <- repl(15,counts(1e5,exp({x <- seq(-15,15,length=1000); fastestsamplerepl(x,1e4,replace=TRUE,prob=exp(-abs(x)**1.1/1.1))}))) # weights come from exp(-abs(x)**1.1/1.1) normzipf <- repl(15,counts(1e5,exp({x <- seq(-15,15,length=1000); sample(x,1e4,replace=TRUE,prob=exp(-abs(x)**2))}))) # weights come from exp(-abs(x)**2) # compare them. For each density, look at the mean and stdev across the 15 corpora of various measures. mapcounts <- function(f,...) { list(zipf=sapply(zipf,f,...),expzipf=sapply(expzipf,f,...),symmexpzipf=sapply(symmexpzipf,f,...),smallzipf=sapply(smallzipf,f,...)) } smallfrac <- function(cnts,k) { sum(cnts <= k)/length(cnts) } # fraction of types that appear <= k times among those that appear at least once sapply(mapcounts(length),mean); sapply(mapcounts(length),stdev); boxplot(mapcounts(length)); # average number of types sampled at least once for each density sapply(mapcounts(smallfrac,1),mean); sapply(mapcounts(smallfrac,1),stdev); boxplot(mapcounts(smallfrac,1)); # average proportion of singletons? (this is indeed about the same for expzipf and symmexpzipf sapply(mapcounts(smallfrac,2),mean); sapply(mapcounts(smallfrac,2),stdev); boxplot(mapcounts(smallfrac,2)); # average proportion of singletons+doubletons? sapply(mapcounts(max),mean); sapply(mapcounts(max),stdev); boxplot(mapcounts(max)); # average frequency of most frequent word # all the measures, not just their mean & stdev mapcounts(length) # number of types sampled at least once for each density mapcounts(smallfrac,1) # average proportion of singletons? (this is indeed about the same for expzipf and symmexpzipf mapcounts(smallfrac,2) mapcounts(max) # average frequency of most frequent word # plot them sapply(zipf, function(cnts) {plot(seqa(cnts),cnts,type="l",log="xy")}) sapply(expzipf, function(cnts) {lines(seqa(cnts),cnts,type="l",col=2)}) sapply(symmexpzipf, function(cnts) {lines(seqa(cnts),cnts,type="l",col=3)}) sapply(smallzipf, function(cnts) {lines(seqa(cnts),cnts,type="l",col=4)}) # ---------------------------------------------------------------------- # Now let's look at actual distribution of tokens in brown corpus. # (don't skip this section as it reads in the corpus and defines plotbrown) # Note that we really want 10x as much random data as we generate above to # match it's million words; taking only the first 10th of corpus doesn't # get the interesting curvature in the log-log plot. # cat ~/cs405/hw3/fsm/brown.txt | perl -ne 's{/\S*}{}g; for $w (split) { $c{$w}++ } END { foreach (sort { $c{$b} <=> $c{$a} } keys %c) { print "$_\t$c{$_}\n"; } }' > ~/tmp/browncounts brownall <- read.table("~/tmp/browncounts",quote="",sep="\t",col.names=c("word","count")) brown <- brownall$count plotbrown <- function(scale=1,...) { freq <- brownall$count rank <- seq(freq)/scale plot(rank,freq,log="xy",xlab="rank of frequency",ylab="frequency",...) # shows curvature: ls <- lsfit(log10(rank),log10(freq)) abline(ls); ls$coefficients # shows that exponent is actually about -1.29 here (This doesn't show up if we only look at first 10% of corpus.) lsearly <- lsfit(log10(rank[1:1000]),log10(freq[1:1000])) abline(lsearly); lsearly$coefficients # but for frequent words, exponent is -1.04, so it's just that tail bends down. } plotbrown() # Add labels of some typical words id <- list(ind=c(1,2,3,8,29,41,86,168,390,578,1259,2412,5874,13029,21941,29716),pos=c(rep(1,3),4,rep(2,11))) # id was originally obtained interactively by: # id <- identify(seqa(brown),brown,brownall$word,pos=TRUE) # id$pos[4:length(id$pos)] <- 2 # make all labels appear on left, except first few text(seqa(brown)[id$ind],brown[id$ind],labels=as.character(brownall$word[id$ind]),pos=id$pos,offset=1) # Now try to fit Brown data manually. (this section is skippable) linecounts <- function(cnts,col=1) {lines(seqa(cnts)/length(cnts),cnts,col=col)} tryman <- function(e,d,col=1) { smalltiltzipf <- counts(sum(brown),exp({x <- seq(-15,15,length=1000); fastestsamplerepl(x,1e5,prob=exp(-abs(x)**e/d))})); linecounts(smalltiltzipf,col); tries <<- c(list(smalltiltzipf), tries) } plotbrown(scale=length(brown)) # replot it with x axis scaled to [0,1]. tries <- list(); tries <- c(tryman(1,1,col=2),tries) # doesn't quite fit - misses the curvature tries <- c(tryman(1.2,2,col=1),tries) tries <- c(tryman(1.3,2.5,col=1),tries) tries <- c(tryman(1.33,3,col=4),tries) tries <- c(tryman(1.4,3.5,col=2),tries) plotbrown(scale=length(brown)) # replot it with x axis scaled to [0,1]. tries <- list(); tries <- c(repl(4,c(tryman(1.35,3,col=3),tries))) # this seems to be the best! try it 4 times # Now try to fit Brown data automatically. (this section is skippable so long as the save at the end has been done previously) plotbrown() tries <- list() sumsq <- function(x,y) { n <- min(length(x),length(y)); sum((log(x[1:n])-log(y[1:n]))**2) } # objective error we're trying to minimize - just distance between the two graphs; we ignore extra singletons that one graph has and other doesn't sumsqtry <- function(edtypes) { i <<- i+1; if (i <= length(tries)) { print(c(i,tries[[i]]$edtypes)); print(tries[[i]]$sumsq); return(tries[[i]]$sumsq); } # we've already done this one and are retracing our steps to continue optimization e <- edtypes[1]; d <- edtypes[2]; types <- length(brown)*20*edtypes[3]; # split argument (a 3-vector) into named vars print(c(length(tries)+1,e,d,types)); # track progress x <- seq(-15,15,length=5000); # will evaluate density at these points (so these are our possible weights) savecnts <<- cnts <- counts(sum(brown),exp(fastestsamplerepl(x,types,prob=exp(-abs(x)**e/d)))); # get weights by sampling, then get a corpus the same size as Brown by sampling the weights r <- sumsq(cnts,brown); # is this curve a good match for Brown corpus? print(r); tries[[i]] <<- list(sumsq=r,e=e,d=d,types=types,edtypes=edtypes); # remember what we did since it's slow to generate lines(seqa(cnts),cnts,col=col); # graph it so we can watch the thing chug (there's assumed to exist a log-log plot of brown) r; } # return goodness sumsqtry5 <- function(edtypes,col=1) { r <- sumsqtry(edtypes); if (r < 2*best) { r <- mean(c(r, unlist(repl(4,sumsqtry(edtypes))))); } # if r is close to optimal, we want a more precise value: average it with 4 others, also random. This should keep optimization from straying due to vagaries of random # generator. if (r < best) { best <<- r; } print(r); } # To start optimization from scratch: tries <- list() # To start by retracing previous optimization as saved in tries: load("zipf.optim.R") best <- 1e6; i <- 0; optim(c(1.33,3,1),sumsqtry5,control=list(trace=5)); # start given what we already have in tries save(tries,file="zipf.optim.R") # let's look at params from best *individual* run (not the same as best): j <- sapply(tries, function (x) x$sumsq) edtypesbest <- tries[j==min(j)][[1]]$edtypes # 1.3175 3.2875 0.8875 - acheives 2117.32 on evaluation 42 # Note also that as far as best mean-of-5 goes, we got a mean of 2562.38 on evals 158-162: # 1.323410 3.271446 1.031016 # given edtypesbest, let's find best fit we can over 10 samples rbest <- 1e6; for (k in 1:10) { r <- sumsqtry(edtypesbest); if (r < rbest) { rbest <- r; cntsbest <- savecnts; } } plotbrown(); lines(seqa(cntsbest),cntsbest) save(edtypesbest,cntsbest,id,file="zipf.cntsbest.R") # careful not to overwrite! # Ok, now make and save the final plot. load("zipf.cntsbest.R") postscript(file="zipf.eps",onefile=FALSE,horizontal=FALSE,width=13,height=6,paper="special"); # make a big graph to be included at half-size; this allows dots to be small layout(cbind(1,2)) # two side-by-side graphs par(mar=c(4,4,1,1),pty="s",pch=20) # trim excess margin; square plots; small filled circle as point plotbrown(type="l"); points(seqa(brownall$count),brownall$count); text(seqa(brown)[id$ind],brown[id$ind],labels=as.character(brownall$word[id$ind]),pos=id$pos,offset=1,cex=0.7) plotbrown(type="l"); points(seqa(cntsbest),cntsbest) dev.off() system("mv zipf.eps zipf.eps.orig"); system("perl -ane 'if ($F[0] =~ /^\\d+\\.\\d+$/ && $F[1] =~ /^\\d+\\.\\d+$/) { next if $F[1]==$oldF[1] && abs($F[0]-$oldF[0])<0.2; @oldF=@F; } print;' zipf.eps.orig > zipf.eps") # squish the .ps file by killing off lines (which plot points) that have the same y coordinate and almost the same x coordinate as the previously printed line. This reduces filesize by a factor of 50 and makes the picture printable without waiting for hours. (Even piping through uniq shrinks by a factor of 4-5, but that's not enuf to print.) system("epstopdf zipf.eps") # Obsolete but maybe useful later: # Regraph our tries ... see that all these plots fit the steeper # line at the bottom and the shallower one at top, just like Zipf. # Also see that some of them like tries[[3]] also look a *lot* like the noisier high-freq # data - very pretty! Just to remember how to read the graph, # the horiz. axis is proportional to log(rank) -- with the constant # of proportionality being slightly different depending on how many # distinct tokens we actually drew (!!! in fact, we should be careful # to use 1.17e6 above rather # than 1e6 so that we have as many tokens as Brown). The vertical axis # is log(count). There's no difference between the ``point'' and ``line'' # style graphs except that the point graph is real Brown data; maybe should # just show the real data as points and then overlay one artificial dataset # on top of it in the next graph, also with points. # load("zipf.R") plot(log((1:length(brown))/length(brown)),log(brown),col=6) # shows curvature ls <- lsfit(log((1:length(brown))/length(brown)),log(brown)) abline(ls,col=6); ls$coefficients # shows that exponent is actually about -1.29 here (This doesn't show up if we only look at first 10% of corpus.) lsearly <- lsfit(log((1:1000)/length(brown)),log(brown[1:1000])) abline(lsearly,col=6); lsearly$coefficients # but for frequent words, exponent is -1.04, so it's just that tail bends down. sapply(tries, function(cnts) {lines(log((1:length(cnts))/length(cnts)),log(cnts),col=3)}) sapply(list(tries[[3]]), function(cnts) {lines(log((1:length(cnts))/length(cnts)),log(cnts),col=4)}) # redo the best one in blue sapply(list(tries[[3]]), function(cnts) {points(log((1:length(cnts))/length(cnts)),log(cnts),col=4)}) # and as points # ---------------------------------------------------------------------- # how does the variance of a density in our family relate to its exponent? # doesn't seem to be an easy relation. # it is only easy when exponent is 1 or 2: # exp distrib with rate of lambda has mean 1/lambda and var 1/lambda^2 # so lambda corresponds to 1/sigma^2 - we are really just dividing by # our desired variance right before the final exponentiation. v <- function(e) { var(sample(x,50000,replace=TRUE,prob=exp(-abs(x)**e/e))) } v <- function(e) { d <- exp(-abs(x)**e/e); sum((x**2)*d)/sum(d) } # faster and more accurate exps <- exp(seq(-5,5,length=500)*log(10)) variances <- map(exps,v) plot(log(exps),variances) plot(exps[exps>.5 & exps<3],newvars[exps>.5 & exps<3])