# Support for thesis section on how to get arc competition to yield
# a Zipf-like distribution.


source("utils.R")

# Simulated corpus data generated from a multinomial.
# counts(k,m) takes a k-sample from the multinomial m and returns a sorted vector of the positive counts.

  counts <- function(tokens,typeprobs) { if (any(!is.finite(typeprobs))) { errtypeprobs <<- typeprobs; stopifnot(all(is.finite(typeprobs))); }
                                         sample <- sort(fastestsamplerepl(seqa(typeprobs), tokens, prob=typeprobs)); # generate word tokens from the distribution over word types
                                         changeindices <- (1:1e6)[sample[1:1e6]>c(0,sample[0:(1e6-1)])];               # find the indices where a new word starts being generated
                                         rev(sort(diff(c(changeindices,1e6+1)))) }                                     # lengths of the same-word runs (when > 0), from largest to smallest

# ------------------------------------------------

# let's look at some weight density functions.  (this section is skippable)

  # generate (smallish) artificial corpora -- 15 corpora from each density

    zipf <- repl(3,counts(1e5,1/(1:1e4)))                               # generate from perfect zipf
    expzipf <-  repl(15,counts(1e5,exp(rexp(1e4))))                     # generate from competing words whose weights come from exp distrib
    randsign <- function(k) { rbinom(k,1,.5)*2-1 }    # utility fn 
    symmexpzipf <- repl(15,counts(1e5,exp(rexp(1e4)*randsign(1e4))))    # generate from competing words whose weights come from reflected exp distrib
    smallzipf <- repl(15,counts(1e5,exp({x <- seq(-15,15,length=1000); fastestsamplerepl(x,1e4,replace=TRUE,prob=exp(-abs(x)**1.1))}))) # weights come from exp(-abs(x)**1.1)
    smalltiltzipf <- repl(15,counts(1e5,exp({x <- seq(-15,15,length=1000); fastestsamplerepl(x,1e4,replace=TRUE,prob=exp(-abs(x)**1.1/1.1))}))) # weights come from exp(-abs(x)**1.1/1.1)
    normzipf <- repl(15,counts(1e5,exp({x <- seq(-15,15,length=1000); sample(x,1e4,replace=TRUE,prob=exp(-abs(x)**2))}))) # weights come from exp(-abs(x)**2)

  # compare them.  For each density, look at the mean and stdev across the 15 corpora of various measures.

    mapcounts <- function(f,...) { list(zipf=sapply(zipf,f,...),expzipf=sapply(expzipf,f,...),symmexpzipf=sapply(symmexpzipf,f,...),smallzipf=sapply(smallzipf,f,...)) }
    smallfrac <- function(cnts,k) { sum(cnts <= k)/length(cnts) }  # fraction of types that appear <= k times among those that appear at least once

    sapply(mapcounts(length),mean); sapply(mapcounts(length),stdev); boxplot(mapcounts(length));                  # average number of types sampled at least once for each density
    sapply(mapcounts(smallfrac,1),mean); sapply(mapcounts(smallfrac,1),stdev); boxplot(mapcounts(smallfrac,1));   # average proportion of singletons?  (this is indeed about the same for expzipf and symmexpzipf
    sapply(mapcounts(smallfrac,2),mean); sapply(mapcounts(smallfrac,2),stdev); boxplot(mapcounts(smallfrac,2));   # average proportion of singletons+doubletons?
    sapply(mapcounts(max),mean); sapply(mapcounts(max),stdev); boxplot(mapcounts(max));                           # average frequency of most frequent word


    # all the measures, not just their mean & stdev
      mapcounts(length)          # number of types sampled at least once for each density
      mapcounts(smallfrac,1)     # average proportion of singletons?  (this is indeed about the same for expzipf and symmexpzipf
      mapcounts(smallfrac,2)     
      mapcounts(max)             # average frequency of most frequent word

  # plot them

    sapply(zipf, function(cnts) {plot(seqa(cnts),cnts,type="l",log="xy")})
    sapply(expzipf, function(cnts) {lines(seqa(cnts),cnts,type="l",col=2)})
    sapply(symmexpzipf, function(cnts) {lines(seqa(cnts),cnts,type="l",col=3)})
    sapply(smallzipf, function(cnts) {lines(seqa(cnts),cnts,type="l",col=4)})

# ----------------------------------------------------------------------

# Now let's look at actual distribution of tokens in brown corpus.
#    (don't skip this section as it reads in the corpus and defines plotbrown)
# Note that we really want 10x as much random data as we generate above to 
# match it's million words; taking only the first 10th of corpus doesn't
# get the interesting curvature in the log-log plot.

  # cat ~/cs405/hw3/fsm/brown.txt | perl -ne 's{/\S*}{}g; for $w (split) { $c{$w}++ } END { foreach (sort { $c{$b} <=> $c{$a} } keys %c) { print "$_\t$c{$_}\n"; } }' > ~/tmp/browncounts
  brownall <- read.table("~/tmp/browncounts",quote="",sep="\t",col.names=c("word","count"))
  brown <- brownall$count

  plotbrown <- function(scale=1,...) {
    freq <- brownall$count
    rank <- seq(freq)/scale
    plot(rank,freq,log="xy",xlab="rank of frequency",ylab="frequency",...)  # shows curvature: 
    ls <- lsfit(log10(rank),log10(freq))
    abline(ls); ls$coefficients   # shows that exponent is actually about -1.29 here   (This doesn't show up if we only look at first 10% of corpus.)
    lsearly <- lsfit(log10(rank[1:1000]),log10(freq[1:1000]))
    abline(lsearly); lsearly$coefficients  # but for frequent words, exponent is -1.04, so it's just that tail bends down.
  }

  plotbrown()

  # Add labels of some typical words

  id <- list(ind=c(1,2,3,8,29,41,86,168,390,578,1259,2412,5874,13029,21941,29716),pos=c(rep(1,3),4,rep(2,11)))
   # id was originally obtained interactively by:
   #   id <- identify(seqa(brown),brown,brownall$word,pos=TRUE)
   #   id$pos[4:length(id$pos)] <- 2    # make all labels appear on left, except first few
  text(seqa(brown)[id$ind],brown[id$ind],labels=as.character(brownall$word[id$ind]),pos=id$pos,offset=1)


# Now try to fit Brown data manually.  (this section is skippable)

  
  linecounts <- function(cnts,col=1) {lines(seqa(cnts)/length(cnts),cnts,col=col)}
  tryman <- function(e,d,col=1) { smalltiltzipf <- counts(sum(brown),exp({x <- seq(-15,15,length=1000); fastestsamplerepl(x,1e5,prob=exp(-abs(x)**e/d))})); linecounts(smalltiltzipf,col); tries <<- c(list(smalltiltzipf), tries) }

  plotbrown(scale=length(brown))  # replot it with x axis scaled to [0,1]. 
  tries <- list();
  tries <- c(tryman(1,1,col=2),tries)   # doesn't quite fit - misses the curvature
  tries <- c(tryman(1.2,2,col=1),tries)
  tries <- c(tryman(1.3,2.5,col=1),tries)
  tries <- c(tryman(1.33,3,col=4),tries)
  tries <- c(tryman(1.4,3.5,col=2),tries)

  plotbrown(scale=length(brown))  # replot it with x axis scaled to [0,1]. 
  tries <- list();
  tries <- c(repl(4,c(tryman(1.35,3,col=3),tries)))   # this seems to be the best!  try it 4 times

# Now try to fit Brown data automatically. (this section is skippable so long as the save at the end has been done previously)

  plotbrown()
  tries <- list()
  sumsq <- function(x,y) { n <- min(length(x),length(y)); sum((log(x[1:n])-log(y[1:n]))**2) }  # objective error we're trying to minimize - just distance between the two graphs; we ignore extra singletons that one graph has and other doesn't
  sumsqtry <- function(edtypes) { i <<- i+1;
                                  if (i <= length(tries)) { print(c(i,tries[[i]]$edtypes)); print(tries[[i]]$sumsq); return(tries[[i]]$sumsq); } # we've already done this one and are retracing our steps to continue optimization
                                  e <- edtypes[1]; d <- edtypes[2]; types <- length(brown)*20*edtypes[3]; # split argument (a 3-vector) into named vars
                                  print(c(length(tries)+1,e,d,types));                                   # track progress
                                  x <- seq(-15,15,length=5000);                          # will evaluate density at these points (so these are our possible weights)
                                  savecnts <<- cnts <- counts(sum(brown),exp(fastestsamplerepl(x,types,prob=exp(-abs(x)**e/d))));   # get weights by sampling, then get a corpus the same size as Brown by sampling the weights  
                                  r <- sumsq(cnts,brown);                                  # is this curve a good match for Brown corpus?  
                                  print(r);
                                  tries[[i]] <<- list(sumsq=r,e=e,d=d,types=types,edtypes=edtypes);   # remember what we did since it's slow to generate
                                  lines(seqa(cnts),cnts,col=col);     # graph it so we can watch the thing chug  (there's assumed to exist a log-log plot of brown)
                                  r; }           # return goodness
   sumsqtry5 <- function(edtypes,col=1) { r <- sumsqtry(edtypes); 
                                          if (r < 2*best) { r <- mean(c(r, unlist(repl(4,sumsqtry(edtypes))))); }   # if r is close to optimal, we want a more precise value: average it with 4 others, also random.  This should keep optimization from straying due to vagaries of random # generator.
                                          if (r < best) { best <<- r; }
                                          print(r);   
                                        }
   # To start optimization from scratch:
       tries <- list()
   # To start by retracing previous optimization as saved in tries:
       load("zipf.optim.R")

   best <- 1e6; i <- 0; optim(c(1.33,3,1),sumsqtry5,control=list(trace=5)); # start given what we already have in tries
   save(tries,file="zipf.optim.R")

   # let's look at params from best *individual* run (not the same as best):
   j <- sapply(tries, function (x) x$sumsq)
   edtypesbest <- tries[j==min(j)][[1]]$edtypes
   #   1.3175 3.2875 0.8875   - acheives 2117.32 on evaluation 42
   # Note also that as far as best mean-of-5 goes, we got a mean of 2562.38 on evals 158-162:
   #   1.323410 3.271446 1.031016

   # given edtypesbest, let's find best fit we can over 10 samples
   rbest <- 1e6; for (k in 1:10) { r <- sumsqtry(edtypesbest); if (r < rbest) { rbest <- r; cntsbest <- savecnts; } }
   plotbrown(); lines(seqa(cntsbest),cntsbest)

   save(edtypesbest,cntsbest,id,file="zipf.cntsbest.R")  # careful not to overwrite!

# Ok, now make and save the final plot.

   load("zipf.cntsbest.R")
   postscript(file="zipf.eps",onefile=FALSE,horizontal=FALSE,width=13,height=6,paper="special");  # make a big graph to be included at half-size; this allows dots to be small
   layout(cbind(1,2))                  # two side-by-side graphs
   par(mar=c(4,4,1,1),pty="s",pch=20)  # trim excess margin; square plots; small filled circle as point
   plotbrown(type="l"); points(seqa(brownall$count),brownall$count); text(seqa(brown)[id$ind],brown[id$ind],labels=as.character(brownall$word[id$ind]),pos=id$pos,offset=1,cex=0.7)
   plotbrown(type="l"); points(seqa(cntsbest),cntsbest)
   dev.off()
   system("mv zipf.eps zipf.eps.orig"); system("perl -ane 'if ($F[0] =~ /^\\d+\\.\\d+$/ && $F[1] =~ /^\\d+\\.\\d+$/) { next if $F[1]==$oldF[1] && abs($F[0]-$oldF[0])<0.2; @oldF=@F; } print;' zipf.eps.orig > zipf.eps")  # squish the .ps file by killing off lines (which plot points) that have the same y coordinate and almost the same x coordinate as the previously printed line.  This reduces filesize by a factor of 50 and makes the picture printable without waiting for hours.  (Even piping through uniq shrinks by a factor of 4-5, but that's not enuf to print.)
   system("epstopdf zipf.eps")

# Obsolete but maybe useful later:

   # Regraph our tries ... see that all these plots fit the steeper
   # line at the bottom and the shallower one at top, just like Zipf.
   # Also see that some of them like tries[[3]] also look a *lot* like the noisier high-freq
   # data - very pretty!  Just to remember how to read the graph,
   # the horiz. axis is proportional to log(rank) -- with the constant
   # of proportionality being slightly different depending on how many 
   # distinct tokens we actually drew (!!! in fact, we should be careful 
   # to use 1.17e6 above rather
   # than 1e6 so that we have as many tokens as Brown).  The vertical axis
   # is log(count).  There's no difference between the ``point'' and ``line''
   # style graphs except that the point graph is real Brown data; maybe should
   # just show the real data as points and then overlay one artificial dataset
   # on top of it in the next graph, also with points.
   # 
   load("zipf.R")
   plot(log((1:length(brown))/length(brown)),log(brown),col=6)  # shows curvature
   ls <- lsfit(log((1:length(brown))/length(brown)),log(brown))
   abline(ls,col=6); ls$coefficients   # shows that exponent is actually about -1.29 here   (This doesn't show up if we only look at first 10% of corpus.)
   lsearly <- lsfit(log((1:1000)/length(brown)),log(brown[1:1000]))
   abline(lsearly,col=6); lsearly$coefficients  # but for frequent words, exponent is -1.04, so it's just that tail bends down. 
   sapply(tries, function(cnts) {lines(log((1:length(cnts))/length(cnts)),log(cnts),col=3)})

   sapply(list(tries[[3]]), function(cnts) {lines(log((1:length(cnts))/length(cnts)),log(cnts),col=4)})  # redo the best one in blue
   sapply(list(tries[[3]]), function(cnts) {points(log((1:length(cnts))/length(cnts)),log(cnts),col=4)}) # and as points

# ----------------------------------------------------------------------

# how does the variance of a density in our family relate to its exponent?
# doesn't seem to be an easy relation.
# it is only easy when exponent is 1 or 2: 
#    exp distrib with rate of lambda has mean 1/lambda and var 1/lambda^2
#    so lambda corresponds to 1/sigma^2 - we are really just dividing by
#    our desired variance right before the final exponentiation.

  v <- function(e) { var(sample(x,50000,replace=TRUE,prob=exp(-abs(x)**e/e))) }
  v <- function(e) { d <- exp(-abs(x)**e/e); sum((x**2)*d)/sum(d) } # faster and more accurate
  exps <- exp(seq(-5,5,length=500)*log(10))
  variances <- map(exps,v)
  plot(log(exps),variances)
  plot(exps[exps>.5 & exps<3],newvars[exps>.5 & exps<3])