# Utility functions: # like stopifnot (cannibalized therefrom) warnifnot <- function (...) { n <- length(ll <- list(...)) if (n == 0) return(invisible()) mc <- match.call() for (i in 1:n) if (!(is.logical(r <- eval(ll[[i]])) && all(r))) warning("assertion failed"); # the unevaluated call will be appended } # map a function along a list map <- function(x,f) apply(cbind(x),1,f) # multiple evaluations of a function repl <- function(n,expr) { s <- substitute(expr); lst <- list(); for (i in 1:n) lst[[i]] <- eval(s,env=parent.frame()); lst } # evaluates the same expression n times and creates a list of the results # equivalent to 1:length(x) seqa <- function(x) seq(along=x) # stdev stdev <- function(x) sqrt(var(x)) # fastsamplerepl is 4-5 times faster than sample(x,size,prob,replace=TRUE) # < 1 min. as opposed to 4.5 min. for length(sample(1:1e5,1e6,prob=1/(1:1e5),replace=TRUE)) # The builtin sample() seems to do poorly on large sets, perhaps # especially if they have distributions far from uniform. So # partition the events into n classes of n events and use a two-step # procedure. In fact, do this recursively, although that doesn't seem # to make much difference. # fastsamplerepl <- function(x,size=length(x),prob=rep(1,length(x))) { if(size > 10 && length(x)>=1000) sqrtsample(x,size,prob) else sample(x,size,prob=prob,replace=TRUE) } # sqrtsample <- function(x,size,prob) { n <- ceiling(sqrt(length(x))); # xm <- matrix(c(x,rep(NA,n*n-length(x))),n,n); # probm <- matrix(c(prob,rep(0,n*n-length(x))),n,n); # s1 <- fastsamplerepl(1:n,size,prob=apply(probm,1,sum)); # unlist(lapply(1:n, function (i) fastsamplerepl(xm[i,],sum(s1==i),prob=probm[i,])))} # An equally fast and rather simpler version. I don't understand why the # built-in version isn't blazing! # fastersamplerepl <- function(x,size=length(x),prob=rep(1,length(x)),replace=TRUE) { # probabilities can be unnormalized # stopifnot(length(x)==length(prob)); # stopifnot(replace); # library(ts); cump <- diffinv(prob); # cumulative probabilities (this vector is one longer than prob and starts with 0) # cumu <- sort(runif(size,max=cump[length(cump)])); # sorted uniform probs # s <- rep(NA,size); # answer will go here # pi <- 1; # for (ui in 1:size) { # while (cump[pi] < cumu[ui]) { pi <- pi+1; } # s[ui] <- x[pi-1]; # } # s; } # And this one is much faster still -- 2 seconds! - because it uses the built-in function approx. # Also we avoid a log(size) factor in the sort while incurring a log(length(x)) factor for approx's search. # (Might be possible to do faster still with a builtin implementing one of the two algorithms.) # (Also, it would make sense for R to produce a version that gives the multinomial counts ...) fastestsamplerepl <- function(x,size=length(x),prob=rep(1,length(x)),replace=TRUE) { # probabilities can be unnormalized warnprob <<- prob; stopifnot(length(x)==length(prob)); stopifnot(replace); stopifnot(all(is.finite(x))); stopifnot(all(is.finite(prob))); require(ts); cump <- diffinv(prob); # cumulative probabilities (this vector is one longer than prob and starts with 0) cumu <- runif(size,max=cump[length(cump)]); # unsorted uniform probs inds <- approx(cump,c(seqa(x),length(x)),cumu,ties="ordered",method="constant")$y if (0 < length(nas <- seqa(size)[!is.finite(inds)])) { warning(paste("had nas: ",length(nas))); warncump <<- cump; warncumu <<- cumu; warnx <<- x; warninds <<- inds; inds[nas] <- print(sample(x,length(nas),prob,replace=TRUE)); } # I don't understand why inds sometimes contains NA values, and it doesn't seem to be replicable even if I squirrel away the arguments to approx globally and run approx again from the prompt. But deal with it by replacing them using ordinary sample(). stopifnot(all(is.finite(inds))); stopifnot(all(is.finite(x[inds]))); x[inds]; }