Christian Bartels
Allschwil, June-2014
Efficient generic integration algorithm to determine confidence intervals and p-values for hypothesis testing. Christian Bartels. figshare.
http://dx.doi.org/10.6084/m9.figshare.1054694
# clean up
rm(list = ls())
# print out version information, path, date
R.Version()
## $platform
## [1] "x86_64-w64-mingw32"
##
## $arch
## [1] "x86_64"
##
## $os
## [1] "mingw32"
##
## $system
## [1] "x86_64, mingw32"
##
## $status
## [1] ""
##
## $major
## [1] "3"
##
## $minor
## [1] "1.0"
##
## $year
## [1] "2014"
##
## $month
## [1] "04"
##
## $day
## [1] "10"
##
## $`svn rev`
## [1] "65387"
##
## $language
## [1] "R"
##
## $version.string
## [1] "R version 3.1.0 (2014-04-10)"
##
## $nickname
## [1] "Spring Dance"
date()
## [1] "Thu Jun 12 18:34:16 2014"
Sys.info()
## sysname release
## "Windows" "7 x64"
## version nodename
## "build 7601, Service Pack 1" "CHRISTIAN-PC"
## machine login
## "x86-64" "Christian"
## user effective_user
## "Christian" "Christian"
# get libraries
library(rstan)
## Loading required package: Rcpp
## Loading required package: inline
##
## Attaching package: 'inline'
##
## Das folgende Objekt ist maskiert from 'package:Rcpp':
##
## registerPlugin
##
## rstan (Version 2.2.0, packaged: 2014-02-14 04:29:17 UTC, GitRev: 52d7b230aaa0)
library(ggplot2)
library(data.table)
set_cppo("fast")
## mode fast without NDEBUG for compiling C++ code has been set already
# my likelihood function x ... observations or samples of observations col:
# repeated measures (evaluated with the same parameter) row: different
# samples (evaluate with different parameters) par ... data.frame of
# parameters col: different parameters, e.g., mean and sd row: samples
# return vector of log-likelihood of samples
my.ll <- function(x, par) {
return(rowSums(dnbinom(x, size = par$size, prob = par$prob, log = TRUE)))
}
# my maximum likelihood estimator - use numerical optimization (slow!)
my.opt <- function(parc, x, fix = data.frame(size = NA, prob = NA)) {
parf <- fix
parf[, is.na(parf[1, ])] <- parc
return(-my.ll(x, parf))
}
my.maxll <- function(x) {
df1 <- do.call(rbind, apply(x, 1, function(x1) {
n <- length(x1)
op <- optim(c(0.08, 0.06), my.opt, x = matrix(x1, nrow = 1), control = list(maxit = 100))
dft <- data.frame(size = op$par[1], prob = op$par[2], mll = -op$value) # max. likelihood parameters
return(dft)
}))
return(df1)
}
# my log prior par ... data.frame of parameters col: different parameters,
# e.g., mean and sd row: samples return vector of log-priors of samples
my.prior <- function(par) {
return(0)
}
# my posterior
my.post <- function(x, par) return(my.ll(x, par) + my.prior(par))
# my simulation function par ... data.frame of parameters col: different
# parameters, e.g., mean and sd row: samples n ... number of repeated
# measures
my.sim <- function(n, par) {
m <- nrow(par)
return(matrix(rnbinom(n * m, size = par$size, prob = par$prob), ncol = n))
}
# make reproducible
set.seed(2)
# generate observed data - series of zeros
n = 10 # number of repeated observations
x <- matrix(c(10, 1, rep(0, n - 2)), ncol = n) # observed data
Use Stan to sample “reasonable” parameters
# list model
modelSample = "mcmc6.model" # model to sample parameters
cbind(system(paste("cat ", modelSample, sep = ""), intern = TRUE))
## [,1]
## [1,] "data {"
## [2,] " int<lower=0> N;"
## [3,] " int x[N];"
## [4,] "} "
## [5,] "parameters {"
## [6,] " real<lower=1E-4,upper=1E4> alpha;"
## [7,] " real<lower=1E-4,upper=1E4> beta;"
## [8,] "} "
## [9,] "model {"
## [10,] " x ~ neg_binomial(alpha, beta); // fit"
## [11,] " increment_log_prob(-log(alpha)-log(beta)); // prior"
## [12,] "} "
## [13,] "generated quantities { "
## [14,] " real<lower=0,upper=1> prob;"
## [15,] " prob <- beta/(1+beta);"
## [16,] "}"
## [17,] ""
# compile model
parSampler <- stan_model(modelSample, verbose = FALSE)
##
## TRANSLATING MODEL 'mcmc6' FROM Stan CODE TO C++ CODE NOW.
## COMPILING THE C++ CODE FOR MODEL 'mcmc6' NOW.
## cygwin warning:
## MS-DOS style path detected: C:/PROGRA~1/R/R-31~1.0/etc/x64/Makeconf
## Preferred POSIX equivalent is: /cygdrive/c/PROGRA~1/R/R-31~1.0/etc/x64/Makeconf
## CYGWIN environment variable option "nodosfilewarning" turns off this warning.
## Consult the user's guide for more details about POSIX paths:
## http://cygwin.com/cygwin-ug-net/using.html#using-pathnames
## C:/Program Files/R/R-3.1.0/library/rstan/include//stansrc/stan/agrad/rev/var_stack.hpp:49:17: warning: 'void stan::agrad::free_memory()' defined but not used [-Wunused-function]
## C:/Program Files/R/R-3.1.0/library/rstan/include//stansrc/stan/agrad/rev/chainable.hpp:87:17: warning: 'void stan::agrad::set_zero_all_adjoints()' defined but not used [-Wunused-function]
# call Stan
dat <- list(N = n, x = c(x))
parSample <- sampling(parSampler, data = dat, init = 0, iter = 8000, thin = 4,
seed = 123)
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 1).
##
Iteration: 1 / 8000 [ 0%] (Warmup)
Iteration: 800 / 8000 [ 10%] (Warmup)
Iteration: 1600 / 8000 [ 20%] (Warmup)
Iteration: 2400 / 8000 [ 30%] (Warmup)
Iteration: 3200 / 8000 [ 40%] (Warmup)
Iteration: 4000 / 8000 [ 50%] (Warmup)
Iteration: 4800 / 8000 [ 60%] (Sampling)
Iteration: 5600 / 8000 [ 70%] (Sampling)
Iteration: 6400 / 8000 [ 80%] (Sampling)
Iteration: 7200 / 8000 [ 90%] (Sampling)
Iteration: 8000 / 8000 [100%] (Sampling)
## Elapsed Time: 0.109 seconds (Warm-up)
## 0.091 seconds (Sampling)
## 0.2 seconds (Total)
##
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 2).
##
Iteration: 1 / 8000 [ 0%] (Warmup)
Iteration: 800 / 8000 [ 10%] (Warmup)
Iteration: 1600 / 8000 [ 20%] (Warmup)
Iteration: 2400 / 8000 [ 30%] (Warmup)
Iteration: 3200 / 8000 [ 40%] (Warmup)
Iteration: 4000 / 8000 [ 50%] (Warmup)
Iteration: 4800 / 8000 [ 60%] (Sampling)
Iteration: 5600 / 8000 [ 70%] (Sampling)
Iteration: 6400 / 8000 [ 80%] (Sampling)
Iteration: 7200 / 8000 [ 90%] (Sampling)
Iteration: 8000 / 8000 [100%] (Sampling)
## Elapsed Time: 0.107 seconds (Warm-up)
## 0.117 seconds (Sampling)
## 0.224 seconds (Total)
##
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 3).
##
Iteration: 1 / 8000 [ 0%] (Warmup)
Iteration: 800 / 8000 [ 10%] (Warmup)
Iteration: 1600 / 8000 [ 20%] (Warmup)
Iteration: 2400 / 8000 [ 30%] (Warmup)
Iteration: 3200 / 8000 [ 40%] (Warmup)
Iteration: 4000 / 8000 [ 50%] (Warmup)
Iteration: 4800 / 8000 [ 60%] (Sampling)
Iteration: 5600 / 8000 [ 70%] (Sampling)
Iteration: 6400 / 8000 [ 80%] (Sampling)
Iteration: 7200 / 8000 [ 90%] (Sampling)
Iteration: 8000 / 8000 [100%] (Sampling)
## Elapsed Time: 0.109 seconds (Warm-up)
## 0.121 seconds (Sampling)
## 0.23 seconds (Total)
##
## SAMPLING FOR MODEL 'mcmc6' NOW (CHAIN 4).
##
Iteration: 1 / 8000 [ 0%] (Warmup)
Iteration: 800 / 8000 [ 10%] (Warmup)
Iteration: 1600 / 8000 [ 20%] (Warmup)
Iteration: 2400 / 8000 [ 30%] (Warmup)
Iteration: 3200 / 8000 [ 40%] (Warmup)
Iteration: 4000 / 8000 [ 50%] (Warmup)
Iteration: 4800 / 8000 [ 60%] (Sampling)
Iteration: 5600 / 8000 [ 70%] (Sampling)
Iteration: 6400 / 8000 [ 80%] (Sampling)
Iteration: 7200 / 8000 [ 90%] (Sampling)
Iteration: 8000 / 8000 [100%] (Sampling)
## Elapsed Time: 0.108 seconds (Warm-up)
## 0.097 seconds (Sampling)
## 0.205 seconds (Total)
# output from stan
print(parSample, digits = 3)
## Inference for Stan model: mcmc6.
## 4 chains, each with iter=8000; warmup=4000; thin=4;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff
## alpha 0.078 0.002 0.088 0.006 0.026 0.050 0.097 0.313 1365
## beta 0.080 0.003 0.127 0.000 0.006 0.032 0.100 0.419 1966
## prob 0.064 0.002 0.084 0.000 0.006 0.031 0.091 0.296 2021
## lp__ -11.202 0.021 1.056 -13.939 -11.660 -10.920 -10.436 -10.088 2446
## Rhat
## alpha 1.004
## beta 1.003
## prob 1.003
## lp__ 1.001
##
## Samples were drawn using NUTS(diag_e) at Thu Jun 12 18:35:27 2014.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
plot(parSample)
traceplot(parSample)
# get samples of parameters (representing integral)
par <- as.data.table(as.data.frame(parSample))
# rename col-names to match rest of code
setnames(par, c("alpha", "prob"), c("size", "prob"))
# add identifier & key
par[, `:=`(ipar, 1:nrow(par))]
## size beta prob lp__ ipar
## 1: 0.02843 0.3428034 0.2552893 -12.98 1
## 2: 0.04396 0.0025133 0.0025070 -11.13 2
## 3: 0.17007 0.0615702 0.0579992 -10.89 3
## 4: 0.02535 0.0007464 0.0007459 -11.56 4
## 5: 0.02767 0.0062750 0.0062359 -10.89 5
## ---
## 3996: 0.04112 0.0947016 0.0865090 -10.58 3996
## 3997: 0.04456 0.0496515 0.0473029 -10.30 3997
## 3998: 0.10385 0.0285465 0.0277542 -10.58 3998
## 3999: 0.01512 0.0072160 0.0071643 -11.49 3999
## 4000: 0.12845 0.1243502 0.1105974 -10.17 4000
setkey(par, ipar)
# simulate
dat <- as.data.table(my.sim(n, par))
# add identifier & key
dat[, `:=`(idat, 1:nrow(dat))]
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 idat
## 1: 0 0 0 0 0 0 0 0 0 0 1
## 2: 0 0 0 0 0 0 0 0 16 0 2
## 3: 13 5 2 0 5 0 0 1 0 1 3
## 4: 0 0 0 0 0 0 0 0 0 0 4
## 5: 0 0 0 8 0 0 0 0 0 0 5
## ---
## 3996: 0 0 0 0 0 0 0 0 0 0 3996
## 3997: 0 0 0 0 0 0 0 0 0 0 3997
## 3998: 0 4 1 0 1 0 141 0 0 1 3998
## 3999: 0 0 0 0 70 0 0 0 0 0 3999
## 4000: 0 0 0 0 0 0 0 0 0 0 4000
setkey(dat, idat)
# indices of data columns
iix <- match(grep("V", colnames(dat), value = TRUE), colnames(dat))
Given individual parameter vector as null hypothesis and complementary space as alternative hypothesis
# test c: likelihood ratio test ll ... log likelihood of null hypothesis,
# i.e., data given selected parameter mll ... log likelihood of alternative
# hypothesis, i.e., data and max. likl. parameter
ftc <- function(ll, mll) {
return(-2 * ll + 2 * mll)
}
# test d: posterior test - probability of parameter given data
ftd <- function(wn.par) {
return(1/wn.par)
}
# evaluate tests for simulated data
llm[, `:=`(tc, ftc(ll, dat[idat, mll])), by = idat]
llm[, `:=`(td, ftd(wn.par)), by = idat]
# evaluate tests for observed data
par[, `:=`(tcr, ftc(ll, mll.obs$mll))]
par[, `:=`(tdr, ftd(1/nrow(par)))]
ggplot(par, aes(x = size, y = prob, colour = cut_number(p.c, n = 10))) + geom_point() +
guides(colour = guide_legend("p-value")) + ggtitle("Likelihood-Ratio Test")
ggplot(par, aes(x = size, y = prob, colour = cut_number(p.d, n = 10))) + geom_point() +
guides(colour = guide_legend("p-value")) + ggtitle("Posterior Test")
ggplot(par, aes(x = p.c, y = p.d)) + geom_point()
ggplot(par, aes(x = tcr, y = p.c)) + geom_point(aes(y = 1 - pchisq(tcr, 2)),
colour = "yellow") + geom_point(alpha = 0.5) + xlab("Log Likelihood Ratio") +
ylab("p-value")