It
depends how complicated the log probability
function is. It all comes down to the number of
function applications in computing it. Each of
those functions needs to have its partial derivative
calculated and the chain rule applied, which is where
all the time goes.
To get a feeling, simulate smaller data sets, then increase
the size to measure the trend.
I could fit 1M data point IRT models with 20K parameters
in hours with four chains.
Thanks for spending your time answering another big email of mine.
Looks like I found the source of the problem: it's the difference of
magnitude of the covariates of the model.
For instance by generating my matrix of covariates bbgg in the
following way, it works:
sampleSize<-1e5
bbgg <-
c(rep(1,sampleSize/4),rep(0,sampleSize/4),rep(2,sampleSize/2))
for(i in 1:3)
{
x1<-
c(rnorm(sampleSize/2,i,runif(1,1,5)),rnorm(sampleSize/2,i*1.2,runif(1,1,5)))
x2 <-
c(rnorm(sampleSize/4,i*1.1,runif(1,1,5)),rnorm(sampleSize/4,i*1.4*1000,runif(1,1,5)),rnorm(sampleSize/2,i*1.7,runif(1,1,5)))
bbgg <- cbind(bbgg,x1,x2)
}
On the other hand, when we have a big difference of magnitude, it
doesn't work: Stan gets stuck in some early iteration, in this case,
the 17ᵗʰ
.
(the difference here is only the *1000 in bold)
sampleSize<-1e5
bbgg <-
c(rep(1,sampleSize/4),rep(0,sampleSize/4),rep(2,sampleSize/2))
for(i in 1:3)
{
x1<-
c(rnorm(sampleSize/2,i,runif(1,1,5)),rnorm(sampleSize/2,i*1.2,runif(1,1,5)))
x2 <- c(rnorm(sampleSize/4,i*1.1*1000,runif(1,1,5)),rnorm(sampleSize/4,i*1.4*1000,runif(1,1,5)),rnorm(sampleSize/2,i*1.7*1000,runif(1,1,5)))
bbgg <- cbind(bbgg,x1,x2)
}
Placing print(lp__) right before the end of model section shows that
Stan is actively calculating log-likelihoods of my model although it
doesn't move to the next iteration. In fact, the log-likelihood
seems to be moving in a circular way.
I know almost nothing about how HMC and NUTS work, so I don't have a
good idea of why covariates with different magnitudes are making the
Stan unable to continue. Do you know some work around for this
problem?
Obs.:
Making the following changes (and also trying to combine then)
doesn't seems to help, at most they change the iteration in which it
gets stuck:
>>> Starting with MLE obtained previously.
>>> Using normal priors for all beta and gamma (the other
parameters are transformed parameters derived from those two and
data)
>>> Constraining all beta and gamma parameters to lie in a
certain region.
>>> Using Kahan summation
---------------------------------------
Also, I get a strange behavior when I constrain eta and/or zeta
(transformed parameters) to certain ranges: when I start sampling,
stan suddenly eats tons of RAM memory, like 10 or 20 times more than
without constraining then. The whole system then gets stuck because
of the aggressive swapping and it takes a lot of time to quit rstan.
I wonder why this happens.
---------------------------------------
Here follows my model specification (should be fully reproducible
code):
require(rstan)
require(coda)
zi_model_code <- '
data
{
int <lower=0> nrow;
int <lower=0> ncol;
int <lower=0> yy[nrow];
matrix [nrow,ncol] bbgg;
int <lower=0> nZeros;
}
transformed data
{
int <lower=0> nNonZeros;
nNonZeros <- nrow-nZeros;
}
parameters
{
vector [ncol] beta;
vector [ncol] gamma;
}
transformed parameters
{
real <lower=0> mu[nrow];
real eta[nrow];
real zeta[nrow];
for (i in 1:nrow)
{
eta[i] <- bbgg[i] * beta;
zeta[i] <- bbgg[i] * gamma;
mu[i] <- exp(eta[i]);
}
}
model {
real summandsZ1[nZeros];
real summandsZ2[nZeros];
real summandsZ3[nZeros];
real summandsN1[nNonZeros];
real summandsN2[nNonZeros];
real summandsN3[nNonZeros];
int j;
int k;
j<-1;
k<-1;
{ #log-likelihood function is: log(
inv_logit(zeta[i])*equals(y[i],0) + (1-w[i])*poison(y[i]) )
#after some algebra, it goes down to:
if (yy[i]==0)
{
summandsZ1[j] <- zeta[i];
summandsZ2[j] <- log1p_exp(-zeta[i]-mu[i]);
summandsZ3[j] <- -log1p_exp(zeta[i]);
j<-j+1;
}
else
{
summandsN1[k] <- -mu[i];
summandsN2[k] <- yy[i]*eta[i];
summandsN3[k] <- -log1p_exp(zeta[i]);
k<-k+1;
}
#the correctness was checked using simulated data
}
lp__ <- lp__ + sum(summandsN1) + sum(summandsN2) +
sum(summandsN3) + sum(summandsZ1) + sum(summandsZ2) +
sum(summandsZ3);
# beta[i] ~ normal(0, 10);
# gamma[i] ~ normal(0, 10);
# }
#print(lp__);
}
'
#model definition ends here
#Compile only if modified
if(ifelse(exists("zi_model_code2"),zi_model_code2,"")!=zi_model_code
|| !exists("modelObj"))
{
if (exists("modelObj")) rm(modelObj)
zi_model_code2<-zi_model_code
modelObj <- stan_model(model_code = zi_model_code)
}
#Random zero-inflated poisson generator
rZIP = function(x,mu,w)
{
a=ifelse(runif(x)<w,0,1)
count1=length(which(a==1))
if(length(mu)!=1)
{
if (length(mu)!=x) stop("mu must have the size 1 or the same
size of x")
mu <- mu[a==1] #select only values where the first stage got
1
}
if(length(w)!=1)
{
if (length(w)!=x) stop("w must have the size 1 or the same size
of x")
w <- w[a==1] #select only values where the first stage got 1
}
a[a==1]=rpois(count1,mu)
a
}
sampleSize<-1e5
set.seed(10)
#Generate simulated yy vector of regressand and the covariate matrix
bbgg
yy <-
c(rZIP(sampleSize/4,1.2,0.7),rZIP(sampleSize/4,1.4,0.6),rZIP(sampleSize/2,1.3,0.8))
bbgg <-
c(rep(1,sampleSize/4),rep(0,sampleSize/4),rep(2,sampleSize/2))
for(i in 1:3)
{
x1<-
c(rnorm(sampleSize/2,i,runif(1,1,5)),rnorm(sampleSize/2,i*1.2,runif(1,1,5)))
x2 <-
c(rnorm(sampleSize/4,i*1.1*1000,runif(1,1,5)),rnorm(sampleSize/4,i*1.4*1000,runif(1,1,5)),rnorm(sampleSize/2,i*1.7*1000,runif(1,1,5)))
bbgg <- cbind(bbgg,x1,x2)
}
bbgg <- cbind(1,bbgg)
zi_model_dat <- list( yy = yy, bbgg = bbgg, nrow=NROW(yy),
ncol=NCOL(bbgg), nZeros=NROW(which(yy==0)) )
#inits<-list("beta0"=0.1726, "beta[1]"=-0.8719,
"beta[2]"=0.3072,"gamma0"=1.042,"gamma[1]"=-1.163,"gamma[2]"=1.463)
#initsV<-list(beta=as.vector(regfBeta),gamma=as.vector(regfGamma))
#for(i in 1:length(regfBeta))
initsV[[paste0("beta[",i,"]")]]<-regfBeta[i]
#for(i in 1:length(regfGamma))
initsV[[paste0("gamma[",i,"]")]]<-regfGamma[i]
###### This is a function to do paralell sampling
#All parameters are the same as rstan::sampling, except nThreads:
the number of used cores and silent: if TRUE, suppress stdout of
child processes
mcSampling <- function(object, data = list(), pars = NA, ...,
nThreads=2, silent = TRUE)
{
if(nThreads<2)
stop("For only one thread, use rstan::sampling directly")
require(rstan)
require(multicore)
tryCatch(kill(children(),9), error = function(e) {})
extraArgsList <- list(...)
pids<-list()
for (i in 1:(nThreads-1))
pids[[i]]<-parallel({ fit<-do.call(function(...){
sampling(object, data, pars, ...) },extraArgsList) }, silent =
silent, mc.set.seed = TRUE)
results<-list()
results[[1]]<-sampling(object, data, pars, ...) #One of the
chains is run in the current session, so we can output the results
for (i in 1:(nThreads-1))
results[[i+1]]<-collect(pids[[i]])[[1]]
results #return a list of rstan objects
}
print(system.time(fit <- mcSampling(modelObj, data =
zi_model_dat, pars=c("beta","gamma"),
iter = 1000, chains = 1, init=0, refresh=1, nThreads=2,
max_treedepth=-1)))
########### Results analysis ################
#stan to coda converter, accept multiple stanfit objects as
arguments, each one can contain 1 or more chains
#it can also accept a list of stanfit objects in a single argument
rstanToCoda <- function(...)
{
require(rstan)
require(coda)
input_list <- list(...)
if(class(input_list[[1]]) == "list") #support for list of
stanfit objects
return (do.call(rstanToCoda,input_list[[1]]))
mcmcTemp <- list()
k=1
j=1
while(TRUE)
{
object <- as.array(input_list[[k]])
for (i in 1:(dim(object)[2]))
{
mcmcTemp[[j]] <- mcmc(as.array(object)[,i,])
j=j+1
}
k=k+1
if(k>length(input_list)) break;
}
do.call(mcmc.list,mcmcTemp)
}
a<-rstanToCoda(fit)
#a<-a[,varnames(a)!="lp__"]
print(gelman.diag( a ))