Dear all,
I'm fairly new to this topic, but in the near future I will have to use hierarchical models as well as GLMMs to estimate trends in population sizes. One of the things I would like to work on is the implementation of covariates, which means that I'll use information criteria and/or cross-validation for model selection, to see which covariates should remain in my models.
Now, I used one of Marc Kéry's simulated datasets (Kéry 2010 - Chapter 21) to try to implement Bayesian p-values as well as several different information criteria (DIC, WAIC, Posterior Predictive Loss (D)) and cross-validation, but I have a hard time figuring out if I'm doing it correctly. Does anyone have experience with these methods? Can anyone tell me if and where I'm making mistakes? Many thanks for your help.
Best, Jelle
###########################################################################################################
If I run the code below, it takes about 60 minutes to produce the following table:
fold |
bay.p |
c.hat |
bay.p.cv |
c.hat.cv |
DIC |
WAIC |
D |
non.convergence |
n.params.estimated |
1 |
0.31 |
1.08 |
0.00 |
4.37 |
867 |
925 |
6174 |
12 |
1215 |
2 |
0.47 |
1.02 |
0.00 |
8.54 |
882 |
920 |
6163 |
9 |
1215 |
3 |
0.37 |
1.05 |
0.00 |
4.28 |
896 |
999 |
6346 |
11 |
1215 |
4 |
0.36 |
1.05 |
0.00 |
4.17 |
918 |
914 |
6191 |
17 |
1215 |
5 |
0.45 |
1.03 |
0.00 |
6.56 |
910 |
971 |
6296 |
19 |
1215 |
6 |
0.37 |
1.05 |
0.00 |
4.49 |
922 |
1015 |
6221 |
13 |
1215 |
7 |
0.31 |
1.07 |
0.02 |
3.32 |
912 |
940 |
6242 |
33 |
1215 |
8 |
0.25 |
1.10 |
0.01 |
3.18 |
894 |
1007 |
6340 |
18 |
1215 |
9 |
0.36 |
1.05 |
0.00 |
5.58 |
910 |
913 |
6168 |
10 |
1215 |
10 |
0.36 |
1.06 |
0.00 |
19.75 |
908 |
903 |
5885 |
19 |
1215 |
average |
0.36 |
1.06 |
0.00 |
6.42 |
901.9 |
950.5 |
6202.6 |
16.1 |
|
###########################################################################################################
library(R2jags)
#######################
### Data generation ###
#######################
n.site <- 200 # number of sites
vege <- sort(runif(n = n.site, min = -1.5, max =1.5)) # vegetation cover
alpha.lam <- 2 # intercept
beta1.lam <- 2 # linear effect of vegetation
beta2.lam <- -2 # quadratic effect of vegetation
lam <- exp(alpha.lam + beta1.lam*vege + beta2.lam*(vege^2)) # lambda
par(mfrow = c(2,1))
plot(vege, lam, main = "", xlab = "", ylab = "Expected abundance", las = 1)
N <- rpois(n = n.site, lambda = lam) # simulation of abundance
table(N) # distribution of abundances across sites
sum(N > 0) / n.site # empirical occupancy
plot(vege, N, main = "", xlab = "Vegetation cover", ylab = "Realized abundance")
points(vege, lam, type = "l", lwd = 2)
par(mfrow = c(2,1))
alpha.p <- 1 # intercept
beta.p <- -4 # linear effect of vegetation
det.prob <- exp(alpha.p + beta.p * vege) / (1 + exp(alpha.p + beta.p * vege)) # detection probability
plot(vege, det.prob, ylim = c(0,1), main = "", xlab = "", ylab = "Detection probability")
expected.count <- N * det.prob # model without error
plot(vege, expected.count, main="", xlab="Vegetation cover",
ylab="Apparent abundance", ylim = c(0, max(N)), las = 1)
points(vege, lam, type = "l", col = "black", lwd = 2) # Truth
n.survey <- 3 # number of replicate counts per site
count <- array(dim = c(n.site, n.survey)) # array to fill in simulated counts
for(j in 1:n.survey){
count[,j] <- rbinom(n=n.site, size=N, prob=det.prob) # simulation of counts
}
max.count <- apply(count, 1, max) # maximum count per site
naive.analysis <- glm(max.count ~ vege + I(vege^2), family = poisson) # simple glm, assusuming perfect detection
summary(naive.analysis)
lin.pred <- naive.analysis$coefficients[1] + naive.analysis$coefficients[2]*vege + naive.analysis$coefficients[3] * (vege*vege)
# linear predictor of naive analysis
par(mfrow = c(1,1))
plot(vege, max.count, main = "", xlab = "Vegetation cover",
ylab = "Abundance or count or detection probability*max.N", ylim = c(0,max(N)), las = 1)
points(vege, lam, type = "l", col = "black", lwd = 2, lty=2) # true abundance
points(vege, exp(lin.pred), type = "l", col = "red", lwd = 2) # prediction of naive analysis
points(vege, det.prob*max(N), type="l", col="blue", lty=3) # detection probability * maximum abundance
###################################################################################
### Binomial mixture model, including cross-validation and information criteria ###
###################################################################################
# Model definition
bin.mix <- function(){
# Priors
alpha.lam ~ dunif(-10, 10)
beta1.lam ~ dunif(-10, 10)
beta2.lam ~ dunif(-10, 10)
alpha.p ~ dunif(-10, 10)
beta.p ~ dunif(-10, 10)
# Likelihood
for (i in indices) {
N[i] ~ dpois(lambda[i])
log(lambda[i]) <- alpha.lam + beta1.lam*vege[i] + beta2.lam*vege[i]*vege[i]
for (j in 1:n.survey) {
lp[i,j] <- alpha.p + beta.p * vege[i]
p[i,j] <- exp(lp[i,j]) / (1 + exp(lp[i,j]))
}
}
for (i in train.indices){
for (j in 1:n.survey){
count[i,j] ~ dbin(p[i,j], N[i])
}
}
totalN <- sum(N[])
# Goodness-of-fit measures
for (i in indices) {
for (j in 1:n.survey) {
count.sim[i,j] ~ dbin(p[i,j], N[i])
exp.count[i,j] <- p[i,j] * N[i]
resid.actual[i,j] <- pow((count[i,j]-exp.count[i,j]),2) / (exp.count[i,j] + 0.5)
resid.sim[i,j] <- pow((count.sim[i,j]-exp.count[i,j]),2) / (exp.count[i,j] + 0.5)
ppd[i,j] <- pow(p[i,j],count[i,j]) * pow((1-p[i,j]),(N[i]-count[i,j]))
}
}
fit.actual <- sum(resid.actual[train.indices,])
fit.sim <- sum(resid.sim[train.indices,])
c.hat <- (fit.actual+0.001) / (fit.sim+0.001)
bay.p <- step(fit.sim - fit.actual)
fit.cv.actual <- sum(resid.actual[test.indices,])
fit.cv.sim <- sum(resid.sim[test.indices,])
c.hat.cv <- (fit.cv.actual+0.001) / (fit.cv.sim+0.001)
bay.p.cv <- step(fit.cv.sim - fit.cv.actual)
}
# Model settings
nc <- 3
nb <- 5000
ni <- 25000
nt <- 4
params <- c("totalN", "alpha.lam", "beta1.lam", "beta2.lam", "alpha.p", "beta.p", "fit.actual", "fit.sim",
"c.hat", "bay.p", "fit.cv.actual", "fit.cv.sim", "
bay.p.cv", "
c.hat.cv", "ppd", "count.sim")
# Shuffle dataset for cross-validation
count.initial <- max.count+1 # initial value for Bayesian model
indices <- seq(1:n.site)
set.seed(1)
shuffled.indices <- sample(indices)
shuffled.sites <- site[shuffled.indices]
shuffled.vege <- vege[shuffled.indices]
shuffled.count <- count[shuffled.indices,]
shuffled.count.initial <- count.initial[shuffled.indices]
# Run model including 10-fold cross-validation
start.time = Sys.time() # set timer
results.model <- list() # list to save results of each fold
results.ICs <- data.frame(matrix(NA, ncol=8, nrow=10)) # data frame to save results of information criteria
colnames(results.ICs) <- c("bay.p", "c.hat", "
bay.p.cv", "
c.hat.cv", "DIC", "WAIC", "D", "non.convergence")
for (i in 1:10) {
test.indices <- ((i-1)*round(0.1*n.site)+1):(i*round(0.1*n.site))
train.indices <- indices[-test.indices]
inits <- function(){list(N=shuffled.count.initial, alpha.lam=rnorm(1, 0, 1), alpha.p=rnorm(1, 0, 1),
beta.p=rnorm(1, 0, 1), beta1.lam=rnorm(1, 0, 1), beta2.lam=rnorm(1, 0, 1))}
jags.data <- list(indices=indices, train.indices=train.indices, test.indices=test.indices, n.survey=n.survey,
vege=shuffled.vege, count=shuffled.count)
out <- jags(jags.data, inits, params, bin.mix, n.chains=nc, n.iter=ni, n.burn = nb, n.thin=nt)
bay.p <- out$BUGSoutput$mean$bay.p
c.hat <- out$BUGSoutput$mean$c.hat
DIC <- out$BUGSoutput$DIC
lppd.sum <- -2*sum(log(apply(out$BUGSoutput$sims.list$ppd,2,mean)))
pD.2 <- sum(apply(log(out$BUGSoutput$sims.list$ppd), 2, function(x) sd(x)*sd(x)))
WAIC <- lppd.sum + pD.2
count.sim.mean <- apply(out$BUGSoutput$sims.list$count.sim,2,mean)
sum1 <- sum((count-count.sim.mean)^2)
sum2 <- sum(apply(out$BUGSoutput$sims.list$count.sim,2,function(x) sd(x)*sd(x)))
D <- sum1 + sum2
non.convergence <- length(which(out$BUGSoutput$summary[,8]>1.1))
results.model[[i]] <- list(summary=data.frame(out$BUGSoutput$summary))
results.ICs[i,] <- c(bay.p, c.hat,
bay.p.cv,
c.hat.cv, DIC, WAIC, D, non.convergence)
}
end.time = Sys.time()
elapsed.time = round(difftime(end.time, start.time, units='mins'), dig = 2)
cat(paste(paste('Posterior models computed in ', elapsed.time, sep=''), ' minutes\n\n', sep=''))
results.ICs