Well, I think I've got a solution to the problem I was facing with trying to determine whether a categorical variable should be in included in the model or not (as opposed to getting posterior inclusion probabilities for the indicator variables representing the categorical variable in the model. Perhaps someone more knowledgeable than me could chime in if they think there's something incorrect with the approach.
I used a coding trick introduced by
Ntzoufras (2002) to deal with categorical variables in a Gibbs Variable Selection (GVS) approach. The method uses a vector (i.e., pos) and nested indexing to track which indicator variables should associate with which coefficients. For a categorical variable, this causes the entire variable to be removed when z = 0 for that coefficient, which is what I was looking to achieve:
rjmcmc_data$pos = c(1,2,2,2,2,3,4,4,4,5) # make indexing to deal with categorical variables
for(i in 1:nvars){
z[i] ~ dbern(0.5)
}
for(i in 1:(ncoefs-1)){
z.temp[i] <- z[pos[i]]
}
In the simulation, I fit the model in frequentist and use AIC for comparison to the Bayesian model selection inference. I then use RJMCMC algorithm provided in nimble and also compare the Gibbs Variable Selection method that is similar but uses psuedopriors to take draws from the prior when
z = 0. Of course, one has to consider the priors used in any Bayesian model or model selection routine so I tested a couple of different priors. For a logistic regression model a normal(0,0.5)for the beta coefficients (parameterized as mean and precision) is a vague prior when passed through the logit link function.
Anyhow, I was pleased that the results from the Bayesian model selection procedures using this coding adjustment produced results that were very consistent with the frequentist inference and very similar across the RJMCMC and GVS. Also, without the position vector included, there are too many possible models to choose from since the number of models would depend on the number of levels in each categorical variable along with the overall number of variables.
Here is the code to run the 1) simulate the data, 2) fit frequentist models, 3) fit RJMCMC with the position vector for categorical variables, 4) no position vector for categorical variables and a latent variable for psi (the inclusion probability), and 5) with psi set at 0.5, and 6) using GVS with the position vector included:
# 1) simulate multiple linear regression problem and prepare it for the 4 model selection tests
library(Survout) # for factor2ind function
# set random seed generator
set.seed(500)
# simple function to standardize variables
std2=function(x){
(x - mean(x,na.rm=TRUE))/(2*sd(x,na.rm=TRUE))
}
# number of observations
N=100
# create variables (both continuous and categorical)
elev = std2(rlnorm(N, log(500), sd=1))
hsi.var = as.factor(apply(rmultinom(N, 1, prob=rep(1/5,5)),2,function(x) which(x==1)))
hsi.ind = factor2ind(hsi.var, "1") # as indicator variables
site.var = as.factor(apply(rmultinom(N, 1, prob=rep(1/4,4)),2,function(x) which(x==1)))
site.ind = factor2ind(site.var, "1") # as indicator variables
sess.var = as.factor(rbinom(N, 1, 0.5))
sess.ind = factor2ind(sess.var, "1") # as indicator variable
slope = std2(runif(N, 0, 90))
# true betas
beta = c(-1,0.75,0.2,-0.5,0.3,0.6)
# generate predictions from true model = ~ elev + site + slope
set.seed(100)
y = rbinom(N, 1, plogis(beta[1] + elev*beta[2] + site.ind[,1]*beta[3] + site.ind[,2]*beta[4] + site.ind[,3]*beta[5] + slope*beta[6]))
##### TEST MODEL SELECTION METHODS AGAINST FREQUENTIST INFERENCE#############################################################################
# 2) Frequentist (fit full model and then dredge)
# make datasets for frequentist and Bayesian models
freq_data = data.frame(y=y,elev=elev, hsi = hsi.var, site = site.var, sess = sess.var, slope = slope)
#inspect dataset
head(freq_data)
fit1 = glm(y ~ elev + hsi + sess + site + slope, na.action="na.fail", family="binomial",data = freq_data)
summary(fit1)
library(MuMIn)
(dredge.fit = dredge(fit1))
(top.model = get.models(dredge.fit, subset = 1)[[1]])
drop1(fit1)
####################################################################################################################################
# 3) RJMCMC in nimble (test 1 - modifed prior probability of inclusion estimated for each variable)
library(nimble)
# define vector for categorical variables to associate with the same inclusion porbability and variable
rjmcmc_data = list(y=y, elev=elev, hsi2 = hsi.ind[,1], hsi3 = hsi.ind[,2], hsi4 = hsi.ind[,3], hsi5 = hsi.ind[,4], site2 = site.ind[,1], site3 = site.ind[,2], site4 = site.ind[,3], sess = sess.ind[,1], slope = slope)
rjmcmc_data$pos = c(1,2,2,2,2,3,4,4,4,5) # make indexing to deal with categorical variables
rjmcmc_data$prior_prec = 0.5 # define prior precision
rjmcmc_data$ncoefs = 11 # number of coefficients including intercept (we'll force the intercept ot be included)
rjmcmc_data$nvars = max(rjmcmc_data$pos, na.rm=TRUE) # number of variables under selection
# define model code
rjmcmc_code <- nimbleCode({
# priors
beta[1] ~ T(dnorm(0, 0.5),-10,10)
for(i in 2:ncoefs){
beta[i] ~ T(dnorm(0, prior_prec),-10,10)
}
#psi ~ dunif(0,1) ## hyperprior on inclusion probability
for(i in 1:nvars){
z[i] ~ dbern(0.5)
}
for(i in 1:(ncoefs-1)){
z.temp[i] <- z[pos[i]]
}
# likihood
for(i in 1:N) {
logit(p[i]) <- beta[1] + beta[2]*z.temp[1]*elev[i] + beta[3]*z.temp[2]*hsi2[i] + beta[4]*z.temp[3]*hsi3[i] + beta[5]*z.temp[4]*hsi4[i] + beta[6]*z.temp[5]*hsi5[i] + + beta[7]*z.temp[6]*sess[i] + beta[8]*z.temp[7]*site2[i] + beta[9]*z.temp[8]*site3[i] + beta[10]*z.temp[9]*site4[i] + beta[11]*z.temp[10]*slope[i]
y[i] ~ dbern(p[i])
}
})
RJexampleModel <- nimbleModel(rjmcmc_code, constants = list(N = length(rjmcmc_data$y),pos = rjmcmc_data$pos,nvars=rjmcmc_data$nvars,ncoefs=rjmcmc_data$ncoefs,prior_prec=rjmcmc_data$prior_prec),
data = rjmcmc_data[1:11],
inits = list(beta=as.numeric(summary(fit1)$coefficients[,1]),z=rbinom(rjmcmc_data$nvars,1,0.5))) # ,psi=runif(1,0.25,0.5)
RJexampleModel$calculate()
RJexampleConf <- configureMCMC(RJexampleModel)
configureRJ(conf = RJexampleConf,
targetNodes = c("beta[2]","beta[3]","beta[4]","beta[5]","beta[6]","beta[7]","beta[8]","beta[9]","beta[10]","beta[11]"),
indicatorNodes = c('z.temp[1]', 'z.temp[2]','z.temp[3]','z.temp[4]','z.temp[5]','z.temp[6]','z.temp[7]','z.temp[8]','z.temp[9]','z.temp[10]'),
control = list(mean = rep(0, rjmcmc_data$ncoefs-1), scale = 2))
RJexampleConf$printSamplers(c("z.temp[4]","beta[3]"))
if(("z.temp" %in% RJexampleConf$monitors)==FALSE){
RJexampleConf$addMonitors("z.temp")
}
if(("z" %in% RJexampleConf$monitors)==FALSE){
RJexampleConf$addMonitors("z")
}
mcmcRJ <- buildMCMC(RJexampleConf)
cRJexampleModel <- compileNimble(RJexampleModel)
CMCMCRJ <- compileNimble(mcmcRJ, project = RJexampleModel, resetFunctions = TRUE)
set.seed(100)
system.time(samplesRJ <- runMCMC(CMCMCRJ, niter = 100000, nburnin = 10000, thin = 10))
round(apply(samplesRJ,2,mean),3)
(d1=data.frame(var=c("elev","hsi","sess","site","slope"),pr = round(as.numeric(apply(samplesRJ[,c("z[1]","z[2]","z[3]","z[4]","z[5]")], 2, function(x) sum(x)/length(x))),3)))
png("Indicator_RJMCMC3a.png",width = 8, height = 5, units = "in", res = 300)
barplot(pr~var,data=d1,ylim=c(0,1),ylab = "Inclusion probability",xlab="Coefficient")
dev.off()
round(as.numeric(apply(samplesRJ[,c("z.temp[1]","z.temp[2]","z.temp[3]","z.temp[4]","z.temp[5]","z.temp[6]","z.temp[7]","z.temp[8]","z.temp[9]","z.temp[10]")], 2, function(x) sum(x)/length(x))),3)
g = samplesRJ[,c("z[1]","z[2]","z[3]","z[4]","z[5]")]
g2 = paste(g[,1],g[,2],g[,3],g[,4],g[,5],sep="")
sort(round(table(g2)/length(g2), 3), decreasing=TRUE)
# 4) RJMCMC in nimble (test 2 - unmodified RJMCMC with psi as a latent variable and no "z" variable; just "z.temp")
# define model code
rjmcmc_code <- nimbleCode({
# priors
beta[1] ~ T(dnorm(0, 0.5),-10,10)
for(i in 2:ncoefs){
beta[i] ~ T(dnorm(0, prior_prec),-10,10)
}
psi ~ dunif(0,1) ## hyperprior on inclusion probability
for(i in 1:(ncoefs-1)){
z.temp[i] ~ dbern(psi)
}
# likihood
for(i in 1:N) {
logit(p[i]) <- beta[1] + beta[2]*z.temp[1]*elev[i] + beta[3]*z.temp[2]*hsi2[i] + beta[4]*z.temp[3]*hsi3[i] + beta[5]*z.temp[4]*hsi4[i] + beta[6]*z.temp[5]*hsi5[i] + + beta[7]*z.temp[6]*sess[i] + beta[8]*z.temp[7]*site2[i] + beta[9]*z.temp[8]*site3[i] + beta[10]*z.temp[9]*site4[i] + beta[11]*z.temp[10]*slope[i]
y[i] ~ dbern(p[i])
}
})
RJexampleModel <- nimbleModel(rjmcmc_code, constants = list(N = length(rjmcmc_data$y),pos = rjmcmc_data$pos,nvars=rjmcmc_data$nvars,ncoefs=rjmcmc_data$ncoefs,prior_prec=rjmcmc_data$prior_prec),
data = rjmcmc_data[1:11],
inits = list(beta=as.numeric(summary(fit1)$coefficients[,1]),z.temp=rbinom(rjmcmc_data$ncoefs-1,1,0.5),psi=runif(1,0.25,0.5))) # ,
RJexampleModel$calculate()
RJexampleConf <- configureMCMC(RJexampleModel)
configureRJ(conf = RJexampleConf,
targetNodes = c("beta[2]","beta[3]","beta[4]","beta[5]","beta[6]","beta[7]","beta[8]","beta[9]","beta[10]","beta[11]"),
indicatorNodes = c('z.temp[1]', 'z.temp[2]','z.temp[3]','z.temp[4]','z.temp[5]','z.temp[6]','z.temp[7]','z.temp[8]','z.temp[9]','z.temp[10]'),
control = list(mean = rep(0, rjmcmc_data$ncoefs-1), scale = 2))
RJexampleConf$printSamplers(c("z.temp[4]","beta[3]"))
if(("z.temp" %in% RJexampleConf$monitors)==FALSE){
RJexampleConf$addMonitors("z.temp")
}
mcmcRJ <- buildMCMC(RJexampleConf)
cRJexampleModel <- compileNimble(RJexampleModel)
CMCMCRJ <- compileNimble(mcmcRJ, project = RJexampleModel, resetFunctions = TRUE)
set.seed(100)
system.time(samplesRJ <- runMCMC(CMCMCRJ, niter = 100000, nburnin = 10000, thin = 10))
# with indicator variables
round(apply(samplesRJ,2,mean),3)
(d1=data.frame(var=c("elev","hsi2","hsi3","hsi4","hsi5","sess","site2","site3","site4","slope"),pr = round(as.numeric(apply(samplesRJ[,c("z.temp[1]","z.temp[2]","z.temp[3]","z.temp[4]","z.temp[5]","z.temp[6]","z.temp[7]","z.temp[8]","z.temp[9]","z.temp[10]")], 2, function(x) sum(x)/length(x))),3)))
png("Indicator_RJMCMC3b.png",width = 8, height = 5, units = "in", res = 300)
barplot(pr~var,data=d1,ylim=c(0,1),ylab = "Inclusion probability",xlab="Coefficient")
dev.off()
g = samplesRJ[,c("z.temp[1]","z.temp[2]","z.temp[3]","z.temp[4]","z.temp[5]","z.temp[6]","z.temp[7]","z.temp[8]","z.temp[9]","z.temp[10]")]
g2 = paste(g[,1],g[,2],g[,3],g[,4],g[,5],g[,6],g[,7],g[,8],g[,9],g[,10],sep="")
sort(round(table(g2)/length(g2), 3), decreasing=TRUE)[1:20]
# 5) RJMCMC in nimble (test 2 - unmodified RJMCMC with no indicator and priorProb=0.5 and no "z" variable; just "z.temp")
# define model code
rjmcmc_code <- nimbleCode({
# priors
beta[1] ~ T(dnorm(0, 0.5),-10,10)
for(i in 2:ncoefs){
beta[i] ~ T(dnorm(0, prior_prec),-10,10)
}
# likihood
for(i in 1:N) {
logit(p[i]) <- beta[1] + beta[2]*elev[i] + beta[3]*hsi2[i] + beta[4]*hsi3[i] + beta[5]*hsi4[i] + beta[6]*hsi5[i] + + beta[7]*sess[i] + beta[8]*site2[i] + beta[9]*site3[i] + beta[10]*site4[i] + beta[11]*slope[i]
y[i] ~ dbern(p[i])
}
})
RJexampleModel <- nimbleModel(rjmcmc_code, constants = list(N = length(rjmcmc_data$y),pos = rjmcmc_data$pos,nvars=rjmcmc_data$nvars,ncoefs=rjmcmc_data$ncoefs,prior_prec=rjmcmc_data$prior_prec),
data = rjmcmc_data[1:11],inits = list(beta=as.numeric(summary(fit1)$coefficients[,1])))
RJexampleModel$calculate()
RJexampleConf <- configureMCMC(RJexampleModel)
configureRJ(conf = RJexampleConf,
targetNodes = c("beta[2]","beta[3]","beta[4]","beta[5]","beta[6]","beta[7]","beta[8]","beta[9]","beta[10]","beta[11]"),
priorProb = 0.5,
control = list(mean = rep(0, rjmcmc_data$ncoefs-1), scale = 2))
mcmcRJ <- buildMCMC(RJexampleConf)
cRJexampleModel <- compileNimble(RJexampleModel)
CMCMCRJ <- compileNimble(mcmcRJ, project = RJexampleModel, resetFunctions = TRUE)
set.seed(100)
system.time(samplesRJ <- runMCMC(CMCMCRJ, niter = 100000, nburnin = 10000, thin = 10))
# with no indicator variables
betaCols <- grep("beta\\[", colnames(samplesRJ))
posterior_inclusion_proportions <- colMeans(apply(samplesRJ[, betaCols],
2, function(x) x != 0))
(d1=data.frame(var=c("elev","hsi2","hsi3","hsi4","hsi5","sess","site2","site3","site4","slope"),pr=posterior_inclusion_proportions[2:11]))
png("Indicator_RJMCMC3c.png",width = 8, height = 5, units = "in", res = 300)
barplot(pr~var,data=d1,ylim=c(0,1),ylab = "Inclusion probability",xlab="Coefficient")
dev.off()
### END TEST RJMCMC ##################################################################################
### GIBBS VARIABLE SELECTION #########################################################################
# 6) Gibbs variable selection
# put data into a list
gvs_data = list(y=y, elev=elev, hsi2 = hsi.ind[,1], hsi3 = hsi.ind[,2], hsi4 = hsi.ind[,3], hsi5 = hsi.ind[,4], site2 = site.ind[,1], site3 = site.ind[,2], site4 = site.ind[,3], sess = sess.ind[,1], slope = slope)
gvs_data$prior_prec = 0.5 # define prior precision
gvs_data$pos = c(1,2,2,2,2,3,4,4,4,5) # make indexing to deal with categorical variables
gvs_data$ncoefs = 11 # number of coefficients including intercept (we'll force the intercept ot be included)
gvs_data$nvars = max(gvs_data$pos, na.rm=TRUE) # number of variables under selection
gvs_data$nmodels = 2^gvs_data$nvars
# get psuedopriors from frequentist fit
gvs_data$mu.beta.ps = as.numeric(summary(fit1)$coefficients[2:nrow(summary(fit1)$coefficients),1])
gvs_data$tau.beta.ps = 1/(as.numeric(summary(fit1)$coefficients[2:nrow(summary(fit1)$coefficients),2])^2)
# define model code
gvs_code <- nimbleCode({
# priors
beta[1] ~ T(dnorm(0, 0.5),-10,10)
for(i in 2:ncoefs){
beta[i] ~ T(dnorm(beta_mu_prior[i-1], beta_tau_prior[i-1]),-10,10)
}
# psuedopriors
for(i in 1:(ncoefs-1)){
beta_mu_prior[i] <- (1-z.temp[i]) * mu.beta.ps[i]
beta_tau_prior[i] <- (1-z.temp[i]) * tau.beta.ps[i] + z.temp[i] * prior_prec
}
# Model selection code
for(i in 1:nvars){
z[i] ~ dbern(0.5) # prior probability for variable inclusion
}
for(i in 1:(ncoefs-1)){
z.temp[i] <- z[pos[i]]
}
# likihood
for(i in 1:N) {
logit(p[i]) <- beta[1] + beta[2]*z.temp[1]*elev[i] + beta[3]*z.temp[2]*hsi2[i] + beta[4]*z.temp[3]*hsi3[i] + beta[5]*z.temp[4]*hsi4[i] + beta[6]*z.temp[5]*hsi5[i] + + beta[7]*z.temp[6]*sess[i] + beta[8]*z.temp[7]*site2[i] + beta[9]*z.temp[8]*site3[i] + beta[10]*z.temp[9]*site4[i] + beta[11]*z.temp[10]*slope[i]
y[i] ~ dbern(p[i])
}
})
RJexampleModel <- nimbleModel(gvs_code, constants = list(N = length(gvs_data$y),pos = gvs_data$pos,nvars=gvs_data$nvars,ncoefs=gvs_data$ncoefs,prior_prec=gvs_data$prior_prec,
mu.beta.ps=gvs_data$mu.beta.ps, tau.beta.ps=gvs_data$tau.beta.ps),
data = gvs_data[1:11],inits = list(beta=as.numeric(summary(fit1)$coefficients[,1]),z=rbinom(gvs_data$nvars,1,0.5)))
RJexampleModel$calculate()
RJexampleConf <- configureMCMC(RJexampleModel, monitors=c("z.temp","z","beta"))
mcmcRJ <- buildMCMC(RJexampleConf)
cRJexampleModel <- compileNimble(RJexampleModel)
CMCMCRJ <- compileNimble(mcmcRJ, project = RJexampleModel, resetFunctions = TRUE)
set.seed(100)
system.time(samplesRJ <- runMCMC(CMCMCRJ, niter = 100000, nburnin = 10000, thin = 10))
round(apply(samplesRJ,2,mean),3)
(d1=data.frame(var=c("elev","hsi","sess","site","slope"),pr = round(as.numeric(apply(samplesRJ[,c("z[1]","z[2]","z[3]","z[4]","z[5]")], 2, function(x) sum(x)/length(x))),3)))
png("Indicator_GVS4.png",width = 8, height = 5, units = "in", res = 300)
barplot(pr~var,data=d1,ylim=c(0,1),ylab = "Inclusion probability",xlab="Coefficient")
dev.off()
round(as.numeric(apply(samplesRJ[,c("z.temp[1]","z.temp[2]","z.temp[3]","z.temp[4]","z.temp[5]","z.temp[6]","z.temp[7]","z.temp[8]","z.temp[9]","z.temp[10]")], 2, function(x) sum(x)/length(x))),3)
g = samplesRJ[,c("z[1]","z[2]","z[3]","z[4]","z[5]")]
g2 = paste(g[,1],g[,2],g[,3],g[,4],g[,5],sep="")
sort(round(table(g2)/length(g2), 3), decreasing=TRUE)
cbind(round(apply(samplesRJ,2,mean),3)[1:11],as.numeric(summary(fit1)$coefficients[,1]))
############# END GIBBS VARIABLE SELECTION #################################################
Thanks,
Dan