data { int<lower=0> N; // total cases int<lower=0> p; //predictors int<lower=0> N_obs; //observed cases both dependent & independent variables int<lower=0> N_miss; //missed data in the independent variables but has data on dependent variable real y_obs[N_obs]; //dependent var with complete cases real y_miss[N_miss]; //dependent var with incomplete cases matrix[N_obs,p] x_obs; //the model matrix of complete cases}parameters { real<lower=0> sigma; //population standard dev vector[p] beta; //the regression parameters & intercept real x_mu; // mean of dist of x values real<lower=0> x_sigma; // sd of dist of x values matrix[N_miss,p] x_miss; //imputed matrix}
model { beta~ cauchy( 0 , 2.5 ); sigma ~ uniform( 0 , 100 ); x_mu ~ cauchy( 0 , 2.5 ); x_sigma ~ uniform( 0 , 100 );
//observed for (n in 1:N_obs) { for (k in 1:p) { x_obs[n,k] ~ normal( x_mu , x_sigma ); y_obs[n] ~ normal(beta[k]*x_obs[n,k], sigma); } }
//missing; impute for (n in 1:N_miss) { for (k in 1:p) { x_miss[n,k] ~ normal( x_mu , x_sigma ); y_miss[n] ~ normal(beta[k]*x_miss[n,k], sigma); } }}
data("mtcars")
testData= mtcars
#FactorstestData$cyl<- factor(testData$cyl)testData$gear<- factor(testData$gear)
#Dependent variabley=testData$mpg
#True coefficientstrueCoef=coef(lm(mpg~cyl+disp+gear, data=testData))
# indices with mssing valuesset.seed(1)cyl.miss=sample(1:nrow(testData), 7,replace = F)set.seed(2)disp.miss=sample(1:nrow(testData), 7,replace = F)set.seed(3)gear.miss=sample(1:nrow(testData), 7,replace = F)
#Create missing values in datatestData$cyl[cyl.miss]<- NA_integer_testData$disp[disp.miss]<- NA_real_testData$gear[gear.miss]<-NA_integer_
# dataset with missing values (Incomplete)incompleteData=testData[!complete.cases(testData),]
# complete datasetscomplecasesData=data.frame(na.omit(testData))
#model matrix of predictorsX=model.matrix(~cyl+disp+gear, data=complecasesData)
# data listdata_list <- list( N=length(y) ,y=y ,N_obs=nrow(complecasesData) ,N_miss=nrow(incompleteData) ,y_obs=complecasesData$mpg ,y_miss=incompleteData$mpg ,x_obs=X ,p=dim(X)[2])
rstan_options(auto_write = TRUE)options(mc.cores = parallel::detectCores())StanCoef<- stan( file = "impute missing.stan", data = data_list)
print(StanCoef, "beta")Inference for Stan model: impute missing.4 chains, each with iter=2000; warmup=1000; thin=1; post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhatbeta[1] 0.16 0 0.06 0.06 0.12 0.16 0.20 0.29 1511 1beta[2] 0.16 0 0.06 0.06 0.12 0.15 0.19 0.28 1944 1beta[3] 0.16 0 0.06 0.05 0.12 0.15 0.19 0.29 1645 1beta[4] 0.07 0 0.02 0.03 0.05 0.07 0.08 0.10 3394 1beta[5] 0.16 0 0.06 0.06 0.12 0.16 0.20 0.29 1772 1beta[6] 0.16 0 0.06 0.06 0.12 0.15 0.19 0.28 1813 1
Samples were drawn using NUTS(diag_e) at Mon Mar 27 17:49:32 2017.For each parameter, n_eff is a crude measure of effective sample size,and Rhat is the potential scale reduction factor on split chains (at convergence, Rhat=1).> trueCoef(Intercept) cyl6 cyl8 disp gear4 gear5 29.41094467 -4.79980682 -4.85587246 -0.02690655 0.03227781 0.31940301