data {
int<lower=0> Nc; // Test data sample size (gold measure and misclassified measure are observed)
int<lower=0> Ni; // Main data sample size (only misclassified measure is observed)
int xc[Nc]; // gold measure in test data
int wc[Nc]; // misclassified measure in test data
int wi[Ni]; // misclassified measure in main data
}
parameters {
real<lower=0,upper=1> phi[2]; // misclassification probabilities: phi[P(w = 1 | x = 0), P(w = 1 | x = 1)]
real<lower=0,upper=1> q; // Probability of gold = 1 in main data, P(x = 1) <- quantity of interest
}
model {
for (n in 1:Nc) { // for each case in test data:
wc[n] ~ bernoulli(phi[xc[n] + 1]); // estimate misclassification probabilities
}
for (n in 1:Ni) { // for each case with incomplete data:
wi[n] ~ bernoulli(phi[1] * (1 - q) + phi[2] * q); // estimate q
}
// Priors on q and/or phi possible, but not necessary
// q ~ uniform(0, 1);
// phi[1] ~ normal(.2, .05); // Priors for misclassification probabilities
// phi[2] ~ normal(.8, .05); // Priors for misclassification probabilities
}
library(rstan)library(tidyverse); theme_set(theme_bw())
# Simulated data
# Sample sizes
Nc = 200
Ni = 1e3
# Misclassification
misclass = function(x, misclassification_matrix){
x = as.character(x)
sample(colnames(misclassification_matrix), size = 1, prob = misclassification_matrix[, x]) %>% as.integer
}
mcm = matrix(c(.7,.3,.1,.9), nrow = 2) # misclassification matrix (phi is in mcm[2, ])
dimnames(mcm) = list(0:1,0:1)
mcm
xc = rbinom(Nc, 1, .5) # gold measure in test data; distribution != main data
wc = sapply(xc, misclass, misclassification_matrix = mcm) # misclassified measure in test data
xi = rbinom(Ni, 1, .2) # gold measure in main data, not observed
wi = sapply(xi, misclass, misclassification_matrix = mcm) # misclassified measure in main data
fm_stan = stan(model_code = stan_model, data = list(Nc = Nc, Ni = Ni, xc = xc, wc = wc, wi = wi), control = list(adapt_delta = 0.95))
> To unsubscribe from this group and stop receiving emails from it, send an email to stan-users+unsubscribe@googlegroups.com.
> To post to this group, send email to stan-...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.
--
You received this message because you are subscribed to a topic in the Google Groups "Stan users mailing list" group.
To unsubscribe from this topic, visit https://groups.google.com/d/topic/stan-users/G1nPD2TbZ08/unsubscribe.
To unsubscribe from this group and all its topics, send an email to stan-users+unsubscribe@googlegroups.com.
data {// test & study int<lower=2> K; // number of categories int<lower=1> J; // number of annotators vector<lower=0>[K] beta[K]; // dirichlet prior on misclassification matrix theta// test int<lower=1> I_test; // number of coding units in test data int<lower=1> N_test; // number of codings in test data// Data set in long format with indices for annotator j and coding unit i; allows for unbalanced number of annotations per coding unit int<lower=1,upper=K> y_test[N_test]; // codings in test data int<lower=1,upper=I_test> ii_test[N_test]; // index coding units in test data int<lower=1,upper=J> jj_test[N_test]; // index annotators in test data vector<lower=0>[K] alpha_test; // dirichlet hyperprior on category prevalence in test data// study; data for the main study where only on annotator annotates each coding unit; format is the same as above int<lower=1> N_study; // number of codings in study data int<lower=1,upper=N_study> ii_study[N_study]; // index coding units in study data int<lower=1,upper=J> jj_study[N_study]; // index coder in study data vector<lower=0>[K] alpha_study; // dirichlet prior on category prevalence in study data// study int<lower=1,upper=K> y_study[N_study]; // codings in study data}parameters { simplex[K] pi_test; simplex[K] theta[J, K];
simplex[K] pi_study_obs; // category proportions in study data without correction; estimated for comparison simplex[K] pi_study_cor; // category proportions in study data corrected by misclassification theta}transformed parameters { vector[K] log_q_z[I_test]; vector[K] log_q_z2[N_study]; // for study data
for (i in 1:I_test) log_q_z[i] = log(pi_test); for (n in 1:N_test) for (k in 1:K) log_q_z[ii_test[n], k] = log_q_z[ii_test[n], k] + log(theta[jj_test[n], k, y_test[n]]);
for (n in 1:N_study) // for study data; theta as in test data above log_q_z2[n] = log(pi_study_cor); for (n in 1:N_study) for (k in 1:K) log_q_z2[ii_study[n], k] = log_q_z2[ii_study[n], k] + log(theta[jj_study[n],k, y_study[n]]);}model { pi_test ~ dirichlet(alpha_test); pi_study_cor ~ dirichlet(alpha_study); for (j in 1:J) for (k in 1:K) theta[j, k] ~ dirichlet(beta[k]); for (i in 1:I_test) target += log_sum_exp(log_q_z[i]); for (n in 1:N_study) target += log_sum_exp(log_q_z2[n]); y_study ~ categorical(pi_study_obs);}generated quantities {
// Apply softmax to generate probabilities from log_q_z(2) vector[K] softmax_lqz[I_test]; // probability of each k for each coding unit in test data vector[K] softmax_lqz2[N_study]; // probability of each k for each coding unit in study data for (i in 1:I_test) softmax_lqz[i] = softmax(log_q_z[i, ]); for (n in 1:N_study) softmax_lqz2[n] = softmax(log_q_z2[n, ]);}