I've used the gamlss package in R to see that the fit to my data is much better with a Poisson Inverse Gaussian (PIG) or Poisson Generalized Inverse Gaussian (Sichel) than with a negative binomial.
// Attempting to implement Inverse Gaussian in Stan.
// PDF://f(x; mu,shape) = (shape/(2*pi*x^3))^0.5 * exp((-shape*(x-mu)^2)/(2*x*mu^2))
// logPDF:// log(f(x; mu,shape)) = 0.5*(log(shape)-log(2*pi)-3*log(shape)-shape*(((x-mu)/mu)^2)/x) data { int<lower=1> N; real<lower=0> x[N];}transformed data { real denom; // calculate this constant only once denom <- 0.0; for (i in 1:N) denom <- denom + (log(2*pi()) + 3*log(x[i]));}parameters { real<lower=0> mu; real<lower=0> shape;}model { real summands[N]; // put priors on mu and shape here if you want
// log-likelihood increment_log_prob(0.5*(N * log(shape) - denom)); for (i in 1:N) { summands[i] <- -0.5 * shape * pow((x[i] - mu)/mu,2)/x[i]; } increment_log_prob(sum(summands)); // faster than doing inside loop}
library(rstan);library(gamlss)N <- 1000mu <- 3shape <- 2sigma <- sqrt(1/shape) # changing shape to sigma to match gamlss's parameterizationx <- rIG(N,mu,sigma)ig_test <- stan(file = "./IG.stan", data = list(N = N, x = x), verbose = FALSE, refresh = 1000)print(ig_test)
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
mu 3.05 0.00 0.12 2.84 2.97 3.05 3.12 3.29 2072 1
shape 2.10 0.00 0.10 1.91 2.03 2.10 2.16 2.29 2446 1
lp__ -2007.51 0.03 1.01 -2010.13 -2007.94 -2007.20 -2006.77 -2006.51 1264 1
// Attempting to implement Poisson inverse Gaussian in Stan.
functions { real IG_log(real x, real mu, real shape){ return 0.5*(log(shape) - log(2*pi()) - 3*log(x) - shape * pow((x - mu)/mu,2)/x); } } data { int<lower=1> N; int<lower=0> x[N];}
parameters { real<lower=0> mu; real<lower=0> shape;}
model { // log-likelihood for (i in 1:N) { increment_log_prob(poisson_log(x[i],(IG_log(x[i],mu,shape)))); }}
library(rstan);library(gamlss)N <- 1000mu <- 3shape <- 2 # changing shape to sigma to match gamlss's parameterizationsigma <- sqrt(1/shape)x <- rPIG(N,mu,sigma)pig_test <- stan(file = "./PIG.stan", data = list(N = N, x = x), verbose = FALSE, refresh = 1000)print(pig_test)
SAMPLING FOR MODEL 'PIG' NOW (CHAIN 1).
[1] "Rejecting initial value:"
[2] " Error evaluating the log probability at the initial value."
[3] "Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:"
[4] "Exception thrown at line 22: stan::math::poisson_log: Rate parameter is -3.57973, but must be >= 0!"
[5] "If this warning occurs sporadically, such as for highly constrained variable types like covariance matrices, then the sampler is fine,"
[6] "but if this warning occurs often then your model may be either severely ill-conditioned or misspecified."
[7] "Rejecting initial value:"
[8] " Error evaluating the log probability at the initial value."
[9] "Initialization between (-2, 2) failed after 100 attempts. "
[10] " Try specifying initial values, reducing ranges of constrained values, or reparameterizing the model."
Thanks Ben! From what I understand, you're right, I'm just looking for a Poisson mixed with the GIG distribution for the lambda parameter. I was able to implement the inverse Gaussian, but I don't have a lot of experience with Stan and I'm having trouble assigning the inverse Gaussian as a prior for lambda.
functions {
// ignoring the 2pi constant
real IG_log(real x, real mu, real shape){
return 0.5 * log(shape) - 1.5 * log(x) - shape * square( (x - mu) / mu) / x
; }
}
data {
int<lower=1> N; int<lower=0> x[N];}
parameters { real<lower=0> mu;
real<lower=0> shape;
real<lower=0> lambda;
}
model { x ~ poisson(lambda);
lambda ~ IG(mu, shape); // edited. Thanks Bob!
}
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
mu 2.372467e+253 Inf Inf 3.71 1.140895e+177 5.767031e+221 1.248539e+227 3.277302e+253 4000 NaN
shape 1.515148e+12 1.805725e+12 3.113701e+12 8.44 9.500000e+00 1.140000e+01 5.061459e+11 1.111952e+13 3 3.07
lambda 3.630000e+00 1.100000e-01 1.600000e-01 3.35 3.490000e+00 3.710000e+00 3.740000e+00 3.820000e+00 2 5.95
lp__ 4.908500e+02 1.433400e+02 2.032600e+02 147.10 4.221500e+02 5.859200e+02 6.287900e+02 6.895600e+02 2 34.27
bKd <- besselK(omega,gamma);
f(0; xi, omega, alpha) = pow(xi,0) * pow(omega / alpha, gamma+0) * besselK(alpha, gamma+0) / (tgamma(0+1) * bKd);
f(1; xi, omega, alpha) = pow(xi,1) * pow(omega / alpha, gamma+1) * besselK(alpha, gamma+1) / (tgamma(1+1) * bKd);
f(x; xi, omega, alpha) = (2 * xi * omega / pow(alpha, 2)) * ((gamma + x - 1) / x) * f[x-1] + pow(xi * omega / alpha, 2) * f[x-2] / (x * (x-1))
functions { // modified Bessel function of the first kind: real besselI(real z, real v){ real summands[101]; for (m in 0:100){ if (m+v+1<=0){ summands[m+1] <- 0; } else { summands[m+1] <- ((z/2)^(v+2*m))/(tgamma(m+1)*tgamma(m+v+1)); } } return sum(summands); } // modified Bessel function of the second kind (sometimes referred to as // modified Bessel function of the third kind): real besselK(real z, real v){ real v1; v1 <- v - 1E-12; return pi() / 2 * (besselI(z,-v1) - besselI(z,v1)) / sin(v1 * pi()); } // log(pmf) of the Sichel distribution, using the recurrence formula from // Chesson and Lee 2005: vector log_sichel_prob(real omega, real xi, real gamma){ // Only fitting data for patches 0:50 vector[51] P; real alpha; real bKd; int nn; alpha <- sqrt(pow(omega,2) + 2 * omega * xi); bKd <- besselK(omega,gamma); P[1] <- pow(xi,0) * pow(omega/alpha, gamma+0) * besselK(alpha, gamma+0) / (tgamma(0+1) * bKd); P[2] <- pow(xi,1) * pow(omega/alpha, gamma+1) * besselK(alpha, gamma+1) / (tgamma(1+1) * bKd); for(n in 3:51){ nn <- n-1; P[n] <- ( 2 * xi * omega / pow(alpha, 2) ) * ( (gamma + nn - 1) / nn ) * P[n-1] + pow(xi * omega / alpha, 2) * P[n-2] / ( nn * (nn-1)); } return log(P); }}
data { int<lower=1> N; int<lower=0> y[N];}
parameters { real<lower=0, upper=10> omega; real<lower=0, upper=10> xi; //real<lower=0, upper=10> gamma;}
transformed parameters { real mu; real sig2; // the mean and variance below are only valid when gamma = -0.5 mu <- xi; sig2 <- xi*(1+xi/omega);}
model { real fit[N]; vector[51] logsichel; //logsichel <- log_sichel_prob(omega, xi, gamma); logsichel <- log_sichel_prob(omega, xi, -0.5); for(i in 1:N){ fit[i] <- logsichel[ y[i] + 1 ]; } increment_log_prob(sum(fit));}
4 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
omega 1.47 0.00 0.10 1.28 1.40 1.46 1.53 1.67 2254 1
xi 3.80 0.00 0.09 3.64 3.74 3.80 3.86 3.99 2330 1
mu 3.80 0.00 0.09 3.64 3.74 3.80 3.86 3.99 2330 1
sig2 13.73 0.02 0.97 12.01 13.03 13.69 14.36 15.82 2142 1
lp__ -3912.86 0.03 0.93 -3915.39 -3913.20 -3912.56 -3912.21 -3911.95 1162 1
Why use the - 1E-12 in the besselK? Should that just be an edge
case that gets detected?
Finally, it may be necessary to work on the log scale throughout,
so you'd save log_P as an array rather than P, and then do
log_sum_exp(log_P) rather than log(P) at the end.
functions { // modified Bessel function of the first kind real besselI(real z, real v){
real sums; sums <- 0;
for (m in 1:101) {
if (m + v > 0) { sums <- sums + pow(0.5 * z, v + 2 * (m - 1)) / ( tgamma(m) * tgamma(m+v) );
} if (v <= -101) { reject("v must be > -101 in besselI(z, v)"); } }
return sums;
} // modified Bessel function of the second kind (sometimes referred to as // modified Bessel function of the third kind) real besselK(real z, real v){ real v1; v1 <- v - 1E-12; return pi() / 2 * (besselI(z,-v1) - besselI(z,v1)) / sin(v1 * pi()); } // log(pmf) of the Sichel distribution, using the recurrence formula from
// Chesson and Lee 2005.
vector log_sichel_prob(real omega, real xi, real gamma){
// Only fitting data from patches 0:50
vector[51] P; real alpha; real bKd; int nn;
real omega_over_alpha; real xi_times_omega_over_alpha; real square_xi_times_omega_over_alpha; real two_times_xi_times_omega_over_square_alpha; // calculate constants alpha <- sqrt(square(omega) + 2 * omega * xi); bKd <- besselK(omega, gamma);
omega_over_alpha <- omega / alpha; xi_times_omega_over_alpha <- xi * omega_over_alpha;
square_xi_times_omega_over_alpha <- square(xi_times_omega_over_alpha); two_times_xi_times_omega_over_square_alpha <- 2 * xi_times_omega_over_alpha / alpha; // calculate probabilities P[1] <- 1 * pow(omega_over_alpha, gamma) * besselK(alpha, gamma) / (tgamma(1) * bKd); P[2] <- xi * pow(omega_over_alpha, gamma+1) * besselK(alpha, gamma+1) / (tgamma(2) * bKd);
for(n in 3:51){ nn <- n-1;
P[n] <- two_times_xi_times_omega_over_square_alpha * ( (gamma + nn - 1) / nn ) * P[n-1] + square_xi_times_omega_over_alpha * P[n-2] / ( nn * (nn-1));
} return log(P); }}
data { int<lower=1> N; int<lower=0> y[N];}
parameters { real<lower=0, upper=10> omega; real<lower=0, upper=10> xi; //real<lower=0, upper=10> gamma;}
transformed parameters { real mu; real sig2; // the mean and variance below are only valid when gamma = -0.5 mu <- xi; sig2 <- xi*(1+xi/omega);}
model { real fit[N]; vector[51] logsichel; //logsichel <- log_sichel_prob(omega, xi, gamma); logsichel <- log_sichel_prob(omega, xi, -0.5); for(i in 1:N) { fit[i] <- logsichel[ y[i] + 1 ]; } increment_log_prob(sum(fit));}
Old Code (average of 8.16 seconds per chain):
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
omega 1.35 0.00 0.06 1.23 1.30 1.35 1.39 1.48 1829 1xi 4.19 0.00 0.07 4.05 4.14 4.19 4.24 4.34 1813 1mu 4.19 0.00 0.07 4.05 4.14 4.19 4.24 4.34 1813 1sig2 17.22 0.02 0.92 15.54 16.60 17.18 17.83 19.15 1626 1lp__ -8101.35 0.03 1.02 -8104.08 -8101.76 -8101.04 -8100.63 -8100.35 1146 1
Updated Code (average of 7.61 seconds per chain):
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
omega 1.35 0.00 0.06 1.23 1.31 1.35 1.39 1.48 1864 1xi 4.19 0.00 0.07 4.05 4.14 4.19 4.24 4.34 1998 1mu 4.19 0.00 0.07 4.05 4.14 4.19 4.24 4.34 1998 1sig2 17.21 0.02 0.93 15.56 16.58 17.16 17.79 19.19 1604 1lp__ -8101.34 0.03 1.02 -8103.97 -8101.74 -8101.03 -8100.61 -8100.36 1163 1
I ended up writing modified Bessel functions of the first and second kind... they're probably not the most robust versions that could be written, but they produce output that is very similar to the output of Rs modified Bessel functions, and they work well in the regions of parameter space that I'm working in.
real PIG(int y, real mu, real tau) {
if(y==0)
return exp(1/tau*(1-(1+2*tau*mu)^0.5));
if(y==1)
return mu*(1+2*tau*mu)^-0.5*PIG(0,mu,tau);
return 2.*tau*mu/(1+2*tau*mu) *
(1. - 3. /(2.*y)) * PIG(y-1,mu,tau) +
mu^2 / (1+2.*tau*mu) / (y*(y-1)) * PIG(y-2,mu,tau);
}