# Author:   Kyle Foreman
#           kforeman@post.harvard.edu
# Date:     27 Nov 2013
# Purpose:  demonstrate sparse MVN methods for STAN
# Notes:    see original Scotland Lip Cancer example at http://www.openbugs.info/Manuals/GeoBUGS/Examples/Scotland.html

# libraries
    library(rstan)
    set_cppo('fast');

# load data
    source('scotland_lip_cancer.RData')
    # N = number of observations
    # O = observed lip cancer cases
    # E = expected lip cancer cases
    # x = covariate (percent pop involved in agriculture)
    # A = adjacency matrix
    # D = diagonal matrix
    D <- diag(rowSums(A))

# how many samples to draw?
    n_samples <- 1e3

# model CAR using multi_normal_prec()
    mvn_prec_code <- "
    data {
        int N;          # number of regions
        int O[N];       # observed cases
        vector[N] E;    # expected cases
        vector[N] x;    # covariate
        matrix[N,N] A;  # adjacency matrix
        matrix[N,N] D;  # diagonal matrix (sum_cij)
    }
    transformed data {
        # zeros for MVN prior
        vector[N] zeros;
        zeros <- rep_vector(0.0, N);
    }
    parameters {
        real alpha0;            # intercept
        real alpha1;            # coefficient on covariate
        vector[N] beta1;        # area-specific random effect (CAR)
        real<lower=1e-5> tau;   # precision of CAR
        real<lower=0,upper=1> p;# strength of spatial correlation
    }
    model {
        # tmp variables
            vector[N] beta1_stz;    # beta1 with sum-to-zero constraint
            vector[N] log_mu;       # prediction of log(mu)
            matrix[N,N] Tau;        # precision matrix for CAR

        # priors on model parameters
            # no explicit prior on alpha0 (improper flat prior)
            alpha1 ~ normal(0.0, sqrt(1e5));
            tau ~ gamma(0.5, 0.0005);

        # precision matrix for CAR
            Tau <- tau * (D - p * A);

        # CAR model using precision version of MVN
            beta1 ~ multi_normal_prec(zeros, Tau);

        # impose sum-to-zero constraint
            beta1_stz <- beta1 - mean(beta1);

        # calculate prediction
            log_mu <- log(E) + alpha0 + (alpha1 * x) + beta1_stz;

        # likelihood
            O ~ poisson_log(log_mu);
    }
    "
    mvn_prec_data <- list(N=N, O=O, E=E, x=x, A=A, D=D)
    mvn_prec_init <- list(
        list(
                  alpha0 = 0.0,
                  alpha1 = 0.0,
                  beta1 = rep(0, N),
                  tau = 1.0,
                  p = 0.99
        )
    )
    mvn_prec_fit <- stan(
        model_code = mvn_prec_code,
        data = mvn_prec_data,
        init = mvn_prec_init,
        chains = 1,
        iter = n_samples
    )

# create sparse versions of A & D
    # D is just the diagonal itself
    D_sparse <- diag(D)
    # A is two vectors corresponding to the non-zero pairs
    A_sparse <- which(A == 1, arr.ind=TRUE)
    A_sparse <- A_sparse[A_sparse[,1] < A_sparse[,2],]  # remove duplicates (because matrix is symmetric)
    A_N <- dim(A_sparse)[1]
    A1 <- A_sparse[,1]
    A2 <- A_sparse[,2]

# so, about that determinant...
    # the multivariate normal likelihood requires the determinant of Tau (Tau = D - p * A)
    # it's quite slow to calculate
    # there are some sparse matrix algorithms that could work, but they're a bit cumbersome
    # since the determinant is a relatively small contributor to the likelihood we can live with an approximation
    # so we'll just sample it at some values and then interpolate the rest
    # determinant(Tau) is a function of just p since D & A are constants
    # and p is constrained to [0,1], so we can just sample at 1000 values of p
    p_samples <- 1e3

# code for the sparse version
    sparse_code <- "
    data {
        int N;          # number of regions
        int O[N];       # observed cases
        vector[N] E;    # expected cases
        vector[N] x;    # covariate
        int A_N;        # number of adjacent region pairs
        int A1[A_N];    # first half of adjacency pairs
        int A2[A_N];    # second half of adjacency pairs
        vector[N] D_sparse; # diagonal (sum_cij)
        int p_samples;  # number of times to sample determinant
    }
    transformed data {
        # calculate log(determinant(D - p * A)) at sampled values
        vector[p_samples + 1] ldet_DpA;
        {
            matrix [N,N] DpA;   # D - p * A (e.g. Tau / tau)
            DpA <- diag_matrix(D_sparse);
            # calculate the off diagonal of D - p * A for each value of p
            for (i in 1:(p_samples + 1)) {
                for (a in 1:A_N)
                    DpA[A1[a], A2[a]] <- -1.0 * ((i - 1.0) / p_samples);
                # store the sampled log(determinant(D - p * A))
                ldet_DpA[i] <- log_determinant(DpA);
            }
        }
    }
    parameters {
        real alpha0;            # intercept
        real alpha1;            # coefficient on covariate
        vector[N] beta1;        # area-specific random effect (CAR)
        real<lower=1e-5> tau;   # precision of CAR
        real<lower=0,upper=1> p;# strength of spatial correlation
    }
    model {
        # tmp variables for MVN likelihood
            row_vector[N] beta1t_D; # beta1' * D
            row_vector[N] beta1t_A; # beta1' * A
            real sigma_sq;

        # other tmp variables
            vector[N] beta1_stz;    # beta1 with sum-to-zero constraint
            vector[N] log_mu;       # prediction of log(mu)

        # priors on model parameters
            # no explicit prior on alpha0 (improper flat prior)
            alpha1 ~ normal(0.0, sqrt(1e5));
            tau ~ gamma(0.5, 0.0005);

        # CAR model using sparse MVN
            # (beta1' * Tau * beta1) can be calculated as (beta1' * D * beta1) - p * (beta1' * A * beta1)
            # and each of those can benefit from a sparse representation

            # sigma_sq
                sigma_sq <- 1 / tau;

            # find beta1' * D
                beta1t_D <- (beta1 .* D_sparse)';

            # find beta1' * A
                beta1t_A <- rep_vector(0.0, N)';    # initialize vector
                for (i in 1:A_N) {
                    beta1t_A[A1[i]] <- beta1t_A[A1[i]] + beta1[A2[i]];
                    beta1t_A[A2[i]] <- beta1t_A[A2[i]] + beta1[A1[i]];
                }

            # incorporate log probability of MVN(0.0, Tau)
                increment_log_prob(-0.5 / sigma_sq * (beta1t_D * beta1 - p * (beta1t_A * beta1)));
                increment_log_prob(-0.5 * N * log(sigma_sq));
                # hacky interpolated log(determinant(D - p * A))
                {
                    int idx;
                    real ldet_left;
                    real ldet_right;
                    idx <- 0;
                    while (floor(p * p_samples) >= idx) {
                        idx <- idx + 1;
                    }
                    ldet_left <- ldet_DpA[idx];
                    ldet_right <- ldet_DpA[idx + 1];
                    increment_log_prob(0.5 * (ldet_left + (ldet_right - ldet_left) * (p * p_samples - floor(p * p_samples))));
                }

            # impose sum-to-zero constraint
                beta1_stz <- beta1 - mean(beta1);

            # calculate prediction
                log_mu <- log(E) + alpha0 + (alpha1 * x) + beta1_stz;

            # data likelihood
                O ~ poisson_log(log_mu);
    }
    "
    sparse_data <- list(N=N, O=O, E=E, x=x, A_N=A_N, A1=A1, A2=A2, D_sparse=D_sparse, p_samples=p_samples)
    sparse_init <- mvn_prec_init
    sparse_fit <- stan(
        model_code = sparse_code,
        data = sparse_data,
        init = sparse_init,
        chains = 1,
        iter = n_samples
    )

# compare outputs to ensure we're getting the same result
    print(sparse_fit)
    print(mvn_prec_fit)