## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ---- echo=FALSE, fig.cap="Figure 1: Model Structure", out.width = '80%'------
knitr::include_graphics("images/ModelDiagram.png")

## ---- echo = TRUE, eval = TRUE,  fig.width = 7, fig.height = 4----------------

library(SAMBA)
library(MASS)
expit <- function(x) exp(x) / (1 + exp(x))
logit <- function(x) log(x / (1 - x))

nobs <- 5000

### Generate Predictors and Follow-up Information
set.seed(1234)
cov <- mvrnorm(n = nobs, mu = rep(0, 3), Sigma = rbind(c(1,   0, 0.4),
                                                       c(0,   1,   0),
                                                       c(0.4, 0,   1)))

data <- data.frame(Z = cov[, 1], X = cov[, 2], W = cov[, 3])

# Generate random uniforms
set.seed(5678)
U1 <- runif(nobs)
set.seed(4321)
U2 <- runif(nobs)
set.seed(8765)
U3 <- runif(nobs)

# Generate Disease Status
DISEASE <- expit(-2 + 0.5 * data$Z)
data$D   <- ifelse(DISEASE > U1, 1, 0)

# Relate W and D
data$W <- data$W + 1 * data$D

# Generate Misclassification
SENS <- expit(-0.4 + 1 * data$X)
SENS[data$D == 0] = 0
data$Dstar <- ifelse(SENS > U2, 1, 0)

# Generate Sampling Status
SELECT <- expit(-0.6 + 1 * data$D + 0.5 * data$W)
S  <- ifelse(SELECT > U3, T, F)

# Observed Data
data.samp <- data[S,]

# True marginal sampling ratio
prob1 <- expit(-0.6 + 1 * 1 + 0.5 * data$W)
prob0 <- expit(-0.6 + 1 * 0 + 0.5 * data$W)
r.marg.true <- mean(prob1[data$D == 1]) / mean(prob0[data$D == 0])

# True inverse probability of sampling weights
prob.WD <- expit(-0.6 + 1 * data.samp$D + 0.5 * data.samp$W)
weights <- nrow(data.samp) * (1  / prob.WD) / (sum(1 / prob.WD))

# True associations with D in population
trueX <- glm(D ~ X, binomial(), data = data)
trueZ <- glm(D ~ Z, binomial(), data = data)

# Initial Parameter Values
fitBeta  <- glm(Dstar ~ X, binomial(), data = data.samp)
fitTheta <- glm(Dstar ~ Z, binomial(), data = data.samp)

## ---- results='hide', message=F-----------------------------------------------
# Using marginal sampling ratio r and P(D=1)
sens1 <- sensitivity(data.samp$Dstar, data.samp$X, mean(data$D),
                      r = r.marg.true)

# Using inverse probability of selection weights and P(D=1)
sens2 <- sensitivity(data.samp$Dstar, data.samp$X, prev = mean(data$D),
                     weights = weights)

# Using marginal sampling ratio r and P(D=1|X)
prev  <- predict(trueX, newdata = data.samp, type = 'response')
sens3 <- sensitivity(data.samp$Dstar, data.samp$X, prev, r = r.marg.true)

# Using inverse probability of selection weights and P(D=1|X)
prev  <- predict(trueX, newdata = data.samp, type = 'response')
sens4 <- sensitivity(data.samp$Dstar, data.samp$X, prev, weights = weights)

## ---- results='hide'----------------------------------------------------------
# Approximation of D*|Z
approx1 <- approxdist(data.samp$Dstar, data.samp$Z, sens1$c_marg,
                      weights = weights)

# Non-logistic link function method
nonlog1 <- nonlogistic(data.samp$Dstar, data.samp$Z, c_X = sens3$c_X,
                       weights = weights)

# Direct observed data likelihood maximization without fixed intercept
start <- c(coef(fitTheta), logit(sens1$c_marg), coef(fitBeta)[2])
fit1 <- obsloglik(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                 weights = weights)
obsloglik1 <- list(param = fit1$param, variance = diag(fit1$variance))

# Direct observed data likelihood maximization with fixed intercept
fit2   <- obsloglik(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                 beta0_fixed = logit(sens1$c_marg), weights = weights)
obsloglik2 <- list(param = fit2$param, variance = diag(fit2$variance))

# Expectation-maximization algorithm without fixed intercept
fit3 <- obsloglikEM(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                 weights = weights)
obsloglik3 <- list(param = fit3$param, variance = diag(fit3$variance))

# Expectation-maximization algorithm with fixed intercept
fit4 <- obsloglikEM(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                  beta0_fixed = logit(sens1$c_marg), weights = weights)
obsloglik4 <- list(param = fit4$param, variance = diag(fit4$variance))

## ---- echo = FALSE, eval = TRUE,  fig.width = 5, fig.height= 5----------------
plot(sort(sens3$c_X), xlab = 'Patients', ylab = 'Sensitivity',
     main = 'Figure 2: Sensitivity Estimates', type = 'l', col = 'red', lwd = 2)
lines(sort(expit(obsloglik1$param[3] + obsloglik1$param[4]*data.samp$X)), col = 'blue', lwd = 2)
lines(sort(expit(obsloglik2$param[3] + obsloglik2$param[4]*data.samp$X)), col = 'green', lwd = 2)
abline(h=sens1$c_marg, col = 'purple', lwd = 2)
lines(sort(expit(-0.4 + 1*data.samp$X)), col = 'black', lwd = 2)
legend(x='topleft', fill = c('purple', 'red','blue', 'green', 'black'),
       legend = c('Estimated marginal sensitivity',
                  'Using non-logistic link method',
                  'Using obs. data log-lik',
                  'Using obs. data log-lik (fixed intercept)',
                  'Truth'), cex = 0.7)

## ---- echo = FALSE, eval = TRUE,  fig.width = 5, fig.height= 5,  results='hide', message=F----
rvals = c(1,1.5,2,2.5,5,10)
COL = c('red', 'orange', 'yellow', 'green', 'blue', 'purple')
true_prevs = predict(trueX, newdata = data.samp, type = 'response')
plot(sort(expit(-0.4 + 1*data.samp$X)), xlab = 'Patients', ylab = 'Sensitivity',
main = 'Figure 3: Estimated sensitivity across \n marginal sampling ratios',
type = 'l', col = 'black', lwd = 2, ylim = c(0,1))

for (i in 1:length(rvals)) {
  TEMP <- sensitivity(X = data.samp$X, Dstar = data.samp$Dstar,  r = rvals[i], prev = true_prevs)
  lines(sort(TEMP$c_X), col = COL[i])
}
legend(x='topleft', legend = c(rvals, 'Truth'), title = 'Sampling Ratio',
       fill = c(COL, 'black'), cex = 0.8)

## ---- echo = FALSE, eval = TRUE,  fig.width = 5, fig.height= 5----------------
plot(fit1$beta0_fixed, fit1$loglik.seq, xlab = 'Beta_0', ylab = 'Log-likelihood',
     main = 'Figure 4: Profile Log-Likelihood Values \n for Direct Maximization', pch = 16)
abline(v=-0.4, col = 'blue', lwd = 2)
points(logit(sens1$c_marg), fit2$loglik.seq, pch = 17, col = 'red')
legend(x='topright', legend = c('No fixed beta_0', 'Fixed beta_0'), col = c('black', 'red'), pch = c(16,17))
text(x=-0.1, y=mean(fit1$loglik.seq),label = 'True beta_0', srt = 90)

plot(fit3$loglik.seq[-c(1)], xlab = 'EM algorithm iteration', ylab = 'Log-likelihood',
     main = 'Figure 5: Log-Likelihood Values \n Across EM Iterations', pch = 16)
points(fit4$loglik.seq[-c(1)], pch = 17, col = 'red')
legend(x='bottomright', legend = c('No fixed beta_0', 'Fixed beta_0'), pch = c(16,17),
       col = c('black', 'red'))

## ---- echo = FALSE, eval = TRUE,  fig.width = 7, fig.height= 4, message=FALSE----
library(ggplot2)
library(scales)

## ---- echo = FALSE, eval = TRUE,  fig.width = 7, fig.height= 4----------------
METHODS = c('True',  'Uncorrected', 'Approx D*|Z + IPW','Non-logistic Link + IPW','Obs. log-lik + IPW', 'Fixed intercept obs. log-lik + IPW')
PARAM = c( coef(trueZ)[2], coef(fitTheta)[2],approx1$param,  nonlog1$param[2], obsloglik1$param[2], obsloglik2$param[2] )
VARIANCE = c(diag(summary(trueZ)$cov.scaled)[2],diag(summary(fitTheta)$cov.scaled)[2],
             approx1$variance,nonlog1$variance[2],obsloglik1$variance[2], obsloglik2$variance[2])
pd = position_dodge(width=0.6)
a <- ggplot(data = data.frame(METHODS = METHODS, PARAM = PARAM, VARIANCE = VARIANCE),
       aes(xmin= METHODS, xmax = METHODS, ymin = PARAM - 1.96*sqrt(VARIANCE), ymax =  PARAM + 1.96*sqrt(VARIANCE),
           col = METHODS,x = METHODS, y = PARAM)) +
  geom_point(position = position_dodge(.7), size = 2) +
  geom_linerange(position = position_dodge(.7), size = 1.2) +
  xlab('') + ylab('logOR')+ggtitle('Figure 6: Estimated Log-Odds Ratio Across Methods')+
  scale_x_discrete(limits=METHODS)+
  geom_hline(yintercept = PARAM[1], linetype = 1, color = 'black')+
  guides(fill=guide_legend(nrow=2,byrow=TRUE))+
  theme(legend.position="top",panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), legend.text=element_text(size=8), legend.title = element_blank(),
        axis.text.x=element_text(angle=20,hjust=1,vjust=1), text = element_text(size=12))
print(a)