## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, message = FALSE---------------------------------------------------
library(multibias)
library(dplyr)

## -----------------------------------------------------------------------------
nhanes <- read.csv("nhanes.csv")

## -----------------------------------------------------------------------------
nhanes_filtered <- nhanes |>
  mutate(alcohol_extreme = case_when(
    alcohol_day_total > 14 * 1.5 & gender_female == 1 ~ 1,
    alcohol_day_total > 28 * 1.5 & gender_female == 0 ~ 1,
    TRUE ~ 0
  )) |>
  filter(age >= 18) |>
  filter(eligstat == 1) |>
  filter(alcohol_12mo > 0)

## -----------------------------------------------------------------------------
exposure_outcome_table <- nhanes_filtered |>
  group_by(alcohol_extreme, mortstat) |>
  summarize(count = n()) |>
  ungroup() |>
  mutate(proportion = count / sum(count))

# Distribution of Exposure and Outcome:
print(exposure_outcome_table)

demographic_table <- nhanes_filtered |>
  group_by(alcohol_extreme) |>
  summarize(
    n = n(),
    age_mean = mean(age),
    age_sd = sd(age),
    gender_prop = mean(gender_female)
  ) |>
  mutate(
    alcohol_extreme = if_else(
      alcohol_extreme == 1, "Extreme", "Non-extreme"
    )
  )

# Demographic Characteristics by Exposure Status:
print(demographic_table)

## -----------------------------------------------------------------------------
base_mod <- glm(
  mortstat ~ alcohol_extreme + gender_female + age,
  data = nhanes_filtered,
  family = binomial(link = "logit")
)

base_results <- data.frame(
  term = c("Extreme Alcohol Use", "Female Gender", "Age"),
  estimate = round(exp(coef(base_mod)), 2)[-1],
  ci_lower = round(exp(confint(base_mod)), 2)[-1, 1],
  ci_upper = round(exp(confint(base_mod)), 2)[-1, 2]
)

# Base Model Results: Odds Ratios and 95% Confidence Intervals
print(base_results)

## -----------------------------------------------------------------------------
# Create observed data object with uncontrolled confounding bias
df_obs1 <- data_observed(
  data = nhanes_filtered,
  bias = "uc",
  exposure = "alcohol_extreme",
  outcome = "mortstat",
  confounders = c("age", "gender_female")
)

# Create validation data with smoking information
df_temp1 <- nhanes_filtered |>
  filter(!is.na(smoked_100cigs))

df_val1 <- data_validation(
  data = df_temp1,
  true_exposure = "alcohol_extreme",
  true_outcome = "mortstat",
  confounders = c("age", "gender_female", "smoked_100cigs")
)

# Perform bias adjustment
set.seed(1234)
uc_adjusted <- multibias_adjust(df_obs1, df_val1)

# Uncontrolled Confounding Adjusted Results:
print(uc_adjusted)

## -----------------------------------------------------------------------------
# Create observed data object with both biases
df_obs2 <- data_observed(
  data = nhanes_filtered,
  bias = c("em", "uc"),
  exposure = "alcohol_extreme",
  outcome = "mortstat",
  confounders = c("age", "gender_female")
)

# Create validation data with adjusted alcohol consumption
df_temp2 <- nhanes_filtered |>
  filter(!is.na(smoked_100cigs)) |>
  mutate(
    # Assume 50% higher consumption for high SES individuals
    alcohol_adj = if_else(
      income == "$100,000 and Over" | education == "College graduate or above",
      alcohol_day_total * 1.5,
      alcohol_day_total
    )
  ) |>
  mutate(alcohol_extreme_adj = case_when(
    alcohol_adj > 14 * 1.5 & gender_female == 1 ~ 1,
    alcohol_adj > 28 * 1.5 & gender_female == 0 ~ 1,
    TRUE ~ 0
  ))

df_val2 <- data_validation(
  data = df_temp2,
  true_exposure = "alcohol_extreme_adj",
  true_outcome = "mortstat",
  confounders = c("age", "gender_female", "smoked_100cigs"),
  misclassified_exposure = "alcohol_extreme"
)

# Perform bias adjustment
set.seed(1234)
uc_em_adjusted <- multibias_adjust(df_obs2, df_val2)

# Exposure Misclassification & Confounding Adjusted Results:
print(uc_em_adjusted)

## -----------------------------------------------------------------------------
# Create observed data object with all three biases
df_obs3 <- data_observed(
  data = nhanes_filtered,
  bias = c("em", "uc", "sel"),
  exposure = "alcohol_extreme",
  outcome = "mortstat",
  confounders = c("age", "gender_female")
)

# Prepare data for selection bias adjustment
df_temp3a <- nhanes_filtered |>
  filter(!is.na(smoked_100cigs)) |>
  mutate(
    alcohol_adj = if_else(
      income == "$100,000 and Over" | education == "College graduate or above",
      alcohol_day_total * 1.5,
      alcohol_day_total
    ),
    alcohol_extreme_adj = case_when(
      alcohol_adj > 14 * 1.5 & gender_female == 1 ~ 1,
      alcohol_adj > 28 * 1.5 & gender_female == 0 ~ 1,
      TRUE ~ 0
    ),
    # Combine survey weights
    weight = if_else(weight_day2 == 0, weight_day1, weight_day2)
  )

# Create selection indicator based on sampling weights
set.seed(1234)
selected_sample <- sample(
  x = df_temp3a$seqn,
  size = nrow(df_temp3a),
  replace = TRUE,
  prob = df_temp3a$weight
)

df_selected_sample <- data.frame()
for (id in selected_sample) {
  df_selected_sample <- rbind(
    df_selected_sample,
    df_temp3a[df_temp3a$seqn == id, ]
  )
}

df_selected_sample$selection <- 1
df_not_selected_sample <- df_temp3a[!(df_temp3a$seqn %in% selected_sample), ]
df_not_selected_sample$selection <- 0

df_temp3b <- rbind(df_selected_sample, df_not_selected_sample)

# Create validation data with selection information
df_val3 <- data_validation(
  data = df_temp3b,
  true_exposure = "alcohol_extreme_adj",
  true_outcome = "mortstat",
  confounders = c("age", "gender_female", "smoked_100cigs"),
  misclassified_exposure = "alcohol_extreme",
  selection = "selection"
)

# Perform final bias adjustment
set.seed(1234)
final_adjusted <- multibias_adjust(df_obs3, df_val3)

# Triple Bias Adjusted Results:
print(final_adjusted)

## -----------------------------------------------------------------------------
# Create comparison table
comparison_table <- data.frame(
  Model = c(
    "Base Model",
    "Adjusted - Single Bias",
    "Adjusted - Double Biases",
    "Adjusted - Triple Biases"
  ),
  OR = c(
    round(exp(coef(base_mod))["alcohol_extreme"], 2),
    round(uc_adjusted$estimate, 2),
    round(uc_em_adjusted$estimate, 2),
    round(final_adjusted$estimate, 2)
  ),
  CI_lower = c(
    round(exp(confint(base_mod))["alcohol_extreme", 1], 2),
    round(uc_adjusted$ci[1], 2),
    round(uc_em_adjusted$ci[1], 2),
    round(final_adjusted$ci[1], 2)
  ),
  CI_upper = c(
    round(exp(confint(base_mod))["alcohol_extreme", 2], 2),
    round(uc_adjusted$ci[2], 2),
    round(uc_em_adjusted$ci[2], 2),
    round(final_adjusted$ci[2], 2)
  )
)

# Comparison of Bias-Adjusted Estimates:
print(comparison_table)