## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup, message=FALSE, warning=FALSE, echo=FALSE--------------------------
library(dplyr) # For data manipulation
library(survey) # For complex survey analysis
library(srvyr) # For complex survey analysis with dplyr syntax
library(svrep)

## -----------------------------------------------------------------------------
data('library_multistage_sample', package = 'svrep')

# Load first-phase sample
  twophase_sample <- library_multistage_sample

# Select second-phase sample
  set.seed(2020)
  
  twophase_sample[['SECOND_PHASE_SELECTION']] <- sampling::srswor(
    n = 100,
    N = nrow(twophase_sample)
  ) |> as.logical()

## -----------------------------------------------------------------------------
# Declare survey design
  twophase_design <- twophase(
    method = "full",
    data = twophase_sample,
    # Identify the subset of first-phase elements
    # which were selected into the second-phase sample
    subset = ~ SECOND_PHASE_SELECTION,
    # Describe clusters, probabilities, and population sizes
    # at each phase of sampling
    id = list(~ PSU_ID + SSU_ID,
              ~ 1),
    probs = list(~ PSU_SAMPLING_PROB + SSU_SAMPLING_PROB,
                 NULL),
    fpc = list(~ PSU_POP_SIZE + SSU_POP_SIZE,
               NULL)
  )

## ----warning=FALSE------------------------------------------------------------
# Obtain a generalized bootstrap replicates
# based on 
#   - The phase 1 estimator is the usual variance estimator
#     for stratified multistage simple random sampling
#   - The phase 2 estimator is the usual variance estimator
#     for single-stage simple random sampling

twophase_boot_design <- as_gen_boot_design(
  design = twophase_design,
  variance_estimator = list(
    "Phase 1" = "Stratified Multistage SRS",
    "Phase 2" = "Ultimate Cluster"
  ),
  replicates = 1000
)

## -----------------------------------------------------------------------------
twophase_boot_design |> svymean(x = ~ LIBRARIA, na.rm = TRUE)

## -----------------------------------------------------------------------------
twophase_boot_design <- as_gen_boot_design(
  design = twophase_design,
  variance_estimator = list(
    "Phase 1" = "Stratified Multistage SRS",
    "Phase 2" = "Ultimate Cluster"
  )
)

## -----------------------------------------------------------------------------
twophase_genrep_design <- as_fays_gen_rep_design(
  design = twophase_design,
  variance_estimator = list(
    "Phase 1" = "Stratified Multistage SRS",
    "Phase 2" = "Ultimate Cluster"
  ),
  max_replicates = 500
)

## -----------------------------------------------------------------------------
# Impute missing values (if necessary)
twophase_sample <- twophase_sample |>
  mutate(
    TOTCIR = ifelse(
      is.na(TOTCIR),
      stats::weighted.mean(TOTCIR, na.rm = TRUE,
                           w = 1/SAMPLING_PROB),
      TOTCIR
    ),
    TOTSTAFF = ifelse(
      is.na(TOTSTAFF),
      stats::weighted.mean(TOTSTAFF, na.rm = TRUE,
                           w = 1/SAMPLING_PROB),
      TOTSTAFF
    )
  )

## ----warning=FALSE------------------------------------------------------------
# Describe the two-phase survey design
  twophase_design <- twophase(
    method = "full",
    data = twophase_sample,
    # Identify the subset of first-phase elements
    # which were selected into the second-phase sample
    subset = ~ SECOND_PHASE_SELECTION,
    # Describe clusters, probabilities, and population sizes
    # at each phase of sampling
    id = list(~ PSU_ID + SSU_ID,
              ~ 1),
    probs = list(~ PSU_SAMPLING_PROB + SSU_SAMPLING_PROB,
                 NULL),
    fpc = list(~ PSU_POP_SIZE + SSU_POP_SIZE,
               NULL)
  )

# Create replicate weights for the second-phase sample
# (meant to reflect variance of the entire two-phase design)
  twophase_boot_design <- as_gen_boot_design(
    design = twophase_design,
    variance_estimator = list(
      "Phase 1" = "Stratified Multistage SRS",
      "Phase 2" = "Ultimate Cluster"
    ),
    replicates = 1000,
    mse = TRUE
  )

## -----------------------------------------------------------------------------
# Extract a survey design object representing the first phase sample
  first_phase_design <- twophase_design$phase1$full

# Create replicate weights for the first-phase sample
  first_phase_gen_boot <- as_gen_boot_design(
    design = first_phase_design,
    variance_estimator = "Stratified Multistage SRS",
    replicates = 1000
  )
  
# Estimate first-phase totals and their sampling-covariance
  first_phase_estimates <- svytotal(
    x = ~ TOTCIR + TOTSTAFF,
    design = first_phase_gen_boot
  )
  
  first_phase_totals <- coef(first_phase_estimates)
  first_phase_vcov <- vcov(first_phase_estimates)
  
  print(first_phase_totals)
  print(first_phase_vcov)

## -----------------------------------------------------------------------------
calibrated_twophase_design <- calibrate_to_estimate(
  rep_design = twophase_boot_design,
  # Specify the variables in the data to use for calibration
  cal_formula = ~ TOTCIR + TOTSTAFF,
  # Supply the first-phase estimates and their variance
  estimate = first_phase_totals,
  vcov_estimate = first_phase_vcov,
)

## -----------------------------------------------------------------------------
# Display second-phase estimates for calibration variables
svytotal(
  x = ~ TOTCIR + TOTSTAFF,
  design = calibrated_twophase_design
)

# Display the original first-phase estimates (which are identical!)
print(first_phase_estimates)

## -----------------------------------------------------------------------------
# Inspect calibrated second-phase estimate
svytotal(
  x = ~ LIBRARIA, na.rm = TRUE,
  design = calibrated_twophase_design
)

# Compare to uncalibrated second-phase estimate
svytotal(
  x = ~ LIBRARIA, na.rm = TRUE,
  design = twophase_boot_design
)

# Compare to first-phase estimate
svytotal(
  x = ~ LIBRARIA, na.rm = TRUE,
  design = first_phase_gen_boot
)

## -----------------------------------------------------------------------------
# Extract a survey design object representing the first phase sample
  first_phase_design <- twophase_design$phase1$full

# Create replicate weights for the first-phase sample
  first_phase_gen_boot <- as_gen_boot_design(
    design = first_phase_design,
    variance_estimator = "Stratified Multistage SRS",
    replicates = 1000
  )

## -----------------------------------------------------------------------------
calibrated_twophase_design <- calibrate_to_sample(
  primary_rep_design = twophase_boot_design,
  # Supply the first-phase replicate design
  control_rep_design = first_phase_gen_boot,
  # Specify the variables in the data to use for calibration
  cal_formula = ~ TOTCIR + TOTSTAFF
)

## -----------------------------------------------------------------------------
# Display second-phase estimates for calibration variables
calibrated_ests <- svytotal(
  x = ~ TOTCIR + TOTSTAFF,
  design = calibrated_twophase_design
)

print(calibrated_ests)

# Display the original first-phase estimates (which are identical!)
first_phase_ests <- svytotal(
  x = ~ TOTCIR + TOTSTAFF,
  design = first_phase_gen_boot
)

print(first_phase_ests)

## -----------------------------------------------------------------------------
ratio_of_variances <- vcov(calibrated_ests)/vcov(first_phase_ests)
ratio_of_variances

## -----------------------------------------------------------------------------
# Inspect calibrated second-phase estimate
svytotal(
  x = ~ LIBRARIA, na.rm = TRUE,
  design = calibrated_twophase_design
)

# Compare to uncalibrated second-phase estimate
svytotal(
  x = ~ LIBRARIA, na.rm = TRUE,
  design = twophase_boot_design
)

# Compare to first-phase estimate
svytotal(
  x = ~ LIBRARIA, na.rm = TRUE,
  design = first_phase_gen_boot
)

## -----------------------------------------------------------------------------
ratio_calib_design <- calibrate_to_sample(
  primary_rep_design = twophase_boot_design,
  # Supply the first-phase replicate design
  control_rep_design = first_phase_gen_boot,
  # Specify the GREG formula.
  # For ratio estimation, we add `-1` to the formula 
  # (i.e., we remove the intercept from the working model)
  # and specify only a single variable
  cal_formula = ~ -1 + TOTSTAFF,
  variance = 1
)

## -----------------------------------------------------------------------------
ratio_adjusted_weights <- weights(ratio_calib_design, type = "sampling")
unadjusted_weights <- weights(twophase_boot_design, type = "sampling")

adjustment_factors <- ratio_adjusted_weights/unadjusted_weights
head(adjustment_factors)

## -----------------------------------------------------------------------------
phase1_total <- svytotal(
  x = ~ TOTSTAFF,
  first_phase_design
) |> coef()
phase2_total <- svytotal(
  x = ~ TOTSTAFF,
  twophase_boot_design
) |> coef()

phase1_total/phase2_total

## -----------------------------------------------------------------------------
set.seed(2022)
y <- rnorm(n = 100)

# Select first phase sample, SRS without replacement
  phase_1_sample_indicators <- sampling::srswor(n = 50, N = 100) |>
    as.logical()
  
  phase_1_sample <- y[phase_1_sample_indicators]
  
# Make variance estimator for first-phase variance component
  Sigma_a <- make_quad_form_matrix(
    variance_estimator = "Ultimate Cluster",
    cluster_ids = as.matrix(1:50),
    strata_ids = rep(1, times = 50) |> as.matrix(),
    strata_pop_sizes = rep(100, times = 50) |> as.matrix()
  )

# Select second stage sample, SRS without replacment
  phase_2_sample_indicators <- sampling::srswor(n = 5, N = 50) |>
    as.logical()
  
  phase_2_sample <- phase_1_sample[phase_2_sample_indicators]
  
# Estimate two-phase variance
  Sigma_a_prime <- Sigma_a[phase_2_sample_indicators,
                           phase_2_sample_indicators]
  
  phase_2_joint_probs <- outer(rep(5/50, times = 5),
                               rep(4/49, times = 5))
  diag(phase_2_joint_probs) <- rep(5/50, times = 5)

  Sigma_b <- make_quad_form_matrix(
    variance_estimator = "Ultimate Cluster",
    cluster_ids = as.matrix(1:5),
    strata_ids = rep(1, times = 5) |> as.matrix(),
    strata_pop_sizes = rep(50, times = 5) |> as.matrix()
  )
  
  sigma_ab <- make_twophase_quad_form(
    sigma_1 = Sigma_a_prime,
    sigma_2 = Sigma_b,
    phase_2_joint_probs = phase_2_joint_probs
  )
  
  wts <- rep(
    (50/100)^(-1) * (5/50)^(-1),
    times = 5
  )
  W_star <- diag(wts)
  
  W_star_y <- W_star %*% phase_2_sample
  t(W_star_y) %*% sigma_ab %*% (W_star_y)
  
# Since both phases are SRS without replacement,
# variance estimate for a total should be similar to the following
  5 * var(W_star_y)


## -----------------------------------------------------------------------------
# Print first phase estimates and their variance-covariance
print(first_phase_totals)
print(first_phase_vcov)

# Calibrate the two-phase replicate design
# to the totals estimated from the first-phase sample
calibrated_twophase_design <- calibrate_to_estimate(
  rep_design = twophase_boot_design,
  # Specify the variables in the data to use for calibration
  cal_formula = ~ TOTCIR + TOTSTAFF,
  # Supply the first-phase estimates and their variance
  estimate = first_phase_totals,
  vcov_estimate = first_phase_vcov,
)

## -----------------------------------------------------------------------------
calibrated_twophase_design <- calibrate_to_sample(
  primary_rep_design = twophase_boot_design,
  # Supply the first-phase replicate design
  control_rep_design = first_phase_gen_boot,
  # Specify the variables in the data to use for calibration
  cal_formula = ~ TOTCIR + TOTSTAFF
)

## -----------------------------------------------------------------------------
gen_boot_design <- as_gen_boot_design(
  design = twophase_design,
  variance_estimator = list(
    'Phase 1' = "Ultimate Cluster",
    'Phase 2' = "Ultimate Cluster"
  )
)

## -----------------------------------------------------------------------------
twophase_quad_form_matrix <- get_design_quad_form(
  design = twophase_design,
  variance_estimator = list(
    'Phase 1' = "Ultimate Cluster",
    'Phase 2' = "Ultimate Cluster"
  )
)

twophase_quad_form_matrix |> is_psd_matrix()

## -----------------------------------------------------------------------------
approx_quad_form <- get_nearest_psd_matrix(twophase_quad_form_matrix)

## -----------------------------------------------------------------------------
# Extract weights and a single variable from the second-phase sample
## NOTE: To get second-phase data,
##       we use `my_design$phase1$sample$variables`.
##       To get first-phase data,
##       we use `my_design$phase1$full$variables

wts <- weights(twophase_design, type = "sampling")
y <- twophase_design$phase1$sample$variables$TOTSTAFF
wtd_y <- as.matrix(wts * y)

# Estimate standard errors
std_error <- as.numeric(
  t(wtd_y) %*% twophase_quad_form_matrix %*% wtd_y
) |> sqrt()

approx_std_error <- as.numeric(
  t(wtd_y) %*% approx_quad_form %*% wtd_y
) |> sqrt()

print(approx_std_error)
print(std_error)

approx_std_error / std_error