## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(pminternal) library(mice) # make some data set.seed(2345) n <- 800 p <- 5 X <- matrix(rnorm(n*p), nrow = n, ncol = p) LP <- -1 + X %*% c(-1, 1, -.5, .5, 0) y <- rbinom(n, 1, plogis(LP)) mean(y) datcomp <- data.frame(y, X) # add missingness datmis <- datcomp datmis$X1[sample(n, 200)] <- NA datmis$X2[sample(n, 200)] <- NA datmis$X3[sample(n, 200)] <- NA # assess performance before introducing missingness mod <- glm(y ~ ., data = datcomp, family = "binomial") (comp_val <- validate(mod, method = "cv_o")) ## ----------------------------------------------------------------------------- model_mi <- function(data, ...){ imp <- mice::mice(data, printFlag = FALSE) # 5 imputed datasets via pmm fits <- with(imp, glm(y ~ X1 + X2 + X3 + X4 + X5, family=binomial)) B <- mice::pool(fits)$pooled$estimate # pooled coefs for predictions # save pooled coefficients and the data to do stacked imputation at validation list(B, data) } pred_mi <- function(model, data, ...){ # extract pooled coefs B <- model[[1]] # stack the new data on the model fit data (model[[2]]) dstacked <- rbind(model[[2]], data) i <- (nrow(model[[2]]) + 1):(nrow(dstacked)) # impute stacked data # use ignore argument to ensure the test data doesn't influence # the imputation model but still gets imputed imp <- mice::mice(dstacked, printFlag = FALSE, ignore = seq(nrow(dstacked)) %in% i) # get logit predicted risks for each imputed data set preds <- sapply(seq(imp$m), \(x){ # complete(imp, x)[i, ] = get complete data set x and extract the # validation data i X <- model.matrix(~ X1 + X2 + X3 + X4 + X5, data = mice::complete(imp, x)[i, ]) X %*% B }) # average logit predictions, transform invlogit, and return plogis(apply(preds, 1, mean)) } ## ----------------------------------------------------------------------------- (mi_val <- validate(data = datmis, model_fun = model_mi, pred_fun = pred_mi, outcome = "y", method = "cv_o")) ## ----fig.width=4, fig.height=5------------------------------------------------ md <- md.pattern(datmis) ## ----------------------------------------------------------------------------- # store missing data pattern in data # as this will be useful datmis$mdp <- apply(datmis[, paste0("X", 1:5)], 1, \(x) paste0(as.numeric(is.na(x)), collapse = "")) table(datmis$mdp) ## ----------------------------------------------------------------------------- submodels <- list( "00000" = y ~ X1 + X2 + X3 + X4 + X5, "00100" = y ~ X1 + X2 + X4 + X5, "01000" = y ~ X1 + X3 + X4 + X5, "01100" = y ~ X1 + X4 + X5, "10000" = y ~ X2 + X3 + X4 + X5, "10100" = y ~ X2 + X4 + X5, "11000" = y ~ X3 + X4 + X5, "11100" = y ~ X4 + X5 ) model_ps <- function(data, ...){ # this example uses submodels as an additional argument to # validate. But we could have put the submodels list in the # model_sub function dots <- list(...) if ("submodels" %in% names(dots)) submodels <- dots[["submodels"]] else stop("no submodels") patterns <- names(submodels) # all patterns in data should be in submodels, # but not necessarily vice versa (resampling # could have omitted a pattern) stopifnot(all(unique(data$mdp) %in% patterns)) fits <- lapply(patterns, \(pat){ f <- submodels[[pat]] # submodel formula # if n with pattern > 2*p then fit submodel n <- sum(data$mdp == pat) if (n >= 2*nchar(pat)){ fit <- glm(f, data = subset(data, mdp == pat), family = binomial) } else{ # otherwise fit on complete cases fit <- glm(f, data = data, family = binomial) # note this approach allows submodels for patterns not # observed in development data. Though these will be # fit using complete cases (obviously not pattern specific data) } fit }) names(fits) <- patterns fits } pred_ps <- function(model, data, ...){ patterns <- names(model) # there needs to be a model for each pattern in data stopifnot(all( unique(data$mdp) %in% patterns )) patterns <- patterns[patterns %in% unique(data$mdp)] preds <- lapply(patterns, \(pat){ i = which(data$mdp == pat) # for reordering later pdat <- subset(data, mdp == pat) p <- predict(model[[pat]], newdata = pdat, type = "response") data.frame(y = pdat$y, p = p, i = i) }) preds <- do.call(rbind, preds) # reorder so same as original data preds <- preds[order(preds$i),] # all(data$y == preds$y) preds$p } ## ----------------------------------------------------------------------------- (sub_val <- validate(data = datmis, model_fun = model_ps, pred_fun = pred_ps, submodels = submodels, outcome = "y", method = "cv_o"))