## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----eval=FALSE---------------------------------------------------------------
#  library(ukbnmr)
#  library(data.table) # for fast reading and writing of csv files using fread() and fwrite()
#  
#  # Load exported field data saved by the Table Exporter tool on the RAP
#  exported <- fread("path/to/exported_ukbiobank_phenotype_data.csv")
#  
#  nmr <- extract_biomarkers(exported)
#  biomarker_qc_flags <- extract_biomarker_qc_flags(exported)
#  sample_qc_flags <- extract_sample_qc_flags(exported)
#  
#  fwrite(nmr, file="path/to/nmr_biomarker_data.csv")
#  fwrite(biomarker_qc_flags, file="path/to/nmr_biomarker_qc_flags.csv")
#  fwrite(sample_qc_flags, file="path/to/nmr_sample_qc_flags.csv")

## -----------------------------------------------------------------------------
library(ukbnmr)

exported <- ukbnmr::test_data # see help("test_data") for more details

nmr <- extract_biomarkers(exported)
biomarker_qc_flags <- extract_biomarker_qc_flags(exported)
sample_qc_flags <- extract_sample_qc_flags(exported)

## ----eval=FALSE---------------------------------------------------------------
#  library(ukbnmr)
#  library(data.table) # for fast reading and writing of csv files using fread() and fwrite()
#  
#  # Load exported field data saved by the Table Exporter tool on the RAP
#  exported <- fread("path/to/exported_ukbiobank_phenotype_data.csv")
#  
#  processed <- remove_technical_variation(exported)
#  
#  fwrite(processed$biomarkers, file="path/to/nmr_biomarker_data.csv")
#  fwrite(processed$biomarker_qc_flags, file="path/to/nmr_biomarker_qc_flags.csv")
#  fwrite(processed$sample_processing, file="path/to/nmr_sample_qc_flags.csv")
#  fwrite(processed$log_offset, file="path/to/nmr_biomarker_log_offset.csv")
#  fwrite(processed$outlier_plate_detection, file="path/to/outlier_plate_info.csv")

## -----------------------------------------------------------------------------
library(ukbnmr)

exported <- ukbnmr::test_data # see help("test_data") for more details

processed <- remove_technical_variation(exported)

## ----eval=FALSE---------------------------------------------------------------
#  library(ukbnmr)
#  library(data.table) # for fast reading and writing of csv files using fread() and fwrite()
#  library(MASS) # Robust linear regression
#  
#  # Load exported field data saved by the Table Exporter tool on the RAP
#  exported <- fread("path/to/exported_ukbiobank_phenotype_data.csv")
#  
#  # First we need to remove the effects of technical variation before removing the
#  # biological covariates
#  processed <- remove_technical_variation(exported)
#  tech_qc <- processed$biomarkers
#  
#  # Write out all the component results as above
#  fwrite(tech_qc, file="path/to/nmr_biomarker_data.csv")
#  fwrite(processed$biomarker_qc_flags, file="path/to/nmr_biomarker_qc_flags.csv")
#  fwrite(processed$sample_processing, file="path/to/nmr_sample_qc_flags.csv")
#  fwrite(processed$log_offset, file="path/to/nmr_biomarker_log_offset.csv")
#  fwrite(processed$outlier_plate_detection, file="path/to/outlier_plate_info.csv")
#  
#  # Second, we can now create an additional dataset that has been adjusted for
#  # age, sex, and BMI.
#  
#  # First, we need to extract the relevant fields from UK biobank and format as
#  # above, assuming the relevant fields for age (field #21003), sex (field #31),
#  # and BMI (field #21001) have also been saved by the Table Exporter tool in
#  # path/to/exported_ukbiobank_phenotype_data.csv
#  baseline_covar <- exported[, .(eid, age=p21003_i0, sex=p31_i0, bmi=p21001_i0)]
#  repeat_covar <- exported[, .(eid, age=p21003_i1, sex=p31_i1, bmi=p21001_i1)]
#  covar <- rbind(idcol="visit_index", "0"=baseline_covar, "1"=repeat_covar)
#  covar[, visit_index := as.integer(visit_index)]
#  
#  # Combine covariates with processed NMR biomarker data. Filter the processed
#  # biomarkers only to the non-derived set, as we will plan to rederive the sums
#  # and ratios after adjusting for age sex and bmi
#  non_derived <- ukbnmr::nmr_info[Type == "Non-derived", Biomarker]
#  non_derived <- intersect(non_derived, names(nmr)) # In case some biomarkers are not in the data
#  bio_qc <- tech_qc[, .SD, .SDcols=c("eid", "visit_index", non_derived)]
#  bio_qc <- covar[bio_qc, on = .(eid, visit_index), nomatch=0]
#  
#  # Transform to long format so we can operate on all biomarkers in bulk
#  bio_qc <- melt(bio_qc, id.vars=c("eid", "visit_index", "age", "sex", "bmi"),
#    variable.name="biomarker")
#  
#  # Log transform biomarkers prior to adjustment. To do so, we need to add a small
#  # offset to any measurements equal to 0. This code is copied directly from the
#  # internals of the remove_technical_variation() function
#  log_offset <- bioqc[!is.na(value),
#    .(Minimum=min(value), Minimum.Non.Zero=min(value[value != 0])),
#    by=biomarker]
#  log_offset[, Log.Offset := ifelse(Minimum == 0, Minimum.Non.Zero / 2, 0)]
#  bio_qc[log_offset, on = .(biomarker), log_value := log(value + Log.Offset)]
#  
#  # Now adjust for age, sex, and (log) BMI using robust linear regression from the MASS package
#  bio_qc[, adjusted := rlm(biomarker ~ age + factor(sex) + log(bmi))$residuals, by=biomarker]
#  
#  # The adjusted residuals are normally distributed with mean 0. We can rescale to
#  # absolute units by subtracting the mean of the original log concentrations, then
#  # undoing the log transformation. This code is copied directly from the internals
#  # of the remove_technical_variation() function.
#  bio_qc[, adjusted := adjusted + as.vector(coef(rlm(value ~ 1)))[1], by=biomarker]
#  bio_qc[, adjusted := exp(adjusted)]
#  bio_qc[log_offset, on = .(biomarker), adjusted := adjusted - Log.Offset] # remove log offset
#  
#  # Some values that were 0 are now < 0, apply small right shift for these biomarkers
#  # (the shift should be very small, essentially numeric error. This is worth double
#  # checking!). Again, this code is copied directly from the internals of the
#  # remove_technical_variation() function.
#  shift <- bio_qc[, .(Right.Shift=-pmin(0, min(adjusted))), by=biomarker]
#  log_offset <- log_offset[shift, on = .(biomarker)]
#  bio_qc[log_offset, on = .(biomarker), adjusted := adjusted + Right.Shift]
#  
#  # Cast back to wide format so that each biomarker has its own column
#  bioqc <- dcast(bio_qc, eid + visit_index ~ biomarker, value.var="adjusted")
#  
#  # Now we recompute the composite biomarkers and derived ratios and save the result.
#  bio_qc <- recompute_derived_biomarkers(bio_qc)
#  fwrite(bio_qc, file="path/to/nmr_biomarkers_adjusted_for_covariates.csv")
#  
#  # You may also want to aggregate and save the quality control flags for each sample from
#  # the biomarkers underlying each derived biomarker or ratio, adding them as additional
#  # columns to the input data (see help("recompute_derived_biomarker_qc_flags")).
#  biomarker_qc_flags <- recompute_derived_biomarker_qc_flags(nmr)
#  fwrite(biomarker_qc_flags, file="path/to/biomarker_qc_flags.csv")