## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE ) ## ----setup-------------------------------------------------------------------- library(GenomicRanges) library(smer) ## ----bim_data----------------------------------------------------------------- bim_data <- data.frame( chromosome = c(1, 1, 1, 2, 2, 2, 3, 3, 3), variant_id = c("rs1", "rs2", "rs3", "rs4", "rs5", "rs6", "rs7", "rs8", "rs9"), cm_position = c(0, 0, 0, 0, 0, 0, 0, 0, 0), bp_position = c(10, 20, 30, 40, 50, 60, 70, 80, 90), allele1 = c("A", "A", "A", "G", "C", "C", "T", "T", "A"), allele1 = c("G", "G", "G", "A", "T", "T", "A", "A", "G") ) bim_data$index <- 1:nrow(bim_data) # DHS intervals hg19_dhs_regions <- data.frame( chromosome = c(1, 2, 3), start = c(5, 45, 85), stop = c(15, 55, 95) ) # LD block intervals hg19_ld_blocks <- data.frame( chromosome = c(1, 1, 2, 2, 3, 3, 3), start = c(5, 25, 35, 45, 65, 75, 85), stop = c(25, 35, 45, 65, 75, 85, 95) ) ## ----dhs_data----------------------------------------------------------------- # Convert .bim to GRanges object bim_gr <- GRanges( seqnames = paste0("chr", bim_data$chromosome), ranges = IRanges(start = bim_data$bp_position, end = bim_data$bp_position), variant_id = bim_data$variant_id, genome = "hg19" ) # Convert DHS to GRanges object dhs_gr <- GRanges( seqnames = paste0("chr", hg19_dhs_regions$chromosome), ranges = IRanges(start = hg19_dhs_regions$start, end = hg19_dhs_regions$stop), genome = "hg19" ) # Find overlaps of BIM variants and DHS intervals overlaps <- findOverlaps(bim_gr, dhs_gr, maxgap = 0) # Extract overlapping variants dhs_data <- bim_data[queryHits(overlaps), ] dhs_data <- dhs_data[!duplicated(dhs_data$index), ] ## ----------------------------------------------------------------------------- # Convert to GRanges object ld_gr <- GRanges( seqnames = paste0("chr", hg19_ld_blocks$chromosome), ranges = IRanges(start = hg19_ld_blocks$start, end = hg19_ld_blocks$stop), genome = "hg19" ) # Find LD block of bim variants ld_overlaps <- findOverlaps(query = bim_gr, subject = ld_gr) ## ----write_mask--------------------------------------------------------------- output_file <- tempfile() gxg_group <- "gxg" ld_group <- "ld" gxg_variants <- dhs_data$index - 1 # 0-base index for C++ create_hdf5_file(output_file) for (j in bim_data$index - 1) { # 0-base index for C++ # Write DHS mask gxg_ds <- sprintf("%s/%d", gxg_group, j) write_hdf5_dataset(file_name = output_file, dataset_name = gxg_ds, gxg_variants) # Find LD block of focal SNP focal_gr <- ld_gr[subjectHits(ld_overlaps[j,])] # Find variants in LD block of focal SNP focal_ld <- findOverlaps(query = bim_gr, subject = focal_gr) ld_data <- bim_data[queryHits(focal_ld),] ld_variants <- ld_data$index - 1 # 0-base index for C++ # Write LD mask ld_ds <- sprintf("%s/%d", ld_group, j) write_hdf5_dataset(file_name = output_file, dataset_name = ld_ds, ld_variants) } dhs_indices <- read_hdf5_dataset(file_name = output_file, dataset_name = gxg_ds) print(sprintf("DHS indices: %s", paste(dhs_indices, collapse = ", "))) ## ----run_sme, eval = FALSE---------------------------------------------------- # sme_result <- sme( # plink_file = "/path/to/plink/data", # pheno_file = "/path/to/pheno/data", # mask_file = "/path/to/mask/file", # gxg_indices = c(1, 2, 3), # chunk_size = 250, # n_randvecs = 10, # n_blocks = 200, # n_threads = 6 # ) ## ----sessinfo----------------------------------------------------------------- sessionInfo()