## ----echo=FALSE, warning=FALSE------------------------------------------------ library(httr) ## ----warning=FALSE, message=FALSE, eval=FALSE--------------------------------- # # install iimi # install.packages(c("iimi", "dplyr")) # # # install Biostrings # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("Biostrings") ## ----warning=FALSE, message=FALSE--------------------------------------------- library(iimi) library(Biostrings) library(dplyr) # this is the extracted DNAStringSet data from the Virtool virus database GitHub. It contains both version 1.4.0 and version 1.5.0 get_url <- "https://github.com/virtool/iimi/blob/main/data/virus_segments.rda?raw=true" load(url(get_url)) ## ----eval=FALSE, warning=FALSE------------------------------------------------ # path_to_bamfiles <- list.files( # path = "path/to/your/BAM/files/folder", # pattern = "bam$", # full.names = TRUE, # include.dirs = TRUE # ) ## ----eval=FALSE, warning=FALSE------------------------------------------------ # example_cov <- convert_bam_to_rle(bam_file = "path_to_bamfiles") ## ----eval=FALSE, warning=FALSE------------------------------------------------ # # Using default settings (recommended) # df <- convert_rle_to_df(example_cov) # # # Disabling unreliable region processing # df <- # convert_rle_to_df(example_cov, unreliable_region_enabled = FALSE) # # # Using custom unreliable regions # # Refer to section 3.3 for details # custom_regions <- create_custom_unreliable_regions() # df <- # convert_rle_to_df(example_cov, unreliable_region_df = custom_regions) ## ----message=FALSE, warning=FALSE, results='hide', eval=FALSE----------------- # prediction_default <- predict_iimi(newdata = df, method = "xgb") ## ----eval=FALSE--------------------------------------------------------------- # # preparing the train and test data # # # spliting into 80-20 train and test data set with the three plant samples # set.seed(123) # train_names <- sample(levels(as.factor(df$sample_id)), # length(unique(df$sample_id)) * 0.8) # # # trian data # train_x = df[df$sample_id %in% train_names,] # # test data # test_x = df[df$sample_id %in% train_names == F,] # # # preparing labels # train_y = c() # for (ii in 1:nrow(train_x)) { # train_y = append(train_y, example_diag[train_x$seg_id[ii], # train_x$sample_id[ii]]) # } ## ----message=FALSE, warning=FALSE, results='hide', eval=FALSE----------------- # fit <- train_iimi(train_x = train_x, train_y = train_y) ## ----eval=FALSE--------------------------------------------------------------- # prediction_customized <- # predict_iimi(newdata = test_x, # trained_model = fit) ## ----------------------------------------------------------------------------- # An example of the provided unreliable regions unreliable_regions %>% group_by(Categories) %>% sample_n(2) ## ----eval=FALSE--------------------------------------------------------------- # # if you would like to keep unmappable regions that can be mapped to other # # viruses or the host genome separate into two data frames, you may use the # # following code: # # # input your own path that you would want to store regions on a virus that can # # be mapped to another virus # # you can customize the name of this type of mappability profile # mappability_profile_virus <- # create_mappability_profile("path/to/bam/files/folder/virus", category = "Unmappable region (virus)", virus_info = virus_segments) # # # input your own path that you would want to store regions on a virus that can # # be mapped to the host genome # # you can customize the name of this type of mappability profile # mappability_profile_host <- # create_mappability_profile("path/to/bam/files/folder/host", category = "Unmappable region (host)", virus_info = virus_segments) # # # if you would like to keep everything in the same data frame, you may use the # # following code: # mappability_profile <- # create_mappability_profile("path/to/bam/files/folder/of/both/types/", category = "Unmappable region", virus_info = virus_segments) ## ----eval=FALSE--------------------------------------------------------------- # high_nucleotide_regions <- # create_high_nucleotide_content(gc = 0.6, a = 0.45, virus_info = virus_segments) ## ----fig.width=7, fig.height=5------------------------------------------------ oldpar <- par(mfrow = c(1, 2)) ## if you wish to plot all segments of one sample, you can try: # plot_cov(covs = example_cov["S1"]) ## if you wish to plot all segments from all samples, you can try: # plot_cov(covs = example_cov) ## if you wish to plot certain segments from one sample, you can try: segs = c("82np2784", "m0kacxse") covs_selected = list() covs_selected$`305S` <- example_cov$`305S`[segs] ## if you have many segments that you would want to plot, you can try the following code with the numbers changed ## to find the index of your desired segments: # covs_selected$`305S` <- # example_cov$`305S`[names(example_cov$`305S`)[c(8, 15)]] par(mar = c(2, 4, 1, 1)) layout(matrix(c(1, 1, 2, 3, 3, 4), nrow = 3)) plot_cov(covs = covs_selected, virus_info = virus_segments) par(oldpar)