## ----envs, include=FALSE------------------------------------------------------
set.seed(0)
knitr::opts_chunk$set(
  out.extra = 'style="display:block; margin: auto"',
  fig.align = "center",
  fig.height = 8,
  fig.path = "gaawr2/",
  fig.width = 8,
  collapse = TRUE,
  comment = "#>",
  dev = "CairoPNG")

## ----setup, message=FALSE, warning=FALSE--------------------------------------
pkgs <- c("EnsDb.Hsapiens.v75","ensembldb","GMMAT","HardyWeinberg","MCMCglmm","SNPassoc","biomaRt",
           "gap","gap.datasets","haplo.stats","powerEQTL","R2jags","regress",
           "dplyr","ggplot2","httr","jsonlite","kableExtra","knitr","tidyr")
for (p in pkgs) if (length(grep(paste("^package:", p, "$", sep=""), search())) == 0) {
    if (!requireNamespace(p)) warning(paste0("This vignette needs package `", p, "'; please install"))
}
invisible(suppressMessages(lapply(pkgs, require, character.only = TRUE)))
sys_options <- options()
new_options <- options(digits=2)

## ----welcome------------------------------------------------------------------
print("Hello, world!\n")

## ----iris---------------------------------------------------------------------
class(iris)
dim(iris)
str(iris)
head(iris,1)
tail(iris,1)

## ----diabetes, fig.cap="Mean values by gender and diabetes category", fig.height=6, fig.with=8, messages=FALSE----
options(new_options)
data(diabetes,package="gaawr2")

mean_values <- diabetes %>%
  dplyr::filter(CLASS %in% c("Y", "N", "P")) %>%
  dplyr::mutate(
    Gender = dplyr::recode(Gender, "F" = "Female", "M" = "Male"),
    CLASS = dplyr::recode(CLASS, "Y" = "Yes", "N" = "No", "P" = "Predicted")
  ) %>%
  dplyr::group_by(CLASS, Gender) %>%
  dplyr::select(AGE:BMI) %>%
  dplyr::summarize(dplyr::across(dplyr::everything(), \(x) mean(x, na.rm = TRUE)))
kableExtra::kbl(mean_values,caption="Mean value by gender and diabetes category") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

mean_values_long <- mean_values %>%
  tidyr::pivot_longer(
    cols = AGE:BMI,
    names_to = "Variable",
    values_to = "Mean_Value"
  )
ggplot2::ggplot(mean_values_long,
  ggplot2::aes(x = Variable, y = Mean_Value, fill = CLASS)) +
  ggplot2::geom_col(position = ggplot2::position_dodge()) +
  ggplot2::facet_wrap(~ Gender) +
  ggplot2::labs(
    title = "",
    x = "Variable",
    y = "Mean Value",
    fill = "Diabetes Status" # Modified legend title for clarity
  ) +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
options(sys_options)

## ----hwe, fig.cap="SNP ternary plot", fig.height=8, fig.width=10, messages=FALSE----
# MN blood group
SNP <- c(MM = 298, MN = 489, NN = 213)
HardyWeinberg::maf(SNP)
HardyWeinberg::HWTernaryPlot(SNP,region=0,grid=TRUE,markercol="blue")
HardyWeinberg::HWChisq(SNP, cc = 0, verbose = TRUE)
# Chromosome X
xSNP <- c(A=10, B=20, AA=30, AB=20, BB=10)
HardyWeinberg::HWChisq(xSNP,cc=0,x.linked=TRUE,verbose=TRUE)
# HLA/DQR
DQR <- gap.datasets::hla[,3:4]
a1 <- DQR[1]
a2 <- DQR[2]
GenotypeCounts <- HardyWeinberg::AllelesToTriangular(a1,a2)
kableExtra::kbl(GenotypeCounts,caption="Genotype distribution of DQR") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
HardyWeinberg::HWPerm.mult(GenotypeCounts,nperm=300)
HardyWeinberg::HWStr(hla[,3:4],test="permutation",nperm=300)

## ----snpassoc1, warning=FALSE-------------------------------------------------
data(asthma, package = "SNPassoc")
str(asthma, list.len=8)
knitr::kable(asthma[1:3,1:8],caption="First three records & two SNPs")
snpCols <- colnames(asthma)[6+(1:2)]
snps <- SNPassoc::setupSNP(data=asthma[snpCols], colSNPs=1:length(snpCols), sep="")
head(snps)
summary(snps, print=FALSE)
lapply(snps, head)
lapply(snps, summary)
SNPassoc::tableHWE(snps)

## ----snpassoc2, warning=FALSE-------------------------------------------------
asthma.snps <- asthma %>%
               dplyr::rename(cc=casecontrol) %>%
               SNPassoc::setupSNP(colSNPs=(6+1):ncol(.), sep="")
# Model 1: Simple SNP association with BMI
SNPassoc::association(bmi ~ rs4490198, data = asthma.snps)

# Model 2: SNP association with case-control status
SNPassoc::association(cc ~ rs4490198, data = asthma.snps)

# Model 3: SNP association with covariates (country and smoke)
SNPassoc::association(cc ~ rs4490198 + country + smoke, data = asthma.snps)

# Model 4: SNP association with stratification by gender
SNPassoc::association(cc ~ rs4490198 + survival::strata(gender), data = asthma.snps)

# Model 5: SNP association with subset (only Spain)
SNPassoc::association(cc ~ rs4490198, data = asthma.snps, subset = country == "Spain")

# Model 6: Interaction between SNP (dominant model) and smoking
SNPassoc::association(cc ~ SNPassoc::dominant(rs4490198) * factor(smoke), data = asthma.snps)

# Model 7: Interaction between two SNPs (dominant model for rs4490198)
SNPassoc::association(cc ~ rs4490198 * factor(rs11123242), data = asthma.snps, model.interaction = "dominant")

## ----haplostats, warning=FALSE------------------------------------------------
# Association with the first three SNPs
snpsH <- names(asthma.snps)[6+(1:3)]
genoH <- SNPassoc::make.geno(asthma.snps, snpsH)
em <- haplo.stats::haplo.em(genoH, locus.label = snpsH, miss.val = c(0, NA))
haplo_table <- with(em,cbind(haplotype,hap.prob))
knitr::kable(haplo_table,caption="Haplotypes of the first three SNPs")
modH <- haplo.stats::haplo.glm(cc ~ genoH, data=asthma.snps,
                               family="binomial",
                               locus.label=snpsH,
                               allele.lev=attributes(genoH)$unique.alleles,
                               control = haplo.stats::haplo.glm.control(haplo.freq.min=0.05))
modH
SNPassoc::intervals(modH)

# Model comparison with / without haplotypes
mod.adj.ref <- glm(cc ~ smoke, data=asthma.snps, family="binomial")
mod.adj <- haplo.glm(cc ~ genoH + smoke, data=asthma.snps,
                 family="binomial",
                 locus.label=snpsH,
                 allele.lev=attributes(genoH3)$unique.alleles,
                 control = haplo.stats::haplo.glm.control(haplo.freq.min=0.05))
mod.adj
lrt.adj <- mod.adj.ref$deviance - mod.adj$deviance
pchisq(lrt.adj, mod.adj$lrt$df, lower=FALSE)

# Four variable slide windows over nine SNPs
snpsH <- labels(asthma.snps)[6+(1:9)]
genoH <- SNPassoc::make.geno(asthma.snps, snpsH)
haploH <- list()
for (i in 1:4) haploH[[i]] <- haplo.stats::haplo.score.slide(asthma.snps$cc, genoH,
                              trait.type="binomial",
                              n.slide=i,
                              locus.label=snpsH,
                              simulate=TRUE,
                              sim.control=haplo.stats::score.sim.control(min.sim=50,max.sim=100))

## ----gmmat, message=FALSE-----------------------------------------------------
data(example,package="GMMAT")
attach(example)
model0 <- GMMAT::glmmkin(disease ~ age + sex, data = pheno, kins = GRM,
                         id = "id", family = binomial(link = "logit"))
model1 <- GMMAT::glmmkin(fixed = trait ~ age + sex, data = pheno, kins = GRM,
                         id = "id", family = gaussian(link = "identity"))
model2 <- GMMAT::glmmkin(fixed = trait ~ age + sex, data = pheno, kins = GRM,
                         id = "id", groups = "disease",
                         family = gaussian(link = "identity"))
snps <- c("SNP10", "SNP25", "SNP1", "SNP0")
geno.file <- system.file("extdata", "geno.bgen", package = "GMMAT")
samplefile <- system.file("extdata", "geno.sample", package = "GMMAT")
outfile <- "glmm.score.txt"
GMMAT::glmm.score(model0, infile = geno.file, BGEN.samplefile = samplefile,
                  outfile = outfile)
read.delim(outfile) |>
     head(n=4) |>
     knitr::kable(caption="Score tests under GLMM on four SNPs",digits=2)
unlink(outfile)
bed.file <- system.file("extdata", "geno.bed", package = "GMMAT") |>
            tools::file_path_sans_ext()
model.wald <- GMMAT::glmm.wald(fixed = disease ~ age + sex, data = pheno,
                               kins = GRM, id = "id", family = binomial(link = "logit"),
                               infile = bed.file, snps = snps)
knitr::kable(model.wald,caption="Wald tests under GLMM on four SNPs")
detach(example)

## ----jags, message=FALSE------------------------------------------------------
set.seed(1234567)
meyer <- within(gap.datasets::meyer,{
         y[is.na(y)] <- rnorm(length(y[is.na(y)]),mean(y,na.rm=TRUE),sd(y,na.rm=TRUE))
         g1 <- ifelse(generation==1,1,0)
         g2 <- ifelse(generation==2,1,0)
         id <- animal
         animal <- ifelse(!is.na(animal),animal,0)
         dam <- ifelse(!is.na(dam),dam,0)
         sire <- ifelse(!is.na(sire),sire,0)
     })
G <- gap::kin.morgan(meyer)$kin.matrix*2
r <- regress::regress(y~-1+g1+g2,~G,data=meyer)
r
with(r,gap::h2G(sigma,sigma.cov))
eps <- 0.001
y <- with(meyer,y)
x <- with(meyer,cbind(g1,g2))
ex <- gap::h2.jags(y,x,G,sigma.p=0.03,sigma.r=0.014,n.chains=1,n.iter=80)
kableExtra::kbl(ex$BUGSoutput$summary,digits=2,caption="MCMC results for the Meyer data") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))

## ----eqtl, fig.cap="Power Estimation for eQTL Studies of 240 SNPs", fig.height=6, fig.width=8, messages=FALSE----
n.designs <- 6
designs <- 1:n.designs
N <- 50 * designs
n.grids <- 100
index <- 1:n.grids
grids <- index / n.grids
MAF <- seq(0.005, n.grids/2, by=0.5) / n.grids
plot(MAF,grids,type="n",ylab="Power")
mtext(expression(paste("(",alpha," = 0.05)")),1,line=4.5)
colors <- grDevices::hcl.colors(n.designs)
for (design in designs)
{
  power.SLR <- rep(NA,n.grids)
  for (j in index) power.SLR[j] <- powerEQTL::powerEQTL.SLR(MAF = MAF[j], FWER = 0.05, nTests = 240, slope = 0.13,
                                                            n = N[design], sigma.y = 0.13)
  lines(MAF,power.SLR,col=colors[design])
}
legend("bottomright", inset=.02, title="Sample size (N)", paste(N), col=colors, horiz=FALSE, cex=0.8, lty=designs)

## ----ensdb, messages=FALSE----------------------------------------------------
ensembldb::metadata(EnsDb.Hsapiens.v75)
genes <- ensembldb::genes(EnsDb.Hsapiens.v75)
head(genes)
transcripts_data <- ensembldb::transcripts(EnsDb.Hsapiens.v75)
head(transcripts_data)

## ----ot, messages=FALSE-------------------------------------------------------
gene_id <- "ENSG00000164308"
query_string = "
  query target($ensemblId: String!){
    target(ensemblId: $ensemblId){
      id
      approvedSymbol
      biotype
      geneticConstraint {
        constraintType
        exp
        obs
        score
        oe
        oeLower
        oeUpper
      }
      tractability {
        label
        modality
        value
      }
    }
  }
"
base_url <- "https://api.platform.opentargets.org/api/v4/graphql"
variables <- list("ensemblId" = gene_id)
post_body <- list(query = query_string, variables = variables)
r <- httr::POST(url=base_url, body=post_body, encode='json')
data <- iconv(r, "", "ASCII")
content <- jsonlite::fromJSON(data)
target <- content$data$target
scalar_fields <- data.frame(
  Field = c("ID", "Approved Symbol", "Biotype"),
  Value = c(target$id, target$approvedSymbol, target$biotype)
)
tractability_data <- target$tractability
kableExtra::kbl(scalar_fields,caption="(a) Basic Information") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
kableExtra::kbl(target$geneticConstraint, caption="(b) Genetic Constraint Metrics") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
kableExtra::kbl(tractability_data,caption="(c) Tractability Information") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

## ----desc, results="asis"-----------------------------------------------------
suggests <- read.dcf(file = system.file("DESCRIPTION", package = "gaawr2"), fields = c("Suggests"))
write.dcf(suggests)