# Start the HDANOVA R package
library(HDANOVA)
#>
#> Attaching package: 'HDANOVA'
#> The following object is masked from 'package:stats':
#>
#> loadings
Data are stored in many different file formats. The following three examples cover two types of CSV-files and generic flat files.
# Find directory extdata from the multiblock package
mbdir <- system.file('extdata/', package = "multiblock")
# Comma separated values, row names in first column
meta_data <- read.csv(paste0(mbdir, "/meta_data.csv"), row.names = 1)
# If working directory matches file location:
# meta_data <- read.csv('meta_data.csv', row.names = 1)
meta_data
#> temperature colour
#> John 38.0 blue
#> Julia 37.0 green
#> James 37.5 blue
#> Jacob 37.6 red
#> Jane 37.2 red
#> Johanna 37.9 green
# Semi-colon separated values (locales where the decimal point is comma),
# no row names
proteins <- read.csv2(paste0(mbdir, "/proteins.csv"))
proteins
#> prot1 prot2 prot3
#> 1 0.46532048 0.30183300 -1.4654414
#> 2 -1.79802081 -0.22812232 -0.4639203
#> 3 -1.92962434 -0.40513080 0.1767796
#> 4 0.87437138 0.79843798 0.1234731
#> 5 -0.62445278 -0.07975479 -1.1126332
#> 6 -0.07493721 1.09576027 1.2656596
# Blank space separated data without labels
genes <- read.table(paste0(mbdir, "/genes.dat"))
genes
#> V1 V2 V3
#> 1 0.39033106 -0.5720390 1.9147573
#> 2 0.55352785 0.0948703 -0.2239755
#> 3 0.09872346 -0.1029385 0.9047138
#> 4 -0.59213740 -0.6027739 0.6177083
#> 5 -0.02350148 0.3572809 -0.5168416
#> 6 0.76644845 1.2863428 1.8239298
Before analysis, various types of pre-processing may be needed. Centring and standardising/scaling may be considered the most basic. In R, these operations are performed column-wise by default, leading to autoscaling. If these operations are performed on the rows, we perform the standard normal variate (SNV) instead.
# Column-centring
genes_centred <- scale(genes, scale=FALSE)
colMeans(genes_centred) # Check mean values
#> V1 V2 V3
#> 1.850372e-17 0.000000e+00 7.401487e-17
# Autoscaling
genes_scaled <- scale(genes)
apply(genes_scaled, 2, sd) # Check standard deviations
#> V1 V2 V3
#> 1 1 1
# SNV (transpose, autoscale, re-transpose)
genes_snv <- t(scale(t(genes)))
apply(genes_snv, 1, sd) # Check standard deviations
#> [1] 1 1 1 1 1 1
Most analysis methods require continuous input data. The meta_data data.frame contains a character vector (a factor in older R versions) of categories. This package has a function dummycode for converting categorical data to various dummy formats.
# Default is sum-coding
dummycode(meta_data$colour)
#> x1 x2
#> 1 1 0
#> 2 0 1
#> 3 1 0
#> 4 -1 -1
#> 5 -1 -1
#> 6 0 1
# Treatment coding
dummycode(meta_data$colour, "contr.treatment")
#> xgreen xred
#> 1 0 0
#> 2 1 0
#> 3 0 0
#> 4 0 1
#> 5 0 1
#> 6 1 0
# Full dummy-coding (rank deficient)
dummycode(meta_data$colour, drop = FALSE)
#> xblue xgreen xred
#> 1 1 0 0
#> 2 0 1 0
#> 3 1 0 0
#> 4 0 0 1
#> 5 0 0 1
#> 6 0 1 0
# Replace categorical with dummy-coded, use I() to index by common name
meta_data2 <- meta_data
meta_data2$colour <- I(dummycode(meta_data$colour, drop = FALSE))
meta_data2
#> temperature colour.xblue colour.xgreen colour.xred
#> John 38.0 1 0 0
#> Julia 37.0 0 1 0
#> James 37.5 1 0 0
#> Jacob 37.6 0 0 1
#> Jane 37.2 0 0 1
#> Johanna 37.9 0 1 0
meta_data2$colour
#> xblue xgreen xred
#> 1 1 0 0
#> 2 0 1 0
#> 3 1 0 0
#> 4 0 0 1
#> 5 0 0 1
#> 6 0 1 0
A simple list of blocks can be created using the list() function. Naming of the blocks can be done directly or after creation.
# Direct approach
blocks1 <- list(meta = meta_data2, proteins = proteins, genes = genes)
# Two-step approach
blocks2 <- list(meta_data2, proteins, genes)
names(blocks2) <- c('meta', 'proteins', 'genes')
# Same result
identical(blocks1, blocks2)
#> [1] TRUE
# Access by name or number
blocks1[['meta']]
#> temperature colour.xblue colour.xgreen colour.xred
#> John 38.0 1 0 0
#> Julia 37.0 0 1 0
#> James 37.5 1 0 0
#> Jacob 37.6 0 0 1
#> Jane 37.2 0 0 1
#> Johanna 37.9 0 1 0
blocks2[[1]]
#> temperature colour.xblue colour.xgreen colour.xred
#> John 38.0 1 0 0
#> Julia 37.0 0 1 0
#> James 37.5 1 0 0
#> Jacob 37.6 0 0 1
#> Jane 37.2 0 0 1
#> Johanna 37.9 0 1 0
A data.frame is a convenient storage format for data in R and can handle many types of variables, e.g. numeric, logical, character, factor or matrices. The latter is useful for analyses of data with shared sample mode.
# Construct block data.frame from list
df1 <- block.data.frame(blocks1)
# Construct block data.frame from data.frame:
# First merge blocks into data.frame
my_data <- cbind(meta_data2, proteins, genes)
# Then construct block data.frame using named
# list of indexes
df2 <- block.data.frame(my_data, block_inds =
list(meta = 1:2, proteins = 3:5, genes = 6:8))
# Same result
identical(df1,df2)
#> [1] TRUE
# Access by name or number
df1[[2]]
#> prot1 prot2 prot3
#> John 0.46532048 0.30183300 -1.4654414
#> Julia -1.79802081 -0.22812232 -0.4639203
#> James -1.92962434 -0.40513080 0.1767796
#> Jacob 0.87437138 0.79843798 0.1234731
#> Jane -0.62445278 -0.07975479 -1.1126332
#> Johanna -0.07493721 1.09576027 1.2656596
df2[['proteins']]
#> prot1 prot2 prot3
#> John 0.46532048 0.30183300 -1.4654414
#> Julia -1.79802081 -0.22812232 -0.4639203
#> James -1.92962434 -0.40513080 0.1767796
#> Jacob 0.87437138 0.79843798 0.1234731
#> Jane -0.62445278 -0.07975479 -1.1126332
#> Johanna -0.07493721 1.09576027 1.2656596
df1[c(1,3)]
#> meta.temperature meta.colour.xblue meta.colour.xgreen meta.colour.xred
#> John 38.0 1.0 0.0 0.0
#> Julia 37.0 0.0 1.0 0.0
#> James 37.5 1.0 0.0 0.0
#> Jacob 37.6 0.0 0.0 1.0
#> Jane 37.2 0.0 0.0 1.0
#> Johanna 37.9 0.0 1.0 0.0
#> genes.V1 genes.V2 genes.V3
#> John 0.39033106 -0.57203901 1.91475732
#> Julia 0.55352785 0.09487030 -0.22397553
#> James 0.09872346 -0.10293847 0.90471382
#> Jacob -0.59213740 -0.60277392 0.61770832
#> Jane -0.02350148 0.35728086 -0.51684156
#> Johanna 0.76644845 1.28634283 1.82392983
df1[-2]
#> meta.temperature meta.colour.xblue meta.colour.xgreen meta.colour.xred
#> John 38.0 1.0 0.0 0.0
#> Julia 37.0 0.0 1.0 0.0
#> James 37.5 1.0 0.0 0.0
#> Jacob 37.6 0.0 0.0 1.0
#> Jane 37.2 0.0 0.0 1.0
#> Johanna 37.9 0.0 1.0 0.0
#> genes.V1 genes.V2 genes.V3
#> John 0.39033106 -0.57203901 1.91475732
#> Julia 0.55352785 0.09487030 -0.22397553
#> James 0.09872346 -0.10293847 0.90471382
#> Jacob -0.59213740 -0.60277392 0.61770832
#> Jane -0.02350148 0.35728086 -0.51684156
#> Johanna 0.76644845 1.28634283 1.82392983
df2[c('proteins','genes')]
#> proteins.prot1 proteins.prot2 proteins.prot3 genes.V1 genes.V2
#> John 0.46532048 0.30183300 -1.46544136 0.39033106 -0.57203901
#> Julia -1.79802081 -0.22812232 -0.46392027 0.55352785 0.09487030
#> James -1.92962434 -0.40513080 0.17677963 0.09872346 -0.10293847
#> Jacob 0.87437138 0.79843798 0.12347308 -0.59213740 -0.60277392
#> Jane -0.62445278 -0.07975479 -1.11263323 -0.02350148 0.35728086
#> Johanna -0.07493721 1.09576027 1.26565956 0.76644845 1.28634283
#> genes.V3
#> John 1.91475732
#> Julia -0.22397553
#> James 0.90471382
#> Jacob 0.61770832
#> Jane -0.51684156
#> Johanna 1.82392983
# Use with formula interface (see other vignettes)
# sopls(meta ~ proteins + genes, data = df1)
# Use with single list interface (see other vignettes)
# mfa(df1[c(1,3)], ncomp = 3)