## ----message=FALSE, warning=FALSE, include=FALSE------------------------------
library(simdata)
library(ggplot2)
library(reshape2)
library(GGally)
library(knitr)
library(doParallel)
library(doRNG)

## ----message=FALSE, warning=FALSE---------------------------------------------
correlation_matrix = diag(1, nrow = 5)

sim_design = simdata::simdesign_mvtnorm(relations = correlation_matrix, 
                                        transform_initial = data.frame,
                                        prefix_final = "variable")
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)

knitr::kable(head(sim_data))

p = GGally::ggpairs(sim_data, 
                    upper = list(continuous = "points"), 
                    progress = FALSE) +
  theme_bw()
print(p)

## ----message=TRUE, warning=TRUE-----------------------------------------------
correlation_matrix = diag(1, nrow = 5)
transformation = simdata::function_list(
  "v1" = function(x) x[, 1],
  "v2*2" = function(x) x[, 2] * 2, 
  "v3^2" = function(x) x[, 3]^2,
  "v4+v5" = function(x) x[, 4] + x[, 5], 
  check.names = FALSE # pass columnnames exactly as specified here
)

sim_design = simdata::simdesign_mvtnorm(
  relations = correlation_matrix,
  transform_initial = transformation, 
  prefix_final = NULL
)
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)

knitr::kable(head(sim_data))

p = GGally::ggpairs(sim_data, upper = list(continuous = "points"), 
                    progress = FALSE) +
  theme_bw()
print(p)

## -----------------------------------------------------------------------------
correlation_matrix = simdata::cor_from_upper(
  15, 
  rbind(c(1,2,0.8), c(1,9,0.3), 
        c(3,5,0.3), c(3,9,-0.5), 
        c(4,6,-0.5), c(4,7,-0.3),
        c(5,6,-0.3), c(5,12,0.5),
        c(6,7,0.5), c(6,11,0.5), c(6,14,0.3),
        c(7,11,0.3), c(7,14,0.3),
        c(8,9,-0.3), c(8,11,0.3),
        c(11,14,0.5)))

transformation = simdata::function_list(
  v1 = function(z) floor(10 * z[,1] + 55), 
  v2 = function(z) z[,2] < 0.6, 
  v3 = function(z) exp(0.4 * z[,3] + 3),
  v4 = function(z) z[,4] >= -1.2,
  v5 = function(z) z[,4] >= 0.75,
  v6 = function(z) exp(0.5 * z[,5] + 1.5),
  v7 = function(z) floor(pmax(0, 100 * exp(z[,6]) - 20)),
  v8 = function(z) floor(pmax(0, 80 * exp(z[,7]) - 20)),
  v9 = function(z) z[,8] < -0.35,
  v10 = function(z) (z[,9] >= 0.5) & (z[,9] < 1.5),
  v11 = function(z) z[,9] >= 1.5,
  v12 = function(z) 0.01*floor(100 * (z[,10] + 4)^2),
  v13 = function(z) floor(10 * z[,11] + 55),
  v14 = function(z) floor(10 * z[,12] + 55),
  v15 = function(z) floor(10 * z[,13] + 55),
  v16 = function(z) z[,14] < 0,
  v17 = function(z) z[,15] < 0
)

sim_design = simdata::simdesign_mvtnorm(
  relations = correlation_matrix,
  transform_initial = transformation,
  prefix_final = NULL
)

## ----echo=FALSE---------------------------------------------------------------
plotdf = melt(sim_design$cor_initial)

ggplot(plotdf, aes(x = factor(Var1), y = factor(Var2), fill = value)) + 
    geom_tile() + 
    geom_text(aes(label = value)) + 
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev) +
    xlab(expr(paste("Initial variable ", z[i]))) + 
    ylab(expr(paste("Initial variable ", z[i]))) +
    coord_equal() +
    scale_fill_gradient2(
        "Correlation",
        limits = c(-1, 1),
        low = "#377eb8", mid = "white", high = "#e41a1c") +
    theme_bw()

## -----------------------------------------------------------------------------
simdata::plot_cor_network(sim_design, seed = 1)

## -----------------------------------------------------------------------------
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)

## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars], measure.vars = numeric_vars)

p = ggplot(plotdf, aes(x = "", y = value)) +
    geom_violin() +
    geom_dotplot(binaxis = "y", stackdir = "center") + 
    facet_wrap(~ variable, scales = "free", nrow = 3) +
    theme_bw() + 
    ylab("Value") +
    theme(axis.title.x = element_blank())
print(p)

discrete_vars = names(sim_design$types_final)[sim_design$types_final != "numeric"]
plotdf = melt(sim_data[, discrete_vars], measure.vars = discrete_vars)

p = ggplot(plotdf, aes(x = variable, fill = value, color = value)) +
    geom_bar(alpha = 0.5, size = 1) +
    scale_fill_brewer(palette = "Set1") +
    scale_color_brewer(palette = "Set1") +
    ylab("Proportion of total samplesize (%)") +
    theme_bw() +
    theme(axis.title.x = element_blank())
print(p)

## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
plotdf = cor(sim_data, method = "s", use = "p")
plotdf = melt(plotdf) 

ggplot(plotdf, aes(x = Var1, y = Var2, fill = value)) +
    geom_tile() + 
    geom_text(aes(label = round(value, 2)), size = 2) +
    scale_fill_gradient2(
        "Correlation",
        limits = c(-1, 1),
        low = "#377eb8", mid = "white", high = "#e41a1c") +
    coord_equal() +
    scale_x_discrete(position = "top") +
        scale_y_discrete(limits = rev) +
    xlab(expr(paste("Final variable ", v[i]))) + 
    ylab(expr(paste("Final variable ", v[i]))) +
    theme_bw() + 
    theme(text = element_text(size = 10))

## ----message=FALSE, warning=FALSE---------------------------------------------
# draw full network
simdata::plot_estimated_cor_network(sim_design, 
                                    cor_cutoff = NULL, 
                                    edge_label_function = NULL,
                                    edge_width_function = function(x) x*25,
                                    use_edge_weights = TRUE, 
                                    edge.color = "clipped-ramp",
                                    seed = 2321673)

## ----message=FALSE, warning=FALSE---------------------------------------------
# simplify by using cor_cutoff
simdata::plot_estimated_cor_network(sim_design, 
                                    seed = 2)

# set correlation type
simdata::plot_estimated_cor_network(sim_design, 
                                    cor_type = "spearman", 
                                    seed = 2321673)

# set various parameters
simdata::plot_estimated_cor_network(sim_design, seed = 2321673, 
                                    edge.color = "red-blue", 
                                    axes = TRUE, cor_type = "s", 
                                    edge_width_function = function(x) var(x)*200,
                                    show_categorical = FALSE, 
                                    mar = c(2, 2, 0, 0))

## -----------------------------------------------------------------------------
sim_design$process_final = list(
    process_truncate_by_iqr = list(
        truncate_multipliers = c(v6 = 2, v7 = 2, v8 = 2)
    )
)

sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)

## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars])

p = ggplot(plotdf, aes(x = "", y = value)) +
    geom_violin() +
    geom_dotplot(binaxis = "y", stackdir = "center") + 
    facet_wrap(~ variable, scales = "free", nrow = 3) +
    theme_bw() + 
    ylab("Value") +
    theme(axis.title.x = element_blank())
print(p)

## -----------------------------------------------------------------------------
sim_design$process_final = list(
    process_truncate_by_threshold = list(
        truncate_upper = c(v8 = 200, v7 = 300),
        truncate_lower = c(v6 = 2)
    )
)

sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)

## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars])

ggplot(plotdf, aes(x = "", y = value)) +
    geom_violin() +
    geom_dotplot(binaxis = "y", stackdir = "center") + 
    facet_wrap(~ variable, scales = "free", nrow = 3) +
    theme_bw() + 
    ylab("Value") +
    theme(axis.title.x = element_blank())

## -----------------------------------------------------------------------------
sim_design$process_final = list(
    process_truncate_by_iqr = list(
        truncate_multipliers = c(v6 = 2, v7 = 2, v8 = 2)
    ), 
    scale = list(), 
    data.frame = list()
)

sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 25897165)

## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
numeric_vars = names(sim_design$types_final)[sim_design$types_final == "numeric"]
plotdf = melt(sim_data[, numeric_vars])

ggplot(plotdf, aes(x = "", y = value)) +
    geom_violin() +
    geom_dotplot(binaxis = "y", stackdir = "center") + 
    facet_wrap(~ variable, scales = "free", nrow = 3) +
    theme_bw() + 
    ylab("Value") +
    theme(axis.title.x = element_blank())

## -----------------------------------------------------------------------------
correlation_matrix = diag(1, nrow = 2)
transformation = simdata::function_list("v1" = function(x) x[, 1],
                                        "v1*2" = function(x) x[, 1] * 2, 
                                        check.names = FALSE)

sim_design = simdata::simdesign_mvtnorm(
  relations = correlation_matrix,        
  transform_initial = transformation, 
  prefix_final = NULL
)

# ignoring the collinearity
sim_data = simdata::simulate_data(sim_design, n_obs = 100, seed = 2)
knitr::kable(cor(sim_data))

# rejecting collinear matrices
sim_data = simdata::simulate_data_conditional(sim_design,
                                              n_obs = 100, seed = 2,
                                              reject = is_collinear,
                                              reject_max_iter = 3,
                                              return_tries = TRUE)

sim_data

## -----------------------------------------------------------------------------
correlation_matrix = diag(1, nrow = 3)
transformation = simdata::function_list(
  "v1" = function(x) x[, 1],
  "might_be_collinear" = function(x) {
    if (rbinom(1, 1, 0.5)) {
      return(x[, 1] * 2)
    } else return(x[, 2])
  }, 
  "might_be_constant" = function(x) {
    if (rbinom(1, 1, 0.5)) {
      return(0)
    } else return(x[, 3])
  },
  check.names = FALSE)

sim_design = simdata::simdesign_mvtnorm(
  relations = correlation_matrix,        
  transform_initial = transformation,
  prefix_final = NULL
)

sim_data = simdata::simulate_data_conditional(
  sim_design,
  n_obs = 100, seed = 3,
  reject = simdata::function_list(is_collinear, 
                                  contains_constant),
  reject_max_iter = 3, 
  return_tries = TRUE)

sprintf("Number of tries: %d", sim_data[[2]])
knitr::kable(head(sim_data[[1]]))

## -----------------------------------------------------------------------------
binomial_simdesign <- function(size = 1, prob = 0.5, ...) {
  
  # define generator function
  # make sure it returns a two-dimensional array
  generator = function(n) matrix(rbinom(n, size = size, prob = prob), ncol = 1)
  
  # setup simdesign object
  # make sure to pass generator function and ...
  # all other information passed is optional
  dsgn = simdata::simdesign(
    generator = generator, 
    size = size, 
    prob = prob,
    ...
  )
  
  # extend the class attribute 
  class(dsgn) = c("binomial_simdesign", class(dsgn))
  
  # return the object
  dsgn
}

## -----------------------------------------------------------------------------
sim_design = binomial_simdesign(size = 1, prob = 0.7)
sim_data = simdata::simulate_data(sim_design, 100, seed = 1)
knitr::kable(table(sim_data))

## -----------------------------------------------------------------------------
realdata_simdesign <- function(dataset, ...) {
  
  # define generator function
  # make sure it returns a two-dimensional array
  generator = function(n) dataset[sample(1:nrow(dataset), n, replace = TRUE), ,
                                  drop = FALSE]
  
  # setup simdesign object
  # make sure to pass generator function and ...
  # all other information passed is optional
  dsgn = simdata::simdesign(
    generator = generator, 
    dataset = dataset,
    ...
  )
  
  # extend the class attribute 
  class(dsgn) = c("realdata_simdesign", class(dsgn))
  
  # return the object
  dsgn
}

## -----------------------------------------------------------------------------
data(iris)
sim_design = realdata_simdesign(iris, prefix_final = NULL)
sim_data = simdata::simulate_data(sim_design, 100, seed = 1)
knitr::kable(head(sim_data))

## -----------------------------------------------------------------------------
correlation_matrix = simdata::cor_from_upper(
  100,
  entries = rbind(
    expand.grid(1:30, 1:30, 0.5), 
    expand.grid(31:50, 31:50, 0.2))
)

# create list of transformation functions programmatically
# For the first 60 variables: 
# odd varibles will be translated
# even variables will be scaled
transformation = list()
for (i in 1:60) {
  if (i %% 2) {
    transformation[[i]] = substitute(function(x) x[, i] * 5, list(i = i))
  } else transformation[[i]] = substitute(function(x) x[, i] - 10, list(i = i))
}
# the remaining are returned as they are
transformation[[61]] = function(x) x[, 61:100]
# construct single transformation function from the list
transformation = simdata::as_function_list(transformation)

# create simulation design
sim_design = simdata::simdesign_mvtnorm(
  relations = correlation_matrix, 
  transform_initial = transformation
)

## ----echo=FALSE---------------------------------------------------------------
plotdf = melt(sim_design$cor_initial)

ggplot(plotdf, aes(x = factor(Var1), y = factor(Var2), fill = value)) + 
    geom_tile() + 
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev) + 
    xlab(expr(paste("Initial variable ", z[i]))) + 
    ylab(expr(paste("Initial variable ", z[i]))) +
    coord_equal() +
    scale_fill_gradient2(
        "Correlation",
        limits = c(-1, 1),
        low = "#377eb8", mid = "white", high = "#e41a1c") +
    theme_bw() + 
    theme(axis.text = element_blank())

## -----------------------------------------------------------------------------
simdata::plot_cor_network(sim_design, seed = 2,  
                          vertex_labels = NA,
                          edge_label_function = NULL,
                          edge_width_function = function(x) 0.01,
                          edge_weight_function = function(x) 0.25 * x, 
                          use_edge_weights = TRUE, 
                          edge.color = "clipped-ramp", 
                          vertex.size = 3)

## -----------------------------------------------------------------------------
sim_data = simdata::simulate_data(sim_design, n_obs = 50, seed = 5)

## ----echo=FALSE, message=FALSE, warning=FALSE---------------------------------
ggplot(melt(sim_data), aes(x = value, color = variable)) + 
    geom_density() + 
    scale_color_viridis_d(option = "plasma") +
    guides(color = "none") +
    xlab("Value") + ylab("Density") +
    theme_bw()

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  correlation_matrix = diag(1, nrow = 5)
#  transformation = simdata::function_list(
#      "v1" = function(x) x[, 1],
#      "v2*2" = function(x) x[, 2] * 2,
#      "v3^2" = function(x) x[, 3]^2,
#      "v4+v5" = function(x) x[, 4] + x[, 5],
#      check.names = FALSE # pass columnnames exactly as specified here
#    )
#  
#  sim_design = simdata::simdesign_mvtnorm(
#    relations = correlation_matrix,
#    transform_initial = transformation,
#    prefix_final = NULL
#  )
#  
#  # parallelisation
#  cl = parallel::makeCluster(1)
#  doParallel::registerDoParallel(cl)
#  res = foreach(
#    i = 1:10,
#    .packages = c("simdata"),
#    .options.RNG = 1 # note that the seed is passed here
#  ) %dorng% {
#    simulate_data(sim_design, n_obs = 100)
#  
#    # do some task with the simulated data
#  }
#  parallel::stopCluster(cl)
#  
#  knitr::kable(purrr::map(res[1:2], summary))

## -----------------------------------------------------------------------------
# parameter
mult = 2
transformation = simdata::function_list(
  "v1" = function(x) x[, 1] * mult # dangerous, depends on global variable!
)

sim_design = simdata::simdesign_mvtnorm(
        relations = diag(1), 
        transform_initial = transformation
)

sample1 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)

# change value of global variable
mult = 4
sample2 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)

# note the different values in both columns
knitr::kable(cbind(sample1, sample2))

## -----------------------------------------------------------------------------
# parameter
mult = 2
transformation = simdata::function_list(
    # specify function as partial 
    "v1" = simdata::partial(function(x, mult) x[, 1] * mult, mult = mult)
)

sim_design = simdata::simdesign_mvtnorm(
        relations = diag(1), 
        transform_initial = transformation
)

sample1 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)

# change value of global variable
mult = 4
sample2 = simdata::simulate_data(sim_design, n_obs = 5, seed = 25897165)

# note both columns are equal now, as expected
knitr::kable(cbind(sample1, sample2))

## ----echo=FALSE---------------------------------------------------------------
sessionInfo()