## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
 )


## ----setup--------------------------------------------------------------------
library(ADLP)
if (!require(SynthETIC)) install.packages("SynthETIC")
if (!require(tidyverse)) install.packages("tidyverse")
library(tidyverse)

# Import dummy claims dataset
claims_dataset <- claims_dataset <- ADLP::test_claims_dataset

head(claims_dataset)

## -----------------------------------------------------------------------------

# Included train/validation splitting function
train_val <- ADLP::train_val_split_method1(
    df = claims_dataset,
    tri.size = 40,
    val_ratio = 0.3,
    test = TRUE
)

train_data <- train_val$train
valid_data <- train_val$valid
insample_data <- rbind(train_data, valid_data)
test_data <- train_val$test

print("Number of observations in each row")
print(nrow(train_data))
print(nrow(valid_data))
print(nrow(test_data))

print("Approximation for val_ratio")
print(nrow(valid_data)/(nrow(insample_data)))


## -----------------------------------------------------------------------------
base_model1 <- glm(formula = claims~factor(dev),
                   family=gaussian(link = "identity"), data=train_data)

tau <- 1e-8
base_model2 <- glm(formula = (claims+tau)~calendar,
                   family=Gamma(link="inverse"), data=train_data)

base_model3 <- glm(formula = claims~factor(origin),
                   family=gaussian(link = "identity"), data=train_data)

base_model1_full <- update(base_model1, data = insample_data)
base_model2_full <- update(base_model2, data = insample_data)
base_model3_full <- update(base_model3, data = insample_data)

## -----------------------------------------------------------------------------
################################################################################
# Normal distribution densities and functions
dens_normal <- function(y, model, newdata){
    pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
    mu <- pred_model$fit
    sigma <- pred_model$residual.scale
    return(dnorm(x=y, mean=mu, sd=sigma))
}

cdf_normal<-function(y, model, newdata){
    pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
    mu <- pred_model$fit
    sigma <- pred_model$residual.scale
    return(pnorm(q=y, mean=mu, sd=sigma))
}

mu_normal<-function(model, newdata){
    mu <- predict(model, newdata=newdata, type="response")
    mu <- pmax(mu, 0)
    return(mu)
}

sim_normal<-function(model, newdata){
    pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
    mu <- pred_model$fit
    sigma <- pred_model$residual.scale
    
    sim <- rnorm(length(mu), mean=mu, sd=sigma)
    sim <- pmax(sim, 0)
    return(sim)
}

################################################################################
# Gamma distribution densities and functions
dens_gamma <- function(y, model, newdata){
    pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
    mu <- pred_model$fit
    sigma <- pred_model$residual.scale
    return(dgamma(x=y, shape = 1/sigma, scale=(mu*sigma)^2))
}

cdf_gamma <- function(y, model, newdata){
    pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
    mu <- pred_model$fit
    sigma <- pred_model$residual.scale
    return(pgamma(q=y, shape = 1/sigma, scale=(mu*sigma)^2))
}

mu_gamma <- function(model, newdata, tau){
    mu <- predict(model, newdata=newdata, type="response")
    mu <- pmax(mu - tau, 0)
    return(mu)
}

sim_gamma <- function(model, newdata, tau){
    pred_model <- predict(model, newdata=newdata, type="response", se.fit=TRUE)
    mu <- pred_model$fit
    sigma <- pred_model$residual.scale
    
    sim <- rgamma(length(mu), shape = 1/sigma, scale=(mu*sigma)^2)
    sim <- pmax(sim - tau, 0)
    return(sim)
}


## -----------------------------------------------------------------------------
# Constructing ADLP component classes

base_component1 = adlp_component(
    model_train = base_model1, 
    model_full = base_model1_full, 
    calc_dens = dens_normal,
    calc_cdf = cdf_normal,
    calc_mu = mu_normal,
    sim_fun = sim_normal
)

base_component2 = adlp_component(
    model_train = base_model2, 
    model_full = base_model2_full, 
    calc_dens = dens_gamma,
    calc_cdf = cdf_gamma,
    calc_mu = mu_gamma,
    sim_fun = sim_gamma,
    tau = tau
)

base_component3 = adlp_component(
    model_train = base_model3, 
    model_full = base_model3_full, 
    calc_dens = dens_normal,
    calc_cdf = cdf_normal,
    calc_mu = mu_normal,
    sim_fun = sim_normal
)

components <- adlp_components(
    base1 = base_component1,
    base2 = base_component2,
    base3 = base_component3
)


## ---- error = T---------------------------------------------------------------
# Fitting ADLP class
fit1 <- adlp(
    components_lst = components,
    newdata = valid_data,
    partition_func = ADLP::adlp_partition_none
)

fit2 <- adlp(
    components_lst = components,
    newdata = valid_data,
    partition_func = ADLP::adlp_partition_ap,
    tri.size = 40,
    size = 3,
    weights = c(3, 1, 2)
)


fit1
fit2

## -----------------------------------------------------------------------------
# Log Score
ensemble_logS <- adlp_logS(fit1, test_data, model = "full")

# Cumulative Ranked Probability Score
ensemble_CRPS <- adlp_CRPS(fit1, test_data, response_name = "claims", model = "full", sample_n = 1000)

# Predictions
ensemble_means <- predict(fit1, test_data)

# Simulations
ensemble_simulate <- adlp_simulate(100, fit1, test_data)

boxplot(ensemble_logS$dens_val, main = "Log Score")
boxplot(ensemble_CRPS$ensemble_crps, main = "Cumulative Ranked Probability Score")
boxplot(ensemble_means$ensemble_mu, main = "Predictions")
boxplot(ensemble_simulate$simulation, main = "Simulations")

## -----------------------------------------------------------------------------
# Defining own partition function
user_defined_partition <- function(df) {
    return (list(
        subset1 = df[(as.numeric(as.character(df$origin)) >= 1) & (as.numeric(as.character(df$origin)) <= 15), ],
        subset2 = df[(as.numeric(as.character(df$origin)) >= 16) & (as.numeric(as.character(df$origin)) <= 40), ]
    ))
}

# Fitting ADLP class
fit3 <- adlp(
    components_lst = components,
    newdata = valid_data,
    partition_func = user_defined_partition
)

fit3

## -----------------------------------------------------------------------------
show_mse <- function(data) {
    print("----------------------------")
    print("Base models 1, 2, 3")
    print(sum((predict(base_model1_full, data, type = "response") - data$claims)^2))
    print(sum((predict(base_model2_full, data, type = "response") - data$claims)^2))
    print(sum((predict(base_model3_full, data, type = "response") - data$claims)^2))
    
    print("ADLP ensembles 1, 2, 3")
    print(sum((predict(fit1, data)$ensemble_mu - data$claims)^2))
    print(sum((predict(fit2, data)$ensemble_mu - data$claims)^2))
    print(sum((predict(fit3, data)$ensemble_mu - data$claims)^2))
    print("----------------------------")
}

show_mse(train_data)
show_mse(valid_data)
show_mse(test_data)