## ---- echo = FALSE------------------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
library(BTYDplus)
data("groceryElog")

## ---- fig.show="hold", fig.width=7, fig.height=2.8, fig.cap="Timing patterns for sampled grocery customers"----
library(BTYDplus)
data("groceryElog")
set.seed(123)
# plot timing patterns of 30 sampled customers
plotTimingPatterns(groceryElog, n = 30, T.cal = "2007-05-15",
                   headers = c("Past", "Future"), title = "")

## ---- echo=FALSE, results="asis"----------------------------------------------
cdnowElog <- read.csv(system.file("data/cdnowElog.csv", package = "BTYD"),
                      stringsAsFactors = FALSE,
                      col.names = c("cust", "sampleid", "date", "cds", "sales"))
cdnowElog$date <- as.Date(as.character(cdnowElog$date), format = "%Y%m%d")
knitr::kable(head(cdnowElog[, c("cust", "date", "sales")], 6), caption = "Transaction Log Example")

## ---- fig.show="hold", fig.width=7, fig.height=2.5, fig.cap="Weekly trends for the grocery dataset"----
data("groceryElog")
op <- par(mfrow = c(1, 2), mar = c(2.5, 2.5, 2.5, 2.5))
# incremental
weekly_inc_total  <- elog2inc(groceryElog, by = 7, first = TRUE)
weekly_inc_repeat <- elog2inc(groceryElog, by = 7, first = FALSE)
plot(weekly_inc_total, typ = "l", frame = FALSE, main = "Incremental")
lines(weekly_inc_repeat, col = "red")
# cumulative
weekly_cum_total  <- elog2cum(groceryElog, by = 7, first = TRUE)
weekly_cum_repeat <- elog2cum(groceryElog, by = 7, first = FALSE)
plot(weekly_cum_total, typ = "l", frame = FALSE, main = "Cumulative")
lines(weekly_cum_repeat, col = "red")
par(op)

## -----------------------------------------------------------------------------
data("groceryElog")
head(elog2cbs(groceryElog), 5)

## -----------------------------------------------------------------------------
data("groceryElog")
range(groceryElog$date)
groceryCBS <- elog2cbs(groceryElog, T.cal = "2006-12-31")
head(groceryCBS, 5)

## ---- fig.show="hold", fig.width=7, fig.height=2.2, fig.cap="Diagnostic plots for estimating regularity"----
data("groceryElog")
op <- par(mfrow = c(1, 2))
(k.wheat <- estimateRegularity(groceryElog, method = "wheat", 
                              plot = TRUE, title = "Wheat & Morrison"))
(k.mle <- estimateRegularity(groceryElog, method = "mle", 
                             plot = TRUE, title = "Maximum Likelihood"))
par(op)

## -----------------------------------------------------------------------------
# load grocery dataset, if it hasn't been done before
if (!exists("groceryCBS")) {
  data("groceryElog")
  groceryCBS <- elog2cbs(groceryElog, T.cal = "2006-12-31")
}
# estimate NBD parameters
round(params.nbd <- nbd.EstimateParameters(groceryCBS), 3)
# report log-likelihood
nbd.cbs.LL(params.nbd, groceryCBS)

## -----------------------------------------------------------------------------
# calculate expected future transactions for customers who've 
# had 1 to 5 transactions in first 52 weeks
est5.nbd <- nbd.ConditionalExpectedTransactions(params.nbd, 
              T.star = 52, x = 1:5, T.cal = 52)
for (i in 1:5) {
  cat("x =", i, ":", sprintf("%5.3f", est5.nbd[i]), "\n")
}

## -----------------------------------------------------------------------------
# predict whole customer cohort
groceryCBS$xstar.nbd <- nbd.ConditionalExpectedTransactions(
                    params = params.nbd, T.star = 52, 
                    x = groceryCBS$x, T.cal = groceryCBS$T.cal)
# compare predictions with actuals at aggregated level
rbind(`Actuals` = c(`Holdout` = sum(groceryCBS$x.star)), 
      `NBD`     = c(`Holdout` = round(sum(groceryCBS$xstar.nbd))))

## -----------------------------------------------------------------------------
# load grocery dataset, if it hasn't been done before
if (!exists("groceryCBS")) {
  data("groceryElog")
  groceryCBS <- elog2cbs(groceryElog, T.cal = "2006-12-31")
}
# estimate Pareto/NBD parameters
params.pnbd <- BTYD::pnbd.EstimateParameters(groceryCBS[, c("x", "t.x", "T.cal")])
names(params.pnbd) <- c("r", "alpha", "s", "beta")
round(params.pnbd, 3)
# report log-likelihood
BTYD::pnbd.cbs.LL(params.pnbd, groceryCBS[, c("x", "t.x", "T.cal")])

## -----------------------------------------------------------------------------
# calculate expected future transactions for customers who've 
# had 1 to 5 transactions in first 12 weeks, but then remained
# inactive for 40 weeks
est5.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions(params.pnbd, 
               T.star = 52, x = 1:5, t.x = 12, T.cal = 52)
for (i in 1:5) {
  cat("x =", i, ":", sprintf("%5.3f", est5.pnbd[i]), "\n")
}

## -----------------------------------------------------------------------------
# predict whole customer cohort
groceryCBS$xstar.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions(
                     params = params.pnbd, T.star = 52, 
                     x = groceryCBS$x, t.x = groceryCBS$t.x,
                     T.cal = groceryCBS$T.cal)
# compare predictions with actuals at aggregated level
rbind(`Actuals`    = c(`Holdout` = sum(groceryCBS$x.star)), 
      `Pareto/NBD` = c(`Holdout` = round(sum(groceryCBS$xstar.pnbd))))

## -----------------------------------------------------------------------------
# P(alive) for customers who've had 1 to 5 transactions in first 
# 12 weeks, but then remained inactive for 40 weeks
palive.pnbd <- BTYD::pnbd.PAlive(params.pnbd, 
                 x = 1:5, t.x = 12, T.cal = 52)
for (i in 1:5) {
  cat("x =", i, ":", sprintf("%5.2f %%", 100*palive.pnbd[i]), "\n")
}

## -----------------------------------------------------------------------------
# load grocery dataset, if it hasn't been done before
if (!exists("groceryCBS")) {
  data("groceryElog")
  groceryCBS <- elog2cbs(groceryElog, T.cal = "2006-12-31")
}
# estimate parameters for various models
params.bgnbd   <- BTYD::bgnbd.EstimateParameters(groceryCBS) # BG/NBD
params.bgcnbd  <- bgcnbd.EstimateParameters(groceryCBS)      # BG/CNBD-k
params.mbgnbd  <- mbgnbd.EstimateParameters(groceryCBS)      # MBG/NBD
params.mbgcnbd <- mbgcnbd.EstimateParameters(groceryCBS)     # MBG/CNBD-k
row <- function(params, LL) { 
  names(params) <- c("k", "r", "alpha", "a", "b")
  c(round(params, 3), LL = round(LL))
}
rbind(`BG/NBD`     = row(c(1, params.bgnbd), 
                         BTYD::bgnbd.cbs.LL(params.bgnbd, groceryCBS)),
      `BG/CNBD-k`  = row(params.bgcnbd, 
                         bgcnbd.cbs.LL(params.bgcnbd, groceryCBS)),
      `MBG/NBD`    = row(params.mbgnbd, 
                         mbgcnbd.cbs.LL(params.mbgnbd, groceryCBS)),
      `MBG/CNBD-k` = row(params.mbgcnbd, 
                         mbgcnbd.cbs.LL(params.mbgcnbd, groceryCBS)))

## -----------------------------------------------------------------------------
# calculate expected future transactions for customers who've 
# had 1 to 5 transactions in first 12 weeks, but then remained
# inactive for 40 weeks
est5.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions(params.mbgcnbd,
                  T.star = 52, x = 1:5, t.x = 12, T.cal = 52)
for (i in 1:5) {
  cat("x =", i, ":", sprintf("%5.3f", est5.mbgcnbd[i]), "\n")
}

## -----------------------------------------------------------------------------
# P(alive) for customers who've had 1 to 5 transactions in first 
# 12 weeks, but then remained inactive for 40 weeks
palive.mbgcnbd <- mbgcnbd.PAlive(params.mbgcnbd, 
                    x = 1:5, t.x = 12, T.cal = 52)
for (i in 1:5) {
  cat("x =", i, ":", sprintf("%5.2f %%", 100*palive.mbgcnbd[i]), "\n")
}

## -----------------------------------------------------------------------------
# predict whole customer cohort
groceryCBS$xstar.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions(
                        params = params.mbgcnbd, T.star = 52,
                        x = groceryCBS$x, t.x = groceryCBS$t.x, 
                        T.cal = groceryCBS$T.cal)
# compare predictions with actuals at aggregated level
rbind(`Actuals`    = c(`Holdout` = sum(groceryCBS$x.star)),
      `MBG/CNBD-k` = c(`Holdout` = round(sum(groceryCBS$xstar.mbgcnbd))))

## ---- fig.show="hold", fig.width=4, fig.height=3.5, fig.cap="Weekly actuals vs. MBG/CNBD-k predictions"----
# runs for ~37secs on a 2015 MacBook Pro
nil <- mbgcnbd.PlotTrackingInc(params.mbgcnbd,
         T.cal = groceryCBS$T.cal,
         T.tot = max(groceryCBS$T.cal + groceryCBS$T.star),
         actual.inc.tracking = elog2inc(groceryElog))

## -----------------------------------------------------------------------------
# mean absolute error (MAE)
mae <- function(act, est) {
  stopifnot(length(act)==length(est))
  sum(abs(act-est)) / length(act)
}
mae.nbd <- mae(groceryCBS$x.star, groceryCBS$xstar.nbd)
mae.pnbd <- mae(groceryCBS$x.star, groceryCBS$xstar.pnbd)
mae.mbgcnbd <- mae(groceryCBS$x.star, groceryCBS$xstar.mbgcnbd)
rbind(`NBD`        = c(`MAE` = round(mae.nbd, 3)),
      `Pareto/NBD` = c(`MAE` = round(mae.pnbd, 3)),
      `MBG/CNBD-k` = c(`MAE` = round(mae.mbgcnbd, 3)))
lift <- 1 - mae.mbgcnbd / mae.pnbd
cat("Lift in MAE for MBG/CNBD-k vs. Pareto/NBD:", round(100*lift, 1), "%")

## -----------------------------------------------------------------------------
# load grocery dataset, if it hasn't been done before
if (!exists("groceryCBS")) {
  data("groceryElog")
  groceryCBS <- elog2cbs(groceryElog, T.cal = "2006-12-31")
}
# generate parameter draws (~13secs on 2015 MacBook Pro)
pnbd.draws <- pnbd.mcmc.DrawParameters(groceryCBS)
# generate draws for holdout period
pnbd.xstar.draws <- mcmc.DrawFutureTransactions(groceryCBS, pnbd.draws)
# conditional expectations
groceryCBS$xstar.pnbd.hb   <- apply(pnbd.xstar.draws, 2, mean)
# P(active)
groceryCBS$pactive.pnbd.hb <- mcmc.PActive(pnbd.xstar.draws)
# P(alive)
groceryCBS$palive.pnbd.hb  <- mcmc.PAlive(pnbd.draws)
# show estimates for first few customers
head(groceryCBS[, c("x", "t.x", "x.star",
                    "xstar.pnbd.hb", "pactive.pnbd.hb",
                    "palive.pnbd.hb")])

## -----------------------------------------------------------------------------
class(pnbd.draws$level_2)

# convert cohort-level draws from coda::mcmc.list to a matrix, with 
# each parameter becoming a column, and each draw a row
cohort.draws <- pnbd.draws$level_2
head(as.matrix(cohort.draws), 5)

# compute median across draws, and compare to ML estimates; as can be
# seen, the two parameter estimation approaches result in very similar
# estimates
round(
  rbind(`Pareto/NBD (HB)` = apply(as.matrix(cohort.draws), 2, median),
        `Pareto/NBD`      = BTYD::pnbd.EstimateParameters(groceryCBS[, c("x", "t.x", "T.cal")]))
, 2)

## ---- fig.show="hold", warning=FALSE, fig.width=7, fig.height=3, fig.cap="MCMC traces and parameter distributions of cohort-level parameters"----
# plot trace- and density-plots for heterogeneity parameters
op <- par(mfrow = c(2, 4), mar = c(2.5, 2.5, 2.5, 2.5))
coda::traceplot(pnbd.draws$level_2)
coda::densplot(pnbd.draws$level_2)
par(op)

## ---- fig.show="hold", warning=FALSE, fig.width=7, fig.height=3, fig.cap="MCMC traces and parameter distributions of individual-level parameters for a specific customer"----
class(pnbd.draws$level_1)
length(pnbd.draws$level_1)

customer4 <- "4"
customer4.draws <- pnbd.draws$level_1[[customer4]]
head(as.matrix(customer4.draws), 5)
round(apply(as.matrix(customer4.draws), 2, median), 3)

# plot trace- and density-plots for customer4 parameters
op <- par(mfrow = c(2, 4), mar = c(2.5, 2.5, 2.5, 2.5))
coda::traceplot(pnbd.draws$level_1[[customer4]])
coda::densplot(pnbd.draws$level_1[[customer4]])
par(op)

## ---- eval = FALSE------------------------------------------------------------
#  # runs for ~120secs on a MacBook Pro 2015
#  op <- par(mfrow = c(1, 2))
#  nil <- mcmc.PlotFrequencyInCalibration(pnbd.draws, groceryCBS)
#  nil <- mcmc.PlotTrackingInc(pnbd.draws,
#           T.cal = groceryCBS$T.cal,
#           T.tot = max(groceryCBS$T.cal + groceryCBS$T.star),
#           actual.inc.tracking.data = elog2inc(groceryElog))
#  par(op)

## ---- eval = FALSE------------------------------------------------------------
#  # load CDNow event log from BTYD package
#  cdnowElog <- read.csv(
#                 system.file("data/cdnowElog.csv", package = "BTYD"),
#                 stringsAsFactors = FALSE,
#                 col.names = c("cust", "sampleid", "date", "cds", "sales"))
#  cdnowElog$date <- as.Date(as.character(cdnowElog$date),
#                            format = "%Y%m%d")
#  # convert to CBS; split into 39 weeks calibration, and 39 weeks holdout
#  cdnowCbs <- elog2cbs(cdnowElog,
#                       T.cal = "1997-09-30", T.tot = "1998-06-30")
#  
#  # estimate Pareto/NBD (Abe) without covariates; model M1 in Abe (2009)
#  draws.m1 <- abe.mcmc.DrawParameters(cdnowCbs,
#                mcmc = 7500, burnin = 2500) # ~33secs on 2015 MacBook Pro
#  quant <- function(x) round(quantile(x, c(0.025, 0.5, 0.975)), 2)
#  t(apply(as.matrix(draws.m1$level_2), 2, quant))
#  #>                        2.5%   50% 97.5%
#  #> log_lambda            -3.70 -3.54 -3.32
#  #> log_mu                -3.96 -3.59 -3.26
#  #> var_log_lambda         1.10  1.34  1.65
#  #> cov_log_lambda_log_mu -0.20  0.13  0.74
#  #> var_log_mu             1.44  2.62  5.05
#  
#  #' append dollar amount of first purchase to use as covariate
#  first <- aggregate(sales ~ cust, cdnowElog, function(x) x[1] * 10^-3)
#  names(first) <- c("cust", "first.sales")
#  cdnowCbs <- merge(cdnowCbs, first, by = "cust")
#  
#  #' estimate with first purchase spend as covariate; model M2 in Abe (2009)
#  draws.m2 <- abe.mcmc.DrawParameters(cdnowCbs,
#                covariates = c("first.sales"),
#                mcmc = 7500, burnin = 2500) # ~33secs on 2015 MacBook Pro
#  t(apply(as.matrix(draws.m2$level_2), 2, quant))
#  #>                         2.5%   50% 97.5%
#  #> log_lambda_intercept   -4.02 -3.77 -3.19
#  #> log_mu_intercept       -4.37 -3.73 -2.69
#  #> log_lambda_first.sales  0.04  6.04  9.39
#  #> log_mu_first.sales     -9.02  1.73  7.90
#  #> var_log_lambda          0.01  1.35  1.79
#  #> cov_log_lambda_log_mu  -0.35  0.22  0.76
#  #> var_log_mu              0.55  2.59  4.97

## ---- eval=FALSE--------------------------------------------------------------
#  # load grocery dataset, if it hasn't been done before
#  if (!exists("groceryCBS")) {
#    data("groceryElog")
#    groceryCBS <- elog2cbs(groceryElog, T.cal = "2006-12-31")
#  }
#  # estimte Pareto/GGG
#  pggg.draws <- pggg.mcmc.DrawParameters(groceryCBS) # ~2mins on 2015 MacBook Pro
#  # generate draws for holdout period
#  pggg.xstar.draws <- mcmc.DrawFutureTransactions(groceryCBS, pggg.draws)
#  # conditional expectations
#  groceryCBS$xstar.pggg  <- apply(pggg.xstar.draws, 2, mean)
#  # P(active)
#  groceryCBS$pactive.pggg <- mcmc.PActive(pggg.xstar.draws)
#  # P(alive)
#  groceryCBS$palive.pggg  <- mcmc.PAlive(pggg.draws)
#  # show estimates for first few customers
#  head(groceryCBS[, c("x", "t.x", "x.star",
#                      "xstar.pggg", "pactive.pggg", "palive.pggg")])
#  #>    x      t.x x.star xstar.pggg pactive.pggg palive.pggg
#  #> 1  0  0.00000      0       0.02         0.02        0.03
#  #> 2  1 50.28571      0       1.01         0.59        1.00
#  #> 3 19 48.57143     14      14.76         0.87        0.87
#  #> 4  0  0.00000      0       0.04         0.03        0.13
#  #> 5  2 40.42857      3       2.02         0.84        0.91
#  #> 6  5 47.57143      6       4.46         0.92        0.95
#  
#  # report median cohort-level parameter estimates
#  round(apply(as.matrix(pggg.draws$level_2), 2, median), 3)
#  #>     t gamma     r alpha     s  beta
#  #> 1.695 0.373 0.948 5.243 0.432 4.348
#  # report mean over median individual-level parameter estimates
#  median.est <- sapply(pggg.draws$level_1, function(draw) {
#    apply(as.matrix(draw), 2, median)
#  })
#  round(apply(median.est, 1, mean), 3)
#  #>    k lambda     mu    tau      z
#  #> 3.892  0.160  0.065 69.546  0.316

## ---- eval = FALSE------------------------------------------------------------
#  # compare predictions with actuals at aggregated level
#  rbind(`Actuals`    = c(`Holdout` = sum(groceryCBS$x.star)),
#        `Pareto/GGG` = round(sum(groceryCBS$xstar.pggg)),
#        `MBG/CNBD-k` = round(sum(groceryCBS$xstar.mbgcnbd)),
#        `Pareto/NBD (HB)` = round(sum(groceryCBS$xstar.pnbd.hb)))
#  #>                 Holdout
#  #> Actuals            3389
#  #> Pareto/GGG         3815
#  #> MBG/CNBD-k         3970
#  #> Pareto/NBD (HB)    4018
#  
#  # error on customer level
#  mae <- function(act, est) {
#    stopifnot(length(act)==length(est))
#    sum(abs(act-est)) / length(act)
#  }
#  mae.pggg <- mae(groceryCBS$x.star, groceryCBS$xstar.pggg)
#  mae.mbgcnbd <- mae(groceryCBS$x.star, groceryCBS$xstar.mbgcnbd)
#  mae.pnbd.hb <- mae(groceryCBS$x.star, groceryCBS$xstar.pnbd.hb)
#  rbind(`Pareto/GGG`      = c(`MAE` = round(mae.pggg, 3)),
#        `MBG/CNBD-k`      = c(`MAE` = round(mae.mbgcnbd, 3)),
#        `Pareto/NBD (HB)` = c(`MAE` = round(mae.pnbd.hb, 3)))
#  #>                   MAE
#  #> Pareto/GGG      0.621
#  #> MBG/CNBD-k      0.644
#  #> Pareto/NBD (HB) 0.688
#  
#  lift <- 1 - mae.pggg / mae.pnbd.hb
#  cat("Lift in MAE:", round(100*lift, 1), "%")
#  #> Lift in MAE for Pareto/GGG vs. Pareto/NBD: 9.8%