%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{3. Comparing distributions with the poweRlaw package}
\documentclass[11pt,BCOR2mm,DIV14]{scrartcl}
\usepackage{booktabs}%For fancy tables
\usepackage[round]{natbib} % References
\usepackage{textcomp}
\usepackage{amsmath,amsfonts}
\usepackage{scrlayer-scrpage}
\usepackage[utf8]{inputenc} %unicode support
\usepackage{color,hyperref}
\urlstyle{rm}

\usepackage{xcolor}
\hypersetup{
    colorlinks,
    linkcolor={red!50!black},
    citecolor={blue!50!black},
    urlcolor={blue!80!black}
}

\pagestyle{scrheadings}
\setheadsepline{.4pt}
\ihead[]{Comparing distributions}
\chead[]{}
\ohead[]{}
\ifoot[]{}
\cfoot[]{}
\ofoot[\pagemark]{\pagemark}
\newcommand{\cc}{\texttt}
\newcommand{\xmin}{x_{\min}}

<<echo=FALSE,message=FALSE>>=
library(knitr)
library(poweRlaw)
options(replace.assign = FALSE, width = 50)

opts_chunk$set(fig.path = "knitr_figure_poweRlaw/graphicsc-",
               cache.path = "knitr_cache_poweRlaw_c/",
               fig.align = "center",
               dev = "pdf", fig.width = 5, fig.height = 5,
               fig.show = "hold", cache = FALSE, par = TRUE,
               out.width = "0.4\\textwidth")
knit_hooks$set(crop = hook_pdfcrop)

knit_hooks$set(par = function(before, options, envir) {
  if (before && options$fig.show != "none") {
    par(mar = c(3, 4, 2, 1), cex.lab = .95, cex.axis = .9,
        mgp = c(3, .7, 0), tcl = -.01, las = 1)
  }}, crop = hook_pdfcrop)

set.seed(1)
palette(c(rgb(170, 93, 152, maxColorValue = 255),
          rgb(103, 143, 57, maxColorValue = 255),
          rgb(196, 95, 46, maxColorValue = 255),
          rgb(79, 134, 165, maxColorValue = 255),
          rgb(205, 71, 103, maxColorValue = 255),
          rgb(203, 77, 202, maxColorValue = 255),
          rgb(115, 113, 206, maxColorValue = 255)))
@
%
\begin{document}
\date{Last updated: \today}
\title{The poweRlaw package: Comparing distributions}
\author{Colin S. Gillespie}

\maketitle
\begin{abstract}
  \noindent The \verb$poweRlaw$ package provides an easy to use interface for fitting and
  visualising heavy tailed distributions, including power-laws. This vignette
  provides examples of comparing competing distributions.
\end{abstract}

\section{Comparing distributions}

This short vignette aims to provide some guidance when comparing distributions
using Vuong's test statistic. The hypothesis being tested is
\[
H_0: \text{Both distributions are equally far from the true distribution}
\]
and
\[
H_1: \text{One of the test distributions is closer to the true distribution.}
\]
To perform this test we use the \cc{compare\_distributions()}
function\footnote{The \cc{compare\_distributions()} function also returns a one
  sided $p$-value. Essentially, the one sided $p$-value is testing whether the
  first model is better than the second, i.e. a \textbf{one} sided test.} and
examine the \cc{p\_two\_sided} value.

\section{Example: Simulated data 1}

First let's generate some data from a power-law distribution
<<>>=
library("poweRlaw")
set.seed(1)
x = rpldis(1000, xmin = 2, alpha = 3)
@
\noindent and fit a discrete power-law distribution
<<>>=
m1 = displ$new(x)
m1$setPars(estimate_pars(m1))
@
\noindent The estimated values of $\xmin$ and $\alpha$ are \Sexpr{m1$getXmin()} and \Sexpr{signif(m1$getPars(), 3)}, respectively. As an alternative distribution, we will fit a discrete Poisson distribution\footnote{When comparing distributions, each model must have the same $\xmin$ value. In this example, both models have $\xmin=\Sexpr{m1$getXmin()}$.}
<<>>=
m2 = dispois$new(x)
m2$setPars(estimate_pars(m2))
@
\noindent Plotting both models
<<F1,echo=1:3,fig.keep='none'>>=
plot(m2, ylab = "CDF")
lines(m1)
lines(m2, col = 2, lty = 2)
grid()
@

\begin{figure}[t]
\centering
<<F1,echo=FALSE>>=
@
\caption{Plot of the simulated data CDF, with power law and Poisson lines
of best fit.}\label{F1}
\end{figure}
\noindent suggests that the power-law model gives a better fit (figure \ref{F1}). Investigating this formally
<<>>=
comp = compare_distributions(m1, m2)
comp$p_two_sided
@
\noindent means we can reject $H_0$ since $p=\Sexpr{signif(comp$p_two_sided, 4)}$ and conclude that one model is closer to the true distribution.

\subsection*{One or two-sided $p$-value}

The two-sided $p$-value does not depend on the order of the model comparison

<<echo=TRUE>>=
compare_distributions(m1, m2)$p_two_sided
compare_distributions(m2, m1)$p_two_sided
@

\noindent However, the one-sided $p$-value is order dependent

<<echo=TRUE>>=
## We only care if m1 is better than m2
## m1 is clearly better
compare_distributions(m1, m2)$p_one_sided
## m2 isn't better than m1
compare_distributions(m2, m1)$p_one_sided
@


\section{Example: Moby Dick data set}

\noindent This time we will look at the Moby Dick data set
<<>>=
data("moby")
@
\noindent Again we fit a power law
<<>>=
m1 = displ$new(moby)
m1$setXmin(estimate_xmin(m1))
@
\noindent and a log-normal model\footnote{In order to compare distributions, $\xmin$ must be equal for both distributions.}
<<>>=
m2 = dislnorm$new(moby)
m2$setXmin(m1$getXmin())
m2$setPars(estimate_pars(m2))
@
\noindent  Plotting the CDFs
<<F2,echo=1:3, fig.keep='none'>>=
plot(m2, ylab = "CDF")
lines(m1)
lines(m2, col = 2, lty = 2)
grid()
@
\begin{figure}[t]
\centering
<<F2,echo=FALSE>>=
@
\caption{The Moby Dick data set with power law and log normal lines of best fit.}\label{F2}
\end{figure}
\noindent suggests that both models perform equally well (figure \ref{F2}). The
formal hypothesis test
<<>>=
comp = compare_distributions(m1, m2)
@
\noindent gives a $p\,$-value and test statistic of
<<>>=
comp$p_two_sided
comp$test_statistic
@
\noindent which means we can not reject $H_0$. The $p\,$-value and test
statistic are similar to the values found in table 6.3 of \citet{clauset2009}.

<<clean-up, include=FALSE>>=
# R compiles all vignettes in the same session, which can be bad
rm(list = ls(all = TRUE))
@

\bibliography{poweRlaw}
\bibliographystyle{plainnat}

\end{document}