%% $Id: entropy_np.Rnw,v 1.43 2010/02/18 14:43:37 jracine Exp jracine $

%\VignetteIndexEntry{Entropy-based Inference Using the np Package}
%\VignetteDepends{np,boot,cubature,MASS}
%\VignetteKeywords{nonparametric, kernel, entropy, econometrics, qualitative,
%categorical}
%\VignettePackage{np}

\documentclass[nojss]{jss}

%% need no \usepackage{Sweave.sty}

\usepackage{amsmath,amsfonts}
\usepackage[utf8]{inputenc}

\newcommand{\field}[1]{\mathbb{#1}}
\newcommand{\N}{\field{N}}
\newcommand{\bbR}{\field{R}} %% Blackboard R
\newcommand{\bbS}{\field{S}} %% Blackboard S

\author{Jeffrey S.~Racine\\McMaster University}

\title{Entropy-Based Inference using \proglang{R} and the \pkg{np}
  Package: A Primer}

\Plainauthor{Jeffrey S.~Racine}

\Plaintitle{Entropy-Based Inference using R and the np Package: A
  Primer}

%\Shorttitle{The np Package}

\Abstract{ 
  
  We describe new functionality in the \proglang{R} (\citet{R})
  package \pkg{np} (\citet{np}) by providing a brief overview of
  each approach and its implementation followed by brief illustrative
  examples. A simple demonstration outlining how practitioners can
  implement their own entropy functions is also provided.

}

\Keywords{nonparametric, semiparametric, kernel smoothing,
  categorical data.}

\Plainkeywords{Nonparametric, kernel, econometrics, qualitative,
  categorical}

\Address{Jeffrey S.~Racine\\
  Department of Economics\\
  McMaster University\\
  Hamilton, Ontario, Canada, L8S 4L8\\
  E-mail: \email{racinej@mcmaster.ca}\\
  URL: \url{http://www.mcmaster.ca/economics/racine/}\\
}

\begin{document}

%% Note that you should use the \pkg{}, \proglang{} and \code{} commands.

%% Note - fragile using \label{} in \section{} - must be outside

%% For graphics

\setkeys{Gin}{width=\textwidth}

%% Achim's request for R prompt set invisibly at the beginning of the
%% paper

<<eval=TRUE,echo=FALSE,results=hide,keep.source=TRUE>>=
library(np)
options(prompt = "R> ", np.messages = FALSE, digits = 3, warn = -1)
@ 

%% Achim asks for making use of <<..., height= ..., width = ...>>= for
%% better aspect ratios and better pen-to-paper ratios...

\section{Overview}

In versions 0.30-4 through 0.30-7 of the \pkg{np} package
(\citet{np}) we introduced a range of new functions that may be
of interest to those conducting applied research. The five new
functions are summarized in Table \ref{np function table} below.

\begin{table}[!ht]
  \centering\raggedright
  {\small
    \begin{tabular}{p{.75in}|p{2.75in}|p{2in}}
			\hline
			\textbf{Function} & \textbf{Description} & \textbf{Reference} \\ 
			\hline
\code{npdeneqtest} &  Nonparametric Test for Equality of Densities & \citet{LI_MAASOUMI_RACINE:2009} \\
\code{npdeptest} &  Nonparametric Entropy Test for Pairwise Dependence & \citet{MAASOUMI_RACINE:2002} \\
\code{npsdeptest} & Nonparametric Entropy Test for Serial Nonlinear Dependence& \citet{GRANGER_MAASOUMI_RACINE:2004}\\
\code{npsymtest} & Nonparametric Entropy Test for Asymmetry&\citet{RACINE_MAASOUMI:2007}, \citet{MAASOUMI_RACINE:2009}\\
\code{npunitest} &  Nonparametric Entropy Test for Univariate Density Equality & \citet{MAASOUMI_RACINE:2002}\\
\hline
    \end{tabular}
    \label{np function table}
  }
  \caption{New \pkg{np} functions introduced in versions 0.30-4 through 0.30-7.}
\end{table}

In what follows we briefly describe each new function listed in Table
\ref{np function table} and provide illustrative examples of their
use. Be aware that many of these functions rely on numerical
integration and can be computationally demanding. Though we provide
moment-based versions of the most computationally demanding functions,
we advocate the use of the integral-based versions which are the
default. In the last section of this overview we describe in detail
how the user can implement their own custom entropy functions with
minimal effort using the \pkg{np} package.

\section{Testing Equality of Multivariate Densities}

\citet{LI_MAASOUMI_RACINE:2009} proposed a nonparametric test for
equality of multivariate densities comprised of continuous and
categorical data. This test can be accessed via the \code{npdeneqtest}
function.

Let $X$ and $Y$ be multivariate vectors of dimension $q+r$ where $q$
denotes the number of continuous variables and $r$ the number of
discrete/categorical variables.  A test statistic can be constructed
based on the integrated squared density difference given by $I=\int [
f(x) - g(x)]^2 dx = \int [ f(x) dF(x) + g(x) dG(x) - f(x) dG(x) - g(x)
dF(x)]$, where $F(\cdot)$ and $G(\cdot)$ are the cumulative
distribution functions for $X$ and $Y$, respectively, and where $\int
dx = \sum_{x^d\in \bbS^d} \int dx^c$.  Replacing the first occurrences
of $f(\cdot)$ and $g(\cdot)$ by their leave-one-out kernel estimates,
and replacing $F(\cdot)$ and $G(\cdot)$ by their empirical
distribution functions, we obtain the following test statistic,
\begin{equation*}
  I_n = \frac{1}{n_1(n_1-1)} \sum_{i=1}^{n_1} \sum_{j\neq i}^{n_1}
  K_{\gamma,x_i,x_j} + \frac{1}{n_2(n_2-1)} \sum_{i=1}^{n_2}
  \sum_{j\neq i}^{n_2} K_{\gamma,y_i,y_j}
  - \frac{2}{n_1 n_2} \sum_{i=1}^{n_1} \sum_{j =
    i}^{n_2} K_{\gamma, x_i,y_j}.
\end{equation*}
	
\citet{LI_MAASOUMI_RACINE:2009} demonstrate that, under the null
of equality,
\begin{equation*}
  T_n = (n_1n_2 h_1\dots h_q)^{1/2}I_n/\sigma_n \to N(0,1)
  \mbox{ in
    distribution},
\end{equation*}
where
\begin{align*}
  \sigma^2_n = 2(n_1n_2 h_1\dots h_q) \left[
  \frac{1}{n_1^2(n_1-1)^2} \sum_{i=1}^{n_1} \sum_{j \neq i}^{n_1}
  (K_{\gamma,x_i,x_j})^2\right. & + \frac{1}{n_2^2(n_2-1)^2}
  \sum_{i=1}^{n_2}\sum_{j\neq i}^{n_2} (K_{\gamma,y_i,y_j})^2\cr
 &\left.  + \frac{2}{n_1^2 n_2^2} \sum_{i=1}^{n_1}\sum_{j= 1}^{n_2}
  (K_{\gamma,x_i,y_j})^2\right],
\end{align*}
which is a consistent estimator of $\sigma^2_{0} = 2[\delta^{-1} +
\delta + 2][E[f(X_i)]][\int W^2(v)dv]$, the asymptotic variance of
$(n_1n_2 h_1\dots h_q)^{1/2}I_n$, where $\delta =
\lim_{\min\{n_1,n_2\}\to \infty} (n_1/n_2)$.  Under the alternative
the statistic diverges to $+\infty$, so the test is one-sided
rejecting when the test statistic is sufficiently large. 

The test that uses critical values taken from the asymptotic
distribution displays finite-sample size distortions, so the
\code{npdeneqtest} function employs bootstrap resampling to obtain the
finite-sample distribution of the statistic (this provides a test
having correct size). The bootstrap resamples are obtained by
resampling from the empirical distribution of the pooled data
(i.e.~are drawn from a common multivariate distribution under the
null). Bandwidths are obtained via likelihood cross-validation by
default.

The following code snippet provides two simple examples using a mix of
continuous and discrete data. Note that the two samples must be data
frames with identically named variables (e.g., the variables `wages'
and `experience' must be common to both data frames while sample A
could be that for females, sample B for males).

<<keep.source=TRUE>>=
set.seed(1234)
n <- 250
## Distributions are equal

sample.A <- data.frame(a=rnorm(n),b=factor(rbinom(n,2,.5)))
sample.B <- data.frame(a=rnorm(n),b=factor(rbinom(n,2,.5)))

npdeneqtest(sample.A,sample.B,boot.num=99)

## Distributions are unequal

sample.A <- data.frame(a=rnorm(n),b=factor(rbinom(n,2,.5)))
sample.B <- data.frame(a=rnorm(n,sd=10),b=factor(rbinom(n,2,.25)))

npdeneqtest(sample.A,sample.B,boot.num=99)
@ 
  
\section{Testing Equality of Univariate Densities}

\citet{MAASOUMI_RACINE:2002} consider a metric entropy useful for
testing for equality of densities for two univariate random variables
$X$ and $Y$. The function \code{npunitest} computes the nonparametric
metric entropy (normalized Hellinger of
\citet{GRANGER_MAASOUMI_RACINE:2004}) for testing the null of
equality of two univariate density (or probability) functions.  For
continuous variables we construct
\begin{align*}
S_{\rho}&=\frac{1}{2}\int\left(f_{1}^{1/2}-f_{2}^{1/2}\right)^{2}\,dx\cr
&=\frac{1}{2}\int\left(1-\frac{f_{2}^{1/2}}{f_1^{1/2}}\right)^2 dF_{1}(x),
\end{align*}
where $f_1=f(x)$ and $f_2=f(y)$ are the marginal densities of the
random variables $X$ and $Y$. The second expression is in a moment
form which is often replaced with a sample average, especially for
theoretical developments. When $X$ and $Y$ are discrete/categorical,
we replace integration with the sum over all possible outcomes. The
unknown density/probability functions are replaced with nonparametric
kernel estimates.

The bootstrap is conducted by resampling with replacement from the
pooled empirical distribution of $X$ and $Y$ ($X$ only for the moment
version). Default bandwidths are of the plug-in variety (â€˜bw.SJâ€™ for
continuous variables and direct plug-in for discrete variables).

The following code snippet provides three simple examples for both
continuous and discrete data.

<<keep.source=TRUE>>=
set.seed(1234)
n <- 1000
     
## Compute the statistic only, different distributions
     
x <- rchisq(n,df=10)
y <- rnorm(n,sd=10000,mean=-10000)
     
npunitest(x,y,bootstrap=FALSE)

## Data drawn from same continuous distribution

x <- rnorm(n)
y <- rnorm(n)
npunitest(x,y,boot.num=99)

## Data drawn from different continuous distributions having the
## same mean and variance

x <- rchisq(n,df=5)
y <- rnorm(n,mean=5,sd=sqrt(10))
npunitest(x,y,boot.num=99)

## Data drawn from different discrete distributions
     
x <- factor(rbinom(n,2,.5))
y <- factor(rbinom(n,2,.1))
npunitest(x,y,boot.num=99)
@ 

\section{Testing Univariate Asymmetry}

Consider a (strictly) stationary series $\{Y_{t}\}_{t=1}^{T}$. Let
$\mu_{y}$ denote a measure of central tendency, say
$\mu_{y}=E[Y_{t}]$, let $f(y)$ denote the density function of the
random variable $Y_{t}$, let $\tilde Y_{t}=-Y_{t}+2\mu_{y}$ denote a
rotation of $Y_{t}$ about its mean, and let $f(\tilde y)$ denote the
density function of the random variable $\tilde Y_{t}$. Note that if
$\mu_{y}=0$ then $\tilde Y_{t}=-Y_{t}$, though in general this will
not be so.

We say a series is \emph{symmetric about the mean} (median, mode) if
$f(y)\equiv f(\tilde y)$ almost surely. Tests for asymmetry about the
mean therefore naturally involve testing the following null:
\begin{equation*}
  H_{0}:f(y)=f(\tilde y)\mbox{ almost everywhere (a.e.)}\nonumber
\end{equation*}
against the alternative:
\begin{equation*}
  H_{1}:f(y)\ne f(\tilde y)\mbox{ on a set with positive measure}.\nonumber
\end{equation*}

The function \code{npsymtest} computes the nonparametric metric
entropy (normalized Hellinger of \citet{GRANGER_MAASOUMI_RACINE:2004})
outlined in \citet{MAASOUMI_RACINE:2009} for testing the null of
symmetry using the densities/probabilities of the data and the rotated
data, $f(y)$ and $f(\tilde y)$, respectively, and in
\citet{RACINE_MAASOUMI:2007}. $Y$ must be univariate and can be a time
series, continuous, or even categorical valued so long as the outcomes
are not character strings.

For bootstrapping the null distribution of the statistic, â€˜iidâ€™
conducts simple random resampling, while â€˜geomâ€™ conducts stationary
bootstrapping using automatic block length selection via the â€˜b.starâ€™
function in the â€˜npâ€™ package (\citet{POLITIS_ROMANO:1994},
\citet{POLITIS_WHITE:2004},
\citet{PATTON_POLITIS_WHITE:2009}). Bootstrapping is conducted by
resampling from the empirical distribution of the pooled data and
rotated data. Default bandwidths are of the plug-in variety (â€˜bw.SJâ€™
for continuous variables and direct plug-in for discrete variables).

For continuous variables we use
\begin{align*}
S_{\rho}&=\frac{1}{2}\int\left(f_{1}^{1/2}-f_{2}^{1/2}\right)^{2}\,dx\cr
&=\frac{1}{2}\int\left(1-\frac{f_{2}^{1/2}}{f_1^{1/2}}\right)^2 dF_{1}(x),
\end{align*}
where $f_1$ and $f_2$ are the marginal densities of the data and
rotated data, respectively. The second expression is in a moment form
which is often replaced with a sample average, especially for
theoretical developments. When $Y$ is discrete/categorical, we replace
integration with the sum over all possible outcomes.

The following code snippet provides two simple examples for both
continuous and discrete data.

<<keep.source=TRUE>>=
set.seed(1234)
     
n <- 100

## Asymmetric discrete probability distribution

x <- factor(rbinom(n,2,.8))
npsymtest(x,boot.num=99)

## Symmetric continuous distribution
     
y <- rnorm(n)
npsymtest(y,boot.num=99)
@ 

\section{Testing Nonlinear Pairwise Independence}

\citet{MAASOUMI_RACINE:2002} consider a metric entropy useful for
testing for pairwise independence of two random variables $X$ and
$Y$. The function \code{npdeptest} computes the nonparametric metric
entropy (normalized Hellinger of
\citet{GRANGER_MAASOUMI_RACINE:2004}) for testing the null of
pairwise independence of two univariate density (or probability)
functions.  For continuous variables we construct
\begin{align*}
S_{\rho}&=\frac{1}{2}\int_{-\infty}^{\infty}\int_{-\infty}^{\infty}\left(f_{1}^{1/2}-f_{2}^{1/2}\right)^{2}\,dx\,dy\cr
&=\frac{1}{2}\int\int\left(1-\frac{f_{2}^{1/2}}{f_1^{1/2}}\right)^2 dF_{1}(x,y),
\end{align*}
where $f_{1}=f(x_i,y_i)$ is the joint density and $f_{2}=g(x_i)\times
h(y_i)$ is the product of the marginal densities of the random
variables $X_i$ and $Y_i$. The unknown density/probability functions
are replaced with nonparametric kernel estimates.

The bootstrap distribution is obtained by resampling with replacement
from the empirical distribution of $X$ delivering $\{X_i,Y_i\}$ pairs
under the null generated as $\{X_i^*,Y_i\}$ where $X^*$ is the
bootstrap resample (i.e.~we `shuffle' $X$ leaving $Y$ unchanged
thereby breaking any pairwise dependence to generate resamples under
the null). Bandwidths are obtained via likelihood cross-validation by
default for the marginal and joint densities.

Examples include, (a) a measure/test of ``fit'', for in-sample values
of a variable $y$ and its fitted values, $\hat y$, and (b) a measure
of ``predictability'' for a variable $y$ and its predicted values
$\hat y$ (from a user implemented model).

The following code snippet provides a simple example using the actual
and fitted values from a regression model. Note that we strongly
advocate the use of the integration (default) version of the statistic
in applied settings but use the summation (i.e.~moment) version below
purely by way of demonstration as it is computationally faster.

<<keep.source=TRUE>>=
set.seed(123)
## Test/measure lack of fit between y and its fitted value from a
## regression model when x is relevant.
n <- 100
x <- rnorm(n)
y <- 1 + x + rnorm(n)
model <- lm(y~x)
y.fit <- fitted(model)
npdeptest(y,y.fit,boot.num=99,method="summation")
@ 

\section{Testing Nonlinear Serial Independence}

\citet{GRANGER_MAASOUMI_RACINE:2004} consider a metric entropy
useful for testing for nonlinear serial independence in a univariate
random variable $Y$. The function \code{npsdeptest} computes the
nonparametric metric entropy (normalized Hellinger) for testing the
null of nonlinear serial independence of such a series. We construct
\begin{align*}
S_{\rho}&=\frac{1}{2}\int_{-\infty}^{\infty}\int_{-\infty}^{\infty}\left(f_{1}^{1/2}-f_{2}^{1/2}\right)^{2}\,dx\,dy\cr
&=\frac{1}{2}\int\int\left(1-\frac{f_{2}^{1/2}}{f_1^{1/2}}\right)^2 dF_{1}(x,y),
\end{align*}
where $f_{1}=f(y_t,y_{t-k})$ is the joint density and
$f_{2}=g(y_t)\times h(y_{t-k})$ is the product of the marginal
densities of the random variables $Y_t$ and $Y_{t-k}$. The second
expression is in a moment form which is often replaced with a sample
average, especially for theoretical developments.  The unknown
density/probability functions are replaced with nonparametric kernel
estimates.

The bootstrap distribution is obtained by resampling with replacement
from the empirical distribution of $Y_t$ delivering $Y^*_t$ under the
null of nonlinear serial independence.  Bandwidths are obtained via
likelihood cross-validation by default for the marginal and joint
densities.

The following code snippet provides a simple example for a continuous
time series. Note that we strongly advocate the use of the integration
(default) version of the statistic in applied settings but use the
summation (i.e.~moment) version below purely by way of demonstration
as it is computationally faster.

<<keep.source=TRUE>>=
set.seed(123)
## A function to create a time series
ar.series <- function(phi,epsilon) {
  n <- length(epsilon)
  series <- numeric(n)
  series[1] <- epsilon[1]/(1-phi)
  for(i in 2:n) {
    series[i] <- phi*series[i-1] + epsilon[i]
  }
  return(series)
}
n <- 100
## Stationary persistent time-series
yt <- ar.series(0.95,rnorm(n))
npsdeptest(yt,lag.num=2,boot.num=99,method="summation")
@ 

\section{Rolling Your Own Entropy Function}

The functions outlined above allow for a range of entropy-based tests
where the underlying distributions are modelled
nonparametrically. However, practitioners may wish to construct their
own entropy measures that rely on nonparametric estimates of the
underlying distributions. By way of example we consider a metric
entropy that measures distance (Hellinger) of an unknown univariate
density from a specific parametric density. The parametric density for
this example will be Gaussian and the unknown density will be
estimated via kernel methods.

In the following code snippet we construct a simple function in R that
accomplishes this. You could of course use nonparametric methods for
both distributions or parametric methods for both. The function
\code{npudens} computes unconditional kernel density estimates, and
\code{fitted} computes the fitted density for the sample
realizations. The function \code{dnorm} provides the normal density
function. The \code{integrate} function performs univariate numerical
integration.

<<keep.source=TRUE>>=
Srho <- function(x,y,...) {
  ## First write a function to compute the integrand (this is fed to
  ## the `integrate' function). This function's first argument is the
  ## point at which the two densities are computed (the remaining
  ## arguments are the data vectors).
  integrand <- function(t,x,y) {
    ## First, nonparametrically estimate the density of the x data
    ## using a plug-in bandwidth and evaluate the density at the point
    ## `t'.
    f.x <- fitted(npudens(tdat=x,edat=t,bws=bw.SJ(x),...))
    ## Next, estimate the parametric density of the data y using the
    ## Gaussian distribution and evaluate the density at the point
    ## `t'.
    f.y <- dnorm(t,mean=mean(y),sd=sd(y))
    ## Compute and return the integrand evaluated at the point `t'.
    return(0.5*(sqrt(f.x)-sqrt(f.y))**2)
  }
  ## Feed the integrand function to integrate() and return the value
  ## of the integral.
  return(integrate(integrand,-Inf,Inf,x=x,y=y)$value)
}
set.seed(123)
n <- 1000
## Data drawn from the same distribution
x <- rnorm(n)
y <- rnorm(n)
Srho(x,y)
## Data drawn from different distributions
y <- rnorm(n,sd=100)
Srho(x,y)
@ 

Should the reader be interested in multivariate measures, multivariate
numerical integration is available in the \proglang{R} packages
\pkg{cubature} and \pkg{R2Cuba} (\citet{CUBATURE}, \citet{R2CUBA}).

\section{Summary}

The \pkg{np} package (\citet{np}) contains new functionality
introduced in versions 0.30-4 through 0.30-7 that implements a range
of entropy-based inferential procedures. We hope you find these to be
easy to use. Please report any issues encountered to
\email{racinej@mcmaster.ca}. Any suggestions for improvements to this
document would be greatly appreciated. As always, if you use these
functions for your work we would sincerely appreciate your citing both
the relevant published research papers and the \pkg{np} package
(\citet{np}).

\section*{Acknowledgments}

We would like to gratefully acknowledge support from the Natural
Sciences and Engineering Research Council of Canada
(\url{http://www.nserc.ca}), the Social Sciences and Humanities
Research Council of Canada (\url{http://www.sshrc.ca}), and the Shared
Hierarchical Academic Research Computing Network
(\url{http://www.sharcnet.ca}).

\bibliography{entropy_np}

\end{document}