\documentclass{chapman}

%%% copy Sweave.sty definitions

%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave} 


\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}

\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
                                              fontshape=it,
                                              fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}

%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
    \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
                                                  fontshape=it,
                                                  fontsize=\small}
    \rawSinput
}

%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
  \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
  \end{figure} }
  \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
  \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, 
                                                samepage = true,
                                                fontfamily=courier,
                                                fontshape=it,
                                                fontsize=\relsize{-1}}
}


%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%%  numbers=left
}

\newcommand{\numberSinput}{
    \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
    \DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}


%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@\textit{#1} package}\textit{#1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}


%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}} 
\newcommand{\stress}[1]{\textit{#1}} 
\newcommand{\booktitle}[1]{`#1'} %%'

%%% Math symbols
\newcommand{\E}{\mathsf{E}}   
\newcommand{\Var}{\mathsf{Var}}   
\newcommand{\Cov}{\mathsf{Cov}}   
\newcommand{\Cor}{\mathsf{Cor}}   
\newcommand{\x}{\mathbf{x}}   
\newcommand{\y}{\mathbf{y}}   
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}   
\newcommand{\C}{\mathbf{C}}   
\renewcommand{\H}{\mathbf{H}}   
\newcommand{\X}{\mathbf{X}}   
\newcommand{\B}{\mathbf{B}}   
\newcommand{\V}{\mathbf{V}}   
\newcommand{\I}{\mathbf{I}}   
\newcommand{\D}{\mathbf{D}}   
\newcommand{\bS}{\mathbf{S}}   
\newcommand{\N}{\mathcal{N}}   
\renewcommand{\P}{\mathsf{P}}   
\usepackage{amstext}

%%% links
\usepackage{hyperref}

\hypersetup{%
  pdftitle = {A Handbook of Statistical Analyses Using R},
  pdfsubject = {Book},
  pdfauthor = {Brian S. Everitt and Torsten Hothorn},
  colorlinks = {true},
  linkcolor = {blue},
  citecolor = {blue},
  urlcolor = {red},
  hyperindex = {true},
  linktocpage = {true},
}


%%% captions & tables
%% <FIXME>: conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%% </FIMXE>
\usepackage{longtable}
\usepackage{rotating}

%%% R symbol in chapter 1
\usepackage{wrapfig}

%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse

%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}

%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\item{\stepcounter{exercise} Ex.
                       \arabic{chapter}.\arabic{exercise} }}


%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}

%%% for manual corrections
%\renewcommand{\baselinestretch}{2}

%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}

%%% color
\usepackage{color}

%%% hyphenations
\hyphenation{drop-out}

%%% new bidirectional quotes need 
\usepackage[utf8]{inputenc}
\begin{document}

%% Title page

\title{A Handbook of Statistical Analyses Using \R}

\author{Brian S. Everitt and Torsten Hothorn}

\maketitle
%%\VignetteIndexEntry{Chapter Recursive Partitioning}
%%\VignetteDepends{vcd,lattice,randomForest,party}
\setcounter{chapter}{7}


\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} 

<<setup, echo = FALSE, results = hide>>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
                "package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+  ",
    width = 63, # digits = 4, 
    SweaveHooks = list(leftpar = function() 
        par(mai = par("mai") * c(1, 1.05, 1, 1))))
HSAURpkg <- require("HSAUR")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR"))
rm(HSAURpkg)
a <- Sys.setlocale("LC_ALL", "C")
book <- TRUE
refs <- cbind(c("AItR", "SI", "CI", "ANOVA", "MLR", "GLM", 
                "DE", "RP", "SA", "ALDI", "ALDII", "MA", "PCA", 
                "MDS", "CA"), 1:15)
ch <- function(x, book = TRUE) {
    ch <- refs[which(refs[,1] == x),]
    if (book) {
        return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
    } else {
        return(paste("Chapter~\\\\ref{", ch[2], "}", sep = ""))
    }
}
@

\pagestyle{headings}

<<RP-setup, echo = FALSE, results = hide>>=
library("vcd")
library("lattice")
library("randomForest")
library("party")
ltheme <- canonical.theme(color = FALSE)     ## in-built B&W theme  
ltheme$strip.background$col <- "transparent" ## change strip bg  
lattice.options(default.theme = ltheme) 
mai <- par("mai")
options(SweaveHooks = list(nullmai = function() { par(mai = rep(0, 4)) },
                           twomai = function() { par(mai = c(0, mai[2], 0, 0)) },
                           threemai = function() { par(mai = c(0, mai[2], 0.1, 0)) }))
@

\chapter[Recursive Partitioning]{Recursive Partitioning: 
                                 Large Companies and Glaucoma Diagnosis \label{RP}}

\section{Introduction}


\section{Recursive Partitioning}


\section{Analysis Using \R{}}

\subsection{Forbes 2000 Data}

\index{Forbes 2000 ranking|(}

\index{Missing values}
For some observations the profit is missing and we first remove those
companies from the list
<<RP-Forbes-na, echo = TRUE>>=
data("Forbes2000", package = "HSAUR")
Forbes2000 <- subset(Forbes2000, !is.na(profits))
@

The \Rcmd{rpart} function from \Rpackage{rpart} can be used to grow a
regression tree. The response variable and the covariates are defined by a
model formula in the same way as for \Rcmd{lm}, say. By default, a large
initial tree is grown. 
<<RP-Forbes-rpart, echo = TRUE>>=
library("rpart")
forbes_rpart <- rpart(profits ~ assets + marketvalue + sales, 
                      data = Forbes2000)
@
A \Rcmd{print} method for \Rclass{rpart} objects is available, however, a
graphical representation shown in Figure~\ref{RP:forbesL} is more
convenient. Observations which satisfy the condition shown for each node 
go to the left and
observations which don't are element of the right branch in each node. %'
The numbers plotted in the leaves are the mean profit for those
observations satisfying the conditions stated above. For example, the
highest profit is observed for companies with a market value greater than
$89.33$ billion US dollars and with more than $91.92$ US dollars sales.

\begin{figure}
\begin{center}
<<RP-Forbes-initial, echo = TRUE, fig = TRUE, nullmai = TRUE>>=
plot(forbes_rpart, uniform = TRUE, margin = 0.1, branch = 0.5, 
     compress = TRUE)
text(forbes_rpart)
@
\caption{Large initial tree for Forbes 2000 data. \label{RP:forbesL}}
\end{center}
\end{figure}

\index{Cross-validation!Forbes 2000 data}
To determine if the tree is appropriate or if some of the branches need to 
be subjected to pruning we can use the \Robject{cptable}
element of the \Rclass{rpart} object:
<<RP-Forbes-cp, echo = TRUE>>=
print(forbes_rpart$cptable)
opt <- which.min(forbes_rpart$cptable[,"xerror"])
@
The \Robject{xerror} column contains of estimates of cross-validated prediction error
for different numbers of splits (\Robject{nsplit}). The best tree has
%%\Sexpr{forbes_rpart$cptable[opt, "nsplit"]} 
three
splits. Now we can prune back the large
initial tree using
\index{Pruning!Forbes 2000 ranking}
<<RP-Forbes-prune, echo = TRUE>>=
cp <- forbes_rpart$cptable[opt, "CP"]
forbes_prune <- prune(forbes_rpart, cp = cp)
@
The result is shown in Figure~\ref{RP:forbesS}. This tree is much smaller.
From the sample sizes and boxplots shown for each leaf we see that the
majority of companies is grouped together. However, a large market
value, more that $32.72$ billion US dollars, seems to be a good indicator of
large profits. 

\begin{figure}
\begin{center}
<<RP-Forbes-plot, echo = TRUE, fig = TRUE, twomai = TRUE>>=
layout(matrix(1:2, nc = 1))
plot(forbes_prune, uniform = TRUE, margin = 0.1, branch = 0.5, 
     compress = TRUE)
text(forbes_prune)
rn <- rownames(forbes_prune$frame)
lev <- rn[sort(unique(forbes_prune$where))]
where <- factor(rn[forbes_prune$where], levels = lev)
n <- tapply(Forbes2000$profits, where, length)
boxplot(Forbes2000$profits ~ where, varwidth = TRUE,
        ylim = range(Forbes2000$profit) * 1.3, 
        pars = list(axes = FALSE), 
        ylab = "Profits in US dollars")
abline(h = 0, lty = 3)
axis(2)
text(1:length(n), max(Forbes2000$profit) * 1.2, 
     paste("n = ", n))
@
\caption{Pruned regression tree for Forbes 2000 data with the distribution
         of the profit in each leaf depicted by a boxplot.
         \label{RP:forbesS}}
\end{center}
\end{figure}
\index{Forbes 2000 ranking|)}

\subsection{Glaucoma Diagnosis}


<<RP-seed-again, echo = FALSE, results = hide>>=
set.seed(290875)
@

<<RP-glaucoma-rpart, echo = TRUE>>=
data("GlaucomaM", package = "TH.data")
glaucoma_rpart <- rpart(Class ~ ., data = GlaucomaM, 
              control = rpart.control(xval = 100))
glaucoma_rpart$cptable
opt <- which.min(glaucoma_rpart$cptable[,"xerror"])
cp <- glaucoma_rpart$cptable[opt, "CP"]
glaucoma_prune <- prune(glaucoma_rpart, cp = cp)
@


\begin{figure}
\begin{center}
<<RP-glaucoma-plot, echo = TRUE, fig = TRUE, threemai = TRUE>>=
layout(matrix(1:2, nc = 1))
plot(glaucoma_prune, uniform = TRUE, margin = 0.1, branch = 0.5, 
     compress = TRUE)
text(glaucoma_prune, use.n = TRUE)
rn <- rownames(glaucoma_prune$frame)
lev <- rn[sort(unique(glaucoma_prune$where))]
where <- factor(rn[glaucoma_prune$where], levels = lev)
mosaicplot(table(where, GlaucomaM$Class), main = "", xlab = "", 
           las = 1)
@
\caption{Pruned classification tree of the glaucoma data with class
         distribution in the leaves depicted by a mosaicplot. \label{RP:gl}}
\end{center}
\end{figure}

\index{Classification tree!choice of tree size}
\index{Tree size}
As we discussed earlier, the choice of the appropriate sized tree is not a
trivial problem. For the glaucoma data, the above choice of three leaves is
very unstable across multiple runs of cross-validation. As an illustration
of this problem we repeat the very same analysis as shown above and record
the optimal number of splits as suggested by the cross-validation runs.
<<RP-glaucoma-cp, echo = TRUE>>=
nsplitopt <- vector(mode = "integer", length = 25)
for (i in 1:length(nsplitopt)) {
    cp <- rpart(Class ~ ., data = GlaucomaM)$cptable
    nsplitopt[i] <- cp[which.min(cp[,"xerror"]), "nsplit"]
}
table(nsplitopt)
@
Although for \Sexpr{sum(nsplitopt == 1)} runs of cross-validation a simple
tree with one split only is suggested, larger trees would have been favored
in \Sexpr{sum(nsplitopt > 1)} of the cases. This short analysis shows that
we should not trust the tree in Figure~\ref{RP:gl} too much.

\index{Bagging}
\index{Bootstrap approach!glaucoma diagnosis data}
One way out of this dilemma is the aggregation of multiple trees via
\stress{bagging}. 
In \R{}, the bagging idea can be implemented by three or four lines
of code. Case count or weight vectors representing the bootstrap samples can be drawn from the
multinominal distribution with parameters $n$ and $p_1 = 1/n, \dots, p_n =
1/n$ via the \Rcmd{rmultinom} function. For each weight vector, one large tree is
constructed without pruning and the \Rclass{rpart} objects are stored in a
list, here called \Robject{trees}:
<<RP-glaucoma-bagg, echo = TRUE>>=
trees <- vector(mode = "list", length = 25)
n <- nrow(GlaucomaM)
bootsamples <- rmultinom(length(trees), n, rep(1, n)/n)
mod <- rpart(Class ~ ., data = GlaucomaM, 
             control = rpart.control(xval = 0))
for (i in 1:length(trees))
    trees[[i]] <- update(mod, weights = bootsamples[,i])
@
The \Rcmd{update} function re-evaluates the call of \Robject{mod}, however,
with the weights being altered, i.e., fits a tree to a bootstrap sample
specified by the weights.
It is interesting to have a look at the structures of the multiple trees.
For example, the variable selected for splitting in the root of the tree is
not unique as can be seen by
<<RP-glaucoma-splits, echo = TRUE>>=
table(sapply(trees, function(x) as.character(x$frame$var[1])))
@
Although \Robject{varg} is selected most of the time, other variables such
as \Robject{vari} occur as well -- a further indication that the tree in
Figure~\ref{RP:gl} is
questionable and that hard decisions are not appropriate for the glaucoma
data.

In order to make use of the ensemble of trees in the list \Robject{trees}
we estimate the conditional probability of suffering from glaucoma given the
covariates for each observation in the original data set by
<<RP-glaucoma-baggpred, echo = TRUE>>=
classprob <- matrix(0, nrow = n, ncol = length(trees))
for (i in 1:length(trees)) {
    classprob[,i] <- predict(trees[[i]], 
                             newdata = GlaucomaM)[,1]
    classprob[bootsamples[,i] > 0,i] <- NA
}
@
Thus, for each observation we get \Sexpr{length(trees)} estimates. However,
each observation has been used for growing one of the trees with
probability $0.632$ and thus was not used with probability $0.368$.
Consequently, the estimate from a tree where an observation was not used for
growing is better for judging the quality of the predictions and we label
the other estimates with \Robject{NA}.

Now, we can average the estimates and we vote for glaucoma when the average
of the estimates of the conditional glaucoma probability exceeds $0.5$. The
comparison between the observed and the predicted classes does not suffer from
overfitting since the predictions are computed from those trees for which 
each single observation was \stress{not} used for growing.
<<RP-glaucoma-avg, echo = TRUE>>=
avg <- rowMeans(classprob, na.rm = TRUE)
predictions <- factor(ifelse(avg > 0.5, "glaucoma", "normal"))
predtab <- table(predictions, GlaucomaM$Class)
predtab
@
Thus, an honest estimate of the probability of
a glaucoma prediction when the patient is actually suffering from glaucoma is
<<RP-glaucoma-sens, echo = TRUE>>=
round(predtab[1,1] / colSums(predtab)[1] * 100)
@
per cent. For 
<<RP-glaucoma-spez, echo = TRUE>>=
round(predtab[2,2] / colSums(predtab)[2] * 100)
@
per cent of normal eyes, the ensemble does not predict a glaucomateous
damage.


%% <FIXME> is this P(Glaucoma), not P(Normal) ??? </FIXME>

\begin{figure}
\begin{center}
<<RP-glaucoma-baggplot, echo = TRUE, fig = TRUE, height = 4>>=
library("lattice")
gdata <- data.frame(avg = rep(avg, 2), 
    class = rep(as.numeric(GlaucomaM$Class), 2),
    obs = c(GlaucomaM[["varg"]], GlaucomaM[["vari"]]),
    var = factor(c(rep("varg", nrow(GlaucomaM)), 
                   rep("vari", nrow(GlaucomaM)))))
panelf <- function(x, y) {
           panel.xyplot(x, y, pch = gdata$class)
           panel.abline(h = 0.5, lty = 2)
       }
print(xyplot(avg ~ obs | var, data = gdata, 
       panel = panelf,
       scales = "free", xlab = "", 
       ylab = "Estimated Class Probability Glaucoma"))
@
\caption{Glaucoma data: Estimated class probabilities depending on two important
         variables. The $0.5$ cut-off for the estimated glaucoma probability
         is depicted as horizontal line. Glaucomateous eyes are plotted as
         circles and normal eyes are triangles. \label{RP:glplot}}
\end{center}
\end{figure}

\index{Random forest}
The \stress{bagging} procedure is a special case of a more general approach
called \stress{random forest} \citep{HSAUR:Breiman2001b}. The package
\Rpackage{randomForest} \citep{PKG:randomForest}
can be used to compute such ensembles via
<<RP-glaucoma-rf, echo = TRUE>>=
library("randomForest")
rf <- randomForest(Class ~ ., data = GlaucomaM)
@
and we obtain out-of-bag estimates for the prediction error via
<<RP-glaucoma-rf-oob, echo = TRUE>>=
table(predict(rf), GlaucomaM$Class)
@


For the glaucoma data, such a \stress{conditional inference tree} 
can be computed using
the \Rcmd{ctree} function
\index{Conditional tree}
<<RP-glaucoma-ctree, echo = TRUE>>=
library("party")
glaucoma_ctree <- ctree(Class ~ ., data = GlaucomaM)
@
and a graphical representation is depicted in Figure~\ref{RP-glaucoma-ctree-plot} showing
both the cutpoints and the $p$-values of the associated independence tests for each node. 
The first split is performed using a cutpoint defined with respect to the
volume of the optic nerve above some reference plane, but in the inferior part of
the eye only (\Robject{vari}).

\begin{figure}
\begin{center}
<<RP-glaucoma-ctree-plot, echo = TRUE, fig = TRUE, width = 10, height = 6>>=
plot(glaucoma_ctree)
@
\caption{Glaucoma data: Conditional inference tree with the distribution 
         of glaucomateous eyes shown for each terminal leaf.
         \label{RP-glaucoma-ctree-plot}}
\end{center}
\end{figure}



\bibliographystyle{LaTeXBibTeX/refstyle}
\bibliography{LaTeXBibTeX/HSAUR}   
\end{document}