\documentclass{chapman}

%%% copy Sweave.sty definitions

%%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE
%\usepackage{Sweave} 


\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}

\DefineVerbatimEnvironment{Sinput}{Verbatim}{}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
                                              fontshape=it,
                                              fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}

%%% environment for raw output
\newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{}
    \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
                                                  fontshape=it,
                                                  fontsize=\small}
    \rawSinput
}

%%% environment for labeled output
\newcommand{\nextcaption}{}
\newcommand{\SchunkLabel}{
  \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption}
  \end{figure} }
  \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline}
  \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, 
                                                samepage = true,
                                                fontfamily=courier,
                                                fontshape=it,
                                                fontsize=\relsize{-1}}
}


%%% S code with line numbers
\DefineVerbatimEnvironment{Sinput}
{Verbatim}
{
%%  numbers=left
}

\newcommand{\numberSinput}{
    \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left}
}
\newcommand{\rawSinput}{
    \DefineVerbatimEnvironment{Sinput}{Verbatim}{}
}


%%% R / System symbols
\newcommand{\R}{\textsf{R}}
\newcommand{\rR}{{R}}
\renewcommand{\S}{\textsf{S}}
\newcommand{\SPLUS}{\textsf{S-PLUS}}
\newcommand{\rSPLUS}{{S-PLUS}}
\newcommand{\SPSS}{\textsf{SPSS}}
\newcommand{\EXCEL}{\textsf{Excel}}
\newcommand{\ACCESS}{\textsf{Access}}
\newcommand{\SQL}{\textsf{SQL}}
%%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}}
%%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}}
\newcommand{\Rpackage}[1]{\index{#1 package@\textit{#1} package}\textit{#1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}}
\newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}


%%% other symbols
\newcommand{\file}[1]{\hbox{\rm\texttt{#1}}}
%%\newcommand{\stress}[1]{\index{#1}\textit{#1}} 
\newcommand{\stress}[1]{\textit{#1}} 
\newcommand{\booktitle}[1]{`#1'} %%'

%%% Math symbols
\newcommand{\E}{\mathsf{E}}   
\newcommand{\Var}{\mathsf{Var}}   
\newcommand{\Cov}{\mathsf{Cov}}   
\newcommand{\Cor}{\mathsf{Cor}}   
\newcommand{\x}{\mathbf{x}}   
\newcommand{\y}{\mathbf{y}}   
\renewcommand{\a}{\mathbf{a}}
\newcommand{\W}{\mathbf{W}}   
\newcommand{\C}{\mathbf{C}}   
\renewcommand{\H}{\mathbf{H}}   
\newcommand{\X}{\mathbf{X}}   
\newcommand{\B}{\mathbf{B}}   
\newcommand{\V}{\mathbf{V}}   
\newcommand{\I}{\mathbf{I}}   
\newcommand{\D}{\mathbf{D}}   
\newcommand{\bS}{\mathbf{S}}   
\newcommand{\N}{\mathcal{N}}   
\renewcommand{\P}{\mathsf{P}}   
\usepackage{amstext}

%%% links
\usepackage{hyperref}

\hypersetup{%
  pdftitle = {A Handbook of Statistical Analyses Using R},
  pdfsubject = {Book},
  pdfauthor = {Brian S. Everitt and Torsten Hothorn},
  colorlinks = {true},
  linkcolor = {blue},
  citecolor = {blue},
  urlcolor = {red},
  hyperindex = {true},
  linktocpage = {true},
}


%%% captions & tables
%% <FIXME>: conflics with figure definition in chapman.cls
%%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption}
%% </FIMXE>
\usepackage{longtable}
\usepackage{rotating}

%%% R symbol in chapter 1
\usepackage{wrapfig}

%%% Bibliography
\usepackage[round,comma]{natbib}
\renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}}
\citeindexfalse

%%% texi2dvi complains that \newblock is undefined, hm...
\def\newblock{\hskip .11em plus .33em minus .07em}

%%% Example sections
\newcounter{exercise}[chapter]
\setcounter{exercise}{0}
\newcommand{\exercise}{\item{\stepcounter{exercise} Ex.
                       \arabic{chapter}.\arabic{exercise} }}


%% URLs
\newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}}

%%% for manual corrections
%\renewcommand{\baselinestretch}{2}

%%% plot sizes
\setkeys{Gin}{width=0.95\textwidth}

%%% color
\usepackage{color}

%%% hyphenations
\hyphenation{drop-out}

%%% new bidirectional quotes need 
\usepackage[utf8]{inputenc}
\begin{document}

%% Title page

\title{A Handbook of Statistical Analyses Using \R}

\author{Brian S. Everitt and Torsten Hothorn}

\maketitle
%%\VignetteIndexEntry{Multiple Linear Regression}
\setcounter{chapter}{4}


\SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} 

<<setup, echo = FALSE, results = hide>>=
rm(list = ls())
s <- search()[-1]
s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices",
                "package:utils", "package:datasets", "package:methods", "Autoloads"), s)]
if (length(s) > 0) sapply(s, detach, character.only = TRUE)
if (!file.exists("tables")) dir.create("tables")
if (!file.exists("figures")) dir.create("figures")
set.seed(290875)
options(prompt = "R> ", continue = "+  ",
    width = 63, # digits = 4, 
    SweaveHooks = list(leftpar = function() 
        par(mai = par("mai") * c(1, 1.05, 1, 1))))
HSAURpkg <- require("HSAUR")
if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR"))
rm(HSAURpkg)
a <- Sys.setlocale("LC_ALL", "C")
book <- TRUE
refs <- cbind(c("AItR", "SI", "CI", "ANOVA", "MLR", "GLM", 
                "DE", "RP", "SA", "ALDI", "ALDII", "MA", "PCA", 
                "MDS", "CA"), 1:15)
ch <- function(x, book = TRUE) {
    ch <- refs[which(refs[,1] == x),]
    if (book) {
        return(paste("Chapter~\\\\ref{", ch[1], "}", sep = ""))
    } else {
        return(paste("Chapter~\\\\ref{", ch[2], "}", sep = ""))
    }
}
@

\pagestyle{headings}

\chapter[Multiple Linear Regression]{Multiple Linear Regression: \                                     Cloud Seeding \label{MLR}}

\section{Introduction}


\index{Cloud seeding}

\section{Multiple Linear Regression}
\index{Linear models|(}


\section{Analysis Using \R{}}


\begin{figure}
\begin{center}
<<MLR-clouds-boxplots, echo = TRUE, fig = TRUE, height = 9>>=
data("clouds", package = "HSAUR")
layout(matrix(1:2, nrow = 2))
bxpseeding <- boxplot(rainfall ~ seeding, data = clouds, 
    ylab = "Rainfall", xlab = "Seeding")
bxpecho <- boxplot(rainfall ~ echomotion, data = clouds, 
    ylab = "Rainfall", xlab = "Echo Motion")
@
\caption{Boxplots of \Robject{rainfall}. \label{MLR-rainfall-boxplot}}
\end{center}
\end{figure}

\begin{figure}
\begin{center}
<<MLR-clouds-scatterplots, echo = TRUE, fig = TRUE, width = 7, height = 7>>=
layout(matrix(1:4, nrow = 2))
plot(rainfall ~ time, data = clouds)
plot(rainfall ~ cloudcover, data = clouds)
plot(rainfall ~ sne, data = clouds, xlab="S-Ne criterion")
plot(rainfall ~ prewetness, data = clouds)
@
\caption{Scatterplots of \Robject{rainfall} against the continuous
covariates. \label{MLR-rainfall-scplot}}
\end{center}
\end{figure}

Both the boxplots (Figure~\ref{MLR-rainfall-boxplot}) and the scatterplots 
(Figure~\ref{MLR-rainfall-scplot}) show some evidence 
of outliers. The row names of the extreme observations in the
\Robject{clouds} \Rclass{data.frame} can be identified via
<<MLR-clouds-outliers, echo = TRUE>>=
rownames(clouds)[clouds$rainfall %in% c(bxpseeding$out, 
                                        bxpecho$out)]
@
where \Robject{bxpseeding} and \Robject{bxpecho} are variables 
created by \Rcmd{boxplot} in Figure~\ref{MLR-rainfall-boxplot}.
For the time being we shall not remove these observations but bear 
in mind during the modelling process that they may cause problems.

\subsection{Fitting a Linear Model}

In this example it is sensible to assume that the effect that 
some of the other explanatory variables is modified by seeding 
and therefore consider a model that allows interaction terms 
\index{Interaction}
for \Robject{seeding} with each of the covariates except \Robject{time}.
This model can be described by the \Rclass{formula}
<<MLR-clouds-formula, echo = TRUE>>=
clouds_formula <- rainfall ~ seeding * (sne + cloudcover +
    prewetness + echomotion) + time
@
and the design matrix $\X^\star$ can be computed via
<<MLR-clouds-modelmatrix, echo = TRUE>>=
Xstar <- model.matrix(clouds_formula, data = clouds)
@
By default, treatment contrasts have been applied to the dummy codings of
the factors \Robject{seeding} and \Robject{echomotion} as can be seen from
the inspection of the \Robject{contrasts} attribute of the model matrix
<<MLR-clouds-contrasts, echo = TRUE>>=
attr(Xstar, "contrasts")
@
The default contrasts can be changed via the \Rarg{contrasts.arg}
argument to \Rcmd{model.matrix} or the \Robject{contrasts} argument to the
fitting function, for example \Rcmd{lm} or \Rcmd{aov} as shown in 
Chapter~4.

However, such internals are hidden and performed by high-level model fitting
functions such as \Rcmd{lm} which will be used to fit the linear model
defined by the \Rclass{formula} \Robject{clouds\_formula}:
<<MLR-clouds-lm, echo = TRUE>>=
clouds_lm <- lm(clouds_formula, data = clouds)
class(clouds_lm)
@
The results of the model fitting is an object of class \Rclass{lm} for which
a \Rcmd{summary} method showing the conventional regression analysis
output is available. The output in Figure~\ref{MLR-clouds-summary}
shows the estimates $\hat{\beta}^\star$ with corresponding standard errors
and $t$-statistics as well as the $F$-statistic with associated $p$-value.
\renewcommand{\nextcaption}{\R{} output of the linear model fit  
                            for the \Robject{clouds} data.
                            \label{MLR-clouds-summary}}
\SchunkLabel
<<MLR-clouds-summary, echo = TRUE>>=
summary(clouds_lm)
@
\SchunkRaw

Many methods are available for extracting components of the fitted model.
The estimates $\hat{\beta}^\star$ can be assessed via
<<MLR-clouds-coef, echo = TRUE>>=
betastar <- coef(clouds_lm)
betastar
@
and the corresponding covariance matrix $\Cov(\hat{\beta}^\star)$ is available
from the \Rcmd{vcov} method
<<MLR-clouds-vcov, echo = TRUE>>=
Vbetastar <- vcov(clouds_lm)
@
where the square roots of the diagonal elements are the standard errors as
shown in Figure~\ref{MLR-clouds-summary}
<<MLR-clouds-sd, echo = TRUE>>=
sqrt(diag(Vbetastar))
@


\begin{figure}
\begin{center}
<<MLR-clouds-lmplot, echo = TRUE, fig = TRUE>>=
psymb <- as.numeric(clouds$seeding)
plot(rainfall ~ sne, data = clouds, pch = psymb, 
     xlab = "S-Ne criterion")
abline(lm(rainfall ~ sne, data = clouds, 
          subset = seeding == "no"))
abline(lm(rainfall ~ sne, data = clouds, 
          subset = seeding == "yes"), lty = 2)  
legend("topright", legend = c("No seeding", "Seeding"), 
       pch = 1:2, lty = 1:2, bty = "n")
@
\caption{Regression relationship between S-Ne criterion and rainfall with
and without seeding. \label{MLR-rel}}
\end{center}
\end{figure}


\subsection{Regression Diagnostics}


In order to investigate the quality of the model fit, we need access to the
residuals and the fitted values. The residuals can be found by the 
\Rcmd{residuals} method and the fitted values of the response 
from the \Rcmd{fitted} (or \Rcmd{predict}) method
<<MLR-clouds-residfitted, echo = TRUE>>=
clouds_resid <- residuals(clouds_lm)
clouds_fitted <- fitted(clouds_lm)
@
Now the residuals and the fitted values 
can be used to construct diagnostic plots; for example the residual
plot in Figure~\ref{MLR-resid} where each observation is labelled by its
number. Observations $1$ and $15$ give rather large residual values and the
data should perhaps be reanalysed after these two observations are removed.
The normal probability plot of the residuals shown in Figure~\ref{MLR-qqplot} 
shows a reasonable agreement between theoretical and sample
quantiles, however, observations $1$ and $15$ are extreme again.

\begin{figure}
\begin{center}
<<MLR-clouds-residplot, echo = TRUE, fig = TRUE>>=
plot(clouds_fitted, clouds_resid, xlab = "Fitted values", 
     ylab = "Residuals", type = "n", 
     ylim = max(abs(clouds_resid)) * c(-1, 1))
abline(h = 0, lty = 2)
text(clouds_fitted, clouds_resid, labels = rownames(clouds))
@
\caption{Plot of residuals against fitted values for \Robject{clouds} seeding data.
\label{MLR-resid}}
\end{center}
\end{figure}

\begin{figure}
\begin{center}
<<MLR-clouds-qqplot, echo = TRUE, fig = TRUE>>=
qqnorm(clouds_resid, ylab = "Residuals")
qqline(clouds_resid)
@
\caption{Normal probability plot of residuals from cloud seeding model
         \Robject{clouds\_lm}. \label{MLR-qqplot}}
\end{center}
\end{figure}


An index plot of the Cook's distances for each observation %'
(and many other plots including those constructed above from
using the basic functions) can be found from applying the \Rcmd{plot} method 
to the object that results from the application
of the \Rcmd{lm} function. 
\begin{figure}
\begin{center}
<<MLR-clouds-cook, echo = TRUE, eval = FALSE>>=
plot(clouds_lm)
@
<<MLR-clouds-cook, echo = FALSE, fig = TRUE>>=
plot(clouds_lm, which = 4, sub.caption = NULL)
@
\caption{Index plot of Cook's distances for cloud seeding data. %'
         \label{MLR-cook}}
\end{center}
\end{figure}
Figure~\ref{MLR-cook} suggests that observations 2 and 18 have undue
influence on the 
estimated regression coefficients, but the two outliers identified
previously do not. Again it may be useful to look at the results
after these two observations have been removed (see Exercise
5.2).
%% \ref{MLR-ex2})
\index{Regression diagnostics|)}

%%\bibliographystyle{LaTeXBibTeX/refstyle}
%%\bibliography{LaTeXBibTeX/HSAUR}   
\end{document}