\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@\textit{#1} package}\textit{#1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{`#1'} %%' %%% Math symbols \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\P}{\mathsf{P}} \usepackage{amstext} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R}, pdfsubject = {Book}, pdfauthor = {Brian S. Everitt and Torsten Hothorn}, colorlinks = {true}, linkcolor = {blue}, citecolor = {blue}, urlcolor = {red}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% <FIXME>: conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% </FIMXE> \usepackage{longtable} \usepackage{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\item{\stepcounter{exercise} Ex. \arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R} \author{Brian S. Everitt and Torsten Hothorn} \maketitle %%\VignetteIndexEntry{Chapter Principal Component Analysis} \setcounter{chapter}{12} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <<setup, echo = FALSE, results = hide>>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)))) HSAURpkg <- require("HSAUR") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR")) rm(HSAURpkg) a <- Sys.setlocale("LC_ALL", "C") book <- TRUE refs <- cbind(c("AItR", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "SA", "ALDI", "ALDII", "MA", "PCA", "MDS", "CA"), 1:15) ch <- function(x, book = TRUE) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~\\\\ref{", ch[2], "}", sep = "")) } } @ \pagestyle{headings} \chapter[Principal Component Analysis]{Principal Component Analysis: The Olympic Heptathlon \label{PCA}} \section{Introduction} \section{Principal Component Analysis} \section{Analysis Using \R{}} To begin it will help to score all the seven events in the same direction, so that `large' values are `good'. We will recode the running events to achieve this; <<PCA-heptathlon-recode, echo = TRUE>>=a data("heptathlon", package = "HSAUR") heptathlon$hurdles <- max(heptathlon$hurdles) - heptathlon$hurdles heptathlon$run200m <- max(heptathlon$run200m) - heptathlon$run200m heptathlon$run800m <- max(heptathlon$run800m) - heptathlon$run800m @ \begin{figure} \begin{center} <<PCA-heptathlon-scatter, echo = TRUE, fig = TRUE>>= score <- which(colnames(heptathlon) == "score") plot(heptathlon[,-score]) @ \caption{Scatterplot matrix for the \Robject{heptathlon} data. \label{PCA-heptathlon-scatter}} \end{center} \end{figure} Figure~\ref{PCA-heptathlon-scatter} shows a scatterplot matrix of the results from the $25$ competitors on the seven events. We see that most pairs of events are positively correlated to a greater or lesser degree. The exceptions all involve the javelin event -- this is the only really `technical' event and %%' it is clear that training to become successful in the other six `power'-based events makes this event difficult for %%' the majority of the competitors. We can examine the numerical values of the correlations by applying the \Rcmd{cor} function <<PCA-options65, echo = FALSE>>= w <- options("width") options(width = 65) @ <<PCA-heptathlon-cor, echo = TRUE>>= round(cor(heptathlon[,-score]), 2) @ <<PCA-optionsw, echo = FALSE>>= options(width = w$width) @ This correlation matrix demonstrates again the points made earlier. A principal component analysis of the data can be applied using the \Rcmd{prcomp} function. The result is a list containing the coefficients defining each component (sometimes referred to as \stress{loadings}), \index{Loadings} the principal component scores, etc. The required code is (omitting the \Robject{score} variable) <<PCA-heptathlon-pca, echo = TRUE>>= heptathlon_pca <- prcomp(heptathlon[, -score], scale = TRUE) print(heptathlon_pca) @ The \Rcmd{summary} method can be used for further inspection of the details: <<PCA-heptathlon-summary, echo = TRUE>>= summary(heptathlon_pca) @ The linear combination for the first principal component is <<PCA-heptathlon-a1, echo = TRUE>>= a1 <- heptathlon_pca$rotation[,1] a1 @ We see that the 200m and long jump competitions receive the highest weight but the javelin result is less important. For computing the first principal component, the data need to be rescaled appropriately. The center and the scaling used by \Rcmd{prcomp} internally can be extracted from the \Robject{heptathlon\_pca} via <<PCA-heptathlon-scaling, echo = TRUE>>= center <- heptathlon_pca$center scale <- heptathlon_pca$scale @ Now, we can apply the \Rcmd{scale} function to the data and multiply with the loadings matrix in order to compute the first principal component score for each competitor <<PCA-heptathlon-s1, echo = TRUE>>= hm <- as.matrix(heptathlon[,-score]) drop(scale(hm, center = center, scale = scale) %*% heptathlon_pca$rotation[,1]) @ or, more conveniently, by extracting the first from all precomputed principal components <<PCA-heptathlon-s1, echo = TRUE>>= predict(heptathlon_pca)[,1] @ \begin{figure} \begin{center} <<PCA-heptathlon-pca-plot, echo = TRUE, fig = TRUE>>= plot(heptathlon_pca) @ \caption{Barplot of the variances explained by the principal components. \label{PCA-heptathlon-pca-plot}} \end{center} \end{figure} <<PCA-heptathlon-sdev, echo = FALSE, results = hide>>= sdev <- heptathlon_pca$sdev prop12 <- round(sum(sdev[1:2]^2)/sum(sdev^2)*100, 0) @ The first two components account for $\Sexpr{prop12}\%$ of the variance. A barplot of each component's variance (see %%' Figure~\ref{PCA-heptathlon-pca-plot}) shows how the first two components dominate. A plot of the data in the space of the first two principal components, with the points labelled by the name of the corresponding competitor can be produced as shown with Figure~\ref{PCA-heptathlon-PCscatter}. In addition, the first two loadings for the events are given in a second coordinate system, also illustrating the special role of the javelin event. This graphical representation is known as \stress{biplot} \citep{HSAUR:Gabriel1971}. \index{Biplot} \begin{figure} \begin{center} <<PCA-heptathlon-PCscatter, eval = FALSE>>= biplot(heptathlon_pca, col = c("gray", "black")) @ <<PCA-heptathlon-PCscatter, echo = FALSE, fig = TRUE>>= tmp <- heptathlon[, -score] rownames(tmp) <- abbreviate(gsub(" \\(.*", "", rownames(tmp))) biplot(prcomp(tmp, scale = TRUE), col = c("black", "lightgray"), xlim = c(-0.5, 0.7)) @ \caption{Biplot of the (scaled) first two principal components. \label{PCA-heptathlon-PCscatter}} \end{center} \end{figure} The correlation between the score given to each athlete by the standard scoring system used for the heptathlon and the first principal component score can be found from <<PCA-scorecor, echo = TRUE>>= cor(heptathlon$score, heptathlon_pca$x[,1]) @ This implies that the first principal component is in good agreement with the score assigned to the athletes by official Olympic rules; a scatterplot of the official score and the first principal component is given in Figure~\ref{PCA-heptathlonscore}. \begin{figure} \begin{center} <<PCA-heptathlonscore, echo = TRUE, fig = TRUE>>= plot(heptathlon$score, heptathlon_pca$x[,1]) @ \caption{Scatterplot of the score assigned to each athlete in 1988 and the first principal component. \label{PCA-heptathlonscore}} \end{center} \end{figure} %%\bibliographystyle{LaTeXBibTeX/refstyle} %%\bibliography{LaTeXBibTeX/HSAUR} \end{document}