\documentclass{chapman} %%% copy Sweave.sty definitions %%% keeps `sweave' from adding `\usepackage{Sweave}': DO NOT REMOVE %\usepackage{Sweave} \RequirePackage[T1]{fontenc} \RequirePackage{graphicx,ae,fancyvrb} \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{} \usepackage{relsize} \DefineVerbatimEnvironment{Sinput}{Verbatim}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} \DefineVerbatimEnvironment{Scode}{Verbatim}{} \newenvironment{Schunk}{}{} %%% environment for raw output \newcommand{\SchunkRaw}{\renewenvironment{Schunk}{}{} \DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier, fontshape=it, fontsize=\small} \rawSinput } %%% environment for labeled output \newcommand{\nextcaption}{} \newcommand{\SchunkLabel}{ \renewenvironment{Schunk}{\begin{figure}[ht] }{\caption{\nextcaption} \end{figure} } \DefineVerbatimEnvironment{Sinput}{Verbatim}{frame = topline} \DefineVerbatimEnvironment{Soutput}{Verbatim}{frame = bottomline, samepage = true, fontfamily=courier, fontshape=it, fontsize=\relsize{-1}} } %%% S code with line numbers \DefineVerbatimEnvironment{Sinput} {Verbatim} { %% numbers=left } \newcommand{\numberSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{numbers=left} } \newcommand{\rawSinput}{ \DefineVerbatimEnvironment{Sinput}{Verbatim}{} } %%% R / System symbols \newcommand{\R}{\textsf{R}} \newcommand{\rR}{{R}} \renewcommand{\S}{\textsf{S}} \newcommand{\SPLUS}{\textsf{S-PLUS}} \newcommand{\rSPLUS}{{S-PLUS}} \newcommand{\SPSS}{\textsf{SPSS}} \newcommand{\EXCEL}{\textsf{Excel}} \newcommand{\ACCESS}{\textsf{Access}} \newcommand{\SQL}{\textsf{SQL}} %%\newcommand{\Rpackage}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Robject}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\Rclass}[1]{\hbox{\rm\textit{#1}}} %%\newcommand{\Rcmd}[1]{\hbox{\rm\texttt{#1}}} \newcommand{\Rpackage}[1]{\index{#1 package@\textit{#1} package}\textit{#1}} \newcommand{\Robject}[1]{\texttt{#1}} \newcommand{\Rclass}[1]{\index{#1 class@\textit{#1} class}\textit{#1}} \newcommand{\Rcmd}[1]{\index{#1 function@\texttt{#1} function}\texttt{#1}} \newcommand{\Roperator}[1]{\texttt{#1}} \newcommand{\Rarg}[1]{\texttt{#1}} \newcommand{\Rlevel}[1]{\texttt{#1}} %%% other symbols \newcommand{\file}[1]{\hbox{\rm\texttt{#1}}} %%\newcommand{\stress}[1]{\index{#1}\textit{#1}} \newcommand{\stress}[1]{\textit{#1}} \newcommand{\booktitle}[1]{`#1'} %%' %%% Math symbols \newcommand{\E}{\mathsf{E}} \newcommand{\Var}{\mathsf{Var}} \newcommand{\Cov}{\mathsf{Cov}} \newcommand{\Cor}{\mathsf{Cor}} \newcommand{\x}{\mathbf{x}} \newcommand{\y}{\mathbf{y}} \renewcommand{\a}{\mathbf{a}} \newcommand{\W}{\mathbf{W}} \newcommand{\C}{\mathbf{C}} \renewcommand{\H}{\mathbf{H}} \newcommand{\X}{\mathbf{X}} \newcommand{\B}{\mathbf{B}} \newcommand{\V}{\mathbf{V}} \newcommand{\I}{\mathbf{I}} \newcommand{\D}{\mathbf{D}} \newcommand{\bS}{\mathbf{S}} \newcommand{\N}{\mathcal{N}} \renewcommand{\P}{\mathsf{P}} \usepackage{amstext} %%% links \usepackage{hyperref} \hypersetup{% pdftitle = {A Handbook of Statistical Analyses Using R}, pdfsubject = {Book}, pdfauthor = {Brian S. Everitt and Torsten Hothorn}, colorlinks = {true}, linkcolor = {blue}, citecolor = {blue}, urlcolor = {red}, hyperindex = {true}, linktocpage = {true}, } %%% captions & tables %% <FIXME>: conflics with figure definition in chapman.cls %%\usepackage[format=hang,margin=10pt,labelfont=bf]{caption} %% </FIMXE> \usepackage{longtable} \usepackage{rotating} %%% R symbol in chapter 1 \usepackage{wrapfig} %%% Bibliography \usepackage[round,comma]{natbib} \renewcommand{\refname}{References \addcontentsline{toc}{chapter}{References}} \citeindexfalse %%% texi2dvi complains that \newblock is undefined, hm... \def\newblock{\hskip .11em plus .33em minus .07em} %%% Example sections \newcounter{exercise}[chapter] \setcounter{exercise}{0} \newcommand{\exercise}{\item{\stepcounter{exercise} Ex. \arabic{chapter}.\arabic{exercise} }} %% URLs \newcommand{\curl}[1]{\begin{center} \url{#1} \end{center}} %%% for manual corrections %\renewcommand{\baselinestretch}{2} %%% plot sizes \setkeys{Gin}{width=0.95\textwidth} %%% color \usepackage{color} %%% hyphenations \hyphenation{drop-out} %%% new bidirectional quotes need \usepackage[utf8]{inputenc} \begin{document} %% Title page \title{A Handbook of Statistical Analyses Using \R} \author{Brian S. Everitt and Torsten Hothorn} \maketitle %%\VignetteIndexEntry{Chapter Cluster Analysis} %%\VignetteDepends{scatterplot3d} \setcounter{chapter}{14} \SweaveOpts{prefix.string=figures/HSAUR,eps=FALSE,keep.source=TRUE} <<setup, echo = FALSE, results = hide>>= rm(list = ls()) s <- search()[-1] s <- s[-match(c("package:base", "package:stats", "package:graphics", "package:grDevices", "package:utils", "package:datasets", "package:methods", "Autoloads"), s)] if (length(s) > 0) sapply(s, detach, character.only = TRUE) if (!file.exists("tables")) dir.create("tables") if (!file.exists("figures")) dir.create("figures") set.seed(290875) options(prompt = "R> ", continue = "+ ", width = 63, # digits = 4, SweaveHooks = list(leftpar = function() par(mai = par("mai") * c(1, 1.05, 1, 1)))) HSAURpkg <- require("HSAUR") if (!HSAURpkg) stop("cannot load package ", sQuote("HSAUR")) rm(HSAURpkg) a <- Sys.setlocale("LC_ALL", "C") book <- TRUE refs <- cbind(c("AItR", "SI", "CI", "ANOVA", "MLR", "GLM", "DE", "RP", "SA", "ALDI", "ALDII", "MA", "PCA", "MDS", "CA"), 1:15) ch <- function(x, book = TRUE) { ch <- refs[which(refs[,1] == x),] if (book) { return(paste("Chapter~\\\\ref{", ch[1], "}", sep = "")) } else { return(paste("Chapter~\\\\ref{", ch[2], "}", sep = "")) } } @ \pagestyle{headings} <<thissetup, echo = FALSE, results = hide>>= library("mclust") library("mvtnorm") mai <- par("mai") options(SweaveHooks = list(rmai = function() { par(mai = mai * c(1,1,1,2))})) @ \chapter[Cluster Analysis]{Cluster Analysis: Classifying the Exoplanets \label{CA}} \section{Introduction} \section{Cluster Analysis} \section{Analysis Using \R{}} \begin{figure} \begin{center} <<CA-planets-scatter, echo = TRUE, fig = TRUE, rmai = TRUE>>= data("planets", package = "HSAUR") library("scatterplot3d") scatterplot3d(log(planets$mass), log(planets$period), log(planets$eccen), type = "h", angle = 55, pch = 16, y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1, scale.y = 0.7) @ \caption{3D scatterplot of the logarithms of the three variables available for each of the exoplanets. \label{CA-planets-scatter}} \end{center} \end{figure} \begin{figure} \begin{center} <<CA-planet-ss, echo = TRUE, fig = TRUE>>= rge <- apply(planets, 2, max) - apply(planets, 2, min) planet.dat <- sweep(planets, 2, rge, FUN = "/") n <- nrow(planet.dat) wss <- rep(0, 10) wss[1] <- (n - 1) * sum(apply(planet.dat, 2, var)) for (i in 2:10) wss[i] <- sum(kmeans(planet.dat, centers = i)$withinss) plot(1:10, wss, type = "b", xlab = "Number of groups", ylab = "Within groups sum of squares") @ \caption{Within-cluster sum of squares for different numbers of clusters for the exoplanet data. \label{CA-planets-ss}} \end{center} \end{figure} Sadly Figure~\ref{CA-planets-ss} gives no completely convincing verdict on the number of groups we should consider, but using a little imagination `little elbows' can be spotted at the three and five group solutions. %%' We can find the number of planets in each group using <<CA-planets-kmeans3, echo = TRUE>>= planet_kmeans3 <- kmeans(planet.dat, centers = 3) table(planet_kmeans3$cluster) @ The centers of the clusters for the untransformed data can be computed using a small convenience function <<CA-planets-ccent, echo = TRUE>>= ccent <- function(cl) { f <- function(i) colMeans(planets[cl == i,]) x <- sapply(sort(unique(cl)), f) colnames(x) <- sort(unique(cl)) return(x) } @ which, applied to the three cluster solution obtained by $k$-means gets <<CA-planets--kmeans3-ccent, echo = TRUE>>= ccent(planet_kmeans3$cluster) @ @ for the three cluster solution and, for the five cluster solution using <<CA-planets-kmeans5, echo = TRUE>>= planet_kmeans5 <- kmeans(planet.dat, centers = 5) table(planet_kmeans5$cluster) ccent(planet_kmeans5$cluster) @ \subsection{Model-based Clustering in \R{}} We now proceed to apply model-based clustering to the planets data. \R{} functions for model-based clustering are available in package \Rpackage{mclust} \citep{PKG:mclust,HSAUR:FraleyRaftery2002}. Here we use the \Rcmd{Mclust} function since this selects both the most appropriate model for the data \stress{and} the optimal number of groups based on the values of the BIC computed over several models and a range of values for number of groups. The necessary code is: <<CA-planets-mclust, echo = TRUE>>= library("mclust") planet_mclust <- Mclust(planet.dat) @ and we first examine a plot of BIC values using \begin{figure} \begin{center} <<CA-planets-mclust-plot, echo = TRUE, fig = TRUE>>= plot(planet_mclust, planet.dat, what = "BIC", col = "black", ylab = "-BIC", ylim = c(0, 350)) @ \caption{Plot of BIC values for a variety of models and a range of number of clusters. \label{CA-mclust1}} \end{center} \end{figure} The resulting diagram is shown in Figure~\ref{CA-mclust1}. In this diagram the numbers refer to different model assumptions about the shape of clusters: \begin{enumerate} \item Spherical, equal volume, \item Spherical, unequal volume, \item Diagonal equal volume, equal shape, \item Diagonal varying volume, varying shape, \item Ellipsoidal, equal volume, shape and orientation, \item Ellipsoidal, varying volume, shape and orientation. \end{enumerate} The BIC selects model $4$ (diagonal varying volume and varying shape) with three clusters as the best solution as can be seen from the \Rcmd{print} output: <<CA-planets-mclust-print, echo = TRUE>>= print(planet_mclust) @ This solution can be shown graphically as a scatterplot matrix The plot is shown in Figure~\ref{CA-planets-mclust-scatter}. Figure~\ref{CA-planets-mclust-scatterclust} depicts the clustering solution in the three-dimensional space. \begin{figure} \begin{center} <<CA-planets-mclust-scatter, echo = TRUE, fig = TRUE, results = hide>>= clPairs(planet.dat, classification = planet_mclust$classification, symbols = 1:3, col = "black") @ \caption{Scatterplot matrix of planets data showing a three cluster solution from \Rcmd{Mclust}. \label{CA-planets-mclust-scatter}} \end{center} \end{figure} \begin{figure} \begin{center} <<CA-planets-mclust-scatterclust, echo = TRUE, fig = TRUE, rmai = TRUE>>= scatterplot3d(log(planets$mass), log(planets$period), log(planets$eccen), type = "h", angle = 55, scale.y = 0.7, pch = planet_mclust$classification, y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1) @ \caption{3D scatterplot of planets data showing a three cluster solution from \Rcmd{Mclust}. \label{CA-planets-mclust-scatterclust}} \end{center} \end{figure} The number of planets in each cluster and the mean vectors of the three clusters for the untransformed data can now be inspected by using <<CA-planets-mclust-mu, echo = TRUE>>= table(planet_mclust$classification) ccent(planet_mclust$classification) @ Cluster 1 consists of planets about the same size as Jupiter with very short periods and eccentricities (similar to the first cluster of the $k$-means solution). Cluster 2 consists of slightly larger planets with moderate periods and large eccentricities, and cluster 3 contains the very large planets with very large periods. These two clusters do not match those found by the $k$-means approach. \bibliographystyle{LaTeXBibTeX/refstyle} \bibliography{LaTeXBibTeX/HSAUR} \end{document}