%\documentclass[codesnippet]{jss}
\documentclass[codesnippet, nojss]{jss}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%\VignetteIndexEntry{A guide to the reporttools package}                                          
%\VignetteKeywords{reporttools}                                                                   
%\VignetteDepends{reporttools, xtable, survival}                                      
%\VignettePackage{reporttools} 

%% almost as usual
\author{Kaspar Rufibach\\University of Zurich} % \And Second Author\\Plus Affiliation}
\title{{\pkg{reporttools}}: \proglang{R} Functions to Generate \\ {\LaTeX} Tables of Descriptive Statistics}

%% for pretty printing and a nice hypersummary also set:
\Plainauthor{Kaspar Rufibach} %% comma-separated
\Plaintitle{reporttools: R Functions to Generate LaTeX Tables of Descriptive Statistics} %% without formatting
\Shorttitle{\pkg{reporttools}: {\LaTeX} Tables of Descriptive Statistics in \proglang{R}}

%% an abstract and keywords
\Abstract{In statistical analysis reports, tables with descriptive statistics are routinely presented. 
We introduce the \proglang{R} package \pkg{reporttools} containing functions
to efficiently generate such tables when compiling
statistical analyses reports by combining {\LaTeX} and \proglang{R} via \code{Sweave}.}
\Keywords{reporting, descriptive statistics, \code{Sweave}}
\Plainkeywords{reporting, descriptive statistics, Sweave}
%% at least one keyword must be supplied

%% publication information
%% NOTE: Typically, this can be left commented and will be filled out by the technical editor
%% \Volume{13}
%% \Issue{9}
%% \Month{September}
%% \Year{2004}
%% \Submitdate{2004-09-29}
%% \Acceptdate{2004-09-29}

%% The address of (at least) one author should be given
%% in the following format:
\Address{
  Kaspar Rufibach\\
  Biostatistics Unit\\
  Institute for Social and Preventive Medicine\\
  University of Zurich\\
  8001 Zurich, Switzerland\\
  Telephone: +41/0/44634-4643 \\
  Fax: +41/0/44634-4386 \\
  E-mail: \email{kaspar.rufibach@ifspm.uzh.ch}\\
  URL: \url{http://www.biostat.uzh.ch/aboutus/people/rufibach.html}
}

%% It is also possible to add a telephone and fax number
%% before the e-mail in the following format:
%% Telephone: +43/1/31336-5053
%% Fax: +43/1/31336-734

%% for those who use Sweave please include the following line (with % symbols):
%% need no \usepackage{Sweave.sty}

%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\usepackage{longtable}
\usepackage{lscape}

% keep code formatting
\SweaveOpts{keep.source = TRUE}

\begin{document}

%% include your article here, just as usual
%% Note that you should use the \pkg{}, \proglang{} and \code{} commands.

<<echo = FALSE, print = FALSE>>=

## save initial options
o1 <- getOption("prompt")
o2 <- getOption("continue")
o3 <- getOption("width")
o4 <- getOption("digits")
options(prompt = "R> ", continue = "+  ", width = 100, digits = 4)
@

% -----------------------------------------------------
\section{About this document}
% -----------------------------------------------------
This document is an introduction to the \proglang{R} package
\pkg{reporttools} \citep{reporttools} based on \citet{Rufibach:2009}. It aims to provide a detailed user guide 
based on simple, reproducible worked examples. The \pkg{reporttools} package is available from the
Comprehensive \proglang{R} Archive Network via \url{http://CRAN.R-project.org/package=reporttools}.

Compared to the package that was the base of \citet{Rufibach:2009} some minor modifications and additions 
have been made to the package. All changes were intended to be backwards compatible.


% -----------------------------------------------------
\section{Introduction}
% -----------------------------------------------------
In statistical analysis reports and medical publications it is common practice to start statistical analyses with 
displays of descriptive statistics of patient characteristics and further important variables. 
The purpose of these tables is (1) to get an idea about basic features of the data and (2) especially in 
analysis reports, data checking.
Tables of descriptive statistics may take different formats, depending on the type of 
variables to be displayed, which descriptive statistics are to be reported, and whether statistics should be given
for all observations jointly or separately for the levels of a given factor, such as e.g. treatment arm. 
To be able to efficiently generate these recurring parts of analyses when combining {\LaTeX}
\citep{knuth_84, lamport_94} with \proglang{R} \citep{R} code via \code{Sweave} \citep{leisch_02}, the \proglang{R} package \pkg{reporttools} 
provides functions to generate {\LaTeX} tables of descriptive statistics for nominal, date,
and continuous variables.
The tables are set up as data frames in \proglang{R}, then translated into {\LaTeX} code
using the standard \proglang{R} package \pkg{xtable} \citep{xtable}. Using \code{Sweave}, these tables can be directly generated
in {\LaTeX} documents by invoking basically only one line of \proglang{R} code.

A package that offers somewhat related functionality is \pkg{r2lUniv} \citep{r2lUniv}.
In that package, each variable gets a separate section providing descriptive statistics and corresponding plots, 
depending on the type of variable that is analyzed. The default functions in \pkg{r2lUniv}
directly generate an entire {\LaTeX} document, thereby reporting the descriptive statistics of only about three variables
on one single page. Set up this way, it seems difficult to efficiently merge plain text and data analysis using \pkg{r2lUniv}.
Additionally, we are not aware of a possibility in \pkg{r2lUniv} to compare a given variable between different groups in an
easy way. The functions introduced in \pkg{reporttools} aim at closing these gaps. The tabulating functions 
in \pkg{reporttools} can be applied inside a \code{.Rnw} document combining {\LaTeX} plain text and data 
analyses in \proglang{R}.

A common feature in data analysis is, that one needs to provide descriptive statistics of a large set of variables thereby
generating tables that are larger than one page. The functions described here have by default 
the \code{tabular.environment}-option
in the implicitly used \proglang{R} function \code{print.xtable} set to \code{longtable}. Together
with the {\LaTeX} package \pkg{longtable}, setting this option generates tables that may range over more than a 
page, without additional specification or programming effort.

The package \pkg{reporttools} contains several additional functions useful in setting up analyses and 
writing reports. However, the emphasis in this article is on the tabulating functions. For a more detailed description
of further elements of the package we refer to its \proglang{R} help files.

In Section~\ref{sec: data} we introduce the dataset that is used to illustrate the tabulating functions 
in Section~\ref{sec: heart}. Some conclusions are drawn in Section~\ref{sec: concl}.

% -----------------------------------------------------
\section{Stanford heart transplantation data} \label{sec: data}
% -----------------------------------------------------
We illustrate our new functions using the Stanford heart transplantation dataset \code{jasa} in the 
standard \proglang{R} package \pkg{survival} \citep{survival}. 
This dataset provides some patient characteristics and the survival of patients on the waiting list for the 
Stanford heart transplantation program, see \cite{crowley_77} and the corresponding \proglang{R} help file for details.


% -----------------------------------------------------
\section{Descriptive statistics for heart transplantation data} \label{sec: heart}
% -----------------------------------------------------
To demonstrate the entire flexibility of \pkg{reporttools} we use the package to describe the Stanford heart
transplantation data.

\subsection{Nominal variables (factors)}
The following code lines invoke \pkg{reporttools} and prepare the variables from \code{jasa} for later use. 

<<results=tex, echo=T>>=
library(reporttools)
data("heart", package = "survival")    # load the jasa dataset
vars0 <- with(jasa, data.frame(
  "Transplantation" = factor(jasa$transplant, levels = 0:1, labels = 
  c("no", "yes")), "Age" = jasa$age, "Surgery" = factor(jasa$surgery, 
  levels = 0:1, labels = c("no", "yes")), "Survival status" = 
  factor(jasa$fustat, levels = 0:1, labels = c("alive", "dead")),
  "HLA A2 score" = jasa$hla.a2, "Birthday" = jasa$birth.dt, 
  "Acceptance into program" = jasa$accept.dt, "End of follow up" = 
  jasa$fu.date, "Follow up time" = futime, "Mismatch score" = 
  mscore, check.names = FALSE))
attach(vars0, warn.conflicts = FALSE)
@ 

To generate Table~\ref{tab: nominal1}, a summary of some nominal variables of the heart
transplantation data, simply invoke:
 
<<results = verbatim, echo = T, eval = F>>=
vars1 <- vars0[, c("Surgery", "Survival status", "HLA A2 score")]
cap1 <- "Patient characteristics: nominal variables."
tableNominal(vars = vars1, cap = cap1, vertical = FALSE, lab = 
  "tab: nominal1", longtable = FALSE)
@ 

\renewcommand{\baselinestretch}{1.15}
\begin{center}
<<results=tex, echo=F>>=
vars1 <- vars0[, c("Surgery", "Survival status", "HLA A2 score")]
cap1 <- "Patient characteristics: nominal variables."
tableNominal(vars = vars1, cap = cap1, vertical = FALSE, lab = "tab: nominal1", longtable = FALSE)
@ 
\end{center}
\renewcommand{\baselinestretch}{1}

If one wants to inspect differences in the distributions of patient characteristics
between patients finally receiving or not receiving a transplantation,
one simply needs to specify the option \code{group = Transplantation} to get absolute and relative frequencies 
per transplantation status, and for all patients jointly. %, see Table~\ref{tab: nominal2}.

Furthermore, by specifying either \code{print.pval = "fisher"} 
or \code{print.pval = "chi2"} we can ask for a $p$-value
of either a Fisher's exact or a $\chi^2$ test comparing the distributions between the groups defined by \code{group}.
If interested in frequencies for patients not older than 50 years, we can set the option \code{subset = (Age <= 50)}. 
Then, for the HLA A2 score, we consider missing values a category in computation of percentages. To this end, we assign
\code{miss.cat} a vector containing the indices of the factor(s) in \code{vars} we want the percentages to be computed 
this way. All these options are implemented in Table~\ref{tab: nominal2} via the following code lines:

<<results = verbatim, echo = T, eval = F>>=
cap2 <- "Patient characteristics: nominal variables, by transplantation, 
  patients not older than 50, missings as a separate category for the 
  $3^{\\mathrm{rd}}$ factor, $p$-values of Fisher's exact test added."
tableNominal(vars = vars1, group = Transplantation, subset = (Age <= 50), 
  miss.cat = 3, print.pval = "fisher", cap = cap2, lab = "tab: nominal2", 
  longtable = FALSE)
@ 

\pagebreak

\renewcommand{\baselinestretch}{1.15}
\begin{center}
<<results=tex, echo=F>>=
cap2 <- "Patient characteristics: nominal variables, by transplantation, patients not older than 50, 
  missings as a separate category for the $3^{\\mathrm{rd}}$ factor, $p$-values of Fisher's exact test added."
tableNominal(vars = vars1, group = Transplantation, subset = (Age <= 50), miss.cat = 3, print.pval = "fisher", cap = cap2, lab = "tab: nominal2", longtable = FALSE)
@ 
\end{center}
\renewcommand{\baselinestretch}{1}

It may happen that one prefers a different ordering or naming of the levels of a factor in a table. For example, 
suppose that in Table~\ref{tab: nominal1} frequencies of the patients that died should precede those who are alive. 
We suggest to reverse this ordering via re-coding of the corresponding factor variable, by appropriately specifying
the options \code{levels} and \code{labels} in \proglang{R}'s command \code{factor}.
Recoding factors is also recommended if one wants to change factor labels in a Table.

By default, vertical lines are added to the plot. 
These can be omitted by specifying \code{vertical = FALSE} in the function call, as is done in the call for 
Table~\ref{tab: nominal1}.

Finally, a vector that attaches a weight to each observation can be assigned to the option \code{weights}.

\subsection{Date variables}
The primary purpose to report descriptive statistics for date variables is data checking. The implemented function
\code{tableDate} provides number of observations ($n$), minimum (Min), first quartile ($q_1$), median ($\widetilde{x}$), mean ($\bar{x}$), third quartile ($q_3$),
Maximum (Max), and number of missing values ($\#$NA).
If not all of these statistics need to be reported, the desired columns can be specified via \code{stats}.
Simply provide a sub-vector of \code{c("n", "min", "q1", "median", "mean", "q3", "max", "na")} giving the statistics 
you want to be displayed, in the order they should appear in the table.
As an illustration, we only display $n$, Min, Max, and $\#$NA for the variables birth date, acceptance date,
and end of follow up date of the heart trial in Table~\ref{tab: date1}, via the following code:

<<results = verbatim, echo = T, eval = F>>=
vars3 <- vars0[, c("Birthday", "Acceptance into program", 
  "End of follow up")]
cap3 <- "Patient characteristics: date variables, by transplantation 
  status."
tableDate(vars = vars3, group = Transplantation, stats = 
  c("n", "min", "max", "na"), print.pval = TRUE, cap = cap3, lab = 
  "tab: date1", longtable = FALSE)
@ 
 
As for \code{tableNominal}, the \code{vars} argument of the function is a data frame containing the variables of interest,
but here each of \proglang{R} class \code{Date}.
One can ask for a $p$-value of a Mann-Whitney test (or Kruskal-Wallis test, 
depending on the number of levels of the grouping variable), to assess whether distributions are different
between the groups defined by a given factor. This is especially relevant when analyzing data of a 
randomized trial, to verify whether patient 
characteristics are indeed equally distributed between randomized treatment arms. 

\renewcommand{\baselinestretch}{1.15}
\begin{center}
<<results=tex, echo=F>>=
vars3 <- vars0[, c("Birthday", "Acceptance into program", "End of follow up")]
cap3 <- "Patient characteristics: date variables, by transplantation status."
tableDate(vars = vars3, group = Transplantation, stats = c("n", "min", "max", "na"), print.pval = TRUE, cap = cap3, lab = "tab: date1", longtable = FALSE)
@ 
\end{center}
\renewcommand{\baselinestretch}{1}

\subsection{Continuous variables}
Compared to \code{tableDate}, the function \code{tableContinuous} to display continuous variables additionally provides 
standard deviation ($s$) and interquartile range (IQR) as default statistics. 
When using \code{tableContinuous}, all variables in the data frame given as the \code{vars} argument to this function must be 
\code{numeric}. Again, via \code{stats} one can choose which statistics to 
display, out of the options \code{c("n", "min", "q1", "median", "mean", "q3", "max", "s", "iqr", "na")}. 
To provide even more flexibility in the choice of descriptive statistics, user-defined functions can be supplied
to \code{stats} in \code{tableContinuous}. Such a user-defined function must take a vector as an argument and return
a single number (the desired statistic). Missing values are removed by default. For illustration, we add a trimmed mean
and the coefficient of variation c$_{\mathrm{v}}$ to the chosen default statistics in Table~\ref{tab: cont2}. 
The number of decimal places the descriptive statistics are displayed with can be set using \code{prec}.
Note that this option does not affect the columns \code{n} and \code{\#NA}, their entries are always given as 
whole numbers.
Finally, by specifying \code{print.pval} one can ask for a $p$-value of an $F$, $t$, Kruskal-Wallis, or Mann-Whitney test,
as appropriate, to compare the variable under consideration between the groups given by \code{group}.

We give two examples summarizing descriptive statistics of some continuous patient characteristics
of the Stanford heart transplantation data. Table~\ref{tab: cont1} is generated via 

<<results = verbatim, echo = T, eval = F>>=
vars4 <- vars0[, c("Age", "Follow up time", "Mismatch score")]
cap4 <- "Patient characteristics: continuous variables."
tableContinuous(vars = vars4, cap = cap4, lab = "tab: cont1", 
  longtable = FALSE)
@ 

\pagebreak

\renewcommand{\baselinestretch}{1.15}
\begin{center}
<<results=tex, echo=F>>=
vars4 <- vars0[, c("Age", "Follow up time", "Mismatch score")]
cap4 <- "Patient characteristics: continuous variables."
tableContinuous(vars = vars4, cap = cap4, lab = "tab: cont1", longtable = FALSE)
@ 
\end{center}
\renewcommand{\baselinestretch}{1}

Finally, Table~\ref{tab: cont2} can be produced via:

<<results = verbatim, echo = T, eval = F>>=
cap5 <- "Patient characteristics, by transplantation: continuous 
  variables, user-defined functions supplied."
stats <- list("n", "min", "median", "$\\bar{x}_{\\mathrm{trim}}$" = 
  function(x){return(mean(x, trim = .05))}, "max", "iqr", 
  "c$_{\\mathrm{v}}$" = function(x){return(sd(x) / mean(x))}, "s", "na")
tableContinuous(vars = vars4, group = Transplantation, stats = stats, 
  print.pval = "kruskal", cap = cap5, lab = "tab: cont2", longtable = 
  FALSE)
@ 

\renewcommand{\baselinestretch}{1.15}
\begin{center}
<<results=tex, echo=F>>=
cap5 <- "Patient characteristics, by transplantation: continuous variables, user-defined functions supplied."
stats <- list("n", "min", "median", "$\\bar{x}_{\\mathrm{trim}}$" = function(x){return(mean(x, trim = .05))}, "max", "iqr", "c$_{\\mathrm{v}}$" = function(x){return(sd(x) / mean(x))}, "s", "na")
tableContinuous(vars = vars4, group = Transplantation, stats = stats, print.pval = "kruskal", cap = cap5, lab = "tab: cont2", longtable = FALSE)
@ 
\end{center}
\renewcommand{\baselinestretch}{1}

% =======================================
\section{Conclusions} \label{sec: concl}
% =======================================
In this article, we present some functions in the \pkg{reporttools} package for \proglang{R} 
that facilitate the presentation of descriptive statistics of nominal, date, and continuous variables
when writing reports using \code{Sweave}. The package is available from CRAN.

% =======================================
\section{Acknowledgments} \label{sec: acknow}
% =======================================
I thank the editor and two referees for constructive comments that led to an improvement of \pkg{reporttools} and the presentation of 
the paper. I also thank Leo Held and Philipp Muri for helpful discussions.

% =======================================
% \section{Literatur} \label{sec: references}
% =======================================
%\nocite{*}

\bibliography{report}

<<echo=FALSE>>=
## re-specify initial options
options(prompt = o1, continue = o2, width = o3, digits = o4)
@

\end{document}