% -*- TeX:RNW -*- -*- US -*-

%\VignetteIndexEntry{Analyzing and Visualizing State Sequences}

\PassOptionsToPackage{svgnames}{xcolor}
%\documentclass[article]{jss}
\documentclass[article,nojss]{jss}
\usepackage{thumbpdf}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% almost as usual
\author{Alexis Gabadinho\\University of Geneva \And
        Gilbert Ritschard\\University of Geneva \And
        Nicolas S. M\"{u}ller\\University of Geneva \And
        Matthias Studer\\ University of Geneva}
\title{\mbox{\ }
\\[2ex]
Analyzing and Visualizing State Sequences in \proglang{R} with \pkg{TraMineR}}

%% for pretty printing and a nice hypersummary also set:
\Plainauthor{Alexis Gabadinho, Gilbert Ritschard, Nicolas S. M\"{u}ller, Matthias Studer} %% comma-separated
\Plaintitle{Analyzing and Visualizing State Sequences in R with TraMineR} %% without formatting
\Shorttitle{Analyzing and Visualizing State Sequences in \proglang{R} with \pkg{TraMineR}} %% a short title (if necessary)

%% an abstract and keywords
\Abstract{
This article describes the many capabilities offered by the \pkg{TraMineR} toolbox for categorical sequence data. It focuses more specifically on the analysis and rendering of state sequences. Addressed features include the description of sets of sequences by means of transversal aggregated views, the computation of longitudinal characteristics of individual sequences and the measure of pairwise dissimilarities. Special emphasis is put on the multiple ways of visualizing sequences. The core element of the package is the state sequence object in which we store the set of sequences together with attributes such as the alphabet, state labels and the color palette.
The functions can then easily retrieve this information to ensure presentation homogeneity across all printed and graphical displays. The article also demonstrates how \pkg{TraMineR}'s outcomes give access to advanced analyses such as clustering and statistical modeling of sequence data.
}

\Keywords{state sequences, categorical sequences, sequence visualization, sequence complexity, dissimilarities, optimal matching, representative sequences, \proglang{R}}
\Plainkeywords{state sequences, categorical sequences, sequence visualization, sequence complexity, dissimilarities, optimal matching, representative sequences, R}
%% without formatting
%% at least one keyword must be supplied

%% publication information
%% NOTE: Typically, this can be left commented and will be filled out by the technical editor
 \Volume{40}
 \Issue{4}
 \Month{April}
 \Year{2011}
 \Submitdate{2009-08-12}
 \Acceptdate{2011-01-31}

%% The address of (at least) one author should be given
%% in the following format:
\Address{
  Alexis Gabadinho, Gilbert Ritschard, Nicolas S.\ M\"{u}ller, Matthias Studer\\
  Institute for Demographic and Life Course Studies\\
  University of Geneva\\
  CH-1211 Geneva 4, Switzerland\\
  E-mail: \email{alexis.gabadinho@unige.ch}\\
  URL: \url{http://mephisto.unige.ch/}
}
%% It is also possible to add a telephone and fax number
%% before the e-mail in the following format:
%% Telephone: +43/1/31336-5053
%% Fax: +43/1/31336-734

%% for those who use Sweave please include the following line (with % symbols):

%% need no \usepackage{Sweave.sty}

%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%\documentclass[10pt,a4paper]{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{rotating}
\usepackage{varioref}
\usepackage{array}
%\usepackage{subfigure}
%\usepackage{afterpage}

\usepackage[svgnames]{xcolor}

\usepackage[authoryear]{natbib}

\newcommand{\Com}[1]{\code{#1}}
\newcommand{\Comt}[1]{\code{#1}}
\newcommand{\File}[1]{\texttt{#1}}
\newcommand{\Filet}[1]{\texttt{#1}}
\newcommand{\Dataset}[1]{\code{#1}}
\newcommand{\Datasett}[1]{\code{#1}} % for dataset without index reference

 \newcommand\traminer[1]{\pkg{TraMineR}%
    \if#1..%
 	\else\if#1,,%
 	\else\if#1;;%
 	\else\if#1??%
    \else\if#1::%
    \else\if#1!!%
    \else\if#1--%
    \else\space#1\fi\fi\fi\fi\fi\fi\fi}

 \newcommand\R[1]{\proglang{R}%
    \if#1..%
 	\else\if#1,,%
 	\else\if#1;;%
 	\else\if#1??%
    \else\if#1::%
    \else\if#1!!%
    \else\if#1--%
    \else\space#1\fi\fi\fi\fi\fi\fi\fi}

\newlength\TableWidth

\graphicspath{
  {Graphics/}
	{./Graphics/}
	}

\SweaveOpts{prefix.string=Graphics/SW, keep.source=FALSE} % sets the prefix for figure files generated par Sweave


\DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl,formatcom=\color{FireBrick}}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{formatcom=\color{DarkBlue}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{fontshape=sl}


 \hypersetup{
 %ps2pdf=true,
 urlcolor=DarkRed,%dark-blue, %magenta,
 filecolor=DarkGreen, %red,
 linkcolor=DarkBlue, % blue,
 %menucolor=white,
 citecolor=DarkBlue, % green,
 colorlinks=true,
 %a4paper=false,
 %bookmarksnumbered=false,
 %pdfpagemode=none,
 %plainpages=true,
 %pageanchor=false,
 %linktocpage=true,
 %pdfpagetransition=empty,
 %bookmarksopen=false,
 %backref=true
 }


\setcounter{topnumber}{4}          % \setcounter{topnumber}{2}
\def\topfraction{1}                % \def\topfraction{.7}
\setcounter{bottomnumber}{2}       % \setcounter{bottomnumber}{1}
\def\bottomfraction{1}             % \def\bottomfraction{.3}
\setcounter{totalnumber}{5}        % \setcounter{totalnumber}{3}
\def\textfraction{0}               % \def\textfraction{.2}
\def\floatpagefraction{1}          % \def\floatpagefraction{.5}

\widowpenalty=10000
\clubpenalty=10000

%% preliminary R commands
<<preliminary, echo=FALSE, results=hide>>=
options(width=76, prompt="R> ", continue="+   ", encoding="UTF-8", useFancyQuotes=FALSE)
library(TraMineR)
library(xtable)

## place where figures generated by Sweave will be stored
dir.create("Graphics", showWarnings = FALSE)
graphdir <- "Graphics/"
@


\begin{document}

\raisebox{14.3cm}[0cm][0cm]{\parbox{\linewidth}{\small\normalfont This vignette is distributed with the \R package \traminer. Please cite as:\\[.5ex]
Gabadinho, A., G. Ritschard, N.S. M\"{u}ller and M. Studer (2011). ``Analyzing and Visualizing State Sequences in \proglang{R} with \pkg{TraMineR}.'' \textit{Journal of Statistical Software}, \textbf{40}(4), 1--37. DOI \href{http://dx.doi.org/10.18637/jss.v040.i04}{10.18637/jss.v040.i04}}}

\section{Introduction}

This article %
%\footnote{The development of the \traminer package and this article were realized within a research project supported by the Swiss National Science Foundation under grants 113998 and 122230.} -> acknowledgment at end
is concerned with categorical sequence data and more specifically with state sequences, where the position of each successive state receives a meaningful interpretation in terms of age, date, or more generally of elapsed time or distance from the beginning of the sequence.
Its aim is to examine a series of questions about such state sequences and to present the various solutions that we implemented in the \proglang{R} \citep{R-Development-Core-Team2011Manual} package \traminer{} for answering them.

The addressed methods are for sets of sequences and most of them are holistic \citep{Billari-2001} in that they consider each sequence as a whole; i.e., as a conceptual unit. The discussion is mainly oriented towards the analysis of sequences describing individual life courses. Nevertheless, most of the discussed concepts and tools should be applicable in other domains such as text, biology, quality control or web logs analysis, to cite just a few.

Sequences are complex objects, and we need special tools for describing and displaying them. We consider, therefore, questions regarding the exploration and description of sets of sequences such as:
%
 \begin{itemize}
    \item
    Which characteristics of sequences are we interested in?
    \item
    What kind of indicators can we compute for a sequence set?
    \item
    What are suited plots for rendering sequences?
    \item
    How can we measure similarity between sequences?
 \end{itemize}
%
With a more analytical or explanatory concern, we also consider issues such as:
%
 \begin{itemize}
    \item
    How can we identify groups with similar patterns and build typologies of sequences?
    \item
    How can we analyze the relationship of sequences with covariates?
 \end{itemize}

In the social sciences, state sequences are of interest for studying life trajectories such as occupational histories, professional careers or cohabitational life courses. Some of the typical questions arising in this area are:
 \begin{itemize}
    \item
    Do life courses obey some social norm? Which are the standard trajectories? What kind of departures do we observe from these standards ? How do life course patterns evolve over time ?
    \item
    Why are some people more at risk to follow a chaotic trajectory or to stay stuck in a state? How does the trajectory complexity evolve across birth cohorts?
    \item
    How is the life trajectory related to sex, social origin and other cultural factors?
 \end{itemize}
%
Empirical answers to such questions require us to consider collections of life sequences, to examine them from both a transversal and a longitudinal perspective, and to study their relationships with covariates.

The primary objective of sequence methods is then to extract simplified workable information from sequential data sets; that is, to efficiently summarize and render these sets and to categorize the sequential patterns into a limited number of groups. This is essentially an exploratory task that consists of computing summary indicators, as well as sorting, grouping and comparing sequences. The resulting groups and real-value indicators may then be submitted to classical inferential methods and serve, for instance, as response variables or explanatory factors for regression-like models.

A common approach for categorizing patterns consists of computing pairwise distances between them by means of sequence alignment algorithms (such as optimal matching) or other suitable metrics and using this information for clustering the sequences. This method has been applied to various data since the pioneering work of \citet{Abbott:Forrest:1986}. A review can be found in \citet{Abbott-2000}. The expected outcome of such a strategy is a typology, with each cluster grouping cases with similar trajectories. Through binary logistic regression or classification trees, for example, we can then study how each cluster membership is related to covariates.

A more recent complementary approach considered in the literature \citep{ElzingaLiefbroer2007EJP,WidmerRitschard2009ALCR} is to focus on sequence indicators measuring for instance the longitudinal diversity and complexity of the sequences and to analyze them by means of conventional statistical tools for real-value variables.

With a somewhat more aggregated point of view, an approach considered, for instance, by \citet{Billari-2001a} consists of looking at the sequence of transversal characteristics measured at each position, such as the diversity of states observed at each given age. Comparing the evolution of such transversal characteristics for different groups defined by birth cohorts or sex, for instance, provides instructive insights \citep{WidmerRitschard2009ALCR}. However, when working with transversal indicators we lose the specific information on individual follow-ups.

We may indeed imagine many other ways of looking at categorical sequences such as correspondence analysis of the states \citep{Deville:Saporta:1983} or advanced Markov modeling \citep{BerchtoldRaftery2002}; i.e., the study of how the probability of a given state depends on the previously observed states.
Transforming state sequences into event sequences and resorting to tools for mining frequent subsequences permits us to gain interesting knowledge about the typical sequencing of states or events \citep{billa:furn:prsk:2006,RitschardGabadinhoMullerStuder2008IJDMMM}.
A very common approach in the life course literature is \textit{event history} or \textit{survival} analysis \citep{Mayer-1990,Yamaguchi-1991,HosmerLemeshow1999,blos:golsch:rohw:2007} which focuses on the occurrence of a specific event or somewhat equivalently on the duration---time to event---until a given state transition. Though not addressed here, all these techniques may usefully complement the considered state sequence techniques.

The \traminer {} \proglang{R} package is available from the Comprehensive \R Archive Network at \url{http:
//CRAN.R-project.org/package=TraMineR} and offers many analysis and visualization tools for either state or event sequences. These tools include already known methods, as well as new developments. We focus in this article on the functions intended for state sequence analysis. The paper is organized as follows. In Section~\ref{sec_overview}, we introduce the \traminer library, describe the \Datasett{mvad} data set used for illustration and give a first example of analysis that can be run with \traminer.
Section~\ref{sec_identifying-formats} defines the different forms of sequential data that are supported by the package. In Section~\ref{sec_seq_object}, we introduce the central concept of \emph{state sequence object}.
Section~\ref{sec:Visualizing-individual-sequences} introduces two basic visualization tools and describes the general plotting principles used by the package.
Section~\ref{sec_describing-visualizing-sequences} is devoted to the summarization and visual rendering of sets of sequences, while Section~\ref{sec_sequence-complexity} is concerned with individual sequence indicators. In Section~\ref{sec_similarities}, we present the metrics that were implemented for measuring pairwise dissimilarities between sequences. In Section~\ref{sec_further_statistical_analysis}, we illustrate how dissimilarities measures
can be used for further statistical analysis. Finally, we make some concluding remarks in Section~\ref{sec_conclusions}.


%%%%%%
\section[The TraMineR R package]{The \traminer{} \R package}
\label{sec_overview}

\traminer{} \citep{Gabadinho_et_al_2009TraMineR-UGuide} is a package for mining and visualizing sequences of categorical data describing life courses in \R {} \citep{R-Development-Core-Team2011Manual}, the name \traminer being a contraction of Life Trajectory Miner for \R.
%.
It puts together most of the features proposed separately by other software for sequential data and offers many original tools for managing, analyzing and rendering categorical sequences.

Other statistical programs that can handle state sequences include \citet{Abbott1997optimize}'s no-longer-maintained \proglang{Optimize} program, \proglang{TDA} \citep{RohwerPotter2002}, which is freely
available at \url{http://www.stat.ruhr-uni-bochum.de/tda.html}, the add-on for \proglang{Stata} by
\cite{Brzinsky-FayKohlerLuniak2006}, which is freely available for licensed \proglang{Stata} users, and the dedicated
\proglang{CHESA} program by \citet{Elzinga-2007b}, available from \url{http://home.fsw.vu.nl/ch.elzinga/}.
They all compute the optimal-matching edit distance between pairs of sequences and each of them offers specific useful facilities for describing sets of sequences. \traminer is, to our knowledge, the first such toolbox for the free \R statistical and graphical environment. Its salient characteristics are:

\begin{itemize}
	\item
\R and \traminer are free and open source;
	\item
Since \traminer is developed in \R, it takes advantage of many already optimized
procedures of \R as well as of its powerful graphical capabilities;
	\item
\R runs under several OS including Linux, MacOS X, Unix and Windows; any \R script with \traminer functions runs unmodified under all operating systems;
 	\item
Specific \traminer functions can be combined in the same script with any of the numerous basic statistical procedures of \R as well as with those of any other
\R-package.
 \end{itemize}

\begin{table}[tb]

%%\newlength\TableWidth
\setlength\TableWidth\linewidth
\addtolength\TableWidth{-8\tabcolsep}
\medskip

%\sffamily\small
	 \begin{tabular}{|>{\raggedright}p{0.17\TableWidth}|>{\raggedright\ttfamily}p{0.16\TableWidth}|>{\raggedright\ttfamily}p{0.16\TableWidth}|>{\raggedright}p{0.51\TableWidth}|}
		\hline
		\multicolumn{1}{|>{\raggedright}m{0.17\TableWidth}|}{{Type}} & \multicolumn{1}{>{\raggedright}m{0.16\TableWidth}|}{{Function}} & \multicolumn{1}{>{\raggedright}m{0.16\TableWidth}|}{{Graphical function}} & {Description} \tabularnewline
		\hline
		%\hline
		Data  & seqformat() & & Translating between sequence formats   \tabularnewline
		%\cline{2-4}
		handling (Section \ref{sec_identifying-formats})  & seqconc(), seqdecomp()  & & Converting between character string and matrix representations of sequences \tabularnewline
		%\hline
		\hline
		 State  & seqdef() &  & Creating state sequence objects %for applying sequence methods
\tabularnewline
		 %\cline{2-4}
		 sequence  & alphabet() & & Setting and retrieving the alphabet \tabularnewline
				%\cline{2-4}
		 objects  & cpal() & & Setting and retrieving the color palette \tabularnewline
				%\cline{2-4}
		 (Section \ref{sec_seq_object}) & stlab() & & Setting and retrieving the state labels \tabularnewline
		%\hline
		\hline
		Individual sequences & print() & seqiplot(), seqIplot() & Displaying and plotting individual sequences \tabularnewline
		%\cline{2-4}
        (Section \ref{sec:Visualizing-individual-sequences})  & seqtab() & seqfplot() & Sequence frequencies \tabularnewline
    	%\hline
		\hline
		Global and   & seqstatd() & seqdplot(), & Transversal state distributions and
        \tabularnewline
        transversal  &            & seqHtplot() & transversal entropies
        \tabularnewline
		%\cline{2-4}
		descriptive  & seqmeant() & seqmtplot() & Mean durations in states of the alphabet \tabularnewline
		%\cline{2-4}
		statistics  & seqmodst() & seqmsplot() & Sequence of modal states\tabularnewline
		%\cline{2-4}
		(Section \ref{sec_describing-visualizing-sequences})          & seqtrate() & & Transitions rates \tabularnewline
		%\hline
		\hline
		Sequence  & seqlength() & & Sequence lengths \tabularnewline
		%\cline{2-4}
		characteristics & seqdss() & & Distinct successive states by sequences\tabularnewline
		%\cline{2-4}
		(Section \ref{sec_sequence-complexity}) & seqdur() & & Durations of the distinct successive states \tabularnewline
		%\cline{2-4}
		 	& seqsubsn() & & Number of subsequences by sequence \tabularnewline
		%\cline{2-4}
			& seqistatd() & & Within sequence state distributions \tabularnewline
		%\cline{2-4}
			& seqient() & & Within sequence entropies \tabularnewline
		%\cline{2-4}
		    & seqST() & & Within sequence turbulences \tabularnewline
		%\cline{2-4}
		    & seqici() & & Within sequence complexity indexes \tabularnewline
		%\hline
		\hline
		Sequence 			& seqsubm() & & Creating a substitution cost matrix %for Optimal Matching distances
\tabularnewline
		%\cline{2-4}
		dissimilarities (Section \ref{sec_similarities}) & seqdist() & & Pairwise distances or distances to a baseline sequence \tabularnewline
		%\cline{2-4}
			& seqdistmc() & & Multichannel distances \tabularnewline
		%\cline{2-4}
%											& disscenter() & & Distances to the (virtual) center of the set \tabularnewline
%		\cline{2-4}
%											& dissvar() & & Pseudo-variance of a set of sequences \tabularnewline
%		\cline{2-4}
											& seqrep() & seqrplot() & Extracting sets of non-redundant representative sequences  \tabularnewline
		\hline

\end{tabular}

\caption{\pkg{TraMineR}'s key functions.}
\label{tab:Overview}
\end{table}
%

\traminer is readily installed from  within \R via \code{install.packages("TraMineR")}. %
%\footnote{Typing \Com{install.packages("TraMineR")} downloads and installs the package from the Comprehensive R Archive Network (CRAN).}
It features a unique set of procedures for analyzing and visualizing state sequence data, such as:
%
 \begin{itemize}
   		\item
Handling a large number of state sequence representations with simple functions for transforming to and from different formats;
		\item
A whole series of easy to use plot functions for rendering sets of sequences (density plot, frequency plot, index plot, representative sequence plot and more);
      	\item
Individual longitudinal characteristics of sequences (length, time in each state, longitudinal entropy, complexity, turbulence and more);
		\item
Sequence of transversal characteristics by position (transversal state distribution, transversal entropy, modal state);
		\item
Other aggregated characteristics (transition rates, average duration in each state, sequence frequency);
		\item
A choice of metrics for evaluating distances between sequences.
 \end{itemize}
%
Table~\ref{tab:Overview} gives an overview of the key functions for analyzing state sequences that will be described in the remainder of the article. It is worth mentioning that the package provides also tools for event sequences such as finding the most frequent and most discriminating subsequences and extracting association rules between subsequences, as well as tools for ANOVA-like analyses of sequences \citep{StuderRitschardGabadinhoMuller2011SMR} that will not be discussed here. See the User's guide \citep{Gabadinho_et_al_2009TraMineR-UGuide} for a detailed description of the package usage.

\subsubsection[A first glance at TraMineR]{A first glance at \traminer{}}

To illustrate how easy it is to work with \traminer we consider the \Datasett{mvad} example data set. This data frame is distributed with the library and will serve throughout the article.
It contains the data used by \citet{McVicarAnyadike2002JRSSa} for studying the school-to-work transition in Northern Ireland.
The figures cover 712 individuals, the sequences being their monthly follow-up over the course of 6 years starting in the month where they were first eligible to leave compulsory education (July 1993). Each individual is characterized by a unique identifier, 13 covariates and 72 monthly activity state variables from July 1993 to June 1999. Since the first two months of the follow-up are summer holidays, we look hereafter at trajectories from September 1993 yielding sequences of 70 monthly statuses. The states are school, FE (further education), employment, training, joblessness, and HE (higher education). See Table~\ref{tab:Variables-McVicar} for a description of the variables in \Datasett{mvad}.
%

\begin{table}[tb]

\setlength\TableWidth\linewidth
\addtolength\TableWidth{-4\tabcolsep}

%\begin{center}\sffamily\small
%\begin{tabular}{|l|p{.8\linewidth}|}

%\sffamily\small%
\medskip

\begin{tabular}{|>{\raggedright}p{0.12\TableWidth}|>{\raggedright}p{0.88\TableWidth}|}
\hline
{ id} & { Unique individual identifier}\tabularnewline
%\hline
{ weight} & { Sample weights}\tabularnewline
%\hline
{ male} & { Binary dummy for gender; $1=$ male}\tabularnewline
%\hline
{ catholic} & { Binary dummy for community; $1=$ Catholic}\tabularnewline[1ex]
%\hline
{ Belfast} & { Binary dummies for location of school, one of five Education and Library Board areas}\tabularnewline
%\hline
{ N.Eastern} & {see Belfast %\hspace{2em} in Northern Ireland
}\tabularnewline
%\hline
{ Southern} & {see Belfast}\tabularnewline
%\hline
{ S.Eastern} & {see Belfast}\tabularnewline
%\hline
{ Western} & {see Belfast}\tabularnewline[1ex]
%\hline
{ Grammar} & { Binary dummy indicating type of secondary education; $1=$ grammar school}\tabularnewline
%\hline
{ funemp} & { Binary dummy indicating father's employment status at time of survey; $1=$ father unemployed}\tabularnewline
%\hline
{ gcse5eq} & { Binary dummy indicating qualifications gained by the end of compulsory education; $1=5$ or more GCSEs at grades A-C, or equivalent}\tabularnewline
%\hline
{ fmpr} & { Binary dummy indicating SOC code of father's current or most recent job; $1=$ SOC1 (professional, managerial or related)}\tabularnewline
%\hline
{ livboth} & { Binary dummy indicating living arrangements at time of first sweep of survey (June 1995); $1=$ living with both parents}\tabularnewline[1ex]
%\hline
 { jul93} & { Monthly activity variables coded 1-6; $1=$ school, $2=$ FE, $3=$ employment, $4=$ training, $5=$ joblessness, 6=HE}\tabularnewline
%\hline
{ $\vdots$} & {\vdots}\tabularnewline
%\hline
 { jun99} & {see jul93}\tabularnewline
\hline
\end{tabular}
% \end{center}
\caption{List of variables in the \Datasett{mvad} data set.}
\label{tab:Variables-McVicar}
\end{table}
%
We first show how to build a typology of the observed school-to-work trajectories by clustering the sequences. This is done with the following few steps:

\begin{enumerate}
    \item
    Load the library, retrieve the \Datasett{mvad} data and create a state sequence object from the status variables (columns 17 to 86):
<<install-load, echo=TRUE, results=hide>>=
## install.packages("TraMineR", repos="http://mephisto.unige.ch/traminer/R")
library("TraMineR")
data("mvad")
mvad.alphab <- c("employment", "FE", "HE", "joblessness", "school", "training")
mvad.seq <- seqdef(mvad, 17:86, xtstep=6, alphabet=mvad.alphab)
##mvad.seq <- seqdef(mvad, 17:86, xtstep=6)
@

     \item
     Compute pairwise optimal matching (OM) distances between sequences with an insertion/deletion cost of 1 and a substitution cost matrix based on observed transition rates:
<<mvad-dist, echo=TRUE, results=hide, cache=TRUE>>=
mvad.om <- seqdist(mvad.seq, method = "OM", indel = 1, sm = "TRATE")
@
     \item
     Proceed to an agglomerative hierarchical clustering using the obtained distance matrix, select the four-clusters solution and express it as a factor:
<<mvad-clust-intro, echo=TRUE, results=hide, cache=FALSE>>=
library("cluster")
clusterward <- agnes(mvad.om, diss=TRUE, method="ward")
mvad.cl4 <- cutree(clusterward, k=4)
cl4.lab <- factor(mvad.cl4, labels = paste("Cluster",1:4))
@
     \item
     Visualize the cluster patterns by plotting their transversal state distributions:
<<mvad-clust-seqdplot, echo=TRUE, results=verbatim>>=
seqdplot(mvad.seq, group=cl4.lab, border=NA)
#seqdplot(mvad.seq, group=cluster5, border=NA)
@
\end{enumerate}

%
\begin{figure}[tb]
\begin{center}
\setkeys{Gin}{width=.9\textwidth}
<<label=fig_cluster-seqdplot, fig=TRUE, echo=FALSE, width=7, height=7>>=
<<mvad-clust-seqdplot>>
@
\caption{State distribution plots by cluster.}
\label{fg_cluster-seqdplot}
\end{center}
\end{figure}

We see in Figure~\ref{fg_cluster-seqdplot} that the first cluster is formed essentially by the youngsters who join the workforce soon after ending compulsory school, while those who pursue education are in the second (higher education) and third clusters. The last cluster groups less successful transitions from school to work with long spells of joblessness or training.

To continue, we examine how the diversity of states within each sequence is related to sex, to whether the father is unemployed and to whether the qualification grade at end of compulsory school was good. We compute the longitudinal entropy and regress it on the covariates:
%
<<regression, echo=TRUE, results=hide>>=
entropies <- seqient(mvad.seq)
lm.ent <- lm(entropies ~ male + funemp + gcse5eq, mvad)
@
%\vspace{-2ex}
\begin{table}[tb]
\begin{center}%\sffamily
\label{tab_reg-entropies}
<<regression-output, echo=FALSE, results=tex>>=
##xt1 <- xtable(lm.ent, align="|crrr|", digits=1)
lm.xtable <- xtable(lm.ent, digits=2, align="|r|rrrr|")
rownames(lm.xtable)[2:4] <- c("Male", "Father unemployed", "Good end compulsory school grade")
colnames(lm.xtable)[2:4] <- c("Std error", "$t$ value", "Pr$(>|t|)$")
##print(lm.xtable, floating=FALSE, caption.placement="top", hline.after=c(-1,0,0,nrow(lm.xtable)),
print(lm.xtable, floating=FALSE, caption.placement="top",
  sanitize.text.function = function(x){x})
@
\end{center}
\vskip -2ex
\caption{Regressing entropies on a selection of covariates.}
\end{table}
%
Results show that males experience less diverse states and that youngsters with good grades at the end of compulsory school experience more diverse states. Whether the father is unemployed does not have a significant effect.

\section{Sequence representations}
\label{sec_identifying-formats}
%\index{sequence!formats}

State sequences can be represented in many different ways, depending on the data source and on how the information is organized. Data organization and conversion between formats is discussed in detail in \citet{RitschardGabadinhoStuderMuller2009DManag}, where an ontology of longitudinal data presentations is given that may help identify the kind of data at hand. Here, we limit the discussion to the sequence data representations that \traminer can handle and import. Those formats are listed in Table~\ref{tab:example-formats} together with the conversions that can currently %(v1.6-2)
be done with the provided \Com{seqformat()} function.

\subsection{State sequences}
%
We consider sequences of discrete or categorical data. Formally, we define a state sequence of length $\ell$ as an ordered list of $\ell$ elements successively chosen from a finite set $A$ of size $a=|A|$ that is called the \emph{alphabet}.
A natural way of representing a sequence $x$ is by listing the successive elements that form the sequence $x=(x_{1},x_{2},\ldots ,x_{\ell})$, with $x_{j}\in A$. With reference to this expanded form of sequences, \emph{state sequences} are characterized by two properties. Firstly, they are formed by elements that are states; i.e., something that can last as opposed, for instance, to events that occur at given time points. Secondly, the position of each element conveys meaningful information in terms of age, date or, more generally, elapsed time or distance from the beginning of the sequence.
%
Position indexes providing time information may be either absolute \emph{calendar} values (day, year, month, ...) or relative \emph{process time} (age, process duration, ...).

In \traminer, the expanded form is called \emph{STate-Sequence} (STS) format. In this format, the successive states (statuses) of an individual are given either in consecutive columns, or as a character string with states separated by a given symbol such as `-' or `/', the former being the default separator. Each position (column) is supposed to correspond to a predetermined time unit.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{table}[tb]

\bigskip

\setlength\TableWidth\linewidth
\addtolength\TableWidth{-6\tabcolsep}

	%\textsf
 {\small%
	 \begin{tabular}{|>{\raggedright}m{0.08\TableWidth}|>{\raggedright}m{0.13\TableWidth}|>{\raggedright}m{0.79\TableWidth}|}
		\hline
		{Code} & {Conversion} & {Example} \tabularnewline
		\hline
		%\hline
		STS & from/to &\begin{tabular}{c@{\hspace{2.5em}}cccccccccc}
		{\it Id}  & $18$ & $19$ & $20$ & $21$ & $22$ & $23$ & $24$ & $25$ & $26$ & $27$
		\tabularnewline
		101 & S & S & S & M & M & MC & MC & MC & MC & D \tabularnewline
		102 & S & S & S & MC & MC & MC & MC & MC & MC & MC \tabularnewline
		\end{tabular}
		  \tabularnewline
		\hline
		SPS  & from/to & \begin{tabular}{c@{\hspace{2.5em}}ccccc}
		{\it Id} & $1$ & $2$ & $3$ & $4$ & \tabularnewline
		101 & (S,3) & (M,2) & (MC,4) & (D,1) & \tabularnewline
		102 & (S,3) & (MC,7) &
		\end{tabular}
		  \tabularnewline
		\hline
		DSS & to &\begin{tabular}{c@{\hspace{2.5em}}ccccc}
		{\it Id} & $1$ & $2$ & $3$ & $4$ &  \tabularnewline
		101 & S & M & MC & D \tabularnewline
		102 & S & MC & &
		\end{tabular}
		  \tabularnewline
		\hline
		SPELL & from &\begin{tabular}{c@{\hspace{2.5em}}ccc@{\hspace{1.5em}}l}
		{\it Id} & {\it Index} & {\it From} & {\it To} & {\it State}
		\tabularnewline
		101 & 1 & 18 & 20 & S (single) \tabularnewline
		101 & 2 & 21 & 22 & M (married) \tabularnewline
		101 & 3 & 23 & 26 & MC (married with children) \tabularnewline
		101 & 4 & 27 & 27 & D (divorced) \tabularnewline
		102 & 1 & 18 & 20 & S (single) \tabularnewline
		102 & 2 & 21 & 27 & MC (married with children)
		\end{tabular}
		 \tabularnewline
		\hline
	\end{tabular}
	}
\caption{Sequence data representations; some formats handled by the \Com{seqformat()} function.}
\label{tab:example-formats}
\end{table}


\subsection{Other sequence representations}

Sequence data can be represented in more compact ways than STS essentially by giving only one of several same successive states. In that case, we have to explicitly stamp the successive distinct states with their starting position or duration. Table~\ref{tab:example-formats} displays the same example of two sequences in the different formats. The two considered sequences describe family formation histories of two individuals, the states being single (S), married (M), married with children (MC) and divorced (D).

A first efficient way of representing a state sequence is by listing the distinct successive states with their associated durations. We get thus a sequence of couples $(x_j, t_j)$ where $x_j$ is a state and $t_j$ its duration; i.e., the number of times it is repeated. This is the \emph{State-Permanence-Sequence} (SPS) format \citep{Aassve-2007}. By considering only the distinct successive states without their associated durations, we get the \emph{Distinct-Successive-States} (DSS) sequence format. This DSS form holds the basic state sequencing information, but loses all time ($t_j$) and, more generally, alignment data.

In the SPELL format there is one line for each spell. Each spell is characterized by the state (supposed constant during the spell) and the spell start and end times. STS sequences can easily be derived from such representation.

\section{State sequence objects}
\label{sec_seq_object}

The general philosophy of the library is to ensure that the various results and plots outputted for a same set of sequence data use the same state labels and colors. Likewise, any information on case weights and possible missing information about some positions in sequences should be treated the same way throughout the analysis.
To achieve this goal, the \traminer functions for state sequence analysis require as an argument a \emph{state sequence object} that includes both the sequential data and its attributes.
%
Thus, the first step when using \traminer for state sequence analysis is to create a \emph{state sequence object}. This is done with the \Com{seqdef()} function from data organized in either of the STS, SPS or SPELL forms described in the previous section.
%discussed previously.
We show below how to create a state sequence object from the \Datasett{mvad} data set introduced in Section~\ref{sec_overview}.
%
The main attributes are listed in Table~\ref{tab:attributes}, together with their default values and the dedicated functions to retrieve or set them.

\begin{table}[tb]
\bigskip
\setlength\TableWidth\linewidth%
\addtolength\TableWidth{-10\tabcolsep}%
%
%{\sffamily\small
\begin{tabular}{|p{.14\TableWidth}|p{.28\TableWidth}|p{.13\TableWidth}|p{.30\TableWidth}|p{.15\TableWidth}|}
\hline
  {Attribute} & {Description} & {Argument} & {Default} & {Retrieve/set}\tabularnewline
\hline
%\hline
  alphabet        & List of states           & \Comt{states}        & From input data        & \Comt{alphabet()} \tabularnewline
  cpal            & Color palette            & \Comt{cpal}          & \pkg{RColorBrewer} palette & \Comt{cpal()} \tabularnewline
%  missing.color   & missing state color      & missing.color & "darkgrey"        & \tabularnewline
  labels          & Long state labels        & \Comt{labels}        & From input data        & \Comt{stlab()} \tabularnewline
  cnames          & Position names           & \Comt{cnames}        & From input data        & \Comt{names()} \tabularnewline
  row.names       & Row (sequence) labels    & \Comt{id}            & From input data        & \Comt{rownames()} \tabularnewline
%  nr              & symbol for missing value & nr            & "*"               &  \tabularnewline
%  void            & symbol for void element  & void          & "\%"              & \tabularnewline
  weights         & Optional case weights    & \Comt{weights}       & \Comt{NULL}            & \tabularnewline
\hline
\end{tabular}
%}
\caption{Main sequence object attributes.}
\label{tab:attributes}
\end{table}


\subsection{Creating state sequence objects}

In the \Datasett{mvad} data set, the retained activity status variables are stored in columns 17 to 86. We display these statuses for the first six considered months (September 1993 to February 1994) of the first two records:
%
<<mvad-extract, echo=TRUE, results=verbatim, quiet=FALSE>>=
mvad[1:2, 17:22]
@
%
%%%%%%%%%%%%%%%%%%%%
The default input format for the \Comt{seqdef()} function is STS, which is appropriate for the \Datasett{mvad} data set. If the input data is in another format it must be specified with the \Com{informat} argument and \Comt{seqdef()} will automatically make the required conversion.

\subsubsection{Alphabet and state labels}

The alphabet is the list of states allowed in the sequences. Both short and long labels of the states forming the alphabet are attached to the object. Long labels serve mainly for color legends in plots, while short state names are primarily used in printed outputs. Shorter names produce cleaner and shorter output when printing the sequences.

By default, the sorted list of the distinct states found in the input data (as returned by the \code{seqstatl()} function) defines the alphabet and is used as state names and labels. This can be changed with optional arguments, which is necessary, for example, when the alphabet contains states that do not appear in the retained sequences.

Below we specify short state names with the \Com{states} argument and long state labels with \Com{labels}. These arguments expect vectors of names or labels that are ordered conformably with the alphabet. The \Comt{alphabet} argument can be used to change the order of the states, in which case the vectors passed with the \Com{states} and \Com{labels} arguments should conform to the newly defined order.
%
<<reducing_width, echo=FALSE, results=hide>>=
## oopt <- options(width=60)
@

<<mvad-seq, echo=TRUE, results=verbatim, quiet=FALSE>>=
mvad.lab <- c("Employment", "Further education", "Higher education", "Joblessness", "School", "Training")
mvad.scode <- c("EM", "FE", "HE", "JL", "SC", "TR")
mvad.seq <- seqdef(mvad, 17:86, alphabet=mvad.alphab, states = mvad.scode,
     labels = mvad.lab, xtstep=6)
@

% RENAMING gcse5eq variable LEVELS
<<mvad-gcse5eq-levels, echo=FALSE, results=hide, quiet=FALSE>>=
levels(mvad$gcse5eq) <- c("<5 GCSEs", ">=5 GCSEs")
@
% $

<<resetting_width, echo=FALSE, results=hide>>=
## options(oopt)
@
%
Now the sequences are stored in the \Comt{mvad.seq} sequence object. We can display them in the concise SPS representation with:
%
<<mvad-seq-print, echo=TRUE, results=verbatim, quiet=FALSE>>=
print(mvad.seq[1:5, ], format="SPS")
@
\label{r-mvad.seq-first-5}

\subsection{Other important attributes and properties}
\label{sec_Other_Attributes}

We briefly comment here upon some other important attributes that will be used in conjunction with the alphabet and state labels by \traminer{}'s functions.

\subsubsection{State colors and position names}
The sequence plot functions provided by the library need a distinct color for each state. A \textit{color palette} is therefore attached to the sequence object. A default color palette from \pkg{RColorBrewer} \citep{Neuwirth2007} is automatically selected as long as the alphabet size does not exceed 12.  \textit{Position names} serving mainly for labeling the ticks of the $x$-axis but that are also useful for increasing readability of tabulated output are also an attribute of the object. If left unspecified, position names are taken from the corresponding column names of the original data frame. The interval between the $x$-axis tick-marks is an additional attribute that can be set (\Com{xtstep} argument) for optimizing rendering.

\subsubsection{Case weights}

Survey data often come with \textit{case weights} that account for the sampling scheme and unit non-responses. Using such case weights is important to compensate for sampling bias and thus get results that are more realistic. When weights are attached to the state sequence object, the \traminer functions that can handle weights automatically produce weighted results. To disable the use of weights, add option \Comt{weighted=FALSE} to the function.

The \Comt{weight} variable in \Datasett{mvad} contains case weights that account for the selective attrition during the survey and we attach them to the sequence object as shown below. Unless otherwise specified, we will use this weighted sequence object from here on.\label{code_mvad.seq-weight}
%
<<seqdef-weighted, echo=TRUE, results=verbatim>>=
mvad.seq <- seqdef(mvad, 17:86, alphabet=mvad.alphab, states = mvad.scode,
     labels = mvad.lab, weights=mvad$weight, xtstep=6)
@
%$


\subsubsection{Missing values}

Missing values in the expanded (STS) form of a sequence occur, for example, when:
%
\begin{itemize}
\item Sequences do not start on the same date while using a calendar time axis;
\item The follow-up time is shorter for some individuals than for others yielding sequences that do not end up at the same position;
%\item Sequences are right or left censored.
\item The observation at some positions is missing due to nonresponse, yielding internal gaps in the sequences.
\end{itemize}
%
The way missing values should be handled may be different for each of these situations. In the first case, we may want to maintain explicitly the starting missing values to preserve alignment across sequences or possibly left-align sequences by switching to a process time axis. In the second case, the ending missing terms could just be ignored.

To allow such differentiated treatments, \traminer distinguishes \emph{left}, \emph{in-between} and \emph{right} missing values.
We can specify how each of the missing types should be encoded with the \Comt{left}, \Comt{gaps} and \Comt{right} arguments.
%
By default, gaps and left-missing states are coded as explicit missing values while all missing values encountered after the last valid (rightmost) state in a sequence are considered void elements; i.e., the sequence is considered to end after the last valid state.

The specific treatment of each type of missing value will depend upon whether the analysis method envisaged supports missing values; and, if yes, which kind it supports. Most of the proposed functions, such as \Comt{seqdist()} for computing distances between sequences, have optional arguments for dealing with missing states.

\subsubsection{Subsets and attributes inheritance}
\label{secsub_subsets_of_seq}

Subsets of sequence objects can be defined by specifying row and column indexes (or names) as for \R matrices and data frames. Every subset of a state sequence object inherits its `parent' attributes. The alphabet and color palette, for instance, remain the same for all subsets. This is of particular importance when comparing graphics that render different subsets of a same sequence object.


% ==============================================
\section{Visualizing individual state sequences}
\label{sec:Visualizing-individual-sequences}

State sequence visualization is one of the most important features of the package.  This section introduces two basic plotting functions, namely the index plot intended to render a set or subset of individual state sequences and the frequency plot that visualizes them according to their frequencies. We explain also the common design of most of \traminer{}'s plotting functions.



\subsection{Sequence index plots}

A \emph{sequence index plot} (Figure~\ref{fg_seqiplot_mvad}) individually renders the selected state sequences. Each of them is represented by horizontally stacked boxes that are colored according to the state at the successive positions. The resulting bars are put above each other to vertically align the positions. We thus visualize, for each case, the individual longitudinal succession of states as well as, through the length of each color segment, the duration spent in each successive state. The alignment also permits easy transversal comparisons at each position.
%
<<iplot-A, echo=FALSE, results=hide, fig=FALSE>>=
seqiplot(mvad.seq, border=NA, with.legend="right")
@
%
\begin{figure}[tb]
\begin{center}\vskip-4ex
\setkeys{Gin}{width=.85\textwidth}
<<label=fig_iplot-A, fig=TRUE, echo=FALSE, width=9, height=5.5>>=
<<iplot-A>>
@
\setkeys{Gin}{width=.8\textwidth}
\vskip -3ex
\caption{Sequence index plot of sequences 1 to 10.}
\label{fg_seqiplot_mvad}
\end{center}
\end{figure}
%
The \emph{sequence index plot} shown in Figure~\ref{fg_seqiplot_mvad} was obtained with the command below:%
\footnote{\Comt{seqiplot()}, as most other plotting functions described in this paper, is just an alias for calling a generic \Comt{seqplot()} state sequence plot function with the appropriate \Comt{type} argument and suitable default option values. The \Com{border=NA} option suppresses the border that surrounds, by default, each unit state in the sequence.}
%
<<iplot-Aa, echo=TRUE, results=hide, fig=FALSE>>=
<<iplot-A>>
@
%
%
Since we have attached case weights to the \Comt{mvad.seq} sequence object, the width of the bar representing each sequence is proportional to its weight. This default behavior could be changed with the \Com{weighted=FALSE} argument. The plotted sequences are selected with the \Com{idxs} argument by providing either a vector of indexes, or 0 for requesting all the sequences. %For example \Com{idxs=c(141,10,23)} will plot sequences 141, 10 and 23 in that order and \Com{idxs=seq(from=1,to=711,by=10)} will plot every 10 sequences.
The default value is \Comt{1:10} and Figure~\ref{fg_seqiplot_mvad} displays therefore only the first 10 sequences of \Comt{mvad.seq}.

The \Comt{seqIplot()} alias produces full index plots that display all the sequences in the set without spaces between sequences and without borders around unit states. The usefulness of such plots has, for instance, been stressed by \citet{Scherer2001} and \citet{Brzinsky-FayKohlerLuniak2006}.
However, when the number of displayed sequences is large, they may produce burden pictures that are often hard to interpret.%
\footnote{When plotting several hundred of sequences, saving index plots may also produce heavy files in vectorial formats such as PostScript and PDF; generating plots in bitmap formats such as PNG or JPEG is recommended in such cases.}
We can partially overcome this drawback by sorting the sequences according to the values of a suitably chosen covariate---passed with the \Comt{sortv} argument. Good choices are, for instance, the distance to the most frequent sequence or the scores of a multidimensional scaling analysis%
\footnote{The scores are obtained from the dissimilarity matrix with the \Comt{cmdscale()} function.}
of the dissimilarities between sequences (Figure~\ref{fg_mvad-full-seqIplot}). Both solutions suppose that we can compute such dissimilarities; this will be addressed in Section~\ref{sec_similarities}.

<<Iplot-mdscale, echo=FALSE, results=hide, fig=FALSE, cache=TRUE>>=
mvad.lcs <- seqdist(mvad.seq, method="LCS")
mds <- cmdscale(mvad.lcs, k=1)
dref <- seqdist(mvad.seq, refseq=0, method="LCS")
@

<<Iplot-sorted, echo=FALSE, results=hide, fig=FALSE>>=
png(file=paste(graphdir,"png_mvad_seqIplot-sorted.png",sep=""), unit="px", width=1600, height=700, pointsize=30)

par(mfrow=c(1,3))
seqIplot(mvad.seq, main="unsorted", with.legend=FALSE)
seqIplot(mvad.seq, sortv=dref, main="by distance to most frequent", with.legend=FALSE)
seqIplot(mvad.seq, sortv=mds, main="by 1st MDS factor", with.legend=FALSE)
dev.off()
@


\begin{figure}[tb]
%	\begin{center}
		\setkeys{Gin}{width=1\textwidth}
		\includegraphics{png_mvad_seqIplot-sorted}
		\setkeys{Gin}{width=.8\textwidth}
		\vspace{-5ex}

		\caption{Unsorted and sorted full-sequence index plots.}
		\label{fg_mvad-full-seqIplot}
%	\end{center}
\end{figure}

\subsection{Sequence frequencies}
\label{sec_sequence-frequencies}

The \Com{seqtab()} function returns a table with the counts and percent frequencies of the sequences sorted in decreasing order of their frequencies. In the next example, we request the four most frequent sequences of \Comt{mvad.seq} with \Comt{idxs=1:4}. In the printed outcome, sequences are displayed in the shorter and more readable SPS format:


<<seqtab, echo=TRUE, results=verbatim>>=
seqtab(mvad.seq, idxs=1:4)
@
The most frequent sequence in the \Comt{mvad.seq} object is a spell of two years of school followed by 46 months of higher education. It accounts, however, for only 4.7\% of the total weights of the 712 cases considered. The second most frequent sequence, which concerns 3.5\% of the weighted individuals, is indeed very similar to the previous one.
%
\subsubsection{Sequence frequency plots}

A graphical view of the sequence frequency table where bar widths are proportional to the frequencies is obtained with the \Comt{seqfplot()} function. Figure~\ref{fg_seqfplot-mvad} shows the plot of the weighted and unweighted frequencies%
\footnote{Remember that we attached case weights to our sequences; the frequencies are thus weighted by default.
Since we have to issue two separate \Comt{seqfplot()} commands, we use \Comt{par(mfrow=...)} for arranging the plots and, therefore, we deactivate the automatic display of the legend that is not compatible with it (see below).
}
obtained with:%
<<reducing_width, echo=FALSE, results=hide>>=
oopt <- options(width=60)
@
<<seqfplot, echo=TRUE, results=hide>>=
par(mfrow=c(1,2))
seqfplot(mvad.seq, border=NA, with.legend=FALSE,
	main="Weighted frequencies")
seqfplot(mvad.seq, weighted=FALSE, border=NA, with.legend=FALSE,
	main="Unweighted frequencies")
@
<<resetting_width, echo=FALSE, results=hide>>=
options(oopt)
@

% $
\begin{figure}[tb]
\center
 \setkeys{Gin}{width=.9\textwidth}
<<fig-seqfplot, echo=FALSE, results=hide, fig=TRUE, width=10, height=5>>=
<<seqfplot>>
@
\vskip -3ex
\caption{Weighted and unweighted sequence frequency plots.}
\label{fg_seqfplot-mvad}
 \setkeys{Gin}{width=.8\textwidth}
\end{figure}
%
By default, only the 10 most frequent sequences are shown. %which can be changed with the \Com{idxs} argument.
The $y$-axis indicates the cumulative percentage of the represented sequences.
%
If we look at the unweighted results, the most frequent sequence is to stay employed during the entire follow-up period (be in state EM during 70 months). This sequence, which was the fourth most frequent in the weighted frequency table with 2.5\% of the total weight, accounts for 7\% of the 712 cases considered.

The probability for two individuals to follow exactly the same 70-month trajectory is small, yielding a large number of different patterns.  %As in many applications, there is a high diversity of sequences with
The 10 most frequent sequences account for only about 20\% of all the trajectories, which reflects this high diversity.

\subsection{Reading and controlling state sequence plots}

The way index plots render individual state sequences with horizontally stacked boxes is common to other functions of the library that visualize specific state sequences. The position in the sequence is read on the $x$-axis.
The first value on this axis is the selected origin. The sequence is read from left to right in the same way as printed outputs. %Colors used for representing the states and
Tick labels for the $x$-axis are retrieved, by default, from the plotted sequence object.

The values on the $y$-axis are the indexes of the plotted sequences. The index refers to the considered ranking of the sequences. For instance, in \emph{sequence index plots}, the default order is that in the state sequence object unless a specific sort variable is provided with the \Comt{sortv} argument. In \emph{sequence frequency plots}, sequences are sorted according to their frequency in the data set, while in \emph{representative sequence plots} (Section~\ref{sec_representative-sequences}), sequences are sorted according to their representativeness score.

The indexes on the $y$-axis---and hence the sequences---are displayed bottom-up. Thus, when sequences are sorted, the first ranked one is at the bottom of the plot.%
\footnote{This can be changed by with the \Comt{idxs} argument.}
This respects the usual standard for $y$-axes. It may, however, be confusing when compared with the corresponding printed outputs where sequences are displayed top-down.

Other aspects of the graphic (title, font size, axes display, axis label, state legend, ...) can be controlled with dedicated options described in detail in the reference manual. There is also an option to produce separate plots by levels of a covariate.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Computing and plotting overall and transversal statistics}
\label{sec_describing-visualizing-sequences}

We now turn to the facilities offered by \traminer for visualizing and computing overall and transversal descriptive statistics of a set of sequences. The functions discussed here all require a state sequence object as main argument and admit a series of optional parameters. We illustrate with the \Comt{mvad.seq} weighted sequence object created on page~\pageref{code_mvad.seq-weight}.


\subsection{Overall statistical characteristics}
\label{sec_overall-statistics}

We consider, first, global synthesized information that is based neither on individual longitudinal characteristics nor on transversal characteristics by position. More specifically, we focus on the overall state distribution and transition rates between states.

\subsubsection{Mean time spent in each state}
%
A first synthetic information is given by the mean---not necessarily consecutive---time spent in the different states; that is, the mean number of times each state is observed in a sequence. This characterizes the overall state distribution. As an example, we plot the mean times for two subsets defined by the \Comt{funemp} covariate that indicates whether the respondent's father was unemployed at the time of the survey (Figure~\ref{fg_mvad_mean_time}). The graphic with the distinct plots by levels of  the \Comt{funemp} covariate is obtained by passing \Comt{funemp} as \Comt{group} argument to the plotting function. This option is common to all the plotting functions presented in this article.
%
<<plot-mean-time, echo=TRUE, results=hide, fig=FALSE>>=
seqmtplot(mvad.seq, group=mvad$funemp, ylim=c(0,30))
@
% $
%
\begin{figure}[tb]
\begin{center}
\setkeys{Gin}{width=1\textwidth}
<<label=fig_plot-mean-time, fig=TRUE, echo=FALSE, width=10, height=5.5>>=
seqmtplot(mvad.seq, group=mvad$funemp, ylim=c(0,30))
@
\setkeys{Gin}{width=.8\textwidth}
\caption{Mean time spent in each state by father's unemployment status.}
\label{fg_mvad_mean_time}
\end{center}
\end{figure}
% $
We can see that the mean time spent in joblessness and training is higher for interviewees with unemployed fathers, while the time they spent in `school', `further education' and `higher education', is lower.%

Mean time values are obtained with the \Comt{seqmeant()} function.%
\footnote{Individual time spent in each state can be obtained with \Comt{seqistatd()}.}
However, unlike for graphical displays, functions returning statistics and sequence characteristics do not have a \Comt{group} argument.
%
We can retrieve the values by levels of a covariate with the row indexing mechanism%
\footnote{
\Comt{seqmeant(mvad.seq[mvad\$funemp=="yes",])} \ and \
\Comt{seqmeant(mvad.seq[mvad\$funemp=="no",])}.
}
or with the \Comt{by()} function:
%
% Commented out, because does not work with TraMineR 1.9
<<mean-time, echo=TRUE, results=hide, fig=FALSE>>=
by(mvad.seq, mvad$funemp, seqmeant)
@
% $

\subsubsection{Transition rates}
\label{sec_transition_rates}
%\index{transition rates}

Another interesting information about a set of sequences is the transition rate between each couple of states $(s_i, s_j)$; i.e., the probability to switch at a given position from state $s_i$ to state $s_j$. Let $n_t(s_i)$ be the number of sequences that do not end in $t$ with state $s_i$ at position
$t$ and let $n_{t,t+1}(s_i,s_j)$ be the number of sequences with state $s_i$ at position $t$ and state $s_j$ at position $t+1$. The transition rate $p(s_j\mid s_i)$ between states $s_i$ and $s_j$ is obtained as
%
\[
 p(s_j \mid s_i)=\dfrac{\sum _{t=1} ^{L-1} n_{t,t+1}(s_i,s_j)}{\sum _{t=1} ^{L-1} n_{t}(s_i)} \enspace .
\]
with $L$ the maximal observed sequence length.

The \Com{seqtrate()} function returns the matrix of transition rates
for the provided sequence object. By default, the rates are assumed position-independent; i.e., the same whatever $t$. The outcome is a single matrix where each row $i$ gives a transition distribution from the originating state $s_i$ in $t$ to the states in $t+1$; that is, each row total equals one. Hence, transition rates provide information about the most frequent state changes observed in the data together with, on the diagonal, an assessment of the stability of each state.

In the following example we compute the transition rate matrix for the \Comt{mvad.seq} sequence object:
%
<<seqtrate, echo=TRUE, results=verbatim>>=
mvad.trate <- seqtrate(mvad.seq)
round(mvad.trate,2)
@
\label{comment_transition_rates}
We learn from this outcome that the highest transition rates (0.04) are observed between states JL (joblessness) and EM (employment), and between states TR (training) and EM. Moreover, states JL and TR are the most unstable with a probability of 0.06 ($1.0-0.94$) to leave the state at each position $t$.

Time-varying transition rates can be obtained with option \Comt{time.varying=TRUE}, in which case a 3-dimensional array with a distinct transition rate matrix for each of the positions $t=1, 2, \ldots, L-1$ is returned. The matrix for position $t$ is computed by considering only the states at $t$ and $t+1$. The third dimension of the array corresponds to the position $t$ index.

%%%%%%%%%%%%%%%%%%%%%%
\subsection{Transversal state distributions}
\label{sec_state-distribution}

Time varying transition rates are transversal characteristics computed at the successive considered positions. In the same vein, it is of interest to look at the transversal distribution of the states at each position of the considered sequences.
The \Com{seqstatd()} function returns a $a \times L$ table containing in each column the distribution among the $a$ states of the alphabet at the corresponding position in the sequence. The output of this function for the first height months in \Comt{mvad.seq} is shown below:
%
<<seqstatd, echo=TRUE, results=verbatim>>=
seqstatd(mvad.seq[,1:8])
@
%
\subsubsection{State distribution plot}

The \Com{seqdplot()} function generates a graphical view of the (weighted) state distributions. We illustrate by generating plots by values of the \emph{gcse5eq} (qualification gained at end of compulsory school) covariate:
%
<<seqdplot-mvad, echo=TRUE, results=hide>>=
seqdplot(mvad.seq, group=mvad$gcse5eq, border=NA)
@
% $
%
\begin{figure}[htb]
\center
 \setkeys{Gin}{width=1\textwidth}

<<seqdplot-mvad-fig, echo=FALSE, results=hide, fig=TRUE, width=10, height=5.5>>=
<<seqdplot-mvad>>
@
%
\caption{Transversal state distributions by end of compulsory school qualification group.}
\label{fg_state-dist-mvad}
 \setkeys{Gin}{width=.8\textwidth}
\end{figure}
%$
The result shown in Figure~\ref{fg_state-dist-mvad} exhibits significant jumps in the sequence of state distributions. As  commented by \citet{McVicarAnyadike2002JRSSa}, they correspond mainly to the beginning and ending of education cycles.

A state distribution plot, as produced by \Com{seqdplot()}, displays the general pattern of the whole set of trajectories. When interpreting such graphics, one must remember that, unlike sequence index plots and sequence frequency plots, they do not render individual sequences or individual follow-ups. They provide aggregated views made of successive slices, each of which represents transversal characteristics.

\subsubsection{Sequence of modal states}

An interesting summary that can be derived from the state distributions is the sequence made of the most frequent state at each position. It is obtained with the \Com{seqmodst()} function and plotted with \Com{seqmsplot()}. Figure~\ref{fg_modalstate} shows how such modal state sequences are displayed.
%
<<seqmodst, echo=FALSE, results=verbatim, fig=FALSE>>=
seqmsplot(mvad.seq, group=mvad$gcse5eq, border=NA)
@
% $
%
\begin{figure}[tb]
\center
 \setkeys{Gin}{width=1\textwidth}
<<seqmsplot, echo=FALSE, results=hide, fig=TRUE, width=10, height=5.5>>=
<<seqmodst>>
@
\caption{Modal state sequence  by end of compulsory school qualification group.}
\label{fg_modalstate}
 \setkeys{Gin}{width=.8\textwidth}
\end{figure}
%
The height of the bar at each position is proportional to the frequency of the displayed state at that position. The number of occurrences of the modal state sequence is also displayed. Since the shown sequences of modal states do not belong to the  sequence dataset, the number of occurrences is 0 for both considered groups.

\subsubsection{Transversal entropy of state distributions}
\label{secsub_transversal_entropy}
%
In addition to the state distribution, the \Comt{seqstatd()} function provides for each position in the sequence the number of valid states and the Shannon entropy\index{entropy!at each time point} of the transversal state distribution. Shannon's entropy, also known as the \emph{entropy index}, has been applied to social science data by, for instance, \citet{Billari-2001a} and \citet{Fussell-2005}. Letting $p_i$ denote the proportion of cases in state $i$ at the considered position, the entropy is
\[
	h(p_1,\ldots,p_a) = -\sum_{i=1}^a p_i \log(p_i)
\]
where $a$ is the size of the alphabet. The entropy is 0 when all cases are in the same state and is maximal when we have the same proportion of cases in each state. The entropy can be seen as a measure of the diversity of states observed at the considered position.
%
%
<<trans-entropy, echo=FALSE, results=hide, fig=FALSE>>=
seqHtplot(mvad.seq, group=mvad$gcse5eq)
@
% $
\begin{figure}[tb]
\center
 \setkeys{Gin}{width=1\textwidth}
<<hist-trans-entropy, echo=FALSE, results=hide, fig=TRUE, width=12, height=5.5>>=
<<trans-entropy>>
@
\vskip -5ex
\caption{Transversal entropy by end of compulsory school qualification group.}
\label{fg_entropy_state-dist-mvad}
 \setkeys{Gin}{width=.8\textwidth}
\end{figure}

Plotting the transversal entropies can be useful to find out how the diversity of states evolves along the time axis. We plot transversal entropies with  \Com{seqHtplot()}.
%
Figure~\ref{fg_entropy_state-dist-mvad} shows the curves by end of compulsory school qualification group. For the first group, the entropy of the state distributions noticeably decreases at the end of the follow-up period. This is a consequence of the increasing proportion of youngsters entering into full employment (Figure~\ref{fg_state-dist-mvad}). For the second group, the entropy index slightly increases at the end of the considered period, which may be explained by the emergence of two balanced subgroups, namely those who continue higher education and those who enter into employment.

%%%%%%%%%%%%%%%%%%%%
\section{Individual sequence characteristics}
\label{sec_sequence-complexity}

% Example sequences
<<data-ex1, echo=FALSE, results=hide>>=
e1m1 <- c("A","A","A","A","A","A","A","A","A","A","A","A")

## 2 states
e2m1 <- c("A","A","A","A","A","A","B","B","B","B","B","B")
e2m2 <- c("A","B","A","B","A","B","A","B","A","B","A","B")
e2m3 <- c("A","A","A","A","A","A","A","A","A","A","A","B")
e2m4 <- c("A","B","B","B","B","B","B","B","B","B","B","A")
e2m5 <- c("A","A","A","A","B","B","B","B","A","A","A","A")
e2m7 <- c("A","A","B","B","A","A","B","B","A","A","B","B")
e2m8 <- c("A","B","B","A","A","B","B","A","A","B","B","A")
e2m9 <- c("A","B","A","A","A","A","A","A","A","A","B","A")
e2m10 <- c("A","B","A","B","A","B","A","B","A","A","A","A")

## 3 states
e3m1 <- c("A","A","A","A","B","B","B","B","C","C","C","C")
e3m2 <- c("A","B","C","A","B","C","A","B","C","A","B","C")
e3m3 <- c("A","A","A","A","A","A","A","A","A","A","B","C")
e3m4 <- c("A","B","B","B","B","B","C","C","C","C","C","A")
e3m5 <- c("A","A","A","A","A","B","C","A","A","A","A","A")
e3m6 <- c("A","B","C","B","A","C","A","C","B","C","B","A")
e3m7 <- c("A","A","B","B","C","C","A","A","B","B","C","C")
e3m8 <- c("A","B","C","C","B","A","A","B","C","C","B","A")
e3m9 <- c("A","B","C","A","A","A","A","A","A","C","B","A")
e3m10 <- c("A","B","C","A","B","C","A","B","A","A","A","A")

## 4 states
e4m1 <- c("A","A","A","B","B","B","C","C","C","D","D","D")
e4m2 <- c("A","B","C","D","A","B","C","D","A","B","C","D")
e4m3 <- c("A","A","A","A","A","A","A","A","A","B","C","D")
e4m4 <- c("A","B","B","B","C","C","C","C","D","D","D","A")
e4m5 <- c("A","A","A","A","B","C","D","A","A","A","A","A")
e4m6 <- c("A","B","C","D","B","A","C","D","A","C","B","D")
e4m7 <- c("A","A","B","B","C","C","D","D","A","A","B","B")
e4m8 <- c("A","B","C","D","D","C","B","A","A","B","C","D")
e4m9 <- c("A","B","C","D","A","A","A","A","D","C","B","A")
e4m10 <- c("A","B","C","D","A","B","C","D","A","A","A","A")

##
ex.comp <- rbind(e1m1,
	e2m3,e2m1,e2m5,e2m4,e2m9,e2m10,e2m2,
	e3m3,e3m1,e3m5,e3m4,e3m9,e3m10,e3m6,e3m2,
	e4m3,e4m1,e4m5,e4m4,e4m9,e4m10,e4m6,e4m2)

## we keep sequence names to identify them
seqnames <- rownames(ex.comp)

ex.comp <- ex.comp[c(1,2,3,4,8,17,18,19,24),]

ex.comp <- seqdef(ex.comp, id="auto")
@

We focus now on the characterization and summarization of longitudinal characteristics of individual sequences.  Essentially, the aim is to define measures that inform on how each sequence is constituted; i.e., on whether it takes a simple or more complex form.

The interpretation of complexity indexes will depend indeed on the context. Consider for instance the  number of transitions---changes of state---in a sequence. When looking at work trajectories, for example, sequences with numerous transitions may correspond to unusual disrupted  trajectories. In other contexts such as family formation, sequences with fewer transitions may indicate that an individual failed to pass through the usual stages of the family formation (leaving the parental home, cohabitation with a partner, birth of one or more children, etc...).

In the SPS form (see Section~\ref{sec_identifying-formats}) a state sequence is represented as an ordered list of successive distinct states with their associated durations; i.e., as a sequence of couples $(x_j,t_{j})$ where $x_j$ is a state and $t_{j}$ its duration. This suggests that we can distinguish characteristics of the state sequencing---the distinct successive states (DSS)---from those of the durations.%
\footnote{The two pieces of information can be extracted separately with \Com{seqdss()} and \Com{seqdur()}.}
We first examine two indicators of the state sequencing and one based on the durations. More synthetic measures are addressed in Section~\ref{sec_composite-cplx-measures}.

\subsection{Unidimensional indicators}

\subsubsection{Number of transitions}

Perhaps the simplest indicator is the number of transitions in the sequence; i.e., the number of state changes.  The number of transitions in a sequence $x$ is readily obtained from the length $\ell_d(x)$ of its DSS sequence. It is $\ell_d(x)-1$. We get the number of transitions for each sequence of state sequence object with \Comt{seqtransn()}.
%
%
\begin{figure}[tb]
\begin{center}
\setkeys{Gin}{width=1\textwidth}
<<label=fig_comp_ex, fig=TRUE, echo=FALSE, width=10, height=5.5>>=
par(mfrow=c(1,2))
seqiplot(ex.comp, border=NA, with.legend=FALSE, idxs=0, main="Example sequences")
ex.comp.indic <- cbind(seqtransn(ex.comp, norm=TRUE), seqient(ex.comp), seqST(ex.comp)/max(seqST(ex.comp)), seqici(ex.comp))
barplot(t(ex.comp.indic),
 col=c("red","blue","cyan", "yellow"), horiz=TRUE, beside=TRUE,
 ## xlim=c(0,0.4),
 main="Longitudinal characteristics")
legend(x="bottomright", lwd=6,
	legend=c("Transitions", "Entropy", "Turbulence", "Complexity"),
	col=c("red","blue","cyan", "yellow")
)
@
\setkeys{Gin}{width=.9\textwidth}
\vskip -3ex
\caption{Example sequences ($\ell=12$,  $a=4$) and normalized values of complexity measures.}
\label{fg_complexity_measures}
\end{center}
\end{figure}
%

\subsubsection{Number of subsequences}
\label{sec_nbr_subsequences}
The number $\phi(x)$ of subsequences that can be extracted from the DSS sequence provides also useful information on the  sequencing of the states. This measure is returned by the \Com{seqsubsn()} function. It is used in the turbulence measure presented below. A subsequence $y$ of $x$ is composed of elements of $x$ occurring in the same order than in $x$.

The maximal number of subsequences is reached only for a sequence made of the repetition of the alphabet. In Figure~\ref{fg_complexity_measures}, for example, sequences 5 and 9  have the maximal number of transitions, while the  number of subsequences is maximal for sequence 9 only.

% ========================================
\subsubsection{Within sequence entropy}

Regarding the durations, we consider the total time spent in each state; i.e., in case of multiple spells in a same state, the sum of the lengths of these spells. For example, in \Sexpr{print(mvad.seq[1,], format="SPS")}, the first sequence in the object \textit{mvad.seq}, there are two spells in state EM with respective durations 4 and 64. Hence, the time spent in state EM is 68 months, as shown by the output of the \Com{seqistatd()} function: % The outcome is a matrix with a column for each of the states.
% ########## SWEAVE ##########
<<actcal-seqistatd, echo=TRUE, results=verbatim>>=
seqistatd(mvad.seq[1:4,])
@
% ############################

The total time spent in each state characterizes the state distribution within a sequence. The entropy of this distribution can be seen as a measure of the diversity of its states. We call it \emph{within} or \emph{longitudinal entropy} to distinguish from the transversal entropy considered in Section~\ref{sec_state-distribution} on page~\pageref{secsub_transversal_entropy}.

The \Comt{seqient()} function returns the longitudinal Shannon entropies; i.e., for each sequence the value of
\[
	h(\pi_1,\ldots,\pi_a)=-\sum_{i=1}^a\pi_i\log \pi_i
\]
%
where $a$ is the size of the alphabet and $\pi_i$ the proportion of occurrences of the $i$th state in the considered sequence. When the state remains the same during the whole sequence, the entropy equals 0, while the maximum entropy is reached when the same time is spent inside the sequence in each possible element of the alphabet. By default the entropy is normalized by dividing the value of $h(\pi_1,\ldots,\pi_s)$ by its theoretical maximum, $\log a$.%
\footnote{The latter is indeed an upper bound for the maximal entropy. It is exactly the maximal possible entropy only when the sequence length is a multiple of the alphabet size. Use \Com{norm=FALSE} to disable normalization.}
%
<<seqient-mvad, echo=FALSE, results=hide>>=
mvad.ient <- seqient(mvad.seq)
@
%
Figure~\ref{fg_complexity_measures} helps to get a more concrete idea of what the entropy measures. We see that the within-sequence entropy does not account for the state order in the sequence. For instance, sequences 7 and 9 have the same maximal normalized entropy of 1.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Composite complexity measures}
 \label{sec_composite-cplx-measures}

The previous measures are based either on the sequencing or on the durations. We look now at composite measures that account simultaneously for those two aspects.

\subsubsection{Turbulence}
\label{sec_sequence_turbulence}

The \emph{turbulence} $T(x)$ of a sequence $x$ is a composite measure proposed by Elzinga \citep{ElzingaLiefbroer2007EJP} that accounts for the number $\phi(x)$ of distinct subsequences of the DSS sequence and the variance $s_{t}^{2}(x)$ of the consecutive times $t_j$ spent in the $\ell_d(x)$ distinct states. The formula is
%
\[
T(x)=\log_{2}\Big(\phi(x)\,\frac{s_{t,max}^{2}(x)+1}{s_{t}^{2}(x)+1}\Big)
\]
%
where
%$s_{t}^{2}(x)$ is the variance of the state-durations $t_{j}$ $j=1,\ldots,\ell_d(x)$ for sequence $x$ and
$s_{t,max}^{2}(x)$ is the maximum value that $s_{t}^{2}(x)$ can take given the total duration $\ell(x)=\sum_j t_j$ of that sequence. This maximum is
%
$
s_{t,max}^{2}(x)=\big(\ell_{d}(x)-1\big)\big(1-\bar{t}(x)\big)^{2}
$
%
where $\bar t(x)$ is the mean consecutive time spent in the distinct states.

From a prediction point of view, the higher the differences in state durations and hence the higher their variance, the less uncertain the sequence. In that sense, small duration variance indicates high complexity.

The vector containing the turbulences of the sequences in a sequence object is obtained with the \Com{seqST()} function.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Complexity index}

The \emph{complexity index}, introduced in \citet{GabadinhoRitschardStuderMuller2010EGC}, is a composite measure that combines the number of transitions in the sequence with the longitudinal entropy. It reads
\[
	C(x)= \sqrt{\frac{(\ell_d(x)-1)}{(\ell(x)-1)} \, \frac{h(x)}{h_{max}}}
\]
where $h_{max}$ is the theoretical maximum value of the entropy given the alphabet; i.e., $h_{max}= \log a$. We get the vector of complexity indexes with the \Comt{seqici()} function.

The minimum value of 0 can only be reached by a sequence with a single distinct state; i.e., with no transition and an entropy of 0.  $C(x)$ reaches its maximum 1 if and only if the sequence $x$ is such that i) $x$ contains each of the states in the alphabet, ii) the same time $\ell(x)/a$ is spent in each state, and iii) the number of transitions is $\ell(x)-1$.

\subsubsection{Complexity index versus turbulence}

It is instructive to look at how the turbulence and complexity indexes behave for the examples in Figure~\ref{fg_complexity_measures}. The turbulence produces significantly higher values for sequences 3 and 4, which have a rather low `sequencing' complexity but a null variance of their state durations. Indeed, this variance does not account for states that are not visited, which tends to give high turbulence values to seemingly simple sequences such as sequence~3 with two spells of same length and hence a null variance of their durations. Similarly, the turbulence exceeds the complexity index for sequences 3, 4, 5, and 7, which all have a zero variance in duration and, hence, a relatively high turbulence value. The longitudinal entropy that intervenes in the complexity index is another way of looking at the time spent in the states. It accounts, on its side, for all states, including the nonvisited ones, and, therefore, discriminates clearly between the sequences with zero duration variance.

% ==========================================
\section{Measuring sequence (dis)similarity}
\label{sec_similarities}
%
<<comp-metrics, echo=FALSE, results=hide, cache=TRUE>>=
scost <- seqsubm(mvad.seq, method = "TRATE")
mvad.om.ref <- seqdist(mvad.seq, refseq=0, method="OM", sm=scost)
mvad.lcs.ref <- seqdist(mvad.seq, refseq=0, method="LCS")
mvad.lcp.ref <- seqdist(mvad.seq, refseq=0, method="LCP")
mvad.dhd.ref <- seqdist(mvad.seq, refseq=0, method="DHD")
mvad.ham.ref <- seqdist(mvad.seq, refseq=0, method="HAM")
@
%
We examine now how we can measure the dissimilarity between two state sequences. As we will see in Section~\ref{sec_further_statistical_analysis}, once we have pairwise dissimilarities we will be able to run many types of powerful classical and specific statistical analysis methods on sequence data.

Many sequence dissimilarity measures have been proposed in the literature, of which the most popular in social sciences is the optimal matching (OM) edit distance. \traminer offers a general \Com{seqdist()} function that can compute the OM dissimilarity as well as a set of other dissimilarity measures. Table~\ref{tab:metrics} lists the available methods and their required parameters. The \Com{seqdist()} function can output the matrix of pairwise dissimilarities or the vector of distances to a provided reference sequence. We can also compute multichannel dissimilarities \citep{Pollock2007JRSSa} with the \Com{seqdistmc()} function.

\begin{table}[tb]
\bigskip

\setlength\TableWidth\linewidth
\addtolength\TableWidth{-8\tabcolsep}

%\sffamily\small
%\begin{center}\sffamily\small
\begin{tabular}{|>{\raggedright}p{0.39\TableWidth}|>{\centering\ttfamily}p{0.09\TableWidth}|>{\centering}p{0.10\TableWidth}|>{\raggedright}p{0.42\TableWidth}|}
%\begin{tabular}{|l|l|p{.4\linewidth}|}
\hline
{Distance} & \normalfont{Method} & {Position-wise} & {Additional arguments}\tabularnewline
\hline
%\hline
\textit{Count of common attributes} & & & \tabularnewline
Simple Hamming & \code{HAM} & Yes &\tabularnewline
Longest Common Prefix & \code{LCP} & Yes &\tabularnewline
Longest Common Suffix & \code{RLCP} & Yes &\tabularnewline
Longest Common Subsequence & \code{LCS} & No &\tabularnewline
\hline
\textit{Edit distances} & & & \tabularnewline
Optimal Matching & \code{OM}  & No & Insertion/deletion costs (\Comt{indel}) and substitution costs matrix (\Comt{sm}) \tabularnewline
Hamming          & \code{HAM} & Yes & substitution costs matrix (\code{sm}) \tabularnewline
Dynamic Hamming  & \code{DHD} & Yes & substitution costs matrix (\code{sm}) \tabularnewline
\hline
\end{tabular}
%\end{center}
\caption{List of available metrics for computing distances with the \Comt{seqdist()} function.}
\label{tab:metrics}

\end{table}
%

Dissimilarity measures can be classified into measures based on the count of matching attributes and those defined as the (minimal) cost of transforming one sequence into the other. Another interesting distinction is between those that make position-wise comparisons; i.e., that do not allow shifting a sequence or part of it, and those accounting for similar shifted patterns (see Table~\ref{tab:metrics}). Without shift, $x=ABAB$ and $y=BABA$ are very distant, while they are quite similar if we shift $y$ by just one position.

\subsection{Dissimilarities based on counts of common attributes}

Let $A(x,y)$ be a count of common attributes between sequences $x$ and $y$. It is a proximity measure since the higher the counts, the closer the sequences. We derive a dissimilarity measure from it through the following general formula
%
\begin{equation}
\label{eq:prox-distances}
  d(x,y)= A(x,x) + A(y,y) - 2A(x,y)
\end{equation}
%
where $d(x,y)$ is the distance between objects $x$ and $y$.
%
The dissimilarity is maximal when $A(x,y)=0$; i.e., when the two sequences have no common attribute.
It is zero when the sequences are identical, in which case we have $A(x,y)=A(x,x)=A(y,y)$.
Let us briefly describe the implemented count-based dissimilarity measures.

%\subsubsection{Simple Hamming (simple HAM) distance}
The simple \textit{Hamming distance} \citep{Hamming-1950} is the number of positions at which two sequences of equal length differ. It can equivalently be defined as $\ell - A_H(x,y)$, with $\ell=|x|=|y|$ the common sequence length and $A_{H}(x,y)$ the number of matching positions.%
\footnote{This latter quantity is returned by the \Com{seqmpos()} function.}
We get the Hamming distance with Equation~(\ref{eq:prox-distances}) by using $A_{H}(x,y)/2$ as proximity measure.

We obtain another simple distance measure by using the length $A_P(x,y)$ of the \emph{longest common prefix} (LCP) between two sequences; i.e., by counting the number of successive common positions starting from the beginning of the sequences%
\footnote{This length is returned by the \Com{seqLLCP()} function.}
\citep[see for instance][]{Elzinga2008SMR}.
The reversed longest common prefix (RLCP) or \emph{longest common suffix} is similar to the LCP but looks for the common elements from the end rather than from the beginning of the sequences.

Another implemented metric is based on the length $A_{S}(x,y)$ of the \emph{longest common subsequence} (LCS).%
\footnote{The length of the LCS is returned by the \Com{seqLLCS()} function.} Notice that consecutive states in the common subsequence are not necessarily consecutive in the compared sequences. For example, the length of the LCS between sequences~1 and~3 of \Comt{mvad.seq} (see page~\pageref{r-mvad.seq-first-5} and Figure~\vref{fg_seqiplot_mvad}) is \Sexpr{seqLLCS(mvad.seq[1,],mvad.seq[3,])} and we get \Sexpr{seqLLCS(mvad.seq[2,],mvad.seq[5,])} between sequences 2 and 5.

Quite obviously, we can only have $A_{S}(x,y)\geq A_{P}(x,y)$; i.e., the length of the LCS cannot be smaller than the length of the LCP, and hence the LCS distance cannot be greater than the LCP distance. We have also $A_{S}(x,y)\geq A_{H}(x,y)$.
When compared with metrics based on position-wise counts such as the simple Hamming and the LCP distances, the LCS metric reduces distances by accounting for non-aligned matches; i.e., position-shifted similarities.

\subsection{Edit distances}

An edit distance is defined as the minimal cost of transforming one sequence into the other. This cost depends, indeed, on the allowed transforming operations and their individual costs. Basically, two types of operations are considered: i) the substitution of one element by an other one, and ii) the \emph{indel}; i.e., the insertion or deletion of an element, which generates a one-position shift of all the elements on its right. The generalized Hamming (HAM) and dynamic Hamming distances (DHD) \citep{lesnard:2006insee} accept only substitutions and hence no shift. The former assumes position-independent substitution costs while the second allows for position-dependent costs. The Optimal Matching (OM) distance, first considered by \citet{Levenshtein1966} and popularized in the social sciences by Abbott \citep{Abbott:Forrest:1986}, accounts for both operations.


\subsubsection{Setting indels and substitution costs}
\label{subsub:substitution-costs}

Usually the indel cost is set as a constant independent of the concerned position and state. Setting a high indel cost relatively to substitution costs favors substitutions while low values favor indels. We can prohibit shifts by setting the indel cost sufficiently high.%
\footnote{It is sufficient, for prohibiting indels, to set the indel cost higher than $\frac{\ell}{2} \times \max(sm)$, where $\ell$ is the common sequence length and $\max(sm)$ is the highest substitution cost.}

Substitution costs are generally organized in matrix form. A three-dimensional matrix is necessary in the case of position varying costs as used, for instance, by the DHD metric. In the time invariant case, the substitution-cost matrix is a square symmetrical matrix of dimension $a\times a$, where $a$ is the number of distinct states in the alphabet. The element $(i,j)$ in the matrix is the cost of substituting state $s_i$ with state $s_j$. The user can either specify its own substitution-cost matrix,%
\footnote{When this substitution-cost matrix does not respect the triangle inequality, dissimilarities based on it may also fail to respect this inequality \citep[see][]{StuderRitschardGabadinhoMuller2011SMR}.\label{ftn_triangle_inequality}} or compute one by means of the \Comt{seqsubm()} function with option \Comt{method = "CONSTANT"} or \Comt{method = "TRATE"}. With \Comt{"CONSTANT"}, all costs are set as the user provided \Comt{cval} constant (2 by default). With \Comt{"TRATE"}, the costs are determined from the estimated transition rates as
%
\[
2-p(s_i\mid s_j)-p(s_j \mid s_i)
\]
where $p(s_i \mid s_j)$ is the probability of observing state $s_i$ at time $t+1$ given that state $s_j$ has been observed at time $t$ (see page~\pageref{sec_transition_rates}). The idea is to set a high cost when changes between $s_i$ and $s_j$ are seldom observed and lower cost when they are frequent.

Here is how we get the time-invariant transition-rate-based substitution cost matrix for the \Dataset{mvad} data:
%
<<scost, echo=TRUE, results=verbatim>>=
scost <- seqsubm(mvad.seq, method="TRATE")
round(scost,3)
@
%
The minimum cost is 0 for the substitution of each state by itself, and the maximum is less than 2; i.e., the value that we would get for a transition not observed in the data.
%
In accordance with what we observed in the transition rate matrix (page~\pageref{comment_transition_rates}), we get the lowest costs for substituting EM (employment) to JL (joblessness) or TR (training). Remember, however, that---unlike transition rates---the substitution costs are symmetric and hence we have the lowest cost for changing JL or TR into EM.

\subsubsection{Implemented edit distances}

\textit{Generalized Hamming} (HAM) and \textit{Dynamic Hamming} (DHD) dissimilarities are intended for sequences of equal lengths only. The former generalizes the basic Hamming distance by allowing for state dependent substitution costs. Indeed, the count of nonmatching positions is the cost of substituting a state at each position when all costs are set to 1. DHD is the extension proposed by \citet{lesnard:2006insee} to account for time-varying costs.
%
<<label=DHD-HAM, fig=FALSE, echo=FALSE>>=
dhd.vs.ham <- abs(mvad.dhd.ref-(4*mvad.ham.ref))
@
%
For the \textit{mvad} data set, the flexibility in substitution costs allowed by the DHD metric has only a limited impact as can be seen in Figure~\ref{fg_metrics_mvad}.

\begin{figure}[tb]

\begin{center}
\setkeys{Gin}{width=.75\textwidth}
<<label=fig_metrics_mvad, fig=TRUE, echo=FALSE>>=
mvad.metrics <- data.frame(LCP=mvad.lcp.ref, LCS=mvad.lcs.ref, OM=mvad.om.ref,
HAM=mvad.ham.ref, DHD=mvad.dhd.ref)

pairs(mvad.metrics, col="blue", pch=20, ps=0.1)
@
\setkeys{Gin}{width=.8\textwidth}
\vskip -3ex
\caption{Distances to the most frequent sequence obtained with various metrics, \Datasett{mvad} data.}
\label{fg_metrics_mvad}
\end{center}
\end{figure}
%


In addition to substitutions, \textit{Optimal Matching} (OM) allows also insertions/deletions. It thus applies to sequences of unequal lengths. Since the cost of the transformation may vary with the order of the successive indels and substitutions, OM is defined as the minimal cost---in terms of insertions, deletions and substitutions---of transforming one sequence into the other one.  %
The cost minimization is achieved through dynamic programming, the algorithm implemented in \traminer being essentially that of \citet{NeedlemanWunsch1970} with standard optimizations.

The $712\times 712$ pairwise distance matrix for our \Dataset{mvad} data computed by using the transition-rate-based costs and an indel cost of 1 is obtained with the command:
%
<<OM-mvad, echo=TRUE, results=verbatim, cache=TRUE>>=
mvad.om <- seqdist(mvad.seq, method="OM", indel=1, sm=scost)
@
\label{R-mvad.om}
%
The \Comt{mvad.om} distance matrix requires only \Sexpr{round(object.size(mvad.om)/1024^2,2)} megabytes memory space. However the number $n$ of sequences in the data can be an important issue when computing dissimilarity matrices since both the computing time and the size of the resulting matrix increase exponentially with $n$.
If necessary, we can divide the size by 2 by requesting only the upper triangle of the matrix with the \Comt{full.matrix=FALSE} argument. Most \R functions accept the resulting upper-triangle objects as dissimilarity argument.

\subsubsection{Comparing dissimilarity measures}

Choosing a dissimilarity measure and setting substitution and indel costs is an important step in sequence analysis. Though popular in social sciences, distances based on such costs have raised questions in the literature
\citep[see for instance][]{DijkstraTaris1995,Wu2000,Elzinga2008SMR}. The meaning of the substitution costs, their required symmetry and the sensitivity of the results to the chosen values have been pointed out as important issues. More recently,
the meaning of indels was also addressed \citep{Hollister2009, Lesnard2010}.
Favoring insertions and deletions reduces the importance of time shifts in the comparison, while favoring substitutions gives more importance to position-wise similarities.

Comparing the results obtained with various settings can also be useful for selecting the appropriate measure. Figure~\ref{fg_metrics_mvad} compares the discussed dissimilarity measures for the distance to the most frequent sequence on the \Dataset{mvad} data. We observe that, apart from the LCP metric, the measures yield very similar results. The few significant differences between HAM (or DHD) and LCS (or OM) illustrate how LCS and OM reduce dissimilarity by allowing for shifts in the comparison of the sequences.
%
<<label=LCS-OM, fig=FALSE, echo=FALSE>>=
LCS.vs.OM <- abs(mvad.lcs.ref-mvad.om.ref)
@
%
The mean difference between OM, obtained with costs derived from substitution rates, and LCS is only \Sexpr{round(mean(LCS.vs.OM)/max(mvad.lcs.ref)*100,2)}\% of the maximal distance.
The largest difference is \Sexpr{round(max(LCS.vs.OM)/max(mvad.lcs.ref)*100,2)}\%. These small differences are a consequence of low transition rates which lead to substitution costs comprised between \Sexpr{round(min(scost[scost!=0]),2)} and \Sexpr{round(max(scost[scost!=0]),2)}; i.e., close to 2. With a constant substitution cost of 2 and an indel cost equal to 1, OM is just LCS \citep{Elzinga2008SMR}.

\subsection{Normalized distances}

When dealing with sequences of different lengths, we may want to normalize the distances to account for these differences. More specifically, the aim of normalization is to relativize distances such that a dissimilarity of say 10 between sequences of length 100 becomes less important than a dissimilarity of 10 between sequences of length 5.
While the maximal distance between a pair of sequences depends on their length, normalization aims at setting it to 1 or, at least, to a value that does not depend on the lengths.%
\footnote{Normalization is not intended to account for the differences in length of the sequences between which we are measuring the dissimilarity.}

With \Comt{seqdist()} we control normalization by means of the \Comt{norm} argument. When setting it to \Comt{TRUE}, the normalization applied is determined by the selected metric. For LCP, RLCP and LCS, we apply \citet{Elzinga2008SMR}'s normalization. It works as follows. Letting $A(x,y)$ be the (non normalized) proximity measure, we first normalize this similarity %by dividing it by the geometrical average of the two proximities
\[
  s_{A}(x,y)=\dfrac{A(x,y)}{\sqrt{A(x,x) A(y,y)}}
\]
The normalized distance is, then, just the complement to 1 of the normalized similarity.
\[
D_{A}(x,y)=1-s_{A}(x,y)
\]
which gives values comprised between 0 and 1.

For the OM distance, as well as for HAM and DHD, we apply Abbott's normalization, which consists of dividing the distance by the length of the longest of the two sequences
\[
D_{OM}(x,y)=\frac{d_{OM}(x,y)}{\max\{|x|,|y|\}} \enspace.
\]
It results that for OM with an indel cost of 1 and a constant substitution cost of 2, the maximal normalized OM distance is~2. Though OM is in this latter case equivalent to LCS, their normalized values differ.

We can also force the normalization method by specifying either \Comt{"gmean"} for Elzinga's normalization or \Comt{"maxlength"} for Abbott's solution. Alternatively, we can use \Comt{"maxdist"}, which consists of dividing each distance by its maximal theoretical value. For LCP and LCS distances, the maximal possible value is the sum $\ell_x + \ell_y$ of the lengths of the two sequences $x$ and $y$, for HAM it is the length $\ell$ of the sequences, while the maximum theoretical OM distance is
%
\begin{equation*}
\label{eq:OM-max}
D_{max}=\min(\ell_{x},\ell_{y}) \cdot \min\big(2{c_I}, \max(S)\big) + c_{I} |\ell_{x}-\ell_{y}|
\end{equation*}
where $c_I > 0$ is the indel cost, $\max(S)$ the greatest substitution cost and $|\ell_{x}-\ell_{y}|$ the absolute value of the difference in lengths of the two sequences.
%
<<OM-maxdist, echo=FALSE, results=hide>>=
D.max <- 70*min(2, max(scost))
@
%
With the unit indel cost and the \Comt{scost} transition-rate-based substitution cost matrix, this yields \Sexpr{round(D.max,2)} for the \Dataset{mvad} data. This is very close from twice the sequence length; i.e., $2\cdot 70 = 140$.

It is worth mentioning that the triangle inequality property of the original distance may in some cases be lost through the \Comt{"maxlength"} and \Comt{"maxdist"} normalizations.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Dissimilarity based sequence analysis}
\label{sec_further_statistical_analysis}

Beside information on the similarity between any pair of sequences, a distance matrix opens access to many classical statistical and data analysis tools. It permits for instance to extract representative sequences such as medoids, to run any clustering technique based on pairwise dissimilarities and to apply multidimensional scaling. It even permits to compute pseudo-variances and run ANOVA-like analyses as explained in \citet{StuderRitschardGabadinhoMuller2011SMR}. We demonstrate in this section how the \Comt{mvad.om} dissimilarity (distance) matrix obtained with the command shown page~\pageref{R-mvad.om} can be exploited for further statistical analysis.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Representative sequences}
\label{sec_representative-sequences}

A major concern when analyzing sets of categorical sequences is to find useful ways of summarizing them. Possible solutions could be to determine some central or typical sequence such as the modal---most frequent---sequence or the medoid---most central---sequence. However, such solutions are of limited interest since they provide usually only a too rough idea of the main patterns in the set. A more general approach consists in finding sets of representatives %rather than a single one
and \traminer provides the versatile generic \Comt{seqrep()} function for extracting such sets from the dissimilarity matrix. The function allows control over the amount of information that the representative set should convey. The sets returned by \Comt{seqrep()} exhibit, thus, the key features of the whole set they are extracted from, which proves useful, for example, when labeling clusters of sequences.

The principle of the search algorithm %\citep{Gabadinho_et_al_2009KDIR}
\citep{Gabadinho_et_al2011CCIS} is to sort the sequences according to a representativeness criterion%
\footnote{See the reference manual for the different criteria that can be used for sorting the original sequences.}
and to remove the redundancy by browsing the sorted sequences. The redundancy threshold is set as a percentage (10\% by default) of the maximum theoretical
%or observed
dissimilarity $D_{max}$ between two sequences and the representative set will thus not contain any pair of sequences that are nearer each other than this threshold. The size of the representative set can be controlled by fixing either the minimal expected \Comt{coverage} of the representative set or the number \Comt{nrep} of representatives.

The \textit{coverage} of a representative sequence is the percentage of sequences that are in its neighborhood; i.e., the number of sequences with a distance to the representative less than a selected threshold.%
\footnote{We suggest setting the threshold as a given percentage of the maximum theoretical distance between two sequences.}
The \emph{total coverage} of the representative set corresponds to the percentage of the $n$ original sequences that have a representative in their neighborhood. A series of other individual and global measures to evaluate the quality of the obtained representatives is also computed.

The list of representative sequences is obtained by printing the outcome of \Comt{seqrep()} and we get the quality measures with the \Comt{summary()} method. The \Comt{seqrplot()} function generates representative sequence plots.

\subsubsection{Example 1: Medoid and the centrality criterion}
A first simple example
\footnote{Results for representative sequences differ from the ones published in the original JSS article. This is due to the use of weights, which was not implemented in the TraMineR version (1.8) the article is based on.}
 of a representative sequence is the \emph{medoid} %or \emph{centrotype}
of a set of sequences. The medoid is the most central object; i.e., the one with minimal sum of distances to all other objects in the set \citep{Kaufman-2005}. It is a special case of representative sequence obtained by selecting the centrality sorting criterion (\Comt{criterion="dist"}) and setting the size of the representative set to 1 (\Comt{nrep=1}).
%
<<seqrep-mvad, echo=TRUE, results=verbatim>>=
medoid <- seqrep(mvad.seq, diss=mvad.om, criterion="dist", nrep=1)
print(medoid, format="SPS")
@

\subsubsection{Example 2: Representative set and the neighborhood density criterion}

The medoid of a set of sequences usually yields poor coverage.
To increase coverage we should allow for more than one representative. When seeking for more than one representative, an initial sort of the sequences according to the density of their neighborhood yields better results. Neighborhood density is, therefore, the default criterion used by \Comt{seqrep()}.

The command below finds and plots the representative set that, with a neighborhood radius of 10\% (default \Comt{pradius} value), covers at least 25\% (default \Comt{coverage} value) of the sequences in each of the two \emph{gcse5eq} groups:
%
<<seqrplot-mvad-density, echo=TRUE, results=verbatim, fig=FALSE, cache=TRUE>>=
seqrplot(mvad.seq, group=mvad$gcse5eq, diss=mvad.om, border=NA)
@
% $
%
%
\begin{figure}[tb]
\begin{center}\vskip-2ex
\setkeys{Gin}{width=1\textwidth}
<<label=fg-seqrplot-mvad, fig=TRUE, echo=FALSE, width=10, height=5.5, cache=FALSE>>=
<<seqrplot-mvad-density>>
@
\setkeys{Gin}{width=.8\textwidth}
\caption{Representative sequences by end of compulsory school qualification group.}
\label{fg_seqrplot_mvad_density}
\end{center}
\end{figure}
%
In the resulting plot (Figure~\ref{fg_seqrplot_mvad_density}) the selected representative sequences are plotted bottom-up according to their representativeness score with bar width proportional to the number of sequences assigned to them. At the top of the plot, two parallel series of symbols standing each for a representative are displayed horizontally on a scale ranging from 0 to the maximal theoretical distance $D_{max}$. The location of the symbol associated with the representative, $r_i$, indicates on axis $A$ the discrepancy within the subset $R_i$ of sequences assigned to $r_i$ and on axis $B$ the mean distance to the representative.

We learn from the plots that respectively five and one representatives are necessary for each of the two groups to achieve the 25\% coverage and that the actual coverage is 29\% in both cases.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Clustering sequences}
Clustering is an exploratory data analysis method aimed at finding automatically homogeneous groups or clusters in the data \citep{Kaufman-2005}. In life course studies~\citep[e.g.,][]{McVicarAnyadike2002JRSSa,WidmerRitschard2009ALCR}, the method has typically been used in combination with OM distances to identify distinct groups of sequences with similar patterns; that is, to define a typology of sequences.

We already showed, in Section~\ref{sec_overview}, how we can make a cluster analysis of sequences using the \pkg{cluster} library \citep{cluster2005}. We used \Com{agnes()} to make a hierarchical clustering with the Ward method, but \Comt{pam()} (partitioning around medoids) or \Comt{diana()} (divisive analysis), for example, could also be used. The four clusters solution was retained after examining the dendrogram (Figure~\ref{fg_mvad-dendrogram}) of the clustering tree obtained with:
%
<<dendro-mvad-om, echo=TRUE, results=hide, cache=TRUE>>=
plot(clusterward, which.plots=2, labels=FALSE)
@
%
\begin{figure}[tb]
\begin{center}
\setkeys{Gin}{width=.6\textwidth}
<<label=fig_cluster-mvad-om, fig=TRUE, echo=FALSE, width=8, height=6, cache=FALSE>>=
<<dendro-mvad-om>>
@
\setkeys{Gin}{width=.7\textwidth}
\caption{Hierarchical sequence clustering from the OM distances, Ward method.}
\label{fg_mvad-dendrogram}
\end{center}
\end{figure}
%
Figure~\vref{fg_cluster-seqrplot} obtained with the command below shows the representative sequences by cluster, complementing the plots of the transversal state distributions shown in Figure~\ref{fg_cluster-seqdplot} page~\pageref{fg_cluster-seqdplot}. The threshold for the coverage of the representative set is set to 35\% using the \Comt{coverage=.35} argument.
%

<<mvad-clust-viz-rplot, echo=TRUE, results=hide, cache=TRUE>>=
seqrplot(mvad.seq, group=cl4.lab, diss=mvad.om, coverage=.35, border=NA)
@

\begin{figure}[tb]
\begin{center}
\setkeys{Gin}{width=.9\textwidth}
<<label=fig_cluster-seqrplot, fig=TRUE, echo=FALSE, width=7, height=7>>=
<<mvad-clust-viz-rplot>>
@
\vskip -1ex
\caption{Plots of representative sequences by cluster.}
\label{fg_cluster-seqrplot}
\end{center}
\end{figure}
%

Looking at these two figures helps interpreting and labeling the clusters. They show that clustering from the OM distances identifies four distinct patterns of school to work transitions. In the first cluster the trajectories are clearly oriented toward early transition to employment, with, in some cases, a spell of training. The second cluster is dominated by trajectories containing a spell of school or further education followed by higher education.
Cluster~3 corresponds to slow transition to employment with first an important spell of further education.
In the last cluster, the transitions from school to work are more chaotic with frequent spells of training and joblessness.

After having labeled the clusters, a common further step is to examine how the cluster membership depends on covariates by means of logistic regressions. We can, for instance, investigate the profiles of the youngsters belonging to the last cluster with:

<<mvad-cl4-logreg, echo=TRUE, results=hide>>=
mb4 <- (cl4.lab=="Cluster 4")
glm.cl4 <- glm(mb4 ~ male + funemp + gcse5eq, data=mvad, family="binomial")
@

<<mvad-prep-ORtable, echo=FALSE, results=hide>>=
glm.cl1 <- glm((cl4.lab=="Cluster 1") ~ male + funemp + gcse5eq, data=mvad, family="binomial")
glm.cl2 <- glm((cl4.lab=="Cluster 2") ~ male + funemp + gcse5eq, data=mvad, family="binomial")
glm.cl3 <- glm((cl4.lab=="Cluster 3") ~ male + funemp + gcse5eq, data=mvad, family="binomial")
cl1.coeff <- as.data.frame(summary(glm.cl1)$coefficients)
cl2.coeff <- as.data.frame(summary(glm.cl2)$coefficients)
cl3.coeff <- as.data.frame(summary(glm.cl3)$coefficients)
cl4.coeff <- as.data.frame(summary(glm.cl4)$coefficients)
OR.table <- data.frame(Cluster1 = exp(cl1.coeff$Estimate), sig1 = cl1.coeff[,"Pr(>|z|)"],
                       Cluster2 = exp(cl2.coeff$Estimate), sig2 = cl2.coeff[,"Pr(>|z|)"],
                       Cluster3 = exp(cl3.coeff$Estimate), sig3 = cl3.coeff[,"Pr(>|z|)"],
                       Cluster4 = exp(cl4.coeff$Estimate), sig4 = cl4.coeff[,"Pr(>|z|)"])
@

\begin{table}[tb]
%	\begin{center}
\bigskip
\centerline{%\sffamily
<<mvad-OR-table, echo=FALSE, results=tex>>=
OR.xtable <- xtable(OR.table, digits=3, align="|r|rrrrrrrr|")
rownames(OR.xtable) <- c("(Constant)", "Male", "Father unemployed", "Good end cs grade")
colnames(OR.xtable) <- c("Clust 1", "Sig", "Clust 2", "Sig", "Clust 3", "Sig", "Clust 4", "Sig")
##print(OR.xtable, floating=FALSE, caption.placement="top", hline.after=c(-1,0,0,nrow(OR.xtable)))
print(OR.xtable, floating=FALSE, caption.placement="top")
@
}
%    \end{center}
\caption{Odds Ratios for cluster memberships.}
 \label{tab_OR-cluster-membership}
\end{table}

Proceeding the same way for the other clusters and taking the exponential of the regression coefficients, we get the odds ratios shown in Table~\ref{tab_OR-cluster-membership}. We learn from those figures that, among others, men are significantly more present in Cluster 1 and women in Cluster 3, and that the chances to follow the trajectory pattern with higher education depicted by Cluster 2 are multiplied by about 15 for those with good results at the end of compulsory school.

% =======================
\section{Conclusion}
\label{sec_conclusions}

\traminer is a unique toolbox for categorical sequential data that offers, in a unified environment, a series of methods that could previously only be found in separate programs. It also provides original measures, plots and methods of analysis that were developed by the authors. Besides the material for state sequences described in this article, the package includes specific tools
%for extracting representative sequences \citep{Gabadinho_et_al_2009KDIR},
for mining event sequences \citep{RitschardGabadinhoMullerStuder2008IJDMMM,MullerStuderRitschardGabadinho2010EGC,StuderMullerRitschardGabadinho2010EGC}, for studying the discrepancy of a set of sequences through ANOVA-like analyses and regression trees \citep{StuderRitschardGabadinhoMuller2011SMR}. In addition, it proposes utilities such as the conversion to and from various sequence formats \citep{RitschardGabadinhoStuderMuller2009DManag} that was briefly addressed in Section~\ref{sec_identifying-formats}.

This rich set of features, in combination with other specialized \R packages, allows the user to run all steps of a complete sequence analysis in a single, free, powerful and multi-platform environment. It is also worth mentioning that development regarding state sequences continues. Related papers and information on current developments can be found on the package's Web page \url{http://mephisto.unige.ch/traminer}. We encourage users to submit their feature requests using our dedicated tool at \url{http://r-forge.r-project.org/projects/traminer}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Acknowledgments}

The development of the \traminer package and this article were realized within a research
project supported by the Swiss National Science Foundation under grants 113998 and 122230.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\bibliographystyle{jss}
%\bibliography{./TraMineR-JStatSoft}
%
% input "C:\C:\texmf_my\bibtex\bib\svn\bibtex\bib\stat.bib"
% input "C:\C:\texmf_my\bibtex\bib\svn\bibtex\bib\bibliobase.bib"

%\vspace{-2ex}

\begin{thebibliography}{43}
\newcommand{\enquote}[1]{``#1''}
\providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}}
\providecommand{\urlprefix}{URL }
\expandafter\ifx\csname urlstyle\endcsname\relax
  \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else
  \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup
  \urlstyle{rm}\Url}\fi
\providecommand{\eprint}[2][]{\url{#2}}

\bibitem[{Aassve \emph{et~al.}(2007)Aassve, Billari, and
  Piccarreta}]{Aassve-2007}
Aassve A, Billari F, Piccarreta R (2007).
\newblock \enquote{Strings of Adulthood: A Sequence Analysis of Young British
  Women's Work-Family Trajectories.}
\newblock \emph{European Journal of Population}, \textbf{23}(3), 369--388.
\newblock \urlprefix\url{http://dx.doi.org/10.1007/s10680-007-9134-6}.

\bibitem[{Abbott(1997)}]{Abbott1997optimize}
Abbott A (1997).
\newblock \enquote{\proglang{Optimize}.}
\newblock \urlprefix\url{http://home.uchicago.edu/~aabbott/om.html}.

\bibitem[{Abbott and Forrest(1986)}]{Abbott:Forrest:1986}
Abbott A, Forrest J (1986).
\newblock \enquote{Optimal Matching Methods for Historical Sequences.}
\newblock \emph{Journal of Interdisciplinary History}, \textbf{16}, 471--494.

\bibitem[{Abbott and Tsay(2000)}]{Abbott-2000}
Abbott A, Tsay A (2000).
\newblock \enquote{Sequence Analysis and Optimal Matching Methods in Sociology,
  {R}eview and Prospect.}
\newblock \emph{Sociological Methods and Research}, \textbf{29}(1), 3--33.

\bibitem[{Berchtold and Raftery(2002)}]{BerchtoldRaftery2002}
Berchtold A, Raftery AE (2002).
\newblock \enquote{The Mixture Transition Distribution Model for High-Order
  {M}arkov Chains and Non-Gaussian Time Series.}
\newblock \emph{Statistical Science}, \textbf{17}(3), 328--356.

\bibitem[{Billari(2001{\natexlab{a}})}]{Billari-2001a}
Billari FC (2001{\natexlab{a}}).
\newblock \enquote{The Analysis of Early Life Courses: Complex Description of
  the Transition to Adulthood.}
\newblock \emph{Journal of Population Research}, \textbf{18}(2), 119--142.

\bibitem[{Billari(2001{\natexlab{b}})}]{Billari-2001}
Billari FC (2001{\natexlab{b}}).
\newblock \enquote{Sequence Analysis in Demographic Research.}
\newblock \emph{Canadian Studies in Population}, \textbf{28}(2), 439--458.
\newblock Special Issue on Longitudinal Methodology.

\bibitem[{Billari \emph{et~al.}(2006)Billari, Frnkranz, and
  Prskawetz}]{billa:furn:prsk:2006}
Billari FC, F\"{u}rnkranz J, Prskawetz A (2006).
\newblock \enquote{Timing, Sequencing, and Quantum of Life Course Events: A
  Machine Learning Approach.}
\newblock \emph{European Journal of Population}, \textbf{22}(1), 37--65.

\bibitem[{Blossfeld \emph{et~al.}(2007)Blossfeld, Golsch, and
  Rohwer}]{blos:golsch:rohw:2007}
Blossfeld HP, Golsch K, Rohwer G (2007).
\newblock \emph{Event History Analysis with \proglang{Stata}}.
\newblock Lawrence Erlbaum, Mahwah NJ.

\bibitem[{Brzinsky-Fay \emph{et~al.}(2006)Brzinsky-Fay, Kohler, and
  Luniak}]{Brzinsky-FayKohlerLuniak2006}
Brzinsky-Fay C, Kohler U, Luniak M (2006).
\newblock \enquote{Sequence Analysis with \proglang{{S}tata}.}
\newblock \emph{The Stata Journal}, \textbf{6}(4), 435--460.

\bibitem[{Deville and Saporta(1983)}]{Deville:Saporta:1983}
Deville JC, Saporta G (1983).
\newblock \enquote{Correspondence analysis with an extension towards nominal
  time series.}
\newblock \emph{Journal of Econometrics}, \textbf{22}, 169--189.

\bibitem[{Dijkstra and Taris(1995)}]{DijkstraTaris1995}
Dijkstra W, Taris T (1995).
\newblock \enquote{Measuring the Agreement between Sequences.}
\newblock \emph{Sociological Methods and Research}, \textbf{24}(2), 214--231.

\bibitem[{Elzinga(2007{\natexlab{a}})}]{Elzinga-2007b}
Elzinga CH (2007{\natexlab{a}}).
\newblock \emph{\proglang{CHESA 2.1} User Manual}.
\newblock Vrije Universiteit, Amsterdam.
\newblock \urlprefix\url{http://home.fsw.vu.nl/ch.elzinga/}.

\bibitem[{Elzinga(2007{\natexlab{b}})}]{Elzinga2008SMR}
Elzinga CH (2007{\natexlab{b}}).
\newblock \enquote{Sequence Analysis: Metric Representations of Categorical
  Time Series.}
\newblock \emph{Manuscript}, Department of Social Science Research Methods,
  Vrije Universiteit, Amsterdam.
\newblock \urlprefix\url{http://home.fsw.vu.nl/ch.elzinga/}.

\bibitem[{Elzinga and Liefbroer(2007)}]{ElzingaLiefbroer2007EJP}
Elzinga CH, Liefbroer AC (2007).
\newblock \enquote{De-standardization of Family-Life Trajectories of Young
  Adults: A Cross-National Comparison Using Sequence Analysis.}
\newblock \emph{European Journal of Population}, \textbf{23}, 225--250.
\newblock \doi{10.1007/s10680-007-9133-7}.

\bibitem[{Fussell(2005)}]{Fussell-2005}
Fussell E (2005).
\newblock \enquote{Measuring the Early Adult Life Course in {M}exico: {A}n
  Application of the Entropy Index.}
\newblock In R~Macmillan (ed.), \enquote{The Structure of the Life Course:
  {S}tandardized? {I}ndividualized? {D}ifferentiated?}, Advances in Life Course
  Research, Vol. 9, pp. 91--122. Elsevier, Amsterdam.

\bibitem[{Gabadinho \emph{et~al.}(2009)Gabadinho, Ritschard, Studer, and
  M\"{u}ller}]{Gabadinho_et_al_2009TraMineR-UGuide}
Gabadinho A, Ritschard G, Studer M, M\"{u}ller NS (2009).
\newblock \enquote{Mining Sequence Data in \proglang{{R}} with the
  \pkg{{T}ra{M}ine{R}} package: {A} User's Guide.}
\newblock \emph{Technical report}, Department of Econometrics and Laboratory of
  Demography, University of Geneva, Geneva.
\newblock \urlprefix\url{http://mephisto.unige.ch/traminer/}.

\bibitem[{Gabadinho \emph{et~al.}(2010)Gabadinho, Ritschard, Studer, and
  M\"{u}ller}]{GabadinhoRitschardStuderMuller2010EGC}
Gabadinho A, Ritschard G, Studer M, M\"{u}ller NS (2010).
\newblock \enquote{Indice de complexit\'{e} pour le tri et la comparaison de
  s\'{e}quences cat\'{e}gorielles.}
\newblock \emph{Revue des nouvelles technologies de l'information {RNTI}},
  \textbf{E-19}, 61--66.

\bibitem[{Gabadinho \emph{et~al.}(2011)Gabadinho, Ritschard, Studer, and
  M\"uller}]{Gabadinho_et_al2011CCIS}
Gabadinho A, Ritschard G, Studer M, M\"uller NS (2011).
\newblock \enquote{Extracting and Rendering Representative Sequences.}
\newblock In A~Fred, JLG Dietz, K~Liu, J~Filipe (eds.), \enquote{Knowledge
  Discovery, Knowledge Engineering and Knowledge Management,} volume 128 of
  \emph{Communications in Computer and Information Science (CCIS)}, pp.
  94--106. Springer-Verlag.

\bibitem[{Hamming(1950)}]{Hamming-1950}
Hamming RW (1950).
\newblock \enquote{Error Detecting and Error Correcting Codes.}
\newblock \emph{Bell System Techical Journal}, \textbf{29}, 147--160.

\bibitem[{Hollister(2009)}]{Hollister2009}
Hollister M (2009).
\newblock \enquote{{Is Optimal Matching Suboptimal?}}
\newblock \emph{Sociological Methods Research}, \textbf{38}(2), 235--264.
\newblock \doi{10.1177/0049124109346164}.

\bibitem[{Hosmer and Lemeshow(1999)}]{HosmerLemeshow1999}
Hosmer DW, Lemeshow S (1999).
\newblock \emph{Applied Survival Analysis, Regression Modeling of Time to Event
  Data}.
\newblock John Wiley \& Sons, New York.

\bibitem[{Kaufman and Rousseeuw(2005)}]{Kaufman-2005}
Kaufman L, Rousseeuw PJ (2005).
\newblock \emph{Finding Groups in Data}.
\newblock John Wiley \& Sons, Hoboken.

\bibitem[{Lesnard(2006)}]{lesnard:2006insee}
Lesnard L (2006).
\newblock \enquote{Optimal Matching and Social Sciences.}
\newblock \emph{S\'{e}rie des Documents de Travail du CREST 2006-01}, Institut
  National de la Statistique et des Etudes Economiques, Paris.

\bibitem[{Lesnard(2010)}]{Lesnard2010}
Lesnard L (2010).
\newblock \enquote{Setting Cost in Optimal Matching to Uncover Contemporaneous
  Socio-Temporal Patterns.}
\newblock \emph{Sociological Methods and Research}, \textbf{38}, 389--419.
\newblock \doi{10.1177/0049124110362526}.

\bibitem[{Levenshtein(1966)}]{Levenshtein1966}
Levenshtein V (1966).
\newblock \enquote{Binary Codes Capable of Correcting Deletions, Insertions,
  and Reversals.}
\newblock \emph{Soviet Physics Doklady}, \textbf{10}, 707--710.

\bibitem[{Maechler \emph{et~al.}(2005)Maechler, Rousseeuw, Struyf, and
  Hubert}]{cluster2005}
Maechler M, Rousseeuw P, Struyf A, Hubert M (2005).
\newblock \enquote{Package `\pkg{cluster}': Cluster Analysis Basics and
  Extensions.}
\newblock \emph{Reference manual}, R-project, CRAN.
\newblock \urlprefix\url{http://CRAN.R-project.org/package=cluster}.

\bibitem[{Mayer and Tuma(1990)}]{Mayer-1990}
Mayer KU, Tuma N (1990).
\newblock \enquote{Life course research and event history analysis: An
  overview.}
\newblock In KU~Mayer, N~Tuma (eds.), \enquote{Event History Analysis in Life
  Course Research,} pp. 3--20. WI: University of Wisconsin Press, Madison.

\bibitem[{McVicar and Anyadike-Danes(2002)}]{McVicarAnyadike2002JRSSa}
McVicar D, Anyadike-Danes M (2002).
\newblock \enquote{Predicting Successful and Unsuccessful Transitions from
  School to Work Using Sequence Methods.}
\newblock \emph{Journal of the Royal Statistical Society A}, \textbf{165}(2),
  317--334.

\bibitem[{M\"{u}ller \emph{et~al.}(2010)M\"{u}ller, Studer, Ritschard, and
  Gabadinho}]{MullerStuderRitschardGabadinho2010EGC}
M\"{u}ller NS, Studer M, Ritschard G, Gabadinho A (2010).
\newblock \enquote{Extraction de r\`{e}gles d'association s\'{e}quentielle
   l'aide de mod\`{e}les de dur\'{e}e.}
\newblock \emph{Revue des nouvelles technologies de l'information {RNTI}},
  \textbf{E-19}, 25--36.

\bibitem[{Needleman and Wunsch(1970)}]{NeedlemanWunsch1970}
Needleman S, Wunsch C (1970).
\newblock \enquote{A General Method Applicable to the Search for Similarities
  in the Amino Acid Sequence of Two Proteins.}
\newblock \emph{Journal of Molecular Biology}, \textbf{48}, 443--453.

\bibitem[{Neuwirth(2007)}]{Neuwirth2007}
Neuwirth E (2007).
\newblock \emph{\pkg{RColorBrewer}: ColorBrewer Palettes}.
\newblock \proglang{R} package version 1.0-2,
  \urlprefix\url{http://CRAN.R-project.org/package=RColorBrewer}.

\bibitem[{Pollock(2007)}]{Pollock2007JRSSa}
Pollock G (2007).
\newblock \enquote{Holistic Trajectories: A Study of Combined Employment,
  Housing and Family Careers by Using Multiple-Sequence Analysis.}
\newblock \emph{Journal of the Royal Statistical Society A}, \textbf{170}(1),
  167--183.
\newblock \doi{10.1111/j.1467-985X.2006.00450.x}.

\bibitem[{{\proglang{R} Development Core
  Team}(2011)}]{R-Development-Core-Team2011Manual}
{\proglang{R} Development Core Team} (2011).
\newblock \emph{\proglang{R}: A Language and Environment for Statistical
  Computing}.
\newblock \proglang{R} Foundation for Statistical Computing, Vienna, Austria.
\newblock {ISBN} 3-900051-07-0, \urlprefix\url{http://www.r-project.org}.

\bibitem[{Ritschard \emph{et~al.}(2008)Ritschard, Gabadinho, M\"{u}ller, and
  Studer}]{RitschardGabadinhoMullerStuder2008IJDMMM}
Ritschard G, Gabadinho A, M\"{u}ller NS, Studer M (2008).
\newblock \enquote{Mining Event Histories: A Social Science Perspective.}
\newblock \emph{International Journal of Data Mining, Modelling and
  Management}, \textbf{1}(1), 68--90.
\newblock \doi{10.1504/IJDMMM.2008.022538}.

\bibitem[{Ritschard \emph{et~al.}(2009)Ritschard, Gabadinho, Studer, and
  M\"uller}]{RitschardGabadinhoStuderMuller2009DManag}
Ritschard G, Gabadinho A, Studer M, M\"uller NS (2009).
\newblock \enquote{Converting between Various Sequence Representations.}
\newblock In Z~Ras, A~Dardzinska (eds.), \enquote{Advances in Data Management,}
  volume 223 of \emph{Studies in Computational Intelligence}, pp. 155--175.
  Springer-Verlag, Berlin.
\newblock \doi{10.1007/978-3-642-02190-9\_8}.

\bibitem[{Rohwer and Ptter(2002)}]{RohwerPotter2002}
Rohwer G, Ptter U (2002).
\newblock \enquote{\proglang{{TDA}}: Transition Data Analysis.}
\newblock \emph{Software}, Ruhr-Universit\"{a}t Bochum, Fakult\"{a}t f\"{u}r
  Sozialwissenschaften, Bochum.
\newblock \urlprefix\url{http://www.ruhr-uni-bochum.de/tda.html}.

\bibitem[{Scherer(2001)}]{Scherer2001}
Scherer S (2001).
\newblock \enquote{Early Career Patterns: A Comparison of {G}reat {B}ritain and
  {W}est {G}ermany.}
\newblock \emph{European Sociological Review}, \textbf{17}(2), 119--144.

\bibitem[{Studer \emph{et~al.}(2010)Studer, M\"{u}ller, Ritschard, and
  Gabadinho}]{StuderMullerRitschardGabadinho2010EGC}
Studer M, M\"{u}ller NS, Ritschard G, Gabadinho A (2010).
\newblock \enquote{Classer, discriminer et visualiser des s\'{e}quences
  d'\'{e}v\'{e}nements.}
\newblock \emph{Revue des nouvelles technologies de l'information {RNTI}},
  \textbf{E-19}, 37--48.

\bibitem[{Studer \emph{et~al.}(2011)Studer, Ritschard, Gabadinho, and
  M\"{u}ller}]{StuderRitschardGabadinhoMuller2011SMR}
Studer M, Ritschard G, Gabadinho A, M\"{u}ller NS (2011).
\newblock \enquote{Discrepancy Analysis of State Sequences.}
\newblock \emph{Sociological Methods and Research}.
\newblock In press.

\bibitem[{Widmer and Ritschard(2009)}]{WidmerRitschard2009ALCR}
Widmer E, Ritschard G (2009).
\newblock \enquote{The De-Standardization of the Life Course: {A}re Men and
  Women Equal?}
\newblock \emph{Advances in Life Course Research}, \textbf{14}(1-2), 28--39.
\newblock \doi{10.1016/j.alcr.2009.04.001}.

\bibitem[{Wu(2000)}]{Wu2000}
Wu LL (2000).
\newblock \enquote{Some Comments on `{S}equence Analysis and Optimal Matching
  Methods in Sociology: {R}eview and Prospect'.}
\newblock \emph{Sociological Methods Research}, \textbf{29}(1), 41--64.
\newblock \doi{10.1177/0049124100029001003}.

\bibitem[{Yamaguchi(1991)}]{Yamaguchi-1991}
Yamaguchi K (1991).
\newblock \emph{Event History Analysis}.
\newblock ASRM 28. Sage, Newbury Park and London.

\end{thebibliography}


\end{document}