%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{The main algorithms used in the seqHMM package}
%\VignetteKeyword{categorical time series}
%\VignetteKeyword{latent Markov models}
%\VignetteKeyword{latent class models}
\documentclass{article}

\usepackage{amsmath}
\usepackage{array}
\usepackage{hyperref}
 \usepackage[authoryear,round,longnamesfirst]{natbib}
 
\newcolumntype{L}[1]{>{\raggedright\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}

\author{Jouni Helske\\ Link\"oping University, Sweden}
\title{The main algorithms used in the \texttt{seqHMM} package}


\begin{document}

\maketitle

\section{Introduction}

This vignette contains the descriptions of the main algorithms used in the \texttt{seqHMM} \citep{Helske2016} package. First, a forward-backward algorithm is presented, followed by a Viterbi algorithm, and the derivations of the gradients for the numerical optimisation routines.

\section{Forward--Backward Algorithm}
\label{app:forwardbackward}

Following \citet{Rabiner1989}, the \emph{forward variable}
\[
  \alpha_{it}(s)=P(\mathbf{y}_{i1}, \ldots, \mathbf{y}_{it},z_t=s|\mathcal{M})
\]
is the joint probability of partial observation sequences for subject $i$ until time $t$ and the hidden state $s$ at time $t$ given the model $\mathcal{M}$. Let us denote $b_s(\mathbf{y}_{it})=b_s(y_{it1})\cdots b_s(y_{itC})$, the joint emission probability of observations at time $t$ in channels $1,\ldots,C$ given hidden state $s$. The forward variable can be solved recursively for subject $i=1,\ldots,N$:
\begin{enumerate}
\item Initialization: For $s=1,\ldots,S$, compute \\
\begin{equation*}
\alpha_{i1}(s)=\pi_s b_s(\mathbf{y}_{i1})
\end{equation*}
\item Recursion: For $t=1,\ldots,T-1$,  compute \\
\begin{equation*}
\alpha_{i(t+1)}(s) = \left[ \sum_{r=1}^S \alpha_{it}(r) a_{rs} \right] b_s(\mathbf{y}_{i(t+1)}), \quad s=1,\ldots,S
\end{equation*}
\item Termination: Compute the likelihood \\
\begin{equation*}
P(Y_i|\mathcal{M})= \sum_{s=1}^S \alpha_{iT}(s)
\end{equation*}
  \end{enumerate}

The \emph{backward variable}
\[
  \beta_{it}(s)=P(\mathbf{y}_{i(t+1)}, \ldots, \mathbf{y}_{iT}| z_t=s, \mathcal{M})
  \]
is the joint probability of the partial observation sequence after time $t$ and hidden state $s$ at time $t$ given the model $\mathcal{M}$. For subject $i=1,\ldots,N$, the backward variable can be computed as
  \begin{enumerate}
\item Initialization: For $s=1,\ldots,S$, set \\
\begin{equation*}
\beta_{iT}(s) = 1
\end{equation*}
\item Recursion: For $t=T-1,\ldots,1$, compute  \\
\begin{equation*}
\beta_{it}(s)=\sum_{r=1}^S \left[ a_{sr} b_r(\mathbf{y}_{i(t+1)}) \beta_{i(t+1)}(r)\right], \quad s=1,\ldots,S
\end{equation*}
\end{enumerate}

In practice the forward-backward algorithm is prone to numerical instabilities. Typically we scale the forward and backward probabilities, as follows \citep{Rabiner1989}. For subject $i=1,\ldots,N$,

\begin{enumerate}
\item Initialization: For $s=1,\ldots,S$, compute  \\
\begin{equation*}
\begin{aligned}
\alpha_{i1}(s) &= \pi_s b_s(\mathbf{y}_{i1}), \\
c_{i1} &= 1 / \sum_{s=1}^S \alpha_{i1}(s),\\
\hat \alpha_{i1} &= c_{i1} \alpha_{i1}
\end{aligned}
\end{equation*}
\item Recursion: For $t=1,\ldots,T-1$, compute (as before) \\
\begin{equation*}
\begin{aligned}
\alpha_{i(t+1)}(s) = \left[ \sum_{r=1}^S \alpha_{it}(r) a_{rs} \right] b_s(\mathbf{y}_{i(t+1)}), \quad s=1,\ldots,S
\end{aligned}
\end{equation*}
and scale as
\begin{equation*}
\begin{aligned}
c_{i(t+1)} &= 1 / \sum_{s=1}^S \alpha_{i(t+1)}(s),\\
\hat \alpha_{i(t+1)} &= c_{i(t+1)} \alpha_{i(t+1)}
\end{aligned}
\end{equation*}
  \item Termination: Compute the log-likelihood \\
  \begin{equation*}
\textrm{log} P(Y_i|\mathcal{M})= -\sum_{t=1}^T c_{it}
\end{equation*}
  \end{enumerate}

The scaling factors $c_{it}$ from the forward algorithm are commonly used to scale also the backward variables, although other scaling schemes are possible as well. In \texttt{seqHMM}, the scaled backward variables for subject $i=1,\ldots,N$ are computed as

\begin{enumerate}
\item Initialization: For $s=1,\ldots,S$, compute  \\
\begin{equation*}
\begin{aligned}
\hat \beta_{iT}(s) &= c_{iT}
\end{aligned}
\end{equation*}
  \item Recursion: For $t=T-1,\ldots,1$, and $r=1,\ldots,S$, compute and scale \\
\begin{equation*}
\begin{aligned}
\beta_{it}(s)&=\sum_{r=1}^S \left[ a_{sr} b_r(\mathbf{y}_{i(t+1)}) \beta_{i(t+1)}(r)\right], \quad s=1,\ldots,S\\
\hat \beta_{it}(s) &= c_{it} \beta_{it}(s)
\end{aligned}
\end{equation*}
\end{enumerate}

Most of the times this scaling method described works well, but in some 
ill-conditioned cases it is possible that the default scaling still produces underflow in backward algorithm. For these cases, \texttt{seqHMM} also supports the computation of the forward and backward variables in log-space. Although numerically more stable, the algorithm is somewhat slower due repeated use of log-sum-exp trick.
  
\section{Viterbi Algorithm}
\label{app:viterbi}

We define the score
\[
\delta_{it}(s)=\max_{z_{i1} z_{i2} \cdots z_{i(t-1)}} P(z_{i1} \cdots z_{it}=s, \mathbf{y}_{i1} \cdots \mathbf{y}_{it}|\mathcal{M}),
\]
which is the highest probability of the hidden state sequence up to time $t$ ending in state $s$. By induction we have
\begin{equation}
\label{eq:delta}
\delta_{i(t+1)}(r)=\left[\max_{s} \delta_{it}(s) a_{sr}\right] b_r(\mathbf{y}_{i(t+1)}).
\end{equation}
We collect the arguments maximizing Equation~\ref{eq:delta} in an array $\psi_{it}(r)$ to keep track of the best hidden state sequence. The full Viterbi algorithm can be stated as follows:
  \begin{enumerate}
\item Initialization \\
$\delta_{i1}(s)=\pi_s b_s(\mathbf{y}_{i1}), s=1,\ldots,S$ \\
$\psi_{i1}(s)=0$
  \item Recursion \\
$\delta_{it}(r)=\max_{s=1,\ldots,S} (\delta_{i(t-1)}(s) a_{sr}) b_h(\mathbf{y}_{it})$,  \\
$\psi_{it}(s)=\operatorname*{arg\,max}_{s=1,\ldots,S} (\delta_{i(t-1)}(s) a_{sr}), s=1,\ldots,S; t=2,\ldots,T$
  \item Termination \\
$\hat{P}=\max_{s=1,\ldots,S} (\delta_{iT}(s))$ \\
$\hat{z}_{iT}=\operatorname*{arg\,max}_{s=1,\ldots,S}(\delta_{iT}(s))$
  \item Sequence backtracking \\
$\hat{z}_{it}=\psi_{i(t+1)}(\hat{s}_{i(t+1)}), t=T-1,\ldots,1$.
\end{enumerate}
To avoid numerical underflow due to multiplying many small probabilities, the Viterbi algorithm can be straighforwardly computed in log space, i.e., calculating $\log (\delta_{it}(s))$.

\section{Gradients}

Following \citet{Levinson1983}, by using the scaled forward and backward variables we have

\[
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial \pi_{s}} = b_s(\mathbf{y}_{i1}) \hat \beta_{i1}(s),
\]
\[
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial a_{sr}} = \sum_{t=1}^{T-1} \hat \alpha_{it}(s)b_r(\mathbf{y}_{i(t+1)}) \hat \beta_{i(t+1)}(r),
\]
and
\[
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial b_{rc}(m)} = 
\sum_{t: y_{itc} = m} \sum_{s=1}^S \alpha_{it}(s)a_{sr} \hat \beta_{i(t+1)}(r) + \textrm{I}(y_{i1c} = m)\pi_r\hat\beta_{i1}(r).
\]

In the direct numerical optimization algorithms used by \texttt{seqHMM}, the model is parameterised using unconstrained parameters $\pi'_s, a'_{sr}, b'_{rc}(m)$ such that $a_{sr} = \exp(a'_sr)/\sum_{k=1}^S \exp(a'_sk)$, and similarly for emission and initial probabilities. This leads to


\[
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial \pi'_{s}} = \frac{\partial \log P(Y_i|\mathcal{M})}{\partial \pi_{s}} \pi_s (1 - \pi_s)
\]
\[
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial a'_{sr}} = \frac{\partial \log P(Y_i|\mathcal{M})}{\partial a_{sr}} a_{sr} (1 - a_{sr}),
\]
and
\[
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial b'_{rc}(m)} = 
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial b_{rc}(m)} b_{rc}(m) (1 - b_{rc}(m)).
\]

\subsection{MHMM case}

For mixture HMM with $K$ clusters, we define a full model with $S=S^1+\cdots+S^K$ states in a block form with $\pi_i = (w_{i1} \pi^1,\ldots,w_{iK}\pi^K )^\top$, where $\pi^k$, $k=1,\ldots,K$ is the vector of initial probabilities for the submodel $\mathcal{M}^k$ and $w_{ik} = \exp(\textbf{x}^{\top}_i\gamma_k) / (1 + \sum_{j=2}^K \exp(\textbf{x}^{\top}_i\gamma_j)$, with $\gamma_1=0$.

First note that the log-likelihood of the HMM for $i$th subject can be written as
\[
P(Y_i|\mathcal{M}) = \sum_{s=1}^S \sum_{r=1}^S \alpha_t(s)a_{sr}b_r(\mathbf{y}_{i(t+1)})\beta_{t+1}(r),
\]
for any $t=1,\ldots,T-1$. Thus for $t=1$ we have

\begin{equation}
\begin{aligned}
P(Y_i|\mathcal{M}) &= \sum_{s=1}^S \sum_{r=1}^S \alpha_1(s)a_{sr}b_r(\mathbf{y}_{i2})\beta_{2}(r)\\ 
&= \sum_{s=1}^S \alpha_1(s) \sum_{r=1}^S a_{sr}b_r(\mathbf{y}_{i2})\beta_{2}(r)\\
&= \sum_{s=1}^S \alpha_1(s) \beta_1(s)\\
&= \sum_{s=1}^S \pi_{is} b_s(\mathbf{y}_{i1})\beta_1(s).
\end{aligned}
\end{equation}

Therefore the gradients for the unconstrained parameters $\pi^{k'}_s$ of the $k$th cluster are given as
\[
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial \pi^{k'}_{s}} = \frac{\partial \log P(Y_i|\mathcal{M}^k)}{\partial \pi^k_{s}} \pi^k_s (1 - \pi^k_s) w_{ik}.
\]
For $\gamma^k$, the gradients are of form
\begin{equation}
\begin{aligned}
\frac{\partial \log P(Y_i|\mathcal{M})}{\partial \gamma^k}
&= \sum_{s=1}^S b_s(\mathbf{y}_{i1}) \hat \beta_1(s) \frac{\pi_{is}}{\partial \gamma_k}.
\end{aligned}
\end{equation}

Now if state $s$ belongs to cluster $k$, we have 
\begin{equation}
\begin{aligned}
\frac{\partial \pi_{is}}{\partial \gamma_k} &= \pi^k_{s}\frac{\partial}{\partial \gamma_k} \frac{\exp(\textbf{x}^{\top}_i\gamma_k)}{\sum_{j=1}^K \exp(\textbf{x}^{\top}_i\gamma_j))}\\
&=\pi^k_{s}\mathbf{x}^\top_i w_{ik} (1 - w_{ik}),
\end{aligned}
\end{equation}
and 
\[
\frac{\partial \pi_{is}}{\partial \gamma_k} = -\pi_s^h \mathbf{x}^\top_i w_{ih}w_{ik},
\] otherwise, where $h$ is the index of cluster containing the state $s$.


\bibliographystyle{plainnat}
\bibliography{references_for_vignettes}

\end{document}