\documentclass[a4paper,12pt]{article}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
% \VignetteIndexEntry{Parametric duration models}
\newcommand{\btheta}{{\ensuremath{\boldsymbol{\theta}}}}
\newcommand{\bbeta}{{\ensuremath{\boldsymbol{\beta}}}}
\newcommand{\bz}{\ensuremath{\mathbf{z}}}
\newcommand{\bZ}{\ensuremath{\mathbf{Z}}}
\newcommand{\bx}{\ensuremath{\mathbf{x}}}
\newcommand{\bs}{\ensuremath{\mathbf{s}}}
\newcommand{\bt}{\ensuremath{\mathbf{t}}}
\newcommand{\bu}{\ensuremath{\mathbf{u}}}
\newcommand{\bd}{\ensuremath{\mathbf{d}}}
\newcommand{\bis}{\ensuremath{\mathbf{{\prime\prime}}}}
\newcommand{\pe}[1]{{\ensuremath{\frac{\partial}{\partial #1}}}}
\newcommand{\pt}[2]{{\ensuremath{\frac{\partial^2}{\partial #1\partial #2}}}}
\newcommand{\ezb}{\ensuremath{{e^{\bz_i \bbeta}}}}

\title{Parametric proportional hazards and accelerated failure time models}
\author{G{\"o}ran Brostr{\"o}m}
\date{February 16, 2009}
\begin{document}

\maketitle

\begin{abstract}
A unified implementation of parametric proportional hazards (PH) and
accelerated failure time (AFT) models for right-censored or
interval-censored and left-truncated data is described. The description
here is vaĺid for time-constant covariates, but the necessary modifications
for handling time-varying covariates are implemented in \emph{\tt
  eha}. Note that only piecewise constant time variation is handled.  
\end{abstract}

\section{Introduction}

There is a need for software for analyzing parametric proportional hazards
(PH) and accelerated failure time (AFT) data, that are right or interval
censored and left truncated.

\section{The proportional hazards model}\label{sec:ph}

We define proportional hazards models in terms of an expansion of a given
survivor function $S_0$, 
\begin{equation}\label{eq:sur}
s_\btheta(t; \bz) = \{S_0(g(t, \btheta))\}^{\exp(\bz\bbeta)},
\end{equation}
where $\btheta$ is a parameter vector used in modeling the baseline
distribution, $\bbeta$ is the vector of regression parameters, and $g$ is a
positive function, which helps defining a parametric family of baseline
survivor functions through
\begin{equation}\label{eq:surv}
S(t; \btheta) = S_0\bigl(g(t, \btheta)\bigr), \quad t > 0, \quad \btheta \in
\boldsymbol{\varTheta}. 
\end{equation}
With $f_0$ and $h_0$ defined as the density and hazard functions
corresponding to $S_0$, respectively, the density function
corresponding to $S$ is
\begin{equation*}
\begin{split}
f(t; \btheta) &= -\frac{\partial}{\partial t} S(t, \btheta)\\
&= -\frac{\partial}{\partial t} S_0(g(t, \btheta)) \\
&= g_t(t, \btheta) f_0(g(t, \btheta)),
\end{split}
\end{equation*}
where
\begin{equation*}
g_t(t, \btheta) = \frac{\partial}{\partial t} g(t, \btheta).
\end{equation*}
Correspondingly, the hazard function is
\begin{equation}\label{eq:haz}
\begin{split}
h(t; \btheta) &= \frac{f(t; \btheta)}{S(t; \btheta)} \\
&= g_t(t, \btheta) h_0(g(t, \btheta)). 
\end{split} 
\end{equation}
So, the proportional hazards model is
\begin{equation}\label{eq:prop}
\begin{split}
\lambda_{\btheta}(t; \bz) &= h(t; \btheta) \exp(\bz\bbeta)\\
&=  g_t(t, \btheta) h_0(g(t, \btheta)) \exp(\bz\bbeta),
\end{split}
\end{equation}
corresponding to \eqref{eq:sur}.

\subsection{Data and the likelihood function}

Given left truncated and right or interval censored data $(s_i, t_i, u_i,
d_i, \bz_i)$, $i = 
1, \ldots, n$ and the model \eqref{eq:prop}, the likelihood function becomes 
\begin{equation}\label{eq:lik}
\begin{split}
L\bigl((\btheta, \bbeta); (\bs, \bt, \bu, \bd), \bZ\bigr) &= 
\prod_{i=1}^n \bigl\{\bigl(h(t_i;
\btheta)\exp(\bz_i\bbeta)\bigr)^{I_{\{d_i=1\}}} \\ 
& \quad \times \bigl(S(t_i; \btheta)^{\exp(\bz_i\bbeta)}
\bigr)^{I_{\{d_i \ne 2\}}}\\  
& \quad \times \bigl(S(t_i; \btheta)^{\exp(\bz_i\bbeta)} - S(u_i; \btheta)^{\exp(\bz_i\bbeta)}
\bigr)^{I_{\{d_i = 2\}}} \\
& \quad \times S(s_i;\btheta)^{-\exp(\bz_i\bbeta)} \bigr\}  
\end{split}
\end{equation}
%\end{document}
Here, for $i = 1, \ldots, n$, $s_i < t_i \le u_i$ are the left truncation and exit
intervals, respectively, $d_i = 0$ means that $t_i = u_i$ and right
censoring at $u_i$, $d_i = 1$ means that $t_i = u_i$ and an event at $u_i$,
and $d_i = 2$ means that $t_i < u_i$ and an event occurs in the interval
$(t_i, u_i)$ (interval censoring)
and $\bz_i = (z_{i1}, \ldots, z_{ip})$ is a vector of explanatory
variables with corresponding parameter vector $\bbeta = (\beta_1, \ldots,
\beta_p)$, $i = 1, \ldots, n$. %And
%\begin{equation*}
%P(t_i, u_i, \bz_i; \btheta, \bbeta) = S(t_i; \btheta)^{\exp(\bz_i\bbeta)}
%- S(u_i; \btheta)^{\exp(\bz_i\bbeta)} 
%\end{equation*}

From \eqref{eq:lik} we now get the log likelihood and the score vector in a
straightforward manner.
\begin{equation}
\begin{split}
\ell\bigl((\btheta, \bbeta); (\bs, \bt, \bu, \bd), \bZ\bigr) &=
\sum_{i: d_i = 1} 
\bigl\{\log h(t_i; \btheta) + \bz_i\bbeta\bigr\} \\
&\quad + \sum_{i:d_i \ne 2} e^{\bz_i\bbeta}
\log S(u_i; \btheta) \\ 
&\quad + \sum_{i:d_i = 2} 
\log\bigl\{S(t_i; \btheta)^{\ezb} - S(u_i; \btheta)^{e^{\bz_i\bbeta}}
 \bigr\} \\
& \quad - \sum_{i=1}^n \ezb \log S(s_i; \btheta)
\end{split}
\end{equation}
and (in the following we drop the long argument list to $\ell$), for the
regression parameters \bbeta,
\begin{equation}
\begin{split}
\frac{\partial}{\partial \beta_j}\ell &= \sum_{i:d_i=1}  z_{ij} \\
&\quad + \sum_{i:d_i \ne 2} z_{ij}\ezb
\log S(t_i; \btheta)  \\
&\quad + \sum_{i:d_i = 2}z_{ij}\ezb
\frac{S(t_i;\btheta)^\ezb\log S(t_i;\btheta) - S(u_i;\btheta)^\ezb\log
  S(u_i;\btheta)} 
{S(t_i; \btheta)^\ezb - S(u_i; \btheta)^\ezb} \\
& \quad - \sum_{i=1}^n z_{ij}\ezb\log S(s_i; \btheta), \quad j = 1, \ldots, p, 
\end{split}
\end{equation}
and for the ``baseline'' parameters \btheta, in vector form,
\begin{equation}
\begin{split}
\frac{\partial}{\partial \btheta}\ell &= \sum_{i:d_i = 1}
\frac{h_\btheta(t_i, \btheta)}{h(t_i, \btheta)} \\
&\quad + \sum_{i:d_i \ne 2}e^{\bz_i\bbeta}
 \frac{S_\btheta(t_i; \btheta)}{S(t_i; \btheta)} \\ 
&\quad + \sum_{i:d_i = 2} \ezb
\frac{S(t_i;\btheta)^{\ezb - 1} S_\btheta(t_i;\btheta) -
  S(u_i;\btheta)^{\ezb - 1}  S_\btheta(u_i;\btheta)}  
{S(t_i; \btheta)^\ezb - S(u_i; \btheta)^\ezb} \\
& \quad - \sum_{i=1}^n\ezb\frac{S_\btheta(s_i; \btheta)}{S(s_i; \btheta)}.
\end{split}
\end{equation}

From \eqref{eq:haz},
\begin{equation}\label{eq:hazprim}
\begin{split}
h_\btheta(t, \btheta) &= \pe{\btheta}h(t, \btheta) \\
&= g_{t\btheta}(t, \btheta)h_0(g(t, \btheta)) + 
g_t(t, \btheta) g_\theta(t, \btheta)h^\prime_0(g(t, \btheta)), 
\end{split}
\end{equation}
and, from \eqref{eq:surv},
\begin{equation}\label{eq:survprim}
\begin{split}
S_\btheta(t; \btheta) &= \pe{\btheta}S(t; \btheta) = 
\pe{\btheta}S_0\bigl(g(t, \btheta)\bigr)\\
&= -g_\btheta(t, \btheta) f_0\bigl(g(t, \btheta)\bigr).
\end{split}
\end{equation}

For estimating standard errors, the observed information (the negative of
the hessian) is useful. However, instead of the error-prone and tedious
work of calculating analytic second-order derivatives, we will rely on 
approximations by numerical differentiation.

\section{The shape--scale families}

From \eqref{eq:sur} we get a \emph{shape--scale} family of distributions by
choosing $\btheta = (p, \lambda)$ and
\begin{equation*}
g(t, (p, \lambda)) = \biggl(\frac{t}{\lambda}\biggr)^p, \quad t \ge 0;
\quad p,
\lambda > 0.
\end{equation*}
However, for reasons of efficient numerical optimization and normality of
parameter 
estimates, we use the parametrisation $p = \exp(\gamma)$ and $\lambda =
\exp(\alpha)$, thus redefining $g$ to
\begin{equation} \label{eq:gshsc}
g(t, (\gamma, \alpha)) =
\biggl(\frac{t}{\exp(\alpha)}\biggr)^{\exp(\gamma)}, \quad t \ge 0; 
\quad -\infty < \gamma, \alpha < \infty.
\end{equation}
 For the calculation of the score and hessian of the log likelihood
 function, we need some partial derivatives of $g$. They are found in an
 appendix.

\subsection{The Weibull family of distributions}

The Weibull family of distributions is obtained by
\begin{equation*}
S_0(t) = \exp(-t), \quad t \ge 0,
\end{equation*}
leading to
\begin{equation*}
f_0(t) = \exp(-t), \quad t \ge 0,
\end{equation*} 
and
\begin{equation*}
h_0(t) = 1, \quad t \ge 0.
\end{equation*}
We need some first and second order derivatives of $f$ and $h$, and they
are particularly simple in this case, for $h$ they are both zero, and for
$f$ we get
\begin{equation*}
f_0^\prime(t) = -\exp(-t), \quad t \ge 0.
\end{equation*}

\subsection{The EV family of distributions}

The EV (Extreme Value) family of distributions is obtained by setting
\begin{equation*}
h_0(t) = \exp(t), \quad t \ge 0,
\end{equation*}
leading to
\begin{equation*}
S_0(t) = \exp\{-(\exp(t) - 1)\}, \quad t \ge 0,
\end{equation*}

The rest of the necessary functions are easily derived from this.

\subsection{The Gompertz family of distributions}

The Gompertz family of distributions is given by
\begin{equation*}
h(t) = \tau\exp(t/\lambda), \quad t \ge 0; \quad \tau, 
\lambda > 0.  
\end{equation*}
This family is not directly possible to generate from the described 
shape-scale models, so it is treated separately by direct maximum likelihood.

\subsection{Other families of distributions}

Included in the \emph{eha} package are the lognormal and the loglogistic 
distributions as well.
  
\section{The accelerated failure time model}
In the description of this family of models, we generate a scape-scale
family of distributions as defined by the equations \eqref{eq:surv} and
\eqref{eq:gshsc}. We get
\begin{equation}\label{eq:survaft}
\begin{split}
S(t; (\gamma, \alpha)) &= S_0\bigl(g(t, (\gamma, \alpha))\bigr) \\
&= S_0\biggl(\biggl\{\frac{t}{\exp(\alpha)}\biggr\}^{\exp(\gamma)}\biggr)
, \quad t >
0, \quad -\infty < \gamma, \alpha < \infty. 
\end{split}
\end{equation}
Given a vector $\bz = (z_1, \ldots, z_p)$ of explanatory variables and a
vector of corresponding regression 
coefficients $\bbeta = (\beta_1, \ldots, \beta_p)$, the AFT regression
model is defined by
\begin{equation}\label{eq:aftreg}
\begin{split}
S(t; (\gamma, \alpha, \bbeta)) &= 
S_0\bigl(g(t\exp(\bz\bbeta), (\gamma, \alpha))\bigr) \\
&=
S_0\biggl(\biggl\{\frac{t\exp(\bz\bbeta)}{\exp(\alpha)}\biggr\}^{\exp(\gamma)}\biggr)
\\
&= S_0\biggl(\biggl\{\frac{t}{\exp(\alpha -
  \bz\bbeta)}\biggr\}^{\exp(\gamma)}\biggr) \\
&= S_0\bigl(g(t, (\gamma, \alpha - \bz\bbeta))\bigr), \quad t > 0. 
\end{split}
\end{equation}
So, by defining $\btheta = (\gamma, \alpha - \bz\bbeta)$, we are back in the
framework of Section \ref{sec:ph}. We get
\begin{equation*}
f(t; \btheta) = g_t(t, \btheta) f_0(g(t, \btheta))
\end{equation*}
and
\begin{equation}\label{eq:afthaz}
h(t; \btheta) = g_t(t, \btheta) h_0(g(t, \btheta)), 
\end{equation}
defining the AFT model generated by the survivor function $S_0$ and
corresponding density $f_0$ and hazard $h_0$.

\subsection{Data and the likelihood function}

Given left truncated and right or interval censored data $(s_i, t_i, u_i,
d_i, \bz_i)$, $i = 
1, \ldots, n$ and the model \eqref{eq:afthaz}, the likelihood function becomes 
\begin{equation}\label{eq:aftlik}
\begin{split}
L\bigl((\gamma, \alpha, \bbeta); (\bs, \bt, \bd), \bZ\bigr) &= 
\prod_{i=1}^n \bigl\{h(t_i; \btheta_i)^{I_{\{d_i=1\}}} \\
& \quad \times S(t_i; \btheta_i)^{I_{\{i\ne 2\}}} \\
& \quad \times \bigl(S(t_i; \btheta_i) - S(u_i;
\btheta_i)\bigr)^{I_{\{d_i=2\}}} \\
& \quad \times S(s_i; \btheta_i)^{-1}\bigr\}
\end{split}  
\end{equation}
Here, for $i = 1, \ldots, n$, $s_i < t_i \le u_i$ are the left truncation and exit
intervals, respectively, $d_i = 0$ means that $t_i = u_i$ and right
censoring at $t_i$, $d_i = 1$ means that $t_i = u_i$ and an event at $t_i$,
and $d_i = 2$ means that $t_i < u_i$ and an event uccurs in the interval
$(t_i, u_i)$ (interval censoring),
and $\bz_i = (z_{i1}, \ldots, z_{ip})$ is a vector of explanatory
variables with corresponding parameter vector $\bbeta = (\beta_1, \ldots,
\beta_p)$, $i = 1, \ldots, n$.

From \eqref{eq:aftlik} we now get the log likelihood and the score vector in a
straightforward manner.
\begin{equation*}
\begin{split}
\ell\bigl((\gamma, \alpha, \bbeta); (\bs, \bt, \bu, \bd), \bZ\bigr) &=
\sum_{i:d_i=1} \log h(t_i; \btheta_i) \\
&\quad + \sum_{i:d_i \ne 2} \log S(t_i; \btheta_i) \\
& \quad + \sum_{i:d_i = 2} \log\bigl(S(t_i; \btheta_i\bigr) - S(u_i;
\btheta_i\bigr)\bigr) \\
&\quad - \sum_{i=1}^n 
\log S(s_i; \btheta_i) 
\end{split}
\end{equation*}
and (in the following we drop the long argument list to $\ell$), for the
regression parameters \bbeta,

\begin{equation*}
\begin{split}
\frac{\partial}{\partial \beta_j}\ell &= \sum_{d_i=1}
\frac{h_j(t_i, \btheta_i)}{h(t_i, \btheta_i)} 
 + \sum_{d_i \ne 2} \frac{S_j(t_i; \btheta_i)}{S(t_i; \btheta_i)} \\
& \quad + \sum_{d_i = 2} \frac{S_j(t_i; \btheta_i) - S_j(u_i;
  \btheta_i)}{S(t_i; \btheta_i) -  S(u_i; \btheta_i)} 
 - \sum_{i = 1}^n \frac{S_j(s_i; \btheta_i)}{S(s_i; \btheta_i)}
 \\
&= -\sum_{d_i=1} z_{ij}
\frac{h_\alpha(t_i, \btheta_i)}{h(t_i, \btheta_i)} 
 - \sum_{d_i \ne 2} z_{ij}\frac{S_\alpha(t_i; \btheta_i)}{S(t_i; \btheta_i)} \\
& \quad - \sum_{d_i = 2} z_{ij}\frac{S_\alpha(t_i; \btheta_i) - S_\alpha(u_i;
  \btheta_i)}{S(t_i; \btheta_i) -  S(u_i; \btheta_i)} 
+ \sum_{i = 1}^n z_{ij}\frac{S_\alpha(s_i; \btheta_i)}{S(s_i; \btheta_i)}
\end{split}
\end{equation*}
and for the ``baseline'' parameters $\gamma$ and $\alpha$, 
\begin{equation*}
\begin{split}
\frac{\partial}{\partial \gamma}\ell &= 
\sum_{i:d_i=1}
\frac{h_\gamma(t_i, \btheta_i)}{h(t_i, \btheta_i)}
+  \sum_{i:d_i \ne 2}\frac{S_\gamma(t_i; \btheta_i)}{S(t_i; \btheta_i)} \\
& \quad + \sum_{i:d_i = 2}\frac{S_\gamma(t_i; \btheta_i) - S_\gamma(u_i; \btheta_i)} 
{S(t_i; \btheta_i) - S(u_i; \btheta_i)} 
- \sum_{i=1}^n \frac{S_\gamma(s_i; \btheta_i)}{S(s_i; \btheta_i)}, 
\end{split}
\end{equation*}
and
\begin{equation*}
\begin{split}
\frac{\partial}{\partial \alpha}\ell &= 
\sum_{i:d_i=1}
\frac{h_\alpha(t_i, \btheta_i)}{h(t_i, \btheta_i)}
+  \sum_{i:d_i \ne 2}\frac{S_\alpha(t_i; \btheta_i)}{S(t_i; \btheta_i)} \\
& \quad + \sum_{i:d_i = 2}\frac{S_\alpha(t_i; \btheta_i) - S_\alpha(u_i; \btheta_i)} 
{S(t_i; \btheta_i) - S(u_i; \btheta_i)} 
- \sum_{i=1}^n \frac{S_\alpha(s_i; \btheta_i)}{S(s_i; \btheta_i)}. 
\end{split}
\end{equation*}
Here, from \eqref{eq:haz},
\begin{equation*}\label{eq:gammahazprim}
\begin{split}
h_\gamma(t, \btheta_i) &= \pe{\gamma}h(t, \btheta_i) \\
&= g_{t\gamma}(t, \btheta_i)h_0(g(t, \btheta_i)) + 
g_t(t, \btheta_i) g_\gamma(t, \btheta_i)h^\prime_0(g(t, \btheta_i)), 
\end{split}
\end{equation*}

\begin{equation*}\label{eq:alphahazprim}
\begin{split}
h_\alpha(t, \btheta_i) &= \pe{\alpha}h(t, \btheta_i) \\
&= g_{t\alpha}(t, \btheta_i)h_0(g(t, \btheta_i)) + 
g_t(t, \btheta_i) g_\alpha(t, \btheta_i)h^\prime_0(g(t, \btheta_i)), 
\end{split}
\end{equation*}
and
\begin{equation*}\label{eq:betahazprim}
\begin{split}
h_j(t, \btheta_i) &= \pe{\beta_j}h(t, \btheta_i) 
= \pe{\alpha}h(t, \btheta_i)\pe{\beta_j}(\alpha - \bz_i\bbeta) \\
&= -z_{ij}h_\alpha(t, \btheta_i), \quad j = 1, \ldots, p.
\end{split}
\end{equation*}
Similarly, from \eqref{eq:surv} we get
\begin{equation*}\label{eq:gammasurvprim}
\begin{split}
S_\gamma(t; \btheta_i) &= \pe{\gamma}S(t; \btheta_i) = 
\pe{\gamma}S_0\bigl(g(t, \btheta_i)\bigr)\\
&= -g_\gamma(t, \btheta_i) f_0\bigl(g(t, \btheta_i)\bigr),
\end{split}
\end{equation*}

\begin{equation*}\label{eq:alphasurvprim}
\begin{split}
S_\alpha(t; \btheta_i) &= \pe{\alpha}S(t; \btheta_i) = 
\pe{\alpha}S_0\bigl(g(t, \btheta_i)\bigr)\\
&= -g_\alpha(t, \btheta_i) f_0\bigl(g(t, \btheta_i)\bigr).
\end{split}
\end{equation*}
and
\begin{equation*}\label{eq:betasurvprim}
\begin{split}
S_j(t; \btheta_i) &= \pe{\beta_j}S(t; \btheta_i) = 
\pe{\alpha}S_0\bigl(g(t, \btheta_i)\bigr)\pe{\beta_j}(\alpha - \bz_i\bbeta) \\
&= -z_{ij} S_\alpha(t, \btheta_i), \quad j = 1, \ldots, p.
\end{split}
\end{equation*}

For estimating standard errors, the observed information (the negative of
the hessian) is useful, so
\begin{multline*}
-\pt{\beta_j}{\beta_m}\ell = -\sum_{i:d_i=1}
z_{ij}z_{im}\biggl\{\frac{h_{\alpha\alpha}(t_i, \btheta_i)}{h(t_i, \btheta_i)} - 
\biggl(\frac{h_\alpha(t_i, \btheta_i)}{h(t_i, \btheta_i)}\biggr)^2\biggr\} \\
- \sum_{i:i \ne 2} z_{ij}z_{im}
\biggl\{\frac{S_{\alpha\alpha}(t_i, \btheta_i)}{S(t_i, \btheta_i)} 
 -\biggl(\frac{S_\alpha(t_i, \btheta_i)}{S(t_i,
   \btheta_i)}\biggr)^2\biggr\} 
\\ 
- \sum_{i:i = 2} z_{ij}z_{im} 
\biggl\{\frac{S_{\alpha\alpha}(t_i, \btheta_i) - S_{\alpha\alpha}(u_i,
  \btheta_i)}{S(t_i,
  \btheta_i) -  S(u_i, \btheta_i)}  
 -\biggl(\frac{S_\alpha(t_i, \btheta_i) - S_{\alpha}(u_i, \btheta_i)}{S(t_i,
   \btheta_i) - S(u_i, \btheta_i)}\biggr)^2\biggr\} 
\\ 
+ \sum_{i=1}^nz_{ij}z_{im}\biggl\{\frac{S_{\alpha\alpha}(s_i,
  \btheta_i)}{S(s_i, \btheta_i)} 
 -\biggl(\frac{S_\alpha(s_i, \btheta_i)}{S(s_i,
   \btheta_i)}\biggr)^2\biggr\}, \quad j, m = 1, \ldots, p,
\end{multline*}
and
\begin{multline*}
-\pt{\beta_j}{\tau}\ell = \sum_{i:d_i=1}
z_{ij}\biggl\{\frac{h_{\alpha\tau}(t_i, \btheta_i)}{h(t_i, \btheta_i)} - 
\frac{h_\alpha(t_i, \btheta_i)h_{\tau}(t_i, \btheta_i)}{h^2(t_i,
  \btheta_i)}\biggr\} \\
+ \sum_{i:i \ne 2} z_{ij}
\biggl\{\frac{S_{\alpha\tau}(t_i, \btheta_i)}{S(t_i, \btheta_i)} 
 -\frac{S_\alpha(t_i, \btheta_i) S_{\tau}(t_i, \btheta_i)}{S^2(t_i,
   \btheta_i)}\biggr\} 
\\ 
+ \sum_{i:i = 2} z_{ij} 
\biggl\{\frac{S_{\alpha\tau}(t_i, \btheta_i) - S_{\alpha\tau}(u_i,
  \btheta_i)}{S(t_i,
  \btheta_i) -  S(u_i, \btheta_i)}  \\
 -\frac{\bigl(S_\alpha(t_i, \btheta_i) - S_{\alpha}(u_i, \btheta_i)\bigr)\bigl(S_\tau(t_i, \btheta_i) - S_{\tau}(u_i, \btheta_i)\bigr)}
{\bigl(S(t_i,
   \btheta_i) - S(u_i, \btheta_i)\bigr)^2}\biggr\} 
\\ 
- \sum_{i=1}^nz_{ij}\biggl\{\frac{S_{\alpha\tau}(s_i,
  \btheta_i)}{S(s_i, \btheta_i)} 
 -\frac{S_\alpha(s_i, \btheta_i) S_\tau(s_i, \btheta_i)}
{S^2(s_i, \btheta_i)}\biggr\} \\ 
\quad j = 1, \ldots, p; \; \tau = \gamma, \alpha, 
\end{multline*}
and finally
\begin{multline*}
-\pt{\tau}{\tau^\prime}\ell = -\sum_{i:d_i=1}
\biggl\{\frac{h_{\tau^\prime\tau}(t_i, \btheta_i)}{h(t_i, \btheta_i)} - 
\frac{h_{\tau^\prime}(t_i, \btheta_i)h_{\tau}(t_i, \btheta_i)}{h^2(t_i,
  \btheta_i)}\biggr\} \\
- \sum_{i:i \ne 2}
\biggl\{\frac{S_{\tau^\prime\tau}(t_i, \btheta_i)}{S(t_i, \btheta_i)} 
 -\frac{S_{\tau^\prime}(t_i, \btheta_i) S_{\tau}(t_i, \btheta_i)}{S^2(t_i,
   \btheta_i)}\biggr\} 
\\ 
- \sum_{i:i = 2} 
\biggl\{\frac{S_{\tau^\prime\tau}(t_i, \btheta_i) - S_{\tau^\prime\tau}(u_i,
  \btheta_i)}{S(t_i,
  \btheta_i) -  S(u_i, \btheta_i)}  \\
 -\frac{\bigl(S_{\tau^\prime}(t_i, \btheta_i) - S_{\tau^\prime}(u_i, \btheta_i)\bigr)\bigl(S_\tau(t_i, \btheta_i) - S_{\tau}(u_i, \btheta_i)\bigr)}
{\bigl(S(t_i,
   \btheta_i) - S(u_i, \btheta_i)\bigr)^2}\biggr\} 
\\ 
+ \sum_{i=1}^n\biggl\{\frac{S_{\tau^\prime\tau}(s_i,
  \btheta_i)}{S(s_i, \btheta_i)} 
 -\frac{S_{\tau^\prime}(s_i, \btheta_i) S_\tau(s_i, \btheta_i)}
{S^2(s_i, \btheta_i)}\biggr\} \\ 
\quad  (\tau, \tau^\prime) = (\gamma, \gamma), (\gamma, \alpha), (\alpha, \alpha). 
\end{multline*}
The second order partial derivatives $h_{\tau\tau^\prime}$ and
$S_{\tau\tau^\prime}$ are 
\begin{equation}
\begin{split}
h_{\tau\tau^\prime}(t, \btheta) &= \pe{\tau^\prime} h_\tau(t,
\btheta) \\
&= g_{t\tau\tau^\prime}(t, \btheta) h_0(g(t, \btheta)) +
g_{t\tau}(t, \btheta) g_{\tau^\prime}(t, \btheta) h_0^\prime(g(t,
\btheta)) \\
&\quad + g_t(t, \btheta) g_\theta(t,
\btheta)g_{\tau^\prime}(t, \btheta)h^{\bis}_0(g(t, \btheta)) \\
&\quad +
g_t(t, \btheta) g_{\theta\theta^\prime}(t, \btheta)h^\prime_0(g(t, \btheta)) \\
&\quad +
g_{t\tau^\prime}(t, \btheta) g_\theta(t, \btheta)h^\prime_0(g(t,
\btheta)) \\
&= h_0(g(t, \btheta)) g_{t\tau\tau^\prime}(t, \btheta) \\
&\quad + h_0^\prime(g(t, \btheta)) \bigl\{g_t(t,
\theta)g_{\theta\theta^\prime}(t, \btheta) \\
& \quad \quad \quad \quad \quad + 
g_{t\tau}(t, \btheta) g_{\tau^\prime}(t, \btheta) \\
&\quad \quad \quad \quad \quad + 
g_{t\tau^\prime}(t, \btheta) g_{\tau}(t, \btheta)   \bigr\} \\
&\quad + h_0^\bis(g(t, \btheta))g_t(t, \btheta) g_\theta(t,
\btheta)g_{\tau^\prime}(t, \btheta),, \\
& \quad (\tau, \tau^\prime) = (\gamma, \gamma), (\gamma, \lambda),
(\lambda, \lambda),
\end{split}
\end{equation}
and from \eqref{eq:survprim},
\begin{equation}
\begin{split}
S_{\tau\tau^\prime}(t, \btheta) &= \pe{\tau^\prime} S_\tau(t; \btheta) \\
&= -\bigl\{g_{\tau\tau^\prime}(t, \btheta)f_0\bigl(g(t, \btheta)\bigr) +
g_\tau(t, \btheta) g_{\tau^\prime}(t, \btheta) f_0^\prime\bigl(g(t,
\btheta)\bigr) \bigr\}, \\
& \quad (\tau, \tau^\prime) = (\gamma, \gamma), (\gamma, \lambda),
(\lambda, \lambda). 
\end{split}
\end{equation}
 
\appendix

\section{Some partial derivatives}

The function (see \eqref{eq:gshsc})
\begin{equation}
g(t, (\gamma, \alpha)) =
\biggl(\frac{t}{\exp(\alpha)}\biggr)^{\exp(\gamma)}, \quad t \ge 0; 
\quad -\infty < \gamma, \alpha < \infty.
\end{equation}
has the following partial derivatives:
\begin{align*}
g_t\bigl(t, (\gamma, \alpha)\bigr) &= \frac{\exp(\gamma)}{t}g\bigl(t,
(\gamma, \alpha)\bigr), \quad t > 0  \\ 
g_\gamma\bigl(t, (\gamma, \alpha)\bigr) &= g\bigl(t, (\gamma, \alpha)\bigr)
\log\bigl\{g\bigl(t, (\gamma, \alpha)\bigr)\bigr\}   \\
g_\alpha\bigl(t, (\gamma, \alpha)\bigr) &= -\exp(\gamma)g\bigl(t, (\gamma,
\alpha)\bigr)
\end{align*}
\begin{align*} 
g_{t\gamma}\bigl(t, (\gamma, \alpha)\bigr) &=  g_t\bigl(t, (\gamma,
\alpha)\bigr) + \frac{\exp(\gamma)}{t} g_\gamma\bigl(t, (\gamma,
\alpha)\bigr), \quad t > 0\\  
g_{t\alpha}\bigl(t, (\gamma, \alpha)\bigr) &= -\exp(\gamma) g_t\bigl(t,
(\gamma, \alpha)\bigr), \quad t > 0  \\
g_{\gamma^2}\bigl(t, (\gamma, \alpha)\bigr) &= g_\gamma\bigl(t, (\gamma,
\alpha)\bigr)\bigl\{1 + \log g\bigl(t, (\gamma, \alpha)\bigr)\bigr\}\\  
g_{\gamma\alpha}\bigl(t, (\gamma, \alpha)\bigr) &= g_\alpha\bigl(t, (\gamma,
\alpha)\bigr)\bigl\{1 + \log g\bigl(t, (\gamma, \alpha)\bigr)\bigr\}\\  
g_{\alpha^2}\bigl(t, (\gamma, \alpha)\bigr) &= -\exp(\gamma)g_\alpha\bigl(t, (\gamma,
\alpha)\bigr)
\end{align*}
\begin{align*} 
g_{t\gamma^2}\bigl(t, (\gamma, \alpha)\bigr) &= g_{t\gamma}\bigl(t,(\gamma,
\alpha)\bigr) \\
&\quad + \frac{\exp(\gamma)}{t} g_\gamma\bigl(t, (\gamma,
\alpha)\bigr) \bigl\{2 + \log g\bigl(t, (\gamma, \alpha)\bigr)\bigr\} \\
g_{t\gamma\alpha}\bigl(t, (\gamma, \alpha)\bigr) &= -\exp(\gamma)\bigl\{
g_t\bigr(t, (\gamma, \alpha)\bigr) + g_{t\gamma}\bigr(t, (\gamma, \alpha)\bigr)
\bigr\}  \\
g_{t\alpha^2}\bigl(t, (\gamma, \alpha)\bigr) &= -\exp(\gamma)
g_{t\alpha}\bigl(t, (\gamma, \alpha)\bigr)
\end{align*}

The formulas will be easier to read if we remove all function arguments,
i.e., $(t, (\gamma, \alpha))$:
\begin{align*}
g_t &= \frac{\exp(\gamma)}{t}g, \quad t > 0  \\ 
g_\gamma &= g
\log g   \\
g_\alpha &= -\exp(\gamma)g   \\ 
g_{t\gamma} &=  g_t + \frac{\exp(\gamma)}{t}
g_\gamma, \quad t > 0\\  
g_{t\alpha} &= -\exp(\gamma) g_t, \quad t > 0  \\
g_{\gamma^2} &= g_\gamma\bigl\{1 + \log g\bigr\}\\  
g_{\gamma\alpha} &= g_\alpha\bigl\{1 + \log
g\bigr\}\\   
g_{\alpha^2} &= -\exp(\gamma)g_\alpha \\ 
g_{t\gamma^2} &= g_{t\gamma} + \frac{\exp(\gamma)}{t} g_\gamma
\bigl\{2 + \log g\bigr\}, \quad t > 0 \\ 
g_{t\gamma\alpha} &= -\exp(\gamma)\bigl\{g_t + g_{t\gamma}\bigr\}, \quad t > 0  \\
g_{t\alpha^2} &= -\exp(\gamma) g_{t\alpha}, \quad t > 0
\end{align*}

\end{document}