%\VignetteIndexEntry{HDBRR-extdoc}
\documentclass[article,shortnames,nojss]{jss}


%Extra packages
\usepackage{amsmath}
\usepackage{amsthm, amssymb,latexsym,amsfonts}
\usepackage{tikz}
\usepackage{subfig}
\usepackage{graphicx}
\usepackage{appendix}
\usepackage{rotating}
\usepackage{multirow, array}


\newlength{\RoundedBoxWidth}
\newsavebox{\GrayRoundedBox}
\newenvironment{GrayBox}[1][\dimexpr\textwidth-4.5ex]%
   {\setlength{\RoundedBoxWidth}{\dimexpr#1}
    \begin{lrbox}{\GrayRoundedBox}
       \begin{minipage}{\RoundedBoxWidth}}%
   {   \end{minipage}
    \end{lrbox}
    \begin{center}
    \begin{tikzpicture}%
       \draw node[draw=black,fill=black!10,rounded corners,%
             inner sep=2ex,text width=\RoundedBoxWidth]%
             {\usebox{\GrayRoundedBox}};
    \end{tikzpicture}
    \end{center}}

\newcommand{\y}{\boldsymbol{\mathrm{y}}}
\newcommand{\X}{\boldsymbol{\mathrm{X}}}
\newcommand{\x}{\mathrm{X}}
\newcommand{\yy}{\mathrm{y}}
\newcommand{\U}{\boldsymbol{\mathrm{U}}}
\newcommand{\V}{\boldsymbol{\mathrm{V}}}
\newcommand{\D}{\boldsymbol{\mathrm{D}}}
\newcommand{\p}{\boldsymbol{\mathrm{P}}}
\newcommand{\Q}{\boldsymbol{\mathrm{Q}}}
\newcommand{\R}{\boldsymbol{\mathrm{R}}}
\newcommand{\s}{\boldsymbol{\mathrm{S}}}
\newcommand{\I}{\boldsymbol{\mathrm{I}}}
\newcommand{\B}{\boldsymbol{\beta}}
\newcommand{\si}{\boldsymbol{\Sigma}}
\newcommand{\g}{\boldsymbol{\gamma}}
\newcommand{\post}{\textit{a posteriori }}
\newcommand{\prior}{\textit{a priori }}
\newcommand{\Post}{\textit{A Posteriori }}
\newcommand{\Prior}{\textit{A Priori }}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% declarations for jss.cls %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% almost as usual
\author{Monroy-Castillo B. \\Colegio de Postgraduados$^{*}$ \And \,\,
        P\'erez-Elizalde S.$^{*}$ \And \,\,\,\,\,\,\,\,\quad
        P\'erez-Rodr\'iguez P.$^{*}$ \And
        Crossa J.\\CIMMyT}

\title{HDBRR: A Statistical Package for High Dimensional Ridge Regression without MCMC}

%% for pretty printing and a nice hypersummary also set:
\Plainauthor{Blanca Monroy-Castillo, Sergio P\'erez-Elizalde, Paulino P\'erez-Rodr\'iguez} %% comma-separated
\Plaintitle{HDBRR: A Statistical Package for High Dimensional Ridge Regression without MCMC} %% without formatting
\Shorttitle{\pkg{HDBRR}: An R-package for Ridge Regression without MCMC} %% a short title (if necessary)

%% an abstract and keywords
\Abstract{
Ridge regression is a useful tool to deal with collinerity in
 the homoscedastic linear regression model, which provide biased estimators
 of the regression parameters with lower variance than the least
 square estimators. Evenmore, when the number
 of predictors ($p$) is much larger than the number of
 observations ($n$), ridge regression give us unique least square estimators by restringing the parametric space to the neighborhood of the origin. From the Bayesian
 point of view ridge regression results of assigning a Gaussian prior on
 the regression parameters and assuming they are conditionally
 independent. However, from both classical and Bayesian approaches the
estimation of parameters is a highly demanding computational task, in the
first one being an optimization problem and in the second one a
high dimensional integration problem usually faced up through
Markov Chain Monte Carlo (MCMC). The main drawback of MCMC is the practical impossibility of checking convergence to the posterior distribution, which is commonly very
slow due to the large number of regression parameters. Here we propose a computational algorithm to obtain posterior estimates of regression parameters, variance components and predictions for the conventional ridge Regression model, based on a reparameterization of the model which allows us to obtain the marginal posterior means and variances by integrating out a nuisance parameter whose marginal posterior is defined on the open interval $(0,1)$.
}
\Keywords{Bayesian Methods, Regression, Variable Selection, Shrinkage, Ridge Regression, MCMC, \proglang{R}}
\Plainkeywords{Bayesian Methods, Regression, Variable Selection, Shrinkage, Ridge Regression, MCMC, R} %% without formatting

%% publication information
%% NOTE: Typically, this can be left commented and will be filled out by the technical editor
%% \Volume{50}
%% \Issue{9}
%% \Month{June}
%% \Year{2012}
%% \Submitdate{2012-06-04}
%% \Acceptdate{2012-06-04}

%% The address of (at least) one author should be given
%% in the following format:
\Address{
  Sergio P\'erez-Elizalde\\
  Socio Econom\'ia Estad\'istica e Inform\'atica \\
  Colegio de Postgraduados, M\'exico\\
  E-mail: \email{sergiop@colpos.mx}\\

  Blanca Monroy-Castillo\\
  Socio Econom\'ia Estad\'istica e Inform\'atica \\
  Colegio de Postgraduados, M\'exico\\
  E-mail: \email{blancamonroy.96@gmail.com}\\

  Paulino P\'erez-Rodr\'iguez\\
  Socio Econom\'ia Estad\'istica e Inform\'atica \\
  Colegio de Postgraduados, M\'exico\\
  E-mail: \email{perpdgo@colpos.mx}\\

}

%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}
\SweaveOpts{concordance=TRUE}

%% include your article here, just as usual
%% Note that you should use the \pkg{}, \proglang{} and \code{} commands.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
\label{sec:Intro}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Nowadays most research areas use massive quantities of information generated by the increasingly sophisticated computer equipment; for example, in genomics an increasing amount of data is available as new sequencing technologies appears. A lot of statistical models have been proposed in order to learn valuable information from data; however, even with the simplest models, the statisticians or data scientists have to deal with high dimensional inference problems which require millions of computation tasks. One of such models is the ridge regression, being a useful tool to deal with collinearity in the homoscedastic linear regression model by providing biased estimators of regression parameters with lower variance than the least square estimators. Even more, when the number of predictors ($p$) is much larger than the number of observations ($n$), ridge regression gives a unique least square estimator by restricting the parametric space.

From the Bayesian point of view, ridge regression results of assigning a Gaussian prior on the regression parameters and assuming they are conditionally independent. However, since the Bayesian estimation of parameters is a high dimensional integration problem, it is also a highly demanding computational task which is usually faced up through Markov Chain Monte Carlo (MCMC), in particular Gibbs Sampling because the full posterior conditionals are available in closed form. The most successful MCMC option implemented in the \proglang{R} software is the package \proglang{BGLR}\citep{BGLR} , other non Bayesian \proglang{R} package options are \proglang{penalized}\citep{penal} and \proglang{ridge}\citep{ridge}.

The main drawback of MCMC in high dimensional settings is checking of convergence to the joint posterior distribution, which is commonly very slow due to the large number of regression parameters and the high correlations between successive samples from the conditional posteriors in the Gibbs sampling implementation of MCMC. As \cite{rajaratnam2015} shows for the regression model, meanwhile the MCMC samples yield a good approximation of the posterior means of the regression parameters, their posterior variances and the posterior mean of the residual variance may be underestimated if the simulated Markov chain is not large enough; nevertheless,  the length of the chain is an issue that still being an open research field. In this paper we propose a simple numerical method to estimate posterior means and variances of the parameters in the ridge regression model as a way to abandon the theoretical guarantees of MCMC methods. We use the SVD  and the QR decompositions together with a reparameterization to get closed expressions of the conditional posteriors from where we obtain the marginal posterior means and variances by numerical integration on the open interval $(0,1)$; furthermore, variable selection and prediction are straightforward consequences. The proposed method is implemented in \proglang{R} and allows to work within the big matrix framework by using storing and parallelization packages \proglang{bigstatsr}, \proglang{bigparallelr} and \proglang{parallel}.

\section{Method and Materials}

In the Bayesian approach to inference statistics we formally combine, through the Bayes rule,  prior information and sample data to learn about unknown quantities of interest. The previous to data uncertainty about the parameter of interest  $\theta$ is expressed by the prior distribution, the information about $\theta$ that comes from observed data is incorporated by the likelihood function and by the Bayes formula we obtain the posterior distribution of the parameter given the data (posterior distribution) \citep{Lee}. However, calculating the posterior distribution is not always an easy task because integration is required and, even in low dimensional settings, Monte Carlo or numerical integration methods are needed. In this way, \cite{Gilks} introduced the MCMC (Markov Chain Monte Carlo) which provides a straightforward and intuitive way to both simulate values from an unknown distribution and use those simulated values to perform subsequent analyses \citep{MCMC3}. For more information about MCMC see \cite{MCMC,MCMC2}. In this paper we do not use the sampling based approach to approximate de posterior distribution, instead we focus on a numerical approximation of the posterior of a nuisance parameter to integrate out it and to obtain numerical aproximations of posterior means and variances of regression parameters and predictions.

\subsection{Model}
Consider the linear model
$$
y_i=\mathbf{x}^{\prime}_{i}\boldsymbol{\beta}+\epsilon_i,\quad i=1,\ldots,n.
$$
where $\epsilon_i$, $i=1,\ldots,n$, is a Gaussian error with mean $0$ and variance $\sigma^2$, $\boldsymbol{\beta}\subseteq\mathbb{R}^p$ is a vector of regression coefficients and $\mathbf{x}_{i}$, $i=1,\ldots,n$, is $p$ dimensional vector of observed without error covariates. Moreover, assume that $\mathrm{cov}\left(\epsilon_i,\epsilon_j\right)=0$, $i\neq j$. In matrix form the model is written as
\begin{eqnarray}\label{eqn:model}
\y &=& \X\boldsymbol{\beta} \,\,+\,\, \boldsymbol{\varepsilon},
\end{eqnarray}

where $\B = (\beta_1,\dots, \beta_p)'$ is the vector of \textit{regression parameters}, $\boldsymbol{\varepsilon} = (\varepsilon_1,\dots,\varepsilon_n)^T$ is the vector of random errors distributed as $\mathcal{N}_p(\boldsymbol{0},\sigma^2\I_{nn})$. The $n\times p$ matrix $\X$ is called the \textit{design matrix} and $\y$ is generally referred to as the vector \textit{response variable}. Since the mean of $\y$, $\X\boldsymbol{\beta}$, is a linear combination of the columns of $\X$,  the model in (\ref{eqn:model}) is known as the \textit{linear regression model}.

When the number observations is greater than the number of covariates, $n>p$, the best linear unbiased estimator of $\B$ is $\hat{\B}=(\X'\X)^{-1}\X'\y$. However, when multicollinearity occurs, although the least squares estimators are unbiased, their variances are inflated because $(\X'\X)^{-1}$ tends to be singular. Another scenario where the obtention of the least squares estimator is an ill-posed problem occurs when $p>>n$, which implies that $\hat{\B}$ is not unique. In both cases some sort of restriction of the parametric space or penalization is needed in order to have unique estimators with lower variance. By adding a degree of bias to the regression estimates, ridge regression gives an unique estimator of $\B$ with variance lower than the least squares estimator variance.

\subsection{Ridge Regression}
\cite{Hoerld} and \cite{Hoerle} first suggested that to control the inflation and general instability associated with the least squares estimates we may use the same Tikhonov regularization for all the regression parameters, which gives the ridge estimator:
\begin{eqnarray}\label{eq:ridge}
\hat{\B}^* &=& \left(\X'\X+k\I\right)^{-1}(\X'\y); \quad \quad k>0,
\end{eqnarray}
where $k$ is the ridge penalty parameter. Large values of $k$ tend to reduce the magnitude of the estimated regression coefficients, leading to fewer effective model parameters \cite{Cannon}. See \citet{Hoerla,Hoerlb,Hoerlc,RR,Alheety,waheed} for more about ridge regression.

\subsection{Bayesian inference for ridge regression}
In Bayesian inference all the uncertainty about the unknown parameters $\left(\B,\sigma^2\right)$ is described by the join posterior distribution, which is obtained, through the Bayes rule, as the likelihood function, the joint density of $\y$ seen as a function of the parameters, times the prior. The prior is a probability density function we use to measure the uncertainty about $\left(\B,\sigma^2\right)$ before any data has been observed.

From model in (\ref{eqn:model}) the likelihood function for the parameters $\left(\B,\sigma^2\right)$ is given by

\begin{eqnarray}
L (\boldsymbol{\beta}, \sigma^2\,\, |\,\, \y) &\propto& \left(\frac{1}{\sigma^2}\right)^{n/2} \exp\left\{-\frac{1}{2\sigma^2}(\y-\X\boldsymbol{\beta})^T(\y-\X\boldsymbol{\beta})\right\}.\label{eq:likfun}
\end{eqnarray}

If the prior for each $\beta_j$, $j=1,\ldots,p$, is Gaussian with mean $0$ and variance $\sigma_\beta^2$ and prior independence of $\B$ and $\sigma^2$ is assumed then the joint prior is of the form
\begin{eqnarray}
\pi(\B, \sigma^2,\sigma^2_{\B}) &=& \pi\left(\B\, |\, \sigma^2_{\B}\right)\,\pi(\sigma^2)\,\pi(\sigma^2_{\B}),\label{eq:prior}
\end{eqnarray}
where,
\begin{eqnarray*}
\pi\left(\B\, |\, \sigma^2_{\B}\right) &=& \mathcal{N}_p\left(\B\,|\, \boldsymbol{0}, \sigma^2_{\B}\I_{p}\right)
\end{eqnarray*}
and for the variance parameters we may assign conjugate priors \citep{prior, prior2}; that is, the inverse gamma distributions
\begin{eqnarray*}
\pi\left(\sigma^2\right) &=& \mathcal{IG}\left(\sigma^2\left | \frac{n_0}{2},\frac{n_0s_0^2}{2}\right.\right) \\
\pi\left(\sigma^2_{\B}\right) &=& \mathcal{IG}\left(\sigma^2_{\B}\left | \frac{p_0}{2},\frac{p_0d_0^2}{2}\right.\right),
\end{eqnarray*}
where $n_0,s_0^2,p_0$ and $d_0^2$ are known hyperparameters.The prior independence of the elementos of $\B$ given $\sigma_\beta^2$ implies that marginally the distribution of $\B$ is multivariate $\emph{t}$ with $p_0$ degrees of freedom. Then, this hierarchical structure implies that regression parameters are not independent to each other, which seems to be an appropiated structure in presence of colineallity or when $p>>n$. The prior variance of the elements of $\B$ has also an interpretation in terms of regularization: for fixed $\sigma^2$, as $\sigma_\beta^2$ tends to $0$ the shrinkage to the prior mean increases, which means that large values of the parameters are penalized.

Due to the problem of estimation of the ridge regression parameters is ill-posed, prior elicitation is a critical step in Bayesian inference since the posterior is too sensitive to the assignation of values to the hyperparameters in the prior of $\B$. Here we use an approach closed to those proposed by \cite{Yongtao} and \cite{BGLR} to model the initial knowledge of $\B, \sigma^2$ and $ \sigma^2_{\beta}$. Thus, the prior expected values of $\sigma^2$ and $ \sigma^2_{\beta}$ are

\begin{eqnarray*}
\text{E}\left[\sigma^2 \left|  \frac{n_0}{2},\frac{n_0s_0^2}{2} \right.\right] &=& \frac{n_0s_0^2}{n_0-2}, \quad \quad \quad n_0>2.\\
\text{Var}\left[\sigma^2 \left|  \frac{n_0}{2},\frac{n_0s_0^2}{2} \right.\right] &=& \frac{n_0^2s_0^4}{(n_0-2)^2(2n_0-8)}, \quad \quad \quad n_0>4.
\end{eqnarray*}
Therefore, in order to have prior moments of first and second order finite for the variance components, the value of $n_0$ and $p_0$ should be at least 5. Then,we use 5 as the default value for $n_0$ and $p_0$ in the \proglang{HDBRR} package, but flat priors could be obtained as $n_0$ and $p_0$ tend to $0$. To assign values to $s_0^2$ and $d_0^2$ we will use the prior expected value of the proportion of variance explained (PVE) by the model with respect to the residual variance, that is given by
\begin{eqnarray*}
\text{PVE} &=& \frac{1}{n}\sum_{i=1}^n \frac{(\boldsymbol{x}_i'\boldsymbol{\beta})^2}{\sigma^2} \,\,=\,\,\frac{1}{n\sigma^2} \sum_{i=1}^n(\sum_{j=1}^p x_{ij}\beta_j)^2.
\end{eqnarray*}
Then, by noting that
\begin{eqnarray*}
\text{E}\left[\left(\sum_{j=1}^p x_{ij}\beta_j\right)^2\big\vert\sigma^2,\sigma^2_{\beta}\right]&=&\text{Var}\left[\sum_{j=1}^p x_{ij}\beta_j\big\vert\sigma^2,\sigma^2_{\beta}\right]=\sigma^2_{\beta}\sum_{j=1}^p x_{ij}^2
\end{eqnarray*}
we have that
\begin{eqnarray*}
\text{E}[\text{PVE}] &=& \text{E}[\text{E}[\text{PVE}\big\vert\sigma^2,\sigma^2_{\beta}]] \,\, = \,\, \frac{1}{s_0^2}\frac{p_0d_0^2}{p_0-1}\sum_{i=1}^n\sum_{j=1}^p\frac{x_{ij}^2}{n}.
\end{eqnarray*}
Let
\begin{eqnarray*}
\text{h} &=& \frac{\text{E}[\text{PVE}]}{1+\text{E}[\text{PVE}]} \,\,=\,\, \frac{p_0d_0^2\sum_{i=1}^n\sum_{j=1}^p\frac{x_{ij}^2}{n}}{s_0^2(p_0-1)+p_0d_0^2\sum_{i=1}^n\sum_{j=1}^p\frac{x_{ij}^2}{n}}.
\end{eqnarray*}
the proportion of the total prior variance explained by the model. From this,
\begin{eqnarray*}
\frac{h}{1-h} &=& \frac{p_0d_0^2\sum_{i=1}^n\sum_{j=1}^p\frac{x_{ij}^2}{n}}{s_0^2(p_0-1)},
\end{eqnarray*}
therefore
\begin{eqnarray*}
d_0^2 &=& \left(\frac{h}{1-h}\right)\frac{s_0^2(p_0-1)}{p_0\sum_{i=1}^n\sum_{j=1}^p\frac{x_{ij}^2}{n}}.
\end{eqnarray*}
The value $s_0^2$ may be interpreted as a prior guess about the residual variance.

In particular, when each covariate and the response have been centered about its sample mean
\begin{eqnarray*}
d_0^2 &=& \left(\frac{h}{1-h}\right)\frac{s_0^2(p_0-1)}{p_0\sum_{j=1}^ps_j^2};
\end{eqnarray*}
moreover, if the all the data are standardized and $s_0^2=1$ we have that $d_0^2=\left(\frac{h}{1-h}\right)\frac{(p_0-1)}{p_0p}$ which approaches to $1/p$ when $h=0.5$ and $p_0$ is large. Also note that as $h$ tends to $1$ the prior for $\sigma_{\beta}^2$ becomes flat but proper distribution.

Once the prior distribution has been assigned and the likelihood function defined, then  the posterior distribution of the regression parameters is derived in what follows.
\subsection{Posterior Distribution}
By the Baye's Rule, the joint posterior is obtained as the product of likelihood function in (\ref{eq:likfun}) and the prior in (\ref{eq:prior}). Thus,
\begin{eqnarray*}
\pi\left(\B,\,\sigma^2,\,\sigma^2_{\B}\mid\y\right) &\propto& L\left(\B,\, \sigma^2\,| \y\right)\, \pi(\B, \sigma^2,\sigma^2_{\B}),
\end{eqnarray*}
Now, cosiderer the transformations $v \,=\, 1/\sigma^2\,+\, 1/\sigma^2_{\B}$ and $u \,=\, \sigma^2/\left(\sigma^2 +\sigma^2_{\B}\right)$; then, the joint posterior distribution of $(\B, \, u,\, v)$ is given by
\begin{eqnarray}\label{eqn:posterior}
\pi(\B,\,u,\,v\mid\y) &\propto& v^{\frac{n+n_0+p+p_0}{2}-1}u^{\frac{p+p_0}{2}-1}(1-u)^{\frac{n+n_0}{2}-1} \\\nonumber
& & \times \exp\left\{-\frac{v(1-u)}{2}\left[\left(\B-\hat{\B}(u)\right)'\si_n^{-1}(u)\left(\B-\hat{\B}(u)\right)\,+\, SSE(u)\,+\, n_0s_0^2\right] \right\}\\\nonumber
& &\quad \times \exp\left\{-\frac{uv}{2}\left(\hat{\B}(u)\hat{\B}(u)\,+\,p_0d_0^2\right)\right\},
\end{eqnarray}
where $\si_n(u) \,=\, \left(\X'\X\,+\, \frac{u}{1-u}\I\right)^{-1}$, $\hat{\B}(u) \,=\, \si_n(u)\X'\y$,  $\hat{\y}(u) \,= \, \X\hat{\B}(u)$ and $SSE(u) \,=\, \left(\y-\hat{\y}(u)\right)'\left(\y-\hat{\y}(u)\right)$.  From (\ref{eqn:posterior}), the full conditional posterior of $\B$ is
\begin{eqnarray}
\pi(\B \,|\, u,\,v,\,\y) &=& \mathcal{N}_p \left(\B \,\left |\, \hat{\B}(u),\, \frac{1}{v(1-u)}\si_n(u)\right.\right).
\end{eqnarray}
Now, let $S(u) \,=\, (1-u) \, (SSE(u)\,+\, n_0s_0^2) \,+\, u(\hat{\B}'(u)\hat{\B}(u) \,+\, p_0d_0^2)$ and by the definition of conditional distribution of $\B$ given $\left(u,v\right)$, we have that
\begin{eqnarray}
\pi(v \, |\, u,\, \y) &=& \frac{\pi(\B,\, v \,|\, u,\,\y)}{\pi(\B \,|\, u,\,v,\,\y)} \quad = \quad \mathcal{G} \left(v \, \left | \frac{n+n_0+p_0}{2},\, \frac{S(u)}{2}\right.\right),
\end{eqnarray}
where $\pi(\B,\, v \,|\, u,\,\y)$, from (\ref{eqn:posterior}), is a Normal-Gamma density. It follows that, the posterior distribution of $\B$ given $u$ is $t$ with $\nu \,= \, n + n_0 + p_0$ degrees of freedom, mean
\begin{eqnarray*}
\text{E} [\B \,|\, u, \, \y] &=& \hat{\B}(u)
\end{eqnarray*}
and variance
\begin{eqnarray*}
\text{V} [\B\,|\, u,\, \y] &=& \frac{S(u)}{(\nu-2)(1-u)}\si_n(u), \quad \quad \quad \quad \nu-2 > 0.
\end{eqnarray*}
Finally, the marginal posterior of $u$ is obtained as
\begin{eqnarray}\label{eq:posterior}
\pi(u \,|\, \y) &=& \frac{\pi(u, \, v \, |\,  \y)}{\pi(v \,|\, u, \, \y)} \quad = \quad \frac{\pi(\B, \,u, \, v\, | \, \y )/ \pi(\B\,|\, u,\, v, \, \y)}{\pi(v\,|\, u,\, \y)}\nonumber \\
&\propto& u^{\frac{p+p_0}{2}-1}(1-u)^{\frac{n-p+n_0}{2}-1}\left|\si_n(u)\right|^{1/2} \nonumber \\
& & \times \left[(1-u)(SSE(u)+n_0s_0^2)\,+\, u\,\left(\hat{\B}'(u)\hat{\B}(u)+p_0d_0^2\right)\right] ^{-\frac{n+n_0+p_0}{2}}, u\in\left(0,1\right)
\end{eqnarray}
It is important to point out that the marginal moments $\pi(\B\mid\y)$ can be obtained by theorem of total expectation. I such a way, the unconditional posterior mean and variance of $\B$ are, respectively,
\begin{eqnarray*}
\text{E} [\B \,|\, \y]=\text{E} \left[\text{E} [\B \,|\, u, \, \y]\right] &=& \int_0^1\hat{\B}(u)\pi(u \,|\, \y)du
\end{eqnarray*}
and
\begin{eqnarray*}
\text{V} [\B \,|\, \y]&=&\text{E} \left[\text{V} [\B \,|\, u, \, \y]\right]+\text{V} \left[\text{E} [\B \,|\, u, \, \y]\right]\\
&=&\int_0^1\left(\frac{S(u)}{(\nu-2)(1-u)}\si_n(u)+\left(\hat{\B}(u)-\text{E} [\B \,|\, \y]\right)^2\right)\pi(u \,|\, \y)du.
\end{eqnarray*}
Both integrals above may be evaluated numerically with accurate precision in most cases.

\subsubsection{Marginal posterior distributions of variance components}
The marginal distributions of $\sigma^2$ and $\sigma^2_{\B}$ are obtained from the joint distribution, $\pi(u,\, v \,|\,\y) \,=\, \pi(v\,|\, u, \y)\pi(u\,|\,\y)$; that is, using the change of variable formula,
\begin{eqnarray*}
\pi(\sigma^2,\,\sigma^2_{\B}\, |\, \y) &=& \pi(v(\sigma^2,\,\sigma^2_{\B})\,|\, u\left(\sigma^2,\,\sigma^2_{\B}\right),\,\y)\, \pi \left(u\left(\sigma^2,\,\sigma^2_{\B}\right)\, \mid\y\right)\left|\frac{\partial(u,v)}{\partial\left(\sigma^2,\sigma^2_{\B}\right)}\right.
\end{eqnarray*}
However, the marginals can not be obtained in closed form, so numerical or Monte Carlo integration over $\mathbb{R}^+$ is needed. However, if only points estimates are needed, it is possible to get them using one dimensional integration over the interval $(0,1)$. For example, the Bayesian estimator under square loss of the ridge parameter $\lambda \,=\, \sigma^2/\sigma^2_{\B} \,=\, u/(1-u)$ is given by
\begin{eqnarray*}
\hat{\lambda}  &=&\text{E}[\lambda\mid\y] =\int_0^1\frac{u}{1-u} \, \pi(u \,|\, \y)\,\text{d}u.
\end{eqnarray*}
In the same way, the posterior means of $\sigma^2_{\B}$ and $\sigma^2$ are
\begin{eqnarray*}
\text{E}\left[\sigma^2_{\B} \,|\, \text{y}\right] &=& \text{E}\left[\left. \frac{1}{uv}\,\right| \text{y}\right] \quad = \quad \text{E}\left[\frac{1}{u}\, \text{E}\left[\left. \frac{1}{v}\right | u, \, \text{y}\right]\right] \\
&=& \frac{1}{n+n_0+p_0-1}  \int_0^1 \frac{S(u)}{u}\, \pi(u\,|\, \y) \,\text{d}u.
\end{eqnarray*}
and
\begin{eqnarray*}
\text{E}[\sigma^2\,|\, \text{y}] &=& \text{E} \left[\left. \frac{1}{v(1-u)}\,\right |\, \text{y} \right] \quad = \quad \frac{1}{n+n_0+p_0-1}\,\int_0^1 \frac{S(u)}{1-u}\,\pi(u\,|\, \y)\, \text{d}u.
\end{eqnarray*}
\subsection{Variable Selection}
Suppose that the prior density for $\B$ is such that,
\begin{eqnarray*}
\pi\left(\beta_j \,|\,\sigma_\beta^2,\, \gamma_j\right) &=& (1-\gamma_j)\text{N}\left(\beta_j \,|\, 0,\sigma^2_{\beta}\right)\, +\, \gamma_j \text{N} \left(\beta_j \,|\, 0, c_j^2\sigma^2_{\beta}\right), \quad j \,=\, 1,\dots,p.
\end{eqnarray*}
Where $\gamma_j \stackrel{iid}{\sim} \text{Bernoulli}(\phi_j)$ is an indicator variable which is $\gamma_j=1$ if the $j-th$ predictor variable is included in the model, in other case $\gamma_j = 0$. To use this hierarchical mixture setup for variable selection, the hyperparameters $\sigma^2_{\beta}$ and $c^2\sigma^2_{\beta}$ are set ``small and large", respectively, so that $\mathcal{N}(0,\sigma^2_{\beta})$ is concentrated around $0$ and $\mathcal{N}(0,c_j^2\sigma^2_{\beta})$ is diffuse as in Figure \ref{fig:delta}.
\begin{figure}[htbp]
\centering
\includegraphics[scale=0.65]{1}
\caption{$\mathcal{N}(0,\sigma^2_{\beta})$ and $\mathcal{N}(0,c_j^2\sigma^2_{\beta})$ densities. Intersection at $\delta_{i}$}
\label{fig:delta}
\end{figure}

If the data supports $\gamma_j = 0$ over $\gamma_j = 1$, then $\beta_j$ is probably small enough so that $\mathrm{X}_j$ will not be needed in the model. Suppose a value $\delta_{j} > 0$ such that if $|\beta_j|<\delta_{j}$ it would be preferable to exclude $\mathrm{X}_j$. The parameter $\delta_{j}$ should be chosen so that the posterior probability $\Pr(\gamma_j=1\mid\y)$ must be higher for those values of $\beta_j$ such that $|\beta_j|>\delta_{j}$ than for those in the neighborhood of $0$. Before any data has been observed, $\delta_{j}$ may be fixed by choosing $\sigma^2_{\beta}$ and $c_j^2\sigma^2_{\beta}$ such that the pdf $\pi\left(\beta_j\,|\, \gamma_{j} = 0\right)\,=\, \mathcal{N}(\beta_j\mid 0,\sigma^2_{\beta})$ is larger than the pdf $\pi\left(\beta_j\,|\, \gamma_{j} = 1\right)\,=\, \mathcal{N}(\beta_j\mid 0,c_j^2\sigma^2_{\beta})$ on the interval $\left(-\delta_{j\gamma},\delta_{j\gamma}\right)$ (see Figure 1). This condition is satisfied for any $\sigma_{\beta}$ and $c_j$ such that
\begin{eqnarray}
\frac{\log\left({\frac{c_j^2\sigma^2_{\beta}}{\sigma^2_{\beta}}}\right)}{\frac{1}{\sigma^2_{\beta}}-\frac{1}{c_j^2\sigma^2_{\beta}}} &\leq& \delta_{j}^2.
\end{eqnarray}
Hence,
\begin{eqnarray*}
\delta_{j} &=& \sqrt{\frac{2c_j^2\sigma^2_{\beta}\log{(c_j)}}{c_j^2-1}},\quad c_j>1.
\end{eqnarray*}
For the selection of $c_j$, note that it is equal to the ratio of $\pi\left(\beta_j=0\,|\, \gamma_{j} = 0\right)$ and $\pi\left(\beta_j=0\,|\, \gamma_{j} = 1\right)$; thus, $c_j$ may be interpreted as the prior odds that $X_j$ should be excluded of the model if $\beta_j$ is to small. Further explanation about the selection of $c_j$ and $\delta_j$ can be found in \cite{Mc}. In what follows, we will assume that, for $j=1,\ldots,p$,  $\phi_j=\phi$ and $c_j=c$, which implies that $\delta_j=\delta$.  In this way, the variable selection procedure is suitable for covariates in the same scale.

The joint posterior distribution of $\g = (\gamma_1,\dots,\gamma_p)$ is given by

$$
\pi(\g \,|\, \y)\,=\, \pi(\y\,|\, \g)\pi(\g)/\pi(\y)
$$
where the marginal model given $\gamma$ in terms of the joint posterior of $\left(\B,\sigma^2,\sigma^2_{\beta}\right)$ is
\begin{eqnarray*}
\pi\left(\y \,|\, \g\right) &=& \frac{L\left(\B, \sigma^2\,|\, \y\right)\pi\left(\B\,|\, \sigma^2_{\beta}, \g\right)\pi\left(\sigma^2\right)\pi\left(\sigma^2_{\beta}\right)}{\pi\left(\B,\sigma^2,\sigma^2_{\beta}\,|\, \g,\y\right)}.
\end{eqnarray*}
Then,
\begin{eqnarray}\label{eqn:postga}\nonumber
\pi(\g\,|\, \y) &=& \quad \frac{\pi(\g)\,L\left(\B,\sigma^2\,|\, \y\right)\pi\left(\B\,|\, \sigma^2_{\beta},\g \right)\,\pi\left(\sigma^2\right)\pi\left(\sigma^2_{\beta}\right)}{\pi(\y)\,\pi\left(\B,\sigma^2,\sigma^2_{\beta}\,|\, \g,\y\right)} \\
&\propto& \frac{\pi(\g)\pi\left(\B \left | \sigma^2_{\beta},\g\right.\right)}{\pi\left(\left. \B,\sigma^2,\sigma^2_{\B}\,\right|\, \g,\y\right)}
\end{eqnarray}
where it should be noted that $\pi\left(\left. \B,\sigma^2,\sigma^2_{\B}\,\right|\, \g\,=\, \boldsymbol{0},\y\right)$ is the joint posterior given in (\ref{eqn:posterior}). Since,
\begin{eqnarray*}
\pi(\g\,|\, \y) &=& \pi\left(\gamma_j\,|\, \g_{-j}, \y \right)\pi\left(\g_{-j}\,|\, \y\right),
\end{eqnarray*}
where $\boldsymbol{\theta}_{-j}$ is the subvector composed by all the elements of $\boldsymbol{\theta}$ except the $j$-th element $\theta_j$. Then,  it follows from (\ref{eqn:postga}) and prior independence that
\begin{eqnarray}\label{eqn:selection}
\pi\left(\gamma_j\,|\, \g_{-j},\y\right) &=& \frac{\pi(\g\,|\, \y)}{\pi\left(\g_{-j}\,|\,\y\right)} \quad \propto \quad \frac{\pi\left(\gamma_j\right)\pi\left(\beta_j \mid \sigma^2_{\beta},\gamma_j \right)}{\pi\left(\g_{-j}\,|\, \y\right)\pi\left(\B,\sigma^2,\sigma^2_{\beta}\,|\, \g,\y \right)} \nonumber \\ &\propto& \frac{\phi^{\gamma_j}(1-\phi)^{1-\gamma_j}\pi\left(\beta_j \,|\, \sigma^2_{\beta},\gamma_j\right)}{\pi\left(\B,\sigma^2,\sigma^2_{\beta}\,|\, \g,\y\right)}.\nonumber
\end{eqnarray}
Expressing the right hand side of the result above in terms of $(\B,\,u,\,v)$ we have the following:
\begin{eqnarray*}
\pi(\gamma_j\,|\, \g_{-j},\y) &\propto& \frac{\phi^{\gamma_j}(1-\phi)^{1-\gamma_j}\pi\left(\beta_j\,|\, u,v,\gamma_j\right)}{\pi\left(\B,u,v\,|\, \g,\y\right)} \\ &\propto& \frac{\phi^{\gamma_j}(1-\phi)^{1-\gamma_j}\pi\left(\beta_j \,|\,u,v,\gamma_j\right)}{\pi\left(\beta_j \,|\, \B_{-j},u,v,\g,\y\right)\pi\left(\B_{-j}\,|\, u,v,\g,\y\right)\pi(v\,|\, u,\g,\y)\pi(u\,|\, \g,\y)}\\ &\propto& \frac{\phi^{\gamma_j}(1-\phi)^{1-\gamma_j}\pi\left(\beta_j \,|\,u,v,\gamma_j\right)}{\pi\left(\beta_j \,|\, \B_{-j},u,v,\g,\y\right)\pi(v\,|\, u,\g,\y)\pi(u\,|\, \g,\y)}.
\end{eqnarray*}
Then, the conditional posterior probability of exclusion of the variable $X_j$ is
\begin{eqnarray}\label{eqn:sel2}
\text{Pr}\left(\gamma_j = 0 \,|\, \g_{-j} \,=\, \boldsymbol{0},\y\right)&=& \pi\left(\gamma_j \,=\,0 \,|\, \g_{-j} \,=\, \boldsymbol{0},\y\right)\\ &\propto& \frac{(1-\phi)\pi\left(\beta_j\,|\, u,v,\gamma_j = 0\right)}{\pi\left(\beta_j \,|\, \B_{-j},u,v,\g=\boldsymbol{0},\y\right)\pi(v\,|\, u,\g=\boldsymbol{0},\y)\,\pi(u\,|\, \g=\boldsymbol{0},\y)}\nonumber \\
&=&C_j \frac{(1-\phi)\pi\left(\beta_j\,|\, u,v,\gamma_j = 0\right)}{\pi\left(\beta_j \,|\, \B_{-j},u,v,\g=\boldsymbol{0},\y\right)}. \nonumber
\end{eqnarray}
Note that probability exclusion (\ref{eqn:sel2}) does not depend on the values of $\B,\,u$ and $v$  and should be equal to one when the conditional posterior mean of $\beta_j$ is equal to the prior mean and $\beta_j=0$.  Moreover, since the prior and the conditional posterior are both normal, it can be verified  that he proportionality constant is $C_j= (1-\phi)^{-1}\sqrt{\sum_{i = 1}^n \frac{1-u}{u}x_{ij}^2+1}$. Thus, the probability of the $j$-th predictor to be included in the model given that the others have been excluded is:
\begin{eqnarray}\label{eqn:phat}
\hat{p}_j=\text{Pr}\left(\gamma_j \,=\, 1 \,|\, \g_{-j}\,=\, \boldsymbol{0},\y\right) &=&1 - \text{Pr}\left(\gamma_j = 0 \,|\, \g_{-j}=\boldsymbol{0},\y\right) \nonumber \\ &=& 1 - \exp\left\{-\frac{\tilde{u}\tilde{v}}{2}\tilde{\beta}_j^2-\frac{(1-\tilde{u})\tilde{v}}{2V_j}\left(\tilde{\beta}_j-E_j\right)^2\right\}
\end{eqnarray}
where, $V_j=\left(\sum_{i = 1}^n x_{ij}^2+\frac{\tilde{u}}{1-\tilde{u}}\right)^{-1}$ and $E_j=V_j\x_{j}^{\prime}\left(\y-\X_{-j}\tilde{\B}_{-j}\right)$ are the conditional posterior variance and mean of $\beta_j$. The quantities $\tilde{u}$,$\tilde{v}$ and $\tilde{\B}$ are arbitrary values $u$,$v$ and $\B$, but with high posterior density. From(\ref{eqn:phat}) we have that the conditional Bayes factor in favour of including the covariate $\x_j$ in the model is
$$
BF_j=\frac{(1-\phi)\hat{p}_j}{(1-\hat{p}_j)\phi}=\frac{(1-\phi)}{\phi}\left(\exp\left\{\frac{\tilde{v}(1-\tilde{u})}{2}\left[\frac{\tilde{u}}{(1-\tilde{u})}\tilde{\beta}_j^2+\frac{1}{V_j}\left(\tilde{\beta}_j-E_j\right)^2\right]\right\}-1\right)
$$

\subsection{Posterior Computation}
The main feature of the parameterization  proposed above is that it enable to use standard matrix algebra to speed up the computation of the posterior distributions without using sampling techniques. This helps to significantly reduce computation time, avoiding slow sampling methods. In fact, once the posterior distribution of  $u$ is calculated all the estimations of interest are almost automatically available.
\subsubsection{Posterior computation through SVD decomposition}
Let $\U\D\V'$ the full singular value decomposition (SVD) of the matrix of covariates $\X$ in the linear model (\ref{eqn:model}). Note that $\V=[\V_1,\V_2]$, where $\V_1$ and $\V_2$ are orthonormals and $\D= [\s,\boldsymbol{0}]$ is a rectangular diagonal matrix, where $\s$ is the diagonal matrix of size  $n \times n$ of positive singular values, $s_1 \geq \dots \geq s_n$ of $\X$ and the last $p-n$ columns are all vectors of zeros. \\ Hence
\begin{eqnarray*}
\si_n(u) &=& \left(\X'\X+\frac{u}{1-u}\I_p\right)^{-1}\\
&=& \left(\V\D'\D\V'+\frac{u}{1-u}\V\V'\right)^{-1}\\
&=& \V\left(\begin{bmatrix}  \boldsymbol{\Lambda} & \boldsymbol{0}_{n,p-n} \\ \boldsymbol{0}_{p-n,n} & \boldsymbol{0}_{p-n,p-n}\end{bmatrix}+\frac{u}{1-u}\I_p\right)^{-1}\V' \\
&=& \V\begin{bmatrix} \boldsymbol{\Lambda}+\frac{u}{1-u}\I_n & \boldsymbol{0}_{n,p-n} \\ \boldsymbol{0}_{p-n,n} & \frac{u}{1-u}\I_{p-n} \end{bmatrix}^{-1}\V',
\end{eqnarray*}
where $\boldsymbol{\Lambda} = \s'\s = \text{diag}(\lambda_1,\dots,\lambda_n)$ is diagonal matrix whose elements are the eigenvalues of $\X\X'$. \\ Similarly,
\begin{eqnarray*}
\hat{\B}_u &=& \si_u\X'\y \\
&=& \V \begin{bmatrix}\boldsymbol{\Lambda} + \frac{u}{1-u}\I_n & \boldsymbol{0}_{n,p-n} \\ \boldsymbol{0}_{p-n,n} & \frac{u}{1-u}\I_{p-n}  \end{bmatrix}^{-1}\V'\V\D'\U'\y \\
&=& \V_1\left(\boldsymbol{\Lambda} + \frac{u}{1-u}\I_n\right)^{-1}\s\U'\y.
\end{eqnarray*}
And
\begin{eqnarray*}
\hat{\y}(u) &=& \X\hat{\B}(u) \\
&=& \U\D\V' \V_1\left(\boldsymbol{\Lambda} + \frac{u}{1-u}\I_n\right)^{-1}\s\U'\y \\
&=& \U\s \left(\boldsymbol{\Lambda} + \frac{u}{1-u}\I_n\right)^{-1} \s'\U'\y\\
&=& \U\boldsymbol{\mathrm{P}}\U'\y,
\end{eqnarray*}
where $\boldsymbol{\mathrm{P}}$ is a diagonal matrix with $\boldsymbol{\mathrm{P}}_{jj} = \frac{(1-u)\lambda_j}{(1-u)\lambda_j+u}$, $j=1,\dots,n$. \\ Thus, for $p>n$, substituting in (\ref{eq:posterior}) the covariance matrix by its SVD decomposition we have,\\
\begin{eqnarray}\label{eq:posterior1}
\pi(u \,|\, \y) &\propto& u^{\frac{p_0+p}{2}-1}(1-u)^{\frac{n_0+n-p}{2}-1} |\X'\X+\frac{u}{1-u}\I|^{-1/2} [(1-u)(SSE(u)+n_0s_0^2)]^{-\frac{n+n_0+p_0}{2}} \nonumber \nonumber \\ && \times\left[1+\frac{u}{1-u}\frac{\hat{\boldsymbol{\beta}}'(u)\hat{\boldsymbol{\beta}}(u)+p_0d_0^2}{SSE(u)+n_0s_0^2}\right]^{-\frac{n+n_0+p_0}{2}} \nonumber \\ &\propto& u^{\frac{p_0+p}{2}-1}(1-u)^{\frac{n_0+n-p}{2}-1}\begin{vmatrix} \boldsymbol{\Lambda}+\frac{u}{1-u}\I_n &  \boldsymbol{0}_{n,p-n} \nonumber \\ \boldsymbol{0}_{p-n,p} &  \frac{u}{1-u}\I_{p-n}\end{vmatrix}^{-1/2} \\ && \times \, [(1-u)(SSE(u)+n_0s_0^2)]^{-\frac{n+n_0+p_0}{2}}\left[1+\frac{u}{1-u}\frac{\hat{\boldsymbol{\beta}}'(u)\hat{\boldsymbol{\beta}}(u)+p_0d_0^2}{SSE(u)+n_0s_0^2}\right]^{-\frac{n+n_0+p_0}{2}} \nonumber \\
&\propto&  u^{\frac{p+p_0}{2}-1}(1-u)^{\frac{n+n_0}{2}-1}u^{\frac{n-p}{2}}\left(\prod_{j=1}^n(1-u)\lambda_j+u\right)^{-\frac{1}{2}} \\ & & \times  [(1-u)(SSE(u)+n_0s_0^2)]^{-\frac{n+n_0+p_0}{2}}  \, \left[1+\frac{u}{1-u}\frac{\hat{\boldsymbol{\beta}}'(u)\hat{\boldsymbol{\beta}}(u)+p_0d_0^2}{SSE(u)+n_0s_0^2}\right]^{-\frac{n+n_0+p_0}{2}},\nonumber
\end{eqnarray}
where
\begin{eqnarray}
SSE(u) &=& (\y-\hat{\y}(u))'(\y-\hat{\y}(u)) \nonumber\\
&=& (\y-\U\boldsymbol{\mathrm{P}}\U'\y)' (\y-\U\boldsymbol{\mathrm{P}}\U'\y)\\
&=& \y'\U(\I-\boldsymbol{\mathrm{P}})^2\U'\y. \nonumber
\end{eqnarray}

In general,the marginal posterior of $u$ is
\begin{eqnarray}
\pi(u \,|\, \y) &\propto&u^{\frac{p+p_0}{2}-1}(1-u)^{\frac{n+n_0}{2}-1}u^{\frac{n-p}{2}I(p>n)}\left(\prod_{j=1}^{\min(n,p)}(1-u)\lambda_j+u\right)^{-\frac{1}{2}} \\\nonumber
& & \times  [(1-u)(SSE(u)+n_0s_0^2)]^{-\frac{n+n_0+p_0}{2}}  \, \left[1+\frac{u}{1-u}\frac{\hat{\boldsymbol{\beta}}'(u)\hat{\boldsymbol{\beta}}(u)+p_0d_0^2}{SSE(u)+n_0s_0^2}\right]^{-\frac{n+n_0+p_0}{2}}
\end{eqnarray}
Also note that
\begin{eqnarray*}
\text{Cov}\left(\hat{\y}(u),\y\,|\, u\right) &=& \text{Cov}\left(\U\boldsymbol{\mathrm{P}}\U'\y,\y\,|\, u\right)\,\, =\,\, \U\boldsymbol{\mathrm{P}}\U'\text{Cov}(\y)\,\, = \,\, \sigma^2\U\boldsymbol{\mathrm{P}}\U'
\end{eqnarray*}
and
\begin{eqnarray*}
\text{Var}\left(\hat{\y}(u)\,|\, u\right) &=& \sigma^2 \U\boldsymbol{\mathrm{P}}\U'	,
\end{eqnarray*}
hence
\begin{eqnarray}\label{eq:cor}
\text{cor}\left(\hat{\mathrm{y}}_i,\mathrm{y}_i\right) &=& \int_0^1 \frac{\sum_{j=1}^ru_{i,j}^2\boldsymbol{\mathrm{P}}_{jj}}{\sqrt{\sum_{j=1}^ru_{i,j}^2\boldsymbol{\mathrm{P}}^2_{jj}}}\,\pi(u\,|\, \y)\,du.
\end{eqnarray}
For ridge regression the efcetive degrees of freedom may be calculated as the expected value trace of the matrix $\U\boldsymbol{\mathrm{P}}\U'$; that is:
\begin{eqnarray*}
edf &=& \text{E}\,\left[\text{tr}(\U\boldsymbol{\mathrm{P}}\U'\,|\,\y)\right] \,\,=\,\, \text{E}\,\left[\text{tr}(\boldsymbol{\mathrm{P}}\,|\,\y)\right]\,\, = \,\, \int_0^1 \sum_{j=1}^p \boldsymbol{\mathrm{P}}_{jj}\pi(u\,|\, \y)\, \text{d}u.
\end{eqnarray*}
A naive but useful approach to calculate the quantities above may be to plug in the posterior mode of $u$. Thus, for example, the effective degrees of freedom may be approximated by
\begin{eqnarray*}
\widehat{edf} &=& \sum_{j=1}^p \frac{\left(1-\hat{u}\right)\lambda_j}{\left(1-\hat{u}\right)\lambda_j+\hat{u}},
\end{eqnarray*}
where $\hat{u} = \text{max arg}\,\pi(u\,|\, \y)$. Obviously, when there is no shrinkage $\hat{u}=0$ and $edf=\text{rank}(\X)$.

\subsubsection{Posterior Computation through QR decomposition}
Another procedure to compute the posterior distribution is given by the QR factorization of $\X'$. This is, let $\X'=\Q\R$, where $\Q=[\Q_1,\Q_2]$ is a $p\times p$ orthonormal matrix and $\R = [\R_1',\R_2']'$ is a $p\times n$ upper triangular matrix, where the entries of the $(p-n)\times n$ matrix $\R_2$ are all zeros. Thus, proceeding in the same way as with SVD decomposition,
\begin{eqnarray}\label{eqn:sigma}
\si_{n}(u) &=& \left(\X'\X+\frac{u}{1-u}\I_p\right)^{-1} \\
&=& \left(\Q\R\R'\Q'+\frac{u}{1-u}\Q\Q'\right)^{-1} \nonumber \\
&=& \Q\left(\R\R'+\frac{u}{1-u}\I_{p}\right)^{-1}\Q' \nonumber \\
&=& \Q\begin{bmatrix} \left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1} & \boldsymbol{0}_{n,p-n} \\ \boldsymbol{0}_{p-n,n} & \frac{1-u}{u}\I_{p-n}\Q' \end{bmatrix}. \nonumber
\end{eqnarray}
In the same way,
\begin{eqnarray}\label{eqn:beta}
\hat{\B}(u) &=& \si_n(u)\X'\y \\
&=& \Q\left(\R\R'+\frac{u}{1-u}\I_p\right)^{-1}\Q'\Q\R\y \nonumber \\
&=& \Q\left(\R\R'+\frac{u}{1-u}\I_p\right)^{-1}\R\y \nonumber \\
&=& \Q_1\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\R_1\y, \nonumber
\end{eqnarray}
and
\begin{eqnarray}
\hat{\y}(u) &=& \X\hat{\B}(u)\nonumber \\
&=& \R'\Q'\Q\left(\R\R'+\frac{u}{1-u}\I_p\right)^{-1}\R\y \\
&=& \R_1'\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\R_1\y. \nonumber
\end{eqnarray}
Hence, by plugging (\ref{eqn:sigma}) and (\ref{eqn:beta}) in (\ref{eq:posterior1}) is obtained another way to calculate the marginal posterior of $u$, this is,
\begin{eqnarray}
\pi(u \,|\, \y) &\propto& u^{\frac{p_0+p}{2}-1}(1-u)^{\frac{n_0+n}{2}-1} \left| (1-u)\R_1\R_1' + u\I_n \right|^{-1/2}\nonumber \\ & & \times[(1-u)(SSE(u)+n_0s_0^2)]^{-\frac{n+n_0+p_0}{2}}\left[1+\frac{u}{1-u}\frac{\hat{\boldsymbol{\beta}}'(u)\hat{\boldsymbol{\beta}}(u)+p_0d_0^2}{SSE(u)+n_0s_0^2}\right]^{-\frac{n+n_0+p_0}{2}}
\end{eqnarray}
when $n > p$. The other form, $u^{\frac{p+p_0}{2}-1}$ is replaced by $u^{\frac{n+p_0}{2}-1}$.
Note that the covariance is
\begin{eqnarray*}
\text{Cov}\left(\hat{\y}(u),\y\,|\, u\right) &=&  \sigma^2\R_1'\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\R_1.
\end{eqnarray*}
and the variance
\begin{eqnarray*}
\text{Var}\left(\hat{\y}(u)\,|\, u\right) &=&  \sigma^2\R_1'\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\R_1\left[\R_1'\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\R_1\right]'.
\end{eqnarray*}
Therefore it is possible to calculate the correlation as in the equation (\ref{eq:cor}). \\
In the same way the effective degrees of freedom are
\begin{eqnarray}
edf &=& \text{E}\, \left[\left. \text{tr}\left(\R_1'\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\R_1\right)\right | \y\right] \nonumber \\
&=& \text{E}\, \left[\left. \text{tr}\left(\R_1\R_1'\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\right)\right | \y\right] \nonumber \\
&=& \int_0^1  \left[\text{tr}\left(\R_1\R_1'\left(\R_1\R_1'+\frac{u}{1-u}\I_n\right)^{-1}\right)\right] \,\pi(u\,|\, \y)\,\text{d}u.
\end{eqnarray}
Therefore the ($edf$) can be approximated as,
\begin{eqnarray}
\widehat{edf} &=&  \left[\text{tr}\left(\R_1\R_1'\left(\R_1\R_1'+\frac{\hat{u}}{1-\hat{u}}\I_n\right)^{-1}\right)\right],
\end{eqnarray}
where $\hat{u} = \text{max arg}\,\pi(u\,|\, \y)$.

Since manipulating high dimensional inverse matrices is complicated and expensive, then when we want to work with the QR method, the posterior mode of u will be used to obtain the values described above. That is, instead of using a vector u, the posterior mode of u will be taken.


\newpage

\section{Results}

An package named \proglang{HDBRR} was built in  \proglang{R} for implementing the methodology described in the previous sections. This package is available from CRAN at \url{https://CRAN.R-project.org/package=HDBRR}.
The main function in the package is also named \proglang{HDBRR} and its parameters with their default values are given in the Box 1a.
\begin{GrayBox}
\small
\textbf{Box 1a: List of arguments of the HDBRR function}

\begin{verbatim}
HDBRR(y, X, n0 = 5, p0 = 5, s20 = NULL, d20 = NULL, h = 0.5,
    intercept = TRUE, vpapp = TRUE, npts = NULL, c = NULL,
    corpred = NULL, method = c("svd","qr"), bigmat = TRUE, ncores = 2)
\end{verbatim}
\end{GrayBox}
\textbf{Inputs}\\\\ Box 1a displays a list of the main arguments of the \textbf{HDBRR} function implemented in the \proglang{HDBRR} package, a short description follows:
\begin{itemize}
\item  {\tt y, X} \, are the data vector, generally referred to as the  \textit{response variable} vector,  {\tt NA}'s allowed, and the design matrix of dimension $n \times p$ respectively.
\item  {\tt n0, p0}  are the numerators of the shape parameters {\tt n0/2} and {\tt p0/2} of the Inverse Gamma prior assigned to the residual variance and the shape parameter of the Inverse Gamma prior assigned to the regression coefficients variance, respectively. The default value for {\tt n0/2} and {\tt p0/2}  hyperparameter is {\tt 5}.
 \item  {\tt s20, d20, h} \, are the numerators of scale parameters {\tt (n0s20)/2} and {\tt (p0d20)/2}  of the Inverse Gamma prior assigned to the residual variance and the scale parameter of the Inverse Gamma prior assigned to the regression coefficients variance respectively. By default s20 and d20 are NULL which means that these parameters will be set at runtime internally, where {\tt h} is a dimensionless shrinkage coefficient, {\tt 0}$<${\tt h}$<${\tt 1}.  If {\tt h} $\to$ {\tt 0} then we have greater shrinkage, that is, $\beta$ $\to$ {\tt 0}. If {\tt h} $\to$ {\tt 1} then we will have less shrinkage.
\item  {\tt intercept} \, Logical argument. Default value  is {\tt TRUE}. A model with intercept is fitted.
\item  {\tt vpapp} \, Logical argument. Default value is {\tt TRUE}. Computes an approximation of the predictive variance.
\item  {\tt npts, c} \, number (integer) of points used to numerical evaluation of  the density of $u$. The default value for the {\tt npts} parameter is {\tt 200} and {\tt c} is the ratio of Gaussian densities (Spike/Slab) in the prior mixture for variable selection.
\item  {\tt corpred} \,   method to compute the correlation between the pairs of predicted and observed.  There are two available methods, Empirical Bayes ({\tt "eb"}) and Bayesian method ({\tt "b"}). The default value for the parameter {\tt corpred} is {\tt NULL}. If the value is {\tt NULL} then the output values  {\tt corr} and {\tt edf} will be {\tt NULL}.
\item  {\tt method} \, Options for the posterior computation. There are two methods available: {\tt "qr"} decomposition of {\tt X*t(X)} and {\tt "svd"} decomposition of matrix {\tt X}. The default value for the method is SVD decomposition.
\item  {\tt bigmat, ncores} \, use of the \proglang{bigstatsr} package and number of the cores for computation respectively. The default value for the {\tt ncores} is {\tt 2}. The number of cores can be detected  with \proglang{detectCores()} and may be used in macOS and Linux systems.
\end{itemize}

The \textbf{values returned} by the \textbf{HDBRR} function are listed in Table \ref{tab:HDBRRreturn}.

\begin{table}[htbp!]
\centering
\begin{tabular}{  m{3cm}  m{11cm}  }
\hline \textbf{Value} &  \\ \hline  \hline
\$betahat $\left(\widehat{\B}\right)$  & (numeric, p) Vector with the beta estimates.  \\ \hline
\$yhat $\left(\widehat{\y}\right)$  &  (numeric, n) Vector with the y's estimates.\\ \hline
\$sdyhat & (numeric, n) Vector  with the standard deviations of the predicted values. \\ \hline
\$sdpred &   (numeric, n) Vector  with the standard deviation of predicted variances. \\ \hline
\$varb &  (numeric, p) Vector with the beta's variance. \\ \hline
\$sigsqhat $\left(\widehat{\sigma}^2\right)$& (numeric)   estimated residual variance. The ``Empirical Bayes''  method is used. \\ \hline
\$sigbsqhat $\left(\widehat{\sigma}_{\beta}^2\right)$&  (numeric)  estimated variance of beta. The ``Empirical Bayes'' method is used.
 \\ \hline
\$u &  (numeric, npts) Vector with the values of $u$.  \\ \hline
\$postu $\pi(u\,|\,\y)$ &  (numeric, npts) Vector with the values of the $u$ posterior.\\ \hline
\$uhat $\left(\hat{u}\right)$ &  (numeric) estimated $u$.% In previous sections this value was mentioned, this is $\hat{u} = \text{max arg}\,\pi(u\,|\, \y)$
\\ \hline
\$umode  &  (numeric) posterior mode of u.\\ \hline
\$whichNa &   (integer) number of NA's in $\y$.   \\ \hline
\$phat $\left(\widehat{\boldsymbol{p}}\right)$ &  (numeric, p) Vector selection probability of $x_i$.  \\ \hline
\$delta $(\delta)$  &   (numeric) Value used in the variable selection. \\ \hline
\$edf  &  (numeric) Value of the effective degrees of freedom for regression.  \\ \hline
\$corr &   (numeric, n) Vector of the correlation between $\hat y_i$ and $y_i$.   \\ \hline
\end{tabular}
\vskip 5mm
\caption{Values returned by the HDBRR function implemented in the \proglang{HDBRR} package with \proglang{R} software.}
\label{tab:HDBRRreturn}
\end{table}

Also, the \textbf{matop} function  was implemented in the \proglang{HDBRR} package, it calculates the SVD and QR decompositions of an $\X$ matrix (specially if $\X$ is high dimensional) with the \proglang{bigstatsr} package. The function with its arguments is:
\begin{GrayBox}
\small
\textbf{Box 1b: List of arguments of the matop function}
\begin{verbatim}
matop(y, X, method = c("svd", "qr"), bigmat = TRUE)
\end{verbatim}
\end{GrayBox}
The arguments {\tt y, X, method} and {\tt bigmat} of the \textbf{matop} function are the same of the \textbf{HDBRR} function. The \textbf{values returned} by the \textbf{matop} function are listed in Table \ref{tab:matopreturn}.

\begin{table}[htbp!]
\centering
\begin{tabular}{  m{3.5cm}  m{10cm}  }
\multicolumn{2}{c}{(a) SVD}  \\
\hline \textbf{Value} &  \\ \hline  \hline
\$y  & (numeric, n)  Data vector. NAs allowed.  \\ \hline
\$X   & Design matrix of dimension $n \times p$. \\ \hline
\$D & Vector containing the singular values of $\X$ of length $\min(n,p)$. \\ \hline
\$L & A matrix whose columns contain the left singular vectors of $\X$. \\ \hline
\$R & A matrix whose columns contain the right singular vectors of $\X$.  \\ \hline
\$ev & A vector containing the square of D. \\ \hline
\$Ly & The cross-product between the matrix L and vector y. \\ \hline
\$n & Number of rows of X.\\ \hline
\$p  & Number of columns of X.\\ \hline
\end{tabular}
\vskip 5mm

\vskip 5mm
\begin{tabular}{  m{3.5cm}  m{10cm}  }
\multicolumn{2}{c}{(b) QR}\\
\hline \textbf{Value} &  \\ \hline  \hline
\$y  & (numeric, n)  Data vector. NAs allowed.  \\ \hline
\$X   &  Design matrix of dimension $n \times p$. \\ \hline
\$R & An upper triangular matrix of dimension $p \times p$.\\ \hline
\$n & Number of rows of X.\\ \hline
\$p  & Number of columns of X.\\ \hline
\$Q  & A matrix of dimension $n \times p$.\\ \hline
\end{tabular}
\vskip 5mm
\caption{Values returned by the HDBRR function  when the (a) SVD  and  (b) QR decompositions are used.}
\label{tab:matopreturn}
\end{table}

\textbf{Summary, plot and predict functions}\\\\
For variable selection  and  visualization of  results for class ``HDBRR'', the package contains the following functions.

\begin{GrayBox}
\small
\textbf{Box 1c: List of arguments of the summary, plot and predict functions}

\begin{verbatim}
summary(object, all.coef = FALSE, crit = log(4), ...)

plot(x, crit = log(4), ...)

predict(object,  ...)
\end{verbatim}
\end{GrayBox}

The \textbf{inputs} for these functions are:
\begin{itemize}
\item {\tt object} \, An HDBRR object generated by a call to HDBRR.
\item {\tt all.coef} \, Logical value. If {\tt all.coef = TRUE}, estimations for all the regression parameters are returned; otherwise,  only for those whose {\tt 2*log(bayes factor)>crit}.
\item {\tt crit} \, Numerical value. The lower bound of the log Bayes factor in favor to include a variable in the model. The default value for {\tt crit} is {\tt log(4)}.
\item {\tt ...} \, Additional arguments to be passed to or from other methods.
\item {\tt x} \, An HDBRR object, typically generated by a call to HDBRR (for the {\tt print.HDBRR} and {\tt plot.HDBRR} functions) or an object of class {\tt summary.HDBRR} (for the {\tt print.summary.HDBRR} function).
\end{itemize}

The \textbf{returned values} for the \textbf{summary} is a list of estimated coefficients ({\tt Estimate}), their standard deviations ({\tt Std. dev}), the signal-to-noise ratio ({\tt SNR}) and twice the conditional log Bayes Factor en favor to include each covariate ({\tt 2ln(BF)}). When {\tt all.coef = TRUE}, then estimations for all ridge regression parameters will be returned, otherwise only for those whose {\tt 2ln(BF)>crit}. The default value for {\tt crit} is {\tt log(4)}. The interpretation of {\tt 2ln(BF)} is in the ``Kass-Raftery'' scale \citep{kassbayes95}, where

\begin{table}[htbp!]
\centering
\begin{tabular}{  ccr }
\hline
$\boldsymbol{2 \log (\textbf{Bayes Factor})}$ & \textbf{Bayes Factor} & \textbf{Evidence against $\boldsymbol{H_0}$} \\
\hline
($-\infty$,0) & $[0,1)$ & Negative \\
$[0,2)$ & $[1,3)$ & Weak \\
$[2,6)$ & $[3,20)$ & Positive \\
$[6,10)$ & $[20,150)$ & Strong \\
$[10,+\infty)$ & $[150,+\infty)$ & Very Strong \\
\hline\\
\end{tabular}
\caption{Bayes Factor interpretation using ``Kass-Raftery'' scale.}
\label{tab:bayesfactor}
\end{table}

Also, the {\tt summary} function returns the estimation for the ridge parameter ($\hat{\lambda}$) and the \textbf{effective degrees of freedom} (edf). When the {\tt corpred} argument is {\tt NULL}, then the {\tt edf} value will be {\tt NULL}. If the {\tt c} value is {\tt NULL}, then the Bayes Factor will not be calculated. \\\\
The {\tt plot} function returns three graphs: 1) the plot  of $\hat{\beta} \,\, \text{vs} \,\,  \hat{p}$, 2) a plot with 4 subplots, \textbf{predicted values vs observed values}, \textbf{coefficients}, \textbf{Std. dev.} (for coefficients) and \textbf{SNR} and 3) the plot of the \textbf{``Marginal Posterior of u''}.
CAMBIAR en la siguiente versi\'on la gr\'afica 2) de modo que los cuatro gr\'aficos se produzcan individualmente, como en lm. \\\\
The {\tt predict} function returns a vector with the predicted values $\left(\hat{\y}\right)$.
\\\\
\textbf{Dataset}\\\\
The \proglang{HDBRR} package comes with a dataset named \textbf{``phenowheat''}, which contains data from a balanced four-way multi-parental cross population from four elite durum wheat cultivars (Neodur, Claudio, Colosseo, and Rascon/Tarro) that were chosen as diverse contributors of different alleles of agronomic relevance.
The final population includes 338 recombinant inbred lines (RILs)  and the final number of SNPs included in the linkage map was 7594, which were centered and standardized. Phenotypic evaluation of the population was performed during two growing seasons (2010-2011 and 2011-2012) in locations in the Po Valley: Cadriano in the 2010-2011 growing season (Cad11) and the 2011-2012 growing season (Cad12); Poggio Renatico in the 2010-2011 growing season (Pr11) and Argelato in the 2011-2012  growing season (Arg12). The four traits included in this study where grain yield GY (Mg ha$^{-1}$), heading data HD (d) , 1000-kernel weight GWT (g 1000 kernels$^{-1}$)  and grain volume weight GVW (kg hL$^{-1}$) . For a detailed description of the data set see \cite{Milner}.

\begin{GrayBox}
\small
\textbf{Box 2: Loading the phenowheat data set included in HDBRR}

\begin{verbatim}
library(HDBRR)
data(phenowheat)
\end{verbatim}
\end{GrayBox}

\newpage
\subsection{Application Example}
In order to illustrate the use of the  \proglang{HDBRR} package with other real data and to show the outputs obtained, the methods summary, predict and plot are shortly explained
\\\\
Consider the  ``phenowheat'' data set included in \proglang{HDBRR}.
The data are loaded as in the {\tt Box 2}. In order to obtain the $y$ vector,   the \proglang{lmer} function is used. This allow us to find the best linear unbiased predictions of the HD trait after adjusting for the environment (see {\tt Box 3b part \#1\#}).

\begin{GrayBox}
\small
\textbf{Box 3b: Example with dataset ``phenowheat"}

\begin{verbatim}
#1# mod <- lmer(pheno$HD~pheno$env+(1|pheno$Line))
y <- unlist(ranef(mod))

#2# n <- length(y)
X <- scale(X, scale=F)
fitall <- HDBRR(y, X/sqrt(ncol(X)), intercept = FALSE, corpred = "eb",
	c = 100)

#3# fitall
summary(fitall, crit = 0)
plot(fitall, crit = 0)
predict(fitall)
\end{verbatim}
\end{GrayBox}

In second part ({\tt \#2\#}) the fit is done by calling the \textbf{HDBRR} function where the covariates are scaled to avoid extremely small regression coefficients. In the third part ({\tt \#3\#}) the results for {\tt fitall} are returned.   If $p > 250$, then only 250 coefficients are displayed. \\\\ In  {\tt Box 3c} the structure of the object returned by \textbf{HDBRR} function is shown. For this case 250 coefficients were returned, but in the Box only 14 coefficients were printed.

\begin{GrayBox}
\small
\textbf{Box 3c: Structure of the object {\tt fitall} returned by HDBRR (after running the code in Box 3b part \#3\#)}

\begin{verbatim}
> fitall

Call:
HDBRR(y = y, X = X/sqrt(ncol(X)), intercept = F, c = 100, corpred = "eb")

Coefficients:
       X1        X2        X3        X4        X5        X6        X7
-0.069050  0.117750 -0.075723 -0.115526 -0.006621 -0.111297 -0.062642
       X8        X9       X10       X11       X12       X13       X14
-0.088641 -0.096674 -0.114091 -0.063993 -0.082015 -0.114091 -0.107933

 ... 7580 coefficients was omitted
\end{verbatim}
\end{GrayBox}

The object HDBRR is a list of 21 elements, included betahat ($\hat{\B}$), yhat ($\hat{\y}$), sigsqhat ($\hat{\sigma}^2$) and sigbsqhat ($\hat{\sigma}_{\beta}^2$). In the {\tt Box 3d} there are some elements contained by an object of the class \textbf{HDBRR} .

\begin{GrayBox}
\small
\textbf{Box 3d: Structure of the object returned by HDBRR (after running the code in Box 3b)}

\begin{verbatim}
str(fm)
List of 21
 $ betahat      : num [1:7594] -0.06905 0.11775 0.07572 ...
 $ yhat         : num [1:338] -0.394 -0.52 2.876 ...
 $ sdyhat       : num [1:338] 0.643 0.648 0.619 ...
 $ sdpred       : num [1:338] 1.71 1.72 1.7 ...
 $ varb         : num [1:7594] 0.3219 0.8271 0.0511 ...
 $ sigsqhat     : num 2.52
 $ sigbsqhat    : num 6.88
 $ u            : num [1:200] 0.116 0.117 0.119 ...
 $ postu        : Named num [1:200] 2.22e-05 2.73e-05 ...
 ..- attr(*,"names") = chr [1:200]  "84.1344\%" ...
 $ uhat         : num 0.273
 $ umode        : num 0.261
 $ whichNa      : int(0)
 $ phat         : num [1:7594] 3.53e-04 1.02e-03 ...
 $ delta        : num 7.96
 $ edf          : num 140
 $ corr         : num [1:338] 0.804 0.818 0.793 ...
\end{verbatim}
\end{GrayBox}
The \textbf{summary} function was explained above. Here, the {\tt crit} argument was set equal to 0 because the probability of inclusion in very low  (the default value for this argument is {\tt log{4}}).
\newpage
\begin{GrayBox}
\small
\textbf{Box 3e: Summary of the object returned by HDBRR (after running the code in Box 3b part \#3\#).}

\begin{verbatim}
> summary(fitall, crit = 0)

Call:
HDBRR(y = y, X = X/sqrt(ncol(X)), intercept = FALSE, c = 100,
    corpred = "eb")

Coefficients:
       Estimate  Std. dev       SNR   2ln(BF)
X1189 -3.477229 0.5949053 -5.845014 0.7370653
X1190 -3.411983 0.5887797 -5.795009 0.6232837
X1191 -3.437302 0.6060903 -5.671272 0.6674882
X1192 -3.578898 0.6424279 -5.570895 0.9133567
X1193 -3.595580 0.6084415 -5.909492 0.9422146
X1194 -3.548588 0.5963829 -5.950185 0.8609486
X1195 -3.581506 0.6196334 -5.780039 0.9178934
X1196 -3.376780 0.5930485 -5.693935 0.5616503
X1197 -3.444590 0.6027798 -5.714507 0.6802414
X1198 -3.443699 0.6013025 -5.727066 0.6786374
X1199 -3.395415 0.5896655 -5.758205 0.5942924
X1205 -3.623269 0.6607038 -5.483955 0.9899897
-----
Signif. codes: 10 '***' 6 '**' 2 '*' 0 ' '

Ridge parameter: 0.3754
Effective degrees of freedom: 140.3917
\end{verbatim}
\end{GrayBox}

Note that the estimation of the ridge parameter returned is $\hat{\lambda}= 0.3754$. The estimated effective degrees of freedom ($edf$) are equal to 140.3917.  \\\\ Similarly, the {\tt plot} function was discussed above and for this example the results are shown in Figure \ref{fig:results1}. For variable selection, in the {\tt plot} function the value of the {\tt crit} argument was set to 0,  such that  in the estimated coefficients vs selection probability plot,  the variable names that appear are those for which $2\log{(\text{BF}) > 0}$ (see Figure \ref{fig:results} (a)). \\\\ Figure  \ref{fig:results1} (b) is the marginal posterior of $u$.
\\\\ The {\tt predict} function applied to the object \proglang{HDBRR} {\tt fitall} returns the predictions of the model, which are the mean of the posterior predictive distribution. See {\tt Box 3f} \\
\newpage
\begin{GrayBox}
\small
\textbf{Box 3f: Predict of the object returned by HDBRR (after running the code in Box 3b part \#3\#).}

\begin{verbatim}
> predict(fitall)

  [1] -0.39403 -0.52017  2.87568  1.51151  0.11663  0.70171  0.12358  ...
 [10] -2.15284  0.62729 -2.25787  2.89244  1.41203  0.50038 -0.44047  ...
 [19]  1.90388 -2.76213 -1.91036  2.99180  3.89731  1.10541  1.47814 ...
 [28]  0.38303 -1.05772 -0.47791 -0.91379 -1.00456 -0.85098 -1.25591  ...
\end{verbatim}
\end{GrayBox}

Finally, the Figure \ref{fig:results1} (c) has 4 subplots, predicted values vs observed values, coefficients, Std. dev. (for coefficients) and SNR. The first subplot (predicted values vs observed values) include the Spearman correlation, for this case this value is 0.9196. \\\\ It is mentioned that the ability of MCMCs to sample from modern-day high-dimensional posteriors has been limited by a widespread perception that these chains typically experience serious convergence problems. Figure \ref{fig:results1} (b) shows evidence on the advantage of the HDBRR over MCMC, the posterior distribution is always calculated.
\newpage
\begin{figure}[htbp!]
\centering
\subfloat[Estimated coefficients vs selection probability of $x_i$.]{\includegraphics[width=70mm]{2}}
\subfloat[ Marginal posterior of $u$.]{\includegraphics[width=70mm]{4}}\\
\subfloat[ $\y$ observed vs $\y$ predicted and their correlation, Coefficients, td.dev. and SNR.]{\includegraphics[width=100mm]{3}}
\caption{Plots returned for the object {\tt fitall} produced by the \textbf{HDBRR} function.}
\label{fig:results1}
\end{figure}
\newpage

\section{Conclusions}

We propose a computational method to make Bayesian inference for high-dimensional ridge regression without using MCMC methods. Posterior means and variances of regression parameters, variance components and predictions for the conventional ridge Regression model are obtained by using a convenient reparameterization. The problem is reduced to numerical integration on the open interval $(0,1)$ to get rid of a nuisance parameter, after SVD or QR decomposition of the matrix $\X^{\prime}\X$. The method is implemented in the \proglang{R} package \proglang{HDBRR}, which allows us also to make also variable selection and prediction without appealing the theoretical guarantees of MCMC methods. The results of cross validation shown that the proposed method has a performance in computation time and accuracy at least as good as the results obtained by using MCMC methods.


%At the end, the bibliography
\bibliography{references}

\end{document}