In this document, we provide a short tutorial on how to use the \(\texttt{BvCategorical}\) R package. If you encounter any errors or strange behavior, please report the issue at https://github.com/ajmolstad/BvCategorical.
First, we install the \(\texttt{BvCategorical}\) R package from GitHub.
install.packages("devtools")
library(devtools)
devtools::install_github("ajmolstad/BvCategorical")
library(BvCategorical)
Next, we generate data from the bivariate categorical response regression model. Here, the responses are \(J\)-category and \(K\)-category multinomial random variables with a single trial per subject. Generalizing to multiple trials per subject is trivial: users need only append additional rows to \(X\) and \(Y\) so that the data are formatted as if there were only a single trial per subject. To keep computing times short, we use \(J=3\) and \(K=2\) in the following example.
Recall, under the bivariate multinomial logistic regression model, \(\boldsymbol{\beta} \in \mathbb{R}^{p + 1 \times J \times K}\) where \[ P(Y_{1i} = j, Y_{2i} = k \mid x_i) = \frac{\exp\left(x_i'\boldsymbol{\beta}_{\cdot, j, k} \right)}{\sum_{s,t} \exp\left(x_i'\boldsymbol{\beta}_{\cdot, s, t} \right)}, \quad\quad(i,j,k) \in \left\{1, \dots, n\right\} \times \left\{1, \dots, J\right\} \times \left\{1,\dots, K\right\}.\] where \(x_i = (1, x_{1i}, \dots, x_{pi})\) for \(i=1, \dots, n\).
In the following, we generate \(\boldsymbol{\beta}\) so that six variables modify the joint probability mass function, i.e., they uniquely modify \(P(Y_{1i} = j, Y_{2i} = k \mid x_i)\); four modify only the marginal probabilities \(P(Y_{1i} = j\mid x_i)\) and \(P(Y_{2i} = k \mid x_i)\), and the rest do not affect the response.
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] MASS_7.3-51.5 Matrix_1.2-18
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.4 lattice_0.20-40 digest_0.6.25 grid_3.6.1
## [5] magrittr_1.5 evaluate_0.14 rlang_0.4.5 stringi_1.4.6
## [9] rmarkdown_2.1 tools_3.6.1 stringr_1.4.0 xfun_0.12
## [13] yaml_2.2.1 compiler_3.6.1 htmltools_0.4.0 knitr_1.28
# --------------------------------
# Preliminaries
# --------------------------------
set.seed(10)
p <- 200
n <- 500
J <- 3
K <- 2
ntest <- 1e4
Results <- NULL
SigmaX <- matrix(0, nrow=p, ncol=p)
for(jj in 1:p){
for(kk in 1:p){
SigmaX[jj,kk] <- .5^abs(jj-kk)
}
}
eo <- eigen(SigmaX)
SigmaXsqrt <- eo$vec%*%diag(eo$val^.5)%*%t(eo$vec)
X <- matrix(rnorm(n*p), nrow=n)%*%SigmaXsqrt
Xtest <- matrix(rnorm(ntest*p), nrow=ntest)%*%SigmaXsqrt
# generated data -------------
X <- matrix(rnorm(n*p), nrow=n)%*%SigmaXsqrt
Xtest <- matrix(rnorm(ntest*p), nrow=ntest)%*%SigmaXsqrt
beta <- matrix(0, nrow=p, ncol=J*K)
temp <- sample(1:p, 10)
# joint probabilities -------------
for(j in 1:6){
beta[temp[j],] <- runif(6, -3, 3)
}
# marginal probabilities only ---
for(j in 7:10){
temp1 <- runif(4, -3, 3)
beta[temp[j],] <- c(-temp1[4] + temp1[3] + temp1[1], temp1[1], temp1[2], temp1[3], temp1[4], -temp1[1]+ temp1[4] + temp1[2])
}
lleval <- exp(crossprod(t(X), beta))
Pmat <- lleval/(rowSums(lleval)%*%t(rep(1, J*K)))
Ymat <- matrix(0, nrow=n, ncol=J*K)
Y <- list(NA)
Y[[1]] <- rep(0, n)
Y[[2]] <- rep(0, n)
refClasses <- rbind(c(1:J, 1:J), c(rep(1, J), rep(2, J)))
for(k in 1:n){
Ymat[k,] <- rmultinom(1, 1, Pmat[k,])
Y[[1]][k] <- refClasses[1,which(Ymat[k,]==1)]
Y[[2]][k] <- refClasses[2,which(Ymat[k,]==1)]
}
lleval <- exp(crossprod(t(Xtest), beta))
PmatTest <- lleval/(rowSums(lleval)%*%t(rep(1, J*K)))
YmatTest <- matrix(0, nrow=ntest, ncol=J*K)
Ytest <- list(NA)
Ytest[[1]] <- rep(0, ntest)
Ytest[[2]] <- rep(0, ntest)
refClasses <- rbind(c(1:J, 1:J), c(rep(1, J), rep(2, J)))
for(k in 1:ntest){
YmatTest[k,] <- rmultinom(1, 1, PmatTest[k,])
Ytest[[1]][k] <- refClasses[1,which(YmatTest[k,]==1)]
Ytest[[2]][k] <- refClasses[2,which(YmatTest[k,]==1)]
}
To fit the model, use the function \(\texttt{BvCat.cv}\).The arguments are as follows:
For more on formatting, we suggest following the example data generating procedure above.
modfit <- BvCat.cv(X, Y, ngamma = 10,
lambda.vec = 10^seq(-1, -4, length=5),
nfolds = 5, delta = .05, standardize = TRUE,
tol = 1e-10, quiet = FALSE, inner.quiet = TRUE)
## lambda = 0.1 ; gamma = 0.2558115 ; nonzero = 6
## lambda = 0.1 ; gamma = 0.1833839 ; nonzero = 12
## lambda = 0.1 ; gamma = 0.1314626 ; nonzero = 30
## lambda = 0.1 ; gamma = 0.09424177 ; nonzero = 48
## lambda = 0.1 ; gamma = 0.0675592 ; nonzero = 60
## lambda = 0.1 ; gamma = 0.04843125 ; nonzero = 66
## lambda = 0.1 ; gamma = 0.03471896 ; nonzero = 144
## lambda = 0.1 ; gamma = 0.02488902 ; nonzero = 312
## lambda = 0.1 ; gamma = 0.01784222 ; nonzero = 504
## lambda = 0.1 ; gamma = 0.01279058 ; nonzero = 714
## lambda = 0.01778279 ; gamma = 0.2558115 ; nonzero = 6
## lambda = 0.01778279 ; gamma = 0.1833839 ; nonzero = 18
## lambda = 0.01778279 ; gamma = 0.1314626 ; nonzero = 42
## lambda = 0.01778279 ; gamma = 0.09424177 ; nonzero = 48
## lambda = 0.01778279 ; gamma = 0.0675592 ; nonzero = 60
## lambda = 0.01778279 ; gamma = 0.04843125 ; nonzero = 66
## lambda = 0.01778279 ; gamma = 0.03471896 ; nonzero = 120
## lambda = 0.01778279 ; gamma = 0.02488902 ; nonzero = 240
## lambda = 0.01778279 ; gamma = 0.01784222 ; nonzero = 426
## lambda = 0.01778279 ; gamma = 0.01279058 ; nonzero = 588
## lambda = 0.003162278 ; gamma = 0.2558115 ; nonzero = 6
## lambda = 0.003162278 ; gamma = 0.1833839 ; nonzero = 24
## lambda = 0.003162278 ; gamma = 0.1314626 ; nonzero = 42
## lambda = 0.003162278 ; gamma = 0.09424177 ; nonzero = 54
## lambda = 0.003162278 ; gamma = 0.0675592 ; nonzero = 60
## lambda = 0.003162278 ; gamma = 0.04843125 ; nonzero = 66
## lambda = 0.003162278 ; gamma = 0.03471896 ; nonzero = 144
## lambda = 0.003162278 ; gamma = 0.02488902 ; nonzero = 294
## lambda = 0.003162278 ; gamma = 0.01784222 ; nonzero = 450
## lambda = 0.003162278 ; gamma = 0.01279058 ; nonzero = 618
## lambda = 0.0005623413 ; gamma = 0.2558115 ; nonzero = 6
## lambda = 0.0005623413 ; gamma = 0.1833839 ; nonzero = 24
## lambda = 0.0005623413 ; gamma = 0.1314626 ; nonzero = 42
## lambda = 0.0005623413 ; gamma = 0.09424177 ; nonzero = 54
## lambda = 0.0005623413 ; gamma = 0.0675592 ; nonzero = 60
## lambda = 0.0005623413 ; gamma = 0.04843125 ; nonzero = 66
## lambda = 0.0005623413 ; gamma = 0.03471896 ; nonzero = 204
## lambda = 0.0005623413 ; gamma = 0.02488902 ; nonzero = 366
## lambda = 0.0005623413 ; gamma = 0.01784222 ; nonzero = 534
## lambda = 0.0005623413 ; gamma = 0.01279058 ; nonzero = 708
## lambda = 1e-04 ; gamma = 0.2558115 ; nonzero = 6
## lambda = 1e-04 ; gamma = 0.1833839 ; nonzero = 24
## lambda = 1e-04 ; gamma = 0.1314626 ; nonzero = 42
## lambda = 1e-04 ; gamma = 0.09424177 ; nonzero = 54
## lambda = 1e-04 ; gamma = 0.0675592 ; nonzero = 60
## lambda = 1e-04 ; gamma = 0.04843125 ; nonzero = 72
## lambda = 1e-04 ; gamma = 0.03471896 ; nonzero = 210
## lambda = 1e-04 ; gamma = 0.02488902 ; nonzero = 378
## lambda = 1e-04 ; gamma = 0.01784222 ; nonzero = 552
## lambda = 1e-04 ; gamma = 0.01279058 ; nonzero = 726
## Through CV fold 1
## Through CV fold 2
## Through CV fold 3
## Through CV fold 4
## Through CV fold 5
Once we have fit the model, we can examine the output.
str(modfit)
## List of 15
## $ beta :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. ..@ i : int [1:9456] 0 201 402 603 804 1005 0 159 201 360 ...
## .. ..@ p : int [1:51] 0 6 18 48 96 156 222 366 678 1182 ...
## .. ..@ Dim : int [1:2] 1206 50
## .. ..@ Dimnames:List of 2
## .. .. ..$ : NULL
## .. .. ..$ : NULL
## .. ..@ x : num [1:9456] 0.0579 0.1679 -0.106 0.3572 -0.106 ...
## .. ..@ factors : list()
## $ J : int 3
## $ K : int 2
## $ p : int 200
## $ D : num [1:6, 1:3] 1 -1 0 -1 1 0 1 0 -1 -1 ...
## $ standardize : logi TRUE
## $ errs.joint : num [1:5, 1:10, 1:5] 0.769 0.769 0.769 0.769 0.769 ...
## $ deviance : num [1:5, 1:10, 1:5] 368 368 368 368 368 ...
## $ lambda.min.joint: num 0.00316
## $ gamma.min.joint : num 0.0249
## $ X.mean : num [1:200] -0.0315 -0.0502 -0.0639 0.0422 0.0317 ...
## $ X.sd : num [1:200] 1.02 1.008 0.967 0.971 1.039 ...
## $ cv.index :List of 5
## ..$ : int [1:104] 2 28 52 83 174 180 182 190 219 231 ...
## ..$ : int [1:100] 15 26 38 77 186 195 205 247 263 340 ...
## ..$ : int [1:100] 5 13 31 120 153 203 235 242 257 283 ...
## ..$ : int [1:98] 17 41 82 162 210 218 249 258 290 311 ...
## ..$ : int [1:98] 111 116 123 132 185 194 196 225 270 330 ...
## $ lambda.vec : num [1:5] 0.1 0.017783 0.003162 0.000562 0.0001
## $ gamma.vec : num [1:10] 0.2558 0.1834 0.1315 0.0942 0.0676 ...
## - attr(*, "class")= chr "BvCat"
The output contains the following (omitting those with obvious meaning):
With the fitted model, an object of type \(\texttt{BvCat}\), we can first examine the misclassification errors averaged over the folds:
library(lattice)
levelplot(t(apply(modfit$errs.joint, c(1,2), mean)), xlab=expression(paste(gamma)), ylab=expression(paste(lambda)), main="Cross-validated misclassification rate", col.regions=grey((100:0)/100))
Then, we can predict the classes for a new set of predictors, \(\texttt{Xtest}\), using the \(\texttt{BvCat.predict}\) function. By default, \(\texttt{BvCat.predict}\) uses the tuning parameter pair which had the minimum average misclassification rate in cross-validation.
outPred <- BvCat.predict(Xtest = Xtest, modfit, type="class") # prediction using tuning parameters which minimized classification error
We can also perform prediction using the fitted model for other tuning parameter pairs:
devInds <- which(apply(modfit$deviance, c(1,2), mean) == min(apply(modfit$deviance, c(1,2), mean)), arr.ind=TRUE)
outPred.dev <- BvCat.predict(Xtest = Xtest, modfit, lambda = modfit$lambda.vec[devInds[1,1]], gamma = modfit$gamma.vec[devInds[1,2]]) # prediction using tuning parameters which minimized classification error
preds.mat <- apply(outPred$preds, 1, function(x){(x[1]+(J*(x[2]-1)))})
misclass.joint <- sum(apply(YmatTest, 1, which.max) != preds.mat)/length(Ytest[[1]])
cat(misclass.joint, "\n")
## 0.2487
preds.dev.mat <- apply(outPred.dev$preds, 1, function(x){(x[1]+(J*(x[2]-1)))})
misclass.joint.dev <- sum(apply(YmatTest, 1, which.max) != preds.dev.mat)/length(Ytest[[1]])
cat(misclass.joint.dev, "\n")
## 0.2558
Finally, we can extract the regression coefficients using \(\texttt{BvCat.coef}\). When we use the argument \(\texttt{type="matrix"}\), we get the matricized version of \(\hat{\boldsymbol{\beta}}\), \(\hat\beta\), which is given by \[ \hat\beta_{m,f(j,k)} = \hat{\boldsymbol{\beta}}_{m,j,k}, \quad\quad (m,j,k) \in \left\{1, \dots, p+1\right\} \times \left\{1, \dots, J \right\} \times \left\{1, \dots, K\right\}.\] where \(f(j,k) = (k-1)J + j\) for \((j,k) \in \left\{1, \dots, J\right\} \times \left\{1, \dots, K\right\}.\) Note that \(\boldsymbol{\beta}_0\) denotes the intercept term.
outBeta <- BvCat.coef(modfit, type="matrix")
str(outBeta)
## List of 3
## $ b0 : num [1:6] 0.0862 0.1196 -0.1713 0.187 0.0102 ...
## $ beta : num [1:200, 1:6] 0.19 0 0 0 -1.11 ...
## $ classref: num [1:2, 1:6] 1 1 2 1 3 1 1 2 2 2 ...
Using the argument \(\texttt{type="array"}\) instead returns \(\hat{\boldsymbol{\beta}}\), the arra-valued estimate.
outBetaArray <- BvCat.coef(modfit, type="array")
str(outBetaArray)
## List of 3
## $ b0 : num [1, 1:3, 1:2] 0.0862 0.1196 -0.1713 0.187 0.0102 ...
## $ beta : num [1:200, 1:3, 1:2] 0.19 0 0 0 -1.11 ...
## $ classref: num [1:2, 1:6] 1 1 2 1 3 1 1 2 2 2 ...