Chapter 6 Modeling

Load data

# Created in 3-clean-finalize.Rmd
# Objects included: data, vars, var_df
# renv also includes a load() method, so we specify base:: here.
base::load("data/clean-finalize-imputed.RData")

6.1 Random forest

set.seed(1, "L'Ecuyer-CMRG")

# mlr wants covariates and outcome to be in the same dataframe.

# For classification RF needs Y to be a factor.
# We use the best mtry based on the CV.SL results from the final prediction library.
# Takes 1 second.
(rf_time = system.time({
  # Ranger uses all available threads by default, nice.
  y = as.factor(data[[vars$outcomes[1]]]) 
  rf = ranger::ranger(y ~ . ,
                      data = data[, vars$predictors],
                      num.threads = get_cores(),
                      # Need this option for OOB curve analysis.
                      keep.inbag = TRUE,
                      num.trees = 4000,
                      # Could also do importance = "impurity".
                      importance = "permutation",
                      # Set based on separate grid/random search.
                      mtry = 4L,
                      # Set based on separate grid/random search.
                      min.node.size = 5L)
}))

##    user  system elapsed 
##    2.19    0.12    0.90

save(rf, file = "data/model-rf.RData")

6.1.1 RF convergence plot

The results of this block are cached because they are slow to compute.

library(mlr)
library(OOBCurve)

oob_data = data[, c(vars$outcomes[1], vars$predictors), drop = FALSE]

# Outcome needs to be a factor.
oob_data[[vars$outcomes[1]]] = as.factor(data[[vars$outcomes[1]]])


task = makeClassifTask(data = oob_data, target = vars$outcomes[1])
# Current package has a bug such that multiple measures have to be specified.
# We aren't using the Brier score though.
# TODO: these results could be averaged over multiple random shufflings
# of the tree ordering. Would give a more accurate, smoother curve.
# This takes ~10 seconds.
system.time({
  results = OOBCurve(rf, measures = list(mlr::auc, mlr::brier), task = task,
                     data = oob_data)
})

##    user  system elapsed 
##   10.67    0.17   10.98

# Look at the OOB AUC with the maximum number of trees.
# 0.894
(rf_auc = results$auc[length(results$auc)])

## [1] 0.8972771

# Can zoom in to certain segments of the forest indexed by an ntree range.
tree_start = 3
#tree_start = 10
tree_end = length(results$auc)
x_span = seq(tree_start, tree_end)
y_span = results$auc[x_span]

ggplot(mapping = aes(x = x_span, y = y_span)) + geom_line() + theme_minimal() +
  coord_fixed(ratio = 3) + 
  scale_y_continuous(expand = c(0, 0), limits = c(0.5, 0.94)) +
  scale_x_log10(breaks = c(3, 10, 30, 100, 300, 1000, 3000),
                limits = c(3, 4000),
                minor_breaks = NULL) +
  labs(x = "Trees in the random forest", y = "Out of Bag AUC")

ggsave("visuals/rf-error-rate-by-trees.pdf",
       width = 7, height = 3)

6.2 Ensemble

6.2.1 Prep SL library

# Via R/sl-dbarts2.R
SL.dbarts = SL.dbarts2

# We aren't using this grid.
learner_bart =
  create.Learner("SL.dbarts",
                 # Turning off detailed_names because binary_offset has a negative value.
                 #detailed_names = FALSE,
                 detailed_names = TRUE,
                 params = list(nthread = getOption("cores")),
                 tune = list(ntree = c(1, 2, 5, 10, 20, 50, 100, 200, 500)))#,

screen_names = function(Y, X, names = names(X), ...)  {
    return(names(X) %in% names)
}


# TODO: add feature selection options (BART, lasso, RF)
# TODO: explore optimization of the meta-learner step.
xgb_tune = function(Y, X, newX, family, obsWeights, id, ...) {
  cat("Running xgb_tune\n")
  
  # Create tuning grid.
  grid = create.Learner("SL.xgboost_fast", detailed_names = TRUE, 
                        tune = list(
                          # 27 combos.
                          ntrees = c(100L, 300L, 1000L),
                          max_depth = c(1L, 3L, 6L),
                          shrinkage = c(0.01, 0.1, 0.3)))
  
  # Coarser/faster grid.
  grid2 = create.Learner("SL.xgboost_fast", detailed_names = TRUE, 
                        tune = list(
                          # 8 combos.
                          ntrees = c(250L, 1000L),
                          max_depth = c(2L, 4L),
                          shrinkage = c(0.05, 0.2)))
  
  # Run SuperLearner.
  # id argument is not being passed to avoid an error "stratified sampling with id not currently implemented"
  sl = SuperLearner(Y = Y, X = X, newX = newX, obsWeights = obsWeights, family = family,
                    SL.library = grid2$names,
                    cvControl = SuperLearner.CV.control(stratifyCV = TRUE,
                                                        # Set to 2 for tutorial.
                                                        V = 2L),
                                                        #V = 5L),
                    verbose = FALSE)
  
  cat("XGB tuned SL:\n")
  print(sl)
  
  # fit returns all objects needed for predict()
  fit = list(object = sl)
  
  # Declare class of fit for predict()
  class(fit) = 'SuperLearner'
  
  out = list(pred = sl$SL.predict, fit = fit)
  return(out)
}

rf_tune = function(Y, X, newX, family, obsWeights, id, ...) {
  cat("Running rf_tune\n")
  
  # Create tuning grid.
  grid = create.Learner("SL.ranger", detailed_names = TRUE, 
                        params = list(num.threads = get_cores(),
                                      # Set this based on the convergence analysis.
                                      num.trees = 200),
                        tune = list(
                          # 9 combos.
                          min.node.size = c(2L, 5L, 15L),
                          mtry = floor(c(0.5, 1, 2) * sqrt(ncol(X)))))
  
  # Run SuperLearner.
  # id argument is not being passed to avoid an error "stratified sampling with id not currently implemented"
  sl = SuperLearner(Y = Y, X = X, newX = newX, obsWeights = obsWeights, family = family,
                    #SL.library = c("SL.mean", grid$names),
                    SL.library = grid$names,
                    cvControl = SuperLearner.CV.control(stratifyCV = TRUE,
                                                        # Set to 2 for tutorial.
                                                        V = 2L),
                                                        #V = 5L),
                    verbose = FALSE)
  
  cat("RF tuned SL:\n")
  print(sl)
  
  # fit returns all objects needed for predict()
  fit = list(object = sl)
  
  # Declare class of fit for predict()
  class(fit) = 'SuperLearner'
  
  out = list(pred = sl$SL.predict, fit = fit)
  return(out)
}

glmnet_tune = function(Y, X, newX, family, obsWeights, id, ...) {
  cat("Running glmnet_tune\n")
  
  # Create tuning grid.
  grid = create.Learner("SL.glmnet_fast", detailed_names = TRUE, 
                        # 4 combos
                        tune = list(alpha = c(0.05, 0.3, 0.7, 0.95)))
  
  # Run SuperLearner.
  # id argument is not being passed to avoid an error "stratified sampling with id not currently implemented"
  sl = SuperLearner(Y = Y, X = X, newX = newX, obsWeights = obsWeights, family = family,
                    SL.library = grid$names,
                    cvControl = SuperLearner.CV.control(stratifyCV = TRUE,
                                                        # Set to 2 for tutorial purposes.
                                                        V = 2L), 
                                                        #V = 5L),
                    verbose = FALSE)
  
  cat("Glmnet tuned SL:\n")
  print(sl)
  
  # TODO: may need to save the learners, put them in the global environment, or otherwise
  # handle in a custom predict() method.
  
  # fit returns all objects needed for predict()
  fit = list(object = sl)
  
  # Declare class of fit for predict()
  class(fit) = 'SuperLearner'
  
  out = list(pred = sl$SL.predict, fit = fit)
  return(out)
}

rpart_tune = function(Y, X, newX, family, obsWeights, id, ...) {
  cat("Running rpart_tune\n")
  
  # Create tuning grid.
  grid = create.Learner("SL.rpart2", detailed_names = TRUE, 
                        tune = list(cp = c(0, 0.01),
                                    minsplit = c(10, 20, 80),
                                    maxdepth = c(5, 15)))
  
  # Run SuperLearner.
  # id argument is not being passed to avoid an error "stratified sampling with id not currently implemented"
  sl = SuperLearner(Y = Y, X = X, newX = newX, obsWeights = obsWeights, family = family,
                    #SL.library = c("SL.mean", grid$names),
                    SL.library = grid$names,
                    cvControl = SuperLearner.CV.control(stratifyCV = TRUE, V = 5L),
                    verbose = FALSE)
  
  cat("Rpart tuned SL:\n")
  print(sl)
  
  # TODO: may need to save the learners, put them in the global environment, or otherwise
  # handle in a custom predict() method.
  
  # fit returns all objects needed for predict()
  fit = list(object = sl)
  
  # Declare class of fit for predict()
  class(fit) = 'SuperLearner'
  
  out = list(pred = sl$SL.predict, fit = fit)
  return(out)
}

learner_mgcv =
  create.Learner("SL.mgcv2",
                 detailed_names = TRUE,
                 params = list(nthreads = min(10L, get_cores()),
                               continuous_values = 10L))


(sl_lib = c(list("SL.mean",
                 "SL.lm2",
                 "SL.glm2",
                 "SL.glmnet_fast"),
#           stratified_lib$names, 
#           rpart_pruned$names,
           learner_mgcv$names,
          list(
            "rpart_tune",
            "SL.ranger_200",
            "rf_tune",
            "SL.dbarts_200",
            "xgb_tune",
            "SL.xgboost_fast")))

## [[1]]
## [1] "SL.mean"
## 
## [[2]]
## [1] "SL.lm2"
## 
## [[3]]
## [1] "SL.glm2"
## 
## [[4]]
## [1] "SL.glmnet_fast"
## 
## [[5]]
## [1] "SL.mgcv2_1"
## 
## [[6]]
## [1] "rpart_tune"
## 
## [[7]]
## [1] "SL.ranger_200"
## 
## [[8]]
## [1] "rf_tune"
## 
## [[9]]
## [1] "SL.dbarts_200"
## 
## [[10]]
## [1] "xgb_tune"
## 
## [[11]]
## [1] "SL.xgboost_fast"

6.2.2 Estimate SuperLearner

The results of this block are cached because they are slow to compute.

set.seed(1, "L'Ecuyer-CMRG")

(sl = SuperLearner(Y = data[[vars$outcomes[1]]],
                   X = data[, vars$predictors],
                   family = binomial(), SL.library = sl_lib,
                   cvControl = SuperLearner.CV.control(stratifyCV = TRUE,
                                                      #V = 10L),
                                                      V = 2L),
                  verbose = TRUE))

## Number of covariates in All is: 33

## CV SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## CV SL.lm2_All

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## CV SL.glm2_All

## Warning: executing %dopar% sequentially: no parallel backend registered

## CV SL.glmnet_fast_All

## CV SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2027929 0.0000000
## SL.rpart2_0.01_10_5_All  0.2027929 0.0000000
## SL.rpart2_0_20_5_All     0.1807607 0.0000000
## SL.rpart2_0.01_20_5_All  0.1771161 0.0000000
## SL.rpart2_0_80_5_All     0.1648992 0.7542944
## SL.rpart2_0.01_80_5_All  0.1648992 0.0000000
## SL.rpart2_0_10_15_All    0.2102988 0.0000000
## SL.rpart2_0.01_10_15_All 0.1988584 0.2457056
## SL.rpart2_0_20_15_All    0.1807607 0.0000000
## SL.rpart2_0.01_20_15_All 0.1771161 0.0000000
## SL.rpart2_0_80_15_All    0.1648992 0.0000000
## SL.rpart2_0.01_80_15_All 0.1648992 0.0000000

## CV rpart_tune_All

## CV SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk       Coef
## SL.ranger_2_2_All   0.1491320 0.00000000
## SL.ranger_5_2_All   0.1502432 0.00000000
## SL.ranger_15_2_All  0.1526904 0.00000000
## SL.ranger_2_5_All   0.1362120 0.92060845
## SL.ranger_5_5_All   0.1377101 0.07939155
## SL.ranger_15_5_All  0.1426574 0.00000000
## SL.ranger_2_11_All  0.1417842 0.00000000
## SL.ranger_5_11_All  0.1432871 0.00000000
## SL.ranger_15_11_All 0.1438529 0.00000000

## CV rf_tune_All

## CV SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2481858    1
## SL.xgboost_fast_1000_2_0.05_All 0.2481858    0
## SL.xgboost_fast_250_4_0.05_All  0.2481858    0
## SL.xgboost_fast_1000_4_0.05_All 0.2481858    0
## SL.xgboost_fast_250_2_0.2_All   0.2481858    0
## SL.xgboost_fast_1000_2_0.2_All  0.2481858    0
## SL.xgboost_fast_250_4_0.2_All   0.2481858    0
## SL.xgboost_fast_1000_4_0.2_All  0.2481858    0

## CV xgb_tune_All

## CV SL.xgboost_fast_All

## Number of covariates in All is: 33

## CV SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## CV SL.lm2_All

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## CV SL.glm2_All

## CV SL.glmnet_fast_All

## CV SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2500382 0.0000000
## SL.rpart2_0.01_10_5_All  0.2500382 0.0000000
## SL.rpart2_0_20_5_All     0.1975844 0.7256534
## SL.rpart2_0.01_20_5_All  0.2023409 0.0000000
## SL.rpart2_0_80_5_All     0.2154568 0.2743466
## SL.rpart2_0.01_80_5_All  0.2154568 0.0000000
## SL.rpart2_0_10_15_All    0.2599618 0.0000000
## SL.rpart2_0.01_10_15_All 0.2599618 0.0000000
## SL.rpart2_0_20_15_All    0.2001702 0.0000000
## SL.rpart2_0.01_20_15_All 0.2049266 0.0000000
## SL.rpart2_0_80_15_All    0.2154568 0.0000000
## SL.rpart2_0.01_80_15_All 0.2154568 0.0000000

## CV rpart_tune_All

## CV SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk      Coef
## SL.ranger_2_2_All   0.1674269 0.0000000
## SL.ranger_5_2_All   0.1694906 0.0000000
## SL.ranger_15_2_All  0.1688027 0.1498788
## SL.ranger_2_5_All   0.1593110 0.8501212
## SL.ranger_5_5_All   0.1626210 0.0000000
## SL.ranger_15_5_All  0.1642748 0.0000000
## SL.ranger_2_11_All  0.1739733 0.0000000
## SL.ranger_5_11_All  0.1753413 0.0000000
## SL.ranger_15_11_All 0.1833558 0.0000000

## CV rf_tune_All

## CV SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2478803    1
## SL.xgboost_fast_1000_2_0.05_All 0.2478803    0
## SL.xgboost_fast_250_4_0.05_All  0.2478803    0
## SL.xgboost_fast_1000_4_0.05_All 0.2478803    0
## SL.xgboost_fast_250_2_0.2_All   0.2478803    0
## SL.xgboost_fast_1000_2_0.2_All  0.2478803    0
## SL.xgboost_fast_250_4_0.2_All   0.2478803    0
## SL.xgboost_fast_1000_4_0.2_All  0.2478803    0

## CV xgb_tune_All

## CV SL.xgboost_fast_All

## Non-Negative least squares convergence: TRUE

## full SL.mean_All

## full SL.lm2_All

## full SL.glm2_All

## full SL.glmnet_fast_All

## full SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2137125 0.0000000
## SL.rpart2_0.01_10_5_All  0.2138648 0.0000000
## SL.rpart2_0_20_5_All     0.1967678 0.0000000
## SL.rpart2_0.01_20_5_All  0.2045359 0.0000000
## SL.rpart2_0_80_5_All     0.1846070 0.5971948
## SL.rpart2_0.01_80_5_All  0.1846070 0.0000000
## SL.rpart2_0_10_15_All    0.2196851 0.2349721
## SL.rpart2_0.01_10_15_All 0.2132001 0.0000000
## SL.rpart2_0_20_15_All    0.1952689 0.1678331
## SL.rpart2_0.01_20_15_All 0.2030371 0.0000000
## SL.rpart2_0_80_15_All    0.1846070 0.0000000
## SL.rpart2_0.01_80_15_All 0.1846070 0.0000000

## full rpart_tune_All

## full SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk Coef
## SL.ranger_2_2_All   0.1459784    0
## SL.ranger_5_2_All   0.1464061    0
## SL.ranger_15_2_All  0.1484737    0
## SL.ranger_2_5_All   0.1400146    0
## SL.ranger_5_5_All   0.1378980    1
## SL.ranger_15_5_All  0.1402499    0
## SL.ranger_2_11_All  0.1433777    0
## SL.ranger_5_11_All  0.1423325    0
## SL.ranger_15_11_All 0.1447646    0

## full rf_tune_All

## full SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk      Coef
## SL.xgboost_fast_250_2_0.05_All  0.1622281 0.2970516
## SL.xgboost_fast_1000_2_0.05_All 0.1622301 0.7029484
## SL.xgboost_fast_250_4_0.05_All  0.1622281 0.0000000
## SL.xgboost_fast_1000_4_0.05_All 0.1622301 0.0000000
## SL.xgboost_fast_250_2_0.2_All   0.1623268 0.0000000
## SL.xgboost_fast_1000_2_0.2_All  0.1627611 0.0000000
## SL.xgboost_fast_250_4_0.2_All   0.1623268 0.0000000
## SL.xgboost_fast_1000_4_0.2_All  0.1627611 0.0000000

## full xgb_tune_All

## full SL.xgboost_fast_All

## 
## Call:  
## SuperLearner(Y = data[[vars$outcomes[1]]], X = data[, vars$predictors], family = binomial(),  
##     SL.library = sl_lib, verbose = TRUE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L)) 
## 
## 
##                          Risk      Coef
## SL.mean_All         0.2480217 0.0000000
## SL.lm2_All          0.1138972 0.3694042
## SL.glm2_All         0.1592580 0.0000000
## SL.glmnet_fast_All  0.1126716 0.3544172
## SL.mgcv2_1_All      0.1646661 0.0000000
## rpart_tune_All      0.1768881 0.0000000
## SL.ranger_200_All   0.1338529 0.0000000
## rf_tune_All         0.1306389 0.0000000
## SL.dbarts_200_All   0.1171052 0.2761786
## xgb_tune_All        0.1489688 0.0000000
## SL.xgboost_fast_All 0.1517064 0.0000000

sl

## 
## Call:  
## SuperLearner(Y = data[[vars$outcomes[1]]], X = data[, vars$predictors], family = binomial(),  
##     SL.library = sl_lib, verbose = TRUE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L)) 
## 
## 
##                          Risk      Coef
## SL.mean_All         0.2480217 0.0000000
## SL.lm2_All          0.1138972 0.3694042
## SL.glm2_All         0.1592580 0.0000000
## SL.glmnet_fast_All  0.1126716 0.3544172
## SL.mgcv2_1_All      0.1646661 0.0000000
## rpart_tune_All      0.1768881 0.0000000
## SL.ranger_200_All   0.1338529 0.0000000
## rf_tune_All         0.1306389 0.0000000
## SL.dbarts_200_All   0.1171052 0.2761786
## xgb_tune_All        0.1489688 0.0000000
## SL.xgboost_fast_All 0.1517064 0.0000000

save(sl,
     file = "data/estimator-sl.RData")

6.2.3 Review SL results

(auc_tab = ck37r::auc_table(sl, y = data[[vars$outcomes[1]]]))

##                learner       auc         se  ci_lower  ci_upper      p-value
## 1          SL.mean_All 0.5000000 0.05767800 0.3869532 0.6130468 1.582959e-13
## 6       rpart_tune_All 0.7987970 0.03299236 0.7341331 0.8634608 1.152513e-04
## 5       SL.mgcv2_1_All 0.8368735 0.02542988 0.7870318 0.8867151 5.172679e-04
## 11 SL.xgboost_fast_All 0.8565402 0.02169579 0.8140172 0.8990631 1.645421e-03
## 3          SL.glm2_All 0.8599265 0.02240240 0.8160185 0.9038344 3.516119e-03
## 10        xgb_tune_All 0.8622278 0.02144475 0.8201969 0.9042588 3.380814e-03
## 7    SL.ranger_200_All 0.8948072 0.01792615 0.8596726 0.9299418 7.743425e-02
## 8          rf_tune_All 0.9020642 0.01727790 0.8682002 0.9359283 1.455073e-01
## 4   SL.glmnet_fast_All 0.9149179 0.01705660 0.8814876 0.9483482 3.759973e-01
## 9    SL.dbarts_200_All 0.9168423 0.01622459 0.8850427 0.9486420 4.154285e-01
## 2           SL.lm2_All 0.9203080 0.01685427 0.8872742 0.9533417 5.000000e-01

# Drop p-value column.
auc_tab2 = auc_tab[, !names(auc_tab) %in% "p-value"]


# TODO: convert to knitr/kableExtra
print(xtable::xtable(auc_tab2, digits = 4), type = "latex",
      file = "tables/sl-auc_table.tex")

ck37r::plot_roc(sl, y = data[[vars$outcomes[1]]])

ggsave("visuals/roc-superlearner.pdf")

## Saving 7 x 5 in image

plot_table = function(x,
                      metric = "auc",
                      sort = TRUE) {

  # Use a clearer object name.
  tab = x

  if (!is.null(sort)) {
    tab = tab[order(tab[[metric]], decreasing = sort), ]
  }

  # Convert to a factor with manual levels so ggplot doesn't re-order
  # alphabetically.
  tab$learner = factor(tab$learner, levels = tab$learner)

  rownames(tab) = NULL

  p =
    ggplot2::ggplot(tab,
           aes(x = learner, y = get(metric), ymin = ci_lower, ymax = ci_upper)) +
      ggplot2::geom_pointrange(fatten = 2) +
      ggplot2::coord_flip() +
      ggplot2::labs(x = "Learner", y = metric) + theme_minimal()

  return(p)
}

# Skip SL.mean - it's too low to be worth plotting.
plot_table(auc_tab[-1, ]) + labs(y = "Cross-validated ROC-AUC")

ggsave("visuals/sl-roc-auc-comparison.pdf")

## Saving 7 x 5 in image

6.2.4 SL PR-AUC

(prauc_tab = ck37r::prauc_table(sl, y = data[[vars$outcomes[1]]]))

# TODO: switch to knitr
print(xtable::xtable(prauc_tab, digits = 4), type = "latex",
      file = "tables/sl-prauc_table.tex")

plot_table(prauc_tab, metric = "prauc") + labs(y = "Cross-validated PR-AUC")
ggsave("visuals/sl-prauc-comparison.pdf")

6.2.5 SL plots

library(dplyr)

df = data.frame(y = data[[vars$outcomes[1]]],
                pred = as.vector(sl$SL.predict))

df = df %>% mutate(decile = ntile(pred, 10L),
                   vigintile = ntile(pred, 20L)) %>% as.data.frame()

table(df$decile)

summary(df)

# Compare risk distribution for 0's vs 1's
ggplot(data = df, aes(x = pred, color = factor(y))) + 
  geom_density() + theme_minimal() +
  labs(title = "Distribution of predicted risk for 0's vs 1's",
         x = "Predicted risk Pr(Y = 1 | X)",
         y = "Density")

# Look at the risk distribution for each learner.
for (learner_i in seq(ncol(sl$Z))) {
  learner_name = sl$libraryNames[learner_i]
  preds = sl$Z[, learner_i, drop = TRUE]
  
  df = data.frame(y = data[[vars$outcomes[1]]],
                  pred = preds)
  
  g = ggplot(data = df, aes(x = pred, color = factor(y))) + 
    geom_density() + theme_minimal() +
    labs(title = "Distribution of predicted risk for 0's vs 1's",
         subtitle = paste("Learner:", learner_name), 
         x = "Predicted risk Pr(Y = 1 | X)",
         y = "Density")
  
  print(g)

}


ggplot(data = df, aes(x = pred, color = factor(y))) + 
  geom_freqpoly() + theme_minimal()

# Quick calibration plot
ggplot(data = df, aes(x = pred, y = y)) + 
  geom_smooth() + theme_minimal() +
  lims(y = c(0, 1))

6.3 Nested ensemble

The results of this block are cached because they are slow to compute.

Note: we are not currently saving the fitLibraries.

set.seed(1, "L'Ecuyer-CMRG")

# 2 is fastest, 10 is most thorough.
#outer_cv_folds = 10L
#outer_cv_folds = 5L
# Low setting to speed up tutorial.
outer_cv_folds = 2L

(cvsl =
    CV.SuperLearner(Y = data[[vars$outcomes[1]]], data[, vars$predictors],
          family = binomial(), SL.library = sl_lib,
          cvControl =
            SuperLearner.CV.control(stratifyCV = TRUE,
                                    V = outer_cv_folds),
          innerCvControl =
            rep(list(SuperLearner.CV.control(stratifyCV = TRUE,
                                             # Low setting to speed up tutorial.
                                             V = 2L)),
                                             #V = 5L)),
                                             #V = 10L)),
                outer_cv_folds),
          verbose = TRUE))

## Number of covariates in All is: 33

## CV SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## CV SL.lm2_All

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## CV SL.glm2_All

## CV SL.glmnet_fast_All

## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L =
## G$L, : Fitting terminated with step failure - check results carefully

## CV SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2431768 0.0000000
## SL.rpart2_0.01_10_5_All  0.2431768 0.0000000
## SL.rpart2_0_20_5_All     0.1930073 0.8053982
## SL.rpart2_0.01_20_5_All  0.1930073 0.0000000
## SL.rpart2_0_80_5_All     0.2479737 0.1946018
## SL.rpart2_0.01_80_5_All  0.2479737 0.0000000
## SL.rpart2_0_10_15_All    0.2431768 0.0000000
## SL.rpart2_0.01_10_15_All 0.2431768 0.0000000
## SL.rpart2_0_20_15_All    0.1930073 0.0000000
## SL.rpart2_0.01_20_15_All 0.1930073 0.0000000
## SL.rpart2_0_80_15_All    0.2479737 0.0000000
## SL.rpart2_0.01_80_15_All 0.2479737 0.0000000

## CV rpart_tune_All

## CV SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk       Coef
## SL.ranger_2_2_All   0.1728728 0.00000000
## SL.ranger_5_2_All   0.1740868 0.00000000
## SL.ranger_15_2_All  0.1765832 0.00000000
## SL.ranger_2_5_All   0.1591611 0.00000000
## SL.ranger_5_5_All   0.1569518 0.32470449
## SL.ranger_15_5_All  0.1584887 0.61008799
## SL.ranger_2_11_All  0.1628896 0.00000000
## SL.ranger_5_11_All  0.1600054 0.00000000
## SL.ranger_15_11_All 0.1574472 0.06520752

## CV rf_tune_All

## CV SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2479319    1
## SL.xgboost_fast_1000_2_0.05_All 0.2479319    0
## SL.xgboost_fast_250_4_0.05_All  0.2479319    0
## SL.xgboost_fast_1000_4_0.05_All 0.2479319    0
## SL.xgboost_fast_250_2_0.2_All   0.2479319    0
## SL.xgboost_fast_1000_2_0.2_All  0.2479319    0
## SL.xgboost_fast_250_4_0.2_All   0.2479319    0
## SL.xgboost_fast_1000_4_0.2_All  0.2479319    0

## CV xgb_tune_All

## CV SL.xgboost_fast_All

## Number of covariates in All is: 33

## CV SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## CV SL.lm2_All

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: prediction from a rank-deficient fit may be misleading

## CV SL.glm2_All

## CV SL.glmnet_fast_All

## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L =
## G$L, : Fitting terminated with step failure - check results carefully

## CV SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2002296 0.4719052
## SL.rpart2_0.01_10_5_All  0.2002296 0.0000000
## SL.rpart2_0_20_5_All     0.2110399 0.2280292
## SL.rpart2_0.01_20_5_All  0.2110399 0.0000000
## SL.rpart2_0_80_5_All     0.2485257 0.3000656
## SL.rpart2_0.01_80_5_All  0.2485257 0.0000000
## SL.rpart2_0_10_15_All    0.2002296 0.0000000
## SL.rpart2_0.01_10_15_All 0.2002296 0.0000000
## SL.rpart2_0_20_15_All    0.2110399 0.0000000
## SL.rpart2_0.01_20_15_All 0.2110399 0.0000000
## SL.rpart2_0_80_15_All    0.2485257 0.0000000
## SL.rpart2_0.01_80_15_All 0.2485257 0.0000000

## CV rpart_tune_All

## CV SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk      Coef
## SL.ranger_2_2_All   0.1484159 0.0000000
## SL.ranger_5_2_All   0.1509681 0.0000000
## SL.ranger_15_2_All  0.1511276 0.0000000
## SL.ranger_2_5_All   0.1280430 0.3394855
## SL.ranger_5_5_All   0.1282459 0.4267854
## SL.ranger_15_5_All  0.1423158 0.0000000
## SL.ranger_2_11_All  0.1313379 0.0000000
## SL.ranger_5_11_All  0.1286828 0.2337291
## SL.ranger_15_11_All 0.1360923 0.0000000

## CV rf_tune_All

## CV SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2484451    0
## SL.xgboost_fast_1000_2_0.05_All 0.2484451    1
## SL.xgboost_fast_250_4_0.05_All  0.2484451    0
## SL.xgboost_fast_1000_4_0.05_All 0.2484451    0
## SL.xgboost_fast_250_2_0.2_All   0.2484451    0
## SL.xgboost_fast_1000_2_0.2_All  0.2484451    0
## SL.xgboost_fast_250_4_0.2_All   0.2484451    0
## SL.xgboost_fast_1000_4_0.2_All  0.2484451    0

## CV xgb_tune_All

## CV SL.xgboost_fast_All

## Non-Negative least squares convergence: TRUE

## full SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## full SL.lm2_All

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## full SL.glm2_All

## full SL.glmnet_fast_All

## full SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2312208 0.0000000
## SL.rpart2_0.01_10_5_All  0.2264744 0.0000000
## SL.rpart2_0_20_5_All     0.1763329 0.3812911
## SL.rpart2_0.01_20_5_All  0.1809318 0.0000000
## SL.rpart2_0_80_5_All     0.1658675 0.6187089
## SL.rpart2_0.01_80_5_All  0.1658675 0.0000000
## SL.rpart2_0_10_15_All    0.2312208 0.0000000
## SL.rpart2_0.01_10_15_All 0.2264744 0.0000000
## SL.rpart2_0_20_15_All    0.1763329 0.0000000
## SL.rpart2_0.01_20_15_All 0.1809318 0.0000000
## SL.rpart2_0_80_15_All    0.1658675 0.0000000
## SL.rpart2_0.01_80_15_All 0.1658675 0.0000000

## full rpart_tune_All

## full SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk        Coef
## SL.ranger_2_2_All   0.1599222 0.000000000
## SL.ranger_5_2_All   0.1584222 0.000000000
## SL.ranger_15_2_All  0.1628434 0.000000000
## SL.ranger_2_5_All   0.1528761 0.004440179
## SL.ranger_5_5_All   0.1535367 0.000000000
## SL.ranger_15_5_All  0.1511217 0.995559821
## SL.ranger_2_11_All  0.1583693 0.000000000
## SL.ranger_5_11_All  0.1591777 0.000000000
## SL.ranger_15_11_All 0.1553593 0.000000000

## full rf_tune_All

## full SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2481858    1
## SL.xgboost_fast_1000_2_0.05_All 0.2481858    0
## SL.xgboost_fast_250_4_0.05_All  0.2481858    0
## SL.xgboost_fast_1000_4_0.05_All 0.2481858    0
## SL.xgboost_fast_250_2_0.2_All   0.2481858    0
## SL.xgboost_fast_1000_2_0.2_All  0.2481858    0
## SL.xgboost_fast_250_4_0.2_All   0.2481858    0
## SL.xgboost_fast_1000_4_0.2_All  0.2481858    0

## full xgb_tune_All

## full SL.xgboost_fast_All

## Number of covariates in All is: 33

## CV SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## CV SL.lm2_All

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## CV SL.glm2_All

## CV SL.glmnet_fast_All

## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L =
## G$L, : Fitting terminated with step failure - check results carefully

## CV SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2199991 0.4023428
## SL.rpart2_0.01_10_5_All  0.2199991 0.0000000
## SL.rpart2_0_20_5_All     0.2080325 0.2802087
## SL.rpart2_0.01_20_5_All  0.2080325 0.0000000
## SL.rpart2_0_80_5_All     0.2479737 0.3174485
## SL.rpart2_0.01_80_5_All  0.2479737 0.0000000
## SL.rpart2_0_10_15_All    0.2199991 0.0000000
## SL.rpart2_0.01_10_15_All 0.2199991 0.0000000
## SL.rpart2_0_20_15_All    0.2080325 0.0000000
## SL.rpart2_0.01_20_15_All 0.2080325 0.0000000
## SL.rpart2_0_80_15_All    0.2479737 0.0000000
## SL.rpart2_0.01_80_15_All 0.2479737 0.0000000

## CV rpart_tune_All

## CV SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk Coef
## SL.ranger_2_2_All   0.1595356    0
## SL.ranger_5_2_All   0.1561519    0
## SL.ranger_15_2_All  0.1686828    0
## SL.ranger_2_5_All   0.1351799    1
## SL.ranger_5_5_All   0.1382402    0
## SL.ranger_15_5_All  0.1509647    0
## SL.ranger_2_11_All  0.1440623    0
## SL.ranger_5_11_All  0.1440000    0
## SL.ranger_15_11_All 0.1532859    0

## CV rf_tune_All

## CV SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2479319    1
## SL.xgboost_fast_1000_2_0.05_All 0.2479319    0
## SL.xgboost_fast_250_4_0.05_All  0.2479319    0
## SL.xgboost_fast_1000_4_0.05_All 0.2479319    0
## SL.xgboost_fast_250_2_0.2_All   0.2479319    0
## SL.xgboost_fast_1000_2_0.2_All  0.2479319    0
## SL.xgboost_fast_250_4_0.2_All   0.2479319    0
## SL.xgboost_fast_1000_4_0.2_All  0.2479319    0

## CV xgb_tune_All

## CV SL.xgboost_fast_All

## Number of covariates in All is: 33

## CV SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## CV SL.lm2_All

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## CV SL.glm2_All

## CV SL.glmnet_fast_All

## Warning in newton(lsp = lsp, X = G$X, y = G$y, Eb = G$Eb, UrS = G$UrS, L =
## G$L, : Fitting terminated with step failure - check results carefully

## CV SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk       Coef
## SL.rpart2_0_10_5_All     0.2258783 0.56991485
## SL.rpart2_0.01_10_5_All  0.2258783 0.00000000
## SL.rpart2_0_20_5_All     0.2403511 0.02177855
## SL.rpart2_0.01_20_5_All  0.2403511 0.00000000
## SL.rpart2_0_80_5_All     0.2480521 0.40830660
## SL.rpart2_0.01_80_5_All  0.2480521 0.00000000
## SL.rpart2_0_10_15_All    0.2258783 0.00000000
## SL.rpart2_0.01_10_15_All 0.2258783 0.00000000
## SL.rpart2_0_20_15_All    0.2403511 0.00000000
## SL.rpart2_0.01_20_15_All 0.2403511 0.00000000
## SL.rpart2_0_80_15_All    0.2480521 0.00000000
## SL.rpart2_0.01_80_15_All 0.2480521 0.00000000

## CV rpart_tune_All

## CV SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk        Coef
## SL.ranger_2_2_All   0.1815163 0.000000000
## SL.ranger_5_2_All   0.1778897 0.004853019
## SL.ranger_15_2_All  0.1886354 0.000000000
## SL.ranger_2_5_All   0.1809942 0.000000000
## SL.ranger_5_5_All   0.1684264 0.995146981
## SL.ranger_15_5_All  0.1811368 0.000000000
## SL.ranger_2_11_All  0.1843232 0.000000000
## SL.ranger_5_11_All  0.1826941 0.000000000
## SL.ranger_15_11_All 0.1841492 0.000000000

## CV rf_tune_All

## CV SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2480845    1
## SL.xgboost_fast_1000_2_0.05_All 0.2480845    0
## SL.xgboost_fast_250_4_0.05_All  0.2480845    0
## SL.xgboost_fast_1000_4_0.05_All 0.2480845    0
## SL.xgboost_fast_250_2_0.2_All   0.2480845    0
## SL.xgboost_fast_1000_2_0.2_All  0.2480845    0
## SL.xgboost_fast_250_4_0.2_All   0.2480845    0
## SL.xgboost_fast_1000_4_0.2_All  0.2480845    0

## CV xgb_tune_All

## CV SL.xgboost_fast_All

## Non-Negative least squares convergence: TRUE

## full SL.mean_All

## Warning in predict.lm(fit, newdata = newX, type = "response"): prediction from a
## rank-deficient fit may be misleading

## full SL.lm2_All

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## full SL.glm2_All

## full SL.glmnet_fast_All

## full SL.mgcv2_1_All

## Running rpart_tune
## Rpart tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 5L), obsWeights = obsWeights) 
## 
## 
##                               Risk      Coef
## SL.rpart2_0_10_5_All     0.2098700 0.0000000
## SL.rpart2_0.01_10_5_All  0.2098700 0.0000000
## SL.rpart2_0_20_5_All     0.1980541 0.0000000
## SL.rpart2_0.01_20_5_All  0.1980541 0.0000000
## SL.rpart2_0_80_5_All     0.1993456 0.4629797
## SL.rpart2_0.01_80_5_All  0.1993456 0.0000000
## SL.rpart2_0_10_15_All    0.1947496 0.5370203
## SL.rpart2_0.01_10_15_All 0.2044895 0.0000000
## SL.rpart2_0_20_15_All    0.1980541 0.0000000
## SL.rpart2_0.01_20_15_All 0.1980541 0.0000000
## SL.rpart2_0_80_15_All    0.1993456 0.0000000
## SL.rpart2_0.01_80_15_All 0.1993456 0.0000000

## full rpart_tune_All

## full SL.ranger_200_All

## Running rf_tune
## RF tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                          Risk     Coef
## SL.ranger_2_2_All   0.1585511 0.000000
## SL.ranger_5_2_All   0.1564458 0.000000
## SL.ranger_15_2_All  0.1612925 0.000000
## SL.ranger_2_5_All   0.1426208 0.875167
## SL.ranger_5_5_All   0.1437786 0.000000
## SL.ranger_15_5_All  0.1479854 0.000000
## SL.ranger_2_11_All  0.1441626 0.124833
## SL.ranger_5_11_All  0.1474295 0.000000
## SL.ranger_15_11_All 0.1472194 0.000000

## full rf_tune_All

## full SL.dbarts_200_All

## Running xgb_tune
## XGB tuned SL:
## 
## Call:  
## SuperLearner(Y = Y, X = X, newX = newX, family = family, SL.library = grid2$names,  
##     verbose = FALSE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L), obsWeights = obsWeights) 
## 
## 
##                                      Risk Coef
## SL.xgboost_fast_250_2_0.05_All  0.2478803    1
## SL.xgboost_fast_1000_2_0.05_All 0.2478803    0
## SL.xgboost_fast_250_4_0.05_All  0.2478803    0
## SL.xgboost_fast_1000_4_0.05_All 0.2478803    0
## SL.xgboost_fast_250_2_0.2_All   0.2478803    0
## SL.xgboost_fast_1000_2_0.2_All  0.2478803    0
## SL.xgboost_fast_250_4_0.2_All   0.2478803    0
## SL.xgboost_fast_1000_4_0.2_All  0.2478803    0

## full xgb_tune_All

## full SL.xgboost_fast_All

## 
## Call:  
## CV.SuperLearner(Y = data[[vars$outcomes[1]]], X = data[, vars$predictors],  
##     family = binomial(), SL.library = sl_lib, verbose = TRUE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = outer_cv_folds), innerCvControl = rep(list(SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L)), outer_cv_folds)) 
## 
## 
## Cross-validated predictions from the SuperLearner:  SL.predict 
## 
## Cross-validated predictions from the discrete super learner (cross-validation selector):  discreteSL.predict 
## 
## Which library algorithm was the discrete super learner:  whichDiscreteSL 
## 
## Cross-validated prediction for all algorithms in the library:  library.predict

save(cvsl,
     file = "data/estimator-cvsl.RData")

summary(cvsl)

## 
## Call:  
## CV.SuperLearner(Y = data[[vars$outcomes[1]]], X = data[, vars$predictors],  
##     family = binomial(), SL.library = sl_lib, verbose = TRUE, cvControl = SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = outer_cv_folds), innerCvControl = rep(list(SuperLearner.CV.control(stratifyCV = TRUE,  
##         V = 2L)), outer_cv_folds)) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  2 
## 
##            Algorithm     Ave        se     Min     Max
##        Super Learner 0.11574 0.0102761 0.10813 0.12335
##          Discrete SL 0.11752 0.0101137 0.11175 0.12329
##          SL.mean_All 0.24802 0.0025546 0.24789 0.24816
##           SL.lm2_All 0.11388 0.0102456 0.10917 0.11859
##          SL.glm2_All 0.15924 0.0174066 0.15381 0.16467
##   SL.glmnet_fast_All 0.11280 0.0117058 0.10880 0.11680
##       SL.mgcv2_1_All 0.16467 0.0176962 0.16385 0.16548
##       rpart_tune_All 0.18406 0.0152627 0.17467 0.19345
##    SL.ranger_200_All 0.13236 0.0101444 0.12557 0.13916
##          rf_tune_All 0.13502 0.0098337 0.12338 0.14666
##    SL.dbarts_200_All 0.11752 0.0101137 0.11175 0.12329
##         xgb_tune_All 0.14893 0.0123762 0.13641 0.16144
##  SL.xgboost_fast_All 0.15166 0.0126596 0.13714 0.16618

6.3.1 Ensemble weights

# Review weight distribution.
(weight_tab = ck37r::cvsl_weights(cvsl))

##     #      Learner    Mean      SD     Min     Max
## 1   1   dbarts_200 0.75463 0.23340 0.58959 0.91967
## 2   2  glmnet_fast 0.20520 0.29020 0.00000 0.41041
## 3   3   ranger_200 0.02775 0.03924 0.00000 0.05550
## 4   4         glm2 0.01241 0.01756 0.00000 0.02483
## 5   5         mean 0.00000 0.00000 0.00000 0.00000
## 6   6          lm2 0.00000 0.00000 0.00000 0.00000
## 7   7      mgcv2_1 0.00000 0.00000 0.00000 0.00000
## 8   8   rpart_tune 0.00000 0.00000 0.00000 0.00000
## 9   9      rf_tune 0.00000 0.00000 0.00000 0.00000
## 10 10     xgb_tune 0.00000 0.00000 0.00000 0.00000
## 11 11 xgboost_fast 0.00000 0.00000 0.00000 0.00000

# Remove algorithms with 0 weight.
(weight_tab = weight_tab[weight_tab$Max > 0, ])

##   #     Learner    Mean      SD     Min     Max
## 1 1  dbarts_200 0.75463 0.23340 0.58959 0.91967
## 2 2 glmnet_fast 0.20520 0.29020 0.00000 0.41041
## 3 3  ranger_200 0.02775 0.03924 0.00000 0.05550
## 4 4        glm2 0.01241 0.01756 0.00000 0.02483

cat(kable(weight_tab, digits = 4, format = "latex", booktabs = TRUE,
          label = "cvsl-weights",
          row.names = FALSE,
          caption = "Distribution of algorithm weights across ensemble cross-validation replications") %>%
      kable_styling(latex_options = "hold_position"),
      file = "tables/cvsl-weight-table.tex")

6.3.2 AUC analysis

library(dplyr)
library(ck37r)

######
# AUC analysis

(auc_tab = auc_table(cvsl))

##                           auc         se  ci_lower  ci_upper      p-value
## SL.mean_All         0.5000000 0.05767800 0.3869532 0.6130468 1.582959e-13
## rpart_tune_All      0.7887063 0.03435854 0.7213648 0.8560478 6.400740e-05
## SL.mgcv2_1_All      0.8368735 0.02542988 0.7870318 0.8867151 5.172679e-04
## SL.xgboost_fast_All 0.8565402 0.02169579 0.8140172 0.8990631 1.645421e-03
## SL.glm2_All         0.8599265 0.02240240 0.8160185 0.9038344 3.516119e-03
## xgb_tune_All        0.8622278 0.02144475 0.8201969 0.9042588 3.380814e-03
## rf_tune_All         0.8926853 0.01803672 0.8573339 0.9280366 6.282675e-02
## SL.ranger_200_All   0.8975765 0.01761040 0.8630608 0.9320923 9.838723e-02
## SL.dbarts_200_All   0.9159608 0.01627180 0.8840686 0.9478529 3.946726e-01
## DiscreteSL          0.9159608 0.01627180 0.8840686 0.9478529 3.946726e-01
## SL.glmnet_fast_All  0.9166283 0.01659627 0.8841002 0.9491564 4.122684e-01
## SuperLearner        0.9171106 0.01637241 0.8850213 0.9492000 4.225844e-01
## SL.lm2_All          0.9203080 0.01685427 0.8872742 0.9533417 5.000000e-01

# Drop p-value column.
auc_tab2 = auc_tab[, !names(auc_tab) %in% "p-value"]

# Convert rownames to learner column.
auc_tab2$learner = rownames(auc_tab2)

# Move learner column to the beginning.
(auc_tab2 = cbind(learner = auc_tab2$learner,
                  auc_tab2[, !names(auc_tab2) %in% "learner"]))

##                                 learner       auc         se  ci_lower
## SL.mean_All                 SL.mean_All 0.5000000 0.05767800 0.3869532
## rpart_tune_All           rpart_tune_All 0.7887063 0.03435854 0.7213648
## SL.mgcv2_1_All           SL.mgcv2_1_All 0.8368735 0.02542988 0.7870318
## SL.xgboost_fast_All SL.xgboost_fast_All 0.8565402 0.02169579 0.8140172
## SL.glm2_All                 SL.glm2_All 0.8599265 0.02240240 0.8160185
## xgb_tune_All               xgb_tune_All 0.8622278 0.02144475 0.8201969
## rf_tune_All                 rf_tune_All 0.8926853 0.01803672 0.8573339
## SL.ranger_200_All     SL.ranger_200_All 0.8975765 0.01761040 0.8630608
## SL.dbarts_200_All     SL.dbarts_200_All 0.9159608 0.01627180 0.8840686
## DiscreteSL                   DiscreteSL 0.9159608 0.01627180 0.8840686
## SL.glmnet_fast_All   SL.glmnet_fast_All 0.9166283 0.01659627 0.8841002
## SuperLearner               SuperLearner 0.9171106 0.01637241 0.8850213
## SL.lm2_All                   SL.lm2_All 0.9203080 0.01685427 0.8872742
##                      ci_upper
## SL.mean_All         0.6130468
## rpart_tune_All      0.8560478
## SL.mgcv2_1_All      0.8867151
## SL.xgboost_fast_All 0.8990631
## SL.glm2_All         0.9038344
## xgb_tune_All        0.9042588
## rf_tune_All         0.9280366
## SL.ranger_200_All   0.9320923
## SL.dbarts_200_All   0.9478529
## DiscreteSL          0.9478529
## SL.glmnet_fast_All  0.9491564
## SuperLearner        0.9492000
## SL.lm2_All          0.9533417

colnames(auc_tab2)[1] = "learner"
rownames(auc_tab2) = NULL

# Skip SL.mean - it's too low to be worth plotting.
plot_table(auc_tab2[-1, ]) + labs(y = "Cross-validated ROC-AUC")

ggsave("visuals/cvsl-roc-auc-comparison.pdf")

## Saving 7 x 5 in image

plot_roc(cvsl)

ggsave("visuals/cvsl-roc.pdf")

## Saving 7 x 5 in image

names(auc_tab2)

## [1] "learner"  "auc"      "se"       "ci_lower" "ci_upper"

names(auc_tab2) = c("Learner", "ROC-AUC", "Std. Err.", "CI Lower", "CI Upper")

cat(kable(auc_tab2, digits = 4, format = "latex", booktabs = TRUE,
          label = "cvsl-auc",
          caption = "Cross-validated ROC-AUC discrimination performance"),
      file = "tables/cvsl-auc_table.tex")

6.3.3 Precision-Recall analysis

######
# Precision-Recall analysis

(prauc_tab = prauc_table(cvsl))

##                         prauc      stderr  ci_lower  ci_upper
## SL.mean_All         0.5445495 0.001503137 0.5416033 0.5474956
## rpart_tune_All      0.7989240 0.002566970 0.7938927 0.8039552
## SL.mgcv2_1_All      0.8131151 0.011315743 0.7909362 0.8352940
## SL.glm2_All         0.8432754 0.012871382 0.8180475 0.8685033
## SL.xgboost_fast_All 0.8657094 0.005130902 0.8556529 0.8757660
## xgb_tune_All        0.8672148 0.009362066 0.8488651 0.8855644
## rf_tune_All         0.9089016 0.004028301 0.9010061 0.9167971
## SL.ranger_200_All   0.9140923 0.001131961 0.9118736 0.9163109
## SL.glmnet_fast_All  0.9200184 0.019765525 0.8812780 0.9587588
## SuperLearner        0.9218614 0.015730351 0.8910299 0.9526929
## SL.lm2_All          0.9222696 0.016925163 0.8890962 0.9554429
## SL.dbarts_200_All   0.9229947 0.014703303 0.8941762 0.9518131
## DiscreteSL          0.9229947 0.014703303 0.8941762 0.9518131

prauc_tab$learner = rownames(prauc_tab)
rownames(prauc_tab) = NULL

# Move learner column to the beginning.
(prauc_tab = cbind(learner = prauc_tab$learner,
                   prauc_tab[, !names(prauc_tab) %in% "learner"]))

##                learner     prauc      stderr  ci_lower  ci_upper
## 1          SL.mean_All 0.5445495 0.001503137 0.5416033 0.5474956
## 2       rpart_tune_All 0.7989240 0.002566970 0.7938927 0.8039552
## 3       SL.mgcv2_1_All 0.8131151 0.011315743 0.7909362 0.8352940
## 4          SL.glm2_All 0.8432754 0.012871382 0.8180475 0.8685033
## 5  SL.xgboost_fast_All 0.8657094 0.005130902 0.8556529 0.8757660
## 6         xgb_tune_All 0.8672148 0.009362066 0.8488651 0.8855644
## 7          rf_tune_All 0.9089016 0.004028301 0.9010061 0.9167971
## 8    SL.ranger_200_All 0.9140923 0.001131961 0.9118736 0.9163109
## 9   SL.glmnet_fast_All 0.9200184 0.019765525 0.8812780 0.9587588
## 10        SuperLearner 0.9218614 0.015730351 0.8910299 0.9526929
## 11          SL.lm2_All 0.9222696 0.016925163 0.8890962 0.9554429
## 12   SL.dbarts_200_All 0.9229947 0.014703303 0.8941762 0.9518131
## 13          DiscreteSL 0.9229947 0.014703303 0.8941762 0.9518131

plot_table(prauc_tab, metric = "prauc") + labs(y = "Cross-validated PR-AUC")

ggsave("visuals/cvsl-prauc-comparison.pdf")

## Saving 7 x 5 in image

################
# Precision-recall curve comparison.

pred_lib = data.frame(cvsl$library.predict)

names(pred_lib) = cvsl$libraryNames
pred_df = pred_lib

summary(pred_df)

##   SL.mean_All       SL.lm2_All      SL.glm2_All     SL.glmnet_fast_All
##  Min.   :0.5430   Min.   :0.0000   Min.   :0.0000   Min.   :0.002291  
##  1st Qu.:0.5430   1st Qu.:0.2252   1st Qu.:0.0153   1st Qu.:0.139768  
##  Median :0.5430   Median :0.5823   Median :0.6291   Median :0.616401  
##  Mean   :0.5445   Mean   :0.5364   Mean   :0.5128   Mean   :0.533499  
##  3rd Qu.:0.5461   3rd Qu.:0.8664   3rd Qu.:0.9745   3rd Qu.:0.903847  
##  Max.   :0.5461   Max.   :1.0000   Max.   :1.0000   Max.   :0.995543  
##  SL.mgcv2_1_All    rpart_tune_All    SL.ranger_200_All  rf_tune_All     
##  Min.   :0.00000   Min.   :0.05936   Min.   :0.00665   Min.   :0.01191  
##  1st Qu.:0.01362   1st Qu.:0.20312   1st Qu.:0.25080   1st Qu.:0.27727  
##  Median :0.66733   Median :0.63368   Median :0.55285   Median :0.54753  
##  Mean   :0.52910   Mean   :0.53916   Mean   :0.53629   Mean   :0.53495  
##  3rd Qu.:0.98181   3rd Qu.:0.85643   3rd Qu.:0.83460   3rd Qu.:0.81306  
##  Max.   :1.00000   Max.   :0.96970   Max.   :0.99217   Max.   :0.98669  
##  SL.dbarts_200_All  xgb_tune_All     SL.xgboost_fast_All
##  Min.   :0.01777   Min.   :0.05946   Min.   :0.05113    
##  1st Qu.:0.22435   1st Qu.:0.20156   1st Qu.:0.19636    
##  Median :0.58495   Median :0.56314   Median :0.55752    
##  Mean   :0.53911   Mean   :0.53977   Mean   :0.53697    
##  3rd Qu.:0.85516   3rd Qu.:0.85621   3rd Qu.:0.86247    
##  Max.   :0.98518   Max.   :0.95030   Max.   :0.98142

# Add on the SuperLearner prediction.
pred_df$SuperLearner = cvsl$SL.predict

library(precrec)
library(ggplot2)

(sscurves = evalmod(scores = pred_df,
                    labels = cvsl$Y,
                    modnames = names(pred_df)))

## 
##     === AUCs ===
## 
##               Model name Dataset ID Curve type       AUC
##    1         SL.mean_All          1        ROC 0.4984848
##    2         SL.mean_All          1        PRC 0.5435123
##    3          SL.lm2_All          1        ROC 0.9207290
##    4          SL.lm2_All          1        PRC 0.9193685
##    5         SL.glm2_All          1        ROC 0.8586737
##    6         SL.glm2_All          1        PRC 0.8443813
##    7  SL.glmnet_fast_All          1        ROC 0.9157664
##    8  SL.glmnet_fast_All          1        PRC 0.9174906
##    9      SL.mgcv2_1_All          1        ROC 0.8355292
##   10      SL.mgcv2_1_All          1        PRC 0.8094916
##   11      rpart_tune_All          1        ROC 0.7879666
##   12      rpart_tune_All          1        PRC 0.7911882
##   13   SL.ranger_200_All          1        ROC 0.8955204
##   14   SL.ranger_200_All          1        PRC 0.9130445
##   15         rf_tune_All          1        ROC 0.8934124
##   16         rf_tune_All          1        PRC 0.9078671
##   17   SL.dbarts_200_All          1        ROC 0.9150637
##   18   SL.dbarts_200_All          1        PRC 0.9165212
##   19        xgb_tune_All          1        ROC 0.8628678
##   20        xgb_tune_All          1        PRC 0.8712067
##   21 SL.xgboost_fast_All          1        ROC 0.8579491
##   22 SL.xgboost_fast_All          1        PRC 0.8640953
##   23        SuperLearner          1        ROC 0.9168643
##   24        SuperLearner          1        PRC 0.9148774
## 
## 
##     === Input data ===
## 
##               Model name Dataset ID # of negatives # of positives
##    1         SL.mean_All          1            138            165
##    2          SL.lm2_All          1            138            165
##    3         SL.glm2_All          1            138            165
##    4  SL.glmnet_fast_All          1            138            165
##    5      SL.mgcv2_1_All          1            138            165
##    6      rpart_tune_All          1            138            165
##    7   SL.ranger_200_All          1            138            165
##    8         rf_tune_All          1            138            165
##    9   SL.dbarts_200_All          1            138            165
##   10        xgb_tune_All          1            138            165
##   11 SL.xgboost_fast_All          1            138            165
##   12        SuperLearner          1            138            165

# Show a Precision-Recall plot comparing all estimators.
# TODO: subset this one and improve legend placement.
autoplot(sscurves, "PRC") +
  labs(title = element_blank()) +
  theme(legend.position = c(0.65, 0.65),
        legend.text = element_text(size = 8), #face = "bold"),
        legend.margin = margin(l = 0.2, r = 0.2, b = 0.2, unit = "cm"),
        legend.background = element_rect(fill = alpha("gray95", 0.8),
                                         color = "gray80"),
        legend.key = element_blank())

ggsave("visuals/prc-comparison.pdf",
       width = 4, height = 4)


################
# PR-AUC table
names(prauc_tab) = c("Learner", "PR-AUC", "Std. Err.", "CI Lower", "CI Upper")

cat(kable(prauc_tab, digits = 4, format = "latex", booktabs = TRUE,
          label = "cvsl-prauc",
          caption = "Cross-validated PR-AUC discrimination performance") %>%
      kable_styling(latex_options = "hold_position"),
      file = "tables/cvsl-prauc_table.tex")

6.3.4 Brier score

##########################
# Brier score table.

(brier_tab = ck37r::brier_table(cvsl))

##                         brier       stderr  ci_lower  ci_upper
## SL.mean_All         0.2480221 0.0001339280 0.2477596 0.2482846
## rpart_tune_All      0.1840604 0.0093869313 0.1656620 0.2024588
## SL.mgcv2_1_All      0.1646688 0.0008143654 0.1630727 0.1662650
## SL.glm2_All         0.1592401 0.0054333149 0.1485908 0.1698894
## SL.xgboost_fast_All 0.1516585 0.0145181452 0.1232029 0.1801141
## xgb_tune_All        0.1489275 0.0125172917 0.1243936 0.1734614
## rf_tune_All         0.1350198 0.0116438189 0.1121979 0.1578417
## SL.ranger_200_All   0.1323632 0.0067966648 0.1190417 0.1456847
## SL.dbarts_200_All   0.1175203 0.0057694833 0.1062121 0.1288285
## DiscreteSL          0.1175203 0.0057694833 0.1062121 0.1288285
## SuperLearner        0.1157416 0.0076113337 0.1008234 0.1306598
## SL.lm2_All          0.1138816 0.0047130518 0.1046440 0.1231192
## SL.glmnet_fast_All  0.1127997 0.0040007360 0.1049583 0.1206412

names(brier_tab) = c("Brier score", "Std. Err.", "CI Lower", "CI Upper")

brier_tab = cbind(Learner = rownames(brier_tab), brier_tab)

rownames(brier_tab) = NULL

cat(kable(brier_tab, digits = 5, format = "latex", booktabs = TRUE,
          label = "cvsl-brier",
          row.names = FALSE,
          caption = "Cross-validated Brier score for each learner and the ensemble") %>%
      kable_styling(latex_options = "hold_position"),
      file = "tables/cvsl-brier-table.tex")

6.3.5 Index of prediction accuracy

##########################
# Index of prediction accuracy table.

(ipa_tab = ck37r::ipa_table(cvsl))

##                               ipa       stderr      ci_lower      ci_upper
## SL.mean_All         -3.644036e-05 1.967793e-08 -3.647893e-05 -3.640179e-05
## rpart_tune_All       2.578395e-01 3.824930e-02  1.828709e-01  3.328081e-01
## SL.mgcv2_1_All       3.360494e-01 2.925023e-03  3.303163e-01  3.417824e-01
## SL.glm2_All          3.579247e-01 2.225410e-02  3.143067e-01  4.015427e-01
## SL.xgboost_fast_All  3.884743e-01 5.886805e-02  2.730929e-01  5.038556e-01
## xgb_tune_All         3.994901e-01 5.079457e-02  2.999327e-01  4.990474e-01
## rf_tune_All          4.555686e-01 4.724240e-02  3.629735e-01  5.481637e-01
## SL.ranger_200_All    4.662906e-01 2.769267e-02  4.120130e-01  5.205682e-01
## SL.dbarts_200_All    5.261401e-01 2.351870e-02  4.800434e-01  5.722368e-01
## DiscreteSL           5.261401e-01 2.351870e-02  4.800434e-01  5.722368e-01
## SuperLearner         5.333079e-01 3.094126e-02  4.726631e-01  5.939528e-01
## SL.lm2_All           5.408138e-01 1.925120e-02  5.030814e-01  5.785461e-01
## SL.glmnet_fast_All   5.451776e-01 1.637675e-02  5.130792e-01  5.772760e-01

names(ipa_tab) = c("IPA", "Std. Err.", "CI Lower", "CI Upper")

ipa_tab = cbind(Learner = rownames(ipa_tab), ipa_tab)

rownames(ipa_tab) = NULL

cat(kable(ipa_tab, digits = 4, format = "latex", booktabs = TRUE,
          label = "cvsl-ipa",
          row.names = FALSE,
          caption = "Cross-validated index of prediction accuracy for each learner and the ensemble") %>%
      kable_styling(latex_options = "hold_position"),
      file = "tables/cvsl-ipa-table.tex")