Running `super_learner()` in Parallel

Parallelization is supported in super_learner() and cv_super_learner(), and is implemented through the future package.

If you’d like to use the parallel versions of super_learner() or cv_super_learner() it is as simple as library(future) (as long as you have it installed) and declaring a plan like plan(multicore).

library(nadir)
library(future)
#> Warning: package 'future' was built under R version 4.3.3
library(tidytuesdayR)
#> Warning: package 'tidytuesdayR' was built under R version 4.3.3
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(microbenchmark)


plan(multicore) # or similar, see https://future.futureverse.org/ 
# we recommend you to use a multicore setup on a Unix/Linux machine if you 
# actually want to see a speed gain from parallelizing your code.
# 
# note that plan(multicore) does not run in RStudio or Windows but multisession
# does. our experience has been multisession has not led to a speed increase
# compared to running with a sequential plan.

data("Boston", package = 'MASS')
data <- Boston

Speed gains are most obvious in `cv_super_learner()`

Let’s run a timing test to see if we can tell if there’s an improvement in performance from using a multicore vs. a sequential plan:

# sequential version: 
plan(sequential)

microbenchmark({
  sl_closure <- function(data) {
    super_learner(
      data = data,
      formula = medv ~ .,
      learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean),
    )
  }
  cv_super_learner(data, sl_closure, y_variable = 'medv')
  }, times = 3)
#> Warning in microbenchmark({: less accurate nanosecond times to avoid potential
#> integer overflows
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#>                                                                                                                                                                                                                                        expr
#>  {     sl_closure <- function(data) {         super_learner(data = data, formula = medv ~ ., learners = list(rf = lnr_rf,              lm = lnr_lm, mean = lnr_mean), )     }     cv_super_learner(data, sl_closure, y_variable = "medv") }
#>       min     lq     mean   median      uq      max neval
#>  7.732953 7.7542 7.904345 7.775448 7.99004 8.204633     3

# multicore version: 
plan(multicore, workers = 10)

microbenchmark({
sl_closure <- function(data) {
  super_learner(
    data = data,
    formula = medv ~ .,
    learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean)
  )
}
cv_super_learner(data, sl_closure, y_variable = 'medv')
}, times = 3)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#>                                                                                                                                                                                                                                      expr
#>  {     sl_closure <- function(data) {         super_learner(data = data, formula = medv ~ ., learners = list(rf = lnr_rf,              lm = lnr_lm, mean = lnr_mean))     }     cv_super_learner(data, sl_closure, y_variable = "medv") }
#>       min       lq     mean   median       uq      max neval
#>  2.016981 2.083049 2.216926 2.149117 2.316898 2.484679     3


# this may be useful for larger datasets with parallel processing...
options(future.globals.maxSize = +Inf)

learners <- list(
  mean = lnr_mean,
  lm = lnr_lm,
  rf = lnr_rf,
  earth = lnr_earth,
  xgboost = lnr_xgboost,
  glmnet0 = lnr_glmnet,
  glmnet1 = lnr_glmnet,
  glmnet2 = lnr_glmnet,
  glmnet3 = lnr_glmnet
)

extra_args <- list(
  glmnet0 = list(lambda = 0.01),
  glmnet1 = list(lambda = 0.2),
  glmnet2 = list(lambda = 0.4),
  glmnet3 = list(lambda = 0.6)
)

plan(sequential)

microbenchmark({ 
  sl_closure <- function(data) {
    nadir::super_learner(
    data = data,
    formulas = mpg ~ .,
    learners = learners,
    extra_learner_args = extra_args)
  }
  cv_out <- cv_super_learner(
    data = mtcars, 
    sl_closure = sl_closure,
    y_variable = 'mpg')
}, times = 3)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#>                                                                                                                                                                                                                                                                               expr
#>  {     sl_closure <- function(data) {         nadir::super_learner(data = data, formulas = mpg ~ .,              learners = learners, extra_learner_args = extra_args)     }     cv_out <- cv_super_learner(data = mtcars, sl_closure = sl_closure,          y_variable = "mpg") }
#>       min       lq     mean   median       uq      max neval
#>  2.402918 2.433435 2.474872 2.463952 2.510849 2.557746     3

plan(multicore)

microbenchmark({ 
  sl_closure <- function(data) {
    nadir::super_learner(
    data = data,
    formulas = mpg ~ .,
    learners = learners,
    extra_learner_args = extra_args)
  }
  cv_out <- cv_super_learner(
    data = mtcars, 
    sl_closure = sl_closure,
    y_variable = 'mpg')
}, times = 3)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Warning in predict.lm(model, newdata = newdata, type = "response"): prediction
#> from rank-deficient fit; attr(*, "non-estim") has doubtful cases
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: milliseconds
#>                                                                                                                                                                                                                                                                               expr
#>  {     sl_closure <- function(data) {         nadir::super_learner(data = data, formulas = mpg ~ .,              learners = learners, extra_learner_args = extra_args)     }     cv_out <- cv_super_learner(data = mtcars, sl_closure = sl_closure,          y_variable = "mpg") }
#>       min       lq    mean   median       uq      max neval
#>  928.6727 939.2432 964.423 949.8138 982.2981 1014.782     3

But why is it not so obvious for just `super_learner()`?

Because cv_super_learner() involves an additional layer of cross-validation, the effect of parallelization is more obvious in cv_super_learner() than compared to super_learner(). However, to make it more obvious that parallelization is working in super_learner() as well, if the number of cv folds we want to run is higher, this increases the relative payoff of using the parallel option.

plan(sequential)

microbenchmark({ 
  sl_out <- nadir::super_learner(
    data = Boston,
    formulas = medv ~ .,
    learners = learners,
    n_folds = 20,
    extra_learner_args = extra_args,
    verbose = TRUE)
}, times = 3)
#> Unit: seconds
#>                                                                                                                                                                              expr
#>  {     sl_out <- nadir::super_learner(data = Boston, formulas = medv ~          ., learners = learners, n_folds = 20, extra_learner_args = extra_args,          verbose = TRUE) }
#>       min       lq     mean   median       uq      max neval
#>  12.70487 12.77468 12.84642 12.84449 12.91719 12.98988     3

plan(multicore)

microbenchmark({ 
  sl_out <- nadir::super_learner(
    data = Boston,
    formulas = medv ~ .,
    learners = learners,
    n_folds = 20,
    extra_learner_args = extra_args,
    verbose = TRUE)
}, times = 3)
#> Unit: seconds
#>                                                                                                                                                                              expr
#>  {     sl_out <- nadir::super_learner(data = Boston, formulas = medv ~          ., learners = learners, n_folds = 20, extra_learner_args = extra_args,          verbose = TRUE) }
#>       min       lq     mean   median       uq      max neval
#>  8.557656 8.615234 8.650471 8.672812 8.696878 8.720944     3

Speed gains are most obvious in cv_super_learner()

But why is it not so obvious for just super_learner()?

Speed gains are most obvious in `cv_super_learner()`

But why is it not so obvious for just `super_learner()`?