Skip to contents

Parallelization is supported in super_learner() and cv_super_learner(), and is implemented through the future package.

If you’d like to use the parallel versions of super_learner() or cv_super_learner() it is as simple as library(future) (as long as you have it installed) and declaring a plan like plan(multicore).

library(nadir)
#> Registered S3 method overwritten by 'future':
#>   method               from      
#>   all.equal.connection parallelly
library(future)
#> Warning: package 'future' was built under R version 4.3.3
library(tidytuesdayR)
#> Warning: package 'tidytuesdayR' was built under R version 4.3.3
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(microbenchmark)


plan(multicore) # or similar, see https://future.futureverse.org/ 
# we recommend you to use a multicore setup on a Unix/Linux machine if you 
# actually want to see a speed gain from parallelizing your code.
# 
# note that plan(multicore) does not run in RStudio or Windows but multisession
# does. our experience has been multisession has not led to a speed increase
# compared to running with a sequential plan.

# we also recommend, if your data is of moderate or large size, then set
# the following to be able to work with large data objects in parallel.
# this may be useful for larger datasets with parallel processing. 
options(future.globals.maxSize = +Inf)

data("Boston", package = 'MASS')
data <- Boston 

Speed gains are most obvious in cv_super_learner()

Let’s run a timing test to see if we can tell if there’s an improvement in performance from using a multicore vs. a sequential plan:

# sequential version: 
plan(sequential)

microbenchmark({
  cv_super_learner(
    data,
    formula = medv ~ .,
    learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean))
  }, times = 3)
#> Warning in microbenchmark({: less accurate nanosecond times to avoid potential
#> integer overflows
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> Unit: seconds
#>                                                                                                                     expr
#>  {     cv_super_learner(data, formula = medv ~ ., learners = list(rf = lnr_rf,          lm = lnr_lm, mean = lnr_mean)) }
#>       min      lq     mean   median       uq      max neval
#>  7.574719 7.63037 7.816375 7.686021 7.937203 8.188385     3
# multicore version: 
plan(multicore, workers = 10)

microbenchmark({
  cv_super_learner(
    data, 
    formula = medv ~ .,
    learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean))
}, times = 3)
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> Unit: seconds
#>                                                                                                                     expr
#>  {     cv_super_learner(data, formula = medv ~ ., learners = list(rf = lnr_rf,          lm = lnr_lm, mean = lnr_mean)) }
#>       min      lq     mean   median       uq      max neval
#>  2.150917 2.16683 2.238391 2.182743 2.282128 2.381514     3
learners <- list(
  mean = lnr_mean,
  lm = lnr_lm,
  rf = lnr_rf,
  earth = lnr_earth,
  xgboost = lnr_xgboost,
  glmnet0 = lnr_glmnet,
  glmnet1 = lnr_glmnet,
  glmnet2 = lnr_glmnet,
  glmnet3 = lnr_glmnet
)

extra_args <- list(
  glmnet0 = list(lambda = 0.01),
  glmnet1 = list(lambda = 0.2),
  glmnet2 = list(lambda = 0.4),
  glmnet3 = list(lambda = 0.6)
)
plan(sequential)

microbenchmark({ 
  cv_out <- cv_super_learner(
    data = mtcars, 
    formulas = mpg ~ .,
    learners = learners,
    extra_learner_args = extra_args)
}, times = 3)
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> Unit: seconds
#>                                                                                                                                  expr
#>  {     cv_out <- cv_super_learner(data = mtcars, formulas = mpg ~          ., learners = learners, extra_learner_args = extra_args) }
#>       min       lq     mean   median       uq      max neval
#>  2.235282 2.252174 2.267698 2.269065 2.283905 2.298745     3
plan(multicore)

microbenchmark({ 
  cv_out <- cv_super_learner(
    data = mtcars, 
    formulas = mpg ~ .,
    learners = learners,
    extra_learner_args = extra_args)
}, times = 3)
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
#> Unit: milliseconds
#>                                                                                                                                  expr
#>  {     cv_out <- cv_super_learner(data = mtcars, formulas = mpg ~          ., learners = learners, extra_learner_args = extra_args) }
#>       min       lq     mean   median       uq      max neval
#>  862.5016 891.1052 906.6127 919.7088 928.6682 937.6277     3

But why is it not so obvious for just super_learner()?

Because cv_super_learner() involves an additional layer of cross-validation, the effect of parallelization is more obvious in cv_super_learner() than compared to super_learner(). However, to make it more obvious that parallelization is working in super_learner() as well, if the number of cv folds we want to run is higher, this increases the relative payoff of using the parallel option.

plan(sequential)

microbenchmark({ 
  sl_out <- nadir::super_learner(
    data = Boston,
    formulas = medv ~ .,
    learners = learners,
    n_folds = 20,
    extra_learner_args = extra_args,
    verbose = TRUE)
}, times = 3)
#> Unit: seconds
#>                                                                                                                                                                              expr
#>  {     sl_out <- nadir::super_learner(data = Boston, formulas = medv ~          ., learners = learners, n_folds = 20, extra_learner_args = extra_args,          verbose = TRUE) }
#>       min       lq     mean   median       uq      max neval
#>  13.20251 13.25625 13.52498 13.30998 13.68621 14.06243     3
plan(multicore)

microbenchmark({ 
  sl_out <- nadir::super_learner(
    data = Boston,
    formulas = medv ~ .,
    learners = learners,
    n_folds = 20,
    extra_learner_args = extra_args,
    verbose = TRUE)
}, times = 3)
#> Unit: seconds
#>                                                                                                                                                                              expr
#>  {     sl_out <- nadir::super_learner(data = Boston, formulas = medv ~          ., learners = learners, n_folds = 20, extra_learner_args = extra_args,          verbose = TRUE) }
#>       min       lq     mean   median       uq      max neval
#>  9.268101 9.327153 9.407291 9.386205 9.476886 9.567568     3