
Running `super_learner()` in Parallel
Source:vignettes/articles/Running-super_learner-in-Parallel.Rmd
Running-super_learner-in-Parallel.Rmd
Parallelization is supported in super_learner()
and
cv_super_learner()
, and is implemented through the
future package.
If you’d like to use the parallel versions of
super_learner()
or cv_super_learner()
it is as
simple as library(future)
(as long as you have it
installed) and declaring a plan like plan(multicore)
.
library(nadir)
library(future)
#> Warning: package 'future' was built under R version 4.3.3
library(tidytuesdayR)
#> Warning: package 'tidytuesdayR' was built under R version 4.3.3
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(microbenchmark)
plan(multicore) # or similar, see https://future.futureverse.org/
# we recommend you to use a multicore setup on a Unix/Linux machine if you
# actually want to see a speed gain from parallelizing your code.
#
# note that plan(multicore) does not run in RStudio or Windows but multisession
# does. our experience has been multisession has not led to a speed increase
# compared to running with a sequential plan.
data("Boston", package = 'MASS')
data <- Boston
Speed gains are most obvious in cv_super_learner()
Let’s run a timing test to see if we can tell if there’s an
improvement in performance from using a multicore
vs. a
sequential
plan:
# sequential version:
plan(sequential)
microbenchmark({
sl_closure <- function(data) {
super_learner(
data = data,
formula = medv ~ .,
learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean),
)
}
cv_super_learner(data, sl_closure, y_variable = 'medv')
}, times = 3)
#> Warning in microbenchmark({: less accurate nanosecond times to avoid potential
#> integer overflows
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#> expr
#> { sl_closure <- function(data) { super_learner(data = data, formula = medv ~ ., learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean), ) } cv_super_learner(data, sl_closure, y_variable = "medv") }
#> min lq mean median uq max neval
#> 7.732953 7.7542 7.904345 7.775448 7.99004 8.204633 3
# multicore version:
plan(multicore, workers = 10)
microbenchmark({
sl_closure <- function(data) {
super_learner(
data = data,
formula = medv ~ .,
learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean)
)
}
cv_super_learner(data, sl_closure, y_variable = 'medv')
}, times = 3)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#> expr
#> { sl_closure <- function(data) { super_learner(data = data, formula = medv ~ ., learners = list(rf = lnr_rf, lm = lnr_lm, mean = lnr_mean)) } cv_super_learner(data, sl_closure, y_variable = "medv") }
#> min lq mean median uq max neval
#> 2.016981 2.083049 2.216926 2.149117 2.316898 2.484679 3
# this may be useful for larger datasets with parallel processing...
options(future.globals.maxSize = +Inf)
learners <- list(
mean = lnr_mean,
lm = lnr_lm,
rf = lnr_rf,
earth = lnr_earth,
xgboost = lnr_xgboost,
glmnet0 = lnr_glmnet,
glmnet1 = lnr_glmnet,
glmnet2 = lnr_glmnet,
glmnet3 = lnr_glmnet
)
extra_args <- list(
glmnet0 = list(lambda = 0.01),
glmnet1 = list(lambda = 0.2),
glmnet2 = list(lambda = 0.4),
glmnet3 = list(lambda = 0.6)
)
plan(sequential)
microbenchmark({
sl_closure <- function(data) {
nadir::super_learner(
data = data,
formulas = mpg ~ .,
learners = learners,
extra_learner_args = extra_args)
}
cv_out <- cv_super_learner(
data = mtcars,
sl_closure = sl_closure,
y_variable = 'mpg')
}, times = 3)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#> expr
#> { sl_closure <- function(data) { nadir::super_learner(data = data, formulas = mpg ~ ., learners = learners, extra_learner_args = extra_args) } cv_out <- cv_super_learner(data = mtcars, sl_closure = sl_closure, y_variable = "mpg") }
#> min lq mean median uq max neval
#> 2.402918 2.433435 2.474872 2.463952 2.510849 2.557746 3
plan(multicore)
microbenchmark({
sl_closure <- function(data) {
nadir::super_learner(
data = data,
formulas = mpg ~ .,
learners = learners,
extra_learner_args = extra_args)
}
cv_out <- cv_super_learner(
data = mtcars,
sl_closure = sl_closure,
y_variable = 'mpg')
}, times = 3)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Warning in predict.lm(model, newdata = newdata, type = "response"): prediction
#> from rank-deficient fit; attr(*, "non-estim") has doubtful cases
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: milliseconds
#> expr
#> { sl_closure <- function(data) { nadir::super_learner(data = data, formulas = mpg ~ ., learners = learners, extra_learner_args = extra_args) } cv_out <- cv_super_learner(data = mtcars, sl_closure = sl_closure, y_variable = "mpg") }
#> min lq mean median uq max neval
#> 928.6727 939.2432 964.423 949.8138 982.2981 1014.782 3
But why is it not so obvious for just
super_learner()
?
Because cv_super_learner()
involves an additional layer
of cross-validation, the effect of parallelization is more obvious in
cv_super_learner()
than compared to
super_learner()
. However, to make it more obvious that
parallelization is working in super_learner()
as well, if
the number of cv folds we want to run is higher, this increases the
relative payoff of using the parallel option.
plan(sequential)
microbenchmark({
sl_out <- nadir::super_learner(
data = Boston,
formulas = medv ~ .,
learners = learners,
n_folds = 20,
extra_learner_args = extra_args,
verbose = TRUE)
}, times = 3)
#> Unit: seconds
#> expr
#> { sl_out <- nadir::super_learner(data = Boston, formulas = medv ~ ., learners = learners, n_folds = 20, extra_learner_args = extra_args, verbose = TRUE) }
#> min lq mean median uq max neval
#> 12.70487 12.77468 12.84642 12.84449 12.91719 12.98988 3
plan(multicore)
microbenchmark({
sl_out <- nadir::super_learner(
data = Boston,
formulas = medv ~ .,
learners = learners,
n_folds = 20,
extra_learner_args = extra_args,
verbose = TRUE)
}, times = 3)
#> Unit: seconds
#> expr
#> { sl_out <- nadir::super_learner(data = Boston, formulas = medv ~ ., learners = learners, n_folds = 20, extra_learner_args = extra_args, verbose = TRUE) }
#> min lq mean median uq max neval
#> 8.557656 8.615234 8.650471 8.672812 8.696878 8.720944 3