
Benchmarking Against `{SuperLearner}` and `{sl3}`
Source:vignettes/articles/Benchmarking.Rmd
Benchmarking.Rmd
It would be nice to compare SuperLearner
sl3 and nadir on some small, medium, and
large-ish datasets to see how they compare in timing results. For now,
we’ll start with some smaller ones to see how long it takes to get
super_learner()
and cv_super_learner()
(and
their equivalents) to run on these.
For these different sizes, I’d consider
-
iris
as the small example (7.3 KB) -
penguins
from palmerpenguins as another small-ish example (16.8 KB) -
tornados
from tidytuesdayR 2023-05-16 (2.7 MB)
We’re running these on my laptop with 10 cores.
In all of these, we’ll use the same library of learners:
lnr_mean
lnr_lm
lnr_rf
lnr_earth
lnr_glmnet
lnr_xgboost
and their equivalents in the other packages.
iris
data
library(pacman)
p_load('nadir', 'sl3', 'SuperLearner', 'microbenchmark', 'tidytuesdayR', 'future')
# setup multicore use
future::plan(future::multicore)
options(future.globals.maxSize = 8000 * 1024^2)
petal_formula <- Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length + Species
nadir::super_learner()
on iris
(7.3 KB)
data
microbenchmark::microbenchmark(
times = 10,
list(
nadir = {
super_learner(
data = iris,
formula = petal_formula,
learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
)
}
)
)
#> Warning in microbenchmark::microbenchmark(times = 10, list(nadir = {: less
#> accurate nanosecond times to avoid potential integer overflows
#> Unit: milliseconds
#> expr
#> list(nadir = { super_learner(data = iris, formula = petal_formula, learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)) })
#> min lq mean median uq max neval
#> 895.6713 909.327 960.9768 938.8454 953.5046 1159.333 10
microbenchmark::microbenchmark(
times = 3,
list(
nadir_cv = {
cv_super_learner(
data = iris,
formula = petal_formula,
learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
)
}
)
)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#> expr
#> list(nadir_cv = { cv_super_learner(data = iris, formula = petal_formula, learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)) })
#> min lq mean median uq max neval
#> 1.152506 1.235811 1.363149 1.319117 1.46847 1.617823 3
sl3
on iris
(7.3 KB) data
task <- make_sl3_Task(
data = iris,
outcome = "Petal.Width",
covariates = c("Sepal.Length", 'Sepal.Width', 'Petal.Length', 'Species')
)
lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()
stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)
sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
cv_control = list(V = 5))
microbenchmark::microbenchmark(
times = 10,
list(
sl3 = { sl_fit <- sl$train(task = task) }
)
)
#> Unit: seconds
#> expr min lq mean
#> list(sl3 = { sl_fit <- sl$train(task = task) }) 1.205832 1.372037 1.771353
#> median uq max neval
#> 1.468686 1.626016 4.69292 10
sl_fit <- sl$train(task = task)
system.time({
sl3_cv = { cv_sl(lrnr_sl = sl_fit, eval_fun = loss_squared_error) }
})
#> [1] "Cross-validated risk:"
#> Key: <learner>
#> learner MSE se fold_sd
#> <fctr> <num> <num> <num>
#> 1: Lrnr_mean 0.58353075 0.039178753 0.157569153
#> 2: Lrnr_glm_TRUE 0.03084785 0.004393916 0.016962594
#> 3: Lrnr_randomForest_500_TRUE_5 0.03603531 0.004713230 0.020498229
#> 4: Lrnr_earth_2_3_backward_0_1_0_0 0.04062728 0.006407517 0.014250254
#> 5: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE 0.03091855 0.004434083 0.017222232
#> 6: Lrnr_xgboost_20_1 0.04482640 0.005864772 0.018983947
#> 7: SuperLearner 0.01866054 0.002595938 0.009095165
#> fold_min_MSE fold_max_MSE
#> <num> <num>
#> 1: 0.364277092 0.90685158
#> 2: 0.008607536 0.06067160
#> 3: 0.009862090 0.07852736
#> 4: 0.014617585 0.06192262
#> 5: 0.009037114 0.06210507
#> 6: 0.012923028 0.08229335
#> 7: 0.005255202 0.02973607
#> user system elapsed
#> 26.222 29.245 33.807
SuperLearner
on iris
(7.3 KB) data
sl_lib = c(
"SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")
microbenchmark::microbenchmark(
times = 10,
list(SuperLearner = {
mcSuperLearner(Y = iris$Petal.Width,
X = iris[, -4],
SL.library = sl_lib,
cvControl = list(V = 5))
}
)
)
#> Unit: seconds
#> expr
#> list(SuperLearner = { mcSuperLearner(Y = iris$Petal.Width, X = iris[, -4], SL.library = sl_lib, cvControl = list(V = 5)) })
#> min lq mean median uq max neval
#> 1.12987 1.139992 1.168167 1.160422 1.197297 1.210417 10
system.time({
CV.SuperLearner(
Y = iris$Petal.Width,
X = iris[, -14],
SL.library = sl_lib,
parallel = 'multicore',
V = 5)
})
#> user system elapsed
#> 3.606 0.218 5.572
Cleanup
penguins
data (16.8 KB)
penguins <- palmerpenguins::penguins
penguins <- penguins[complete.cases(penguins),]
flipper_length_formula <-
flipper_length_mm ~ species + island + bill_length_mm +
bill_depth_mm + body_mass_g + sex
nadir
on penguins
data (16.8 KB)
microbenchmark(
times = 10,
nadir = {
nadir::super_learner(
data = penguins,
formula = flipper_length_formula,
learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
)
}
)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> nadir 1.203198 1.248632 1.305662 1.264015 1.371984 1.49825 10
microbenchmark::microbenchmark(
times = 3,
list(
nadir_cv = {
cv_super_learner(
data = penguins,
formula = flipper_length_formula,
learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
)
}
)
)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#> expr
#> list(nadir_cv = { cv_super_learner(data = penguins, formula = flipper_length_formula, learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)) })
#> min lq mean median uq max neval
#> 2.290299 2.418485 2.475291 2.546671 2.567787 2.588903 3
sl3
on penguins
data (16.8 KB)
task <- make_sl3_Task(
data = penguins,
outcome = "flipper_length_mm",
covariates = c("species",
"island",
"bill_length_mm",
"bill_depth_mm",
"body_mass_g",
"sex")
)
lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()
stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)
sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
cv_control = list(V = 5))
microbenchmark::microbenchmark(
times = 10,
list(
sl3 = { sl_fit <- sl$train(task = task) }
)
)
#> Unit: seconds
#> expr min lq mean
#> list(sl3 = { sl_fit <- sl$train(task = task) }) 1.392177 1.562695 1.926519
#> median uq max neval
#> 1.699887 1.876163 3.945073 10
sl_fit <- sl$train(task = task)
system.time({
sl3_cv = { cv_sl(lrnr_sl = sl_fit, eval_fun = loss_squared_error) }
})
#> [1] "Cross-validated risk:"
#> Key: <learner>
#> learner MSE se fold_sd
#> <fctr> <num> <num> <num>
#> 1: Lrnr_mean 197.02427 10.998947 24.859403
#> 2: Lrnr_glm_TRUE 28.04216 2.481515 7.181076
#> 3: Lrnr_randomForest_500_TRUE_5 29.88601 2.424218 6.083925
#> 4: Lrnr_earth_2_3_backward_0_1_0_0 29.00206 2.607459 7.372143
#> 5: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE 28.57830 2.557022 7.692615
#> 6: Lrnr_xgboost_20_1 34.18020 3.034687 5.676727
#> 7: SuperLearner 15.84166 1.354607 2.914526
#> fold_min_MSE fold_max_MSE
#> <num> <num>
#> 1: 154.98742 228.52485
#> 2: 17.97819 38.96773
#> 3: 18.99754 37.14019
#> 4: 17.99485 38.11684
#> 5: 17.93107 39.10167
#> 6: 22.02743 42.13261
#> 7: 10.77440 20.22545
#> user system elapsed
#> 36.023 29.942 35.971
SuperLearner
on penguins
(16.8 KB)
data
sl_lib = c(
"SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")
microbenchmark::microbenchmark(
times = 10,
list(SuperLearner = {
mcSuperLearner(Y = penguins$flipper_length_mm,
X = penguins[, c("species",
"island",
"bill_length_mm",
"bill_depth_mm",
"body_mass_g",
"sex")],
SL.library = sl_lib,
cvControl = list(V = 5))
}
)
)
#> Unit: seconds
#> expr
#> list(SuperLearner = { mcSuperLearner(Y = penguins$flipper_length_mm, X = penguins[, c("species", "island", "bill_length_mm", "bill_depth_mm", "body_mass_g", "sex")], SL.library = sl_lib, cvControl = list(V = 5)) })
#> min lq mean median uq max neval
#> 2.064258 2.084957 2.159371 2.130866 2.14746 2.428693 10
num_cores = RhpcBLASctl::get_num_cores()
system.time({
CV.SuperLearner(
Y = penguins$flipper_length_mm,
X = penguins[, c("species",
"island",
"bill_length_mm",
"bill_depth_mm",
"body_mass_g",
"sex")],
SL.library = sl_lib,
parallel = 'multicore',
V = 5)
})
#> user system elapsed
#> 8.064 0.317 12.308
Cleanup
tornados
data (2.7 MB)
tuesdata <- tidytuesdayR::tt_load('2023-05-16')
#> ---- Compiling #TidyTuesday Information for 2023-05-16 ----
#> --- There is 1 file available ---
#>
#>
#> ── Downloading files ───────────────────────────────────────────────────────────
#>
#> 1 of 1: "tornados.csv"
tornados <- tuesdata$tornados
tornados <- tornados[,c('yr', 'mo', 'dy', 'mag', 'st', 'inj', 'fat', 'loss')]
tornados <- tornados[complete.cases(tornados),]
# these states appear only very infrequently, like 2 and 1 times respectively — DC and Alaska
tornados <- tornados |> dplyr::filter(!st %in% c('DC', 'AK'))
tornado_formula <- inj ~ yr + mo + mag + fat + st + loss
nadir
on tornados
data (2.7 MB)
system.time({
super_learner(
data = tornados,
formula = tornado_formula,
learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost),
cv_schema = cv_character_and_factors_schema
)
})
#> user system elapsed
#> 517.594 12.720 306.328
sl3
on tornados
data (2.7 MB)
task <- make_sl3_Task(
data = tornados,
outcome = "inj",
covariates = c("yr", "mo", "dy", "mag",
"st", "fat", "loss")
)
#> Warning in process_data(data, nodes, column_names = column_names, flag = flag, : Character variables found: st;
#> Converting these to factors
lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()
stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)
sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
cv_control = list(V = 5))
system.time({
sl_fit <- sl$train(task = task)
})
#> user system elapsed
#> 8654.110 52.755 1832.448
SuperLearner
on tornados
data (2.7
MB)
sl_lib = c(
"SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")
system.time({
mcSuperLearner(Y = tornados$inj,
X = tornados[, c("yr", "mo", "dy", "mag",
"st", "fat", "loss")],
SL.library = sl_lib,
cvControl = list(V = 5))
})