Skip to contents

It would be nice to compare SuperLearner sl3 and nadir on some small, medium, and large-ish datasets to see how they compare in timing results. For now, we’ll start with some smaller ones to see how long it takes to get super_learner() and cv_super_learner() (and their equivalents) to run on these.

For these different sizes, I’d consider

  • iris as the small example (7.3 KB)
  • penguins from palmerpenguins as another small-ish example (16.8 KB)
  • tornados from tidytuesdayR 2023-05-16 (2.7 MB)

We’re running these on my laptop with 10 cores.

In all of these, we’ll use the same library of learners:

  • lnr_mean
  • lnr_lm
  • lnr_rf
  • lnr_earth
  • lnr_glmnet
  • lnr_xgboost

and their equivalents in the other packages.

iris data

library(pacman)
p_load('nadir', 'sl3', 'SuperLearner', 'microbenchmark', 'tidytuesdayR', 'future')

# setup multicore use 
future::plan(future::multicore)
options(future.globals.maxSize = 8000 * 1024^2)
petal_formula <- Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length + Species

nadir::super_learner() on iris (7.3 KB) data

microbenchmark::microbenchmark(
  times = 10,
  list(
    nadir = {
      super_learner(
        data = iris,
        formula = petal_formula,
        learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
      )
    }
  )
)
#> Warning in microbenchmark::microbenchmark(times = 10, list(nadir = {: less
#> accurate nanosecond times to avoid potential integer overflows
#> Unit: milliseconds
#>                                                                                                                                                               expr
#>  list(nadir = {     super_learner(data = iris, formula = petal_formula, learners = list(lnr_mean,          lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)) })
#>       min      lq     mean   median       uq      max neval
#>  895.6713 909.327 960.9768 938.8454 953.5046 1159.333    10

microbenchmark::microbenchmark(
  times = 3,
  list(
    nadir_cv = { 
      cv_super_learner(
        data = iris,
        formula = petal_formula,
        learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
      )
    }
  )
)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#>                                                                                                                                                                     expr
#>  list(nadir_cv = {     cv_super_learner(data = iris, formula = petal_formula, learners = list(lnr_mean,          lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)) })
#>       min       lq     mean   median      uq      max neval
#>  1.152506 1.235811 1.363149 1.319117 1.46847 1.617823     3

sl3 on iris (7.3 KB) data

task <- make_sl3_Task(
  data = iris,
  outcome = "Petal.Width",
  covariates = c("Sepal.Length", 'Sepal.Width', 'Petal.Length', 'Species')
)

lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()

stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)

sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
                  cv_control = list(V = 5))

microbenchmark::microbenchmark(
  times = 10,
  list(
    sl3 = { sl_fit <- sl$train(task = task) }
  )
)
#> Unit: seconds
#>                                                 expr      min       lq     mean
#>  list(sl3 = {     sl_fit <- sl$train(task = task) }) 1.205832 1.372037 1.771353
#>    median       uq     max neval
#>  1.468686 1.626016 4.69292    10

sl_fit <- sl$train(task = task)

system.time({
  sl3_cv = { cv_sl(lrnr_sl = sl_fit, eval_fun = loss_squared_error) }
})
#> [1] "Cross-validated risk:"
#> Key: <learner>
#>                                    learner        MSE          se     fold_sd
#>                                     <fctr>      <num>       <num>       <num>
#> 1:                               Lrnr_mean 0.58353075 0.039178753 0.157569153
#> 2:                           Lrnr_glm_TRUE 0.03084785 0.004393916 0.016962594
#> 3:            Lrnr_randomForest_500_TRUE_5 0.03603531 0.004713230 0.020498229
#> 4:         Lrnr_earth_2_3_backward_0_1_0_0 0.04062728 0.006407517 0.014250254
#> 5: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE 0.03091855 0.004434083 0.017222232
#> 6:                       Lrnr_xgboost_20_1 0.04482640 0.005864772 0.018983947
#> 7:                            SuperLearner 0.01866054 0.002595938 0.009095165
#>    fold_min_MSE fold_max_MSE
#>           <num>        <num>
#> 1:  0.364277092   0.90685158
#> 2:  0.008607536   0.06067160
#> 3:  0.009862090   0.07852736
#> 4:  0.014617585   0.06192262
#> 5:  0.009037114   0.06210507
#> 6:  0.012923028   0.08229335
#> 7:  0.005255202   0.02973607
#>    user  system elapsed 
#>  26.222  29.245  33.807

SuperLearner on iris (7.3 KB) data

sl_lib = c( 
  "SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")


microbenchmark::microbenchmark(
  times = 10, 
  list(SuperLearner = {
    mcSuperLearner(Y = iris$Petal.Width,
                 X = iris[, -4],
                 SL.library = sl_lib,
                 cvControl = list(V = 5))
  }
  )
)
#> Unit: seconds
#>                                                                                                                                      expr
#>  list(SuperLearner = {     mcSuperLearner(Y = iris$Petal.Width, X = iris[, -4], SL.library = sl_lib,          cvControl = list(V = 5)) })
#>      min       lq     mean   median       uq      max neval
#>  1.12987 1.139992 1.168167 1.160422 1.197297 1.210417    10

system.time({
  CV.SuperLearner(
    Y = iris$Petal.Width, 
    X = iris[, -14], 
    SL.library = sl_lib,
    parallel = 'multicore',
    V = 5)
})
#>    user  system elapsed 
#>   3.606   0.218   5.572

Cleanup

rm(list = ls())

penguins data (16.8 KB)

penguins <- palmerpenguins::penguins
penguins <- penguins[complete.cases(penguins),]

flipper_length_formula <-
  flipper_length_mm ~ species + island + bill_length_mm +
    bill_depth_mm + body_mass_g + sex

nadir on penguins data (16.8 KB)

microbenchmark(
  times = 10,
  nadir = {
    nadir::super_learner(
      data = penguins,
      formula = flipper_length_formula,
      learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
    )
  }
)
#> Unit: seconds
#>   expr      min       lq     mean   median       uq     max neval
#>  nadir 1.203198 1.248632 1.305662 1.264015 1.371984 1.49825    10

microbenchmark::microbenchmark(
  times = 3,
  list(
    nadir_cv = { 
      cv_super_learner(
        data = penguins,
        formula = flipper_length_formula,
        learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
      )
    }
  )
)
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> The default is to report CV-MSE if no other loss_metric is specified.
#> Unit: seconds
#>                                                                                                                                                                                               expr
#>  list(nadir_cv = {     cv_super_learner(data = penguins, formula = flipper_length_formula,          learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth,              lnr_glmnet, lnr_xgboost)) })
#>       min       lq     mean   median       uq      max neval
#>  2.290299 2.418485 2.475291 2.546671 2.567787 2.588903     3

sl3 on penguins data (16.8 KB)

task <- make_sl3_Task(
  data = penguins,
  outcome = "flipper_length_mm",
  covariates = c("species",
                 "island",
                 "bill_length_mm",
                 "bill_depth_mm",
                 "body_mass_g",
                 "sex")
)

lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()

stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)

sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
                  cv_control = list(V = 5))

microbenchmark::microbenchmark(
  times = 10,
  list(
    sl3 = { sl_fit <- sl$train(task = task) }
  )
)
#> Unit: seconds
#>                                                 expr      min       lq     mean
#>  list(sl3 = {     sl_fit <- sl$train(task = task) }) 1.392177 1.562695 1.926519
#>    median       uq      max neval
#>  1.699887 1.876163 3.945073    10

sl_fit <- sl$train(task = task)

system.time({
  sl3_cv = { cv_sl(lrnr_sl = sl_fit, eval_fun = loss_squared_error) }
})
#> [1] "Cross-validated risk:"
#> Key: <learner>
#>                                    learner       MSE        se   fold_sd
#>                                     <fctr>     <num>     <num>     <num>
#> 1:                               Lrnr_mean 197.02427 10.998947 24.859403
#> 2:                           Lrnr_glm_TRUE  28.04216  2.481515  7.181076
#> 3:            Lrnr_randomForest_500_TRUE_5  29.88601  2.424218  6.083925
#> 4:         Lrnr_earth_2_3_backward_0_1_0_0  29.00206  2.607459  7.372143
#> 5: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE  28.57830  2.557022  7.692615
#> 6:                       Lrnr_xgboost_20_1  34.18020  3.034687  5.676727
#> 7:                            SuperLearner  15.84166  1.354607  2.914526
#>    fold_min_MSE fold_max_MSE
#>           <num>        <num>
#> 1:    154.98742    228.52485
#> 2:     17.97819     38.96773
#> 3:     18.99754     37.14019
#> 4:     17.99485     38.11684
#> 5:     17.93107     39.10167
#> 6:     22.02743     42.13261
#> 7:     10.77440     20.22545
#>    user  system elapsed 
#>  36.023  29.942  35.971

SuperLearner on penguins (16.8 KB) data

sl_lib = c( 
  "SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")


microbenchmark::microbenchmark(
  times = 10, 
  list(SuperLearner = {
    mcSuperLearner(Y = penguins$flipper_length_mm,
                   X = penguins[, c("species",
                                    "island",
                                    "bill_length_mm",
                                    "bill_depth_mm",
                                    "body_mass_g",
                                    "sex")], 
                 SL.library = sl_lib,
                 cvControl = list(V = 5))
  }
  )
)
#> Unit: seconds
#>                                                                                                                                                                                                                                              expr
#>  list(SuperLearner = {     mcSuperLearner(Y = penguins$flipper_length_mm, X = penguins[,          c("species", "island", "bill_length_mm", "bill_depth_mm",              "body_mass_g", "sex")], SL.library = sl_lib, cvControl = list(V = 5)) })
#>       min       lq     mean   median      uq      max neval
#>  2.064258 2.084957 2.159371 2.130866 2.14746 2.428693    10

num_cores = RhpcBLASctl::get_num_cores()

system.time({
  CV.SuperLearner(
    Y = penguins$flipper_length_mm,
    X = penguins[, c("species",
                     "island",
                     "bill_length_mm",
                     "bill_depth_mm",
                     "body_mass_g",
                     "sex")], 
    SL.library = sl_lib,
    parallel = 'multicore',
    V = 5)
})
#>    user  system elapsed 
#>   8.064   0.317  12.308

Cleanup

rm(list=ls())

tornados data (2.7 MB)

tuesdata <- tidytuesdayR::tt_load('2023-05-16')
#> ---- Compiling #TidyTuesday Information for 2023-05-16 ----
#> --- There is 1 file available ---
#> 
#> 
#> ── Downloading files ───────────────────────────────────────────────────────────
#> 
#>   1 of 1: "tornados.csv"
tornados <- tuesdata$tornados
tornados <- tornados[,c('yr', 'mo', 'dy', 'mag', 'st', 'inj', 'fat', 'loss')]
tornados <- tornados[complete.cases(tornados),]

# these states appear only very infrequently, like 2 and 1 times respectively — DC and Alaska 
tornados <- tornados |> dplyr::filter(!st %in% c('DC', 'AK'))

tornado_formula <- inj ~ yr + mo + mag + fat + st + loss

nadir on tornados data (2.7 MB)

system.time({
  super_learner(
    data = tornados,
    formula = tornado_formula,
    learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost),
    cv_schema = cv_character_and_factors_schema
  )
})
#>    user  system elapsed 
#> 517.594  12.720 306.328

sl3 on tornados data (2.7 MB)

task <- make_sl3_Task(
  data = tornados,
  outcome = "inj",
  covariates = c("yr", "mo", "dy", "mag", 
                 "st", "fat", "loss")
)
#> Warning in process_data(data, nodes, column_names = column_names, flag = flag, : Character variables found: st;
#> Converting these to factors

lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()

stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)

sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
                  cv_control = list(V = 5))

system.time({
  sl_fit <- sl$train(task = task)
})
#>     user   system  elapsed 
#> 8654.110   52.755 1832.448

SuperLearner on tornados data (2.7 MB)

sl_lib = c( 
  "SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")

system.time({
    mcSuperLearner(Y = tornados$inj,
                   X = tornados[, c("yr", "mo", "dy", "mag", 
                 "st", "fat", "loss")], 
                 SL.library = sl_lib,
                 cvControl = list(V = 5))
  })