Benchmarking Against `{SuperLearner}` and `{sl3}`

It would be nice to compare SuperLearner sl3 and nadir on some small, medium, and large-ish datasets to see how they compare in timing results. For now, we’ll start with some smaller ones to see how long it takes to get super_learner() and cv_super_learner() (and their equivalents) to run on these.

For these different sizes, I’d consider

iris as the small example (7.3 KB)
penguins from palmerpenguins as another small-ish example (16.8 KB)
tornados from tidytuesdayR 2023-05-16 (2.7 MB)

We’re running these on my laptop with 10 cores.

In all of these, we’ll use the same library of learners:

lnr_mean
lnr_lm
lnr_rf
lnr_earth
lnr_glmnet
lnr_xgboost

and their equivalents in the other packages.

`iris` data

library(pacman)
p_load('nadir', 'sl3', 'SuperLearner', 'microbenchmark', 'tidytuesdayR', 'future', 'ggplot2', 'forcats', 'dplyr')

# setup multicore use 
future::plan(future::multicore)

petal_formula <- Petal.Width ~ Sepal.Length + Sepal.Width + Petal.Length + Species

`nadir::super_learner()` on `iris` (7.3 KB) data

iris_nadir_sl_benchmark <- microbenchmark::microbenchmark(
  times = 10,
  list(
    nadir = {
      super_learner(
        data = iris,
        formula = petal_formula,
        learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
      )
    }
  )
)
#> Warning in microbenchmark::microbenchmark(times = 10, list(nadir = {: less
#> accurate nanosecond times to avoid potential integer overflows
iris_nadir_sl_benchmark
#> Unit: milliseconds
#>                                                                                                                                                               expr
#>  list(nadir = {     super_learner(data = iris, formula = petal_formula, learners = list(lnr_mean,          lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)) })
#>       min       lq     mean   median      uq      max neval
#>  889.8272 902.7249 971.1413 945.5787 1007.02 1192.484    10

iris_nadir_cv_sl_benchmark <- system.time({
  cv_super_learner(
    data = iris,
    formula = petal_formula,
    learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
  )
})
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
iris_nadir_cv_sl_benchmark
#>    user  system elapsed 
#>   3.891   0.975   1.419

`sl3` on `iris` (7.3 KB) data

task <- make_sl3_Task(
  data = iris,
  outcome = "Petal.Width",
  covariates = c("Sepal.Length", 'Sepal.Width', 'Petal.Length', 'Species')
)

lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()

stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)

sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
                  cv_control = list(V = 5))

iris_sl3_sl_benchmark <- microbenchmark::microbenchmark(
  times = 10,
  list(
    sl3 = { sl_fit <- sl$train(task = task) }
  )
)
iris_sl3_sl_benchmark
#> Unit: seconds
#>                                                 expr      min       lq     mean
#>  list(sl3 = {     sl_fit <- sl$train(task = task) }) 1.080533 1.268148 1.462321
#>    median       uq      max neval
#>  1.433153 1.487204 1.954279    10

sl_fit <- sl$train(task = task)
iris_sl3_cv_sl_benchmark <- system.time({
  sl3_cv = { cv_sl(lrnr_sl = sl_fit, eval_fun = loss_squared_error) }
})
#> [1] "Cross-validated risk:"
#> Key: <learner>
#>                                    learner        MSE          se     fold_sd
#>                                     <fctr>      <num>       <num>       <num>
#> 1:                               Lrnr_mean 0.58845981 0.039084452 0.095707715
#> 2:                           Lrnr_glm_TRUE 0.02902919 0.004196715 0.010396127
#> 3:            Lrnr_randomForest_500_TRUE_5 0.03148265 0.004415253 0.009794200
#> 4:         Lrnr_earth_2_3_backward_0_1_0_0 0.04513001 0.010649813 0.038136107
#> 5: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE 0.02900513 0.004208611 0.010167544
#> 6:                       Lrnr_xgboost_20_1 0.03800387 0.005239035 0.013841992
#> 7:                            SuperLearner 0.01707933 0.002604897 0.005322035
#>    fold_min_MSE fold_max_MSE
#>           <num>        <num>
#> 1:  0.430667215   0.73268861
#> 2:  0.013528873   0.04413943
#> 3:  0.021548138   0.04761416
#> 4:  0.016095248   0.14942276
#> 5:  0.013102653   0.04301907
#> 6:  0.017879247   0.05820837
#> 7:  0.007382378   0.02344127
iris_sl3_cv_sl_benchmark
#>    user  system elapsed 
#>  26.072  36.402  41.021

`SuperLearner` on `iris` (7.3 KB) data

sl_lib = c( 
  "SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")

iris_SuperLearner_sl_benchmark <- microbenchmark::microbenchmark(
  times = 10, 
  list(SuperLearner = {
    mcSuperLearner(Y = iris$Petal.Width,
                 X = iris[, -4],
                 SL.library = sl_lib,
                 cvControl = list(V = 5))
  }
  )
)
iris_SuperLearner_sl_benchmark
#> Unit: seconds
#>                                                                                                                                      expr
#>  list(SuperLearner = {     mcSuperLearner(Y = iris$Petal.Width, X = iris[, -4], SL.library = sl_lib,          cvControl = list(V = 5)) })
#>       min       lq     mean   median       uq      max neval
#>  1.203083 1.221367 1.231049 1.230699 1.238238 1.265271    10

iris_SuperLearner_cv_sl_benchmark <- system.time({
  CV.SuperLearner(
    Y = iris$Petal.Width, 
    X = iris[, -14], 
    SL.library = sl_lib,
    parallel = 'multicore',
    V = 5)
})
iris_SuperLearner_cv_sl_benchmark
#>    user  system elapsed 
#>   3.776   0.241   5.852

Visual comparison

sl_timing <- data.frame(
  package = c('nadir', 'sl3', 'SuperLearner'),
  time = c(mean(iris_nadir_sl_benchmark$time) / 1e9, # convert nanoseconds to seconds
           mean(iris_sl3_sl_benchmark$time) / 1e9,
           mean(iris_SuperLearner_sl_benchmark$time) / 1e9)
)

sl_timing |> 
  mutate(package = forcats::fct_reorder(package, time)) |> 
  ggplot(aes(x = time, y = package, fill = package)) + 
  geom_col() + 
  geom_point() + 
  scale_fill_brewer(palette = 'Set2') + 
  theme_bw() + 
  ggtitle("Time taken to super learn on the iris dataset across 6 candidate learners") + 
  labs(caption = "Task: Petal.Width ~ . on the iris dataset, 150 rows, 5 features") +  
  theme(plot.caption.position = 'plot')




cv_timing <- data.frame(
  package = c('nadir', 'sl3', 'SuperLearner'),
  time = c(
    iris_nadir_cv_sl_benchmark[[3]], # user time
    iris_sl3_cv_sl_benchmark[[3]],
    iris_SuperLearner_cv_sl_benchmark[[3]])
)
cv_timing |>
  dplyr::mutate(package = forcats::fct_reorder(package, time)) |> 
  ggplot(aes(x = time, y = package, fill = package)) + 
  geom_col() + 
  geom_point() + 
  scale_fill_brewer(palette = 'Set2') + 
  theme_bw() + 
  ggtitle("Elapsed time taken to run cross-validated super learner on the iris dataset across 6 candidate learners",
          "Elapsed time (as opposed to user or system time) shows the amount of time experienced by the user.") + 
  labs(caption = "Task: Petal.Width ~ . on the iris dataset, 150 rows, 5 features") +  
  theme(plot.caption.position = 'plot')

`penguins` data (16.8 KB)

penguins <- palmerpenguins::penguins
penguins <- penguins[complete.cases(penguins),]

flipper_length_formula <-
  flipper_length_mm ~ species + island + bill_length_mm +
    bill_depth_mm + body_mass_g + sex

`nadir` on `penguins` data (16.8 KB)

penguins_nadir_sl_benchmark <- microbenchmark(
  times = 10,
  nadir = {
    nadir::super_learner(
      data = penguins,
      formula = flipper_length_formula,
      learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
    )
  }
)
penguins_nadir_sl_benchmark
#> Unit: seconds
#>   expr      min       lq     mean   median       uq      max neval
#>  nadir 1.157808 1.192269 1.290731 1.218286 1.309779 1.728092    10

penguins_nadir_cv_sl_benchmark <- system.time({
  cv_super_learner(
    data = penguins,
    formula = flipper_length_formula,
    learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost)
  )
})
#> The loss_metric is being inferred based on the outcome_type=continuous -> using CV-MSE
penguins_nadir_cv_sl_benchmark
#>    user  system elapsed 
#>   7.037   1.595   2.599

`sl3` on `penguins` data (16.8 KB)

task <- make_sl3_Task(
  data = penguins,
  outcome = "flipper_length_mm",
  covariates = c("species",
                 "island",
                 "bill_length_mm",
                 "bill_depth_mm",
                 "body_mass_g",
                 "sex")
)

lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()

stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)

sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
                  cv_control = list(V = 5))

penguins_sl3_sl_benchmark <- microbenchmark::microbenchmark(
  times = 10,
  list(
    sl3 = { sl_fit <- sl$train(task = task) }
  )
)
penguins_sl3_sl_benchmark
#> Unit: seconds
#>                                                 expr      min       lq     mean
#>  list(sl3 = {     sl_fit <- sl$train(task = task) }) 1.314009 1.465803 2.642716
#>    median       uq      max neval
#>  1.640787 1.831675 9.482718    10


sl_fit <- sl$train(task = task)
penguins_sl3_cv_sl_benchmark <- system.time({
  sl3_cv = { cv_sl(lrnr_sl = sl_fit, eval_fun = loss_squared_error) }
})
#> [1] "Cross-validated risk:"
#> Key: <learner>
#>                                    learner       MSE        se   fold_sd
#>                                     <fctr>     <num>     <num>     <num>
#> 1:                               Lrnr_mean 196.47868 10.979336 33.704733
#> 2:                           Lrnr_glm_TRUE  28.08063  2.496742  6.675033
#> 3:            Lrnr_randomForest_500_TRUE_5  28.82479  2.349832  5.264787
#> 4:         Lrnr_earth_2_3_backward_0_1_0_0  29.00131  2.650027  5.636709
#> 5: Lrnr_glmnet_NULL_deviance_10_1_100_TRUE  28.39991  2.504920  6.139148
#> 6:                       Lrnr_xgboost_20_1  32.54006  2.627296  6.469929
#> 7:                            SuperLearner  19.35342  1.825590  6.853104
#>    fold_min_MSE fold_max_MSE
#>           <num>        <num>
#> 1:    139.88804    238.52211
#> 2:     16.23283     36.93534
#> 3:     17.60123     36.40382
#> 4:     21.20477     37.72809
#> 5:     19.34574     37.51000
#> 6:     21.36498     42.66572
#> 7:     11.12754     35.96351
penguins_sl3_cv_sl_benchmark
#>    user  system elapsed 
#>  38.712  48.137  44.837

`SuperLearner` on `penguins` (16.8 KB) data

sl_lib = c( 
  "SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")

penguins_SuperLearner_sl_benchmark <- microbenchmark::microbenchmark(
  times = 10, 
  list(SuperLearner = {
    mcSuperLearner(Y = penguins$flipper_length_mm,
                   X = penguins[, c("species",
                                    "island",
                                    "bill_length_mm",
                                    "bill_depth_mm",
                                    "body_mass_g",
                                    "sex")], 
                 SL.library = sl_lib,
                 cvControl = list(V = 5))
  }
  )
)
penguins_SuperLearner_sl_benchmark
#> Unit: seconds
#>                                                                                                                                                                                                                                              expr
#>  list(SuperLearner = {     mcSuperLearner(Y = penguins$flipper_length_mm, X = penguins[,          c("species", "island", "bill_length_mm", "bill_depth_mm",              "body_mass_g", "sex")], SL.library = sl_lib, cvControl = list(V = 5)) })
#>       min       lq     mean   median       uq      max neval
#>  2.112144 2.154183 2.174983 2.175506 2.202207 2.224516    10

num_cores = RhpcBLASctl::get_num_cores()

penguins_SuperLearner_cv_sl_benchmark <- system.time({
  CV.SuperLearner(
    Y = penguins$flipper_length_mm,
    X = penguins[, c("species",
                     "island",
                     "bill_length_mm",
                     "bill_depth_mm",
                     "body_mass_g",
                     "sex")], 
    SL.library = sl_lib,
    parallel = 'multicore',
    V = 5)
})
penguins_SuperLearner_cv_sl_benchmark
#>    user  system elapsed 
#>   8.211   0.295  12.642

Visual comparison

sl_timing <- data.frame(
  package = c('nadir', 'sl3', 'SuperLearner'),
  time = c(mean(penguins_nadir_sl_benchmark$time) / 1e9, # convert nanoseconds to seconds
           mean(penguins_sl3_sl_benchmark$time) / 1e9,
           mean(penguins_SuperLearner_sl_benchmark$time) / 1e9)
)

ggplot(sl_timing, aes(x = time, y = package, fill = package)) + 
  geom_col() + 
  geom_point() + 
  scale_fill_brewer(palette = 'Set2') + 
  theme_bw() + 
  ggtitle("Time taken to super learn on the penguins dataset across 6 candidate learners") + 
  labs(caption = "Task: flipper_length_mm ~ . on the penguins dataset, 333 rows, 8 features") + 
  theme(plot.caption.position = 'plot')


cv_timing <- data.frame(
  package = c('nadir', 'sl3', 'SuperLearner'),
  time = c(
    penguins_nadir_cv_sl_benchmark[[3]], # user time
    penguins_sl3_cv_sl_benchmark[[3]],
    penguins_SuperLearner_cv_sl_benchmark[[3]])
)
cv_timing |>
  dplyr::mutate(package = forcats::fct_reorder(package, time)) |> 
  ggplot(aes(x = time, y = package, fill = package)) + 
  geom_col() + 
  geom_point() + 
  scale_fill_brewer(palette = 'Set2') + 
  theme_bw() + 
  ggtitle("Elapsed time taken to run cross-validated super learner on the penguins dataset across 6 candidate learners",
          "Elapsed time (as opposed to user or system time) shows the amount of time experienced by the user.") + 
  labs(caption = "Task: flipper_length_mm ~ . on the penguins dataset, 333 rows, 8 features") + 
  theme(plot.caption.position = 'plot')

`tornados` data (2.8 MB)

# recommendations from here for dealing with large data 
# in future multicore setup https://github.com/satijalab/seurat/issues/1845 
options(future.globals.maxSize = 8000 * 1024^2) 

tuesdata <- tidytuesdayR::tt_load('2023-05-16')
tornados <- tuesdata$tornados
tornados <- tornados[,c('yr', 'mo', 'dy', 'mag', 'st', 'inj', 'fat', 'loss')]
tornados <- tornados[complete.cases(tornados),]

# these states appear only very infrequently, like 2 and 1 times respectively — DC and Alaska 
tornados <- tornados |> dplyr::filter(!st %in% c('DC', 'AK'))

tornado_formula <- inj ~ yr + mo + mag + fat + st + loss

`nadir` on `tornados` data (2.8 MB)

tornados_nadir_sl_benchmark <- system.time({
  super_learner(
    data = tornados,
    formula = tornado_formula,
    learners = list(lnr_mean, lnr_lm, lnr_rf, lnr_earth, lnr_glmnet, lnr_xgboost),
    cv_schema = cv_character_and_factors_schema
  )
})
tornados_nadir_sl_benchmark

`sl3` on `tornados` data (2.8 MB)

task <- make_sl3_Task(
  data = tornados,
  outcome = "inj",
  covariates = c("yr", "mo", "dy", "mag", 
                 "st", "fat", "loss")
)

lrn_mean <- Lrnr_mean$new()
lrn_lm <- Lrnr_glm$new()
lrn_rf <- Lrnr_randomForest$new()
lrn_earth <- Lrnr_earth$new()
lrn_glmnet <- Lrnr_glmnet$new()
lrn_xgboost <- Lrnr_xgboost$new()

stack <- Stack$new(lrn_mean, lrn_lm, lrn_rf, lrn_earth, lrn_glmnet, lrn_xgboost)

sl <- Lrnr_sl$new(learners = stack, metalearner = Lrnr_nnls$new(),
                  cv_control = list(V = 5))

tornados_sl3_sl_benchmark <- system.time({
  sl_fit <- sl$train(task = task)
})
tornados_sl3_sl_benchmark

`SuperLearner` on `tornados` data (2.8 MB)

Unfortunately we were unable to get results from the SuperLearner package with the tornados dataset, as it produced an obscure error and little to no guidance on how to debug the error is presently available.

sl_lib = c( 
  "SL.mean", "SL.lm", "SL.randomForest", "SL.earth", "SL.glmnet", "SL.xgboost")

tornados_SuperLearner_sl_benchmark <- system.time({
    mcSuperLearner(Y = tornados$inj,
                   X = tornados[, c("yr", "mo", "dy", "mag", 
                 "st", "fat", "loss")], 
                 SL.library = sl_lib,
                 cvControl = list(V = 5))
  })

error produced by SuperLearner on the tornados dataset

Visualizing Results

sl_timing <- data.frame(
  package = c('nadir', 'sl3', 'SuperLearner'),
  time = c(tornados_nadir_sl_benchmark[[3]],
    tornados_sl3_sl_benchmark[[3]],
    NA))

sl_timing |> 
  ggplot(aes(y = package, x = time, fill = package)) + 
  geom_col() + 
  geom_point() + 
  xlab("Seconds") + 
  geom_text(
    data = data.frame(package = 'SuperLearner', label = 'failed to run', time = 200),
    mapping = aes(label = label)) + 
  theme_bw() + 
  scale_fill_brewer(palette = 'Set2') + 
  ggtitle("Time taken to super learn on the tornados dataset across 6 candidate learners") + 
  labs(caption = "Task: inj ~ . on the tornados dataset, 41,514 rows, 8 features, 2.8 MB") + 
  theme(plot.caption.position = 'plot', panel.grid.major.y = element_blank())

timing results with the tornados dataset

iris data

nadir::super_learner() on iris (7.3 KB) data

sl3 on iris (7.3 KB) data

SuperLearner on iris (7.3 KB) data

penguins data (16.8 KB)

nadir on penguins data (16.8 KB)

sl3 on penguins data (16.8 KB)

SuperLearner on penguins (16.8 KB) data

tornados data (2.8 MB)

nadir on tornados data (2.8 MB)

sl3 on tornados data (2.8 MB)

SuperLearner on tornados data (2.8 MB)