Skip to contents

If you’d like to construct a doubly robust estimator of the average treatment effect using nadir::super_learner(), we provide some example code for you here.

Suppose we want to investigate if higher crime rates drive an increase (or not) in housing costs in the Boston dataset, and we consider every other covariate in that dataset to be a plausible confounder.

We can fit the augmented-inverse-probability-weighted (AIPW) / doubly-robust (DR) estimator of the average treatment effect as follows.

library(nadir)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

# setup
data("Boston", package = "MASS")

Boston$high_crime <- as.integer(Boston$crim > mean(Boston$crim)) 
data <- Boston |> select(-crim)

outcome_model <- super_learner(
  data = data, 
  formula = medv ~ .,
  learners = list(
    mean = lnr_mean,
    lm = lnr_lm,
    earth = lnr_earth,
    rf = lnr_rf))

data_without_outcome <- data |> select(-medv)

exposure_model <- super_learner(
  data = data_without_outcome, 
  formula = high_crime ~ .,
  learners = list(
    logistic = lnr_glm,
    linear = lnr_lm,
    rf = lnr_rf,
    earth = lnr_earth,
    xgboost = lnr_xgboost,
    glmnet = lnr_glmnet),
  extra_learner_args = list(
    logistic = list(family = 'binomial'),
    glmnet = list(family = 'binomial'),
    rf = list(type = 'classification')
  ),
  determine_super_learner_weights = determine_weights_for_binary_outcomes,
  verbose = TRUE
)
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

# check which binary outcome learner was favored:
compare_learners(exposure_model, loss_metric = negative_log_loss)
#> Warning: There were 4 warnings in `dplyr::summarize()`.
#> The first warning was:
#>  In argument: `across(everything(), ~loss_metric(., true_outcome))`.
#> Caused by warning in `log()`:
#> ! NaNs produced
#>  Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.
#> # A tibble: 1 × 6
#>   logistic linear     rf earth xgboost glmnet
#>      <dbl>  <dbl>  <dbl> <dbl>   <dbl>  <dbl>
#> 1   12137.  6318. 10571. 7697.   5515.   848.

A <- data$high_crime
Y <- data$medv 
pi_hat_of_L <- exposure_model$sl_predictor(data_without_outcome)
one_minus_pi_hat_of_L <- 1-pi_hat_of_L

m_of_0_exposure_and_L <- outcome_model({ data |> mutate(high_crime = 0) })
m_of_1_exposure_and_L <- outcome_model({ data |> mutate(high_crime = 1) })


ATE_AIPW <- 
  mean(
    (m_of_1_exposure_and_L + (Y - m_of_1_exposure_and_L) / pi_hat_of_L * A) - 
      (m_of_0_exposure_and_L + (Y - m_of_0_exposure_and_L) / one_minus_pi_hat_of_L * (1-A))
  )

IF <- 
  A * Y / pi_hat_of_L - (A - pi_hat_of_L) / pi_hat_of_L * m_of_1_exposure_and_L - 
  (1 - A) * Y / one_minus_pi_hat_of_L - ((1-A) - one_minus_pi_hat_of_L) / one_minus_pi_hat_of_L * m_of_0_exposure_and_L 
- ATE_AIPW
#> [1] -0.03530433

ATE_AIPW_variance_estimate <- var(IF)/nrow(data)

ATE_AIPW + qnorm(c(0.025, 0.975)) * sqrt(ATE_AIPW_variance_estimate)
#> [1] -1.764351  1.834960

Let’s do a simulation so we can know the truth for sure.

sample_size <- 1000
L <- rnorm(n = sample_size)
A <- rbinom(n = sample_size, size = 1, prob = plogis(1.75 * L + rnorm(n = sample_size)))
Y <- 2.5 * A - 1.3 * L + rnorm(n = sample_size)
# so our true ATE is obviously 2.5 but the relationship is confounded by construction
data <- data.frame(L, A, Y)

lm(Y ~ A)
#> 
#> Call:
#> lm(formula = Y ~ A)
#> 
#> Coefficients:
#> (Intercept)            A  
#>      0.6206       1.1304
lm(Y ~ A + L)
#> 
#> Call:
#> lm(formula = Y ~ A + L)
#> 
#> Coefficients:
#> (Intercept)            A            L  
#>    -0.04046      2.58479     -1.35808

outcome_model <- super_learner(
  data = data, 
  formula = Y ~ A + L,
  learners = list(
    mean = lnr_mean,
    lm = lnr_lm,
    earth = lnr_earth,
    rf = lnr_rf))

data_without_outcome <- data |> select(-Y)

exposure_model <- super_learner(
  data = data_without_outcome, 
  formula = A ~ L,
  learners = list(
    logistic = lnr_glm,
    linear = lnr_lm,
    rf = lnr_rf,
    earth = lnr_earth,
    xgboost = lnr_xgboost,
    glmnet = lnr_glmnet),
  extra_learner_args = list(
    logistic = list(family = 'binomial'),
    glmnet = list(family = 'binomial'),
    rf = list(type = 'classification')
  ),
  determine_super_learner_weights = determine_weights_for_binary_outcomes,
  verbose = TRUE
)
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values.  Are you sure you want to do regression?

# check which binary outcome learner was favored:
compare_learners(exposure_model, loss_metric = negative_log_loss)
#> Warning: There were 3 warnings in `dplyr::summarize()`.
#> The first warning was:
#>  In argument: `across(everything(), ~loss_metric(., true_outcome))`.
#> Caused by warning in `log()`:
#> ! NaNs produced
#>  Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
#> # A tibble: 1 × 6
#>   logistic linear    rf earth xgboost glmnet
#>      <dbl>  <dbl> <dbl> <dbl>   <dbl>  <dbl>
#> 1     946.  1773. 4299. 1501.   3591.   712.

pi_hat_of_L <- exposure_model$sl_predictor(data_without_outcome)
one_minus_pi_hat_of_L <- 1-pi_hat_of_L

m_of_0_exposure_and_L <- outcome_model({ data |> mutate(A = 0) })
m_of_1_exposure_and_L <- outcome_model({ data |> mutate(A = 1) })


ATE_AIPW <- 
  mean(
    (m_of_1_exposure_and_L + (Y - m_of_1_exposure_and_L) / pi_hat_of_L * A) - 
      (m_of_0_exposure_and_L + (Y - m_of_0_exposure_and_L) / one_minus_pi_hat_of_L * (1-A))
  )

IF <- 
  A * Y / pi_hat_of_L - (A - pi_hat_of_L) / pi_hat_of_L * m_of_1_exposure_and_L - 
  (1 - A) * Y / one_minus_pi_hat_of_L - ((1-A) - one_minus_pi_hat_of_L) / one_minus_pi_hat_of_L * m_of_0_exposure_and_L - ATE_AIPW

ATE_AIPW_variance_estimate <- var(IF)/nrow(data)

ATE_AIPW + qnorm(c(0.025, 0.975)) * sqrt(ATE_AIPW_variance_estimate)
#> [1] 2.404488 2.761259