
Doubly Robust Estimation
Source:vignettes/articles/Doubly-Robust-Estimation.Rmd
Doubly-Robust-Estimation.Rmd
If you’d like to construct a doubly robust estimator of the average
treatment effect using nadir::super_learner()
, we provide
some example code for you here.
Suppose we want to investigate if higher crime rates drive an increase (or not) in housing costs in the Boston dataset, and we consider every other covariate in that dataset to be a plausible confounder.
We can fit the augmented-inverse-probability-weighted (AIPW) / doubly-robust (DR) estimator of the average treatment effect as follows.
library(nadir)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# setup
data("Boston", package = "MASS")
Boston$high_crime <- as.integer(Boston$crim > mean(Boston$crim))
data <- Boston |> select(-crim)
outcome_model <- super_learner(
data = data,
formula = medv ~ .,
learners = list(
mean = lnr_mean,
lm = lnr_lm,
earth = lnr_earth,
rf = lnr_rf))
data_without_outcome <- data |> select(-medv)
exposure_model <- super_learner(
data = data_without_outcome,
formula = high_crime ~ .,
learners = list(
logistic = lnr_glm,
linear = lnr_lm,
rf = lnr_rf,
earth = lnr_earth,
xgboost = lnr_xgboost,
glmnet = lnr_glmnet),
extra_learner_args = list(
logistic = list(family = 'binomial'),
glmnet = list(family = 'binomial'),
rf = list(type = 'classification')
),
determine_super_learner_weights = determine_weights_for_binary_outcomes,
verbose = TRUE
)
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
# check which binary outcome learner was favored:
compare_learners(exposure_model, loss_metric = negative_log_loss)
#> Warning: There were 4 warnings in `dplyr::summarize()`.
#> The first warning was:
#> ℹ In argument: `across(everything(), ~loss_metric(., true_outcome))`.
#> Caused by warning in `log()`:
#> ! NaNs produced
#> ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.
#> # A tibble: 1 × 6
#> logistic linear rf earth xgboost glmnet
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 12137. 6318. 10571. 7697. 5515. 848.
A <- data$high_crime
Y <- data$medv
pi_hat_of_L <- exposure_model$sl_predictor(data_without_outcome)
one_minus_pi_hat_of_L <- 1-pi_hat_of_L
m_of_0_exposure_and_L <- outcome_model({ data |> mutate(high_crime = 0) })
m_of_1_exposure_and_L <- outcome_model({ data |> mutate(high_crime = 1) })
ATE_AIPW <-
mean(
(m_of_1_exposure_and_L + (Y - m_of_1_exposure_and_L) / pi_hat_of_L * A) -
(m_of_0_exposure_and_L + (Y - m_of_0_exposure_and_L) / one_minus_pi_hat_of_L * (1-A))
)
IF <-
A * Y / pi_hat_of_L - (A - pi_hat_of_L) / pi_hat_of_L * m_of_1_exposure_and_L -
(1 - A) * Y / one_minus_pi_hat_of_L - ((1-A) - one_minus_pi_hat_of_L) / one_minus_pi_hat_of_L * m_of_0_exposure_and_L
- ATE_AIPW
#> [1] -0.03530433
ATE_AIPW_variance_estimate <- var(IF)/nrow(data)
ATE_AIPW + qnorm(c(0.025, 0.975)) * sqrt(ATE_AIPW_variance_estimate)
#> [1] -1.764351 1.834960
Let’s do a simulation so we can know the truth for sure.
sample_size <- 1000
L <- rnorm(n = sample_size)
A <- rbinom(n = sample_size, size = 1, prob = plogis(1.75 * L + rnorm(n = sample_size)))
Y <- 2.5 * A - 1.3 * L + rnorm(n = sample_size)
# so our true ATE is obviously 2.5 but the relationship is confounded by construction
data <- data.frame(L, A, Y)
lm(Y ~ A)
#>
#> Call:
#> lm(formula = Y ~ A)
#>
#> Coefficients:
#> (Intercept) A
#> 0.6206 1.1304
lm(Y ~ A + L)
#>
#> Call:
#> lm(formula = Y ~ A + L)
#>
#> Coefficients:
#> (Intercept) A L
#> -0.04046 2.58479 -1.35808
outcome_model <- super_learner(
data = data,
formula = Y ~ A + L,
learners = list(
mean = lnr_mean,
lm = lnr_lm,
earth = lnr_earth,
rf = lnr_rf))
data_without_outcome <- data |> select(-Y)
exposure_model <- super_learner(
data = data_without_outcome,
formula = A ~ L,
learners = list(
logistic = lnr_glm,
linear = lnr_lm,
rf = lnr_rf,
earth = lnr_earth,
xgboost = lnr_xgboost,
glmnet = lnr_glmnet),
extra_learner_args = list(
logistic = list(family = 'binomial'),
glmnet = list(family = 'binomial'),
rf = list(type = 'classification')
),
determine_super_learner_weights = determine_weights_for_binary_outcomes,
verbose = TRUE
)
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
#> Warning in randomForest.default(m, y, ...): The response has five or fewer
#> unique values. Are you sure you want to do regression?
# check which binary outcome learner was favored:
compare_learners(exposure_model, loss_metric = negative_log_loss)
#> Warning: There were 3 warnings in `dplyr::summarize()`.
#> The first warning was:
#> ℹ In argument: `across(everything(), ~loss_metric(., true_outcome))`.
#> Caused by warning in `log()`:
#> ! NaNs produced
#> ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
#> # A tibble: 1 × 6
#> logistic linear rf earth xgboost glmnet
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 946. 1773. 4299. 1501. 3591. 712.
pi_hat_of_L <- exposure_model$sl_predictor(data_without_outcome)
one_minus_pi_hat_of_L <- 1-pi_hat_of_L
m_of_0_exposure_and_L <- outcome_model({ data |> mutate(A = 0) })
m_of_1_exposure_and_L <- outcome_model({ data |> mutate(A = 1) })
ATE_AIPW <-
mean(
(m_of_1_exposure_and_L + (Y - m_of_1_exposure_and_L) / pi_hat_of_L * A) -
(m_of_0_exposure_and_L + (Y - m_of_0_exposure_and_L) / one_minus_pi_hat_of_L * (1-A))
)
IF <-
A * Y / pi_hat_of_L - (A - pi_hat_of_L) / pi_hat_of_L * m_of_1_exposure_and_L -
(1 - A) * Y / one_minus_pi_hat_of_L - ((1-A) - one_minus_pi_hat_of_L) / one_minus_pi_hat_of_L * m_of_0_exposure_and_L - ATE_AIPW
ATE_AIPW_variance_estimate <- var(IF)/nrow(data)
ATE_AIPW + qnorm(c(0.025, 0.975)) * sqrt(ATE_AIPW_variance_estimate)
#> [1] 2.404488 2.761259