[1]:
%matplotlib inline

DML: Bonus Data#

This example shows TODO: Add a general description!

[2]:
import numpy as np
import doubleml as dml
from doubleml.datasets import fetch_bonus

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
[3]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()

Load bonus data using the dml datasets module#

[4]:
dml_data = dml.datasets.fetch_bonus()
dml_data.data.head()
[4]:
index abdt tg inuidur1 inuidur2 female black hispanic othrace dep ... recall agelt35 agegt54 durable nondurable lusd husd muld dep1 dep2
0 0 10824 0 2.890372 18 0 0 0 0 2 ... 0 0 0 0 0 0 1 0 0.0 1.0
1 3 10824 0 0.000000 1 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0.0 0.0
2 4 10747 0 3.295837 27 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0.0 0.0
3 11 10607 1 2.197225 9 0 0 0 0 0 ... 0 1 0 0 0 0 0 1 0.0 0.0
4 12 10831 0 3.295837 27 0 0 0 0 1 ... 0 0 1 1 0 1 0 0 1.0 0.0

5 rows × 26 columns

Specify learner and estimate causal parameter: PLR model with random forest as learner#

[5]:
# Set machine learning methods for m & l
ml_l = RandomForestRegressor()
ml_m = RandomForestRegressor()
n_folds = 2
n_rep = 10

np.random.seed(3141)
dml_plr_rf = dml.DoubleMLPLR(dml_data,
                             ml_l,
                             ml_m,
                             n_folds=n_folds,
                             n_rep=n_rep,
                             score='partialling out',
                             dml_procedure='dml2')

# set some hyperparameters for the learners
pars = {'n_estimators': 500,
        'max_features': 'sqrt',
        'max_depth': 5}

dml_plr_rf.set_ml_nuisance_params('ml_l', 'tg', pars)
dml_plr_rf.set_ml_nuisance_params('ml_m', 'tg', pars)
[6]:
dml_plr_rf.fit()
dml_plr_rf.summary
[6]:
coef std err t P>|t| 2.5 % 97.5 %
tg -0.079085 0.035391 -2.234605 0.025443 -0.14845 -0.00972
[7]:
# Load data with polynomial features
dml_data_lasso = dml.datasets.fetch_bonus(polynomial_features=True)
print(dml_data_lasso)
================== DoubleMLData Object ==================

------------------ Data summary      ------------------
Outcome variable: inuidur1
Treatment variable(s): ['tg']
Covariates: ['female', 'black', 'othrace', 'dep1', 'dep2', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54', 'durable', 'lusd', 'husd', 'female^2', 'female black', 'female othrace', 'female dep1', 'female dep2', 'female q2', 'female q3', 'female q4', 'female q5', 'female q6', 'female agelt35', 'female agegt54', 'female durable', 'female lusd', 'female husd', 'black^2', 'black othrace', 'black dep1', 'black dep2', 'black q2', 'black q3', 'black q4', 'black q5', 'black q6', 'black agelt35', 'black agegt54', 'black durable', 'black lusd', 'black husd', 'othrace^2', 'othrace dep1', 'othrace dep2', 'othrace q2', 'othrace q3', 'othrace q4', 'othrace q5', 'othrace q6', 'othrace agelt35', 'othrace agegt54', 'othrace durable', 'othrace lusd', 'othrace husd', 'dep1^2', 'dep1 dep2', 'dep1 q2', 'dep1 q3', 'dep1 q4', 'dep1 q5', 'dep1 q6', 'dep1 agelt35', 'dep1 agegt54', 'dep1 durable', 'dep1 lusd', 'dep1 husd', 'dep2^2', 'dep2 q2', 'dep2 q3', 'dep2 q4', 'dep2 q5', 'dep2 q6', 'dep2 agelt35', 'dep2 agegt54', 'dep2 durable', 'dep2 lusd', 'dep2 husd', 'q2^2', 'q2 q3', 'q2 q4', 'q2 q5', 'q2 q6', 'q2 agelt35', 'q2 agegt54', 'q2 durable', 'q2 lusd', 'q2 husd', 'q3^2', 'q3 q4', 'q3 q5', 'q3 q6', 'q3 agelt35', 'q3 agegt54', 'q3 durable', 'q3 lusd', 'q3 husd', 'q4^2', 'q4 q5', 'q4 q6', 'q4 agelt35', 'q4 agegt54', 'q4 durable', 'q4 lusd', 'q4 husd', 'q5^2', 'q5 q6', 'q5 agelt35', 'q5 agegt54', 'q5 durable', 'q5 lusd', 'q5 husd', 'q6^2', 'q6 agelt35', 'q6 agegt54', 'q6 durable', 'q6 lusd', 'q6 husd', 'agelt35^2', 'agelt35 agegt54', 'agelt35 durable', 'agelt35 lusd', 'agelt35 husd', 'agegt54^2', 'agegt54 durable', 'agegt54 lusd', 'agegt54 husd', 'durable^2', 'durable lusd', 'durable husd', 'lusd^2', 'lusd husd', 'husd^2']
Instrument variable(s): None
No. Observations: 5099

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5099 entries, 0 to 5098
Columns: 137 entries, inuidur1 to husd^2
dtypes: float64(136), int64(1)
memory usage: 5.3 MB

Specify learner and estimate causal parameter: PLR model with Lasso as learner#

[8]:
# Set machine learning methods for m & l
ml_l = Lasso()
ml_m = Lasso()
n_folds = 2
n_rep = 10

np.random.seed(3141)
dml_plr_lasso = dml.DoubleMLPLR(dml_data_lasso,
                                ml_l,
                                ml_m,
                                n_folds=n_folds,
                                n_rep=n_rep,
                                score='partialling out',
                                dml_procedure='dml2')

# set some hyperparameters for the learners
dml_plr_lasso.set_ml_nuisance_params('ml_l', 'tg', {'alpha': 0.0005})
dml_plr_lasso.set_ml_nuisance_params('ml_m', 'tg', {'alpha': 0.0026})
[9]:
dml_plr_lasso.fit()
dml_plr_lasso.summary
[9]:
coef std err t P>|t| 2.5 % 97.5 %
tg -0.078207 0.035572 -2.198549 0.02791 -0.147927 -0.008487

Specify learner and estimate causal parameter: IRM model with random forest as learner#

[10]:
# Set machine learning methods for m & g
ml_g = RandomForestRegressor()
ml_m = RandomForestClassifier()
n_folds = 2
n_rep = 10

np.random.seed(3141)
dml_irm_rf = dml.DoubleMLIRM(dml_data,
                             ml_g,
                             ml_m,
                             n_folds=n_folds,
                             n_rep=n_rep,
                             score='ATE',
                             dml_procedure='dml2')

# set some hyperparameters for the learners
pars = {'n_estimators': 500,
        'max_features': 'sqrt',
        'max_depth': 5}

dml_irm_rf.set_ml_nuisance_params('ml_g0', 'tg', pars)
dml_irm_rf.set_ml_nuisance_params('ml_g1', 'tg', pars)
dml_irm_rf.set_ml_nuisance_params('ml_m', 'tg', pars)
[10]:
<doubleml.double_ml_irm.DoubleMLIRM at 0x1747bdd6b90>
[11]:
dml_irm_rf.fit()
dml_irm_rf.summary
[11]:
coef std err t P>|t| 2.5 % 97.5 %
tg -0.076971 0.03574 -2.153633 0.031269 -0.14702 -0.006922

Specify learner and estimate causal parameter: IRM model with Lasso as learner#

[12]:
# Set machine learning methods for m & g
ml_g = Lasso()
ml_m = LogisticRegression()
np.random.seed(1234)
n_folds = 2
n_rep = 10

np.random.seed(3141)
dml_irm_lasso = dml.DoubleMLIRM(dml_data_lasso,
                                ml_g,
                                ml_m,
                                n_folds=n_folds,
                                n_rep=n_rep,
                                score='ATE',
                                dml_procedure='dml2')

# set some hyperparameters for the learners
dml_irm_lasso.set_ml_nuisance_params('ml_g0', 'tg', {'alpha': 0.0019})
dml_irm_lasso.set_ml_nuisance_params('ml_g1', 'tg', {'alpha': 0.0073})
dml_irm_lasso.set_ml_nuisance_params('ml_m', 'tg', {'C': 0.0001})
[12]:
<doubleml.double_ml_irm.DoubleMLIRM at 0x1747bdd4520>
[13]:
dml_irm_lasso.fit()
dml_irm_lasso.summary
[13]:
coef std err t P>|t| 2.5 % 97.5 %
tg -0.080947 0.035545 -2.277299 0.022768 -0.150614 -0.01128