Note
-
Download Jupyter notebook:
https://docs.doubleml.org/stable/examples/double_ml_bonus_data.ipynb.
[1]:
%matplotlib inline
DML: Bonus Data#
This example shows TODO: Add a general description!
[2]:
import numpy as np
import doubleml as dml
from doubleml.datasets import fetch_bonus
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
[3]:
plt.rcParams['figure.figsize'] = 14, 6
sns.set()
Load bonus data using the dml datasets module#
[4]:
dml_data = dml.datasets.fetch_bonus()
dml_data.data.head()
[4]:
index | abdt | tg | inuidur1 | inuidur2 | female | black | hispanic | othrace | dep | ... | recall | agelt35 | agegt54 | durable | nondurable | lusd | husd | muld | dep1 | dep2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 10824 | 0 | 2.890372 | 18 | 0 | 0 | 0 | 0 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0.0 | 1.0 |
1 | 3 | 10824 | 0 | 0.000000 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.0 | 0.0 |
2 | 4 | 10747 | 0 | 3.295837 | 27 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.0 | 0.0 |
3 | 11 | 10607 | 1 | 2.197225 | 9 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0.0 | 0.0 |
4 | 12 | 10831 | 0 | 3.295837 | 27 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1.0 | 0.0 |
5 rows × 26 columns
Specify learner and estimate causal parameter: PLR model with random forest as learner#
[5]:
# Set machine learning methods for m & l
ml_l = RandomForestRegressor()
ml_m = RandomForestRegressor()
n_folds = 2
n_rep = 10
np.random.seed(3141)
dml_plr_rf = dml.DoubleMLPLR(dml_data,
ml_l,
ml_m,
n_folds=n_folds,
n_rep=n_rep,
score='partialling out',
dml_procedure='dml2')
# set some hyperparameters for the learners
pars = {'n_estimators': 500,
'max_features': 'sqrt',
'max_depth': 5}
dml_plr_rf.set_ml_nuisance_params('ml_l', 'tg', pars)
dml_plr_rf.set_ml_nuisance_params('ml_m', 'tg', pars)
[6]:
dml_plr_rf.fit()
dml_plr_rf.summary
[6]:
coef | std err | t | P>|t| | 2.5 % | 97.5 % | |
---|---|---|---|---|---|---|
tg | -0.079085 | 0.035391 | -2.234605 | 0.025443 | -0.14845 | -0.00972 |
[7]:
# Load data with polynomial features
dml_data_lasso = dml.datasets.fetch_bonus(polynomial_features=True)
print(dml_data_lasso)
================== DoubleMLData Object ==================
------------------ Data summary ------------------
Outcome variable: inuidur1
Treatment variable(s): ['tg']
Covariates: ['female', 'black', 'othrace', 'dep1', 'dep2', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54', 'durable', 'lusd', 'husd', 'female^2', 'female black', 'female othrace', 'female dep1', 'female dep2', 'female q2', 'female q3', 'female q4', 'female q5', 'female q6', 'female agelt35', 'female agegt54', 'female durable', 'female lusd', 'female husd', 'black^2', 'black othrace', 'black dep1', 'black dep2', 'black q2', 'black q3', 'black q4', 'black q5', 'black q6', 'black agelt35', 'black agegt54', 'black durable', 'black lusd', 'black husd', 'othrace^2', 'othrace dep1', 'othrace dep2', 'othrace q2', 'othrace q3', 'othrace q4', 'othrace q5', 'othrace q6', 'othrace agelt35', 'othrace agegt54', 'othrace durable', 'othrace lusd', 'othrace husd', 'dep1^2', 'dep1 dep2', 'dep1 q2', 'dep1 q3', 'dep1 q4', 'dep1 q5', 'dep1 q6', 'dep1 agelt35', 'dep1 agegt54', 'dep1 durable', 'dep1 lusd', 'dep1 husd', 'dep2^2', 'dep2 q2', 'dep2 q3', 'dep2 q4', 'dep2 q5', 'dep2 q6', 'dep2 agelt35', 'dep2 agegt54', 'dep2 durable', 'dep2 lusd', 'dep2 husd', 'q2^2', 'q2 q3', 'q2 q4', 'q2 q5', 'q2 q6', 'q2 agelt35', 'q2 agegt54', 'q2 durable', 'q2 lusd', 'q2 husd', 'q3^2', 'q3 q4', 'q3 q5', 'q3 q6', 'q3 agelt35', 'q3 agegt54', 'q3 durable', 'q3 lusd', 'q3 husd', 'q4^2', 'q4 q5', 'q4 q6', 'q4 agelt35', 'q4 agegt54', 'q4 durable', 'q4 lusd', 'q4 husd', 'q5^2', 'q5 q6', 'q5 agelt35', 'q5 agegt54', 'q5 durable', 'q5 lusd', 'q5 husd', 'q6^2', 'q6 agelt35', 'q6 agegt54', 'q6 durable', 'q6 lusd', 'q6 husd', 'agelt35^2', 'agelt35 agegt54', 'agelt35 durable', 'agelt35 lusd', 'agelt35 husd', 'agegt54^2', 'agegt54 durable', 'agegt54 lusd', 'agegt54 husd', 'durable^2', 'durable lusd', 'durable husd', 'lusd^2', 'lusd husd', 'husd^2']
Instrument variable(s): None
No. Observations: 5099
------------------ DataFrame info ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5099 entries, 0 to 5098
Columns: 137 entries, inuidur1 to husd^2
dtypes: float64(136), int64(1)
memory usage: 5.3 MB
Specify learner and estimate causal parameter: PLR model with Lasso as learner#
[8]:
# Set machine learning methods for m & l
ml_l = Lasso()
ml_m = Lasso()
n_folds = 2
n_rep = 10
np.random.seed(3141)
dml_plr_lasso = dml.DoubleMLPLR(dml_data_lasso,
ml_l,
ml_m,
n_folds=n_folds,
n_rep=n_rep,
score='partialling out',
dml_procedure='dml2')
# set some hyperparameters for the learners
dml_plr_lasso.set_ml_nuisance_params('ml_l', 'tg', {'alpha': 0.0005})
dml_plr_lasso.set_ml_nuisance_params('ml_m', 'tg', {'alpha': 0.0026})
[9]:
dml_plr_lasso.fit()
dml_plr_lasso.summary
[9]:
coef | std err | t | P>|t| | 2.5 % | 97.5 % | |
---|---|---|---|---|---|---|
tg | -0.078207 | 0.035572 | -2.198549 | 0.02791 | -0.147927 | -0.008487 |
Specify learner and estimate causal parameter: IRM model with random forest as learner#
[10]:
# Set machine learning methods for m & g
ml_g = RandomForestRegressor()
ml_m = RandomForestClassifier()
n_folds = 2
n_rep = 10
np.random.seed(3141)
dml_irm_rf = dml.DoubleMLIRM(dml_data,
ml_g,
ml_m,
n_folds=n_folds,
n_rep=n_rep,
score='ATE',
dml_procedure='dml2')
# set some hyperparameters for the learners
pars = {'n_estimators': 500,
'max_features': 'sqrt',
'max_depth': 5}
dml_irm_rf.set_ml_nuisance_params('ml_g0', 'tg', pars)
dml_irm_rf.set_ml_nuisance_params('ml_g1', 'tg', pars)
dml_irm_rf.set_ml_nuisance_params('ml_m', 'tg', pars)
[10]:
<doubleml.double_ml_irm.DoubleMLIRM at 0x1747bdd6b90>
[11]:
dml_irm_rf.fit()
dml_irm_rf.summary
[11]:
coef | std err | t | P>|t| | 2.5 % | 97.5 % | |
---|---|---|---|---|---|---|
tg | -0.076971 | 0.03574 | -2.153633 | 0.031269 | -0.14702 | -0.006922 |
Specify learner and estimate causal parameter: IRM model with Lasso as learner#
[12]:
# Set machine learning methods for m & g
ml_g = Lasso()
ml_m = LogisticRegression()
np.random.seed(1234)
n_folds = 2
n_rep = 10
np.random.seed(3141)
dml_irm_lasso = dml.DoubleMLIRM(dml_data_lasso,
ml_g,
ml_m,
n_folds=n_folds,
n_rep=n_rep,
score='ATE',
dml_procedure='dml2')
# set some hyperparameters for the learners
dml_irm_lasso.set_ml_nuisance_params('ml_g0', 'tg', {'alpha': 0.0019})
dml_irm_lasso.set_ml_nuisance_params('ml_g1', 'tg', {'alpha': 0.0073})
dml_irm_lasso.set_ml_nuisance_params('ml_m', 'tg', {'C': 0.0001})
[12]:
<doubleml.double_ml_irm.DoubleMLIRM at 0x1747bdd4520>
[13]:
dml_irm_lasso.fit()
dml_irm_lasso.summary
[13]:
coef | std err | t | P>|t| | 2.5 % | 97.5 % | |
---|---|---|---|---|---|---|
tg | -0.080947 | 0.035545 | -2.277299 | 0.022768 | -0.150614 | -0.01128 |