{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"# DML: Bonus Data\n",
"This example shows\n",
"TODO: Add a general description!\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import doubleml as dml\n",
"from doubleml.datasets import fetch_bonus\n",
"\n",
"from sklearn.linear_model import Lasso, LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"plt.rcParams['figure.figsize'] = 14, 6\n",
"sns.set()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load bonus data using the dml datasets module\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" abdt | \n",
" tg | \n",
" inuidur1 | \n",
" inuidur2 | \n",
" female | \n",
" black | \n",
" hispanic | \n",
" othrace | \n",
" dep | \n",
" ... | \n",
" recall | \n",
" agelt35 | \n",
" agegt54 | \n",
" durable | \n",
" nondurable | \n",
" lusd | \n",
" husd | \n",
" muld | \n",
" dep1 | \n",
" dep2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 10824 | \n",
" 0 | \n",
" 2.890372 | \n",
" 18 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 3 | \n",
" 10824 | \n",
" 0 | \n",
" 0.000000 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" 10747 | \n",
" 0 | \n",
" 3.295837 | \n",
" 27 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 11 | \n",
" 10607 | \n",
" 1 | \n",
" 2.197225 | \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 12 | \n",
" 10831 | \n",
" 0 | \n",
" 3.295837 | \n",
" 27 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 26 columns
\n",
"
"
],
"text/plain": [
" index abdt tg inuidur1 inuidur2 female black hispanic othrace \\\n",
"0 0 10824 0 2.890372 18 0 0 0 0 \n",
"1 3 10824 0 0.000000 1 0 0 0 0 \n",
"2 4 10747 0 3.295837 27 0 0 0 0 \n",
"3 11 10607 1 2.197225 9 0 0 0 0 \n",
"4 12 10831 0 3.295837 27 0 0 0 0 \n",
"\n",
" dep ... recall agelt35 agegt54 durable nondurable lusd husd muld \\\n",
"0 2 ... 0 0 0 0 0 0 1 0 \n",
"1 0 ... 0 0 0 0 0 1 0 0 \n",
"2 0 ... 0 0 0 0 0 1 0 0 \n",
"3 0 ... 0 1 0 0 0 0 0 1 \n",
"4 1 ... 0 0 1 1 0 1 0 0 \n",
"\n",
" dep1 dep2 \n",
"0 0.0 1.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 1.0 0.0 \n",
"\n",
"[5 rows x 26 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dml_data = dml.datasets.fetch_bonus()\n",
"dml_data.data.head()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify learner and estimate causal parameter: PLR model with random forest as learner\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Set machine learning methods for m & l\n",
"ml_l = RandomForestRegressor()\n",
"ml_m = RandomForestRegressor()\n",
"n_folds = 2\n",
"n_rep = 10\n",
"\n",
"np.random.seed(3141)\n",
"dml_plr_rf = dml.DoubleMLPLR(dml_data,\n",
" ml_l,\n",
" ml_m,\n",
" n_folds=n_folds,\n",
" n_rep=n_rep,\n",
" score='partialling out',\n",
" dml_procedure='dml2')\n",
"\n",
"# set some hyperparameters for the learners\n",
"pars = {'n_estimators': 500,\n",
" 'max_features': 'sqrt',\n",
" 'max_depth': 5}\n",
"\n",
"dml_plr_rf.set_ml_nuisance_params('ml_l', 'tg', pars)\n",
"dml_plr_rf.set_ml_nuisance_params('ml_m', 'tg', pars)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" coef | \n",
" std err | \n",
" t | \n",
" P>|t| | \n",
" 2.5 % | \n",
" 97.5 % | \n",
"
\n",
" \n",
" \n",
" \n",
" tg | \n",
" -0.079085 | \n",
" 0.035391 | \n",
" -2.234605 | \n",
" 0.025443 | \n",
" -0.14845 | \n",
" -0.00972 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" coef std err t P>|t| 2.5 % 97.5 %\n",
"tg -0.079085 0.035391 -2.234605 0.025443 -0.14845 -0.00972"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dml_plr_rf.fit()\n",
"dml_plr_rf.summary"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"================== DoubleMLData Object ==================\n",
"\n",
"------------------ Data summary ------------------\n",
"Outcome variable: inuidur1\n",
"Treatment variable(s): ['tg']\n",
"Covariates: ['female', 'black', 'othrace', 'dep1', 'dep2', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54', 'durable', 'lusd', 'husd', 'female^2', 'female black', 'female othrace', 'female dep1', 'female dep2', 'female q2', 'female q3', 'female q4', 'female q5', 'female q6', 'female agelt35', 'female agegt54', 'female durable', 'female lusd', 'female husd', 'black^2', 'black othrace', 'black dep1', 'black dep2', 'black q2', 'black q3', 'black q4', 'black q5', 'black q6', 'black agelt35', 'black agegt54', 'black durable', 'black lusd', 'black husd', 'othrace^2', 'othrace dep1', 'othrace dep2', 'othrace q2', 'othrace q3', 'othrace q4', 'othrace q5', 'othrace q6', 'othrace agelt35', 'othrace agegt54', 'othrace durable', 'othrace lusd', 'othrace husd', 'dep1^2', 'dep1 dep2', 'dep1 q2', 'dep1 q3', 'dep1 q4', 'dep1 q5', 'dep1 q6', 'dep1 agelt35', 'dep1 agegt54', 'dep1 durable', 'dep1 lusd', 'dep1 husd', 'dep2^2', 'dep2 q2', 'dep2 q3', 'dep2 q4', 'dep2 q5', 'dep2 q6', 'dep2 agelt35', 'dep2 agegt54', 'dep2 durable', 'dep2 lusd', 'dep2 husd', 'q2^2', 'q2 q3', 'q2 q4', 'q2 q5', 'q2 q6', 'q2 agelt35', 'q2 agegt54', 'q2 durable', 'q2 lusd', 'q2 husd', 'q3^2', 'q3 q4', 'q3 q5', 'q3 q6', 'q3 agelt35', 'q3 agegt54', 'q3 durable', 'q3 lusd', 'q3 husd', 'q4^2', 'q4 q5', 'q4 q6', 'q4 agelt35', 'q4 agegt54', 'q4 durable', 'q4 lusd', 'q4 husd', 'q5^2', 'q5 q6', 'q5 agelt35', 'q5 agegt54', 'q5 durable', 'q5 lusd', 'q5 husd', 'q6^2', 'q6 agelt35', 'q6 agegt54', 'q6 durable', 'q6 lusd', 'q6 husd', 'agelt35^2', 'agelt35 agegt54', 'agelt35 durable', 'agelt35 lusd', 'agelt35 husd', 'agegt54^2', 'agegt54 durable', 'agegt54 lusd', 'agegt54 husd', 'durable^2', 'durable lusd', 'durable husd', 'lusd^2', 'lusd husd', 'husd^2']\n",
"Instrument variable(s): None\n",
"No. Observations: 5099\n",
"\n",
"------------------ DataFrame info ------------------\n",
"\n",
"RangeIndex: 5099 entries, 0 to 5098\n",
"Columns: 137 entries, inuidur1 to husd^2\n",
"dtypes: float64(136), int64(1)\n",
"memory usage: 5.3 MB\n",
"\n"
]
}
],
"source": [
"# Load data with polynomial features\n",
"dml_data_lasso = dml.datasets.fetch_bonus(polynomial_features=True)\n",
"print(dml_data_lasso)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify learner and estimate causal parameter: PLR model with Lasso as learner\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Set machine learning methods for m & l\n",
"ml_l = Lasso()\n",
"ml_m = Lasso()\n",
"n_folds = 2\n",
"n_rep = 10\n",
"\n",
"np.random.seed(3141)\n",
"dml_plr_lasso = dml.DoubleMLPLR(dml_data_lasso,\n",
" ml_l,\n",
" ml_m,\n",
" n_folds=n_folds,\n",
" n_rep=n_rep,\n",
" score='partialling out',\n",
" dml_procedure='dml2')\n",
"\n",
"# set some hyperparameters for the learners\n",
"dml_plr_lasso.set_ml_nuisance_params('ml_l', 'tg', {'alpha': 0.0005})\n",
"dml_plr_lasso.set_ml_nuisance_params('ml_m', 'tg', {'alpha': 0.0026})"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" coef | \n",
" std err | \n",
" t | \n",
" P>|t| | \n",
" 2.5 % | \n",
" 97.5 % | \n",
"
\n",
" \n",
" \n",
" \n",
" tg | \n",
" -0.078207 | \n",
" 0.035572 | \n",
" -2.198549 | \n",
" 0.02791 | \n",
" -0.147927 | \n",
" -0.008487 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" coef std err t P>|t| 2.5 % 97.5 %\n",
"tg -0.078207 0.035572 -2.198549 0.02791 -0.147927 -0.008487"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dml_plr_lasso.fit()\n",
"dml_plr_lasso.summary"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify learner and estimate causal parameter: IRM model with random forest as learner\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Set machine learning methods for m & g\n",
"ml_g = RandomForestRegressor()\n",
"ml_m = RandomForestClassifier()\n",
"n_folds = 2\n",
"n_rep = 10\n",
"\n",
"np.random.seed(3141)\n",
"dml_irm_rf = dml.DoubleMLIRM(dml_data,\n",
" ml_g,\n",
" ml_m,\n",
" n_folds=n_folds,\n",
" n_rep=n_rep,\n",
" score='ATE',\n",
" dml_procedure='dml2')\n",
"\n",
"# set some hyperparameters for the learners\n",
"pars = {'n_estimators': 500,\n",
" 'max_features': 'sqrt',\n",
" 'max_depth': 5}\n",
"\n",
"dml_irm_rf.set_ml_nuisance_params('ml_g0', 'tg', pars)\n",
"dml_irm_rf.set_ml_nuisance_params('ml_g1', 'tg', pars)\n",
"dml_irm_rf.set_ml_nuisance_params('ml_m', 'tg', pars)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" coef | \n",
" std err | \n",
" t | \n",
" P>|t| | \n",
" 2.5 % | \n",
" 97.5 % | \n",
"
\n",
" \n",
" \n",
" \n",
" tg | \n",
" -0.076971 | \n",
" 0.03574 | \n",
" -2.153633 | \n",
" 0.031269 | \n",
" -0.14702 | \n",
" -0.006922 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" coef std err t P>|t| 2.5 % 97.5 %\n",
"tg -0.076971 0.03574 -2.153633 0.031269 -0.14702 -0.006922"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dml_irm_rf.fit()\n",
"dml_irm_rf.summary"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specify learner and estimate causal parameter: IRM model with Lasso as learner\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Set machine learning methods for m & g\n",
"ml_g = Lasso()\n",
"ml_m = LogisticRegression()\n",
"np.random.seed(1234)\n",
"n_folds = 2\n",
"n_rep = 10\n",
"\n",
"np.random.seed(3141)\n",
"dml_irm_lasso = dml.DoubleMLIRM(dml_data_lasso,\n",
" ml_g,\n",
" ml_m,\n",
" n_folds=n_folds,\n",
" n_rep=n_rep,\n",
" score='ATE',\n",
" dml_procedure='dml2')\n",
"\n",
"# set some hyperparameters for the learners\n",
"dml_irm_lasso.set_ml_nuisance_params('ml_g0', 'tg', {'alpha': 0.0019})\n",
"dml_irm_lasso.set_ml_nuisance_params('ml_g1', 'tg', {'alpha': 0.0073})\n",
"dml_irm_lasso.set_ml_nuisance_params('ml_m', 'tg', {'C': 0.0001})"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" coef | \n",
" std err | \n",
" t | \n",
" P>|t| | \n",
" 2.5 % | \n",
" 97.5 % | \n",
"
\n",
" \n",
" \n",
" \n",
" tg | \n",
" -0.080947 | \n",
" 0.035545 | \n",
" -2.277299 | \n",
" 0.022768 | \n",
" -0.150614 | \n",
" -0.01128 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" coef std err t P>|t| 2.5 % 97.5 %\n",
"tg -0.080947 0.035545 -2.277299 0.022768 -0.150614 -0.01128"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dml_irm_lasso.fit()\n",
"dml_irm_lasso.summary"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
}
},
"nbformat": 4,
"nbformat_minor": 1
}