{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "\n", "# DML: Bonus Data\n", "This example shows\n", "TODO: Add a general description!\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import doubleml as dml\n", "from doubleml.datasets import fetch_bonus\n", "\n", "from sklearn.linear_model import Lasso, LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "plt.rcParams['figure.figsize'] = 14, 6\n", "sns.set()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load bonus data using the dml datasets module\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", "

5 rows × 26 columns

\n", "
" ], "text/plain": [ " index abdt tg inuidur1 inuidur2 female black hispanic othrace \\\n", "0 0 10824 0 2.890372 18 0 0 0 0 \n", "1 3 10824 0 0.000000 1 0 0 0 0 \n", "2 4 10747 0 3.295837 27 0 0 0 0 \n", "3 11 10607 1 2.197225 9 0 0 0 0 \n", "4 12 10831 0 3.295837 27 0 0 0 0 \n", "\n", " dep ... recall agelt35 agegt54 durable nondurable lusd husd muld \\\n", "0 2 ... 0 0 0 0 0 0 1 0 \n", "1 0 ... 0 0 0 0 0 1 0 0 \n", "2 0 ... 0 0 0 0 0 1 0 0 \n", "3 0 ... 0 1 0 0 0 0 0 1 \n", "4 1 ... 0 0 1 1 0 1 0 0 \n", "\n", " dep1 dep2 \n", "0 0.0 1.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 0.0 0.0 \n", "4 1.0 0.0 \n", "\n", "[5 rows x 26 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dml_data = dml.datasets.fetch_bonus()\n", "dml_data.data.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Specify learner and estimate causal parameter: PLR model with random forest as learner\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Set machine learning methods for m & l\n", "ml_l = RandomForestRegressor()\n", "ml_m = RandomForestRegressor()\n", "n_folds = 2\n", "n_rep = 10\n", "\n", "np.random.seed(3141)\n", "dml_plr_rf = dml.DoubleMLPLR(dml_data,\n", " ml_l,\n", " ml_m,\n", " n_folds=n_folds,\n", " n_rep=n_rep,\n", " score='partialling out',\n", " dml_procedure='dml2')\n", "\n", "# set some hyperparameters for the learners\n", "pars = {'n_estimators': 500,\n", " 'max_features': 'sqrt',\n", " 'max_depth': 5}\n", "\n", "dml_plr_rf.set_ml_nuisance_params('ml_l', 'tg', pars)\n", "dml_plr_rf.set_ml_nuisance_params('ml_m', 'tg', pars)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
coefstd errtP>|t|2.5 %97.5 %
\n", "
" ], "text/plain": [ " coef std err t P>|t| 2.5 % 97.5 %\n", "tg -0.079085 0.035391 -2.234605 0.025443 -0.14845 -0.00972" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dml_plr_rf.fit()\n", "dml_plr_rf.summary" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "================== DoubleMLData Object ==================\n", "\n", "------------------ Data summary ------------------\n", "Outcome variable: inuidur1\n", "Treatment variable(s): ['tg']\n", "Covariates: ['female', 'black', 'othrace', 'dep1', 'dep2', 'q2', 'q3', 'q4', 'q5', 'q6', 'agelt35', 'agegt54', 'durable', 'lusd', 'husd', 'female^2', 'female black', 'female othrace', 'female dep1', 'female dep2', 'female q2', 'female q3', 'female q4', 'female q5', 'female q6', 'female agelt35', 'female agegt54', 'female durable', 'female lusd', 'female husd', 'black^2', 'black othrace', 'black dep1', 'black dep2', 'black q2', 'black q3', 'black q4', 'black q5', 'black q6', 'black agelt35', 'black agegt54', 'black durable', 'black lusd', 'black husd', 'othrace^2', 'othrace dep1', 'othrace dep2', 'othrace q2', 'othrace q3', 'othrace q4', 'othrace q5', 'othrace q6', 'othrace agelt35', 'othrace agegt54', 'othrace durable', 'othrace lusd', 'othrace husd', 'dep1^2', 'dep1 dep2', 'dep1 q2', 'dep1 q3', 'dep1 q4', 'dep1 q5', 'dep1 q6', 'dep1 agelt35', 'dep1 agegt54', 'dep1 durable', 'dep1 lusd', 'dep1 husd', 'dep2^2', 'dep2 q2', 'dep2 q3', 'dep2 q4', 'dep2 q5', 'dep2 q6', 'dep2 agelt35', 'dep2 agegt54', 'dep2 durable', 'dep2 lusd', 'dep2 husd', 'q2^2', 'q2 q3', 'q2 q4', 'q2 q5', 'q2 q6', 'q2 agelt35', 'q2 agegt54', 'q2 durable', 'q2 lusd', 'q2 husd', 'q3^2', 'q3 q4', 'q3 q5', 'q3 q6', 'q3 agelt35', 'q3 agegt54', 'q3 durable', 'q3 lusd', 'q3 husd', 'q4^2', 'q4 q5', 'q4 q6', 'q4 agelt35', 'q4 agegt54', 'q4 durable', 'q4 lusd', 'q4 husd', 'q5^2', 'q5 q6', 'q5 agelt35', 'q5 agegt54', 'q5 durable', 'q5 lusd', 'q5 husd', 'q6^2', 'q6 agelt35', 'q6 agegt54', 'q6 durable', 'q6 lusd', 'q6 husd', 'agelt35^2', 'agelt35 agegt54', 'agelt35 durable', 'agelt35 lusd', 'agelt35 husd', 'agegt54^2', 'agegt54 durable', 'agegt54 lusd', 'agegt54 husd', 'durable^2', 'durable lusd', 'durable husd', 'lusd^2', 'lusd husd', 'husd^2']\n", "Instrument variable(s): None\n", "No. Observations: 5099\n", "\n", "------------------ DataFrame info ------------------\n", "\n", "RangeIndex: 5099 entries, 0 to 5098\n", "Columns: 137 entries, inuidur1 to husd^2\n", "dtypes: float64(136), int64(1)\n", "memory usage: 5.3 MB\n", "\n" ] } ], "source": [ "# Load data with polynomial features\n", "dml_data_lasso = dml.datasets.fetch_bonus(polynomial_features=True)\n", "print(dml_data_lasso)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Specify learner and estimate causal parameter: PLR model with Lasso as learner\n", "\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Set machine learning methods for m & l\n", "ml_l = Lasso()\n", "ml_m = Lasso()\n", "n_folds = 2\n", "n_rep = 10\n", "\n", "np.random.seed(3141)\n", "dml_plr_lasso = dml.DoubleMLPLR(dml_data_lasso,\n", " ml_l,\n", " ml_m,\n", " n_folds=n_folds,\n", " n_rep=n_rep,\n", " score='partialling out',\n", " dml_procedure='dml2')\n", "\n", "# set some hyperparameters for the learners\n", "dml_plr_lasso.set_ml_nuisance_params('ml_l', 'tg', {'alpha': 0.0005})\n", "dml_plr_lasso.set_ml_nuisance_params('ml_m', 'tg', {'alpha': 0.0026})" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
coefstd errtP>|t|2.5 %97.5 %
\n", "
" ], "text/plain": [ " coef std err t P>|t| 2.5 % 97.5 %\n", "tg -0.078207 0.035572 -2.198549 0.02791 -0.147927 -0.008487" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dml_plr_lasso.fit()\n", "dml_plr_lasso.summary" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Specify learner and estimate causal parameter: IRM model with random forest as learner\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Set machine learning methods for m & g\n", "ml_g = RandomForestRegressor()\n", "ml_m = RandomForestClassifier()\n", "n_folds = 2\n", "n_rep = 10\n", "\n", "np.random.seed(3141)\n", "dml_irm_rf = dml.DoubleMLIRM(dml_data,\n", " ml_g,\n", " ml_m,\n", " n_folds=n_folds,\n", " n_rep=n_rep,\n", " score='ATE',\n", " dml_procedure='dml2')\n", "\n", "# set some hyperparameters for the learners\n", "pars = {'n_estimators': 500,\n", " 'max_features': 'sqrt',\n", " 'max_depth': 5}\n", "\n", "dml_irm_rf.set_ml_nuisance_params('ml_g0', 'tg', pars)\n", "dml_irm_rf.set_ml_nuisance_params('ml_g1', 'tg', pars)\n", "dml_irm_rf.set_ml_nuisance_params('ml_m', 'tg', pars)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
coefstd errtP>|t|2.5 %97.5 %
\n", "
" ], "text/plain": [ " coef std err t P>|t| 2.5 % 97.5 %\n", "tg -0.076971 0.03574 -2.153633 0.031269 -0.14702 -0.006922" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dml_irm_rf.fit()\n", "dml_irm_rf.summary" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Specify learner and estimate causal parameter: IRM model with Lasso as learner\n", "\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Set machine learning methods for m & g\n", "ml_g = Lasso()\n", "ml_m = LogisticRegression()\n", "np.random.seed(1234)\n", "n_folds = 2\n", "n_rep = 10\n", "\n", "np.random.seed(3141)\n", "dml_irm_lasso = dml.DoubleMLIRM(dml_data_lasso,\n", " ml_g,\n", " ml_m,\n", " n_folds=n_folds,\n", " n_rep=n_rep,\n", " score='ATE',\n", " dml_procedure='dml2')\n", "\n", "# set some hyperparameters for the learners\n", "dml_irm_lasso.set_ml_nuisance_params('ml_g0', 'tg', {'alpha': 0.0019})\n", "dml_irm_lasso.set_ml_nuisance_params('ml_g1', 'tg', {'alpha': 0.0073})\n", "dml_irm_lasso.set_ml_nuisance_params('ml_m', 'tg', {'C': 0.0001})" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
coefstd errtP>|t|2.5 %97.5 %
\n", "
" ], "text/plain": [ " coef std err t P>|t| 2.5 % 97.5 %\n", "tg -0.080947 0.035545 -2.277299 0.022768 -0.150614 -0.01128" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dml_irm_lasso.fit()\n", "dml_irm_lasso.summary" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 1 }