# Load required modules
import numpy as np
import pandas as pd
import doubleml as dml

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from xgboost import XGBClassifier, XGBRegressor

import matplotlib.pyplot as plt
import scipy.stats as stats

C:\Users\PuD\Anaconda3\lib\site-packages\xgboost\compat.py:31: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index


import pandas as pd

# Load data set from url (internet connection required)
url = 'https://raw.githubusercontent.com/DoubleML/doubleml-docs/master/doc/examples/data/high42.CSV'
df = pd.read_csv(url)


print(df.shape)

(1000, 202)


df.head()


# Specify explanatory variables for data-backend
features_base = list(df.columns.values)[2:]

# TODO: Initialize DoubleMLData (data-backend of DoubleML)
data_dml = dml.DoubleMLData(df,
                            y_col='Y',
                            d_cols='A',
                            x_cols=features_base)


# TODO: print data backend
print(data_dml)

================== DoubleMLData Object ==================

------------------ Data summary      ------------------
Outcome variable: Y
Treatment variable(s): ['A']
Covariates: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', 'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200']
Instrument variable(s): None
No. Observations: 1000

------------------ DataFrame info    ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 202 entries, Y to V200
dtypes: float64(107), int64(95)
memory usage: 1.5 MB


# TODO: Calculate unconditional average treatment effect
df[['A', 'Y']].groupby('A').mean()


df[['A', 'Y']].groupby('A').mean().diff()


# TODO: Initialize Linear and Logistic Regression learners
linreg = make_pipeline(StandardScaler(), LinearRegression())
logreg_class = make_pipeline(StandardScaler(), LogisticRegression(penalty="none"))


# TODO: Initialize one ML learner of your choice
# Initialize Lasso learners
lasso = make_pipeline(StandardScaler(), LassoCV(cv=5, max_iter=20000))      
lasso_class = make_pipeline(StandardScaler(),
                            LogisticRegressionCV(cv=5, penalty='l1', solver='liblinear',     
                                                 Cs = 4, max_iter=1000))


# TODO: Initialize a second ML learner of your choice
#      (proceed as long as you like)

# Initialize Random Forest learners
randomForest = RandomForestRegressor()
randomForest_class = RandomForestClassifier()


# TODO: Initialize benchmark DoubleMLIRM model
np.random.seed(1234)
dml_irm_regression = dml.DoubleMLIRM(data_dml,
                                ml_g = linreg,
                                ml_m = logreg_class,
                                trimming_threshold = 0.025,
                                n_folds = 3,
                                n_rep = 3)


# TODO: Initialize a DoubleMLIRM model using the ML learners of your choice
np.random.seed(1234)
dml_irm_lasso = dml.DoubleMLIRM(data_dml,
                          ml_g = lasso,
                          ml_m = lasso_class,
                          trimming_threshold = 0.025,                          
                          n_folds = 3,
                          n_rep = 3)


np.random.seed(1234)
dml_irm_forest = dml.DoubleMLIRM(data_dml,
                                 ml_g = randomForest,
                                 ml_m = randomForest_class,
                                 trimming_threshold = 0.025,
                                 n_folds = 3,
                                 n_rep = 3)


# TODO: Fit benchmark DoubleMLIRM model using the fit() method

# HINT: set parameter 'store_predictions = True' for later model diagnostics
dml_irm_regression.fit(store_predictions=True)

<doubleml.double_ml_irm.DoubleMLIRM at 0x141afad2130>


def pred_acc_irm(DoubleML, prop):
    """
    A function to calculate prediction accuracy values for every repetition
    of a Double Machine Learning model using IRM, DoubleMLIRM
    
    ...
    
    Parameters
    ----------
    DoubleML : doubleml.double_ml_irm.DoubleMLIRM
        The IRM Double Machine Learning model
    prop : bool
        Indication if RMSE values have to be computed for main regression or
        log loss values for propensity score
    """
    
    # export data and predictions of the DoubleML model
    y = DoubleML._dml_data.y
    d = DoubleML._dml_data.d
    g0 = DoubleML.predictions.get('ml_g0')
    g1 = DoubleML.predictions.get('ml_g1')    
    m = DoubleML.predictions.get('ml_m')
    
    # dimensions of prediction array
    h = g0.shape[0]
    w = DoubleML.n_rep
    
    # check whether treatment is binary 
    if np.isin(d, [0,1]).all() == False:
        raise ValueError("Treatment must be a binary variable.")
    
    # prepare array to store prediction accuracy measure values
    pred_acc_array = np.zeros((w,))
    
    # check whether to assess main regression or propensity score accuracy:   
    if prop == False:
        
        # evaluate main regression accuracy
        # export an array with correctly picked prediction values    
        export_pred_array = np.zeros((h, w))            
        for i in range(w):
            for j in range(h):
                if d[j] == 0:
                    export_pred_array[j,i] = g0[j,i]
                else:
                    export_pred_array[j,i] = g1[j,i]
    
        # fill array that contains rmse of each repetition
        for i in range(w):
            pred_acc_array[i] = mean_squared_error(y, export_pred_array[:,i], squared=False)    
    else:
        
        # evaluate propensity score accuracy
        # fill array that contains log loss of each repetition
        for i in range(w):
            pred_acc_array[i] = log_loss(d, m[:,i], eps=0.025)
    
    return pred_acc_array


# TODO: Evaluate the predictive performance for `ml_g` and `ml_m` using the
#       helper function `pred_acc_irm()`.
# calculate mean and standard deviation of repetition RMSE's to evaluate main regression accuracy
rmse_main_linlog_irm = pred_acc_irm(dml_irm_regression, prop=False)
rmse_main_linlog_irm_mean = np.mean(rmse_main_linlog_irm)
rmse_main_linlog_irm_std = np.std(rmse_main_linlog_irm)

# calculate mean and standard deviation of repetition log losses to evaluate propensity score accuracy
logloss_prop_linlog_irm = pred_acc_irm(dml_irm_regression, prop=True)
logloss_prop_linlog_irm_mean = np.mean(logloss_prop_linlog_irm)
logloss_prop_linlog_irm_std = np.std(logloss_prop_linlog_irm)


print("Mean of the main regression RMSE across 3 repetitions is", rmse_main_linlog_irm_mean)
print("Standard deviation of the RMSE is", rmse_main_linlog_irm_std)

Mean of the main regression RMSE across 3 repetitions is 1.727905012276966
Standard deviation of the RMSE is 0.026526079197961804


print("Mean of the propensity score's log loss across 3 repetitions is", logloss_prop_linlog_irm_mean)
print("Standard deviation of log loss is", logloss_prop_linlog_irm_std)

Mean of the propensity score's log loss across 3 repetitions is 1.0228568161263911
Standard deviation of log loss is 0.027536454508812133


def rep_propscore_plot(DoubleML):
    """
    A function to create histograms as sublots for every repetition's propensity score density 
    of a Double Machine Learning model
    
    ...
    
    Parameters
    ----------
    DoubleML : doubleml
        The Double Machine Learning model
    """
    
    
    #export nuisance part from the DoubleML model
    m = DoubleML.predictions.get('ml_m')
    
    # dimensions of nuisance array
    h = m.shape[0]
    rep = DoubleML.n_rep
    i = 0
    
    # create histograms as subplots covering the propensity score densities of all repetitions
    if rep > 1:
        fig, ax = plt.subplots(1, rep, figsize=[20,4.8])
    
        for i in range(rep):
            ax[i].hist(np.reshape(m[:,i], h), range=[0,1], bins=25, density=False)
            ax[i].set_title('repetition ' + str(i+1))
            ax[i].set_xlabel("prop_score")
            ax[i].set_ylabel("count")
    
    else:
        fig, ax = plt.subplots(figsize=[20,4.8])
        ax.hist(np.reshape(m[:,i], h), range=[0,1], bins=25, density=False)
        ax.hist(np.reshape(m[:,i], h), range=[0,1], bins=25, density=False)
        ax.set_title('repetition ' + str(i+1))
        ax.set_xlabel("prop_score")
        ax.set_ylabel("count")
        
    plt.show()


# (TODO): Summarize the propensity score estimates
rep_propscore_plot(dml_irm_regression)


# TODO: Fit the ML DoubleMLIRM model using the fit() method
dml_irm_lasso.fit(store_predictions=True)

<doubleml.double_ml_irm.DoubleMLIRM at 0x141afad20a0>


# TODO: Evaluate the predictive performance for `ml_g` and `ml_m` using the
#       helper function `pred_acc_irm()`.

# calculate mean and standard deviation of repetition RMSE's to evaluate main regression accuracy
rmse_main_lasso_irm = pred_acc_irm(dml_irm_lasso, prop=False)
rmse_main_lasso_irm_mean = np.mean(rmse_main_lasso_irm)
rmse_main_lasso_irm_std = np.std(rmse_main_lasso_irm)

# calculate mean and standard deviation of repetition log losses to evaluate propensity score accuracy
logloss_prop_lasso_irm = pred_acc_irm(dml_irm_lasso, prop=True)
logloss_prop_lasso_irm_mean = np.mean(logloss_prop_lasso_irm)
logloss_prop_lasso_irm_std = np.std(logloss_prop_lasso_irm)


print("Mean of the main regression RMSE across 3 repetitions is", rmse_main_lasso_irm_mean)
print("Standard deviation of RMSE is", rmse_main_lasso_irm_std)

Mean of the main regression RMSE across 3 repetitions is 1.1143405452234652
Standard deviation of RMSE is 0.005869688113839954


print("Mean of the propensity score's log loss across 3 repetitions is", logloss_prop_lasso_irm_mean)
print("Standard deviation of log loss is", logloss_prop_lasso_irm_std)

Mean of the propensity score's log loss across 3 repetitions is 0.6686766530020503
Standard deviation of log loss is 0.0032543941531603953


# (TODO): Summarize the propensity score estimates
rep_propscore_plot(dml_irm_lasso)


# Initialize DoubleMLIRM model
np.random.seed(1234)
dml_irm_forest = dml.DoubleMLIRM(data_dml,
                                 ml_g = randomForest,
                                 ml_m = randomForest_class,
                                 trimming_threshold = 0.025,
                                 n_folds = 3,
                                 n_rep = 3)

# Set nuisance-part specific parameters
dml_irm_forest.set_ml_nuisance_params('ml_g0', 'A', {
    'max_features': 200, 'n_estimators': 250})
dml_irm_forest.set_ml_nuisance_params('ml_g1', 'A', {
    'max_features': 200, 'n_estimators': 250})
dml_irm_forest.set_ml_nuisance_params('ml_m', 'A', {
    'max_features': 200, 'n_estimators': 250})

dml_irm_forest.fit(store_predictions=True)

<doubleml.double_ml_irm.DoubleMLIRM at 0x141b2da4f70>


# calculate mean and standard deviation of repetition RMSE's to evaluate main regression accuracy
rmse_main_forest_irm = pred_acc_irm(dml_irm_forest, prop=False)
rmse_main_forest_irm_mean = np.mean(rmse_main_forest_irm)
rmse_main_forest_irm_std = np.std(rmse_main_forest_irm)

# calculate mean and standard deviation of repetition log losses to evaluate propensity score accuracy
logloss_prop_forest_irm = pred_acc_irm(dml_irm_forest, prop=True)
logloss_prop_forest_irm_mean = np.mean(logloss_prop_forest_irm)
logloss_prop_forest_irm_std = np.std(logloss_prop_forest_irm)


print("Mean of the main regression RMSE across 3 repetitions is", rmse_main_forest_irm_mean)
print("Standard deviation of RMSE is", rmse_main_forest_irm_std)

Mean of the main regression RMSE across 3 repetitions is 1.1688034544446142
Standard deviation of RMSE is 0.00826647072034282


print("Mean of the propensity score's log loss across 3 repetitions is", logloss_prop_forest_irm_mean)
print("Standard deviation of log loss is", logloss_prop_forest_irm_std)

Mean of the propensity score's log loss across 3 repetitions is 0.6823242335385794
Standard deviation of log loss is 0.0026223238604172463


rep_propscore_plot(dml_irm_forest)


## TODO: After calling fit(), access the coefficient parameter,
##      the standard error and confidence interval accessing the fiels
##      `coef` and `summary`.


# TODO: Summarize your results
reg_summary = dml_irm_regression.summary
reg_summary


## TODO: After calling fit(), access the coefficient parameter,
##      the standard error and confidence interval accessing the fiels
##      `coef` and `summary`.


# TODO: Summarize your results
lasso_summary = dml_irm_lasso.summary
lasso_summary


forest_summary = dml_irm_forest.summary
forest_summary


irm_summary = pd.concat((reg_summary, lasso_summary, forest_summary))
irm_summary.index = ['regression','lasso', 'forest']
irm_summary = irm_summary[['coef', 'std err', '2.5 %', '97.5 %']]
irm_summary.round(3)


errors = np.full((2, irm_summary.shape[0]), np.nan)
errors[0, :] = irm_summary['coef'] - irm_summary['2.5 %']
errors[1, :] = irm_summary['97.5 %'] - irm_summary['coef']
plt.errorbar(irm_summary.index, irm_summary.coef, fmt='o', yerr=errors)
plt.axhline(y=0.8, color='r', linestyle='--', label="true value")

plt.title('Interactive Regression Model (IRM)')
plt.xlabel('ML method')
_ = plt.ylabel('Coefficients and 95%-CI')

	Y	A	V1	V2	V3	V4	V5	V6	V7	V8	...	V191	V192	V193	V194	V196	V197	V199	V200
0	7.358185	1	10	0	0	7	192.793769	23.676950	8	0.185443	...	1.462837	1	1627.274196	0	4.683956	0.565667	3	0.024338
1	8.333672	1	12	0	1	4	199.653596	19.281270	7	0.514842	...	1.330522	1	1661.484439	1	6.766661	-0.395402	4	0.056518
2	7.472758	0	14	1	1	2	194.207792	24.589331	5	0.309199	...	1.384151	1	1658.939293	0	5.647794	1.112766	0	0.013442
3	6.502319	1	0	1	0	9	201.838024	25.513918	4	0.160160	...	1.220303	1	1650.801625	0	5.370363	-0.305842	4	0.034632
4	7.043758	1	12	0	0	9	201.360443	31.160641	6	0.291976	...	1.170094	1	1676.818876	0	3.446532	2.440661	1	0.017514

	coef	std err	2.5 %	97.5 %
regression	1.058	0.625	-0.167	2.282
lasso	0.856	0.071	0.717	0.995
forest	0.893	0.077	0.741	1.045

Python: A/B Testing with DoubleML¶

0. Problem Formulation: A/B Testing¶

The A/B Testing Scenario¶

Why control for individual characteristics?¶

Why use machine learning to analyze A/B tests?¶

1. Data-Backend¶

The data set¶

2. Causal Model¶

2.1. Interactive regression model (IRM)¶

2.2. Naive Approach: Unconditional estimate of ATE¶

3. ML Methods¶

3.1. Benchmark using linear and logistic regression¶

3.2. Instantiate one or several ML learners of your choice¶

4. DML Specifications¶

4.1. Linear and logistic benchmark model¶

4.2. ML Model of your choice¶

4.3. - 4.X. ML Model of your choice¶

5. Estimation¶

5.1. Estimation for the Benchmark IRM¶

5.2. Estimation Diagnostics for the Benchmark IRM¶

5.2.1. Assess the Predictive Performance in the benchmark IRM¶

Optional: 5.2.2. Evaluation of Propensity Score Estimates in the Benchmark IRM¶

5.3. Estimation for ML Model¶

5.3. Estimation Diagnostics for the IRM using ML Methods¶

5.3.1. Assess the Predictive Performance in the IRM using ML methods¶

Optional: 5.3.2. Evaluation of Propensity Score Estimates in the Benchmark IRM¶

5.4. - 5.X. ML Model of your choice¶

5.X+1 Summarize your Results on the Quality of Estimation¶

6. Inference¶

6.1. Inference for the benchmark IRM¶

6.2. Inference for the IRM using ML methods¶

6.3. - 6.X. ML Model of your choice¶

Summary¶

References¶

	Y
A
0	6.836141
1	7.953744

	Y
A
0	NaN
1	1.117603