Machine Learning - Double LASSO
def fn_variance(data, ddof=0):
n = len(data)
mean = sum(data) / n
return sum((x - mean) ** 2 for x in data) / (n - ddof)
# Note this is equivalent to np.var(Yt,ddof)
def fn_generate_cov(dim):
acc = []
for i in range(dim):
row = np.ones((1,dim)) * corr
row[0][i] = 1
acc.append(row)
return np.concatenate(acc,axis=0)
def fn_generate_multnorm(nobs,corr,nvar):
mu = np.zeros(nvar)
std = (np.abs(np.random.normal(loc = 1, scale = .5,size = (nvar,1))))**(1/2)
# generate random normal distribution
acc = []
for i in range(nvar):
acc.append(np.reshape(np.random.normal(mu[i],std[i],nobs),(nobs,-1)))
normvars = np.concatenate(acc,axis=1)
cov = fn_generate_cov(nvar)
C = np.linalg.cholesky(cov)
Y = np.transpose(np.dot(C,np.transpose(normvars)))
# return (Y,np.round(np.corrcoef(Y,rowvar=False),2))
return Y
def fn_randomize_treatment(N,p=0.5):
treated = random.sample(range(N), round(N*p))
return np.array([(1 if i in treated else 0) for i in range(N)]).reshape([N,1])
import pandas as pd
import numpy as np
import random
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
from tqdm import tqdm
random.seed(10)
def fn_generate_data(tau,N,p,p0,corr,conf = True,flagX = False):
"""
p0(int): number of covariates with nonzero coefficients
"""
nvar = p+2 # 1 confounder and variable for randomizing treatment
corr = 0.5 # correlation for multivariate normal
if conf==False:
conf_mult = 0 # remove confounder from outcome
allX = fn_generate_multnorm(N,corr,nvar)
W0 = allX[:,0].reshape([N,1]) # variable for RDD assignment
C = allX[:,1].reshape([N,1]) # confounder
X = allX[:,2:] # observed covariates
T = fn_randomize_treatment(N) # choose treated units
err = np.random.normal(0,1,[N,1])
beta0 = np.random.normal(5,5,[p,1])
beta0[p0:p] = 0 # sparse model
Yab = tau*T+X@beta0+conf_mult*0.6*C+err
if flagX==False:
return (Yab,T)
else:
return (Yab,T,X)
# regression discontinuity
# W = W0 + 0.5*C+3*X[:,80].reshape([N,1])-6*X[:,81].reshape([N,1])
# treated = 1*(W>0)
# Yrdd = 1.2* treated - 4*W + X@beta0 +0.6*C+err
def fn_tauhat_means(Yt,Yc):
nt = len(Yt)
nc = len(Yc)
tauhat = np.mean(Yt)-np.mean(Yc)
se_tauhat = (np.var(Yt,ddof=1)/nt+np.var(Yc,ddof=1)/nc)**(1/2)
return (tauhat,se_tauhat)
def fn_bias_rmse_size(theta0,thetahat,se_thetahat,cval = 1.96):
b = thetahat - theta0
bias = np.mean(b)
rmse = np.sqrt(np.mean(b**2))
tval = b/se_thetahat
size = np.mean(1*(np.abs(tval)>cval))
# note size calculated at true parameter value
return (bias,rmse,size)
def fn_run_experiments(tau,Nrange,p,p0,corr,conf,flagX=False):
n_values = []
tauhats = []
sehats = []
lb = []
ub = []
for N in tqdm(Nrange):
n_values = n_values + [N]
if flagX==False:
Yexp,T = fn_generate_data(tau,N,p,p0,corr,conf,flagX)
Yt = Yexp[np.where(T==1)[0],:]
Yc = Yexp[np.where(T==0)[0],:]
tauhat,se_tauhat = fn_tauhat_means(Yt,Yc)
elif flagX==1:
# use the right covariates in regression
Yexp,T,X = fn_generate_data(tau,N,p,p0,corr,conf,flagX)
Xobs = X[:,:p0]
covars = np.concatenate([T,Xobs],axis = 1)
mod = sm.OLS(Yexp,covars)
res = mod.fit()
tauhat = res.params[0]
se_tauhat = res.HC1_se[0]
elif flagX==2:
# use some of the right covariates and some "wrong" ones
Yexp,T,X = fn_generate_data(tau,N,p,p0,corr,conf,flagX)
Xobs1 = X[:,:np.int(p0/2)]
Xobs2 = X[:,-np.int(p0/2):]
covars = np.concatenate([T,Xobs1,Xobs2],axis = 1)
mod = sm.OLS(Yexp,covars)
res = mod.fit()
tauhat = res.params[0]
se_tauhat = res.HC1_se[0]
tauhats = tauhats + [tauhat]
sehats = sehats + [se_tauhat]
lb = lb + [tauhat-1.96*se_tauhat]
ub = ub + [tauhat+1.96*se_tauhat]
return (n_values,tauhats,sehats,lb,ub)
def fn_plot_with_ci(n_values,tauhats,tau,lb,ub,caption):
fig = plt.figure(figsize = (10,6))
plt.plot(n_values,tauhats,label = '$\hat{\\tau}$')
plt.xlabel('N')
plt.ylabel('$\hat{\\tau}$')
plt.axhline(y=tau, color='r', linestyle='-',linewidth=1,
label='True $\\tau$={}'.format(tau))
plt.title('{}'.format(caption))
plt.fill_between(n_values, lb, ub,
alpha=0.5, edgecolor='#FF9848', facecolor='#FF9848',label = '95% CI')
plt.legend()
Statsmodels sandwich variance estimators https://github.com/statsmodels/statsmodels/blob/master/statsmodels/stats/sandwich_covariance.py
1. Experiments with no covariates in the DGP
$y_i = \tau*T_i+e_i$
tau = 2
corr = .5
conf=False
p = 10
p0 = 0 # number of covariates used in the DGP
Nrange = range(10,1000,2) # loop over N values
(nvalues,tauhats,sehats,lb,ub) = fn_run_experiments(tau,Nrange,p,p0,corr,conf)
100%|██████████| 495/495 [00:15<00:00, 31.01it/s]
caption = """Estimates of the treatment effect parameter
for a randomized experiment without confounders"""
fn_plot_with_ci(nvalues,tauhats,tau,lb,ub,caption)
For selected N check that this is the same as running a regression with a intercept
N = 100
Yexp,T = fn_generate_data(tau,N,10,0,corr,conf = False)
Yt = Yexp[np.where(T==1)[0],:]
Yc = Yexp[np.where(T==0)[0],:]
tauhat,se_tauhat = fn_tauhat_means(Yt,Yc)
# n_values = n_values + [N]
# tauhats = tauhats + [tauhat]
lb = lb + [tauhat-1.96*se_tauhat]
ub = ub + [tauhat+1.96*se_tauhat]
tauhat,se_tauhat
(2.0265077249506818, 0.21439126250360868)
# np.linalg.inv(np.transpose(T)@T)@np.transpose(T)@Yexp
const = np.ones([N,1])
model = sm.OLS(Yexp,np.concatenate([T,const],axis = 1))
res = model.fit()
# res.summary()
res.params[0], res.HC1_se[0]
(2.0265077249506818, 0.21439126250360865)
Run R Monte Carlo iterations and compute bias, RMSE and size
estDict = {}
R = 2000
for N in [10,50,100,500,1000]:
tauhats = []
sehats = []
for r in tqdm(range(R)):
Yexp,T = fn_generate_data(tau,N,10,0,corr,conf)
Yt = Yexp[np.where(T==1)[0],:]
Yc = Yexp[np.where(T==0)[0],:]
tauhat,se_tauhat = fn_tauhat_means(Yt,Yc)
tauhats = tauhats + [tauhat]
sehats = sehats + [se_tauhat]
estDict[N] = {
'tauhat':np.array(tauhats).reshape([len(tauhats),1]),
'sehat':np.array(sehats).reshape([len(sehats),1])
}
100%|██████████| 2000/2000 [00:08<00:00, 239.28it/s]
100%|██████████| 2000/2000 [00:10<00:00, 195.10it/s]
100%|██████████| 2000/2000 [00:05<00:00, 336.33it/s]
100%|██████████| 2000/2000 [00:14<00:00, 138.82it/s]
100%|██████████| 2000/2000 [00:52<00:00, 38.13it/s]
tau0 = tau*np.ones([R,1])
for N, results in estDict.items():
(bias,rmse,size) = fn_bias_rmse_size(tau0,results['tauhat'],
results['sehat'])
print(f'N={N}: bias={bias}, RMSE={rmse}, size={size}')
N=10: bias=0.009269882933282698, RMSE=0.6446653201961393, size=0.0935
N=50: bias=0.004576677580047069, RMSE=0.2778357876169915, size=0.0555
N=100: bias=0.0018937616864692784, RMSE=0.1953541404256453, size=0.049
N=500: bias=-0.003191612276919599, RMSE=0.08956691752623909, size=0.0465
N=1000: bias=-0.000456703628317072, RMSE=0.062656868377909, size=0.051
1. Experiments with covariates in the DGP
$y_i = \tauT_i+\beta’x_i+e_i$
tau = 2
corr = .5
conf=False
p = 100
p0 = 50 # number of covariates used in the DGP
Nrange = range(10,1000,2) # loop over N values
(nvalues_x,tauhats_x,sehats_x,lb_x,ub_x) = fn_run_experiments(tau,Nrange,p,p0,corr,conf)
100%|██████████| 495/495 [00:20<00:00, 23.91it/s]
caption = """Estimates of the treatment effect parameter
for a randomized experiment with X's in the DGP"""
fn_plot_with_ci(nvalues_x,tauhats_x,tau,lb_x,ub_x,caption)
tau = 2
corr = .5
conf=False
p = 100
p0 = 0 # number of covariates used in the DGP
Nrange = range(10,1000,2) # loop over N values
(nvalues_x0,tauhats_x0,sehats_x0,lb_x0,ub_x0) = fn_run_experiments(tau,Nrange,p,p0,corr,conf)
100%|██████████| 495/495 [00:21<00:00, 23.20it/s]
fig = plt.figure(figsize = (10,6))
plt.plot(nvalues_x,tauhats_x,label = '$\hat{\\tau}^{(x)}$')
plt.plot(nvalues_x,tauhats_x0,label = '$\hat{\\tau}$',color = 'green')
plt.legend()
<matplotlib.legend.Legend at 0x12977fd00>
fig = plt.figure(figsize = (10,6))
plt.plot(nvalues_x[400:],tauhats_x[400:],label = '$\hat{\\tau}^{(x)}$')
plt.plot(nvalues_x[400:],tauhats_x0[400:],label = '$\hat{\\tau}$',color = 'green')
plt.legend()
<matplotlib.legend.Legend at 0x129808820>
Repeat experiment with larger N
tau = 2
corr = .5
conf=False
p = 100
p0 = 50 # number of covariates used in the DGP
Nrange = range(1000,50000,10000) # loop over N values
(nvalues_x2,tauhats_x2,sehats_x2,lb_x2,ub_x2) = fn_run_experiments(tau,Nrange,p,p0,corr,conf)
100%|██████████| 5/5 [00:54<00:00, 10.83s/it]
fn_plot_with_ci(nvalues_x2,tauhats_x2,tau,lb_x2,ub_x2,caption)
Still pretty noisy!
DGP with X - adding covariates to the regression
Use same DGP as before
# Use same DGP as
tau = 2
corr = .5
conf=False
p = 100
p0 = 50 # number of covariates used in the DGP
Nrange = range(100,1000,2) # loop over N values
# we need to start with more observations than p
flagX = 1
(nvalues2,tauhats2,sehats2,lb2,ub2) = fn_run_experiments(tau,Nrange,p,p0,corr,conf,flagX)
100%|██████████| 450/450 [00:30<00:00, 14.54it/s]
caption = """Estimates of the treatment effect parameter
for a randomized experiment with X's in the DGP,
estimates obtained using regression with the right Xs"""
fn_plot_with_ci(nvalues2,tauhats2,tau,lb2,ub2,caption)
Including X’s improves precision. However, we cheated because we included the right X’s from the start!
What happens if we use some X’s that influence the outcome and some that don’t?
# Use same DGP as
tau = 2
corr = .5
conf=False
p = 1000
p0 = 50 # number of covariates used in the DGP
Nrange = range(100,1000,2) # loop over N values
# we need to start with more observations than p
flagX = 2
(nvalues3,tauhats3,sehats3,lb3,ub3) = fn_run_experiments(tau,Nrange,p,p0,corr,conf,flagX)
100%|██████████| 450/450 [02:13<00:00, 3.37it/s]
caption = """Estimates of the treatment effect parameter
for a randomized experiment with X's in the DGP,
estimates obtained using regression with the 50% of the right Xs and
an additional 50% irrelevant Xs"""
fn_plot_with_ci(nvalues3,tauhats3,tau,lb3,ub3,caption)