Experimental Design - AB Testing
import numpy as np
from scipy.stats import ttest_ind
from tqdm import tqdm
def t_test(x,y,alternative='both-sided'):
_, double_p = ttest_ind(x,y,equal_var = False)
if alternative == 'both-sided':
pval = double_p
elif alternative == 'greater':
if np.mean(x) > np.mean(y):
pval = double_p/2.
else:
pval = 1.0 - double_p/2.
elif alternative == 'less':
if np.mean(x) < np.mean(y):
pval = double_p/2.
else:
pval = 1.0 - double_p/2.
return pval
Assume the conversion rates for options A and B are equal
np.random.seed(123)
Let’s assume we fix the sample size in advance, run the experiment until we achieve the full sample size and then do a t-test of equality of means. If the conversion rates are truly equal, we expect to incorrectly reject this 5% of the time if we use a 95% confidence level
def run_experiment_and_test(conv_a,conv_b,n,p_threshold=0.05):
obs_a = np.random.binomial(1,conv_a,n)
obs_b = np.random.binomial(1,conv_b,n)
p = t_test(obs_a,obs_b,'greater')
if p<p_threshold:
return 1 # reject
else:
return 0
def run_experiment_and_peek(conv_a,conv_b,n,n_min,p_threshold=0.05):
# now the difference is that we peek at each sample size starting at n_min and stop the
# experiment if we see a statistically significant difference
if n<n_min:
return 'Error, n<n_min'
else:
obs_a = np.random.binomial(1,conv_a,n)
obs_b = np.random.binomial(1,conv_b,n)
for ntest in range(n_min,n):
p = t_test(obs_a[:ntest],obs_b[:ntest],'greater')
if p<p_threshold:
return 1
break
return 0
def run_mc(conv_a,conv_b,n,R,peek = False,n_min = None):
rejection = []
for r in tqdm(range(R)):
if peek==False: # don't peek
rejection = rejection + [run_experiment_and_test(conv_a,conv_b,n)]
else:
rejection = rejection + [run_experiment_and_peek(conv_a,conv_b,n,n_min)]
return np.mean(rejection)
R = 2000 # number of MC replications
conv_a = .2
conv_b = .2
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [10,100,1000,2000]:
r = run_mc(conv_a,conv_b,n,R)
print(f'n={n}, rejection rate = {r}')
18%|█▊ | 350/2000 [00:00<00:00, 1801.94it/s]
Rejection rates across 2000 Monte Carlo replications:
100%|██████████| 2000/2000 [00:01<00:00, 1658.09it/s]
8%|▊ | 162/2000 [00:00<00:01, 1616.93it/s]
n=10, rejection rate = 0.04
100%|██████████| 2000/2000 [00:01<00:00, 1722.74it/s]
9%|▉ | 177/2000 [00:00<00:01, 1763.75it/s]
n=100, rejection rate = 0.0485
100%|██████████| 2000/2000 [00:01<00:00, 1704.21it/s]
16%|█▌ | 319/2000 [00:00<00:01, 1629.52it/s]
n=1000, rejection rate = 0.054
100%|██████████| 2000/2000 [00:01<00:00, 1556.02it/s]
n=2000, rejection rate = 0.0585
Now let’s see what happens if there actually is a difference between the conversion rates
R = 2000 # number of MC replications
conv_a = .2
conv_b = .25
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [10,100,1000,2000]:
r = run_mc(conv_a,conv_b,n,R)
print(f'n={n}, rejection rate = {r}')
9%|▉ | 183/2000 [00:00<00:00, 1828.50it/s]
Rejection rates across 2000 Monte Carlo replications:
100%|██████████| 2000/2000 [00:01<00:00, 1484.15it/s]
8%|▊ | 165/2000 [00:00<00:01, 1645.82it/s]
n=10, rejection rate = 0.025
100%|██████████| 2000/2000 [00:01<00:00, 1349.68it/s]
7%|▋ | 133/2000 [00:00<00:01, 1324.64it/s]
n=100, rejection rate = 0.0065
100%|██████████| 2000/2000 [00:01<00:00, 1567.98it/s]
7%|▋ | 137/2000 [00:00<00:01, 1369.47it/s]
n=1000, rejection rate = 0.0
100%|██████████| 2000/2000 [00:01<00:00, 1230.43it/s]
n=2000, rejection rate = 0.0
Peeking
Now let’s run an experiment where we “peek”, that is, for each n starting at n=100 we check the statistical significance
R = 500 # number of MC replications
conv_a = .2
conv_b = .2
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [500,1000,2000]:
r = run_mc(conv_a,conv_b,n,R,peek = True,n_min = 100)
print(f'n_max={n}, rejection rate = {r}')
0%| | 0/500 [00:00<?, ?it/s]
Rejection rates across 500 Monte Carlo replications:
100%|██████████| 500/500 [02:23<00:00, 3.48it/s]
0%| | 0/500 [00:00<?, ?it/s]
n_max=500, rejection rate = 0.21
100%|██████████| 500/500 [04:33<00:00, 1.83it/s]
0%| | 0/500 [00:00<?, ?it/s]
n_max=1000, rejection rate = 0.246
100%|██████████| 500/500 [07:23<00:00, 1.13it/s]
n_max=2000, rejection rate = 0.312
conv_a = .2
conv_b = .25
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [500,1000,2000]:
r = run_mc(conv_a,conv_b,n,R,peek = True,n_min = 100)
print(f'n_max={n}, rejection rate = {r}')
0%| | 0/500 [00:00<?, ?it/s]
Rejection rates across 500 Monte Carlo replications:
100%|██████████| 500/500 [02:00<00:00, 4.15it/s]
0%| | 0/500 [00:00<?, ?it/s]
n_max=500, rejection rate = 0.022
100%|██████████| 500/500 [04:26<00:00, 1.87it/s]
0%| | 0/500 [00:00<?, ?it/s]
n_max=1000, rejection rate = 0.02
100%|██████████| 500/500 [09:19<00:00, 1.12s/it]
n_max=2000, rejection rate = 0.026