Experimental Design - AB Testing

3 minute read

import numpy as np
from scipy.stats import ttest_ind
from tqdm import tqdm
def t_test(x,y,alternative='both-sided'):
    _, double_p = ttest_ind(x,y,equal_var = False)
    if alternative == 'both-sided':
        pval = double_p
    elif alternative == 'greater':
        if np.mean(x) > np.mean(y):
            pval = double_p/2.
        else:
            pval = 1.0 - double_p/2.
    elif alternative == 'less':
        if np.mean(x) < np.mean(y):
            pval = double_p/2.
        else:
            pval = 1.0 - double_p/2.
    return pval

Assume the conversion rates for options A and B are equal

np.random.seed(123)

Let’s assume we fix the sample size in advance, run the experiment until we achieve the full sample size and then do a t-test of equality of means. If the conversion rates are truly equal, we expect to incorrectly reject this 5% of the time if we use a 95% confidence level

def run_experiment_and_test(conv_a,conv_b,n,p_threshold=0.05):
    obs_a = np.random.binomial(1,conv_a,n)
    obs_b = np.random.binomial(1,conv_b,n)
    p = t_test(obs_a,obs_b,'greater')
    if p<p_threshold:
        return 1 # reject
    else:
        return 0
    
def run_experiment_and_peek(conv_a,conv_b,n,n_min,p_threshold=0.05):
    # now the difference is that we peek at each sample size starting at n_min and stop the 
    # experiment if we see a statistically significant difference    
    if n<n_min:
        return 'Error, n<n_min'
    else:
        obs_a = np.random.binomial(1,conv_a,n)
        obs_b = np.random.binomial(1,conv_b,n)
        for ntest in range(n_min,n):
            p = t_test(obs_a[:ntest],obs_b[:ntest],'greater')
            if p<p_threshold:  
                return 1
                break
        return 0

def run_mc(conv_a,conv_b,n,R,peek = False,n_min = None):
    rejection = []
    for r in tqdm(range(R)):
        if peek==False: # don't peek
            rejection = rejection + [run_experiment_and_test(conv_a,conv_b,n)]
        else:
            rejection = rejection + [run_experiment_and_peek(conv_a,conv_b,n,n_min)]
    return np.mean(rejection)
R = 2000 # number of MC replications
conv_a = .2
conv_b = .2
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [10,100,1000,2000]:
    r = run_mc(conv_a,conv_b,n,R)
    print(f'n={n}, rejection rate = {r}')
 18%|█▊        | 350/2000 [00:00<00:00, 1801.94it/s]

Rejection rates across 2000 Monte Carlo replications: 



100%|██████████| 2000/2000 [00:01<00:00, 1658.09it/s]
  8%|▊         | 162/2000 [00:00<00:01, 1616.93it/s]

n=10, rejection rate = 0.04


100%|██████████| 2000/2000 [00:01<00:00, 1722.74it/s]
  9%|▉         | 177/2000 [00:00<00:01, 1763.75it/s]

n=100, rejection rate = 0.0485


100%|██████████| 2000/2000 [00:01<00:00, 1704.21it/s]
 16%|█▌        | 319/2000 [00:00<00:01, 1629.52it/s]

n=1000, rejection rate = 0.054


100%|██████████| 2000/2000 [00:01<00:00, 1556.02it/s]

n=2000, rejection rate = 0.0585

Now let’s see what happens if there actually is a difference between the conversion rates

R = 2000 # number of MC replications
conv_a = .2
conv_b = .25
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [10,100,1000,2000]:
    r = run_mc(conv_a,conv_b,n,R)
    print(f'n={n}, rejection rate = {r}')
  9%|▉         | 183/2000 [00:00<00:00, 1828.50it/s]

Rejection rates across 2000 Monte Carlo replications: 



100%|██████████| 2000/2000 [00:01<00:00, 1484.15it/s]
  8%|▊         | 165/2000 [00:00<00:01, 1645.82it/s]

n=10, rejection rate = 0.025


100%|██████████| 2000/2000 [00:01<00:00, 1349.68it/s]
  7%|▋         | 133/2000 [00:00<00:01, 1324.64it/s]

n=100, rejection rate = 0.0065


100%|██████████| 2000/2000 [00:01<00:00, 1567.98it/s]
  7%|▋         | 137/2000 [00:00<00:01, 1369.47it/s]

n=1000, rejection rate = 0.0


100%|██████████| 2000/2000 [00:01<00:00, 1230.43it/s]

n=2000, rejection rate = 0.0

Peeking

Now let’s run an experiment where we “peek”, that is, for each n starting at n=100 we check the statistical significance

R = 500 # number of MC replications
conv_a = .2
conv_b = .2
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [500,1000,2000]:
    r = run_mc(conv_a,conv_b,n,R,peek = True,n_min = 100)
    print(f'n_max={n}, rejection rate = {r}')
  0%|          | 0/500 [00:00<?, ?it/s]

Rejection rates across 500 Monte Carlo replications: 



100%|██████████| 500/500 [02:23<00:00,  3.48it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

n_max=500, rejection rate = 0.21


100%|██████████| 500/500 [04:33<00:00,  1.83it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

n_max=1000, rejection rate = 0.246


100%|██████████| 500/500 [07:23<00:00,  1.13it/s]

n_max=2000, rejection rate = 0.312
conv_a = .2
conv_b = .25
print(f'Rejection rates across {R} Monte Carlo replications: \n')
for n in [500,1000,2000]:
    r = run_mc(conv_a,conv_b,n,R,peek = True,n_min = 100)
    print(f'n_max={n}, rejection rate = {r}')
  0%|          | 0/500 [00:00<?, ?it/s]

Rejection rates across 500 Monte Carlo replications: 



100%|██████████| 500/500 [02:00<00:00,  4.15it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

n_max=500, rejection rate = 0.022


100%|██████████| 500/500 [04:26<00:00,  1.87it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

n_max=1000, rejection rate = 0.02


100%|██████████| 500/500 [09:19<00:00,  1.12s/it]

n_max=2000, rejection rate = 0.026