import pods
import numpy as np
import scipy as sp
import mlai
from matplotlib import pyplot as plt
f, ax = plt.subplots(1, 2, figsize=(14,7))
basis = mlai.polynomial
data = pods.datasets.olympic_marathon_men()
x = data['X']
y = data['Y']
num_data = x.shape[0]
data_limits = [1892, 2020]
max_basis = y.shape[0]
ll = np.array([np.nan]*(max_basis))
sum_squares = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
sum_squares[num_basis-1] = model.objective()/num_data
ll[num_basis-1] = model.log_likelihood()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=sum_squares, objective_ylim=[0, 0.3],
fig=f, ax=ax)
f, ax = plt.subplots(1, 2, figsize=(14,7))
val_start = 20;
x = data['X'][:val_start, :]
x_val = data['X'][val_start:, :]
y = data['Y'][:val_start, :]
y_val = data['Y'][val_start:, :]
num_val_data = x_val.shape[0]
max_basis = 7
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val = model.predict(x_val)
ss_val[num_basis-1] = ((y_val-f_val)**2).mean()
ll[num_basis-1] = model.log_likelihood()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0,0.6],
fig=f, ax=ax, prefix='olympic_val',
x_val=x_val, y_val=y_val)
f, ax = plt.subplots(1, 2, figsize=(14,7))
val_start = 20;
perm = np.random.permutation(data['X'].shape[0])
x = data['X'][perm[:val_start], :]
x_val = data['X'][perm[val_start:], :]
y = data['Y'][perm[:val_start], :]
y_val = data['Y'][perm[val_start:], :]
num_val_data = x_val.shape[0]
max_basis = 7
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val = model.predict(x_val)
ss_val[num_basis-1] = ((y_val-f_val)**2).mean()
ll[num_basis-1] = model.log_likelihood()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.1,0.6],
fig=f, ax=ax, prefix='olympic_val_inter',
x_val=x_val, y_val=y_val)
f, ax = plt.subplots(1, 2, figsize=(14,7))
num_data = data['X'].shape[0]
num_parts = num_data
partitions = []
for part in range(num_parts):
train_ind = list(range(part))
train_ind.extend(range(part+1,num_data))
val_ind = [part]
partitions.append((train_ind, val_ind))
max_basis = 7
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
ss_val_temp = 0.
for part, (train_ind, val_ind) in enumerate(partitions):
x = data['X'][train_ind, :]
x_val = data['X'][val_ind, :]
y = data['Y'][train_ind, :]
y_val = data['Y'][val_ind, :]
num_val_data = x_val.shape[0]
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val = model.predict(x_val)
ss_val_temp += ((y_val-f_val)**2).mean()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.1,0.6],
fig=f, ax=ax, prefix='olympic_loo' + str(part) + '_inter',
x_val=x_val, y_val=y_val)
ss_val[num_basis-1] = ss_val_temp/(num_parts)
ax[1].cla()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.1,0.6],
fig=f, ax=ax, prefix='olympic_loo_inter',
x_val=x_val, y_val=y_val)
Expected test error for different variations of the training data sampled from, $\Pr(\mathbf{x}, y)$
$$E\left[ (y - f^*(\mathbf{x}))^2 \right]$$Decompose as
$$E\left[ (y - f(\mathbf{x}))^2 \right] = \text{bias}\left[f^*(\mathbf{x})\right]^2 + \text{variance}\left[f^*(\mathbf{x})\right] +\sigma^2$$Given by $$\text{bias}\left[f^*(\mathbf{x})\right] = E\left[f^*(\mathbf{x})\right] - f(\mathbf{x})$$
Error due to bias comes from a model that's too simple.
Given by $$\text{variance}\left[f^*(\mathbf{x})\right] = E\left[\left(f^*(\mathbf{x}) - E\left[f^*(\mathbf{x})\right]\right)^2\right]$$
Slight variations in the training set cause changes in the prediction. Error due to variance is error in the model due to an overly complex model.
f, ax = plt.subplots(1, 2, figsize=(14,7))
num_data = data['X'].shape[0]
num_parts = 5
partitions = []
ind = list(np.random.permutation(num_data))
start = 0
for part in range(num_parts):
end = round((float(num_data)/num_parts)*part)
train_ind = ind[:start]
train_ind.extend(ind[end:])
val_ind = ind[start:end]
partitions.append((train_ind, val_ind))
start = end
max_basis = 7
ll = np.array([np.nan]*(max_basis))
ss = np.array([np.nan]*(max_basis))
ss_val = np.array([np.nan]*(max_basis))
for num_basis in range(1,max_basis+1):
ss_val_temp = 0.
for part, (train_ind, val_ind) in enumerate(partitions):
x = data['X'][train_ind, :]
x_val = data['X'][val_ind, :]
y = data['Y'][train_ind, :]
y_val = data['Y'][val_ind, :]
num_val_data = x_val.shape[0]
model= mlai.LM(x, y, basis, num_basis=num_basis, data_limits=data_limits)
model.fit()
ss[num_basis-1] = model.objective()
f_val = model.predict(x_val)
ss_val_temp += ((y_val-f_val)**2).mean()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.2,0.6],
fig=f, ax=ax, prefix='olympic_' + str(num_parts) + 'cv' + str(part) + '_inter',
x_val=x_val, y_val=y_val)
ss_val[num_basis-1] = ss_val_temp/(num_parts)
ax[1].cla()
mlai.plot_marathon_fit(model=model, data_limits=data_limits,
objective=np.sqrt(ss_val), objective_ylim=[0.2,0.6],
fig=f, ax=ax, prefix='olympic_' + str(num_parts) + 'cv_inter',
x_val=x_val, y_val=y_val)
/Users/neil/anaconda/lib/python3.4/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice. warnings.warn("Mean of empty slice.", RuntimeWarning)