import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from scipy.stats import pearsonr
from itertools import combinations
import statsmodels as sm
import statsmodels.api as sm
import random
from sklearn import metrics
from ggplot import *
_SEED = 314153
pd.options.display.max_rows = 50
pd.options.display.max_columns = 999
data = pd.read_excel("cleaned_data.xlsx")
data.head()
#Setting indp and dep variables
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1']
dep_var = ['WIN_MATCH']
data_winners = data[data.WIN_MATCH==1].copy(deep=True).reset_index(drop=True)
data_losers = data[data.WIN_MATCH==0].copy(deep=True).reset_index(drop=True)
random.seed(_SEED)
all_match_nos = list(set(data.MATCH))
winner_match_nos = [m for m in all_match_nos if random.random() >= 0.50]
loser_match_nos = [m for m in all_match_nos if m not in winner_match_nos]
data_winners = data_winners[data_winners.MATCH.isin(winner_match_nos)].reset_index(drop=True)
data_losers = data_losers[data_losers.MATCH.isin(loser_match_nos)].reset_index(drop=True)
#Lets combine both winners and losers to make a trainable dataset
train_data = data_winners.append(data_losers, ignore_index=True).reset_index(drop=True)
train_data['intercept'] = 1.0
ind_vars.append("intercept")
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
def extract_match_data(matches_file,points_file):
pbp_matches = pd.read_csv(matches_file)
pbp_points = pd.read_csv(points_file)
#keyed by match_id,player# tuple
speed_mph_dict = dict(pbp_points.groupby(['match_id','PointServer'])['Speed_MPH'].mean())
errors = pbp_points.groupby(['match_id'])['P1UnfErr','P2UnfErr','P1ForcedError','P2ForcedError'].sum()
#keyed by match_id,player# tuple
unf_errors_dict = dict()
for_errors_dict = dict()
for i in errors.iterrows():
key = (i[0],1.0)
unf_errors_dict[key] = i[1].P1UnfErr
key = (i[0],2.0)
unf_errors_dict[key] = i[1].P2UnfErr
key = (i[0],1.0)
for_errors_dict[key] = i[1].P1ForcedError
key = (i[0],2.0)
for_errors_dict[key] = i[1].P2ForcedError
pbp_matches['player1_speed'] = 0.0
pbp_matches['player2_speed'] = 0.0
pbp_matches['player1_unferr'] = 0.0
pbp_matches['player2_unferr'] = 0.0
pbp_matches['player1_forerr'] = 0.0
pbp_matches['player2_forerr'] = 0.0
for i in range(0,pbp_matches.shape[0]):
player1_key = (pbp_matches.match_id[i],1.0)
player2_key = (pbp_matches.match_id[i],2.0)
pbp_matches.set_value(i,'player1_speed',speed_mph_dict[player1_key] if player1_key in speed_mph_dict else None)
pbp_matches.set_value(i,'player2_speed',speed_mph_dict[player2_key] if player2_key in speed_mph_dict else None)
pbp_matches.set_value(i,'player1_unferr',unf_errors_dict[player1_key] if player1_key in unf_errors_dict else None)
pbp_matches.set_value(i,'player2_unferr',unf_errors_dict[player2_key] if player2_key in unf_errors_dict else None)
pbp_matches.set_value(i,'player1_forerr',for_errors_dict[player1_key] if player1_key in for_errors_dict else None)
pbp_matches.set_value(i,'player2_forerr',for_errors_dict[player2_key] if player2_key in for_errors_dict else None)
return pbp_matches
#Get the data for all years from 2011-2015
slams = ['ausopen','usopen','frenchopen','wimbledon']
years = [2011,2012,2013,2014,2015]
all_pbp_data = pd.DataFrame()
for y in years:
for s in slams:
matches_file = "pbp/" + str(y) + "-" + s + "-" + "matches.csv"
points_file = "pbp/" + str(y) + "-" + s + "-" + "points.csv"
match = extract_match_data(matches_file,points_file)
all_pbp_data = all_pbp_data.append(match, ignore_index=True).reset_index(drop=True)
set(all_pbp_data.slam)
#We need to join the original data table to the pbp data table.
#The only way possible to join is slam, year, and player. So let's create that column in the original data table.
usopen = set([
"Men'sUSOpen",
"Men'sUSOpen.",
"Men'sUSOpen.html",
"Women'sUSOpen",
"Women'sUSOpen.",
"Women'sUSOpen.html",
"WomensUSOpen",
"WomensUSOpen.html"
])
ausopen = set([
"Men'sAustralianOpen"
"Men'sAustralianOpen.",
"MensAustralianOpen",
"MensAustralianOpen.html",
"Women'sAustralianOpen",
"Women'sAustralianOpen.",
"WomensAustralianOpen",
"WomensAustralianOpen.html"
])
fropen = set([
"Men'sFrenchOpen",
"Men'sFrenchOpen.",
"MensFrenchOpen",
"MensFrenchOpen.html",
"Women'sFrenchOpen",
"Women'sFrenchOpen.",
"WomensFrenchOpen",
"WomensFrenchOpen.html"
])
wimbl = set([
"Gentlemen'sWimbledonSingles",
"Gentlemen'sWimbledonSingles.",
"Gentlemen'sWimbledonSingles.html",
"Ladies'WimbledonSingles",
"Ladies'WimbledonSingles.",
"Ladies'WimbledonSingles2013",
"Ladies'WimbledonSingles2013.html"
])
data['slam'] = ""
data['year'] = 0
for i in range(0,data.shape[0]):
y = data.DATE[i].year
data.set_value(i,"year",y)
s = ""
if data.TOURNAMENT[i] in usopen:
s = 'usopen'
elif data.TOURNAMENT[i] in ausopen:
s = 'ausopen'
elif data.TOURNAMENT[i] in fropen:
s = 'frenchopen'
elif data.TOURNAMENT[i] in wimbl:
s = 'wimbledon'
else:
s = ""
data.set_value(i,"slam",s)
#First, lets join the pbp table by player 1, and then separately by player 2
cols = ['year','slam','player1','player1_speed','player1_unferr','player1_forerr']
all_pbp_data_1 = all_pbp_data[cols]
cols[2] = 'PLAYER_NAME'
cols[3] = 'speed_mph'
cols[4] = 'num_unf_err'
cols[5] = 'num_for_err'
all_pbp_data_1.columns = cols
cols = ['year','slam','player2','player2_speed','player2_unferr','player2_forerr']
all_pbp_data_2 = all_pbp_data[cols]
cols[2] = 'PLAYER_NAME'
cols[3] = 'speed_mph'
cols[4] = 'num_unf_err'
cols[5] = 'num_for_err'
all_pbp_data_2.columns = cols
all_pbp_data_1.append(all_pbp_data_1,ignore_index=True).reset_index(drop=True)
data['speed_mph_pergame'] = None
data['num_unf_err_pergame'] = None
data['num_for_err_pergame'] = None
#keyed by yr,slam,player
speed_dict = dict()
unf_dict = dict()
f_dict = dict()
by_player = all_pbp_data_1.groupby(['year','slam','PLAYER_NAME'])['speed_mph','num_unf_err','num_for_err'].mean()
for i in by_player.iterrows():
key = i[0]
speed_dict[key] = i[1].speed_mph
unf_dict[key] = i[1].num_unf_err
f_dict[key] = i[1].num_for_err
for i in range(data.shape[0]):
key = (data.year[i],data.slam[i],data.PLAYER_NAME[i])
data.set_value(i,"speed_mph_pergame",speed_dict[key] if key in speed_dict else None)
data.set_value(i,"num_unf_err_pergame",unf_dict[key] if key in unf_dict else None)
data.set_value(i,"num_for_err_pergame",f_dict[key] if key in f_dict else None)
data['IS_SLAM'] = 0
data['IS_MENS'] = 0
data['IS_TOP30'] = 0
for i in range(data.shape[0]):
if len(data.slam[i]) > 0:
data.set_value(i,'IS_SLAM',1)
if data.TOUR[i] == 'ATP':
data.set_value(i,'IS_MENS',1)
if data.PLAYER_RNK[i] <= 30:
data.set_value(i,'IS_TOP30',1)
data_winners = data[data.WIN_MATCH==1].copy(deep=True).reset_index(drop=True)
data_losers = data[data.WIN_MATCH==0].copy(deep=True).reset_index(drop=True)
random.seed(_SEED)
all_match_nos = list(set(data.MATCH))
winner_match_nos = [m for m in all_match_nos if random.random() >= 0.50]
loser_match_nos = [m for m in all_match_nos if m not in winner_match_nos]
data_winners = data_winners[data_winners.MATCH.isin(winner_match_nos)].reset_index(drop=True)
data_losers = data_losers[data_losers.MATCH.isin(loser_match_nos)].reset_index(drop=True)
combined_data = data_winners.append(data_losers, ignore_index=True).reset_index(drop=True)
data_train = combined_data[combined_data.year <> 2015].copy(deep=True).reset_index(drop=True)
data_test = combined_data[combined_data.year == 2015].copy(deep=True).reset_index(drop=True)
train_data = data_train.copy(deep=True).reset_index(drop=True)
train_data['intercept'] = 1.0
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','IS_SLAM','IS_MENS','IS_TOP30','intercept']
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
test_data = data_test.copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')
auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
train_data = data_train[data_train.IS_MENS==1].copy(deep=True).reset_index(drop=True)
train_data['intercept'] = 1.0
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
test_data = data_test[data_test.IS_MENS==1].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')
auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
train_data = data_train[data_train.IS_MENS==0].copy(deep=True).reset_index(drop=True)
train_data['intercept'] = 1.0
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
test_data = data_test[data_test.IS_MENS==0].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')
auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
train_data = data_train[data_train.IS_TOP30==1].copy(deep=True).reset_index(drop=True)
train_data['intercept'] = 1.0
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
test_data = data_test[data_test.IS_TOP30==1].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')
auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
train_data = data_train[data_train.IS_TOP30==0].copy(deep=True).reset_index(drop=True)
train_data['intercept'] = 1.0
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
test_data = data_test[data_test.IS_TOP30==0].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')
auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
train_data = data_train[data_train.IS_SLAM==1].copy(deep=True).reset_index(drop=True)
train_data['speed_None'] = 0.0
train_data['unf_err_None'] = 0.0
train_data['for_err_None'] = 0.0
for i in range(train_data.shape[0]):
if train_data.speed_mph_pergame[i] is None or train_data.speed_mph_pergame[i] == "":
train_data.set_value(i,'speed_mph_pergame',0.0)
train_data.set_value(i,'speed_None',1.0)
if train_data.num_unf_err_pergame[i] is None or pd.isnull(train_data.num_unf_err_pergame[i]) or train_data.num_unf_err_pergame[i] == "":
train_data.set_value(i,'num_unf_err_pergame',0.0)
train_data.set_value(i,'unf_err_None',1.0)
if train_data.num_for_err_pergame[i] is None or pd.isnull(train_data.num_for_err_pergame[i]) or train_data.num_for_err_pergame[i] == "":
train_data.set_value(i,'num_for_err_pergame',0.0)
train_data.set_value(i,'for_err_None',1.0)
train_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']] = train_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']].apply(pd.to_numeric)
train_data['intercept'] = 1.0
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','speed_mph_pergame',
'num_for_err_pergame','num_unf_err_pergame','intercept']
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
test_data = data_test[data_test.IS_SLAM==1].copy(deep=True).reset_index(drop=True)
test_data['speed_None'] = 0.0
test_data['unf_err_None'] = 0.0
test_data['for_err_None'] = 0.0
for i in range(test_data.shape[0]):
if test_data.speed_mph_pergame[i] is None or test_data.speed_mph_pergame[i] == "":
test_data.set_value(i,'speed_mph_pergame',0.0)
test_data.set_value(i,'speed_None',1.0)
if test_data.num_unf_err_pergame[i] is None or pd.isnull(test_data.num_unf_err_pergame[i]) or test_data.num_unf_err_pergame[i] == "":
test_data.set_value(i,'num_unf_err_pergame',0.0)
test_data.set_value(i,'unf_err_None',1.0)
if test_data.num_for_err_pergame[i] is None or pd.isnull(test_data.num_for_err_pergame[i]) or test_data.num_for_err_pergame[i] == "":
test_data.set_value(i,'num_for_err_pergame',0.0)
test_data.set_value(i,'for_err_None',1.0)
test_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']] = train_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']].apply(pd.to_numeric)
test_data['intercept'] = 1.0
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')
auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_GAME_1','SERVE_1','speed_mph_pergame',
'num_for_err_pergame','num_unf_err_pergame','intercept']
logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])
result = logit_model.fit()
result.summary()
preds = result.predict(test_data[ind_vars])
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')
auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
ggplot(df, aes(x='fpr', y='tpr')) +\
geom_line() +\
geom_abline(linetype='dashed')