Initial Setup
In [364]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from scipy.stats import pearsonr
from itertools import combinations
import statsmodels as sm
import statsmodels.api as sm
import random
from sklearn import metrics
from ggplot import *
_SEED = 314153
pd.options.display.max_rows = 50
pd.options.display.max_columns = 999
Read in cleaned tennis data Excel file from Ann and Simao
In [328]:
data = pd.read_excel("cleaned_data.xlsx")
In [333]:
data.head()
Out[333]:
DATE TOURNAMENT TOUR MATCH PLAYER_NAME PLAYER_ID PLAYER_RNK OPPONENT_RNK RNK_DIFF FAVOURITE NUM_SETS NUM_GAMES WIN_MATCH WIN_SET_1 WIN_GAME_1 SERVE_1
0 2011-07-28 ATPStudenaCroatiaOpen-ATPUmag2011 ATP 1 Olivier Rochus 103694 115 55 60 0 2 17 0 0 1 1
1 2011-07-28 ATPStudenaCroatiaOpen-ATPUmag2011 ATP 1 Fabio Fognini 104926 55 115 -60 1 2 17 1 1 0 0
2 2011-07-28 ATPStudenaCroatiaOpen-ATPUmag2011 ATP 2 Robin Haase 104898 65 14 51 0 3 29 0 1 1 1
3 2011-07-28 ATPStudenaCroatiaOpen-ATPUmag2011 ATP 2 Marin Cilic 105227 14 65 -51 1 3 29 1 0 0 0
4 2011-07-28 CreditAgricoleSuisseOpenGstaad-ATPGstaad2011 ATP 3 Matthias Bachinger 104897 96 43 53 0 2 20 0 0 1 1
In [ ]:
#Setting indp and dep variables
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1']
dep_var = ['WIN_MATCH']
First, let's duplicate Ann and Simao's logistic regressions.
To do that, we cannot use all of the data since one is a complement of the other -
let's pick some winners, and some losers, but not from the same match
In [31]:
data_winners = data[data.WIN_MATCH==1].copy(deep=True).reset_index(drop=True)
data_losers = data[data.WIN_MATCH==0].copy(deep=True).reset_index(drop=True)

random.seed(_SEED)
all_match_nos = list(set(data.MATCH))
winner_match_nos = [m for m in all_match_nos if random.random() >= 0.50]
loser_match_nos = [m for m in all_match_nos if m not in winner_match_nos]

data_winners = data_winners[data_winners.MATCH.isin(winner_match_nos)].reset_index(drop=True)
data_losers = data_losers[data_losers.MATCH.isin(loser_match_nos)].reset_index(drop=True)
In [32]:
#Lets combine both winners and losers to make a trainable dataset

train_data = data_winners.append(data_losers, ignore_index=True).reset_index(drop=True)

train_data['intercept'] = 1.0

ind_vars.append("intercept")

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.457517
         Iterations 6
In [33]:
result.summary()
Out[33]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 18633
Model: Logit Df Residuals: 18627
Method: MLE Df Model: 5
Date: Sat, 03 Dec 2016 Pseudo R-squ.: 0.3399
Time: 12:33:30 Log-Likelihood: -8524.9
converged: True LL-Null: -12914.
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF -0.0004 0.000 -2.828 0.005 -0.001 -0.000
FAVOURITE 0.7304 0.050 14.511 0.000 0.632 0.829
WIN_SET_1 2.9445 0.040 73.938 0.000 2.866 3.023
WIN_GAME_1 0.2135 0.045 4.777 0.000 0.126 0.301
SERVE_1 -0.0081 0.044 -0.185 0.853 -0.094 0.078
intercept -1.9210 0.043 -44.978 0.000 -2.005 -1.837
Okay, those results approximately match what Ann and Simao had. Now, let's try to add more variables.
Jeff Sackmann has point-by-point data for grand slams, where we can get average serves and other info
All I am doing here is extracting Serve Speed, # Unforced Errors, and # Forced Errors per player for all grand slams.
In [211]:
def extract_match_data(matches_file,points_file):
    pbp_matches = pd.read_csv(matches_file)
    pbp_points = pd.read_csv(points_file)

    #keyed by match_id,player# tuple
    speed_mph_dict = dict(pbp_points.groupby(['match_id','PointServer'])['Speed_MPH'].mean())

    errors = pbp_points.groupby(['match_id'])['P1UnfErr','P2UnfErr','P1ForcedError','P2ForcedError'].sum()

    #keyed by match_id,player# tuple
    unf_errors_dict = dict()
    for_errors_dict = dict()

    for i in errors.iterrows():
        key = (i[0],1.0)
        unf_errors_dict[key] = i[1].P1UnfErr
        key = (i[0],2.0)
        unf_errors_dict[key] = i[1].P2UnfErr

        key = (i[0],1.0)
        for_errors_dict[key] = i[1].P1ForcedError
        key = (i[0],2.0)
        for_errors_dict[key] = i[1].P2ForcedError

    pbp_matches['player1_speed'] = 0.0
    pbp_matches['player2_speed'] = 0.0
    pbp_matches['player1_unferr'] = 0.0
    pbp_matches['player2_unferr'] = 0.0
    pbp_matches['player1_forerr'] = 0.0
    pbp_matches['player2_forerr'] = 0.0

    for i in range(0,pbp_matches.shape[0]):
        player1_key = (pbp_matches.match_id[i],1.0)
        player2_key = (pbp_matches.match_id[i],2.0)
        pbp_matches.set_value(i,'player1_speed',speed_mph_dict[player1_key] if player1_key in speed_mph_dict else None)
        pbp_matches.set_value(i,'player2_speed',speed_mph_dict[player2_key] if player2_key in speed_mph_dict else None)
        pbp_matches.set_value(i,'player1_unferr',unf_errors_dict[player1_key] if player1_key in unf_errors_dict else None)
        pbp_matches.set_value(i,'player2_unferr',unf_errors_dict[player2_key] if player2_key in unf_errors_dict else None)
        pbp_matches.set_value(i,'player1_forerr',for_errors_dict[player1_key] if player1_key in for_errors_dict else None)
        pbp_matches.set_value(i,'player2_forerr',for_errors_dict[player2_key] if player2_key in for_errors_dict else None)
    
    return pbp_matches
In [219]:
#Get the data for all years from 2011-2015

slams = ['ausopen','usopen','frenchopen','wimbledon']
years = [2011,2012,2013,2014,2015]

all_pbp_data = pd.DataFrame()

for y in years:
    for s in slams:
        matches_file = "pbp/" + str(y) + "-" + s + "-" + "matches.csv"
        points_file = "pbp/" + str(y) + "-" + s + "-" + "points.csv"
        match = extract_match_data(matches_file,points_file)
        all_pbp_data = all_pbp_data.append(match, ignore_index=True).reset_index(drop=True)
In [240]:
set(all_pbp_data.slam)
Out[240]:
{'ausopen', 'frenchopen', 'usopen', 'wimbledon'}
In [334]:
#We need to join the original data table to the pbp data table.

#The only way possible to join is slam, year, and player. So let's create that column in the original data table.
usopen = set([
    "Men'sUSOpen",
    "Men'sUSOpen.",
    "Men'sUSOpen.html",
    "Women'sUSOpen",
    "Women'sUSOpen.",
    "Women'sUSOpen.html",
    "WomensUSOpen",
    "WomensUSOpen.html"
])
ausopen = set([
    "Men'sAustralianOpen"
    "Men'sAustralianOpen.",
    "MensAustralianOpen",
    "MensAustralianOpen.html",
    "Women'sAustralianOpen",
    "Women'sAustralianOpen.",
    "WomensAustralianOpen",
    "WomensAustralianOpen.html"
])
fropen = set([
    "Men'sFrenchOpen",
    "Men'sFrenchOpen.",
    "MensFrenchOpen",
    "MensFrenchOpen.html",
    "Women'sFrenchOpen",
    "Women'sFrenchOpen.",
    "WomensFrenchOpen",
    "WomensFrenchOpen.html"
])
wimbl = set([
    "Gentlemen'sWimbledonSingles",
    "Gentlemen'sWimbledonSingles.",
    "Gentlemen'sWimbledonSingles.html",
    "Ladies'WimbledonSingles",
    "Ladies'WimbledonSingles.",
    "Ladies'WimbledonSingles2013",
    "Ladies'WimbledonSingles2013.html"
])

data['slam'] = ""
data['year'] = 0

for i in range(0,data.shape[0]):
    y = data.DATE[i].year
    data.set_value(i,"year",y)
    
    s = ""
    if data.TOURNAMENT[i] in usopen:
        s = 'usopen'
    elif data.TOURNAMENT[i] in ausopen:
        s = 'ausopen'
    elif data.TOURNAMENT[i] in fropen:
        s = 'frenchopen'
    elif data.TOURNAMENT[i] in wimbl:
        s = 'wimbledon'
    else:
        s = ""
    
    data.set_value(i,"slam",s)
In [335]:
#First, lets join the pbp table by player 1, and then separately by player 2

cols = ['year','slam','player1','player1_speed','player1_unferr','player1_forerr']
all_pbp_data_1 = all_pbp_data[cols]
cols[2] = 'PLAYER_NAME'
cols[3] = 'speed_mph'
cols[4] = 'num_unf_err'
cols[5] = 'num_for_err'
all_pbp_data_1.columns = cols

cols = ['year','slam','player2','player2_speed','player2_unferr','player2_forerr']
all_pbp_data_2 = all_pbp_data[cols]
cols[2] = 'PLAYER_NAME'
cols[3] = 'speed_mph'
cols[4] = 'num_unf_err'
cols[5] = 'num_for_err'
all_pbp_data_2.columns = cols

all_pbp_data_1.append(all_pbp_data_1,ignore_index=True).reset_index(drop=True)

data['speed_mph_pergame'] = None
data['num_unf_err_pergame'] = None
data['num_for_err_pergame'] = None

#keyed by yr,slam,player
speed_dict = dict()
unf_dict = dict()
f_dict  = dict()

by_player = all_pbp_data_1.groupby(['year','slam','PLAYER_NAME'])['speed_mph','num_unf_err','num_for_err'].mean()

for i in by_player.iterrows():
    key = i[0]
    speed_dict[key] = i[1].speed_mph
    unf_dict[key] = i[1].num_unf_err
    f_dict[key] = i[1].num_for_err

for i in range(data.shape[0]):
    key = (data.year[i],data.slam[i],data.PLAYER_NAME[i])
    data.set_value(i,"speed_mph_pergame",speed_dict[key] if key in speed_dict else None)
    data.set_value(i,"num_unf_err_pergame",unf_dict[key] if key in unf_dict else None)
    data.set_value(i,"num_for_err_pergame",f_dict[key] if key in f_dict else None)

Finally ready for more regressions.

Lets create some more variables: slam VS non-slam, women VS men, and top 20 VS non top 20
In [517]:
data['IS_SLAM'] = 0
data['IS_MENS'] = 0
data['IS_TOP30'] = 0

for i in range(data.shape[0]):
    if len(data.slam[i]) > 0:
        data.set_value(i,'IS_SLAM',1)
    
    if data.TOUR[i] == 'ATP':
        data.set_value(i,'IS_MENS',1)
    
    if data.PLAYER_RNK[i] <= 30:
        data.set_value(i,'IS_TOP30',1)
Okay - let's create a holdout set for 2015. This will be the test set for all of our regressions.
In [339]:
data_winners = data[data.WIN_MATCH==1].copy(deep=True).reset_index(drop=True)
data_losers = data[data.WIN_MATCH==0].copy(deep=True).reset_index(drop=True)

random.seed(_SEED)
all_match_nos = list(set(data.MATCH))
winner_match_nos = [m for m in all_match_nos if random.random() >= 0.50]
loser_match_nos = [m for m in all_match_nos if m not in winner_match_nos]

data_winners = data_winners[data_winners.MATCH.isin(winner_match_nos)].reset_index(drop=True)
data_losers = data_losers[data_losers.MATCH.isin(loser_match_nos)].reset_index(drop=True)

combined_data = data_winners.append(data_losers, ignore_index=True).reset_index(drop=True)

data_train = combined_data[combined_data.year <> 2015].copy(deep=True).reset_index(drop=True)
data_test = combined_data[combined_data.year == 2015].copy(deep=True).reset_index(drop=True)
Logistic Regression 1: All Variables (except slam-specific ones)
In [518]:
train_data = data_train.copy(deep=True).reset_index(drop=True)

train_data['intercept'] = 1.0

dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','IS_SLAM','IS_MENS','IS_TOP30','intercept']

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.451315
         Iterations 6
In [411]:
result.summary()
Out[411]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 14435
Model: Logit Df Residuals: 14426
Method: MLE Df Model: 8
Date: Sun, 04 Dec 2016 Pseudo R-squ.: 0.3487
Time: 01:22:14 Log-Likelihood: -6514.7
converged: True LL-Null: -10003.
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF -0.0007 0.000 -4.091 0.000 -0.001 -0.000
FAVOURITE 0.6409 0.060 10.645 0.000 0.523 0.759
WIN_SET_1 2.9603 0.046 64.658 0.000 2.871 3.050
WIN_GAME_1 0.1909 0.051 3.738 0.000 0.091 0.291
SERVE_1 -0.0079 0.050 -0.158 0.875 -0.106 0.090
IS_SLAM 0.0089 0.059 0.152 0.879 -0.106 0.124
IS_MENS -0.0057 0.044 -0.127 0.899 -0.093 0.081
IS_TOP30 0.3008 0.057 5.285 0.000 0.189 0.412
intercept -1.9357 0.054 -35.569 0.000 -2.042 -1.829
In [412]:
test_data = data_test.copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])
In [413]:
fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
AUC: 0.842683880398
Odds:
RNK_DIFF       0.999315
FAVOURITE      1.898247
WIN_SET_1     19.304489
WIN_GAME_1     1.210364
SERVE_1        0.992140
IS_SLAM        1.008983
IS_MENS        0.994363
IS_TOP30       1.350961
intercept      0.144318
dtype: float64
That's a pretty good model! Now obviously, we're including things like rank differential and whether they're
ranked in the top 30 which is going to bias it. Let's split the analysis. First up: mens VS womens
Logistic Regression 2: Men Only
In [415]:
train_data = data_train[data_train.IS_MENS==1].copy(deep=True).reset_index(drop=True)

train_data['intercept'] = 1.0

dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.457472
         Iterations 6
In [416]:
result.summary()
Out[416]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 7059
Model: Logit Df Residuals: 7053
Method: MLE Df Model: 5
Date: Sun, 04 Dec 2016 Pseudo R-squ.: 0.3396
Time: 01:22:52 Log-Likelihood: -3229.3
converged: True LL-Null: -4889.6
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF -0.0007 0.000 -3.126 0.002 -0.001 -0.000
FAVOURITE 0.8285 0.079 10.429 0.000 0.673 0.984
WIN_SET_1 2.9045 0.065 44.550 0.000 2.777 3.032
WIN_GAME_1 0.1566 0.081 1.934 0.053 -0.002 0.315
SERVE_1 -0.0271 0.079 -0.342 0.733 -0.182 0.128
intercept -1.9041 0.068 -28.026 0.000 -2.037 -1.771
In [417]:
test_data = data_test[data_test.IS_MENS==1].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])

fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
AUC: 0.846633739124
Odds:
RNK_DIFF       0.999302
FAVOURITE      2.289823
WIN_SET_1     18.256114
WIN_GAME_1     1.169499
SERVE_1        0.973283
intercept      0.148959
dtype: float64
Logistic Regression 3: Women Only
In [419]:
train_data = data_train[data_train.IS_MENS==0].copy(deep=True).reset_index(drop=True)

train_data['intercept'] = 1.0

dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.446660
         Iterations 6
In [420]:
result.summary()
Out[420]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 7376
Model: Logit Df Residuals: 7370
Method: MLE Df Model: 5
Date: Sun, 04 Dec 2016 Pseudo R-squ.: 0.3556
Time: 01:23:39 Log-Likelihood: -3294.6
converged: True LL-Null: -5112.6
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF -0.0007 0.000 -2.675 0.007 -0.001 -0.000
FAVOURITE 0.6444 0.084 7.659 0.000 0.479 0.809
WIN_SET_1 3.0346 0.064 47.138 0.000 2.908 3.161
WIN_GAME_1 0.2388 0.067 3.540 0.000 0.107 0.371
SERVE_1 0.0433 0.066 0.657 0.511 -0.086 0.172
intercept -1.9589 0.070 -27.919 0.000 -2.096 -1.821
In [421]:
test_data = data_test[data_test.IS_MENS==0].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])

fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
AUC: 0.831429191465
Odds:
RNK_DIFF       0.999325
FAVOURITE      1.904808
WIN_SET_1     20.792594
WIN_GAME_1     1.269761
SERVE_1        1.044263
intercept      0.141012
dtype: float64
Conclusion: Splitting by Mens or Womens yields the exact same results, but which a higher emphasis on winning the
first set. This makes sense, because the women's matches tend to be best of 3.
Logistic Regression 4: Top30 Ranked Players Only
In [520]:
train_data = data_train[data_train.IS_TOP30==1].copy(deep=True).reset_index(drop=True)

train_data['intercept'] = 1.0

dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.428590
         Iterations 6
In [429]:
result.summary()
Out[429]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 3518
Model: Logit Df Residuals: 3512
Method: MLE Df Model: 5
Date: Sun, 04 Dec 2016 Pseudo R-squ.: 0.3259
Time: 01:41:09 Log-Likelihood: -1507.8
converged: True LL-Null: -2236.7
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF -0.0006 0.000 -1.241 0.215 -0.001 0.000
FAVOURITE 0.9846 0.132 7.485 0.000 0.727 1.242
WIN_SET_1 2.8674 0.095 30.167 0.000 2.681 3.054
WIN_GAME_1 0.2930 0.107 2.738 0.006 0.083 0.503
SERVE_1 0.0142 0.105 0.134 0.893 -0.192 0.221
intercept -1.9360 0.134 -14.489 0.000 -2.198 -1.674
In [430]:
test_data = data_test[data_test.IS_TOP30==1].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])

fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
AUC: 0.839764198499
Odds:
RNK_DIFF       0.999431
FAVOURITE      2.676858
WIN_SET_1     17.591873
WIN_GAME_1     1.340414
SERVE_1        1.014275
intercept      0.144279
dtype: float64
Logistic Regression 5: Non-Top30 Ranked Players Only
In [435]:
train_data = data_train[data_train.IS_TOP30==0].copy(deep=True).reset_index(drop=True)

train_data['intercept'] = 1.0

dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','intercept']

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.458089
         Iterations 6
In [436]:
result.summary()
Out[436]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 10917
Model: Logit Df Residuals: 10911
Method: MLE Df Model: 5
Date: Sun, 04 Dec 2016 Pseudo R-squ.: 0.3357
Time: 01:45:00 Log-Likelihood: -5001.0
converged: True LL-Null: -7527.7
LLR p-value: 0.000
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF -0.0008 0.000 -4.415 0.000 -0.001 -0.000
FAVOURITE 0.5535 0.068 8.138 0.000 0.420 0.687
WIN_SET_1 2.9842 0.052 57.036 0.000 2.882 3.087
WIN_GAME_1 0.1628 0.058 2.798 0.005 0.049 0.277
SERVE_1 -0.0181 0.057 -0.319 0.750 -0.129 0.093
intercept -1.8929 0.054 -35.277 0.000 -1.998 -1.788
In [437]:
test_data = data_test[data_test.IS_TOP30==0].copy(deep=True).reset_index(drop=True)
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])

fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
AUC: 0.8297610557
Odds:
RNK_DIFF       0.999189
FAVOURITE      1.739325
WIN_SET_1     19.770493
WIN_GAME_1     1.176786
SERVE_1        0.982062
intercept      0.150637
dtype: float64
Conclusion: Our variables are less predictive for non Top30 players (by AUC), but not by much.
Logistic Regression 6: Slams only
Now, we use the other variables.
NOTE: Because some of the new variables added have empty values, I am actually creating a new dummy variable for when
the row is empty. Thus, in the original row, the empty can now simply be 0, and the dummy variable will be flagged.
NOTE 2: However, it turns out that those dummy variables are not predictive anyway...so removing them.
In [508]:
train_data = data_train[data_train.IS_SLAM==1].copy(deep=True).reset_index(drop=True)

train_data['speed_None'] = 0.0
train_data['unf_err_None'] = 0.0
train_data['for_err_None'] = 0.0

for i in range(train_data.shape[0]):
    if train_data.speed_mph_pergame[i] is None or train_data.speed_mph_pergame[i] == "":
        train_data.set_value(i,'speed_mph_pergame',0.0)
        train_data.set_value(i,'speed_None',1.0)
    
    if train_data.num_unf_err_pergame[i] is None or pd.isnull(train_data.num_unf_err_pergame[i]) or train_data.num_unf_err_pergame[i] == "":
        train_data.set_value(i,'num_unf_err_pergame',0.0)
        train_data.set_value(i,'unf_err_None',1.0)
        
    if train_data.num_for_err_pergame[i] is None or pd.isnull(train_data.num_for_err_pergame[i]) or train_data.num_for_err_pergame[i] == "":
        train_data.set_value(i,'num_for_err_pergame',0.0)
        train_data.set_value(i,'for_err_None',1.0)

train_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']] = train_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']].apply(pd.to_numeric)
train_data['intercept'] = 1.0

dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_SET_1','WIN_GAME_1','SERVE_1','speed_mph_pergame',
            'num_for_err_pergame','num_unf_err_pergame','intercept']

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.409121
         Iterations 6
In [509]:
result.summary()
Out[509]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 2569
Model: Logit Df Residuals: 2560
Method: MLE Df Model: 8
Date: Sun, 04 Dec 2016 Pseudo R-squ.: 0.4096
Time: 02:22:39 Log-Likelihood: -1051.0
converged: True LL-Null: -1780.2
LLR p-value: 1.339e-309
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF 9.79e-06 0.000 0.023 0.981 -0.001 0.001
FAVOURITE 1.3868 0.148 9.381 0.000 1.097 1.677
WIN_SET_1 2.9952 0.117 25.684 0.000 2.767 3.224
WIN_GAME_1 0.2616 0.129 2.021 0.043 0.008 0.515
SERVE_1 -0.1437 0.126 -1.144 0.253 -0.390 0.103
speed_mph_pergame 0.0143 0.002 7.499 0.000 0.011 0.018
num_for_err_pergame -0.0134 0.011 -1.249 0.212 -0.034 0.008
num_unf_err_pergame -0.0082 0.005 -1.660 0.097 -0.018 0.001
intercept -2.7170 0.142 -19.194 0.000 -2.994 -2.440
In [511]:
test_data = data_test[data_test.IS_SLAM==1].copy(deep=True).reset_index(drop=True)

test_data['speed_None'] = 0.0
test_data['unf_err_None'] = 0.0
test_data['for_err_None'] = 0.0

for i in range(test_data.shape[0]):
    if test_data.speed_mph_pergame[i] is None or test_data.speed_mph_pergame[i] == "":
        test_data.set_value(i,'speed_mph_pergame',0.0)
        test_data.set_value(i,'speed_None',1.0)
    
    if test_data.num_unf_err_pergame[i] is None or pd.isnull(test_data.num_unf_err_pergame[i]) or test_data.num_unf_err_pergame[i] == "":
        test_data.set_value(i,'num_unf_err_pergame',0.0)
        test_data.set_value(i,'unf_err_None',1.0)
        
    if test_data.num_for_err_pergame[i] is None or pd.isnull(test_data.num_for_err_pergame[i]) or test_data.num_for_err_pergame[i] == "":
        test_data.set_value(i,'num_for_err_pergame',0.0)
        test_data.set_value(i,'for_err_None',1.0)

test_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']] = train_data[['speed_mph_pergame','num_unf_err_pergame','num_for_err_pergame']].apply(pd.to_numeric)
test_data['intercept'] = 1.0
                                                                        
test_data['intercept'] = 1.0
preds = result.predict(test_data[ind_vars])

fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
AUC: 0.842040705773
Odds:
RNK_DIFF                1.000010
FAVOURITE               4.001945
WIN_SET_1              19.988915
WIN_GAME_1              1.299053
SERVE_1                 0.866133
speed_mph_pergame       1.014427
num_for_err_pergame     0.986693
num_unf_err_pergame     0.991799
intercept               0.066073
dtype: float64

Not surprisingly, the speed, # unf errors, and # forced errors are quite predictive.

Now, using the "WIN_SET_1" variable is kind of cheating because part of teh game is already over. Since we have

more variables, let's remove that variable, and see how good our regression becomes.

Logistic Regression 7: Slams only, without Set variable
In [512]:
dep_var = ['WIN_MATCH']
ind_vars = ['RNK_DIFF','FAVOURITE','WIN_GAME_1','SERVE_1','speed_mph_pergame',
            'num_for_err_pergame','num_unf_err_pergame','intercept']

logit_model = sm.Logit(train_data[dep_var], train_data[ind_vars])

result = logit_model.fit()
Optimization terminated successfully.
         Current function value: 0.574362
         Iterations 5
In [513]:
result.summary()
Out[513]:
Logit Regression Results
Dep. Variable: WIN_MATCH No. Observations: 2569
Model: Logit Df Residuals: 2561
Method: MLE Df Model: 7
Date: Sun, 04 Dec 2016 Pseudo R-squ.: 0.1712
Time: 02:26:31 Log-Likelihood: -1475.5
converged: True LL-Null: -1780.2
LLR p-value: 2.323e-127
coef std err z P>|z| [95.0% Conf. Int.]
RNK_DIFF -0.0001 0.000 -0.300 0.764 -0.001 0.001
FAVOURITE 1.4772 0.120 12.322 0.000 1.242 1.712
WIN_GAME_1 1.0490 0.102 10.247 0.000 0.848 1.250
SERVE_1 -0.3285 0.102 -3.207 0.001 -0.529 -0.128
speed_mph_pergame 0.0143 0.002 9.167 0.000 0.011 0.017
num_for_err_pergame -0.0110 0.008 -1.370 0.171 -0.027 0.005
num_unf_err_pergame -0.0078 0.004 -1.886 0.059 -0.016 0.000
intercept -1.5914 0.105 -15.146 0.000 -1.797 -1.385
In [514]:
preds = result.predict(test_data[ind_vars])

fpr, tpr, _ = metrics.roc_curve(list(test_data.WIN_MATCH), preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
print "AUC: " + str(auc)
print "Odds:"
print np.exp(result.params)
AUC: 0.673200742919
Odds:
RNK_DIFF               0.999898
FAVOURITE              4.380668
WIN_GAME_1             2.854837
SERVE_1                0.720001
speed_mph_pergame      1.014453
num_for_err_pergame    0.989036
num_unf_err_pergame    0.992199
intercept              0.203648
dtype: float64
Although the AUC goes down quite a bit, this is still a great model! For grand slams, we are able to predict
the match with a great accuracy. The ROC curve is below.
In [516]:
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')
Out[516]:
<ggplot: (303134325)>
In [ ]: