In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

pd.options.display.max_rows = 400
pd.options.display.max_columns = 400

datafile = "student/student-mat.csv"
prtscores = "student/student-por.csv"

df_math = pd.read_csv(datafile, sep=";")
df_por = pd.read_csv(prtscores, sep=";")
In [2]:
df_math.head()
Out[2]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian traveltime studytime failures schoolsup famsup paid activities nursery higher internet romantic famrel freetime goout Dalc Walc health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher course mother 2 2 0 yes no no no yes yes no no 4 3 4 1 1 3 6 5 6 6
1 GP F 17 U GT3 T 1 1 at_home other course father 1 2 0 no yes no no no yes yes no 5 3 3 1 1 3 4 5 5 6
2 GP F 15 U LE3 T 1 1 at_home other other mother 1 2 3 yes no yes no yes yes yes no 4 3 2 2 3 3 10 7 8 10
3 GP F 15 U GT3 T 4 2 health services home mother 1 3 0 no yes yes yes yes yes yes yes 3 2 2 1 1 5 2 15 14 15
4 GP F 16 U GT3 T 3 3 other other home father 1 2 0 no yes yes no yes yes no no 4 3 2 1 2 5 4 6 10 10
In [3]:
df_por.head()
Out[3]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian traveltime studytime failures schoolsup famsup paid activities nursery higher internet romantic famrel freetime goout Dalc Walc health absences G1 G2 G3
0 GP F 18 U GT3 A 4 4 at_home teacher course mother 2 2 0 yes no no no yes yes no no 4 3 4 1 1 3 4 0 11 11
1 GP F 17 U GT3 T 1 1 at_home other course father 1 2 0 no yes no no no yes yes no 5 3 3 1 1 3 2 9 11 11
2 GP F 15 U LE3 T 1 1 at_home other other mother 1 2 0 yes no no no yes yes yes no 4 3 2 2 3 3 6 12 13 12
3 GP F 15 U GT3 T 4 2 health services home mother 1 3 0 no yes no yes yes yes yes yes 3 2 2 1 1 5 0 14 14 14
4 GP F 16 U GT3 T 3 3 other other home father 1 2 0 no yes no no yes yes no no 4 3 2 1 2 5 0 11 13 13
In [4]:
# Compare if column headers are the same in both dataframes - should give us an empty list
list(set(df_math.columns) - set(df_por.columns))
Out[4]:
[]
In [76]:
# Create subsets of data that can be used to merge original datasets with the the grade column of the other class
def create_df_subset(df):
    column_names = ["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet", 'G3']
    df = df.loc[:, column_names]
    return df

df_por_sub = create_df_subset(df_por)
df_por_sub.rename(columns={'G3':'G3_port'}, inplace=True)
df_math_sub = create_df_subset(df_math)
df_math_sub.rename(columns={'G3':'G3_math'}, inplace=True)
In [6]:
# Add portugese grade to mat df by merging on identifying columns
def merge_datasets(df_base, df_grade):
    columns_merge = ["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"]
    df_merged = df_base.merge(df_grade, left_on=columns_merge, right_on=columns_merge)
    return df_merged

df_math = merge_datasets(df_math, df_por_sub)
df_port = merge_datasets(df_por, df_math_sub)

print(df_math.head(3))
print(df_port.head(3))
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob     Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home  teacher   
1     GP   F   17       U     GT3       T     1     1  at_home    other   
2     GP   F   15       U     LE3       T     1     1  at_home    other   

   reason guardian  traveltime  studytime  failures schoolsup famsup paid  \
0  course   mother           2          2         0       yes     no   no   
1  course   father           1          2         0        no    yes   no   
2   other   mother           1          2         3       yes     no  yes   

  activities nursery higher internet romantic  famrel  freetime  goout  Dalc  \
0         no     yes    yes       no       no       4         3      4     1   
1         no      no    yes      yes       no       5         3      3     1   
2         no     yes    yes      yes       no       4         3      2     2   

   Walc  health  absences  G1  G2  G3  G3_port  
0     1       3         6   5   6   6       11  
1     1       3         4   5   5   6       11  
2     3       3        10   7   8  10       12  
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob     Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home  teacher   
1     GP   F   17       U     GT3       T     1     1  at_home    other   
2     GP   F   15       U     LE3       T     1     1  at_home    other   

   reason guardian  traveltime  studytime  failures schoolsup famsup paid  \
0  course   mother           2          2         0       yes     no   no   
1  course   father           1          2         0        no    yes   no   
2   other   mother           1          2         0       yes     no   no   

  activities nursery higher internet romantic  famrel  freetime  goout  Dalc  \
0         no     yes    yes       no       no       4         3      4     1   
1         no      no    yes      yes       no       5         3      3     1   
2         no     yes    yes      yes       no       4         3      2     2   

   Walc  health  absences  G1  G2  G3  G3_math  
0     1       3         4   0  11  11        6  
1     1       3         2   9  11  11        6  
2     3       3         6  12  13  12       10  
In [7]:
print(df_math.info())
print(df_port.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 382 entries, 0 to 381
Data columns (total 34 columns):
school        382 non-null object
sex           382 non-null object
age           382 non-null int64
address       382 non-null object
famsize       382 non-null object
Pstatus       382 non-null object
Medu          382 non-null int64
Fedu          382 non-null int64
Mjob          382 non-null object
Fjob          382 non-null object
reason        382 non-null object
guardian      382 non-null object
traveltime    382 non-null int64
studytime     382 non-null int64
failures      382 non-null int64
schoolsup     382 non-null object
famsup        382 non-null object
paid          382 non-null object
activities    382 non-null object
nursery       382 non-null object
higher        382 non-null object
internet      382 non-null object
romantic      382 non-null object
famrel        382 non-null int64
freetime      382 non-null int64
goout         382 non-null int64
Dalc          382 non-null int64
Walc          382 non-null int64
health        382 non-null int64
absences      382 non-null int64
G1            382 non-null int64
G2            382 non-null int64
G3            382 non-null int64
G3_port       382 non-null int64
dtypes: int64(17), object(17)
memory usage: 104.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 382 entries, 0 to 381
Data columns (total 34 columns):
school        382 non-null object
sex           382 non-null object
age           382 non-null int64
address       382 non-null object
famsize       382 non-null object
Pstatus       382 non-null object
Medu          382 non-null int64
Fedu          382 non-null int64
Mjob          382 non-null object
Fjob          382 non-null object
reason        382 non-null object
guardian      382 non-null object
traveltime    382 non-null int64
studytime     382 non-null int64
failures      382 non-null int64
schoolsup     382 non-null object
famsup        382 non-null object
paid          382 non-null object
activities    382 non-null object
nursery       382 non-null object
higher        382 non-null object
internet      382 non-null object
romantic      382 non-null object
famrel        382 non-null int64
freetime      382 non-null int64
goout         382 non-null int64
Dalc          382 non-null int64
Walc          382 non-null int64
health        382 non-null int64
absences      382 non-null int64
G1            382 non-null int64
G2            382 non-null int64
G3            382 non-null int64
G3_math       382 non-null int64
dtypes: int64(17), object(17)
memory usage: 104.5+ KB
None
In [8]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns

def plot_hist(df):
    mu = df.G3.mean() # mean of distribution
    sigma = df.G3.std() # standard deviation of distribution

    num_bins = 17
    # the histogram of the data
    n, bins, patches = plt.hist(df.G3, num_bins, normed=1, facecolor='blue', alpha=0.5)

    # add a 'best fit' line
    y = mlab.normpdf(bins, mu, sigma)
    plt.plot(bins, y, 'r--')
    plt.xlabel('Final Grade')
    plt.ylabel('Probability')
    plt.title("Histogram of final grade frequencies")

    # Tweak spacing to prevent clipping of ylabel
    plt.subplots_adjust(left=0.15)
    plt.show()
In [9]:
plot_hist(df_math)
plot_hist(df_port)
In [10]:
sns.jointplot(data=df_math, y='G3', x='G3_port', kind='reg', size=10);
plt.show()
In [72]:
df_wo_outliers = df_math[(df_math['G3'] > 0) & (df_math['G3_port'] > 0)]
sns.jointplot(data=df_wo_outliers, y='G3', x='G3_port', kind='reg', size=10);
plt.show()
In [11]:
def value_count(df):
    print(df.school.value_counts())
    print(df.famsize.value_counts())
    print(df.Mjob.value_counts())
    print(df.Fjob.value_counts())
    print(df.reason.value_counts())
    print(df.guardian.value_counts())
    print(df.schoolsup.value_counts())
    print(df.famsup.value_counts())
    print(df.paid.value_counts())
    print(df.activities.value_counts())
    print(df.nursery.value_counts())
    print(df.higher.value_counts())
    print(df.internet.value_counts())
    print(df.romantic.value_counts())
In [12]:
def binary_conversion(df):
    df['Pstatus_A'] = np.where(df['Pstatus']=='A', 1, 0)
    df['sex_M'] = np.where(df['sex']=='M', 1, 0)
    df['address_urban'] = np.where(df['address']=='U', 1, 0)
    df['sex_M'] = np.where(df['sex']=='M', 1, 0)
    df['school_GP'] = np.where(df['school']=='GP', 1, 0)
    df['famsize_GT3'] = np.where(df['famsize']=='GT3', 1, 0)
    df['schoolsup_yes'] = np.where(df['schoolsup']=='yes', 1, 0)
    df['famsup_yes'] = np.where(df['famsup']=='yes', 1, 0)
    df['paid_yes'] = np.where(df['paid']=='yes', 1, 0)
    df['activities_yes'] = np.where(df['activities']=='yes', 1, 0)
    df['nusery_yes'] = np.where(df['nursery']=='yes', 1, 0)
    df['higher_yes'] = np.where(df['higher']=='yes', 1, 0)
    df['internet_yes'] = np.where(df['internet']=='yes', 1, 0)
    df['romantic_yes'] = np.where(df['romantic']=='yes', 1, 0)
    return df

df_math = binary_conversion(df_math)
df_port = binary_conversion(df_port)

df_math.head()
Out[12]:
school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian traveltime studytime failures schoolsup famsup paid activities nursery higher internet romantic famrel freetime goout Dalc Walc health absences G1 G2 G3 G3_port Pstatus_A sex_M address_urban school_GP famsize_GT3 schoolsup_yes famsup_yes paid_yes activities_yes nusery_yes higher_yes internet_yes romantic_yes
0 GP F 18 U GT3 A 4 4 at_home teacher course mother 2 2 0 yes no no no yes yes no no 4 3 4 1 1 3 6 5 6 6 11 1 0 1 1 1 1 0 0 0 1 1 0 0
1 GP F 17 U GT3 T 1 1 at_home other course father 1 2 0 no yes no no no yes yes no 5 3 3 1 1 3 4 5 5 6 11 0 0 1 1 1 0 1 0 0 0 1 1 0
2 GP F 15 U LE3 T 1 1 at_home other other mother 1 2 3 yes no yes no yes yes yes no 4 3 2 2 3 3 10 7 8 10 12 0 0 1 1 0 1 0 1 0 1 1 1 0
3 GP F 15 U GT3 T 4 2 health services home mother 1 3 0 no yes yes yes yes yes yes yes 3 2 2 1 1 5 2 15 14 15 14 0 0 1 1 1 0 1 1 1 1 1 1 1
4 GP F 16 U GT3 T 3 3 other other home father 1 2 0 no yes yes no yes yes no no 4 3 2 1 2 5 4 6 10 10 13 0 0 1 1 1 0 1 1 0 1 1 0 0
In [13]:
from sklearn.preprocessing import LabelBinarizer

def convert_categories(df, column):
    lb_style = LabelBinarizer()
    lb_results = lb_style.fit_transform(df[column])
    df2 = pd.DataFrame(lb_results, columns=lb_style.classes_)
    df2.columns = [column + "_" + str(col) for col in df2.columns]
    df3 = df.merge(df2, right_index=True, left_index=True)
    return df3
In [14]:
def merge_categories(df):
    df1 = convert_categories(df, "Mjob")
    df2 = convert_categories(df1, "Fjob")
    df3 = convert_categories(df2, "reason")
    df4 = convert_categories(df3, "guardian")
    return df4

df_math = merge_categories(df_math)
df_port = merge_categories(df_port)
    
In [15]:
df_math.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 382 entries, 0 to 381
Data columns (total 64 columns):
school               382 non-null object
sex                  382 non-null object
age                  382 non-null int64
address              382 non-null object
famsize              382 non-null object
Pstatus              382 non-null object
Medu                 382 non-null int64
Fedu                 382 non-null int64
Mjob                 382 non-null object
Fjob                 382 non-null object
reason               382 non-null object
guardian             382 non-null object
traveltime           382 non-null int64
studytime            382 non-null int64
failures             382 non-null int64
schoolsup            382 non-null object
famsup               382 non-null object
paid                 382 non-null object
activities           382 non-null object
nursery              382 non-null object
higher               382 non-null object
internet             382 non-null object
romantic             382 non-null object
famrel               382 non-null int64
freetime             382 non-null int64
goout                382 non-null int64
Dalc                 382 non-null int64
Walc                 382 non-null int64
health               382 non-null int64
absences             382 non-null int64
G1                   382 non-null int64
G2                   382 non-null int64
G3                   382 non-null int64
G3_port              382 non-null int64
Pstatus_A            382 non-null int64
sex_M                382 non-null int64
address_urban        382 non-null int64
school_GP            382 non-null int64
famsize_GT3          382 non-null int64
schoolsup_yes        382 non-null int64
famsup_yes           382 non-null int64
paid_yes             382 non-null int64
activities_yes       382 non-null int64
nusery_yes           382 non-null int64
higher_yes           382 non-null int64
internet_yes         382 non-null int64
romantic_yes         382 non-null int64
Mjob_at_home         382 non-null int64
Mjob_health          382 non-null int64
Mjob_other           382 non-null int64
Mjob_services        382 non-null int64
Mjob_teacher         382 non-null int64
Fjob_at_home         382 non-null int64
Fjob_health          382 non-null int64
Fjob_other           382 non-null int64
Fjob_services        382 non-null int64
Fjob_teacher         382 non-null int64
reason_course        382 non-null int64
reason_home          382 non-null int64
reason_other         382 non-null int64
reason_reputation    382 non-null int64
guardian_father      382 non-null int64
guardian_mother      382 non-null int64
guardian_other       382 non-null int64
dtypes: int64(47), object(17)
memory usage: 204.0+ KB
In [16]:
# Drop grades in G3 that are zero
def drop_zeros(df):
    df = df[(df['G3'] > 0)]
    return df

df_math = drop_zeros(df_math)._get_numeric_data()
df_port = drop_zeros(df_port)._get_numeric_data()
In [17]:
# Exploratory Analysis
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
corrmat = df_math.corr(method='pearson')
f, ax = sns.plt.subplots(figsize=(14, 14))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt='.1f');
In [18]:
g = sns.clustermap(corrmat, annot=True, fmt='.1f', figsize=(15, 15));
_ = g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
/Users/Ernst/anaconda3/lib/python3.6/site-packages/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
In [19]:
def drop_columns(df):
    df = df.drop(['Fjob_other', 'Mjob_other', 'guardian_other', 'reason_other'], axis=1)
    return df

df_math = drop_columns(df_math)
df_port = drop_columns(df_port)
In [20]:
df_math.to_csv("ARA_MathGrades.csv", index=False)
df_port.to_csv("ARA_PortGrades.csv", index=False)
In [21]:
sns.lmplot(x='G3_port', y='G3', data=df_math);
In [22]:
def normalize_df(df):
    df = (df - df.mean()) / (df.max() - df.min())
    return df

df_math_norm = normalize_df(df_math)
df_port_norm = normalize_df(df_port)

df_math_norm.head(3)
Out[22]:
age Medu Fedu traveltime studytime failures famrel freetime goout Dalc Walc health absences G1 G2 G3 G3_port Pstatus_A sex_M address_urban school_GP famsize_GT3 schoolsup_yes famsup_yes paid_yes activities_yes nusery_yes higher_yes internet_yes romantic_yes Mjob_at_home Mjob_health Mjob_services Mjob_teacher Fjob_at_home Fjob_health Fjob_services Fjob_teacher reason_course reason_home reason_reputation guardian_father guardian_mother
0 0.209496 0.287901 0.352041 0.191448 -0.013605 -0.072886 0.010933 -0.059038 0.222303 -0.122449 -0.33309 -0.142857 0.001011 -0.390488 -0.385048 -0.348032 -0.089765 0.895044 -0.489796 0.209913 0.102041 0.28863 0.854227 -0.620991 -0.492711 -0.527697 0.183673 0.03207 -0.851312 -0.294461 0.87172 -0.090379 -0.259475 -0.163265 -0.037901 -0.049563 -0.279883 0.921283 0.650146 -0.282799 -0.271137 -0.239067 0.274052
1 0.066639 -0.462099 -0.397959 -0.141885 -0.013605 -0.072886 0.260933 -0.059038 -0.027697 -0.122449 -0.33309 -0.142857 -0.025656 -0.390488 -0.456476 -0.348032 -0.089765 -0.104956 -0.489796 0.209913 0.102041 0.28863 -0.145773 0.379009 -0.492711 -0.527697 -0.816327 0.03207 0.148688 -0.294461 0.87172 -0.090379 -0.259475 -0.163265 -0.037901 -0.049563 -0.279883 -0.078717 0.650146 -0.282799 -0.271137 0.760933 -0.725948
2 -0.219075 -0.462099 -0.397959 -0.141885 -0.013605 0.927114 0.010933 -0.059038 -0.277697 0.127551 0.16691 -0.142857 0.054344 -0.265488 -0.242191 -0.098032 -0.037134 -0.104956 -0.489796 0.209913 0.102041 -0.71137 0.854227 -0.620991 0.507289 -0.527697 0.183673 0.03207 0.148688 -0.294461 0.87172 -0.090379 -0.259475 -0.163265 -0.037901 -0.049563 -0.279883 -0.078717 -0.349854 -0.282799 -0.271137 -0.239067 0.274052
In [23]:
import statsmodels.api as sm

X = df_math['G3_port']
Y = df_math['G3']

X = sm.add_constant(X)
est = sm.OLS(Y, X).fit()

est.summary()
/Users/Ernst/anaconda3/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
Out[23]:
OLS Regression Results
Dep. Variable: G3 R-squared: 0.311
Model: OLS Adj. R-squared: 0.309
Method: Least Squares F-statistic: 153.9
Date: Mon, 11 Dec 2017 Prob (F-statistic): 2.03e-29
Time: 11:41:53 Log-Likelihood: -829.97
No. Observations: 343 AIC: 1664.
Df Residuals: 341 BIC: 1672.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 3.3608 0.678 4.958 0.000 2.028 4.694
G3_port 0.6460 0.052 12.405 0.000 0.544 0.748
Omnibus: 4.674 Durbin-Watson: 1.976
Prob(Omnibus): 0.097 Jarque-Bera (JB): 4.640
Skew: 0.251 Prob(JB): 0.0983
Kurtosis: 2.731 Cond. No. 60.2
In [24]:
X = df_math_norm.drop(['G3', 'G1', 'G2'], axis=1)
Y = df_math_norm['G3']

X = sm.add_constant(X)
est_math = sm.OLS(Y, X).fit()

est_math.summary()
Out[24]:
OLS Regression Results
Dep. Variable: G3 R-squared: 0.502
Model: OLS Adj. R-squared: 0.436
Method: Least Squares F-statistic: 7.602
Date: Mon, 11 Dec 2017 Prob (F-statistic): 3.35e-27
Time: 11:41:53 Log-Likelihood: 176.61
No. Observations: 343 AIC: -271.2
Df Residuals: 302 BIC: -113.9
Df Model: 40
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 5.161e-17 0.008 6.2e-15 1.000 -0.016 0.016
age -0.1928 0.063 -3.077 0.002 -0.316 -0.069
Medu 0.0476 0.052 0.918 0.359 -0.054 0.150
Fedu 0.0254 0.046 0.555 0.579 -0.065 0.116
traveltime -0.0083 0.042 -0.196 0.845 -0.091 0.075
studytime 0.0414 0.035 1.175 0.241 -0.028 0.111
failures -0.0895 0.048 -1.846 0.066 -0.185 0.006
famrel 0.0301 0.040 0.754 0.452 -0.049 0.109
freetime 0.0260 0.039 0.671 0.503 -0.050 0.102
goout -0.0652 0.038 -1.729 0.085 -0.139 0.009
Dalc 0.1058 0.053 1.994 0.047 0.001 0.210
Walc -0.0618 0.041 -1.496 0.136 -0.143 0.019
health -0.0217 0.026 -0.827 0.409 -0.073 0.030
absences -0.1591 0.093 -1.711 0.088 -0.342 0.024
G3_port 0.7052 0.072 9.818 0.000 0.564 0.847
Pstatus_A -0.0178 0.030 -0.604 0.547 -0.076 0.040
sex_M 0.0710 0.021 3.451 0.001 0.031 0.112
address_urban -0.0098 0.024 -0.407 0.685 -0.057 0.038
school_GP -0.0164 0.034 -0.483 0.630 -0.083 0.050
famsize_GT3 -0.0105 0.020 -0.536 0.592 -0.049 0.028
schoolsup_yes -0.0940 0.027 -3.510 0.001 -0.147 -0.041
famsup_yes -0.0377 0.019 -1.952 0.052 -0.076 0.000
paid_yes -0.0181 0.019 -0.955 0.341 -0.055 0.019
activities_yes -0.0058 0.019 -0.314 0.754 -0.042 0.031
nusery_yes -0.0116 0.023 -0.500 0.618 -0.057 0.034
higher_yes -0.0917 0.056 -1.648 0.100 -0.201 0.018
internet_yes 0.0288 0.026 1.101 0.272 -0.023 0.080
romantic_yes -0.0136 0.019 -0.700 0.484 -0.052 0.025
Mjob_at_home 0.0176 0.031 0.575 0.566 -0.043 0.078
Mjob_health 0.0527 0.037 1.423 0.156 -0.020 0.126
Mjob_services 0.0588 0.024 2.453 0.015 0.012 0.106
Mjob_teacher -0.0410 0.033 -1.245 0.214 -0.106 0.024
Fjob_at_home 0.0120 0.047 0.254 0.799 -0.081 0.105
Fjob_health -0.0029 0.044 -0.065 0.948 -0.090 0.084
Fjob_services -0.0072 0.022 -0.334 0.739 -0.050 0.035
Fjob_teacher 0.0915 0.037 2.472 0.014 0.019 0.164
reason_course -0.0016 0.033 -0.048 0.962 -0.066 0.063
reason_home 0.0044 0.034 0.130 0.896 -0.062 0.071
reason_reputation -0.0039 0.034 -0.113 0.910 -0.071 0.063
guardian_father 0.0167 0.051 0.326 0.744 -0.084 0.117
guardian_mother 0.0079 0.049 0.161 0.872 -0.089 0.105
Omnibus: 4.022 Durbin-Watson: 2.119
Prob(Omnibus): 0.134 Jarque-Bera (JB): 4.052
Skew: 0.238 Prob(JB): 0.132
Kurtosis: 2.763 Cond. No. 11.9
In [25]:
dict_math = est_math.params.to_dict()
dict_math.pop('const')
dict_math.pop('G3_port')
dict_math
Out[25]:
{'Dalc': 0.10579705471757239,
 'Fedu': 0.025438490065162966,
 'Fjob_at_home': 0.012044925536647069,
 'Fjob_health': -0.0028642076474327469,
 'Fjob_services': -0.0072384999684696801,
 'Fjob_teacher': 0.091523098498639988,
 'Medu': 0.047602650473355254,
 'Mjob_at_home': 0.01763378618000561,
 'Mjob_health': 0.052713665032108885,
 'Mjob_services': 0.058832966474728618,
 'Mjob_teacher': -0.041006323366823971,
 'Pstatus_A': -0.017821962435043307,
 'Walc': -0.061751787059174348,
 'absences': -0.15912053408601359,
 'activities_yes': -0.0058310307579844053,
 'address_urban': -0.0098367451574800195,
 'age': -0.19282077475811515,
 'failures': -0.089479265869901792,
 'famrel': 0.03012707178921949,
 'famsize_GT3': -0.010521241422356945,
 'famsup_yes': -0.037701234676568225,
 'freetime': 0.025981764492934985,
 'goout': -0.065170704914442004,
 'guardian_father': 0.016698102472835843,
 'guardian_mother': 0.0079247290967995512,
 'health': -0.021722279951860438,
 'higher_yes': -0.091734140057228547,
 'internet_yes': 0.028769152852879669,
 'nusery_yes': -0.011550843819242249,
 'paid_yes': -0.018090326708878508,
 'reason_course': -0.0015709263438080431,
 'reason_home': 0.0043931683751412473,
 'reason_reputation': -0.0038641131230675429,
 'romantic_yes': -0.013625663317861672,
 'school_GP': -0.016396151990538017,
 'schoolsup_yes': -0.093964507283273158,
 'sex_M': 0.07101362718747449,
 'studytime': 0.041399436119181697,
 'traveltime': -0.0082725299506371669}
In [26]:
X = df_port_norm.drop(['G3', 'G1', 'G2'], axis=1)
Y = df_port_norm['G3']

X = sm.add_constant(X)
est_port = sm.OLS(Y, X).fit()

est_port.summary()
Out[26]:
OLS Regression Results
Dep. Variable: G3 R-squared: 0.532
Model: OLS Adj. R-squared: 0.476
Method: Least Squares F-statistic: 9.548
Date: Mon, 11 Dec 2017 Prob (F-statistic): 1.29e-35
Time: 11:41:53 Log-Likelihood: 340.35
No. Observations: 377 AIC: -598.7
Df Residuals: 336 BIC: -437.5
Df Model: 40
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 2.125e-17 0.005 3.97e-15 1.000 -0.011 0.011
age 0.1744 0.040 4.373 0.000 0.096 0.253
Medu 0.0242 0.034 0.701 0.484 -0.044 0.092
Fedu -0.0020 0.029 -0.069 0.945 -0.060 0.056
traveltime 0.0354 0.027 1.325 0.186 -0.017 0.088
studytime 0.0427 0.022 1.920 0.056 -0.001 0.086
failures -0.1360 0.037 -3.654 0.000 -0.209 -0.063
famrel 0.0141 0.025 0.556 0.578 -0.036 0.064
freetime -0.0174 0.025 -0.703 0.483 -0.066 0.031
goout -0.0446 0.023 -1.940 0.053 -0.090 0.001
Dalc -0.0568 0.036 -1.599 0.111 -0.127 0.013
Walc -0.0070 0.025 -0.273 0.785 -0.057 0.043
health -0.0373 0.017 -2.225 0.027 -0.070 -0.004
absences -0.0864 0.038 -2.247 0.025 -0.162 -0.011
G3_math 0.2589 0.026 9.802 0.000 0.207 0.311
Pstatus_A 0.0233 0.019 1.231 0.219 -0.014 0.061
sex_M -0.0535 0.013 -4.117 0.000 -0.079 -0.028
address_urban 0.0290 0.015 1.884 0.060 -0.001 0.059
school_GP 0.0450 0.021 2.100 0.036 0.003 0.087
famsize_GT3 0.0015 0.013 0.120 0.904 -0.024 0.027
schoolsup_yes -0.0615 0.017 -3.525 0.000 -0.096 -0.027
famsup_yes 0.0108 0.012 0.889 0.375 -0.013 0.035
paid_yes -0.0301 0.023 -1.289 0.198 -0.076 0.016
activities_yes 0.0303 0.012 2.610 0.009 0.007 0.053
nusery_yes -0.0193 0.015 -1.319 0.188 -0.048 0.009
higher_yes 0.0950 0.030 3.206 0.001 0.037 0.153
internet_yes 0.0035 0.017 0.204 0.838 -0.030 0.037
romantic_yes 0.0012 0.012 0.102 0.919 -0.023 0.025
Mjob_at_home 0.0109 0.019 0.573 0.567 -0.027 0.048
Mjob_health 0.0387 0.024 1.602 0.110 -0.009 0.086
Mjob_services 0.0082 0.015 0.529 0.597 -0.022 0.039
Mjob_teacher 0.0292 0.021 1.373 0.171 -0.013 0.071
Fjob_at_home 0.0530 0.029 1.833 0.068 -0.004 0.110
Fjob_health -0.0098 0.029 -0.334 0.739 -0.067 0.048
Fjob_services -0.0092 0.014 -0.661 0.509 -0.037 0.018
Fjob_teacher 0.0445 0.024 1.875 0.062 -0.002 0.091
reason_course 9.371e-05 0.022 0.004 0.997 -0.043 0.043
reason_home 0.0189 0.023 0.833 0.405 -0.026 0.063
reason_reputation -0.0009 0.023 -0.040 0.968 -0.046 0.044
guardian_father 0.0179 0.030 0.593 0.554 -0.042 0.078
guardian_mother 0.0108 0.029 0.377 0.706 -0.046 0.067
Omnibus: 13.963 Durbin-Watson: 1.995
Prob(Omnibus): 0.001 Jarque-Bera (JB): 27.499
Skew: -0.153 Prob(JB): 1.07e-06
Kurtosis: 4.287 Cond. No. 8.70
In [27]:
dict_port = est_port.params.to_dict()
dict_port.pop('const')
dict_port.pop('G3_math')
dict_port
Out[27]:
{'Dalc': -0.056780590882028692,
 'Fedu': -0.0020403289578331402,
 'Fjob_at_home': 0.052962596560062011,
 'Fjob_health': -0.0097890470146390039,
 'Fjob_services': -0.0092179525615222625,
 'Fjob_teacher': 0.044530305890802369,
 'Medu': 0.024195397912966071,
 'Mjob_at_home': 0.01093241644645325,
 'Mjob_health': 0.038697761497450141,
 'Mjob_services': 0.0081865762249839404,
 'Mjob_teacher': 0.029185772999491821,
 'Pstatus_A': 0.023339910860671241,
 'Walc': -0.0069654146256981075,
 'absences': -0.086388802377466239,
 'activities_yes': 0.030335083286891329,
 'address_urban': 0.029010107793604616,
 'age': 0.17443382450061343,
 'failures': -0.13599869989967867,
 'famrel': 0.01413306784244147,
 'famsize_GT3': 0.0015432067156058724,
 'famsup_yes': 0.010805940818369287,
 'freetime': -0.017401616184114899,
 'goout': -0.044557078064293779,
 'guardian_father': 0.017949915711913927,
 'guardian_mother': 0.010835623135459551,
 'health': -0.037280048465463178,
 'higher_yes': 0.09499036701184578,
 'internet_yes': 0.0034679315086513004,
 'nusery_yes': -0.019308428979514319,
 'paid_yes': -0.030095517272531874,
 'reason_course': 9.3705955207890566e-05,
 'reason_home': 0.018866653507946012,
 'reason_reputation': -0.00091316854548767826,
 'romantic_yes': 0.0012482917097375573,
 'school_GP': 0.045006757435693781,
 'schoolsup_yes': -0.061541233361383965,
 'sex_M': -0.053519912423387279,
 'studytime': 0.042694853322908323,
 'traveltime': 0.035416247056443285}
In [28]:
dict_coeff = {}
for key in (dict_math.keys() | dict_port.keys()):
    if key in dict_math: dict_coeff.setdefault(key, []).append(dict_math[key])
    if key in dict_port: dict_coeff.setdefault(key, []).append(dict_port[key])
        
print(dict_coeff)
{'famsize_GT3': [-0.010521241422356945, 0.0015432067156058724], 'Fjob_services': [-0.0072384999684696801, -0.0092179525615222625], 'guardian_mother': [0.0079247290967995512, 0.010835623135459551], 'Pstatus_A': [-0.017821962435043307, 0.023339910860671241], 'school_GP': [-0.016396151990538017, 0.045006757435693781], 'Mjob_teacher': [-0.041006323366823971, 0.029185772999491821], 'romantic_yes': [-0.013625663317861672, 0.0012482917097375573], 'Mjob_at_home': [0.01763378618000561, 0.01093241644645325], 'reason_reputation': [-0.0038641131230675429, -0.00091316854548767826], 'studytime': [0.041399436119181697, 0.042694853322908323], 'internet_yes': [0.028769152852879669, 0.0034679315086513004], 'Mjob_health': [0.052713665032108885, 0.038697761497450141], 'nusery_yes': [-0.011550843819242249, -0.019308428979514319], 'famsup_yes': [-0.037701234676568225, 0.010805940818369287], 'absences': [-0.15912053408601359, -0.086388802377466239], 'Fjob_health': [-0.0028642076474327469, -0.0097890470146390039], 'traveltime': [-0.0082725299506371669, 0.035416247056443285], 'reason_course': [-0.0015709263438080431, 9.3705955207890566e-05], 'address_urban': [-0.0098367451574800195, 0.029010107793604616], 'Fjob_at_home': [0.012044925536647069, 0.052962596560062011], 'sex_M': [0.07101362718747449, -0.053519912423387279], 'Mjob_services': [0.058832966474728618, 0.0081865762249839404], 'Walc': [-0.061751787059174348, -0.0069654146256981075], 'Dalc': [0.10579705471757239, -0.056780590882028692], 'Medu': [0.047602650473355254, 0.024195397912966071], 'higher_yes': [-0.091734140057228547, 0.09499036701184578], 'activities_yes': [-0.0058310307579844053, 0.030335083286891329], 'age': [-0.19282077475811515, 0.17443382450061343], 'guardian_father': [0.016698102472835843, 0.017949915711913927], 'freetime': [0.025981764492934985, -0.017401616184114899], 'Fjob_teacher': [0.091523098498639988, 0.044530305890802369], 'goout': [-0.065170704914442004, -0.044557078064293779], 'paid_yes': [-0.018090326708878508, -0.030095517272531874], 'reason_home': [0.0043931683751412473, 0.018866653507946012], 'famrel': [0.03012707178921949, 0.01413306784244147], 'failures': [-0.089479265869901792, -0.13599869989967867], 'schoolsup_yes': [-0.093964507283273158, -0.061541233361383965], 'Fedu': [0.025438490065162966, -0.0020403289578331402], 'health': [-0.021722279951860438, -0.037280048465463178]}
In [29]:
test = dict_coeff

# repackage data into array-like for matplotlib 
# (see a preferred pythonic way below)
data = {"x":[], "y":[], "label":[]}
for label, coord in test.items():
    data["x"].append(coord[0])
    data["y"].append(coord[1])
    data["label"].append(label)

# display scatter plot data
plt.figure(figsize=(14,14))
plt.title('Regression coefficients', fontsize=20)
plt.xlabel('Math', fontsize=20)
plt.ylabel('Portuguese', fontsize=20)
plt.scatter(data["x"], data["y"], marker = 'o')

# add labels
for label, x, y in zip(data["label"], data["x"], data["y"]):
    plt.annotate(label, xy = (x, y), fontsize=12)
    
print( """
# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - student's sex (binary: "F" - female or "M" - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: "U" - urban or "R" - rural)
5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10 Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
11 reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
12 guardian - student's guardian (nominal: "mother", "father" or "other")
13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16 schoolsup - extra educational support (binary: yes or no)
17 famsup - family educational support (binary: yes or no)
18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19 activities - extra-curricular activities (binary: yes or no)
20 nursery - attended nursery school (binary: yes or no)
21 higher - wants to take higher education (binary: yes or no)
22 internet - Internet access at home (binary: yes or no)
23 romantic - with a romantic relationship (binary: yes or no)
24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29 health - current health status (numeric: from 1 - very bad to 5 - very good)
30 absences - number of school absences (numeric: from 0 to 93)
""")
# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - student's sex (binary: "F" - female or "M" - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: "U" - urban or "R" - rural)
5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 Medu - mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 Fedu - father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10 Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
11 reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
12 guardian - student's guardian (nominal: "mother", "father" or "other")
13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16 schoolsup - extra educational support (binary: yes or no)
17 famsup - family educational support (binary: yes or no)
18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19 activities - extra-curricular activities (binary: yes or no)
20 nursery - attended nursery school (binary: yes or no)
21 higher - wants to take higher education (binary: yes or no)
22 internet - Internet access at home (binary: yes or no)
23 romantic - with a romantic relationship (binary: yes or no)
24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29 health - current health status (numeric: from 1 - very bad to 5 - very good)
30 absences - number of school absences (numeric: from 0 to 93)

Machine learning analysis Math

In [30]:
df_math.head()
Out[30]:
age Medu Fedu traveltime studytime failures famrel freetime goout Dalc Walc health absences G1 G2 G3 G3_port Pstatus_A sex_M address_urban school_GP famsize_GT3 schoolsup_yes famsup_yes paid_yes activities_yes nusery_yes higher_yes internet_yes romantic_yes Mjob_at_home Mjob_health Mjob_services Mjob_teacher Fjob_at_home Fjob_health Fjob_services Fjob_teacher reason_course reason_home reason_reputation guardian_father guardian_mother
0 18 4 4 2 2 0 4 3 4 1 1 3 6 5 6 6 11 1 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1
1 17 1 1 1 2 0 5 3 3 1 1 3 4 5 5 6 11 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0
2 15 1 1 1 2 3 4 3 2 2 3 3 10 7 8 10 12 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
3 15 4 2 1 3 0 3 2 2 1 1 5 2 15 14 15 14 0 0 1 1 1 0 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1
4 16 3 3 1 2 0 4 3 2 1 2 5 4 6 10 10 13 0 0 1 1 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
In [31]:
from sklearn.model_selection import train_test_split
output_columns = ['G3']
input_columns = df_math.drop(['G3', 'G2', 'G1'], axis=1).columns
df_model = df_math.drop(['G2', 'G1'], axis=1)
# df_model = df_num

train, test = train_test_split(df_model, test_size = 0.3)
x_train = train.loc[:,input_columns] # Gives dataframe of 0: rows 
y_train = train[output_columns]
x_test = test.loc[:,input_columns]
y_test = test[output_columns]
In [32]:
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(x_train,y_train)
Out[32]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [33]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_training = model.predict(x_train)
y_pred_testing = model.predict(x_test)
training_msq = mean_squared_error(y_pred_training,y_train)
testing_msq = mean_squared_error(y_pred_testing,y_test)
print(training_msq,testing_msq)
4.6467057913 8.61194094871
In [34]:
print('Train R-Square:',r2_score(y_train,y_pred_training))
print('Test R-Square:',r2_score(y_test,y_pred_testing))
Train R-Square: 0.567275880032
Test R-Square: 0.18925849639
In [35]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

headers = ['depth', 'sample_size', 'score_test', 'score_train']
scores_dt = pd.DataFrame(columns = headers)
index = 0
models = list()
for tree_depth in range(1, 15):
    for min_samples_size in range(1, 30, 1):
        model = tree.DecisionTreeRegressor(max_depth = tree_depth, min_samples_leaf=min_samples_size)
        model.fit(x_train,y_train)
        score_train = model.score(x_train,y_train)
        score_test = model.score(x_test,y_test)
        scores_dt.loc[index] = [tree_depth, min_samples_size, score_test, score_train]
        models.append(model)
        index += 1
In [36]:
df_max_scores = scores_dt.sort_values('score_test', ascending=False).head(2)
print(df_max_scores)
     depth  sample_size  score_test  score_train
357   13.0         10.0    0.397403     0.580559
270   10.0         10.0    0.397403     0.580559
In [37]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
model = tree.DecisionTreeRegressor(max_depth=int(df_max_scores.iloc[0,0]), min_samples_leaf=int(df_max_scores.iloc[0,1]))
model.fit(x_train,y_train)
Out[37]:
DecisionTreeRegressor(criterion='mse', max_depth=13, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
In [38]:
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
Training R-Square 0.580559337194
Testing R-Square 0.39740315327
In [39]:
import pydotplus 
feature_names = [key for key in df_math if not key=='G3']
from IPython.display import Image
dot_data = tree.export_graphviz(model, out_file=None,feature_names=feature_names)
import pydotplus

graph = pydotplus.graphviz.graph_from_dot_data(dot_data)

Image(graph.create_png())
Out[39]:
In [40]:
sns.plt.scatter(model.predict(x_test), y_test)
Out[40]:
<matplotlib.collections.PathCollection at 0x1218eb2b0>
In [41]:
from sklearn.ensemble import GradientBoostingRegressor

headers = ['n_estimators', 'score_test', 'score_train']
scores_gbt = pd.DataFrame(columns = headers)
index = 0
models = list()
for estimator in range(10, 200, 5):
    model = GradientBoostingRegressor(n_estimators = estimator, 
        max_depth=3, 
        random_state=0, 
        learning_rate=0.1)
    model.fit(x_train,y_train)
    score_train = model.score(x_train,y_train)
    score_test = model.score(x_test,y_test)
    scores_gbt.loc[index] = [estimator, score_test, score_train]
    models.append(model)
    index += 1
/Users/Ernst/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [42]:
scores_gbt.plot(y = ['score_train', 'score_test'], x = ['n_estimators'])
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1219077f0>
In [43]:
max_scores_gbt = scores_gbt.sort_values('score_test', ascending=False).head(3)
In [44]:
# Using optimal depth and sample size we run the decision tree regressor 
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=int(max_scores_gbt.iloc[0,0]), 
        max_depth=3, 
        random_state=0, 
        learning_rate=0.1)
# Here selected min samples as 201 to be able to draw tree
model.fit(x_train,y_train)
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
Training R-Square 0.803778563923
Testing R-Square 0.392478079719
/Users/Ernst/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [45]:
sns.plt.scatter(model.predict(x_test), y_test)
Out[45]:
<matplotlib.collections.PathCollection at 0x121cdd5f8>
In [46]:
df_features_math = pd.DataFrame({'name': input_columns, 'math_value': model.feature_importances_}).sort_values('math_value', ascending=False)
df_features_math
Out[46]:
math_value name
13 0.267185 G3_port
4 0.065996 studytime
12 0.063322 absences
8 0.048701 goout
15 0.045943 sex_M
10 0.041758 Walc
1 0.031768 Medu
28 0.031048 Mjob_health
0 0.030810 age
34 0.030314 Fjob_teacher
21 0.028965 paid_yes
5 0.028815 failures
2 0.026670 Fedu
20 0.024660 famsup_yes
11 0.023289 health
9 0.022688 Dalc
22 0.020309 activities_yes
38 0.020123 guardian_father
19 0.017851 schoolsup_yes
35 0.016769 reason_course
29 0.015749 Mjob_services
7 0.014895 freetime
6 0.012071 famrel
36 0.010105 reason_home
16 0.009931 address_urban
37 0.008045 reason_reputation
26 0.007479 romantic_yes
39 0.007297 guardian_mother
27 0.004969 Mjob_at_home
18 0.004600 famsize_GT3
23 0.004490 nusery_yes
25 0.003882 internet_yes
30 0.003590 Mjob_teacher
3 0.003431 traveltime
31 0.002295 Fjob_at_home
33 0.000184 Fjob_services
32 0.000000 Fjob_health
24 0.000000 higher_yes
17 0.000000 school_GP
14 0.000000 Pstatus_A
In [ ]:
 

Machine learning analysis Portugese

In [47]:
df_port.head()
Out[47]:
age Medu Fedu traveltime studytime failures famrel freetime goout Dalc Walc health absences G1 G2 G3 G3_math Pstatus_A sex_M address_urban school_GP famsize_GT3 schoolsup_yes famsup_yes paid_yes activities_yes nusery_yes higher_yes internet_yes romantic_yes Mjob_at_home Mjob_health Mjob_services Mjob_teacher Fjob_at_home Fjob_health Fjob_services Fjob_teacher reason_course reason_home reason_reputation guardian_father guardian_mother
0 18 4 4 2 2 0 4 3 4 1 1 3 4 0 11 11 6 1 0 1 1 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1
1 17 1 1 1 2 0 5 3 3 1 1 3 2 9 11 11 6 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0
2 15 1 1 1 2 0 4 3 2 2 3 3 6 12 13 12 10 0 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1
3 15 4 2 1 3 0 3 2 2 1 1 5 0 14 14 14 15 0 0 1 1 1 0 1 0 1 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1
4 16 3 3 1 2 0 4 3 2 1 2 5 0 11 13 13 10 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
In [48]:
from sklearn.model_selection import train_test_split
output_columns = ['G3']
input_columns = df_port.drop(['G3', 'G2', 'G1'], axis=1).columns
df_model = df_port.drop(['G2', 'G1'], axis=1)
# df_model = df_num

train, test = train_test_split(df_model, test_size = 0.3)
x_train = train.loc[:,input_columns] # Gives dataframe of 0: rows 
y_train = train[output_columns]
x_test = test.loc[:,input_columns]
y_test = test[output_columns]
In [49]:
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(x_train,y_train)
Out[49]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [50]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred_training = model.predict(x_train)
y_pred_testing = model.predict(x_test)
training_msq = mean_squared_error(y_pred_training,y_train)
testing_msq = mean_squared_error(y_pred_testing,y_test)
print(training_msq,testing_msq)
2.44820126676 5.23395543193
In [51]:
print('Train R-Square:',r2_score(y_train,y_pred_training))
print('Test R-Square:',r2_score(y_test,y_pred_testing))
Train R-Square: 0.588997433545
Test R-Square: 0.357211850263
In [52]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

headers = ['depth', 'sample_size', 'score_test', 'score_train']
scores_dt = pd.DataFrame(columns = headers)
index = 0
models = list()
for tree_depth in range(1, 15):
    for min_samples_size in range(1, 30, 1):
        model = tree.DecisionTreeRegressor(max_depth = tree_depth, min_samples_leaf=min_samples_size)
        model.fit(x_train,y_train)
        score_train = model.score(x_train,y_train)
        score_test = model.score(x_test,y_test)
        scores_dt.loc[index] = [tree_depth, min_samples_size, score_test, score_train]
        models.append(model)
        index += 1
In [53]:
df_max_socres = scores_dt.sort_values('score_test', ascending=False).head(2)
df_max_scores
Out[53]:
depth sample_size score_test score_train
357 13.0 10.0 0.397403 0.580559
270 10.0 10.0 0.397403 0.580559
In [54]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
model = tree.DecisionTreeRegressor(max_depth=int(df_max_scores.iloc[0,0]), min_samples_leaf=int(df_max_scores.iloc[0,1]))
model.fit(x_train,y_train)
Out[54]:
DecisionTreeRegressor(criterion='mse', max_depth=13, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
In [55]:
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
Training R-Square 0.585668900631
Testing R-Square 0.37210438495
In [56]:
import pydotplus 
feature_names = [key for key in df_math if not key=='G3']
from IPython.display import Image
dot_data = tree.export_graphviz(model, out_file=None,feature_names=feature_names)
import pydotplus

graph = pydotplus.graphviz.graph_from_dot_data(dot_data)

Image(graph.create_png())
Out[56]:
In [57]:
sns.plt.scatter(model.predict(x_test), y_test)
Out[57]:
<matplotlib.collections.PathCollection at 0x121f49320>
In [58]:
from sklearn.ensemble import GradientBoostingRegressor

headers = ['n_estimators', 'score_test', 'score_train']
scores_gbt = pd.DataFrame(columns = headers)
index = 0
models = list()
for estimator in range(10, 200, 5):
    model = GradientBoostingRegressor(n_estimators = estimator, 
        max_depth=3, 
        random_state=0, 
        learning_rate=0.1)
    model.fit(x_train,y_train)
    score_train = model.score(x_train,y_train)
    score_test = model.score(x_test,y_test)
    scores_gbt.loc[index] = [estimator, score_test, score_train]
    models.append(model)
    index += 1
/Users/Ernst/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [59]:
scores_gbt.plot(y = ['score_train', 'score_test'], x = ['n_estimators'])
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x122008860>
In [71]:
max_scores_gbt = scores_gbt.sort_values('score_test', ascending=False).head(3)
# max_scores_gbt.to_csv('estimators.csv')
max_scores_gbt
Out[71]:
n_estimators score_test score_train
6 40.0 0.405215 0.756702
8 50.0 0.404637 0.791239
5 35.0 0.399193 0.733878
In [61]:
# Using optimal depth and sample size we run the decision tree regressor 
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=int(max_scores_gbt.iloc[0,0]), 
        max_depth=3, 
        random_state=0, 
        learning_rate=0.1)
# Here selected min samples as 201 to be able to draw tree
model.fit(x_train,y_train)
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
Training R-Square 0.75670227719
Testing R-Square 0.405215074636
/Users/Ernst/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [62]:
sns.plt.scatter(model.predict(x_test), y_test)
Out[62]:
<matplotlib.collections.PathCollection at 0x1223d0438>
In [74]:
df_features_port = pd.DataFrame({'name': input_columns, 'port_value': model.feature_importances_}).sort_values('port_value', ascending=False)
df_features_port
Out[74]:
name port_value
13 G3_math 0.279978
12 absences 0.075464
0 age 0.062906
15 sex_M 0.052550
11 health 0.051117
5 failures 0.050225
19 schoolsup_yes 0.037177
9 Dalc 0.034146
24 higher_yes 0.033574
20 famsup_yes 0.031664
1 Medu 0.027958
4 studytime 0.027422
17 school_GP 0.020135
22 activities_yes 0.020041
2 Fedu 0.018799
28 Mjob_health 0.015390
34 Fjob_teacher 0.015371
14 Pstatus_A 0.014968
10 Walc 0.013678
16 address_urban 0.013449
31 Fjob_at_home 0.012753
29 Mjob_services 0.012074
35 reason_course 0.010438
37 reason_reputation 0.009306
6 famrel 0.008822
7 freetime 0.006649
39 guardian_mother 0.006519
3 traveltime 0.006247
18 famsize_GT3 0.006071
36 reason_home 0.005378
30 Mjob_teacher 0.004309
25 internet_yes 0.003189
33 Fjob_services 0.003009
26 romantic_yes 0.002285
27 Mjob_at_home 0.001900
32 Fjob_health 0.001877
23 nusery_yes 0.001476
8 goout 0.001272
38 guardian_father 0.000414
21 paid_yes 0.000000
In [64]:
df_features_port['name'].replace('G3_math', 'G3_other_subject', inplace=True)
df_features_math['name'].replace('G3_port', 'G3_other_subject', inplace=True)
df_features = df_features_math.merge(df_features_port, on='name')
df_features_no_crossover = df_features[df_features.name != 'G3_other_subject']
df_features_no_crossover.set_index('name')
Out[64]:
math_value port_value
name
studytime 0.065996 0.027422
absences 0.063322 0.075464
goout 0.048701 0.001272
sex_M 0.045943 0.052550
Walc 0.041758 0.013678
Medu 0.031768 0.027958
Mjob_health 0.031048 0.015390
age 0.030810 0.062906
Fjob_teacher 0.030314 0.015371
paid_yes 0.028965 0.000000
failures 0.028815 0.050225
Fedu 0.026670 0.018799
famsup_yes 0.024660 0.031664
health 0.023289 0.051117
Dalc 0.022688 0.034146
activities_yes 0.020309 0.020041
guardian_father 0.020123 0.000414
schoolsup_yes 0.017851 0.037177
reason_course 0.016769 0.010438
Mjob_services 0.015749 0.012074
freetime 0.014895 0.006649
famrel 0.012071 0.008822
reason_home 0.010105 0.005378
address_urban 0.009931 0.013449
reason_reputation 0.008045 0.009306
romantic_yes 0.007479 0.002285
guardian_mother 0.007297 0.006519
Mjob_at_home 0.004969 0.001900
famsize_GT3 0.004600 0.006071
nusery_yes 0.004490 0.001476
internet_yes 0.003882 0.003189
Mjob_teacher 0.003590 0.004309
traveltime 0.003431 0.006247
Fjob_at_home 0.002295 0.012753
Fjob_services 0.000184 0.003009
Fjob_health 0.000000 0.001877
higher_yes 0.000000 0.033574
school_GP 0.000000 0.020135
Pstatus_A 0.000000 0.014968
In [75]:
df_features.head()
Out[75]:
math_value name port_value
0 0.267185 G3_other_subject 0.279978
1 0.065996 studytime 0.027422
2 0.063322 absences 0.075464
3 0.048701 goout 0.001272
4 0.045943 sex_M 0.052550
In [65]:
sns.plt.scatter(df_features_no_crossover.math_value, df_features_no_crossover.port_value)
Out[65]:
<matplotlib.collections.PathCollection at 0x1224c24a8>
In [66]:
df_features_no_crossover.plot.scatter(x='math_value', y='port_value')
Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x122423f60>
In [67]:
test = df_features_no_crossover.set_index('name').T.to_dict('list')


# repackage data into array-like for matplotlib 
# (see a preferred pythonic way below)
data = {"x":[], "y":[], "label":[]}
for label, coord in test.items():
    data["x"].append(coord[0])
    data["y"].append(coord[1])
    data["label"].append(label)

# display scatter plot data
plt.figure(figsize=(14,14))
plt.title('Most important features as identified by GradientBoostingRegressor', fontsize=20)
plt.xlabel('Math features', fontsize=20)
plt.ylabel('Portuguese', fontsize=20)
plt.scatter(data["x"], data["y"], marker = 'o')

# add labels
for label, x, y in zip(data["label"], data["x"], data["y"]):
    plt.annotate(label, xy = (x, y), fontsize=12)
    
In [68]:
from mpl_toolkits.basemap import Basemap 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: