import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400
datafile = "student/student-mat.csv"
prtscores = "student/student-por.csv"
df_math = pd.read_csv(datafile, sep=";")
df_por = pd.read_csv(prtscores, sep=";")
df_math.head()
df_por.head()
# Compare if column headers are the same in both dataframes - should give us an empty list
list(set(df_math.columns) - set(df_por.columns))
# Create subsets of data that can be used to merge original datasets with the the grade column of the other class
def create_df_subset(df):
column_names = ["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet", 'G3']
df = df.loc[:, column_names]
return df
df_por_sub = create_df_subset(df_por)
df_por_sub.rename(columns={'G3':'G3_port'}, inplace=True)
df_math_sub = create_df_subset(df_math)
df_math_sub.rename(columns={'G3':'G3_math'}, inplace=True)
# Add portugese grade to mat df by merging on identifying columns
def merge_datasets(df_base, df_grade):
columns_merge = ["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"]
df_merged = df_base.merge(df_grade, left_on=columns_merge, right_on=columns_merge)
return df_merged
df_math = merge_datasets(df_math, df_por_sub)
df_port = merge_datasets(df_por, df_math_sub)
print(df_math.head(3))
print(df_port.head(3))
print(df_math.info())
print(df_port.info())
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns
def plot_hist(df):
mu = df.G3.mean() # mean of distribution
sigma = df.G3.std() # standard deviation of distribution
num_bins = 17
# the histogram of the data
n, bins, patches = plt.hist(df.G3, num_bins, normed=1, facecolor='blue', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, mu, sigma)
plt.plot(bins, y, 'r--')
plt.xlabel('Final Grade')
plt.ylabel('Probability')
plt.title("Histogram of final grade frequencies")
# Tweak spacing to prevent clipping of ylabel
plt.subplots_adjust(left=0.15)
plt.show()
plot_hist(df_math)
plot_hist(df_port)
sns.jointplot(data=df_math, y='G3', x='G3_port', kind='reg', size=10);
plt.show()
df_wo_outliers = df_math[(df_math['G3'] > 0) & (df_math['G3_port'] > 0)]
sns.jointplot(data=df_wo_outliers, y='G3', x='G3_port', kind='reg', size=10);
plt.show()
def value_count(df):
print(df.school.value_counts())
print(df.famsize.value_counts())
print(df.Mjob.value_counts())
print(df.Fjob.value_counts())
print(df.reason.value_counts())
print(df.guardian.value_counts())
print(df.schoolsup.value_counts())
print(df.famsup.value_counts())
print(df.paid.value_counts())
print(df.activities.value_counts())
print(df.nursery.value_counts())
print(df.higher.value_counts())
print(df.internet.value_counts())
print(df.romantic.value_counts())
def binary_conversion(df):
df['Pstatus_A'] = np.where(df['Pstatus']=='A', 1, 0)
df['sex_M'] = np.where(df['sex']=='M', 1, 0)
df['address_urban'] = np.where(df['address']=='U', 1, 0)
df['sex_M'] = np.where(df['sex']=='M', 1, 0)
df['school_GP'] = np.where(df['school']=='GP', 1, 0)
df['famsize_GT3'] = np.where(df['famsize']=='GT3', 1, 0)
df['schoolsup_yes'] = np.where(df['schoolsup']=='yes', 1, 0)
df['famsup_yes'] = np.where(df['famsup']=='yes', 1, 0)
df['paid_yes'] = np.where(df['paid']=='yes', 1, 0)
df['activities_yes'] = np.where(df['activities']=='yes', 1, 0)
df['nusery_yes'] = np.where(df['nursery']=='yes', 1, 0)
df['higher_yes'] = np.where(df['higher']=='yes', 1, 0)
df['internet_yes'] = np.where(df['internet']=='yes', 1, 0)
df['romantic_yes'] = np.where(df['romantic']=='yes', 1, 0)
return df
df_math = binary_conversion(df_math)
df_port = binary_conversion(df_port)
df_math.head()
from sklearn.preprocessing import LabelBinarizer
def convert_categories(df, column):
lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(df[column])
df2 = pd.DataFrame(lb_results, columns=lb_style.classes_)
df2.columns = [column + "_" + str(col) for col in df2.columns]
df3 = df.merge(df2, right_index=True, left_index=True)
return df3
def merge_categories(df):
df1 = convert_categories(df, "Mjob")
df2 = convert_categories(df1, "Fjob")
df3 = convert_categories(df2, "reason")
df4 = convert_categories(df3, "guardian")
return df4
df_math = merge_categories(df_math)
df_port = merge_categories(df_port)
df_math.info()
# Drop grades in G3 that are zero
def drop_zeros(df):
df = df[(df['G3'] > 0)]
return df
df_math = drop_zeros(df_math)._get_numeric_data()
df_port = drop_zeros(df_port)._get_numeric_data()
# Exploratory Analysis
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
corrmat = df_math.corr(method='pearson')
f, ax = sns.plt.subplots(figsize=(14, 14))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt='.1f');
g = sns.clustermap(corrmat, annot=True, fmt='.1f', figsize=(15, 15));
_ = g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
def drop_columns(df):
df = df.drop(['Fjob_other', 'Mjob_other', 'guardian_other', 'reason_other'], axis=1)
return df
df_math = drop_columns(df_math)
df_port = drop_columns(df_port)
df_math.to_csv("ARA_MathGrades.csv", index=False)
df_port.to_csv("ARA_PortGrades.csv", index=False)
sns.lmplot(x='G3_port', y='G3', data=df_math);
def normalize_df(df):
df = (df - df.mean()) / (df.max() - df.min())
return df
df_math_norm = normalize_df(df_math)
df_port_norm = normalize_df(df_port)
df_math_norm.head(3)
import statsmodels.api as sm
X = df_math['G3_port']
Y = df_math['G3']
X = sm.add_constant(X)
est = sm.OLS(Y, X).fit()
est.summary()
X = df_math_norm.drop(['G3', 'G1', 'G2'], axis=1)
Y = df_math_norm['G3']
X = sm.add_constant(X)
est_math = sm.OLS(Y, X).fit()
est_math.summary()
dict_math = est_math.params.to_dict()
dict_math.pop('const')
dict_math.pop('G3_port')
dict_math
X = df_port_norm.drop(['G3', 'G1', 'G2'], axis=1)
Y = df_port_norm['G3']
X = sm.add_constant(X)
est_port = sm.OLS(Y, X).fit()
est_port.summary()
dict_port = est_port.params.to_dict()
dict_port.pop('const')
dict_port.pop('G3_math')
dict_port
dict_coeff = {}
for key in (dict_math.keys() | dict_port.keys()):
if key in dict_math: dict_coeff.setdefault(key, []).append(dict_math[key])
if key in dict_port: dict_coeff.setdefault(key, []).append(dict_port[key])
print(dict_coeff)
test = dict_coeff
# repackage data into array-like for matplotlib
# (see a preferred pythonic way below)
data = {"x":[], "y":[], "label":[]}
for label, coord in test.items():
data["x"].append(coord[0])
data["y"].append(coord[1])
data["label"].append(label)
# display scatter plot data
plt.figure(figsize=(14,14))
plt.title('Regression coefficients', fontsize=20)
plt.xlabel('Math', fontsize=20)
plt.ylabel('Portuguese', fontsize=20)
plt.scatter(data["x"], data["y"], marker = 'o')
# add labels
for label, x, y in zip(data["label"], data["x"], data["y"]):
plt.annotate(label, xy = (x, y), fontsize=12)
print( """
# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - student's sex (binary: "F" - female or "M" - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: "U" - urban or "R" - rural)
5 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
6 Pstatus - parent's cohabitation status (binary: "T" - living together or "A" - apart)
7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
9 Mjob - mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10 Fjob - father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
11 reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
12 guardian - student's guardian (nominal: "mother", "father" or "other")
13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16 schoolsup - extra educational support (binary: yes or no)
17 famsup - family educational support (binary: yes or no)
18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19 activities - extra-curricular activities (binary: yes or no)
20 nursery - attended nursery school (binary: yes or no)
21 higher - wants to take higher education (binary: yes or no)
22 internet - Internet access at home (binary: yes or no)
23 romantic - with a romantic relationship (binary: yes or no)
24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29 health - current health status (numeric: from 1 - very bad to 5 - very good)
30 absences - number of school absences (numeric: from 0 to 93)
""")
df_math.head()
from sklearn.model_selection import train_test_split
output_columns = ['G3']
input_columns = df_math.drop(['G3', 'G2', 'G1'], axis=1).columns
df_model = df_math.drop(['G2', 'G1'], axis=1)
# df_model = df_num
train, test = train_test_split(df_model, test_size = 0.3)
x_train = train.loc[:,input_columns] # Gives dataframe of 0: rows
y_train = train[output_columns]
x_test = test.loc[:,input_columns]
y_test = test[output_columns]
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(x_train,y_train)
from sklearn.metrics import mean_squared_error, r2_score
y_pred_training = model.predict(x_train)
y_pred_testing = model.predict(x_test)
training_msq = mean_squared_error(y_pred_training,y_train)
testing_msq = mean_squared_error(y_pred_testing,y_test)
print(training_msq,testing_msq)
print('Train R-Square:',r2_score(y_train,y_pred_training))
print('Test R-Square:',r2_score(y_test,y_pred_testing))
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
headers = ['depth', 'sample_size', 'score_test', 'score_train']
scores_dt = pd.DataFrame(columns = headers)
index = 0
models = list()
for tree_depth in range(1, 15):
for min_samples_size in range(1, 30, 1):
model = tree.DecisionTreeRegressor(max_depth = tree_depth, min_samples_leaf=min_samples_size)
model.fit(x_train,y_train)
score_train = model.score(x_train,y_train)
score_test = model.score(x_test,y_test)
scores_dt.loc[index] = [tree_depth, min_samples_size, score_test, score_train]
models.append(model)
index += 1
df_max_scores = scores_dt.sort_values('score_test', ascending=False).head(2)
print(df_max_scores)
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
model = tree.DecisionTreeRegressor(max_depth=int(df_max_scores.iloc[0,0]), min_samples_leaf=int(df_max_scores.iloc[0,1]))
model.fit(x_train,y_train)
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
import pydotplus
feature_names = [key for key in df_math if not key=='G3']
from IPython.display import Image
dot_data = tree.export_graphviz(model, out_file=None,feature_names=feature_names)
import pydotplus
graph = pydotplus.graphviz.graph_from_dot_data(dot_data)
Image(graph.create_png())
sns.plt.scatter(model.predict(x_test), y_test)
from sklearn.ensemble import GradientBoostingRegressor
headers = ['n_estimators', 'score_test', 'score_train']
scores_gbt = pd.DataFrame(columns = headers)
index = 0
models = list()
for estimator in range(10, 200, 5):
model = GradientBoostingRegressor(n_estimators = estimator,
max_depth=3,
random_state=0,
learning_rate=0.1)
model.fit(x_train,y_train)
score_train = model.score(x_train,y_train)
score_test = model.score(x_test,y_test)
scores_gbt.loc[index] = [estimator, score_test, score_train]
models.append(model)
index += 1
scores_gbt.plot(y = ['score_train', 'score_test'], x = ['n_estimators'])
max_scores_gbt = scores_gbt.sort_values('score_test', ascending=False).head(3)
# Using optimal depth and sample size we run the decision tree regressor
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=int(max_scores_gbt.iloc[0,0]),
max_depth=3,
random_state=0,
learning_rate=0.1)
# Here selected min samples as 201 to be able to draw tree
model.fit(x_train,y_train)
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
sns.plt.scatter(model.predict(x_test), y_test)
df_features_math = pd.DataFrame({'name': input_columns, 'math_value': model.feature_importances_}).sort_values('math_value', ascending=False)
df_features_math
df_port.head()
from sklearn.model_selection import train_test_split
output_columns = ['G3']
input_columns = df_port.drop(['G3', 'G2', 'G1'], axis=1).columns
df_model = df_port.drop(['G2', 'G1'], axis=1)
# df_model = df_num
train, test = train_test_split(df_model, test_size = 0.3)
x_train = train.loc[:,input_columns] # Gives dataframe of 0: rows
y_train = train[output_columns]
x_test = test.loc[:,input_columns]
y_test = test[output_columns]
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(x_train,y_train)
from sklearn.metrics import mean_squared_error, r2_score
y_pred_training = model.predict(x_train)
y_pred_testing = model.predict(x_test)
training_msq = mean_squared_error(y_pred_training,y_train)
testing_msq = mean_squared_error(y_pred_testing,y_test)
print(training_msq,testing_msq)
print('Train R-Square:',r2_score(y_train,y_pred_training))
print('Test R-Square:',r2_score(y_test,y_pred_testing))
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
headers = ['depth', 'sample_size', 'score_test', 'score_train']
scores_dt = pd.DataFrame(columns = headers)
index = 0
models = list()
for tree_depth in range(1, 15):
for min_samples_size in range(1, 30, 1):
model = tree.DecisionTreeRegressor(max_depth = tree_depth, min_samples_leaf=min_samples_size)
model.fit(x_train,y_train)
score_train = model.score(x_train,y_train)
score_test = model.score(x_test,y_test)
scores_dt.loc[index] = [tree_depth, min_samples_size, score_test, score_train]
models.append(model)
index += 1
df_max_socres = scores_dt.sort_values('score_test', ascending=False).head(2)
df_max_scores
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
model = tree.DecisionTreeRegressor(max_depth=int(df_max_scores.iloc[0,0]), min_samples_leaf=int(df_max_scores.iloc[0,1]))
model.fit(x_train,y_train)
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
import pydotplus
feature_names = [key for key in df_math if not key=='G3']
from IPython.display import Image
dot_data = tree.export_graphviz(model, out_file=None,feature_names=feature_names)
import pydotplus
graph = pydotplus.graphviz.graph_from_dot_data(dot_data)
Image(graph.create_png())
sns.plt.scatter(model.predict(x_test), y_test)
from sklearn.ensemble import GradientBoostingRegressor
headers = ['n_estimators', 'score_test', 'score_train']
scores_gbt = pd.DataFrame(columns = headers)
index = 0
models = list()
for estimator in range(10, 200, 5):
model = GradientBoostingRegressor(n_estimators = estimator,
max_depth=3,
random_state=0,
learning_rate=0.1)
model.fit(x_train,y_train)
score_train = model.score(x_train,y_train)
score_test = model.score(x_test,y_test)
scores_gbt.loc[index] = [estimator, score_test, score_train]
models.append(model)
index += 1
scores_gbt.plot(y = ['score_train', 'score_test'], x = ['n_estimators'])
max_scores_gbt = scores_gbt.sort_values('score_test', ascending=False).head(3)
# max_scores_gbt.to_csv('estimators.csv')
max_scores_gbt
# Using optimal depth and sample size we run the decision tree regressor
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=int(max_scores_gbt.iloc[0,0]),
max_depth=3,
random_state=0,
learning_rate=0.1)
# Here selected min samples as 201 to be able to draw tree
model.fit(x_train,y_train)
#Get the R-Square for the predicted vs actuals on the text sample
print("Training R-Square",model.score(x_train,y_train))
print("Testing R-Square",model.score(x_test,y_test))
sns.plt.scatter(model.predict(x_test), y_test)
df_features_port = pd.DataFrame({'name': input_columns, 'port_value': model.feature_importances_}).sort_values('port_value', ascending=False)
df_features_port
df_features_port['name'].replace('G3_math', 'G3_other_subject', inplace=True)
df_features_math['name'].replace('G3_port', 'G3_other_subject', inplace=True)
df_features = df_features_math.merge(df_features_port, on='name')
df_features_no_crossover = df_features[df_features.name != 'G3_other_subject']
df_features_no_crossover.set_index('name')
df_features.head()
sns.plt.scatter(df_features_no_crossover.math_value, df_features_no_crossover.port_value)
df_features_no_crossover.plot.scatter(x='math_value', y='port_value')
test = df_features_no_crossover.set_index('name').T.to_dict('list')
# repackage data into array-like for matplotlib
# (see a preferred pythonic way below)
data = {"x":[], "y":[], "label":[]}
for label, coord in test.items():
data["x"].append(coord[0])
data["y"].append(coord[1])
data["label"].append(label)
# display scatter plot data
plt.figure(figsize=(14,14))
plt.title('Most important features as identified by GradientBoostingRegressor', fontsize=20)
plt.xlabel('Math features', fontsize=20)
plt.ylabel('Portuguese', fontsize=20)
plt.scatter(data["x"], data["y"], marker = 'o')
# add labels
for label, x, y in zip(data["label"], data["x"], data["y"]):
plt.annotate(label, xy = (x, y), fontsize=12)
from mpl_toolkits.basemap import Basemap