import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv("heloc_dataset_v1.csv") # load the data
data.head() # inspect data structure
data.info()
data.describe()
#data cleaning
data['row_sum'] = data.sum(axis = 1)
data = data[data.row_sum != -9*23]
data = data.drop('row_sum', axis = 1)
data = data.replace([-7,-8,-9],[np.nan, np.nan, np.nan])
data = data.replace('Bad', 0)
data = data.replace('Good', 1)
data.describe()
data.hist(bins=50, figsize=(20,15))
pass
corr_matrix = data.corr()
corr_matrix["RiskPerformance"].sort_values(ascending=False)
from pandas.plotting import scatter_matrix
attributes=["ExternalRiskEstimate","PercentTradesNeverDelq","AverageMInFile", "MaxDelq2PublicRecLast12M"]
scatter_matrix(data[attributes],figsize=(12,8))
pass
from pandas.plotting import scatter_matrix
attributes=["NetFractionRevolvingBurden","PercentTradesWBalance","NumBank2NatlTradesWHighUtilization", "NumTrades60Ever2DerogPubRec"]
scatter_matrix(data[attributes],figsize=(12,8))
pass
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=1)
print(data.shape, train_set.shape, test_set.shape)
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median") # only works on numerical values
imputer.fit(data) # initialize object
print(imputer.statistics_) # print interval values (media)
print(data.median().values)
X = imputer.transform(data) # transform all numerical values in data frame (returns matrix without labels)
data = pd.DataFrame(X, columns=data.columns) # add column names
data.info() # show statistics on transformed data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = data.copy().drop("RiskPerformance", axis=1)
y = data["RiskPerformance"].copy()
trainX = train_set.copy().drop("RiskPerformance", axis=1)
train_y = train_set["RiskPerformance"].copy()
testX = test_set.copy().drop("RiskPerformance", axis=1)
test_y = test_set["RiskPerformance"].copy()
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
cat_attributes = ['RiskPerformance']
num_attributes = ['ExternalRiskEstimate','MSinceOldestTradeOpen','MSinceMostRecentTradeOpen','AverageMInFile','NumSatisfactoryTrades','NumTrades60Ever2DerogPubRec',
'NumTrades90Ever2DerogPubRec','PercentTradesNeverDelq', 'MSinceMostRecentDelq','MaxDelq2PublicRecLast12M','MaxDelqEver','NumTotalTrades',
'NumTradesOpeninLast12M','PercentInstallTrades','MSinceMostRecentInqexcl7days','NumInqLast6M','NumInqLast6Mexcl7days','NetFractionRevolvingBurden',
'NetFractionInstallBurden','NumRevolvingTradesWBalance','NumInstallTradesWBalance','NumBank2NatlTradesWHighUtilization','PercentTradesWBalance']
num_pipeline = Pipeline([
('imputer',Imputer(strategy="median")),
('std_scaler', StandardScaler()),])
X = num_pipeline.fit_transform(X)
train_X = num_pipeline.fit_transform(trainX)
test_X = num_pipeline.fit_transform(testX)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
dtree = DecisionTreeClassifier()
dtree.fit(train_X, train_y)
tree_pred = dtree.predict(test_X)
tree_mse = mean_squared_error(test_y, tree_pred)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(test_y, tree_pred))
print(classification_report(test_y, tree_pred))
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
base_clf = tree.DecisionTreeClassifier(max_depth=10) # initialize a base classifier
clf = BaggingClassifier(n_estimators=50, base_estimator=base_clf, oob_score=True) #50 different models
clf = clf.fit(X,y)
print('OOB', clf.oob_score_) # uses information from a single bag (50 trees)
scores = cross_val_score(clf, X, y, cv=5)
print('CV: ',scores.mean(),scores.std()) # uses information from 5 bags (50*5 trees)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs = 2, random_state = 0, max_features = 'sqrt')
rfc.fit(train_X, train_y)
rfc_pred = rfc.predict(test_X)
rfc_mse = mean_squared_error(test_y, tree_pred)
rfc_rmse = np.sqrt(tree_mse)
rfc_rmse
print(confusion_matrix(test_y, rfc_pred))
print(classification_report(test_y, rfc_pred))
param_grid = {'n_estimators':[5,10,30, 50],
'criterion':['mse','mae'],
'max_features':[2,4,6,8],
'max_depth':[1,2,3],
'min_samples_leaf':[1,3,5]}
def count_configurations(p):
return(np.product(list(map(len,param_grid.values()))))
count_configurations(param_grid)
param_grid = {'n_estimators':[5,10,20,30],
'max_depth':[1,3,5,7,9,11,15]}
count_configurations(param_grid)
from sklearn.model_selection import GridSearchCV
forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_X,train_y)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)
grid_search.best_estimator_
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=11, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=30,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
accuracy = cross_val_score(clf, train_X, train_y, cv=5)
accuracy.mean()
# use cross-validation to get the highest accuracy from k=1 to k=50
from sklearn import neighbors
acc = []
for i in range(1,50):
clf = neighbors.KNeighborsClassifier(i)
accuracy = cross_val_score(clf, train_X, train_y, cv=5)
accmean = accuracy.mean()
acc.append(accmean)
acc
#find the highest accuracy
max(acc)
#find the responding k of the highest accuracy
acc.index(max(acc))
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
tuned_parameters = [
{'kernel': ['linear'], 'C': [0.01,0.1,1.0,5.0,10.0,100.0]}]
scores = ['precision']
clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(train_X, train_y)
print("Best Hyperparameter is:")
print(clf.best_params_)
#0.732 for {'C': 0.1, 'kernel': 'linear'}
from sklearn.svm import SVC
print("Result is:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
print("%0.3f for %r" % (mean, params))
tuned_parameters = [
{'kernel': ['poly'], 'degree':[0,1,2,5], 'C': [0.01,0.1,1.0,5.0,10.0,100.0]}]
# Objective metrics
scores = ['precision']
clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(train_X, train_y)
print("Best Hyperparameter is:")
print(clf.best_params_)
print("Results are:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
print("%0.3f for %r" % (mean, params))
tuned_parameters = [
{'kernel': ['rbf'], 'gamma': [0.0001,0.001,0.1,1,10,100],'C': [0.1,1,10,100,1000]}]
# Objective metrics
scores = ['precision']
clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(train_X, train_y)
print("Best Hyperparameters are:")
print(clf.best_params_)
print("Results are:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
print("%0.3f for %r" % (mean, params))
clf = SVC(kernel='rbf',C=100, gamma=0.001)
pred = clf.fit(train_X, train_y)
y_pred = pred.predict(test_X)
print(classification_report(test_y, y_pred))
clf = SVC(kernel='rbf',C=100, gamma=0.001)
pred = clf.fit(X, y)
y_pred = pred.predict(X)
print(classification_report(y, y_pred))
import pickle
pickle.dump(train_X, open('X_train.sav', 'wb'))
pickle.dump(test_X, open('X_test.sav', 'wb'))
pickle.dump(test_y, open('y_test.sav', 'wb'))
pickle.dump(train_y , open('y_train.sav', 'wb'))
pickle.dump(SVC(kernel='rbf',C=100, gamma=0.001), open('pipe_SVM.sav', 'wb'))
pickle.dump(neighbors.KNeighborsClassifier(48), open('pipe_KNN.sav', 'wb'))
pickle.dump(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=11, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=30,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False), open('pipe_RDF.sav', 'wb'))
pickle.dump(DecisionTreeClassifier(), open('pipe_DT.sav', 'wb'))
#interface code
import streamlit as st
import pickle
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
X_test = pickle.load(open('X_test.sav', 'rb'))
y_test = pickle.load(open('y_test.sav', 'rb'))
X_train = pickle.load(open('X_train.sav', 'rb'))
y_train = pickle.load(open('y_train.sav', 'rb'))
# Function to test certain index of dataset
def test_demo():
# Create four sliders in the sidebar
st.sidebar.slider('ExternalRiskEstimate', 0.0, 100.0, (0.0, 100.0), 1.0)
st.sidebar.slider('MSinceOldestTradeOpen', 0.0, 810.0, (0.0, 810.0), 1.0)
st.sidebar.slider('MSinceMostRecentTradeOpen', 0.0, 400.0, (0.0, 400.0), 1.0)
st.sidebar.slider('AverageMInFile', 0.0, 400.0, (0.0, 400.0), 1.0)
st.sidebar.slider('NumSatisfactoryTrades', 0.0, 80.0, (0.0, 80.0), 1.0)
st.sidebar.slider('NumTrades60Ever2DerogPubRec', 0.0, 20.0, (0.0, 20.0), 1.0)
st.sidebar.slider('NumTrades90Ever2DerogPubRec', 0.0, 20.0, (0.0, 20.0), 1.0)
st.sidebar.slider('PercentTradesNeverDelq', 0.0, 100.0, (0.0, 100.0), 1.0)
st.sidebar.slider('MSinceMostRecentDelq', 0.0, 90.0, (0.0, 90.0), 1.0)
st.sidebar.slider('MaxDelq2PublicRecLast12M', 0.0, 7.0, (0.0, 7.0), 1.0)
st.sidebar.slider('MaxDelqEver', 0.0, 5.0, (0.0, 5.0), 1.0)
st.sidebar.slider('NumTotalTrades', 0.0, 110.0, (0.0, 110.0), 1.0)
st.sidebar.slider('NumTradesOpeninLast12M', 0.0, 19.0, (0.0, 19.0), 1.0)
st.sidebar.slider('PercentInstallTrades', 0.0, 100.0, (0.0, 100.0), 1.0)
st.sidebar.slider('MSinceMostRecentInqexcl7days', 0.0, 13.0, (0.0, 13.0), 1.0)
st.sidebar.slider('NumInqLast6M', 0.0, 70.0, (0.0, 70.0), 1.0)
st.sidebar.slider('NumInqLast6Mexcl7days', 0.0, 70.0, (0.0, 70.0), 1.0)
st.sidebar.slider('NetFractionRevolvingBurden', 0.0, 240.0, (0.0, 240.0), 1.0)
st.sidebar.slider('NetFractionInstallBurden', 0.0, 480.0, (0.0, 480.0), 1.0)
st.sidebar.slider('NumRevolvingTradesWBalance', 0.0, 40.0, (0.0, 40.0), 1.0)
st.sidebar.slider('NumInstallTradesWBalance', 0.0, 30.0, (0.0, 30.0), 1.0)
st.sidebar.slider('NumBank2NatlTradesWHighUtilization', 0.0, 20.0, (0.0, 20.0), 1.0)
st.sidebar.slider('PercentTradesWBalance', 0.0, 100.0, (0.0, 100.0), 1.0)
# Print the prediction result
alg = ['Decision Tree','Random Forest', 'Support Vector Machine', 'K Nearest Neighbor']
classifier = st.selectbox('Which algorithm?', alg)
if classifier == 'Decision Tree':
pipe_DT = pickle.load(open('pipe_DT.sav', 'rb'))
pipe_DT.fit(X_train, y_train)
acc = pipe_DT.score(X_test, y_test)
pred_DT = pipe_DT.predict(X_test)
cm_dtc = confusion_matrix(y_test, pred_DT)
st.write('Accuracy: ', acc)
st.write('Confusion matrix: ', cm_dtc)
st.text('Decision Tree Chosen')
elif classifier == 'Random Forest':
pipe_RDF = pickle.load(open('pipe_RDF.sav', 'rb'))
pipe_RDF.fit(X_train, y_train)
accuracy_score = pipe_RDF.score(X_test, y_test)
pred_RDF = pipe_RDF.predict(X_test)
cm_rdf = confusion_matrix(y_test, pred_RDF)
st.write('Accuracy: ', accuracy_score)
st.write('Confusion matrix: ', cm_rdf)
st.text('Random Forest Chosen')
elif classifier == 'Support Vector Machine':
pipe_SVM = pickle.load(open('pipe_SVM.sav', 'rb'))
pipe_SVM.fit(X_train, y_train)
accuracy = pipe_SVM.score(X_test, y_test)
pred_SVM = pipe_SVM.predict(X_test)
cm = confusion_matrix(y_test, pred_SVM)
st.write('Accuracy: ', accuracy)
st.write('Confusion matrix: ', cm)
st.text('Support Vector Machine Chosen')
else:
pipe_KNN = pickle.load(open('pipe_KNN.sav', 'rb'))
pipe_KNN.fit(X_train, y_train)
score = pipe_KNN.score(X_test, y_test)
pred_KNN = pipe_KNN.predict(X_test)
cmKNN = confusion_matrix(y_test, pred_KNN)
st.write('Accuracy: ', score)
st.write('Confusion matrix: ', cmKNN)
st.text('K Nearest Neighbor Chosen')
# title
st.title('Credit Risk Prediction')
# show data
if st.checkbox('Show dataframe'):
st.write(X_test)
# st.write(X_train) # Show the dataset
test_demo()