Python Final Project: Predictive Model for Credit Risk

In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

data = pd.read_csv("heloc_dataset_v1.csv") # load the data
data.head()                                # inspect data structure 
Out[76]:
RiskPerformance ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec PercentTradesNeverDelq MSinceMostRecentDelq ... PercentInstallTrades MSinceMostRecentInqexcl7days NumInqLast6M NumInqLast6Mexcl7days NetFractionRevolvingBurden NetFractionInstallBurden NumRevolvingTradesWBalance NumInstallTradesWBalance NumBank2NatlTradesWHighUtilization PercentTradesWBalance
0 Bad 55 144 4 84 20 3 0 83 2 ... 43 0 0 0 33 -8 8 1 1 69
1 Bad 61 58 15 41 2 4 4 100 -7 ... 67 0 0 0 0 -8 0 -8 -8 0
2 Bad 67 66 5 24 9 0 0 100 -7 ... 44 0 4 4 53 66 4 2 1 86
3 Bad 66 169 1 73 28 1 1 93 76 ... 57 0 5 4 72 83 6 4 3 91
4 Bad 81 333 27 132 12 0 0 100 -7 ... 25 0 1 1 51 89 3 1 0 80

5 rows × 24 columns

1. Explore the data

  • Size, data types, missing values, ...
In [77]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10459 entries, 0 to 10458
Data columns (total 24 columns):
RiskPerformance                       10459 non-null object
ExternalRiskEstimate                  10459 non-null int64
MSinceOldestTradeOpen                 10459 non-null int64
MSinceMostRecentTradeOpen             10459 non-null int64
AverageMInFile                        10459 non-null int64
NumSatisfactoryTrades                 10459 non-null int64
NumTrades60Ever2DerogPubRec           10459 non-null int64
NumTrades90Ever2DerogPubRec           10459 non-null int64
PercentTradesNeverDelq                10459 non-null int64
MSinceMostRecentDelq                  10459 non-null int64
MaxDelq2PublicRecLast12M              10459 non-null int64
MaxDelqEver                           10459 non-null int64
NumTotalTrades                        10459 non-null int64
NumTradesOpeninLast12M                10459 non-null int64
PercentInstallTrades                  10459 non-null int64
MSinceMostRecentInqexcl7days          10459 non-null int64
NumInqLast6M                          10459 non-null int64
NumInqLast6Mexcl7days                 10459 non-null int64
NetFractionRevolvingBurden            10459 non-null int64
NetFractionInstallBurden              10459 non-null int64
NumRevolvingTradesWBalance            10459 non-null int64
NumInstallTradesWBalance              10459 non-null int64
NumBank2NatlTradesWHighUtilization    10459 non-null int64
PercentTradesWBalance                 10459 non-null int64
dtypes: int64(23), object(1)
memory usage: 1.9+ MB
  • 10459 records
  • 24 columns

Explore the data - cont.

In [3]:
data.describe()
Out[3]:
ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec PercentTradesNeverDelq MSinceMostRecentDelq MaxDelq2PublicRecLast12M ... PercentInstallTrades MSinceMostRecentInqexcl7days NumInqLast6M NumInqLast6Mexcl7days NetFractionRevolvingBurden NetFractionInstallBurden NumRevolvingTradesWBalance NumInstallTradesWBalance NumBank2NatlTradesWHighUtilization PercentTradesWBalance
count 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 ... 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000 10459.000000
mean 67.425758 184.205373 8.543455 73.843293 19.428052 0.042738 -0.142843 86.661536 6.762406 4.928291 ... 32.166460 -0.325366 0.868152 0.812602 31.629888 39.158906 3.185008 0.976097 0.018071 62.079166
std 21.121621 109.683816 13.301745 38.782803 13.004327 2.513910 2.367397 25.999584 20.501250 3.756275 ... 20.128634 6.067556 3.179304 3.143698 30.060140 42.101601 4.413173 4.060995 3.358135 27.711565
min -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 ... -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000 -9.000000
25% 63.000000 118.000000 3.000000 52.000000 12.000000 0.000000 0.000000 87.000000 -7.000000 4.000000 ... 20.000000 -7.000000 0.000000 0.000000 5.000000 -8.000000 2.000000 1.000000 0.000000 47.000000
50% 71.000000 178.000000 5.000000 74.000000 19.000000 0.000000 0.000000 96.000000 -7.000000 6.000000 ... 31.000000 0.000000 1.000000 1.000000 25.000000 47.000000 3.000000 2.000000 0.000000 67.000000
75% 79.000000 249.500000 11.000000 95.000000 27.000000 1.000000 0.000000 100.000000 14.000000 7.000000 ... 44.000000 1.000000 2.000000 2.000000 54.000000 79.000000 5.000000 3.000000 1.000000 82.000000
max 94.000000 803.000000 383.000000 383.000000 79.000000 19.000000 19.000000 100.000000 83.000000 9.000000 ... 100.000000 24.000000 66.000000 66.000000 232.000000 471.000000 32.000000 23.000000 18.000000 100.000000

8 rows × 23 columns

  • -9 in all columns, need to deal with nan
In [65]:
#data cleaning
data['row_sum'] = data.sum(axis = 1)
data = data[data.row_sum != -9*23]
data = data.drop('row_sum', axis = 1)
data = data.replace([-7,-8,-9],[np.nan, np.nan, np.nan])
data = data.replace('Bad', 0)
data = data.replace('Good', 1)
data.describe()
Out[65]:
RiskPerformance ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec PercentTradesNeverDelq MSinceMostRecentDelq ... PercentInstallTrades MSinceMostRecentInqexcl7days NumInqLast6M NumInqLast6Mexcl7days NetFractionRevolvingBurden NetFractionInstallBurden NumRevolvingTradesWBalance NumInstallTradesWBalance NumBank2NatlTradesWHighUtilization PercentTradesWBalance
count 9871.000000 9861.000000 9632.000000 9871.000000 9871.000000 9871.000000 9871.000000 9871.000000 9871.000000 5031.000000 ... 9871.000000 7540.000000 9871.000000 9871.000000 9685.000000 6452.000000 9715.000000 9010.000000 9288.00000 9853.000000
mean 0.479688 72.060440 200.769103 9.588492 78.778138 21.121467 0.581400 0.384763 92.359943 21.879547 ... 34.618681 2.477719 1.455982 1.397123 34.857718 68.537973 4.102110 2.484906 1.09227 66.449000
std 0.499613 9.871795 97.946081 12.963398 34.066063 11.321396 1.238783 0.993223 11.772876 20.808514 ... 17.953432 4.760413 2.136161 2.096102 28.896627 24.903776 3.021621 1.634503 1.53625 22.035459
min 0.000000 33.000000 2.000000 0.000000 4.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.00000 0.000000
25% 0.000000 64.000000 135.000000 3.000000 57.000000 13.000000 0.000000 0.000000 89.000000 5.000000 ... 21.000000 0.000000 0.000000 0.000000 9.000000 53.000000 2.000000 1.000000 0.00000 50.000000
50% 0.000000 72.000000 186.000000 6.000000 76.000000 20.000000 0.000000 0.000000 97.000000 15.000000 ... 33.000000 0.000000 1.000000 1.000000 29.000000 74.000000 3.000000 2.000000 1.00000 67.000000
75% 1.000000 80.000000 257.000000 12.000000 97.000000 28.000000 1.000000 0.000000 100.000000 34.000000 ... 45.000000 3.000000 2.000000 2.000000 56.000000 87.000000 5.000000 3.000000 2.00000 83.000000
max 1.000000 94.000000 803.000000 383.000000 383.000000 79.000000 19.000000 19.000000 100.000000 83.000000 ... 100.000000 24.000000 66.000000 66.000000 232.000000 471.000000 32.000000 23.000000 18.00000 100.000000

8 rows × 24 columns

Explore the data - cont.

In [29]:
data.hist(bins=50, figsize=(20,15))
pass

2. Explore data to gain insights

  • Only use training set from now on
  • Use sample if data set is too large
In [30]:
corr_matrix = data.corr()
corr_matrix["RiskPerformance"].sort_values(ascending=False)
Out[30]:
RiskPerformance                       1.000000
ExternalRiskEstimate                  0.460161
PercentTradesNeverDelq                0.257358
AverageMInFile                        0.243869
MaxDelq2PublicRecLast12M              0.236759
MaxDelqEver                           0.216764
MSinceOldestTradeOpen                 0.213520
MSinceMostRecentInqexcl7days          0.163994
NumSatisfactoryTrades                 0.141092
NumTotalTrades                        0.100787
MSinceMostRecentDelq                  0.077159
MSinceMostRecentTradeOpen             0.046343
NetFractionInstallBurden             -0.049501
NumInstallTradesWBalance             -0.051586
NumTradesOpeninLast12M               -0.077462
NumRevolvingTradesWBalance           -0.126915
NumInqLast6Mexcl7days                -0.137998
PercentInstallTrades                 -0.140418
NumTrades90Ever2DerogPubRec          -0.140443
NumInqLast6M                         -0.143640
NumTrades60Ever2DerogPubRec          -0.169747
NumBank2NatlTradesWHighUtilization   -0.236910
PercentTradesWBalance                -0.279148
NetFractionRevolvingBurden           -0.350217
Name: RiskPerformance, dtype: float64

(Pearson's) Correlation coefficient - cont.

In [31]:
from pandas.plotting import scatter_matrix
attributes=["ExternalRiskEstimate","PercentTradesNeverDelq","AverageMInFile", "MaxDelq2PublicRecLast12M"]
scatter_matrix(data[attributes],figsize=(12,8))
pass
In [32]:
from pandas.plotting import scatter_matrix
attributes=["NetFractionRevolvingBurden","PercentTradesWBalance","NumBank2NatlTradesWHighUtilization", "NumTrades60Ever2DerogPubRec"]
scatter_matrix(data[attributes],figsize=(12,8))
pass

4. Prepare the Data for ML Algorithms

  • Data cleaning
  • Transformations and scaling
  • Create data pipelines
    • Reproducable code
    • Use with new data
    • Test various transformations

1) Create test set

  • Test set - data that was not used in constructing the model and only used for final evaluations
  • Train set - data used to construct a model
In [66]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=1)
print(data.shape, train_set.shape, test_set.shape)
(9871, 24) (7896, 24) (1975, 24)

2) Data cleanning

  • Imputer - sklearn built in objects to fill missing values (mean, median, most frequent)
  • Fills values for columns with numerical values
In [67]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median") # only works on numerical values
In [68]:
imputer.fit(data)   # initialize object
print(imputer.statistics_) # print interval values (media)
print(data.median().values)
[  0.  72. 186.   6.  76.  20.   0.   0.  97.  15.   6.   6.  21.   1.
  33.   0.   1.   1.  29.  74.   3.   2.   1.  67.]
[  0.  72. 186.   6.  76.  20.   0.   0.  97.  15.   6.   6.  21.   1.
  33.   0.   1.   1.  29.  74.   3.   2.   1.  67.]
In [69]:
X = imputer.transform(data) # transform all numerical values in data frame (returns matrix without labels)
data = pd.DataFrame(X, columns=data.columns) # add column names
data.info() # show statistics on transformed data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9871 entries, 0 to 9870
Data columns (total 24 columns):
RiskPerformance                       9871 non-null float64
ExternalRiskEstimate                  9871 non-null float64
MSinceOldestTradeOpen                 9871 non-null float64
MSinceMostRecentTradeOpen             9871 non-null float64
AverageMInFile                        9871 non-null float64
NumSatisfactoryTrades                 9871 non-null float64
NumTrades60Ever2DerogPubRec           9871 non-null float64
NumTrades90Ever2DerogPubRec           9871 non-null float64
PercentTradesNeverDelq                9871 non-null float64
MSinceMostRecentDelq                  9871 non-null float64
MaxDelq2PublicRecLast12M              9871 non-null float64
MaxDelqEver                           9871 non-null float64
NumTotalTrades                        9871 non-null float64
NumTradesOpeninLast12M                9871 non-null float64
PercentInstallTrades                  9871 non-null float64
MSinceMostRecentInqexcl7days          9871 non-null float64
NumInqLast6M                          9871 non-null float64
NumInqLast6Mexcl7days                 9871 non-null float64
NetFractionRevolvingBurden            9871 non-null float64
NetFractionInstallBurden              9871 non-null float64
NumRevolvingTradesWBalance            9871 non-null float64
NumInstallTradesWBalance              9871 non-null float64
NumBank2NatlTradesWHighUtilization    9871 non-null float64
PercentTradesWBalance                 9871 non-null float64
dtypes: float64(24)
memory usage: 1.8 MB

3) Feature Scaling

  • standardization: change to 0 mean and 1 standard deviation (subtract mean and divide by standard deviation)

Pipelines

  • We finished transforming the data, creating pipelines that manage flow of data to make predictions on the validation/test data.
In [70]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
In [71]:
X = data.copy().drop("RiskPerformance", axis=1)
y = data["RiskPerformance"].copy()
trainX = train_set.copy().drop("RiskPerformance", axis=1)
train_y = train_set["RiskPerformance"].copy()
testX = test_set.copy().drop("RiskPerformance", axis=1)
test_y = test_set["RiskPerformance"].copy()
In [72]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

cat_attributes = ['RiskPerformance']
num_attributes = ['ExternalRiskEstimate','MSinceOldestTradeOpen','MSinceMostRecentTradeOpen','AverageMInFile','NumSatisfactoryTrades','NumTrades60Ever2DerogPubRec',
                  'NumTrades90Ever2DerogPubRec','PercentTradesNeverDelq', 'MSinceMostRecentDelq','MaxDelq2PublicRecLast12M','MaxDelqEver','NumTotalTrades',
                  'NumTradesOpeninLast12M','PercentInstallTrades','MSinceMostRecentInqexcl7days','NumInqLast6M','NumInqLast6Mexcl7days','NetFractionRevolvingBurden',
                  'NetFractionInstallBurden','NumRevolvingTradesWBalance','NumInstallTradesWBalance','NumBank2NatlTradesWHighUtilization','PercentTradesWBalance']

num_pipeline = Pipeline([
    ('imputer',Imputer(strategy="median")),
    ('std_scaler', StandardScaler()),])

X = num_pipeline.fit_transform(X)
train_X = num_pipeline.fit_transform(trainX)
test_X = num_pipeline.fit_transform(testX)

5. Model training

  • We are now ready to train and select models

1) Decision trees

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
dtree = DecisionTreeClassifier()
dtree.fit(train_X, train_y)
Out[40]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
In [41]:
tree_pred = dtree.predict(test_X)
tree_mse = mean_squared_error(test_y, tree_pred)
tree_rmse = np.sqrt(tree_mse)
tree_rmse
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-41-b7c33a507fdb> in <module>
      1 tree_pred = dtree.predict(test_X)
----> 2 tree_mse = mean_squared_error(test_y, tree_pred)
      3 tree_rmse = np.sqrt(tree_mse)
      4 tree_rmse

NameError: name 'mean_squared_error' is not defined
In [42]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(test_y, tree_pred))
[[661 352]
 [385 577]]
In [43]:
print(classification_report(test_y, tree_pred))
              precision    recall  f1-score   support

         0.0       0.63      0.65      0.64      1013
         1.0       0.62      0.60      0.61       962

    accuracy                           0.63      1975
   macro avg       0.63      0.63      0.63      1975
weighted avg       0.63      0.63      0.63      1975

Fitting bootstrap aggregation of decision trees of depth 10

In [44]:
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score


base_clf = tree.DecisionTreeClassifier(max_depth=10) # initialize a base classifier
clf = BaggingClassifier(n_estimators=50, base_estimator=base_clf, oob_score=True) #50 different models
clf = clf.fit(X,y)    
print('OOB', clf.oob_score_)              # uses information from a single bag (50 trees)
scores = cross_val_score(clf, X, y, cv=5) 
print('CV: ',scores.mean(),scores.std())  # uses information from 5 bags (50*5 trees)
OOB 0.7223179009218924
CV:  0.7275866261398176 0.013125066649432854

2) Random Forests

In [45]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs = 2, random_state = 0, max_features = 'sqrt')
rfc.fit(train_X, train_y)
/Users/kevweirikefe/opt/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[45]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
In [46]:
rfc_pred = rfc.predict(test_X)
rfc_mse = mean_squared_error(test_y, tree_pred)
rfc_rmse = np.sqrt(tree_mse)
rfc_rmse
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-46-ad29b46ba765> in <module>
      1 rfc_pred = rfc.predict(test_X)
----> 2 rfc_mse = mean_squared_error(test_y, tree_pred)
      3 rfc_rmse = np.sqrt(tree_mse)
      4 rfc_rmse

NameError: name 'mean_squared_error' is not defined
In [47]:
print(confusion_matrix(test_y, rfc_pred))
[[785 228]
 [382 580]]
In [48]:
print(classification_report(test_y, rfc_pred))
              precision    recall  f1-score   support

         0.0       0.67      0.77      0.72      1013
         1.0       0.72      0.60      0.66       962

    accuracy                           0.69      1975
   macro avg       0.70      0.69      0.69      1975
weighted avg       0.69      0.69      0.69      1975

Hyper-parameter tuning

In [49]:
param_grid = {'n_estimators':[5,10,30, 50],
               'criterion':['mse','mae'],
               'max_features':[2,4,6,8],
               'max_depth':[1,2,3],
               'min_samples_leaf':[1,3,5]}
In [50]:
def count_configurations(p):
    return(np.product(list(map(len,param_grid.values()))))
count_configurations(param_grid)
Out[50]:
288
In [51]:
param_grid = {'n_estimators':[5,10,20,30],
               'max_depth':[1,3,5,7,9,11,15]}
count_configurations(param_grid)
Out[51]:
28
In [52]:
from sklearn.model_selection import GridSearchCV
forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_X,train_y)
Out[52]:
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 3, 5, 7, 9, 11, 15],
                         'n_estimators': [5, 10, 20, 30]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)
In [53]:
cvres = grid_search.cv_results_ 
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):  
    print(np.sqrt(-mean_score), params)
0.5620107914319056 {'max_depth': 1, 'n_estimators': 5}
0.5471673393619522 {'max_depth': 1, 'n_estimators': 10}
0.5394751473890673 {'max_depth': 1, 'n_estimators': 20}
0.5366506507707266 {'max_depth': 1, 'n_estimators': 30}
0.5340484061406018 {'max_depth': 3, 'n_estimators': 5}
0.52544187030335 {'max_depth': 3, 'n_estimators': 10}
0.52471828693922 {'max_depth': 3, 'n_estimators': 20}
0.524597592679558 {'max_depth': 3, 'n_estimators': 30}
0.5203556573457353 {'max_depth': 5, 'n_estimators': 5}
0.519990452721957 {'max_depth': 5, 'n_estimators': 10}
0.5164468271061055 {'max_depth': 5, 'n_estimators': 20}
0.5188932966350659 {'max_depth': 5, 'n_estimators': 30}
0.5276066663019348 {'max_depth': 7, 'n_estimators': 5}
0.5188932966350659 {'max_depth': 7, 'n_estimators': 10}
0.5188932966350659 {'max_depth': 7, 'n_estimators': 20}
0.5179160955858287 {'max_depth': 7, 'n_estimators': 30}
0.5295234997411499 {'max_depth': 9, 'n_estimators': 5}
0.5260440961403267 {'max_depth': 9, 'n_estimators': 10}
0.5152192360496084 {'max_depth': 9, 'n_estimators': 20}
0.5186491689825813 {'max_depth': 9, 'n_estimators': 30}
0.5398271703612021 {'max_depth': 11, 'n_estimators': 5}
0.5301210888036112 {'max_depth': 11, 'n_estimators': 10}
0.5204773352749901 {'max_depth': 11, 'n_estimators': 20}
0.5198686608353534 {'max_depth': 11, 'n_estimators': 30}
0.554067578422494 {'max_depth': 15, 'n_estimators': 5}
0.5415838530724625 {'max_depth': 15, 'n_estimators': 10}
0.525321342289308 {'max_depth': 15, 'n_estimators': 20}
0.5229049479588019 {'max_depth': 15, 'n_estimators': 30}
In [54]:
grid_search.best_estimator_
Out[54]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
In [55]:
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
accuracy = cross_val_score(clf, train_X, train_y, cv=5)
accuracy.mean()
Out[55]:
0.7330254608461279

3) KNN Model

In [56]:
# use cross-validation to get the highest accuracy from k=1 to k=50
from sklearn import neighbors
acc = []
for i in range(1,50):
    clf = neighbors.KNeighborsClassifier(i) 
    accuracy = cross_val_score(clf, train_X, train_y, cv=5)
    accmean = accuracy.mean()
    acc.append(accmean) 
acc
Out[56]:
[0.6443782387576006,
 0.6513439540517558,
 0.6769230398998892,
 0.6851550592802939,
 0.6926266223869343,
 0.6947773980844203,
 0.7033925334103415,
 0.7056690892172435,
 0.7095960276587894,
 0.7151688565882146,
 0.7166907317386213,
 0.717828047039194,
 0.7146642922462338,
 0.7168182766199804,
 0.7198567326049639,
 0.7190954741621344,
 0.720869390832812,
 0.7241614926761963,
 0.7239089698544866,
 0.7245410790778264,
 0.7237815854069404,
 0.7274542362548331,
 0.7231491553159743,
 0.7254274758948196,
 0.7241627761467007,
 0.7254274758948196,
 0.7249211467808955,
 0.7239072050825432,
 0.7258080248993277,
 0.7256803195841556,
 0.7275816207023793,
 0.726314674880878,
 0.72707448941939,
 0.7270733663826988,
 0.728466092313616,
 0.7283403122042001,
 0.7290998058750862,
 0.7289721005599141,
 0.7268200413919239,
 0.7269464632365917,
 0.72770659864273,
 0.7275808185333141,
 0.7278346248255282,
 0.7283414352408915,
 0.7285936371949753,
 0.7284676966517464,
 0.7287211820763344,
 0.7285941184964143,
 0.7294805153134074]
In [57]:
#find the highest accuracy 
max(acc)
Out[57]:
0.7294805153134074
In [58]:
#find the responding k of the highest accuracy
acc.index(max(acc))
Out[58]:
48

4) Support Vector

SVC with linear kernel

In [60]:
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")


tuned_parameters = [
  {'kernel': ['linear'], 'C': [0.01,0.1,1.0,5.0,10.0,100.0]}]
  
scores = ['precision']

clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(train_X, train_y)

print("Best Hyperparameter is:")
print(clf.best_params_)

    
#0.732 for {'C': 0.1, 'kernel': 'linear'}
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-60-90107d87cf7e> in <module>
     10 
     11 clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
---> 12 clf.fit(train_X, train_y)
     13 
     14 print("Best Hyperparameter is:")

~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    686                 return results
    687 
--> 688             self._run_search(evaluate_candidates)
    689 
    690         # For multi-metric evaluation, store the best_index_, best_params_ and

~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1147     def _run_search(self, evaluate_candidates):
   1148         """Search all candidates in param_grid"""
-> 1149         evaluate_candidates(ParameterGrid(self.param_grid))
   1150 
   1151 

~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    665                                for parameters, (train, test)
    666                                in product(candidate_params,
--> 667                                           cv.split(X, y, groups)))
    668 
    669                 if len(out) < 1:

~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
    922                 self._iterating = self._original_iterator is not None
    923 
--> 924             while self.dispatch_one_batch(iterator):
    925                 pass
    926 

~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    514             estimator.fit(X_train, **fit_params)
    515         else:
--> 516             estimator.fit(X_train, y_train, **fit_params)
    517 
    518     except Exception as e:

~/opt/anaconda3/lib/python3.7/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
    207 
    208         seed = rnd.randint(np.iinfo('i').max)
--> 209         fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
    210         # see comment on the other call to np.iinfo in this file
    211 

~/opt/anaconda3/lib/python3.7/site-packages/sklearn/svm/base.py in _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
    266                 cache_size=self.cache_size, coef0=self.coef0,
    267                 gamma=self._gamma, epsilon=self.epsilon,
--> 268                 max_iter=self.max_iter, random_seed=random_seed)
    269 
    270         self._warn_from_fit_status()

KeyboardInterrupt: 
In [63]:
from sklearn.svm import SVC
In [ ]:
print("Result is:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))

SVC with polynomial kernel

In [ ]:
tuned_parameters = [
  {'kernel': ['poly'], 'degree':[0,1,2,5], 'C': [0.01,0.1,1.0,5.0,10.0,100.0]}]
  
# Objective metrics
scores = ['precision']

clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(train_X, train_y)

print("Best Hyperparameter is:")
print(clf.best_params_)
In [ ]:
print("Results are:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))

SVC with RBF training part

In [ ]:
tuned_parameters = [
  {'kernel': ['rbf'], 'gamma': [0.0001,0.001,0.1,1,10,100],'C': [0.1,1,10,100,1000]}]
  
# Objective metrics
scores = ['precision']

clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % scores[0])
clf.fit(train_X, train_y)

print("Best Hyperparameters are:")
print(clf.best_params_)
In [ ]:
print("Results are:")
means = clf.cv_results_['mean_test_score']
for mean,params in zip(means, clf.cv_results_['params']):
    print("%0.3f for %r" % (mean, params))

Evaluation & Final Model with full dataset

Evaluation with test set: SVC with RBF (C= 100, gamma= 0.001)

In [ ]:
clf = SVC(kernel='rbf',C=100, gamma=0.001)
pred = clf.fit(train_X, train_y)
y_pred = pred.predict(test_X)
print(classification_report(test_y, y_pred))

Final Model with full dataset

In [ ]:
clf = SVC(kernel='rbf',C=100, gamma=0.001)
pred = clf.fit(X, y)
y_pred = pred.predict(X)
print(classification_report(y, y_pred))
In [78]:
import pickle

pickle.dump(train_X, open('X_train.sav', 'wb'))
pickle.dump(test_X, open('X_test.sav', 'wb'))
pickle.dump(test_y, open('y_test.sav', 'wb'))
pickle.dump(train_y , open('y_train.sav', 'wb'))
pickle.dump(SVC(kernel='rbf',C=100, gamma=0.001), open('pipe_SVM.sav', 'wb'))
pickle.dump(neighbors.KNeighborsClassifier(48), open('pipe_KNN.sav', 'wb'))
pickle.dump(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), open('pipe_RDF.sav', 'wb'))
pickle.dump(DecisionTreeClassifier(), open('pipe_DT.sav', 'wb'))
In [ ]:
#interface code
import streamlit as st
import pickle
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix

X_test = pickle.load(open('X_test.sav', 'rb'))
y_test = pickle.load(open('y_test.sav', 'rb'))
X_train = pickle.load(open('X_train.sav', 'rb'))
y_train = pickle.load(open('y_train.sav', 'rb'))


# Function to test certain index of dataset
def test_demo():
      

    # Create four sliders in the sidebar
    st.sidebar.slider('ExternalRiskEstimate', 0.0, 100.0, (0.0, 100.0), 1.0)
    st.sidebar.slider('MSinceOldestTradeOpen', 0.0, 810.0, (0.0, 810.0), 1.0)
    st.sidebar.slider('MSinceMostRecentTradeOpen', 0.0, 400.0, (0.0, 400.0), 1.0)
    st.sidebar.slider('AverageMInFile', 0.0, 400.0, (0.0, 400.0), 1.0)
    st.sidebar.slider('NumSatisfactoryTrades', 0.0, 80.0, (0.0, 80.0), 1.0)
    st.sidebar.slider('NumTrades60Ever2DerogPubRec', 0.0, 20.0, (0.0, 20.0), 1.0)
    st.sidebar.slider('NumTrades90Ever2DerogPubRec', 0.0, 20.0, (0.0, 20.0), 1.0)
    st.sidebar.slider('PercentTradesNeverDelq', 0.0, 100.0, (0.0, 100.0), 1.0)
    st.sidebar.slider('MSinceMostRecentDelq', 0.0, 90.0, (0.0, 90.0), 1.0)
    st.sidebar.slider('MaxDelq2PublicRecLast12M', 0.0, 7.0, (0.0, 7.0), 1.0)
    st.sidebar.slider('MaxDelqEver', 0.0, 5.0, (0.0, 5.0), 1.0)
    st.sidebar.slider('NumTotalTrades', 0.0, 110.0, (0.0, 110.0), 1.0)
    st.sidebar.slider('NumTradesOpeninLast12M', 0.0, 19.0, (0.0, 19.0), 1.0)
    st.sidebar.slider('PercentInstallTrades', 0.0, 100.0, (0.0, 100.0), 1.0)
    st.sidebar.slider('MSinceMostRecentInqexcl7days', 0.0, 13.0, (0.0, 13.0), 1.0)
    st.sidebar.slider('NumInqLast6M', 0.0, 70.0, (0.0, 70.0), 1.0)
    st.sidebar.slider('NumInqLast6Mexcl7days', 0.0, 70.0, (0.0, 70.0), 1.0)
    st.sidebar.slider('NetFractionRevolvingBurden', 0.0, 240.0, (0.0, 240.0), 1.0)
    st.sidebar.slider('NetFractionInstallBurden', 0.0, 480.0, (0.0, 480.0), 1.0)
    st.sidebar.slider('NumRevolvingTradesWBalance', 0.0, 40.0, (0.0, 40.0), 1.0)
    st.sidebar.slider('NumInstallTradesWBalance', 0.0, 30.0, (0.0, 30.0), 1.0)
    st.sidebar.slider('NumBank2NatlTradesWHighUtilization', 0.0, 20.0, (0.0, 20.0), 1.0)
    st.sidebar.slider('PercentTradesWBalance', 0.0, 100.0, (0.0, 100.0), 1.0)
    
    

    # Print the prediction result
    alg = ['Decision Tree','Random Forest', 'Support Vector Machine', 'K Nearest Neighbor']
    classifier = st.selectbox('Which algorithm?', alg)
    if classifier == 'Decision Tree':
       
        pipe_DT = pickle.load(open('pipe_DT.sav', 'rb'))
        pipe_DT.fit(X_train, y_train)
        acc = pipe_DT.score(X_test, y_test)
        pred_DT = pipe_DT.predict(X_test)
        cm_dtc = confusion_matrix(y_test, pred_DT)
        st.write('Accuracy: ', acc)
        st.write('Confusion matrix: ', cm_dtc)
        
        st.text('Decision Tree Chosen')
        
    elif classifier == 'Random Forest':
        pipe_RDF = pickle.load(open('pipe_RDF.sav', 'rb'))
        pipe_RDF.fit(X_train, y_train)
        accuracy_score = pipe_RDF.score(X_test, y_test)
        pred_RDF = pipe_RDF.predict(X_test)
        cm_rdf = confusion_matrix(y_test, pred_RDF)
        st.write('Accuracy: ', accuracy_score)
        st.write('Confusion matrix: ', cm_rdf)
        
        
        
        st.text('Random Forest Chosen') 
        
    elif classifier == 'Support Vector Machine':
        pipe_SVM = pickle.load(open('pipe_SVM.sav', 'rb'))
        pipe_SVM.fit(X_train, y_train)
        accuracy = pipe_SVM.score(X_test, y_test)
        pred_SVM = pipe_SVM.predict(X_test)
        cm = confusion_matrix(y_test, pred_SVM)
        st.write('Accuracy: ', accuracy)
        st.write('Confusion matrix: ', cm)
        
        
        
        st.text('Support Vector Machine Chosen')  

    else:
        pipe_KNN = pickle.load(open('pipe_KNN.sav', 'rb'))
        pipe_KNN.fit(X_train, y_train)
        score = pipe_KNN.score(X_test, y_test)
        pred_KNN = pipe_KNN.predict(X_test)
        cmKNN = confusion_matrix(y_test, pred_KNN)
        st.write('Accuracy: ', score)
        st.write('Confusion matrix: ', cmKNN)
        
        
        
        st.text('K Nearest Neighbor Chosen')
       



# title
st.title('Credit Risk Prediction')
# show data
if st.checkbox('Show dataframe'):
    st.write(X_test)
# st.write(X_train) # Show the dataset


test_demo()