Lending Club Default Classification
Lending Club Default(charged off, late also are included) Fully Paid Classification Model¶
- Main Repository for this project/
- Raw data is gathered from Lending Club Loan Data and Kaggle
- This project has been initiated from JAN 18 2017
- Co-contributers are as below(sorted alphabetically)
- Jang Sungguk(simfeel87@gmail.com)
- Kim Gibeom(curtisk808@gmail.com)
- Shin Yoonsig(shinys825@gmail.com)
Summary¶
Check List¶
- High annual income
- Low interest rate
- Low loan amount
- Opened employment title
- Issued on JAN, SEP, DEC
- Verification status and Owned house are not so important, but relatively positive
Initialize¶
Library loading¶
In [1]:
import re
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import matplotlib as mpl
import matplotlib.tri as mtri
import matplotlib.pylab as plt
import matplotlib.patches as mpatches
import seaborn as sns
import itertools
from scipy import stats
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
# Seaborn setting
sns.set(palette="hls", font_scale=2)
In [7]:
print np.mean(df["loan_amnt"])
print np.mean(df["int_rate"])
print np.median(df["int_rate"])
Read dataframe¶
In [2]:
df = pd.read_csv('frames/lc_dataframe.csv')
print list(df.columns)
df.tail()
Out[2]:
Check dataframe and elements¶
In [56]:
df.dtypes
Out[56]:
In [55]:
df.isnull().sum()
Out[55]:
Data Preprocessing¶
check correlation between dependent variables¶
In [257]:
fig = plt.figure(figsize = (15, 10))
corrmat = df.corr()
sns.heatmap(corrmat)
plt.title("Correlation between Features")
plt.show()
get high correlation¶
In [307]:
corr_stack = corrmat.abs().unstack()
ordered_stack = corr_stack.order(kind="quicksort", ascending=False)
order_ix = []
for num in range(len(ordered_stack)):
if ordered_stack[num] > 0.6 and ordered_stack[num] < 1.0:
order_ix.append(num)
print ordered_stack[min(order_ix):max(order_ix)]
remove high corrleation variables¶
In [3]:
del df["grade"]
del df["sub_grade"]
del df["installment"]
del df["open_acc"]
check groupby elements¶
In [89]:
category_vars = ["emp_title", "emp_length", "home_ownership",
"verification_status", "issue_d", "purpose",
"initial_list_status", "pymnt_plan", "application_type"]
continuous_vars = ["loan_amnt", "int_rate", "annual_inc",
"desc", "dti", "revol_bal", "revol_util", "total_acc",
"pub_rec", "inq_last_6mths", "delinq_2yrs"]
In [310]:
for var in category_vars:
print df.groupby(var).loan_status.value_counts()
remove variables 'pymnt_plan', 'application_type': almost every elements are in oneside¶
In [5]:
del df["pymnt_plan"]
del df["application_type"]
In [90]:
category_vars.remove("pymnt_plan")
category_vars.remove("application_type")
rearrage 'home_ownership', 'purpose'¶
In [7]:
df["home_ownership"].replace([1, 3], 4, inplace=True)
df["home_ownership"].replace(2, 1, inplace=True)
df["home_ownership"].replace(4, 2, inplace=True)
df["home_ownership"].replace(5, 3, inplace=True)
df["home_ownership"].replace(6, 4, inplace=True)
In [327]:
np.unique(df["home_ownership"])
Out[327]:
Home_ownership Category | Category number |
---|---|
Mortgage | 1 |
Other | 2 |
Own | 3 |
Rent | 4 |
In [8]:
df["purpose"].replace([4, 11], 10, inplace=True)
df["purpose"].replace(13, 4, inplace=True)
df["purpose"].replace(14, 11, inplace=True)
In [326]:
np.unique(df["purpose"])
Out[326]:
Purpose Category | Category number |
---|---|
car | 1 |
credit_card | 2 |
debt_consolidation | 3 |
vacation | 4 |
home_improvement | 5 |
house | 6 |
major_purchase | 7 |
medical | 8 |
moving | 9 |
other | 10 |
wedding | 11 |
small_business | 12 |
Histogram¶
In [332]:
print len(category_vars), len(continuous_vars)
In [377]:
N=4; M=2; # set row and column of the figure
fig = plt.figure(figsize=(10,15)) # figure size
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0) # subplot setting
# make subplot with face data index
for i in range(N):
for j in range(M):
if i * M + j < 7:
ax = fig.add_subplot(N, M, i * M + j + 1)
plt.hist(df[category_vars[i * M + j]], bins=30)
plt.title(category_vars[i * M + j])
else:
pass
plt.tight_layout()
plt.suptitle("Histograms for category variables", y=1.02,fontsize=30)
plt.show()
In [381]:
N=4; M=3; # set row and column of the figure
fig = plt.figure(figsize=(15,15)) # figure size
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0) # subplot setting
# make subplot with face data index
for i in range(N):
for j in range(M):
if i * M + j < 11:
ax = fig.add_subplot(N, M, i * M + j + 1)
plt.hist(df[continuous_vars[i * M + j]], bins=10)
plt.title(continuous_vars[i * M + j])
else:
pass
plt.tight_layout()
plt.suptitle("Histograms for continuous variables", y=1.02,fontsize=30)
plt.show()
QQ-Plot¶
In [420]:
N=4; M=3; # set row and column of the figure
fig = plt.figure(figsize=(15,15)) # figure size
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0) # subplot setting
# make subplot with face data index
for i in range(N):
for j in range(M):
if i * M + j < 11:
ax = fig.add_subplot(N, M, i * M + j + 1)
sp.stats.probplot(df[continuous_vars[i * M + j]], plot=plt)
plt.title(continuous_vars[i * M + j])
else:
pass
plt.tight_layout()
plt.suptitle("QQ-Plots for continuous variables", y=1.02,fontsize=30)
plt.show()
Remove outliers(0.005% of highest and lowest observations)¶
In [9]:
outliers = int(len(df) * 0.00005)
print "Number of 0.005%: ", outliers
out_ix = np.array([])
for name in continuous_vars:
out_large_list = np.array(df[name].nlargest(outliers).index)
out_small_list = np.array(df[name].nsmallest(outliers).index)
out_ix = np.concatenate((out_ix, out_large_list, out_small_list))
out_ix = np.unique(out_ix)
print "Number of outliers(0.005% * 2): ", out_ix.shape[0]
In [10]:
cleaned_df = df.drop(df.index[list(out_ix)])
cleaned_df.index = range(len(cleaned_df))
cleaned_df.tail()
Out[10]:
QQ-Plot after removing outliers¶
In [422]:
N=4; M=3; # set row and column of the figure
fig = plt.figure(figsize=(15,15)) # figure size
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0) # subplot setting
# make subplot with face data index
for i in range(N):
for j in range(M):
if i * M + j < 11:
ax = fig.add_subplot(N, M, i * M + j + 1)
sp.stats.probplot(cleaned_df[continuous_vars[i * M + j]], plot=plt)
plt.title(continuous_vars[i * M + j])
else:
pass
plt.tight_layout()
plt.suptitle("QQ-Plots for continuous variables after removing outliers", y=1.02,fontsize=30)
plt.show()
Scaling and Logarithm¶
In [11]:
log_vars = ["int_rate", "annual_inc", "loan_amnt"]
scaling_vars = ["desc", "dti", "revol_bal", "revol_util", "total_acc", "pub_rec", "inq_last_6mths", "delinq_2yrs"]
check 0 for log_vars to prevent '-inf'¶
In [401]:
for i in log_vars:
print cleaned_df[cleaned_df[i].isin([int(0)])].index
In [12]:
cleaned_df[log_vars] = np.log10(cleaned_df[log_vars])
In [13]:
cleaned_df[scaling_vars] = preprocessing.scale(cleaned_df[scaling_vars])
In [419]:
N=4; M=3; # set row and column of the figure
fig = plt.figure(figsize=(15,15)) # figure size
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0) # subplot setting
# make subplot with face data index
for i in range(N):
for j in range(M):
if i * M + j < 11:
ax = fig.add_subplot(N, M, i * M + j + 1)
sp.stats.probplot(cleaned_df[continuous_vars[i * M + j]], plot=plt)
plt.title(continuous_vars[i * M + j])
else:
pass
plt.tight_layout()
plt.suptitle("QQ-Plots for continuous variables after Scaling and Logarithm", y=1.02,fontsize=30)
plt.show()
In [426]:
N=4; M=3; # set row and column of the figure
fig = plt.figure(figsize=(15,15)) # figure size
plt.subplots_adjust(top=1, bottom=0, hspace=0, wspace=0) # subplot setting
# make subplot with face data index
for i in range(N):
for j in range(M):
if i * M + j < 11:
ax = fig.add_subplot(N, M, i * M + j + 1)
plt.hist(cleaned_df[continuous_vars[i * M + j]], bins=10)
plt.title(continuous_vars[i * M + j])
else:
pass
plt.tight_layout()
plt.suptitle("Histograms for continuous variables after Scaling and Logarithm", y=1.02,fontsize=30)
plt.show()
Fit and Scoring¶
Under Sampling¶
In [14]:
scaled_df = cleaned_df
In [38]:
def under_sampling(df_name):
global X, y, under_X, under_y, under_df
X = df_name.ix[:, df_name.columns != "loan_status"]
y = df_name.ix[:, df_name.columns == "loan_status"]
default_ix = np.array(df_name[df_name.loan_status == 0].index)
paid_ix = np.array(df_name[df_name.loan_status == 1].index)
rand_paid_ix = np.random.choice(paid_ix, len(default_ix), replace=False)
rand_ix = np.concatenate([default_ix, rand_paid_ix])
print "X Shape: ", X.shape
print "y Shape: ", y.shape
print "Default Index Length: ", len(default_ix)
print "Paid Index Length: ", len(rand_paid_ix)
print "Random Index Shape: ", rand_ix.shape
print "-" * 75
under_df = df_name.iloc[rand_ix, :]
under_X = under_df.ix[:, under_df.columns != "loan_status"]
under_y = under_df.ix[:, under_df.columns == "loan_status"]
print "Under DataFrame's X and y", len(under_X), len(under_y)
print "Data Variables Names: under_df, under_X, under_y"
In [126]:
under_sampling(scaled_df)
Split train and test for under sampled data and whole data¶
In [183]:
under_train_X, under_test_X, under_train_y, under_test_y = train_test_split(
under_X, under_y, test_size=0.20, random_state=0)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
Random Forest Fit¶
In [18]:
rf = RandomForestClassifier(max_features=None, random_state=0)
rf_result = rf.fit(under_train_X, under_train_y)
Confusion Matrix¶
In [19]:
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Oranges):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
Cofusion Matrix(predict with under sampled test set)¶
In [20]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(under_test_y, rf_result.predict(under_test_X))
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0,1],
title='Cofusion Matrix(predict with under sampled test set)')
plt.show()
print classification_report(under_test_y, rf_result.predict(under_test_X))
Cofusion Matrix(predict with whole test set)¶
In [21]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y, rf_result.predict(X))
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0,1],
title='Cofusion Matrix(predict with whole test set)')
plt.show()
print classification_report(y, rf_result.predict(X))
Set CV and Models¶
In [71]:
cv = StratifiedKFold(n_splits=8, random_state=0, shuffle=False)
sgd = SGDClassifier(loss="log", fit_intercept=True,
average=1000, n_iter=30, n_jobs=8,
random_state=0)
lda = LinearDiscriminantAnalysis()
lr = LogisticRegression(n_jobs=8)
Dummy Encoding for Categori Variables¶
In [113]:
dummies_df = scaled_df[continuous_vars]
for var in category_vars:
dummies_df = dummies_df.join(pd.get_dummies(scaled_df[var], prefix=var))
print dummies_df.shape
print dummies_df.columns
dummies_df = dummies_df.join(scaled_df["loan_status"])
dummies_df.tail()
Out[113]:
In [109]:
under_sampling(dummies_df)
Get Scores¶
SGD¶
In [124]:
print "SGD CV Accuracy for under sampled dataset"
sgd_cv_under = cross_val_score(sgd, under_X,
np.array(under_y).reshape(len(under_y, )),
cv = cv, scoring="accuracy", n_jobs=8)
print "Each Validation's Accuracy: ", sgd_cv_under
print "Average Accuracy: ", np.mean(sgd_cv_under)
print '-' * 75
print "SGD CV Accuracy for whole dataset"
sgd_cv_whole = cross_val_score(sgd, X,
np.array(y).reshape(len(y, )),
cv = cv, scoring="accuracy", n_jobs=8)
print "Each Validation's Accuracy: ", sgd_cv_whole
print "Average Accuracy: ", np.mean(sgd_cv_whole)
LDA¶
In [80]:
print "LDA CV Accuracy for under sampled dataset"
lda_cv_under = cross_val_score(lda, under_X,
np.array(under_y).reshape(len(under_y, )),
cv = cv, scoring="accuracy", n_jobs=8)
print "Each Validation's Accuracy: ", lda_cv_under
print "Average Accuracy: ", np.mean(lda_cv_under)
print '-' * 75
print "LDA CV Accuracy for whole dataset"
lda_cv_whole = cross_val_score(lda, X,
np.array(y).reshape(len(y, )),
cv = cv, scoring="accuracy", n_jobs=8)
print "Each Validation's Accuracy: ", lda_cv_whole
print "Average Accuracy: ", np.mean(lda_cv_whole)
LogisticRegression¶
In [125]:
print "LogisticRegression CV Accuracy for under sampled dataset"
lr_cv_under = cross_val_score(lr, under_X,
np.array(under_y).reshape(len(under_y, )),
cv = cv, scoring="accuracy", n_jobs=8)
print "Each Validation's Accuracy: ", lr_cv_under
print "Average Accuracy: ", np.mean(lr_cv_under)
print '-' * 75
print "LogisticRegression CV Accuracy for whole dataset"
lr_cv_whole = cross_val_score(lr, X,
np.array(y).reshape(len(y, )),
cv = cv, scoring="accuracy", n_jobs=8)
print "Each Validation's Accuracy: ", lr_cv_whole
print "Average Accuracy: ", np.mean(lr_cv_whole)
Random Forest Feature importances¶
In [469]:
importances = rf_result.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
print("{}. {} / feature_n: {} / importance: {}".format(f+1, list(X.columns)[f], indices[f], importances[indices[f]]))
std = np.std([tree.feature_importances_ for tree in rf_result.estimators_],
axis=0)
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
Optimization¶
Fit iteration 1¶
Logistic Regressions with High feature importance variables to check p-value and coefficient¶
In [167]:
model = sm.Logit.from_formula("loan_status ~ loan_amnt + int_rate + C(emp_title) + C(emp_length) + annual_inc + C(verification_status) + C(home_ownership)",
data=under_df)
result = model.fit()
print result.pred_table()
result.summary()
Out[167]:
Remove the variable with high p-value¶
In [184]:
iter_df = scaled_df.drop("emp_length", 1)
under_sampling(iter_df)
In [185]:
under_train_X, under_test_X, under_train_y, under_test_y = train_test_split(
under_X, under_y, test_size=0.20, random_state=0)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
In [159]:
def rf_fit():
rf = RandomForestClassifier(max_features=None, random_state=0)
rf_result = rf.fit(under_train_X, under_train_y)
importances = rf_result.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
print("{}. {} / feature_n: {} / importance: {}".format(f+1, list(X.columns)[f], indices[f], importances[indices[f]]))
std = np.std([tree.feature_importances_ for tree in rf_result.estimators_],
axis=0)
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
In [187]:
rf_fit()
Fit Iteration 2¶
In [164]:
model = sm.Logit.from_formula("loan_status ~ loan_amnt + int_rate + C(emp_title) + C(issue_d) + annual_inc + C(verification_status) + C(home_ownership)",
data=under_df)
result = model.fit()
print result.pred_table()
result.summary()
Out[164]:
In [188]:
iter_df["issue_d"] = iter_df["issue_d"].replace([1, 9, 12], 1)
iter_df["issue_d"] = iter_df["issue_d"].replace([2, 3, 4, 5, 6, 7, 8, 10, 11], 0)
print np.unique(iter_df["issue_d"])
under_sampling(iter_df)
In [189]:
under_train_X, under_test_X, under_train_y, under_test_y = train_test_split(
under_X, under_y, test_size=0.20, random_state=0)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
In [190]:
rf_fit()
Fit iteration 3¶
In [176]:
model = sm.Logit.from_formula("loan_status ~ loan_amnt + int_rate + C(emp_title) + C(issue_d) + annual_inc + C(verification_status) + C(home_ownership)",
data=under_df)
result = model.fit()
print result.pred_table()
result.summary()
Out[176]:
In [191]:
fitted_cols = ["loan_status", "emp_title", "issue_d", "verification_status",
"home_ownership", "loan_amnt", "int_rate", "annual_inc"]
fitted_df = iter_df.reindex(columns=fitted_cols)
fitted_df.tail()
Out[191]:
In [192]:
under_sampling(fitted_df)
In [196]:
under_train_X, under_test_X, under_train_y, under_test_y = train_test_split(
under_X, under_y, test_size=0.20, random_state=0)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)
In [194]:
rf_fit()
In [200]:
rf = RandomForestClassifier(max_features=None, random_state=0)
rf_result = rf.fit(under_train_X, under_train_y)
In [201]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(under_test_y, rf_result.predict(under_test_X))
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0,1],
title='Cofusion Matrix(predict with under sampled test set)')
plt.show()
print classification_report(under_test_y, rf_result.predict(under_test_X))
In [202]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y, rf_result.predict(X))
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0,1],
title='Cofusion Matrix(predict with whole test set)')
plt.show()
print classification_report(y, rf_result.predict(X))