Logistic Regression using German Credit Data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
The German Credit Data contains data on 20 variables and the classification whether an applicant is considered a Good or a Bad credit risk for 1000 loan applicants.
credit_dat=pd.read_csv("C:\Work\Datasets\germancreditdata.csv")
print(credit_dat.head())
credit_dat.shape
Lets read same data without header and assign header. Creditability is named to class. Class 1 indicates good credit and 2 as bad credit(default).
credit_df=pd.read_csv("C:\Work\Datasets\germancreditdataUCI.csv",sep=" ")
columns = ['checkin_acc', 'duration', 'credit_history', 'purpose', 'amount',
'saving_acc', 'present_emp_since', 'inst_rate', 'personal_status',
'other_debtors', 'residing_since', 'property', 'age',
'inst_plans', 'housing', 'num_credits',
'job', 'dependents', 'telephone', 'foreign_worker', 'class']
credit_df.columns = columns
credit_df.head()
The value of class variable 2 indicates default and 1 describes non-default. In order to model as 0-1 probelm, we have removed the value by 1 in the following code
credit_df['class']=credit_df['class']-1
credit_df.to_csv("C:\Work\Datasets\germancreditdatawithHeader.csv")
In order to know the predictive ability of each variable with respect to the independent variable, let do an information value calculation
credit_df.info()
We see few data types are object and few int64. We will bin the int values to 10 equal parts deciles..
credit_df.groupby(['saving_acc'])['class'].head()
credit_df.groupby(['saving_acc'])['class'].agg(['count','sum']).head()
# Calculation of IV metrics
def IV_calc(data,var):
if data[var].dtypes == "object":
dataf = data.groupby([var])['class'].agg(['count','sum'])
dataf.columns = ["Total","bad"]
dataf["good"] = dataf["Total"] - dataf["bad"]
dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
dataf["good_per"] = dataf["good"]/dataf["good"].sum()
dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
return dataf
else:
data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
dataf.columns = ["Total","bad"]
dataf["good"] = dataf["Total"] - dataf["bad"]
dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
dataf["good_per"] = dataf["good"]/dataf["good"].sum()
dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
return dataf
IV_calc(credit_df,'saving_acc')
pd.crosstab(credit_df['saving_acc'],credit_df['class'] )
Lets get variable importance of few more cols
print ("\n\nCredit History - Information Value\n")
print (IV_calc(credit_df,'credit_history'))
print ("\n\nCredit History - Duration in month\n")
print (IV_calc(credit_df,'duration'))
# List of IV values
Iv_list = []
for col in credit_df.columns:
assigned_data = IV_calc(data = credit_df,var = col)
iv_val = round(assigned_data["I_V"].sum(),3)
dt_type = credit_df[col].dtypes
Iv_list.append((iv_val,col,dt_type))
Iv_list = sorted(Iv_list,reverse = True)
for i in range(len(Iv_list)):
print (Iv_list[i][0],",",Iv_list[i][1],",type =",Iv_list[i][2])
We will consider top 15 vars. We will do dummy variable coding for discrete vars/objects and use continous as is
tmp=credit_df.select_dtypes(include=['object']).columns.values
tmp
tmp.size
tmp[12]
tmplist=tmp.tolist()
tmplist
dummy_stseca = pd.get_dummies(credit_df['checkin_acc'], prefix='status_exs_accnt')
dummy_ch = pd.get_dummies(credit_df['credit_history'], prefix='cred_hist')
dummy_purpose = pd.get_dummies(credit_df['purpose'], prefix='purpose')
dummy_savacc = pd.get_dummies(credit_df['saving_acc'], prefix='sav_acc')
dummy_presc = pd.get_dummies(credit_df['present_emp_since'], prefix='pre_emp_snc')
dummy_perssx = pd.get_dummies(credit_df['personal_status'], prefix='per_stat_sx')
dummy_othdts = pd.get_dummies(credit_df['other_debtors'], prefix='oth_debtors')
dummy_property = pd.get_dummies(credit_df['property'], prefix='property')
dummy_othinstpln = pd.get_dummies(credit_df['inst_plans'], prefix='oth_inst_pln')
dummy_forgnwrkr = pd.get_dummies(credit_df['foreign_worker'], prefix='forgn_wrkr')
dummy_stseca #get_dummies acts as one hot encoder.
continuous_columnsOld = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income',
'Age_in_years','Number_of_existing_credits_at_this_bank' ]
type(continuous_columns)
continuous_columns=credit_df.select_dtypes(include=['int64']).columns.values.tolist()
continuous_columns.remove('class')
credit_continuous = credit_df[continuous_columns]
credit_data_new = pd.concat([dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx,
dummy_property,dummy_othinstpln,dummy_othdts,
dummy_forgnwrkr,credit_continuous,credit_df['class']],axis=1)
credit_data_new['class'].head()
x_train,x_test,y_train,y_test = train_test_split(credit_data_new.drop(['class'],axis=1),credit_data_new['class'],train_size = 0.7,random_state=42)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
We remove one extra category column from all categorical variables for which dummies have been created
remove_cols_extra_dummy = ['status_exs_accnt_A11','cred_hist_A30','purpose_A40','sav_acc_A61','pre_emp_snc_A71',
'per_stat_sx_A91','oth_debtors_A101','property_A121','oth_inst_pln_A141','forgn_wrkr_A201']
remove_cols_insig = []
remove_cols = list(set(remove_cols_extra_dummy+remove_cols_insig))
Model Creation
import statsmodels.api as sm
logistic_model = sm.Logit(y_train,sm.add_constant(x_train.drop(remove_cols,axis=1))).fit()
print (logistic_model.summary())
# Calculation of VIF
print ("\nVariance Inflation Factor")
cnames = x_train.drop(remove_cols,axis=1).columns
for i in np.arange(0,len(cnames)):
xvars = list(cnames)
yvar = xvars.pop(i)
mod = sm.OLS(x_train.drop(remove_cols,axis=1)[yvar],sm.add_constant(x_train.drop(remove_cols,axis=1)[xvars]))
res = mod.fit()
vif = 1/(1-res.rsquared)
print (yvar,round(vif,3))
y_pred = pd.DataFrame(logistic_model.predict(sm.add_constant(x_train.drop(remove_cols,axis=1))))
y_pred.columns = ["probs"]
print(y_pred)
C Stat tells us the proprtion of concordant pairs out of total pairs. The higher the value the best. >0.7 is adivsable for production. A pair is concordant if the probability against the 1 class is > than the 0 class, if its <1 then its discordant.
both = pd.concat([y_train,y_pred],axis=1)
zeros = both[['class','probs']][both['class']==0]
ones = both[['class','probs']][both['class']==1]
def df_crossjoin(df1, df2, **kwargs):
df1['_tmpkey'] = 1
df2['_tmpkey'] = 1
res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
res.index = pd.MultiIndex.from_product((df1.index, df2.index))
df1.drop('_tmpkey', axis=1, inplace=True)
df2.drop('_tmpkey', axis=1, inplace=True)
return res
joined_data = df_crossjoin(ones,zeros)
joined_data['concordant_pair'] = 0
joined_data.loc[joined_data['probs_x'] > joined_data['probs_y'],'concordant_pair'] =1
joined_data['discordant_pair'] = 0
joined_data.loc[joined_data['probs_x'] < joined_data['probs_y'],'discordant_pair'] =1
joined_data['tied_pair'] = 0
joined_data.loc[joined_data['probs_x'] == joined_data['probs_y'],'tied_pair'] =1
p_conc = (sum(joined_data['concordant_pair'])*1.0 )/ (joined_data.shape[0])
p_disc = (sum(joined_data['discordant_pair'])*1.0 )/ (joined_data.shape[0])
c_statistic = 0.5 + (p_conc - p_disc)/2.0
print ("\nC-statistic:",round(c_statistic,4))
Keep tab of c-stat and how log likelihood(AIC) is changing while removing various predictors one by one in order to justify where to stop
# ROC & AUC
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import auc
fpr, tpr, thresholds = metrics.roc_curve(both['class'],both['probs'], pos_label=1)
roc_auc = auc(fpr,tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - German Credit Data')
plt.legend(loc="lower right")
plt.show()
# Tuning for threshold
for i in list(np.arange(0,1,0.1)):
both["y_pred"] = 0
both.loc[both["probs"] > i, 'y_pred'] = 1
print ("Threshold",i,"Train Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4))
# Implement best threshold on train data
both["y_pred"] = 0
both.loc[both["probs"] > 0.5, 'y_pred'] = 1
print ("\nTrain Confusion Matrix\n\n",pd.crosstab(both['class'],both['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"]))
print ("\nTrain Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4))
# Predicting test output
y_pred_test = pd.DataFrame(logistic_model.predict(sm.add_constant(x_test.drop(remove_cols,axis=1))))
y_pred_test.columns = ["probs"]
#both_test = pd.concat([y_test.reset_index(drop=True),y_pred_test],axis=1)
both_test = pd.concat([y_test,y_pred_test],axis=1)
both_test["y_pred"] = 0
both_test.loc[both_test["probs"] > 0.5, 'y_pred'] = 1
print ("\nTest Confusion Matrix\n\n",pd.crosstab(both_test['class'],both_test['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"]))
print ("\nTest Accuracy:",round(accuracy_score(both_test['class'],both_test['y_pred']),4))
Comments
Post a Comment