import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

The German Credit Data contains data on 20 variables and the classification whether an applicant is considered a Good or a Bad credit risk for 1000 loan applicants.

credit_dat=pd.read_csv("C:\Work\Datasets\germancreditdata.csv")

print(credit_dat.head())

   Creditability  Account Balance  Duration of Credit (month)  \
0              1                1                          18   
1              1                1                           9   
2              1                2                          12   
3              1                1                          12   
4              1                1                          12   

   Payment Status of Previous Credit  Purpose  Credit Amount  \
0                                  4        2           1049   
1                                  4        0           2799   
2                                  2        9            841   
3                                  4        0           2122   
4                                  4        0           2171   

   Value Savings/Stocks  Length of current employment  Instalment per cent  \
0                     1                             2                    4   
1                     1                             3                    2   
2                     2                             4                    2   
3                     1                             3                    3   
4                     1                             3                    4   

   Sex & Marital Status       ...        Duration in Current address  \
0                     2       ...                                  4   
1                     3       ...                                  2   
2                     2       ...                                  4   
3                     3       ...                                  2   
4                     3       ...                                  4   

   Most valuable available asset  Age (years)  Concurrent Credits  \
0                              2           21                   3   
1                              1           36                   3   
2                              1           23                   3   
3                              1           39                   3   
4                              2           38                   1   

   Type of apartment  No of Credits at this Bank  Occupation  \
0                  1                           1           3   
1                  1                           2           3   
2                  1                           1           2   
3                  1                           2           2   
4                  2                           2           2   

   No of dependents  Telephone  Foreign Worker  
0                 1          1               1  
1                 2          1               1  
2                 1          1               1  
3                 2          1               2  
4                 1          1               2  

[5 rows x 21 columns]

credit_dat.shape

(1000, 21)

Lets read same data without header and assign header. Creditability is named to class. Class 1 indicates good credit and 2 as bad credit(default).

credit_df=pd.read_csv("C:\Work\Datasets\germancreditdataUCI.csv",sep=" ")

columns = ['checkin_acc', 'duration', 'credit_history', 'purpose', 'amount',
         'saving_acc', 'present_emp_since', 'inst_rate', 'personal_status',
         'other_debtors', 'residing_since', 'property', 'age',
         'inst_plans', 'housing', 'num_credits',
         'job', 'dependents', 'telephone', 'foreign_worker', 'class']

credit_df.columns = columns

credit_df.head()

The value of class variable 2 indicates default and 1 describes non-default. In order to model as 0-1 probelm, we have removed the value by 1 in the following code

credit_df['class']=credit_df['class']-1

credit_df.to_csv("C:\Work\Datasets\germancreditdatawithHeader.csv")

In order to know the predictive ability of each variable with respect to the independent variable, let do an information value calculation

credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 21 columns):
checkin_acc          999 non-null object
duration             999 non-null int64
credit_history       999 non-null object
purpose              999 non-null object
amount               999 non-null int64
saving_acc           999 non-null object
present_emp_since    999 non-null object
inst_rate            999 non-null int64
personal_status      999 non-null object
other_debtors        999 non-null object
residing_since       999 non-null int64
property             999 non-null object
age                  999 non-null int64
inst_plans           999 non-null object
housing              999 non-null object
num_credits          999 non-null int64
job                  999 non-null object
dependents           999 non-null int64
telephone            999 non-null object
foreign_worker       999 non-null object
class                999 non-null int64
dtypes: int64(8), object(13)
memory usage: 164.0+ KB

We see few data types are object and few int64. We will bin the int values to 10 equal parts deciles..

credit_df.groupby(['saving_acc'])['class'].head()

0     1
1     0
2     0
3     1
4     0
5     0
6     0
7     0
14    1
15    0
16    0
18    0
20    0
22    0
23    0
26    0
29    0
31    0
32    0
39    0
40    0
42    0
48    0
66    0
93    0
Name: class, dtype: int64

credit_df.groupby(['saving_acc'])['class'].agg(['count','sum']).head()

# Calculation of IV  metrics
def IV_calc(data,var):
    if data[var].dtypes == "object":
        dataf = data.groupby([var])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf
    else:
        data['bin_var'] = pd.qcut(data[var].rank(method='first'),10)
        dataf = data.groupby(['bin_var'])['class'].agg(['count','sum'])
        dataf.columns = ["Total","bad"]    
        dataf["good"] = dataf["Total"] - dataf["bad"]
        dataf["bad_per"] = dataf["bad"]/dataf["bad"].sum()
        dataf["good_per"] = dataf["good"]/dataf["good"].sum()
        dataf["I_V"] = (dataf["good_per"] - dataf["bad_per"]) * np.log(dataf["good_per"]/dataf["bad_per"])
        return dataf

IV_calc(credit_df,'saving_acc')

pd.crosstab(credit_df['saving_acc'],credit_df['class'] )

Lets get variable importance of few more cols

print ("\n\nCredit History - Information Value\n")
print (IV_calc(credit_df,'credit_history'))


Credit History - Information Value

                Total  bad  good   bad_per  good_per       I_V
credit_history                                                
A30                40   25    15  0.083333  0.021459  0.083944
A31                49   28    21  0.093333  0.030043  0.071743
A32               530  169   361  0.563333  0.516452  0.004073
A33                88   28    60  0.093333  0.085837  0.000628
A34               292   50   242  0.166667  0.346209  0.131254

print ("\n\nCredit History - Duration in month\n")
print (IV_calc(credit_df,'duration'))


Credit History - Duration in month

                Total  bad  good   bad_per  good_per       I_V
bin_var                                                       
(0.999, 100.8]    100   12    88  0.040000  0.125894  0.098483
(100.8, 200.6]    100   19    81  0.063333  0.115880  0.031746
(200.6, 300.4]    100   27    73  0.090000  0.104435  0.002147
(300.4, 400.2]    100   25    75  0.083333  0.107296  0.006056
(400.2, 500.0]    100   27    73  0.090000  0.104435  0.002147
(500.0, 599.8]     99   37    62  0.123333  0.088698  0.011418
(599.8, 699.6]    100   32    68  0.106667  0.097282  0.000864
(699.6, 799.4]    100   31    69  0.103333  0.098712  0.000211
(799.4, 899.2]    100   42    58  0.140000  0.082976  0.029829
(899.2, 999.0]    100   48    52  0.160000  0.074392  0.065561

# List of IV values
Iv_list = []
for col in credit_df.columns:
    assigned_data =  IV_calc(data = credit_df,var = col)
    iv_val = round(assigned_data["I_V"].sum(),3)
    dt_type = credit_df[col].dtypes
    Iv_list.append((iv_val,col,dt_type))

Iv_list = sorted(Iv_list,reverse = True)

C:\Users\Binu\Downloads\Conda\lib\site-packages\ipykernel_launcher.py:18: RuntimeWarning: divide by zero encountered in log

for i in range(len(Iv_list)):
    print (Iv_list[i][0],",",Iv_list[i][1],",type =",Iv_list[i][2])

inf , class ,type = int64
inf , bin_var ,type = category
0.67 , checkin_acc ,type = object
0.292 , credit_history ,type = object
0.248 , duration ,type = int64
0.194 , saving_acc ,type = object
0.168 , purpose ,type = object
0.112 , property ,type = object
0.109 , amount ,type = int64
0.107 , age ,type = int64
0.099 , num_credits ,type = int64
0.086 , present_emp_since ,type = object
0.083 , housing ,type = object
0.061 , inst_rate ,type = int64
0.057 , inst_plans ,type = object
0.049 , residing_since ,type = int64
0.044 , personal_status ,type = object
0.044 , foreign_worker ,type = object
0.037 , dependents ,type = int64
0.032 , other_debtors ,type = object
0.009 , job ,type = object
0.006 , telephone ,type = object

We will consider top 15 vars. We will do dummy variable coding for discrete vars/objects and use continous as is

tmp=credit_df.select_dtypes(include=['object']).columns.values
tmp

array(['checkin_acc', 'credit_history', 'purpose', 'saving_acc',
       'present_emp_since', 'personal_status', 'other_debtors',
       'property', 'inst_plans', 'housing', 'job', 'telephone',
       'foreign_worker'], dtype=object)

tmp.size
tmp[12]
tmplist=tmp.tolist()
tmplist

['checkin_acc',
 'credit_history',
 'purpose',
 'saving_acc',
 'present_emp_since',
 'personal_status',
 'other_debtors',
 'property',
 'inst_plans',
 'housing',
 'job',
 'telephone',
 'foreign_worker']

dummy_stseca = pd.get_dummies(credit_df['checkin_acc'], prefix='status_exs_accnt')
dummy_ch = pd.get_dummies(credit_df['credit_history'], prefix='cred_hist')
dummy_purpose = pd.get_dummies(credit_df['purpose'], prefix='purpose')
dummy_savacc = pd.get_dummies(credit_df['saving_acc'], prefix='sav_acc')
dummy_presc = pd.get_dummies(credit_df['present_emp_since'], prefix='pre_emp_snc')
dummy_perssx = pd.get_dummies(credit_df['personal_status'], prefix='per_stat_sx')
dummy_othdts = pd.get_dummies(credit_df['other_debtors'], prefix='oth_debtors')


dummy_property = pd.get_dummies(credit_df['property'], prefix='property')
dummy_othinstpln = pd.get_dummies(credit_df['inst_plans'], prefix='oth_inst_pln')
dummy_forgnwrkr = pd.get_dummies(credit_df['foreign_worker'], prefix='forgn_wrkr')

dummy_stseca #get_dummies acts as one hot encoder.

continuous_columnsOld = ['Duration_in_month', 'Credit_amount','Installment_rate_in_percentage_of_disposable_income',
                       'Age_in_years','Number_of_existing_credits_at_this_bank' ]
type(continuous_columns)

list

continuous_columns=credit_df.select_dtypes(include=['int64']).columns.values.tolist()
continuous_columns.remove('class')
credit_continuous = credit_df[continuous_columns]
credit_data_new = pd.concat([dummy_stseca,dummy_ch,dummy_purpose,dummy_savacc,dummy_presc,dummy_perssx,
                             dummy_property,dummy_othinstpln,dummy_othdts,
                             dummy_forgnwrkr,credit_continuous,credit_df['class']],axis=1)

credit_data_new['class'].head()

0    1
1    0
2    0
3    1
4    0
Name: class, dtype: int64

x_train,x_test,y_train,y_test = train_test_split(credit_data_new.drop(['class'],axis=1),credit_data_new['class'],train_size = 0.7,random_state=42)

C:\Users\Binu\Downloads\Conda\lib\site-packages\sklearn\model_selection\_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
  FutureWarning)

y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

We remove one extra category column from all categorical variables for which dummies have been created

remove_cols_extra_dummy = ['status_exs_accnt_A11','cred_hist_A30','purpose_A40','sav_acc_A61','pre_emp_snc_A71',
               'per_stat_sx_A91','oth_debtors_A101','property_A121','oth_inst_pln_A141','forgn_wrkr_A201']

remove_cols_insig = []
remove_cols = list(set(remove_cols_extra_dummy+remove_cols_insig))

Model Creation

import statsmodels.api as sm
logistic_model = sm.Logit(y_train,sm.add_constant(x_train.drop(remove_cols,axis=1))).fit()
print (logistic_model.summary())

C:\Users\Binu\Downloads\Conda\lib\site-packages\statsmodels\compat\pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

Optimization terminated successfully.
         Current function value: 0.446791
         Iterations 7
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                  class   No. Observations:                  699
Model:                          Logit   Df Residuals:                      656
Method:                           MLE   Df Model:                           42
Date:                Sun, 15 Jul 2018   Pseudo R-squ.:                  0.2690
Time:                        11:19:56   Log-Likelihood:                -312.31
converged:                       True   LL-Null:                       -427.25
                                        LLR p-value:                 9.711e-28
========================================================================================
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -0.2971      1.158     -0.257      0.798      -2.567       1.973
status_exs_accnt_A12    -0.5038      0.259     -1.949      0.051      -1.011       0.003
status_exs_accnt_A13    -0.7095      0.431     -1.647      0.100      -1.554       0.135
status_exs_accnt_A14    -1.7952      0.288     -6.244      0.000      -2.359      -1.232
cred_hist_A31            0.0352      0.634      0.055      0.956      -1.208       1.278
cred_hist_A32           -0.5271      0.508     -1.038      0.299      -1.523       0.468
cred_hist_A33           -0.7466      0.554     -1.347      0.178      -1.833       0.340
cred_hist_A34           -1.4947      0.512     -2.922      0.003      -2.497      -0.492
purpose_A41             -1.9940      0.476     -4.188      0.000      -2.927      -1.061
purpose_A410            -1.7021      1.081     -1.575      0.115      -3.820       0.416
purpose_A42             -0.9345      0.324     -2.886      0.004      -1.569      -0.300
purpose_A43             -0.8942      0.294     -3.038      0.002      -1.471      -0.317
purpose_A44             -0.5686      0.781     -0.728      0.467      -2.100       0.962
purpose_A45             -0.6304      0.648     -0.973      0.330      -1.900       0.639
purpose_A46              0.0440      0.490      0.090      0.929      -0.917       1.005
purpose_A48             -1.6428      1.249     -1.316      0.188      -4.090       0.804
purpose_A49             -0.6122      0.386     -1.586      0.113      -1.369       0.144
sav_acc_A62             -0.2320      0.327     -0.710      0.478      -0.873       0.409
sav_acc_A63             -0.2957      0.454     -0.652      0.515      -1.185       0.594
sav_acc_A64             -1.2303      0.608     -2.025      0.043      -2.421      -0.040
sav_acc_A65             -1.1089      0.321     -3.452      0.001      -1.738      -0.479
pre_emp_snc_A72          0.1980      0.486      0.407      0.684      -0.755       1.151
pre_emp_snc_A73          0.4371      0.450      0.972      0.331      -0.445       1.319
pre_emp_snc_A74         -0.3136      0.491     -0.639      0.523      -1.275       0.648
pre_emp_snc_A75          0.2344      0.471      0.498      0.619      -0.689       1.158
per_stat_sx_A92         -0.2906      0.464     -0.626      0.531      -1.200       0.619
per_stat_sx_A93         -1.0503      0.457     -2.298      0.022      -1.946      -0.154
per_stat_sx_A94         -0.6757      0.553     -1.223      0.221      -1.759       0.407
property_A122            0.3465      0.304      1.140      0.254      -0.249       0.942
property_A123            0.0493      0.278      0.177      0.859      -0.496       0.594
property_A124            0.5463      0.366      1.493      0.135      -0.171       1.263
oth_inst_pln_A142       -0.8366      0.559     -1.496      0.135      -1.932       0.259
oth_inst_pln_A143       -0.4680      0.290     -1.615      0.106      -1.036       0.100
oth_debtors_A102         0.3080      0.462      0.667      0.505      -0.598       1.214
oth_debtors_A103        -0.9577      0.489     -1.959      0.050      -1.916       0.000
forgn_wrkr_A202         -0.6698      0.679     -0.987      0.324      -2.000       0.661
duration                 0.0373      0.011      3.357      0.001       0.016       0.059
amount                7.663e-05   5.11e-05      1.499      0.134   -2.35e-05       0.000
inst_rate                0.4026      0.106      3.799      0.000       0.195       0.610
residing_since           0.0595      0.099      0.601      0.548      -0.134       0.253
age                     -0.0210      0.011     -1.872      0.061      -0.043       0.001
num_credits              0.3263      0.236      1.383      0.167      -0.136       0.789
dependents               0.3611      0.286      1.262      0.207      -0.200       0.922
========================================================================================

# Calculation of VIF
print ("\nVariance Inflation Factor")
cnames = x_train.drop(remove_cols,axis=1).columns
for i in np.arange(0,len(cnames)):
    xvars = list(cnames)
    yvar = xvars.pop(i)
    mod = sm.OLS(x_train.drop(remove_cols,axis=1)[yvar],sm.add_constant(x_train.drop(remove_cols,axis=1)[xvars]))
    res = mod.fit()
    vif = 1/(1-res.rsquared)
    print (yvar,round(vif,3))

Variance Inflation Factor
status_exs_accnt_A12 1.67
status_exs_accnt_A13 1.239
status_exs_accnt_A14 1.796
cred_hist_A31 2.396
cred_hist_A32 7.85
cred_hist_A33 3.111
cred_hist_A34 6.547
purpose_A41 1.449
purpose_A410 1.203
purpose_A42 1.611
purpose_A43 1.791
purpose_A44 1.091
purpose_A45 1.113
purpose_A46 1.201
purpose_A48 1.094
purpose_A49 1.523
sav_acc_A62 1.171
sav_acc_A63 1.147
sav_acc_A64 1.099
sav_acc_A65 1.208
pre_emp_snc_A72 3.505
pre_emp_snc_A73 5.079
pre_emp_snc_A74 3.725
pre_emp_snc_A75 4.483
per_stat_sx_A92 5.573
per_stat_sx_A93 6.261
per_stat_sx_A94 2.892
property_A122 1.55
property_A123 1.716
property_A124 1.788
oth_inst_pln_A142 1.349
oth_inst_pln_A143 1.42
oth_debtors_A102 1.118
oth_debtors_A103 1.158
forgn_wrkr_A202 1.102
duration 2.175
amount 2.496
inst_rate 1.353
residing_since 1.22
age 1.424
num_credits 1.641
dependents 1.176

y_pred = pd.DataFrame(logistic_model.predict(sm.add_constant(x_train.drop(remove_cols,axis=1))))
y_pred.columns = ["probs"]
print(y_pred)

        probs
728  0.110868
630  0.641502
394  0.582308
777  0.013102
598  0.094606
333  0.885485
575  0.162798
933  0.713771
948  0.082967
697  0.078326
529  0.603244
580  0.177365
706  0.731610
538  0.125449
451  0.123173
228  0.724357
350  0.100148
844  0.041859
212  0.225265
79   0.037974
148  0.013954
302  0.322334
334  0.133526
713  0.927340
945  0.759816
931  0.069923
483  0.056489
978  0.594984
133  0.373880
549  0.029431
..        ...
955  0.070492
191  0.588051
385  0.079418
805  0.114946
413  0.684382
491  0.030447
343  0.113420
769  0.103670
308  0.597652
661  0.100622
130  0.836643
663  0.190370
871  0.209145
99   0.253691
372  0.219594
87   0.552725
458  0.081692
330  0.236899
214  0.008996
466  0.235683
121  0.072948
614  0.560086
20   0.173579
700  0.412481
71   0.164323
106  0.221320
270  0.019282
860  0.191153
435  0.029859
102  0.057933

[699 rows x 1 columns]

C Stat tells us the proprtion of concordant pairs out of total pairs. The higher the value the best. >0.7 is adivsable for production. A pair is concordant if the probability against the 1 class is > than the 0 class, if its <1 then its discordant.

both = pd.concat([y_train,y_pred],axis=1)

zeros = both[['class','probs']][both['class']==0]
ones = both[['class','probs']][both['class']==1]

def df_crossjoin(df1, df2, **kwargs):
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1
    res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
    res.index = pd.MultiIndex.from_product((df1.index, df2.index))
    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)
    return res

joined_data = df_crossjoin(ones,zeros)

joined_data['concordant_pair'] = 0
joined_data.loc[joined_data['probs_x'] > joined_data['probs_y'],'concordant_pair'] =1
joined_data['discordant_pair'] = 0
joined_data.loc[joined_data['probs_x'] < joined_data['probs_y'],'discordant_pair'] =1
joined_data['tied_pair'] = 0           
joined_data.loc[joined_data['probs_x'] == joined_data['probs_y'],'tied_pair'] =1 
p_conc = (sum(joined_data['concordant_pair'])*1.0 )/ (joined_data.shape[0])     
p_disc =  (sum(joined_data['discordant_pair'])*1.0 )/ (joined_data.shape[0])
   

c_statistic = 0.5 + (p_conc - p_disc)/2.0           
print ("\nC-statistic:",round(c_statistic,4))

C-statistic: 0.8338

Keep tab of c-stat and how log likelihood(AIC) is changing while removing various predictors one by one in order to justify where to stop

# ROC & AUC
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import auc
fpr, tpr, thresholds = metrics.roc_curve(both['class'],both['probs'], pos_label=1)

roc_auc = auc(fpr,tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - German Credit Data')
plt.legend(loc="lower right")
plt.show()

# Tuning for threshold
for i in list(np.arange(0,1,0.1)):
    both["y_pred"] = 0
    both.loc[both["probs"] > i, 'y_pred'] = 1      
    print ("Threshold",i,"Train Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4))

# Implement best threshold on train data
both["y_pred"] = 0
both.loc[both["probs"] > 0.5, 'y_pred'] = 1      
print ("\nTrain Confusion Matrix\n\n",pd.crosstab(both['class'],both['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nTrain Accuracy:",round(accuracy_score(both['class'],both['y_pred']),4))

Threshold 0.0 Train Accuracy: 0.3004
Threshold 0.1 Train Accuracy: 0.5622
Threshold 0.2 Train Accuracy: 0.6967
Threshold 0.30000000000000004 Train Accuracy: 0.7554
Threshold 0.4 Train Accuracy: 0.7911
Threshold 0.5 Train Accuracy: 0.7811
Threshold 0.6000000000000001 Train Accuracy: 0.7811
Threshold 0.7000000000000001 Train Accuracy: 0.7568
Threshold 0.8 Train Accuracy: 0.7396
Threshold 0.9 Train Accuracy: 0.7067

Train Confusion Matrix

 Predicted    0    1
Actuall            
0          432   57
1           96  114

Train Accuracy: 0.7811

# Predicting test output
y_pred_test = pd.DataFrame(logistic_model.predict(sm.add_constant(x_test.drop(remove_cols,axis=1))))
y_pred_test.columns = ["probs"]

#both_test = pd.concat([y_test.reset_index(drop=True),y_pred_test],axis=1)
both_test = pd.concat([y_test,y_pred_test],axis=1)
both_test["y_pred"] = 0
both_test.loc[both_test["probs"] > 0.5, 'y_pred'] = 1      
print ("\nTest Confusion Matrix\n\n",pd.crosstab(both_test['class'],both_test['y_pred'],rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nTest Accuracy:",round(accuracy_score(both_test['class'],both_test['y_pred']),4))

Test Confusion Matrix

 Predicted    0   1
Actuall           
0          192  18
1           52  38

Test Accuracy: 0.7667

Search This Blog

TechMusings (BigData,Hadoop,Pig,Hive,DataScience,IoT,EAI,SOA,J2EE)

Logistic Regression using German Credit Data

Comments

Post a Comment

Popular posts from this blog

ScoreCard Model using R

The auxService:mapreduce_shuffle does not exist

Zeppelin and Anaconda

	checkin_acc	duration	credit_history	purpose	amount	saving_acc	present_emp_since	inst_rate	personal_status	other_debtors	...	property	age	inst_plans	housing	num_credits	job	dependents	telephone	foreign_worker	class
0	A12	48	A32	A43	5951	A61	A73	2	A92	A101	...	A121	22	A143	A152	1	A173	1	A191	A201	2
1	A14	12	A34	A46	2096	A61	A74	2	A93	A101	...	A121	49	A143	A152	1	A172	2	A191	A201	1
2	A11	42	A32	A42	7882	A61	A74	2	A93	A103	...	A122	45	A143	A153	1	A173	2	A191	A201	1
3	A11	24	A33	A40	4870	A61	A73	3	A93	A101	...	A124	53	A143	A153	2	A173	2	A191	A201	2
4	A14	36	A32	A46	9055	A65	A73	2	A93	A101	...	A124	35	A143	A153	1	A172	2	A192	A201	1

	Total	bad	good	bad_per	good_per	I_V
saving_acc
A61	603	217	386	0.723333	0.552217	0.046189
A62	103	34	69	0.113333	0.098712	0.002019
A63	63	11	52	0.036667	0.074392	0.026690
A64	48	6	42	0.020000	0.060086	0.044096
A65	182	32	150	0.106667	0.214592	0.075443

	status_exs_accnt_A11	status_exs_accnt_A12	status_exs_accnt_A13	status_exs_accnt_A14
0	0	1	0	0
1	0	0	0	1
2	1	0	0	0
3	1	0	0	0
4	0	0	0	1
5	0	0	0	1
6	0	1	0	0
7	0	0	0	1
8	0	1	0	0
9	0	1	0	0
10	1	0	0	0
11	0	1	0	0
12	1	0	0	0
13	1	0	0	0
14	1	0	0	0
15	0	0	0	1
16	1	0	0	0
17	0	1	0	0
18	0	0	0	1
19	0	0	0	1
20	1	0	0	0
21	1	0	0	0
22	0	1	0	0
23	0	0	0	1
24	1	0	0	0
25	0	0	0	1
26	0	0	1	0
27	0	1	0	0
28	1	0	0	0
29	0	1	0	0
...	...	...	...	...
969	0	1	0	0
970	0	0	0	1
971	1	0	0	0
972	1	0	0	0
973	0	0	0	1
974	0	0	1	0
975	0	1	0	0
976	0	1	0	0
977	0	0	0	1
978	0	1	0	0
979	0	1	0	0
980	0	0	0	1
981	0	0	1	0
982	1	0	0	0
983	0	0	0	1
984	1	0	0	0
985	0	0	1	0
986	0	0	0	1
987	1	0	0	0
988	0	1	0	0
989	0	0	0	1
990	0	0	0	1
991	1	0	0	0
992	1	0	0	0
993	0	0	0	1
994	0	0	0	1
995	1	0	0	0
996	0	0	0	1
997	1	0	0	0
998	0	1	0	0

	status_exs_accnt_A11	status_exs_accnt_A12	status_exs_accnt_A13	status_exs_accnt_A14
0	0	1	0	0
1	0	0	0	1
2	1	0	0	0
3	1	0	0	0
4	0	0	0	1
5	0	0	0	1
6	0	1	0	0
7	0	0	0	1
8	0	1	0	0
9	0	1	0	0
10	1	0	0	0
11	0	1	0	0
12	1	0	0	0
13	1	0	0	0
14	1	0	0	0
15	0	0	0	1
16	1	0	0	0
17	0	1	0	0
18	0	0	0	1
19	0	0	0	1
20	1	0	0	0
21	1	0	0	0
22	0	1	0	0
23	0	0	0	1
24	1	0	0	0
25	0	0	0	1
26	0	0	1	0
27	0	1	0	0
28	1	0	0	0
29	0	1	0	0
...	...	...	...	...
969	0	1	0	0
970	0	0	0	1
971	1	0	0	0
972	1	0	0	0
973	0	0	0	1
974	0	0	1	0
975	0	1	0	0
976	0	1	0	0
977	0	0	0	1
978	0	1	0	0
979	0	1	0	0
980	0	0	0	1
981	0	0	1	0
982	1	0	0	0
983	0	0	0	1
984	1	0	0	0
985	0	0	1	0
986	0	0	0	1
987	1	0	0	0
988	0	1	0	0
989	0	0	0	1
990	0	0	0	1
991	1	0	0	0
992	1	0	0	0
993	0	0	0	1
994	0	0	0	1
995	1	0	0	0
996	0	0	0	1
997	1	0	0	0
998	0	1	0	0

	status_exs_accnt_A11	status_exs_accnt_A12	status_exs_accnt_A13	status_exs_accnt_A14
0	0	1	0	0
1	0	0	0	1
2	1	0	0	0
3	1	0	0	0
4	0	0	0	1
5	0	0	0	1
6	0	1	0	0
7	0	0	0	1
8	0	1	0	0
9	0	1	0	0
10	1	0	0	0
11	0	1	0	0
12	1	0	0	0
13	1	0	0	0
14	1	0	0	0
15	0	0	0	1
16	1	0	0	0
17	0	1	0	0
18	0	0	0	1
19	0	0	0	1
20	1	0	0	0
21	1	0	0	0
22	0	1	0	0
23	0	0	0	1
24	1	0	0	0
25	0	0	0	1
26	0	0	1	0
27	0	1	0	0
28	1	0	0	0
29	0	1	0	0
...	...	...	...	...
969	0	1	0	0
970	0	0	0	1
971	1	0	0	0
972	1	0	0	0
973	0	0	0	1
974	0	0	1	0
975	0	1	0	0
976	0	1	0	0
977	0	0	0	1
978	0	1	0	0
979	0	1	0	0
980	0	0	0	1
981	0	0	1	0
982	1	0	0	0
983	0	0	0	1
984	1	0	0	0
985	0	0	1	0
986	0	0	0	1
987	1	0	0	0
988	0	1	0	0
989	0	0	0	1
990	0	0	0	1
991	1	0	0	0
992	1	0	0	0
993	0	0	0	1
994	0	0	0	1
995	1	0	0	0
996	0	0	0	1
997	1	0	0	0
998	0	1	0	0