通过构建自动化的信用评分模型,以在线方式进行即时的信贷审批能够为银行节约很多人工成本。本案例,我们将使用C5.0决策树算法建立一个简单的个人信用风险评估模型。
导入类库
读取数据
#创建编码所用的数据字典
col_dicts={}
#要编码的属性集
cols = ['checking_balance', 'credit_history', 'purpose', 'savings_balance', 'employment_length', 'personal_status','other_debtors', 'property', 'installment_plan', 'housing', 'job', 'telephone', 'foreign_worker']
# 编码规则
col_dicts = {'checking_balance': {'1 - 200 DM': 2,'< 0 DM': 1,'> 200 DM': 3,'unknown': 0},'credit_history': {'critical': 0,'delayed': 2,'fully repaid': 3,'fully repaid this bank': 4,'repaid': 1},'employment_length': {'0 - 1 yrs': 1,'1 - 4 yrs': 2,'4 - 7 yrs': 3,'> 7 yrs': 4,'unemployed': 0},'foreign_worker': {'no': 1, 'yes': 0},'housing': {'for free': 1, 'own': 0, 'rent': 2},'installment_plan': {'bank': 1, 'none': 0, 'stores': 2},'job': {'mangement self-employed': 3,'skilled employee': 2,'unemployed non-resident': 0,'unskilled resident': 1},'other_debtors': {'co-applicant': 2, 'guarantor': 1, 'none': 0},'personal_status': {'divorced male': 2,'female': 1,'married male': 3,'single male': 0},'property': {'building society savings': 1,'other': 3,'real estate': 0,'unknown/none': 2},'purpose': {'business': 5,'car (new)': 3,'car (used)': 4,'domestic appliances': 6,'education': 1,'furniture': 2,'others': 8,'radio/tv': 0,'repairs': 7,'retraining': 9},'savings_balance': {'101 - 500 DM': 2,'501 - 1000 DM': 3,'< 100 DM': 1,'> 1000 DM': 4,'unknown': 0},'telephone': {'none': 1, 'yes': 0}}
#划分数据集
#确定因变量
Y=credit['default']
#确定自变量
X=credit.loc[:,'checking_balance':'foreign_worker']
#划分训练集和测试集,random_state=1表示先打乱顺序再划分,测试集占30%
X_train,X_test,Y_train,Y_test=model_selection.train_test_split(X,Y,test_size=0.3,random_state=1)
#查看训练集中违约和非违约样本的分布
Y_train.value_counts()/len(Y_train)
#模型构建和训练
DecisionTreeClassifier(criterion="gini",splitter="best",max_depth=None,min_samples_split=2,min_samples_leaf=1,min_weight_fraction_leaf=0.,max_features=None,random_state=None,max_leaf_nodes=None,min_impurity_decrease=0.,min_impurity_split=None,class_weight=None,presort=False)
#创建模型
credit_model=DecisionTreeClassifier(min_samples_leaf=6,random_state=1)
credit_model.fit(X_train,Y_train)
#获取决策树的数据
dot_data=StringIO()
#决策树构建
tree.export_graphviz(credit_model,out_file=dot_data,feature_names=X_train.columns,class_names=['no default','default'],filled=True,rounded=True)
graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
import os
os.environ["PATH"]+=os.pathsep+'E:/stable插件/Graphviz/bin/'
#绘制决策树
Image(graph.create_png())
#推测测试集标签
credit_pred=credit_model.predict(X_test)
print(metrics.classification_report(Y_test,credit_pred))
#绘制混淆矩阵
metrics.confusion_matrix(Y_test,credit_pred)
#获取分类准确分数,即所有分类正确的百分比
metrics.accuracy_score(Y_test,credit_pred)
# 认为一个贷款违约者给银行带来的损失是银行错过一个不违约的贷款带来损失的4倍
class_weights={1:1,2:4}
credit_model_cost=DecisionTreeClassifier(max_depth=15,class_weight=class_weights)
credit_model_cost.fit(X_train,Y_train)
credit_pred_cost=credit_model_cost.predict(X_test)
#测试模型的性能
print(metrics.classification_report(Y_test,credit_pred_cost))
print(metrics.confusion_matrix(Y_test,credit_pred_cost))
print(metrics.accuracy_score(Y_test,credit_pred_cost))