竞赛链接:https://www.kaggle.com/competitions/home-credit-default-risk/
认识数据集:application的两张表是申请人信息
通过id关联bureau:过去的借款、previous_application两张表
而bureau_balance则代表对应的还款信息
表之间的关系如下:
第一部分code我们做数据探索:看缺失值、异常值、相关性情况,并做填补及字段筛选,而后用逻辑回归和随机森林分别建立baseline,最终得分0.70,离0.85的第一名得分有较大差距,后几讲会再优化
具体代码如下:
# coding: utf-8
# In[1]:
import numpy as np
# In[2]:
import pandas as pd
import time
# In[3]:
start_time=time.time()
# In[4]:
application_train=pd.read_csv('./application_train.csv',nrows=100000)
# In[5]:
application_test=pd.read_csv('./application_test.csv')
# In[6]:
previous_application=pd.read_csv('./previous_application.csv',nrows=100000)
# In[7]:
bureau_df=pd.read_csv('./bureau.csv',nrows=100000)
# In[8]:
bureau_balance=pd.read_csv('./bureau_balance.csv',nrows=100000)
# In[9]:
POS_CASH_balance=pd.read_csv('./POS_CASH_balance.csv',nrows=100000)
# In[10]:
credit_card_balance=pd.read_csv('./credit_card_balance.csv',nrows=100000)
# In[11]:
installments_payments=pd.read_csv('./installments_payments.csv',nrows=100000)
# In[12]:
application_train.memory_usage()
# In[13]:
print(f'application_train.shape:{application_train.shape}')
# In[14]:
class_counts=application_train['TARGET'].value_counts()#
# In[15]:
import matplotlib.pyplot as plt
plt.pie(class_counts,labels=class_counts.index,autopct='%1.1f%%')#显示一位小数的百分比
# In[16]:
application_train.head()#发现训练集中缺失的id都出现在了测试集中
# In[17]:
application_train.select_dtypes('object')#看哪些是文本类型
# In[18]:
#看缺失情况
def missing(df):
missing_number=df.isnull().sum().sort_values(ascending=False)#sum看有几个缺失值,count看一共有几个值,如果直接count会踢掉缺失值再看有几个值
missing_percent=(df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_values=pd.concat([missing_number,missing_percent],axis=1,keys=['missing_number','missing_percent'])
return missing_values
# In[19]:
missing(application_train).sort_values(by='missing_percent',ascending=False)
# In[20]:
missing(application_train)[missing(application_train)['missing_number']>0].index
# In[21]:
#想怎么填补
#类别型变量应该由众数填补
application_train[application_train['NAME_TYPE_SUITE']=='Unaccompanied']['TARGET'].mean()
# In[22]:
application_train[application_train['NAME_TYPE_SUITE'].isna()]['TARGET'].mean()
# In[23]:
#发现二者违约率不一样,不适合这样填补,因此填成一个特殊的群体
application_train['NAME_TYPE_SUITE']=application_train['NAME_TYPE_SUITE'].fillna('Unknow')
# In[24]:
application_test['NAME_TYPE_SUITE']=application_test['NAME_TYPE_SUITE'].fillna('Unknow')
# In[25]:
application_train['OWN_CAR_AGE'].isnull().sum()
# In[26]:
#想一个人为啥没车,可能FLAG_OWN_CAR也是N
application_train.loc[application_train['OWN_CAR_AGE'].isnull()&(application_train['FLAG_OWN_CAR']=='Y')][['OWN_CAR_AGE','FLAG_OWN_CAR']]
# In[27]:
#填充没有车的人车龄为0
application_train.loc[application_train['FLAG_OWN_CAR']=='N','OWN_CAR_AGE']=application_train.loc[application_train['FLAG_OWN_CAR']=='N','OWN_CAR_AGE'].fillna(0)
application_test.loc[application_test['FLAG_OWN_CAR']=='N','OWN_CAR_AGE']=application_test.loc[application_test['FLAG_OWN_CAR']=='N','OWN_CAR_AGE'].fillna(0)
# In[28]:
#看填充结果
application_train['OWN_CAR_AGE'].isna().sum()
# In[29]:
#再看上次换电话号码的时间,发现有大量是申请当天换的电话号码,这些是没有意义的
application_train['DAYS_LAST_PHONE_CHANGE'].value_counts()
# In[30]:
#考虑把这些设为缺失值,后面再用均值或者中位数填补
application_train['DAYS_LAST_PHONE_CHANGE'].replace(0,np.nan,inplace=True)
# In[31]:
#有时没有缺失值,但有XNA,测试集没有,因此可以把它删掉
application_train['CODE_GENDER'].value_counts()
# In[32]:
application_train=application_train[application_train['CODE_GENDER']!='XNA']
# In[33]:
#开始看异常值
#三类异常:看描述性统计,minmax是否远离均值/看箱线图,是否有离群点/3西格玛法则,看25%和75%分位数是否和minmax差别过大
application_train.describe()
# In[34]:
#观察发现DAYS_EMPLOYED最大值特别大
(application_train['DAYS_EMPLOYED']/365).describe()
# In[35]:
application_train.loc[application_train['TARGET']==0,'DAYS_EMPLOYED'].hist()
# In[36]:
#直方图只适合离散值,连续值需要核密度估计图
import seaborn as sns
# In[37]:
sns.kdeplot(application_train.loc[application_train['TARGET']==0,'DAYS_EMPLOYED']/365,label='target'=='0')
# In[38]:
#写一个二分类的核密度直方图函数
def kde_plot(feature_name,df):
plt.figure(figsize=(8,6))
sns.kdeplot(df.loc[df['TARGET']==0,feature_name],label='target==0')
sns.kdeplot(df.loc[df['TARGET']==1,feature_name],label='target==1')
plt.legend()#显示曲线所代表的含义
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.show()
# In[39]:
kde_plot('DAYS_EMPLOYED',application_train)
#发现标签为0的异常值较多,因此
# In[40]:
#把异常值置空并留一列说明这些是异常值
application_train['DAYS_EMPLOYED_ANOM']=application_train["DAYS_EMPLOYED"]
application_train['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)
application_test['DAYS_EMPLOYED_ANOM']=application_test["DAYS_EMPLOYED"]
application_test['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)
# In[41]:
#看特征关联性,可视化;相关系数;特征重要性
# In[42]:
kde_plot('EXT_SOURCE_3',application_train)
#发现这个字段对标签影响比较大,特征工程时可以多考虑
# In[43]:
#再看看小提琴图,它既可以反应数据的分位数情况,也可以反应数据的密度情况
plt.figure(figsize=(10,8))
sns.violinplot(x='TARGET',y='EXT_SOURCE_3',data=application_train)
plt.show()
# In[44]:
#再看几个连续型变量
kde_plot('DAYS_BIRTH',application_train)
# In[45]:
#看相关性
correlations=application_train.corr()['TARGET'].sort_values()
correlations
# In[46]:
correlations.tail(15)
#看正向最重要的15个特征
# In[47]:
#看绝对值
correlations_abs=abs(correlations).sort_values(ascending=False)[:11]
correlations_abs
# In[48]:
#特征间关系,热力图,选10个最强的特征来画
correlations=application_train.corr()
# In[49]:
plt.figure(figsize=(30,40))
sns.heatmap(correlations[correlations_abs.index.tolist()])
plt.show()
# In[50]:
#发现留个变量有较强相关性
ext_data=application_train[['TARGET','DAYS_BIRTH','FLAG_EMP_PHONE','EXT_SOURCE_1','DAYS_EMPLOYED_ANOM']]
# In[51]:
ext_data_corrs=ext_data.corr()
# In[52]:
plt.figure(figsize=(10,8))
sns.heatmap(ext_data_corrs,cmap='RdBu_r',annot=True,fmt=".2f")#颜色,把字写入
plt.show()
# In[53]:
application_train[application_train['DAYS_EMPLOYED_ANOM']==1]['NAME_INCOME_TYPE'].value_counts()
# In[54]:
#发现新创建的这列DAYS_EMPLOYED_ANOM的信息可能已经被其它特征所反映,但如果能从业务角度挖掘出特别何原因,会对建模有很大帮助
# In[55]:
#验证EXT_SOURCE_1和DAYS_BIRTH有相关性,用六边形图
x=application_train['EXT_SOURCE_1']
y=application_train['DAYS_BIRTH']
plt.hexbin(x,y,gridsize=30)
plt.show()
# In[56]:
#海量数据处理的方法
import polars as pl
# In[57]:
df_pl=pl.read_csv('application_train.csv')
# In[58]:
df_pl.head()
# In[59]:
#建立baseline
bureau=pd.read_csv('./bureau.csv',nrows=100000)
# In[60]:
#先把类别型变量作数据编码。用label encoder,这对树模型不会有影响
#具体使用factorize,它对缺失值和异常值都会分配一个新值,防止自己先做填充出问题
#在合并时会遇到训练集和测试集对不齐(测试集多一列)的问题,解决方法是把训练集和测试集合起来再进行one-hot编码
#然后找到target是nan的
apply=application_train.append(application_test)
# In[61]:
object_col=apply.dtypes[apply.dtypes=='object'].index.to_list()
# In[62]:
for col in object_col:
if len(apply[col].unique())>2:
apply=pd.concat([apply,pd.get_dummies(apply[col],prefix=col)],axis=1)#生成独热编码,prefix是前缀
apply.drop(columns=[col],inplace=True)#inplace表示是否删副本
else:
apply[col]=pd.factorize(apply[col])[0]#数值型编码
apply.head()
# In[63]:
#分割训练集和测试集,target为null的就是测试集
application_test=apply[apply['TARGET'].isnull()]
application_test=application_test.drop('TARGET',axis=1)
application_train=apply[~apply['TARGET'].isnull()]
# In[64]:
#逻辑回归,需要填补缺失值,并进行缩放
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer#用来算minmax
# In[65]:
train=application_train.drop(columns=['TARGET','SK_ID_CURR'])#ID和TARGET作编号时无用
# In[66]:
features=list(train.columns)
# In[67]:
imputer=SimpleImputer(strategy='median')
# In[68]:
scaler=MinMaxScaler(feature_range=(0,1))
scaler
# In[69]:
#在训练集上进行拟合
imputer.fit(train.append(application_test[features]))
# In[70]:
train=imputer.transform(train)
test=imputer.transform(application_test[features])
train
# In[71]:
scaler.fit(train)
train=scaler.transform(train)
test=scaler.transform(test)
test
# In[72]:
#训练模型
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression(C=0.0001)
log_reg.fit(train,application_train['TARGET'])
# In[73]:
#进行预测,确保只获取第二列(为1的概率)
log_reg_pred=log_reg.predict_proba(test)[:,1]
test
# In[74]:
#获取特征的系数
coefficients=log_reg.coef_[0]#把数组转为整数
coefficients
# In[75]:
#看特征重要性
feature_importance=np.abs(coefficients)
# In[76]:
#给特征重要性排序,得出每个特征的重要性排名
sorted_indices=np.argsort(feature_importance)[::-1]
# In[77]:
for idx in sorted_indices:
print(f"{features[idx]},IMPORTANCE:{feature_importance[idx]}")
# In[78]:
np.argsort(feature_importance)
# In[79]:
coefficients[::-1]
# In[80]:
#保存结果
submit=application_test[['SK_ID_CURR']]
# In[81]:
submit['TARGET']=log_reg_pred
# In[82]:
submit
# In[83]:
#保存结果
submit.to_csv('baseline_model_log_reg.csv',index=False)
# In[84]:
#再尝试下其它类型的模型,随机森林
#区别于逻辑回归,它不需要缩放
train=application_train.drop(columns=['TARGET','SK_ID_CURR'])#ID和TARGET作编号时无用
features=list(train.columns)
imputer=SimpleImputer(strategy='median')
imputer.fit(train.append(application_test[features]))
train=imputer.transform(train)
test=imputer.transform(application_test[features])
# In[85]:
from sklearn.ensemble import RandomForestClassifier
random_forest=RandomForestClassifier(n_estimators=1000,random_state=2024,verbose=1,n_jobs=-1)
# In[86]:
random_forest.fit(train,application_train['TARGET'])
#提取特征重要性
feature_importance_values=random_forest.feature_importances_
feature_importances=pd.DataFrame({'feature':features,'importance':feature_importance_values})
# In[88]:
#在测试数据上预测
predictions=random_forest.predict_proba(test)[:,1]
#并保存为提交文件
submit=application_test[['SK_ID_CURR']]
submit['TARGET']=predictions
# In[89]:
#保存文件
submit.to_csv('baseline_model_random_forest.csv',index=False)
#0.703分,比逻辑回归稍好些