文章目录
- PSI-群体稳定性指标(离散型)
- 单个指标计算
- 所有指标计算
PSI-群体稳定性指标(离散型)
单个指标计算
-
代码
import pandas as pddf = pd.read_csv('/Users/mengzhichao/Desktop/文件/图表/小微企业用电量数据.csv')X_train = df.sample(n=7000) X_test = df.sample(n=3000)
-
计算单个psi
def psi_calculate_category(origin,new,feature_name):"""计算单个类别型变量的psiorigin为实际数据,new为预期数据Parameters----------------------------------------------------------------------:param origin DataFrame,实际数据:param new DataFrame,预期数据:param feature_name string,需要计算PSI的字段(类别型)Returns----------------------------------------------------------------------psi: float,psi值psi_df: DataFrame,psi详细Examples---------------------------------------------------------------------->>> psi,psi_df = psi_calculate_category(origin=X_train,new=X_test,feature_name='dimvl_name')"""origin_cut = origin[feature_name].value_counts(dropna=False).reset_index()origin_cut.columns = ['buckets','origin_cnt']origin_cut['feature'] = feature_nameorigin_cut = origin_cut[['feature','buckets','origin_cnt']]new_cut = new[feature_name].value_counts(dropna=False).reset_index()new_cut.columns = ['buckets','new_cnt']new_cut['feature'] = feature_namenew_cut = new_cut[['feature','buckets','new_cnt']]psi_df = pd.merge(origin_cut,new_cut,on=['feature','buckets'])# print(psi_df)# 计算占比,分子加1,防止计算PSI时分子为0(这里分母不可能为0)psi_df['origin_percent'] = (psi_df['origin_cnt'] + 1) / psi_df['origin_cnt'].sum()psi_df['new_percent'] = (psi_df['new_cnt'] + 1) / psi_df['new_cnt'].sum()psi_df['minus'] = psi_df.apply(lambda x: x['origin_percent']-x['new_percent'],axis=1)psi_df['log'] = psi_df.apply(lambda x: np.log(x['origin_percent']/x['new_percent']),axis=1)psi_df['psi_bucket'] = psi_df.apply(lambda x: x['minus'] * x['log'],axis=1)psi_df['psi'] = psi_df['psi_bucket'].sum() psi = psi_df['psi_bucket'].sum()return psi,psi_dfpsi,psi_df = psi,psi_df = psi_calculate_category(origin=X_train,new=X_test,feature_name='区域')
所有指标计算
- 计算所有类别型:
## 所有的类别型特征列表
all_category_feature_list = df.select_dtypes(include='object').columns.tolist()psi_list = []
psi_df_list = []
for feature in notebook.tqdm(all_category_feature_list):psi,psi_df = psi_calculate_category(X_train,X_test,feature_name=feature)psi_list.append((feature,psi))psi_df_list.append(psi_df)psi = pd.DataFrame(psi_list,columns=['feature','psi'])
psi_df = pd.concat(psi_df_list,ignore_index=False)