Python员工离职数据分析
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# 数据全显示
pd.set_option('display.max_columns', None)
# 颜色
colors = sns.color_palette()
# 数据精度
pd.set_option('precision', 3)
# 解决中文/小数点显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
项目名称:IBM员工离职数据分析
数据来源:https://tianchi.aliyun.com/dataset/dataDetail?dataId=77180
data_path = './data/WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(data_path)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
Age 1470 non-null int64
Attrition 1470 non-null object
BusinessTravel 1470 non-null object
DailyRate 1470 non-null int64
Department 1470 non-null object
DistanceFromHome 1470 non-null int64
Education 1470 non-null int64
EducationField 1470 non-null object
EmployeeCount 1470 non-null int64
EmployeeNumber 1470 non-null int64
EnvironmentSatisfaction 1470 non-null int64
Gender 1470 non-null object
HourlyRate 1470 non-null int64
JobInvolvement 1470 non-null int64
JobLevel 1470 non-null int64
JobRole 1470 non-null object
JobSatisfaction 1470 non-null int64
MaritalStatus 1470 non-null object
MonthlyIncome 1470 non-null int64
MonthlyRate 1470 non-null int64
NumCompaniesWorked 1470 non-null int64
Over18 1470 non-null object
OverTime 1470 non-null object
PercentSalaryHike 1470 non-null int64
PerformanceRating 1470 non-null int64
RelationshipSatisfaction 1470 non-null int64
StandardHours 1470 non-null int64
StockOptionLevel 1470 non-null int64
TotalWorkingYears 1470 non-null int64
TrainingTimesLastYear 1470 non-null int64
WorkLifeBalance 1470 non-null int64
YearsAtCompany 1470 non-null int64
YearsInCurrentRole 1470 non-null int64
YearsSinceLastPromotion 1470 non-null int64
YearsWithCurrManager 1470 non-null int64
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
df.describe()
Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.0 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.00 | 1470.000 | 1470.000 | 1470.0 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 | 1470.000 |
mean | 36.924 | 802.486 | 9.193 | 2.913 | 1.0 | 1024.865 | 2.722 | 65.891 | 2.730 | 2.064 | 2.729 | 6502.931 | 14313.103 | 2.693 | 15.21 | 3.154 | 2.712 | 80.0 | 0.794 | 11.280 | 2.799 | 2.761 | 7.008 | 4.229 | 2.188 | 4.123 |
std | 9.135 | 403.509 | 8.107 | 1.024 | 0.0 | 602.024 | 1.093 | 20.329 | 0.712 | 1.107 | 1.103 | 4707.957 | 7117.786 | 2.498 | 3.66 | 0.361 | 1.081 | 0.0 | 0.852 | 7.781 | 1.289 | 0.706 | 6.127 | 3.623 | 3.222 | 3.568 |
min | 18.000 | 102.000 | 1.000 | 1.000 | 1.0 | 1.000 | 1.000 | 30.000 | 1.000 | 1.000 | 1.000 | 1009.000 | 2094.000 | 0.000 | 11.00 | 3.000 | 1.000 | 80.0 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 |
25% | 30.000 | 465.000 | 2.000 | 2.000 | 1.0 | 491.250 | 2.000 | 48.000 | 2.000 | 1.000 | 2.000 | 2911.000 | 8047.000 | 1.000 | 12.00 | 3.000 | 2.000 | 80.0 | 0.000 | 6.000 | 2.000 | 2.000 | 3.000 | 2.000 | 0.000 | 2.000 |
50% | 36.000 | 802.000 | 7.000 | 3.000 | 1.0 | 1020.500 | 3.000 | 66.000 | 3.000 | 2.000 | 3.000 | 4919.000 | 14235.500 | 2.000 | 14.00 | 3.000 | 3.000 | 80.0 | 1.000 | 10.000 | 3.000 | 3.000 | 5.000 | 3.000 | 1.000 | 3.000 |
75% | 43.000 | 1157.000 | 14.000 | 4.000 | 1.0 | 1555.750 | 4.000 | 83.750 | 3.000 | 3.000 | 4.000 | 8379.000 | 20461.500 | 4.000 | 18.00 | 3.000 | 4.000 | 80.0 | 1.000 | 15.000 | 3.000 | 3.000 | 9.000 | 7.000 | 3.000 | 7.000 |
max | 60.000 | 1499.000 | 29.000 | 5.000 | 1.0 | 2068.000 | 4.000 | 100.000 | 4.000 | 5.000 | 4.000 | 19999.000 | 26999.000 | 9.000 | 25.00 | 4.000 | 4.000 | 80.0 | 3.000 | 40.000 | 6.000 | 4.000 | 40.000 | 18.000 | 15.000 | 17.000 |
# 查看所有数据分布
colnm = df.columns.to_list()
plt.figure(figsize=(35, 25))
for i in range(35):plt.subplot(5, 7, i+1) # 3行4列 位置是i+1的子图df[colnm[i]].hist(bins=80, color=colors[1]) # bins 指定显示多少竖条plt.xlabel(colnm[i], fontsize=13)plt.ylabel('Frequency')
plt.tight_layout()
print('\n figure 01')
figure 01
得出结论: 该公司员工平均年龄为36岁 平均工作年限为11年,最长工作年限为40年 月平均收入为6502.93等重要信息
# 公司各部门离职率分析
plt.figure(figsize=(15, 9)) # 图片大小
df1 = pd.crosstab(df['Department'], df['Attrition'])
df1.plot(kind='bar')
plt.legend()
plt.xlabel('部门')
plt.ylabel('人数')
plt.xticks(rotation=0)
plt.title('公司各部门离职率分析')
print('\n figure 02')
figure 02
得出结论:公司主要由三个部门组成(人力资源,科技部门,销售部门),其中科技部门的总人数最多,相对应的离职人数也是最多的,人力资源部门总人数和离职人数都最少
# 公司离职人数占比分析
labels = ['离职', '在职']
sizes = [sum(df['Attrition'] == 'Yes'), sum(df['Attrition'] == 'No')]
explode = (0, 0.001)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
plt.title("公司离职人数占比分析")
print('\n figure 03')
figure 03
得出结论:公司离职率是16%
# 加班与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='OverTime', hue='Attrition', data=df, color=colors[4])
plt.legend()
plt.xlabel('加班时长')
plt.title('加班与离职率的关系')
print('\n figure 04')
figure 04
得出结论:加班时间越长,员工的离职率越高,加班是导致离职的重要原因
# 商务出差与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='BusinessTravel', hue='Attrition', data=df)
plt.legend()
plt.xlabel('商务出差频次')
plt.title('商务出差与离职率的关系')
print('\n figure 05')
figure 05
得出结论:如果频繁的商务出差,员工的离职率会更高
# 教育程度与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='Education', hue='Attrition', data=df, color=colors[6])
plt.legend()
plt.xlabel('教育程度')
plt.title('教育程度与离职率的关系')
print('\n figure 06')
figure 06
得出结论:科技公司大部分学历中偏上,教育程度越高,更有竞争力,离职率也越低
# 性别与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='Gender', hue='Attrition', data=df, color=colors[8])
plt.legend()
plt.xlabel('性别')
plt.title('性别与离职率的关系')
print('\n figure 07')
figure 07
得出结论:科技公司男性人数比较多,通过公司人口基数来对比,女性的离职率是要高于男性的
# 婚姻状况与离职率的关系
labels = ['单身', '已婚', '离婚']
sizes = [sum(df['MaritalStatus'] == 'Married'), sum(df['MaritalStatus'] == 'Single'), sum(df['MaritalStatus'] == 'Divorced')]
explode = (0.04, 0, 0.001)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
plt.title("婚姻状况与离职率的关系")
print('\n figure 08')
figure 08
得出结论:公司单身离职率最高,离婚后的离职率最低
# 收入与离职率的关系
df_income1 = df[(df['MonthlyIncome'] > 1000) & (df['MonthlyIncome'] < 8000)]
df_income2 = df[(df['MonthlyIncome'] > 8000) & (df['MonthlyIncome'] < 15000)]
df_income3 = df[(df['MonthlyIncome'] > 15000) & (df['MonthlyIncome'] < 20000)]
labels = ['月收入小于8000离职率', '月收入8000-15000离职率', '月收入大于15000离职率']
sizes = [sum(df_income1['Attrition'] == 'Yes'), sum(df['Attrition'] == 'Yes'),sum(df['Attrition'] == 'Yes')]
explode = (0.04, 0, 0.001)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
plt.title("收入与离职率的关系")
print('\n figure 09')
figure 09
得出结论:公司单身离职率最高,离婚后的离职率最低
# 员工工作过的公司与离职率的关系
df_worked1 = df[(df['NumCompaniesWorked'] > 0) & (df['NumCompaniesWorked'] < 3)]
df_worked2 = df[(df['NumCompaniesWorked'] > 3) & (df['NumCompaniesWorked'] < 6)]
df_worked3 = df[(df['NumCompaniesWorked'] > 6) & (df['NumCompaniesWorked'] < 10)]
x = ['0-3家', '3-6家', '6-10家']
y = [sum(df_worked1['Attrition'] == 'Yes'), sum(df_worked2['Attrition'] == 'Yes'), sum(df_worked3['Attrition'] == 'Yes')]
plt.plot(x, y, color='r')
plt.xlabel('员工工作过公司数量')
plt.ylabel('离职人数')
plt.title("员工工作过的公司与离职率的关系")
print('\n figure 10')
figure 10
得出结论:员工工作过的公司越少,离职率越低,说明频繁跳槽的员工稳定性较差,更容易离职。
总结:
1.加班是导致离职最重要的原因。
建议:完善加班制度,可以对员工每月加班的时间进行限制。
加强员工培训,合理安排工作时间的工作和会议安排,提高工作时间的工作效率。
2.差旅次数过多也会导致员工离职。因为员工长期出差没办法兼顾家庭并且工作负担也比较大。
建议:合理安排员工的差旅次数。例如实施轮流差旅措施
3.工作年数也和离职率密切相关。工作年数越长,职位水平越高,离职率越低。
建议:公司招聘时考察员工的稳定性,一般情况下,以前待过的公司越少越好。
4.单身的人离职率较高。
建议:公司多关照单身的人,别让加班耽误了单身青年谈恋爱,相亲的时间。