今年大学毕业生预计突破900万大关。
每年毕业的大学生数量是在逐年增加。
根据教育部最新官方数据,预测明年也就是2021年我国高校毕业生人数将达到909万人,首次突破900万人!到2022年,我国高校毕业生人数将超过1000万人!目前我国14亿多人口中,只有1.7亿名大学生,再过10年左右时间,随着各高校毕业生进入社会的各行各业,我国总人口中就会有3亿多名大学生!在这个大学生遍地走的时代,了解各种工作岗位的情况是非常有必要的。
什么是pyecharts?
pyecharts 是一个用于生成 Echarts 图表的类库。
echarts 是百度开源的一个数据可视化 JS 库,主要用于数据可视化。pyecharts 是一个用于生成 Echarts 图表的类库。实际上就是 Echarts 与 Python 的对接。
使用 pyecharts 可以生成独立的网页,也可以在 flask , Django 中集成使用。
pyecharts0.5和1.0在用法上有较大差异,本项目用的是0.5版本
先安装所需的库
pip install pyecharts==0.5.11
pip install echarts-countries-pypkg
pip install pyecharts-snapshot
pip install bs4
pip install pandas
pip install jieba
pip install requests
项目组成
本项目由两个py文件组成 ,最终结果如下
1.数据获取
进入前程无忧官网,随便输入一个关键词,根据分析URL,不难发现他的命名规律
获取数据并存为HTML文件
def getfront(page,item): #爬取函数,page是页数,item是输入的字符串result = urllib.parse.quote(item)ur1 = result+',2,'+ str(page)+'.html'ur2 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'res = ur2+ur1a = urllib.request.urlopen(res)html = a.read().decode('gbk')html = html.replace('\\','')html = html.replace('[', '')html = html.replace(']', '')f=open('51job.html','w',encoding='gbk')f.write(html)f.close()# print(html)return html
通过对HTML文件内容的分析,我们可以通过正则表达式获取其中有用的信息
def getInformation(html):reg = re.compile(r'\{"type":"engine_search_result","jt":"0".*?"job_href":"(.*?)","job_name":"(.*?)".*?"company_href":"(.*?)","company_name":"(.*?)","providesalary_text":"(.*?)".*?"updatedate":"(.*?)".*?,'r'"companytype_text":"(.*?)".*?"jobwelf":"(.*?)".*?"attribute_text":"(.*?)","(.*?)","(.*?)","(.*?)","companysize_text":"(.*?)","companyind_text":"(.*?)","adid":""},',re.S)#匹配换行符items=re.findall(reg,html)print(items)return items
将获取到的信息存入XLS表格
for j in range(1, 6): ##需要爬取的页数print("正在爬取"+item+"第" + str(j) + "页数据...")html = getfront(j,item)# print(html)for i in getInformation(html): #将数据写入Excel表格# print(i)sheet1.write(number,0,number)sheet1.write(number,1,i[1])sheet1.write(number,2,i[3])sheet1.write(number,3,i[8])sheet1.write(number,4,i[6])sheet1.write(number,5,i[4])sheet1.write(number,6,i[10])sheet1.write(number,7,i[9])sheet1.write(number,8,i[12])sheet1.write(number,9,i[7])sheet1.write(number,10,i[5])number+=1excel1.save(item+".xls")time.sleep(0.3)
2.数据清洗
这时的数据我们还不能直接用,其中有空缺或者错位,工资单位也需要统一成万/月,也要筛除其中不符合我们预期的岗位信息,特别需要注意的是每进行一种处理,就要对表格进行重新排序,不然在下一次处理是会有序号缺失导致报错
data = pd.read_excel(item+'.xls')
result = pd.DataFrame(data)a = result.dropna(axis=0,how='any') #删除有空值的行
pd.set_option('display.max_rows',None) #输出全部行a = a.reset_index(drop = True) #重新排序
b2 = u'人'
li2 = a['学历要求']
for i in range(0,len(li2)): #删除信息错位的行try:if '人' in li2[i]:# print(li[i])a = a.drop(i, axis=0)except:passa = a.reset_index(drop = True)
b3=u'万/年'
b4=u'月'
b5=u'万'
b6=u'千'
li3 = a['薪资']
for i in range(0,len(li3)): #薪资转换try:if b3 in li3[i]:x=li3[i].split('万')[0]x1=x.split('-')[0]x2=x.split('-')[1]min_=format(float(x1)/12,'.2f')max_=format(float(x2)/12,'.2f')a.loc[i, '薪资'] = min_+'-'+max_+ u'万/月'# li3[i]=min_+'-'+max_+ u'万/月'if b6 in li3[i]:x = li3[i].split('千')[0]x1 = x.split('-')[0]x2 = x.split('-')[1]min_ = format(float(x1) / 10, '.2f')max_ = format(float(x2) / 10, '.2f')a.loc[i, '薪资'] = min_ + '-' + max_ + u'万/月'# li3[i] = min_ + '-' + max_ + u'万/月'except:passa.to_excel(item+'2.xls', sheet_name=item, index=False)
处理后的结果存为另一个XLS文件
3.数据处理及可视化
1.学历要求玫瑰图
attr= dir1.keys() #玫瑰图
value = dir1.values()
pie = Pie("学历要求",title_pos='center')
pie.add("", attr, value, center=[60, 60], is_random=False, radius=[30, 75], rosetype='radius',is_legend_show=False, is_label_show=True,legend_orient='vertical')
pie.render('学历要求玫瑰图.html')
2.城市需求分布图
def get_address(list):address2 = {}for i in set(list):address2[i] = list.count(i)try:address2.pop('异地招聘')except:passreturn address2
dir2 = get_address(address)
# print(dir2)
#城市需求分布图
geo = Geo("人才需求分布图", title_color="#2E2E2E",title_text_size=24,title_top=20,title_pos="center", width=1300,height=600)
attr2 = dir2.keys()
value2 = dir2.values()
geo.add("",attr2, value2, type="effectScatter", is_random=True, visual_range=[0, 30], maptype='china',symbol_size=8, effect_scale=5, is_visualmap=True,is_label_show=True)
geo.render('城市需求分布图.html')
3.工作经验漏斗图
def get_experience(list):experience2 = {}for i in set(list):experience2[i] = list.count(i)return experience2
dir3 = get_experience(experience)
# print(dir3)
#工作经验漏斗图
attr3= dir3.keys()
value3 = dir3.values()
funnel = Funnel("工作经验漏斗图",title_pos='center')
funnel.add("", attr3, value3, center=[100, 100],is_label_show=True,label_pos="inside", label_text_color="#fff",legend_orient='vertical',legend_pos='left')
funnel.render('工作经验要求漏斗图.html')
4.公司性质直方图
for i in range(0,len(com)):##统计公司性质com_dict[com[i]]=com_dict.get(com[i],0)+1bar=Bar('公司性质直方图',title_pos='center',width=800,height=500)##公司性质直方图
bar.add('',list(com_dict.keys()),list(com_dict.values()),is_label_show=True,mark_point=['min','max'])
bar.render('直方图.html')
5.岗位关键词词云图
job_list=''
for i in range(0,len(job)): ##统计岗位关键词job_list+=job[i]
mytext = jieba.analyse.extract_tags(job_list,topK=30)
mytext1=jieba.lcut(job_list)
# print(mytext)
# print(mytext1)
num=[]
for word in mytext:sum=0for i in mytext1:if word==i:sum+=1num.append(sum)
# print(num)
wc=WordCloud()##关键词词云图
wc.add('',mytext ,num, word_size_range=[20, 200],shape='diamond')
wc.render('慈云.html')
6.页面整合
将所有页面整合到一个HTML页面上,形成最终的效果
page = Page(page_title= "前程无忧可视化大屏")##整合图表
page.add(pie)
page.add(funnel)
page.add(bar)
page.add(geo)
page.add(wc)
page.render("page.html")with open("page.html", "r+", encoding='utf-8') as html:##调整页面排版html_bf = BeautifulSoup(html, 'lxml')# print(html_bf)divs = html_bf.select('div')divs[0]['style'] = "width:500px;height:350px;position:absolute;top:5px;left:0px;border-style:solid;border-color:#444444;border-width:0px;"divs[1]["style"] = "width:500px;height:350px;position:absolute;top:370px;left:50px;border-style:solid;border-color:#444444;border-width:0px;"divs[2]["style"] = "width:500px;height:350px;position:absolute;top:370px;left:961px;border-style:solid;border-color:#444444;border-width:0px;"divs[3]["style"] = "width:505px;height:350px;position:absolute;top:0px;left:961px;border-style:solid;border-color:#444444;border-width:0px;"divs[4]["style"] = "width:500px;height:800px;position:absolute;top:0px;left:450px;border-style:solid;border-color:#444444;border-width:0px;"body = html_bf.find("body")div_title = "<div align=\"center\" style=\"width:1500px;\">\n<span style=\"font-size:32px;font face=\'黑体\';color:#000000\"><b>{}工作情况统计与分析</b></div>".format(item)body["style"] = "background-color:#ffffff;"body.insert(0, BeautifulSoup(div_title, "lxml").div)html_new = str(html_bf)html.seek(0, 0)html.truncate()html.write(html_new)html.close()
最终效果如下
附上全部代码
数据获取
import re
import urllib.request
import xlwt
import urllib.parse
import timeheader={'Host':'search.51job.com','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}def getfront(page,item): #爬取函数,page是页数,item是输入的字符串result = urllib.parse.quote(item)ur1 = result+',2,'+ str(page)+'.html'ur2 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'res = ur2+ur1a = urllib.request.urlopen(res)html = a.read().decode('gbk')html = html.replace('\\','')html = html.replace('[', '')html = html.replace(']', '')f=open('51job.html','w',encoding='gbk')f.write(html)f.close()# print(html)return htmldef getInformation(html):reg = re.compile(r'\{"type":"engine_search_result","jt":"0".*?"job_href":"(.*?)","job_name":"(.*?)".*?"company_href":"(.*?)","company_name":"(.*?)","providesalary_text":"(.*?)".*?"updatedate":"(.*?)".*?,'r'"companytype_text":"(.*?)".*?"jobwelf":"(.*?)".*?"attribute_text":"(.*?)","(.*?)","(.*?)","(.*?)","companysize_text":"(.*?)","companyind_text":"(.*?)","adid":""},',re.S)#匹配换行符items=re.findall(reg,html)print(items)return itemsitem = input("请输入要分析的岗位:")number = 1
# 新建表格空间
excel1 = xlwt.Workbook()
# 设置单元格格式
sheet1 = excel1.add_sheet(item, cell_overwrite_ok=True)
sheet1.write(0, 0, '序号')
sheet1.write(0, 1, '职位')
sheet1.write(0, 2, '公司名称')
sheet1.write(0, 3, '公司地点')
sheet1.write(0, 4, '公司性质')
sheet1.write(0, 5, '薪资')
sheet1.write(0, 6, '学历要求')
sheet1.write(0, 7, '工作经验')
sheet1.write(0, 8, '公司规模')
sheet1.write(0, 9, '公司福利')
sheet1.write(0, 10, '发布时间')
for j in range(1, 6): ##需要爬取的页数print("正在爬取"+item+"第" + str(j) + "页数据...")html = getfront(j,item)# print(html)for i in getInformation(html): #将数据写入Excel表格# print(i)sheet1.write(number,0,number)sheet1.write(number,1,i[1])sheet1.write(number,2,i[3])sheet1.write(number,3,i[8])sheet1.write(number,4,i[6])sheet1.write(number,5,i[4])sheet1.write(number,6,i[10])sheet1.write(number,7,i[9])sheet1.write(number,8,i[12])sheet1.write(number,9,i[7])sheet1.write(number,10,i[5])number+=1excel1.save(item+".xls")time.sleep(0.3)
数据处理及可视化
import webbrowser #调用库
import pandas as pd
import re
from pyecharts import Funnel,Pie,Geo,Page,Bar,WordCloud
from 数据爬取 import item
from bs4 import BeautifulSoup
import jieba.analyse
import jieba
jieba.setLogLevel(jieba.logging.INFO)data = pd.read_excel(item+'.xls')
result = pd.DataFrame(data)a = result.dropna(axis=0,how='any') #删除有空值的行
pd.set_option('display.max_rows',None) #输出全部行a = a.reset_index(drop = True) #重新排序
b2 = u'人'
li2 = a['学历要求']
for i in range(0,len(li2)): #删除信息错位的行try:if '人' in li2[i]:# print(li[i])a = a.drop(i, axis=0)except:passa = a.reset_index(drop = True)
b3=u'万/年'
b4=u'月'
b5=u'万'
b6=u'千'
li3 = a['薪资']
for i in range(0,len(li3)): #薪资转换try:if b3 in li3[i]:x=li3[i].split('万')[0]x1=x.split('-')[0]x2=x.split('-')[1]min_=format(float(x1)/12,'.2f')max_=format(float(x2)/12,'.2f')a.loc[i, '薪资'] = min_+'-'+max_+ u'万/月'# li3[i]=min_+'-'+max_+ u'万/月'if b6 in li3[i]:x = li3[i].split('千')[0]x1 = x.split('-')[0]x2 = x.split('-')[1]min_ = format(float(x1) / 10, '.2f')max_ = format(float(x2) / 10, '.2f')a.loc[i, '薪资'] = min_ + '-' + max_ + u'万/月'# li3[i] = min_ + '-' + max_ + u'万/月'except:passa.to_excel(item+'2.xls', sheet_name=item, index=False)file = pd.read_excel(item+'2.xls',sheet_name=item)
f = pd.DataFrame(file)
pd.set_option('display.max_rows',None)add = f['公司地点']
sly = f['薪资']
edu = f['学历要求']
exp = f['工作经验']
com = f['公司性质']
job = f['职位']
com_dict = {}
address =[]
salary = []
education = []
experience = []
for i in range(0,len(f)): #将作图所需的数据放入对应的列表try:a = add[i].split('-')address.append(a[0])#print(address[i])s = re.findall(r'\d*\.?\d+',sly[i])s1= float(s[0])s2 =float(s[1])salary.append([s1,s2])#print(salary[i])education.append(edu[i])#print(education[i])experience.append(exp[i])#print(experience[i])except:passdef get_edu(list):education2 = {}for i in set(list):education2[i] = list.count(i) ##统计各个学历要求出现次数return education2
dir1 = get_edu(education)
# print(dir1)attr= dir1.keys() #玫瑰图
value = dir1.values()
pie = Pie("学历要求",title_pos='center')
pie.add("", attr, value, center=[60, 60], is_random=False, radius=[30, 75], rosetype='radius',is_legend_show=False, is_label_show=True,legend_orient='vertical')
pie.render('学历要求玫瑰图.html')def get_address(list):address2 = {}for i in set(list):address2[i] = list.count(i)try:address2.pop('异地招聘')except:passreturn address2
dir2 = get_address(address)
# print(dir2)
#城市需求分布图
geo = Geo("人才需求分布图", title_color="#2E2E2E",title_text_size=24,title_top=20,title_pos="center", width=1300,height=600)
attr2 = dir2.keys()
value2 = dir2.values()
geo.add("",attr2, value2, type="effectScatter", is_random=True, visual_range=[0, 30], maptype='china',symbol_size=8, effect_scale=5, is_visualmap=True,is_label_show=True)
geo.render('城市需求分布图.html')def get_experience(list):experience2 = {}for i in set(list):experience2[i] = list.count(i)return experience2
dir3 = get_experience(experience)
# print(dir3)
#工作经验漏斗图
attr3= dir3.keys()
value3 = dir3.values()
funnel = Funnel("工作经验漏斗图",title_pos='center')
funnel.add("", attr3, value3, center=[100, 100],is_label_show=True,label_pos="inside", label_text_color="#fff",legend_orient='vertical',legend_pos='left')
funnel.render('工作经验要求漏斗图.html')for i in range(0,len(com)):##统计公司性质com_dict[com[i]]=com_dict.get(com[i],0)+1bar=Bar('公司性质直方图',title_pos='center',width=800,height=500)##公司性质直方图
bar.add('',list(com_dict.keys()),list(com_dict.values()),is_label_show=True,mark_point=['min','max'])
bar.render('直方图.html')job_list=''
for i in range(0,len(job)): ##统计岗位关键词job_list+=job[i]
mytext = jieba.analyse.extract_tags(job_list,topK=30)
mytext1=jieba.lcut(job_list)
# print(mytext)
# print(mytext1)
num=[]
for word in mytext:sum=0for i in mytext1:if word==i:sum+=1num.append(sum)
# print(num)
wc=WordCloud()##关键词词云图
wc.add('',mytext ,num, word_size_range=[20, 200],shape='diamond')
wc.render('慈云.html')page = Page(page_title= "前程无忧可视化大屏")##整合图表
page.add(pie)
page.add(funnel)
page.add(bar)
page.add(geo)
page.add(wc)
page.render("page.html")with open("page.html", "r+", encoding='utf-8') as html:##调整页面排版html_bf = BeautifulSoup(html, 'lxml')# print(html_bf)divs = html_bf.select('div')divs[0]['style'] = "width:500px;height:350px;position:absolute;top:5px;left:0px;border-style:solid;border-color:#444444;border-width:0px;"divs[1]["style"] = "width:500px;height:350px;position:absolute;top:370px;left:50px;border-style:solid;border-color:#444444;border-width:0px;"divs[2]["style"] = "width:500px;height:350px;position:absolute;top:370px;left:961px;border-style:solid;border-color:#444444;border-width:0px;"divs[3]["style"] = "width:505px;height:350px;position:absolute;top:0px;left:961px;border-style:solid;border-color:#444444;border-width:0px;"divs[4]["style"] = "width:500px;height:800px;position:absolute;top:0px;left:450px;border-style:solid;border-color:#444444;border-width:0px;"body = html_bf.find("body")div_title = "<div align=\"center\" style=\"width:1500px;\">\n<span style=\"font-size:32px;font face=\'黑体\';color:#000000\"><b>{}工作情况统计与分析</b></div>".format(item)body["style"] = "background-color:#ffffff;"body.insert(0, BeautifulSoup(div_title, "lxml").div)html_new = str(html_bf)html.seek(0, 0)html.truncate()html.write(html_new)html.close()
webbrowser.open_new_tab('page.html')##自动打开文件