最近用python写了个全国疫情中高风险地区查询的爬虫代码,分享给大家一起交流,希望得到不同思路的指教,让代码更简洁,运行效率更高。
总体思路
1、找到可供查询的源网站
2、分析、获取查询的API
3、构造API
4、获取全国中高风险地区信息
5、对信息按“省、市、街道”进行整理
6、构造查询匹配语句,判断输入的“省、市、县”是否在获取的信息中
注:输入匹配部分的代码自己总觉得不是最优解,希望能有大佬看到,指点一二。
代码部分
# -*- coding: utf-8 -*-
# @Time : 2022/5/12 11:08
# @Author : Kyln.Wu
# @Email : kylnwu@qq.com
# @File : 疫情风险地区查询.py
# @Software: PyCharm
import hashlib
import json
import difflib
import requests
import time# 获取当前时间戳
timestamp = str(int((time.time())))
# print(timestamp)# 定义几个解密需要用到的常量
token = '*********************' # 这里不让写全部密钥,写在注释里了:23y0ufFl5YxIyGrI8hWRUZmKkvtSjLQA
nonce = '123456789abcdefg'
passid = 'zdww'
key = "3C502C97ABDA40D0A60FBEE50FAAD1DA"# 用python逆向Headers里要插入的zdwwsignature变量的值
def get_zdwwsignature():zdwwsign = timestamp + 'fTN2pfuisxTavbTuYVSsNJHetwq5bJvC' + 'QkjjtiLM2dCratiA' + timestamphsobj = hashlib.sha256()hsobj.update(zdwwsign.encode('utf-8'))zdwwsignature = hsobj.hexdigest().upper()# print(zdwwsignature)return zdwwsignature# 用python逆向Params里要插入的signatureheader变量的值
def get_signatureheader():has256 = hashlib.sha256()sign_header = timestamp + token + nonce + timestamphas256.update(sign_header.encode('utf-8'))signatureHeader = has256.hexdigest().upper()# print(signatureHeader)return signatureHeader# 这里才是主函数,通过api接口,带入需要的参数,获取全国的数据
def get_datas():url = 'https://bmfw.www.gov.cn/bjww/interface/interfaceJson'headers = {"Accept": "application/json, text/javascript, */*; q=0.01","Accept-Encoding": "gzip, deflate, br","Accept-Language": "zh-CN,zh;q=0.9","Connection": "keep-alive",# "Content-Length": "235","Content-Type": "application/json; charset=UTF-8","Host": "bmfw.www.gov.cn","Origin": "http://bmfw.www.gov.cn","Referer": "http://bmfw.www.gov.cn/yqfxdjcx/risk.html",# "Sec-Fetch-Dest": "empty",# "Sec-Fetch-Mode": "cors",# "Sec-Fetch-Site": "cross-site","User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0","x-wif-nonce": "QkjjtiLM2dCratiA","x-wif-paasid": "smt-application","x-wif-signature": get_zdwwsignature(),"x-wif-timestamp": timestamp}params = {'appId': "NcApplication",'paasHeader': "zdww",'timestampHeader': timestamp,'nonceHeader': "123456789abcdefg",'signatureHeader': get_signatureheader(),'key': "3C502C97ABDA40D0A60FBEE50FAAD1DA"}resp = requests.post(url, headers=headers, json=params)datas = resp.text# 在线获取后,保存到本地,再进行本地整理操作,减少在线访问,以免被封IPwith open('./risk_data.log', 'w', encoding='utf-8') as f:f.write(datas)# 获取data中highlist部分数据,即高风险地区数据
def get_highlist(data):highlist = data['data']['highlist']return highlist# 获取data中middlelist部分数据,即中风险地区数据
def get_middlelist(data):middlelist = data['data']['middlelist']return middlelist# 查询中高风险地区的函数。这部分的算法不是很严谨,有待改进。
def chaxun(high_list, middle_list):# 用列表推导式从high_list中取出province的值,然后用set()去重,再转换成列表high_provinces = list(set([x['province'] for x in high_list]))high_citys = list(set([x['city'] for x in high_list]))high_countys = list(set([x['county'] for x in high_list]))print(f'高风险省/直辖市:{high_provinces}')print(f'高风险市/区:{high_citys}')print(f'高风险县/街道:{high_countys}')middle_provinces = list(set([x['province'] for x in middle_list]))middle_citys = list(set([x['city'] for x in middle_list]))middle_countys = list(set([x['county'] for x in middle_list]))print(f'中风险省/直辖市:{middle_provinces}')print(f'中风险市/区:{middle_citys}')print(f'中风险县/街道:{middle_countys}')while True:province_in = input('请输入来自省/直辖市:')if len(province_in) == 0:print('输入省/直辖市不能为空!')breakcity_in = input('请输入来自市/区:')if len(city_in) == 0:print('输入市/区不能为空!')breakcounty_in = input('请输入来自县/街道:')if len(county_in) == 0:print('输入县/街道不能为空!')break# 用difflib.get_close_matches()方法从high_provinces中取出1个与province_in最匹配的值,近似度0.6high_province = difflib.get_close_matches(province_in, high_provinces, 1, cutoff=0.6)# print(f'high_province:{high_province}')high_city = difflib.get_close_matches(city_in, high_citys, 1, cutoff=0.6)# print(f'high_city:{high_city}')high_county = difflib.get_close_matches(county_in, high_countys, 1, cutoff=0.6)# print(f'high_county:{high_county}')middle_province = difflib.get_close_matches(province_in, middle_provinces, 1, cutoff=0.6)# print(f'middle_province:{middle_province}')middle_city = difflib.get_close_matches(city_in, middle_citys, 1, cutoff=0.6)# print(f'middle_city:{middle_city}')middle_county = difflib.get_close_matches(county_in, middle_countys, 1, cutoff=0.6)# print(f'middle_county:{middle_county}')# 构造所有可能的中高风险条件# 高、高、高if high_province and high_city and high_county:print(f'{province_in},{city_in},{county_in} 为高风险省/直辖市,市/区,县/街道!!')# 高、高、中elif high_province and high_city and not high_county and not middle_province and not middle_city and middle_county:print(f'{province_in} 为高风险省/直辖市,{city_in} 为高风险市/区,{county_in} 为中风险县/街道。')# 高、高、低elif high_province and high_city and not high_county and not middle_province and not middle_city and not middle_county:print(f'{province_in} 为高风险省/直辖市,{city_in} 为高风险市/区,{county_in} 为低风险县/街道。')# 高、低、低elif high_province and not high_city and not high_county and not middle_province and not middle_city and not middle_county:print(f'{province_in} 为高风险省/直辖市,{city_in} 为低风险市/区,{county_in} 为低风险县/街道。')# 高、中、中elif high_province and not high_city and not high_county and middle_province and middle_city and not middle_county:print(f'{province_in} 为高风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为中风险县/街道。')# 高、中、低elif high_province and not high_city and not high_county and not middle_province and middle_city and not middle_county:print(f'{province_in} 为高风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为低风险县/街道。')# 中、中、中elif not high_province and not high_city and not high_county and middle_province and middle_city and middle_county:print(f'{province_in} 为中风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为中风险县/街道。')# 中、中、低elif not high_province and not high_city and not high_county and middle_province and middle_city and not middle_county:print(f'{province_in} 为中风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为低风险县/街道。')# 中、低、低elif not high_province and not high_city and not high_county and middle_province and not middle_city and not middle_county:print(f'{province_in} 为中风险省/直辖市,{city_in} 为低风险市/区,{county_in} 为低风险县/街道。')# 低、低、低elif not high_province and not high_city and not high_county and not middle_province and not middle_city and not middle_county:print(f'{province_in},{city_in},{county_in} 为低风险省/直辖市,市/区,县/街道。')else:print(f'不在中高风险列表中,或行政区域不匹配,请检查!!')loop = int(input('是否继续查询?1-继续,0-退出。'))if loop == 0:breakif __name__ == '__main__':# 第一次运行时需要打开下面的注释,这样才能先从网上获取到数据# get_datas()# 以下是读取本地数据来进行查询的,前提是要先运行一次上一行get_datas()with open('./risk_data.log', 'r', encoding='utf-8') as f:datas_dic = json.loads(f.read())high_lst = get_highlist(datas_dic)# print(high_list)middle_lst = get_middlelist(datas_dic)# print(middle_list)chaxun(high_lst, middle_lst)
运行结果
以上是运行结果,输入信息支持模糊查询