import gzip
import re
from urllib import requestimport xlwt
from bs4 import BeautifulSoup# 参数说明:
# period: 开奖期数
# redo_num: 重试次数,默认是0,每错误一次回调时会+1,默认就好
# max:最大重试次数,默认5
def get_pls(period, redo_num=0, redo_max=5) -> dict:try:url = 'http://kaijiang.500.com/shtml/pls/%05d.shtml' % periodreq = request.Request(url)# 设置一下头,不设置返回结果不正确req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/79.0.3945.117 Safari/537.36')req.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,''application/signed-exchange;v=b3;q=0.9')req.add_header('Accept-Encoding', 'gzip')req.add_header('Accept-Language', 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7')req.add_header('Cache-Control', 'max-age=0')req.add_header('Host', 'kaijiang.500.com')req.add_header('Referer', 'http://kaijiang.500.com/shtml/pls/04001.shtml')req.add_header('Upgrade-Insecure-Requests', 1)req.add_header('Cookie','ck_regchanel=baidu; regfrom=0%7Cala%7Cbaidu; sdc_session=1579657460948; ')# 发送请求并读取结果html_content = request.urlopen(req).read()# 由于编码格式是gzip,所以需要进行解压html_content = gzip.decompress(html_content)# 解压后是一个bytes流,需要转换为字符串进行html解析html_content = html_content.decode('utf-8', errors='ignore')# 解析为soup构建的html对象soup = BeautifulSoup(html_content, 'html.parser')# 获取原始开奖号码(先定位到标签)pls_rst = soup.find_all('li', attrs={'class': 'ball_orange'})# 获取原始开奖期数qi_shu = soup.find_all('font', attrs={'class': 'cfont2'})# 获取原始开奖日期 - 截止兑奖日期pls_date = soup.find('span', attrs={"class": "span_right"})# 解析期数qi_shu = qi_shu[0].get_text() if qi_shu else period# 解析日期match_obj = re.findall(r'\d+', pls_date.get_text())# 开奖日期 解析不到取空start_date = match_obj[0] if match_obj else ''# 截止兑奖日期 解析不到取空end_date = match_obj[1] if match_obj else ''# 解析开奖号码# 第 1 位pls_1 = pls_rst[0].get_text()# 第 2 位pls_2 = pls_rst[1].get_text()# 第 3 位pls_3 = pls_rst[2].get_text()return {'period': qi_shu, 'num_1': pls_1, 'num_2': pls_2, 'num_3': pls_3, 'all': pls_1 + pls_2 + pls_3,'open_date': start_date, 'end_date': end_date}except OSError:# 达到最大请求次数,还是无法解压,则返回空if redo_num > redo_max:return {'period': qi_shu, 'num_1': '', 'num_2': '', 'num_3': '', 'all': '','open_date': '', 'end_date': ''}# 请求可能返回的结果不符合gzip格式所以这里做一下处理,不符合格式重新请求,一直到符合格式return get_pls(period, redo_num + 1)'''
python 获取彩票【排列三】指定【期数区间】的的开奖号,并保存为Excel
需要用到的库: BeautifulSoup:html解析从dom中获取需要的数据xlwt:Excel操作urllib.request:发送HTTP请求并接收结果re:正则表达式,提取字符串中的有效数据gzip:解压gzip格式的压缩数据
一次不要爬太多期哦,不然会很慢
'''
if __name__ == '__main__':l = []# 开始爬取的期数start = 4001# end = 20021# 结束爬取的期数end = 4020for q in range(start, end):rs = get_pls(q)l.append(rs)book = xlwt.Workbook()sheet = book.add_sheet('排列三')for index, head in enumerate(['期数', '第1位', '第2位', '第3位', '全部', '开奖日期', '兑奖截止日期']):sheet.write(0, index, head)row = 1for r in l:col = 0for key, value in r.items():sheet.write(row, col, value)col += 1row += 1book.save('排列三.xls')
欢迎关注公众号一起成长