目的
学习更多的python反爬虫策略
测试网址
http://credit.customs.gov.cn/ccppserver/verifyCode/creator
分析
01 下载gif图片
02 使用ddddocr逐帧识别
03 如指定字符串出现次数大于等于3,则认定为正确的识别结果
经验证,识别成功率95%+
源码
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2023/12/26 11:26
# @Author : jia666666
# @FileName: 01 下载.py
import time
from PIL import Image
import ddddocr
import io
import requestsheaders = {"Host": "credit.customs.gov.cn","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8","Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2","Accept-Encoding": "gzip, deflate","Upgrade-Insecure-Requests": "1","Connection": "keep-alive"
}def get_yzm():while True:url = f"http://credit.customs.gov.cn/ccppserver/verifyCode/creator?{int(time.time() * 1000)}"response = requests.get(url, headers=headers, verify=False)#先保存本地在识别if savesign:with open(savepath, 'wb') as file:file.write(response.content)image = Image.open(savepath)else:#不保存本地image=Image.open(io.BytesIO(response.content))# 存储识别结果的字符串ocrresult = ''# 出现几次停止ocrcount = 3# 最后结果endres = ''i=0#开始帧while True:try:image.seek(i)ocr = ddddocr.DdddOcr()res = ocr.classification(image)ocrresult = ocrresult + resif len(res) == 4:if ocrresult.count(res) >= ocrcount:endres = resbreakif endres:breakelse:i=i+1except Exception as e:passbreakif endres:return endres
#开始时间
start_time = time.time()
#gif是否保存本地
savesign=True
#保存本地的路径
savepath=r"yzm.gif"
print("gif识别结果",get_yzm())
print("用时", time.time() - start_time)