前置知识
1/3乐观锁
2/3 Scrapy流程(非全部)
3/3 关于付费代理
我用的"快代理", 1000个ip, 每个ip1min的有效期, 你用的时候, 把你的链接, 用户名填上去就行
设置代理IP 🔒
- & 帮助文档: ①meta ②meta#proxy
- $ 语法: ①proxy的设置: Request对象中有个meta属性, 其下有个proxy属性,
request.meta['proxy']="http://2.2.3.3:4324"
②download-timeout属性 - ! 卡点: Scrapy的异步是, 将所有的请求(近乎)同一时刻发出, 所以需要一个"版本锁"/"乐观锁"的思想(Redis的锁)
1.不可重用, 同步的思路
# Version_1: 因为异步延迟问题, 导致的ip复用失败
class IpDownloaderMiddleware:def __init__(self):# self.current_proxy_ip = self.get_proxy_ip() # 初始化当前代理IPself.current_proxy_ip = "http://3.3.3.3:3333" # 模拟ip失效@classmethoddef from_crawler(cls, crawler):s = cls()return sdef process_request(self, request: Request, spider):request.meta['proxy'] = self.current_proxy_ip # 使用当前代理IPrequest.meta['download_timeout'] = 1 # 1s内没有返回就超时return Nonedef process_response(self, request: Request, response: Response, spider):if response.status != 200:print(f"请求失败: {request.url}, 状态码: {response.status}")self.current_proxy_ip = self.get_proxy_ip() # 获取新的代理IPrequest.meta['proxy'] = self.current_proxy_ip # 更新请求的代理IPreturn request # 返回请求以重试return responsedef process_exception(self, request, exception, spider):print(f"请求 {request.url} 发生异常: {exception}")self.current_proxy_ip = self.get_proxy_ip() # 获取新的代理IPrequest.meta['proxy'] = self.current_proxy_ip # 更新请求的代理IPreturn request # 返回请求以重试def get_proxy_ip(self):api_url = "换成你的付费ip代理地址"proxy_ip = requests.get(api_url).textusername = "你的账号"password = "你的密码"return f"http://{username}:{password}@{proxy_ip}/"
2. 可重用单ip版本
# Version_2: 采用类似"版本锁"的思想, 构建复用单ip
# 大体思路: 一个ip就是一个版本, ①版本相同+报错=ip失效(需要获取新ip) ②版本不同+报错=ip未及时更新(更新版本即可,无需获取新ip)
class IpDownloaderMiddleware_V2:def __init__(self):# self.current_proxy_ip = self.get_proxy_ip() # 初始化当前代理IPself.current_proxy_ip = "http://3.3.3.3:8923" # 模拟ip失效@classmethoddef from_crawler(cls, crawler):s = cls()return sdef process_request(self, request: Request, spider):request.meta['proxy'] = self.current_proxy_ip # 使用当前代理IPrequest.meta['download_timeout'] = 1 # 1s内没有返回就超时request.meta['max_retry_times'] = 3 # 最多尝试0次# 打印当前是第几次tryif "retry_times" not in request.meta:request.meta['retry_times'] = 1print(f"url:{request.url}, {request.meta['retry_times']}/{request.meta['max_retry_times']}")return Nonedef process_response(self, request: Request, response: Response, spider):if response.status != 200:self.check_version(request)return requestreturn responsedef process_exception(self, request, exception, spider):if isinstance(exception, twisted.internet.error.TimeoutError):self.check_version(request)return requestdef check_version(self, request):# 检测版本# case1:版本相同,意味着ip失效, 需要新的ipif request.meta['proxy'] == self.current_proxy_ip:self.current_proxy_ip = self.get_proxy_ip() # 更新iprequest.meta['proxy'] = self.current_proxy_ipprint("获取新ip成功!!!")# case2: 版本不同,意味着未及时更新ipelse:print("ip未及时更新,已正确赋值新ip")request.meta['proxy'] = self.current_proxy_ipdef get_proxy_ip(self):api_url = "换成你的付费ip代理地址"proxy_ip = requests.get(api_url).textusername = "你的账号"password = "你的密码"return f"http://{username}:{password}@{proxy_ip}/"
3. 可复用ip代理池
# Version_3: 同Version_2的思想, 构建ip池
class IpDownloaderMiddleware_V3:def __init__(self):self.pool_size = 5self.proxy_ip_pool = [f"http://3.3.3.3:333{i}" for i in range(self.pool_size)] # 模拟失效ip# self.proxy_ip_pool = [self.get_proxy_ip() for _ in range(self.pool_size)] # 初始化当前代理IP@classmethoddef from_crawler(cls, crawler):s = cls()return sdef process_request(self, request: Request, spider):request.meta['proxy'] = random.choice(self.proxy_ip_pool) # 使用当前代理IPrequest.meta['download_timeout'] = 1 # 1s内没有返回就超时request.meta['max_retry_times'] = 3 # 最多尝试0次# 打印当前是第几次tryif "retry_times" not in request.meta:request.meta['retry_times'] = 1print(f"url:{request.url}, {request.meta['retry_times']}/{request.meta['max_retry_times']}")return Nonedef process_response(self, request: Request, response: Response, spider):if response.status != 200:self.check_version(request)return requestreturn responsedef process_exception(self, request, exception, spider):if isinstance(exception, twisted.internet.error.TimeoutError):self.check_version(request)return requestdef check_version(self, request):# 检测版本# case1:版本相同[在ip池里有],意味着ip失效, 需要新的ipif request.meta['proxy'] in self.proxy_ip_pool:# 更新ipself.proxy_ip_pool.remove(request.meta['proxy']) # 删除错误版本self.proxy_ip_pool.append(self.get_proxy_ip()) # 相当于替换request.meta['proxy'] = random.choice(self.proxy_ip_pool)print("获取新ip成功!!!")# case2: 版本不同,意味着未及时更新ipelse:request.meta['proxy'] = random.choice(self.proxy_ip_pool)print("ip未及时更新,已正确赋值新ip")print(f"当前代理池:{self.proxy_ip_pool}")def get_proxy_ip(self):api_url = "换成你的付费ip代理地址"proxy_ip = requests.get(api_url).textusername = "你的账号"password = "你的密码"return f"http://{username}:{password}@{proxy_ip}/"