前言:今天讲下高德地图全国美食爬虫。
反爬点:
高德地图反爬主要是:
1.IP代理。
2.headers(referer,cookie必要的),referer:随便个可以不变。cookie:必要的参数:isg,l,cna(可自行数字大小写字母组合),uab_collina(固定值)
3.限制最大45页,可地区精确到区及二级分类精确到火锅来尽可能最大获取,避免最大页数问题.
import requests,random,string,time,pymongo,re,json,datetime,logging
from Config import Config
from urllib import parse
logging.basicConfig(filename="show.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)class Amap(object):def __init__(self):self.isg = 'XXXX'self.l = 'XXX'self.cna = 'XXXX' def get_pro(self):get_pro_list = self.post_city.find({})for get_pro in get_pro_list[9:]:print('begin......{}'.format(get_pro['pro_name']))pro_name = get_pro['pro_name']for every_city in get_pro['city_list']:choose_city = every_citycity_name = choose_city['city_name']print('begin city ....{}'.format(city_name))city_adcode = choose_city['city_adcode']# 1获取城市所有区及美食二级分类show_url = 'https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=17&city={}&geoobj=121.9098|25.510585|111.923414|24.516816&_src=around&keywords=美食'.format(city_adcode)headers = self.get_headers(city_adcode)show_json = self.request_url(show_url,headers)# print('11111',show_json)if show_json:# 区分类area_list = []if 'bizAreaData' in show_json:districts = show_json['bizAreaData']['districts']for k in districts:area_dict = {}area_dict['area_name'] = k['name']area_dict['area_value'] = k['districts']area_list.append(area_dict)self.deal_areas(pro_name, city_name, city_adcode, area_list)else:print('该市并未有区......')area_list = []self.deal_areas(pro_name, city_name, city_adcode, area_list)else:print('{} 未获取到 json数据.......'.format(city_name))def deal_areas(self,pro_name,city_name,city_adcode,area_list):classify_list = Config.classify_listif len(area_list) > 0:for j in area_list:area_name = j['area_name']area_site_list = j['area_value']for k in area_site_list:if re.search('全部',k['name']):continueelse:area_site_adcode = k['adcode']area_site_name = k['name']for m in classify_list:classify_name = m['classify_name']classify_value = m['classify_value']print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,classify_name))self.deal_information(pro_name,city_name,city_adcode,area_name,area_site_adcode,area_site_name,classify_name,classify_value)else:print('该市分区为0..........')area_name = ''area_site_adcode = ''area_site_name = ''classify_list2 = Config.classify_list2for m in classify_list2:classify_name = m['classify_name']second_classify_list = m['second_list']if len(second_classify_list) > 0:for l in second_classify_list:print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,classify_name))self.deal_other_information(pro_name, city_name, city_adcode, area_name, area_site_adcode, area_site_name,classify_name,l)else:print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,classify_name))self.deal_other_information(pro_name, city_name, city_adcode, area_name, area_site_adcode,area_site_name,classify_name, '')def deal_other_information(self,pro_name, city_name, city_adcode, area_name, area_site_adcode, area_site_name,classify_name,second_classify_str):if second_classify_str:second_un = parse.quote(second_classify_str)else:second_un = parse.quote(classify_name)geoobj = parse.quote('')i = 1a = 0while True:url = 'https://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=14.18&city={}&geoobj={}&keywords={}'.format(str(i),city_adcode,geoobj,second_un)headers = self.get_headers(city_adcode)resp_json = self.request_url(url,headers)if resp_json:shop_total = int(resp_json['data']['total'])print('总共{}个店铺'.format(resp_json['data']['total']))if 'poi_list' in resp_json['data']:now_num = len(resp_json['data']['poi_list'])a += now_numelse:breakprint('当前已爬取{}个店铺'.format(a))if shop_total > 0:for j in resp_json['data']['poi_list']:shop_id = j['id']shop_name = j['name']# print(shop_name)shop_address = j['address']# print(shop_address)shop_tel = j['tel']shop_latitude = j['latitude']shop_longitude = j['longitude']if 'value' in j['domain_list'][8]:second_classify = j['domain_list'][8]['value']else:second_classify = ''self.save_info(pro_name, city_name, area_name, area_site_name, classify_name, shop_id, shop_name,shop_address, shop_tel, shop_latitude, shop_longitude, second_classify)else:print('shop num is none.......')breakelse:print('{}...{}..{}.get shop type information failed'.format(city_name, area_name, area_site_name))breaki += 1def deal_information(self,pro_name,city_name,city_adcode,area_name,area_site_adcode,area_site_name,classify_name,classify_value):geoobj = parse.quote('')classify_data = parse.quote('business_area_flag=1;adcode={};custom=business_area:{}+{}+sort_rule=5;reserved_keywords=true'.format(area_site_adcode,area_site_name,classify_value))user_loc = parse.quote('')need_params = 'city={}&geoobj={}&_src=around&classify_data={}&user_loc={}&keywords=%E7%BE%8E%E9%A3%9F'.format(city_adcode, geoobj, classify_data, user_loc)i = 1a = 0while True:need_url1 = 'https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=17&'.format(str(i))every_url = need_url1 + need_paramsheaders = self.get_headers(city_adcode)resp_json = self.request_url(every_url,headers)# print('22222',resp_json)if resp_json:shop_total = int(resp_json['data']['total'])print('总共{}个店铺'.format(resp_json['data']['total']))if 'poi_list' in resp_json['data']:now_num = len(resp_json['data']['poi_list'])a += now_numelse:breakprint('当前已爬取{}个店铺'.format(a))if shop_total > 0:for j in resp_json['data']['poi_list']:shop_id = j['id']shop_name = j['name']shop_address = j['address']shop_tel = j['tel']shop_latitude = j['latitude']shop_longitude = j['longitude']if 'value' in j['domain_list'][8]:second_classify = j['domain_list'][8]['value']else:second_classify = ''self.save_info(pro_name,city_name,area_name,area_site_name,classify_name,shop_id,shop_name,shop_address,shop_tel,shop_latitude,shop_longitude,second_classify)else:print('shop num is none.......')breakelse:print('{}...{}..{}.get shop type information failed'.format(city_name,area_name,area_site_name))breaki += 1def get_headers(self,city_adcode):headers = {'Accept': '*/*','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'zh-CN,zh;q=0.9','Host': 'www.amap.com','X-Requested-With': 'XMLHttpRequest','User-Agent': random.choice(Config.pc_user_agent_list),'Referer': 'XXXXXXXXXXXXXX'.format(city_adcode),isg={}; l={}'.format(cna,# isg, l)'Cookie': 'cna={}; _uab_collina=XXXXXXXX;isg={}; l={}'.format(cna,isg,l)}return headersdef request_url(self,url,headers):i = 0while i <=5:if i == 5:print('retry five times {}'.format(url))logging.info("get url five times failed %s" % url)return {}try:resp = requests.get(url, headers=headers, proxies=Config.proxies, verify=False,timeout=2)# print(resp.text)resp_json = json.loads(resp.text)if 'status' in resp_json:return resp_jsonelse:print('被反爬啦,重新尝试了....{}次'.format(str(i)))i +=1continueexcept Exception as f:print('get json data failed {}'.format(str(f)))i += 1
来源:高德地图美食爬虫_高德,cookie-CSDN博客