Python实现小红书app版爬虫

简介：由于数据需求的日益增大，小红书网页版已经不能满足我们日常工作的需求，为此，小编特地开发了小红书手机版算法，方便大家获取更多的数据，提升工作效率。

手机版接口主要包括：搜素，详情，话题，评论，主页，用户信息，用户收藏，用户喜欢，发现页等等。

搜索页

评论页

code：

签名获取

    def get_shield_value(self, note_id, xhs_api_url, xy_common_params, method='GET', apibody=''):if method == 'GET':if note_id:body = f'noteid={note_id}&param={quote(xy_common_params)}&device={self.device_id}&hmac={quote(self.hmac)}&url={quote(xhs_api_url)}&direction=48'else:body = f'param={quote(xy_common_params)}&device={self.device_id}&hmac={quote(self.hmac)}&url={quote(xhs_api_url)}&direction=48'response = requests.post(self.get_shield_url, data=body, headers=self.headers, timeout=5)return response.text.strip()else:url = f'{self.post_shield_url}?url={urllib.parse.quote(xhs_api_url)}&param={urllib.parse.quote(xy_common_params)}&direction=40&body={urllib.parse.quote(apibody)}&hmac={urllib.parse.quote(self.hmac)}&device={self.device_id}'headers = {'User-Agent': 'Apifox/1.0.0 (https://apifox.com)','Content-Type': 'application/json'}response = requests.post(url, headers=headers)response.raise_for_status()shield = response.text.strip()return shield

headers生成

    def generate_post_headers(self, note_id, xhs_api_url, custom_headers, session_id, api='', method='GET', apibody=''):post_headers = copy.deepcopy(custom_headers)post_headers['x-legacy-sid'] = f'session.{session_id}'post_headers['xy-common-params'] = re.sub(r'session\.\d+', f'session.{session_id}', post_headers['xy-common-params'])shield = self.get_shield_value(note_id, xhs_api_url, post_headers['xy-common-params'], method, apibody)post_headers.update({'shield': shield,})logger.info(shield)if method == 'POST':post_headers['xy-direction'] = '40'return post_headers

请求

    def spider_search(self, keyword, page='1', page_pos='0', sort='general', note_type='不限', publish_time='不限', search_type='不限', session_id=None):"""搜索:param keyword: 关键词:param sort: 排序方式 general:综合 time_descending:最新 popularity_descending:最多点赞 comment_descending:最多评论 collect_descending:最多收藏:param note_type: 笔记类型 不限 视频笔记 普通笔记:param publish_time: 发布时间 不限 一天内 一周内 半年内:param search_type: 搜索范围 不限 已看过 未看过:param session_id::return:"""api_url_base = "https://edith.xiaohongshu.com/api/sns/v10/search/notes"filters = [{'tags': [sort],'type': 'sort_type'},{'tags': [note_type],'type': 'filter_note_type'},{'tags': [publish_time],'type': 'filter_note_time'},{'tags': [search_type],'type': 'filter_note_range'}]params = {"keyword": keyword,"filters": json.dumps(filters, ensure_ascii=False, separators=(',', ':')),"sort": "","page": page,"page_size": "20","source": "explore_feed","search_id": "2ehsgm5x5z2etryfwa5ts","session_id": "2ehsglrpf9h3h4y091csg","api_extra": "","page_pos": page_pos,"pin_note_id": "","allow_rewrite": "1","geo": "","loaded_ad": "","query_extra_info": "","rec_extra_params": "","preview_ad": "","scene": "","is_optimize": "0","location_permission": "0","is_out_of_china": "0","device_level": "4","refresh_type": "0","in_map_card_exp": "0","search_tab": ""}xhs_api_url = api_url_base + '?' + urllib.parse.urlencode(params)post_headers = self.generate_post_headers('', xhs_api_url, self.custom_headers, session_id, api=xhs_api_url)# logger.info(post_headers)response = requests.get(xhs_api_url, headers=post_headers, timeout=5)res_json = response.json()logger.info(f'请求数据: {json.dumps(res_json, ensure_ascii=False)}')return res_json

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.rhkb.cn/news/39644.html

如若内容造成侵权/违法违规/事实不符，请联系长河编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！