引言
本篇博文是对上一篇文章中存在问题的修正,上一篇文章中使用了selenium来爬取页面,效率比较低,这篇文章中,我直接使用requests库进行爬取并且增强了程序的健壮性。
思路
上一篇文章中已经分析了,这里就不重复造轮子了,请出门,右转。
文章地址
代码
# !/usr/bin/env python
# —*— coding: utf-8 —*—
# @Time: 2020/2/8 9:08
# @Author: Martin
# @File: fang.py
# @Software:PyCharm
import requests
import re
import pymongo
from lxml import etreeclass FangSpider(object):def __init__(self):self.start_url = 'https://www.fang.com/SoufunFamily.htm'self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}self.client = pymongo.MongoClient(host='localhost', port=27017)self.db = self.client['fangtianxia']def run(self):try:response = requests.get(self.start_url, headers=self.headers)response.encoding = 'gbk'self.parse_page(response.text)except Exception as e:print(e)def parse_page(self, text):html = etree.HTML(text)a_list = html.xpath('//table[@id="senfe"]//td//a')china_house = []for a in a_list:city_url = "".join(a.xpath('@href'))city_name = "".join(a.xpath('text()'))if city_name == '海外':breakchina_house.append((city_name, city_url))self.parse_page_url(china_house)def parse_page_url(self, china_house):for city in china_house:(city_name, city_url) = citynew_house_url = 'http://' + city_url.split("//")[-1].split(".")[0] + '.newhouse.fang.com/house/s/'if city_name == '北京':new_house_url = 'http://newhouse.fang.com/house/s'html = self.parse_detail_page(new_house_url, city_name)try:end = "".join(html.xpath('//div[@class="page"]/ul/li[last()]/a[@class="last"]/@href')).strip()end_url = new_house_url + '/' + end.split('/')[-2]except:print("未找到结束页码!")continuei = 2while True:next_url = new_house_url + "/b9" + str(i)i += 1self.parse_detail_page(next_url, city_name)if next_url == end_url:breakdef parse_detail_page(self, url, city_name):try:r = requests.get(url, headers=self.headers)except Exception as e:print(e)return ""r.encoding = 'gbk'html = etree.HTML(r.text)li_list = html.xpath('//div[@id="newhouse_loupai_list"]//ul//li')for li in li_list:name = "".join(li.xpath('.//div[@class="nlcd_name"]/a/text()')).strip()origin_url = "http://" + "".join(li.xpath('.//div[@class="nlcd_name"]/a/@href')).strip()house_type = "".join(li.xpath('.//div[contains(@class,"house_type")]//text()'))house_type = re.sub(r'\s', "", house_type)address = "".join(li.xpath('.//div[@class="address"]/a/@title')).strip()price = "".join(li.xpath('.//div[@class="nhouse_price"]//text()'))price = re.sub(r'\s', "", price)sale = "".join(li.xpath('.//div[@class="fangyuan"]/span/text()'))label = "".join(li.xpath('.//div[@class="fangyuan"]//a//text()'))house = {'city_name': city_name,'name': name,'house_type': house_type,'address': address,'price': price,'sale': sale,'label': label,'origin_url': origin_url}print(house)self.save(house)return htmldef save(self, house):self.db.fangtianxia.insert_one(house)def close(self):self.client.close()if __name__ == '__main__':spider = FangSpider()spider.run()spider.close()