一、需求
爬取网址:https://www.gushiwen.org/
需求:
(1)获取侧边栏【类型】信息;
(2)获取每个类型中古诗文详情页信息;
(3)提取详情页数据:古诗文名、作者、朝代、类型、内容、译文及注释;
(4)将数据保存到 csv 文件;
二、代码实现
import requests
import csv
from lxml import etreestart_url = "https://so.gushiwen.cn/shiwen/"
base_url = "https://so.gushiwen.cn"headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
}items = []def parse_url(url):"""解析url,得到响应内容"""# time.sleep(random.random())response = requests.get(url=url, headers=headers)return response.content.decode("utf-8")def parse_html(html):"""使用xpath解析html,返回xpath对象"""etree_obj = etree.HTML(html)return etree_objdef get_first_type():"""获取所有的一级类型"""first_type_list = []html = parse_url(start_url)etree_obj = parse_html(html)first_type_name_list = etree_obj.xpath('(//a[contains(@href,"/gushi/")]|//a[contains(@href,"/wenyan/")])/text()')first_type_url_list = etree_obj.xpath('(//a[contains(@href,"/gushi/")]|//a[contains(@href,"/wenyan/")])/@href')data_zip = zip(first_type_name_list, first_type_url_list)for data in data_zip:first_type = {}first_type["name"] = data[0]first_type["url"] = data[1]first_type_list.append(first_type)return first_type_listdef get_data(first_type):"""查询数据"""#一级类型urlurl = base_url + first_type["url"]first_type_name = first_type["name"]#向一级类型url发送请求获取二级类型数据html = parse_url(url)etree_obj = parse_html(html)div_list = etree_obj.xpath('//div[@class="typecont"]')#二级类型类型数据divfor div in div_list:#二级类型名称second_type_name = div.xpath(".//strong/text()")if second_type_name: # 有的没有二级类型second_type_name = second_type_name[0]else:second_type_name = ""#二级类型下诗词的名称和urlpoetry_name_list = div.xpath(".//span/a/text()")poetry_url_list = div.xpath(".//span/a/@href")data_zip = zip(poetry_name_list,poetry_url_list)for data in data_zip:#item是一个诗词数据item = {}item["first_type_name"] = first_type_nameitem["second_type_name"] = second_type_nameitem["poetry_name"] = data[0]#诗词urlpoetry_url = base_url+data[1]html = parse_url(poetry_url)etree_obj = parse_html(html)#诗词作者poetry_author = etree_obj.xpath('//p[@class="source"]')[0].xpath(".//text()")item["poetry_author"] = "".join(poetry_author).strip()#诗词内容poetry_content = etree_obj.xpath('//*[@id="contson45c396367f59"]/text()')item["poetry_content"] = "".join(poetry_content).strip()#诗词译文和注释if etree_obj.xpath('//div[@class="contyishang"]'):#有的没有注释poetry_explain = etree_obj.xpath('//div[@class="contyishang"]')[0].xpath(".//text()")item["poetry_explain"] = "".join(poetry_explain).strip()else:item["poetry_explain"] = ""print(item)#保存save(item)def save(item):"""将数据保存到csv中"""with open("./古诗词.csv", "a", encoding="utf-8") as file:writer = csv.writer(file)writer.writerow(item.values())def start():first_type_list = get_first_type()for first_type in first_type_list:get_data(first_type)if __name__ == '__main__':start()
三、运行结果
保存数据: