公众号爬虫，仅仅 7 步实现

文章目录

- 01 具体操作步骤
- - 1. 安装必要的库
  - 2. 导入库
  - 3. 准备中文字体
  - 4. 抓取网页内容
  - 5. 解析网页内容
  - 6. 生成PDF
  - 7. 主程序
- 02 完整案例代码

如果想将公众号的文章转成 PDF 文件的话，非常简单，实现方式如下：

01 具体操作步骤

1. 安装必要的库

首先，你需要安装一些库，就像在做一个大项目时，你需要准备工具一样：

pip install selenium beautifulsoup4 reportlab requests

2. 导入库

想象一下，你在做一个大项目，首先你需要把所有的工具都拿出来放好：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from io import BytesIO
import requests

3. 准备中文字体

就像在画画前准备颜料一样，这里我们准备中文字体：

try:from reportlab.pdfbase import cidfontspdfmetrics.registerFont(cidfonts.UnicodeCIDFont('STSong-Light'))
except:print("无法加载中文字体，使用默认字体")

4. 抓取网页内容

这就像用一个机器人去网上抓取一个网页的内容：

def fetch_article_with_selenium(url):chrome_options = Options()chrome_options.add_argument("--headless")  # 让浏览器在后台运行driver = webdriver.Chrome(options=chrome_options)driver.get(url)# 等待页面加载wait = WebDriverWait(driver, 30)try:element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))# 模拟滚动到页面底部last_height = driver.execute_script("return document.body.scrollHeight")while True:driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")time.sleep(2)new_height = driver.execute_script("return document.body.scrollHeight")if new_height == last_height:breaklast_height = new_height# 等待所有图片加载images = driver.find_elements(By.TAG_NAME, "img")for img in images:WebDriverWait(driver, 10).until(lambda d: img.get_attribute('complete') == 'true')html_content = driver.page_source.encode('utf-8').decode('utf-8')print("页面内容获取成功")except Exception as e:print(f"页面加载失败: {e}")html_content = Nonedriver.quit()return html_content

5. 解析网页内容

就像从一大堆杂物中找出你需要的部分：

def parse_article_content(html):soup = BeautifulSoup(html, 'html.parser')content_divs = []content_divs.extend(soup.find_all('div', class_=lambda x: x and ('rich_media_content' in x or 'article-content' in x)))content_divs.extend(soup.find_all('div', class_=lambda x: x and ('content' in x or 'post-content' in x)))content_divs.extend(soup.find_all('article'))for div in content_divs:if div.find('p'):print("找到文章内容")return divbody = soup.find('body')if body:print("尝试解析整个body")return bodyprint("未找到文章内容")return None

6. 生成PDF

就像用一个打印机打印文档：

def text_to_pdf(div, output_filename):doc = SimpleDocTemplate(output_filename, pagesize=A4,rightMargin=72, leftMargin=72,topMargin=72, bottomMargin=18)Story = []styles = getSampleStyleSheet()heading_style = ParagraphStyle('Heading1', parent=styles['Heading1'], fontName='STSong-Light', fontSize=16, leading=20)normal_style = ParagraphStyle('Normal', parent=styles['Normal'], fontName='STSong-Light', fontSize=12, leading=16)for element in div.children:if element.name == 'p':p = Paragraph(element.get_text(strip=True), normal_style)Story.append(p)Story.append(Spacer(1, 12))  # 添加段落间距elif element.name in ['h1', 'h2', 'h3']:h = Paragraph(element.get_text(strip=True), heading_style)Story.append(h)Story.append(Spacer(1, 12))  # 添加标题后间距elif element.name == 'img':img_url = element.get('src')if img_url:response = requests.get(img_url)img = Image(BytesIO(response.content), width=element.get('width'), height=element.get('height'))Story.append(img)Story.append(Spacer(1, 12))  # 添加图片后间距elif element.name == 'br':Story.append(Spacer(1, 6))  # 添加换行间距if not Story:Story.append(Paragraph("无法从页面获取内容。", normal_style))doc.build(Story)

7. 主程序

就像一个指挥者，组织所有步骤：

def main():url = input("请输入文章的链接: ")html_content = fetch_article_with_selenium(url)if html_content:article_div = parse_article_content(html_content)if article_div:output_filename = "article.pdf"text_to_pdf(article_div, output_filename)print(f"文章已保存为PDF文件: {output_filename}")else:print("无法解析文章内容。")else:print("无法获取文章内容。")if __name__ == "__main__":main()

02 完整案例代码

注意：某些经过三方编辑平台格式化后的文章，可能无法抓取，不过你们可以继续深入研究。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
from reportlab.pdfgen import canvas
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from io import BytesIO
import requests
import logging# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')# 使用系统默认中文字体
try:from reportlab.pdfbase import cidfontspdfmetrics.registerFont(cidfonts.UnicodeCIDFont('STSong-Light'))
except:try:from reportlab.pdfbase import cidfontspdfmetrics.registerFont(cidfonts.UnicodeCIDFont('HeiseiKakuGo-W5'))except:logging.warning("无法加载中文字体，使用默认字体")def fetch_article_with_selenium(url):chrome_options = Options()chrome_options.add_argument("--headless")driver = webdriver.Chrome(options=chrome_options)driver.get(url)# 等待页面加载wait = WebDriverWait(driver, 30)try:element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))# 模拟滚动到页面底部last_height = driver.execute_script("return document.body.scrollHeight")while True:driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")time.sleep(2)new_height = driver.execute_script("return document.body.scrollHeight")if new_height == last_height:breaklast_height = new_height# 等待所有图片加载images = driver.find_elements(By.TAG_NAME, "img")for img in images:WebDriverWait(driver, 10).until(lambda d: img.get_attribute('complete') == 'true')html_content = driver.page_source.encode('utf-8').decode('utf-8')logging.info("页面内容获取成功")except Exception as e:logging.error(f"页面加载失败: {e}")html_content = Nonedriver.quit()return html_contentdef parse_article_content(html):soup = BeautifulSoup(html, 'html.parser')# 尝试多种可能的HTML结构content_divs = []content_divs.extend(soup.find_all('div', class_=lambda x: x and ('rich_media_content' in x or 'article-content' in x)))content_divs.extend(soup.find_all('div', class_=lambda x: x and ('content' in x or 'post-content' in x)))content_divs.extend(soup.find_all('article'))for div in content_divs:if div.find('p') or div.find('h1') or div.find('h2') or div.find('h3'):logging.info("找到文章内容")return div# 如果以上都失败，尝试解析整个bodybody = soup.find('body')if body:logging.info("尝试解析整个body")return bodylogging.warning("未找到文章内容")return Nonedef text_to_pdf(div, output_filename):doc = SimpleDocTemplate(output_filename, pagesize=A4,rightMargin=72, leftMargin=72,topMargin=72, bottomMargin=18)Story = []# 获取所有样式表styles = getSampleStyleSheet()# 创建新的样式对象heading_style = ParagraphStyle('Heading1', parent=styles['Heading1'], fontName='STSong-Light', fontSize=16, leading=20)normal_style = ParagraphStyle('Normal', parent=styles['Normal'], fontName='STSong-Light', fontSize=12, leading=16)# 遍历 div 中的所有元素for element in div.children:if element.name == 'p':# 段落p = Paragraph(element.get_text(strip=True), normal_style)Story.append(p)Story.append(Spacer(1, 12))  # 添加段落间距elif element.name in ['h1', 'h2', 'h3']:# 标题h = Paragraph(element.get_text(strip=True), heading_style)Story.append(h)Story.append(Spacer(1, 12))  # 添加标题后间距elif element.name == 'img':# 图片处理img_url = element.get('src')if img_url:try:response = requests.get(img_url)img = Image(BytesIO(response.content), width=element.get('width'), height=element.get('height'))Story.append(img)Story.append(Spacer(1, 12))  # 添加图片后间距except Exception as e:logging.error(f"无法加载图片 {img_url}: {e}")elif element.name == 'br':Story.append(Spacer(1, 6))  # 添加换行间距# 确保至少有一个段落if not Story:Story.append(Paragraph("无法从页面获取内容。", normal_style))doc.build(Story)def main():url = input("请输入文章的链接: ")html_content = fetch_article_with_selenium(url)if html_content:article_div = parse_article_content(html_content)if article_div:output_filename = "article.pdf"text_to_pdf(article_div, output_filename)logging.info(f"文章已保存为PDF文件: {output_filename}")else:logging.error("无法解析文章内容。")else:logging.error("无法获取文章内容。")if __name__ == "__main__":main()