Python提取PDF和DOCX中的文本、图片和表格

如何使用Python提取PDF和DOCX中的文本、图片和表格

在日常工作中，我们经常会需要从PDF或DOCX文件中提取文本、图片或表格数据进行分析或存档。这里，我分享一个Python脚本，它能够自动化地从PDF和DOCX文件中提取所需的内容，并将提取的表格转换为图片保存。

所需库

使用此脚本需要安装以下库：

PyMuPDF (fitz)：用于处理PDF文件。
python-docx：用于处理DOCX文件。
Pillow (PIL)：用于图像处理。

pip install pymupdf python-docx pillow

实现功能

我们将实现以下功能：

从PDF文件中提取文本、图片和表格。
将提取的表格和图片以PNG格式保存。
从DOCX文件中提取文本、图片和表格，并将表格转换为图片。

代码结构概览

1. PDF文件处理函数

extract_tables_from_pdf(pdf_path)：从PDF文件中提取表格，将每个表格保存为PNG图片。
extract_images_from_pdf(pdf_path)：从PDF中提取所有图片。
extract_text_from_pdf(pdf_path)：从PDF文件中提取所有文本。

2. DOCX文件处理函数

process_docx(docx_path)：从DOCX文件中提取文本、图片，并将表格转换为图片保存。
table_to_image(table)：将DOCX表格转换为图片，支持中文内容。
extract_images_from_docx(docx_path)：从DOCX中提取图片并保存。

3. 主函数

process_file(file_path)：根据文件类型（PDF或DOCX）调用不同的处理函数。

import os
import fitz  # PyMuPDF
from docx import Document
from PIL import Image, ImageDraw, ImageFont# 确保保存目录存在
def ensure_dir(directory):"""确保目录存在，如果不存在则创建"""if not os.path.exists(directory):os.makedirs(directory)# 从PDF中提取表格
def extract_tables_from_pdf(pdf_path):"""从PDF中提取表格并保存为图片"""doc = fitz.open(pdf_path)form_img_dir = "form_img"ensure_dir(form_img_dir)table_count = 0for page_num in range(doc.page_count):page = doc.load_page(page_num)tables = page.find_tables()  # 查找表格# 提取并保存表格图片if tables and hasattr(tables, 'tables'):for table_num, table in enumerate(tables.tables):bbox = table.bbox  # 表格边界table_pix = page.get_pixmap(clip=bbox)image_filename = os.path.join(form_img_dir, f"table_page{page_num + 1}_{table_num + 1}.png")table_pix.save(image_filename)table_count += 1print(f"保存表格: {image_filename}")doc.close()return table_count# 从PDF中提取图片
def extract_images_from_pdf(pdf_path):"""从PDF中提取图片"""doc = fitz.open(pdf_path)img_dir = "img"ensure_dir(img_dir)image_count = 0for page_num in range(doc.page_count):page = doc.load_page(page_num)image_list = page.get_images()for img_index, img in enumerate(image_list):xref = img[0]base_image = doc.extract_image(xref)image_bytes = base_image["image"]image_filename = os.path.join(img_dir, f"image_page{page_num + 1}_{img_index + 1}.png")with open(image_filename, "wb") as image_file:image_file.write(image_bytes)image_count += 1print(f"保存图片: {image_filename}")doc.close()return image_count# 从PDF中提取文本
def extract_text_from_pdf(pdf_path):"""从PDF中提取文本"""doc = fitz.open(pdf_path)text_content = []for page_num in range(doc.page_count):page = doc.load_page(page_num)text = page.get_text("text")if text.strip():text_content.append(text)doc.close()return "\n".join(text_content)# 处理DOCX文件
def process_docx(docx_path):"""处理DOCX文件"""doc = Document(docx_path)form_img_dir = "form_img"ensure_dir(form_img_dir)# 提取图片image_count = extract_images_from_docx(docx_path)table_count = 0text_content = []for paragraph in doc.paragraphs:text_content.append(paragraph.text)for table in doc.tables:table_img = table_to_image(table)image_filename = os.path.join(form_img_dir, f"table_{table_count + 1}.png")table_img.save(image_filename)table_count += 1print(f"保存表格: {image_filename}")return "\n".join(text_content), image_count, table_count# DOCX表格转图片
def table_to_image(table):"""将DOCX表格转换为图片，支持中文"""rows = len(table.rows)cols = len(table.columns)cell_width = 150cell_height = 40padding = 10img_width = cols * cell_width + 2 * paddingimg_height = rows * cell_height + 2 * paddingimg = Image.new('RGB', (img_width, img_height), 'white')draw = ImageDraw.Draw(img)try:font = ImageFont.truetype("msyh.ttc", 12)except:font = ImageFont.load_default()for i, row in enumerate(table.rows):for j, cell in enumerate(row.cells):x = j * cell_width + paddingy = i * cell_height + paddingdraw.rectangle([x, y, x + cell_width, y + cell_height], outline='black')text = cell.text.strip()text_bbox = draw.textbbox((0, 0), text, font=font)text_x = x + (cell_width - text_bbox[2] // 2)text_y = y + (cell_height - text_bbox[3] // 2)draw.text((text_x, text_y), text, fill='black', font=font)return img# 主处理函数
def process_file(file_path):"""主处理函数"""if not os.path.exists(file_path):print(f"文件不存在: {file_path}")returnfile_ext = os.path.splitext(file_path)[1].lower()if file_ext == '.pdf':text_content = extract_text_from_pdf(file_path)image_count = extract_images_from_pdf(file_path)table_count = extract_tables_from_pdf(file_path)print("\n=== 处理结果 ===")print("提取的文本内容:")print(text_content)print(f"\n总计提取: {image_count} 张图片, {table_count} 个表格")elif file_ext == '.docx':text_content, image_count, table_count = process_docx(file_path)print("\n=== 处理结果 ===")print("提取的文本内容:")print(text_content)print(f"\n总计提取: {image_count} 张图片, {table_count} 个表格")# 示例用法
file_path = "your_file_path_here.pdf"
process_file(file_path)