一款基于Python的从常规文档里提取图片的简单工具开发方案
1. 环境准备
安装必需库
pip install python-docx PyMuPDF openpyxl beautifulsoup4 pillow
pip install pdfplumber
pip install tk
工具选择
开发环境 :VSCode + Python插件调试工具 :Python IDLE(初学者友好)打包工具 :pyinstaller(可选,用于生成exe)
2. 项目架构设计
image-extractor/
├── main.py # 主程序入口
├── core/
│ ├── docx_extractor.py
│ ├── pdf_extractor.py
│ ├── excel_extractor.py
│ └── html_extractor.py
└── outputs/ # 默认输出目录
3. 核心功能实现
(1) Word文档提取 (docx_extractor.py
)
import zipfile
import os
from PIL import Imagedef extract_docx_images ( file_path, output_dir) : with zipfile. ZipFile( file_path, 'r' ) as zip_ref: image_files = [ f for f in zip_ref. namelist( ) if f. startswith( 'word/media/' ) ] for img_file in image_files: zip_ref. extract( img_file, output_dir) src = os. path. join( output_dir, img_file) dst = os. path. join( output_dir, os. path. basename( img_file) ) os. rename( src, dst) return len ( image_files)
(2) PDF文件提取 (pdf_extractor.py
)
import fitz
import osdef extract_pdf_images ( file_path, output_dir) : doc = fitz. open ( file_path) img_count = 0 for page_num in range ( len ( doc) ) : page = doc. load_page( page_num) images = page. get_images( full= True ) for img_index, img in enumerate ( images) : xref = img[ 0 ] base_image = doc. extract_image( xref) img_data = base_image[ "image" ] img_path = os. path. join( output_dir, f"pdf_page { page_num} _img { img_index} .png" ) with open ( img_path, "wb" ) as f: f. write( img_data) img_count += 1 return img_count
(3) Excel文件提取 (excel_extractor.py
)
from openpyxl import load_workbook
import osdef extract_excel_images ( file_path, output_dir) : wb = load_workbook( file_path) img_count = 0 for sheet in wb. worksheets: for image in sheet. _images: img = image. _dataimg_path = os. path. join( output_dir, f"excel_ { sheet. title} _img { img_count} .png" ) with open ( img_path, "wb" ) as f: f. write( img) img_count += 1 return img_count
(4) HTML文件提取 (html_extractor.py
)
import requests
from bs4 import BeautifulSoup
import os
import base64def extract_html_images ( html_path, output_dir) : if html_path. startswith( 'http' ) : response = requests. get( html_path) soup = BeautifulSoup( response. text, 'html.parser' ) else : with open ( html_path, 'r' ) as f: soup = BeautifulSoup( f. read( ) , 'html.parser' ) img_tags = soup. find_all( 'img' ) img_count = 0 for img in img_tags: src = img. get( 'src' ) if src. startswith( 'data:image' ) : header, data = src. split( ',' , 1 ) img_format = header. split( '/' ) [ 1 ] . split( ';' ) [ 0 ] img_data = base64. b64decode( data) img_path = os. path. join( output_dir, f"html_img { img_count} . { img_format} " ) with open ( img_path, 'wb' ) as f: f. write( img_data) img_count += 1 return img_count
4. 交互界面开发 (main.py
)
import tkinter as tk
from tkinter import filedialog, messagebox
from core import docx_extractor, pdf_extractor, excel_extractor, html_extractor
import osclass ImageExtractorApp : def __init__ ( self, root) : self. root = rootself. root. title( "多格式图片提取工具" ) self. file_path = tk. StringVar( ) self. output_dir = tk. StringVar( value= "outputs" ) self. create_widgets( ) def create_widgets ( self) : tk. Label( self. root, text= "选择文件:" ) . grid( row= 0 , column= 0 , padx= 5 , pady= 5 ) tk. Entry( self. root, textvariable= self. file_path, width= 40 ) . grid( row= 0 , column= 1 ) tk. Button( self. root, text= "浏览" , command= self. select_file) . grid( row= 0 , column= 2 ) tk. Label( self. root, text= "输出目录:" ) . grid( row= 1 , column= 0 ) tk. Entry( self. root, textvariable= self. output_dir, width= 40 ) . grid( row= 1 , column= 1 ) tk. Button( self. root, text= "选择目录" , command= self. select_output_dir) . grid( row= 1 , column= 2 ) tk. Button( self. root, text= "开始提取" , command= self. start_extraction) . grid( row= 2 , column= 1 , pady= 10 ) self. log_text = tk. Text( self. root, height= 10 , width= 50 ) self. log_text. grid( row= 3 , column= 0 , columnspan= 3 ) def select_file ( self) : file_types = [ ( '支持的文件类型' , '*.docx *.pdf *.xlsx *.html' ) , ( 'Word文档' , '*.docx' ) , ( 'PDF文件' , '*.pdf' ) , ( 'Excel文件' , '*.xlsx' ) , ( '网页文件' , '*.html' ) ] self. file_path. set ( filedialog. askopenfilename( filetypes= file_types) ) def select_output_dir ( self) : self. output_dir. set ( filedialog. askdirectory( ) ) def start_extraction ( self) : file_path = self. file_path. get( ) output_dir = self. output_dir. get( ) if not os. path. exists( output_dir) : os. makedirs( output_dir) ext = os. path. splitext( file_path) [ 1 ] . lower( ) try : if ext == '.docx' : count = docx_extractor. extract_docx_images( file_path, output_dir) elif ext == '.pdf' : count = pdf_extractor. extract_pdf_images( file_path, output_dir) elif ext == '.xlsx' : count = excel_extractor. extract_excel_images( file_path, output_dir) elif ext == '.html' : count = html_extractor. extract_html_images( file_path, output_dir) else : messagebox. showerror( "错误" , "不支持的文件类型" ) return self. log_text. insert( tk. END, f"成功提取 { count} 张图片到 { output_dir} \n" ) except Exception as e: messagebox. showerror( "错误" , f"提取失败: { str ( e) } " ) if __name__ == "__main__" : root = tk. Tk( ) app = ImageExtractorApp( root) root. mainloop( )
5. 使用说明
操作步骤
运行 main.py
点击 浏览 选择文件 (支持.docx/.pdf/.xlsx/.html) 选择输出目录(默认 outputs) 点击 开始提取 查看底部日志区域的提取结果
效果示例
成功提取 5 张图片到 outputs/
成功提取 3 张图片到 outputs/
6. 常见问题解决
Q1: Excel图片无法提取?
原因:openpyxl只能提取嵌入式图片,无法提取浮动图片 解决方案:改用xlrd
+图像坐标识别(需更复杂处理)
Q2: PDF提取的图片模糊?
原因:PDF内嵌低分辨率图片 解决方案:使用pdfplumber
的更高精度提取模式
Q3: 程序无响应?
原因:大文件处理耗时阻塞主线程 解决方案:改用多线程处理(参考threading
模块)
7. 项目扩展建议
增加批量处理 :支持文件夹批量导入添加图片预览 :在界面中显示缩略图支持压缩包 :直接解压ZIP/RAR文件并处理内容增加格式转换 :自动转换HEIC/WEBP等特殊格式