准备环境
python3 -m pip install huggingface_hub
python3 -m pip install modelscope
python3 -m pip install -U magic-pdf[ full] --extra-index-url https://wheels.myhloli.com
下载需要的模型
import json
import osimport requests
from huggingface_hub import snapshot_download
def download_json ( url) : response = requests. get( url) response. raise_for_status( ) return response. json( ) def download_and_modify_json ( url, local_filename, modifications) : if os. path. exists( local_filename) : data = json. load( open ( local_filename) ) config_version = data. get( 'config_version' , '0.0.0' ) if config_version < '1.0.0' : data = download_json( url) else : data = download_json( url) for key, value in modifications. items( ) : data[ key] = valuewith open ( local_filename, 'w' , encoding= 'utf-8' ) as f: json. dump( data, f, ensure_ascii= False , indent= 4 ) if __name__ == '__main__' : mineru_patterns = [ "models/Layout/LayoutLMv3/*" , "models/Layout/YOLO/*" , "models/MFD/YOLO/*" , "models/MFR/unimernet_small/*" , "models/TabRec/TableMaster/*" , "models/TabRec/StructEqTable/*" , ] model_dir = snapshot_download( 'opendatalab/PDF-Extract-Kit-1.0' , allow_patterns= mineru_patterns) layoutreader_pattern = [ "*.json" , "*.safetensors" , ] layoutreader_model_dir = snapshot_download( 'hantian/layoutreader' , allow_patterns= layoutreader_pattern) model_dir = model_dir + '/models' print ( f'model_dir is: { model_dir} ' ) print ( f'layoutreader_model_dir is: { layoutreader_model_dir} ' ) json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json' config_file_name = 'magic-pdf.json' home_dir = os. path. expanduser( '~' ) config_file = os. path. join( home_dir, config_file_name) json_mods = { 'models-dir' : model_dir, 'layoutreader-model-dir' : layoutreader_model_dir, } download_and_modify_json( json_url, config_file, json_mods) print ( f'The configuration file has been configured successfully, the path is: { config_file} ' )
测试
wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf -O small_ocr.pdf
magic-pdf -p small_ocr.pdf -o ./output
GPU加速
vim yourpath/magic-pdf.json
"device-mode" : "cpu" -> "device-mode" : "cuda" ,