add pdf-mineru (#4276)

* add pdf-mineru 添加了基于MinerU的PDF转Markdown接口服务，调用方式与pdf-marker一致，开箱即用。 * Rename Readme.md to README.md * Rename pdf_parser_mineru.py to main.py
2025-10-18 17:51:24 +00:00 · 2025-03-24 17:17:08 +08:00
parent 1c4e0c66d5
commit 8a68de6471
2 changed files with 367 additions and 0 deletions
--- a/plugins/model/pdf-mineru/README.md
+++ b/plugins/model/pdf-mineru/README.md
@@ -0,0 +1,85 @@
 # Readme
 # 项目介绍
 ---
 本项目参照官方插件**pdf-marker，**基于MinertU实现了一个高效的 **PDF 转 Markdown 接口服务**，通过高性能的接口设计，快速将 PDF 文档转换为 Markdown 格式文本。
 - **简洁性：**项目无需修改代码，仅需调整文件路径即可使用，简单易用
 - **易用性：**通过提供简洁的 API，开发者只需发送 HTTP 请求即可完成 PDF 转换
 - **灵活性：**支持本地部署，便于快速上手和灵活集成
 # 配置推荐
 配置及速率请参照[MinerU项目](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md)官方介绍。
 # 本地开发
 ## 基本流程
 1、安装基本环境，主要参照官方文档[使用CPU及GPU](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8GPU)运行MinerU的方式进行。具体如下，首先使用anaconda安装基础运行环境
 ```bash
 conda create -n mineru python=3.10
 conda activate mineru
 pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
 ```
 2、[下载模型权重文件](https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md)
 ```bash
 pip install modelscope
 wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
 python download_models.py
 ```
 python脚本会自动下载模型文件并配置好配置文件中的模型目录
 配置文件可以在用户目录中找到，文件名为`magic-pdf.json`
 > windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
 3、如果您的显卡显存大于等于 **8GB** ，可以进行以下流程，测试CUDA解析加速效果。默认为cpu模式，使用显卡的话需修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值。
 ```bash
 {
  "device-mode":"cuda"
 }
 ```
 4、如需使用GPU加速，需额外再安装依赖。
 ```bash
 pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu118
 ```
 ```bash
 pip install paddlepaddle-gpu==2.6.1
 ```
 5、克隆一个FastGPT的项目文件
 ```
 git clone https://github.com/labring/FastGPT.git
 ```
 6、将主目录设置为 plugins/model 下的pdf-mineru文件夹
 ```
 cd /plugins/model/pdf-mineru/
 ```
 7、执行文件pdf_parser_mineru.py，启动服务
 ```bash
 python pdf_parser_mineru.py
 ```
 # 访问示例
 仿照了**pdf-marker**的方式。
 ```bash
 curl --location --request POST "http://localhost:7231/v1/parse/file" \
 --header "Authorization: Bearer your_access_token" \
 --form "file=@./file/chinese_test.pdf"
 ```
--- a/plugins/model/pdf-mineru/main.py
+++ b/plugins/model/pdf-mineru/main.py
@@ -0,0 +1,282 @@
 import json
 import os
 from base64 import b64encode
 from glob import glob
 from io import StringIO
 from typing import Tuple, Union
 import uvicorn
 from fastapi import FastAPI, UploadFile, File
 from fastapi.responses import JSONResponse
 from loguru import logger
 from tempfile import TemporaryDirectory
 from pathlib import Path
 import fitz  # PyMuPDF
 import asyncio
 from concurrent.futures import ProcessPoolExecutor
 import torch
 import multiprocessing as mp
 from contextlib import asynccontextmanager
 import time
 import magic_pdf.model as model_config
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 from magic_pdf.operators.models import InferenceResult
 from magic_pdf.operators.pipes import PipeResult
 model_config.__use_inside_model__ = True
 app = FastAPI()
 process_variables = {}
 my_pool = None
 class MemoryDataWriter(DataWriter):
    def __init__(self):
        self.buffer = StringIO()
    def write(self, path: str, data: bytes) -> None:
        if isinstance(data, str):
            self.buffer.write(data)
        else:
            self.buffer.write(data.decode("utf-8"))
    def write_string(self, path: str, data: str) -> None:
        self.buffer.write(data)
    def get_value(self) -> str:
        return self.buffer.getvalue()  # 修复：使用 getvalue() 而不是 get_value()
    def close(self):
        self.buffer.close()
 def worker_init(counter, lock):
    num_gpus = torch.cuda.device_count()
    processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1))
    with lock:
        worker_id = counter.value
        counter.value += 1
    if num_gpus == 0:
        device = 'cpu'
    else:
        device_id = worker_id // processes_per_gpu
        if device_id >= num_gpus:
            raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).")
        device = f'cuda:{device_id}'
    config = {
        "parse_method": "auto",
        "ADDITIONAL_KEY": "VALUE"
    }
    converter = init_converter(config, device_id)
    pid = os.getpid()
    process_variables[pid] = converter
    print(f"Worker {worker_id}: Models loaded successfully on {device}!")
 def init_converter(config, device_id):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    return config
 def img_to_base64(img_path: str) -> str:
    with open(img_path, "rb") as img_file:
        return b64encode(img_file.read()).decode('utf-8')
 def embed_images_as_base64(md_content: str, image_dir: str) -> str:
    lines = md_content.split('\n')
    new_lines = []
    for line in lines:
        if line.startswith("![") and "](" in line and ")" in line:
            start_idx = line.index("](") + 2
            end_idx = line.index(")", start_idx)
            img_rel_path = line[start_idx:end_idx]
            img_name = os.path.basename(img_rel_path)
            img_path = os.path.join(image_dir, img_name)
            logger.info(f"Checking image: {img_path}")
            if os.path.exists(img_path):
                img_base64 = img_to_base64(img_path)
                new_line = f"![](data:image/png;base64,{img_base64})"
                new_lines.append(new_line)
            else:
                logger.warning(f"Image not found: {img_path}")
                new_lines.append(line)
        else:
            new_lines.append(line)
    return '\n'.join(new_lines)
 def process_pdf(pdf_path, output_dir):
    try:
        pid = os.getpid()
        config = process_variables.get(pid, "No variable")
        parse_method = config["parse_method"]
        with open(str(pdf_path), "rb") as f:
            pdf_bytes = f.read()
        output_path = Path(output_dir) / f"{Path(pdf_path).stem}_output"
        os.makedirs(str(output_path), exist_ok=True)
        image_dir = os.path.join(str(output_path), "images")
        os.makedirs(image_dir, exist_ok=True)
        image_writer = FileBasedDataWriter(str(output_path))
        # 处理 PDF
        infer_result, pipe_result = process_pdf_content(pdf_bytes, parse_method, image_writer)
        md_content_writer = MemoryDataWriter()
        pipe_result.dump_md(md_content_writer, "", "images")
        md_content = md_content_writer.get_value()
        md_content_writer.close()
        # 获取保存的图片路径
        image_paths = glob(os.path.join(image_dir, "*.jpg"))
        logger.info(f"Saved images by magic_pdf: {image_paths}")
        # 如果 magic_pdf 未保存足够图片，使用 fitz 提取
        if not image_paths or len(image_paths) < 3:  # 假设至少 3 张图片
            logger.warning("Insufficient images saved by magic_pdf, falling back to fitz extraction")
            image_map = {}
            original_names = []
            # 收集 Markdown 中的所有图片文件名
            for line in md_content.split('\n'):
                if line.startswith("![") and "](" in line and ")" in line:
                    start_idx = line.index("](") + 2
                    end_idx = line.index(")", start_idx)
                    img_rel_path = line[start_idx:end_idx]
                    original_names.append(os.path.basename(img_rel_path))
            # 提取图片并映射
            with fitz.open(pdf_path) as doc:
                img_counter = 0
                for page_num, page in enumerate(doc):
                    for img_index, img in enumerate(page.get_images(full=True)):
                        xref = img[0]
                        base = doc.extract_image(xref)
                        if img_counter < len(original_names):
                            img_name = original_names[img_counter]  # 使用 Markdown 中的原始文件名
                        else:
                            img_name = f"page_{page_num}_img_{img_index}.jpg"
                        img_path = os.path.join(image_dir, img_name)
                        with open(img_path, "wb") as f:
                            f.write(base["image"])
                        if img_counter < len(original_names):
                            image_map[original_names[img_counter]] = img_name
                        img_counter += 1
            image_paths = glob(os.path.join(image_dir, "*.jpg"))
            logger.info(f"Images extracted by fitz: {image_paths}")
            # 更新 Markdown（仅在必要时替换）
            for original_name, new_name in image_map.items():
                if original_name != new_name:
                    md_content = md_content.replace(f"images/{original_name}", f"images/{new_name}")
        return {
            "status": "success",
            "text": md_content,
            "output_path": str(output_path),
            "images": image_paths
        }
    except Exception as e:
        logger.error(f"Error processing PDF: {str(e)}")
        return {
            "status": "error",
            "message": str(e),
            "file": str(pdf_path)
        }
 def process_pdf_content(pdf_bytes, parse_method, image_writer):
    ds = PymuDocDataset(pdf_bytes)
    infer_result: InferenceResult = None
    pipe_result: PipeResult = None
    if parse_method == "ocr":
        infer_result = ds.apply(doc_analyze, ocr=True)
        pipe_result = infer_result.pipe_ocr_mode(image_writer)
    elif parse_method == "txt":
        infer_result = ds.apply(doc_analyze, ocr=False)
        pipe_result = infer_result.pipe_txt_mode(image_writer)
    else:  # auto
        if ds.classify() == SupportedPdfParseMethod.OCR:
            infer_result = ds.apply(doc_analyze, ocr=True)
            pipe_result = infer_result.pipe_ocr_mode(image_writer)
        else:
            infer_result = ds.apply(doc_analyze, ocr=False)
            pipe_result = infer_result.pipe_txt_mode(image_writer)
    return infer_result, pipe_result
@asynccontextmanager
 async def lifespan(app: FastAPI):
    try:
        mp.set_start_method('spawn')
    except RuntimeError:
        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
    global my_pool
    manager = mp.Manager()
    worker_counter = manager.Value('i', 0)
    worker_lock = manager.Lock()
    gpu_count = torch.cuda.device_count()
    my_pool = ProcessPoolExecutor(max_workers=gpu_count * int(os.environ.get('PROCESSES_PER_GPU', 1)), 
                                  initializer=worker_init, initargs=(worker_counter, worker_lock))
    yield
    if my_pool:
        my_pool.shutdown(wait=True)
    print("Application shutdown, cleaning up...")
 app.router.lifespan_context = lifespan
@app.post("/v2/parse/file")
 async def process_pdfs(file: UploadFile = File(...)):
    s_time = time.time()
    with TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir) / file.filename
        with open(str(temp_path), "wb") as buffer:
            buffer.write(await file.read())
        # 验证 PDF 文件
        try:
            with fitz.open(str(temp_path)) as pdf_document:
                total_pages = pdf_document.page_count
        except fitz.fitz.FileDataError:
            return JSONResponse(content={"success": False, "message": "", "error": "Invalid PDF file"}, status_code=400)
        except Exception as e:
            logger.error(f"Error opening PDF: {str(e)}")
            return JSONResponse(content={"success": False, "message": "", "error": f"Internal server error: {str(e)}"}, status_code=500)
        try:
            loop = asyncio.get_running_loop()
            results = await loop.run_in_executor(
                my_pool,
                process_pdf,
                str(temp_path),
                str(temp_dir)
            )
            if results.get("status") == "error":
                return JSONResponse(content={
                    "success": False,
                    "message": "",
                    "error": results.get("message")
                }, status_code=500)
            # 嵌入 Base64
            image_dir = os.path.join(results.get("output_path"), "images")
            md_content_with_base64 = embed_images_as_base64(results.get("text"), image_dir)
            return {
                "success": True,
                "message": "",
                "markdown": md_content_with_base64,
                "pages": total_pages
            }
        except Exception as e:
            logger.error(f"Error in process_pdfs: {str(e)}")
            return JSONResponse(content={
                "success": False,
                "message": "",
                "error": f"Internal server error: {str(e)}"
            }, status_code=500)
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7231)