pdf2md (#3213)

* pdf2md * pdf2md * pdf2md * pdf2md * pdf2md
2025-07-21 11:43:56 +00:00 · 2024-11-27 16:28:14 +08:00
parent 5fa2e3c5ac
commit b09e972c20
6 changed files with 449 additions and 0 deletions
--- a/python/pdf-marker/Dockerfile
+++ b/python/pdf-marker/Dockerfile
@@ -0,0 +1,38 @@
+FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-devel
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG=C.UTF-8
+# 安装构建依赖 cv2 dependencies
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
+
+# 设置 pip 配置
+RUN mkdir -p /root/.pip
+COPY pip.conf /root/.pip/
+
+# 创建模型文件夹
+RUN mkdir -p /root/huggingface
+
+# 复制依赖文件
+COPY requirements.txt /root/
+COPY api_mp.py /root/
+
+
+# 导入huggingface的代理和huggingface模型位置
+ENV HF_ENDPOINT=https://hf-mirror.com \
+    HF_DATASETS_CACHE=/root/huggingface \
+    HUGGINGFACE_HUB_CACHE=/root/huggingface \
+    HF_HOME=/root/huggingface
+
+# 设置工作目录
+WORKDIR /root
+
+# 安装 Python 依赖
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# 删除不必要的工具和文件以减小镜像体积
+RUN apt-get purge -y vim && apt-get autoremove -y && rm -rf /root/.pip /root/.cache/pip
+
+
+
+# 设置容器启动命令
+CMD ["python3", "api_mp.py"]
--- a/python/pdf-marker/Readme.md
+++ b/python/pdf-marker/Readme.md
@@ -0,0 +1,131 @@
+# 项目介绍
+
+本项目实现了一个高效的 **PDF 转 Markdown 接口服务**，支持多进程并行处理多个 PDF 文件。通过高性能的接口设计，快速将 PDF 文档转换为 Markdown 格式文本。
+
+- **简洁性：**项目无需修改代码，仅需调整文件路径即可使用，简单易用
+- **易用性：**通过提供简洁的 API，开发者只需发送 HTTP 请求即可完成 PDF 转换
+- **灵活性：**支持本地部署和 Docker 容器部署两种方式，便于快速上手和灵活集成
+
+# 配置推荐
+
+## 常规配置
+
+24G显存的显卡两张，可以支持四个文件同时处理
+
+## 最低配置
+
+**不低于11G** 显存的显卡一张
+
+并设置每张卡处理的进程数为1
+
+```bash
+export PROCESSES_PER_GPU="1"
+```
+
+## 单文件实测速率
+
+| 显卡          | 中文PDF      | 英文PDF      | 扫描件       |
+| ------------- | ------------ | ------------ | ------------ |
+| **4090D 24G** | **0.75s/页** | **1.60s/页** | **3.26s/页** |
+| **P40 24G**   | **0.99s/页** | **2.22s/页** | **5.24s/页** |
+
+## 多文件实测速率
+
+中文PDF+英文PDF
+
+| 显卡          | 串行处理     | 并行处理     | 提升效率  |
+| ------------- | ------------ | ------------ | --------- |
+| **4090D 24G** | **0.92s/页** | **0.62s/页** | **31.9%** |
+| **P40 24G**   | **1.22s/页** | **0.85s/页** | **30.5%** |
+
+# 本地开发
+
+## 基本流程
+
+1. 克隆一个FastGPT的项目文件
+
+   ```
+   git clone https://github.com/labring/FastGPT.git
+   ```
+
+2. 将主目录设置为 python下的pdf-marker文件
+
+   ```
+   cd python/pdf-marker
+   ```
+
+3. 创建Anaconda并安装requirement.txt文件
+
+   安装的Anaconda版本：**conda 24.7.1**
+
+   ```
+   conda create -n pdf-marker python=3.11
+   pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+   conda activate pdf-marker
+   ```
+
+4. 执行主文件启动pdf2md服务
+
+   ```
+   python api_mp.py
+   ```
+
+# 镜像打包和部署
+
+## 打包镜像
+
+在 `pdf-marker` 根目录下执行：
+
+```bash
+sudo docker build -t model_pdf -f Dockerfile .
+```
+
+## 运行容器
+
+```bash
+sudo docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 model_pdf
+```
+
+# 访问示例
+
+用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务
+
+参数：file-->本地文件的地址
+
+- 访问方法
+
+  ```
+  curl --location --request POST "http://localhost:7231/v1/parse/file" \
+  --header "Authorization: Bearer your_access_token" \
+  --form "file=@./file/chinese_test.pdf"
+  ```
+
+- 多文件测试数据
+
+  运行 `test` 文件下的 `test.py` 文件，修改里面的 `file_paths` 为自己仓库的 `url` 即可
+
+# FQA
+
+- 如果出现huggingface模型下载不下来?
+
+  可以选择在环境变量中加入huggingface镜像
+
+  ```bash
+  export HF_ENDPOINT=https://hf-mirror.com
+  export HF_DATASETS_CACHE=/huggingface
+  export HUGGINGFACE_HUB_CACHE=/huggingface
+  export HF_HOME=/huggingface
+  ```
+
+  也可以直接访问 [huggingface][https://huggingface.co] 来下载模型到 `/huggingface` 文件夹下
+
+  ```
+  https://huggingface.co/vikp/surya_det3/tree/main
+  https://huggingface.co/vikp/surya_layout3/tree/main
+  https://huggingface.co/vikp/surya_order/tree/main
+  https://huggingface.co/vikp/surya_rec2/tree/main
+  https://huggingface.co/vikp/surya_tablerec/tree/main
+  https://huggingface.co/vikp/texify2/tree/main
+  ```
+
+  
--- a/python/pdf-marker/api_mp.py
+++ b/python/pdf-marker/api_mp.py
@@ -0,0 +1,141 @@
+import asyncio
+import base64
+import fitz
+import torch.multiprocessing as mp
+import shutil
+import time
+from contextlib import asynccontextmanager
+from loguru import logger
+from fastapi import HTTPException, FastAPI, UploadFile, File
+import multiprocessing
+from marker.output import save_markdown
+from marker.convert import convert_single_pdf
+from marker.models import load_all_models
+import torch
+from concurrent.futures import ProcessPoolExecutor
+import os
+app = FastAPI()
+model_lst = None
+model_refs = None
+temp_dir = "./temp"
+os.environ['PROCESSES_PER_GPU'] = str(2)
+
+def worker_init(counter, lock):
+    global model_lst
+    num_gpus = torch.cuda.device_count()
+    processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1))
+    with lock:
+        worker_id = counter.value
+        counter.value += 1
+    if num_gpus == 0:
+        device = 'cpu'
+    else:
+        device_id = worker_id // processes_per_gpu
+        if device_id >= num_gpus:
+            raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).")
+        device = f'cuda:{device_id}'
+    model_lst = load_all_models(device=device, dtype=torch.float32)
+    print(f"Worker {worker_id}: Models loaded successfully on {device}!")
+    for model in model_lst:
+        if model is None:
+            continue
+        model.share_memory()
+
+def process_file_with_multiprocessing(temp_file_path):
+    global model_lst
+    full_text, images, out_meta = convert_single_pdf(temp_file_path, model_lst, batch_multiplier=1)
+    fname = os.path.basename(temp_file_path)
+    subfolder_path = save_markdown(r'./result', fname, full_text, images, out_meta)
+    md_content_with_base64_images = embed_images_as_base64(full_text, subfolder_path)
+    return md_content_with_base64_images, out_meta
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    try:
+        mp.set_start_method('spawn')
+    except RuntimeError:
+        raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
+    manager = multiprocessing.Manager()
+    worker_counter = manager.Value('i', 0)
+    worker_lock = manager.Lock()
+    global my_pool
+    gpu_count = torch.cuda.device_count()
+    my_pool = ProcessPoolExecutor(max_workers=gpu_count*int(os.environ.get('PROCESSES_PER_GPU', 1)), initializer=worker_init, initargs=(worker_counter, worker_lock))
+
+    yield
+    global temp_dir
+    if temp_dir and os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+    del model_lst
+    del model_refs
+    print("Application shutdown, cleaning up...")
+
+app.router.lifespan_context = lifespan
+
+@app.post("/v1/parse/file")
+async def read_file(
+        file: UploadFile = File(...)):
+    try:
+        start_time = time.time()
+        global temp_dir
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_file_path = os.path.join(temp_dir, file.filename)
+        with open(temp_file_path, "wb") as temp_file:
+            temp_file.write(await file.read())
+        pdf_document = fitz.open(temp_file_path)
+        total_pages = pdf_document.page_count
+        pdf_document.close()
+        global my_pool
+        loop = asyncio.get_event_loop()
+        md_content_with_base64_images, out_meta = await loop.run_in_executor(my_pool, process_file_with_multiprocessing, temp_file_path)
+
+        end_time = time.time()
+        duration = end_time - start_time
+        print(file.filename+"Total time:", duration)
+        return {
+                "success": True,
+                "message": "",
+                "data": {
+                    "markdown": md_content_with_base64_images,
+                    "page": total_pages,
+                    "duration": duration
+                }
+            }
+
+    except Exception as e:
+        logger.exception(e)
+        raise HTTPException(status_code=500, detail=f"错误信息: {str(e)}")
+
+    finally:
+
+        if temp_file_path and os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+def img_to_base64(img_path):
+    with open(img_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode('utf-8')
+def embed_images_as_base64(md_content, image_dir):
+    lines = md_content.split('\n')
+    new_lines = []
+    for line in lines:
+        if line.startswith("![") and "](" in line and ")" in line:
+            start_idx = line.index("](") + 2
+            end_idx = line.index(")", start_idx)
+            img_rel_path = line[start_idx:end_idx]
+
+            img_name = os.path.basename(img_rel_path)
+            img_path = os.path.join(image_dir, img_name)
+
+            if os.path.exists(img_path):
+                img_base64 = img_to_base64(img_path)
+                new_line = f'{line[:start_idx]}data:image/png;base64,{img_base64}{line[end_idx:]}'
+                new_lines.append(new_line)
+            else:
+                new_lines.append(line)
+        else:
+            new_lines.append(line)
+    return '\n'.join(new_lines)
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7231)
+
--- a/python/pdf-marker/pip.conf
+++ b/python/pdf-marker/pip.conf
@@ -0,0 +1,5 @@
+[global]
+time-out=60
+index-url=https://pypi.tuna.tsinghua.edu.cn/simple
+[install]
+trusted-host=pypi.tuna.tsinghua.edu.cn
--- a/python/pdf-marker/requirements.txt
+++ b/python/pdf-marker/requirements.txt
@@ -0,0 +1,108 @@
+acres==0.1.0
+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.6.2.post1
+certifi==2024.8.30
+charset-normalizer==3.4.0
+ci-info==0.3.0
+click==8.1.7
+coloredlogs==15.0.1
+configobj==5.0.9
+configparser==7.1.0
+dol==0.2.83
+etelemetry==0.3.1
+fastapi==0.115.5
+filelock==3.16.1
+filetype==1.2.0
+flatbuffers==24.3.25
+frontend==0.0.3
+fsspec==2024.10.0
+ftfy==6.3.1
+h11==0.14.0
+httplib2==0.22.0
+huggingface-hub==0.26.2
+humanfriendly==10.0
+i2==0.1.36
+idna==3.10
+importlib_resources==6.4.5
+isodate==0.6.1
+itsdangerous==2.2.0
+Jinja2==3.1.4
+joblib==1.4.2
+loguru==0.7.2
+looseversion==1.3.0
+lxml==5.3.0
+marker-pdf==0.3.10
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx==3.4.2
+nibabel==5.3.2
+nipype==1.9.1
+numpy==2.1.3
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+onnxruntime==1.20.1
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+packaging==24.2
+pandas==2.2.3
+pathlib==1.0.1
+pdftext==0.3.19
+pillow==10.4.0
+pip==24.3.1
+protobuf==5.28.3
+prov==2.0.1
+puremagic==1.28
+pydantic==2.10.0
+pydantic_core==2.27.0
+pydantic-settings==2.6.1
+pydot==3.0.2
+PyMuPDF==1.24.14
+pyparsing==3.2.0
+pypdfium2==4.30.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.17
+pytz==2024.2
+pyxnat==1.6.2
+PyYAML==6.0.2
+RapidFuzz==3.10.1
+rdflib==6.3.2
+regex==2024.11.6
+requests==2.32.3
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+setuptools==75.6.0
+simplejson==3.19.3
+six==1.16.0
+sniffio==1.3.1
+starlette==0.41.3
+surya-ocr==0.6.13
+sympy==1.13.1
+tabled-pdf==0.1.4
+tabulate==0.9.0
+texify==0.2.1
+threadpoolctl==3.5.0
+tokenizers==0.20.3
+torch==2.5.1
+tqdm==4.67.0
+traits==6.4.3
+transformers==4.46.3
+triton==3.1.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.1
+wcwidth==0.2.13
+wheel==0.45.0
--- a/python/pdf-marker/test/test.py
+++ b/python/pdf-marker/test/test.py
@@ -0,0 +1,26 @@
+import json
+import os
+from io import BytesIO
+import requests
+from multiprocessing import Process
+def request_(file_path):
+    url = "http://127.0.0.1:7231/v1/parse/file"
+    response = requests.get(file_path)
+    if response.status_code == 200:
+        file_data = BytesIO(response.content)
+        pdf_name = os.path.basename(file_path)
+        files = {'file': (pdf_name, file_data, 'application/pdf')}
+        response = requests.post(url, files=files)
+        if response.status_code == 200:
+            print("Response JSON:", json.dumps(response.json(), indent=4, ensure_ascii=False))
+        else:
+            print(f"Request failed with status code: {response.status_code}")
+            print(response.text)
+
+if __name__ == "__main__":
+    file_paths = ["https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_test.pdf", "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/chinese_test.pdf",
+                 "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf","https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/3649329.3658477.pdf"]
+    for file_path in file_paths:
+        p = Process(target=request_, args=(file_path))
+        p.start()
+