diff --git a/python/pdf-marker/Dockerfile b/python/pdf-marker/Dockerfile new file mode 100644 index 000000000..1d2b5fd0d --- /dev/null +++ b/python/pdf-marker/Dockerfile @@ -0,0 +1,38 @@ +FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-devel + +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=C.UTF-8 +# 安装构建依赖 cv2 dependencies +RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y + +# 设置 pip 配置 +RUN mkdir -p /root/.pip +COPY pip.conf /root/.pip/ + +# 创建模型文件夹 +RUN mkdir -p /root/huggingface + +# 复制依赖文件 +COPY requirements.txt /root/ +COPY api_mp.py /root/ + + +# 导入huggingface的代理和huggingface模型位置 +ENV HF_ENDPOINT=https://hf-mirror.com \ + HF_DATASETS_CACHE=/root/huggingface \ + HUGGINGFACE_HUB_CACHE=/root/huggingface \ + HF_HOME=/root/huggingface + +# 设置工作目录 +WORKDIR /root + +# 安装 Python 依赖 +RUN pip3 install --no-cache-dir -r requirements.txt + +# 删除不必要的工具和文件以减小镜像体积 +RUN apt-get purge -y vim && apt-get autoremove -y && rm -rf /root/.pip /root/.cache/pip + + + +# 设置容器启动命令 +CMD ["python3", "api_mp.py"] \ No newline at end of file diff --git a/python/pdf-marker/Readme.md b/python/pdf-marker/Readme.md new file mode 100644 index 000000000..d0f35cd11 --- /dev/null +++ b/python/pdf-marker/Readme.md @@ -0,0 +1,131 @@ +# 项目介绍 + +本项目实现了一个高效的 **PDF 转 Markdown 接口服务**,支持多进程并行处理多个 PDF 文件。通过高性能的接口设计,快速将 PDF 文档转换为 Markdown 格式文本。 + +- **简洁性:**项目无需修改代码,仅需调整文件路径即可使用,简单易用 +- **易用性:**通过提供简洁的 API,开发者只需发送 HTTP 请求即可完成 PDF 转换 +- **灵活性:**支持本地部署和 Docker 容器部署两种方式,便于快速上手和灵活集成 + +# 配置推荐 + +## 常规配置 + +24G显存的显卡两张,可以支持四个文件同时处理 + +## 最低配置 + +**不低于11G** 显存的显卡一张 + +并设置每张卡处理的进程数为1 + +```bash +export PROCESSES_PER_GPU="1" +``` + +## 单文件实测速率 + +| 显卡 | 中文PDF | 英文PDF | 扫描件 | +| ------------- | ------------ | ------------ | ------------ | +| **4090D 24G** | **0.75s/页** | **1.60s/页** | **3.26s/页** | +| **P40 24G** | **0.99s/页** | **2.22s/页** | **5.24s/页** | + +## 多文件实测速率 + +中文PDF+英文PDF + +| 显卡 | 串行处理 | 并行处理 | 提升效率 | +| ------------- | ------------ | ------------ | --------- | +| **4090D 24G** | **0.92s/页** | **0.62s/页** | **31.9%** | +| **P40 24G** | **1.22s/页** | **0.85s/页** | **30.5%** | + +# 本地开发 + +## 基本流程 + +1. 克隆一个FastGPT的项目文件 + + ``` + git clone https://github.com/labring/FastGPT.git + ``` + +2. 将主目录设置为 python下的pdf-marker文件 + + ``` + cd python/pdf-marker + ``` + +3. 创建Anaconda并安装requirement.txt文件 + + 安装的Anaconda版本:**conda 24.7.1** + + ``` + conda create -n pdf-marker python=3.11 + pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple + conda activate pdf-marker + ``` + +4. 执行主文件启动pdf2md服务 + + ``` + python api_mp.py + ``` + +# 镜像打包和部署 + +## 打包镜像 + +在 `pdf-marker` 根目录下执行: + +```bash +sudo docker build -t model_pdf -f Dockerfile . +``` + +## 运行容器 + +```bash +sudo docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 model_pdf +``` + +# 访问示例 + +用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务 + +参数:file-->本地文件的地址 + +- 访问方法 + + ``` + curl --location --request POST "http://localhost:7231/v1/parse/file" \ + --header "Authorization: Bearer your_access_token" \ + --form "file=@./file/chinese_test.pdf" + ``` + +- 多文件测试数据 + + 运行 `test` 文件下的 `test.py` 文件,修改里面的 `file_paths` 为自己仓库的 `url` 即可 + +# FQA + +- 如果出现huggingface模型下载不下来? + + 可以选择在环境变量中加入huggingface镜像 + + ```bash + export HF_ENDPOINT=https://hf-mirror.com + export HF_DATASETS_CACHE=/huggingface + export HUGGINGFACE_HUB_CACHE=/huggingface + export HF_HOME=/huggingface + ``` + + 也可以直接访问 [huggingface][https://huggingface.co] 来下载模型到 `/huggingface` 文件夹下 + + ``` + https://huggingface.co/vikp/surya_det3/tree/main + https://huggingface.co/vikp/surya_layout3/tree/main + https://huggingface.co/vikp/surya_order/tree/main + https://huggingface.co/vikp/surya_rec2/tree/main + https://huggingface.co/vikp/surya_tablerec/tree/main + https://huggingface.co/vikp/texify2/tree/main + ``` + + \ No newline at end of file diff --git a/python/pdf-marker/api_mp.py b/python/pdf-marker/api_mp.py new file mode 100644 index 000000000..893873af6 --- /dev/null +++ b/python/pdf-marker/api_mp.py @@ -0,0 +1,141 @@ +import asyncio +import base64 +import fitz +import torch.multiprocessing as mp +import shutil +import time +from contextlib import asynccontextmanager +from loguru import logger +from fastapi import HTTPException, FastAPI, UploadFile, File +import multiprocessing +from marker.output import save_markdown +from marker.convert import convert_single_pdf +from marker.models import load_all_models +import torch +from concurrent.futures import ProcessPoolExecutor +import os +app = FastAPI() +model_lst = None +model_refs = None +temp_dir = "./temp" +os.environ['PROCESSES_PER_GPU'] = str(2) + +def worker_init(counter, lock): + global model_lst + num_gpus = torch.cuda.device_count() + processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1)) + with lock: + worker_id = counter.value + counter.value += 1 + if num_gpus == 0: + device = 'cpu' + else: + device_id = worker_id // processes_per_gpu + if device_id >= num_gpus: + raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).") + device = f'cuda:{device_id}' + model_lst = load_all_models(device=device, dtype=torch.float32) + print(f"Worker {worker_id}: Models loaded successfully on {device}!") + for model in model_lst: + if model is None: + continue + model.share_memory() + +def process_file_with_multiprocessing(temp_file_path): + global model_lst + full_text, images, out_meta = convert_single_pdf(temp_file_path, model_lst, batch_multiplier=1) + fname = os.path.basename(temp_file_path) + subfolder_path = save_markdown(r'./result', fname, full_text, images, out_meta) + md_content_with_base64_images = embed_images_as_base64(full_text, subfolder_path) + return md_content_with_base64_images, out_meta + +@asynccontextmanager +async def lifespan(app: FastAPI): + try: + mp.set_start_method('spawn') + except RuntimeError: + raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.") + manager = multiprocessing.Manager() + worker_counter = manager.Value('i', 0) + worker_lock = manager.Lock() + global my_pool + gpu_count = torch.cuda.device_count() + my_pool = ProcessPoolExecutor(max_workers=gpu_count*int(os.environ.get('PROCESSES_PER_GPU', 1)), initializer=worker_init, initargs=(worker_counter, worker_lock)) + + yield + global temp_dir + if temp_dir and os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + del model_lst + del model_refs + print("Application shutdown, cleaning up...") + +app.router.lifespan_context = lifespan + +@app.post("/v1/parse/file") +async def read_file( + file: UploadFile = File(...)): + try: + start_time = time.time() + global temp_dir + os.makedirs(temp_dir, exist_ok=True) + temp_file_path = os.path.join(temp_dir, file.filename) + with open(temp_file_path, "wb") as temp_file: + temp_file.write(await file.read()) + pdf_document = fitz.open(temp_file_path) + total_pages = pdf_document.page_count + pdf_document.close() + global my_pool + loop = asyncio.get_event_loop() + md_content_with_base64_images, out_meta = await loop.run_in_executor(my_pool, process_file_with_multiprocessing, temp_file_path) + + end_time = time.time() + duration = end_time - start_time + print(file.filename+"Total time:", duration) + return { + "success": True, + "message": "", + "data": { + "markdown": md_content_with_base64_images, + "page": total_pages, + "duration": duration + } + } + + except Exception as e: + logger.exception(e) + raise HTTPException(status_code=500, detail=f"错误信息: {str(e)}") + + finally: + + if temp_file_path and os.path.exists(temp_file_path): + os.remove(temp_file_path) +def img_to_base64(img_path): + with open(img_path, "rb") as img_file: + return base64.b64encode(img_file.read()).decode('utf-8') +def embed_images_as_base64(md_content, image_dir): + lines = md_content.split('\n') + new_lines = [] + for line in lines: + if line.startswith("![") and "](" in line and ")" in line: + start_idx = line.index("](") + 2 + end_idx = line.index(")", start_idx) + img_rel_path = line[start_idx:end_idx] + + img_name = os.path.basename(img_rel_path) + img_path = os.path.join(image_dir, img_name) + + if os.path.exists(img_path): + img_base64 = img_to_base64(img_path) + new_line = f'{line[:start_idx]}data:image/png;base64,{img_base64}{line[end_idx:]}' + new_lines.append(new_line) + else: + new_lines.append(line) + else: + new_lines.append(line) + return '\n'.join(new_lines) + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=7231) + diff --git a/python/pdf-marker/pip.conf b/python/pdf-marker/pip.conf new file mode 100644 index 000000000..431ea007d --- /dev/null +++ b/python/pdf-marker/pip.conf @@ -0,0 +1,5 @@ +[global] +time-out=60 +index-url=https://pypi.tuna.tsinghua.edu.cn/simple +[install] +trusted-host=pypi.tuna.tsinghua.edu.cn diff --git a/python/pdf-marker/requirements.txt b/python/pdf-marker/requirements.txt new file mode 100644 index 000000000..e4776be33 --- /dev/null +++ b/python/pdf-marker/requirements.txt @@ -0,0 +1,108 @@ +acres==0.1.0 +aiofiles==24.1.0 +annotated-types==0.7.0 +anyio==4.6.2.post1 +certifi==2024.8.30 +charset-normalizer==3.4.0 +ci-info==0.3.0 +click==8.1.7 +coloredlogs==15.0.1 +configobj==5.0.9 +configparser==7.1.0 +dol==0.2.83 +etelemetry==0.3.1 +fastapi==0.115.5 +filelock==3.16.1 +filetype==1.2.0 +flatbuffers==24.3.25 +frontend==0.0.3 +fsspec==2024.10.0 +ftfy==6.3.1 +h11==0.14.0 +httplib2==0.22.0 +huggingface-hub==0.26.2 +humanfriendly==10.0 +i2==0.1.36 +idna==3.10 +importlib_resources==6.4.5 +isodate==0.6.1 +itsdangerous==2.2.0 +Jinja2==3.1.4 +joblib==1.4.2 +loguru==0.7.2 +looseversion==1.3.0 +lxml==5.3.0 +marker-pdf==0.3.10 +MarkupSafe==3.0.2 +mpmath==1.3.0 +networkx==3.4.2 +nibabel==5.3.2 +nipype==1.9.1 +numpy==2.1.3 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +onnxruntime==1.20.1 +opencv-python==4.10.0.84 +opencv-python-headless==4.10.0.84 +packaging==24.2 +pandas==2.2.3 +pathlib==1.0.1 +pdftext==0.3.19 +pillow==10.4.0 +pip==24.3.1 +protobuf==5.28.3 +prov==2.0.1 +puremagic==1.28 +pydantic==2.10.0 +pydantic_core==2.27.0 +pydantic-settings==2.6.1 +pydot==3.0.2 +PyMuPDF==1.24.14 +pyparsing==3.2.0 +pypdfium2==4.30.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.17 +pytz==2024.2 +pyxnat==1.6.2 +PyYAML==6.0.2 +RapidFuzz==3.10.1 +rdflib==6.3.2 +regex==2024.11.6 +requests==2.32.3 +safetensors==0.4.5 +scikit-learn==1.5.2 +scipy==1.14.1 +setuptools==75.6.0 +simplejson==3.19.3 +six==1.16.0 +sniffio==1.3.1 +starlette==0.41.3 +surya-ocr==0.6.13 +sympy==1.13.1 +tabled-pdf==0.1.4 +tabulate==0.9.0 +texify==0.2.1 +threadpoolctl==3.5.0 +tokenizers==0.20.3 +torch==2.5.1 +tqdm==4.67.0 +traits==6.4.3 +transformers==4.46.3 +triton==3.1.0 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.2.3 +uvicorn==0.32.1 +wcwidth==0.2.13 +wheel==0.45.0 diff --git a/python/pdf-marker/test/test.py b/python/pdf-marker/test/test.py new file mode 100644 index 000000000..2e40116ab --- /dev/null +++ b/python/pdf-marker/test/test.py @@ -0,0 +1,26 @@ +import json +import os +from io import BytesIO +import requests +from multiprocessing import Process +def request_(file_path): + url = "http://127.0.0.1:7231/v1/parse/file" + response = requests.get(file_path) + if response.status_code == 200: + file_data = BytesIO(response.content) + pdf_name = os.path.basename(file_path) + files = {'file': (pdf_name, file_data, 'application/pdf')} + response = requests.post(url, files=files) + if response.status_code == 200: + print("Response JSON:", json.dumps(response.json(), indent=4, ensure_ascii=False)) + else: + print(f"Request failed with status code: {response.status_code}") + print(response.text) + +if __name__ == "__main__": + file_paths = ["https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_test.pdf", "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/chinese_test.pdf", + "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf","https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/3649329.3658477.pdf"] + for file_path in file_paths: + p = Process(target=request_, args=(file_path)) + p.start() +