mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 11:43:56 +00:00
38
python/pdf-marker/Dockerfile
Normal file
38
python/pdf-marker/Dockerfile
Normal file
@@ -0,0 +1,38 @@
|
||||
FROM pytorch/pytorch:2.4.1-cuda12.4-cudnn9-devel
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV LANG=C.UTF-8
|
||||
# 安装构建依赖 cv2 dependencies
|
||||
RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y
|
||||
|
||||
# 设置 pip 配置
|
||||
RUN mkdir -p /root/.pip
|
||||
COPY pip.conf /root/.pip/
|
||||
|
||||
# 创建模型文件夹
|
||||
RUN mkdir -p /root/huggingface
|
||||
|
||||
# 复制依赖文件
|
||||
COPY requirements.txt /root/
|
||||
COPY api_mp.py /root/
|
||||
|
||||
|
||||
# 导入huggingface的代理和huggingface模型位置
|
||||
ENV HF_ENDPOINT=https://hf-mirror.com \
|
||||
HF_DATASETS_CACHE=/root/huggingface \
|
||||
HUGGINGFACE_HUB_CACHE=/root/huggingface \
|
||||
HF_HOME=/root/huggingface
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /root
|
||||
|
||||
# 安装 Python 依赖
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 删除不必要的工具和文件以减小镜像体积
|
||||
RUN apt-get purge -y vim && apt-get autoremove -y && rm -rf /root/.pip /root/.cache/pip
|
||||
|
||||
|
||||
|
||||
# 设置容器启动命令
|
||||
CMD ["python3", "api_mp.py"]
|
131
python/pdf-marker/Readme.md
Normal file
131
python/pdf-marker/Readme.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# 项目介绍
|
||||
|
||||
本项目实现了一个高效的 **PDF 转 Markdown 接口服务**,支持多进程并行处理多个 PDF 文件。通过高性能的接口设计,快速将 PDF 文档转换为 Markdown 格式文本。
|
||||
|
||||
- **简洁性:**项目无需修改代码,仅需调整文件路径即可使用,简单易用
|
||||
- **易用性:**通过提供简洁的 API,开发者只需发送 HTTP 请求即可完成 PDF 转换
|
||||
- **灵活性:**支持本地部署和 Docker 容器部署两种方式,便于快速上手和灵活集成
|
||||
|
||||
# 配置推荐
|
||||
|
||||
## 常规配置
|
||||
|
||||
24G显存的显卡两张,可以支持四个文件同时处理
|
||||
|
||||
## 最低配置
|
||||
|
||||
**不低于11G** 显存的显卡一张
|
||||
|
||||
并设置每张卡处理的进程数为1
|
||||
|
||||
```bash
|
||||
export PROCESSES_PER_GPU="1"
|
||||
```
|
||||
|
||||
## 单文件实测速率
|
||||
|
||||
| 显卡 | 中文PDF | 英文PDF | 扫描件 |
|
||||
| ------------- | ------------ | ------------ | ------------ |
|
||||
| **4090D 24G** | **0.75s/页** | **1.60s/页** | **3.26s/页** |
|
||||
| **P40 24G** | **0.99s/页** | **2.22s/页** | **5.24s/页** |
|
||||
|
||||
## 多文件实测速率
|
||||
|
||||
中文PDF+英文PDF
|
||||
|
||||
| 显卡 | 串行处理 | 并行处理 | 提升效率 |
|
||||
| ------------- | ------------ | ------------ | --------- |
|
||||
| **4090D 24G** | **0.92s/页** | **0.62s/页** | **31.9%** |
|
||||
| **P40 24G** | **1.22s/页** | **0.85s/页** | **30.5%** |
|
||||
|
||||
# 本地开发
|
||||
|
||||
## 基本流程
|
||||
|
||||
1. 克隆一个FastGPT的项目文件
|
||||
|
||||
```
|
||||
git clone https://github.com/labring/FastGPT.git
|
||||
```
|
||||
|
||||
2. 将主目录设置为 python下的pdf-marker文件
|
||||
|
||||
```
|
||||
cd python/pdf-marker
|
||||
```
|
||||
|
||||
3. 创建Anaconda并安装requirement.txt文件
|
||||
|
||||
安装的Anaconda版本:**conda 24.7.1**
|
||||
|
||||
```
|
||||
conda create -n pdf-marker python=3.11
|
||||
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
conda activate pdf-marker
|
||||
```
|
||||
|
||||
4. 执行主文件启动pdf2md服务
|
||||
|
||||
```
|
||||
python api_mp.py
|
||||
```
|
||||
|
||||
# 镜像打包和部署
|
||||
|
||||
## 打包镜像
|
||||
|
||||
在 `pdf-marker` 根目录下执行:
|
||||
|
||||
```bash
|
||||
sudo docker build -t model_pdf -f Dockerfile .
|
||||
```
|
||||
|
||||
## 运行容器
|
||||
|
||||
```bash
|
||||
sudo docker run --gpus all -itd -p 7231:7231 --name model_pdf_v1 model_pdf
|
||||
```
|
||||
|
||||
# 访问示例
|
||||
|
||||
用Post方法访问端口为 `7321 ` 的 `v1/parse/file` 服务
|
||||
|
||||
参数:file-->本地文件的地址
|
||||
|
||||
- 访问方法
|
||||
|
||||
```
|
||||
curl --location --request POST "http://localhost:7231/v1/parse/file" \
|
||||
--header "Authorization: Bearer your_access_token" \
|
||||
--form "file=@./file/chinese_test.pdf"
|
||||
```
|
||||
|
||||
- 多文件测试数据
|
||||
|
||||
运行 `test` 文件下的 `test.py` 文件,修改里面的 `file_paths` 为自己仓库的 `url` 即可
|
||||
|
||||
# FQA
|
||||
|
||||
- 如果出现huggingface模型下载不下来?
|
||||
|
||||
可以选择在环境变量中加入huggingface镜像
|
||||
|
||||
```bash
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
export HF_DATASETS_CACHE=/huggingface
|
||||
export HUGGINGFACE_HUB_CACHE=/huggingface
|
||||
export HF_HOME=/huggingface
|
||||
```
|
||||
|
||||
也可以直接访问 [huggingface][https://huggingface.co] 来下载模型到 `/huggingface` 文件夹下
|
||||
|
||||
```
|
||||
https://huggingface.co/vikp/surya_det3/tree/main
|
||||
https://huggingface.co/vikp/surya_layout3/tree/main
|
||||
https://huggingface.co/vikp/surya_order/tree/main
|
||||
https://huggingface.co/vikp/surya_rec2/tree/main
|
||||
https://huggingface.co/vikp/surya_tablerec/tree/main
|
||||
https://huggingface.co/vikp/texify2/tree/main
|
||||
```
|
||||
|
||||
|
141
python/pdf-marker/api_mp.py
Normal file
141
python/pdf-marker/api_mp.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import fitz
|
||||
import torch.multiprocessing as mp
|
||||
import shutil
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from loguru import logger
|
||||
from fastapi import HTTPException, FastAPI, UploadFile, File
|
||||
import multiprocessing
|
||||
from marker.output import save_markdown
|
||||
from marker.convert import convert_single_pdf
|
||||
from marker.models import load_all_models
|
||||
import torch
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import os
|
||||
app = FastAPI()
|
||||
model_lst = None
|
||||
model_refs = None
|
||||
temp_dir = "./temp"
|
||||
os.environ['PROCESSES_PER_GPU'] = str(2)
|
||||
|
||||
def worker_init(counter, lock):
|
||||
global model_lst
|
||||
num_gpus = torch.cuda.device_count()
|
||||
processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1))
|
||||
with lock:
|
||||
worker_id = counter.value
|
||||
counter.value += 1
|
||||
if num_gpus == 0:
|
||||
device = 'cpu'
|
||||
else:
|
||||
device_id = worker_id // processes_per_gpu
|
||||
if device_id >= num_gpus:
|
||||
raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).")
|
||||
device = f'cuda:{device_id}'
|
||||
model_lst = load_all_models(device=device, dtype=torch.float32)
|
||||
print(f"Worker {worker_id}: Models loaded successfully on {device}!")
|
||||
for model in model_lst:
|
||||
if model is None:
|
||||
continue
|
||||
model.share_memory()
|
||||
|
||||
def process_file_with_multiprocessing(temp_file_path):
|
||||
global model_lst
|
||||
full_text, images, out_meta = convert_single_pdf(temp_file_path, model_lst, batch_multiplier=1)
|
||||
fname = os.path.basename(temp_file_path)
|
||||
subfolder_path = save_markdown(r'./result', fname, full_text, images, out_meta)
|
||||
md_content_with_base64_images = embed_images_as_base64(full_text, subfolder_path)
|
||||
return md_content_with_base64_images, out_meta
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
try:
|
||||
mp.set_start_method('spawn')
|
||||
except RuntimeError:
|
||||
raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
|
||||
manager = multiprocessing.Manager()
|
||||
worker_counter = manager.Value('i', 0)
|
||||
worker_lock = manager.Lock()
|
||||
global my_pool
|
||||
gpu_count = torch.cuda.device_count()
|
||||
my_pool = ProcessPoolExecutor(max_workers=gpu_count*int(os.environ.get('PROCESSES_PER_GPU', 1)), initializer=worker_init, initargs=(worker_counter, worker_lock))
|
||||
|
||||
yield
|
||||
global temp_dir
|
||||
if temp_dir and os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
del model_lst
|
||||
del model_refs
|
||||
print("Application shutdown, cleaning up...")
|
||||
|
||||
app.router.lifespan_context = lifespan
|
||||
|
||||
@app.post("/v1/parse/file")
|
||||
async def read_file(
|
||||
file: UploadFile = File(...)):
|
||||
try:
|
||||
start_time = time.time()
|
||||
global temp_dir
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
temp_file_path = os.path.join(temp_dir, file.filename)
|
||||
with open(temp_file_path, "wb") as temp_file:
|
||||
temp_file.write(await file.read())
|
||||
pdf_document = fitz.open(temp_file_path)
|
||||
total_pages = pdf_document.page_count
|
||||
pdf_document.close()
|
||||
global my_pool
|
||||
loop = asyncio.get_event_loop()
|
||||
md_content_with_base64_images, out_meta = await loop.run_in_executor(my_pool, process_file_with_multiprocessing, temp_file_path)
|
||||
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
print(file.filename+"Total time:", duration)
|
||||
return {
|
||||
"success": True,
|
||||
"message": "",
|
||||
"data": {
|
||||
"markdown": md_content_with_base64_images,
|
||||
"page": total_pages,
|
||||
"duration": duration
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
raise HTTPException(status_code=500, detail=f"错误信息: {str(e)}")
|
||||
|
||||
finally:
|
||||
|
||||
if temp_file_path and os.path.exists(temp_file_path):
|
||||
os.remove(temp_file_path)
|
||||
def img_to_base64(img_path):
|
||||
with open(img_path, "rb") as img_file:
|
||||
return base64.b64encode(img_file.read()).decode('utf-8')
|
||||
def embed_images_as_base64(md_content, image_dir):
|
||||
lines = md_content.split('\n')
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
if line.startswith("" in line:
|
||||
start_idx = line.index("](") + 2
|
||||
end_idx = line.index(")", start_idx)
|
||||
img_rel_path = line[start_idx:end_idx]
|
||||
|
||||
img_name = os.path.basename(img_rel_path)
|
||||
img_path = os.path.join(image_dir, img_name)
|
||||
|
||||
if os.path.exists(img_path):
|
||||
img_base64 = img_to_base64(img_path)
|
||||
new_line = f'{line[:start_idx]}data:image/png;base64,{img_base64}{line[end_idx:]}'
|
||||
new_lines.append(new_line)
|
||||
else:
|
||||
new_lines.append(line)
|
||||
else:
|
||||
new_lines.append(line)
|
||||
return '\n'.join(new_lines)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=7231)
|
||||
|
5
python/pdf-marker/pip.conf
Normal file
5
python/pdf-marker/pip.conf
Normal file
@@ -0,0 +1,5 @@
|
||||
[global]
|
||||
time-out=60
|
||||
index-url=https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
[install]
|
||||
trusted-host=pypi.tuna.tsinghua.edu.cn
|
108
python/pdf-marker/requirements.txt
Normal file
108
python/pdf-marker/requirements.txt
Normal file
@@ -0,0 +1,108 @@
|
||||
acres==0.1.0
|
||||
aiofiles==24.1.0
|
||||
annotated-types==0.7.0
|
||||
anyio==4.6.2.post1
|
||||
certifi==2024.8.30
|
||||
charset-normalizer==3.4.0
|
||||
ci-info==0.3.0
|
||||
click==8.1.7
|
||||
coloredlogs==15.0.1
|
||||
configobj==5.0.9
|
||||
configparser==7.1.0
|
||||
dol==0.2.83
|
||||
etelemetry==0.3.1
|
||||
fastapi==0.115.5
|
||||
filelock==3.16.1
|
||||
filetype==1.2.0
|
||||
flatbuffers==24.3.25
|
||||
frontend==0.0.3
|
||||
fsspec==2024.10.0
|
||||
ftfy==6.3.1
|
||||
h11==0.14.0
|
||||
httplib2==0.22.0
|
||||
huggingface-hub==0.26.2
|
||||
humanfriendly==10.0
|
||||
i2==0.1.36
|
||||
idna==3.10
|
||||
importlib_resources==6.4.5
|
||||
isodate==0.6.1
|
||||
itsdangerous==2.2.0
|
||||
Jinja2==3.1.4
|
||||
joblib==1.4.2
|
||||
loguru==0.7.2
|
||||
looseversion==1.3.0
|
||||
lxml==5.3.0
|
||||
marker-pdf==0.3.10
|
||||
MarkupSafe==3.0.2
|
||||
mpmath==1.3.0
|
||||
networkx==3.4.2
|
||||
nibabel==5.3.2
|
||||
nipype==1.9.1
|
||||
numpy==2.1.3
|
||||
nvidia-cublas-cu12==12.4.5.8
|
||||
nvidia-cuda-cupti-cu12==12.4.127
|
||||
nvidia-cuda-nvrtc-cu12==12.4.127
|
||||
nvidia-cuda-runtime-cu12==12.4.127
|
||||
nvidia-cudnn-cu12==9.1.0.70
|
||||
nvidia-cufft-cu12==11.2.1.3
|
||||
nvidia-curand-cu12==10.3.5.147
|
||||
nvidia-cusolver-cu12==11.6.1.9
|
||||
nvidia-cusparse-cu12==12.3.1.170
|
||||
nvidia-nccl-cu12==2.21.5
|
||||
nvidia-nvjitlink-cu12==12.4.127
|
||||
nvidia-nvtx-cu12==12.4.127
|
||||
onnxruntime==1.20.1
|
||||
opencv-python==4.10.0.84
|
||||
opencv-python-headless==4.10.0.84
|
||||
packaging==24.2
|
||||
pandas==2.2.3
|
||||
pathlib==1.0.1
|
||||
pdftext==0.3.19
|
||||
pillow==10.4.0
|
||||
pip==24.3.1
|
||||
protobuf==5.28.3
|
||||
prov==2.0.1
|
||||
puremagic==1.28
|
||||
pydantic==2.10.0
|
||||
pydantic_core==2.27.0
|
||||
pydantic-settings==2.6.1
|
||||
pydot==3.0.2
|
||||
PyMuPDF==1.24.14
|
||||
pyparsing==3.2.0
|
||||
pypdfium2==4.30.0
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.0.1
|
||||
python-multipart==0.0.17
|
||||
pytz==2024.2
|
||||
pyxnat==1.6.2
|
||||
PyYAML==6.0.2
|
||||
RapidFuzz==3.10.1
|
||||
rdflib==6.3.2
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
safetensors==0.4.5
|
||||
scikit-learn==1.5.2
|
||||
scipy==1.14.1
|
||||
setuptools==75.6.0
|
||||
simplejson==3.19.3
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
starlette==0.41.3
|
||||
surya-ocr==0.6.13
|
||||
sympy==1.13.1
|
||||
tabled-pdf==0.1.4
|
||||
tabulate==0.9.0
|
||||
texify==0.2.1
|
||||
threadpoolctl==3.5.0
|
||||
tokenizers==0.20.3
|
||||
torch==2.5.1
|
||||
tqdm==4.67.0
|
||||
traits==6.4.3
|
||||
transformers==4.46.3
|
||||
triton==3.1.0
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.2
|
||||
urllib3==2.2.3
|
||||
uvicorn==0.32.1
|
||||
wcwidth==0.2.13
|
||||
wheel==0.45.0
|
26
python/pdf-marker/test/test.py
Normal file
26
python/pdf-marker/test/test.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import json
|
||||
import os
|
||||
from io import BytesIO
|
||||
import requests
|
||||
from multiprocessing import Process
|
||||
def request_(file_path):
|
||||
url = "http://127.0.0.1:7231/v1/parse/file"
|
||||
response = requests.get(file_path)
|
||||
if response.status_code == 200:
|
||||
file_data = BytesIO(response.content)
|
||||
pdf_name = os.path.basename(file_path)
|
||||
files = {'file': (pdf_name, file_data, 'application/pdf')}
|
||||
response = requests.post(url, files=files)
|
||||
if response.status_code == 200:
|
||||
print("Response JSON:", json.dumps(response.json(), indent=4, ensure_ascii=False))
|
||||
else:
|
||||
print(f"Request failed with status code: {response.status_code}")
|
||||
print(response.text)
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_paths = ["https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_test.pdf", "https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/chinese_test.pdf",
|
||||
"https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/ocr_test.pdf","https://objectstorageapi.bja.sealos.run/czrn86r1-yyh/english_file/3649329.3658477.pdf"]
|
||||
for file_path in file_paths:
|
||||
p = Process(target=request_, args=(file_path))
|
||||
p.start()
|
||||
|
Reference in New Issue
Block a user