add mistral-ocr support as a plugin like pdf-marker (#4284)

Co-authored-by: zhengshuai.li <zhengshuai.li@cloudpense.com>
This commit is contained in:
lzs2000131
2025-03-24 10:32:06 +08:00
committed by GitHub
parent f3ef56998d
commit 48233c7d55
4 changed files with 382 additions and 0 deletions

View File

@@ -0,0 +1 @@
MISTRAL_API_KEY=

View File

@@ -0,0 +1,143 @@
# PDF-Mistral 插件
此插件使用 Mistral 的 OCR API 将 PDF 文件转换为 Markdown 文本。它可以从 PDF 文档中提取文本内容和图像,并将它们作为带有嵌入式 base64 图像的 Markdown 返回。
## 功能特点
- 使用 Mistral OCR API 提取 PDF 文本
- Markdown 中的 base64 图像嵌入
- 完善的错误处理
- 支持多页 PDF
## 设置
### 前提条件
- Python 3.8+
- Mistral API 密钥([在此获取](https://mistral.ai/)
### 安装
1. 安装所需的软件包:
```bash
pip install -r requirements.txt
```
2. 通过创建/编辑 `.env` 文件设置环境变量:
```bash
# 在 .env 文件中
MISTRAL_API_KEY=你的-mistral-api-密钥
```
## 使用方法
### 启动服务器
使用以下命令运行服务器:
```bash
python api_mp.py
```
或者直接使用 uvicorn
```bash
uvicorn api_mp:app --host 0.0.0.0 --port 7231
```
然后配置到FastGPT配置文件即可
```json
{
xxx
"systemEnv": {
xxx
"customPdfParse": {
"url": "http://localhost:7231/v1/parse/file", // 自定义 PDF 解析服务地址
}
}
}
```
### API 端点
#### 解析 PDF 文件
**端点**`POST /v1/parse/file`
**请求**
- 包含文件字段的多部分表单数据
**响应**
```json
{
"pages": 5, // PDF 中的页数
"markdown": "...", // 带有嵌入式 base64 图像的 Markdown 内容
"duration": 10.5 // 处理时间(秒)
}
```
**错误响应**
```json
{
"pages": 0,
"markdown": "",
"error": "错误信息"
}
```
### 使用示例
使用 curl
```bash
curl -X POST -F "file=@path/to/your/document.pdf" http://localhost:7231/v1/parse/file
```
使用 JavaScript/Axios
```javascript
const formData = new FormData();
formData.append('file', pdfFile);
const response = await axios.post('http://localhost:7231/v1/parse/file', formData, {
headers: {
'Content-Type': 'multipart/form-data'
}
});
if (response.data.error) {
console.error('错误:', response.data.error);
} else {
console.log('页数:', response.data.pages);
console.log('Markdown:', response.data.markdown);
}
```
## 限制
- PDF 文件必须可读且没有密码保护
- 最大文件大小取决于 Mistral API 限制目前最大52.4M
- Mistral API 有页面限制最多最大1000页
## 故障排除
### 常见错误
1. **"MISTRAL_API_KEY environment variable not set"(未设置 MISTRAL_API_KEY 环境变量)**
- 确保您已将 Mistral API 密钥添加到 `.env` 文件中
- 确保 `.env` 文件与脚本在同一目录中
2. **"Failed to process PDF file"(无法处理 PDF 文件)**
- PDF 可能已损坏或受密码保护
- 尝试使用其他 PDF 文件
3. **Mistral API 错误**
- 检查您的 Mistral API 密钥是否有效
- 确保您在 Mistral API 速率限制范围内
- 验证 PDF 是否在大小/页数限制范围内
## 许可证
MIT 许可证

View File

@@ -0,0 +1,230 @@
import time
import base64
import fitz
import re
import json
from contextlib import asynccontextmanager
from loguru import logger
from fastapi import HTTPException, FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from mistralai import Mistral
import os
import shutil
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
app = FastAPI()
temp_dir = "./temp"
# Initialize Mistral client with API key from environment variable
mistral_api_key = os.environ.get("MISTRAL_API_KEY", "")
if not mistral_api_key:
logger.warning("MISTRAL_API_KEY environment variable not set. PDF processing will fail.")
mistral_client = Mistral(api_key=mistral_api_key) if mistral_api_key else None
@asynccontextmanager
async def lifespan(app: FastAPI):
# Create temp directory if it doesn't exist
global temp_dir
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
print("Application startup, creating temp directory...")
yield
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
print("Application shutdown, cleaning up...")
app.router.lifespan_context = lifespan
@app.post("/v1/parse/file")
async def read_file(
file: UploadFile = File(...)):
temp_file_path = None
try:
start_time = time.time()
global temp_dir
os.makedirs(temp_dir, exist_ok=True)
temp_file_path = os.path.join(temp_dir, file.filename)
with open(temp_file_path, "wb") as temp_file:
file_content = await file.read()
temp_file.write(file_content)
# Get page count using PyMuPDF
try:
pdf_document = fitz.open(temp_file_path)
total_pages = pdf_document.page_count
pdf_document.close()
except Exception as e:
logger.error(f"Failed to open PDF file: {str(e)}")
return {
"pages": 0,
"markdown": "",
"error": f"Failed to process PDF file: {str(e)}"
}
if mistral_client is None:
return {
"pages": 0,
"markdown": "",
"error": "MISTRAL_API_KEY environment variable not set."
}
# Step 1: Upload the file to Mistral's servers
logger.info(f"Uploading file {file.filename} to Mistral servers")
with open(temp_file_path, "rb") as f:
try:
uploaded_file = mistral_client.files.upload(
file={
"file_name": file.filename,
"content": f,
},
purpose="ocr"
)
except Exception as e:
error_msg = str(e)
# Try to parse Mistral API error format
try:
error_data = json.loads(error_msg)
if error_data.get("object") == "error":
error_msg = error_data.get("message", error_msg)
except:
pass
return {
"pages": 0,
"markdown": "",
"error": f"Mistral API upload error: {error_msg}"
}
# Step 2: Get a signed URL for the uploaded file
logger.info(f"Getting signed URL for file ID: {uploaded_file.id}")
try:
signed_url = mistral_client.files.get_signed_url(file_id=uploaded_file.id)
except Exception as e:
error_msg = str(e)
# Try to parse Mistral API error format
try:
error_data = json.loads(error_msg)
if error_data.get("object") == "error":
error_msg = error_data.get("message", error_msg)
except:
pass
return {
"pages": 0,
"markdown": "",
"error": f"Mistral API signed URL error: {error_msg}"
}
# Step 3: Process the file using the signed URL
logger.info("Processing file with OCR API")
try:
ocr_response = mistral_client.ocr.process(
model="mistral-ocr-latest",
document={
"type": "document_url",
"document_url": signed_url.url,
},
include_image_base64=True
)
except Exception as e:
error_msg = str(e)
# Try to parse Mistral API error format
try:
error_data = json.loads(error_msg)
if error_data.get("object") == "error":
error_msg = error_data.get("message", error_msg)
except:
pass
return {
"pages": 0,
"markdown": "",
"error": f"Mistral OCR processing error: {error_msg}"
}
# Combine all pages' markdown content
markdown_content = "\n".join(page.markdown for page in ocr_response.pages)
# Create a dictionary to map image filenames to their base64 data
image_map = {}
for page in ocr_response.pages:
for img in page.images:
# Extract the image filename from the image id
img_id = img.id
img_base64 = img.image_base64
# Print a sample of the first image base64 data for debugging
if len(image_map) == 0 and img_base64:
print("Sample image base64 prefix:", img_base64[:50] if len(img_base64) > 50 else img_base64)
print("Does base64 already include prefix?", img_base64.startswith("data:image/"))
# Ensure the base64 data is in the correct format for the upstream system
# If it doesn't already have the prefix, add it
if not img_base64.startswith("data:image/"):
# Assume it's a PNG if we can't determine the type
img_base64 = f"data:image/png;base64,{img_base64}"
# Add both potential formats to the map
image_map[f"{img_id}.jpeg"] = img_base64
image_map[f"{img_id}.png"] = img_base64
image_map[img_id] = img_base64
# Use regex to find all image references in the markdown content
# This will match patterns like ![any-text](any-filename.extension)
image_pattern = r'!\[(.*?)\]\((.*?)\)'
def replace_image_with_base64(match):
alt_text = match.group(1)
img_filename = match.group(2)
# Extract just the filename without path
img_filename_only = os.path.basename(img_filename)
# Check if we have base64 data for this image
if img_filename_only in image_map:
return f"![]({image_map[img_filename_only]})"
else:
# If we don't have base64 data, keep the original reference
logger.warning(f"No base64 data found for image: {img_filename_only}")
return match.group(0)
# Replace all image references with base64 data
markdown_content = re.sub(image_pattern, replace_image_with_base64, markdown_content)
# Clean up the uploaded file from Mistral's servers
try:
logger.info(f"Deleting uploaded file from Mistral servers: {uploaded_file.id}")
mistral_client.files.delete(file_id=uploaded_file.id)
except Exception as e:
logger.warning(f"Failed to delete uploaded file: {e}")
end_time = time.time()
duration = end_time - start_time
print(file.filename + " Total time:", duration)
# Return with format matching client expectations
return {
"pages": total_pages,
"markdown": markdown_content,
"duration": duration # Keep this for logging purposes
}
except Exception as e:
logger.exception(e)
return {
"pages": 0,
"markdown": "",
"error": f"Internal server error: {str(e)}"
}
finally:
if temp_file_path and os.path.exists(temp_file_path):
os.remove(temp_file_path)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7231)

View File

@@ -0,0 +1,8 @@
fastapi==0.115.5
uvicorn==0.32.1
mistralai>=1.5.0
PyMuPDF==1.24.14
python-multipart==0.0.17
python-dotenv==1.0.1
loguru==0.7.2
requests==2.32.3