add mistral-ocr support as a plugin like pdf-marker (#4284)

Co-authored-by: zhengshuai.li <zhengshuai.li@cloudpense.com>
2025-07-23 13:03:50 +00:00 · 2025-03-24 10:32:06 +08:00
parent f3ef56998d
commit 48233c7d55
4 changed files with 382 additions and 0 deletions
--- a/plugins/model/pdf-mistral/.env
+++ b/plugins/model/pdf-mistral/.env
@@ -0,0 +1 @@
+MISTRAL_API_KEY=
--- a/plugins/model/pdf-mistral/README.md
+++ b/plugins/model/pdf-mistral/README.md
@@ -0,0 +1,143 @@
+# PDF-Mistral 插件
+
+此插件使用 Mistral 的 OCR API 将 PDF 文件转换为 Markdown 文本。它可以从 PDF 文档中提取文本内容和图像，并将它们作为带有嵌入式 base64 图像的 Markdown 返回。
+
+## 功能特点
+
+- 使用 Mistral OCR API 提取 PDF 文本
+- Markdown 中的 base64 图像嵌入
+- 完善的错误处理
+- 支持多页 PDF
+
+## 设置
+
+### 前提条件
+
+- Python 3.8+
+- Mistral API 密钥（[在此获取](https://mistral.ai/)）
+
+### 安装
+
+1. 安装所需的软件包：
+
+```bash
+pip install -r requirements.txt
+```
+
+2. 通过创建/编辑 `.env` 文件设置环境变量：
+
+```bash
+# 在 .env 文件中
+MISTRAL_API_KEY=你的-mistral-api-密钥
+```
+
+## 使用方法
+
+### 启动服务器
+
+使用以下命令运行服务器：
+
+```bash
+python api_mp.py
+```
+
+或者直接使用 uvicorn：
+
+```bash
+uvicorn api_mp:app --host 0.0.0.0 --port 7231
+```
+
+然后配置到FastGPT配置文件即可
+```json
+{
+  xxx
+  "systemEnv": {
+    xxx
+    "customPdfParse": {
+      "url": "http://localhost:7231/v1/parse/file", // 自定义 PDF 解析服务地址
+    }
+  }
+}
+```
+
+### API 端点
+
+#### 解析 PDF 文件
+
+**端点**：`POST /v1/parse/file`
+
+**请求**：
+- 包含文件字段的多部分表单数据
+
+**响应**：
+```json
+{
+  "pages": 5,                // PDF 中的页数
+  "markdown": "...",         // 带有嵌入式 base64 图像的 Markdown 内容
+  "duration": 10.5           // 处理时间（秒）
+}
+```
+
+**错误响应**：
+```json
+{
+  "pages": 0,
+  "markdown": "",
+  "error": "错误信息"
+}
+```
+
+### 使用示例
+
+使用 curl：
+
+```bash
+curl -X POST -F "file=@path/to/your/document.pdf" http://localhost:7231/v1/parse/file
+```
+
+使用 JavaScript/Axios：
+
+```javascript
+const formData = new FormData();
+formData.append('file', pdfFile);
+
+const response = await axios.post('http://localhost:7231/v1/parse/file', formData, {
+  headers: {
+    'Content-Type': 'multipart/form-data'
+  }
+});
+
+if (response.data.error) {
+  console.error('错误:', response.data.error);
+} else {
+  console.log('页数:', response.data.pages);
+  console.log('Markdown:', response.data.markdown);
+}
+```
+
+## 限制
+
+- PDF 文件必须可读且没有密码保护
+- 最大文件大小取决于 Mistral API 限制（目前最大52.4M）
+- Mistral API 有页面限制（最多最大1000页）
+
+## 故障排除
+
+### 常见错误
+
+1. **"MISTRAL_API_KEY environment variable not set"（未设置 MISTRAL_API_KEY 环境变量）**
+   - 确保您已将 Mistral API 密钥添加到 `.env` 文件中
+   - 确保 `.env` 文件与脚本在同一目录中
+
+2. **"Failed to process PDF file"（无法处理 PDF 文件）**
+   - PDF 可能已损坏或受密码保护
+   - 尝试使用其他 PDF 文件
+
+3. **Mistral API 错误**
+   - 检查您的 Mistral API 密钥是否有效
+   - 确保您在 Mistral API 速率限制范围内
+   - 验证 PDF 是否在大小/页数限制范围内
+
+## 许可证
+
+MIT 许可证
--- a/plugins/model/pdf-mistral/api_mp.py
+++ b/plugins/model/pdf-mistral/api_mp.py
@@ -0,0 +1,230 @@
+import time
+import base64
+import fitz
+import re
+import json
+from contextlib import asynccontextmanager
+from loguru import logger
+from fastapi import HTTPException, FastAPI, UploadFile, File
+from fastapi.responses import JSONResponse
+from mistralai import Mistral
+import os
+import shutil
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+app = FastAPI()
+temp_dir = "./temp"
+
+# Initialize Mistral client with API key from environment variable
+mistral_api_key = os.environ.get("MISTRAL_API_KEY", "")
+if not mistral_api_key:
+    logger.warning("MISTRAL_API_KEY environment variable not set. PDF processing will fail.")
+    
+mistral_client = Mistral(api_key=mistral_api_key) if mistral_api_key else None
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Create temp directory if it doesn't exist
+    global temp_dir
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+    print("Application startup, creating temp directory...")
+    yield
+    if temp_dir and os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+    print("Application shutdown, cleaning up...")
+
+app.router.lifespan_context = lifespan
+
+@app.post("/v1/parse/file")
+async def read_file(
+        file: UploadFile = File(...)):
+    temp_file_path = None
+    try:
+        start_time = time.time()
+        global temp_dir
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_file_path = os.path.join(temp_dir, file.filename)
+        with open(temp_file_path, "wb") as temp_file:
+            file_content = await file.read()
+            temp_file.write(file_content)
+        
+        # Get page count using PyMuPDF
+        try:
+            pdf_document = fitz.open(temp_file_path)
+            total_pages = pdf_document.page_count
+            pdf_document.close()
+        except Exception as e:
+            logger.error(f"Failed to open PDF file: {str(e)}")
+            return {
+                "pages": 0,
+                "markdown": "",
+                "error": f"Failed to process PDF file: {str(e)}"
+            }
+        
+        if mistral_client is None:
+            return {
+                "pages": 0,
+                "markdown": "",
+                "error": "MISTRAL_API_KEY environment variable not set."
+            }
+        
+        # Step 1: Upload the file to Mistral's servers
+        logger.info(f"Uploading file {file.filename} to Mistral servers")
+        with open(temp_file_path, "rb") as f:
+            try:
+                uploaded_file = mistral_client.files.upload(
+                    file={
+                        "file_name": file.filename,
+                        "content": f,
+                    },
+                    purpose="ocr"
+                )
+            except Exception as e:
+                error_msg = str(e)
+                # Try to parse Mistral API error format
+                try:
+                    error_data = json.loads(error_msg)
+                    if error_data.get("object") == "error":
+                        error_msg = error_data.get("message", error_msg)
+                except:
+                    pass
+                
+                return {
+                    "pages": 0,
+                    "markdown": "",
+                    "error": f"Mistral API upload error: {error_msg}"
+                }
+        
+        # Step 2: Get a signed URL for the uploaded file
+        logger.info(f"Getting signed URL for file ID: {uploaded_file.id}")
+        try:
+            signed_url = mistral_client.files.get_signed_url(file_id=uploaded_file.id)
+        except Exception as e:
+            error_msg = str(e)
+            # Try to parse Mistral API error format
+            try:
+                error_data = json.loads(error_msg)
+                if error_data.get("object") == "error":
+                    error_msg = error_data.get("message", error_msg)
+            except:
+                pass
+            
+            return {
+                "pages": 0,
+                "markdown": "",
+                "error": f"Mistral API signed URL error: {error_msg}"
+            }
+        
+        # Step 3: Process the file using the signed URL
+        logger.info("Processing file with OCR API")
+        try:
+            ocr_response = mistral_client.ocr.process(
+                model="mistral-ocr-latest",
+                document={
+                    "type": "document_url",
+                    "document_url": signed_url.url,
+                },
+                include_image_base64=True
+            )
+        except Exception as e:
+            error_msg = str(e)
+            # Try to parse Mistral API error format
+            try:
+                error_data = json.loads(error_msg)
+                if error_data.get("object") == "error":
+                    error_msg = error_data.get("message", error_msg)
+            except:
+                pass
+            
+            return {
+                "pages": 0,
+                "markdown": "",
+                "error": f"Mistral OCR processing error: {error_msg}"
+            }
+        
+        # Combine all pages' markdown content
+        markdown_content = "\n".join(page.markdown for page in ocr_response.pages)
+        
+        # Create a dictionary to map image filenames to their base64 data
+        image_map = {}
+        for page in ocr_response.pages:
+            for img in page.images:
+                # Extract the image filename from the image id
+                img_id = img.id
+                img_base64 = img.image_base64
+                
+                # Print a sample of the first image base64 data for debugging
+                if len(image_map) == 0 and img_base64:
+                    print("Sample image base64 prefix:", img_base64[:50] if len(img_base64) > 50 else img_base64)
+                    print("Does base64 already include prefix?", img_base64.startswith("data:image/"))
+                
+                # Ensure the base64 data is in the correct format for the upstream system
+                # If it doesn't already have the prefix, add it
+                if not img_base64.startswith("data:image/"):
+                    # Assume it's a PNG if we can't determine the type
+                    img_base64 = f"data:image/png;base64,{img_base64}"
+                
+                # Add both potential formats to the map
+                image_map[f"{img_id}.jpeg"] = img_base64
+                image_map[f"{img_id}.png"] = img_base64
+                image_map[img_id] = img_base64
+        
+        # Use regex to find all image references in the markdown content
+        # This will match patterns like ![any-text](any-filename.extension)
+        image_pattern = r'!\[(.*?)\]\((.*?)\)'
+        
+        def replace_image_with_base64(match):
+            alt_text = match.group(1)
+            img_filename = match.group(2)
+            
+            # Extract just the filename without path
+            img_filename_only = os.path.basename(img_filename)
+            
+            # Check if we have base64 data for this image
+            if img_filename_only in image_map:
+                return f"![]({image_map[img_filename_only]})"
+            else:
+                # If we don't have base64 data, keep the original reference
+                logger.warning(f"No base64 data found for image: {img_filename_only}")
+                return match.group(0)
+        
+        # Replace all image references with base64 data
+        markdown_content = re.sub(image_pattern, replace_image_with_base64, markdown_content)
+        
+        # Clean up the uploaded file from Mistral's servers
+        try:
+            logger.info(f"Deleting uploaded file from Mistral servers: {uploaded_file.id}")
+            mistral_client.files.delete(file_id=uploaded_file.id)
+        except Exception as e:
+            logger.warning(f"Failed to delete uploaded file: {e}")
+        
+        end_time = time.time()
+        duration = end_time - start_time
+        print(file.filename + " Total time:", duration)
+        
+        # Return with format matching client expectations
+        return {
+            "pages": total_pages,
+            "markdown": markdown_content,
+            "duration": duration  # Keep this for logging purposes
+        }
+
+    except Exception as e:
+        logger.exception(e)
+        return {
+            "pages": 0,
+            "markdown": "",
+            "error": f"Internal server error: {str(e)}"
+        }
+
+    finally:
+        if temp_file_path and os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7231)
--- a/plugins/model/pdf-mistral/requirements.txt
+++ b/plugins/model/pdf-mistral/requirements.txt
@@ -0,0 +1,8 @@
+fastapi==0.115.5
+uvicorn==0.32.1
+mistralai>=1.5.0
+PyMuPDF==1.24.14
+python-multipart==0.0.17
+python-dotenv==1.0.1
+loguru==0.7.2
+requests==2.32.3