diff --git a/plugins/model/pdf-mistral/.env b/plugins/model/pdf-mistral/.env new file mode 100644 index 000000000..79017ded3 --- /dev/null +++ b/plugins/model/pdf-mistral/.env @@ -0,0 +1 @@ +MISTRAL_API_KEY= \ No newline at end of file diff --git a/plugins/model/pdf-mistral/README.md b/plugins/model/pdf-mistral/README.md new file mode 100644 index 000000000..84689efb1 --- /dev/null +++ b/plugins/model/pdf-mistral/README.md @@ -0,0 +1,143 @@ +# PDF-Mistral 插件 + +此插件使用 Mistral 的 OCR API 将 PDF 文件转换为 Markdown 文本。它可以从 PDF 文档中提取文本内容和图像,并将它们作为带有嵌入式 base64 图像的 Markdown 返回。 + +## 功能特点 + +- 使用 Mistral OCR API 提取 PDF 文本 +- Markdown 中的 base64 图像嵌入 +- 完善的错误处理 +- 支持多页 PDF + +## 设置 + +### 前提条件 + +- Python 3.8+ +- Mistral API 密钥([在此获取](https://mistral.ai/)) + +### 安装 + +1. 安装所需的软件包: + +```bash +pip install -r requirements.txt +``` + +2. 通过创建/编辑 `.env` 文件设置环境变量: + +```bash +# 在 .env 文件中 +MISTRAL_API_KEY=你的-mistral-api-密钥 +``` + +## 使用方法 + +### 启动服务器 + +使用以下命令运行服务器: + +```bash +python api_mp.py +``` + +或者直接使用 uvicorn: + +```bash +uvicorn api_mp:app --host 0.0.0.0 --port 7231 +``` + +然后配置到FastGPT配置文件即可 +```json +{ + xxx + "systemEnv": { + xxx + "customPdfParse": { + "url": "http://localhost:7231/v1/parse/file", // 自定义 PDF 解析服务地址 + } + } +} +``` + +### API 端点 + +#### 解析 PDF 文件 + +**端点**:`POST /v1/parse/file` + +**请求**: +- 包含文件字段的多部分表单数据 + +**响应**: +```json +{ + "pages": 5, // PDF 中的页数 + "markdown": "...", // 带有嵌入式 base64 图像的 Markdown 内容 + "duration": 10.5 // 处理时间(秒) +} +``` + +**错误响应**: +```json +{ + "pages": 0, + "markdown": "", + "error": "错误信息" +} +``` + +### 使用示例 + +使用 curl: + +```bash +curl -X POST -F "file=@path/to/your/document.pdf" http://localhost:7231/v1/parse/file +``` + +使用 JavaScript/Axios: + +```javascript +const formData = new FormData(); +formData.append('file', pdfFile); + +const response = await axios.post('http://localhost:7231/v1/parse/file', formData, { + headers: { + 'Content-Type': 'multipart/form-data' + } +}); + +if (response.data.error) { + console.error('错误:', response.data.error); +} else { + console.log('页数:', response.data.pages); + console.log('Markdown:', response.data.markdown); +} +``` + +## 限制 + +- PDF 文件必须可读且没有密码保护 +- 最大文件大小取决于 Mistral API 限制(目前最大52.4M) +- Mistral API 有页面限制(最多最大1000页) + +## 故障排除 + +### 常见错误 + +1. **"MISTRAL_API_KEY environment variable not set"(未设置 MISTRAL_API_KEY 环境变量)** + - 确保您已将 Mistral API 密钥添加到 `.env` 文件中 + - 确保 `.env` 文件与脚本在同一目录中 + +2. **"Failed to process PDF file"(无法处理 PDF 文件)** + - PDF 可能已损坏或受密码保护 + - 尝试使用其他 PDF 文件 + +3. **Mistral API 错误** + - 检查您的 Mistral API 密钥是否有效 + - 确保您在 Mistral API 速率限制范围内 + - 验证 PDF 是否在大小/页数限制范围内 + +## 许可证 + +MIT 许可证 \ No newline at end of file diff --git a/plugins/model/pdf-mistral/api_mp.py b/plugins/model/pdf-mistral/api_mp.py new file mode 100755 index 000000000..a284676ad --- /dev/null +++ b/plugins/model/pdf-mistral/api_mp.py @@ -0,0 +1,230 @@ +import time +import base64 +import fitz +import re +import json +from contextlib import asynccontextmanager +from loguru import logger +from fastapi import HTTPException, FastAPI, UploadFile, File +from fastapi.responses import JSONResponse +from mistralai import Mistral +import os +import shutil +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +app = FastAPI() +temp_dir = "./temp" + +# Initialize Mistral client with API key from environment variable +mistral_api_key = os.environ.get("MISTRAL_API_KEY", "") +if not mistral_api_key: + logger.warning("MISTRAL_API_KEY environment variable not set. PDF processing will fail.") + +mistral_client = Mistral(api_key=mistral_api_key) if mistral_api_key else None + +@asynccontextmanager +async def lifespan(app: FastAPI): + # Create temp directory if it doesn't exist + global temp_dir + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + print("Application startup, creating temp directory...") + yield + if temp_dir and os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + print("Application shutdown, cleaning up...") + +app.router.lifespan_context = lifespan + +@app.post("/v1/parse/file") +async def read_file( + file: UploadFile = File(...)): + temp_file_path = None + try: + start_time = time.time() + global temp_dir + os.makedirs(temp_dir, exist_ok=True) + temp_file_path = os.path.join(temp_dir, file.filename) + with open(temp_file_path, "wb") as temp_file: + file_content = await file.read() + temp_file.write(file_content) + + # Get page count using PyMuPDF + try: + pdf_document = fitz.open(temp_file_path) + total_pages = pdf_document.page_count + pdf_document.close() + except Exception as e: + logger.error(f"Failed to open PDF file: {str(e)}") + return { + "pages": 0, + "markdown": "", + "error": f"Failed to process PDF file: {str(e)}" + } + + if mistral_client is None: + return { + "pages": 0, + "markdown": "", + "error": "MISTRAL_API_KEY environment variable not set." + } + + # Step 1: Upload the file to Mistral's servers + logger.info(f"Uploading file {file.filename} to Mistral servers") + with open(temp_file_path, "rb") as f: + try: + uploaded_file = mistral_client.files.upload( + file={ + "file_name": file.filename, + "content": f, + }, + purpose="ocr" + ) + except Exception as e: + error_msg = str(e) + # Try to parse Mistral API error format + try: + error_data = json.loads(error_msg) + if error_data.get("object") == "error": + error_msg = error_data.get("message", error_msg) + except: + pass + + return { + "pages": 0, + "markdown": "", + "error": f"Mistral API upload error: {error_msg}" + } + + # Step 2: Get a signed URL for the uploaded file + logger.info(f"Getting signed URL for file ID: {uploaded_file.id}") + try: + signed_url = mistral_client.files.get_signed_url(file_id=uploaded_file.id) + except Exception as e: + error_msg = str(e) + # Try to parse Mistral API error format + try: + error_data = json.loads(error_msg) + if error_data.get("object") == "error": + error_msg = error_data.get("message", error_msg) + except: + pass + + return { + "pages": 0, + "markdown": "", + "error": f"Mistral API signed URL error: {error_msg}" + } + + # Step 3: Process the file using the signed URL + logger.info("Processing file with OCR API") + try: + ocr_response = mistral_client.ocr.process( + model="mistral-ocr-latest", + document={ + "type": "document_url", + "document_url": signed_url.url, + }, + include_image_base64=True + ) + except Exception as e: + error_msg = str(e) + # Try to parse Mistral API error format + try: + error_data = json.loads(error_msg) + if error_data.get("object") == "error": + error_msg = error_data.get("message", error_msg) + except: + pass + + return { + "pages": 0, + "markdown": "", + "error": f"Mistral OCR processing error: {error_msg}" + } + + # Combine all pages' markdown content + markdown_content = "\n".join(page.markdown for page in ocr_response.pages) + + # Create a dictionary to map image filenames to their base64 data + image_map = {} + for page in ocr_response.pages: + for img in page.images: + # Extract the image filename from the image id + img_id = img.id + img_base64 = img.image_base64 + + # Print a sample of the first image base64 data for debugging + if len(image_map) == 0 and img_base64: + print("Sample image base64 prefix:", img_base64[:50] if len(img_base64) > 50 else img_base64) + print("Does base64 already include prefix?", img_base64.startswith("data:image/")) + + # Ensure the base64 data is in the correct format for the upstream system + # If it doesn't already have the prefix, add it + if not img_base64.startswith("data:image/"): + # Assume it's a PNG if we can't determine the type + img_base64 = f"data:image/png;base64,{img_base64}" + + # Add both potential formats to the map + image_map[f"{img_id}.jpeg"] = img_base64 + image_map[f"{img_id}.png"] = img_base64 + image_map[img_id] = img_base64 + + # Use regex to find all image references in the markdown content + # This will match patterns like ![any-text](any-filename.extension) + image_pattern = r'!\[(.*?)\]\((.*?)\)' + + def replace_image_with_base64(match): + alt_text = match.group(1) + img_filename = match.group(2) + + # Extract just the filename without path + img_filename_only = os.path.basename(img_filename) + + # Check if we have base64 data for this image + if img_filename_only in image_map: + return f"![]({image_map[img_filename_only]})" + else: + # If we don't have base64 data, keep the original reference + logger.warning(f"No base64 data found for image: {img_filename_only}") + return match.group(0) + + # Replace all image references with base64 data + markdown_content = re.sub(image_pattern, replace_image_with_base64, markdown_content) + + # Clean up the uploaded file from Mistral's servers + try: + logger.info(f"Deleting uploaded file from Mistral servers: {uploaded_file.id}") + mistral_client.files.delete(file_id=uploaded_file.id) + except Exception as e: + logger.warning(f"Failed to delete uploaded file: {e}") + + end_time = time.time() + duration = end_time - start_time + print(file.filename + " Total time:", duration) + + # Return with format matching client expectations + return { + "pages": total_pages, + "markdown": markdown_content, + "duration": duration # Keep this for logging purposes + } + + except Exception as e: + logger.exception(e) + return { + "pages": 0, + "markdown": "", + "error": f"Internal server error: {str(e)}" + } + + finally: + if temp_file_path and os.path.exists(temp_file_path): + os.remove(temp_file_path) + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=7231) diff --git a/plugins/model/pdf-mistral/requirements.txt b/plugins/model/pdf-mistral/requirements.txt new file mode 100755 index 000000000..e840b8ce1 --- /dev/null +++ b/plugins/model/pdf-mistral/requirements.txt @@ -0,0 +1,8 @@ +fastapi==0.115.5 +uvicorn==0.32.1 +mistralai>=1.5.0 +PyMuPDF==1.24.14 +python-multipart==0.0.17 +python-dotenv==1.0.1 +loguru==0.7.2 +requests==2.32.3