Create Python API (#457)

* 更新镜像

* 更新镜像信息

* 更新镜像信息

* Create openai_api.py

* Create requirements.txt

* Create README.md

* 添加python接口

* Delete python directory

* Create README.md

* Create Python API

* 文件结构化

* 文件结构化
This commit is contained in:
不做了睡大觉
2023-11-09 11:52:53 +08:00
committed by GitHub
parent 8bb5588305
commit 9f889d8806
9 changed files with 467 additions and 0 deletions

54
python/api/README.md Normal file
View File

@@ -0,0 +1,54 @@
# FastGPT-python-API
作者stakeswky。有问题请这样联系我stakeswky@gmail.com
## 1. 项目简介
该API以python为技术栈为fastgpt提供了一个简单易用的接口方便fastgpt处理各种任务。该API的主要功能包括
1. Word & PDF 图文提取
在现有的文件读取中fastgpt只能读取文件中的文字而无法读取图片。该API可以将word和pdf中的文字和图片提取出来方便fastgpt进行处理。
2. 网页递归获取
该API可以递归获取指定页面的内容和挖掘该页面存在的链接指向页面的内容请注意该功能现在仅支持获取静态页面的内容如果出现动态页面可能会出现无法获取的情况。
3. (研发中。。)
## 2. 安装方法
### 必要的知识
会使用Google
python的基本用法
docker的基本用法
百度OCR-API的文档https://ai.baidu.com/ai-doc/OCR/Ek3h7xypm
### 2.1 源码安装
该API依赖于python3.8请确保您的python版本符合要求。
```shell
pip install -r requirements.txt
```
引入环境变量APP_ID,API_KEY,SECRET_KEY
然后运行:
```shell
python main.py
```
启动!
### 2.2 Docker安装
一把梭拉现成的镜像,直接拉下来用就行了。
```shell
docker pull registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/fastgpt_python_api:1.0
```
然后运行,三个环境变量记得配置成自己的:
```shell
docker run -d -p 6010:6010 -e APP_ID=<your_app_id> -e API_KEY=<your_api_key> -e SECRET_KEY=<your_secret_key> registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/fastgpt_python_api:1.0
```
或者你也可以自己打镜像
```shell
docker build -t fastgpt-python-api .
```
然后运行:
```shell
docker run -d -p 6010:6010 -e APP_ID=<your_app_id> -e API_KEY=<your_api_key> -e SECRET_KEY=<your_secret_key> fastgpt-python-api
```
## 3. 使用方法
目录下附带了两个测试案例分别是word和pdf的图文提取和网页递归获取。按照那个来使用就好

74
python/api/api.py Normal file
View File

@@ -0,0 +1,74 @@
import os
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from services.office2txt import office_to_txt
from typing import List
from fastapi import HTTPException
from services.fetch import get_summary
import aiofiles
import queue
import uuid
# 请求模型
class SummaryRequest(BaseModel):
url: str
level: int
# 响应模型
class SummaryResponse(BaseModel):
url: str
title: str
summary: str
class ExtractedText(BaseModel):
text: str
# 文件转文本
async def process_file(file: UploadFile):
file_ext = os.path.splitext(file.filename)[1].lower()
if file_ext not in ['.docx', '.pdf', '.doc', '.txt']:
return JSONResponse(content={"error": "Unsupported file format"}, status_code=400)
# 生成唯一的文件名
unique_filename = f"{uuid.uuid4()}{file_ext}"
try:
# 读取文件内容并保存到唯一命名的文件中
async with aiofiles.open(unique_filename, "wb") as out_file:
while True:
contents = await file.read(1024) # 以块的方式读取文件
if not contents:
break
await out_file.write(contents)
# 文件处理逻辑,注意传入新的唯一文件名
extracted_text = office_to_txt(unique_filename)
print(extracted_text)
return {"text": extracted_text}
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=500)
finally:
# 清理:删除临时保存的唯一命名文件
if os.path.exists(unique_filename):
os.remove(unique_filename)
# 定义一个处理网页摘要的函数
async def process_summary(request):
if request.level < 0:
raise HTTPException(status_code=400, detail="Level must be non-negative.")
try:
# 使用定义的函数来获取网页摘要
summaries = get_summary(request.url, request.level)
# 将结果转换为响应模型列表
print(summaries)
return [SummaryResponse(url=url, title=title, summary=summary) for url, title, summary in summaries]
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

20
python/api/dockerfile Normal file
View File

@@ -0,0 +1,20 @@
# 使用官方Python运行时作为父镜像
FROM python:3.8
# 设置工作目录
WORKDIR /app
# 将当前目录内容复制到容器的/app中
ADD . /app
RUN pip install --upgrade -i https://pypi.tuna.tsinghua.edu.cn/simple pip
# 安装程序需要的包
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --no-cache-dir -r requirements.txt
RUN python -m nltk.downloader punkt
# 运行时监听的端口
EXPOSE 6010
# 运行app.py时的命令及其参数
CMD ["python", "main.py"]

48
python/api/main.py Normal file
View File

@@ -0,0 +1,48 @@
from fastapi.middleware.cors import CORSMiddleware
from fastapi import FastAPI, File, UploadFile
import queue
from typing import List
from api import SummaryRequest, SummaryResponse, ExtractedText,process_file,process_summary
import uvicorn
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
q = queue.Queue()
# 定义一个接口,接收文件并将其放入队列中
@app.post("/extract_text/", response_model=ExtractedText)
async def extract_text(file: UploadFile = File(...)):
# 将文件对象放入队列中,先进先出
q.put(file)
# 从队列中取出文件对象,并调用处理函数
file = q.get()
result = await process_file(file)
# 标记队列中的任务已完成
q.task_done()
# 返回处理结果
return result
# 定义一个接口,接收请求并将其放入队列中
@app.post("/generate_summary/", response_model=List[SummaryResponse])
async def generate_summary(request: SummaryRequest):
# 将请求对象放入队列中,先进先出
q.put(request)
# 从队列中取出请求对象,并调用处理函数
request = q.get()
result = await process_summary(request)
# 标记队列中的任务已完成
q.task_done()
# 返回处理结果
return result
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=6010)

View File

@@ -0,0 +1,14 @@
aiofiles==23.2.1
baidu_aip==4.16.12
beautifulsoup4==4.11.1
fastapi==0.104.1
nltk==3.8.1
pdf2image==1.16.3
pydantic==1.10.7
PyPDF2==3.0.1
python_docx==0.8.11
python_pptx==0.6.21
Requests==2.31.0
uvicorn==0.24.0
chardet==5.2.0
python-multipart==0.0.6

View File

@@ -0,0 +1,90 @@
import requests
import bs4
import nltk
from urllib.parse import urljoin
from time import sleep
import time
import math
# 全局变量来记录开始时间
start_time = time.time()
# 你可以设定一个最大运行时长比如60秒
max_run_time = 20
# 添加一个简单的IDF计算器
class SimpleIDFCalculator:
def __init__(self):
self.doc_freq = {}
self.num_docs = 0
def add_document(self, doc):
self.num_docs += 1
words = set(nltk.word_tokenize(doc))
for word in words:
if word in self.doc_freq:
self.doc_freq[word] += 1
else:
self.doc_freq[word] = 1
def idf(self, word):
return math.log(self.num_docs / (1 + self.doc_freq.get(word, 0)))
# 定义一个函数,用于获取网页的内容,并进行总结
def get_summary(url, level):
result = []
visited = set()
idf_calculator = SimpleIDFCalculator()
helper(url, level, result, visited, idf_calculator)
return result
# 辅助函数
def helper(url, level, result, visited, idf_calculator):
# # 检查是否超出运行时间限制
# if time.time() - start_time > max_run_time:
# print("Reached max run time, exiting...")
# return
if level == 0 or url in visited or not url.startswith("http"):
return
visited.add(url)
try:
response = requests.get(url)
if response.status_code != 200:
return
soup = bs4.BeautifulSoup(response.text, "html.parser")
title = soup.title.string if soup.title else 'No Title'
text = soup.get_text().strip()
idf_calculator.add_document(text)
sentences = nltk.sent_tokenize(text)
words = nltk.word_tokenize(text)
scores = {}
for sentence in sentences:
for word in nltk.word_tokenize(sentence):
tf = words.count(word) / len(words)
idf = idf_calculator.idf(word)
scores[sentence] = scores.get(sentence, 0) + (tf * idf)
summary = " ".join(sorted(scores, key=scores.get, reverse=True)[:10])
result.append((url, title, summary))
sleep(1) # Simple delay to prevent aggressive crawling
links = soup.find_all("a")
for link in links:
href = link.get("href")
if href:
# Handle relative links
next_url = urljoin(url, href)
helper(next_url, level - 1, result, visited, idf_calculator)
except Exception as e:
print(f"Error processing {url}: {e}")
# # 主程序部分,仅作为函数调用示例:
# summary = get_summary('https://zhihu.com', 2)
# print(summary)

View File

@@ -0,0 +1,93 @@
import os
import docx
from aip import AipOcr
from io import BytesIO
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
# 百度OCR API设置
APP_ID = os.environ.get('APP_ID','xxx')
API_KEY = os.environ.get('API_KEY','xxx')
SECRET_KEY = os.environ.get('SECRET_KEY','xxx')
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
def ocr_image(image_data):
result = client.basicGeneral(image_data)
text = ''
if 'words_result' in result:
for item in result['words_result']:
text += item['words'] + '\n'
return text
def process_pdf(file_path):
pdf = PdfReader(file_path)
num_pages = len(pdf.pages)
text = ''
for page_num in range(num_pages):
page = pdf.pages[page_num]
text += f'--------------------------------------------\n'
text += f'文档名:{os.path.basename(file_path)}\n'
text += f'页数:{page_num + 1}\n'
text += f'该页内容:\n'
text += page.extract_text() + '\n'
images = convert_from_path(file_path, first_page=page_num + 1, last_page=page_num + 1)
for image in images:
image_data = BytesIO()
image.save(image_data, format='PNG')
image_data = image_data.getvalue()
ocr_text = ocr_image(image_data)
if ocr_text:
text += f'图片文字:\n'
text += ocr_text + '\n'
text += '--------------------------------------------\n'
return text
def process_doc(file_path):
doc = docx.Document(file_path)
text = ''
page_num = 1
for paragraph in doc.paragraphs:
if paragraph.text.strip() == '': # 简单地将空行视为分页符
page_num += 1
else:
text += f'--------------------------------------------\n'
text += f'文档名:{os.path.basename(file_path)}\n'
text += f'页数:{page_num}\n'
text += f'该页内容:\n'
text += paragraph.text + '\n'
for shape in doc.inline_shapes:
if shape.type == docx.enum.shape.WD_INLINE_SHAPE.PICTURE:
blip_id = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
image_part = doc.part.related_parts[blip_id]
image_data = image_part.blob
ocr_text = ocr_image(image_data)
if ocr_text:
text += f'图片文字:\n'
text += ocr_text + '\n'
return text
def process_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
def office_to_txt(file_path):
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.docx':
return process_doc(file_path)
elif file_ext == '.pdf':
return process_pdf(file_path)
elif file_ext == '.doc':
return process_doc(file_path)
elif file_ext == '.txt':
return process_txt(file_path)
else:
raise ValueError('Unsupported file format')

View File

@@ -0,0 +1,25 @@
import requests
# 接口的URL
api_url = "http://127.0.0.1:6010/generate_summary/"
# 请求的数据
data = {
"url": "https://bing.com",
"level": 1
}
# 发送POST请求
response = requests.post(api_url, json=data)
# 检查响应状态
if response.status_code == 200:
# 请求成功,打印结果
summaries = response.json()
for summary in summaries:
print(f"URL: {summary['url']}")
print(f"Title: {summary['title']}")
print(f"Summary: {summary['summary']}\n")
else:
# 请求失败,打印错误信息
print(f"Failed to generate summary with status code {response.status_code}: {response.text}")

View File

@@ -0,0 +1,49 @@
import requests
import pytest
from docx import Document
import os
from tempfile import NamedTemporaryFile
from docx.shared import Inches
image_path = os.path.join(os.path.dirname(__file__), "test.png")
# 定义一个函数来创建一个新的Word文档并添加一个图片
def create_test_docx_with_image():
# 使用临时文件来避免文件名冲突
temp_file = NamedTemporaryFile(delete=False, suffix='.docx')
# 创建一个文档对象
doc = Document()
# 添加一个段落
doc.add_paragraph("This is a test document with an image.")
# 添加一个图片,确保提供的图片路径是有效的
doc.add_picture(image_path, width=Inches(1.25)) # 图片宽度设为1.25英寸
# 保存文档到临时文件
doc.save(temp_file.name)
# 关闭临时文件
temp_file.close()
# 返回文件路径
return temp_file.name
# 定义一个函数它将创建并发送多个Word文档并返回响应对象列表
def get_responses():
responses = []
# 创建并发送10个文档
for _ in range(10):
test_file_path = create_test_docx_with_image()
with open(test_file_path, "rb") as f:
files = {"file": (os.path.basename(test_file_path), f, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")}
response = requests.post("http://127.0.0.1:6010/extract_text/", files=files)
responses.append(response)
# 测试完成后删除文件
os.unlink(test_file_path)
return responses
# 使用pytest的parametrize装饰器测试所有响应
@pytest.mark.parametrize("response", get_responses())
def test_response(response):
# 断言响应的状态码为200
assert response.status_code == 200
# 断言响应的内容类型是application/json
assert "application/json" in response.headers["Content-Type"]
# 断言响应的数据包含文本信息
assert "text" in response.json()