From a2250b3a447a150816968a07f8e3ec192945a4d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=B8=8D=E5=81=9A=E4=BA=86=E7=9D=A1=E5=A4=A7=E8=A7=89?= <64798754+stakeswky@users.noreply.github.com> Date: Tue, 24 Feb 2026 13:40:00 +0800 Subject: [PATCH] cleanup: remove obsolete llm-ChatGLM2 and llm-Baichuan2 plugins (#6444) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These plugins provided OpenAI-compatible API wrappers for ChatGLM2 and Baichuan2 models. Both are now obsolete: - ChatGLM2 (2023) has been superseded by GLM-4 series with official OpenAI-compatible APIs - Baichuan2 (2023) has been superseded by Baichuan 4 with official OpenAI-compatible APIs FastGPT's model system already supports any OpenAI-compatible endpoint via requestUrl/requestAuth configuration — no self-hosted wrapper needed. Zero references to these plugins exist in the codebase. --- plugins/model/llm-Baichuan2/openai_api.py | 233 ----------------- plugins/model/llm-Baichuan2/requirements.txt | 14 - plugins/model/llm-ChatGLM2/openai_api.py | 260 ------------------- plugins/model/llm-ChatGLM2/requirements.txt | 11 - 4 files changed, 518 deletions(-) delete mode 100644 plugins/model/llm-Baichuan2/openai_api.py delete mode 100644 plugins/model/llm-Baichuan2/requirements.txt delete mode 100644 plugins/model/llm-ChatGLM2/openai_api.py delete mode 100644 plugins/model/llm-ChatGLM2/requirements.txt diff --git a/plugins/model/llm-Baichuan2/openai_api.py b/plugins/model/llm-Baichuan2/openai_api.py deleted file mode 100644 index ddcb6e9cb2..0000000000 --- a/plugins/model/llm-Baichuan2/openai_api.py +++ /dev/null @@ -1,233 +0,0 @@ -# coding=utf-8 -# Implements API for Baichuan2-7B-Chat in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat) -# Usage: python openai_api.py - -import gc -import time -import torch -import uvicorn -from pydantic import BaseModel, Field, validator -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from contextlib import asynccontextmanager -from typing import Any, Dict, List, Optional, Union -from transformers import AutoModelForCausalLM, AutoTokenizer -from sse_starlette.sse import ServerSentEvent, EventSourceResponse -from transformers.generation.utils import GenerationConfig -import random -import string - - -@asynccontextmanager -async def lifespan(app: FastAPI): # collects GPU memory - yield - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.ipc_collect() - - -app = FastAPI(lifespan=lifespan) - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -class ModelCard(BaseModel): - id: str - object: str = "model" - created: int = Field(default_factory=lambda: int(time.time())) - owned_by: str = "owner" - root: Optional[str] = None - parent: Optional[str] = None - permission: Optional[list] = None - -class ModelList(BaseModel): - object: str = "list" - data: List[str] = [] # Assuming ModelCard is a string type. Replace with the correct type if not. - -class ChatMessage(BaseModel): - role: str - content: str - - @validator('role') - def check_role(cls, v): - if v not in ["user", "assistant", "system"]: - raise ValueError('role must be one of "user", "assistant", "system"') - return v - -class DeltaMessage(BaseModel): - role: Optional[str] = None - content: Optional[str] = None - - @validator('role', allow_reuse=True) - def check_role(cls, v): - if v is not None and v not in ["user", "assistant", "system"]: - raise ValueError('role must be one of "user", "assistant", "system"') - return v - -class ChatCompletionRequest(BaseModel): - model: str - messages: List[ChatMessage] - temperature: Optional[float] = None - top_p: Optional[float] = None - max_length: Optional[int] = 8192 # max_length should be an integer. - stream: Optional[bool] = False - -class ChatCompletionResponseChoice(BaseModel): - index: int - message: ChatMessage - finish_reason: str - - @validator('finish_reason') - def check_finish_reason(cls, v): - if v not in ["stop", "length"]: - raise ValueError('finish_reason must be one of "stop" or "length"') - return v - -class ChatCompletionResponseStreamChoice(BaseModel): - index: int - delta: DeltaMessage - finish_reason: Optional[str] - - @validator('finish_reason', allow_reuse=True) - def check_finish_reason(cls, v): - if v is not None and v not in ["stop", "length"]: - raise ValueError('finish_reason must be one of "stop" or "length"') - return v - -class ChatCompletionResponse(BaseModel): - id:str - object:str - - @validator('object') - def check_object(cls,v): - if v not in ["chat.completion","chat.completion.chunk"]: - raise ValueError("object must be one of 'chat.completion' or 'chat.completion.chunk'") - return v - - created :Optional[int]=Field(default_factory=lambda:int(time.time())) - model:str - choices :List[Union[ChatCompletionResponseChoice,ChatCompletionResponseStreamChoice]] - - -def generate_id(): - possible_characters = string.ascii_letters + string.digits - random_string = ''.join(random.choices(possible_characters, k=29)) - return 'chatcmpl-' + random_string - - -@app.get("/v1/models", response_model=ModelList) -async def list_models(): - global model_args - model_card = ModelCard(id="gpt-3.5-turbo") - return ModelList(data=[model_card]) - - -@app.post("/v1/chat/completions", response_model=ChatCompletionResponse) -async def create_chat_completion(request: ChatCompletionRequest): - global model, tokenizer - if request.messages[-1].role != "user": - raise HTTPException(status_code=400, detail="Invalid request") - query = request.messages[-1].content - prev_messages = request.messages[:-1] - if len(prev_messages) > 0 and prev_messages[0].role == "system": - query = prev_messages.pop(0).content + query - messages = [] - for message in prev_messages: - messages.append({"role": message.role, "content": message.content}) - - messages.append({"role": "user", "content": query}) - - if request.stream: - generate = predict(messages, request.model) - return EventSourceResponse(generate, media_type="text/event-stream") - - response = '本接口不支持非stream模式' - choice_data = ChatCompletionResponseChoice( - index=0, - message=ChatMessage(role="assistant", content=response), - finish_reason="stop" - ) - id='chatcmpl-7QyqpwdfhqwajicIEznoc6Q47XAyW' - - return ChatCompletionResponse(id=id,model=request.model, choices=[choice_data], object="chat.completion") - - -async def predict(messages: List[List[str]], model_id: str): - global model, tokenizer - id = generate_id() - created = int(time.time()) - choice_data = ChatCompletionResponseStreamChoice( - index=0, - delta=DeltaMessage(role="assistant",content=""), - finish_reason=None - ) - chunk = ChatCompletionResponse(id=id,object="chat.completion.chunk",created=created,model=model_id, choices=[choice_data]) - yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) - - current_length = 0 - - for new_response in model.chat(tokenizer, messages, stream=True): - if len(new_response) == current_length: - continue - - new_text = new_response[current_length:] - current_length = len(new_response) - - choice_data = ChatCompletionResponseStreamChoice( - index=0, - delta=DeltaMessage(content=new_text), - finish_reason=None - ) - chunk = ChatCompletionResponse(id=id,object="chat.completion.chunk",created=created,model=model_id, choices=[choice_data]) - yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) - - - choice_data = ChatCompletionResponseStreamChoice( - index=0, - delta=DeltaMessage(), - finish_reason="stop" - ) - chunk = ChatCompletionResponse(id=id,object="chat.completion.chunk",created=created,model=model_id, choices=[choice_data]) - yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) - yield '[DONE]' - - -def load_models(): - print("本次加载的大语言模型为: Baichuan-13B-Chat") - tokenizer = AutoTokenizer.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", use_fast=False, trust_remote_code=True) - # model = AutoModelForCausalLM.from_pretrained("Baichuan2-13B-Chat", torch_dtype=torch.float32, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained("baichuan-inc/Baichuan2-7B-Chat", torch_dtype=torch.float16, trust_remote_code=True) - model = model.cuda() - model.generation_config = GenerationConfig.from_pretrained("baichuan-inc/Baichuan2-7B-Chat") - return tokenizer, model - -if __name__ == "__main__": - tokenizer, model = load_models() - uvicorn.run(app, host='0.0.0.0', port=6006, workers=1) - - while True: - try: - # 在这里执行您的程序逻辑 - - # 检查显存使用情况,如果超过阈值(例如90%),则触发垃圾回收 - if torch.cuda.is_available(): - gpu_memory_usage = torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated() - if gpu_memory_usage > 0.9: - gc.collect() - torch.cuda.empty_cache() - except RuntimeError as e: - if "out of memory" in str(e): - print("显存不足,正在重启程序...") - gc.collect() - torch.cuda.empty_cache() - time.sleep(5) # 等待一段时间以确保显存已释放 - tokenizer, model = load_models() - else: - raise e - - diff --git a/plugins/model/llm-Baichuan2/requirements.txt b/plugins/model/llm-Baichuan2/requirements.txt deleted file mode 100644 index 8f4b4470af..0000000000 --- a/plugins/model/llm-Baichuan2/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -protobuf -transformers==4.53.0 -cpm_kernels -torch>=2.0 -gradio -mdtex2html -sentencepiece -accelerate -sse-starlette -fastapi==0.99.1 -pydantic==1.10.7 -uvicorn==0.21.1 -xformers -bitsandbytes diff --git a/plugins/model/llm-ChatGLM2/openai_api.py b/plugins/model/llm-ChatGLM2/openai_api.py deleted file mode 100644 index 8ce1304867..0000000000 --- a/plugins/model/llm-ChatGLM2/openai_api.py +++ /dev/null @@ -1,260 +0,0 @@ -# coding=utf-8 -import argparse -import time -from contextlib import asynccontextmanager -from typing import List, Literal, Optional, Union - -import numpy as np -import tiktoken -import torch -import uvicorn -from fastapi import Depends, FastAPI, HTTPException, Request -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, Field -from sentence_transformers import SentenceTransformer -from sklearn.preprocessing import PolynomialFeatures -from sse_starlette.sse import EventSourceResponse -from starlette.status import HTTP_401_UNAUTHORIZED -from transformers import AutoModel, AutoTokenizer - - -@asynccontextmanager -async def lifespan(app: FastAPI): # collects GPU memory - yield - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.ipc_collect() - - -app = FastAPI(lifespan=lifespan) - -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -class ChatMessage(BaseModel): - role: Literal["user", "assistant", "system"] - content: str - - -class DeltaMessage(BaseModel): - role: Optional[Literal["user", "assistant", "system"]] = None - content: Optional[str] = None - - -class ChatCompletionRequest(BaseModel): - model: str - messages: List[ChatMessage] - temperature: Optional[float] = None - top_p: Optional[float] = None - max_length: Optional[int] = None - stream: Optional[bool] = False - - -class ChatCompletionResponseChoice(BaseModel): - index: int - message: ChatMessage - finish_reason: Literal["stop", "length"] - - -class ChatCompletionResponseStreamChoice(BaseModel): - index: int - delta: DeltaMessage - finish_reason: Optional[Literal["stop", "length"]] - - -class ChatCompletionResponse(BaseModel): - model: str - object: Literal["chat.completion", "chat.completion.chunk"] - choices: List[ - Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice] - ] - created: Optional[int] = Field(default_factory=lambda: int(time.time())) - - -async def verify_token(request: Request): - auth_header = request.headers.get('Authorization') - if auth_header: - token_type, _, token = auth_header.partition(' ') - if ( - token_type.lower() == "bearer" - and token == "sk-aaabbbcccdddeeefffggghhhiiijjjkkk" - ): # 这里配置你的token - return True - raise HTTPException( - status_code=HTTP_401_UNAUTHORIZED, - detail="Invalid authorization credentials", - ) - - -class EmbeddingRequest(BaseModel): - input: List[str] - model: str - - -class EmbeddingResponse(BaseModel): - data: list - model: str - object: str - usage: dict - - -def num_tokens_from_string(string: str) -> int: - """Returns the number of tokens in a text string.""" - encoding = tiktoken.get_encoding('cl100k_base') - num_tokens = len(encoding.encode(string)) - return num_tokens - - -def expand_features(embedding, target_length): - poly = PolynomialFeatures(degree=2) - expanded_embedding = poly.fit_transform(embedding.reshape(1, -1)) - expanded_embedding = expanded_embedding.flatten() - if len(expanded_embedding) > target_length: - # 如果扩展后的特征超过目标长度,可以通过截断或其他方法来减少维度 - expanded_embedding = expanded_embedding[:target_length] - elif len(expanded_embedding) < target_length: - # 如果扩展后的特征少于目标长度,可以通过填充或其他方法来增加维度 - expanded_embedding = np.pad( - expanded_embedding, (0, target_length - len(expanded_embedding)) - ) - return expanded_embedding - - -@app.post("/v1/chat/completions", response_model=ChatCompletionResponse) -async def create_chat_completion( - request: ChatCompletionRequest, token: bool = Depends(verify_token) -): - global model, tokenizer - - if request.messages[-1].role != "user": - raise HTTPException(status_code=400, detail="Invalid request") - query = request.messages[-1].content - - prev_messages = request.messages[:-1] - if len(prev_messages) > 0 and prev_messages[0].role == "system": - query = prev_messages.pop(0).content + query - - history = [] - if len(prev_messages) % 2 == 0: - for i in range(0, len(prev_messages), 2): - if ( - prev_messages[i].role == "user" - and prev_messages[i + 1].role == "assistant" - ): - history.append([prev_messages[i].content, prev_messages[i + 1].content]) - - if request.stream: - generate = predict(query, history, request.model) - return EventSourceResponse(generate, media_type="text/event-stream") - - response, _ = model.chat(tokenizer, query, history=history) - choice_data = ChatCompletionResponseChoice( - index=0, - message=ChatMessage(role="assistant", content=response), - finish_reason="stop", - ) - - return ChatCompletionResponse( - model=request.model, choices=[choice_data], object="chat.completion" - ) - - -async def predict(query: str, history: List[List[str]], model_id: str): - global model, tokenizer - - choice_data = ChatCompletionResponseStreamChoice( - index=0, delta=DeltaMessage(role="assistant"), finish_reason=None - ) - chunk = ChatCompletionResponse( - model=model_id, choices=[choice_data], object="chat.completion.chunk" - ) - yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) - - current_length = 0 - - for new_response, _ in model.stream_chat(tokenizer, query, history): - if len(new_response) == current_length: - continue - - new_text = new_response[current_length:] - current_length = len(new_response) - - choice_data = ChatCompletionResponseStreamChoice( - index=0, delta=DeltaMessage(content=new_text), finish_reason=None - ) - chunk = ChatCompletionResponse( - model=model_id, choices=[choice_data], object="chat.completion.chunk" - ) - yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) - - choice_data = ChatCompletionResponseStreamChoice( - index=0, delta=DeltaMessage(), finish_reason="stop" - ) - chunk = ChatCompletionResponse( - model=model_id, choices=[choice_data], object="chat.completion.chunk" - ) - yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) - yield '[DONE]' - - -@app.post("/v1/embeddings", response_model=EmbeddingResponse) -async def get_embeddings( - request: EmbeddingRequest, token: bool = Depends(verify_token) -): - # 计算嵌入向量和tokens数量 - embeddings = [embeddings_model.encode(text) for text in request.input] - - # 如果嵌入向量的维度不为1536,则使用插值法扩展至1536维度 - embeddings = [ - expand_features(embedding, 1536) if len(embedding) < 1536 else embedding - for embedding in embeddings - ] - - # Min-Max normalization 归一化 - embeddings = [embedding / np.linalg.norm(embedding) for embedding in embeddings] - - # 将numpy数组转换为列表 - embeddings = [embedding.tolist() for embedding in embeddings] - prompt_tokens = sum(len(text.split()) for text in request.input) - total_tokens = sum(num_tokens_from_string(text) for text in request.input) - - response = { - "data": [ - {"embedding": embedding, "index": index, "object": "embedding"} - for index, embedding in enumerate(embeddings) - ], - "model": request.model, - "object": "list", - "usage": { - "prompt_tokens": prompt_tokens, - "total_tokens": total_tokens, - }, - } - - return response - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", default="16", type=str, help="Model name") - args = parser.parse_args() - - model_dict = { - "4": "THUDM/chatglm2-6b-int4", - "8": "THUDM/chatglm2-6b-int8", - "16": "THUDM/chatglm2-6b", - } - - model_name = model_dict.get(args.model_name, "THUDM/chatglm2-6b") - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda() - embeddings_model = SentenceTransformer('moka-ai/m3e-large', device='cpu') - - uvicorn.run(app, host='0.0.0.0', port=6006, workers=1) diff --git a/plugins/model/llm-ChatGLM2/requirements.txt b/plugins/model/llm-ChatGLM2/requirements.txt deleted file mode 100644 index d55e99dcb0..0000000000 --- a/plugins/model/llm-ChatGLM2/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -fastapi==0.101.1 -numpy==1.24.3 -pydantic==1.10.7 -scikit_learn==1.2.2 -sentence_transformers==2.2.2 -sse_starlette==1.6.5 -starlette==0.49.1 -tiktoken==0.4.0 -torch==2.7.1 -transformers==4.53.0 -uvicorn==0.23.2