mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-05 01:02:59 +08:00
V4.14.2 fearured (#5922)
* fix: chat agent template create (#5912) * doc * template market ui (#5917) * Compress tool (#5919) * Compress tool (#5914) * rename file * feat: agent call request * perf: Agent call (#5916) * fix: interactive in tool call * doc * fix: merge node response * fix: test * fix:修改 message 对话中的压缩提示词 (#5918) Co-authored-by: xxyyh <2289112474@qq> * perf: compress code * perf: agent call comment --------- Co-authored-by: YeYuheng <57035043+YYH211@users.noreply.github.com> Co-authored-by: xxyyh <2289112474@qq> * remove pr * feat: auto password * perf: app template cache * fix template market ui (#5921) --------- Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: YeYuheng <57035043+YYH211@users.noreply.github.com> Co-authored-by: xxyyh <2289112474@qq>
This commit is contained in:
@@ -0,0 +1,313 @@
|
||||
import type {
|
||||
ChatCompletionMessageParam,
|
||||
ChatCompletionTool,
|
||||
ChatCompletionMessageToolCall,
|
||||
CompletionFinishReason
|
||||
} from '@fastgpt/global/core/ai/type';
|
||||
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
|
||||
import type {
|
||||
ToolCallChildrenInteractive,
|
||||
WorkflowInteractiveResponseType
|
||||
} from '@fastgpt/global/core/workflow/template/system/interactive/type';
|
||||
import type { CreateLLMResponseProps, ResponseEvents } from '../request';
|
||||
import { createLLMResponse } from '../request';
|
||||
import type { ChatNodeUsageType } from '@fastgpt/global/support/wallet/bill/type';
|
||||
import { compressRequestMessages } from '../compress';
|
||||
import { computedMaxToken } from '../../utils';
|
||||
import { filterGPTMessageByMaxContext } from '../utils';
|
||||
import { getLLMModel } from '../../model';
|
||||
import { filterEmptyAssistantMessages } from './utils';
|
||||
|
||||
type RunAgentCallProps = {
|
||||
maxRunAgentTimes: number;
|
||||
compressTaskDescription?: string;
|
||||
|
||||
body: CreateLLMResponseProps['body'] & {
|
||||
tools: ChatCompletionTool[];
|
||||
|
||||
temperature?: number;
|
||||
top_p?: number;
|
||||
stream?: boolean;
|
||||
};
|
||||
|
||||
userKey?: CreateLLMResponseProps['userKey'];
|
||||
isAborted?: CreateLLMResponseProps['isAborted'];
|
||||
|
||||
childrenInteractiveParams?: ToolCallChildrenInteractive['params'];
|
||||
handleInteractiveTool: (e: ToolCallChildrenInteractive['params']) => Promise<{
|
||||
response: string;
|
||||
assistantMessages: ChatCompletionMessageParam[];
|
||||
usages: ChatNodeUsageType[];
|
||||
interactive?: WorkflowInteractiveResponseType;
|
||||
stop?: boolean;
|
||||
}>;
|
||||
|
||||
handleToolResponse: (e: {
|
||||
call: ChatCompletionMessageToolCall;
|
||||
messages: ChatCompletionMessageParam[];
|
||||
}) => Promise<{
|
||||
response: string;
|
||||
assistantMessages: ChatCompletionMessageParam[];
|
||||
usages: ChatNodeUsageType[];
|
||||
interactive?: WorkflowInteractiveResponseType;
|
||||
stop?: boolean;
|
||||
}>;
|
||||
} & ResponseEvents;
|
||||
|
||||
type RunAgentResponse = {
|
||||
completeMessages: ChatCompletionMessageParam[]; // Step request complete messages
|
||||
assistantMessages: ChatCompletionMessageParam[]; // Step assistant response messages
|
||||
interactiveResponse?: ToolCallChildrenInteractive;
|
||||
|
||||
// Usage
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
subAppUsages: ChatNodeUsageType[];
|
||||
|
||||
finish_reason: CompletionFinishReason | undefined;
|
||||
};
|
||||
|
||||
/*
|
||||
一个循环进行工具调用的 LLM 请求封装。
|
||||
|
||||
AssistantMessages 组成:
|
||||
1. 调用 AI 时生成的 messages
|
||||
2. tool 内部调用产生的 messages
|
||||
3. tool 响应的值,role=tool,content=tool response
|
||||
|
||||
RequestMessages 为模型请求的消息,组成:
|
||||
1. 历史对话记录
|
||||
2. 调用 AI 时生成的 messages
|
||||
3. tool 响应的值,role=tool,content=tool response
|
||||
|
||||
memoryRequestMessages 为上一轮中断时,requestMessages 的内容
|
||||
*/
|
||||
export const runAgentCall = async ({
|
||||
maxRunAgentTimes,
|
||||
body: { model, messages, max_tokens, tools, ...body },
|
||||
userKey,
|
||||
isAborted,
|
||||
|
||||
childrenInteractiveParams,
|
||||
handleInteractiveTool,
|
||||
handleToolResponse,
|
||||
|
||||
onReasoning,
|
||||
onStreaming,
|
||||
onToolCall,
|
||||
onToolParam
|
||||
}: RunAgentCallProps): Promise<RunAgentResponse> => {
|
||||
const modelData = getLLMModel(model);
|
||||
|
||||
let runTimes = 0;
|
||||
let interactiveResponse: ToolCallChildrenInteractive | undefined;
|
||||
|
||||
// Init messages
|
||||
const maxTokens = computedMaxToken({
|
||||
model: modelData,
|
||||
maxToken: max_tokens || 8000,
|
||||
min: 100
|
||||
});
|
||||
|
||||
// 本轮产生的 assistantMessages,包括 tool 内产生的
|
||||
const assistantMessages: ChatCompletionMessageParam[] = [];
|
||||
// 多轮运行时候的请求 messages
|
||||
let requestMessages = (
|
||||
await filterGPTMessageByMaxContext({
|
||||
messages,
|
||||
maxContext: modelData.maxContext - (maxTokens || 0) // filter token. not response maxToken
|
||||
})
|
||||
).map((item) => {
|
||||
if (item.role === 'assistant' && item.tool_calls) {
|
||||
return {
|
||||
...item,
|
||||
tool_calls: item.tool_calls.map((tool) => ({
|
||||
id: tool.id,
|
||||
type: tool.type,
|
||||
function: tool.function
|
||||
}))
|
||||
};
|
||||
}
|
||||
return item;
|
||||
});
|
||||
|
||||
let inputTokens: number = 0;
|
||||
let outputTokens: number = 0;
|
||||
let finish_reason: CompletionFinishReason | undefined;
|
||||
const subAppUsages: ChatNodeUsageType[] = [];
|
||||
|
||||
// 处理 tool 里的交互
|
||||
if (childrenInteractiveParams) {
|
||||
const {
|
||||
response,
|
||||
assistantMessages: toolAssistantMessages,
|
||||
usages,
|
||||
interactive,
|
||||
stop
|
||||
} = await handleInteractiveTool(childrenInteractiveParams);
|
||||
|
||||
// 将 requestMessages 复原成上一轮中断时的内容,并附上 tool response
|
||||
requestMessages = childrenInteractiveParams.toolParams.memoryRequestMessages.map((item) =>
|
||||
item.role === 'tool' && item.tool_call_id === childrenInteractiveParams.toolParams.toolCallId
|
||||
? {
|
||||
...item,
|
||||
content: response
|
||||
}
|
||||
: item
|
||||
);
|
||||
|
||||
// 只需要推送本轮产生的 assistantMessages
|
||||
assistantMessages.push(...filterEmptyAssistantMessages(toolAssistantMessages));
|
||||
subAppUsages.push(...usages);
|
||||
|
||||
// 相同 tool 触发了多次交互, 调用的 toolId 认为是相同的
|
||||
if (interactive) {
|
||||
// console.dir(interactive, { depth: null });
|
||||
interactiveResponse = {
|
||||
type: 'toolChildrenInteractive',
|
||||
params: {
|
||||
childrenResponse: interactive,
|
||||
toolParams: {
|
||||
memoryRequestMessages: requestMessages,
|
||||
toolCallId: childrenInteractiveParams.toolParams.toolCallId
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if (interactiveResponse || stop) {
|
||||
return {
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
subAppUsages,
|
||||
completeMessages: requestMessages,
|
||||
assistantMessages,
|
||||
interactiveResponse,
|
||||
finish_reason: 'stop'
|
||||
};
|
||||
}
|
||||
|
||||
// 正常完成该工具的响应,继续进行工具调用
|
||||
}
|
||||
|
||||
// 自循环运行
|
||||
while (runTimes < maxRunAgentTimes) {
|
||||
// TODO: 费用检测
|
||||
|
||||
runTimes++;
|
||||
|
||||
// 1. Compress request messages
|
||||
const result = await compressRequestMessages({
|
||||
messages: requestMessages,
|
||||
model: modelData
|
||||
});
|
||||
requestMessages = result.messages;
|
||||
inputTokens += result.usage?.inputTokens || 0;
|
||||
outputTokens += result.usage?.outputTokens || 0;
|
||||
|
||||
// 2. Request LLM
|
||||
let {
|
||||
reasoningText: reasoningContent,
|
||||
answerText: answer,
|
||||
toolCalls = [],
|
||||
usage,
|
||||
getEmptyResponseTip,
|
||||
assistantMessage: llmAssistantMessage,
|
||||
finish_reason: finishReason
|
||||
} = await createLLMResponse({
|
||||
body: {
|
||||
...body,
|
||||
model,
|
||||
messages: requestMessages,
|
||||
tool_choice: 'auto',
|
||||
toolCallMode: modelData.toolChoice ? 'toolChoice' : 'prompt',
|
||||
tools,
|
||||
parallel_tool_calls: true
|
||||
},
|
||||
userKey,
|
||||
isAborted,
|
||||
onReasoning,
|
||||
onStreaming,
|
||||
onToolCall,
|
||||
onToolParam
|
||||
});
|
||||
|
||||
finish_reason = finishReason;
|
||||
|
||||
if (!answer && !reasoningContent && !toolCalls.length) {
|
||||
return Promise.reject(getEmptyResponseTip());
|
||||
}
|
||||
|
||||
// 3. 更新 messages
|
||||
const cloneRequestMessages = requestMessages.slice();
|
||||
// 推送 AI 生成后的 assistantMessages
|
||||
assistantMessages.push(...llmAssistantMessage);
|
||||
requestMessages.push(...llmAssistantMessage);
|
||||
|
||||
// 4. Call tools
|
||||
let toolCallStep = false;
|
||||
for await (const tool of toolCalls) {
|
||||
const {
|
||||
response,
|
||||
assistantMessages: toolAssistantMessages,
|
||||
usages,
|
||||
interactive,
|
||||
stop
|
||||
} = await handleToolResponse({
|
||||
call: tool,
|
||||
messages: cloneRequestMessages
|
||||
});
|
||||
|
||||
const toolMessage: ChatCompletionMessageParam = {
|
||||
tool_call_id: tool.id,
|
||||
role: ChatCompletionRequestMessageRoleEnum.Tool,
|
||||
content: response
|
||||
};
|
||||
|
||||
// 5. Add tool response to messages
|
||||
assistantMessages.push(toolMessage);
|
||||
assistantMessages.push(...filterEmptyAssistantMessages(toolAssistantMessages)); // 因为 toolAssistantMessages 也需要记录成 AI 响应,所以这里需要推送。
|
||||
requestMessages.push(toolMessage); // 请求的 Request 只需要工具响应,不需要工具中 assistant 的内容,所以不推送 toolAssistantMessages
|
||||
|
||||
subAppUsages.push(...usages);
|
||||
|
||||
if (interactive) {
|
||||
interactiveResponse = {
|
||||
type: 'toolChildrenInteractive',
|
||||
params: {
|
||||
childrenResponse: interactive,
|
||||
toolParams: {
|
||||
memoryRequestMessages: [],
|
||||
toolCallId: tool.id
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
if (stop) {
|
||||
toolCallStep = true;
|
||||
}
|
||||
}
|
||||
|
||||
// 6 Record usage
|
||||
inputTokens += usage.inputTokens;
|
||||
outputTokens += usage.outputTokens;
|
||||
|
||||
if (toolCalls.length === 0 || !!interactiveResponse || toolCallStep) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (interactiveResponse) {
|
||||
interactiveResponse.params.toolParams.memoryRequestMessages = requestMessages;
|
||||
}
|
||||
|
||||
return {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
subAppUsages,
|
||||
completeMessages: requestMessages,
|
||||
assistantMessages,
|
||||
interactiveResponse,
|
||||
finish_reason
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,11 @@
|
||||
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type';
|
||||
|
||||
export const filterEmptyAssistantMessages = (messages: ChatCompletionMessageParam[]) => {
|
||||
return messages.filter((item) => {
|
||||
if (item.role === 'assistant') {
|
||||
if (!item.content) return false;
|
||||
if (item.content.length === 0) return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
};
|
||||
@@ -0,0 +1,103 @@
|
||||
/**
|
||||
* Agent 上下文压缩配置常量
|
||||
*
|
||||
* ## 设计原则
|
||||
*
|
||||
* 1. **空间分配**
|
||||
* - 输出预留:30%(模型生成答案 + 缓冲)
|
||||
* - 系统提示词(Depends on):15%
|
||||
* - Agent 对话历史:55%
|
||||
*
|
||||
* 2. **压缩策略**
|
||||
* - 触发阈值:接近空间上限时触发
|
||||
* - 压缩目标:激进压缩,预留增长空间
|
||||
* - 约束机制:单个 tool 有绝对大小限制
|
||||
*
|
||||
* 3. **协调关系**
|
||||
* - Depends on 使用完整 response,需要较大空间(15%)
|
||||
* - Agent 历史包含所有 tool responses,是动态主体(55%)
|
||||
* - 单个 tool 不能过大,避免挤占其他空间(10%)
|
||||
*/
|
||||
|
||||
export const COMPRESSION_CONFIG = {
|
||||
/**
|
||||
* === Depends on(系统提示词中的步骤历史)===
|
||||
*
|
||||
* 触发场景:拼接依赖步骤的完整 response 后,token 数超过阈值
|
||||
* 内容特点:包含多个步骤的完整执行结果(使用 response 而非 summary)
|
||||
*
|
||||
* 示例(maxContext=100k):
|
||||
* - 依赖 3 个步骤,每个 4k → 12k (12%) ✅ 不触发
|
||||
* - 依赖 5 个步骤,每个 4k → 20k (20%) ⚠️ 触发压缩 → 12k
|
||||
*/
|
||||
DEPENDS_ON_THRESHOLD: 0.15, // 15% 触发压缩
|
||||
DEPENDS_ON_TARGET: 0.12, // 压缩到 12%(预留 3% 缓冲)
|
||||
|
||||
/**
|
||||
* === 对话历史 ===
|
||||
*
|
||||
* 触发场景:对话历史(含所有 user/assistant/tool 消息)超过阈值
|
||||
* 内容特点:动态累积,包含所有 tool responses
|
||||
*
|
||||
* 示例(maxContext=100k):
|
||||
* - 初始 20k + 6 轮对话(34k) = 54k (54%) ✅ 不触发
|
||||
* - 再 1 轮 = 60k (60%) ⚠️ 触发压缩 → 30k
|
||||
* - 预留:55k - 30k = 25k(还能跑 4 轮)
|
||||
*/
|
||||
MESSAGE_THRESHOLD: 0.8, // 55% 触发压缩
|
||||
MESSAGE_TARGET_RATIO: 0.5, // 压缩到 50%(即原 55% → 27.5%)
|
||||
|
||||
/**
|
||||
* === 单个 tool response ===
|
||||
*
|
||||
* 触发场景:单个 tool 返回的内容超过绝对大小限制
|
||||
* 内容特点:单次 tool 调用的响应(如搜索结果、文件内容等)
|
||||
*
|
||||
* 示例(maxContext=100k):
|
||||
* - tool response = 8k (8%) ✅ 不触发
|
||||
* - tool response = 15k (15%) ⚠️ 触发压缩 → 7k
|
||||
*/
|
||||
SINGLE_TOOL_MAX: 0.5,
|
||||
SINGLE_TOOL_TARGET: 0.25,
|
||||
|
||||
/**
|
||||
* === 分块压缩 ===
|
||||
*
|
||||
* 触发场景:当内容需要分块处理时(超过 LLM 单次处理能力)
|
||||
* 用途:将超大内容切分成多个块,分别压缩后合并
|
||||
*
|
||||
* 示例(maxContext=100k):
|
||||
* - 单块最大:40k tokens
|
||||
* - 50k 内容 → 切分成 2 块,每块约 25k
|
||||
*/
|
||||
CHUNK_SIZE_RATIO: 0.5 // 40%(单块不超过此比例)
|
||||
} as const;
|
||||
|
||||
/**
|
||||
* 计算各场景的压缩阈值
|
||||
* @param maxContext - 模型的最大上下文长度
|
||||
* @returns 各场景的具体 token 数阈值
|
||||
*/
|
||||
export const calculateCompressionThresholds = (maxContext: number) => {
|
||||
return {
|
||||
// Depends on 压缩阈值
|
||||
dependsOn: {
|
||||
threshold: Math.floor(maxContext * COMPRESSION_CONFIG.DEPENDS_ON_THRESHOLD),
|
||||
target: Math.floor(maxContext * COMPRESSION_CONFIG.DEPENDS_ON_TARGET)
|
||||
},
|
||||
// 对话历史压缩阈值
|
||||
messages: {
|
||||
threshold: Math.floor(maxContext * COMPRESSION_CONFIG.MESSAGE_THRESHOLD),
|
||||
targetRatio: COMPRESSION_CONFIG.MESSAGE_TARGET_RATIO
|
||||
},
|
||||
|
||||
// 单个 tool response 压缩阈值
|
||||
singleTool: {
|
||||
threshold: Math.floor(maxContext * COMPRESSION_CONFIG.SINGLE_TOOL_MAX),
|
||||
target: Math.floor(maxContext * COMPRESSION_CONFIG.SINGLE_TOOL_TARGET)
|
||||
},
|
||||
|
||||
// 分块大小
|
||||
chunkSize: Math.floor(maxContext * COMPRESSION_CONFIG.CHUNK_SIZE_RATIO)
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,140 @@
|
||||
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
import { countGptMessagesTokens } from '../../../../common/string/tiktoken';
|
||||
import { addLog } from '../../../../common/system/log';
|
||||
import { calculateCompressionThresholds } from './constants';
|
||||
import { createLLMResponse } from '../request';
|
||||
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
|
||||
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type';
|
||||
import { getCompressRequestMessagesPrompt } from './prompt';
|
||||
import type { ChatNodeUsageType } from '@fastgpt/global/support/wallet/bill/type';
|
||||
import { formatModelChars2Points } from '../../../../support/wallet/usage/utils';
|
||||
import { i18nT } from '../../../../../web/i18n/utils';
|
||||
import { parseToolArgs } from '../../utils';
|
||||
|
||||
/**
|
||||
* 压缩 对话历史
|
||||
* 当 messages 的 token 长度超过阈值时,调用 LLM 进行压缩
|
||||
*/
|
||||
export const compressRequestMessages = async ({
|
||||
messages,
|
||||
model
|
||||
}: {
|
||||
messages: ChatCompletionMessageParam[];
|
||||
model: LLMModelItemType;
|
||||
}): Promise<{
|
||||
messages: ChatCompletionMessageParam[];
|
||||
usage?: ChatNodeUsageType;
|
||||
}> => {
|
||||
if (!messages || messages.length === 0) {
|
||||
return {
|
||||
messages
|
||||
};
|
||||
}
|
||||
|
||||
// Save the system messages
|
||||
const [systemMessages, otherMessages]: [
|
||||
ChatCompletionMessageParam[],
|
||||
ChatCompletionMessageParam[]
|
||||
] = [[], []];
|
||||
messages.forEach((message) => {
|
||||
if (message.role === ChatCompletionRequestMessageRoleEnum.System) {
|
||||
systemMessages.push(message);
|
||||
} else {
|
||||
otherMessages.push(message);
|
||||
}
|
||||
});
|
||||
|
||||
const messageTokens = await countGptMessagesTokens(otherMessages);
|
||||
const thresholds = calculateCompressionThresholds(model.maxContext).messages;
|
||||
|
||||
if (messageTokens < thresholds.threshold) {
|
||||
return {
|
||||
messages
|
||||
};
|
||||
}
|
||||
|
||||
addLog.info('[Compression messages] Start', {
|
||||
tokens: messageTokens
|
||||
});
|
||||
|
||||
const compressPrompt = await getCompressRequestMessagesPrompt({
|
||||
messages: otherMessages,
|
||||
rawTokens: messageTokens,
|
||||
model
|
||||
});
|
||||
|
||||
const userPrompt = '请执行压缩操作,严格按照JSON格式返回结果。';
|
||||
|
||||
try {
|
||||
const { answerText, usage } = await createLLMResponse({
|
||||
body: {
|
||||
model,
|
||||
messages: [
|
||||
{
|
||||
role: ChatCompletionRequestMessageRoleEnum.System,
|
||||
content: compressPrompt
|
||||
},
|
||||
{
|
||||
role: ChatCompletionRequestMessageRoleEnum.User,
|
||||
content: userPrompt
|
||||
}
|
||||
],
|
||||
temperature: 0.1,
|
||||
stream: true
|
||||
}
|
||||
});
|
||||
|
||||
if (!answerText) {
|
||||
addLog.warn('[Compression messages] failed: empty response, return original messages');
|
||||
return { messages };
|
||||
}
|
||||
|
||||
const { totalPoints, modelName } = formatModelChars2Points({
|
||||
model: model.model,
|
||||
inputTokens: usage.inputTokens,
|
||||
outputTokens: usage.outputTokens
|
||||
});
|
||||
const compressedUsage = {
|
||||
moduleName: i18nT('account_usage:compress_llm_messages'),
|
||||
model: modelName,
|
||||
totalPoints,
|
||||
inputTokens: usage.inputTokens,
|
||||
outputTokens: usage.outputTokens
|
||||
};
|
||||
|
||||
const compressResult = parseToolArgs<{
|
||||
compressed_messages: ChatCompletionMessageParam[];
|
||||
compression_summary: string;
|
||||
}>(answerText);
|
||||
|
||||
if (
|
||||
!compressResult ||
|
||||
!Array.isArray(compressResult) ||
|
||||
compressResult.compressed_messages.length === 0
|
||||
) {
|
||||
addLog.warn('[Compression messages] failed: cannot parse JSON, return original messages', {
|
||||
messages: compressResult?.compressed_messages
|
||||
});
|
||||
return { messages, usage: compressedUsage };
|
||||
}
|
||||
|
||||
const compressedTokens = usage.outputTokens;
|
||||
addLog.info('[Compression messages] successfully', {
|
||||
originalTokens: messageTokens,
|
||||
compressedTokens,
|
||||
actualRatio: (compressedTokens / messageTokens).toFixed(2),
|
||||
summary: compressResult.compression_summary
|
||||
});
|
||||
|
||||
// 如果之前提取了 system 消息,现在插回去
|
||||
const finalMessages = [...systemMessages, ...compressResult.compressed_messages];
|
||||
|
||||
return {
|
||||
messages: finalMessages,
|
||||
usage: compressedUsage
|
||||
};
|
||||
} catch (error) {
|
||||
addLog.error('[Compression messages] failed', error);
|
||||
return { messages };
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,296 @@
|
||||
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type';
|
||||
import { calculateCompressionThresholds } from './constants';
|
||||
|
||||
export const getCompressRequestMessagesPrompt = async ({
|
||||
rawTokens,
|
||||
messages,
|
||||
model
|
||||
}: {
|
||||
messages: ChatCompletionMessageParam[];
|
||||
rawTokens: number;
|
||||
model: LLMModelItemType;
|
||||
}) => {
|
||||
const thresholds = calculateCompressionThresholds(model.maxContext);
|
||||
const targetTokens = Math.round(rawTokens * thresholds.messages.targetRatio);
|
||||
|
||||
return `你是 Agent 对话历史压缩专家。你的任务是将对话历史压缩到目标 token 数,同时确保对话逻辑连贯性和工具调用的 ID 映射关系完全正确。
|
||||
|
||||
## 核心原则(最高优先级)
|
||||
|
||||
### ⚠️ 忠实性铁律
|
||||
**你只能对原始内容进行删除和截断,绝不能添加、推测、改写或创造任何不存在的内容。**
|
||||
|
||||
**绝对禁止的行为**:
|
||||
- ❌ 添加原文中不存在的信息、数据或结论
|
||||
- ❌ 推测或补充用户可能的意图
|
||||
- ❌ 修改数字、日期、人名、地名等任何事实性信息
|
||||
- ❌ 更改 tool_call 的参数值(即使看起来更合理)
|
||||
|
||||
**允许的操作**:
|
||||
- ✅ 删除整条消息(但要保持 tool_call 原子性)
|
||||
- ✅ 截断消息的 content(删除部分句子或段落)
|
||||
- ✅ 删除冗余的重复表达(保留其中一次)
|
||||
- ✅ 删除寒暄、过程描述等低价值内容
|
||||
- ✅ 改写或重新表述原文,总结或概括原文内容
|
||||
|
||||
**验证方法**:压缩后的每个词、每个数字、每个 ID 都必须能在原始消息中找到。
|
||||
|
||||
---
|
||||
|
||||
## 压缩目标
|
||||
- **原始 token 数**: ${rawTokens} tokens
|
||||
- **目标 token 数**: ${targetTokens} tokens (压缩比例: ${Math.round(thresholds.messages.targetRatio * 100)}%)
|
||||
- **约束**: 输出的 JSON 内容必须接近 ${targetTokens} tokens
|
||||
|
||||
---
|
||||
|
||||
## 三阶段压缩工作流
|
||||
|
||||
### 【第一阶段:扫描与标注】(内部思考,不输出)
|
||||
|
||||
在开始压缩前,请先在内心完成以下分析:
|
||||
|
||||
1. **构建 ID 映射表**
|
||||
- 扫描所有 assistant 消息中的 tool_calls,提取每个 tool_call 的 id
|
||||
- 找到对应的 tool 消息的 tool_call_id
|
||||
- 建立一一对应的映射关系表,例如:
|
||||
\`\`\`
|
||||
call_abc123 → tool 消息 #5
|
||||
call_def456 → tool 消息 #7
|
||||
\`\`\`
|
||||
|
||||
2. **评估消息价值**
|
||||
基于以下四个维度,为每条消息标注价值等级:
|
||||
|
||||
**维度 1:信息密度**
|
||||
- **[高密度]**: 包含数据、结论、决策、关键引用、成功的执行结果
|
||||
- **[中密度]**: 提供背景信息、过程性描述
|
||||
- **[低密度]**: 寒暄、重复、冗余表达、调试日志
|
||||
- **[负价值]**: 空内容、纯错误信息、失败的尝试、无意义的响应
|
||||
|
||||
**维度 2:对话连贯性**
|
||||
- **[关键节点]**: 话题转折点、问题解决的关键步骤、问题定位的关键错误
|
||||
- **[上下文依赖]**: 被后续消息引用或依赖的内容
|
||||
- **[独立片段]**: 与上下文关联较弱的内容
|
||||
- **[断裂节点]**: 失败后被重试的操作、未被引用的错误、中间步骤的失败
|
||||
|
||||
**维度 3:时间权重**
|
||||
- **[近期]**: 越接近对话尾部,权重越高(保留完整度优先)
|
||||
- **[早期]**: 早期消息可适度精简,但需保留关键定义/约束
|
||||
|
||||
**维度 4:工具调用有效性**
|
||||
- **[成功响应]**: tool 消息返回了有效数据或成功执行的确认
|
||||
- **[有价值错误]**: 错误信息帮助定位问题或被后续消息引用分析
|
||||
- **[无价值错误]**: 纯粹的失败尝试,后续有成功重试,未被引用
|
||||
- **[空响应]**: content 为空、null 或仅包含"无结果"、"未找到"等无效信息
|
||||
|
||||
**错误和空响应识别标准**:
|
||||
判断 tool 消息是否为错误或空响应:
|
||||
- content 包含"失败"、"错误"、"Error"、"Exception"、"超时"、"Timeout"
|
||||
- content 为空字符串、null、"无结果"、"未找到"、"No results"
|
||||
- 检查后续是否有 assistant 引用该错误来调整策略
|
||||
- 如果是孤立的错误且后续有成功重试 → 标记为负价值,优先删除
|
||||
- **关键**:删除错误消息时,必须同时删除对应的 tool_call(保持原子性)
|
||||
|
||||
3. **确定压缩策略**
|
||||
综合三个维度,制定压缩策略:
|
||||
- **tool_call 相关消息**:作为原子单元,必须成对保留(见第二阶段的原子性约束)
|
||||
- **高价值消息**(高密度 或 关键节点 或 近期消息):保留 70-90% 内容
|
||||
- **中等价值消息**(中密度 + 有上下文依赖):保留 40-60% 内容
|
||||
- **低价值消息**(低密度 + 独立片段 + 早期):保留 10-20% 或删除
|
||||
|
||||
---
|
||||
|
||||
### 【第二阶段:执行压缩】
|
||||
|
||||
基于第一阶段的分析,执行压缩操作:
|
||||
|
||||
**压缩原则**:
|
||||
1. **工具调用原子性(最高优先级)**:
|
||||
- ⚠️ **强制约束**:assistant 的 tool_calls 消息和对应的 tool 响应消息必须作为**不可分割的原子单元**
|
||||
- 如果要删除某个工具调用,必须**同时删除** assistant 消息中的 tool_call 和对应的 tool 消息
|
||||
- **绝不允许**出现以下情况:
|
||||
- ❌ 保留 tool_call 但删除 tool 响应
|
||||
- ❌ 保留 tool 响应但删除 tool_call
|
||||
- ❌ tool_call 的 id 与 tool 的 tool_call_id 不匹配
|
||||
- 验证方法:遍历所有 tool_call 的 id,确保每个 id 都有且仅有一个对应的 tool 消息
|
||||
|
||||
2. **ID 不可变**: 所有 tool_call 的 id 和 tool_call_id 必须原样保留,绝不修改
|
||||
|
||||
3. **结构完整**: 每个 tool_call 对象必须包含 \`id\`, \`type\`, \`function\` 字段
|
||||
|
||||
4. **顺序保持**: 严格保持对话的时间顺序,assistant 的 tool_calls 和对应的 tool 响应按原始顺序出现
|
||||
|
||||
5. **逻辑连贯**: 确保压缩后的对话仍然能体现完整的逻辑流程(问题→分析→工具调用→结论)
|
||||
|
||||
6. **大幅精简 content**:
|
||||
- tool 消息的 content:删除冗长描述、重复信息,只保留核心结论和关键数据
|
||||
- user/assistant 消息:精简表达,但保留关键信息和逻辑转折
|
||||
- 可合并相似的工具结果(但必须保留各自的 tool_call_id)
|
||||
|
||||
**压缩技巧**:
|
||||
- **删除类内容**:详细过程描述、重复信息、失败尝试、调试日志、冗余寒暄
|
||||
- **保留类内容**:具体数据、关键结论、错误信息、链接引用、决策依据
|
||||
- **精简技巧**:
|
||||
- 用"核心发现:A、B、C"代替长篇叙述
|
||||
- 用"报错:具体错误"代替详细堆栈
|
||||
- 用"已完成:操作X"代替冗长确认
|
||||
|
||||
---
|
||||
|
||||
### 【第三阶段:自校验】
|
||||
|
||||
输出前,必须检查:
|
||||
|
||||
1. **ID 一致性校验**
|
||||
- 每个 assistant 消息中的 tool_calls[i].id 是否有对应的 tool 消息?
|
||||
- 每个 tool 消息的 tool_call_id 是否能在前面的 assistant 消息中找到?
|
||||
- 是否所有 ID 都原样保留,没有修改或生成新 ID?
|
||||
|
||||
2. **逻辑连贯性校验**
|
||||
- 对话流程是否完整?(提问→分析→执行→结论)
|
||||
- 是否存在突兀的跳跃或缺失关键步骤?
|
||||
- 工具调用的上下文是否清晰?
|
||||
|
||||
3. **压缩比例校验**
|
||||
- 估算输出的 JSON 字符串长度,是否接近 ${targetTokens} tokens?
|
||||
- 如果超出目标,需进一步精简 content 字段(优先精简低价值消息)
|
||||
|
||||
4. **格式完整性校验**
|
||||
- 所有 tool_call 对象是否包含完整的 \`id\`, \`type\`, \`function\` 字段?
|
||||
- JSON 结构是否正确?
|
||||
|
||||
---
|
||||
|
||||
## 输出格式
|
||||
|
||||
请按照以下 JSON 格式输出(必须使用 \`\`\`json 代码块):
|
||||
|
||||
\`\`\`json
|
||||
{
|
||||
"compressed_messages": [
|
||||
{"role": "user", "content": "用户请求(精简表达)"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "分析说明(精简但保留逻辑)",
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "call_原始ID",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "工具名",
|
||||
"arguments": "{\\"param\\":\\"精简后的值\\"}"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"tool_call_id": "call_原始ID",
|
||||
"content": "工具返回的核心结果(已大幅精简,只保留关键信息)"
|
||||
},
|
||||
{"role": "assistant", "content": "基于工具结果的结论(精简表达)"}
|
||||
],
|
||||
"compression_summary": "原始${rawTokens}tokens → 约X tokens (压缩比例Y%)。操作:删除了Z条低价值消息,精简了N个工具响应,M条用户/助手消息。对话逻辑保持完整,ID映射关系已验证正确。"
|
||||
}
|
||||
\`\`\`
|
||||
|
||||
---
|
||||
|
||||
## 压缩示例
|
||||
|
||||
**示例 1:忠实性压缩(只删除,不改写)**
|
||||
|
||||
原始(约 500 tokens):
|
||||
\`\`\`json
|
||||
[
|
||||
{"role": "user", "content": "你好,我想了解一下 Python 性能优化的相关技术和最佳实践,能帮我搜索一些资料吗?"},
|
||||
{"role": "assistant", "content": "当然可以!我会帮您搜索 Python 性能优化相关的资料。让我先搜索相关文章和教程。"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_abc", "type": "function", "function": {"name": "search", "arguments": "{\\"query\\":\\"Python性能优化完整指南\\",\\"max_results\\":10}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_abc", "content": "找到10篇文章:\\n1. 标题:Python性能优化完整指南\\n 作者:张三\\n 发布时间:2024-01-15\\n 摘要:本文详细介绍了Python性能优化的各种技巧,包括使用Cython进行编译优化,NumPy向量化计算,以及内存优化技术...(此处省略400字详细内容)\\n URL: https://example.com/article1\\n\\n2. 标题:高性能Python编程实战\\n 作者:李四\\n ..."},
|
||||
{"role": "assistant", "content": "根据搜索结果,我为您总结了Python性能优化的主要技术..."}
|
||||
]
|
||||
\`\`\`
|
||||
|
||||
压缩后(约 200 tokens,注意:所有内容都直接来自原文,只是删除了冗余部分):
|
||||
\`\`\`json
|
||||
[
|
||||
{"role": "user", "content": "我想了解 Python 性能优化的相关技术和最佳实践"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_abc", "type": "function", "function": {"name": "search", "arguments": "{\\"query\\":\\"Python性能优化完整指南\\",\\"max_results\\":10}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_abc", "content": "找到10篇文章:\\n1. 标题:Python性能优化完整指南\\n 摘要:使用Cython进行编译优化,NumPy向量化计算,以及内存优化技术"},
|
||||
{"role": "assistant", "content": "根据搜索结果,我为您总结了Python性能优化的主要技术"}
|
||||
]
|
||||
\`\`\`
|
||||
|
||||
**关键**:压缩后的每个词都能在原文找到,只是删除了"你好"、"能帮我搜索"、"作者"、"发布时间"等冗余信息。
|
||||
|
||||
**示例 2:删除失败的工具调用**
|
||||
|
||||
原始(约 600 tokens):
|
||||
\`\`\`json
|
||||
[
|
||||
{"role": "user", "content": "搜索北京的五星级酒店"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_fail1", "type": "function", "function": {"name": "search", "arguments": "{\\"query\\":\\"北京五星级酒店\\",\\"location\\":\\"Beijing\\"}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_fail1", "content": "Error: 网络超时,请重试"},
|
||||
{"role": "assistant", "content": "搜索遇到网络问题,让我重试"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_fail2", "type": "function", "function": {"name": "search", "arguments": "{\\"query\\":\\"北京酒店\\"}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_fail2", "content": "未找到相关结果"},
|
||||
{"role": "assistant", "content": "没找到结果,我换个搜索方式"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_ok", "type": "function", "function": {"name": "search", "arguments": "{\\"query\\":\\"北京五星酒店推荐\\"}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_ok", "content": "找到5家酒店:1. 北京王府半岛酒店 2. 北京四季酒店..."},
|
||||
{"role": "assistant", "content": "为您找到了5家五星级酒店推荐"}
|
||||
]
|
||||
\`\`\`
|
||||
|
||||
压缩后(约 120 tokens):
|
||||
\`\`\`json
|
||||
[
|
||||
{"role": "user", "content": "搜索北京的五星级酒店"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_ok", "type": "function", "function": {"name": "search", "arguments": "{\\"query\\":\\"北京五星酒店推荐\\"}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_ok", "content": "找到5家酒店:1. 北京王府半岛酒店 2. 北京四季酒店..."},
|
||||
{"role": "assistant", "content": "为您找到5家五星级酒店"}
|
||||
]
|
||||
\`\`\`
|
||||
|
||||
**示例 3:多轮对话合并(通过删除中间过程)**
|
||||
|
||||
原始(约 400 tokens):
|
||||
\`\`\`json
|
||||
[
|
||||
{"role": "user", "content": "帮我创建一个新文件"},
|
||||
{"role": "assistant", "content": "好的,我需要知道文件名和内容。请问文件名是什么?"},
|
||||
{"role": "user", "content": "文件名叫 test.txt"},
|
||||
{"role": "assistant", "content": "明白了,文件名是 test.txt。那么您想在文件中写入什么内容呢?"},
|
||||
{"role": "user", "content": "写入 'Hello World'"},
|
||||
{"role": "assistant", "content": "收到!我现在帮您创建文件 test.txt,并写入内容 'Hello World'"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_xyz", "type": "function", "function": {"name": "write_file", "arguments": "{\\"path\\":\\"test.txt\\",\\"content\\":\\"Hello World\\"}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_xyz", "content": "文件创建成功。文件路径:/workspace/test.txt。文件大小:11 bytes。创建时间:2024-01-15 10:30:00"},
|
||||
{"role": "assistant", "content": "太好了!文件 test.txt 已经成功创建,内容为 'Hello World'。"}
|
||||
]
|
||||
\`\`\`
|
||||
|
||||
压缩后(约 150 tokens,删除了询问过程,保留最终状态):
|
||||
\`\`\`json
|
||||
[
|
||||
{"role": "user", "content": "帮我创建一个新文件"},
|
||||
{"role": "user", "content": "文件名叫 test.txt"},
|
||||
{"role": "user", "content": "写入 'Hello World'"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "call_xyz", "type": "function", "function": {"name": "write_file", "arguments": "{\\"path\\":\\"test.txt\\",\\"content\\":\\"Hello World\\"}"}}]},
|
||||
{"role": "tool", "tool_call_id": "call_xyz", "content": "文件创建成功。文件路径:/workspace/test.txt。文件大小:11 bytes"},
|
||||
{"role": "assistant", "content": "文件 test.txt 已经成功创建,内容为 'Hello World'"}
|
||||
]
|
||||
\`\`\`
|
||||
|
||||
**关键**:删除了 assistant 的询问消息,但保留了所有 user 消息和最终结果,所有内容都来自原文。
|
||||
|
||||
---
|
||||
|
||||
## 待压缩的对话历史
|
||||
|
||||
${JSON.stringify(messages, null, 2)}
|
||||
|
||||
---
|
||||
|
||||
请严格按照三阶段工作流执行,确保对话逻辑连贯、ID 映射关系完全正确,输出接近目标 token 数。`;
|
||||
};
|
||||
@@ -15,7 +15,7 @@ import { removeDatasetCiteText } from '@fastgpt/global/core/ai/llm/utils';
|
||||
import { getAIApi } from '../config';
|
||||
import type { OpenaiAccountType } from '@fastgpt/global/support/user/team/type';
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
import { parsePromptToolCall, promptToolCallMessageRewrite } from './promptToolCall';
|
||||
import { parsePromptToolCall, promptToolCallMessageRewrite } from './promptCall';
|
||||
import { getLLMModel } from '../model';
|
||||
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
|
||||
import { countGptMessagesTokens } from '../../../common/string/tiktoken/index';
|
||||
@@ -26,14 +26,14 @@ import { i18nT } from '../../../../web/i18n/utils';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import json5 from 'json5';
|
||||
|
||||
type ResponseEvents = {
|
||||
export type ResponseEvents = {
|
||||
onStreaming?: ({ text }: { text: string }) => void;
|
||||
onReasoning?: ({ text }: { text: string }) => void;
|
||||
onToolCall?: ({ call }: { call: ChatCompletionMessageToolCall }) => void;
|
||||
onToolParam?: ({ tool, params }: { tool: ChatCompletionMessageToolCall; params: string }) => void;
|
||||
};
|
||||
|
||||
type CreateLLMResponseProps<T extends CompletionsBodyType> = {
|
||||
export type CreateLLMResponseProps<T extends CompletionsBodyType = CompletionsBodyType> = {
|
||||
userKey?: OpenaiAccountType;
|
||||
body: LLMRequestBodyType<T>;
|
||||
isAborted?: () => boolean | undefined;
|
||||
@@ -86,7 +86,7 @@ export const createLLMResponse = async <T extends CompletionsBodyType>(
|
||||
messages: rewriteMessages
|
||||
});
|
||||
|
||||
// console.log(JSON.stringify(requestBody, null, 2));
|
||||
// console.dir(requestBody, { depth: null });
|
||||
const { response, isStreamResponse, getEmptyResponseTip } = await createChatCompletion({
|
||||
body: requestBody,
|
||||
userKey,
|
||||
|
||||
@@ -2,6 +2,8 @@ import { type LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
import type { CompletionFinishReason, CompletionUsage } from '@fastgpt/global/core/ai/type';
|
||||
import { getLLMDefaultUsage } from '@fastgpt/global/core/ai/constants';
|
||||
import { removeDatasetCiteText } from '@fastgpt/global/core/ai/llm/utils';
|
||||
import json5 from 'json5';
|
||||
import { sliceJsonStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
/*
|
||||
Count response max token
|
||||
@@ -317,3 +319,11 @@ export const parseLLMStreamResponse = () => {
|
||||
updateFinishReason
|
||||
};
|
||||
};
|
||||
|
||||
export const parseToolArgs = <T = Record<string, any>>(toolArgs: string) => {
|
||||
try {
|
||||
return json5.parse(sliceJsonStr(toolArgs)) as T;
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user