Test parse cite and add tool call parallel (#4737)

* add quote response filter (#4727) * chatting * add quote response filter * add test * remove comment * perf: cite hidden * perf: format llm response * feat: comment * update default chunk size * update default chunk size --------- Co-authored-by: heheer <heheer@sealos.io>
2025-07-22 04:06:18 +00:00 · 2025-04-30 17:43:50 +08:00
parent 683ab6c17d
commit fdd4e9edbd
53 changed files with 1131 additions and 716 deletions
--- a/packages/service/core/ai/config/provider/Qwen.json
+++ b/packages/service/core/ai/config/provider/Qwen.json
@@ -1,79 +1,6 @@
 {
  "provider": "Qwen",
  "list": [
-    {
-      "model": "qwen-vl-plus",
-      "name": "qwen-vl-plus",
-      "maxContext": 32000,
-      "maxResponse": 2000,
-      "quoteMaxToken": 20000,
-      "maxTemperature": 1.2,
-      "vision": true,
-      "toolChoice": false,
-      "functionCall": false,
-      "defaultSystemChatPrompt": "",
-      "datasetProcess": true,
-      "usedInClassify": true,
-      "customCQPrompt": "",
-      "usedInExtractFields": true,
-      "usedInQueryExtension": true,
-      "customExtractPrompt": "",
-      "usedInToolCall": true,
-      "type": "llm",
-      "showTopP": true,
-      "showStopSign": true
-    },
-    {
-      "model": "qwen-plus",
-      "name": "Qwen-plus",
-      "maxContext": 64000,
-      "maxResponse": 8000,
-      "quoteMaxToken": 60000,
-      "maxTemperature": 1,
-      "vision": false,
-      "toolChoice": true,
-      "functionCall": false,
-      "defaultSystemChatPrompt": "",
-      "datasetProcess": true,
-      "usedInClassify": true,
-      "customCQPrompt": "",
-      "usedInExtractFields": true,
-      "usedInQueryExtension": true,
-      "customExtractPrompt": "",
-      "usedInToolCall": true,
-      "defaultConfig": {},
-      "fieldMap": {},
-      "type": "llm",
-      "showTopP": true,
-      "showStopSign": true,
-      "responseFormatList": ["text", "json_object"]
-    },
-    {
-      "model": "qwen-turbo",
-      "name": "Qwen-turbo",
-      "maxContext": 128000,
-      "maxResponse": 8000,
-      "quoteMaxToken": 100000,
-      "maxTemperature": 1,
-      "vision": false,
-      "toolChoice": true,
-      "functionCall": false,
-      "defaultSystemChatPrompt": "",
-      "datasetProcess": true,
-      "usedInClassify": true,
-      "customCQPrompt": "",
-      "usedInExtractFields": true,
-      "usedInQueryExtension": true,
-      "customExtractPrompt": "",
-      "usedInToolCall": true,
-      "defaultConfig": {},
-      "fieldMap": {},
-      "type": "llm",
-      "showTopP": true,
-      "showStopSign": true,
-      "responseFormatList": ["text", "json_object"]
-    },
-
    {
      "model": "qwen-max",
      "name": "Qwen-max",
@@ -123,6 +50,78 @@
      "showTopP": true,
      "showStopSign": true
    },
+    {
+      "model": "qwen-plus",
+      "name": "Qwen-plus",
+      "maxContext": 64000,
+      "maxResponse": 8000,
+      "quoteMaxToken": 60000,
+      "maxTemperature": 1,
+      "vision": false,
+      "toolChoice": true,
+      "functionCall": false,
+      "defaultSystemChatPrompt": "",
+      "datasetProcess": true,
+      "usedInClassify": true,
+      "customCQPrompt": "",
+      "usedInExtractFields": true,
+      "usedInQueryExtension": true,
+      "customExtractPrompt": "",
+      "usedInToolCall": true,
+      "defaultConfig": {},
+      "fieldMap": {},
+      "type": "llm",
+      "showTopP": true,
+      "showStopSign": true,
+      "responseFormatList": ["text", "json_object"]
+    },
+    {
+      "model": "qwen-vl-plus",
+      "name": "qwen-vl-plus",
+      "maxContext": 32000,
+      "maxResponse": 2000,
+      "quoteMaxToken": 20000,
+      "maxTemperature": 1.2,
+      "vision": true,
+      "toolChoice": false,
+      "functionCall": false,
+      "defaultSystemChatPrompt": "",
+      "datasetProcess": true,
+      "usedInClassify": true,
+      "customCQPrompt": "",
+      "usedInExtractFields": true,
+      "usedInQueryExtension": true,
+      "customExtractPrompt": "",
+      "usedInToolCall": true,
+      "type": "llm",
+      "showTopP": true,
+      "showStopSign": true
+    },
+    {
+      "model": "qwen-turbo",
+      "name": "Qwen-turbo",
+      "maxContext": 128000,
+      "maxResponse": 8000,
+      "quoteMaxToken": 100000,
+      "maxTemperature": 1,
+      "vision": false,
+      "toolChoice": true,
+      "functionCall": false,
+      "defaultSystemChatPrompt": "",
+      "datasetProcess": true,
+      "usedInClassify": true,
+      "customCQPrompt": "",
+      "usedInExtractFields": true,
+      "usedInQueryExtension": true,
+      "customExtractPrompt": "",
+      "usedInToolCall": true,
+      "defaultConfig": {},
+      "fieldMap": {},
+      "type": "llm",
+      "showTopP": true,
+      "showStopSign": true,
+      "responseFormatList": ["text", "json_object"]
+    },
    {
      "model": "qwen3-235b-a22b",
      "name": "qwen3-235b-a22b",
@@ -142,7 +141,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -168,7 +169,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -194,7 +197,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -220,7 +225,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -246,7 +253,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -272,7 +281,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -298,7 +309,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -324,7 +337,9 @@
      "usedInQueryExtension": true,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": true,
@@ -350,7 +365,9 @@
      "usedInQueryExtension": false,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": false,
@@ -375,7 +392,9 @@
      "usedInQueryExtension": false,
      "customExtractPrompt": "",
      "usedInToolCall": true,
-      "defaultConfig": {},
+      "defaultConfig": {
+        "stream": true
+      },
      "fieldMap": {},
      "type": "llm",
      "showTopP": false,
--- a/packages/service/core/ai/functions/createQuestionGuide.ts
+++ b/packages/service/core/ai/functions/createQuestionGuide.ts
@@ -2,7 +2,7 @@ import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d'
 import { createChatCompletion } from '../config';
 import { countGptMessagesTokens, countPromptTokens } from '../../../common/string/tiktoken/index';
 import { loadRequestMessages } from '../../chat/utils';
-import { llmCompletionsBodyFormat, llmResponseToAnswerText } from '../utils';
+import { llmCompletionsBodyFormat, formatLLMResponse } from '../utils';
 import {
  QuestionGuidePrompt,
  QuestionGuideFooterPrompt
@@ -42,12 +42,12 @@ export async function createQuestionGuide({
        temperature: 0.1,
        max_tokens: 200,
        messages: requestMessages,
-        stream: false
+        stream: true
      },
      model
    )
  });
-  const { text: answer, usage } = await llmResponseToAnswerText(response);
+  const { text: answer, usage } = await formatLLMResponse(response);

  const start = answer.indexOf('[');
  const end = answer.lastIndexOf(']');
--- a/packages/service/core/ai/functions/queryExtension.ts
+++ b/packages/service/core/ai/functions/queryExtension.ts
@@ -4,7 +4,7 @@ import { ChatItemType } from '@fastgpt/global/core/chat/type';
 import { countGptMessagesTokens, countPromptTokens } from '../../../common/string/tiktoken/index';
 import { chats2GPTMessages } from '@fastgpt/global/core/chat/adapt';
 import { getLLMModel } from '../model';
-import { llmCompletionsBodyFormat, llmResponseToAnswerText } from '../utils';
+import { llmCompletionsBodyFormat, formatLLMResponse } from '../utils';
 import { addLog } from '../../../common/system/log';
 import { filterGPTMessageByMaxContext } from '../../chat/utils';
 import json5 from 'json5';
@@ -170,7 +170,7 @@ assistant: ${chatBg}
  const { response } = await createChatCompletion({
    body: llmCompletionsBodyFormat(
      {
-        stream: false,
+        stream: true,
        model: modelData.model,
        temperature: 0.1,
        messages
@@ -178,7 +178,7 @@ assistant: ${chatBg}
      modelData
    )
  });
-  const { text: answer, usage } = await llmResponseToAnswerText(response);
+  const { text: answer, usage } = await formatLLMResponse(response);
  const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(messages));
  const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer));

--- a/packages/service/core/ai/utils.test.ts
+++ b/packages/service/core/ai/utils.test.ts
@@ -1,145 +0,0 @@
-import { parseReasoningStreamContent } from './utils';
-import { expect, test } from 'vitest';
-
-test('Parse reasoning stream content test', async () => {
-  const partList = [
-    {
-      data: [{ content: '你好1' }, { content: '你好2' }, { content: '你好3' }],
-      correct: { answer: '你好1你好2你好3', reasoning: '' }
-    },
-    {
-      data: [
-        { reasoning_content: '这是' },
-        { reasoning_content: '思考' },
-        { reasoning_content: '过程' },
-        { content: '你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程' }
-    },
-    {
-      data: [
-        { content: '<t' },
-        { content: 'hink>' },
-        { content: '这是' },
-        { content: '思考' },
-        { content: '过程' },
-        { content: '</think>' },
-        { content: '你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程' }
-    },
-    {
-      data: [
-        { content: '<think>' },
-        { content: '这是' },
-        { content: '思考' },
-        { content: '过程' },
-        { content: '</think>' },
-        { content: '你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程' }
-    },
-    {
-      data: [
-        { content: '<think>这是' },
-        { content: '思考' },
-        { content: '过程' },
-        { content: '</think>' },
-        { content: '你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程' }
-    },
-    {
-      data: [
-        { content: '<think>这是' },
-        { content: '思考' },
-        { content: '过程</' },
-        { content: 'think>' },
-        { content: '你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程' }
-    },
-    {
-      data: [
-        { content: '<think>这是' },
-        { content: '思考' },
-        { content: '过程</think>' },
-        { content: '你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程' }
-    },
-    {
-      data: [
-        { content: '<think>这是' },
-        { content: '思考' },
-        { content: '过程</think>你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程' }
-    },
-    {
-      data: [
-        { content: '<think>这是' },
-        { content: '思考' },
-        { content: '过程</th' },
-        { content: '假的' },
-        { content: '你好2' },
-        { content: '你好3' },
-        { content: '过程</think>你好1' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '你好1你好2你好3', reasoning: '这是思考过程</th假的你好2你好3过程' }
-    },
-    {
-      data: [
-        { content: '<think>这是' },
-        { content: '思考' },
-        { content: '过程</th' },
-        { content: '假的' },
-        { content: '你好2' },
-        { content: '你好3' }
-      ],
-      correct: { answer: '', reasoning: '这是思考过程</th假的你好2你好3' }
-    }
-  ];
-
-  partList.forEach((part) => {
-    const { parsePart } = parseReasoningStreamContent();
-
-    let answer = '';
-    let reasoning = '';
-    part.data.forEach((item) => {
-      const formatPart = {
-        choices: [
-          {
-            delta: {
-              role: 'assistant',
-              content: item.content,
-              reasoning_content: item.reasoning_content
-            }
-          }
-        ]
-      };
-      const [reasoningContent, content] = parsePart(formatPart, true);
-      answer += content;
-      reasoning += reasoningContent;
-    });
-
-    expect(answer).toBe(part.correct.answer);
-    expect(reasoning).toBe(part.correct.reasoning);
-  });
-});
--- a/packages/service/core/ai/utils.ts
+++ b/packages/service/core/ai/utils.ts
@@ -5,10 +5,12 @@ import {
  CompletionFinishReason,
  StreamChatType,
  UnStreamChatType,
-  CompletionUsage
+  CompletionUsage,
+  ChatCompletionMessageToolCall
 } from '@fastgpt/global/core/ai/type';
 import { getLLMModel } from './model';
 import { getLLMDefaultUsage } from '@fastgpt/global/core/ai/constants';
+import { getNanoid } from '@fastgpt/global/common/string/tools';

 /* 
  Count response max token
@@ -105,33 +107,84 @@ export const llmStreamResponseToAnswerText = async (
 ): Promise<{
  text: string;
  usage?: CompletionUsage;
+  toolCalls?: ChatCompletionMessageToolCall[];
 }> => {
  let answer = '';
  let usage = getLLMDefaultUsage();
+  let toolCalls: ChatCompletionMessageToolCall[] = [];
+  let callingTool: { name: string; arguments: string } | null = null;
+
  for await (const part of response) {
    usage = part.usage || usage;
+    const responseChoice = part.choices?.[0]?.delta;

-    const content = part.choices?.[0]?.delta?.content || '';
+    const content = responseChoice?.content || '';
    answer += content;
+
+    // Tool calls
+    if (responseChoice?.tool_calls?.length) {
+      responseChoice.tool_calls.forEach((toolCall) => {
+        const index = toolCall.index;
+
+        if (toolCall.id || callingTool) {
+          // 有 id，代表新 call 工具
+          if (toolCall.id) {
+            callingTool = {
+              name: toolCall.function?.name || '',
+              arguments: toolCall.function?.arguments || ''
+            };
+          } else if (callingTool) {
+            // Continue call(Perhaps the name of the previous function was incomplete)
+            callingTool.name += toolCall.function?.name || '';
+            callingTool.arguments += toolCall.function?.arguments || '';
+          }
+
+          if (!callingTool) {
+            return;
+          }
+
+          // New tool, add to list.
+          const toolId = getNanoid();
+          toolCalls[index] = {
+            ...toolCall,
+            id: toolId,
+            type: 'function',
+            function: callingTool
+          };
+          callingTool = null;
+        } else {
+          /* arg 追加到当前工具的参数里 */
+          const arg: string = toolCall?.function?.arguments ?? '';
+          const currentTool = toolCalls[index];
+          if (currentTool && arg) {
+            currentTool.function.arguments += arg;
+          }
+        }
+      });
+    }
  }
  return {
    text: parseReasoningContent(answer)[1],
-    usage
+    usage,
+    toolCalls
  };
 };
 export const llmUnStreamResponseToAnswerText = async (
  response: UnStreamChatType
 ): Promise<{
  text: string;
+  toolCalls?: ChatCompletionMessageToolCall[];
  usage?: CompletionUsage;
 }> => {
  const answer = response.choices?.[0]?.message?.content || '';
+  const toolCalls = response.choices?.[0]?.message?.tool_calls;
  return {
    text: answer,
-    usage: response.usage
+    usage: response.usage,
+    toolCalls
  };
 };
-export const llmResponseToAnswerText = async (response: StreamChatType | UnStreamChatType) => {
+export const formatLLMResponse = async (response: StreamChatType | UnStreamChatType) => {
  if ('iterator' in response) {
    return llmStreamResponseToAnswerText(response);
  }
@@ -155,20 +208,31 @@ export const parseReasoningContent = (text: string): [string, string] => {
  return [thinkContent, answerContent];
 };

-// Parse <think></think> tags to think and answer - stream response
-export const parseReasoningStreamContent = () => {
-  let isInThinkTag: boolean | undefined;
+export const removeDatasetCiteText = (text: string, retainDatasetCite: boolean) => {
+  return retainDatasetCite ? text : text.replace(/\[([a-f0-9]{24})\]\(CITE\)/g, '');
+};

-  const startTag = '<think>';
+// Parse llm stream part
+export const parseLLMStreamResponse = () => {
+  let isInThinkTag: boolean | undefined = undefined;
  let startTagBuffer = '';
-
-  const endTag = '</think>';
  let endTagBuffer = '';

+  const thinkStartChars = '<think>';
+  const thinkEndChars = '</think>';
+
+  let citeBuffer = '';
+  const maxCiteBufferLength = 32; // [Object](CITE)总长度为32
+
  /* 
    parseThinkTag - 只控制是否主动解析 <think></think>，如果接口已经解析了，则不再解析。
+    retainDatasetCite - 
  */
-  const parsePart = (
+  const parsePart = ({
+    part,
+    parseThinkTag = true,
+    retainDatasetCite = true
+  }: {
    part: {
      choices: {
        delta: {
@@ -177,147 +241,209 @@ export const parseReasoningStreamContent = () => {
        };
        finish_reason?: CompletionFinishReason;
      }[];
-    },
-    parseThinkTag = false
-  ): {
+    };
+    parseThinkTag?: boolean;
+    retainDatasetCite?: boolean;
+  }): {
    reasoningContent: string;
    content: string;
+    responseContent: string;
    finishReason: CompletionFinishReason;
  } => {
-    const content = part.choices?.[0]?.delta?.content || '';
    const finishReason = part.choices?.[0]?.finish_reason || null;
-
+    const content = part.choices?.[0]?.delta?.content || '';
    // @ts-ignore
    const reasoningContent = part.choices?.[0]?.delta?.reasoning_content || '';
-    if (reasoningContent || !parseThinkTag) {
-      isInThinkTag = false;
-      return { reasoningContent, content, finishReason };
-    }
+    const isStreamEnd = !!finishReason;

-    if (!content) {
-      return {
-        reasoningContent: '',
-        content: '',
-        finishReason
-      };
-    }
-
-    // 如果不在 think 标签中，或者有 reasoningContent(接口已解析），则返回 reasoningContent 和 content
-    if (isInThinkTag === false) {
-      return {
-        reasoningContent: '',
-        content,
-        finishReason
-      };
-    }
-
-    // 检测是否为 think 标签开头的数据
-    if (isInThinkTag === undefined) {
-      // Parse content think and answer
-      startTagBuffer += content;
-      // 太少内容时候，暂时不解析
-      if (startTagBuffer.length < startTag.length) {
-        return {
-          reasoningContent: '',
-          content: '',
-          finishReason
-        };
-      }
-
-      if (startTagBuffer.startsWith(startTag)) {
-        isInThinkTag = true;
-        return {
-          reasoningContent: startTagBuffer.slice(startTag.length),
-          content: '',
-          finishReason
-        };
-      }
-
-      // 如果未命中 think 标签，则认为不在 think 标签中，返回 buffer 内容作为 content
-      isInThinkTag = false;
-      return {
-        reasoningContent: '',
-        content: startTagBuffer,
-        finishReason
-      };
-    }
-
-    // 确认是 think 标签内容，开始返回 think 内容，并实时检测 </think>
-    /* 
-      检测 </think> 方案。
-      存储所有疑似 </think> 的内容，直到检测到完整的 </think> 标签或超出 </think> 长度。
-      content 返回值包含以下几种情况:
-        abc - 完全未命中尾标签
-        abc<th - 命中一部分尾标签
-        abc</think> - 完全命中尾标签
-        abc</think>abc - 完全命中尾标签
-        </think>abc - 完全命中尾标签
-        k>abc - 命中一部分尾标签
-    */
-    // endTagBuffer 专门用来记录疑似尾标签的内容
-    if (endTagBuffer) {
-      endTagBuffer += content;
-      if (endTagBuffer.includes(endTag)) {
+    // Parse think
+    const { reasoningContent: parsedThinkReasoningContent, content: parsedThinkContent } = (() => {
+      if (reasoningContent || !parseThinkTag) {
        isInThinkTag = false;
-        const answer = endTagBuffer.slice(endTag.length);
+        return { reasoningContent, content };
+      }
+
+      if (!content) {
        return {
          reasoningContent: '',
-          content: answer,
-          finishReason
-        };
-      } else if (endTagBuffer.length >= endTag.length) {
-        // 缓存内容超出尾标签长度，且仍未命中 </think>，则认为本次猜测 </think> 失败，仍处于 think 阶段。
-        const tmp = endTagBuffer;
-        endTagBuffer = '';
-        return {
-          reasoningContent: tmp,
-          content: '',
-          finishReason
+          content: ''
        };
      }
-      return {
-        reasoningContent: '',
-        content: '',
-        finishReason
-      };
-    } else if (content.includes(endTag)) {
-      // 返回内容，完整命中</think>，直接结束
-      isInThinkTag = false;
-      const [think, answer] = content.split(endTag);
-      return {
-        reasoningContent: think,
-        content: answer,
-        finishReason
-      };
-    } else {
-      // 无 buffer，且未命中 </think>，开始疑似 </think> 检测。
-      for (let i = 1; i < endTag.length; i++) {
-        const partialEndTag = endTag.slice(0, i);
-        // 命中一部分尾标签
-        if (content.endsWith(partialEndTag)) {
-          const think = content.slice(0, -partialEndTag.length);
-          endTagBuffer += partialEndTag;
+
+      // 如果不在 think 标签中，或者有 reasoningContent(接口已解析），则返回 reasoningContent 和 content
+      if (isInThinkTag === false) {
+        return {
+          reasoningContent: '',
+          content
+        };
+      }
+
+      // 检测是否为 think 标签开头的数据
+      if (isInThinkTag === undefined) {
+        // Parse content think and answer
+        startTagBuffer += content;
+        // 太少内容时候，暂时不解析
+        if (startTagBuffer.length < thinkStartChars.length) {
+          if (isStreamEnd) {
+            const tmpContent = startTagBuffer;
+            startTagBuffer = '';
+            return {
+              reasoningContent: '',
+              content: tmpContent
+            };
+          }
          return {
-            reasoningContent: think,
-            content: '',
-            finishReason
+            reasoningContent: '',
+            content: ''
          };
        }
+
+        if (startTagBuffer.startsWith(thinkStartChars)) {
+          isInThinkTag = true;
+          return {
+            reasoningContent: startTagBuffer.slice(thinkStartChars.length),
+            content: ''
+          };
+        }
+
+        // 如果未命中 think 标签，则认为不在 think 标签中，返回 buffer 内容作为 content
+        isInThinkTag = false;
+        return {
+          reasoningContent: '',
+          content: startTagBuffer
+        };
      }
+
+      // 确认是 think 标签内容，开始返回 think 内容，并实时检测 </think>
+      /* 
+        检测 </think> 方案。
+        存储所有疑似 </think> 的内容，直到检测到完整的 </think> 标签或超出 </think> 长度。
+        content 返回值包含以下几种情况:
+          abc - 完全未命中尾标签
+          abc<th - 命中一部分尾标签
+          abc</think> - 完全命中尾标签
+          abc</think>abc - 完全命中尾标签
+          </think>abc - 完全命中尾标签
+          k>abc - 命中一部分尾标签
+      */
+      // endTagBuffer 专门用来记录疑似尾标签的内容
+      if (endTagBuffer) {
+        endTagBuffer += content;
+        if (endTagBuffer.includes(thinkEndChars)) {
+          isInThinkTag = false;
+          const answer = endTagBuffer.slice(thinkEndChars.length);
+          return {
+            reasoningContent: '',
+            content: answer
+          };
+        } else if (endTagBuffer.length >= thinkEndChars.length) {
+          // 缓存内容超出尾标签长度，且仍未命中 </think>，则认为本次猜测 </think> 失败，仍处于 think 阶段。
+          const tmp = endTagBuffer;
+          endTagBuffer = '';
+          return {
+            reasoningContent: tmp,
+            content: ''
+          };
+        }
+        return {
+          reasoningContent: '',
+          content: ''
+        };
+      } else if (content.includes(thinkEndChars)) {
+        // 返回内容，完整命中</think>，直接结束
+        isInThinkTag = false;
+        const [think, answer] = content.split(thinkEndChars);
+        return {
+          reasoningContent: think,
+          content: answer
+        };
+      } else {
+        // 无 buffer，且未命中 </think>，开始疑似 </think> 检测。
+        for (let i = 1; i < thinkEndChars.length; i++) {
+          const partialEndTag = thinkEndChars.slice(0, i);
+          // 命中一部分尾标签
+          if (content.endsWith(partialEndTag)) {
+            const think = content.slice(0, -partialEndTag.length);
+            endTagBuffer += partialEndTag;
+            return {
+              reasoningContent: think,
+              content: ''
+            };
+          }
+        }
+      }
+
+      // 完全未命中尾标签，还是 think 阶段。
+      return {
+        reasoningContent: content,
+        content: ''
+      };
+    })();
+
+    // Parse datset cite
+    if (retainDatasetCite) {
+      return {
+        reasoningContent: parsedThinkReasoningContent,
+        content: parsedThinkContent,
+        responseContent: parsedThinkContent,
+        finishReason
+      };
    }

-    // 完全未命中尾标签，还是 think 阶段。
+    // 缓存包含 [ 的字符串，直到超出 maxCiteBufferLength 再一次性返回
+    const parseCite = (text: string) => {
+      // 结束时，返回所有剩余内容
+      if (isStreamEnd) {
+        const content = citeBuffer + text;
+        return {
+          content: removeDatasetCiteText(content, false)
+        };
+      }
+
+      // 新内容包含 [，初始化缓冲数据
+      if (text.includes('[')) {
+        const index = text.indexOf('[');
+        const beforeContent = citeBuffer + text.slice(0, index);
+        citeBuffer = text.slice(index);
+
+        // beforeContent 可能是：普通字符串，带 [ 的字符串
+        return {
+          content: removeDatasetCiteText(beforeContent, false)
+        };
+      }
+      // 处于 Cite 缓冲区，判断是否满足条件
+      else if (citeBuffer) {
+        citeBuffer += text;
+
+        // 检查缓冲区长度是否达到完整Quote长度或已经流结束
+        if (citeBuffer.length >= maxCiteBufferLength) {
+          const content = removeDatasetCiteText(citeBuffer, false);
+          citeBuffer = '';
+
+          return {
+            content
+          };
+        } else {
+          // 暂时不返回内容
+          return { content: '' };
+        }
+      }
+
+      return {
+        content: text
+      };
+    };
+    const { content: pasedCiteContent } = parseCite(parsedThinkContent);
+
    return {
-      reasoningContent: content,
-      content: '',
+      reasoningContent: parsedThinkReasoningContent,
+      content: parsedThinkContent,
+      responseContent: pasedCiteContent,
      finishReason
    };
  };

-  const getStartTagBuffer = () => startTagBuffer;
-
  return {
-    parsePart,
-    getStartTagBuffer
+    parsePart
  };
 };
--- a/packages/service/core/app/plugin/utils.ts
+++ b/packages/service/core/app/plugin/utils.ts
@@ -31,5 +31,6 @@ export const computedPluginUsage = async ({
    return plugin.hasTokenFee ? pluginCurrentCost + childrenUsages : pluginCurrentCost;
  }

+  // Personal plugins are charged regardless of whether they are successful or not
  return childrenUsages;
 };
--- a/packages/service/core/workflow/dispatch/agent/classifyQuestion.ts
+++ b/packages/service/core/workflow/dispatch/agent/classifyQuestion.ts
@@ -19,7 +19,7 @@ import { DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/ty
 import { chatValue2RuntimePrompt } from '@fastgpt/global/core/chat/adapt';
 import { getHandleId } from '@fastgpt/global/core/workflow/utils';
 import { loadRequestMessages } from '../../../chat/utils';
-import { llmCompletionsBodyFormat, llmResponseToAnswerText } from '../../../ai/utils';
+import { llmCompletionsBodyFormat, formatLLMResponse } from '../../../ai/utils';
 import { addLog } from '../../../../common/system/log';
 import { ModelTypeEnum } from '../../../../../global/core/ai/model';
 import { replaceVariable } from '@fastgpt/global/common/string/tools';
@@ -135,13 +135,13 @@ const completions = async ({
        model: cqModel.model,
        temperature: 0.01,
        messages: requestMessages,
-        stream: false
+        stream: true
      },
      cqModel
    ),
    userKey: externalProvider.openaiAccount
  });
-  const { text: answer, usage } = await llmResponseToAnswerText(response);
+  const { text: answer, usage } = await formatLLMResponse(response);

  // console.log(JSON.stringify(chats2GPTMessages({ messages, reserveId: false }), null, 2));
  // console.log(answer, '----');
--- a/packages/service/core/workflow/dispatch/agent/extract.ts
+++ b/packages/service/core/workflow/dispatch/agent/extract.ts
@@ -30,7 +30,7 @@ import {
 import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
 import { DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type';
 import { chatValue2RuntimePrompt } from '@fastgpt/global/core/chat/adapt';
-import { llmCompletionsBodyFormat, llmResponseToAnswerText } from '../../../ai/utils';
+import { llmCompletionsBodyFormat, formatLLMResponse } from '../../../ai/utils';
 import { ModelTypeEnum } from '../../../../../global/core/ai/model';
 import {
  getExtractJsonPrompt,
@@ -226,10 +226,10 @@ const toolChoice = async (props: ActionProps) => {
    }
  ];

-  const { response } = (await createChatCompletion({
+  const { response } = await createChatCompletion({
    body: llmCompletionsBodyFormat(
      {
-        stream: false,
+        stream: true,
        model: extractModel.model,
        temperature: 0.01,
        messages: filterMessages,
@@ -239,16 +239,15 @@ const toolChoice = async (props: ActionProps) => {
      extractModel
    ),
    userKey: externalProvider.openaiAccount
-  })) as { response: UnStreamChatType };
+  });
+  const { toolCalls, usage } = await formatLLMResponse(response);

  const arg: Record<string, any> = (() => {
    try {
-      return json5.parse(
-        response?.choices?.[0]?.message?.tool_calls?.[0]?.function?.arguments || ''
-      );
+      return json5.parse(toolCalls?.[0]?.function?.arguments || '');
    } catch (error) {
      console.log(agentFunction.parameters);
-      console.log(response.choices?.[0]?.message?.tool_calls?.[0]?.function);
+      console.log(toolCalls?.[0]?.function);
      console.log('Your model may not support tool_call', error);
      return {};
    }
@@ -257,11 +256,10 @@ const toolChoice = async (props: ActionProps) => {
  const AIMessages: ChatCompletionMessageParam[] = [
    {
      role: ChatCompletionRequestMessageRoleEnum.Assistant,
-      tool_calls: response.choices?.[0]?.message?.tool_calls
+      tool_calls: toolCalls
    }
  ];

-  const usage = response.usage;
  const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(filterMessages, tools));
  const outputTokens = usage?.completion_tokens || (await countGptMessagesTokens(AIMessages));
  return {
@@ -321,13 +319,13 @@ Human: ${content}`
        model: extractModel.model,
        temperature: 0.01,
        messages: requestMessages,
-        stream: false
+        stream: true
      },
      extractModel
    ),
    userKey: externalProvider.openaiAccount
  });
-  const { text: answer, usage } = await llmResponseToAnswerText(response);
+  const { text: answer, usage } = await formatLLMResponse(response);
  const inputTokens = usage?.prompt_tokens || (await countMessagesTokens(messages));
  const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer));

--- a/packages/service/core/workflow/dispatch/agent/runTool/functionCall.ts
+++ b/packages/service/core/workflow/dispatch/agent/runTool/functionCall.ts
@@ -26,7 +26,12 @@ import { getNanoid, sliceStrStartEnd } from '@fastgpt/global/common/string/tools
 import { AIChatItemType } from '@fastgpt/global/core/chat/type';
 import { GPTMessages2Chats } from '@fastgpt/global/core/chat/adapt';
 import { formatToolResponse, initToolCallEdges, initToolNodes } from './utils';
-import { computedMaxToken, llmCompletionsBodyFormat } from '../../../../ai/utils';
+import {
+  computedMaxToken,
+  llmCompletionsBodyFormat,
+  removeDatasetCiteText,
+  parseLLMStreamResponse
+} from '../../../../ai/utils';
 import { toolValueTypeList } from '@fastgpt/global/core/workflow/constants';
 import { WorkflowInteractiveResponseType } from '@fastgpt/global/core/workflow/template/system/interactive/type';
 import { ChatItemValueTypeEnum } from '@fastgpt/global/core/chat/constants';
@@ -48,6 +53,7 @@ export const runToolWithFunctionCall = async (
    runtimeEdges,
    externalProvider,
    stream,
+    retainDatasetCite = true,
    workflowStreamResponse,
    params: {
      temperature,
@@ -261,7 +267,8 @@ export const runToolWithFunctionCall = async (
        res,
        toolNodes,
        stream: aiResponse,
-        workflowStreamResponse
+        workflowStreamResponse,
+        retainDatasetCite
      });

      return {
@@ -288,8 +295,18 @@ export const runToolWithFunctionCall = async (
          ]
        : [];

+      const answer = result.choices?.[0]?.message?.content || '';
+      if (answer) {
+        workflowStreamResponse?.({
+          event: SseResponseEventEnum.fastAnswer,
+          data: textAdaptGptResponse({
+            text: removeDatasetCiteText(answer, retainDatasetCite)
+          })
+        });
+      }
+
      return {
-        answer: result.choices?.[0]?.message?.content || '',
+        answer,
        functionCalls: toolCalls,
        inputTokens: usage?.prompt_tokens,
        outputTokens: usage?.completion_tokens
@@ -509,12 +526,14 @@ async function streamResponse({
  res,
  toolNodes,
  stream,
-  workflowStreamResponse
+  workflowStreamResponse,
+  retainDatasetCite
 }: {
  res: NextApiResponse;
  toolNodes: ToolNodeItemType[];
  stream: StreamChatType;
  workflowStreamResponse?: WorkflowResponseType;
+  retainDatasetCite?: boolean;
 }) {
  const write = responseWriteController({
    res,
@@ -526,6 +545,8 @@ async function streamResponse({
  let functionId = getNanoid();
  let usage = getLLMDefaultUsage();

+  const { parsePart } = parseLLMStreamResponse();
+
  for await (const part of stream) {
    usage = part.usage || usage;
    if (res.closed) {
@@ -533,17 +554,21 @@ async function streamResponse({
      break;
    }

+    const { content: toolChoiceContent, responseContent } = parsePart({
+      part,
+      parseThinkTag: false,
+      retainDatasetCite
+    });
+
    const responseChoice = part.choices?.[0]?.delta;
+    textAnswer += toolChoiceContent;

-    if (responseChoice.content) {
-      const content = responseChoice?.content || '';
-      textAnswer += content;
-
+    if (responseContent) {
      workflowStreamResponse?.({
        write,
        event: SseResponseEventEnum.answer,
        data: textAdaptGptResponse({
-          text: content
+          text: responseContent
        })
      });
    } else if (responseChoice.function_call) {
--- a/packages/service/core/workflow/dispatch/agent/runTool/promptCall.ts
+++ b/packages/service/core/workflow/dispatch/agent/runTool/promptCall.ts
@@ -29,8 +29,9 @@ import { formatToolResponse, initToolCallEdges, initToolNodes } from './utils';
 import {
  computedMaxToken,
  llmCompletionsBodyFormat,
+  removeDatasetCiteText,
  parseReasoningContent,
-  parseReasoningStreamContent
+  parseLLMStreamResponse
 } from '../../../../ai/utils';
 import { WorkflowResponseType } from '../../type';
 import { toolValueTypeList } from '@fastgpt/global/core/workflow/constants';
@@ -60,6 +61,7 @@ export const runToolWithPromptCall = async (
    runtimeEdges,
    externalProvider,
    stream,
+    retainDatasetCite = true,
    workflowStreamResponse,
    params: {
      temperature,
@@ -275,7 +277,8 @@ export const runToolWithPromptCall = async (
        toolNodes,
        stream: aiResponse,
        workflowStreamResponse,
-        aiChatReasoning
+        aiChatReasoning,
+        retainDatasetCite
      });

      return {
@@ -318,7 +321,7 @@ export const runToolWithPromptCall = async (
    workflowStreamResponse?.({
      event: SseResponseEventEnum.fastAnswer,
      data: textAdaptGptResponse({
-        reasoning_content: reasoning
+        reasoning_content: removeDatasetCiteText(reasoning, retainDatasetCite)
      })
    });
  }
@@ -344,7 +347,7 @@ export const runToolWithPromptCall = async (
      workflowStreamResponse?.({
        event: SseResponseEventEnum.fastAnswer,
        data: textAdaptGptResponse({
-          text: replaceAnswer
+          text: removeDatasetCiteText(replaceAnswer, retainDatasetCite)
        })
      });
    }
@@ -566,13 +569,15 @@ async function streamResponse({
  res,
  stream,
  workflowStreamResponse,
-  aiChatReasoning
+  aiChatReasoning,
+  retainDatasetCite
 }: {
  res: NextApiResponse;
  toolNodes: ToolNodeItemType[];
  stream: StreamChatType;
  workflowStreamResponse?: WorkflowResponseType;
  aiChatReasoning?: boolean;
+  retainDatasetCite?: boolean;
 }) {
  const write = responseWriteController({
    res,
@@ -585,7 +590,7 @@ async function streamResponse({
  let finish_reason: CompletionFinishReason = null;
  let usage = getLLMDefaultUsage();

-  const { parsePart, getStartTagBuffer } = parseReasoningStreamContent();
+  const { parsePart } = parseLLMStreamResponse();

  for await (const part of stream) {
    usage = part.usage || usage;
@@ -595,11 +600,16 @@ async function streamResponse({
      break;
    }

-    const { reasoningContent, content, finishReason } = parsePart(part, aiChatReasoning);
+    const { reasoningContent, content, responseContent, finishReason } = parsePart({
+      part,
+      parseThinkTag: aiChatReasoning,
+      retainDatasetCite
+    });
    finish_reason = finish_reason || finishReason;
    answer += content;
    reasoning += reasoningContent;

+    // Reasoning response
    if (aiChatReasoning && reasoningContent) {
      workflowStreamResponse?.({
        write,
@@ -612,13 +622,15 @@ async function streamResponse({

    if (content) {
      if (startResponseWrite) {
-        workflowStreamResponse?.({
-          write,
-          event: SseResponseEventEnum.answer,
-          data: textAdaptGptResponse({
-            text: content
-          })
-        });
+        if (responseContent) {
+          workflowStreamResponse?.({
+            write,
+            event: SseResponseEventEnum.answer,
+            data: textAdaptGptResponse({
+              text: responseContent
+            })
+          });
+        }
      } else if (answer.length >= 3) {
        answer = answer.trimStart();
        if (/0(:|：)/.test(answer)) {
@@ -640,22 +652,6 @@ async function streamResponse({
    }
  }

-  if (answer === '') {
-    answer = getStartTagBuffer();
-    if (/0(:|：)/.test(answer)) {
-      // find first : index
-      const firstIndex = answer.indexOf('0:') !== -1 ? answer.indexOf('0:') : answer.indexOf('0：');
-      answer = answer.substring(firstIndex + 2).trim();
-      workflowStreamResponse?.({
-        write,
-        event: SseResponseEventEnum.answer,
-        data: textAdaptGptResponse({
-          text: answer
-        })
-      });
-    }
-  }
-
  return { answer, reasoning, finish_reason, usage };
 }

--- a/packages/service/core/workflow/dispatch/agent/runTool/toolChoice.ts
+++ b/packages/service/core/workflow/dispatch/agent/runTool/toolChoice.ts
@@ -26,7 +26,12 @@ import { countGptMessagesTokens } from '../../../../../common/string/tiktoken/in
 import { GPTMessages2Chats } from '@fastgpt/global/core/chat/adapt';
 import { AIChatItemType } from '@fastgpt/global/core/chat/type';
 import { formatToolResponse, initToolCallEdges, initToolNodes } from './utils';
-import { computedMaxToken, llmCompletionsBodyFormat } from '../../../../ai/utils';
+import {
+  computedMaxToken,
+  llmCompletionsBodyFormat,
+  removeDatasetCiteText,
+  parseLLMStreamResponse
+} from '../../../../ai/utils';
 import { getNanoid, sliceStrStartEnd } from '@fastgpt/global/common/string/tools';
 import { toolValueTypeList } from '@fastgpt/global/core/workflow/constants';
 import { WorkflowInteractiveResponseType } from '@fastgpt/global/core/workflow/template/system/interactive/type';
@@ -89,12 +94,13 @@ export const runToolWithToolChoice = async (
    interactiveEntryToolParams,
    ...workflowProps
  } = props;
-  const {
+  let {
    res,
    requestOrigin,
    runtimeNodes,
    runtimeEdges,
    stream,
+    retainDatasetCite = true,
    externalProvider,
    workflowStreamResponse,
    params: {
@@ -104,9 +110,11 @@ export const runToolWithToolChoice = async (
      aiChatTopP,
      aiChatStopSign,
      aiChatResponseFormat,
-      aiChatJsonSchema
+      aiChatJsonSchema,
+      aiChatReasoning
    }
  } = workflowProps;
+  aiChatReasoning = !!aiChatReasoning && !!toolModel.reasoning;

  if (maxRunToolTimes <= 0 && response) {
    return response;
@@ -279,6 +287,7 @@ export const runToolWithToolChoice = async (
      messages: requestMessages,
      tools,
      tool_choice: 'auto',
+      parallel_tool_calls: true,
      temperature,
      max_tokens,
      top_p: aiChatTopP,
@@ -288,7 +297,7 @@ export const runToolWithToolChoice = async (
    },
    toolModel
  );
-  // console.log(JSON.stringify(filterMessages, null, 2), '==requestMessages');
+  // console.log(JSON.stringify(requestBody, null, 2), '==requestMessages');
  /* Run llm */
  const {
    response: aiResponse,
@@ -320,7 +329,9 @@ export const runToolWithToolChoice = async (
        res,
        workflowStreamResponse,
        toolNodes,
-        stream: aiResponse
+        stream: aiResponse,
+        aiChatReasoning,
+        retainDatasetCite
      });

      return {
@@ -335,11 +346,38 @@ export const runToolWithToolChoice = async (
      const finish_reason = result.choices?.[0]?.finish_reason as CompletionFinishReason;
      const calls = result.choices?.[0]?.message?.tool_calls || [];
      const answer = result.choices?.[0]?.message?.content || '';
+      // @ts-ignore
+      const reasoningContent = result.choices?.[0]?.message?.reasoning_content || '';
      const usage = result.usage;

-      // 加上name和avatar
+      if (aiChatReasoning && reasoningContent) {
+        workflowStreamResponse?.({
+          event: SseResponseEventEnum.fastAnswer,
+          data: textAdaptGptResponse({
+            reasoning_content: removeDatasetCiteText(reasoningContent, retainDatasetCite)
+          })
+        });
+      }
+
+      // 格式化 toolCalls
      const toolCalls = calls.map((tool) => {
        const toolNode = toolNodes.find((item) => item.nodeId === tool.function?.name);
+
+        // 不支持 stream 模式的模型的这里需要补一个响应给客户端
+        workflowStreamResponse?.({
+          event: SseResponseEventEnum.toolCall,
+          data: {
+            tool: {
+              id: tool.id,
+              toolName: toolNode?.name || '',
+              toolAvatar: toolNode?.avatar || '',
+              functionName: tool.function.name,
+              params: tool.function?.arguments ?? '',
+              response: ''
+            }
+          }
+        });
+
        return {
          ...tool,
          toolName: toolNode?.name || '',
@@ -347,27 +385,11 @@ export const runToolWithToolChoice = async (
        };
      });

-      // 不支持 stream 模式的模型的流失响应
-      toolCalls.forEach((tool) => {
-        workflowStreamResponse?.({
-          event: SseResponseEventEnum.toolCall,
-          data: {
-            tool: {
-              id: tool.id,
-              toolName: tool.toolName,
-              toolAvatar: tool.toolAvatar,
-              functionName: tool.function.name,
-              params: tool.function?.arguments ?? '',
-              response: ''
-            }
-          }
-        });
-      });
      if (answer) {
        workflowStreamResponse?.({
          event: SseResponseEventEnum.fastAnswer,
          data: textAdaptGptResponse({
-            text: answer
+            text: removeDatasetCiteText(answer, retainDatasetCite)
          })
        });
      }
@@ -627,12 +649,16 @@ async function streamResponse({
  res,
  toolNodes,
  stream,
-  workflowStreamResponse
+  workflowStreamResponse,
+  aiChatReasoning,
+  retainDatasetCite
 }: {
  res: NextApiResponse;
  toolNodes: ToolNodeItemType[];
  stream: StreamChatType;
  workflowStreamResponse?: WorkflowResponseType;
+  aiChatReasoning: boolean;
+  retainDatasetCite?: boolean;
 }) {
  const write = responseWriteController({
    res,
@@ -642,105 +668,130 @@ async function streamResponse({
  let textAnswer = '';
  let callingTool: { name: string; arguments: string } | null = null;
  let toolCalls: ChatCompletionMessageToolCall[] = [];
-  let finishReason: CompletionFinishReason = null;
+  let finish_reason: CompletionFinishReason = null;
  let usage = getLLMDefaultUsage();

+  const { parsePart } = parseLLMStreamResponse();
+
  for await (const part of stream) {
    usage = part.usage || usage;
    if (res.closed) {
      stream.controller?.abort();
-      finishReason = 'close';
+      finish_reason = 'close';
      break;
    }

+    const {
+      reasoningContent,
+      content: toolChoiceContent,
+      responseContent,
+      finishReason
+    } = parsePart({
+      part,
+      parseThinkTag: true,
+      retainDatasetCite
+    });
+    textAnswer += toolChoiceContent;
+    finish_reason = finishReason || finish_reason;
+
    const responseChoice = part.choices?.[0]?.delta;
-    const finish_reason = part.choices?.[0]?.finish_reason as CompletionFinishReason;
-    finishReason = finishReason || finish_reason;
-
-    if (responseChoice?.content) {
-      const content = responseChoice.content || '';
-      textAnswer += content;

+    // Reasoning response
+    if (aiChatReasoning && reasoningContent) {
      workflowStreamResponse?.({
        write,
        event: SseResponseEventEnum.answer,
        data: textAdaptGptResponse({
-          text: content
+          reasoning_content: reasoningContent
        })
      });
    }
-    if (responseChoice?.tool_calls?.[0]) {
-      // @ts-ignore
-      const toolCall: ChatCompletionMessageToolCall = responseChoice.tool_calls[0];
-      // In a stream response, only one tool is returned at a time.  If have id, description is executing a tool
-      if (toolCall.id || callingTool) {
-        // Start call tool
-        if (toolCall.id) {
-          callingTool = {
-            name: toolCall.function?.name || '',
-            arguments: toolCall.function?.arguments || ''
-          };
-        } else if (callingTool) {
-          // Continue call
-          callingTool.name += toolCall.function.name || '';
-          callingTool.arguments += toolCall.function.arguments || '';
-        }
+    if (responseContent) {
+      workflowStreamResponse?.({
+        write,
+        event: SseResponseEventEnum.answer,
+        data: textAdaptGptResponse({
+          text: responseContent
+        })
+      });
+    }
+    // Parse tool calls
+    if (responseChoice?.tool_calls?.length) {
+      responseChoice.tool_calls.forEach((toolCall) => {
+        const index = toolCall.index;

-        const toolFunction = callingTool!;
+        // Call new tool
+        if (toolCall.id || callingTool) {
+          // 有 id，代表新 call 工具
+          if (toolCall.id) {
+            callingTool = {
+              name: toolCall.function?.name || '',
+              arguments: toolCall.function?.arguments || ''
+            };
+          } else if (callingTool) {
+            // Continue call(Perhaps the name of the previous function was incomplete)
+            callingTool.name += toolCall.function?.name || '';
+            callingTool.arguments += toolCall.function?.arguments || '';
+          }

-        const toolNode = toolNodes.find((item) => item.nodeId === toolFunction.name);
+          if (!callingTool) {
+            return;
+          }

-        if (toolNode) {
-          // New tool, add to list.
-          const toolId = getNanoid();
-          toolCalls.push({
-            ...toolCall,
-            id: toolId,
-            type: 'function',
-            function: toolFunction,
-            toolName: toolNode.name,
-            toolAvatar: toolNode.avatar
-          });
+          const toolNode = toolNodes.find((item) => item.nodeId === callingTool!.name);

-          workflowStreamResponse?.({
-            event: SseResponseEventEnum.toolCall,
-            data: {
-              tool: {
-                id: toolId,
-                toolName: toolNode.name,
-                toolAvatar: toolNode.avatar,
-                functionName: toolFunction.name,
-                params: toolFunction?.arguments ?? '',
-                response: ''
+          if (toolNode) {
+            // New tool, add to list.
+            const toolId = getNanoid();
+            toolCalls[index] = {
+              ...toolCall,
+              id: toolId,
+              type: 'function',
+              function: callingTool,
+              toolName: toolNode.name,
+              toolAvatar: toolNode.avatar
+            };
+
+            workflowStreamResponse?.({
+              event: SseResponseEventEnum.toolCall,
+              data: {
+                tool: {
+                  id: toolId,
+                  toolName: toolNode.name,
+                  toolAvatar: toolNode.avatar,
+                  functionName: callingTool.name,
+                  params: callingTool?.arguments ?? '',
+                  response: ''
+                }
              }
-            }
-          });
-          callingTool = null;
-        }
-      } else {
-        /* arg 插入最后一个工具的参数里 */
-        const arg: string = toolCall?.function?.arguments ?? '';
-        const currentTool = toolCalls[toolCalls.length - 1];
-        if (currentTool && arg) {
-          currentTool.function.arguments += arg;
+            });
+            callingTool = null;
+          }
+        } else {
+          /* arg 追加到当前工具的参数里 */
+          const arg: string = toolCall?.function?.arguments ?? '';
+          const currentTool = toolCalls[index];
+          if (currentTool && arg) {
+            currentTool.function.arguments += arg;

-          workflowStreamResponse?.({
-            write,
-            event: SseResponseEventEnum.toolParams,
-            data: {
-              tool: {
-                id: currentTool.id,
-                toolName: '',
-                toolAvatar: '',
-                params: arg,
-                response: ''
+            workflowStreamResponse?.({
+              write,
+              event: SseResponseEventEnum.toolParams,
+              data: {
+                tool: {
+                  id: currentTool.id,
+                  toolName: '',
+                  toolAvatar: '',
+                  params: arg,
+                  response: ''
+                }
              }
-            }
-          });
+            });
+          }
        }
-      }
+      });
    }
  }

-  return { answer: textAnswer, toolCalls, finish_reason: finishReason, usage };
+  return { answer: textAnswer, toolCalls: toolCalls.filter(Boolean), finish_reason, usage };
 }
--- a/packages/service/core/workflow/dispatch/chat/oneapi.ts
+++ b/packages/service/core/workflow/dispatch/chat/oneapi.ts
@@ -4,7 +4,11 @@ import type { ChatItemType, UserChatItemValueItemType } from '@fastgpt/global/co
 import { ChatRoleEnum } from '@fastgpt/global/core/chat/constants';
 import { SseResponseEventEnum } from '@fastgpt/global/core/workflow/runtime/constants';
 import { textAdaptGptResponse } from '@fastgpt/global/core/workflow/runtime/utils';
-import { parseReasoningContent, parseReasoningStreamContent } from '../../../ai/utils';
+import {
+  removeDatasetCiteText,
+  parseReasoningContent,
+  parseLLMStreamResponse
+} from '../../../ai/utils';
 import { createChatCompletion } from '../../../ai/config';
 import type {
  ChatCompletionMessageParam,
@@ -75,7 +79,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
    res,
    requestOrigin,
    stream = false,
-    parseQuote = true,
+    retainDatasetCite = true,
    externalProvider,
    histories,
    node: { name, version },
@@ -159,8 +163,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
      userChatInput,
      systemPrompt,
      userFiles,
-      documentQuoteText,
-      parseQuote
+      documentQuoteText
    }),
    // Censor = true and system key, will check content
    (() => {
@@ -223,7 +226,8 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
        aiChatReasoning,
        parseThinkTag: modelConstantsData.reasoning,
        isResponseAnswerText,
-        workflowStreamResponse
+        workflowStreamResponse,
+        retainDatasetCite
      });

      return {
@@ -258,23 +262,21 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
      })();

      // Some models do not support streaming
-      if (stream) {
-        if (aiChatReasoning && reasoningContent) {
-          workflowStreamResponse?.({
-            event: SseResponseEventEnum.fastAnswer,
-            data: textAdaptGptResponse({
-              reasoning_content: reasoningContent
-            })
-          });
-        }
-        if (isResponseAnswerText && content) {
-          workflowStreamResponse?.({
-            event: SseResponseEventEnum.fastAnswer,
-            data: textAdaptGptResponse({
-              text: content
-            })
-          });
-        }
+      if (aiChatReasoning && reasoningContent) {
+        workflowStreamResponse?.({
+          event: SseResponseEventEnum.fastAnswer,
+          data: textAdaptGptResponse({
+            reasoning_content: removeDatasetCiteText(reasoningContent, retainDatasetCite)
+          })
+        });
+      }
+      if (isResponseAnswerText && content) {
+        workflowStreamResponse?.({
+          event: SseResponseEventEnum.fastAnswer,
+          data: textAdaptGptResponse({
+            text: removeDatasetCiteText(content, retainDatasetCite)
+          })
+        });
      }

      return {
@@ -452,8 +454,7 @@ async function getChatMessages({
  systemPrompt,
  userChatInput,
  userFiles,
-  documentQuoteText,
-  parseQuote = true
+  documentQuoteText
 }: {
  model: LLMModelItemType;
  maxTokens?: number;
@@ -470,14 +471,13 @@ async function getChatMessages({

  userFiles: UserChatItemValueItemType['file'][];
  documentQuoteText?: string; // document quote
-  parseQuote?: boolean;
 }) {
  // Dataset prompt ====>
  // User role or prompt include question
  const quoteRole =
    aiChatQuoteRole === 'user' || datasetQuotePrompt.includes('{{question}}') ? 'user' : 'system';

-  const defaultQuotePrompt = getQuotePrompt(version, quoteRole, parseQuote);
+  const defaultQuotePrompt = getQuotePrompt(version, quoteRole);

  const datasetQuotePromptTemplate = datasetQuotePrompt || defaultQuotePrompt;

@@ -539,7 +539,8 @@ async function streamResponse({
  workflowStreamResponse,
  aiChatReasoning,
  parseThinkTag,
-  isResponseAnswerText
+  isResponseAnswerText,
+  retainDatasetCite = true
 }: {
  res: NextApiResponse;
  stream: StreamChatType;
@@ -547,6 +548,7 @@ async function streamResponse({
  aiChatReasoning?: boolean;
  parseThinkTag?: boolean;
  isResponseAnswerText?: boolean;
+  retainDatasetCite: boolean;
 }) {
  const write = responseWriteController({
    res,
@@ -557,7 +559,7 @@ async function streamResponse({
  let finish_reason: CompletionFinishReason = null;
  let usage: CompletionUsage = getLLMDefaultUsage();

-  const { parsePart, getStartTagBuffer } = parseReasoningStreamContent();
+  const { parsePart } = parseLLMStreamResponse();

  for await (const part of stream) {
    usage = part.usage || usage;
@@ -568,7 +570,11 @@ async function streamResponse({
      break;
    }

-    const { reasoningContent, content, finishReason } = parsePart(part, parseThinkTag);
+    const { reasoningContent, content, responseContent, finishReason } = parsePart({
+      part,
+      parseThinkTag,
+      retainDatasetCite
+    });
    finish_reason = finish_reason || finishReason;
    answer += content;
    reasoning += reasoningContent;
@@ -583,26 +589,12 @@ async function streamResponse({
      });
    }

-    if (isResponseAnswerText && content) {
+    if (isResponseAnswerText && responseContent) {
      workflowStreamResponse?.({
        write,
        event: SseResponseEventEnum.answer,
        data: textAdaptGptResponse({
-          text: content
-        })
-      });
-    }
-  }
-
-  // if answer is empty, try to get value from startTagBuffer. (Cause: The response content is too short to exceed the minimum parse length)
-  if (answer === '') {
-    answer = getStartTagBuffer();
-    if (isResponseAnswerText && answer) {
-      workflowStreamResponse?.({
-        write,
-        event: SseResponseEventEnum.answer,
-        data: textAdaptGptResponse({
-          text: answer
+          text: responseContent
        })
      });
    }
--- a/packages/service/core/workflow/dispatch/dataset/concat.ts
+++ b/packages/service/core/workflow/dispatch/dataset/concat.ts
@@ -21,7 +21,7 @@ export async function dispatchDatasetConcat(
  props: DatasetConcatProps
 ): Promise<DatasetConcatResponse> {
  const {
-    params: { limit = 1500, ...quoteMap }
+    params: { limit = 6000, ...quoteMap }
  } = props as DatasetConcatProps;

  const quoteList = Object.values(quoteMap).filter((list) => Array.isArray(list));
--- a/packages/service/core/workflow/dispatch/dataset/search.ts
+++ b/packages/service/core/workflow/dispatch/dataset/search.ts
@@ -55,11 +55,10 @@ export async function dispatchDatasetSearch(
    runningUserInfo: { tmbId },
    histories,
    node,
-    parseQuote = true,
    params: {
      datasets = [],
      similarity,
-      limit = 1500,
+      limit = 5000,
      userChatInput = '',
      authTmbId = false,
      collectionFilterMatch,
@@ -114,7 +113,6 @@ export async function dispatchDatasetSearch(
  if (datasetIds.length === 0) {
    return emptyResult;
  }
-  // console.log(concatQueries, rewriteQuery, aiExtensionResult);

  // get vector
  const vectorModel = getEmbeddingModel(
@@ -267,7 +265,7 @@ export async function dispatchDatasetSearch(
    [DispatchNodeResponseKeyEnum.nodeResponse]: responseData,
    nodeDispatchUsages,
    [DispatchNodeResponseKeyEnum.toolResponses]: {
-      prompt: getDatasetSearchToolResponsePrompt(parseQuote),
+      prompt: getDatasetSearchToolResponsePrompt(),
      quotes: searchRes.map((item) => ({
        id: item.id,
        sourceName: item.sourceName,
--- a/packages/service/core/workflow/dispatch/index.ts
+++ b/packages/service/core/workflow/dispatch/index.ts
@@ -135,7 +135,7 @@ export async function dispatchWorkFlow(data: Props): Promise<DispatchFlowRespons
    timezone,
    externalProvider,
    stream = false,
-    parseQuote = true,
+    retainDatasetCite = true,
    version = 'v1',
    responseDetail = true,
    responseAllData = true,
@@ -607,7 +607,7 @@ export async function dispatchWorkFlow(data: Props): Promise<DispatchFlowRespons
      timezone,
      externalProvider,
      stream,
-      parseQuote,
+      retainDatasetCite,
      node,
      runtimeNodes,
      runtimeEdges,
--- a/packages/service/core/workflow/dispatch/utils.ts
+++ b/packages/service/core/workflow/dispatch/utils.ts
@@ -1,10 +1,7 @@
 import { getErrText } from '@fastgpt/global/common/error/utils';
 import { ChatRoleEnum } from '@fastgpt/global/core/chat/constants';
 import type { ChatItemType } from '@fastgpt/global/core/chat/type.d';
-import {
-  WorkflowIOValueTypeEnum,
-  NodeOutputKeyEnum
-} from '@fastgpt/global/core/workflow/constants';
+import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants';
 import {
  RuntimeEdgeItemType,
  RuntimeNodeItemType,
@@ -34,31 +31,22 @@ export const getWorkflowResponseWrite = ({
  return ({
    write,
    event,
-    data,
-    stream
+    data
  }: {
    write?: (text: string) => void;
    event: SseResponseEventEnum;
    data: Record<string, any>;
-    stream?: boolean; // Focus set stream response
  }) => {
-    const useStreamResponse = stream ?? streamResponse;
+    const useStreamResponse = streamResponse;

    if (!res || res.closed || !useStreamResponse) return;

    // Forbid show detail
-    const detailEvent: Record<string, 1> = {
-      [SseResponseEventEnum.error]: 1,
-      [SseResponseEventEnum.flowNodeStatus]: 1,
-      [SseResponseEventEnum.flowResponses]: 1,
-      [SseResponseEventEnum.interactive]: 1,
-      [SseResponseEventEnum.toolCall]: 1,
-      [SseResponseEventEnum.toolParams]: 1,
-      [SseResponseEventEnum.toolResponse]: 1,
-      [SseResponseEventEnum.updateVariables]: 1,
-      [SseResponseEventEnum.flowNodeResponse]: 1
+    const notDetailEvent: Record<string, 1> = {
+      [SseResponseEventEnum.answer]: 1,
+      [SseResponseEventEnum.fastAnswer]: 1
    };
-    if (!detail && detailEvent[event]) return;
+    if (!detail && !notDetailEvent[event]) return;

    // Forbid show running status
    const statusEvent: Record<string, 1> = {