FastGPT/packages/service/core/ai/llm/request.ts

import type {
  ChatCompletion,
  ChatCompletionCreateParamsNonStreaming,
  ChatCompletionCreateParamsStreaming,
  ChatCompletionMessageParam,
  ChatCompletionMessageToolCall,
  CompletionFinishReason,
  CompletionUsage,
  OpenAI,
  StreamChatType,
  UnStreamChatType
} from '@fastgpt/global/core/ai/type';
import {
  computedMaxToken,
  computedTemperature,
  parseLLMStreamResponse,
  parseReasoningContent
} from '../utils';
import { removeDatasetCiteText } from '@fastgpt/global/core/ai/llm/utils';
import { getAIApi } from '../config';
import type { OpenaiAccountType } from '@fastgpt/global/support/user/team/type';
import { customNanoid, getNanoid } from '@fastgpt/global/common/string/tools';
import { parsePromptToolCall, promptToolCallMessageRewrite } from './promptCall';
import { getLLMModel } from '../model';
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
import { countGptMessagesTokens } from '../../../common/string/tiktoken/index';
import { loadRequestMessages } from './utils';
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.schema';
import { i18nT } from '../../../../web/i18n/utils';
import { getErrText } from '@fastgpt/global/common/error/utils';
import json5 from 'json5';
import { getLogger, LogCategories } from '../../../common/logger';
import { saveLLMRequestRecord } from '../record/controller';

const getRequestId = () => {
  return customNanoid('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_-', 16);
};

const logger = getLogger(LogCategories.MODULE.AI.LLM);

export type ResponseEvents = {
  onStreaming?: (e: { text: string }) => void;
  onReasoning?: (e: { text: string }) => void;
  onToolCall?: (e: { call: ChatCompletionMessageToolCall }) => void;
  onToolParam?: (e: { tool: ChatCompletionMessageToolCall; params: string }) => void;
};

export type CreateLLMResponseProps<T extends CompletionsBodyType = CompletionsBodyType> = {
  throwError?: boolean;
  userKey?: OpenaiAccountType;
  body: LLMRequestBodyType<T>;
  isAborted?: () => boolean | undefined | null;
  custonHeaders?: Record<string, string>;
  maxContinuations?: number;
} & ResponseEvents;

type LLMResponse = {
  requestId: string; // LLM 请求追踪 ID
  error?: any;
  isStreamResponse: boolean;
  answerText: string;
  reasoningText: string;
  toolCalls?: ChatCompletionMessageToolCall[];
  finish_reason: CompletionFinishReason;
  responseEmptyTip?: string;
  usage: {
    inputTokens: number;
    outputTokens: number;
  };

  requestMessages: ChatCompletionMessageParam[];
  assistantMessage?: ChatCompletionMessageParam;
  completeMessages: ChatCompletionMessageParam[];
};

/*
  底层封装 LLM 调用 帮助上层屏蔽 stream 和非 stream，以及 toolChoice 和 promptTool 模式。
  工具调用无论哪种模式，都存 toolChoice 的格式，promptTool 通过修改 toolChoice 的结构，形成特定的 messages 进行调用。
*/
export const createLLMResponse = async <T extends CompletionsBodyType>(
  args: CreateLLMResponseProps<T>
): Promise<LLMResponse> => {
  // 生成唯一的请求追踪 ID
  const requestId = getRequestId();

  const { throwError = true, body, custonHeaders, userKey, maxContinuations = 1 } = args;
  const { messages, useVision, requestOrigin, tools, toolCallMode } = body;

  // Messages process
  const requestMessages = await loadRequestMessages({
    messages,
    useVision,
    origin: requestOrigin
  });
  // Message process
  const rewriteMessages = (() => {
    if (tools?.length && toolCallMode === 'prompt') {
      return promptToolCallMessageRewrite(requestMessages, tools);
    }
    return requestMessages;
  })();

  const { requestBody, modelData } = await llmCompletionsBodyFormat({
    ...body,
    messages: rewriteMessages
  });

  // Initial request and accumulate results if finish_reason is 'length'
  let accumulatedAnswerText = '';
  let accumulatedReasoningText = '';
  let accumulatedToolCalls: ChatCompletionMessageToolCall[] | undefined;
  let currentFinishReason: CompletionFinishReason = 'stop';
  let accumulatedUsage = {
    prompt_tokens: 0,
    completion_tokens: 0,
    total_tokens: 0
  };
  let currentError: any = undefined;
  let currentMessages = [...requestBody.messages];
  let continuationCount = 0;
  let isStreamResponse = false;

  try {
    while (continuationCount < maxContinuations) {
      // console.debug(
      //   'LLM Request Body:',
      //   JSON.stringify(
      //     {
      //       ...requestBody,
      //       messages: currentMessages
      //     },
      //     null,
      //     2
      //   )
      // );
      const { response, isStreamResponse: currentIsStreamResponse } = await createChatCompletion({
        body: {
          ...requestBody,
          messages: currentMessages
        },
        modelData,
        userKey,
        options: {
          headers: {
            Accept: 'application/json, text/plain, */*',
            ...custonHeaders
          }
        }
      });

      // Save isStreamResponse from first request
      if (continuationCount === 0) {
        isStreamResponse = currentIsStreamResponse;
      }

      let { answerText, reasoningText, toolCalls, finish_reason, usage, error } =
        await (async () => {
          if (currentIsStreamResponse) {
            return createStreamResponse({
              response,
              body,
              isAborted: args.isAborted,
              onStreaming: args.onStreaming,
              onReasoning: args.onReasoning,
              onToolCall: args.onToolCall,
              onToolParam: args.onToolParam
            });
          } else {
            return createCompleteResponse({
              response,
              body,
              onStreaming: args.onStreaming,
              onReasoning: args.onReasoning,
              onToolCall: args.onToolCall
            });
          }
        })();

      // Format toolCalls
      // 1. Auto complete arguments, avoid model not support "" arguments
      toolCalls = toolCalls?.map((tool) => ({
        ...tool,
        function: {
          ...tool.function,
          arguments: tool.function.arguments || '{}'
        }
      }));

      // Accumulate results
      accumulatedAnswerText += answerText;
      accumulatedReasoningText += reasoningText;
      if (toolCalls?.length) {
        accumulatedToolCalls = [...(accumulatedToolCalls || []), ...toolCalls];
      }
      currentFinishReason = finish_reason;
      currentError = error;

      // Accumulate usage
      if (usage) {
        accumulatedUsage.prompt_tokens += usage.prompt_tokens || 0;
        accumulatedUsage.completion_tokens += usage.completion_tokens || 0;
        accumulatedUsage.total_tokens += usage.total_tokens || 0;
      }

      // Check if we need to continue
      // TODO: 输出超出模型输出上限
      if (finish_reason === 'length' && !error) {
        // Append assistant message and user continuation message
        currentMessages = currentMessages.slice(0, requestBody.messages.length);
        currentMessages = [
          ...currentMessages,
          ...(accumulatedToolCalls
            ? [
                {
                  role: ChatCompletionRequestMessageRoleEnum.Assistant as 'assistant',
                  tool_calls: accumulatedToolCalls
                }
              ]
            : []),
          {
            role: ChatCompletionRequestMessageRoleEnum.Assistant as 'assistant',
            ...(accumulatedAnswerText && { content: accumulatedAnswerText }),
            ...(accumulatedReasoningText && { reasoning_content: accumulatedReasoningText })
          },
          {
            role: ChatCompletionRequestMessageRoleEnum.User as 'user',
            content: '[继续输出]'
          }
        ];

        logger.debug(`Continue LLM response due to length limit`, {
          continuationCount,
          completionTokens: usage?.completion_tokens
        });
        continuationCount++;
      } else {
        // Stop condition reached
        break;
      }
    }

    // Use accumulated results
    let { answerText, reasoningText, toolCalls, finish_reason, usage, error } = {
      answerText: accumulatedAnswerText,
      reasoningText: accumulatedReasoningText,
      toolCalls: accumulatedToolCalls,
      finish_reason: currentFinishReason,
      usage: accumulatedUsage,
      error: currentError
    };

    const assistantMessage: ChatCompletionMessageParam = {
      role: ChatCompletionRequestMessageRoleEnum.Assistant as 'assistant',
      ...(answerText && { content: answerText }),
      ...(reasoningText && { reasoning_content: reasoningText }),
      ...(toolCalls?.length && { tool_calls: toolCalls })
    };

    // Usage count
    const inputTokens =
      usage?.prompt_tokens ||
      (await countGptMessagesTokens(requestBody.messages, requestBody.tools));
    const outputTokens =
      usage?.completion_tokens || (await countGptMessagesTokens([assistantMessage]));

    // 异步保存 LLM 请求追踪记录
    saveLLMRequestRecord({
      requestId,
      body: requestBody,
      response: {
        ...(answerText && { answerText }),
        ...(reasoningText && { reasoningText }),
        ...(toolCalls?.length && { toolCalls }),
        finish_reason,
        usage: {
          inputTokens,
          outputTokens
        },
        error
      }
    });

    if (error) {
      finish_reason = 'error';

      if (throwError) {
        throw error;
      }
    }

    const getEmptyResponseTip = () => {
      if (userKey?.baseUrl) {
        logger.warn(`User LLM response empty`, {
          baseUrl: userKey?.baseUrl,
          requestBody,
          finish_reason
        });
        return `您的 OpenAI key 没有响应: ${JSON.stringify(body)}`;
      } else {
        logger.error(`LLM response empty`, {
          message: '',
          data: requestBody,
          finish_reason
        });
      }
      return i18nT('chat:LLM_model_response_empty');
    };
    const isNotResponse =
      !answerText &&
      !reasoningText &&
      !toolCalls?.length &&
      !error &&
      (finish_reason === 'stop' || !finish_reason);
    const responseEmptyTip = isNotResponse ? getEmptyResponseTip() : undefined;

    return {
      error,
      isStreamResponse,
      responseEmptyTip,
      answerText,
      reasoningText,
      toolCalls,
      finish_reason,
      usage: {
        inputTokens: error ? 0 : inputTokens,
        outputTokens: error ? 0 : outputTokens
      },
      requestId, // 返回请求追踪 ID

      requestMessages,
      assistantMessage,
      completeMessages: [...requestMessages, assistantMessage]
    };
  } catch (error) {
    // 异步保存 LLM 请求追踪记录
    saveLLMRequestRecord({
      requestId,
      body: requestBody,
      response: {
        error: getErrText(error)
      }
    });

    if (throwError) {
      throw error;
    }

    return {
      error,
      requestId, // 返回请求追踪 ID
      isStreamResponse: false,
      answerText: '',
      reasoningText: '',
      finish_reason: 'error',
      usage: {
        inputTokens: 0,
        outputTokens: 0
      },
      requestMessages: requestBody.messages,
      completeMessages: [...requestBody.messages]
    };
  }
};

type CompleteParams = Pick<CreateLLMResponseProps<CompletionsBodyType>, 'body'> & ResponseEvents;

type CompleteResponse = Pick<
  LLMResponse,
  'answerText' | 'reasoningText' | 'toolCalls' | 'finish_reason'
> & {
  usage?: CompletionUsage;
  error?: any;
};

export const createStreamResponse = async ({
  body,
  response,
  isAborted,
  onStreaming,
  onReasoning,
  onToolCall,
  onToolParam
}: CompleteParams & {
  response: StreamChatType;
  isAborted?: CreateLLMResponseProps['isAborted'];
}): Promise<CompleteResponse> => {
  const { retainDatasetCite = true, tools, toolCallMode = 'toolChoice', model } = body;
  const modelData = getLLMModel(model);

  const { parsePart, getResponseData, updateFinishReason, updateError } = parseLLMStreamResponse();

  if (tools?.length) {
    if (toolCallMode === 'toolChoice') {
      let callingTool: ChatCompletionMessageToolCall['function'] | null = null;
      const toolCalls: ChatCompletionMessageToolCall[] = [];

      try {
        for await (const part of response) {
          if (isAborted?.()) {
            response.controller?.abort();
            updateFinishReason('close');
            break;
          }

          const { reasoningContent, responseContent } = parsePart({
            part,
            parseThinkTag: modelData.reasoning,
            retainDatasetCite
          });

          if (reasoningContent) {
            onReasoning?.({ text: reasoningContent });
          }
          if (responseContent) {
            onStreaming?.({ text: responseContent });
          }

          const responseChoice = part.choices?.[0]?.delta;

          // Parse tool calls
          if (responseChoice?.tool_calls?.length) {
            responseChoice.tool_calls.forEach((toolCall, i) => {
              const index = toolCall.index ?? i;

              // Call new tool
              const hasNewTool = toolCall?.function?.name || callingTool;
              if (hasNewTool) {
                // Call new tool
                if (toolCall?.function?.name) {
                  callingTool = {
                    name: toolCall.function?.name || '',
                    arguments: toolCall.function?.arguments || ''
                  };
                } else if (callingTool) {
                  // Continue call(Perhaps the name of the previous function was incomplete)
                  callingTool.name += toolCall.function?.name || '';
                  callingTool.arguments += toolCall.function?.arguments || '';
                }

                // New tool, add to list.
                if (tools.find((item) => item.function.name === callingTool!.name)) {
                  const call: ChatCompletionMessageToolCall = {
                    id: toolCall.id || getNanoid(6),
                    type: 'function',
                    function: callingTool!
                  };
                  toolCalls[index] = call;
                  onToolCall?.({ call });
                  callingTool = null;
                }
              } else {
                /* arg 追加到当前工具的参数里 */
                const arg: string = toolCall?.function?.arguments ?? '';
                const currentTool = toolCalls[index];
                if (currentTool && arg) {
                  currentTool.function.arguments += arg;

                  onToolParam?.({ tool: currentTool, params: arg });
                }
              }
            });
          }
        }
      } catch (error: any) {
        updateError(error?.error || error);
      }

      const { reasoningContent, content, finish_reason, usage, error } = getResponseData();

      return {
        error,
        answerText: content,
        reasoningText: reasoningContent,
        finish_reason,
        usage,
        toolCalls: toolCalls.filter((call) => !!call)
      };
    } else {
      let startResponseWrite = false;
      let answer = '';

      try {
        for await (const part of response) {
          if (isAborted?.()) {
            response.controller?.abort();
            updateFinishReason('close');
            break;
          }

          const { reasoningContent, content, responseContent } = parsePart({
            part,
            parseThinkTag: modelData.reasoning,
            retainDatasetCite
          });
          answer += content;

          if (reasoningContent) {
            onReasoning?.({ text: reasoningContent });
          }

          if (content) {
            if (startResponseWrite) {
              if (responseContent) {
                onStreaming?.({ text: responseContent });
              }
            } else if (answer.length >= 3) {
              answer = answer.trimStart();

              // Not call tool
              if (/0(:|：)/.test(answer)) {
                startResponseWrite = true;

                // find first : index
                const firstIndex =
                  answer.indexOf('0:') !== -1 ? answer.indexOf('0:') : answer.indexOf('0：');
                answer = answer.substring(firstIndex + 2).trim();

                onStreaming?.({ text: answer });
              }
              // Not response tool
              else if (/1(:|：)/.test(answer)) {
              }
              // Not start 1/0, start response
              else {
                startResponseWrite = true;
                onStreaming?.({ text: answer });
              }
            }
          }
        }
      } catch (error: any) {
        updateError(error?.error || error);
      }

      const { reasoningContent, content, finish_reason, usage, error } = getResponseData();
      const { answer: llmAnswer, streamAnswer, toolCalls } = parsePromptToolCall(content);

      if (streamAnswer) {
        onStreaming?.({ text: streamAnswer });
      }

      toolCalls?.forEach((call) => {
        onToolCall?.({ call });
      });

      return {
        error,
        answerText: llmAnswer,
        reasoningText: reasoningContent,
        finish_reason,
        usage,
        toolCalls
      };
    }
  } else {
    // Not use tool
    try {
      for await (const part of response) {
        if (isAborted?.()) {
          response.controller?.abort();
          updateFinishReason('close');
          break;
        }

        const { reasoningContent, responseContent } = parsePart({
          part,
          parseThinkTag: modelData.reasoning,
          retainDatasetCite
        });

        if (reasoningContent) {
          onReasoning?.({ text: reasoningContent });
        }
        if (responseContent) {
          onStreaming?.({ text: responseContent });
        }
      }
    } catch (error: any) {
      updateError(error?.error || error);
    }

    const { reasoningContent, content, finish_reason, usage, error } = getResponseData();

    return {
      error,
      answerText: content,
      reasoningText: reasoningContent,
      finish_reason,
      usage
    };
  }
};

export const createCompleteResponse = async ({
  body,
  response,
  onStreaming,
  onReasoning,
  onToolCall
}: CompleteParams & { response: ChatCompletion }): Promise<CompleteResponse> => {
  const { tools, toolCallMode = 'toolChoice', retainDatasetCite = true } = body;
  const modelData = getLLMModel(body.model);

  const finish_reason = response.choices?.[0]?.finish_reason as CompletionFinishReason;
  const usage = response.usage;

  // Content and think parse
  const { content, reasoningContent } = (() => {
    const content = response.choices?.[0]?.message?.content || '';
    const reasoningContent: string =
      (response.choices?.[0]?.message as any)?.reasoning_content || '';

    // API already parse reasoning content
    if (reasoningContent || !modelData.reasoning) {
      return {
        content,
        reasoningContent
      };
    }

    const [think, answer] = parseReasoningContent(content);
    return {
      content: answer,
      reasoningContent: think
    };
  })();
  const formatReasonContent = removeDatasetCiteText(reasoningContent, retainDatasetCite);
  let formatContent = removeDatasetCiteText(content, retainDatasetCite);

  // Tool parse
  const { toolCalls } = (() => {
    if (tools?.length) {
      if (toolCallMode === 'toolChoice') {
        return {
          toolCalls: response.choices?.[0]?.message?.tool_calls || []
        };
      }

      // Prompt call
      const { answer, toolCalls } = parsePromptToolCall(formatContent);
      formatContent = answer;

      return {
        toolCalls
      };
    }

    return {
      toolCalls: undefined
    };
  })();

  // Event response
  if (formatReasonContent) {
    onReasoning?.({ text: formatReasonContent });
  }
  if (formatContent) {
    onStreaming?.({ text: formatContent });
  }
  if (toolCalls?.length && onToolCall) {
    toolCalls.forEach((call) => {
      onToolCall({ call });
    });
  }

  return {
    error: response.error,
    reasoningText: formatReasonContent,
    answerText: formatContent,
    toolCalls,
    finish_reason,
    usage
  };
};

type CompletionsBodyType =
  | ChatCompletionCreateParamsNonStreaming
  | ChatCompletionCreateParamsStreaming;
type InferCompletionsBody<T> = T extends { stream: true }
  ? ChatCompletionCreateParamsStreaming
  : T extends { stream: false }
    ? ChatCompletionCreateParamsNonStreaming
    : ChatCompletionCreateParamsNonStreaming | ChatCompletionCreateParamsStreaming;

type LLMRequestBodyType<T> = Omit<T, 'model' | 'stop' | 'response_format' | 'messages'> & {
  model: string | LLMModelItemType;
  stop?: string;
  response_format?: {
    type?: string;
    json_schema?: string;
  };
  messages: ChatCompletionMessageParam[];

  // Custom field
  retainDatasetCite?: boolean;
  toolCallMode?: 'toolChoice' | 'prompt';
  useVision?: boolean;
  requestOrigin?: string;
};
const llmCompletionsBodyFormat = async <T extends CompletionsBodyType>({
  retainDatasetCite,
  useVision,
  requestOrigin,

  tools,
  tool_choice,
  parallel_tool_calls,
  toolCallMode,
  ...body
}: LLMRequestBodyType<T>): Promise<{
  requestBody: InferCompletionsBody<T>;
  modelData: LLMModelItemType;
}> => {
  const modelData = getLLMModel(body.model);
  if (!modelData) {
    return {
      requestBody: body as unknown as InferCompletionsBody<T>,
      modelData
    };
  }

  const response_format = (() => {
    if (!body.response_format?.type) return undefined;
    if (body.response_format.type === 'json_schema') {
      try {
        return {
          type: 'json_schema',
          json_schema: json5.parse(body.response_format?.json_schema as unknown as string)
        };
      } catch (error) {
        throw new Error('Json schema error');
      }
    }
    if (body.response_format.type) {
      return {
        type: body.response_format.type
      };
    }
    return undefined;
  })();
  const stop = body.stop ?? undefined;

  const maxTokens = computedMaxToken({
    model: modelData,
    maxToken: body.max_tokens || undefined
  });

  const formatStop = stop?.split('|').filter((item) => !!item.trim());
  let requestBody = {
    ...body,
    max_tokens: maxTokens,
    model: modelData.model,
    temperature:
      typeof body.temperature === 'number'
        ? computedTemperature({
            model: modelData,
            temperature: body.temperature
          })
        : undefined,
    response_format,
    stop: formatStop?.length ? formatStop : undefined,
    ...(toolCallMode === 'toolChoice' &&
      tools?.length && {
        tools,
        tool_choice,
        parallel_tool_calls
      })
  } as T;

  // Filter undefined/null value
  requestBody = Object.fromEntries(
    Object.entries(requestBody).filter(([_, value]) => value !== null && value !== undefined)
  ) as T;

  // field map
  if (modelData.fieldMap) {
    Object.entries(modelData.fieldMap).forEach(([sourceKey, targetKey]) => {
      // @ts-ignore
      requestBody[targetKey] = body[sourceKey];
      // @ts-ignore
      delete requestBody[sourceKey];
    });
  }

  requestBody = {
    ...requestBody,
    ...modelData?.defaultConfig
  };

  return {
    requestBody: requestBody as unknown as InferCompletionsBody<T>,
    modelData
  };
};
const createChatCompletion = async ({
  modelData,
  body,
  userKey,
  timeout,
  options
}: {
  modelData: LLMModelItemType;
  body: ChatCompletionCreateParamsNonStreaming | ChatCompletionCreateParamsStreaming;
  userKey?: OpenaiAccountType;
  timeout?: number;
  options?: OpenAI.RequestOptions;
}): Promise<
  | {
      response: StreamChatType;
      isStreamResponse: true;
    }
  | {
      response: UnStreamChatType;
      isStreamResponse: false;
    }
> => {
  try {
    if (!modelData) {
      return Promise.reject(`${body.model} not found`);
    }
    body.model = modelData.model;

    const formatTimeout = timeout ? timeout : 600000;
    const ai = getAIApi({
      userKey,
      timeout: formatTimeout
    });

    logger.debug('Start create chat completion', { model: body.model });

    const response = await ai.chat.completions.create(body, {
      ...options,
      ...(modelData.requestUrl ? { path: modelData.requestUrl } : {}),
      headers: {
        ...options?.headers,
        ...(modelData.requestAuth ? { Authorization: `Bearer ${modelData.requestAuth}` } : {})
      }
    });

    const isStreamResponse =
      typeof response === 'object' &&
      response !== null &&
      ('iterator' in response || 'controller' in response);

    if (isStreamResponse) {
      return {
        response,
        isStreamResponse: true
      };
    }

    return {
      response,
      isStreamResponse: false
    };
  } catch (error) {
    if (userKey?.baseUrl) {
      logger.warn('User AI API error', {
        baseUrl: userKey?.baseUrl,
        request: body,
        error
      });
      return Promise.reject(`您的 OpenAI key 出错了: ${getErrText(error)}`);
    } else {
      logger.error('LLM response error', { request: body, error });
    }
    return Promise.reject(error);
  }
};