perf: stream timeout;feat: hnsw max_scan_tuples config;fix: fulltext search merge error (#4838)

* perf: stream timeout * feat: hnsw max_scan_tuples config * fix: fulltext search merge error * perf: jieba code
2025-10-18 17:51:24 +00:00 · 2025-05-20 09:59:24 +08:00
parent 9fef3e15fb
commit 1dac2b70ec
10 changed files with 74 additions and 58 deletions
--- a/docSite/content/zh-cn/docs/development/upgrading/4910.md
+++ b/docSite/content/zh-cn/docs/development/upgrading/4910.md
@@ -0,0 +1,21 @@
 ---
 title: 'V4.9.10(进行中)'
 description: 'FastGPT V4.9.10 更新说明'
 icon: 'upgrade'
 draft: false
 toc: true
 weight: 790
 ---
 ## 🚀 新增内容
 1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数，提高迭代搜索的数据总量。
 ## ⚙️ 优化
 1. LLM stream调用，默认超时调大。
 ## 🐛 修复
 1. 全文检索多知识库时排序得分排序不正确
--- a/packages/global/common/system/types/index.d.ts
+++ b/packages/global/common/system/types/index.d.ts
@@ -130,9 +130,11 @@ export type SystemEnvType = {
  vectorMaxProcess: number;
  qaMaxProcess: number;
  vlmMaxProcess: number;
  hnswEfSearch: number;
  tokenWorkers: number; // token count max worker
  hnswEfSearch: number;
  hnswMaxScanTuples: number;
  oneapiUrl?: string;
  chatApiKey?: string;
--- a/packages/service/common/string/jieba/index.ts
+++ b/packages/service/common/string/jieba/index.ts
@@ -10,6 +10,7 @@ let jieba: Jieba | undefined;
 })();
 const stopWords = new Set([
  '\n',
  '--',
  '?',
  '“',
@@ -1519,8 +1520,7 @@ const stopWords = new Set([
 ]);
 export async function jiebaSplit({ text }: { text: string }) {
-  text = text.replace(/[#*`_~>[\](){}|]/g, '').replace(/\S*https?\S*/gi, '');
+  text = text.replace(/[#*`_~>[\](){}|]|\S*https?\S*/g, '').trim();
  const tokens = (await jieba!.cutAsync(text, true)) as string[];
  return (
--- a/packages/service/common/vectorDB/pg/index.ts
+++ b/packages/service/common/vectorDB/pg/index.ts
@@ -188,6 +188,7 @@ export class PgVectorCtrl {
      const results: any = await PgClient.query(
        `BEGIN;
          SET LOCAL hnsw.ef_search = ${global.systemEnv?.hnswEfSearch || 100};
          SET LOCAL hnsw.max_scan_tuples = ${global.systemEnv?.hnswMaxScanTuples || 100000};
          SET LOCAL hnsw.iterative_scan = relaxed_order;
          WITH relaxed_results AS MATERIALIZED (
            select id, collection_id, vector <#> '[${vector}]' AS score
@@ -199,7 +200,7 @@ export class PgVectorCtrl {
          ) SELECT id, collection_id, score FROM relaxed_results ORDER BY score;
        COMMIT;`
      );
-      const rows = results?.[3]?.rows as PgSearchRawType[];
+      const rows = results?.[results.length - 2]?.rows as PgSearchRawType[];
      if (!Array.isArray(rows)) {
        return {
--- a/packages/service/core/ai/config.ts
+++ b/packages/service/core/ai/config.ts
@@ -78,7 +78,7 @@ export const createChatCompletion = async ({
    }
    body.model = modelConstantsData.model;
-    const formatTimeout = timeout ? timeout : body.stream ? 60000 : 600000;
+    const formatTimeout = timeout ? timeout : 600000;
    const ai = getAIApi({
      userKey,
      timeout: formatTimeout
--- a/packages/service/core/dataset/data/dataTextSchema.ts
+++ b/packages/service/core/dataset/data/dataTextSchema.ts
@@ -34,9 +34,9 @@ const DatasetDataTextSchema = new Schema({
 try {
  DatasetDataTextSchema.index(
-    { teamId: 1, datasetId: 1, fullTextToken: 'text' },
+    { teamId: 1, fullTextToken: 'text' },
    {
-      name: 'teamId_1_datasetId_1_fullTextToken_text',
+      name: 'teamId_1_fullTextToken_text',
      default_language: 'none'
    }
  );
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -544,16 +544,13 @@ export async function searchDatasetData(
      };
    }
-    const searchResults = (
+    const searchResults = (await MongoDatasetDataText.aggregate(
      await Promise.all(
        datasetIds.map(async (id) => {
          return MongoDatasetDataText.aggregate(
      [
        {
          $match: {
            teamId: new Types.ObjectId(teamId),
                  datasetId: new Types.ObjectId(id),
            $text: { $search: await jiebaSplit({ text: query }) },
            datasetId: { $in: datasetIds.map((id) => new Types.ObjectId(id)) },
            ...(filterCollectionIdList
              ? {
                  collectionId: {
@@ -590,10 +587,7 @@ export async function searchDatasetData(
      {
        ...readFromSecondary
      }
-          );
+    )) as (DatasetDataTextSchemaType & { score: number })[];
        })
      )
    ).flat() as (DatasetDataTextSchemaType & { score: number })[];
    // Get data and collections
    const [dataList, collections] = await Promise.all([
--- a/packages/service/core/workflow/dispatch/code/run.ts
+++ b/packages/service/core/workflow/dispatch/code/run.ts
@@ -49,8 +49,6 @@ export const dispatchRunCode = async (props: RunCodeType): Promise<RunCodeRespon
      variables: customVariables
    });
    console.log(runResult);
    if (runResult.success) {
      return {
        [NodeOutputKeyEnum.rawResponse]: runResult.data.codeReturn,
--- a/projects/app/data/config.json
+++ b/projects/app/data/config.json
@@ -10,6 +10,7 @@
    "vlmMaxProcess": 10, // 图片理解模型最大处理进程
    "tokenWorkers": 30, // Token 计算线程保持数，会持续占用内存，不能设置太大。
    "hnswEfSearch": 100, // 向量搜索参数，仅对 PG 和 OB 生效。越大，搜索越精确，但是速度越慢。设置为100，有99%+精度。
    "hnswMaxScanTuples": 100000, // 向量搜索最大扫描数据量，仅对 PG生效。
    "customPdfParse": {
      "url": "", // 自定义 PDF 解析服务地址
      "key": "", // 自定义 PDF 解析服务密钥
--- a/projects/app/src/pageComponents/dataset/detail/Test.tsx
+++ b/projects/app/src/pageComponents/dataset/detail/Test.tsx
@@ -171,7 +171,6 @@ const Test = ({ datasetId }: { datasetId: string }) => {
          <Flex alignItems={'center'} justifyContent={'space-between'}>
            <MySelect<'text' | 'file'>
              size={'sm'}
              w={'150px'}
              list={[
                {
                  label: (