perf: text splitter (#4313)

* sync collection * remove lock * perf: text splitter * update comment
2025-07-23 05:12:39 +00:00 · 2025-03-25 17:44:38 +08:00
parent 826a53dcb6
commit 37b4a1919b
8 changed files with 716 additions and 70 deletions
--- a/projects/app/src/pageComponents/account/team/OrgManage/index.tsx
+++ b/projects/app/src/pageComponents/account/team/OrgManage/index.tsx
@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
        isLoading={isLoadingOrgs}
      >
        <Box mb={3}>
-          <Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} />
+          <Path paths={paths} rootName={userInfo?.team?.teamName} />
        </Box>
        <Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
          <MemberScrollData flex="1">
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
                  <ActionButton
                    icon="common/administrator"
                    text={t('account_team:manage_member')}
-                    onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)}
+                    onClick={() => setManageMemberOrg(currentOrg)}
                  />
                  {currentOrg && currentOrg?.path !== '' && (
                    <>
--- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
+++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
@@ -94,7 +94,7 @@ async function handler(
    per: WritePermissionVal
  });

-  if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
+  if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
    return Promise.reject(CommonErrEnum.unAuthFile);
  }

--- a/projects/app/src/service/core/dataset/data/controller.ts
+++ b/projects/app/src/service/core/dataset/data/controller.ts
@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
 import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
 import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
-import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';

 const formatIndexes = async ({
  indexes,
  q,
  a = '',
-  indexSize
+  indexSize,
+  maxIndexSize
 }: {
  indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
  q: string;
  a?: string;
  indexSize: number;
+  maxIndexSize: number;
 }): Promise<
  {
    type: `${DatasetDataIndexTypeEnum}`;
@@ -46,9 +47,12 @@ const formatIndexes = async ({
  }) => {
    const qChunks = splitText2Chunks({
      text: q,
-      chunkSize: indexSize
+      chunkSize: indexSize,
+      maxSize: maxIndexSize
    }).chunks;
-    const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
+    const aChunks = a
+      ? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
+      : [];

    return [
      ...qChunks.map((text) => ({
@@ -100,7 +104,11 @@ const formatIndexes = async ({
        // If oversize tokens, split it
        const tokens = await countPromptTokens(item.text);
        if (tokens > indexSize) {
-          const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
+          const splitText = splitText2Chunks({
+            text: item.text,
+            chunkSize: 512,
+            maxSize: maxIndexSize
+          }).chunks;
          return splitText.map((text) => ({
            text,
            type: item.type
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
    indexes,
    q,
    a,
-    indexSize
+    indexSize,
+    maxIndexSize: embModel.maxToken
  });

  // insert to vector store
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
  if (!mongoData) return Promise.reject('core.dataset.error.Data not found');

  // 2. Compute indexes
-  const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
+  const formatIndexesResult = await formatIndexes({
+    indexes,
+    q,
+    a,
+    indexSize,
+    maxIndexSize: getEmbeddingModel(model).maxToken
+  });

  // 3. Patch indexes, create, update, delete
  const patchResult: PatchIndexesProps[] = [];