perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split

* update max size computed

* perf: i18n

* remove table
This commit is contained in:
Archer
2025-05-26 18:57:22 +08:00
committed by GitHub
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions

View File

@@ -34,6 +34,7 @@ import { getTrainingModeByCollection } from './utils';
import {
computeChunkSize,
computeChunkSplitter,
computeParagraphChunkDeep,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
@@ -74,6 +75,8 @@ export const createCollectionAndInsertData = async ({
llmModel: getLLMModel(dataset.agentModel)
});
const chunkSplitter = computeChunkSplitter(createCollectionParams);
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
delete createCollectionParams.chunkTriggerType;
delete createCollectionParams.chunkTriggerMinSize;
@@ -87,7 +90,11 @@ export const createCollectionAndInsertData = async ({
// 1. split chunks
const chunks = rawText2Chunks({
rawText,
chunkTriggerType: createCollectionParams.chunkTriggerType,
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
@@ -112,6 +119,7 @@ export const createCollectionAndInsertData = async ({
const { _id: collectionId } = await createOneCollection({
...createCollectionParams,
trainingType,
paragraphChunkDeep,
chunkSize,
chunkSplitter,
@@ -212,46 +220,19 @@ export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
tmbId: string;
session?: ClientSession;
};
export async function createOneCollection({
teamId,
tmbId,
name,
parentId,
datasetId,
type,
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
const {
teamId,
parentId,
datasetId,
tags,
createTime,
updateTime,
hashRawText,
rawTextLength,
metadata = {},
tags,
nextSyncTime,
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId,
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt,
session
}: CreateOneCollectionParams) {
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId
} = props;
// Create collection tags
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -259,41 +240,18 @@ export async function createOneCollection({
const [collection] = await MongoDatasetCollection.create(
[
{
...props,
teamId,
tmbId,
parentId: parentId || null,
datasetId,
name,
type,
rawTextLength,
hashRawText,
tags: collectionTags,
metadata,
createTime,
updateTime,
nextSyncTime,
...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}),
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt
...(apiFileId ? { apiFileId } : {})
}
],
{ session, ordered: true }

View File

@@ -1,5 +1,8 @@
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import {
ChunkTriggerConfigTypeEnum,
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
@@ -179,11 +182,17 @@ export const readApiServerFileContent = async ({
export const rawText2Chunks = ({
rawText,
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize = 1000,
backupParse,
chunkSize = 512,
...splitProps
}: {
rawText: string;
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
chunkTriggerMinSize?: number; // maxSize from agent model, not store
backupParse?: boolean;
tableParse?: boolean;
} & TextSplitProps): {
@@ -209,6 +218,28 @@ export const rawText2Chunks = ({
};
};
// Chunk condition
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
const textLength = rawText.trim().length;
const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
if (textLength < maxSize) {
return [
{
q: rawText,
a: ''
}
];
}
}
// 2. 选择最小值条件,只有超过最小值(手动决定)才会触发分块
if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
const textLength = rawText.trim().length;
if (textLength < chunkTriggerMinSize) {
return [{ q: rawText, a: '' }];
}
}
if (backupParse) {
return parseDatasetBackup2Chunks(rawText).chunks;
}

View File

@@ -47,7 +47,6 @@ export const ChunkSettings = {
},
paragraphChunkDeep: Number,
paragraphChunkMinSize: Number,
paragraphChunkMaxSize: Number,
chunkSize: Number,
chunkSplitter: String,

View File

@@ -658,7 +658,7 @@ export async function searchDatasetData(
tokenLen: 0
};
} catch (error) {
addLog.error('multiQueryRecall error', error);
addLog.error('Full text search error', error);
return {
fullTextRecallResults: [],
tokenLen: 0

View File

@@ -1,5 +1,6 @@
import iconv from 'iconv-lite';
import { type ReadRawTextByBuffer, type ReadFileResponse } from '../type';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
const rawEncodingList = [
'ascii',
@@ -34,7 +35,10 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
}
})();
const { text, imageList } = matchMdImg(content);
return {
rawText: content
rawText: text,
imageList
};
};