perf: chunk trigger and paragraph split (#4893)

* perf: chunk trigger and paragraph split

* update max size computed

* perf: i18n

* remove table
This commit is contained in:
Archer
2025-05-26 18:57:22 +08:00
committed by GitHub
parent 874300a56a
commit c25cd48e72
23 changed files with 859 additions and 164 deletions

View File

@@ -34,6 +34,7 @@ import { getTrainingModeByCollection } from './utils';
import {
computeChunkSize,
computeChunkSplitter,
computeParagraphChunkDeep,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
@@ -74,6 +75,8 @@ export const createCollectionAndInsertData = async ({
llmModel: getLLMModel(dataset.agentModel)
});
const chunkSplitter = computeChunkSplitter(createCollectionParams);
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
delete createCollectionParams.chunkTriggerType;
delete createCollectionParams.chunkTriggerMinSize;
@@ -87,7 +90,11 @@ export const createCollectionAndInsertData = async ({
// 1. split chunks
const chunks = rawText2Chunks({
rawText,
chunkTriggerType: createCollectionParams.chunkTriggerType,
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
@@ -112,6 +119,7 @@ export const createCollectionAndInsertData = async ({
const { _id: collectionId } = await createOneCollection({
...createCollectionParams,
trainingType,
paragraphChunkDeep,
chunkSize,
chunkSplitter,
@@ -212,46 +220,19 @@ export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
tmbId: string;
session?: ClientSession;
};
export async function createOneCollection({
teamId,
tmbId,
name,
parentId,
datasetId,
type,
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
const {
teamId,
parentId,
datasetId,
tags,
createTime,
updateTime,
hashRawText,
rawTextLength,
metadata = {},
tags,
nextSyncTime,
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId,
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt,
session
}: CreateOneCollectionParams) {
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId
} = props;
// Create collection tags
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
@@ -259,41 +240,18 @@ export async function createOneCollection({
const [collection] = await MongoDatasetCollection.create(
[
{
...props,
teamId,
tmbId,
parentId: parentId || null,
datasetId,
name,
type,
rawTextLength,
hashRawText,
tags: collectionTags,
metadata,
createTime,
updateTime,
nextSyncTime,
...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}),
// Parse settings
customPdfParse,
imageIndex,
autoIndexes,
// Chunk settings
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter,
qaPrompt
...(apiFileId ? { apiFileId } : {})
}
],
{ session, ordered: true }