mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
perf: chunk trigger and paragraph split (#4893)
* perf: chunk trigger and paragraph split * update max size computed * perf: i18n * remove table
This commit is contained in:
@@ -34,6 +34,7 @@ import { getTrainingModeByCollection } from './utils';
|
||||
import {
|
||||
computeChunkSize,
|
||||
computeChunkSplitter,
|
||||
computeParagraphChunkDeep,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
@@ -74,6 +75,8 @@ export const createCollectionAndInsertData = async ({
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
||||
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
|
||||
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
delete createCollectionParams.chunkTriggerType;
|
||||
delete createCollectionParams.chunkTriggerMinSize;
|
||||
@@ -87,7 +90,11 @@ export const createCollectionAndInsertData = async ({
|
||||
// 1. split chunks
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
chunkTriggerType: createCollectionParams.chunkTriggerType,
|
||||
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
|
||||
chunkSize,
|
||||
paragraphChunkDeep,
|
||||
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
@@ -112,6 +119,7 @@ export const createCollectionAndInsertData = async ({
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
...createCollectionParams,
|
||||
trainingType,
|
||||
paragraphChunkDeep,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
|
||||
@@ -212,46 +220,19 @@ export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
|
||||
tmbId: string;
|
||||
session?: ClientSession;
|
||||
};
|
||||
export async function createOneCollection({
|
||||
teamId,
|
||||
tmbId,
|
||||
name,
|
||||
parentId,
|
||||
datasetId,
|
||||
type,
|
||||
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
|
||||
const {
|
||||
teamId,
|
||||
parentId,
|
||||
datasetId,
|
||||
tags,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
tags,
|
||||
|
||||
nextSyncTime,
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
externalFileId,
|
||||
externalFileUrl,
|
||||
apiFileId,
|
||||
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
|
||||
// Chunk settings
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
session
|
||||
}: CreateOneCollectionParams) {
|
||||
fileId,
|
||||
rawLink,
|
||||
externalFileId,
|
||||
externalFileUrl,
|
||||
apiFileId
|
||||
} = props;
|
||||
// Create collection tags
|
||||
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
|
||||
|
||||
@@ -259,41 +240,18 @@ export async function createOneCollection({
|
||||
const [collection] = await MongoDatasetCollection.create(
|
||||
[
|
||||
{
|
||||
...props,
|
||||
teamId,
|
||||
tmbId,
|
||||
parentId: parentId || null,
|
||||
datasetId,
|
||||
name,
|
||||
type,
|
||||
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
tags: collectionTags,
|
||||
metadata,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime,
|
||||
|
||||
...(fileId ? { fileId } : {}),
|
||||
...(rawLink ? { rawLink } : {}),
|
||||
...(externalFileId ? { externalFileId } : {}),
|
||||
...(externalFileUrl ? { externalFileUrl } : {}),
|
||||
...(apiFileId ? { apiFileId } : {}),
|
||||
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
|
||||
// Chunk settings
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt
|
||||
...(apiFileId ? { apiFileId } : {})
|
||||
}
|
||||
],
|
||||
{ session, ordered: true }
|
||||
|
@@ -1,5 +1,8 @@
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
ChunkTriggerConfigTypeEnum,
|
||||
DatasetSourceReadTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
||||
import { urlsFetch } from '../../common/string/cheerio';
|
||||
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
@@ -179,11 +182,17 @@ export const readApiServerFileContent = async ({
|
||||
|
||||
export const rawText2Chunks = ({
|
||||
rawText,
|
||||
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
|
||||
chunkTriggerMinSize = 1000,
|
||||
backupParse,
|
||||
chunkSize = 512,
|
||||
...splitProps
|
||||
}: {
|
||||
rawText: string;
|
||||
|
||||
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
|
||||
chunkTriggerMinSize?: number; // maxSize from agent model, not store
|
||||
|
||||
backupParse?: boolean;
|
||||
tableParse?: boolean;
|
||||
} & TextSplitProps): {
|
||||
@@ -209,6 +218,28 @@ export const rawText2Chunks = ({
|
||||
};
|
||||
};
|
||||
|
||||
// Chunk condition
|
||||
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
|
||||
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
|
||||
const textLength = rawText.trim().length;
|
||||
const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
|
||||
if (textLength < maxSize) {
|
||||
return [
|
||||
{
|
||||
q: rawText,
|
||||
a: ''
|
||||
}
|
||||
];
|
||||
}
|
||||
}
|
||||
// 2. 选择最小值条件,只有超过最小值(手动决定)才会触发分块
|
||||
if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
|
||||
const textLength = rawText.trim().length;
|
||||
if (textLength < chunkTriggerMinSize) {
|
||||
return [{ q: rawText, a: '' }];
|
||||
}
|
||||
}
|
||||
|
||||
if (backupParse) {
|
||||
return parseDatasetBackup2Chunks(rawText).chunks;
|
||||
}
|
||||
|
@@ -47,7 +47,6 @@ export const ChunkSettings = {
|
||||
},
|
||||
paragraphChunkDeep: Number,
|
||||
paragraphChunkMinSize: Number,
|
||||
paragraphChunkMaxSize: Number,
|
||||
chunkSize: Number,
|
||||
chunkSplitter: String,
|
||||
|
||||
|
@@ -658,7 +658,7 @@ export async function searchDatasetData(
|
||||
tokenLen: 0
|
||||
};
|
||||
} catch (error) {
|
||||
addLog.error('multiQueryRecall error', error);
|
||||
addLog.error('Full text search error', error);
|
||||
return {
|
||||
fullTextRecallResults: [],
|
||||
tokenLen: 0
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import iconv from 'iconv-lite';
|
||||
import { type ReadRawTextByBuffer, type ReadFileResponse } from '../type';
|
||||
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
||||
|
||||
const rawEncodingList = [
|
||||
'ascii',
|
||||
@@ -34,7 +35,10 @@ export const readFileRawText = ({ buffer, encoding }: ReadRawTextByBuffer): Read
|
||||
}
|
||||
})();
|
||||
|
||||
const { text, imageList } = matchMdImg(content);
|
||||
|
||||
return {
|
||||
rawText: content
|
||||
rawText: text,
|
||||
imageList
|
||||
};
|
||||
};
|
||||
|
Reference in New Issue
Block a user