perf: buffer;fix: back up split (#4913)

* perf: buffer

* fix: back up split

* fix: app limit

* doc
This commit is contained in:
Archer
2025-05-28 18:18:25 +08:00
committed by GitHub
parent 802de11363
commit a171c7b11c
11 changed files with 208 additions and 93 deletions

View File

@@ -77,7 +77,10 @@ export const createCollectionAndInsertData = async ({
const chunkSplitter = computeChunkSplitter(createCollectionParams);
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
if (
trainingType === DatasetCollectionDataProcessModeEnum.qa ||
trainingType === DatasetCollectionDataProcessModeEnum.backup
) {
delete createCollectionParams.chunkTriggerType;
delete createCollectionParams.chunkTriggerMinSize;
delete createCollectionParams.dataEnhanceCollectionName;

View File

@@ -218,6 +218,10 @@ export const rawText2Chunks = ({
};
};
if (backupParse) {
return parseDatasetBackup2Chunks(rawText).chunks;
}
// Chunk condition
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
@@ -240,10 +244,6 @@ export const rawText2Chunks = ({
}
}
if (backupParse) {
return parseDatasetBackup2Chunks(rawText).chunks;
}
const { chunks } = splitText2Chunks({
text: rawText,
chunkSize,

View File

@@ -5,8 +5,6 @@ import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants';
import { type DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type';
import axios from 'axios';
import { serverRequestBaseUrl } from '../../../../common/api/serverRequest';
import { MongoRawTextBuffer } from '../../../../common/buffer/rawText/schema';
import { readFromSecondary } from '../../../../common/mongo/utils';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { detectFileEncoding, parseUrlToFileType } from '@fastgpt/global/common/file/tools';
import { readRawContentByFileBuffer } from '../../../../common/file/read/utils';
@@ -14,6 +12,8 @@ import { ChatRoleEnum } from '@fastgpt/global/core/chat/constants';
import { type ChatItemType, type UserChatItemValueItemType } from '@fastgpt/global/core/chat/type';
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
import { addLog } from '../../../../common/system/log';
import { addRawTextBuffer, getRawTextBuffer } from '../../../../common/buffer/rawText/controller';
import { addMinutes } from 'date-fns';
type Props = ModuleDispatchProps<{
[NodeInputKeyEnum.fileUrlList]: string[];
@@ -158,14 +158,12 @@ export const getFileContentFromLinks = async ({
parseUrlList
.map(async (url) => {
// Get from buffer
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: url }, undefined, {
...readFromSecondary
}).lean();
const fileBuffer = await getRawTextBuffer(url);
if (fileBuffer) {
return formatResponseObject({
filename: fileBuffer.metadata?.filename || url,
filename: fileBuffer.sourceName || url,
url,
content: fileBuffer.rawText
content: fileBuffer.text
});
}
@@ -220,17 +218,12 @@ export const getFileContentFromLinks = async ({
});
// Add to buffer
try {
if (buffer.length < 14 * 1024 * 1024 && rawText.trim()) {
MongoRawTextBuffer.create({
sourceId: url,
rawText,
metadata: {
filename: filename
}
});
}
} catch (error) {}
addRawTextBuffer({
sourceId: url,
sourceName: filename,
text: rawText,
expiredTime: addMinutes(new Date(), 20)
});
return formatResponseObject({ filename, url, content: rawText });
} catch (error) {