From 01ff56b42b6fcd1122a93e9636c04c47443180a8 Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Tue, 10 Jun 2025 00:05:54 +0800 Subject: [PATCH] perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984) * perf: password special chars * feat: llm paragraph;perf: chunk setting params * perf: text splitter worker * perf: get rawtext buffer * fix: test * fix: test * doc * min chunk size --- .../zh-cn/docs/development/upgrading/4912.md | 22 +++ packages/global/common/string/password.ts | 4 +- packages/global/common/string/textSplitter.ts | 10 +- packages/global/common/system/constants.ts | 1 + packages/global/core/dataset/constants.ts | 3 +- .../global/core/dataset/training/utils.ts | 118 +++++++----- packages/global/package.json | 4 +- .../common/buffer/rawText/controller.ts | 41 ++-- packages/service/common/file/gridfs/utils.ts | 6 +- packages/service/common/file/read/utils.ts | 7 +- packages/service/common/mongo/index.ts | 3 +- .../core/dataset/collection/controller.ts | 99 +++++----- packages/service/core/dataset/read.ts | 21 +- .../core/dataset/training/controller.ts | 17 +- .../support/wallet/usage/controller.ts | 64 ++++++ packages/service/worker/controller.ts | 18 ++ packages/service/worker/function.ts | 24 +++ packages/service/worker/htmlStr2Md/index.ts | 12 +- packages/service/worker/readFile/index.ts | 14 +- packages/service/worker/text2Chunks/index.ts | 14 ++ packages/service/worker/utils.ts | 3 +- .../web/components/common/MySelect/index.tsx | 5 +- packages/web/i18n/en/account_usage.json | 2 + packages/web/i18n/en/dataset.json | 6 + packages/web/i18n/zh-CN/account_usage.json | 2 + packages/web/i18n/zh-CN/dataset.json | 6 + packages/web/i18n/zh-Hant/account_usage.json | 2 + packages/web/i18n/zh-Hant/dataset.json | 6 + pnpm-lock.yaml | 6 + .../detail/CollectionCard/WebsiteConfig.tsx | 21 +- .../detail/Form/CollectionChunkForm.tsx | 182 +++++++----------- .../dataset/detail/Import/Context.tsx | 6 +- .../Import/commonProgress/PreviewData.tsx | 7 +- .../detail/Import/commonProgress/Upload.tsx | 9 +- .../api/core/dataset/file/getPreviewChunks.ts | 60 ++---- .../app/src/pages/api/core/dataset/update.ts | 12 +- .../service/core/dataset/data/controller.ts | 32 +-- .../core/dataset/queues/datasetParse.ts | 50 +++-- .../service/core/dataset/queues/generateQA.ts | 15 +- .../src/service/support/wallet/usage/push.ts | 36 ---- .../service/core/dataset/textSplitter.test.ts | 24 +-- 41 files changed, 546 insertions(+), 448 deletions(-) create mode 100644 docSite/content/zh-cn/docs/development/upgrading/4912.md create mode 100644 packages/service/worker/controller.ts create mode 100644 packages/service/worker/function.ts create mode 100644 packages/service/worker/text2Chunks/index.ts diff --git a/docSite/content/zh-cn/docs/development/upgrading/4912.md b/docSite/content/zh-cn/docs/development/upgrading/4912.md new file mode 100644 index 000000000..836ecd57d --- /dev/null +++ b/docSite/content/zh-cn/docs/development/upgrading/4912.md @@ -0,0 +1,22 @@ +--- +title: 'V4.9.11(进行中)' +description: 'FastGPT V4.9.12 更新说明' +icon: 'upgrade' +draft: false +toc: true +weight: 789 +--- + +## 🚀 新增内容 + +1. 商业版支持知识库分块时,LLM 进行自动分段识别。 + +## ⚙️ 优化 + +1. 密码校验时,增加更多的特殊字符 +2. 后端全量计算知识库 chunk 参数,避免自动模式下部分参数未正确使用默认值。 +3. 将文本分块移至 worker 线程,避免阻塞。 + +## 🐛 修复 + +1. 自定义问答提取提示词被覆盖。 \ No newline at end of file diff --git a/packages/global/common/string/password.ts b/packages/global/common/string/password.ts index 68b92d544..4042019a0 100644 --- a/packages/global/common/string/password.ts +++ b/packages/global/common/string/password.ts @@ -3,9 +3,9 @@ export const checkPasswordRule = (password: string) => { /\d/, // Contains digits /[a-z]/, // Contains lowercase letters /[A-Z]/, // Contains uppercase letters - /[!@#$%^&*()_+=-]/ // Contains special characters + /[!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]/ // Contains special characters ]; - const validChars = /^[\dA-Za-z!@#$%^&*()_+=-]{8,100}$/; + const validChars = /^[\dA-Za-z!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]{8,100}$/; // Check length and valid characters if (!validChars.test(password)) return false; diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index 30dfe5174..6ded96260 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -1,10 +1,11 @@ import { defaultMaxChunkSize } from '../../core/dataset/training/utils'; import { getErrText } from '../error/utils'; +import { simpleText } from './tools'; import { getTextValidLength } from './utils'; export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----'; -type SplitProps = { +export type SplitProps = { text: string; chunkSize: number; @@ -19,7 +20,7 @@ export type TextSplitProps = Omit & { chunkSize?: number; }; -type SplitResponse = { +export type SplitResponse = { chunks: string[]; chars: number; }; @@ -474,7 +475,10 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => { }); return { - chunks: splitResult.map((item) => item.chunks).flat(), + chunks: splitResult + .map((item) => item.chunks) + .flat() + .map((chunk) => simpleText(chunk)), chars: splitResult.reduce((sum, item) => sum + item.chars, 0) }; }; diff --git a/packages/global/common/system/constants.ts b/packages/global/common/system/constants.ts index 4dcc4d276..ab7a281be 100644 --- a/packages/global/common/system/constants.ts +++ b/packages/global/common/system/constants.ts @@ -7,3 +7,4 @@ export const DEFAULT_ORG_AVATAR = '/imgs/avatar/defaultOrgAvatar.svg'; export const DEFAULT_USER_AVATAR = '/imgs/avatar/BlueAvatar.svg'; export const isProduction = process.env.NODE_ENV === 'production'; +export const isTestEnv = process.env.NODE_ENV === 'test'; diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts index 8e74b7c2a..5f759247d 100644 --- a/packages/global/core/dataset/constants.ts +++ b/packages/global/core/dataset/constants.ts @@ -211,7 +211,8 @@ export enum DataChunkSplitModeEnum { } export enum ParagraphChunkAIModeEnum { auto = 'auto', - force = 'force' + force = 'force', + forbid = 'forbid' } /* ------------ data -------------- */ diff --git a/packages/global/core/dataset/training/utils.ts b/packages/global/core/dataset/training/utils.ts index d98390e9c..ac9715eb3 100644 --- a/packages/global/core/dataset/training/utils.ts +++ b/packages/global/core/dataset/training/utils.ts @@ -3,8 +3,11 @@ import { type EmbeddingModelItemType, type LLMModelItemType } from '../../../cor import { ChunkSettingModeEnum, DataChunkSplitModeEnum, - DatasetCollectionDataProcessModeEnum + DatasetCollectionDataProcessModeEnum, + ParagraphChunkAIModeEnum } from '../constants'; +import type { ChunkSettingsType } from '../type'; +import { cloneDeep } from 'lodash'; export const minChunkSize = 64; // min index and chunk size @@ -103,53 +106,78 @@ export const getIndexSizeSelectList = (max = 512) => { }; // Compute -export const computeChunkSize = (params: { - trainingType: DatasetCollectionDataProcessModeEnum; - chunkSettingMode?: ChunkSettingModeEnum; - chunkSplitMode?: DataChunkSplitModeEnum; +export const computedCollectionChunkSettings = ({ + llmModel, + vectorModel, + ...data +}: { llmModel?: LLMModelItemType; - chunkSize?: number; -}) => { - if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) { - if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { - return getLLMDefaultChunkSize(params.llmModel); + vectorModel?: EmbeddingModelItemType; +} & T) => { + const { + trainingType = DatasetCollectionDataProcessModeEnum.chunk, + chunkSettingMode = ChunkSettingModeEnum.auto, + chunkSplitMode, + chunkSize, + paragraphChunkDeep = 5, + indexSize, + autoIndexes + } = data; + const cloneChunkSettings = cloneDeep(data); + + if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) { + delete cloneChunkSettings.qaPrompt; + } + + // Format training type indexSize/chunkSize + const trainingModeSize: { + autoChunkSize: number; + autoIndexSize: number; + chunkSize?: number; + indexSize?: number; + } = (() => { + if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { + return { + autoChunkSize: getLLMDefaultChunkSize(llmModel), + autoIndexSize: getMaxIndexSize(vectorModel), + chunkSize, + indexSize: getMaxIndexSize(vectorModel) + }; + } else if (autoIndexes) { + return { + autoChunkSize: chunkAutoChunkSize, + autoIndexSize: getAutoIndexSize(vectorModel), + chunkSize, + indexSize + }; + } else { + return { + autoChunkSize: chunkAutoChunkSize, + autoIndexSize: getAutoIndexSize(vectorModel), + chunkSize, + indexSize + }; } + })(); + + if (chunkSettingMode === ChunkSettingModeEnum.auto) { + cloneChunkSettings.chunkSplitMode = DataChunkSplitModeEnum.paragraph; + cloneChunkSettings.paragraphChunkAIMode = ParagraphChunkAIModeEnum.forbid; + cloneChunkSettings.paragraphChunkDeep = 5; + cloneChunkSettings.paragraphChunkMinSize = 100; + cloneChunkSettings.chunkSize = trainingModeSize.autoChunkSize; + cloneChunkSettings.indexSize = trainingModeSize.autoIndexSize; + + cloneChunkSettings.chunkSplitter = undefined; } else { - // chunk - if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { - return chunkAutoChunkSize; - } + cloneChunkSettings.paragraphChunkDeep = + chunkSplitMode === DataChunkSplitModeEnum.paragraph ? paragraphChunkDeep : 0; + + cloneChunkSettings.chunkSize = trainingModeSize.chunkSize + ? Math.min(trainingModeSize.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(llmModel)) + : undefined; + cloneChunkSettings.indexSize = trainingModeSize.indexSize; } - if (params.chunkSplitMode === DataChunkSplitModeEnum.char) { - return getLLMMaxChunkSize(params.llmModel); - } - - return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel)); -}; -export const computeChunkSplitter = (params: { - chunkSettingMode?: ChunkSettingModeEnum; - chunkSplitMode?: DataChunkSplitModeEnum; - chunkSplitter?: string; -}) => { - if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { - return undefined; - } - if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) { - return undefined; - } - return params.chunkSplitter; -}; -export const computeParagraphChunkDeep = (params: { - chunkSettingMode?: ChunkSettingModeEnum; - chunkSplitMode?: DataChunkSplitModeEnum; - paragraphChunkDeep?: number; -}) => { - if (params.chunkSettingMode === ChunkSettingModeEnum.auto) { - return 5; - } - if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) { - return params.paragraphChunkDeep; - } - return 0; + return cloneChunkSettings; }; diff --git a/packages/global/package.json b/packages/global/package.json index 376a115bd..93dd88a28 100644 --- a/packages/global/package.json +++ b/packages/global/package.json @@ -15,9 +15,11 @@ "next": "14.2.28", "openai": "4.61.0", "openapi-types": "^12.1.3", - "timezones-list": "^3.0.2" + "timezones-list": "^3.0.2", + "lodash": "^4.17.21" }, "devDependencies": { + "@types/lodash": "^4.14.191", "@types/js-yaml": "^4.0.9", "@types/node": "20.14.0" } diff --git a/packages/service/common/buffer/rawText/controller.ts b/packages/service/common/buffer/rawText/controller.ts index 8750494a3..306edac32 100644 --- a/packages/service/common/buffer/rawText/controller.ts +++ b/packages/service/common/buffer/rawText/controller.ts @@ -5,6 +5,8 @@ import { addLog } from '../../system/log'; import { setCron } from '../../system/cron'; import { checkTimerLock } from '../../system/timerLock/utils'; import { TimerIdEnum } from '../../system/timerLock/constants'; +import { gridFsStream2Buffer } from '../../file/gridfs/utils'; +import { readRawContentFromBuffer } from '../../../worker/function'; const getGridBucket = () => { return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, { @@ -85,30 +87,27 @@ export const getRawTextBuffer = async (sourceId: string) => { // Read file content const downloadStream = gridBucket.openDownloadStream(bufferData._id); - const chunks: Buffer[] = []; - return new Promise<{ - text: string; - sourceName: string; - } | null>((resolve, reject) => { - downloadStream.on('data', (chunk) => { - chunks.push(chunk); - }); + const fileBuffers = await gridFsStream2Buffer(downloadStream); - downloadStream.on('end', () => { - const buffer = Buffer.concat(chunks); - const text = buffer.toString('utf8'); - resolve({ - text, - sourceName: bufferData.metadata?.sourceName || '' - }); - }); + const rawText = await (async () => { + if (fileBuffers.length < 10000000) { + return fileBuffers.toString('utf8'); + } else { + return ( + await readRawContentFromBuffer({ + extension: 'txt', + encoding: 'utf8', + buffer: fileBuffers + }) + ).rawText; + } + })(); - downloadStream.on('error', (error) => { - addLog.error('getRawTextBuffer error', error); - resolve(null); - }); - }); + return { + text: rawText, + sourceName: bufferData.metadata?.sourceName || '' + }; }); }; diff --git a/packages/service/common/file/gridfs/utils.ts b/packages/service/common/file/gridfs/utils.ts index 4c72fb61d..691d85a4f 100644 --- a/packages/service/common/file/gridfs/utils.ts +++ b/packages/service/common/file/gridfs/utils.ts @@ -55,13 +55,17 @@ export const createFileFromText = async ({ export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => { return new Promise((resolve, reject) => { + if (!stream.readable) { + return resolve(Buffer.from([])); + } + const chunks: Uint8Array[] = []; stream.on('data', (chunk) => { chunks.push(chunk); }); stream.on('end', () => { - const resultBuffer = Buffer.concat(chunks); // 一次性拼接 + const resultBuffer = Buffer.concat(chunks); // One-time splicing resolve(resultBuffer); }); stream.on('error', (err) => { diff --git a/packages/service/common/file/read/utils.ts b/packages/service/common/file/read/utils.ts index b08d36137..461da09dc 100644 --- a/packages/service/common/file/read/utils.ts +++ b/packages/service/common/file/read/utils.ts @@ -1,6 +1,5 @@ import { uploadMongoImg } from '../image/controller'; import FormData from 'form-data'; -import { WorkerNameEnum, runWorker } from '../../../worker/utils'; import fs from 'fs'; import type { ReadFileResponse } from '../../../worker/readFile/type'; import axios from 'axios'; @@ -9,6 +8,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils'; import { matchMdImg } from '@fastgpt/global/common/string/markdown'; import { createPdfParseUsage } from '../../../support/wallet/usage/controller'; import { useDoc2xServer } from '../../../thirdProvider/doc2x'; +import { readRawContentFromBuffer } from '../../../worker/function'; export type readRawTextByLocalFileParams = { teamId: string; @@ -63,11 +63,10 @@ export const readRawContentByFileBuffer = async ({ rawText: string; }> => { const systemParse = () => - runWorker(WorkerNameEnum.readFile, { + readRawContentFromBuffer({ extension, encoding, - buffer, - teamId + buffer }); const parsePdfFromCustomService = async (): Promise => { const url = global.systemEnv.customPdfParse?.url; diff --git a/packages/service/common/mongo/index.ts b/packages/service/common/mongo/index.ts index 6153714be..0abb1e615 100644 --- a/packages/service/common/mongo/index.ts +++ b/packages/service/common/mongo/index.ts @@ -1,3 +1,4 @@ +import { isTestEnv } from '@fastgpt/global/common/system/constants'; import { addLog } from '../../common/system/log'; import type { Model } from 'mongoose'; import mongoose, { Mongoose } from 'mongoose'; @@ -70,7 +71,7 @@ const addCommonMiddleware = (schema: mongoose.Schema) => { export const getMongoModel = (name: string, schema: mongoose.Schema) => { if (connectionMongo.models[name]) return connectionMongo.models[name] as Model; - if (process.env.NODE_ENV !== 'test') console.log('Load model======', name); + if (!isTestEnv) console.log('Load model======', name); addCommonMiddleware(schema); const model = connectionMongo.model(name, schema); diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 646c5ab4e..9a032b5f3 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -32,10 +32,7 @@ import { MongoDatasetDataText } from '../data/dataTextSchema'; import { retryFn } from '@fastgpt/global/common/system/utils'; import { getTrainingModeByCollection } from './utils'; import { - computeChunkSize, - computeChunkSplitter, - computeParagraphChunkDeep, - getAutoIndexSize, + computedCollectionChunkSettings, getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; @@ -68,31 +65,50 @@ export const createCollectionAndInsertData = async ({ createCollectionParams.autoIndexes = true; } - const teamId = createCollectionParams.teamId; - const tmbId = createCollectionParams.tmbId; + const formatCreateCollectionParams = computedCollectionChunkSettings({ + ...createCollectionParams, + llmModel: getLLMModel(dataset.agentModel), + vectorModel: getEmbeddingModel(dataset.vectorModel) + }); + + const teamId = formatCreateCollectionParams.teamId; + const tmbId = formatCreateCollectionParams.tmbId; // Set default params const trainingType = - createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk; - const chunkSplitter = computeChunkSplitter(createCollectionParams); - const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams); + formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk; const trainingMode = getTrainingModeByCollection({ trainingType: trainingType, - autoIndexes: createCollectionParams.autoIndexes, - imageIndex: createCollectionParams.imageIndex + autoIndexes: formatCreateCollectionParams.autoIndexes, + imageIndex: formatCreateCollectionParams.imageIndex }); if ( trainingType === DatasetCollectionDataProcessModeEnum.qa || - trainingType === DatasetCollectionDataProcessModeEnum.backup + trainingType === DatasetCollectionDataProcessModeEnum.backup || + trainingType === DatasetCollectionDataProcessModeEnum.template ) { - delete createCollectionParams.chunkTriggerType; - delete createCollectionParams.chunkTriggerMinSize; - delete createCollectionParams.dataEnhanceCollectionName; - delete createCollectionParams.imageIndex; - delete createCollectionParams.autoIndexes; - delete createCollectionParams.indexSize; - delete createCollectionParams.qaPrompt; + delete formatCreateCollectionParams.chunkTriggerType; + delete formatCreateCollectionParams.chunkTriggerMinSize; + delete formatCreateCollectionParams.dataEnhanceCollectionName; + delete formatCreateCollectionParams.imageIndex; + delete formatCreateCollectionParams.autoIndexes; + + if ( + trainingType === DatasetCollectionDataProcessModeEnum.backup || + trainingType === DatasetCollectionDataProcessModeEnum.template + ) { + delete formatCreateCollectionParams.paragraphChunkAIMode; + delete formatCreateCollectionParams.paragraphChunkDeep; + delete formatCreateCollectionParams.paragraphChunkMinSize; + delete formatCreateCollectionParams.chunkSplitMode; + delete formatCreateCollectionParams.chunkSize; + delete formatCreateCollectionParams.chunkSplitter; + delete formatCreateCollectionParams.indexSize; + } + } + if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) { + delete formatCreateCollectionParams.qaPrompt; } // 1. split chunks or create image chunks @@ -109,30 +125,27 @@ export const createCollectionAndInsertData = async ({ }>; chunkSize?: number; indexSize?: number; - } = (() => { + } = await (async () => { if (rawText) { - const chunkSize = computeChunkSize({ - ...createCollectionParams, - trainingType, - llmModel: getLLMModel(dataset.agentModel) - }); // Process text chunks - const chunks = rawText2Chunks({ + const chunks = await rawText2Chunks({ rawText, - chunkTriggerType: createCollectionParams.chunkTriggerType, - chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize, - chunkSize, - paragraphChunkDeep, - paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize, + chunkTriggerType: formatCreateCollectionParams.chunkTriggerType, + chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize, + chunkSize: formatCreateCollectionParams.chunkSize, + paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep, + paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize, maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, - customReg: chunkSplitter ? [chunkSplitter] : [], + customReg: formatCreateCollectionParams.chunkSplitter + ? [formatCreateCollectionParams.chunkSplitter] + : [], backupParse }); return { chunks, - chunkSize, - indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel) + chunkSize: formatCreateCollectionParams.chunkSize, + indexSize: formatCreateCollectionParams.indexSize }; } @@ -147,12 +160,8 @@ export const createCollectionAndInsertData = async ({ return { chunks: [], - chunkSize: computeChunkSize({ - ...createCollectionParams, - trainingType, - llmModel: getLLMModel(dataset.agentModel) - }), - indexSize: createCollectionParams.indexSize ?? getAutoIndexSize(dataset.vectorModel) + chunkSize: formatCreateCollectionParams.chunkSize, + indexSize: formatCreateCollectionParams.indexSize }; })(); @@ -165,11 +174,9 @@ export const createCollectionAndInsertData = async ({ const fn = async (session: ClientSession) => { // 3. Create collection const { _id: collectionId } = await createOneCollection({ - ...createCollectionParams, + ...formatCreateCollectionParams, trainingType, - paragraphChunkDeep, chunkSize, - chunkSplitter, indexSize, hashRawText: rawText ? hashStr(rawText) : undefined, @@ -179,7 +186,7 @@ export const createCollectionAndInsertData = async ({ if (!dataset.autoSync && dataset.type === DatasetTypeEnum.websiteDataset) return undefined; if ( [DatasetCollectionTypeEnum.link, DatasetCollectionTypeEnum.apiFile].includes( - createCollectionParams.type + formatCreateCollectionParams.type ) ) { return addDays(new Date(), 1); @@ -195,7 +202,7 @@ export const createCollectionAndInsertData = async ({ const { billId: newBillId } = await createTrainingUsage({ teamId, tmbId, - appName: createCollectionParams.name, + appName: formatCreateCollectionParams.name, billSource: UsageSourceEnum.training, vectorModel: getEmbeddingModel(dataset.vectorModel)?.name, agentModel: getLLMModel(dataset.agentModel)?.name, @@ -218,7 +225,7 @@ export const createCollectionAndInsertData = async ({ vlmModel: dataset.vlmModel, indexSize, mode: trainingMode, - prompt: createCollectionParams.qaPrompt, + prompt: formatCreateCollectionParams.qaPrompt, billId: traingBillId, data: chunks.map((item, index) => ({ ...item, diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index 20b34e3bc..f2a4a9c1c 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -5,13 +5,14 @@ import { } from '@fastgpt/global/core/dataset/constants'; import { readFileContentFromMongo } from '../../common/file/gridfs/controller'; import { urlsFetch } from '../../common/string/cheerio'; -import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; +import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter'; import axios from 'axios'; import { readRawContentByFileBuffer } from '../../common/file/read/utils'; import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { getApiDatasetRequest } from './apiDataset'; import Papa from 'papaparse'; import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type'; +import { text2Chunks } from '../../worker/function'; export const readFileRawTextByUrl = async ({ teamId, @@ -165,7 +166,7 @@ export const readApiServerFileContent = async ({ }); }; -export const rawText2Chunks = ({ +export const rawText2Chunks = async ({ rawText, chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize, chunkTriggerMinSize = 1000, @@ -182,12 +183,14 @@ export const rawText2Chunks = ({ backupParse?: boolean; tableParse?: boolean; -} & TextSplitProps): { - q: string; - a: string; - indexes?: string[]; - imageIdList?: string[]; -}[] => { +} & TextSplitProps): Promise< + { + q: string; + a: string; + indexes?: string[]; + imageIdList?: string[]; + }[] +> => { const parseDatasetBackup2Chunks = (rawText: string) => { const csvArr = Papa.parse(rawText).data as string[][]; @@ -233,7 +236,7 @@ export const rawText2Chunks = ({ } } - const { chunks } = splitText2Chunks({ + const { chunks } = await text2Chunks({ text: rawText, chunkSize, ...splitProps diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index 990cfa427..7385d783e 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -112,24 +112,15 @@ export async function pushDataListToTrainingQueue({ // format q and a, remove empty char data = data.filter((item) => { - item.q = simpleText(item.q); - item.a = simpleText(item.a); - - item.indexes = item.indexes - ?.map((index) => { - return { - ...index, - text: simpleText(index.text) - }; - }) - .filter(Boolean); + const q = item.q || ''; + const a = item.a || ''; // filter repeat content - if (!item.imageId && !item.q) { + if (!item.imageId && !q) { return; } - const text = item.q + item.a; + const text = q + a; // Oversize llm tokens if (text.length > maxToken) { diff --git a/packages/service/support/wallet/usage/controller.ts b/packages/service/support/wallet/usage/controller.ts index ec840e35b..8ff311650 100644 --- a/packages/service/support/wallet/usage/controller.ts +++ b/packages/service/support/wallet/usage/controller.ts @@ -8,6 +8,8 @@ import { type CreateUsageProps } from '@fastgpt/global/support/wallet/usage/api'; import { i18nT } from '../../../../web/i18n/utils'; +import { formatModelChars2Points } from './utils'; +import { ModelTypeEnum } from '@fastgpt/global/core/ai/model'; export async function createUsage(data: CreateUsageProps) { try { @@ -67,6 +69,14 @@ export const createChatUsage = ({ return { totalPoints }; }; +export type DatasetTrainingMode = 'paragraph' | 'qa' | 'autoIndex' | 'imageIndex' | 'imageParse'; +export const datasetTrainingUsageIndexMap: Record = { + paragraph: 1, + qa: 2, + autoIndex: 3, + imageIndex: 4, + imageParse: 5 +}; export const createTrainingUsage = async ({ teamId, tmbId, @@ -108,6 +118,13 @@ export const createTrainingUsage = async ({ : []), ...(agentModel ? [ + { + moduleName: i18nT('account_usage:llm_paragraph'), + model: agentModel, + amount: 0, + inputTokens: 0, + outputTokens: 0 + }, { moduleName: i18nT('account_usage:qa'), model: agentModel, @@ -126,6 +143,13 @@ export const createTrainingUsage = async ({ : []), ...(vllmModel ? [ + { + moduleName: i18nT('account_usage:image_index'), + model: vllmModel, + amount: 0, + inputTokens: 0, + outputTokens: 0 + }, { moduleName: i18nT('account_usage:image_parse'), model: vllmModel, @@ -171,3 +195,43 @@ export const createPdfParseUsage = async ({ ] }); }; + +export const pushLLMTrainingUsage = async ({ + teamId, + tmbId, + model, + inputTokens, + outputTokens, + billId, + mode +}: { + teamId: string; + tmbId: string; + model: string; + inputTokens: number; + outputTokens: number; + billId: string; + mode: DatasetTrainingMode; +}) => { + const index = datasetTrainingUsageIndexMap[mode]; + + // Compute points + const { totalPoints } = formatModelChars2Points({ + model, + modelType: ModelTypeEnum.llm, + inputTokens, + outputTokens + }); + + concatUsage({ + billId, + teamId, + tmbId, + totalPoints, + inputTokens, + outputTokens, + listIndex: index + }); + + return { totalPoints }; +}; diff --git a/packages/service/worker/controller.ts b/packages/service/worker/controller.ts new file mode 100644 index 000000000..0b9db3717 --- /dev/null +++ b/packages/service/worker/controller.ts @@ -0,0 +1,18 @@ +import type { MessagePort } from 'worker_threads'; + +export const workerResponse = ({ + parentPort, + status, + data +}: { + parentPort: MessagePort | null; + status: 'success' | 'error'; + data: any; +}) => { + parentPort?.postMessage({ + type: status, + data: data + }); + + process.exit(); +}; diff --git a/packages/service/worker/function.ts b/packages/service/worker/function.ts new file mode 100644 index 000000000..6e1e76168 --- /dev/null +++ b/packages/service/worker/function.ts @@ -0,0 +1,24 @@ +import { + splitText2Chunks, + type SplitProps, + type SplitResponse +} from '@fastgpt/global/common/string/textSplitter'; +import { runWorker, WorkerNameEnum } from './utils'; +import type { ReadFileResponse } from './readFile/type'; +import { isTestEnv } from '@fastgpt/global/common/system/constants'; + +export const text2Chunks = (props: SplitProps) => { + // Test env, not run worker + if (isTestEnv) { + return splitText2Chunks(props); + } + return runWorker(WorkerNameEnum.text2Chunks, props); +}; + +export const readRawContentFromBuffer = (props: { + extension: string; + encoding: string; + buffer: Buffer; +}) => { + return runWorker(WorkerNameEnum.readFile, props); +}; diff --git a/packages/service/worker/htmlStr2Md/index.ts b/packages/service/worker/htmlStr2Md/index.ts index 22a998760..bc63c6d1b 100644 --- a/packages/service/worker/htmlStr2Md/index.ts +++ b/packages/service/worker/htmlStr2Md/index.ts @@ -1,19 +1,21 @@ import { parentPort } from 'worker_threads'; import { html2md } from './utils'; +import { workerResponse } from '../controller'; parentPort?.on('message', (params: { html: string }) => { try { const md = html2md(params?.html || ''); - parentPort?.postMessage({ - type: 'success', + workerResponse({ + parentPort, + status: 'success', data: md }); } catch (error) { - parentPort?.postMessage({ - type: 'error', + workerResponse({ + parentPort, + status: 'error', data: error }); } - process.exit(); }); diff --git a/packages/service/worker/readFile/index.ts b/packages/service/worker/readFile/index.ts index 40a55025a..78c3edc5b 100644 --- a/packages/service/worker/readFile/index.ts +++ b/packages/service/worker/readFile/index.ts @@ -7,6 +7,7 @@ import { readDocsFile } from './extension/docx'; import { readPptxRawText } from './extension/pptx'; import { readXlsxRawText } from './extension/xlsx'; import { readCsvRawText } from './extension/csv'; +import { workerResponse } from '../controller'; parentPort?.on('message', async (props: ReadRawTextProps) => { const read = async (params: ReadRawTextByBuffer) => { @@ -41,17 +42,16 @@ parentPort?.on('message', async (props: ReadRawTextProps) => { }; try { - parentPort?.postMessage({ - type: 'success', + workerResponse({ + parentPort, + status: 'success', data: await read(newProps) }); } catch (error) { - console.log(error); - parentPort?.postMessage({ - type: 'error', + workerResponse({ + parentPort, + status: 'error', data: error }); } - - process.exit(); }); diff --git a/packages/service/worker/text2Chunks/index.ts b/packages/service/worker/text2Chunks/index.ts new file mode 100644 index 000000000..9a9fc1147 --- /dev/null +++ b/packages/service/worker/text2Chunks/index.ts @@ -0,0 +1,14 @@ +import { parentPort } from 'worker_threads'; +import type { SplitProps } from '@fastgpt/global/common/string/textSplitter'; +import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; +import { workerResponse } from '../controller'; + +parentPort?.on('message', async (props: SplitProps) => { + const result = splitText2Chunks(props); + + workerResponse({ + parentPort, + status: 'success', + data: result + }); +}); diff --git a/packages/service/worker/utils.ts b/packages/service/worker/utils.ts index b7508def0..fb541ae90 100644 --- a/packages/service/worker/utils.ts +++ b/packages/service/worker/utils.ts @@ -6,7 +6,8 @@ export enum WorkerNameEnum { readFile = 'readFile', htmlStr2Md = 'htmlStr2Md', countGptMessagesTokens = 'countGptMessagesTokens', - systemPluginRun = 'systemPluginRun' + systemPluginRun = 'systemPluginRun', + text2Chunks = 'text2Chunks' } export const getSafeEnv = () => { diff --git a/packages/web/components/common/MySelect/index.tsx b/packages/web/components/common/MySelect/index.tsx index 6534a4d41..6a17f2b92 100644 --- a/packages/web/components/common/MySelect/index.tsx +++ b/packages/web/components/common/MySelect/index.tsx @@ -151,8 +151,7 @@ const MySelect = ( ? { ref: SelectedItemRef, color: 'primary.700', - bg: 'myGray.100', - fontWeight: '600' + bg: 'myGray.100' } : { color: 'myGray.900' @@ -167,7 +166,7 @@ const MySelect = ( display={'block'} mb={0.5} > - + {item.icon && ( )} diff --git a/packages/web/i18n/en/account_usage.json b/packages/web/i18n/en/account_usage.json index 6a07f8f05..a754ef15d 100644 --- a/packages/web/i18n/en/account_usage.json +++ b/packages/web/i18n/en/account_usage.json @@ -20,8 +20,10 @@ "export_title": "Time,Members,Type,Project name,AI points", "feishu": "Feishu", "generation_time": "Generation time", + "image_index": "Image index", "image_parse": "Image tagging", "input_token_length": "input tokens", + "llm_paragraph": "LLM segmentation", "mcp": "MCP call", "member": "member", "member_name": "Member name", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index fa29d8b66..e693d1e9a 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -45,6 +45,7 @@ "core.dataset.import.Adjust parameters": "Adjust parameters", "custom_data_process_params": "Custom", "custom_data_process_params_desc": "Customize data processing rules", + "custom_split_char": "Char", "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", "data_error_amount": "{{errorAmount}} Group training exception", @@ -117,6 +118,11 @@ "insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.", "is_open_schedule": "Enable scheduled synchronization", "keep_image": "Keep the picture", + "llm_paragraph_mode": "LLM recognition paragraph(Beta)", + "llm_paragraph_mode_auto": "automatic", + "llm_paragraph_mode_auto_desc": "Enable the model to automatically recognize the title when the file content does not contain a Markdown title.", + "llm_paragraph_mode_forbid": "Disabled", + "llm_paragraph_mode_forbid_desc": "Force the disabling of the model's automatic paragraph recognition", "loading": "Loading...", "max_chunk_size": "Maximum chunk size", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", diff --git a/packages/web/i18n/zh-CN/account_usage.json b/packages/web/i18n/zh-CN/account_usage.json index 13b6a1151..8befa05ab 100644 --- a/packages/web/i18n/zh-CN/account_usage.json +++ b/packages/web/i18n/zh-CN/account_usage.json @@ -20,8 +20,10 @@ "export_title": "时间,成员,类型,项目名,AI 积分消耗", "feishu": "飞书", "generation_time": "生成时间", + "image_index": "图片索引", "image_parse": "图片标注", "input_token_length": "输入 tokens", + "llm_paragraph": "模型分段", "mcp": "MCP 调用", "member": "成员", "member_name": "成员名", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index eb9a41ea3..d785aef8b 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -45,6 +45,7 @@ "core.dataset.import.Adjust parameters": "调整参数", "custom_data_process_params": "自定义", "custom_data_process_params_desc": "自定义设置数据处理规则", + "custom_split_char": "分隔符", "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", "data_error_amount": "{{errorAmount}} 组训练异常", @@ -117,6 +118,11 @@ "insert_images_success": "新增图片成功,需等待训练完成才会展示", "is_open_schedule": "启用定时同步", "keep_image": "保留图片", + "llm_paragraph_mode": "模型识别段落(Beta)", + "llm_paragraph_mode_auto": "自动", + "llm_paragraph_mode_auto_desc": "当文件内容不包含 Markdown 标题时,启用模型自动识别标题。", + "llm_paragraph_mode_forbid": "禁用", + "llm_paragraph_mode_forbid_desc": "强制禁用模型自动识别段落", "loading": "加载中...", "max_chunk_size": "最大分块大小", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", diff --git a/packages/web/i18n/zh-Hant/account_usage.json b/packages/web/i18n/zh-Hant/account_usage.json index fe1145801..db2f54fbd 100644 --- a/packages/web/i18n/zh-Hant/account_usage.json +++ b/packages/web/i18n/zh-Hant/account_usage.json @@ -20,8 +20,10 @@ "export_title": "時間,成員,類型,項目名,AI 積分消耗", "feishu": "飛書", "generation_time": "生成時間", + "image_index": "圖片索引", "image_parse": "圖片標註", "input_token_length": "輸入 tokens", + "llm_paragraph": "模型分段", "mcp": "MCP 調用", "member": "成員", "member_name": "成員名", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index d5bc64a43..6d5d1789e 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -44,6 +44,7 @@ "core.dataset.import.Adjust parameters": "調整參數", "custom_data_process_params": "自訂", "custom_data_process_params_desc": "自訂資料處理規則", + "custom_split_char": "分隔符", "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的資料,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.”表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如:* () [] {} 等。", "data_amount": "{{dataAmount}} 組資料,{{indexAmount}} 組索引", "data_error_amount": "{{errorAmount}} 組訓練異常", @@ -116,6 +117,11 @@ "insert_images_success": "新增圖片成功,需等待訓練完成才會展示", "is_open_schedule": "啟用定時同步", "keep_image": "保留圖片", + "llm_paragraph_mode": "模型識別段落(Beta)", + "llm_paragraph_mode_auto": "自動", + "llm_paragraph_mode_auto_desc": "當文件內容不包含 Markdown 標題時,啟用模型自動識別標題。", + "llm_paragraph_mode_forbid": "禁用", + "llm_paragraph_mode_forbid_desc": "強制禁用模型自動識別段落", "loading": "加載中...", "max_chunk_size": "最大分塊大小", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 145765dff..2d422b810 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -89,6 +89,9 @@ importers: json5: specifier: ^2.2.3 version: 2.2.3 + lodash: + specifier: ^4.17.21 + version: 4.17.21 nanoid: specifier: ^5.1.3 version: 5.1.3 @@ -108,6 +111,9 @@ importers: '@types/js-yaml': specifier: ^4.0.9 version: 4.0.9 + '@types/lodash': + specifier: ^4.14.191 + version: 4.17.16 '@types/node': specifier: 20.14.0 version: 20.14.0 diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx index 3c5d7f5c1..ace2df1a9 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/WebsiteConfig.tsx @@ -9,25 +9,14 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep'; import MyDivider from '@fastgpt/web/components/common/MyDivider'; import React from 'react'; import { Box, Link, Input, Button, ModalBody, ModalFooter, Stack } from '@chakra-ui/react'; -import { - DataChunkSplitModeEnum, - DatasetCollectionDataProcessModeEnum -} from '@fastgpt/global/core/dataset/constants'; -import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { useContextSelector } from 'use-context-selector'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; -import CollectionChunkForm, { - collectionChunkForm2StoreChunkData, - type CollectionChunkFormType -} from '../Form/CollectionChunkForm'; -import { - getAutoIndexSize, - getLLMDefaultChunkSize -} from '@fastgpt/global/core/dataset/training/utils'; +import CollectionChunkForm, { type CollectionChunkFormType } from '../Form/CollectionChunkForm'; import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm'; import { defaultFormData } from '../Import/Context'; +import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils'; export type WebsiteConfigFormType = { websiteConfig: { @@ -80,7 +69,7 @@ const WebsiteConfigModal = ({ const form = useForm({ defaultValues: { - trainingType: chunkSettings?.trainingType, + trainingType: chunkSettings?.trainingType || defaultFormData.trainingType, chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType, chunkTriggerMinSize: @@ -204,9 +193,9 @@ const WebsiteConfigModal = ({ form.handleSubmit((data) => onSuccess({ websiteConfig: websiteInfoGetValues(), - chunkSettings: collectionChunkForm2StoreChunkData({ + chunkSettings: computedCollectionChunkSettings({ ...data, - agentModel: datasetDetail.agentModel, + llmModel: datasetDetail.agentModel, vectorModel: datasetDetail.vectorModel }) }) diff --git a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx index b7d9d31be..3ac7164cb 100644 --- a/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Form/CollectionChunkForm.tsx @@ -17,7 +17,7 @@ import { } from '@chakra-ui/react'; import MyIcon from '@fastgpt/web/components/common/Icon'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; -import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { DataChunkSplitModeEnum, @@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn { const list = { @@ -362,11 +363,35 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn { setValue('chunkSplitMode', e); }} + fontSize={'md'} /> {chunkSplitMode === DataChunkSplitModeEnum.paragraph && ( <> - + + {t('dataset:llm_paragraph_mode')} + + size={'sm'} + bg={'myGray.50'} + value={paragraphChunkAIMode} + onChange={(e) => { + setValue('paragraphChunkAIMode', e); + }} + list={[ + { + label: t('dataset:llm_paragraph_mode_forbid'), + value: ParagraphChunkAIModeEnum.forbid, + description: t('dataset:llm_paragraph_mode_forbid_desc') + }, + { + label: t('dataset:llm_paragraph_mode_auto'), + value: ParagraphChunkAIModeEnum.auto, + description: t('dataset:llm_paragraph_mode_auto_desc') + } + ]} + /> + + {t('dataset:paragraph_max_deep')} - + {t('dataset:max_chunk_size')} + {t('dataset:chunk_size')} - - - list={customSplitList} - size={'sm'} - bg={'myGray.50'} - value={customListSelectValue} - h={'32px'} - onChange={(val) => { - if (val === 'Other') { - setValue('chunkSplitter', ''); - } else { - setValue('chunkSplitter', val); - } - setCustomListSelectValue(val); - }} - /> - - {customListSelectValue === 'Other' && ( - - )} - + + {t('dataset:custom_split_char')} + + + + list={customSplitList} + size={'sm'} + bg={'myGray.50'} + value={customListSelectValue} + h={'32px'} + onChange={(val) => { + if (val === 'Other') { + setValue('chunkSplitter', ''); + } else { + setValue('chunkSplitter', val); + } + setCustomListSelectValue(val); + }} + /> + + {customListSelectValue === 'Other' && ( + + )} + + )} {trainingType === DatasetCollectionDataProcessModeEnum.chunk && ( - - + + {t('dataset:index_size')} - + bg={'myGray.50'} list={indexSizeSeletorList} @@ -490,7 +518,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn + {t('common:core.dataset.collection.QA Prompt')} { - const { - trainingType, - autoIndexes, - chunkSettingMode, - chunkSize, - chunkSplitter, - indexSize, - qaPrompt - } = data; - - // 根据处理方式,获取 auto 和 custom 的参数。 - const trainingModeSize: { - autoChunkSize: number; - autoIndexSize: number; - chunkSize: number; - indexSize: number; - } = (() => { - if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { - return { - autoChunkSize: getLLMDefaultChunkSize(agentModel), - autoIndexSize: getMaxIndexSize(vectorModel), - chunkSize, - indexSize: getMaxIndexSize(vectorModel) - }; - } else if (autoIndexes) { - return { - autoChunkSize: chunkAutoChunkSize, - autoIndexSize: getAutoIndexSize(vectorModel), - chunkSize, - indexSize - }; - } else { - return { - autoChunkSize: chunkAutoChunkSize, - autoIndexSize: getAutoIndexSize(vectorModel), - chunkSize, - indexSize - }; - } - })(); - - // 获取真实参数 - const { - chunkSize: formatChunkIndex, - indexSize: formatIndexSize, - chunkSplitter: formatChunkSplitter - } = (() => { - if (chunkSettingMode === ChunkSettingModeEnum.auto) { - return { - chunkSize: trainingModeSize.autoChunkSize, - indexSize: trainingModeSize.autoIndexSize, - chunkSplitter: '' - }; - } else { - return { - chunkSize: trainingModeSize.chunkSize, - indexSize: trainingModeSize.indexSize, - chunkSplitter - }; - } - })(); - - return { - ...data, - chunkSize: formatChunkIndex, - indexSize: formatIndexSize, - chunkSplitter: formatChunkSplitter, - qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined - }; -}; diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx index a55ccf4c7..c2cbb9d92 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx @@ -52,7 +52,7 @@ export const defaultFormData: ImportFormType = { chunkSettingMode: ChunkSettingModeEnum.auto, chunkSplitMode: DataChunkSplitModeEnum.paragraph, - paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto, + paragraphChunkAIMode: ParagraphChunkAIModeEnum.forbid, paragraphChunkDeep: 5, paragraphChunkMinSize: 100, @@ -198,10 +198,10 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode const vectorModel = datasetDetail.vectorModel; const processParamsForm = useForm({ - defaultValues: { + defaultValues: (() => ({ ...defaultFormData, indexSize: getAutoIndexSize(vectorModel) - } + }))() }); const [sources, setSources] = useState([]); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx index 56dd189f8..9116f6495 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/PreviewData.tsx @@ -17,7 +17,6 @@ import MyBox from '@fastgpt/web/components/common/MyBox'; import Markdown from '@/components/Markdown'; import { useToast } from '@fastgpt/web/hooks/useToast'; import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; -import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm'; const PreviewData = () => { const { t } = useTranslation(); @@ -37,11 +36,7 @@ const PreviewData = () => { async () => { if (!previewFile) return { chunks: [], total: 0 }; - const chunkData = collectionChunkForm2StoreChunkData({ - ...processParamsForm.getValues(), - vectorModel: datasetDetail.vectorModel, - agentModel: datasetDetail.agentModel - }); + const chunkData = processParamsForm.getValues(); if (importSource === ImportDataSourceEnum.fileCustom) { const chunkSplitter = processParamsForm.getValues('chunkSplitter'); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx index bffc8e16e..c992c5141 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/commonProgress/Upload.tsx @@ -37,7 +37,6 @@ import { useContextSelector } from 'use-context-selector'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetImportContext, type ImportFormType } from '../Context'; import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; -import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm'; const Upload = () => { const { t } = useTranslation(); @@ -82,12 +81,6 @@ const Upload = () => { const { runAsync: startUpload, loading: isLoading } = useRequest2( async ({ customPdfParse, webSelector, ...data }: ImportFormType) => { - const chunkData = collectionChunkForm2StoreChunkData({ - ...data, - vectorModel: datasetDetail.vectorModel, - agentModel: datasetDetail.agentModel - }); - if (sources.length === 0) return; const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); @@ -108,7 +101,7 @@ const Upload = () => { const commonParams: ApiCreateDatasetCollectionParams & { name: string; } = { - ...chunkData, + ...data, parentId, datasetId: datasetDetail._id, name: item.sourceName, diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index 540ec199d..b6a8c1dfb 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -1,7 +1,3 @@ -import { - ChunkSettingModeEnum, - DatasetCollectionDataProcessModeEnum -} from '@fastgpt/global/core/dataset/constants'; import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; import { NextAPI } from '@/service/middleware/entry'; @@ -13,13 +9,11 @@ import { import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { - computeChunkSize, - computeChunkSplitter, - computeParagraphChunkDeep, + computedCollectionChunkSettings, getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; -import { getLLMModel } from '@fastgpt/service/core/ai/model'; +import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model'; import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type'; export type PostPreviewFilesChunksProps = ChunkSettingsType & { @@ -52,22 +46,12 @@ async function handler( sourceId, customPdfParse = false, - trainingType = DatasetCollectionDataProcessModeEnum.chunk, - - chunkTriggerType, - chunkTriggerMinSize, - - chunkSettingMode = ChunkSettingModeEnum.auto, - chunkSplitMode, - paragraphChunkDeep, - paragraphChunkMinSize, - chunkSize, - chunkSplitter, - overlapRatio, selector, datasetId, - externalFileId + externalFileId, + + ...chunkSettings } = req.body; if (!sourceId) { @@ -97,22 +81,10 @@ async function handler( return Promise.reject(CommonErrEnum.unAuthFile); } - chunkSize = computeChunkSize({ - trainingType, - chunkSettingMode, - chunkSplitMode, - chunkSize, - llmModel: getLLMModel(dataset.agentModel) - }); - chunkSplitter = computeChunkSplitter({ - chunkSettingMode, - chunkSplitMode, - chunkSplitter - }); - paragraphChunkDeep = computeParagraphChunkDeep({ - chunkSettingMode, - chunkSplitMode, - paragraphChunkDeep + const formatChunkSettings = computedCollectionChunkSettings({ + ...chunkSettings, + llmModel: getLLMModel(dataset.agentModel), + vectorModel: getEmbeddingModel(dataset.vectorModel) }); const { rawText } = await readDatasetSourceRawText({ @@ -126,16 +98,16 @@ async function handler( apiDatasetServer: dataset.apiDatasetServer }); - const chunks = rawText2Chunks({ + const chunks = await rawText2Chunks({ rawText, - chunkTriggerType, - chunkTriggerMinSize, - chunkSize, - paragraphChunkDeep, - paragraphChunkMinSize, + chunkTriggerType: formatChunkSettings.chunkTriggerType, + chunkTriggerMinSize: formatChunkSettings.chunkTriggerMinSize, + chunkSize: formatChunkSettings.chunkSize, + paragraphChunkDeep: formatChunkSettings.paragraphChunkDeep, + paragraphChunkMinSize: formatChunkSettings.paragraphChunkMinSize, maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), overlapRatio, - customReg: chunkSplitter ? [chunkSplitter] : [] + customReg: formatChunkSettings.chunkSplitter ? [formatChunkSettings.chunkSplitter] : [] }); return { diff --git a/projects/app/src/pages/api/core/dataset/update.ts b/projects/app/src/pages/api/core/dataset/update.ts index 7ea50dd42..02b3aaa7d 100644 --- a/projects/app/src/pages/api/core/dataset/update.ts +++ b/projects/app/src/pages/api/core/dataset/update.ts @@ -40,6 +40,8 @@ import { isEqual } from 'lodash'; import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog'; import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants'; import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util'; +import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model'; +import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils'; export type DatasetUpdateQuery = {}; export type DatasetUpdateResponse = any; @@ -59,7 +61,7 @@ async function handler( req: ApiRequestProps, _res: ApiResponseType ): Promise { - const { + let { id, parentId, name, @@ -89,6 +91,14 @@ async function handler( let targetName = ''; + chunkSettings = chunkSettings + ? computedCollectionChunkSettings({ + ...chunkSettings, + llmModel: getLLMModel(dataset.agentModel), + vectorModel: getEmbeddingModel(dataset.vectorModel) + }) + : undefined; + if (isMove) { if (parentId) { // move to a folder, check the target folder's permission diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index d076f78e3..336455c24 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -16,9 +16,9 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { type ClientSession } from '@fastgpt/service/common/mongo'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; -import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken'; import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller'; +import { text2Chunks } from '@fastgpt/service/worker/function'; const formatIndexes = async ({ indexes = [], @@ -40,7 +40,7 @@ const formatIndexes = async ({ }[] > => { /* get dataset data default index */ - const getDefaultIndex = ({ + const getDefaultIndex = async ({ q = '', a, indexSize @@ -49,13 +49,15 @@ const formatIndexes = async ({ a?: string; indexSize: number; }) => { - const qChunks = splitText2Chunks({ - text: q, - chunkSize: indexSize, - maxSize: maxIndexSize - }).chunks; + const qChunks = ( + await text2Chunks({ + text: q, + chunkSize: indexSize, + maxSize: maxIndexSize + }) + ).chunks; const aChunks = a - ? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks + ? (await text2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize })).chunks : []; return [ @@ -80,7 +82,7 @@ const formatIndexes = async ({ .filter((item) => !!item.text.trim()); // Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds - const defaultIndexes = getDefaultIndex({ q, a, indexSize }); + const defaultIndexes = await getDefaultIndex({ q, a, indexSize }); const concatDefaultIndexes = defaultIndexes.map((item) => { const oldIndex = indexes!.find((index) => index.text === item.text); @@ -114,11 +116,13 @@ const formatIndexes = async ({ // If oversize tokens, split it const tokens = await countPromptTokens(item.text); if (tokens > maxIndexSize) { - const splitText = splitText2Chunks({ - text: item.text, - chunkSize: indexSize, - maxSize: maxIndexSize - }).chunks; + const splitText = ( + await text2Chunks({ + text: item.text, + chunkSize: indexSize, + maxSize: maxIndexSize + }) + ).chunks; return splitText.map((text) => ({ text, type: item.type diff --git a/projects/app/src/service/core/dataset/queues/datasetParse.ts b/projects/app/src/service/core/dataset/queues/datasetParse.ts index eccd4a8b5..f68abfbb4 100644 --- a/projects/app/src/service/core/dataset/queues/datasetParse.ts +++ b/projects/app/src/service/core/dataset/queues/datasetParse.ts @@ -1,6 +1,6 @@ /* Dataset collection source parse, not max size. */ -import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants'; import { DatasetCollectionDataProcessModeEnum, DatasetCollectionTypeEnum, @@ -29,7 +29,7 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema'; import { hashStr } from '@fastgpt/global/common/string/tools'; import { POST } from '@fastgpt/service/common/api/plusRequest'; -import { deleteRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/controller'; +import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller'; const requestLLMPargraph = async ({ rawText, @@ -42,13 +42,11 @@ const requestLLMPargraph = async ({ billId: string; paragraphChunkAIMode: ParagraphChunkAIModeEnum; }) => { - return { - resultText: rawText, - totalInputTokens: 0, - totalOutputTokens: 0 - }; - - if (!global.feConfigs?.isPlus || !paragraphChunkAIMode) { + if ( + !global.feConfigs?.isPlus || + !paragraphChunkAIMode || + paragraphChunkAIMode === ParagraphChunkAIModeEnum.forbid + ) { return { resultText: rawText, totalInputTokens: 0, @@ -57,16 +55,16 @@ const requestLLMPargraph = async ({ } // Check is markdown text(Include 1 group of title) - // if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) { - // const isMarkdown = /^(#+)\s/.test(rawText); - // if (isMarkdown) { - // return { - // resultText: rawText, - // totalInputTokens: 0, - // totalOutputTokens: 0 - // }; - // } - // } + if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) { + const isMarkdown = /^(#+)\s/.test(rawText); + if (isMarkdown) { + return { + resultText: rawText, + totalInputTokens: 0, + totalOutputTokens: 0 + }; + } + } const data = await POST<{ resultText: string; @@ -226,15 +224,25 @@ export const datasetParseQueue = async (): Promise => { }); // 3. LLM Pargraph - const { resultText } = await requestLLMPargraph({ + const { resultText, totalInputTokens, totalOutputTokens } = await requestLLMPargraph({ rawText, model: dataset.agentModel, billId: data.billId, paragraphChunkAIMode: collection.paragraphChunkAIMode }); + // Push usage + pushLLMTrainingUsage({ + teamId: data.teamId, + tmbId: data.tmbId, + model: dataset.agentModel, + inputTokens: totalInputTokens, + outputTokens: totalOutputTokens, + billId: data.billId, + mode: 'paragraph' + }); // 4. Chunk split - const chunks = rawText2Chunks({ + const chunks = await rawText2Chunks({ rawText: resultText, chunkTriggerType: collection.chunkTriggerType, chunkTriggerMinSize: collection.chunkTriggerMinSize, diff --git a/projects/app/src/service/core/dataset/queues/generateQA.ts b/projects/app/src/service/core/dataset/queues/generateQA.ts index e4fb1d355..e7b5c6b6f 100644 --- a/projects/app/src/service/core/dataset/queues/generateQA.ts +++ b/projects/app/src/service/core/dataset/queues/generateQA.ts @@ -1,10 +1,9 @@ import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; -import { pushQAUsage } from '@/service/support/wallet/usage/push'; +import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { createChatCompletion } from '@fastgpt/service/core/ai/config'; import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d'; import { addLog } from '@fastgpt/service/common/system/log'; -import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter'; import { replaceVariable } from '@fastgpt/global/common/string/tools'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d'; @@ -24,6 +23,7 @@ import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import { getErrText } from '@fastgpt/global/common/error/utils'; +import { text2Chunks } from '@fastgpt/service/worker/function'; const reduceQueue = () => { global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0; @@ -144,7 +144,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(messages)); const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer)); - const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对 + const qaArr = await formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对 // get vector and insert await pushDataListToTrainingQueueByCollectionId({ @@ -163,13 +163,14 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; await MongoDatasetTraining.findByIdAndDelete(data._id); // add bill - pushQAUsage({ + pushLLMTrainingUsage({ teamId: data.teamId, tmbId: data.tmbId, inputTokens, outputTokens, billId: data.billId, - model: modelData.model + model: modelData.model, + mode: 'qa' }); addLog.info(`[QA Queue] Finish`, { time: Date.now() - startTime, @@ -196,7 +197,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`; } // Format qa answer -function formatSplitText({ +async function formatSplitText({ answer, rawText, llmModel @@ -223,7 +224,7 @@ function formatSplitText({ // empty result. direct split chunk if (result.length === 0) { - const { chunks } = splitText2Chunks({ + const { chunks } = await text2Chunks({ text: rawText, chunkSize: chunkAutoChunkSize, maxSize: getLLMMaxChunkSize(llmModel) diff --git a/projects/app/src/service/support/wallet/usage/push.ts b/projects/app/src/service/support/wallet/usage/push.ts index 33997582d..d8b2e5e16 100644 --- a/projects/app/src/service/support/wallet/usage/push.ts +++ b/projects/app/src/service/support/wallet/usage/push.ts @@ -5,42 +5,6 @@ import { i18nT } from '@fastgpt/web/i18n/utils'; import { ModelTypeEnum } from '@fastgpt/global/core/ai/model'; import { getDefaultTTSModel } from '@fastgpt/service/core/ai/model'; -export const pushQAUsage = async ({ - teamId, - tmbId, - model, - inputTokens, - outputTokens, - billId -}: { - teamId: string; - tmbId: string; - model: string; - inputTokens: number; - outputTokens: number; - billId: string; -}) => { - // 计算价格 - const { totalPoints } = formatModelChars2Points({ - model, - modelType: ModelTypeEnum.llm, - inputTokens, - outputTokens - }); - - concatUsage({ - billId, - teamId, - tmbId, - totalPoints, - inputTokens, - outputTokens, - listIndex: 1 - }); - - return { totalPoints }; -}; - export const pushGenerateVectorUsage = ({ billId, teamId, diff --git a/test/cases/function/packages/service/core/dataset/textSplitter.test.ts b/test/cases/function/packages/service/core/dataset/textSplitter.test.ts index d1bcd79ba..ef61821c9 100644 --- a/test/cases/function/packages/service/core/dataset/textSplitter.test.ts +++ b/test/cases/function/packages/service/core/dataset/textSplitter.test.ts @@ -16,7 +16,7 @@ const formatResult = (result: string[]) => { }; // 最大值分块测试-小于最大值,不分块 -it(`Test splitText2Chunks 1`, () => { +it(`Test splitText2Chunks 1`, async () => { const mock = { text: `# A @@ -61,7 +61,7 @@ dsgsgfsgs22sddddddd` ] }; - const data = rawText2Chunks({ + const data = await rawText2Chunks({ rawText: mock.text, chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize, chunkTriggerMinSize: 1000, @@ -72,7 +72,7 @@ dsgsgfsgs22sddddddd` expect(formatChunks(data)).toEqual(formatResult(mock.result)); }); // 最大值分块测试-大于最大值,分块 -it(`Test splitText2Chunks 2`, () => { +it(`Test splitText2Chunks 2`, async () => { const mock = { text: `# A @@ -122,7 +122,7 @@ dsgsgfsgs22sddddddd` ] }; - const data = rawText2Chunks({ + const data = await rawText2Chunks({ rawText: mock.text, chunkTriggerType: ChunkTriggerConfigTypeEnum.maxSize, chunkTriggerMinSize: 10, @@ -135,7 +135,7 @@ dsgsgfsgs22sddddddd` }); // 最小值分块测试-大于最小值,不分块 -it(`Test splitText2Chunks 3`, () => { +it(`Test splitText2Chunks 3`, async () => { const mock = { text: `# A @@ -179,7 +179,7 @@ it(`Test splitText2Chunks 3`, () => { ] }; - const data = rawText2Chunks({ + const data = await rawText2Chunks({ rawText: mock.text, chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize, chunkTriggerMinSize: 1000, @@ -191,7 +191,7 @@ it(`Test splitText2Chunks 3`, () => { expect(formatChunks(data)).toEqual(formatResult(mock.result)); }); // 最小值分块测试-小于最小值,分块 -it(`Test splitText2Chunks 4`, () => { +it(`Test splitText2Chunks 4`, async () => { const mock = { text: `# A @@ -241,7 +241,7 @@ dsgsgfsgs22sddddddd`, ] }; - const data = rawText2Chunks({ + const data = await rawText2Chunks({ rawText: mock.text, chunkTriggerType: ChunkTriggerConfigTypeEnum.minSize, chunkTriggerMinSize: 10, @@ -254,7 +254,7 @@ dsgsgfsgs22sddddddd`, }); // 强制分块测试-小于最小值和最大值 -it(`Test splitText2Chunks 5`, () => { +it(`Test splitText2Chunks 5`, async () => { const mock = { text: `# A @@ -304,7 +304,7 @@ dsgsgfsgs22sddddddd`, ] }; - const data = rawText2Chunks({ + const data = await rawText2Chunks({ rawText: mock.text, chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk, chunkTriggerMinSize: 1000, @@ -317,7 +317,7 @@ dsgsgfsgs22sddddddd`, }); // 强制分块测试-大于最小值 -it(`Test splitText2Chunks 6`, () => { +it(`Test splitText2Chunks 6`, async () => { const mock = { text: `# A @@ -367,7 +367,7 @@ dsgsgfsgs22sddddddd`, ] }; - const data = rawText2Chunks({ + const data = await rawText2Chunks({ rawText: mock.text, chunkTriggerType: ChunkTriggerConfigTypeEnum.forceChunk, chunkTriggerMinSize: 10,