perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)

* perf: password special chars

* feat: llm paragraph;perf: chunk setting params

* perf: text splitter worker

* perf: get rawtext buffer

* fix: test

* fix: test

* doc

* min chunk size
This commit is contained in:
Archer
2025-06-10 00:05:54 +08:00
committed by GitHub
parent 068918a9ee
commit 01ff56b42b
41 changed files with 546 additions and 448 deletions

View File

@@ -9,25 +9,14 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
import MyDivider from '@fastgpt/web/components/common/MyDivider';
import React from 'react';
import { Box, Link, Input, Button, ModalBody, ModalFooter, Stack } from '@chakra-ui/react';
import {
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import { useContextSelector } from 'use-context-selector';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import CollectionChunkForm, {
collectionChunkForm2StoreChunkData,
type CollectionChunkFormType
} from '../Form/CollectionChunkForm';
import {
getAutoIndexSize,
getLLMDefaultChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import CollectionChunkForm, { type CollectionChunkFormType } from '../Form/CollectionChunkForm';
import { type ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm';
import { defaultFormData } from '../Import/Context';
import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
export type WebsiteConfigFormType = {
websiteConfig: {
@@ -80,7 +69,7 @@ const WebsiteConfigModal = ({
const form = useForm<CollectionChunkFormType>({
defaultValues: {
trainingType: chunkSettings?.trainingType,
trainingType: chunkSettings?.trainingType || defaultFormData.trainingType,
chunkTriggerType: chunkSettings?.chunkTriggerType || defaultFormData.chunkTriggerType,
chunkTriggerMinSize:
@@ -204,9 +193,9 @@ const WebsiteConfigModal = ({
form.handleSubmit((data) =>
onSuccess({
websiteConfig: websiteInfoGetValues(),
chunkSettings: collectionChunkForm2StoreChunkData({
chunkSettings: computedCollectionChunkSettings({
...data,
agentModel: datasetDetail.agentModel,
llmModel: datasetDetail.agentModel,
vectorModel: datasetDetail.vectorModel
})
})

View File

@@ -17,7 +17,7 @@ import {
} from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ChunkTriggerConfigTypeEnum } from '@fastgpt/global/core/dataset/constants';
import {
DataChunkSplitModeEnum,
@@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
const autoIndexes = watch('autoIndexes');
const indexSize = watch('indexSize');
const imageIndex = watch('imageIndex');
const paragraphChunkAIMode = watch('paragraphChunkAIMode');
const trainingModeList = useMemo(() => {
const list = {
@@ -362,11 +363,35 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
onChange={(e) => {
setValue('chunkSplitMode', e);
}}
fontSize={'md'}
/>
{chunkSplitMode === DataChunkSplitModeEnum.paragraph && (
<>
<Box mt={1.5}>
<Box mt={3}>
<Box fontSize={'sm'}>{t('dataset:llm_paragraph_mode')}</Box>
<MySelect<ParagraphChunkAIModeEnum>
size={'sm'}
bg={'myGray.50'}
value={paragraphChunkAIMode}
onChange={(e) => {
setValue('paragraphChunkAIMode', e);
}}
list={[
{
label: t('dataset:llm_paragraph_mode_forbid'),
value: ParagraphChunkAIModeEnum.forbid,
description: t('dataset:llm_paragraph_mode_forbid_desc')
},
{
label: t('dataset:llm_paragraph_mode_auto'),
value: ParagraphChunkAIModeEnum.auto,
description: t('dataset:llm_paragraph_mode_auto_desc')
}
]}
/>
</Box>
<Box mt={2} fontSize={'sm'}>
<Box>{t('dataset:paragraph_max_deep')}</Box>
<MyNumberInput
size={'sm'}
@@ -379,7 +404,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
h={'32px'}
/>
</Box>
<Box mt={1.5}>
<Box mt={2} fontSize={'sm'}>
<Box>{t('dataset:max_chunk_size')}</Box>
<Box
css={{
@@ -409,7 +434,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)}
{chunkSplitMode === DataChunkSplitModeEnum.size && (
<Box mt={1.5}>
<Box mt={3} fontSize={'sm'}>
<Box>{t('dataset:chunk_size')}</Box>
<Box
css={{
@@ -438,45 +463,48 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)}
{chunkSplitMode === DataChunkSplitModeEnum.char && (
<HStack mt={1.5}>
<Box flex={'1 0 0'}>
<MySelect<string>
list={customSplitList}
size={'sm'}
bg={'myGray.50'}
value={customListSelectValue}
h={'32px'}
onChange={(val) => {
if (val === 'Other') {
setValue('chunkSplitter', '');
} else {
setValue('chunkSplitter', val);
}
setCustomListSelectValue(val);
}}
/>
</Box>
{customListSelectValue === 'Other' && (
<Input
flex={'1 0 0'}
h={'32px'}
size={'sm'}
bg={'myGray.50'}
placeholder="\n;======;==SPLIT=="
{...register('chunkSplitter')}
/>
)}
</HStack>
<Box mt={3} fontSize={'sm'}>
<Box>{t('dataset:custom_split_char')}</Box>
<HStack>
<Box flex={'1 0 0'}>
<MySelect<string>
list={customSplitList}
size={'sm'}
bg={'myGray.50'}
value={customListSelectValue}
h={'32px'}
onChange={(val) => {
if (val === 'Other') {
setValue('chunkSplitter', '');
} else {
setValue('chunkSplitter', val);
}
setCustomListSelectValue(val);
}}
/>
</Box>
{customListSelectValue === 'Other' && (
<Input
flex={'1 0 0'}
h={'32px'}
size={'sm'}
bg={'myGray.50'}
placeholder="\n;======;==SPLIT=="
{...register('chunkSplitter')}
/>
)}
</HStack>
</Box>
)}
</Box>
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
<Box>
<Flex alignItems={'center'} mt={3}>
<Box fontSize={'sm'} mt={2}>
<Flex alignItems={'center'}>
<Box>{t('dataset:index_size')}</Box>
<QuestionTip label={t('dataset:index_size_tips')} />
</Flex>
<Box mt={1}>
<Box>
<MySelect<number>
bg={'myGray.50'}
list={indexSizeSeletorList}
@@ -490,7 +518,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
)}
{showQAPromptInput && (
<Box mt={3}>
<Box mt={2}>
<Box>{t('common:core.dataset.collection.QA Prompt')}</Box>
<Box
position={'relative'}
@@ -570,83 +598,3 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
};
export default CollectionChunkForm;
// Get chunk settings from form
export const collectionChunkForm2StoreChunkData = ({
agentModel,
vectorModel,
...data
}: CollectionChunkFormType & {
agentModel: LLMModelItemType;
vectorModel: EmbeddingModelItemType;
}): CollectionChunkFormType => {
const {
trainingType,
autoIndexes,
chunkSettingMode,
chunkSize,
chunkSplitter,
indexSize,
qaPrompt
} = data;
// 根据处理方式,获取 auto 和 custom 的参数。
const trainingModeSize: {
autoChunkSize: number;
autoIndexSize: number;
chunkSize: number;
indexSize: number;
} = (() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
autoChunkSize: getLLMDefaultChunkSize(agentModel),
autoIndexSize: getMaxIndexSize(vectorModel),
chunkSize,
indexSize: getMaxIndexSize(vectorModel)
};
} else if (autoIndexes) {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
} else {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
}
})();
// 获取真实参数
const {
chunkSize: formatChunkIndex,
indexSize: formatIndexSize,
chunkSplitter: formatChunkSplitter
} = (() => {
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
return {
chunkSize: trainingModeSize.autoChunkSize,
indexSize: trainingModeSize.autoIndexSize,
chunkSplitter: ''
};
} else {
return {
chunkSize: trainingModeSize.chunkSize,
indexSize: trainingModeSize.indexSize,
chunkSplitter
};
}
})();
return {
...data,
chunkSize: formatChunkIndex,
indexSize: formatIndexSize,
chunkSplitter: formatChunkSplitter,
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
};
};

View File

@@ -52,7 +52,7 @@ export const defaultFormData: ImportFormType = {
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.paragraph,
paragraphChunkAIMode: ParagraphChunkAIModeEnum.auto,
paragraphChunkAIMode: ParagraphChunkAIModeEnum.forbid,
paragraphChunkDeep: 5,
paragraphChunkMinSize: 100,
@@ -198,10 +198,10 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
const vectorModel = datasetDetail.vectorModel;
const processParamsForm = useForm<ImportFormType>({
defaultValues: {
defaultValues: (() => ({
...defaultFormData,
indexSize: getAutoIndexSize(vectorModel)
}
}))()
});
const [sources, setSources] = useState<ImportSourceItemType[]>([]);

View File

@@ -17,7 +17,6 @@ import MyBox from '@fastgpt/web/components/common/MyBox';
import Markdown from '@/components/Markdown';
import { useToast } from '@fastgpt/web/hooks/useToast';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
const PreviewData = () => {
const { t } = useTranslation();
@@ -37,11 +36,7 @@ const PreviewData = () => {
async () => {
if (!previewFile) return { chunks: [], total: 0 };
const chunkData = collectionChunkForm2StoreChunkData({
...processParamsForm.getValues(),
vectorModel: datasetDetail.vectorModel,
agentModel: datasetDetail.agentModel
});
const chunkData = processParamsForm.getValues();
if (importSource === ImportDataSourceEnum.fileCustom) {
const chunkSplitter = processParamsForm.getValues('chunkSplitter');

View File

@@ -37,7 +37,6 @@ import { useContextSelector } from 'use-context-selector';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { DatasetImportContext, type ImportFormType } from '../Context';
import { type ApiCreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { collectionChunkForm2StoreChunkData } from '../../Form/CollectionChunkForm';
const Upload = () => {
const { t } = useTranslation();
@@ -82,12 +81,6 @@ const Upload = () => {
const { runAsync: startUpload, loading: isLoading } = useRequest2(
async ({ customPdfParse, webSelector, ...data }: ImportFormType) => {
const chunkData = collectionChunkForm2StoreChunkData({
...data,
vectorModel: datasetDetail.vectorModel,
agentModel: datasetDetail.agentModel
});
if (sources.length === 0) return;
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
@@ -108,7 +101,7 @@ const Upload = () => {
const commonParams: ApiCreateDatasetCollectionParams & {
name: string;
} = {
...chunkData,
...data,
parentId,
datasetId: datasetDetail._id,
name: item.sourceName,

View File

@@ -1,7 +1,3 @@
import {
ChunkSettingModeEnum,
DatasetCollectionDataProcessModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { NextAPI } from '@/service/middleware/entry';
@@ -13,13 +9,11 @@ import {
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import {
computeChunkSize,
computeChunkSplitter,
computeParagraphChunkDeep,
computedCollectionChunkSettings,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMModel } from '@fastgpt/service/core/ai/model';
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import type { ChunkSettingsType } from '@fastgpt/global/core/dataset/type';
export type PostPreviewFilesChunksProps = ChunkSettingsType & {
@@ -52,22 +46,12 @@ async function handler(
sourceId,
customPdfParse = false,
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
chunkTriggerType,
chunkTriggerMinSize,
chunkSettingMode = ChunkSettingModeEnum.auto,
chunkSplitMode,
paragraphChunkDeep,
paragraphChunkMinSize,
chunkSize,
chunkSplitter,
overlapRatio,
selector,
datasetId,
externalFileId
externalFileId,
...chunkSettings
} = req.body;
if (!sourceId) {
@@ -97,22 +81,10 @@ async function handler(
return Promise.reject(CommonErrEnum.unAuthFile);
}
chunkSize = computeChunkSize({
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize,
llmModel: getLLMModel(dataset.agentModel)
});
chunkSplitter = computeChunkSplitter({
chunkSettingMode,
chunkSplitMode,
chunkSplitter
});
paragraphChunkDeep = computeParagraphChunkDeep({
chunkSettingMode,
chunkSplitMode,
paragraphChunkDeep
const formatChunkSettings = computedCollectionChunkSettings({
...chunkSettings,
llmModel: getLLMModel(dataset.agentModel),
vectorModel: getEmbeddingModel(dataset.vectorModel)
});
const { rawText } = await readDatasetSourceRawText({
@@ -126,16 +98,16 @@ async function handler(
apiDatasetServer: dataset.apiDatasetServer
});
const chunks = rawText2Chunks({
const chunks = await rawText2Chunks({
rawText,
chunkTriggerType,
chunkTriggerMinSize,
chunkSize,
paragraphChunkDeep,
paragraphChunkMinSize,
chunkTriggerType: formatChunkSettings.chunkTriggerType,
chunkTriggerMinSize: formatChunkSettings.chunkTriggerMinSize,
chunkSize: formatChunkSettings.chunkSize,
paragraphChunkDeep: formatChunkSettings.paragraphChunkDeep,
paragraphChunkMinSize: formatChunkSettings.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio,
customReg: chunkSplitter ? [chunkSplitter] : []
customReg: formatChunkSettings.chunkSplitter ? [formatChunkSettings.chunkSplitter] : []
});
return {

View File

@@ -40,6 +40,8 @@ import { isEqual } from 'lodash';
import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog';
import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants';
import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util';
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { computedCollectionChunkSettings } from '@fastgpt/global/core/dataset/training/utils';
export type DatasetUpdateQuery = {};
export type DatasetUpdateResponse = any;
@@ -59,7 +61,7 @@ async function handler(
req: ApiRequestProps<DatasetUpdateBody, DatasetUpdateQuery>,
_res: ApiResponseType<any>
): Promise<DatasetUpdateResponse> {
const {
let {
id,
parentId,
name,
@@ -89,6 +91,14 @@ async function handler(
let targetName = '';
chunkSettings = chunkSettings
? computedCollectionChunkSettings({
...chunkSettings,
llmModel: getLLMModel(dataset.agentModel),
vectorModel: getEmbeddingModel(dataset.vectorModel)
})
: undefined;
if (isMove) {
if (parentId) {
// move to a folder, check the target folder's permission

View File

@@ -16,9 +16,9 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { type ClientSession } from '@fastgpt/service/common/mongo';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { deleteDatasetImage } from '@fastgpt/service/core/dataset/image/controller';
import { text2Chunks } from '@fastgpt/service/worker/function';
const formatIndexes = async ({
indexes = [],
@@ -40,7 +40,7 @@ const formatIndexes = async ({
}[]
> => {
/* get dataset data default index */
const getDefaultIndex = ({
const getDefaultIndex = async ({
q = '',
a,
indexSize
@@ -49,13 +49,15 @@ const formatIndexes = async ({
a?: string;
indexSize: number;
}) => {
const qChunks = splitText2Chunks({
text: q,
chunkSize: indexSize,
maxSize: maxIndexSize
}).chunks;
const qChunks = (
await text2Chunks({
text: q,
chunkSize: indexSize,
maxSize: maxIndexSize
})
).chunks;
const aChunks = a
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
? (await text2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize })).chunks
: [];
return [
@@ -80,7 +82,7 @@ const formatIndexes = async ({
.filter((item) => !!item.text.trim());
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
const defaultIndexes = getDefaultIndex({ q, a, indexSize });
const defaultIndexes = await getDefaultIndex({ q, a, indexSize });
const concatDefaultIndexes = defaultIndexes.map((item) => {
const oldIndex = indexes!.find((index) => index.text === item.text);
@@ -114,11 +116,13 @@ const formatIndexes = async ({
// If oversize tokens, split it
const tokens = await countPromptTokens(item.text);
if (tokens > maxIndexSize) {
const splitText = splitText2Chunks({
text: item.text,
chunkSize: indexSize,
maxSize: maxIndexSize
}).chunks;
const splitText = (
await text2Chunks({
text: item.text,
chunkSize: indexSize,
maxSize: maxIndexSize
})
).chunks;
return splitText.map((text) => ({
text,
type: item.type

View File

@@ -1,6 +1,6 @@
/* Dataset collection source parse, not max size. */
import type { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ParagraphChunkAIModeEnum } from '@fastgpt/global/core/dataset/constants';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum,
@@ -29,7 +29,7 @@ import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { POST } from '@fastgpt/service/common/api/plusRequest';
import { deleteRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/controller';
import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
const requestLLMPargraph = async ({
rawText,
@@ -42,13 +42,11 @@ const requestLLMPargraph = async ({
billId: string;
paragraphChunkAIMode: ParagraphChunkAIModeEnum;
}) => {
return {
resultText: rawText,
totalInputTokens: 0,
totalOutputTokens: 0
};
if (!global.feConfigs?.isPlus || !paragraphChunkAIMode) {
if (
!global.feConfigs?.isPlus ||
!paragraphChunkAIMode ||
paragraphChunkAIMode === ParagraphChunkAIModeEnum.forbid
) {
return {
resultText: rawText,
totalInputTokens: 0,
@@ -57,16 +55,16 @@ const requestLLMPargraph = async ({
}
// Check is markdown text(Include 1 group of title)
// if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
// const isMarkdown = /^(#+)\s/.test(rawText);
// if (isMarkdown) {
// return {
// resultText: rawText,
// totalInputTokens: 0,
// totalOutputTokens: 0
// };
// }
// }
if (paragraphChunkAIMode === ParagraphChunkAIModeEnum.auto) {
const isMarkdown = /^(#+)\s/.test(rawText);
if (isMarkdown) {
return {
resultText: rawText,
totalInputTokens: 0,
totalOutputTokens: 0
};
}
}
const data = await POST<{
resultText: string;
@@ -226,15 +224,25 @@ export const datasetParseQueue = async (): Promise<any> => {
});
// 3. LLM Pargraph
const { resultText } = await requestLLMPargraph({
const { resultText, totalInputTokens, totalOutputTokens } = await requestLLMPargraph({
rawText,
model: dataset.agentModel,
billId: data.billId,
paragraphChunkAIMode: collection.paragraphChunkAIMode
});
// Push usage
pushLLMTrainingUsage({
teamId: data.teamId,
tmbId: data.tmbId,
model: dataset.agentModel,
inputTokens: totalInputTokens,
outputTokens: totalOutputTokens,
billId: data.billId,
mode: 'paragraph'
});
// 4. Chunk split
const chunks = rawText2Chunks({
const chunks = await rawText2Chunks({
rawText: resultText,
chunkTriggerType: collection.chunkTriggerType,
chunkTriggerMinSize: collection.chunkTriggerMinSize,

View File

@@ -1,10 +1,9 @@
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
import { pushQAUsage } from '@/service/support/wallet/usage/push';
import { pushLLMTrainingUsage } from '@fastgpt/service/support/wallet/usage/controller';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { createChatCompletion } from '@fastgpt/service/core/ai/config';
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
import { addLog } from '@fastgpt/service/common/system/log';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { replaceVariable } from '@fastgpt/global/common/string/tools';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
@@ -24,6 +23,7 @@ import {
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { text2Chunks } from '@fastgpt/service/worker/function';
const reduceQueue = () => {
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@@ -144,7 +144,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
const inputTokens = usage?.prompt_tokens || (await countGptMessagesTokens(messages));
const outputTokens = usage?.completion_tokens || (await countPromptTokens(answer));
const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
const qaArr = await formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
// get vector and insert
await pushDataListToTrainingQueueByCollectionId({
@@ -163,13 +163,14 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
await MongoDatasetTraining.findByIdAndDelete(data._id);
// add bill
pushQAUsage({
pushLLMTrainingUsage({
teamId: data.teamId,
tmbId: data.tmbId,
inputTokens,
outputTokens,
billId: data.billId,
model: modelData.model
model: modelData.model,
mode: 'qa'
});
addLog.info(`[QA Queue] Finish`, {
time: Date.now() - startTime,
@@ -196,7 +197,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
}
// Format qa answer
function formatSplitText({
async function formatSplitText({
answer,
rawText,
llmModel
@@ -223,7 +224,7 @@ function formatSplitText({
// empty result. direct split chunk
if (result.length === 0) {
const { chunks } = splitText2Chunks({
const { chunks } = await text2Chunks({
text: rawText,
chunkSize: chunkAutoChunkSize,
maxSize: getLLMMaxChunkSize(llmModel)

View File

@@ -5,42 +5,6 @@ import { i18nT } from '@fastgpt/web/i18n/utils';
import { ModelTypeEnum } from '@fastgpt/global/core/ai/model';
import { getDefaultTTSModel } from '@fastgpt/service/core/ai/model';
export const pushQAUsage = async ({
teamId,
tmbId,
model,
inputTokens,
outputTokens,
billId
}: {
teamId: string;
tmbId: string;
model: string;
inputTokens: number;
outputTokens: number;
billId: string;
}) => {
// 计算价格
const { totalPoints } = formatModelChars2Points({
model,
modelType: ModelTypeEnum.llm,
inputTokens,
outputTokens
});
concatUsage({
billId,
teamId,
tmbId,
totalPoints,
inputTokens,
outputTokens,
listIndex: 1
});
return { totalPoints };
};
export const pushGenerateVectorUsage = ({
billId,
teamId,