Files
FastGPT/packages/service/core/dataset/collection/controller.ts
T
Archer 051455238c V4.13.0 features (#5693)
* feat: concat usage code (#5657)

* feat: dataset parse queue (#5661)

* feat: chat usage concat (#5669)

* perf: search test usage

* feat: chat usage concat

* fix: ts

* fix: ts

* feat: chat node response store (#5675)

* feat: chat node response store

* limit export

* test

* add ai generate node (#5506)

* add node copilot

* apply code

* update dynamic input & output

* add code test

* usage

* dynamic input border render

* optimize input & output

* optimize code

* update style

* change card to popover

* prompt editor basic

* prompt editor

* handle key down

* update prompt

* merge

* fix

* fix

* fix

* perf: workflow performance (#5677)

* feat: chat node response store

* limit export

* perf: workflow performance

* remove log

* fix: app template get duplicate (#5682)

* fix: dynamic input lock & code param (#5680)

* fix: dynamic input lock & code param

* fix

* fix

* feat: multi node data sync & system tool hot-swapping (#5575)

* Enhance file upload functionality and system tool integration (#5257)

* Enhance file upload functionality and system tool integration

* Add supplementary documents and optimize the upload interface

* Refactor file plugin types and update upload configurations

* Refactor MinIO configuration variables and clean up API plugin handlers for improved readability and consistency

* File name change

* Refactor SystemTools component layout

* fix i18n

* fix

* fix

* fix

* optimize app logs sort (#5310)

* log keys config modal

* multiple select

* api

* fontsize

* code

* chatid

* fix build

* fix

* fix component

* change name

* log keys config

* fix

* delete unused

* fix

* chore: minio service class rewrite

* chore: s3 plugin upload

* feat: system global cache with multi node sync feature

* feat: cache

* chore: move images

* docs: update & remove useless code

* chore: resolve merge conflicts

* chore: adjust the code

* chore: adjust

* deps: upgrade @fastgpt-sdk/plugin to 0.1.17

* perf(s3): s3 config

* fix: cache syncKey refresh

* fix: update @fastgpt-sdk/plugin to v0.1.18 removing mongo definition for fixing vitest

* chore: adjust

---------

Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Archer <545436317@qq.com>

* perf: s3 api code

* fix: toolbox empty when second open modal

* feat: http tool set (#5599)

* feat: http toolSet manual create front end

* feat: http toolSet manual create i18n

* feat: http toolSet manual create back end

* feat: auth, as tool param, adapt mcp

* fix: delete unused httpPlugin

* fix: delete FlowNodeTypeEnum.httpPlugin

* fix: AppTypeEnum include httpToolSet and httpPlugin

* fix

* delete console

* fix

* output schema

* fix

* fix bg

* fix base url

* fix

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* feat: app count

* perf: type check

* feat: catch error

* perf: plugin hot-swapping (#5688)

* perf: plugin hot-swapping

* chore: adjust code

* perf: cite data auth

* fix http toolset (#5689)

* temp

* fix http tool set

* fix

* template author hide

* dynamic IO ui

* fix: auth test

* fix dynamic input & output (#5690)

Co-authored-by: Archer <545436317@qq.com>

* fix: dynamic output id

* doc

* feat: model permission (#5666)

* feat(permission): model permission definition & api

* chore: support update model's collaborators

* feat: remove unauthedmodel when paste and import

* fix: type error

* fix: test setup global model list

* fix: http tool api

* chore: update fastgpt-sdk version

* chore: remove useless code

* chore: myModelList cache

* perf: user who is not manager can not configure model permission (FE)

* perf: model => Set

* feat: getMyModels moved to opensource code; cache the myModelList

* fix: type error

* fix dynamic input reference select type (#5694)

* remove unique index

* read file usage

* perf: connection error

* fix: abort token count

* fix: debug usage concat

* fix: immer clone object

* fix: immer clone object

* perf: throw error when error chat

* update audit i18n

* fix: 修复识别pptx文件后,返回内容顺序错乱问题 (#5696)

* fix: pptx sort error

* fix prompt editor (#5695)

* fix prompt editor

* fix

* fix: redis cache prefix (#5697)

* fix: redis cache prefix

* fix: cache

* fix: get model collaborator by model.model

* feat: hint for model per

* rename bucket name

* model ui

* doc

* doc

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: Zeng Qingwen <143274079+fishwww-ww@users.noreply.github.com>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: Deepturn <33342819+Deepturn@users.noreply.github.com>
2025-09-24 22:40:31 +08:00

425 lines
12 KiB
TypeScript

import { DatasetCollectionDataProcessModeEnum } from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
import type {
DatasetCollectionSchemaType,
DatasetSchemaType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '../training/schema';
import { MongoDatasetData } from '../data/schema';
import { delImgByRelatedId } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorDB/controller';
import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import type { ClientSession } from '../../../common/mongo';
import { createOrGetCollectionTags } from './utils';
import { rawText2Chunks } from '../read';
import { checkDatasetIndexLimit } from '../../../support/permission/teamLimit';
import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { pushDataListToTrainingQueue, pushDatasetToParseQueue } from '../training/controller';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { MongoDatasetDataText } from '../data/dataTextSchema';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils';
import {
computedCollectionChunkSettings,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { clearCollectionImages, removeDatasetImageExpiredTime } from '../image/utils';
export const createCollectionAndInsertData = async ({
dataset,
rawText,
imageIds,
createCollectionParams,
backupParse = false,
billId,
session
}: {
dataset: DatasetSchemaType;
rawText?: string;
imageIds?: string[];
createCollectionParams: CreateOneCollectionParams;
backupParse?: boolean;
billId?: string;
session?: ClientSession;
}) => {
// Adapter 4.9.0
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
createCollectionParams.autoIndexes = true;
}
const formatCreateCollectionParams = computedCollectionChunkSettings({
...createCollectionParams,
llmModel: getLLMModel(dataset.agentModel),
vectorModel: getEmbeddingModel(dataset.vectorModel)
});
const teamId = formatCreateCollectionParams.teamId;
const tmbId = formatCreateCollectionParams.tmbId;
// Set default params
const trainingType =
formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const trainingMode = getTrainingModeByCollection({
trainingType: trainingType,
autoIndexes: formatCreateCollectionParams.autoIndexes,
imageIndex: formatCreateCollectionParams.imageIndex
});
if (
trainingType === DatasetCollectionDataProcessModeEnum.qa ||
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) {
delete formatCreateCollectionParams.chunkTriggerType;
delete formatCreateCollectionParams.chunkTriggerMinSize;
delete formatCreateCollectionParams.dataEnhanceCollectionName;
delete formatCreateCollectionParams.imageIndex;
delete formatCreateCollectionParams.autoIndexes;
if (
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) {
delete formatCreateCollectionParams.paragraphChunkAIMode;
delete formatCreateCollectionParams.paragraphChunkDeep;
delete formatCreateCollectionParams.paragraphChunkMinSize;
delete formatCreateCollectionParams.chunkSplitMode;
delete formatCreateCollectionParams.chunkSize;
delete formatCreateCollectionParams.chunkSplitter;
delete formatCreateCollectionParams.indexSize;
delete formatCreateCollectionParams.indexPrefixTitle;
}
}
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
delete formatCreateCollectionParams.qaPrompt;
}
// 1. split chunks or create image chunks
const {
chunks,
chunkSize,
indexSize
}: {
chunks: Array<{
q?: string;
a?: string; // answer or custom content
imageId?: string;
indexes?: string[];
}>;
chunkSize?: number;
indexSize?: number;
} = await (async () => {
if (rawText) {
// Process text chunks
const chunks = await rawText2Chunks({
rawText,
chunkTriggerType: formatCreateCollectionParams.chunkTriggerType,
chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize,
chunkSize: formatCreateCollectionParams.chunkSize,
paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep,
paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: formatCreateCollectionParams.chunkSplitter
? [formatCreateCollectionParams.chunkSplitter]
: [],
backupParse
});
return {
chunks,
chunkSize: formatCreateCollectionParams.chunkSize,
indexSize: formatCreateCollectionParams.indexSize
};
}
if (imageIds) {
// Process image chunks
const chunks = imageIds.map((imageId: string) => ({
imageId,
indexes: []
}));
return { chunks };
}
return {
chunks: [],
chunkSize: formatCreateCollectionParams.chunkSize,
indexSize: formatCreateCollectionParams.indexSize
};
})();
// 2. auth limit
await checkDatasetIndexLimit({
teamId,
insertLen: predictDataLimitLength(trainingMode, chunks)
});
const fn = async (session: ClientSession) => {
// 3. Create collection
const { _id: collectionId } = await createOneCollection({
...formatCreateCollectionParams,
trainingType,
chunkSize,
indexSize,
hashRawText: rawText ? hashStr(rawText) : undefined,
rawTextLength: rawText?.length,
session
});
// 4. create training bill
const traingUsageId = await (async () => {
if (billId) return billId;
const { usageId: newUsageId } = await createTrainingUsage({
teamId,
tmbId,
appName: formatCreateCollectionParams.name,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
vllmModel: getVlmModel(dataset.vlmModel)?.name,
session
});
return newUsageId;
})();
// 5. insert to training queue
const insertResults = await (async () => {
if (rawText || imageIds) {
return pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
vlmModel: dataset.vlmModel,
indexSize,
mode: trainingMode,
billId: traingUsageId,
data: chunks.map((item, index) => ({
...item,
indexes: item.indexes?.map((text) => ({
type: DatasetDataIndexTypeEnum.custom,
text
})),
chunkIndex: index
})),
session
});
} else {
await pushDatasetToParseQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
billId: traingUsageId,
session
});
return {
insertLen: 0
};
}
})();
// 6. Remove images ttl index
await removeDatasetImageExpiredTime({
ids: imageIds,
collectionId,
session
});
return {
collectionId: String(collectionId),
insertResults
};
};
if (session) {
return fn(session);
}
return mongoSessionRun(fn);
};
export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
teamId: string;
tmbId: string;
session?: ClientSession;
};
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
const {
teamId,
parentId,
datasetId,
tags,
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId,
apiFileParentId
} = props;
const collectionTags = await createOrGetCollectionTags({
tags,
teamId,
datasetId,
session
});
// Create collection
const [collection] = await MongoDatasetCollection.create(
[
{
...props,
_id: undefined,
parentId: parentId || null,
tags: collectionTags,
...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}),
...(apiFileParentId ? { apiFileParentId } : {})
}
],
{ session, ordered: true }
);
return collection;
}
/* delete collection related images/files */
export const delCollectionRelatedSource = async ({
collections,
session
}: {
collections: {
teamId: string;
fileId?: string;
metadata?: {
relatedImgId?: string;
};
}[];
session?: ClientSession;
}) => {
if (collections.length === 0) return;
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
const fileIdList = collections.map((item) => item?.fileId || '').filter(Boolean);
const relatedImageIds = collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean);
// Delete files and images in parallel
await Promise.all([
// Delete files
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList
}),
// Delete images
delImgByRelatedId({
teamId,
relateIds: relatedImageIds,
session
})
]);
};
/**
* delete collection and it related data
*/
export async function delCollection({
collections,
session,
delImg = true,
delFile = true
}: {
collections: DatasetCollectionSchemaType[];
session: ClientSession;
delImg: boolean;
delFile: boolean;
}) {
if (collections.length === 0) return;
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
const collectionIds = collections.map((item) => String(item._id));
await retryFn(async () => {
await Promise.all([
// Delete training data
MongoDatasetTraining.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_data_texts
MongoDatasetDataText.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_datas
MongoDatasetData.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_images
clearCollectionImages(collectionIds),
// Delete images if needed
...(delImg
? [
delImgByRelatedId({
teamId,
relateIds: collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean)
})
]
: []),
// Delete files if needed
...(delFile
? [
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
})
]
: []),
// Delete vector data
deleteDatasetDataVector({ teamId, datasetIds, collectionIds })
]);
// delete collections
await MongoDatasetCollection.deleteMany(
{
teamId,
_id: { $in: collectionIds }
},
{ session }
);
});
}