Files
FastGPT/packages/service/core/dataset/collection/controller.ts
Archer 3a5d725efd feature: 4.10.1 (#5201)
* add dynamic inputRender (#5127)

* dynamic input component

* fix

* fix

* fix

* perf: dynamic render input

* update doc

* perf: error catch

* num input ui

* fix form render (#5177)

* perf: i18n check

* add log

* doc

* Sync dataset  (#5181)

* perf: api dataset create (#5047)

* Sync dataset (#5120)

* add

* wait

* restructure dataset sync, update types and APIs, add sync hints, and remove legacy logic

* feat: add function to retrieve real file ID from third-party doc library and rename team permission check function for clarity

* fix come console

* refactor: rename team dataset limit check functions for clarity, update API dataset sync limit usage, and rename root directory to "ROOT_FOLDER"

* frat: update sync dataset login

* fix delete.ts

* feat: update pnpm-lock.yaml to include bullmq, fix comments in api.d.ts and type.d.ts, rename API file ID field, optimize dataset sync logic, and add website sync feature with related APIs

* feat: update CollectionCard to support site dataset sync, add API root ID constant and init sync API

* feat: add RootCollectionId constant to replace hardcoded root ID

---------

Co-authored-by: dreamer6680 <146868355@qq.com>

* perf: code

* feat: update success message for dataset sync, revise related i18n texts, and optimize file selection logic (#5166)

Co-authored-by: dreamer6680 <146868355@qq.com>

* perf: select file

* Sync dataset (#5180)

* feat: update success message for dataset sync, revise related i18n texts, and optimize file selection logic

* fix: make listfile function return rawid string

---------

Co-authored-by: dreamer6680 <146868355@qq.com>

* init sh

* fix: ts

---------

Co-authored-by: dreamer6680 <1468683855@qq.com>
Co-authored-by: dreamer6680 <146868355@qq.com>

* update doc

* i18n

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: dreamer6680 <1468683855@qq.com>
Co-authored-by: dreamer6680 <146868355@qq.com>
2025-07-11 17:02:48 +08:00

431 lines
12 KiB
TypeScript

import {
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum,
DatasetTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
import type {
DatasetCollectionSchemaType,
DatasetSchemaType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '../training/schema';
import { MongoDatasetData } from '../data/schema';
import { delImgByRelatedId } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorDB/controller';
import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import type { ClientSession } from '../../../common/mongo';
import { createOrGetCollectionTags } from './utils';
import { rawText2Chunks } from '../read';
import { checkDatasetIndexLimit } from '../../../support/permission/teamLimit';
import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { pushDataListToTrainingQueue, pushDatasetToParseQueue } from '../training/controller';
import { MongoImage } from '../../../common/file/image/schema';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { addDays } from 'date-fns';
import { MongoDatasetDataText } from '../data/dataTextSchema';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils';
import {
computedCollectionChunkSettings,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { clearCollectionImages, removeDatasetImageExpiredTime } from '../image/utils';
export const createCollectionAndInsertData = async ({
dataset,
rawText,
imageIds,
createCollectionParams,
backupParse = false,
billId,
session
}: {
dataset: DatasetSchemaType;
rawText?: string;
imageIds?: string[];
createCollectionParams: CreateOneCollectionParams;
backupParse?: boolean;
billId?: string;
session?: ClientSession;
}) => {
// Adapter 4.9.0
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
createCollectionParams.autoIndexes = true;
}
const formatCreateCollectionParams = computedCollectionChunkSettings({
...createCollectionParams,
llmModel: getLLMModel(dataset.agentModel),
vectorModel: getEmbeddingModel(dataset.vectorModel)
});
const teamId = formatCreateCollectionParams.teamId;
const tmbId = formatCreateCollectionParams.tmbId;
// Set default params
const trainingType =
formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const trainingMode = getTrainingModeByCollection({
trainingType: trainingType,
autoIndexes: formatCreateCollectionParams.autoIndexes,
imageIndex: formatCreateCollectionParams.imageIndex
});
if (
trainingType === DatasetCollectionDataProcessModeEnum.qa ||
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) {
delete formatCreateCollectionParams.chunkTriggerType;
delete formatCreateCollectionParams.chunkTriggerMinSize;
delete formatCreateCollectionParams.dataEnhanceCollectionName;
delete formatCreateCollectionParams.imageIndex;
delete formatCreateCollectionParams.autoIndexes;
if (
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) {
delete formatCreateCollectionParams.paragraphChunkAIMode;
delete formatCreateCollectionParams.paragraphChunkDeep;
delete formatCreateCollectionParams.paragraphChunkMinSize;
delete formatCreateCollectionParams.chunkSplitMode;
delete formatCreateCollectionParams.chunkSize;
delete formatCreateCollectionParams.chunkSplitter;
delete formatCreateCollectionParams.indexSize;
delete formatCreateCollectionParams.indexPrefixTitle;
}
}
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
delete formatCreateCollectionParams.qaPrompt;
}
// 1. split chunks or create image chunks
const {
chunks,
chunkSize,
indexSize
}: {
chunks: Array<{
q?: string;
a?: string; // answer or custom content
imageId?: string;
indexes?: string[];
}>;
chunkSize?: number;
indexSize?: number;
} = await (async () => {
if (rawText) {
// Process text chunks
const chunks = await rawText2Chunks({
rawText,
chunkTriggerType: formatCreateCollectionParams.chunkTriggerType,
chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize,
chunkSize: formatCreateCollectionParams.chunkSize,
paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep,
paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: formatCreateCollectionParams.chunkSplitter
? [formatCreateCollectionParams.chunkSplitter]
: [],
backupParse
});
return {
chunks,
chunkSize: formatCreateCollectionParams.chunkSize,
indexSize: formatCreateCollectionParams.indexSize
};
}
if (imageIds) {
// Process image chunks
const chunks = imageIds.map((imageId: string) => ({
imageId,
indexes: []
}));
return { chunks };
}
return {
chunks: [],
chunkSize: formatCreateCollectionParams.chunkSize,
indexSize: formatCreateCollectionParams.indexSize
};
})();
// 2. auth limit
await checkDatasetIndexLimit({
teamId,
insertLen: predictDataLimitLength(trainingMode, chunks)
});
const fn = async (session: ClientSession) => {
// 3. Create collection
const { _id: collectionId } = await createOneCollection({
...formatCreateCollectionParams,
trainingType,
chunkSize,
indexSize,
hashRawText: rawText ? hashStr(rawText) : undefined,
rawTextLength: rawText?.length,
session
});
// 4. create training bill
const traingBillId = await (async () => {
if (billId) return billId;
const { billId: newBillId } = await createTrainingUsage({
teamId,
tmbId,
appName: formatCreateCollectionParams.name,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
vllmModel: getVlmModel(dataset.vlmModel)?.name,
session
});
return newBillId;
})();
// 5. insert to training queue
const insertResults = await (async () => {
if (rawText || imageIds) {
return pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
vlmModel: dataset.vlmModel,
indexSize,
mode: trainingMode,
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,
indexes: item.indexes?.map((text) => ({
type: DatasetDataIndexTypeEnum.custom,
text
})),
chunkIndex: index
})),
session
});
} else {
await pushDatasetToParseQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
billId: traingBillId,
session
});
return {
insertLen: 0
};
}
})();
// 6. Remove images ttl index
await removeDatasetImageExpiredTime({
ids: imageIds,
collectionId,
session
});
return {
collectionId: String(collectionId),
insertResults
};
};
if (session) {
return fn(session);
}
return mongoSessionRun(fn);
};
export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
teamId: string;
tmbId: string;
session?: ClientSession;
};
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
const {
teamId,
parentId,
datasetId,
tags,
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId,
apiFileParentId
} = props;
const collectionTags = await createOrGetCollectionTags({
tags,
teamId,
datasetId,
session
});
// Create collection
const [collection] = await MongoDatasetCollection.create(
[
{
...props,
_id: undefined,
parentId: parentId || null,
tags: collectionTags,
...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}),
...(apiFileParentId ? { apiFileParentId } : {})
}
],
{ session, ordered: true }
);
return collection;
}
/* delete collection related images/files */
export const delCollectionRelatedSource = async ({
collections,
session
}: {
collections: {
teamId: string;
fileId?: string;
metadata?: {
relatedImgId?: string;
};
}[];
session?: ClientSession;
}) => {
if (collections.length === 0) return;
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
const fileIdList = collections.map((item) => item?.fileId || '').filter(Boolean);
const relatedImageIds = collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean);
// Delete files and images in parallel
await Promise.all([
// Delete files
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList
}),
// Delete images
delImgByRelatedId({
teamId,
relateIds: relatedImageIds,
session
})
]);
};
/**
* delete collection and it related data
*/
export async function delCollection({
collections,
session,
delImg = true,
delFile = true
}: {
collections: DatasetCollectionSchemaType[];
session: ClientSession;
delImg: boolean;
delFile: boolean;
}) {
if (collections.length === 0) return;
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
const collectionIds = collections.map((item) => String(item._id));
await retryFn(async () => {
await Promise.all([
// Delete training data
MongoDatasetTraining.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_data_texts
MongoDatasetDataText.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_datas
MongoDatasetData.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_images
clearCollectionImages(collectionIds),
// Delete images if needed
...(delImg
? [
delImgByRelatedId({
teamId,
relateIds: collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean)
})
]
: []),
// Delete files if needed
...(delFile
? [
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList: collections.map((item) => item?.fileId || '').filter(Boolean)
})
]
: []),
// Delete vector data
deleteDatasetDataVector({ teamId, datasetIds, collectionIds })
]);
// delete collections
await MongoDatasetCollection.deleteMany(
{
teamId,
_id: { $in: collectionIds }
},
{ session }
);
});
}