Files
FastGPT/packages/service/core/dataset/collection/controller.ts
T
Archer 2e18f1ebc2 next 15 (#6457)
* next 15

* lock

* feat: rename .d.ts to .ts for Next 15 compatibility

- Rename 104 .d.ts files to .ts (Next 15 no longer supports .d.ts in src)
- Remove 5 redundant .d.ts files that had .ts counterparts
- Update all import paths: remove .d suffix from 100 import statements
- Update tsconfig.json include patterns across all packages
- Add pnpm overrides to unify react@18.3.1 across monorepo
- Fix react version mismatch (packages/global and packages/service were resolving to react@19.1.1)

* fix: resolve 61 TypeScript errors from .d.ts to .ts migration

- Fix broken imports using non-relative module paths (e.g. 'support/user/team/type' → relative paths)
- Remove unused/dead imports referencing deleted modules
- Fix duplicate identifiers (show_emptyChat, concatMd, TrainingModeEnum)
- Add missing imports (BoxProps, GroupMemberRole, UsageSourceEnum, dashboard_evaluation)
- Fix generic type constraints (OutLinkEditType, createShareChat)
- Replace removed types with correct alternatives (ChatModelItemType → LLMModelItemType)
- Delete 5 dead code files with 0 references
- Add global type declaration for countTrackQueue
- Fix nullable type narrowing (sourceMember, ParentIdType, optional app fields)

* refactor: replace as ClientSession assertion with proper type narrowing via Omit & intersection

* fix: remove experimental.workerThreads to fix DataCloneError in Next 15 static generation

Next 15 worker threads attempt to structuredClone the config object,
which fails on the webpack function. workerThreads is not needed for
the build to work correctly.

* Update document/content/docs/upgrading/4-14/4148.mdx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix: ts

* update next config

* update next

* fix: dockerfile

* fix: comment

---------

Co-authored-by: Archer <c121914yu@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-02-25 18:28:16 +08:00

435 lines
13 KiB
TypeScript

import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
import { MongoDatasetCollection } from './schema';
import type {
DatasetCollectionSchemaType,
DatasetSchemaType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '../training/schema';
import { MongoDatasetData } from '../data/schema';
import { delImgByRelatedId } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorDB/controller';
import type { ClientSession } from '../../../common/mongo';
import { createOrGetCollectionTags } from './utils';
import { rawText2Chunks } from '../read';
import { checkDatasetIndexLimit } from '../../../support/permission/teamLimit';
import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
import { pushDataListToTrainingQueue, pushDatasetToParseQueue } from '../training/controller';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { MongoDatasetDataText } from '../data/dataTextSchema';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils';
import {
computedCollectionChunkSettings,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { getS3DatasetSource } from '../../../common/s3/sources/dataset';
import { removeS3TTL, isS3ObjectKey } from '../../../common/s3/utils';
export const createCollectionAndInsertData = async ({
dataset,
rawText,
imageIds,
createCollectionParams,
backupParse = false,
billId,
session
}: {
dataset: DatasetSchemaType;
rawText?: string;
imageIds?: string[];
createCollectionParams: CreateOneCollectionParams;
backupParse?: boolean;
billId?: string;
session?: ClientSession;
}) => {
// Adapter 4.9.0
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
createCollectionParams.autoIndexes = true;
}
const formatCreateCollectionParams = computedCollectionChunkSettings({
...createCollectionParams,
llmModel: getLLMModel(dataset.agentModel),
vectorModel: getEmbeddingModel(dataset.vectorModel)
});
const teamId = formatCreateCollectionParams.teamId;
const tmbId = formatCreateCollectionParams.tmbId;
// Set default params
const trainingType =
formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const trainingMode = getTrainingModeByCollection({
trainingType: trainingType,
autoIndexes: formatCreateCollectionParams.autoIndexes,
imageIndex: formatCreateCollectionParams.imageIndex
});
if (
trainingType === DatasetCollectionDataProcessModeEnum.qa ||
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) {
delete formatCreateCollectionParams.chunkTriggerType;
delete formatCreateCollectionParams.chunkTriggerMinSize;
delete formatCreateCollectionParams.dataEnhanceCollectionName;
delete formatCreateCollectionParams.imageIndex;
delete formatCreateCollectionParams.autoIndexes;
if (
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
trainingType === DatasetCollectionDataProcessModeEnum.template
) {
delete formatCreateCollectionParams.paragraphChunkAIMode;
delete formatCreateCollectionParams.paragraphChunkDeep;
delete formatCreateCollectionParams.paragraphChunkMinSize;
delete formatCreateCollectionParams.chunkSplitMode;
delete formatCreateCollectionParams.chunkSize;
delete formatCreateCollectionParams.chunkSplitter;
delete formatCreateCollectionParams.indexSize;
delete formatCreateCollectionParams.indexPrefixTitle;
}
}
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
delete formatCreateCollectionParams.qaPrompt;
}
// 1. split chunks or create image chunks
const {
chunks,
chunkSize,
indexSize
}: {
chunks: Array<{
q?: string;
a?: string; // answer or custom content
imageId?: string;
indexes?: string[];
}>;
chunkSize?: number;
indexSize?: number;
} = await (async () => {
if (rawText) {
// Process text chunks
const chunks = await rawText2Chunks({
rawText,
chunkTriggerType: formatCreateCollectionParams.chunkTriggerType,
chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize,
chunkSize: formatCreateCollectionParams.chunkSize,
paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep,
paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: formatCreateCollectionParams.chunkSplitter
? [formatCreateCollectionParams.chunkSplitter]
: [],
backupParse
});
return {
chunks,
chunkSize: formatCreateCollectionParams.chunkSize,
indexSize: formatCreateCollectionParams.indexSize
};
}
if (imageIds) {
// Process image chunks
const chunks = imageIds.map((imageId: string) => ({
imageId,
indexes: []
}));
return { chunks };
}
return {
chunks: [],
chunkSize: formatCreateCollectionParams.chunkSize,
indexSize: formatCreateCollectionParams.indexSize
};
})();
// 2. auth limit
await checkDatasetIndexLimit({
teamId,
insertLen: predictDataLimitLength(trainingMode, chunks)
});
const fn = async (session: ClientSession) => {
// 3. Create collection
const { _id: collectionId } = await createOneCollection({
...formatCreateCollectionParams,
trainingType,
chunkSize,
indexSize,
hashRawText: rawText ? hashStr(rawText) : undefined,
rawTextLength: rawText?.length,
session
});
// 4. create training bill
const traingUsageId = await (async () => {
if (billId) return billId;
const { usageId: newUsageId } = await createTrainingUsage({
teamId,
tmbId,
appName: formatCreateCollectionParams.name,
billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name,
vllmModel: getVlmModel(dataset.vlmModel)?.name,
session
});
return newUsageId;
})();
// 5. insert to training queue
const insertResults = await (async () => {
if (rawText || imageIds) {
return pushDataListToTrainingQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel,
vlmModel: dataset.vlmModel,
indexSize,
mode: trainingMode,
billId: traingUsageId,
data: chunks.map((item, index) => ({
...item,
indexes: item.indexes?.map((text) => ({
type: DatasetDataIndexTypeEnum.custom,
text
})),
chunkIndex: index
})),
session
});
} else {
await pushDatasetToParseQueue({
teamId,
tmbId,
datasetId: dataset._id,
collectionId,
billId: traingUsageId,
session
});
return {
insertLen: 0
};
}
})();
return {
collectionId: String(collectionId),
insertResults
};
};
if (session) {
return fn(session);
}
return mongoSessionRun(fn);
};
export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
teamId: string;
tmbId: string;
session?: ClientSession;
};
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
const {
teamId,
parentId,
datasetId,
tags,
fileId,
rawLink,
externalFileId,
externalFileUrl,
apiFileId,
apiFileParentId
} = props;
const collectionTags = await createOrGetCollectionTags({
tags,
teamId,
datasetId,
session
});
// Create collection
const [collection] = await MongoDatasetCollection.create(
[
{
...props,
_id: undefined,
parentId: parentId || null,
tags: collectionTags,
...(fileId ? { fileId } : {}),
...(rawLink ? { rawLink } : {}),
...(externalFileId ? { externalFileId } : {}),
...(externalFileUrl ? { externalFileUrl } : {}),
...(apiFileId ? { apiFileId } : {}),
...(apiFileParentId ? { apiFileParentId } : {})
}
],
{ session, ordered: true }
);
if (isS3ObjectKey(fileId, 'dataset')) {
await removeS3TTL({ key: fileId, bucketName: 'private', session });
}
return collection;
}
/* delete collection related images/files */
export const delCollectionRelatedSource = async ({
collections,
session
}: {
collections: {
teamId: string;
fileId?: string;
metadata?: {
relatedImgId?: string;
};
}[];
session?: ClientSession;
}) => {
if (collections.length === 0) return;
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
// FIXME: 兼容旧解析图像删除
const relatedImageIds = collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean);
// Delete files and images in parallel
await Promise.all([
// Delete images
delImgByRelatedId({
teamId,
relateIds: relatedImageIds,
session
})
]);
};
/**
* delete collection and it related data
*/
export async function delCollection({
collections,
session,
delImg = true,
delFile = true
}: {
collections: DatasetCollectionSchemaType[];
session: ClientSession;
delImg: boolean;
delFile: boolean;
}) {
if (collections.length === 0) return;
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
const s3DatasetSource = getS3DatasetSource();
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
const collectionIds = collections.map((item) => String(item._id));
const imageCollectionIds = collections
.filter((item) => item.type === DatasetCollectionTypeEnum.images)
.map((item) => String(item._id));
const imageDatas = await MongoDatasetData.find(
{
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: imageCollectionIds }
},
{ imageId: 1 }
).lean();
const imageIds = imageDatas
.map((item) => item.imageId)
.filter((key) => isS3ObjectKey(key, 'dataset'));
await retryFn(async () => {
await Promise.all([
// Delete training data
MongoDatasetTraining.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_data_texts
MongoDatasetDataText.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete dataset_datas
MongoDatasetData.deleteMany({
teamId,
datasetId: { $in: datasetIds },
collectionId: { $in: collectionIds }
}),
// Delete images if needed
...(delImg // 兼容旧图像删除
? [
delImgByRelatedId({
teamId,
relateIds: collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean)
})
]
: []),
// Delete files if needed
...(delFile
? [
getS3DatasetSource().deleteDatasetFilesByKeys(
collections.map((item) => item?.fileId || '').filter(Boolean)
)
]
: []),
// Delete vector data
deleteDatasetDataVector({ teamId, datasetIds, collectionIds })
]);
// delete collections
await MongoDatasetCollection.deleteMany(
{
teamId,
_id: { $in: collectionIds }
},
{ session }
).lean();
// delete s3 images which are uploaded by users
await s3DatasetSource.deleteDatasetFilesByKeys(imageIds);
});
}