mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-07 01:02:55 +08:00
2e18f1ebc2
* next 15 * lock * feat: rename .d.ts to .ts for Next 15 compatibility - Rename 104 .d.ts files to .ts (Next 15 no longer supports .d.ts in src) - Remove 5 redundant .d.ts files that had .ts counterparts - Update all import paths: remove .d suffix from 100 import statements - Update tsconfig.json include patterns across all packages - Add pnpm overrides to unify react@18.3.1 across monorepo - Fix react version mismatch (packages/global and packages/service were resolving to react@19.1.1) * fix: resolve 61 TypeScript errors from .d.ts to .ts migration - Fix broken imports using non-relative module paths (e.g. 'support/user/team/type' → relative paths) - Remove unused/dead imports referencing deleted modules - Fix duplicate identifiers (show_emptyChat, concatMd, TrainingModeEnum) - Add missing imports (BoxProps, GroupMemberRole, UsageSourceEnum, dashboard_evaluation) - Fix generic type constraints (OutLinkEditType, createShareChat) - Replace removed types with correct alternatives (ChatModelItemType → LLMModelItemType) - Delete 5 dead code files with 0 references - Add global type declaration for countTrackQueue - Fix nullable type narrowing (sourceMember, ParentIdType, optional app fields) * refactor: replace as ClientSession assertion with proper type narrowing via Omit & intersection * fix: remove experimental.workerThreads to fix DataCloneError in Next 15 static generation Next 15 worker threads attempt to structuredClone the config object, which fails on the webpack function. workerThreads is not needed for the build to work correctly. * Update document/content/docs/upgrading/4-14/4148.mdx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix: ts * update next config * update next * fix: dockerfile * fix: comment --------- Co-authored-by: Archer <c121914yu@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
435 lines
13 KiB
TypeScript
435 lines
13 KiB
TypeScript
import {
|
|
DatasetCollectionDataProcessModeEnum,
|
|
DatasetCollectionTypeEnum
|
|
} from '@fastgpt/global/core/dataset/constants';
|
|
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api';
|
|
import { MongoDatasetCollection } from './schema';
|
|
import type {
|
|
DatasetCollectionSchemaType,
|
|
DatasetSchemaType
|
|
} from '@fastgpt/global/core/dataset/type';
|
|
import { MongoDatasetTraining } from '../training/schema';
|
|
import { MongoDatasetData } from '../data/schema';
|
|
import { delImgByRelatedId } from '../../../common/file/image/controller';
|
|
import { deleteDatasetDataVector } from '../../../common/vectorDB/controller';
|
|
import type { ClientSession } from '../../../common/mongo';
|
|
import { createOrGetCollectionTags } from './utils';
|
|
import { rawText2Chunks } from '../read';
|
|
import { checkDatasetIndexLimit } from '../../../support/permission/teamLimit';
|
|
import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
|
|
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
|
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
|
|
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
|
|
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
|
|
import { pushDataListToTrainingQueue, pushDatasetToParseQueue } from '../training/controller';
|
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
|
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
|
import { retryFn } from '@fastgpt/global/common/system/utils';
|
|
import { getTrainingModeByCollection } from './utils';
|
|
import {
|
|
computedCollectionChunkSettings,
|
|
getLLMMaxChunkSize
|
|
} from '@fastgpt/global/core/dataset/training/utils';
|
|
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
|
import { getS3DatasetSource } from '../../../common/s3/sources/dataset';
|
|
import { removeS3TTL, isS3ObjectKey } from '../../../common/s3/utils';
|
|
|
|
export const createCollectionAndInsertData = async ({
|
|
dataset,
|
|
rawText,
|
|
imageIds,
|
|
createCollectionParams,
|
|
backupParse = false,
|
|
billId,
|
|
session
|
|
}: {
|
|
dataset: DatasetSchemaType;
|
|
rawText?: string;
|
|
imageIds?: string[];
|
|
createCollectionParams: CreateOneCollectionParams;
|
|
|
|
backupParse?: boolean;
|
|
|
|
billId?: string;
|
|
session?: ClientSession;
|
|
}) => {
|
|
// Adapter 4.9.0
|
|
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
|
|
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
|
|
createCollectionParams.autoIndexes = true;
|
|
}
|
|
|
|
const formatCreateCollectionParams = computedCollectionChunkSettings({
|
|
...createCollectionParams,
|
|
llmModel: getLLMModel(dataset.agentModel),
|
|
vectorModel: getEmbeddingModel(dataset.vectorModel)
|
|
});
|
|
|
|
const teamId = formatCreateCollectionParams.teamId;
|
|
const tmbId = formatCreateCollectionParams.tmbId;
|
|
|
|
// Set default params
|
|
const trainingType =
|
|
formatCreateCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
|
const trainingMode = getTrainingModeByCollection({
|
|
trainingType: trainingType,
|
|
autoIndexes: formatCreateCollectionParams.autoIndexes,
|
|
imageIndex: formatCreateCollectionParams.imageIndex
|
|
});
|
|
|
|
if (
|
|
trainingType === DatasetCollectionDataProcessModeEnum.qa ||
|
|
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
|
|
trainingType === DatasetCollectionDataProcessModeEnum.template
|
|
) {
|
|
delete formatCreateCollectionParams.chunkTriggerType;
|
|
delete formatCreateCollectionParams.chunkTriggerMinSize;
|
|
delete formatCreateCollectionParams.dataEnhanceCollectionName;
|
|
delete formatCreateCollectionParams.imageIndex;
|
|
delete formatCreateCollectionParams.autoIndexes;
|
|
|
|
if (
|
|
trainingType === DatasetCollectionDataProcessModeEnum.backup ||
|
|
trainingType === DatasetCollectionDataProcessModeEnum.template
|
|
) {
|
|
delete formatCreateCollectionParams.paragraphChunkAIMode;
|
|
delete formatCreateCollectionParams.paragraphChunkDeep;
|
|
delete formatCreateCollectionParams.paragraphChunkMinSize;
|
|
delete formatCreateCollectionParams.chunkSplitMode;
|
|
delete formatCreateCollectionParams.chunkSize;
|
|
delete formatCreateCollectionParams.chunkSplitter;
|
|
delete formatCreateCollectionParams.indexSize;
|
|
delete formatCreateCollectionParams.indexPrefixTitle;
|
|
}
|
|
}
|
|
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
|
|
delete formatCreateCollectionParams.qaPrompt;
|
|
}
|
|
|
|
// 1. split chunks or create image chunks
|
|
const {
|
|
chunks,
|
|
chunkSize,
|
|
indexSize
|
|
}: {
|
|
chunks: Array<{
|
|
q?: string;
|
|
a?: string; // answer or custom content
|
|
imageId?: string;
|
|
indexes?: string[];
|
|
}>;
|
|
chunkSize?: number;
|
|
indexSize?: number;
|
|
} = await (async () => {
|
|
if (rawText) {
|
|
// Process text chunks
|
|
const chunks = await rawText2Chunks({
|
|
rawText,
|
|
chunkTriggerType: formatCreateCollectionParams.chunkTriggerType,
|
|
chunkTriggerMinSize: formatCreateCollectionParams.chunkTriggerMinSize,
|
|
chunkSize: formatCreateCollectionParams.chunkSize,
|
|
paragraphChunkDeep: formatCreateCollectionParams.paragraphChunkDeep,
|
|
paragraphChunkMinSize: formatCreateCollectionParams.paragraphChunkMinSize,
|
|
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
|
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
|
customReg: formatCreateCollectionParams.chunkSplitter
|
|
? [formatCreateCollectionParams.chunkSplitter]
|
|
: [],
|
|
backupParse
|
|
});
|
|
return {
|
|
chunks,
|
|
chunkSize: formatCreateCollectionParams.chunkSize,
|
|
indexSize: formatCreateCollectionParams.indexSize
|
|
};
|
|
}
|
|
|
|
if (imageIds) {
|
|
// Process image chunks
|
|
const chunks = imageIds.map((imageId: string) => ({
|
|
imageId,
|
|
indexes: []
|
|
}));
|
|
return { chunks };
|
|
}
|
|
|
|
return {
|
|
chunks: [],
|
|
chunkSize: formatCreateCollectionParams.chunkSize,
|
|
indexSize: formatCreateCollectionParams.indexSize
|
|
};
|
|
})();
|
|
|
|
// 2. auth limit
|
|
await checkDatasetIndexLimit({
|
|
teamId,
|
|
insertLen: predictDataLimitLength(trainingMode, chunks)
|
|
});
|
|
|
|
const fn = async (session: ClientSession) => {
|
|
// 3. Create collection
|
|
const { _id: collectionId } = await createOneCollection({
|
|
...formatCreateCollectionParams,
|
|
trainingType,
|
|
chunkSize,
|
|
indexSize,
|
|
|
|
hashRawText: rawText ? hashStr(rawText) : undefined,
|
|
rawTextLength: rawText?.length,
|
|
session
|
|
});
|
|
|
|
// 4. create training bill
|
|
const traingUsageId = await (async () => {
|
|
if (billId) return billId;
|
|
const { usageId: newUsageId } = await createTrainingUsage({
|
|
teamId,
|
|
tmbId,
|
|
appName: formatCreateCollectionParams.name,
|
|
billSource: UsageSourceEnum.training,
|
|
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
|
agentModel: getLLMModel(dataset.agentModel)?.name,
|
|
vllmModel: getVlmModel(dataset.vlmModel)?.name,
|
|
session
|
|
});
|
|
return newUsageId;
|
|
})();
|
|
|
|
// 5. insert to training queue
|
|
const insertResults = await (async () => {
|
|
if (rawText || imageIds) {
|
|
return pushDataListToTrainingQueue({
|
|
teamId,
|
|
tmbId,
|
|
datasetId: dataset._id,
|
|
collectionId,
|
|
agentModel: dataset.agentModel,
|
|
vectorModel: dataset.vectorModel,
|
|
vlmModel: dataset.vlmModel,
|
|
indexSize,
|
|
mode: trainingMode,
|
|
billId: traingUsageId,
|
|
data: chunks.map((item, index) => ({
|
|
...item,
|
|
indexes: item.indexes?.map((text) => ({
|
|
type: DatasetDataIndexTypeEnum.custom,
|
|
text
|
|
})),
|
|
chunkIndex: index
|
|
})),
|
|
session
|
|
});
|
|
} else {
|
|
await pushDatasetToParseQueue({
|
|
teamId,
|
|
tmbId,
|
|
datasetId: dataset._id,
|
|
collectionId,
|
|
billId: traingUsageId,
|
|
session
|
|
});
|
|
return {
|
|
insertLen: 0
|
|
};
|
|
}
|
|
})();
|
|
|
|
return {
|
|
collectionId: String(collectionId),
|
|
insertResults
|
|
};
|
|
};
|
|
|
|
if (session) {
|
|
return fn(session);
|
|
}
|
|
return mongoSessionRun(fn);
|
|
};
|
|
|
|
export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
|
|
teamId: string;
|
|
tmbId: string;
|
|
session?: ClientSession;
|
|
};
|
|
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
|
|
const {
|
|
teamId,
|
|
parentId,
|
|
datasetId,
|
|
tags,
|
|
|
|
fileId,
|
|
rawLink,
|
|
externalFileId,
|
|
externalFileUrl,
|
|
apiFileId,
|
|
apiFileParentId
|
|
} = props;
|
|
|
|
const collectionTags = await createOrGetCollectionTags({
|
|
tags,
|
|
teamId,
|
|
datasetId,
|
|
session
|
|
});
|
|
|
|
// Create collection
|
|
const [collection] = await MongoDatasetCollection.create(
|
|
[
|
|
{
|
|
...props,
|
|
_id: undefined,
|
|
|
|
parentId: parentId || null,
|
|
|
|
tags: collectionTags,
|
|
|
|
...(fileId ? { fileId } : {}),
|
|
...(rawLink ? { rawLink } : {}),
|
|
...(externalFileId ? { externalFileId } : {}),
|
|
...(externalFileUrl ? { externalFileUrl } : {}),
|
|
...(apiFileId ? { apiFileId } : {}),
|
|
...(apiFileParentId ? { apiFileParentId } : {})
|
|
}
|
|
],
|
|
{ session, ordered: true }
|
|
);
|
|
|
|
if (isS3ObjectKey(fileId, 'dataset')) {
|
|
await removeS3TTL({ key: fileId, bucketName: 'private', session });
|
|
}
|
|
|
|
return collection;
|
|
}
|
|
|
|
/* delete collection related images/files */
|
|
export const delCollectionRelatedSource = async ({
|
|
collections,
|
|
session
|
|
}: {
|
|
collections: {
|
|
teamId: string;
|
|
fileId?: string;
|
|
metadata?: {
|
|
relatedImgId?: string;
|
|
};
|
|
}[];
|
|
session?: ClientSession;
|
|
}) => {
|
|
if (collections.length === 0) return;
|
|
|
|
const teamId = collections[0].teamId;
|
|
|
|
if (!teamId) return Promise.reject('teamId is not exist');
|
|
|
|
// FIXME: 兼容旧解析图像删除
|
|
const relatedImageIds = collections
|
|
.map((item) => item?.metadata?.relatedImgId || '')
|
|
.filter(Boolean);
|
|
|
|
// Delete files and images in parallel
|
|
await Promise.all([
|
|
// Delete images
|
|
delImgByRelatedId({
|
|
teamId,
|
|
relateIds: relatedImageIds,
|
|
session
|
|
})
|
|
]);
|
|
};
|
|
/**
|
|
* delete collection and it related data
|
|
*/
|
|
export async function delCollection({
|
|
collections,
|
|
session,
|
|
delImg = true,
|
|
delFile = true
|
|
}: {
|
|
collections: DatasetCollectionSchemaType[];
|
|
session: ClientSession;
|
|
delImg: boolean;
|
|
delFile: boolean;
|
|
}) {
|
|
if (collections.length === 0) return;
|
|
|
|
const teamId = collections[0].teamId;
|
|
|
|
if (!teamId) return Promise.reject('teamId is not exist');
|
|
|
|
const s3DatasetSource = getS3DatasetSource();
|
|
const datasetIds = Array.from(new Set(collections.map((item) => String(item.datasetId))));
|
|
const collectionIds = collections.map((item) => String(item._id));
|
|
|
|
const imageCollectionIds = collections
|
|
.filter((item) => item.type === DatasetCollectionTypeEnum.images)
|
|
.map((item) => String(item._id));
|
|
const imageDatas = await MongoDatasetData.find(
|
|
{
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
collectionId: { $in: imageCollectionIds }
|
|
},
|
|
{ imageId: 1 }
|
|
).lean();
|
|
const imageIds = imageDatas
|
|
.map((item) => item.imageId)
|
|
.filter((key) => isS3ObjectKey(key, 'dataset'));
|
|
|
|
await retryFn(async () => {
|
|
await Promise.all([
|
|
// Delete training data
|
|
MongoDatasetTraining.deleteMany({
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
collectionId: { $in: collectionIds }
|
|
}),
|
|
// Delete dataset_data_texts
|
|
MongoDatasetDataText.deleteMany({
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
collectionId: { $in: collectionIds }
|
|
}),
|
|
// Delete dataset_datas
|
|
MongoDatasetData.deleteMany({
|
|
teamId,
|
|
datasetId: { $in: datasetIds },
|
|
collectionId: { $in: collectionIds }
|
|
}),
|
|
// Delete images if needed
|
|
...(delImg // 兼容旧图像删除
|
|
? [
|
|
delImgByRelatedId({
|
|
teamId,
|
|
relateIds: collections
|
|
.map((item) => item?.metadata?.relatedImgId || '')
|
|
.filter(Boolean)
|
|
})
|
|
]
|
|
: []),
|
|
// Delete files if needed
|
|
...(delFile
|
|
? [
|
|
getS3DatasetSource().deleteDatasetFilesByKeys(
|
|
collections.map((item) => item?.fileId || '').filter(Boolean)
|
|
)
|
|
]
|
|
: []),
|
|
// Delete vector data
|
|
deleteDatasetDataVector({ teamId, datasetIds, collectionIds })
|
|
]);
|
|
|
|
// delete collections
|
|
await MongoDatasetCollection.deleteMany(
|
|
{
|
|
teamId,
|
|
_id: { $in: collectionIds }
|
|
},
|
|
{ session }
|
|
).lean();
|
|
|
|
// delete s3 images which are uploaded by users
|
|
await s3DatasetSource.deleteDatasetFilesByKeys(imageIds);
|
|
});
|
|
}
|