mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
4.6.7-alpha commit (#743)
Co-authored-by: Archer <545436317@qq.com> Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
@@ -1,6 +1,20 @@
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import {
|
||||
TrainingModeEnum,
|
||||
DatasetCollectionTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
import {
|
||||
CollectionWithDatasetType,
|
||||
DatasetCollectionSchemaType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDatasetData } from '../data/schema';
|
||||
import { delImgByRelatedId } from '../../../common/file/image/controller';
|
||||
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
|
||||
import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
|
||||
export async function createOneCollection({
|
||||
teamId,
|
||||
@@ -85,20 +99,50 @@ export function createDefaultCollection({
|
||||
});
|
||||
}
|
||||
|
||||
// check same collection
|
||||
export const getSameRawTextCollection = async ({
|
||||
datasetId,
|
||||
hashRawText
|
||||
/**
|
||||
* delete collection and it related data
|
||||
*/
|
||||
export async function delCollectionAndRelatedSources({
|
||||
collections
|
||||
}: {
|
||||
datasetId: string;
|
||||
hashRawText?: string;
|
||||
}) => {
|
||||
if (!hashRawText) return undefined;
|
||||
collections: (CollectionWithDatasetType | DatasetCollectionSchemaType)[];
|
||||
}) {
|
||||
if (collections.length === 0) return;
|
||||
|
||||
const collection = await MongoDatasetCollection.findOne({
|
||||
datasetId,
|
||||
hashRawText
|
||||
const teamId = collections[0].teamId;
|
||||
|
||||
if (!teamId) return Promise.reject('teamId is not exist');
|
||||
|
||||
const collectionIds = collections.map((item) => String(item._id));
|
||||
const fileIdList = collections.map((item) => item?.fileId || '').filter(Boolean);
|
||||
const relatedImageIds = collections
|
||||
.map((item) => item?.metadata?.relatedImgId || '')
|
||||
.filter(Boolean);
|
||||
|
||||
// delete training data
|
||||
await MongoDatasetTraining.deleteMany({
|
||||
teamId,
|
||||
collectionId: { $in: collectionIds }
|
||||
});
|
||||
|
||||
return collection;
|
||||
};
|
||||
await delay(2000);
|
||||
|
||||
// delete dataset.datas
|
||||
await MongoDatasetData.deleteMany({ teamId, collectionId: { $in: collectionIds } });
|
||||
// delete pg data
|
||||
await deleteDatasetDataVector({ teamId, collectionIds });
|
||||
|
||||
// delete file and imgs
|
||||
await Promise.all([
|
||||
delImgByRelatedId(relatedImageIds),
|
||||
delFileByFileIdList({
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileIdList
|
||||
})
|
||||
]);
|
||||
|
||||
// delete collections
|
||||
await MongoDatasetCollection.deleteMany({
|
||||
_id: { $in: collectionIds }
|
||||
});
|
||||
}
|
||||
|
@@ -1,7 +1,7 @@
|
||||
import { connectionMongo, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
@@ -91,11 +91,19 @@ const DatasetCollectionSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
DatasetCollectionSchema.index({ teamId: 1 });
|
||||
DatasetCollectionSchema.index({ datasetId: 1 });
|
||||
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
|
||||
DatasetCollectionSchema.index({ updateTime: -1 });
|
||||
DatasetCollectionSchema.index({ hashRawText: -1 });
|
||||
// auth file
|
||||
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
|
||||
|
||||
// list collection; deep find collections
|
||||
DatasetCollectionSchema.index(
|
||||
{
|
||||
teamId: 1,
|
||||
datasetId: 1,
|
||||
parentId: 1,
|
||||
updateTime: -1
|
||||
},
|
||||
{ background: true }
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
@@ -4,16 +4,32 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { urlsFetch } from '../../../common/string/cheerio';
|
||||
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
/**
|
||||
* get all collection by top collectionId
|
||||
*/
|
||||
export async function findCollectionAndChild(id: string, fields = '_id parentId name metadata') {
|
||||
export async function findCollectionAndChild({
|
||||
teamId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
fields = '_id parentId name metadata'
|
||||
}: {
|
||||
teamId: string;
|
||||
datasetId: string;
|
||||
collectionId: string;
|
||||
fields?: string;
|
||||
}) {
|
||||
async function find(id: string) {
|
||||
// find children
|
||||
const children = await MongoDatasetCollection.find({ parentId: id }, fields);
|
||||
const children = await MongoDatasetCollection.find(
|
||||
{ teamId, datasetId, parentId: id },
|
||||
fields
|
||||
).lean();
|
||||
|
||||
let collections = children;
|
||||
|
||||
@@ -25,8 +41,8 @@ export async function findCollectionAndChild(id: string, fields = '_id parentId
|
||||
return collections;
|
||||
}
|
||||
const [collection, childCollections] = await Promise.all([
|
||||
MongoDatasetCollection.findById(id, fields),
|
||||
find(id)
|
||||
MongoDatasetCollection.findById(collectionId, fields),
|
||||
find(collectionId)
|
||||
]);
|
||||
|
||||
if (!collection) {
|
||||
@@ -107,8 +123,8 @@ export const getCollectionAndRawText = async ({
|
||||
});
|
||||
|
||||
return {
|
||||
title: result[0].title,
|
||||
rawText: result[0].content
|
||||
title: result[0]?.title,
|
||||
rawText: result[0]?.content
|
||||
};
|
||||
}
|
||||
|
||||
@@ -121,7 +137,7 @@ export const getCollectionAndRawText = async ({
|
||||
})();
|
||||
|
||||
const hashRawText = hashStr(rawText);
|
||||
const isSameRawText = col.hashRawText === hashRawText;
|
||||
const isSameRawText = rawText && col.hashRawText === hashRawText;
|
||||
|
||||
return {
|
||||
collection: col,
|
||||
@@ -161,8 +177,7 @@ export const reloadCollectionChunks = async ({
|
||||
// split data
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: newRawText,
|
||||
chunkLen: col.chunkSize || 512,
|
||||
countTokens: false
|
||||
chunkLen: col.chunkSize || 512
|
||||
});
|
||||
|
||||
// insert to training queue
|
||||
|
Reference in New Issue
Block a user