4.6.7-alpha commit (#743)

Co-authored-by: Archer <545436317@qq.com>
Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
Archer
2024-01-19 11:17:28 +08:00
committed by GitHub
parent 8ee7407c4c
commit c031e6dcc9
324 changed files with 8509 additions and 4757 deletions

View File

@@ -1,6 +1,20 @@
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import {
TrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
import {
CollectionWithDatasetType,
DatasetCollectionSchemaType
} from '@fastgpt/global/core/dataset/type';
import { MongoDatasetTraining } from '../training/schema';
import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetData } from '../data/schema';
import { delImgByRelatedId } from '../../../common/file/image/controller';
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
import { delFileByFileIdList } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
export async function createOneCollection({
teamId,
@@ -85,20 +99,50 @@ export function createDefaultCollection({
});
}
// check same collection
export const getSameRawTextCollection = async ({
datasetId,
hashRawText
/**
* delete collection and it related data
*/
export async function delCollectionAndRelatedSources({
collections
}: {
datasetId: string;
hashRawText?: string;
}) => {
if (!hashRawText) return undefined;
collections: (CollectionWithDatasetType | DatasetCollectionSchemaType)[];
}) {
if (collections.length === 0) return;
const collection = await MongoDatasetCollection.findOne({
datasetId,
hashRawText
const teamId = collections[0].teamId;
if (!teamId) return Promise.reject('teamId is not exist');
const collectionIds = collections.map((item) => String(item._id));
const fileIdList = collections.map((item) => item?.fileId || '').filter(Boolean);
const relatedImageIds = collections
.map((item) => item?.metadata?.relatedImgId || '')
.filter(Boolean);
// delete training data
await MongoDatasetTraining.deleteMany({
teamId,
collectionId: { $in: collectionIds }
});
return collection;
};
await delay(2000);
// delete dataset.datas
await MongoDatasetData.deleteMany({ teamId, collectionId: { $in: collectionIds } });
// delete pg data
await deleteDatasetDataVector({ teamId, collectionIds });
// delete file and imgs
await Promise.all([
delImgByRelatedId(relatedImageIds),
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList
})
]);
// delete collections
await MongoDatasetCollection.deleteMany({
_id: { $in: collectionIds }
});
}

View File

@@ -1,7 +1,7 @@
import { connectionMongo, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
@@ -91,11 +91,19 @@ const DatasetCollectionSchema = new Schema({
});
try {
DatasetCollectionSchema.index({ teamId: 1 });
DatasetCollectionSchema.index({ datasetId: 1 });
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
DatasetCollectionSchema.index({ updateTime: -1 });
DatasetCollectionSchema.index({ hashRawText: -1 });
// auth file
DatasetCollectionSchema.index({ teamId: 1, fileId: 1 }, { background: true });
// list collection; deep find collections
DatasetCollectionSchema.index(
{
teamId: 1,
datasetId: 1,
parentId: 1,
updateTime: -1
},
{ background: true }
);
} catch (error) {
console.log(error);
}

View File

@@ -4,16 +4,32 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '../../../common/string/cheerio';
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { hashStr } from '@fastgpt/global/common/string/tools';
/**
* get all collection by top collectionId
*/
export async function findCollectionAndChild(id: string, fields = '_id parentId name metadata') {
export async function findCollectionAndChild({
teamId,
datasetId,
collectionId,
fields = '_id parentId name metadata'
}: {
teamId: string;
datasetId: string;
collectionId: string;
fields?: string;
}) {
async function find(id: string) {
// find children
const children = await MongoDatasetCollection.find({ parentId: id }, fields);
const children = await MongoDatasetCollection.find(
{ teamId, datasetId, parentId: id },
fields
).lean();
let collections = children;
@@ -25,8 +41,8 @@ export async function findCollectionAndChild(id: string, fields = '_id parentId
return collections;
}
const [collection, childCollections] = await Promise.all([
MongoDatasetCollection.findById(id, fields),
find(id)
MongoDatasetCollection.findById(collectionId, fields),
find(collectionId)
]);
if (!collection) {
@@ -107,8 +123,8 @@ export const getCollectionAndRawText = async ({
});
return {
title: result[0].title,
rawText: result[0].content
title: result[0]?.title,
rawText: result[0]?.content
};
}
@@ -121,7 +137,7 @@ export const getCollectionAndRawText = async ({
})();
const hashRawText = hashStr(rawText);
const isSameRawText = col.hashRawText === hashRawText;
const isSameRawText = rawText && col.hashRawText === hashRawText;
return {
collection: col,
@@ -161,8 +177,7 @@ export const reloadCollectionChunks = async ({
// split data
const { chunks } = splitText2Chunks({
text: newRawText,
chunkLen: col.chunkSize || 512,
countTokens: false
chunkLen: col.chunkSize || 512
});
// insert to training queue