New dpcs structure and dataset i18n (#551)

* perf: check balance

* md

* lock way

* i18n

* docs

* doc

* i18n

* update doc

* feat: one link sync

* feat: one link sync

* feat: one link sync

* feat: one link sync

* feat: one link sync

* feat: one link sync

* feat: one link sync
This commit is contained in:
Archer
2023-12-04 21:37:07 +08:00
committed by GitHub
parent c3ae38df8b
commit 62e87551ac
141 changed files with 961 additions and 469 deletions

View File

@@ -44,7 +44,6 @@ const DatasetCollectionSchema = new Schema({
enum: Object.keys(DatasetCollectionTypeMap),
required: true
},
name: {
type: String,
required: true

View File

@@ -1,5 +1,11 @@
import type { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type.d';
import { MongoDatasetCollection } from './schema';
import { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type';
import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type.d';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '@fastgpt/global/common/file/tools';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
/**
* get all collection by top collectionId
@@ -58,3 +64,64 @@ export function getCollectionUpdateTime({ name, time }: { time?: Date; name: str
if (name.startsWith('手动') || ['manual', 'mark'].includes(name)) return new Date('2999/9/9');
return new Date();
}
/* link collection start load data */
export const loadingOneChunkCollection = async ({
collectionId,
tmbId,
billId,
rawText
}: {
collectionId: string;
tmbId: string;
billId?: string;
rawText?: string;
}) => {
const collection = (await MongoDatasetCollection.findById(collectionId).populate(
'datasetId'
)) as CollectionWithDatasetType;
if (!collection) {
return Promise.reject(DatasetErrEnum.unCreateCollection);
}
const newRawText = await (async () => {
if (rawText) return rawText;
// link
if (collection.type === DatasetCollectionTypeEnum.link && collection.rawLink) {
// crawl new data
const result = await urlsFetch({
urlList: [collection.rawLink],
selector: collection.datasetId?.websiteConfig?.selector
});
return result[0].content;
}
// file
return '';
})();
// split data
const { chunks } = splitText2Chunks({
text: newRawText,
chunkLen: collection.chunkSize || 512
});
// insert to training queue
await MongoDatasetTraining.insertMany(
chunks.map((item, i) => ({
teamId: collection.teamId,
tmbId,
datasetId: collection.datasetId._id,
collectionId: collection._id,
billId,
mode: collection.trainingType,
prompt: '',
model: collection.datasetId.vectorModel,
q: item,
a: '',
chunkIndex: i
}))
);
};

View File

@@ -41,8 +41,8 @@ export async function delCollectionRelevantData({
collectionIds: string[];
fileIds: string[];
}) {
collectionIds = collectionIds.map((item) => String(item));
const filterFileIds = fileIds.filter(Boolean);
collectionIds = collectionIds.filter(Boolean).map((item) => String(item));
const filterFileIds = fileIds.filter(Boolean).map((item) => String(item));
// delete training data
await MongoDatasetTraining.deleteMany({

View File

@@ -0,0 +1,21 @@
import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetTraining } from './schema';
export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise<any> => {
try {
await MongoDatasetTraining.updateMany(
{
teamId
},
{
lockTime: new Date('2999/5/5')
}
);
} catch (error) {
if (retry > 0) {
await delay(1000);
return lockTrainingDataByTeamId(teamId, retry - 1);
}
return Promise.reject(error);
}
};