mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
New dpcs structure and dataset i18n (#551)
* perf: check balance * md * lock way * i18n * docs * doc * i18n * update doc * feat: one link sync * feat: one link sync * feat: one link sync * feat: one link sync * feat: one link sync * feat: one link sync * feat: one link sync
This commit is contained in:
@@ -1,5 +1,11 @@
|
||||
import type { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
import { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type';
|
||||
import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type.d';
|
||||
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { urlsFetch } from '@fastgpt/global/common/file/tools';
|
||||
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
|
||||
/**
|
||||
* get all collection by top collectionId
|
||||
@@ -58,3 +64,64 @@ export function getCollectionUpdateTime({ name, time }: { time?: Date; name: str
|
||||
if (name.startsWith('手动') || ['manual', 'mark'].includes(name)) return new Date('2999/9/9');
|
||||
return new Date();
|
||||
}
|
||||
|
||||
/* link collection start load data */
|
||||
export const loadingOneChunkCollection = async ({
|
||||
collectionId,
|
||||
tmbId,
|
||||
billId,
|
||||
rawText
|
||||
}: {
|
||||
collectionId: string;
|
||||
tmbId: string;
|
||||
billId?: string;
|
||||
rawText?: string;
|
||||
}) => {
|
||||
const collection = (await MongoDatasetCollection.findById(collectionId).populate(
|
||||
'datasetId'
|
||||
)) as CollectionWithDatasetType;
|
||||
|
||||
if (!collection) {
|
||||
return Promise.reject(DatasetErrEnum.unCreateCollection);
|
||||
}
|
||||
|
||||
const newRawText = await (async () => {
|
||||
if (rawText) return rawText;
|
||||
// link
|
||||
if (collection.type === DatasetCollectionTypeEnum.link && collection.rawLink) {
|
||||
// crawl new data
|
||||
const result = await urlsFetch({
|
||||
urlList: [collection.rawLink],
|
||||
selector: collection.datasetId?.websiteConfig?.selector
|
||||
});
|
||||
|
||||
return result[0].content;
|
||||
}
|
||||
// file
|
||||
|
||||
return '';
|
||||
})();
|
||||
|
||||
// split data
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: newRawText,
|
||||
chunkLen: collection.chunkSize || 512
|
||||
});
|
||||
|
||||
// insert to training queue
|
||||
await MongoDatasetTraining.insertMany(
|
||||
chunks.map((item, i) => ({
|
||||
teamId: collection.teamId,
|
||||
tmbId,
|
||||
datasetId: collection.datasetId._id,
|
||||
collectionId: collection._id,
|
||||
billId,
|
||||
mode: collection.trainingType,
|
||||
prompt: '',
|
||||
model: collection.datasetId.vectorModel,
|
||||
q: item,
|
||||
a: '',
|
||||
chunkIndex: i
|
||||
}))
|
||||
);
|
||||
};
|
||||
|
Reference in New Issue
Block a user