4.6.7 first pr (#726)

This commit is contained in:
Archer
2024-01-10 23:35:04 +08:00
committed by GitHub
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions

View File

@@ -4,7 +4,7 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '../../../common/string/cheerio';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { hashStr } from '@fastgpt/global/common/string/tools';
/**
@@ -92,8 +92,12 @@ export const getCollectionAndRawText = async ({
return Promise.reject('Collection not found');
}
const rawText = await (async () => {
if (newRawText) return newRawText;
const { title, rawText } = await (async () => {
if (newRawText)
return {
title: '',
rawText: newRawText
};
// link
if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
// crawl new data
@@ -102,12 +106,18 @@ export const getCollectionAndRawText = async ({
selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
});
return result[0].content;
return {
title: result[0].title,
rawText: result[0].content
};
}
// file
return '';
return {
title: '',
rawText: ''
};
})();
const hashRawText = hashStr(rawText);
@@ -115,6 +125,7 @@ export const getCollectionAndRawText = async ({
return {
collection: col,
title,
rawText,
isSameRawText
};
@@ -135,6 +146,7 @@ export const reloadCollectionChunks = async ({
rawText?: string;
}) => {
const {
title,
rawText: newRawText,
collection: col,
isSameRawText
@@ -154,6 +166,11 @@ export const reloadCollectionChunks = async ({
});
// insert to training queue
const model = await (() => {
if (col.trainingType === TrainingModeEnum.chunk) return col.datasetId.vectorModel;
if (col.trainingType === TrainingModeEnum.qa) return col.datasetId.agentModel;
return Promise.reject('Training model error');
})();
await MongoDatasetTraining.insertMany(
chunks.map((item, i) => ({
teamId: col.teamId,
@@ -163,7 +180,7 @@ export const reloadCollectionChunks = async ({
billId,
mode: col.trainingType,
prompt: '',
model: col.datasetId.vectorModel,
model,
q: item,
a: '',
chunkIndex: i
@@ -172,6 +189,7 @@ export const reloadCollectionChunks = async ({
// update raw text
await MongoDatasetCollection.findByIdAndUpdate(col._id, {
...(title && { name: title }),
rawTextLength: newRawText.length,
hashRawText: hashStr(newRawText)
});