4.6.7 first pr (#726)

This commit is contained in:
Archer
2024-01-10 23:35:04 +08:00
committed by GitHub
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions

View File

@@ -1,7 +1,4 @@
import {
DatasetCollectionTrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constant';
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
@@ -12,11 +9,15 @@ export async function createOneCollection({
parentId,
datasetId,
type,
trainingType = DatasetCollectionTrainingModeEnum.manual,
trainingType = TrainingModeEnum.chunk,
chunkSize = 0,
chunkSplitter,
qaPrompt,
fileId,
rawLink,
qaPrompt,
hashRawText,
rawTextLength,
metadata = {},
@@ -30,11 +31,15 @@ export async function createOneCollection({
datasetId,
name,
type,
trainingType,
chunkSize,
chunkSplitter,
qaPrompt,
fileId,
rawLink,
qaPrompt,
rawTextLength,
hashRawText,
metadata
@@ -74,7 +79,7 @@ export function createDefaultCollection({
datasetId,
parentId,
type: DatasetCollectionTypeEnum.virtual,
trainingType: DatasetCollectionTrainingModeEnum.manual,
trainingType: TrainingModeEnum.chunk,
chunkSize: 0,
updateTime: new Date('2099')
});

View File

@@ -1,10 +1,7 @@
import { connectionMongo, type Model } from '../../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import {
DatasetCollectionTrainingTypeMap,
DatasetCollectionTypeMap
} from '@fastgpt/global/core/dataset/constant';
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
import { DatasetCollectionName } from '../schema';
import {
TeamCollectionName,
@@ -56,15 +53,23 @@ const DatasetCollectionSchema = new Schema({
type: Date,
default: () => new Date()
},
trainingType: {
type: String,
enum: Object.keys(DatasetCollectionTrainingTypeMap),
enum: Object.keys(TrainingTypeMap),
required: true
},
chunkSize: {
type: Number,
required: true
},
chunkSplitter: {
type: String
},
qaPrompt: {
type: String
},
fileId: {
type: Schema.Types.ObjectId,
ref: 'dataset.files'
@@ -72,9 +77,6 @@ const DatasetCollectionSchema = new Schema({
rawLink: {
type: String
},
qaPrompt: {
type: String
},
rawTextLength: {
type: Number
@@ -89,8 +91,9 @@ const DatasetCollectionSchema = new Schema({
});
try {
DatasetCollectionSchema.index({ teamId: 1 });
DatasetCollectionSchema.index({ datasetId: 1 });
DatasetCollectionSchema.index({ datasetId: 1, parentId: 1 });
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
DatasetCollectionSchema.index({ updateTime: -1 });
DatasetCollectionSchema.index({ hashRawText: -1 });
} catch (error) {

View File

@@ -4,7 +4,7 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '../../../common/string/cheerio';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { hashStr } from '@fastgpt/global/common/string/tools';
/**
@@ -92,8 +92,12 @@ export const getCollectionAndRawText = async ({
return Promise.reject('Collection not found');
}
const rawText = await (async () => {
if (newRawText) return newRawText;
const { title, rawText } = await (async () => {
if (newRawText)
return {
title: '',
rawText: newRawText
};
// link
if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
// crawl new data
@@ -102,12 +106,18 @@ export const getCollectionAndRawText = async ({
selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
});
return result[0].content;
return {
title: result[0].title,
rawText: result[0].content
};
}
// file
return '';
return {
title: '',
rawText: ''
};
})();
const hashRawText = hashStr(rawText);
@@ -115,6 +125,7 @@ export const getCollectionAndRawText = async ({
return {
collection: col,
title,
rawText,
isSameRawText
};
@@ -135,6 +146,7 @@ export const reloadCollectionChunks = async ({
rawText?: string;
}) => {
const {
title,
rawText: newRawText,
collection: col,
isSameRawText
@@ -154,6 +166,11 @@ export const reloadCollectionChunks = async ({
});
// insert to training queue
const model = await (() => {
if (col.trainingType === TrainingModeEnum.chunk) return col.datasetId.vectorModel;
if (col.trainingType === TrainingModeEnum.qa) return col.datasetId.agentModel;
return Promise.reject('Training model error');
})();
await MongoDatasetTraining.insertMany(
chunks.map((item, i) => ({
teamId: col.teamId,
@@ -163,7 +180,7 @@ export const reloadCollectionChunks = async ({
billId,
mode: col.trainingType,
prompt: '',
model: col.datasetId.vectorModel,
model,
q: item,
a: '',
chunkIndex: i
@@ -172,6 +189,7 @@ export const reloadCollectionChunks = async ({
// update raw text
await MongoDatasetCollection.findByIdAndUpdate(col._id, {
...(title && { name: title }),
rawTextLength: newRawText.length,
hashRawText: hashStr(newRawText)
});