mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
4.6.7 first pr (#726)
This commit is contained in:
@@ -1,7 +1,4 @@
|
||||
import {
|
||||
DatasetCollectionTrainingModeEnum,
|
||||
DatasetCollectionTypeEnum
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
|
||||
@@ -12,11 +9,15 @@ export async function createOneCollection({
|
||||
parentId,
|
||||
datasetId,
|
||||
type,
|
||||
trainingType = DatasetCollectionTrainingModeEnum.manual,
|
||||
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 0,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
qaPrompt,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
@@ -30,11 +31,15 @@ export async function createOneCollection({
|
||||
datasetId,
|
||||
name,
|
||||
type,
|
||||
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
qaPrompt,
|
||||
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
metadata
|
||||
@@ -74,7 +79,7 @@ export function createDefaultCollection({
|
||||
datasetId,
|
||||
parentId,
|
||||
type: DatasetCollectionTypeEnum.virtual,
|
||||
trainingType: DatasetCollectionTrainingModeEnum.manual,
|
||||
trainingType: TrainingModeEnum.chunk,
|
||||
chunkSize: 0,
|
||||
updateTime: new Date('2099')
|
||||
});
|
||||
|
@@ -1,10 +1,7 @@
|
||||
import { connectionMongo, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import {
|
||||
DatasetCollectionTrainingTypeMap,
|
||||
DatasetCollectionTypeMap
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
@@ -56,15 +53,23 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetCollectionTrainingTypeMap),
|
||||
enum: Object.keys(TrainingTypeMap),
|
||||
required: true
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
|
||||
fileId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: 'dataset.files'
|
||||
@@ -72,9 +77,6 @@ const DatasetCollectionSchema = new Schema({
|
||||
rawLink: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
|
||||
rawTextLength: {
|
||||
type: Number
|
||||
@@ -89,8 +91,9 @@ const DatasetCollectionSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
DatasetCollectionSchema.index({ teamId: 1 });
|
||||
DatasetCollectionSchema.index({ datasetId: 1 });
|
||||
DatasetCollectionSchema.index({ datasetId: 1, parentId: 1 });
|
||||
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
|
||||
DatasetCollectionSchema.index({ updateTime: -1 });
|
||||
DatasetCollectionSchema.index({ hashRawText: -1 });
|
||||
} catch (error) {
|
||||
|
@@ -4,7 +4,7 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { urlsFetch } from '../../../common/string/cheerio';
|
||||
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
/**
|
||||
@@ -92,8 +92,12 @@ export const getCollectionAndRawText = async ({
|
||||
return Promise.reject('Collection not found');
|
||||
}
|
||||
|
||||
const rawText = await (async () => {
|
||||
if (newRawText) return newRawText;
|
||||
const { title, rawText } = await (async () => {
|
||||
if (newRawText)
|
||||
return {
|
||||
title: '',
|
||||
rawText: newRawText
|
||||
};
|
||||
// link
|
||||
if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
|
||||
// crawl new data
|
||||
@@ -102,12 +106,18 @@ export const getCollectionAndRawText = async ({
|
||||
selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
|
||||
});
|
||||
|
||||
return result[0].content;
|
||||
return {
|
||||
title: result[0].title,
|
||||
rawText: result[0].content
|
||||
};
|
||||
}
|
||||
|
||||
// file
|
||||
|
||||
return '';
|
||||
return {
|
||||
title: '',
|
||||
rawText: ''
|
||||
};
|
||||
})();
|
||||
|
||||
const hashRawText = hashStr(rawText);
|
||||
@@ -115,6 +125,7 @@ export const getCollectionAndRawText = async ({
|
||||
|
||||
return {
|
||||
collection: col,
|
||||
title,
|
||||
rawText,
|
||||
isSameRawText
|
||||
};
|
||||
@@ -135,6 +146,7 @@ export const reloadCollectionChunks = async ({
|
||||
rawText?: string;
|
||||
}) => {
|
||||
const {
|
||||
title,
|
||||
rawText: newRawText,
|
||||
collection: col,
|
||||
isSameRawText
|
||||
@@ -154,6 +166,11 @@ export const reloadCollectionChunks = async ({
|
||||
});
|
||||
|
||||
// insert to training queue
|
||||
const model = await (() => {
|
||||
if (col.trainingType === TrainingModeEnum.chunk) return col.datasetId.vectorModel;
|
||||
if (col.trainingType === TrainingModeEnum.qa) return col.datasetId.agentModel;
|
||||
return Promise.reject('Training model error');
|
||||
})();
|
||||
await MongoDatasetTraining.insertMany(
|
||||
chunks.map((item, i) => ({
|
||||
teamId: col.teamId,
|
||||
@@ -163,7 +180,7 @@ export const reloadCollectionChunks = async ({
|
||||
billId,
|
||||
mode: col.trainingType,
|
||||
prompt: '',
|
||||
model: col.datasetId.vectorModel,
|
||||
model,
|
||||
q: item,
|
||||
a: '',
|
||||
chunkIndex: i
|
||||
@@ -172,6 +189,7 @@ export const reloadCollectionChunks = async ({
|
||||
|
||||
// update raw text
|
||||
await MongoDatasetCollection.findByIdAndUpdate(col._id, {
|
||||
...(title && { name: title }),
|
||||
rawTextLength: newRawText.length,
|
||||
hashRawText: hashStr(newRawText)
|
||||
});
|
||||
|
Reference in New Issue
Block a user