feat: dataset index prefix (#5061)

This commit is contained in:
Archer
2025-06-18 17:26:53 +08:00
committed by GitHub
parent 6b2ea696c5
commit 36fafd2149
34 changed files with 371 additions and 259 deletions

View File

@@ -103,6 +103,7 @@ export const createCollectionAndInsertData = async ({
delete formatCreateCollectionParams.chunkSize;
delete formatCreateCollectionParams.chunkSplitter;
delete formatCreateCollectionParams.indexSize;
delete formatCreateCollectionParams.indexPrefixTitle;
}
}
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
@@ -223,7 +224,6 @@ export const createCollectionAndInsertData = async ({
vlmModel: dataset.vlmModel,
indexSize,
mode: trainingMode,
prompt: formatCreateCollectionParams.qaPrompt,
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,

View File

@@ -32,6 +32,7 @@ export const ChunkSettings = {
imageIndex: Boolean,
autoIndexes: Boolean,
indexPrefixTitle: Boolean,
chunkSettingMode: {
type: String,

View File

@@ -27,23 +27,6 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
} catch (error) {}
};
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
...props
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
const {
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
vectorModel,
agentModel,
vlmModel
});
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
@@ -53,7 +36,6 @@ export async function pushDataListToTrainingQueue({
vectorModel,
vlmModel,
data,
prompt,
billId,
mode = TrainingModeEnum.chunk,
indexSize,
@@ -149,8 +131,6 @@ export async function pushDataListToTrainingQueue({
collectionId: collectionId,
billId,
mode: formatTrainingMode(item, mode),
prompt,
model,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),

View File

@@ -10,6 +10,7 @@ import {
TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { DatasetDataCollectionName } from '../data/schema';
export const DatasetTrainingCollectionName = 'dataset_trainings';
@@ -54,8 +55,6 @@ const TrainingDataSchema = new Schema({
default: 5
},
model: String,
prompt: String,
q: {
type: String,
default: ''
@@ -74,7 +73,10 @@ const TrainingDataSchema = new Schema({
type: Number,
default: 0
},
dataId: Schema.Types.ObjectId,
dataId: {
type: Schema.Types.ObjectId,
ref: DatasetDataCollectionName
},
indexes: {
type: [
{
@@ -105,6 +107,12 @@ TrainingDataSchema.virtual('collection', {
foreignField: '_id',
justOne: true
});
TrainingDataSchema.virtual('data', {
ref: DatasetDataCollectionName,
localField: 'dataId',
foreignField: '_id',
justOne: true
});
try {
// lock training data(teamId); delete training data