feat: dataset index prefix (#5061)

This commit is contained in:
Archer
2025-06-18 17:26:53 +08:00
committed by GitHub
parent 6b2ea696c5
commit 36fafd2149
34 changed files with 371 additions and 259 deletions

View File

@@ -10,6 +10,7 @@ export type CreateDatasetDataProps = {
a?: string;
imageId?: string;
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
indexPrefix?: string;
};
export type UpdateDatasetDataProps = {
@@ -21,6 +22,7 @@ export type UpdateDatasetDataProps = {
dataId?: string; // pg data id
})[];
imageId?: string;
indexPrefix?: string;
};
export type PatchIndexesProps =

View File

@@ -7,9 +7,9 @@ export type PushDataToTrainingQueueProps = {
datasetId: string;
collectionId: string;
data: PushDatasetDataChunkProps[];
mode?: TrainingModeEnum;
data: PushDatasetDataChunkProps[];
prompt?: string;
agentModel: string;
vectorModel: string;

View File

@@ -36,6 +36,7 @@ export type ChunkSettingsType = {
// Index enhance
imageIndex?: boolean;
autoIndexes?: boolean;
indexPrefixTitle?: boolean;
// Chunk setting
chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
@@ -184,8 +185,6 @@ export type DatasetTrainingSchemaType = {
expireAt: Date;
lockTime: Date;
mode: TrainingModeEnum;
model?: string;
prompt?: string;
dataId?: string;
q: string;
a: string;

View File

@@ -103,6 +103,7 @@ export const createCollectionAndInsertData = async ({
delete formatCreateCollectionParams.chunkSize;
delete formatCreateCollectionParams.chunkSplitter;
delete formatCreateCollectionParams.indexSize;
delete formatCreateCollectionParams.indexPrefixTitle;
}
}
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
@@ -223,7 +224,6 @@ export const createCollectionAndInsertData = async ({
vlmModel: dataset.vlmModel,
indexSize,
mode: trainingMode,
prompt: formatCreateCollectionParams.qaPrompt,
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,

View File

@@ -32,6 +32,7 @@ export const ChunkSettings = {
imageIndex: Boolean,
autoIndexes: Boolean,
indexPrefixTitle: Boolean,
chunkSettingMode: {
type: String,

View File

@@ -27,23 +27,6 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
} catch (error) {}
};
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
...props
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
const {
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
vectorModel,
agentModel,
vlmModel
});
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
@@ -53,7 +36,6 @@ export async function pushDataListToTrainingQueue({
vectorModel,
vlmModel,
data,
prompt,
billId,
mode = TrainingModeEnum.chunk,
indexSize,
@@ -149,8 +131,6 @@ export async function pushDataListToTrainingQueue({
collectionId: collectionId,
billId,
mode: formatTrainingMode(item, mode),
prompt,
model,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),

View File

@@ -10,6 +10,7 @@ import {
TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { DatasetDataCollectionName } from '../data/schema';
export const DatasetTrainingCollectionName = 'dataset_trainings';
@@ -54,8 +55,6 @@ const TrainingDataSchema = new Schema({
default: 5
},
model: String,
prompt: String,
q: {
type: String,
default: ''
@@ -74,7 +73,10 @@ const TrainingDataSchema = new Schema({
type: Number,
default: 0
},
dataId: Schema.Types.ObjectId,
dataId: {
type: Schema.Types.ObjectId,
ref: DatasetDataCollectionName
},
indexes: {
type: [
{
@@ -105,6 +107,12 @@ TrainingDataSchema.virtual('collection', {
foreignField: '_id',
justOne: true
});
TrainingDataSchema.virtual('data', {
ref: DatasetDataCollectionName,
localField: 'dataId',
foreignField: '_id',
justOne: true
});
try {
// lock training data(teamId); delete training data

View File

@@ -111,6 +111,8 @@
"import_param_setting": "Parameter settings",
"import_select_file": "Select a file",
"import_select_link": "Enter link",
"index_prefix_title": "Index add title",
"index_prefix_title_tips": "Automatically add title names to all indexes",
"index_size": "Index size",
"index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
"input_required_field_to_select_baseurl": "Please enter the required information first",

View File

@@ -111,6 +111,8 @@
"import_param_setting": "参数设置",
"import_select_file": "选择文件",
"import_select_link": "输入链接",
"index_prefix_title": "将标题加入索引",
"index_prefix_title_tips": "自动给索引所有索引加标题名",
"index_size": "索引大小",
"index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。",
"input_required_field_to_select_baseurl": "请先输入必填信息",

View File

@@ -110,6 +110,8 @@
"import_param_setting": "參數設定",
"import_select_file": "選擇文件",
"import_select_link": "輸入連結",
"index_prefix_title": "將標題加入索引",
"index_prefix_title_tips": "自動給索引所有索引加標題名",
"index_size": "索引大小",
"index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。",
"input_required_field_to_select_baseurl": "請先輸入必填信息",