mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-18 10:03:55 +00:00
feat: dataset index prefix (#5061)
This commit is contained in:
@@ -133,15 +133,15 @@ services:
|
||||
# fastgpt
|
||||
sandbox:
|
||||
container_name: sandbox
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
|
||||
networks:
|
||||
- fastgpt
|
||||
restart: always
|
||||
fastgpt-mcp-server:
|
||||
container_name: fastgpt-mcp-server
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3005:3000
|
||||
networks:
|
||||
@@ -151,8 +151,8 @@ services:
|
||||
- FASTGPT_ENDPOINT=http://fastgpt:3000
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: ghcr.io/labring/fastgpt:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
@@ -109,15 +109,15 @@ services:
|
||||
# fastgpt
|
||||
sandbox:
|
||||
container_name: sandbox
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
|
||||
networks:
|
||||
- fastgpt
|
||||
restart: always
|
||||
fastgpt-mcp-server:
|
||||
container_name: fastgpt-mcp-server
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3005:3000
|
||||
networks:
|
||||
@@ -127,8 +127,8 @@ services:
|
||||
- FASTGPT_ENDPOINT=http://fastgpt:3000
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: ghcr.io/labring/fastgpt:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
@@ -96,15 +96,15 @@ services:
|
||||
# fastgpt
|
||||
sandbox:
|
||||
container_name: sandbox
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
|
||||
networks:
|
||||
- fastgpt
|
||||
restart: always
|
||||
fastgpt-mcp-server:
|
||||
container_name: fastgpt-mcp-server
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3005:3000
|
||||
networks:
|
||||
@@ -114,8 +114,8 @@ services:
|
||||
- FASTGPT_ENDPOINT=http://fastgpt:3000
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: ghcr.io/labring/fastgpt:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
@@ -72,15 +72,15 @@ services:
|
||||
|
||||
sandbox:
|
||||
container_name: sandbox
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
|
||||
networks:
|
||||
- fastgpt
|
||||
restart: always
|
||||
fastgpt-mcp-server:
|
||||
container_name: fastgpt-mcp-server
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3005:3000
|
||||
networks:
|
||||
@@ -90,8 +90,8 @@ services:
|
||||
- FASTGPT_ENDPOINT=http://fastgpt:3000
|
||||
fastgpt:
|
||||
container_name: fastgpt
|
||||
image: ghcr.io/labring/fastgpt:v4.9.11 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
|
||||
image: ghcr.io/labring/fastgpt:v4.9.13 # git
|
||||
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
|
@@ -9,7 +9,11 @@ weight: 788
|
||||
|
||||
## 更新指南
|
||||
|
||||
### 1. 更新镜像:
|
||||
### 1. 更新环境变量
|
||||
|
||||
在 `fastgpt`和`fastgpt-pro`镜像环境变量中加入: `AES256_SECRET_KEY=` 变量,用于密钥加密。
|
||||
|
||||
### 2. 更新镜像:
|
||||
|
||||
- 更新 FastGPT 镜像 tag: v4.9.12
|
||||
- 更新 FastGPT 商业版镜像 tag: v4.9.12
|
||||
|
@@ -1,5 +1,5 @@
|
||||
---
|
||||
title: 'V4.9.13(进行中)'
|
||||
title: 'V4.9.13'
|
||||
description: 'FastGPT V4.9.13 更新说明'
|
||||
icon: 'upgrade'
|
||||
draft: false
|
||||
|
20
docSite/content/zh-cn/docs/development/upgrading/4914.md
Normal file
20
docSite/content/zh-cn/docs/development/upgrading/4914.md
Normal file
@@ -0,0 +1,20 @@
|
||||
---
|
||||
title: 'V4.9.14(进行中)'
|
||||
description: 'FastGPT V4.9.14 更新说明'
|
||||
icon: 'upgrade'
|
||||
draft: false
|
||||
toc: true
|
||||
weight: 787
|
||||
---
|
||||
|
||||
|
||||
## 🚀 新增内容
|
||||
|
||||
1. 知识库导入,支持配置:自动将文件名加入系统索引中。
|
||||
|
||||
## ⚙️ 优化
|
||||
|
||||
1. 统一知识库训练队列代码逻辑。
|
||||
2. 输入框 UX。
|
||||
|
||||
## 🐛 修复
|
2
packages/global/core/dataset/controller.d.ts
vendored
2
packages/global/core/dataset/controller.d.ts
vendored
@@ -10,6 +10,7 @@ export type CreateDatasetDataProps = {
|
||||
a?: string;
|
||||
imageId?: string;
|
||||
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
|
||||
indexPrefix?: string;
|
||||
};
|
||||
|
||||
export type UpdateDatasetDataProps = {
|
||||
@@ -21,6 +22,7 @@ export type UpdateDatasetDataProps = {
|
||||
dataId?: string; // pg data id
|
||||
})[];
|
||||
imageId?: string;
|
||||
indexPrefix?: string;
|
||||
};
|
||||
|
||||
export type PatchIndexesProps =
|
||||
|
@@ -7,9 +7,9 @@ export type PushDataToTrainingQueueProps = {
|
||||
datasetId: string;
|
||||
collectionId: string;
|
||||
|
||||
data: PushDatasetDataChunkProps[];
|
||||
mode?: TrainingModeEnum;
|
||||
data: PushDatasetDataChunkProps[];
|
||||
prompt?: string;
|
||||
|
||||
agentModel: string;
|
||||
vectorModel: string;
|
||||
|
3
packages/global/core/dataset/type.d.ts
vendored
3
packages/global/core/dataset/type.d.ts
vendored
@@ -36,6 +36,7 @@ export type ChunkSettingsType = {
|
||||
// Index enhance
|
||||
imageIndex?: boolean;
|
||||
autoIndexes?: boolean;
|
||||
indexPrefixTitle?: boolean;
|
||||
|
||||
// Chunk setting
|
||||
chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
|
||||
@@ -184,8 +185,6 @@ export type DatasetTrainingSchemaType = {
|
||||
expireAt: Date;
|
||||
lockTime: Date;
|
||||
mode: TrainingModeEnum;
|
||||
model?: string;
|
||||
prompt?: string;
|
||||
dataId?: string;
|
||||
q: string;
|
||||
a: string;
|
||||
|
@@ -103,6 +103,7 @@ export const createCollectionAndInsertData = async ({
|
||||
delete formatCreateCollectionParams.chunkSize;
|
||||
delete formatCreateCollectionParams.chunkSplitter;
|
||||
delete formatCreateCollectionParams.indexSize;
|
||||
delete formatCreateCollectionParams.indexPrefixTitle;
|
||||
}
|
||||
}
|
||||
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
|
||||
@@ -223,7 +224,6 @@ export const createCollectionAndInsertData = async ({
|
||||
vlmModel: dataset.vlmModel,
|
||||
indexSize,
|
||||
mode: trainingMode,
|
||||
prompt: formatCreateCollectionParams.qaPrompt,
|
||||
billId: traingBillId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
|
@@ -32,6 +32,7 @@ export const ChunkSettings = {
|
||||
|
||||
imageIndex: Boolean,
|
||||
autoIndexes: Boolean,
|
||||
indexPrefixTitle: Boolean,
|
||||
|
||||
chunkSettingMode: {
|
||||
type: String,
|
||||
|
@@ -27,23 +27,6 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
|
||||
} catch (error) {}
|
||||
};
|
||||
|
||||
export const pushDataListToTrainingQueueByCollectionId = async ({
|
||||
collectionId,
|
||||
...props
|
||||
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
|
||||
const {
|
||||
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
return pushDataListToTrainingQueue({
|
||||
...props,
|
||||
datasetId,
|
||||
collectionId,
|
||||
vectorModel,
|
||||
agentModel,
|
||||
vlmModel
|
||||
});
|
||||
};
|
||||
|
||||
export async function pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
@@ -53,7 +36,6 @@ export async function pushDataListToTrainingQueue({
|
||||
vectorModel,
|
||||
vlmModel,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
mode = TrainingModeEnum.chunk,
|
||||
indexSize,
|
||||
@@ -149,8 +131,6 @@ export async function pushDataListToTrainingQueue({
|
||||
collectionId: collectionId,
|
||||
billId,
|
||||
mode: formatTrainingMode(item, mode),
|
||||
prompt,
|
||||
model,
|
||||
...(item.q && { q: item.q }),
|
||||
...(item.a && { a: item.a }),
|
||||
...(item.imageId && { imageId: item.imageId }),
|
||||
|
@@ -10,6 +10,7 @@ import {
|
||||
TeamMemberCollectionName
|
||||
} from '@fastgpt/global/support/user/team/constant';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
import { DatasetDataCollectionName } from '../data/schema';
|
||||
|
||||
export const DatasetTrainingCollectionName = 'dataset_trainings';
|
||||
|
||||
@@ -54,8 +55,6 @@ const TrainingDataSchema = new Schema({
|
||||
default: 5
|
||||
},
|
||||
|
||||
model: String,
|
||||
prompt: String,
|
||||
q: {
|
||||
type: String,
|
||||
default: ''
|
||||
@@ -74,7 +73,10 @@ const TrainingDataSchema = new Schema({
|
||||
type: Number,
|
||||
default: 0
|
||||
},
|
||||
dataId: Schema.Types.ObjectId,
|
||||
dataId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetDataCollectionName
|
||||
},
|
||||
indexes: {
|
||||
type: [
|
||||
{
|
||||
@@ -105,6 +107,12 @@ TrainingDataSchema.virtual('collection', {
|
||||
foreignField: '_id',
|
||||
justOne: true
|
||||
});
|
||||
TrainingDataSchema.virtual('data', {
|
||||
ref: DatasetDataCollectionName,
|
||||
localField: 'dataId',
|
||||
foreignField: '_id',
|
||||
justOne: true
|
||||
});
|
||||
|
||||
try {
|
||||
// lock training data(teamId); delete training data
|
||||
|
@@ -111,6 +111,8 @@
|
||||
"import_param_setting": "Parameter settings",
|
||||
"import_select_file": "Select a file",
|
||||
"import_select_link": "Enter link",
|
||||
"index_prefix_title": "Index add title",
|
||||
"index_prefix_title_tips": "Automatically add title names to all indexes",
|
||||
"index_size": "Index size",
|
||||
"index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
|
||||
"input_required_field_to_select_baseurl": "Please enter the required information first",
|
||||
|
@@ -111,6 +111,8 @@
|
||||
"import_param_setting": "参数设置",
|
||||
"import_select_file": "选择文件",
|
||||
"import_select_link": "输入链接",
|
||||
"index_prefix_title": "将标题加入索引",
|
||||
"index_prefix_title_tips": "自动给索引所有索引加标题名",
|
||||
"index_size": "索引大小",
|
||||
"index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。",
|
||||
"input_required_field_to_select_baseurl": "请先输入必填信息",
|
||||
|
@@ -110,6 +110,8 @@
|
||||
"import_param_setting": "參數設定",
|
||||
"import_select_file": "選擇文件",
|
||||
"import_select_link": "輸入連結",
|
||||
"index_prefix_title": "將標題加入索引",
|
||||
"index_prefix_title_tips": "自動給索引所有索引加標題名",
|
||||
"index_size": "索引大小",
|
||||
"index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。",
|
||||
"input_required_field_to_select_baseurl": "請先輸入必填信息",
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import { Box, Flex, Textarea } from '@chakra-ui/react';
|
||||
import type { FlexProps} from '@chakra-ui/react';
|
||||
import { Box, Flex, Textarea, useBoolean } from '@chakra-ui/react';
|
||||
import React, { useRef, useCallback, useMemo, useState } from 'react';
|
||||
import { useTranslation } from 'next-i18next';
|
||||
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
|
||||
@@ -48,6 +49,8 @@ const ChatInput = ({
|
||||
const { setValue, watch, control } = chatForm;
|
||||
const inputValue = watch('input');
|
||||
|
||||
const [focusing, { on: onFocus, off: offFocus }] = useBoolean();
|
||||
|
||||
// Check voice input state
|
||||
const [mobilePreSpeak, setMobilePreSpeak] = useState(false);
|
||||
|
||||
@@ -207,6 +210,8 @@ const ChatInput = ({
|
||||
}
|
||||
}
|
||||
}}
|
||||
onFocus={onFocus}
|
||||
onBlur={offFocus}
|
||||
/>
|
||||
</Flex>
|
||||
</Flex>
|
||||
@@ -254,7 +259,8 @@ const ChatInput = ({
|
||||
borderRadius={'sm'}
|
||||
cursor={'pointer'}
|
||||
_hover={{ bg: 'rgba(0, 0, 0, 0.04)' }}
|
||||
onClick={() => {
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
onOpenSelectFile();
|
||||
}}
|
||||
>
|
||||
@@ -276,7 +282,8 @@ const ChatInput = ({
|
||||
borderRadius={'sm'}
|
||||
cursor={'pointer'}
|
||||
_hover={{ bg: 'rgba(0, 0, 0, 0.04)' }}
|
||||
onClick={() => {
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
VoiceInputRef.current?.onSpeak?.();
|
||||
}}
|
||||
>
|
||||
@@ -307,7 +314,8 @@ const ChatInput = ({
|
||||
}
|
||||
borderRadius={['md', 'lg']}
|
||||
cursor={isChatting ? 'pointer' : canSendMessage ? 'pointer' : 'not-allowed'}
|
||||
onClick={() => {
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
if (isChatting) {
|
||||
return onStop();
|
||||
}
|
||||
@@ -343,6 +351,11 @@ const ChatInput = ({
|
||||
onStop
|
||||
]);
|
||||
|
||||
const activeStyles: FlexProps = {
|
||||
boxShadow: '0px 5px 20px -4px rgba(19, 51, 107, 0.13)',
|
||||
border: '0.5px solid rgba(0, 0, 0, 0.24)'
|
||||
};
|
||||
|
||||
return (
|
||||
<Box
|
||||
m={['0 auto 10px', '10px auto']}
|
||||
@@ -381,12 +394,17 @@ const ChatInput = ({
|
||||
pt={fileList.length > 0 ? '0' : mobilePreSpeak ? [0, 4] : [3, 4]}
|
||||
pb={[2, 4]}
|
||||
position={'relative'}
|
||||
boxShadow={`0px 5px 16px -4px rgba(19, 51, 107, 0.08)`}
|
||||
borderRadius={['xl', 'xxl']}
|
||||
bg={'white'}
|
||||
overflow={'display'}
|
||||
border={'0.5px solid rgba(0, 0, 0, 0.15)'}
|
||||
borderColor={'rgba(0,0,0,0.12)'}
|
||||
{...(focusing
|
||||
? activeStyles
|
||||
: {
|
||||
_hover: activeStyles,
|
||||
border: '0.5px solid rgba(0, 0, 0, 0.18)',
|
||||
boxShadow: `0px 5px 16px -4px rgba(19, 51, 107, 0.08)`
|
||||
})}
|
||||
onClick={() => TextareaDom?.current?.focus()}
|
||||
>
|
||||
<Box flex={1}>
|
||||
{/* Chat input guide box */}
|
||||
|
@@ -13,7 +13,8 @@ import {
|
||||
Textarea,
|
||||
useDisclosure,
|
||||
Checkbox,
|
||||
HStack
|
||||
HStack,
|
||||
Grid
|
||||
} from '@chakra-ui/react';
|
||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||
@@ -35,7 +36,6 @@ import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContex
|
||||
import MySelect from '@fastgpt/web/components/common/MySelect';
|
||||
import {
|
||||
chunkAutoChunkSize,
|
||||
getAutoIndexSize,
|
||||
getIndexSizeSelectList,
|
||||
getLLMDefaultChunkSize,
|
||||
getLLMMaxChunkSize,
|
||||
@@ -44,7 +44,6 @@ import {
|
||||
minChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
|
||||
import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
|
||||
const PromptTextarea = ({
|
||||
defaultValue = '',
|
||||
@@ -98,6 +97,7 @@ export type CollectionChunkFormType = {
|
||||
// Index enhance
|
||||
imageIndex: boolean;
|
||||
autoIndexes: boolean;
|
||||
indexPrefixTitle: boolean;
|
||||
|
||||
// Chunk setting
|
||||
chunkSettingMode: ChunkSettingModeEnum; // 系统参数/自定义参数
|
||||
@@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
const autoIndexes = watch('autoIndexes');
|
||||
const indexSize = watch('indexSize');
|
||||
const imageIndex = watch('imageIndex');
|
||||
const indexPrefixTitle = watch('indexPrefixTitle');
|
||||
const paragraphChunkAIMode = watch('paragraphChunkAIMode');
|
||||
|
||||
const trainingModeList = useMemo(() => {
|
||||
@@ -282,48 +283,56 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
|
||||
</Box>
|
||||
)}
|
||||
|
||||
{trainingType === DatasetCollectionDataProcessModeEnum.chunk &&
|
||||
feConfigs?.show_dataset_enhance !== false && (
|
||||
<Box mt={6}>
|
||||
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
|
||||
{t('dataset:enhanced_indexes')}
|
||||
</Box>
|
||||
<HStack gap={[3, 7]}>
|
||||
<HStack flex={'1'} spacing={1}>
|
||||
<MyTooltip label={!feConfigs?.isPlus ? t('common:commercial_function_tip') : ''}>
|
||||
<Checkbox
|
||||
isDisabled={!feConfigs?.isPlus}
|
||||
isChecked={autoIndexes}
|
||||
{...register('autoIndexes')}
|
||||
<Box mt={6}>
|
||||
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
|
||||
{t('dataset:enhanced_indexes')}
|
||||
</Box>
|
||||
<Grid gridTemplateColumns={'1fr 1fr'} rowGap={[2, 4]} columnGap={[3, 7]}>
|
||||
<HStack flex={'1'} spacing={1}>
|
||||
<Checkbox isChecked={indexPrefixTitle} {...register('indexPrefixTitle')}>
|
||||
<FormLabel>{t('dataset:index_prefix_title')}</FormLabel>
|
||||
</Checkbox>
|
||||
<QuestionTip label={t('dataset:index_prefix_title_tips')} />
|
||||
</HStack>
|
||||
{trainingType === DatasetCollectionDataProcessModeEnum.chunk &&
|
||||
feConfigs?.show_dataset_enhance !== false && (
|
||||
<>
|
||||
<HStack flex={'1'} spacing={1}>
|
||||
<MyTooltip label={!feConfigs?.isPlus ? t('common:commercial_function_tip') : ''}>
|
||||
<Checkbox
|
||||
isDisabled={!feConfigs?.isPlus}
|
||||
isChecked={autoIndexes}
|
||||
{...register('autoIndexes')}
|
||||
>
|
||||
<FormLabel>{t('dataset:auto_indexes')}</FormLabel>
|
||||
</Checkbox>
|
||||
</MyTooltip>
|
||||
<QuestionTip label={t('dataset:auto_indexes_tips')} />
|
||||
</HStack>
|
||||
<HStack flex={'1'} spacing={1}>
|
||||
<MyTooltip
|
||||
label={
|
||||
!feConfigs?.isPlus
|
||||
? t('common:commercial_function_tip')
|
||||
: !datasetDetail?.vlmModel
|
||||
? t('common:error_vlm_not_config')
|
||||
: ''
|
||||
}
|
||||
>
|
||||
<FormLabel>{t('dataset:auto_indexes')}</FormLabel>
|
||||
</Checkbox>
|
||||
</MyTooltip>
|
||||
<QuestionTip label={t('dataset:auto_indexes_tips')} />
|
||||
</HStack>
|
||||
<HStack flex={'1'} spacing={1}>
|
||||
<MyTooltip
|
||||
label={
|
||||
!feConfigs?.isPlus
|
||||
? t('common:commercial_function_tip')
|
||||
: !datasetDetail?.vlmModel
|
||||
? t('common:error_vlm_not_config')
|
||||
: ''
|
||||
}
|
||||
>
|
||||
<Checkbox
|
||||
isDisabled={!feConfigs?.isPlus || !datasetDetail?.vlmModel}
|
||||
isChecked={imageIndex}
|
||||
{...register('imageIndex')}
|
||||
>
|
||||
<FormLabel>{t('dataset:image_auto_parse')}</FormLabel>
|
||||
</Checkbox>
|
||||
</MyTooltip>
|
||||
<QuestionTip label={t('dataset:image_auto_parse_tips')} />
|
||||
</HStack>
|
||||
</HStack>
|
||||
</Box>
|
||||
)}
|
||||
<Checkbox
|
||||
isDisabled={!feConfigs?.isPlus || !datasetDetail?.vlmModel}
|
||||
isChecked={imageIndex}
|
||||
{...register('imageIndex')}
|
||||
>
|
||||
<FormLabel>{t('dataset:image_auto_parse')}</FormLabel>
|
||||
</Checkbox>
|
||||
</MyTooltip>
|
||||
<QuestionTip label={t('dataset:image_auto_parse_tips')} />
|
||||
</HStack>
|
||||
</>
|
||||
)}
|
||||
</Grid>
|
||||
</Box>
|
||||
<Box mt={6}>
|
||||
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
|
||||
{t('dataset:chunk_process_params')}
|
||||
|
@@ -49,6 +49,7 @@ export const defaultFormData: ImportFormType = {
|
||||
|
||||
imageIndex: false,
|
||||
autoIndexes: false,
|
||||
indexPrefixTitle: true,
|
||||
|
||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode: DataChunkSplitModeEnum.paragraph,
|
||||
|
@@ -55,8 +55,9 @@ const ReTraining = () => {
|
||||
dataEnhanceCollectionName:
|
||||
collection.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
|
||||
|
||||
imageIndex: collection.imageIndex || defaultFormData.imageIndex,
|
||||
autoIndexes: collection.autoIndexes || defaultFormData.autoIndexes,
|
||||
imageIndex: collection.imageIndex ?? defaultFormData.imageIndex,
|
||||
autoIndexes: collection.autoIndexes ?? defaultFormData.autoIndexes,
|
||||
indexPrefixTitle: collection.indexPrefixTitle ?? defaultFormData.indexPrefixTitle,
|
||||
|
||||
chunkSettingMode: collection.chunkSettingMode || defaultFormData.chunkSettingMode,
|
||||
chunkSplitMode: collection.chunkSplitMode || defaultFormData.chunkSplitMode,
|
||||
|
@@ -84,15 +84,13 @@ const InputDataModal = ({
|
||||
onSuccess(res) {
|
||||
if (res.type === DatasetCollectionTypeEnum.images) {
|
||||
setCurrentTab(TabEnum.image);
|
||||
} else {
|
||||
setCurrentTab(TabEnum.chunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Get data
|
||||
const { loading: isFetchingData } = useRequest2(
|
||||
const { data: dataItem, loading: isFetchingData } = useRequest2(
|
||||
async () => {
|
||||
if (dataId) return getDatasetDataItemById(dataId);
|
||||
return null;
|
||||
@@ -125,6 +123,11 @@ const InputDataModal = ({
|
||||
}
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
if (currentTab || !dataItem) return;
|
||||
setCurrentTab(dataItem.a ? TabEnum.qa : TabEnum.chunk);
|
||||
}, [collection, dataItem, currentTab]);
|
||||
|
||||
// Import new data
|
||||
const { runAsync: sureImportData, loading: isImporting } = useRequest2(
|
||||
async (e: InputDataType) => {
|
||||
|
@@ -46,6 +46,10 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
|
||||
const webSelector = collection?.metadata?.webPageSelector;
|
||||
|
||||
return [
|
||||
{
|
||||
label: t('common:core.dataset.collection.id'),
|
||||
value: collection?._id
|
||||
},
|
||||
{
|
||||
label: t('common:core.dataset.collection.metadata.source'),
|
||||
value: t(DatasetCollectionTypeMap[collection.type]?.name as any)
|
||||
@@ -94,6 +98,14 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
|
||||
}
|
||||
]
|
||||
: []),
|
||||
...(collection.indexPrefixTitle !== undefined
|
||||
? [
|
||||
{
|
||||
label: t('dataset:index_prefix_title'),
|
||||
value: collection.indexPrefixTitle ? 'Yes' : 'No'
|
||||
}
|
||||
]
|
||||
: []),
|
||||
...(collection.imageIndex !== undefined
|
||||
? [
|
||||
{
|
||||
@@ -146,26 +158,22 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
|
||||
}, [collection, t]);
|
||||
|
||||
return (
|
||||
<MyBox isLoading={isLoading} w={'100%'} h={'100%'} p={6}>
|
||||
<Box fontSize={'md'} pb={4}>
|
||||
<MyBox isLoading={isLoading} w={'100%'} h={'100%'} p={6} overflow={'auto'}>
|
||||
<Box fontSize={'md'} fontWeight={'bold'} color={'myGray.900'} pb={4}>
|
||||
{t('common:core.dataset.collection.metadata.metadata')}
|
||||
</Box>
|
||||
<Flex mb={3} wordBreak={'break-all'} fontSize={'sm'}>
|
||||
<Box color={'myGray.500'} flex={'0 0 90px'}>
|
||||
{t('common:core.dataset.collection.id')}:
|
||||
</Box>
|
||||
<Box>{collection?._id}</Box>
|
||||
</Flex>
|
||||
{metadataList.map(
|
||||
(item, i) =>
|
||||
item.label &&
|
||||
item.value && (
|
||||
<Flex key={i} alignItems={'center'} mb={3} wordBreak={'break-all'} fontSize={'sm'}>
|
||||
<Box color={'myGray.500'} flex={'0 0 90px'}>
|
||||
<Box key={i} mb={3} wordBreak={'break-all'}>
|
||||
<Box color={'myGray.500'} fontSize={'xs'}>
|
||||
{item.label}
|
||||
</Box>
|
||||
<Box>{item.value}</Box>
|
||||
</Flex>
|
||||
<Box color={'myGray.900'} fontSize={'sm'}>
|
||||
{item.value}
|
||||
</Box>
|
||||
</Box>
|
||||
)
|
||||
)}
|
||||
{collection?.sourceId && (
|
||||
|
@@ -48,7 +48,9 @@ async function handler(req: NextApiRequest) {
|
||||
|
||||
const [
|
||||
{
|
||||
dataset: { _id: datasetId, vectorModel, agentModel }
|
||||
dataset: { _id: datasetId, vectorModel, agentModel },
|
||||
indexPrefixTitle,
|
||||
name
|
||||
}
|
||||
] = await Promise.all([getCollectionWithDataset(collectionId)]);
|
||||
|
||||
@@ -84,6 +86,7 @@ async function handler(req: NextApiRequest) {
|
||||
q: formatQ,
|
||||
a: formatA,
|
||||
chunkIndex: 0,
|
||||
indexPrefix: indexPrefixTitle ? `# ${name}` : undefined,
|
||||
embeddingModel: vectorModelData.model,
|
||||
indexes: formatIndexes
|
||||
});
|
||||
|
@@ -8,13 +8,16 @@ import { type ApiRequestProps } from '@fastgpt/service/type/next';
|
||||
import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog';
|
||||
import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants';
|
||||
import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util';
|
||||
|
||||
async function handler(req: ApiRequestProps<UpdateDatasetDataProps>) {
|
||||
const { dataId, q, a, indexes = [] } = req.body;
|
||||
|
||||
// auth data permission
|
||||
const {
|
||||
collection: {
|
||||
dataset: { vectorModel }
|
||||
dataset: { vectorModel },
|
||||
name,
|
||||
indexPrefixTitle
|
||||
},
|
||||
teamId,
|
||||
tmbId,
|
||||
@@ -33,7 +36,8 @@ async function handler(req: ApiRequestProps<UpdateDatasetDataProps>) {
|
||||
q,
|
||||
a,
|
||||
indexes,
|
||||
model: vectorModel
|
||||
model: vectorModel,
|
||||
indexPrefix: indexPrefixTitle ? `# ${name}` : undefined
|
||||
});
|
||||
|
||||
pushGenerateVectorUsage({
|
||||
|
@@ -41,7 +41,7 @@ type Props = { datasetId: string; currentTab: TabEnum };
|
||||
const sliderStyles: FlexProps = {
|
||||
bg: 'white',
|
||||
borderRadius: 'md',
|
||||
overflowY: 'scroll',
|
||||
overflowY: 'auto',
|
||||
boxShadow: 2
|
||||
};
|
||||
|
||||
|
@@ -25,13 +25,15 @@ const formatIndexes = async ({
|
||||
q,
|
||||
a = '',
|
||||
indexSize,
|
||||
maxIndexSize
|
||||
maxIndexSize,
|
||||
indexPrefix
|
||||
}: {
|
||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
||||
q: string;
|
||||
a?: string;
|
||||
indexSize: number;
|
||||
maxIndexSize: number;
|
||||
indexPrefix?: string;
|
||||
}): Promise<
|
||||
{
|
||||
type: `${DatasetDataIndexTypeEnum}`;
|
||||
@@ -39,6 +41,12 @@ const formatIndexes = async ({
|
||||
dataId?: string;
|
||||
}[]
|
||||
> => {
|
||||
const formatText = (text: string) => {
|
||||
if (indexPrefix && !text.startsWith(indexPrefix)) {
|
||||
return `${indexPrefix}\n${text}`;
|
||||
}
|
||||
return text;
|
||||
};
|
||||
/* get dataset data default index */
|
||||
const getDefaultIndex = async ({
|
||||
q = '',
|
||||
@@ -62,11 +70,11 @@ const formatIndexes = async ({
|
||||
|
||||
return [
|
||||
...qChunks.map((text) => ({
|
||||
text,
|
||||
text: formatText(text),
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
})),
|
||||
...aChunks.map((text) => ({
|
||||
text,
|
||||
text: formatText(text),
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
}))
|
||||
];
|
||||
@@ -130,9 +138,22 @@ const formatIndexes = async ({
|
||||
return item;
|
||||
})
|
||||
)
|
||||
).flat();
|
||||
)
|
||||
.flat()
|
||||
.filter((item) => !!item.text.trim());
|
||||
|
||||
return chekcIndexes.filter((item) => !!item.text.trim());
|
||||
// Add prefix
|
||||
const prefixIndexes = indexPrefix
|
||||
? chekcIndexes.map((index) => {
|
||||
if (index.type === DatasetDataIndexTypeEnum.custom) return index;
|
||||
return {
|
||||
...index,
|
||||
text: formatText(index.text)
|
||||
};
|
||||
})
|
||||
: chekcIndexes;
|
||||
|
||||
return prefixIndexes;
|
||||
};
|
||||
/* insert data.
|
||||
* 1. create data id
|
||||
@@ -150,6 +171,7 @@ export async function insertData2Dataset({
|
||||
chunkIndex = 0,
|
||||
indexSize = 512,
|
||||
indexes,
|
||||
indexPrefix,
|
||||
embeddingModel,
|
||||
session
|
||||
}: CreateDatasetDataProps & {
|
||||
@@ -174,7 +196,8 @@ export async function insertData2Dataset({
|
||||
q,
|
||||
a,
|
||||
indexSize,
|
||||
maxIndexSize: embModel.maxToken
|
||||
maxIndexSize: embModel.maxToken,
|
||||
indexPrefix
|
||||
});
|
||||
|
||||
// insert to vector store
|
||||
@@ -255,7 +278,8 @@ export async function updateData2Dataset({
|
||||
a,
|
||||
indexes,
|
||||
model,
|
||||
indexSize = 512
|
||||
indexSize = 512,
|
||||
indexPrefix
|
||||
}: UpdateDatasetDataProps & { model: string; indexSize?: number }) {
|
||||
if (!Array.isArray(indexes)) {
|
||||
return Promise.reject('indexes is required');
|
||||
@@ -271,7 +295,8 @@ export async function updateData2Dataset({
|
||||
q,
|
||||
a,
|
||||
indexSize,
|
||||
maxIndexSize: getEmbeddingModel(model).maxToken
|
||||
maxIndexSize: getEmbeddingModel(model).maxToken,
|
||||
indexPrefix
|
||||
});
|
||||
|
||||
// 3. Patch indexes, create, update, delete
|
||||
|
@@ -101,21 +101,13 @@ export const datasetParseQueue = async (): Promise<any> => {
|
||||
$inc: { retryCount: -1 }
|
||||
}
|
||||
)
|
||||
.select({
|
||||
_id: 1,
|
||||
teamId: 1,
|
||||
tmbId: 1,
|
||||
datasetId: 1,
|
||||
collectionId: 1,
|
||||
billId: 1,
|
||||
q: 1
|
||||
})
|
||||
.populate<{
|
||||
dataset: DatasetSchemaType;
|
||||
collection: DatasetCollectionSchemaType;
|
||||
}>([
|
||||
{
|
||||
path: 'collection'
|
||||
path: 'collection',
|
||||
select: '-qaPrompt'
|
||||
},
|
||||
{
|
||||
path: 'dataset'
|
||||
@@ -300,7 +292,6 @@ export const datasetParseQueue = async (): Promise<any> => {
|
||||
vlmModel: dataset.vlmModel,
|
||||
indexSize: collection.indexSize,
|
||||
mode: trainingMode,
|
||||
prompt: collection.qaPrompt,
|
||||
billId: data.billId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
|
@@ -14,7 +14,6 @@ import {
|
||||
countGptMessagesTokens,
|
||||
countPromptTokens
|
||||
} from '@fastgpt/service/common/string/tiktoken/index';
|
||||
import { pushDataListToTrainingQueueByCollectionId } from '@fastgpt/service/core/dataset/training/controller';
|
||||
import { loadRequestMessages } from '@fastgpt/service/core/chat/utils';
|
||||
import { llmCompletionsBodyFormat, formatLLMResponse } from '@fastgpt/service/core/ai/utils';
|
||||
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
@@ -24,6 +23,7 @@ import {
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { text2Chunks } from '@fastgpt/service/worker/function';
|
||||
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
|
||||
|
||||
const reduceQueue = () => {
|
||||
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
|
||||
@@ -41,6 +41,11 @@ const reduceQueueAndReturn = (delay = 0) => {
|
||||
}
|
||||
};
|
||||
|
||||
type PopulateType = {
|
||||
dataset: { vectorModel: string; agentModel: string; vlmModel: string };
|
||||
collection: { qaPrompt?: string };
|
||||
};
|
||||
|
||||
export async function generateQA(): Promise<any> {
|
||||
const max = global.systemEnv?.qaMaxProcess || 10;
|
||||
addLog.debug(`[QA Queue] Queue size: ${global.qaQueueLen}`);
|
||||
@@ -68,18 +73,16 @@ export async function generateQA(): Promise<any> {
|
||||
$inc: { retryCount: -1 }
|
||||
}
|
||||
)
|
||||
.select({
|
||||
_id: 1,
|
||||
teamId: 1,
|
||||
tmbId: 1,
|
||||
datasetId: 1,
|
||||
collectionId: 1,
|
||||
q: 1,
|
||||
model: 1,
|
||||
chunkIndex: 1,
|
||||
billId: 1,
|
||||
prompt: 1
|
||||
})
|
||||
.populate<PopulateType>([
|
||||
{
|
||||
path: 'dataset',
|
||||
select: 'agentModel vectorModel vlmModel'
|
||||
},
|
||||
{
|
||||
path: 'collection',
|
||||
select: 'qaPrompt'
|
||||
}
|
||||
])
|
||||
.lean();
|
||||
|
||||
// task preemption
|
||||
@@ -110,6 +113,13 @@ export async function generateQA(): Promise<any> {
|
||||
return reduceQueueAndReturn();
|
||||
}
|
||||
|
||||
if (!data.dataset || !data.collection) {
|
||||
addLog.info(`[QA Queue] Dataset or collection not found`, data);
|
||||
// Delete data
|
||||
await MongoDatasetTraining.deleteOne({ _id: data._id });
|
||||
return reduceQueueAndReturn();
|
||||
}
|
||||
|
||||
// auth balance
|
||||
if (!(await checkTeamAiPointsAndLock(data.teamId))) {
|
||||
return reduceQueueAndReturn();
|
||||
@@ -117,8 +127,8 @@ export async function generateQA(): Promise<any> {
|
||||
addLog.info(`[QA Queue] Start`);
|
||||
|
||||
try {
|
||||
const modelData = getLLMModel(data.model);
|
||||
const prompt = `${data.prompt || Prompt_AgentQA.description}
|
||||
const modelData = getLLMModel(data.dataset.agentModel);
|
||||
const prompt = `${data.collection.qaPrompt || Prompt_AgentQA.description}
|
||||
${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
|
||||
// request LLM to get QA
|
||||
@@ -147,16 +157,20 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
const qaArr = await formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
|
||||
|
||||
// get vector and insert
|
||||
await pushDataListToTrainingQueueByCollectionId({
|
||||
await pushDataListToTrainingQueue({
|
||||
teamId: data.teamId,
|
||||
tmbId: data.tmbId,
|
||||
datasetId: data.datasetId,
|
||||
collectionId: data.collectionId,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
data: qaArr.map((item) => ({
|
||||
...item,
|
||||
chunkIndex: data.chunkIndex
|
||||
})),
|
||||
billId: data.billId
|
||||
billId: data.billId,
|
||||
vectorModel: data.dataset.vectorModel,
|
||||
agentModel: data.dataset.agentModel,
|
||||
vlmModel: data.dataset.vlmModel
|
||||
});
|
||||
|
||||
// delete data from training
|
||||
@@ -192,7 +206,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
||||
}
|
||||
);
|
||||
|
||||
return reduceQueueAndReturn(1000);
|
||||
return reduceQueueAndReturn(500);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -12,10 +12,13 @@ import {
|
||||
} from '@fastgpt/service/common/vectorDB/controller';
|
||||
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
|
||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||
import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import type { Document } from '@fastgpt/service/common/mongo';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { getMaxIndexSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
import type {
|
||||
DatasetDataSchemaType,
|
||||
DatasetTrainingSchemaType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
const reduceQueue = () => {
|
||||
global.vectorQueueLen = global.vectorQueueLen > 0 ? global.vectorQueueLen - 1 : 0;
|
||||
@@ -33,6 +36,13 @@ const reduceQueueAndReturn = (delay = 0) => {
|
||||
}
|
||||
};
|
||||
|
||||
type PopulateType = {
|
||||
dataset: { vectorModel: string };
|
||||
collection: { name: string; indexPrefixTitle: boolean };
|
||||
data: { _id: string; indexes: DatasetDataSchemaType['indexes'] };
|
||||
};
|
||||
type TrainingDataType = DatasetTrainingSchemaType & PopulateType;
|
||||
|
||||
/* 索引生成队列。每导入一次,就是一个单独的线程 */
|
||||
export async function generateVector(): Promise<any> {
|
||||
const max = global.systemEnv?.vectorMaxProcess || 10;
|
||||
@@ -59,7 +69,22 @@ export async function generateVector(): Promise<any> {
|
||||
lockTime: new Date(),
|
||||
$inc: { retryCount: -1 }
|
||||
}
|
||||
);
|
||||
)
|
||||
.populate<PopulateType>([
|
||||
{
|
||||
path: 'dataset',
|
||||
select: 'vectorModel'
|
||||
},
|
||||
{
|
||||
path: 'collection',
|
||||
select: 'name indexPrefixTitle'
|
||||
},
|
||||
{
|
||||
path: 'data',
|
||||
select: '_id indexes'
|
||||
}
|
||||
])
|
||||
.lean();
|
||||
|
||||
// task preemption
|
||||
if (!data) {
|
||||
@@ -89,6 +114,13 @@ export async function generateVector(): Promise<any> {
|
||||
return reduceQueueAndReturn();
|
||||
}
|
||||
|
||||
if (!data.dataset || !data.collection) {
|
||||
addLog.info(`[Vector Queue] Dataset or collection not found`, data);
|
||||
// Delete data
|
||||
await MongoDatasetTraining.deleteOne({ _id: data._id });
|
||||
return reduceQueueAndReturn();
|
||||
}
|
||||
|
||||
// auth balance
|
||||
if (!(await checkTeamAiPointsAndLock(data.teamId))) {
|
||||
return reduceQueueAndReturn();
|
||||
@@ -110,7 +142,7 @@ export async function generateVector(): Promise<any> {
|
||||
teamId: data.teamId,
|
||||
tmbId: data.tmbId,
|
||||
inputTokens: tokens,
|
||||
model: data.model,
|
||||
model: data.dataset.vectorModel,
|
||||
billId: data.billId
|
||||
});
|
||||
|
||||
@@ -131,75 +163,62 @@ export async function generateVector(): Promise<any> {
|
||||
errorMsg: getErrText(err, 'unknown error')
|
||||
}
|
||||
);
|
||||
return reduceQueueAndReturn(1000);
|
||||
return reduceQueueAndReturn(500);
|
||||
}
|
||||
}
|
||||
|
||||
const rebuildData = async ({
|
||||
trainingData
|
||||
}: {
|
||||
trainingData: Document<unknown, {}, DatasetTrainingSchemaType> &
|
||||
Omit<
|
||||
DatasetTrainingSchemaType &
|
||||
Required<{
|
||||
_id: string;
|
||||
}>,
|
||||
never
|
||||
>;
|
||||
}) => {
|
||||
// find data
|
||||
const mongoData = await MongoDatasetData.findById(
|
||||
trainingData.dataId,
|
||||
'indexes teamId datasetId collectionId'
|
||||
);
|
||||
|
||||
if (!mongoData) {
|
||||
await trainingData.deleteOne();
|
||||
const rebuildData = async ({ trainingData }: { trainingData: TrainingDataType }) => {
|
||||
if (!trainingData.data) {
|
||||
await MongoDatasetTraining.deleteOne({ _id: trainingData._id });
|
||||
return Promise.reject('Not data');
|
||||
}
|
||||
|
||||
const deleteVectorIdList = mongoData.indexes.map((index) => index.dataId);
|
||||
// Old vectorId
|
||||
const deleteVectorIdList = trainingData.data.indexes.map((index) => index.dataId);
|
||||
|
||||
// Find next rebuilding data to insert training queue
|
||||
await mongoSessionRun(async (session) => {
|
||||
// get new mongoData insert to training
|
||||
const newRebuildingData = await MongoDatasetData.findOneAndUpdate(
|
||||
{
|
||||
rebuilding: true,
|
||||
teamId: mongoData.teamId,
|
||||
datasetId: mongoData.datasetId
|
||||
},
|
||||
{
|
||||
$unset: {
|
||||
rebuilding: null
|
||||
},
|
||||
updateTime: new Date()
|
||||
},
|
||||
{ session }
|
||||
).select({
|
||||
_id: 1,
|
||||
collectionId: 1
|
||||
});
|
||||
|
||||
if (newRebuildingData) {
|
||||
await MongoDatasetTraining.create(
|
||||
[
|
||||
try {
|
||||
await retryFn(() =>
|
||||
mongoSessionRun(async (session) => {
|
||||
// get new mongoData insert to training
|
||||
const newRebuildingData = await MongoDatasetData.findOneAndUpdate(
|
||||
{
|
||||
teamId: mongoData.teamId,
|
||||
tmbId: trainingData.tmbId,
|
||||
datasetId: mongoData.datasetId,
|
||||
collectionId: newRebuildingData.collectionId,
|
||||
billId: trainingData.billId,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
model: trainingData.model,
|
||||
dataId: newRebuildingData._id,
|
||||
retryCount: 50
|
||||
}
|
||||
],
|
||||
{ session, ordered: true }
|
||||
);
|
||||
}
|
||||
});
|
||||
rebuilding: true,
|
||||
teamId: trainingData.teamId,
|
||||
datasetId: trainingData.datasetId
|
||||
},
|
||||
{
|
||||
$unset: {
|
||||
rebuilding: null
|
||||
},
|
||||
updateTime: new Date()
|
||||
},
|
||||
{ session }
|
||||
).select({
|
||||
_id: 1,
|
||||
collectionId: 1
|
||||
});
|
||||
|
||||
if (newRebuildingData) {
|
||||
await MongoDatasetTraining.create(
|
||||
[
|
||||
{
|
||||
teamId: trainingData.teamId,
|
||||
tmbId: trainingData.tmbId,
|
||||
datasetId: trainingData.datasetId,
|
||||
collectionId: newRebuildingData.collectionId,
|
||||
billId: trainingData.billId,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
dataId: newRebuildingData._id,
|
||||
retryCount: 50
|
||||
}
|
||||
],
|
||||
{ session, ordered: true }
|
||||
);
|
||||
}
|
||||
})
|
||||
);
|
||||
} catch (error) {}
|
||||
|
||||
// update vector, update dataset_data rebuilding status, delete data from training
|
||||
// 1. Insert new vector to dataset_data
|
||||
@@ -208,28 +227,36 @@ const rebuildData = async ({
|
||||
insertId: string;
|
||||
}[] = [];
|
||||
let i = 0;
|
||||
for await (const index of mongoData.indexes) {
|
||||
for await (const index of trainingData.data.indexes) {
|
||||
const result = await insertDatasetDataVector({
|
||||
query: index.text,
|
||||
model: getEmbeddingModel(trainingData.model),
|
||||
teamId: mongoData.teamId,
|
||||
datasetId: mongoData.datasetId,
|
||||
collectionId: mongoData.collectionId
|
||||
model: getEmbeddingModel(trainingData.dataset.vectorModel),
|
||||
teamId: trainingData.teamId,
|
||||
datasetId: trainingData.datasetId,
|
||||
collectionId: trainingData.collectionId
|
||||
});
|
||||
mongoData.indexes[i].dataId = result.insertId;
|
||||
trainingData.data.indexes[i].dataId = result.insertId;
|
||||
updateResult.push(result);
|
||||
i++;
|
||||
}
|
||||
|
||||
const { tokens } = await mongoSessionRun(async (session) => {
|
||||
// 2. Ensure that the training data is deleted after the Mongo update is successful
|
||||
await mongoData.save({ session });
|
||||
await MongoDatasetData.updateOne(
|
||||
{ _id: trainingData.data._id },
|
||||
{
|
||||
$set: {
|
||||
indexes: trainingData.data.indexes
|
||||
}
|
||||
},
|
||||
{ session }
|
||||
);
|
||||
// 3. Delete the training data
|
||||
await trainingData.deleteOne({ session });
|
||||
await MongoDatasetTraining.deleteOne({ _id: trainingData._id }, { session });
|
||||
|
||||
// 4. Delete old vector
|
||||
await deleteDatasetDataVector({
|
||||
teamId: mongoData.teamId,
|
||||
teamId: trainingData.teamId,
|
||||
idList: deleteVectorIdList
|
||||
});
|
||||
|
||||
@@ -241,19 +268,8 @@ const rebuildData = async ({
|
||||
return { tokens };
|
||||
};
|
||||
|
||||
const insertData = async ({
|
||||
trainingData
|
||||
}: {
|
||||
trainingData: Document<unknown, {}, DatasetTrainingSchemaType> &
|
||||
Omit<
|
||||
DatasetTrainingSchemaType &
|
||||
Required<{
|
||||
_id: string;
|
||||
}>,
|
||||
never
|
||||
>;
|
||||
}) => {
|
||||
const { tokens } = await mongoSessionRun(async (session) => {
|
||||
const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) => {
|
||||
return mongoSessionRun(async (session) => {
|
||||
// insert new data to dataset
|
||||
const { tokens } = await insertData2Dataset({
|
||||
teamId: trainingData.teamId,
|
||||
@@ -264,18 +280,21 @@ const insertData = async ({
|
||||
a: trainingData.a,
|
||||
imageId: trainingData.imageId,
|
||||
chunkIndex: trainingData.chunkIndex,
|
||||
indexSize: trainingData.indexSize || getMaxIndexSize(getEmbeddingModel(trainingData.model)),
|
||||
indexSize:
|
||||
trainingData.indexSize ||
|
||||
getMaxIndexSize(getEmbeddingModel(trainingData.dataset.vectorModel)),
|
||||
indexes: trainingData.indexes,
|
||||
embeddingModel: trainingData.model,
|
||||
indexPrefix: trainingData.collection.indexPrefixTitle
|
||||
? `# ${trainingData.collection.name}`
|
||||
: undefined,
|
||||
embeddingModel: trainingData.dataset.vectorModel,
|
||||
session
|
||||
});
|
||||
// delete data from training
|
||||
await trainingData.deleteOne({ session });
|
||||
await MongoDatasetTraining.deleteOne({ _id: trainingData._id }, { session });
|
||||
|
||||
return {
|
||||
tokens
|
||||
};
|
||||
});
|
||||
|
||||
return { tokens };
|
||||
};
|
||||
|
@@ -33,8 +33,7 @@ describe('delete training data test', () => {
|
||||
tmbId: root.tmbId,
|
||||
datasetId: dataset._id,
|
||||
collectionId: collection._id,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
model: 'test'
|
||||
mode: TrainingModeEnum.chunk
|
||||
});
|
||||
|
||||
const res = await Call<deleteTrainingDataBody, {}, deleteTrainingDataResponse>(handler, {
|
@@ -33,7 +33,6 @@ describe('get training data detail test', () => {
|
||||
tmbId: root.tmbId,
|
||||
datasetId: dataset._id,
|
||||
collectionId: collection._id,
|
||||
model: 'test',
|
||||
mode: TrainingModeEnum.chunk,
|
||||
q: 'test',
|
||||
a: 'test'
|
@@ -35,7 +35,6 @@ describe('training error list test', () => {
|
||||
datasetId: dataset._id,
|
||||
collectionId: collection._id,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
model: 'test',
|
||||
errorMsg: 'test'
|
||||
}))
|
||||
);
|
@@ -33,8 +33,7 @@ describe('update training data test', () => {
|
||||
tmbId: root.tmbId,
|
||||
datasetId: dataset._id,
|
||||
collectionId: collection._id,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
model: 'test'
|
||||
mode: TrainingModeEnum.chunk
|
||||
});
|
||||
|
||||
const res = await Call<updateTrainingDataBody, {}, updateTrainingDataResponse>(handler, {
|
Reference in New Issue
Block a user