feat: dataset index prefix (#5061)

This commit is contained in:
Archer
2025-06-18 17:26:53 +08:00
committed by GitHub
parent 6b2ea696c5
commit 36fafd2149
34 changed files with 371 additions and 259 deletions

View File

@@ -133,15 +133,15 @@ services:
# fastgpt
sandbox:
container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
networks:
- fastgpt
restart: always
fastgpt-mcp-server:
container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
ports:
- 3005:3000
networks:
@@ -151,8 +151,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt:
container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
ports:
- 3000:3000
networks:

View File

@@ -109,15 +109,15 @@ services:
# fastgpt
sandbox:
container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
networks:
- fastgpt
restart: always
fastgpt-mcp-server:
container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
ports:
- 3005:3000
networks:
@@ -127,8 +127,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt:
container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
ports:
- 3000:3000
networks:

View File

@@ -96,15 +96,15 @@ services:
# fastgpt
sandbox:
container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
networks:
- fastgpt
restart: always
fastgpt-mcp-server:
container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
ports:
- 3005:3000
networks:
@@ -114,8 +114,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt:
container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
ports:
- 3000:3000
networks:

View File

@@ -72,15 +72,15 @@ services:
sandbox:
container_name: sandbox
image: ghcr.io/labring/fastgpt-sandbox:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-sandbox:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.13 # 阿里云
networks:
- fastgpt
restart: always
fastgpt-mcp-server:
container_name: fastgpt-mcp-server
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt-mcp_server:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-mcp_server:v4.9.13 # 阿里云
ports:
- 3005:3000
networks:
@@ -90,8 +90,8 @@ services:
- FASTGPT_ENDPOINT=http://fastgpt:3000
fastgpt:
container_name: fastgpt
image: ghcr.io/labring/fastgpt:v4.9.11 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.11 # 阿里云
image: ghcr.io/labring/fastgpt:v4.9.13 # git
# image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.13 # 阿里云
ports:
- 3000:3000
networks:

View File

@@ -9,7 +9,11 @@ weight: 788
## 更新指南
### 1. 更新镜像:
### 1. 更新环境变量
`fastgpt``fastgpt-pro`镜像环境变量中加入: `AES256_SECRET_KEY=` 变量,用于密钥加密。
### 2. 更新镜像:
- 更新 FastGPT 镜像 tag: v4.9.12
- 更新 FastGPT 商业版镜像 tag: v4.9.12

View File

@@ -1,5 +1,5 @@
---
title: 'V4.9.13(进行中)'
title: 'V4.9.13'
description: 'FastGPT V4.9.13 更新说明'
icon: 'upgrade'
draft: false

View File

@@ -0,0 +1,20 @@
---
title: 'V4.9.14(进行中)'
description: 'FastGPT V4.9.14 更新说明'
icon: 'upgrade'
draft: false
toc: true
weight: 787
---
## 🚀 新增内容
1. 知识库导入,支持配置:自动将文件名加入系统索引中。
## ⚙️ 优化
1. 统一知识库训练队列代码逻辑。
2. 输入框 UX。
## 🐛 修复

View File

@@ -10,6 +10,7 @@ export type CreateDatasetDataProps = {
a?: string;
imageId?: string;
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
indexPrefix?: string;
};
export type UpdateDatasetDataProps = {
@@ -21,6 +22,7 @@ export type UpdateDatasetDataProps = {
dataId?: string; // pg data id
})[];
imageId?: string;
indexPrefix?: string;
};
export type PatchIndexesProps =

View File

@@ -7,9 +7,9 @@ export type PushDataToTrainingQueueProps = {
datasetId: string;
collectionId: string;
data: PushDatasetDataChunkProps[];
mode?: TrainingModeEnum;
data: PushDatasetDataChunkProps[];
prompt?: string;
agentModel: string;
vectorModel: string;

View File

@@ -36,6 +36,7 @@ export type ChunkSettingsType = {
// Index enhance
imageIndex?: boolean;
autoIndexes?: boolean;
indexPrefixTitle?: boolean;
// Chunk setting
chunkSettingMode?: ChunkSettingModeEnum; // 系统参数/自定义参数
@@ -184,8 +185,6 @@ export type DatasetTrainingSchemaType = {
expireAt: Date;
lockTime: Date;
mode: TrainingModeEnum;
model?: string;
prompt?: string;
dataId?: string;
q: string;
a: string;

View File

@@ -103,6 +103,7 @@ export const createCollectionAndInsertData = async ({
delete formatCreateCollectionParams.chunkSize;
delete formatCreateCollectionParams.chunkSplitter;
delete formatCreateCollectionParams.indexSize;
delete formatCreateCollectionParams.indexPrefixTitle;
}
}
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
@@ -223,7 +224,6 @@ export const createCollectionAndInsertData = async ({
vlmModel: dataset.vlmModel,
indexSize,
mode: trainingMode,
prompt: formatCreateCollectionParams.qaPrompt,
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,

View File

@@ -32,6 +32,7 @@ export const ChunkSettings = {
imageIndex: Boolean,
autoIndexes: Boolean,
indexPrefixTitle: Boolean,
chunkSettingMode: {
type: String,

View File

@@ -27,23 +27,6 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
} catch (error) {}
};
export const pushDataListToTrainingQueueByCollectionId = async ({
collectionId,
...props
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
const {
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
} = await getCollectionWithDataset(collectionId);
return pushDataListToTrainingQueue({
...props,
datasetId,
collectionId,
vectorModel,
agentModel,
vlmModel
});
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
@@ -53,7 +36,6 @@ export async function pushDataListToTrainingQueue({
vectorModel,
vlmModel,
data,
prompt,
billId,
mode = TrainingModeEnum.chunk,
indexSize,
@@ -149,8 +131,6 @@ export async function pushDataListToTrainingQueue({
collectionId: collectionId,
billId,
mode: formatTrainingMode(item, mode),
prompt,
model,
...(item.q && { q: item.q }),
...(item.a && { a: item.a }),
...(item.imageId && { imageId: item.imageId }),

View File

@@ -10,6 +10,7 @@ import {
TeamMemberCollectionName
} from '@fastgpt/global/support/user/team/constant';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { DatasetDataCollectionName } from '../data/schema';
export const DatasetTrainingCollectionName = 'dataset_trainings';
@@ -54,8 +55,6 @@ const TrainingDataSchema = new Schema({
default: 5
},
model: String,
prompt: String,
q: {
type: String,
default: ''
@@ -74,7 +73,10 @@ const TrainingDataSchema = new Schema({
type: Number,
default: 0
},
dataId: Schema.Types.ObjectId,
dataId: {
type: Schema.Types.ObjectId,
ref: DatasetDataCollectionName
},
indexes: {
type: [
{
@@ -105,6 +107,12 @@ TrainingDataSchema.virtual('collection', {
foreignField: '_id',
justOne: true
});
TrainingDataSchema.virtual('data', {
ref: DatasetDataCollectionName,
localField: 'dataId',
foreignField: '_id',
justOne: true
});
try {
// lock training data(teamId); delete training data

View File

@@ -111,6 +111,8 @@
"import_param_setting": "Parameter settings",
"import_select_file": "Select a file",
"import_select_link": "Enter link",
"index_prefix_title": "Index add title",
"index_prefix_title_tips": "Automatically add title names to all indexes",
"index_size": "Index size",
"index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
"input_required_field_to_select_baseurl": "Please enter the required information first",

View File

@@ -111,6 +111,8 @@
"import_param_setting": "参数设置",
"import_select_file": "选择文件",
"import_select_link": "输入链接",
"index_prefix_title": "将标题加入索引",
"index_prefix_title_tips": "自动给索引所有索引加标题名",
"index_size": "索引大小",
"index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。",
"input_required_field_to_select_baseurl": "请先输入必填信息",

View File

@@ -110,6 +110,8 @@
"import_param_setting": "參數設定",
"import_select_file": "選擇文件",
"import_select_link": "輸入連結",
"index_prefix_title": "將標題加入索引",
"index_prefix_title_tips": "自動給索引所有索引加標題名",
"index_size": "索引大小",
"index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。",
"input_required_field_to_select_baseurl": "請先輸入必填信息",

View File

@@ -1,4 +1,5 @@
import { Box, Flex, Textarea } from '@chakra-ui/react';
import type { FlexProps} from '@chakra-ui/react';
import { Box, Flex, Textarea, useBoolean } from '@chakra-ui/react';
import React, { useRef, useCallback, useMemo, useState } from 'react';
import { useTranslation } from 'next-i18next';
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
@@ -48,6 +49,8 @@ const ChatInput = ({
const { setValue, watch, control } = chatForm;
const inputValue = watch('input');
const [focusing, { on: onFocus, off: offFocus }] = useBoolean();
// Check voice input state
const [mobilePreSpeak, setMobilePreSpeak] = useState(false);
@@ -207,6 +210,8 @@ const ChatInput = ({
}
}
}}
onFocus={onFocus}
onBlur={offFocus}
/>
</Flex>
</Flex>
@@ -254,7 +259,8 @@ const ChatInput = ({
borderRadius={'sm'}
cursor={'pointer'}
_hover={{ bg: 'rgba(0, 0, 0, 0.04)' }}
onClick={() => {
onClick={(e) => {
e.stopPropagation();
onOpenSelectFile();
}}
>
@@ -276,7 +282,8 @@ const ChatInput = ({
borderRadius={'sm'}
cursor={'pointer'}
_hover={{ bg: 'rgba(0, 0, 0, 0.04)' }}
onClick={() => {
onClick={(e) => {
e.stopPropagation();
VoiceInputRef.current?.onSpeak?.();
}}
>
@@ -307,7 +314,8 @@ const ChatInput = ({
}
borderRadius={['md', 'lg']}
cursor={isChatting ? 'pointer' : canSendMessage ? 'pointer' : 'not-allowed'}
onClick={() => {
onClick={(e) => {
e.stopPropagation();
if (isChatting) {
return onStop();
}
@@ -343,6 +351,11 @@ const ChatInput = ({
onStop
]);
const activeStyles: FlexProps = {
boxShadow: '0px 5px 20px -4px rgba(19, 51, 107, 0.13)',
border: '0.5px solid rgba(0, 0, 0, 0.24)'
};
return (
<Box
m={['0 auto 10px', '10px auto']}
@@ -381,12 +394,17 @@ const ChatInput = ({
pt={fileList.length > 0 ? '0' : mobilePreSpeak ? [0, 4] : [3, 4]}
pb={[2, 4]}
position={'relative'}
boxShadow={`0px 5px 16px -4px rgba(19, 51, 107, 0.08)`}
borderRadius={['xl', 'xxl']}
bg={'white'}
overflow={'display'}
border={'0.5px solid rgba(0, 0, 0, 0.15)'}
borderColor={'rgba(0,0,0,0.12)'}
{...(focusing
? activeStyles
: {
_hover: activeStyles,
border: '0.5px solid rgba(0, 0, 0, 0.18)',
boxShadow: `0px 5px 16px -4px rgba(19, 51, 107, 0.08)`
})}
onClick={() => TextareaDom?.current?.focus()}
>
<Box flex={1}>
{/* Chat input guide box */}

View File

@@ -13,7 +13,8 @@ import {
Textarea,
useDisclosure,
Checkbox,
HStack
HStack,
Grid
} from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
@@ -35,7 +36,6 @@ import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContex
import MySelect from '@fastgpt/web/components/common/MySelect';
import {
chunkAutoChunkSize,
getAutoIndexSize,
getIndexSizeSelectList,
getLLMDefaultChunkSize,
getLLMMaxChunkSize,
@@ -44,7 +44,6 @@ import {
minChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
import type { LLMModelItemType, EmbeddingModelItemType } from '@fastgpt/global/core/ai/model.d';
const PromptTextarea = ({
defaultValue = '',
@@ -98,6 +97,7 @@ export type CollectionChunkFormType = {
// Index enhance
imageIndex: boolean;
autoIndexes: boolean;
indexPrefixTitle: boolean;
// Chunk setting
chunkSettingMode: ChunkSettingModeEnum; // 系统参数/自定义参数
@@ -133,6 +133,7 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
const autoIndexes = watch('autoIndexes');
const indexSize = watch('indexSize');
const imageIndex = watch('imageIndex');
const indexPrefixTitle = watch('indexPrefixTitle');
const paragraphChunkAIMode = watch('paragraphChunkAIMode');
const trainingModeList = useMemo(() => {
@@ -282,48 +283,56 @@ const CollectionChunkForm = ({ form }: { form: UseFormReturn<CollectionChunkForm
</Box>
)}
{trainingType === DatasetCollectionDataProcessModeEnum.chunk &&
feConfigs?.show_dataset_enhance !== false && (
<Box mt={6}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:enhanced_indexes')}
</Box>
<HStack gap={[3, 7]}>
<HStack flex={'1'} spacing={1}>
<MyTooltip label={!feConfigs?.isPlus ? t('common:commercial_function_tip') : ''}>
<Checkbox
isDisabled={!feConfigs?.isPlus}
isChecked={autoIndexes}
{...register('autoIndexes')}
<Box mt={6}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:enhanced_indexes')}
</Box>
<Grid gridTemplateColumns={'1fr 1fr'} rowGap={[2, 4]} columnGap={[3, 7]}>
<HStack flex={'1'} spacing={1}>
<Checkbox isChecked={indexPrefixTitle} {...register('indexPrefixTitle')}>
<FormLabel>{t('dataset:index_prefix_title')}</FormLabel>
</Checkbox>
<QuestionTip label={t('dataset:index_prefix_title_tips')} />
</HStack>
{trainingType === DatasetCollectionDataProcessModeEnum.chunk &&
feConfigs?.show_dataset_enhance !== false && (
<>
<HStack flex={'1'} spacing={1}>
<MyTooltip label={!feConfigs?.isPlus ? t('common:commercial_function_tip') : ''}>
<Checkbox
isDisabled={!feConfigs?.isPlus}
isChecked={autoIndexes}
{...register('autoIndexes')}
>
<FormLabel>{t('dataset:auto_indexes')}</FormLabel>
</Checkbox>
</MyTooltip>
<QuestionTip label={t('dataset:auto_indexes_tips')} />
</HStack>
<HStack flex={'1'} spacing={1}>
<MyTooltip
label={
!feConfigs?.isPlus
? t('common:commercial_function_tip')
: !datasetDetail?.vlmModel
? t('common:error_vlm_not_config')
: ''
}
>
<FormLabel>{t('dataset:auto_indexes')}</FormLabel>
</Checkbox>
</MyTooltip>
<QuestionTip label={t('dataset:auto_indexes_tips')} />
</HStack>
<HStack flex={'1'} spacing={1}>
<MyTooltip
label={
!feConfigs?.isPlus
? t('common:commercial_function_tip')
: !datasetDetail?.vlmModel
? t('common:error_vlm_not_config')
: ''
}
>
<Checkbox
isDisabled={!feConfigs?.isPlus || !datasetDetail?.vlmModel}
isChecked={imageIndex}
{...register('imageIndex')}
>
<FormLabel>{t('dataset:image_auto_parse')}</FormLabel>
</Checkbox>
</MyTooltip>
<QuestionTip label={t('dataset:image_auto_parse_tips')} />
</HStack>
</HStack>
</Box>
)}
<Checkbox
isDisabled={!feConfigs?.isPlus || !datasetDetail?.vlmModel}
isChecked={imageIndex}
{...register('imageIndex')}
>
<FormLabel>{t('dataset:image_auto_parse')}</FormLabel>
</Checkbox>
</MyTooltip>
<QuestionTip label={t('dataset:image_auto_parse_tips')} />
</HStack>
</>
)}
</Grid>
</Box>
<Box mt={6}>
<Box fontSize={'sm'} mb={2} color={'myGray.600'}>
{t('dataset:chunk_process_params')}

View File

@@ -49,6 +49,7 @@ export const defaultFormData: ImportFormType = {
imageIndex: false,
autoIndexes: false,
indexPrefixTitle: true,
chunkSettingMode: ChunkSettingModeEnum.auto,
chunkSplitMode: DataChunkSplitModeEnum.paragraph,

View File

@@ -55,8 +55,9 @@ const ReTraining = () => {
dataEnhanceCollectionName:
collection.dataEnhanceCollectionName || defaultFormData.dataEnhanceCollectionName,
imageIndex: collection.imageIndex || defaultFormData.imageIndex,
autoIndexes: collection.autoIndexes || defaultFormData.autoIndexes,
imageIndex: collection.imageIndex ?? defaultFormData.imageIndex,
autoIndexes: collection.autoIndexes ?? defaultFormData.autoIndexes,
indexPrefixTitle: collection.indexPrefixTitle ?? defaultFormData.indexPrefixTitle,
chunkSettingMode: collection.chunkSettingMode || defaultFormData.chunkSettingMode,
chunkSplitMode: collection.chunkSplitMode || defaultFormData.chunkSplitMode,

View File

@@ -84,15 +84,13 @@ const InputDataModal = ({
onSuccess(res) {
if (res.type === DatasetCollectionTypeEnum.images) {
setCurrentTab(TabEnum.image);
} else {
setCurrentTab(TabEnum.chunk);
}
}
}
);
// Get data
const { loading: isFetchingData } = useRequest2(
const { data: dataItem, loading: isFetchingData } = useRequest2(
async () => {
if (dataId) return getDatasetDataItemById(dataId);
return null;
@@ -125,6 +123,11 @@ const InputDataModal = ({
}
);
useEffect(() => {
if (currentTab || !dataItem) return;
setCurrentTab(dataItem.a ? TabEnum.qa : TabEnum.chunk);
}, [collection, dataItem, currentTab]);
// Import new data
const { runAsync: sureImportData, loading: isImporting } = useRequest2(
async (e: InputDataType) => {

View File

@@ -46,6 +46,10 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
const webSelector = collection?.metadata?.webPageSelector;
return [
{
label: t('common:core.dataset.collection.id'),
value: collection?._id
},
{
label: t('common:core.dataset.collection.metadata.source'),
value: t(DatasetCollectionTypeMap[collection.type]?.name as any)
@@ -94,6 +98,14 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
}
]
: []),
...(collection.indexPrefixTitle !== undefined
? [
{
label: t('dataset:index_prefix_title'),
value: collection.indexPrefixTitle ? 'Yes' : 'No'
}
]
: []),
...(collection.imageIndex !== undefined
? [
{
@@ -146,26 +158,22 @@ const MetaDataCard = ({ datasetId }: { datasetId: string }) => {
}, [collection, t]);
return (
<MyBox isLoading={isLoading} w={'100%'} h={'100%'} p={6}>
<Box fontSize={'md'} pb={4}>
<MyBox isLoading={isLoading} w={'100%'} h={'100%'} p={6} overflow={'auto'}>
<Box fontSize={'md'} fontWeight={'bold'} color={'myGray.900'} pb={4}>
{t('common:core.dataset.collection.metadata.metadata')}
</Box>
<Flex mb={3} wordBreak={'break-all'} fontSize={'sm'}>
<Box color={'myGray.500'} flex={'0 0 90px'}>
{t('common:core.dataset.collection.id')}:
</Box>
<Box>{collection?._id}</Box>
</Flex>
{metadataList.map(
(item, i) =>
item.label &&
item.value && (
<Flex key={i} alignItems={'center'} mb={3} wordBreak={'break-all'} fontSize={'sm'}>
<Box color={'myGray.500'} flex={'0 0 90px'}>
<Box key={i} mb={3} wordBreak={'break-all'}>
<Box color={'myGray.500'} fontSize={'xs'}>
{item.label}
</Box>
<Box>{item.value}</Box>
</Flex>
<Box color={'myGray.900'} fontSize={'sm'}>
{item.value}
</Box>
</Box>
)
)}
{collection?.sourceId && (

View File

@@ -48,7 +48,9 @@ async function handler(req: NextApiRequest) {
const [
{
dataset: { _id: datasetId, vectorModel, agentModel }
dataset: { _id: datasetId, vectorModel, agentModel },
indexPrefixTitle,
name
}
] = await Promise.all([getCollectionWithDataset(collectionId)]);
@@ -84,6 +86,7 @@ async function handler(req: NextApiRequest) {
q: formatQ,
a: formatA,
chunkIndex: 0,
indexPrefix: indexPrefixTitle ? `# ${name}` : undefined,
embeddingModel: vectorModelData.model,
indexes: formatIndexes
});

View File

@@ -8,13 +8,16 @@ import { type ApiRequestProps } from '@fastgpt/service/type/next';
import { addOperationLog } from '@fastgpt/service/support/operationLog/addOperationLog';
import { OperationLogEventEnum } from '@fastgpt/global/support/operationLog/constants';
import { getI18nDatasetType } from '@fastgpt/service/support/operationLog/util';
async function handler(req: ApiRequestProps<UpdateDatasetDataProps>) {
const { dataId, q, a, indexes = [] } = req.body;
// auth data permission
const {
collection: {
dataset: { vectorModel }
dataset: { vectorModel },
name,
indexPrefixTitle
},
teamId,
tmbId,
@@ -33,7 +36,8 @@ async function handler(req: ApiRequestProps<UpdateDatasetDataProps>) {
q,
a,
indexes,
model: vectorModel
model: vectorModel,
indexPrefix: indexPrefixTitle ? `# ${name}` : undefined
});
pushGenerateVectorUsage({

View File

@@ -41,7 +41,7 @@ type Props = { datasetId: string; currentTab: TabEnum };
const sliderStyles: FlexProps = {
bg: 'white',
borderRadius: 'md',
overflowY: 'scroll',
overflowY: 'auto',
boxShadow: 2
};

View File

@@ -25,13 +25,15 @@ const formatIndexes = async ({
q,
a = '',
indexSize,
maxIndexSize
maxIndexSize,
indexPrefix
}: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string;
a?: string;
indexSize: number;
maxIndexSize: number;
indexPrefix?: string;
}): Promise<
{
type: `${DatasetDataIndexTypeEnum}`;
@@ -39,6 +41,12 @@ const formatIndexes = async ({
dataId?: string;
}[]
> => {
const formatText = (text: string) => {
if (indexPrefix && !text.startsWith(indexPrefix)) {
return `${indexPrefix}\n${text}`;
}
return text;
};
/* get dataset data default index */
const getDefaultIndex = async ({
q = '',
@@ -62,11 +70,11 @@ const formatIndexes = async ({
return [
...qChunks.map((text) => ({
text,
text: formatText(text),
type: DatasetDataIndexTypeEnum.default
})),
...aChunks.map((text) => ({
text,
text: formatText(text),
type: DatasetDataIndexTypeEnum.default
}))
];
@@ -130,9 +138,22 @@ const formatIndexes = async ({
return item;
})
)
).flat();
)
.flat()
.filter((item) => !!item.text.trim());
return chekcIndexes.filter((item) => !!item.text.trim());
// Add prefix
const prefixIndexes = indexPrefix
? chekcIndexes.map((index) => {
if (index.type === DatasetDataIndexTypeEnum.custom) return index;
return {
...index,
text: formatText(index.text)
};
})
: chekcIndexes;
return prefixIndexes;
};
/* insert data.
* 1. create data id
@@ -150,6 +171,7 @@ export async function insertData2Dataset({
chunkIndex = 0,
indexSize = 512,
indexes,
indexPrefix,
embeddingModel,
session
}: CreateDatasetDataProps & {
@@ -174,7 +196,8 @@ export async function insertData2Dataset({
q,
a,
indexSize,
maxIndexSize: embModel.maxToken
maxIndexSize: embModel.maxToken,
indexPrefix
});
// insert to vector store
@@ -255,7 +278,8 @@ export async function updateData2Dataset({
a,
indexes,
model,
indexSize = 512
indexSize = 512,
indexPrefix
}: UpdateDatasetDataProps & { model: string; indexSize?: number }) {
if (!Array.isArray(indexes)) {
return Promise.reject('indexes is required');
@@ -271,7 +295,8 @@ export async function updateData2Dataset({
q,
a,
indexSize,
maxIndexSize: getEmbeddingModel(model).maxToken
maxIndexSize: getEmbeddingModel(model).maxToken,
indexPrefix
});
// 3. Patch indexes, create, update, delete

View File

@@ -101,21 +101,13 @@ export const datasetParseQueue = async (): Promise<any> => {
$inc: { retryCount: -1 }
}
)
.select({
_id: 1,
teamId: 1,
tmbId: 1,
datasetId: 1,
collectionId: 1,
billId: 1,
q: 1
})
.populate<{
dataset: DatasetSchemaType;
collection: DatasetCollectionSchemaType;
}>([
{
path: 'collection'
path: 'collection',
select: '-qaPrompt'
},
{
path: 'dataset'
@@ -300,7 +292,6 @@ export const datasetParseQueue = async (): Promise<any> => {
vlmModel: dataset.vlmModel,
indexSize: collection.indexSize,
mode: trainingMode,
prompt: collection.qaPrompt,
billId: data.billId,
data: chunks.map((item, index) => ({
...item,

View File

@@ -14,7 +14,6 @@ import {
countGptMessagesTokens,
countPromptTokens
} from '@fastgpt/service/common/string/tiktoken/index';
import { pushDataListToTrainingQueueByCollectionId } from '@fastgpt/service/core/dataset/training/controller';
import { loadRequestMessages } from '@fastgpt/service/core/chat/utils';
import { llmCompletionsBodyFormat, formatLLMResponse } from '@fastgpt/service/core/ai/utils';
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
@@ -24,6 +23,7 @@ import {
} from '@fastgpt/global/core/dataset/training/utils';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { text2Chunks } from '@fastgpt/service/worker/function';
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
const reduceQueue = () => {
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@@ -41,6 +41,11 @@ const reduceQueueAndReturn = (delay = 0) => {
}
};
type PopulateType = {
dataset: { vectorModel: string; agentModel: string; vlmModel: string };
collection: { qaPrompt?: string };
};
export async function generateQA(): Promise<any> {
const max = global.systemEnv?.qaMaxProcess || 10;
addLog.debug(`[QA Queue] Queue size: ${global.qaQueueLen}`);
@@ -68,18 +73,16 @@ export async function generateQA(): Promise<any> {
$inc: { retryCount: -1 }
}
)
.select({
_id: 1,
teamId: 1,
tmbId: 1,
datasetId: 1,
collectionId: 1,
q: 1,
model: 1,
chunkIndex: 1,
billId: 1,
prompt: 1
})
.populate<PopulateType>([
{
path: 'dataset',
select: 'agentModel vectorModel vlmModel'
},
{
path: 'collection',
select: 'qaPrompt'
}
])
.lean();
// task preemption
@@ -110,6 +113,13 @@ export async function generateQA(): Promise<any> {
return reduceQueueAndReturn();
}
if (!data.dataset || !data.collection) {
addLog.info(`[QA Queue] Dataset or collection not found`, data);
// Delete data
await MongoDatasetTraining.deleteOne({ _id: data._id });
return reduceQueueAndReturn();
}
// auth balance
if (!(await checkTeamAiPointsAndLock(data.teamId))) {
return reduceQueueAndReturn();
@@ -117,8 +127,8 @@ export async function generateQA(): Promise<any> {
addLog.info(`[QA Queue] Start`);
try {
const modelData = getLLMModel(data.model);
const prompt = `${data.prompt || Prompt_AgentQA.description}
const modelData = getLLMModel(data.dataset.agentModel);
const prompt = `${data.collection.qaPrompt || Prompt_AgentQA.description}
${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
// request LLM to get QA
@@ -147,16 +157,20 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
const qaArr = await formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
// get vector and insert
await pushDataListToTrainingQueueByCollectionId({
await pushDataListToTrainingQueue({
teamId: data.teamId,
tmbId: data.tmbId,
datasetId: data.datasetId,
collectionId: data.collectionId,
mode: TrainingModeEnum.chunk,
data: qaArr.map((item) => ({
...item,
chunkIndex: data.chunkIndex
})),
billId: data.billId
billId: data.billId,
vectorModel: data.dataset.vectorModel,
agentModel: data.dataset.agentModel,
vlmModel: data.dataset.vlmModel
});
// delete data from training
@@ -192,7 +206,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
}
);
return reduceQueueAndReturn(1000);
return reduceQueueAndReturn(500);
}
}

View File

@@ -12,10 +12,13 @@ import {
} from '@fastgpt/service/common/vectorDB/controller';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
import type { Document } from '@fastgpt/service/common/mongo';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { getMaxIndexSize } from '@fastgpt/global/core/dataset/training/utils';
import type {
DatasetDataSchemaType,
DatasetTrainingSchemaType
} from '@fastgpt/global/core/dataset/type';
import { retryFn } from '@fastgpt/global/common/system/utils';
const reduceQueue = () => {
global.vectorQueueLen = global.vectorQueueLen > 0 ? global.vectorQueueLen - 1 : 0;
@@ -33,6 +36,13 @@ const reduceQueueAndReturn = (delay = 0) => {
}
};
type PopulateType = {
dataset: { vectorModel: string };
collection: { name: string; indexPrefixTitle: boolean };
data: { _id: string; indexes: DatasetDataSchemaType['indexes'] };
};
type TrainingDataType = DatasetTrainingSchemaType & PopulateType;
/* 索引生成队列。每导入一次,就是一个单独的线程 */
export async function generateVector(): Promise<any> {
const max = global.systemEnv?.vectorMaxProcess || 10;
@@ -59,7 +69,22 @@ export async function generateVector(): Promise<any> {
lockTime: new Date(),
$inc: { retryCount: -1 }
}
);
)
.populate<PopulateType>([
{
path: 'dataset',
select: 'vectorModel'
},
{
path: 'collection',
select: 'name indexPrefixTitle'
},
{
path: 'data',
select: '_id indexes'
}
])
.lean();
// task preemption
if (!data) {
@@ -89,6 +114,13 @@ export async function generateVector(): Promise<any> {
return reduceQueueAndReturn();
}
if (!data.dataset || !data.collection) {
addLog.info(`[Vector Queue] Dataset or collection not found`, data);
// Delete data
await MongoDatasetTraining.deleteOne({ _id: data._id });
return reduceQueueAndReturn();
}
// auth balance
if (!(await checkTeamAiPointsAndLock(data.teamId))) {
return reduceQueueAndReturn();
@@ -110,7 +142,7 @@ export async function generateVector(): Promise<any> {
teamId: data.teamId,
tmbId: data.tmbId,
inputTokens: tokens,
model: data.model,
model: data.dataset.vectorModel,
billId: data.billId
});
@@ -131,75 +163,62 @@ export async function generateVector(): Promise<any> {
errorMsg: getErrText(err, 'unknown error')
}
);
return reduceQueueAndReturn(1000);
return reduceQueueAndReturn(500);
}
}
const rebuildData = async ({
trainingData
}: {
trainingData: Document<unknown, {}, DatasetTrainingSchemaType> &
Omit<
DatasetTrainingSchemaType &
Required<{
_id: string;
}>,
never
>;
}) => {
// find data
const mongoData = await MongoDatasetData.findById(
trainingData.dataId,
'indexes teamId datasetId collectionId'
);
if (!mongoData) {
await trainingData.deleteOne();
const rebuildData = async ({ trainingData }: { trainingData: TrainingDataType }) => {
if (!trainingData.data) {
await MongoDatasetTraining.deleteOne({ _id: trainingData._id });
return Promise.reject('Not data');
}
const deleteVectorIdList = mongoData.indexes.map((index) => index.dataId);
// Old vectorId
const deleteVectorIdList = trainingData.data.indexes.map((index) => index.dataId);
// Find next rebuilding data to insert training queue
await mongoSessionRun(async (session) => {
// get new mongoData insert to training
const newRebuildingData = await MongoDatasetData.findOneAndUpdate(
{
rebuilding: true,
teamId: mongoData.teamId,
datasetId: mongoData.datasetId
},
{
$unset: {
rebuilding: null
},
updateTime: new Date()
},
{ session }
).select({
_id: 1,
collectionId: 1
});
if (newRebuildingData) {
await MongoDatasetTraining.create(
[
try {
await retryFn(() =>
mongoSessionRun(async (session) => {
// get new mongoData insert to training
const newRebuildingData = await MongoDatasetData.findOneAndUpdate(
{
teamId: mongoData.teamId,
tmbId: trainingData.tmbId,
datasetId: mongoData.datasetId,
collectionId: newRebuildingData.collectionId,
billId: trainingData.billId,
mode: TrainingModeEnum.chunk,
model: trainingData.model,
dataId: newRebuildingData._id,
retryCount: 50
}
],
{ session, ordered: true }
);
}
});
rebuilding: true,
teamId: trainingData.teamId,
datasetId: trainingData.datasetId
},
{
$unset: {
rebuilding: null
},
updateTime: new Date()
},
{ session }
).select({
_id: 1,
collectionId: 1
});
if (newRebuildingData) {
await MongoDatasetTraining.create(
[
{
teamId: trainingData.teamId,
tmbId: trainingData.tmbId,
datasetId: trainingData.datasetId,
collectionId: newRebuildingData.collectionId,
billId: trainingData.billId,
mode: TrainingModeEnum.chunk,
dataId: newRebuildingData._id,
retryCount: 50
}
],
{ session, ordered: true }
);
}
})
);
} catch (error) {}
// update vector, update dataset_data rebuilding status, delete data from training
// 1. Insert new vector to dataset_data
@@ -208,28 +227,36 @@ const rebuildData = async ({
insertId: string;
}[] = [];
let i = 0;
for await (const index of mongoData.indexes) {
for await (const index of trainingData.data.indexes) {
const result = await insertDatasetDataVector({
query: index.text,
model: getEmbeddingModel(trainingData.model),
teamId: mongoData.teamId,
datasetId: mongoData.datasetId,
collectionId: mongoData.collectionId
model: getEmbeddingModel(trainingData.dataset.vectorModel),
teamId: trainingData.teamId,
datasetId: trainingData.datasetId,
collectionId: trainingData.collectionId
});
mongoData.indexes[i].dataId = result.insertId;
trainingData.data.indexes[i].dataId = result.insertId;
updateResult.push(result);
i++;
}
const { tokens } = await mongoSessionRun(async (session) => {
// 2. Ensure that the training data is deleted after the Mongo update is successful
await mongoData.save({ session });
await MongoDatasetData.updateOne(
{ _id: trainingData.data._id },
{
$set: {
indexes: trainingData.data.indexes
}
},
{ session }
);
// 3. Delete the training data
await trainingData.deleteOne({ session });
await MongoDatasetTraining.deleteOne({ _id: trainingData._id }, { session });
// 4. Delete old vector
await deleteDatasetDataVector({
teamId: mongoData.teamId,
teamId: trainingData.teamId,
idList: deleteVectorIdList
});
@@ -241,19 +268,8 @@ const rebuildData = async ({
return { tokens };
};
const insertData = async ({
trainingData
}: {
trainingData: Document<unknown, {}, DatasetTrainingSchemaType> &
Omit<
DatasetTrainingSchemaType &
Required<{
_id: string;
}>,
never
>;
}) => {
const { tokens } = await mongoSessionRun(async (session) => {
const insertData = async ({ trainingData }: { trainingData: TrainingDataType }) => {
return mongoSessionRun(async (session) => {
// insert new data to dataset
const { tokens } = await insertData2Dataset({
teamId: trainingData.teamId,
@@ -264,18 +280,21 @@ const insertData = async ({
a: trainingData.a,
imageId: trainingData.imageId,
chunkIndex: trainingData.chunkIndex,
indexSize: trainingData.indexSize || getMaxIndexSize(getEmbeddingModel(trainingData.model)),
indexSize:
trainingData.indexSize ||
getMaxIndexSize(getEmbeddingModel(trainingData.dataset.vectorModel)),
indexes: trainingData.indexes,
embeddingModel: trainingData.model,
indexPrefix: trainingData.collection.indexPrefixTitle
? `# ${trainingData.collection.name}`
: undefined,
embeddingModel: trainingData.dataset.vectorModel,
session
});
// delete data from training
await trainingData.deleteOne({ session });
await MongoDatasetTraining.deleteOne({ _id: trainingData._id }, { session });
return {
tokens
};
});
return { tokens };
};

View File

@@ -33,8 +33,7 @@ describe('delete training data test', () => {
tmbId: root.tmbId,
datasetId: dataset._id,
collectionId: collection._id,
mode: TrainingModeEnum.chunk,
model: 'test'
mode: TrainingModeEnum.chunk
});
const res = await Call<deleteTrainingDataBody, {}, deleteTrainingDataResponse>(handler, {

View File

@@ -33,7 +33,6 @@ describe('get training data detail test', () => {
tmbId: root.tmbId,
datasetId: dataset._id,
collectionId: collection._id,
model: 'test',
mode: TrainingModeEnum.chunk,
q: 'test',
a: 'test'

View File

@@ -35,7 +35,6 @@ describe('training error list test', () => {
datasetId: dataset._id,
collectionId: collection._id,
mode: TrainingModeEnum.chunk,
model: 'test',
errorMsg: 'test'
}))
);

View File

@@ -33,8 +33,7 @@ describe('update training data test', () => {
tmbId: root.tmbId,
datasetId: dataset._id,
collectionId: collection._id,
mode: TrainingModeEnum.chunk,
model: 'test'
mode: TrainingModeEnum.chunk
});
const res = await Call<updateTrainingDataBody, {}, updateTrainingDataResponse>(handler, {