feat: Text check before synchronization (#689)

* fix: icon

* fix: web selector

* fix: web selector

* perf: link sync

* dev doc

* chomd doc

* perf: git intro

* 466 intro

* intro img

* add json editor (#5)

* team limit

* websync limit

* json editor

* text editor

* perf: search test

* change cq value type

* doc

* intro img

---------

Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
Archer
2024-01-04 23:19:24 +08:00
committed by GitHub
parent c2abbb579f
commit 828829011a
64 changed files with 1789 additions and 1489 deletions

View File

@@ -50,7 +50,8 @@ const defaultFeConfigs: FastGPTFeConfigsType = {
concatMd:
'* 项目开源地址: [FastGPT GitHub](https://github.com/labring/FastGPT)\n* 交流群: ![](https://doc.fastgpt.in/wechat-fastgpt.webp)',
limit: {
exportLimitMinutes: 0
exportDatasetLimitMinutes: 0,
websiteSyncLimitMinuted: 0
},
scripts: [],
favicon: '/favicon.ico'

View File

@@ -1,73 +0,0 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoUser } from '@fastgpt/service/support/user/schema';
import { addLog } from '@fastgpt/service/common/system/log';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { findDatasetIdTreeByTopDatasetId } from '@fastgpt/service/core/dataset/controller';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
let { datasetId } = req.query as {
datasetId: string;
};
if (!datasetId) {
throw new Error('缺少参数');
}
// 凭证校验
const { userId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
await limitCheck({
datasetId,
userId
});
jsonRes(res);
} catch (err) {
res.status(500);
jsonRes(res, {
code: 500,
error: err
});
}
}
export async function limitCheck({ datasetId, userId }: { datasetId: string; userId: string }) {
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
const limitMinutesAgo = new Date(
Date.now() - (global.feConfigs?.limit?.exportLimitMinutes || 0) * 60 * 1000
);
// auth export times
const authTimes = await MongoUser.findOne(
{
_id: userId,
$or: [
{ 'limit.exportKbTime': { $exists: false } },
{ 'limit.exportKbTime': { $lte: limitMinutesAgo } }
]
},
'_id limit'
);
if (!authTimes) {
const minutes = `${global.feConfigs?.limit?.exportLimitMinutes || 0} 分钟`;
return Promise.reject(`上次导出未到 ${minutes},每 ${minutes}仅可导出一次。`);
}
// auth max data
const total = await MongoDatasetData.countDocuments({
datasetId: { $in: exportIds }
});
addLog.info(`export datasets: ${datasetId}`, { total });
if (total > 100000) {
return Promise.reject('数据量超出 10 万,无法导出');
}
}

View File

@@ -2,14 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
import { loadingOneChunkCollection } from '@fastgpt/service/core/dataset/collection/utils';
import {
getCollectionAndRawText,
reloadCollectionChunks
} from '@fastgpt/service/core/dataset/collection/utils';
import { delCollectionRelevantData } from '@fastgpt/service/core/dataset/data/controller';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import {
DatasetCollectionSyncResultEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constant';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { createTrainingBill } from '@fastgpt/service/support/wallet/bill/controller';
import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
import { createOneCollection } from '@fastgpt/service/core/dataset/collection/controller';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -32,6 +38,18 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
return Promise.reject(DatasetErrEnum.unLinkCollection);
}
const { rawText, isSameRawText } = await getCollectionAndRawText({
collection
});
if (isSameRawText) {
return jsonRes(res, {
data: DatasetCollectionSyncResultEnum.sameRaw
});
}
/* Not the same original text, create and reload */
const vectorModelData = getVectorModel(collection.datasetId.vectorModel);
const agentModelData = getQAModel(collection.datasetId.agentModel);
// create training bill
@@ -45,26 +63,27 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
});
// create a collection and delete old
const { _id } = await MongoDatasetCollection.create({
parentId: collection.parentId,
const _id = await createOneCollection({
teamId: collection.teamId,
tmbId: collection.tmbId,
parentId: collection.parentId,
datasetId: collection.datasetId._id,
type: collection.type,
name: collection.name,
createTime: collection.createTime,
type: collection.type,
trainingType: collection.trainingType,
chunkSize: collection.chunkSize,
fileId: collection.fileId,
rawLink: collection.rawLink,
metadata: collection.metadata
metadata: collection.metadata,
createTime: collection.createTime
});
// start load
await loadingOneChunkCollection({
await reloadCollectionChunks({
collectionId: _id,
tmbId,
billId
billId,
rawText
});
// delete old collection
@@ -73,7 +92,9 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
fileIds: collection.fileId ? [collection.fileId] : []
});
jsonRes(res);
jsonRes(res, {
data: DatasetCollectionSyncResultEnum.success
});
} catch (err) {
jsonRes(res, {
code: 500,

View File

@@ -1,13 +1,15 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes, responseWriteController } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { MongoUser } from '@fastgpt/service/support/user/schema';
import { addLog } from '@fastgpt/service/common/system/log';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
import { findDatasetIdTreeByTopDatasetId } from '@fastgpt/service/core/dataset/controller';
import { limitCheck } from './checkExportLimit';
import { withNextCors } from '@fastgpt/service/common/middle/cors';
import {
checkExportDatasetLimit,
updateExportDatasetLimit
} from '@fastgpt/service/support/user/utils';
export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -21,11 +23,11 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
}
// 凭证校验
const { userId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
const { teamId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
await limitCheck({
userId,
datasetId
await checkExportDatasetLimit({
teamId,
limitMinutes: global.feConfigs?.limit?.exportDatasetLimitMinutes
});
const exportIds = await findDatasetIdTreeByTopDatasetId(datasetId);
@@ -43,7 +45,9 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
datasetId: { $in: exportIds }
},
'q a'
).cursor();
)
.limit(50000)
.cursor();
const write = responseWriteController({
res,
@@ -59,12 +63,10 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
write(`\n"${q}","${a}"`);
});
cursor.on('end', async () => {
cursor.on('end', () => {
cursor.close();
res.end();
await MongoUser.findByIdAndUpdate(userId, {
'limit.exportKbTime': new Date()
});
updateExportDatasetLimit(teamId);
});
cursor.on('error', (err) => {

View File

@@ -0,0 +1,34 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { authDataset } from '@fastgpt/service/support/permission/auth/dataset';
import { checkExportDatasetLimit } from '@fastgpt/service/support/user/utils';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
const { datasetId } = req.query as {
datasetId: string;
};
if (!datasetId) {
throw new Error('datasetId is required');
}
// 凭证校验
const { teamId } = await authDataset({ req, authToken: true, datasetId, per: 'w' });
await checkExportDatasetLimit({
teamId,
limitMinutes: global.feConfigs?.limit?.exportDatasetLimitMinutes
});
jsonRes(res);
} catch (err) {
res.status(500);
jsonRes(res, {
code: 500,
error: err
});
}
}

View File

@@ -0,0 +1,27 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo';
import { checkWebSyncLimit } from '@fastgpt/service/support/user/utils';
import { authCert } from '@fastgpt/service/support/permission/auth/common';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
await connectToDatabase();
// 凭证校验
const { teamId } = await authCert({ req, authToken: true });
await checkWebSyncLimit({
teamId,
limitMinutes: global.feConfigs?.limit?.websiteSyncLimitMinuted
});
jsonRes(res);
} catch (err) {
res.status(500);
jsonRes(res, {
code: 500,
error: err
});
}
}

View File

@@ -46,7 +46,8 @@ import {
DatasetCollectionTrainingModeEnum,
DatasetTypeEnum,
DatasetTypeMap,
DatasetStatusEnum
DatasetStatusEnum,
DatasetCollectionSyncResultMap
} from '@fastgpt/global/core/dataset/constant';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import EditFolderModal, { useEditFolder } from '../../component/EditFolderModal';
@@ -61,6 +62,7 @@ import { useUserStore } from '@/web/support/user/useUserStore';
import { TeamMemberRoleEnum } from '@fastgpt/global/support/user/team/constant';
import { useDatasetStore } from '@/web/core/dataset/store/dataset';
import { DatasetSchemaType } from '@fastgpt/global/core/dataset/type';
import { DatasetCollectionSyncResultEnum } from '../../../../../../../packages/global/core/dataset/constant';
const FileImportModal = dynamic(() => import('./Import/ImportModal'), {});
const WebSiteConfigModal = dynamic(() => import('./Import/WebsiteConfig'), {});
@@ -246,8 +248,12 @@ const CollectionCard = () => {
mutationFn: (collectionId: string) => {
return postLinkCollectionSync(collectionId);
},
onSuccess() {
onSuccess(res: DatasetCollectionSyncResultEnum) {
getData(pageNum);
toast({
status: 'success',
title: t(DatasetCollectionSyncResultMap[res]?.label)
});
},
errorToast: t('core.dataset.error.Start Sync Failed')
});

View File

@@ -121,46 +121,55 @@ const DataCard = () => {
[collection?.canWrite, userInfo?.team?.role]
);
const metadataList = useMemo(
() =>
collection
const metadataList = useMemo(() => {
if (!collection) return [];
const webSelector =
collection?.datasetId?.websiteConfig?.selector || collection?.metadata?.webPageSelector;
return [
{
label: t('core.dataset.collection.metadata.source'),
value: t(DatasetCollectionTypeMap[collection.type]?.name)
},
{
label: t('core.dataset.collection.metadata.source name'),
value: collection.file?.filename || collection?.rawLink || collection?.name
},
{
label: t('core.dataset.collection.metadata.source size'),
value: collection.file ? formatFileSize(collection.file.length) : '-'
},
{
label: t('core.dataset.collection.metadata.Createtime'),
value: formatTime2YMDHM(collection.createTime)
},
{
label: t('core.dataset.collection.metadata.Updatetime'),
value: formatTime2YMDHM(collection.updateTime)
},
{
label: t('core.dataset.collection.metadata.Raw text length'),
value: collection.rawTextLength ?? '-'
},
{
label: t('core.dataset.collection.metadata.Training Type'),
value: t(DatasetCollectionTrainingTypeMap[collection.trainingType]?.label)
},
{
label: t('core.dataset.collection.metadata.Chunk Size'),
value: collection.chunkSize || '-'
},
...(webSelector
? [
{
label: t('core.dataset.collection.metadata.source'),
value: t(DatasetCollectionTypeMap[collection.type]?.name)
},
{
label: t('core.dataset.collection.metadata.source name'),
value: collection.file?.filename || collection?.rawLink || collection?.name
},
{
label: t('core.dataset.collection.metadata.source size'),
value: collection.file ? formatFileSize(collection.file.length) : '-'
},
{
label: t('core.dataset.collection.metadata.Createtime'),
value: formatTime2YMDHM(collection.createTime)
},
{
label: t('core.dataset.collection.metadata.Updatetime'),
value: formatTime2YMDHM(collection.updateTime)
},
{
label: t('core.dataset.collection.metadata.Raw text length'),
value: collection.rawTextLength ?? '-'
},
{
label: t('core.dataset.collection.metadata.Training Type'),
value: t(DatasetCollectionTrainingTypeMap[collection.trainingType]?.label)
},
{
label: t('core.dataset.collection.metadata.Chunk Size'),
value: collection.chunkSize || '-'
label: t('core.dataset.collection.metadata.Web page selector'),
value: webSelector
}
]
: [],
[collection, t]
);
: [])
];
}, [collection, t]);
return (
<Box ref={BoxRef} position={'relative'} px={5} py={[1, 5]} h={'100%'} overflow={'overlay'}>

View File

@@ -41,6 +41,7 @@ export type FileItemType = {
type: DatasetCollectionTypeEnum.file | DatasetCollectionTypeEnum.link;
fileId?: string;
rawLink?: string;
metadata?: Record<string, any>;
};
export interface Props extends BoxProps {
@@ -232,7 +233,7 @@ const FileSelect = ({
// link fetch
const onUrlFetch = useCallback(
(e: UrlFetchResponse) => {
const result: FileItemType[] = e.map<FileItemType>(({ url, content }) => {
const result: FileItemType[] = e.map<FileItemType>(({ url, content, selector }) => {
const { chunks, tokens } = splitText2Chunks({
text: content,
chunkLen,
@@ -250,7 +251,10 @@ const FileSelect = ({
chunks: chunks.map((chunk) => ({
q: chunk,
a: ''
}))
})),
metadata: {
webPageSelector: selector
}
};
});
onPushFiles(result);

View File

@@ -156,19 +156,24 @@ const Provider = ({
return formatModelPrice2Read(totalTokens * inputPrice);
}, [inputPrice, mode, outputPrice, totalTokens]);
/* start upload data */
/*
start upload data
1. create training bill
2. create collection
3. upload chunks
*/
const { mutate: onclickUpload, isLoading: uploading } = useRequest({
mutationFn: async (props?: { prompt?: string }) => {
const { prompt } = props || {};
let totalInsertion = 0;
for await (const file of files) {
const chunks = file.chunks;
// create training bill
const billId = await postCreateTrainingBill({
name: t('dataset.collections.Create Training Data', { filename: file.filename }),
vectorModel,
agentModel
});
// create a file collection and training bill
const collectionId = await postDatasetCollection({
datasetId,
@@ -181,10 +186,12 @@ const Provider = ({
trainingType: collectionTrainingType,
qaPrompt: mode === TrainingModeEnum.qa ? prompt : '',
rawTextLength: file.rawText.length,
hashRawText: hashStr(file.rawText)
hashRawText: hashStr(file.rawText),
metadata: file.metadata
});
// upload data
// upload chunks
const chunks = file.chunks;
const { insertLen } = await chunksUpload({
collectionId,
billId,

View File

@@ -60,6 +60,7 @@ const Test = ({ datasetId }: { datasetId: string }) => {
const [inputType, setInputType] = useState<'text' | 'file'>('text');
const [datasetTestItem, setDatasetTestItem] = useState<SearchTestStoreItemType>();
const [refresh, setRefresh] = useState(false);
const [isFocus, setIsFocus] = useState(false);
const { File, onOpen } = useSelectFile({
fileType: '.csv',
multiple: false
@@ -169,7 +170,20 @@ const Test = ({ datasetId }: { datasetId: string }) => {
py={4}
borderRight={['none', theme.borders.base]}
>
<Box border={'2px solid'} borderColor={'primary.500'} p={3} mx={4} borderRadius={'md'}>
<Box
border={'2px solid'}
p={3}
mx={4}
borderRadius={'md'}
{...(isFocus
? {
borderColor: 'primary.500',
boxShadow: '0px 0px 0px 2.4px rgba(51, 112, 255, 0.15)'
}
: {
borderColor: 'primary.300'
})}
>
{/* header */}
<Flex alignItems={'center'} justifyContent={'space-between'}>
<MySelect
@@ -221,8 +235,12 @@ const Test = ({ datasetId }: { datasetId: string }) => {
variant={'unstyled'}
maxLength={datasetDetail.vectorModel.maxToken}
placeholder={t('core.dataset.test.Test Text Placeholder')}
onFocus={() => setIsFocus(true)}
{...register('inputText', {
required: true
required: true,
onBlur: () => {
setIsFocus(false);
}
})}
/>
)}
@@ -340,25 +358,26 @@ const TestHistories = React.memo(function TestHistories({
);
return (
<>
<Flex alignItems={'center'} color={'myGray.600'}>
<MyIcon mr={2} name={'history'} w={'16px'} h={'16px'} />
<Box fontSize={'2xl'}>{t('core.dataset.test.test history')}</Box>
<Flex alignItems={'center'} color={'myGray.900'}>
<MyIcon mr={2} name={'history'} w={'18px'} h={'18px'} color={'myGray.900'} />
<Box fontSize={'xl'}>{t('core.dataset.test.test history')}</Box>
</Flex>
<Box mt={2}>
<Flex py={2} fontWeight={'bold'} borderBottom={theme.borders.sm}>
<Box flex={'0 0 80px'}>{t('core.dataset.search.search mode')}</Box>
<Box flex={1}>{t('core.dataset.test.Test Text')}</Box>
<Box flex={'0 0 70px'}>{t('common.Time')}</Box>
<Box w={'14px'}></Box>
</Flex>
{testHistories.map((item) => (
<Flex
key={item.id}
p={1}
py={2}
px={3}
alignItems={'center'}
borderBottom={theme.borders.base}
borderColor={'borderColor.low'}
borderWidth={'1px'}
borderRadius={'md'}
_notLast={{
mb: 2
}}
_hover={{
bg: '#f4f4f4',
borderColor: 'primary.300',
boxShadow: '1',
'& .delete': {
display: 'block'
}
@@ -369,7 +388,7 @@ const TestHistories = React.memo(function TestHistories({
>
<Box flex={'0 0 80px'}>
{DatasetSearchModeMap[item.searchMode] ? (
<Flex alignItems={'center'}>
<Flex alignItems={'center'} fontWeight={'500'} color={'myGray.500'}>
<MyIcon
name={DatasetSearchModeMap[item.searchMode].icon as any}
w={'12px'}
@@ -381,7 +400,7 @@ const TestHistories = React.memo(function TestHistories({
'-'
)}
</Box>
<Box flex={1} mr={2} wordBreak={'break-all'}>
<Box flex={1} mr={2} wordBreak={'break-all'} fontWeight={'400'}>
{item.text}
</Box>
<Box flex={'0 0 70px'}>{formatTimeToChatTime(item.time)}</Box>
@@ -433,13 +452,20 @@ const TestResults = React.memo(function TestResults({
</Flex>
) : (
<>
<Box fontSize={'xl'} color={'myGray.600'}>
<Flex fontSize={'xl'} color={'myGray.900'} alignItems={'center'}>
<MyIcon name={'common/paramsLight'} w={'18px'} mr={2} />
{t('core.dataset.test.Test params')}
</Box>
<TableContainer mb={3} bg={'myGray.150'} borderRadius={'md'}>
</Flex>
<TableContainer
mt={3}
bg={'primary.50'}
borderRadius={'lg'}
borderWidth={'1px'}
borderColor={'primary.1'}
>
<Table>
<Thead>
<Tr>
<Tr color={'myGray.600'}>
<Th>{t('core.dataset.search.search mode')}</Th>
<Th>{t('core.dataset.search.ReRank')}</Th>
<Th>{t('core.dataset.search.Max Tokens')}</Th>
@@ -447,8 +473,8 @@ const TestResults = React.memo(function TestResults({
</Tr>
</Thead>
<Tbody>
<Tr>
<Td>
<Tr color={'myGray.800'}>
<Td pt={0}>
<Flex alignItems={'center'}>
<MyIcon
name={DatasetSearchModeMap[datasetTestItem.searchMode]?.icon as any}
@@ -458,45 +484,31 @@ const TestResults = React.memo(function TestResults({
{t(DatasetSearchModeMap[datasetTestItem.searchMode]?.title)}
</Flex>
</Td>
<Td>{datasetTestItem.usingReRank ? '✅' : '❌'}</Td>
<Td>{datasetTestItem.limit}</Td>
<Td>{datasetTestItem.similarity}</Td>
<Td pt={0}>{datasetTestItem.usingReRank ? '✅' : '❌'}</Td>
<Td pt={0}>{datasetTestItem.limit}</Td>
<Td pt={0}>{datasetTestItem.similarity}</Td>
</Tr>
</Tbody>
</Table>
</TableContainer>
<Flex alignItems={'center'}>
<Box fontSize={'xl'} color={'myGray.600'}>
<Flex mt={5} mb={3} alignItems={'center'}>
<Flex fontSize={'xl'} color={'myGray.900'} alignItems={'center'}>
<MyIcon name={'common/resultLight'} w={'18px'} mr={2} />
{t('core.dataset.test.Test Result')}
</Box>
</Flex>
<MyTooltip label={t('core.dataset.test.test result tip')} forceShow>
<QuestionOutlineIcon mx={2} color={'myGray.600'} cursor={'pointer'} fontSize={'lg'} />
</MyTooltip>
<Box>({datasetTestItem.duration})</Box>
</Flex>
<Grid
mt={1}
gridTemplateColumns={[
'repeat(1,minmax(0, 1fr))',
'repeat(1,minmax(0, 1fr))',
'repeat(1,minmax(0, 1fr))',
'repeat(1,minmax(0, 1fr))',
'repeat(2,minmax(0, 1fr))'
]}
gridGap={4}
>
<Box mt={1} gap={4}>
{datasetTestItem?.results.map((item, index) => (
<Box
key={item.id}
p={2}
borderRadius={'sm'}
border={theme.borders.base}
_notLast={{ mb: 2 }}
>
<Box key={item.id} p={3} borderRadius={'lg'} bg={'myGray.100'} _notLast={{ mb: 2 }}>
<QuoteItem quoteItem={item} canViewSource />
</Box>
))}
</Grid>
</Box>
</>
)}
</>

View File

@@ -20,9 +20,9 @@ import {
delDatasetById,
getDatasetPaths,
putDatasetById,
postCreateDataset,
getCheckExportLimit
postCreateDataset
} from '@/web/core/dataset/api';
import { checkTeamExportDatasetLimit } from '@/web/support/user/api';
import { useTranslation } from 'next-i18next';
import Avatar from '@/components/Avatar';
import MyIcon from '@fastgpt/web/components/common/Icon';
@@ -99,7 +99,7 @@ const Kb = () => {
const { mutate: exportDataset } = useRequest({
mutationFn: async (dataset: DatasetItemType) => {
setLoading(true);
await getCheckExportLimit(dataset._id);
await checkTeamExportDatasetLimit(dataset._id);
const a = document.createElement('a');
a.href = `/api/core/dataset/exportAll?datasetId=${dataset._id}`;
a.download = `${dataset.name}.csv`;