Add externalfile api params (#2745)

* feat: external dataset api

* perf: doc
This commit is contained in:
Archer
2024-09-19 13:28:55 +08:00
committed by GitHub
parent 258de4471e
commit 265434799f
12 changed files with 207 additions and 55 deletions

View File

@@ -312,6 +312,8 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete?
| chunkSize | 预估块大小 | |
| chunkSplitter | 自定义最高优先分割符号 | |
| qaPrompt | qa拆分提示词 | |
| tags | 集合标签(字符串数组) | |
| createTime | 文件创建时间Date / String | |
**出参**
@@ -604,9 +606,11 @@ curl --location --request POST 'http://localhost:3000/api/proApi/core/dataset/co
--data-raw '{
"externalFileUrl":"https://image.xxxxx.com/fastgpt-dev/%E6%91%82.pdf",
"externalFileId":"1111",
"filename":"自定义文件名",
"createTime": "2024-05-01T00:00:00.000Z",
"filename":"自定义文件名.pdf",
"datasetId":"6642d105a5e9d2b00255b27b",
"parentId": null,
"tags": ["tag1","tag2"],
"trainingType": "chunk",
"chunkSize":512,
@@ -625,7 +629,8 @@ curl --location --request POST 'http://localhost:3000/api/proApi/core/dataset/co
| --- | --- | --- |
| externalFileUrl | 文件访问链接(可以是临时链接) | ✅ |
| externalFileId | 外部文件ID | |
| filename | 自定义文件名 | |
| filename | 自定义文件名,需要带后缀 | |
| createTime | 文件创建时间Date ISO 字符串都 ok | |
{{< /markdownify >}}
@@ -710,7 +715,21 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
"updateTime": "2099-01-01T00:00:00.000Z",
"dataAmount": 3,
"trainingAmount": 0,
"canWrite": true
"externalFileId": "1111",
"tags": [
"11",
"测试的"
],
"forbid": false,
"trainingType": "chunk",
"permission": {
"value": 4294967295,
"isOwner": true,
"hasManagePer": true,
"hasWritePer": true,
"hasReadPer": true
}
},
{
"_id": "65abd0ad9d1448617cba6031",
@@ -722,7 +741,19 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
"updateTime": "2024-01-20T13:54:53.031Z",
"dataAmount": 3,
"trainingAmount": 0,
"canWrite": true
"externalFileId": "222",
"tags": [
"测试的"
],
"forbid": false,
"trainingType": "chunk",
"permission": {
"value": 4294967295,
"isOwner": true,
"hasManagePer": true,
"hasWritePer": true,
"hasReadPer": true
}
}
],
"total": 93
@@ -813,14 +844,36 @@ curl --location --request GET 'http://localhost:3000/api/core/dataset/collection
{{< tab tabName="请求示例" >}}
{{< markdownify >}}
**通过集合 ID 修改集合信息**
```bash
curl --location --request PUT 'http://localhost:3000/api/core/dataset/collection/update' \
--header 'Authorization: Bearer {{authorization}}' \
--header 'Content-Type: application/json' \
--data-raw '{
"id":"65abcfab9d1448617cba5f0d",
"parentId":null,
"name":"测2222试"
"parentId": null,
"name": "测2222试",
"tags": ["tag1", "tag2"],
"forbid": false,
"createTime": "2024-01-01T00:00:00.000Z"
}'
```
**通过外部文件 ID 修改集合信息** 只需要把 id 换成 datasetId 和 externalFileId。
```bash
curl --location --request PUT 'http://localhost:3000/api/core/dataset/collection/update' \
--header 'Authorization: Bearer {{authorization}}' \
--header 'Content-Type: application/json' \
--data-raw '{
"datasetId":"6593e137231a2be9c5603ba7",
"externalFileId":"1111",
"parentId": null,
"name": "测2222试",
"tags": ["tag1", "tag2"],
"forbid": false,
"createTime": "2024-01-01T00:00:00.000Z"
}'
```
@@ -834,6 +887,9 @@ curl --location --request PUT 'http://localhost:3000/api/core/dataset/collection
- id: 集合的ID
- parentId: 修改父级ID可选
- name: 修改集合名称(可选)
- tags: 修改集合标签(可选)
- forbid: 修改集合禁用状态(可选)
- createTime: 修改集合创建时间(可选)
{{% /alert %}}
{{< /markdownify >}}

View File

@@ -95,6 +95,7 @@ weight: 813
9. 优化 - 工作流 handler 性能优化。
10. 优化 - 工作流快捷键,避免调试测试时也会触发。
11. 优化 - 流输出,切换 tab 时仍可以继续输出。
12. 修复 - 知识库选择权限问题。
13. 修复 - 空 chatId 发起对话,首轮携带用户选择时会异常
14. 修复 - createDataset 接口intro 为赋值
12. 优化 - 完善外部文件知识库相关 API
13. 修复 - 知识库选择权限问题
14. 修复 - 空 chatId 发起对话,首轮携带用户选择时会异常
15. 修复 - createDataset 接口intro 为赋值。

View File

@@ -45,7 +45,10 @@ export async function createOneCollection({
[key: string]: any;
session?: ClientSession;
}) {
// Create collection tags
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
// Create collection
const [collection] = await MongoDatasetCollection.create(
[
{

View File

@@ -111,6 +111,17 @@ try {
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, tags: 1 });
// create time filter
DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, createTime: 1 });
// Get collection by external file id
DatasetCollectionSchema.index(
{ datasetId: 1, externalFileId: 1 },
{
unique: true,
partialFilterExpression: {
externalFileId: { $exists: true }
}
}
);
} catch (error) {
console.log(error);
}

View File

@@ -1,6 +1,5 @@
import type { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type.d';
import { MongoDatasetCollection } from './schema';
import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type.d';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '../../../common/string/cheerio';
@@ -12,6 +11,7 @@ import { hashStr } from '@fastgpt/global/common/string/tools';
import { ClientSession } from '../../../common/mongo';
import { PushDatasetDataResponse } from '@fastgpt/global/core/dataset/api';
import { MongoDatasetCollectionTags } from '../tag/schema';
import { readFromSecondary } from '../../../common/mongo/utils';
/**
* get all collection by top collectionId
@@ -160,7 +160,7 @@ export const reloadCollectionChunks = async ({
const { chunks } = splitText2Chunks({
text: newRawText,
chunkLen: col.chunkSize || 512,
customReg: col.chunkSplitter ? [col.chunkSplitter] : [],
customReg: col.chunkSplitter ? [col.chunkSplitter] : []
});
// insert to training queue
@@ -204,7 +204,7 @@ export const reloadCollectionChunks = async ({
};
export const createOrGetCollectionTags = async ({
tags = [],
tags,
datasetId,
teamId,
session
@@ -213,13 +213,20 @@ export const createOrGetCollectionTags = async ({
datasetId: string;
teamId: string;
session?: ClientSession;
}): Promise<string[]> => {
if (!tags.length) return [];
const existingTags = await MongoDatasetCollectionTags.find({
teamId,
datasetId,
$expr: { $in: ['$tag', tags] }
});
}) => {
if (!tags) return undefined;
if (tags.length === 0) return [];
const existingTags = await MongoDatasetCollectionTags.find(
{
teamId,
datasetId,
tag: { $in: tags }
},
undefined,
{ session }
).lean();
const existingTagContents = existingTags.map((tag) => tag.tag);
const newTagContents = tags.filter((tag) => !existingTagContents.includes(tag));
@@ -235,3 +242,29 @@ export const createOrGetCollectionTags = async ({
return [...existingTags.map((tag) => tag._id), ...newTags.map((tag) => tag._id)];
};
export const collectionTagsToTagLabel = async ({
datasetId,
tags
}: {
datasetId: string;
tags?: string[];
}) => {
if (!tags) return undefined;
if (tags.length === 0) return;
// Get all the tags
const collectionTags = await MongoDatasetCollectionTags.find({ datasetId }, undefined, {
...readFromSecondary
}).lean();
const tagsMap = new Map<string, string>();
collectionTags.forEach((tag) => {
tagsMap.set(String(tag._id), tag.tag);
});
return tags
.map((tag) => {
return tagsMap.get(tag) || '';
})
.filter(Boolean);
};

View File

@@ -21,6 +21,8 @@ export type DatasetCollectionsListItemType = {
trainingType?: DatasetCollectionSchemaType['trainingType'];
tags?: string[];
externalFileId?: string;
fileId?: string;
rawLink?: string;
permission: DatasetPermission;

View File

@@ -10,6 +10,7 @@ import { NextAPI } from '@/service/middleware/entry';
import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant';
import { DatasetCollectionItemType } from '@fastgpt/global/core/dataset/type';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { collectionTagsToTagLabel } from '@fastgpt/service/core/dataset/collection/utils';
async function handler(req: NextApiRequest): Promise<DatasetCollectionItemType> {
const { id } = req.query as { id: string };
@@ -35,6 +36,10 @@ async function handler(req: NextApiRequest): Promise<DatasetCollectionItemType>
return {
...collection,
...getCollectionSourceData(collection),
tags: await collectionTagsToTagLabel({
datasetId: collection.datasetId._id,
tags: collection.tags
}),
permission,
file
};

View File

@@ -11,6 +11,8 @@ import { startTrainingQueue } from '@/service/core/dataset/training/utils';
import { NextAPI } from '@/service/middleware/entry';
import { ReadPermissionVal } from '@fastgpt/global/support/permission/constant';
import { PagingData } from '@/types';
import { readFromSecondary } from '@fastgpt/service/common/mongo/utils';
import { collectionTagsToTagLabel } from '@fastgpt/service/core/dataset/collection/utils';
async function handler(req: NextApiRequest): Promise<PagingData<DatasetCollectionsListItemType>> {
let {
@@ -60,12 +62,15 @@ async function handler(req: NextApiRequest): Promise<PagingData<DatasetCollectio
trainingType: 1,
fileId: 1,
rawLink: 1,
tags: 1
tags: 1,
externalFileId: 1
};
// not count data amount
if (simple) {
const collections = await MongoDatasetCollection.find(match)
const collections = await MongoDatasetCollection.find(match, undefined, {
...readFromSecondary
})
.select(selectField)
.sort({
updateTime: -1
@@ -78,6 +83,10 @@ async function handler(req: NextApiRequest): Promise<PagingData<DatasetCollectio
data: await Promise.all(
collections.map(async (item) => ({
...item,
tags: await collectionTagsToTagLabel({
datasetId,
tags: item.tags
}),
dataAmount: 0,
trainingAmount: 0,
permission
@@ -153,12 +162,18 @@ async function handler(req: NextApiRequest): Promise<PagingData<DatasetCollectio
}
}
]),
MongoDatasetCollection.countDocuments(match)
MongoDatasetCollection.countDocuments(match, {
...readFromSecondary
})
]);
const data = await Promise.all(
collections.map(async (item) => ({
...item,
tags: await collectionTagsToTagLabel({
datasetId,
tags: item.tags
}),
permission
}))
);

View File

@@ -1,5 +1,8 @@
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import { getCollectionUpdateTime } from '@fastgpt/service/core/dataset/collection/utils';
import {
createOrGetCollectionTags,
getCollectionUpdateTime
} from '@fastgpt/service/core/dataset/collection/utils';
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
@@ -11,11 +14,16 @@ import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
export type UpdateDatasetCollectionParams = {
id: string;
id?: string;
parentId?: string;
name?: string;
tags?: string[];
tags?: string[]; // Not tag id, is tag label
forbid?: boolean;
createTime?: Date;
// External file id
datasetId?: string;
externalFileId?: string;
};
// Set folder collection children forbid status
@@ -65,14 +73,22 @@ const updateFolderChildrenForbid = async ({
};
async function handler(req: ApiRequestProps<UpdateDatasetCollectionParams>) {
const { id, parentId, name, tags, forbid } = req.body;
let { datasetId, externalFileId, id, parentId, name, tags, forbid, createTime } = req.body;
if (datasetId && externalFileId) {
const collection = await MongoDatasetCollection.findOne({ datasetId, externalFileId }, '_id');
if (!collection) {
return Promise.reject(CommonErrEnum.fileNotFound);
}
id = collection._id;
}
if (!id) {
return Promise.reject(CommonErrEnum.missingParams);
}
// 凭证校验
const { collection } = await authDatasetCollection({
const { collection, teamId } = await authDatasetCollection({
req,
authToken: true,
authApiKey: true,
@@ -81,6 +97,13 @@ async function handler(req: ApiRequestProps<UpdateDatasetCollectionParams>) {
});
await mongoSessionRun(async (session) => {
const collectionTags = await createOrGetCollectionTags({
tags,
teamId,
datasetId: collection.datasetId._id,
session
});
await MongoDatasetCollection.updateOne(
{
_id: id
@@ -89,8 +112,9 @@ async function handler(req: ApiRequestProps<UpdateDatasetCollectionParams>) {
$set: {
...(parentId !== undefined && { parentId: parentId || null }),
...(name && { name, updateTime: getCollectionUpdateTime({ name }) }),
...(tags && { tags }),
...(forbid !== undefined && { forbid })
...(collectionTags !== undefined && { tags: collectionTags }),
...(forbid !== undefined && { forbid }),
...(createTime !== undefined && { createTime })
}
},
{

View File

@@ -35,8 +35,8 @@ const TagsPopOver = ({
const tagList = useMemo(
() =>
(collectionTags
?.map((tagId) => {
const tagObject = allDatasetTags.find((tag) => tag._id === tagId);
?.map((item) => {
const tagObject = allDatasetTags.find((tag) => tag.tag === item);
return tagObject ? { _id: tagObject._id, tag: tagObject.tag } : null;
})
.filter((tag) => tag !== null) as {
@@ -153,9 +153,9 @@ const TagsPopOver = ({
setIsUpdateLoading(true);
await putDatasetCollectionById({
id: currentCollection._id,
tags: checkedTags.map((tag) => tag._id)
tags: checkedTags.map((tag) => tag.tag)
});
setCollectionTags(checkedTags.map((tag) => tag._id));
setCollectionTags(checkedTags.map((tag) => tag.tag));
setIsUpdateLoading(false);
}}
display={showTagManage || overflowTags.length > 0 ? 'block' : 'none'}

View File

@@ -1,10 +1,9 @@
import React, { useState, useRef, useMemo } from 'react';
import React, { useState, useMemo } from 'react';
import { Box, Card, IconButton, Flex, Button, useTheme } from '@chakra-ui/react';
import {
getDatasetDataList,
delOneDatasetDataById,
getDatasetCollectionById,
putDatasetDataById
getDatasetCollectionById
} from '@/web/core/dataset/api';
import { useQuery } from '@tanstack/react-query';
import { useToast } from '@fastgpt/web/hooks/useToast';
@@ -21,7 +20,6 @@ import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection
import EmptyTip from '@fastgpt/web/components/common/EmptyTip';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { useContextSelector } from 'use-context-selector';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import MyTag from '@fastgpt/web/components/common/Tag/index';
import MyBox from '@fastgpt/web/components/common/MyBox';
import { useSystem } from '@fastgpt/web/hooks/useSystem';
@@ -96,24 +94,27 @@ const DataCard = () => {
<Flex flexDirection={'column'} h={'100%'}>
{/* Header */}
<Flex alignItems={'center'} px={6}>
<Flex className="textEllipsis" flex={'1 0 0'} mr={[3, 5]} alignItems={'center'}>
<Box>
<Box alignItems={'center'} gap={2} display={isPc ? 'flex' : ''}>
{collection?._id && (
<RawSourceBox
collectionId={collection._id}
{...getCollectionSourceData(collection)}
fontSize={['sm', 'md']}
color={'black'}
textDecoration={'none'}
/>
)}
</Box>
{feConfigs?.isPlus && !!collection?.tags?.length && (
<TagsPopOver currentCollection={collection} />
<Box flex={'1 0 0'} mr={[3, 5]} alignItems={'center'}>
<Box
className="textEllipsis"
alignItems={'center'}
gap={2}
display={isPc ? 'flex' : ''}
>
{collection?._id && (
<RawSourceBox
collectionId={collection._id}
{...getCollectionSourceData(collection)}
fontSize={['sm', 'md']}
color={'black'}
textDecoration={'none'}
/>
)}
</Box>
</Flex>
{feConfigs?.isPlus && !!collection?.tags?.length && (
<TagsPopOver currentCollection={collection} />
)}
</Box>
{canWrite && (
<Box>
<Button

View File

@@ -243,8 +243,9 @@ const Info = ({ datasetId }: { datasetId: string }) => {
const vectorModel = vectorModelList.find((item) => item.model === e);
if (!vectorModel) return;
return onOpenConfirmRebuild(() => {
setValue('vectorModel', vectorModel);
return onRebuilding(vectorModel);
return onRebuilding(vectorModel).then(() => {
setValue('vectorModel', vectorModel);
});
})();
}}
/>