mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-21 11:43:56 +00:00

* feat: add feishu & yuque dataset (#3379) * feat: add feishu & yuque dataset * fix ts * fix ts * move type position * fix * fix: merge interface * fix * feat: dingtalk sso support (#3408) * fix: optional sso state * feat: dingtalk bot * feat: dingtalk sso login * chore: move i18n to user namespace * feat: dingtalk bot integration (#3415) * feat: dingtalk bot integration * docs: config dingtalk bot * feat:sear XNG服务 (#3413) * feat:sear XNG服务 * 补充了courseUrl * 添加了官方文档 * 错误时返回情况修正了一下 * Tracks (#3420) * feat: node intro * feat: add domain track * dingding sso login * perf: api dataset code and add doc * feat: tracks * feat: searXNG plugins * fix: ts * feat: delete node tracks (#3423) * fix: dingtalk bot GET verification (#3424) * 4.8.16 test: fix: plugin inputs render;fix: ui offset (#3426) * fix: ui offset * perf: dingding talk * fix: plugin inputs render * feat: menu all folder (#3429) * fix: recall code --------- Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: a.e. <49438478+I-Info@users.noreply.github.com> Co-authored-by: Jiangween <145003935+Jiangween@users.noreply.github.com>
162 lines
4.0 KiB
TypeScript
162 lines
4.0 KiB
TypeScript
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
|
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
|
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
|
import { urlsFetch } from '../../common/string/cheerio';
|
|
import { parseCsvTable2Chunks } from './training/utils';
|
|
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
|
import axios from 'axios';
|
|
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
|
|
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
|
|
import { APIFileServer, FeishuServer, YuqueServer } from '@fastgpt/global/core/dataset/apiDataset';
|
|
import { useApiDatasetRequest } from './apiDataset/api';
|
|
import { POST } from '../../common/api/plusRequest';
|
|
|
|
export const readFileRawTextByUrl = async ({
|
|
teamId,
|
|
url,
|
|
relatedId
|
|
}: {
|
|
teamId: string;
|
|
url: string;
|
|
relatedId: string; // externalFileId / apiFileId
|
|
}) => {
|
|
const response = await axios({
|
|
method: 'get',
|
|
url: url,
|
|
responseType: 'arraybuffer'
|
|
});
|
|
const extension = parseFileExtensionFromUrl(url);
|
|
|
|
const buffer = Buffer.from(response.data, 'binary');
|
|
|
|
const { rawText } = await readRawContentByFileBuffer({
|
|
extension,
|
|
teamId,
|
|
buffer,
|
|
encoding: 'utf-8',
|
|
metadata: {
|
|
relatedId
|
|
}
|
|
});
|
|
|
|
return rawText;
|
|
};
|
|
|
|
/*
|
|
fileId - local file, read from mongo
|
|
link - request
|
|
externalFile/apiFile = request read
|
|
*/
|
|
export const readDatasetSourceRawText = async ({
|
|
teamId,
|
|
type,
|
|
sourceId,
|
|
isQAImport,
|
|
selector,
|
|
externalFileId,
|
|
apiServer,
|
|
feishuServer,
|
|
yuqueServer
|
|
}: {
|
|
teamId: string;
|
|
type: DatasetSourceReadTypeEnum;
|
|
sourceId: string;
|
|
|
|
isQAImport?: boolean; // csv data
|
|
selector?: string; // link selector
|
|
externalFileId?: string; // external file dataset
|
|
apiServer?: APIFileServer; // api dataset
|
|
feishuServer?: FeishuServer; // feishu dataset
|
|
yuqueServer?: YuqueServer; // yuque dataset
|
|
}): Promise<string> => {
|
|
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
|
const { rawText } = await readFileContentFromMongo({
|
|
teamId,
|
|
bucketName: BucketNameEnum.dataset,
|
|
fileId: sourceId,
|
|
isQAImport
|
|
});
|
|
return rawText;
|
|
} else if (type === DatasetSourceReadTypeEnum.link) {
|
|
const result = await urlsFetch({
|
|
urlList: [sourceId],
|
|
selector
|
|
});
|
|
|
|
return result[0]?.content || '';
|
|
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
|
|
if (!externalFileId) return Promise.reject('FileId not found');
|
|
const rawText = await readFileRawTextByUrl({
|
|
teamId,
|
|
url: sourceId,
|
|
relatedId: externalFileId
|
|
});
|
|
return rawText;
|
|
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
|
|
const rawText = await readApiServerFileContent({
|
|
apiServer,
|
|
feishuServer,
|
|
yuqueServer,
|
|
apiFileId: sourceId,
|
|
teamId
|
|
});
|
|
return rawText;
|
|
}
|
|
return '';
|
|
};
|
|
|
|
export const readApiServerFileContent = async ({
|
|
apiServer,
|
|
feishuServer,
|
|
yuqueServer,
|
|
apiFileId,
|
|
teamId
|
|
}: {
|
|
apiServer?: APIFileServer;
|
|
feishuServer?: FeishuServer;
|
|
yuqueServer?: YuqueServer;
|
|
apiFileId: string;
|
|
teamId: string;
|
|
}) => {
|
|
if (apiServer) {
|
|
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId });
|
|
}
|
|
|
|
if (feishuServer || yuqueServer) {
|
|
return POST<string>(`/core/dataset/systemApiDataset`, {
|
|
type: 'content',
|
|
feishuServer,
|
|
yuqueServer,
|
|
apiFileId
|
|
});
|
|
}
|
|
|
|
return Promise.reject('No apiServer or feishuServer or yuqueServer');
|
|
};
|
|
|
|
export const rawText2Chunks = ({
|
|
rawText,
|
|
isQAImport,
|
|
chunkLen = 512,
|
|
...splitProps
|
|
}: {
|
|
rawText: string;
|
|
isQAImport?: boolean;
|
|
} & TextSplitProps) => {
|
|
if (isQAImport) {
|
|
const { chunks } = parseCsvTable2Chunks(rawText);
|
|
return chunks;
|
|
}
|
|
|
|
const { chunks } = splitText2Chunks({
|
|
text: rawText,
|
|
chunkLen,
|
|
...splitProps
|
|
});
|
|
|
|
return chunks.map((item) => ({
|
|
q: item,
|
|
a: ''
|
|
}));
|
|
};
|