Files
FastGPT/packages/service/core/dataset/read.ts
Archer bd79e7701f V4.8.16 dev (#3431)
* feat: add feishu & yuque dataset (#3379)

* feat: add feishu & yuque dataset

* fix ts

* fix ts

* move type position

* fix

* fix: merge interface

* fix

* feat: dingtalk sso support (#3408)

* fix: optional sso state

* feat: dingtalk bot

* feat: dingtalk sso login

* chore: move i18n to user namespace

* feat: dingtalk bot integration (#3415)

* feat: dingtalk bot integration

* docs: config dingtalk bot

* feat:sear XNG服务 (#3413)

* feat:sear XNG服务

* 补充了courseUrl

* 添加了官方文档

* 错误时返回情况修正了一下

* Tracks (#3420)

* feat: node intro

* feat: add domain track

* dingding sso login

* perf: api dataset code and add doc

* feat: tracks

* feat: searXNG plugins

* fix: ts

* feat: delete node tracks (#3423)

* fix: dingtalk bot GET verification (#3424)

* 4.8.16 test: fix: plugin inputs render;fix: ui offset (#3426)

* fix: ui offset

* perf: dingding talk

* fix: plugin inputs render

* feat: menu all folder (#3429)

* fix: recall code

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: a.e. <49438478+I-Info@users.noreply.github.com>
Co-authored-by: Jiangween <145003935+Jiangween@users.noreply.github.com>
2024-12-18 19:30:19 +08:00

162 lines
4.0 KiB
TypeScript

import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { parseCsvTable2Chunks } from './training/utils';
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios';
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
import { APIFileServer, FeishuServer, YuqueServer } from '@fastgpt/global/core/dataset/apiDataset';
import { useApiDatasetRequest } from './apiDataset/api';
import { POST } from '../../common/api/plusRequest';
export const readFileRawTextByUrl = async ({
teamId,
url,
relatedId
}: {
teamId: string;
url: string;
relatedId: string; // externalFileId / apiFileId
}) => {
const response = await axios({
method: 'get',
url: url,
responseType: 'arraybuffer'
});
const extension = parseFileExtensionFromUrl(url);
const buffer = Buffer.from(response.data, 'binary');
const { rawText } = await readRawContentByFileBuffer({
extension,
teamId,
buffer,
encoding: 'utf-8',
metadata: {
relatedId
}
});
return rawText;
};
/*
fileId - local file, read from mongo
link - request
externalFile/apiFile = request read
*/
export const readDatasetSourceRawText = async ({
teamId,
type,
sourceId,
isQAImport,
selector,
externalFileId,
apiServer,
feishuServer,
yuqueServer
}: {
teamId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean; // csv data
selector?: string; // link selector
externalFileId?: string; // external file dataset
apiServer?: APIFileServer; // api dataset
feishuServer?: FeishuServer; // feishu dataset
yuqueServer?: YuqueServer; // yuque dataset
}): Promise<string> => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { rawText } = await readFileContentFromMongo({
teamId,
bucketName: BucketNameEnum.dataset,
fileId: sourceId,
isQAImport
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.link) {
const result = await urlsFetch({
urlList: [sourceId],
selector
});
return result[0]?.content || '';
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
if (!externalFileId) return Promise.reject('FileId not found');
const rawText = await readFileRawTextByUrl({
teamId,
url: sourceId,
relatedId: externalFileId
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
const rawText = await readApiServerFileContent({
apiServer,
feishuServer,
yuqueServer,
apiFileId: sourceId,
teamId
});
return rawText;
}
return '';
};
export const readApiServerFileContent = async ({
apiServer,
feishuServer,
yuqueServer,
apiFileId,
teamId
}: {
apiServer?: APIFileServer;
feishuServer?: FeishuServer;
yuqueServer?: YuqueServer;
apiFileId: string;
teamId: string;
}) => {
if (apiServer) {
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId });
}
if (feishuServer || yuqueServer) {
return POST<string>(`/core/dataset/systemApiDataset`, {
type: 'content',
feishuServer,
yuqueServer,
apiFileId
});
}
return Promise.reject('No apiServer or feishuServer or yuqueServer');
};
export const rawText2Chunks = ({
rawText,
isQAImport,
chunkLen = 512,
...splitProps
}: {
rawText: string;
isQAImport?: boolean;
} & TextSplitProps) => {
if (isQAImport) {
const { chunks } = parseCsvTable2Chunks(rawText);
return chunks;
}
const { chunks } = splitText2Chunks({
text: rawText,
chunkLen,
...splitProps
});
return chunks.map((item) => ({
q: item,
a: ''
}));
};