Files
FastGPT/packages/service/core/dataset/read.ts
Archer 1aebe5f185 V4.8.15 feature (#3331)
* feat: add customize toolkit (#3205)

* chaoyang

* fix-auth

* add toolkit

* add order

* plugin usage

* fix

* delete console:

* Fix: Fix fullscreen preview top positioning and improve Markdown rendering logic (#3247)

* 完成任务:修复全屏预览顶部固定问题,优化 Markdown 渲染逻辑

* 有问题修改

* 问题再修改

* 修正问题

* fix: plugin standalone display issue (#3254)

* 4.8.15 test (#3246)

* o1 config

* perf: system plugin code

* 调整系统插件代码。增加html 渲染安全配置。 (#3258)

* perf: base64 picker

* perf: list app or dataset

* perf: plugin config code

* 小窗适配等问题 (#3257)

* 小窗适配等问题

* git问题

* 小窗剩余问题

* feat: system plugin auth and lock version (#3265)

* feat: system plugin auth and lock version

* update comment

* 4.8.15 test (#3267)

* tmp log

* perf: login direct

* perf: iframe html code

* remove log

* fix: plugin standalone display (#3277)

* refactor: 页面拆分&i18n拆分 (#3281)

* refactor: account组件拆成独立页面

* script: 新增i18n json文件创建脚本

* refactor: 页面i18n拆分

* i18n: add en&hant

* 4.8.15 test (#3285)

* tmp log

* remove log

* fix: watch avatar refresh

* perf: i18n code

* fix(plugin): use intro instead of userguide (#3290)

* Universal SSO (#3292)

* tmp log

* remove log

* feat: common oauth

* readme

* perf: sso provider

* remove sso code

* perf: refresh plugins

* feat: add api dataset (#3272)

* add api-dataset

* fix api-dataset

* fix api dataset

* fix ts

* perf: create collection code (#3301)

* tmp log

* remove log

* perf: i18n change

* update version doc

* feat: question guide from chatId

* perf: create collection code

* fix: request api

* fix: request api

* fix: tts auth and response type (#3303)

* perf: md splitter

* fix: tts auth and response type

* fix: api file dataset (#3307)

* perf: api dataset init (#3310)

* perf: collection schema

* perf: api dataset init

* refactor: 团队管理独立页面 (#3302)

* ui: 团队管理独立页面

* 代码优化

* fix

* perf: sync collection and ui check (#3314)

* perf: sync collection

* remove script

* perf: update api server

* perf: api dataset parent

* perf: team ui

* perf: team 18n

* update team ui

* perf: ui check

* perf: i18n

* fix: debug variables & cronjob & system plugin callback load (#3315)

* fix: debug variables & cronjob & system plugin callback load

* fix type

* fix

* fix

* fix: plugin dataset quote;perf: system variables init (#3316)

* fix: plugin dataset quote

* perf: system variables init

* perf: node templates ui;fix: dataset import ui (#3318)

* fix: dataset import ui

* perf: node templates ui

* perf: ui refresh

* feat:套餐改名和套餐跳转配置 (#3309)

* fixing:except Sidebar

* 去除了多余的代码

* 修正了套餐说明的代码

* 修正了误删除的show_git代码

* 修正了名字部分等代码

* 修正了问题,遗留了其他和ui讨论不一致的部分

* 4.8.15 test (#3319)

* remove log

* pref: bill ui

* pref: bill ui

* perf: log

* html渲染文档 (#3270)

* html渲染文档

* 文档有点小问题

* feat: doc (#3322)

* 集合重训练 (#3282)

* rebaser

* 一点补充

* 小问题

* 其他问题修正,删除集合保留文件的参数还没找到...

* reTraining

* delete uesless

* 删除了一行错误代码

* 集合重训练部分

* fixing

* 删除console代码

* feat: navbar item config (#3326)

* perf: custom navbar code;perf: retraining code;feat: api dataset and dataset api doc (#3329)

* feat: api dataset and dataset api doc

* perf: retraining code

* perf: custom navbar code

* fix: ts (#3330)

* fix: ts

* fix: ts

* retraining ui

* perf: api collection filter

* perf: retrining button

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Jiangween <145003935+Jiangween@users.noreply.github.com>
Co-authored-by: papapatrick <109422393+Patrickill@users.noreply.github.com>
2024-12-06 10:56:53 +08:00

140 lines
3.4 KiB
TypeScript

import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { parseCsvTable2Chunks } from './training/utils';
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios';
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
import { APIFileServer } from '@fastgpt/global/core/dataset/apiDataset';
import { useApiDatasetRequest } from './apiDataset/api';
export const readFileRawTextByUrl = async ({
teamId,
url,
relatedId
}: {
teamId: string;
url: string;
relatedId: string; // externalFileId / apiFileId
}) => {
const response = await axios({
method: 'get',
url: url,
responseType: 'arraybuffer'
});
const extension = parseFileExtensionFromUrl(url);
const buffer = Buffer.from(response.data, 'binary');
const { rawText } = await readRawContentByFileBuffer({
extension,
teamId,
buffer,
encoding: 'utf-8',
metadata: {
relatedId
}
});
return rawText;
};
/*
fileId - local file, read from mongo
link - request
externalFile/apiFile = request read
*/
export const readDatasetSourceRawText = async ({
teamId,
type,
sourceId,
isQAImport,
selector,
externalFileId,
apiServer
}: {
teamId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean; // csv data
selector?: string; // link selector
externalFileId?: string; // external file dataset
apiServer?: APIFileServer; // api dataset
}): Promise<string> => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { rawText } = await readFileContentFromMongo({
teamId,
bucketName: BucketNameEnum.dataset,
fileId: sourceId,
isQAImport
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.link) {
const result = await urlsFetch({
urlList: [sourceId],
selector
});
return result[0]?.content || '';
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
if (!externalFileId) return Promise.reject('FileId not found');
const rawText = await readFileRawTextByUrl({
teamId,
url: sourceId,
relatedId: externalFileId
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
if (!apiServer) return Promise.reject('apiServer not found');
const rawText = await readApiServerFileContent({
apiServer,
apiFileId: sourceId,
teamId
});
return rawText;
}
return '';
};
export const readApiServerFileContent = async ({
apiServer,
apiFileId,
teamId
}: {
apiServer: APIFileServer;
apiFileId: string;
teamId: string;
}) => {
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId });
};
export const rawText2Chunks = ({
rawText,
isQAImport,
chunkLen = 512,
...splitProps
}: {
rawText: string;
isQAImport?: boolean;
} & TextSplitProps) => {
if (isQAImport) {
const { chunks } = parseCsvTable2Chunks(rawText);
return chunks;
}
const { chunks } = splitText2Chunks({
text: rawText,
chunkLen,
...splitProps
});
return chunks.map((item) => ({
q: item,
a: ''
}));
};