Files
FastGPT/packages/service/core/dataset/read.ts
T
Archer 051455238c V4.13.0 features (#5693)
* feat: concat usage code (#5657)

* feat: dataset parse queue (#5661)

* feat: chat usage concat (#5669)

* perf: search test usage

* feat: chat usage concat

* fix: ts

* fix: ts

* feat: chat node response store (#5675)

* feat: chat node response store

* limit export

* test

* add ai generate node (#5506)

* add node copilot

* apply code

* update dynamic input & output

* add code test

* usage

* dynamic input border render

* optimize input & output

* optimize code

* update style

* change card to popover

* prompt editor basic

* prompt editor

* handle key down

* update prompt

* merge

* fix

* fix

* fix

* perf: workflow performance (#5677)

* feat: chat node response store

* limit export

* perf: workflow performance

* remove log

* fix: app template get duplicate (#5682)

* fix: dynamic input lock & code param (#5680)

* fix: dynamic input lock & code param

* fix

* fix

* feat: multi node data sync & system tool hot-swapping (#5575)

* Enhance file upload functionality and system tool integration (#5257)

* Enhance file upload functionality and system tool integration

* Add supplementary documents and optimize the upload interface

* Refactor file plugin types and update upload configurations

* Refactor MinIO configuration variables and clean up API plugin handlers for improved readability and consistency

* File name change

* Refactor SystemTools component layout

* fix i18n

* fix

* fix

* fix

* optimize app logs sort (#5310)

* log keys config modal

* multiple select

* api

* fontsize

* code

* chatid

* fix build

* fix

* fix component

* change name

* log keys config

* fix

* delete unused

* fix

* chore: minio service class rewrite

* chore: s3 plugin upload

* feat: system global cache with multi node sync feature

* feat: cache

* chore: move images

* docs: update & remove useless code

* chore: resolve merge conflicts

* chore: adjust the code

* chore: adjust

* deps: upgrade @fastgpt-sdk/plugin to 0.1.17

* perf(s3): s3 config

* fix: cache syncKey refresh

* fix: update @fastgpt-sdk/plugin to v0.1.18 removing mongo definition for fixing vitest

* chore: adjust

---------

Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Archer <545436317@qq.com>

* perf: s3 api code

* fix: toolbox empty when second open modal

* feat: http tool set (#5599)

* feat: http toolSet manual create front end

* feat: http toolSet manual create i18n

* feat: http toolSet manual create back end

* feat: auth, as tool param, adapt mcp

* fix: delete unused httpPlugin

* fix: delete FlowNodeTypeEnum.httpPlugin

* fix: AppTypeEnum include httpToolSet and httpPlugin

* fix

* delete console

* fix

* output schema

* fix

* fix bg

* fix base url

* fix

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* feat: app count

* perf: type check

* feat: catch error

* perf: plugin hot-swapping (#5688)

* perf: plugin hot-swapping

* chore: adjust code

* perf: cite data auth

* fix http toolset (#5689)

* temp

* fix http tool set

* fix

* template author hide

* dynamic IO ui

* fix: auth test

* fix dynamic input & output (#5690)

Co-authored-by: Archer <545436317@qq.com>

* fix: dynamic output id

* doc

* feat: model permission (#5666)

* feat(permission): model permission definition & api

* chore: support update model's collaborators

* feat: remove unauthedmodel when paste and import

* fix: type error

* fix: test setup global model list

* fix: http tool api

* chore: update fastgpt-sdk version

* chore: remove useless code

* chore: myModelList cache

* perf: user who is not manager can not configure model permission (FE)

* perf: model => Set

* feat: getMyModels moved to opensource code; cache the myModelList

* fix: type error

* fix dynamic input reference select type (#5694)

* remove unique index

* read file usage

* perf: connection error

* fix: abort token count

* fix: debug usage concat

* fix: immer clone object

* fix: immer clone object

* perf: throw error when error chat

* update audit i18n

* fix: 修复识别pptx文件后,返回内容顺序错乱问题 (#5696)

* fix: pptx sort error

* fix prompt editor (#5695)

* fix prompt editor

* fix

* fix: redis cache prefix (#5697)

* fix: redis cache prefix

* fix: cache

* fix: get model collaborator by model.model

* feat: hint for model per

* rename bucket name

* model ui

* doc

* doc

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: Zeng Qingwen <143274079+fishwww-ww@users.noreply.github.com>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: Deepturn <33342819+Deepturn@users.noreply.github.com>
2025-09-24 22:40:31 +08:00

344 lines
8.4 KiB
TypeScript

import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import {
ChunkTriggerConfigTypeEnum,
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios';
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
import { getApiDatasetRequest } from './apiDataset';
import Papa from 'papaparse';
import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type';
import { text2Chunks } from '../../worker/function';
import { addLog } from '../../common/system/log';
import { retryFn } from '@fastgpt/global/common/system/utils';
import { getFileMaxSize } from '../../common/file/utils';
import { UserError } from '@fastgpt/global/common/error/utils';
export const readFileRawTextByUrl = async ({
teamId,
tmbId,
url,
customPdfParse,
getFormatText,
relatedId,
maxFileSize = getFileMaxSize()
}: {
teamId: string;
tmbId: string;
url: string;
customPdfParse?: boolean;
getFormatText?: boolean;
relatedId: string; // externalFileId / apiFileId
maxFileSize?: number;
}) => {
const extension = parseFileExtensionFromUrl(url);
// Check file size
try {
const headResponse = await axios.head(url, { timeout: 10000 });
const contentLength = parseInt(headResponse.headers['content-length'] || '0');
if (contentLength > 0 && contentLength > maxFileSize) {
return Promise.reject(
`File too large. Size: ${Math.round(contentLength / 1024 / 1024)}MB, Maximum allowed: ${Math.round(maxFileSize / 1024 / 1024)}MB`
);
}
} catch (error) {
addLog.warn('Check file HEAD request failed');
}
// Use stream response type, avoid double memory usage
const response = await axios({
method: 'get',
url: url,
responseType: 'stream',
maxContentLength: maxFileSize,
timeout: 30000
});
// 优化:直接从 stream 转换为 buffer,避免 arraybuffer 中间步骤
const chunks: Buffer[] = [];
let totalLength = 0;
return new Promise<string>((resolve, reject) => {
let isAborted = false;
const cleanup = () => {
if (!isAborted) {
isAborted = true;
chunks.length = 0; // 清理内存
response.data.destroy();
}
};
// Stream timeout
const timeoutId = setTimeout(() => {
cleanup();
reject('File download timeout after 30 seconds');
}, 600000);
response.data.on('data', (chunk: Buffer) => {
if (isAborted) return;
totalLength += chunk.length;
if (totalLength > maxFileSize) {
clearTimeout(timeoutId);
cleanup();
return reject(
`File too large. Maximum size allowed is ${Math.round(maxFileSize / 1024 / 1024)}MB.`
);
}
chunks.push(chunk);
});
response.data.on('end', async () => {
if (isAborted) return;
clearTimeout(timeoutId);
try {
// 合并所有 chunks 为单个 buffer
const buffer = Buffer.concat(chunks);
// 立即清理 chunks 数组释放内存
chunks.length = 0;
const { rawText } = await retryFn(() =>
readRawContentByFileBuffer({
customPdfParse,
getFormatText,
extension,
teamId,
tmbId,
buffer,
encoding: 'utf-8',
metadata: {
relatedId
}
})
);
resolve(rawText);
} catch (error) {
cleanup();
reject(error);
}
});
response.data.on('error', (error: Error) => {
clearTimeout(timeoutId);
cleanup();
reject(error);
});
response.data.on('close', () => {
clearTimeout(timeoutId);
cleanup();
});
});
};
/*
fileId - local file, read from mongo
link - request
externalFile/apiFile = request read
*/
export const readDatasetSourceRawText = async ({
teamId,
tmbId,
type,
sourceId,
selector,
externalFileId,
apiDatasetServer,
customPdfParse,
getFormatText,
usageId
}: {
teamId: string;
tmbId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
customPdfParse?: boolean;
getFormatText?: boolean;
selector?: string; // link selector
externalFileId?: string; // external file dataset
apiDatasetServer?: ApiDatasetServerType; // api dataset
usageId?: string;
}): Promise<{
title?: string;
rawText: string;
}> => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { filename, rawText } = await readFileContentFromMongo({
teamId,
tmbId,
bucketName: BucketNameEnum.dataset,
fileId: sourceId,
getFormatText,
customPdfParse,
usageId
});
return {
title: filename,
rawText
};
} else if (type === DatasetSourceReadTypeEnum.link) {
const result = await urlsFetch({
urlList: [sourceId],
selector
});
const { title = sourceId, content = '' } = result[0];
if (!content || content === 'Cannot fetch internal url') {
return Promise.reject(content || 'Can not fetch content from link');
}
return {
title,
rawText: content
};
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
if (!externalFileId) return Promise.reject(new UserError('FileId not found'));
const rawText = await readFileRawTextByUrl({
teamId,
tmbId,
url: sourceId,
relatedId: externalFileId,
customPdfParse
});
return {
rawText
};
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
const { title, rawText } = await readApiServerFileContent({
apiDatasetServer,
apiFileId: sourceId,
teamId,
tmbId,
customPdfParse
});
return {
title,
rawText
};
}
return {
title: '',
rawText: ''
};
};
export const readApiServerFileContent = async ({
apiDatasetServer,
apiFileId,
teamId,
tmbId,
customPdfParse
}: {
apiDatasetServer?: ApiDatasetServerType;
apiFileId: string;
teamId: string;
tmbId: string;
customPdfParse?: boolean;
}): Promise<{
title?: string;
rawText: string;
}> => {
return (await getApiDatasetRequest(apiDatasetServer)).getFileContent({
teamId,
tmbId,
apiFileId,
customPdfParse
});
};
export const rawText2Chunks = async ({
rawText = '',
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
chunkTriggerMinSize = 1000,
backupParse,
chunkSize = 512,
imageIdList,
...splitProps
}: {
rawText: string;
imageIdList?: string[];
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
chunkTriggerMinSize?: number; // maxSize from agent model, not store
backupParse?: boolean;
tableParse?: boolean;
} & TextSplitProps): Promise<
{
q: string;
a: string;
indexes?: string[];
imageIdList?: string[];
}[]
> => {
const parseDatasetBackup2Chunks = (rawText: string) => {
const csvArr = Papa.parse(rawText).data as string[][];
const chunks = csvArr
.slice(1)
.map((item) => ({
q: item[0] || '',
a: item[1] || '',
indexes: item.slice(2).filter((item) => item.trim()),
imageIdList
}))
.filter((item) => item.q || item.a);
return {
chunks
};
};
if (backupParse) {
return parseDatasetBackup2Chunks(rawText).chunks;
}
// Chunk condition
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
const textLength = rawText.trim().length;
const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
if (textLength < maxSize) {
return [
{
q: rawText,
a: '',
imageIdList
}
];
}
}
// 2. 选择最小值条件,只有超过最小值(手动决定)才会触发分块
if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
const textLength = rawText.trim().length;
if (textLength < chunkTriggerMinSize) {
return [{ q: rawText, a: '', imageIdList }];
}
}
const { chunks } = await text2Chunks({
text: rawText,
chunkSize,
...splitProps
});
return chunks.map((item) => ({
q: item,
a: '',
indexes: [],
imageIdList
}));
};