mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-17 16:45:02 +00:00

* feat: concat usage code (#5657) * feat: dataset parse queue (#5661) * feat: chat usage concat (#5669) * perf: search test usage * feat: chat usage concat * fix: ts * fix: ts * feat: chat node response store (#5675) * feat: chat node response store * limit export * test * add ai generate node (#5506) * add node copilot * apply code * update dynamic input & output * add code test * usage * dynamic input border render * optimize input & output * optimize code * update style * change card to popover * prompt editor basic * prompt editor * handle key down * update prompt * merge * fix * fix * fix * perf: workflow performance (#5677) * feat: chat node response store * limit export * perf: workflow performance * remove log * fix: app template get duplicate (#5682) * fix: dynamic input lock & code param (#5680) * fix: dynamic input lock & code param * fix * fix * feat: multi node data sync & system tool hot-swapping (#5575) * Enhance file upload functionality and system tool integration (#5257) * Enhance file upload functionality and system tool integration * Add supplementary documents and optimize the upload interface * Refactor file plugin types and update upload configurations * Refactor MinIO configuration variables and clean up API plugin handlers for improved readability and consistency * File name change * Refactor SystemTools component layout * fix i18n * fix * fix * fix * optimize app logs sort (#5310) * log keys config modal * multiple select * api * fontsize * code * chatid * fix build * fix * fix component * change name * log keys config * fix * delete unused * fix * chore: minio service class rewrite * chore: s3 plugin upload * feat: system global cache with multi node sync feature * feat: cache * chore: move images * docs: update & remove useless code * chore: resolve merge conflicts * chore: adjust the code * chore: adjust * deps: upgrade @fastgpt-sdk/plugin to 0.1.17 * perf(s3): s3 config * fix: cache syncKey refresh * fix: update @fastgpt-sdk/plugin to v0.1.18 removing mongo definition for fixing vitest * chore: adjust --------- Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com> Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Archer <545436317@qq.com> * perf: s3 api code * fix: toolbox empty when second open modal * feat: http tool set (#5599) * feat: http toolSet manual create front end * feat: http toolSet manual create i18n * feat: http toolSet manual create back end * feat: auth, as tool param, adapt mcp * fix: delete unused httpPlugin * fix: delete FlowNodeTypeEnum.httpPlugin * fix: AppTypeEnum include httpToolSet and httpPlugin * fix * delete console * fix * output schema * fix * fix bg * fix base url * fix --------- Co-authored-by: heheer <zhiyu44@qq.com> * feat: app count * perf: type check * feat: catch error * perf: plugin hot-swapping (#5688) * perf: plugin hot-swapping * chore: adjust code * perf: cite data auth * fix http toolset (#5689) * temp * fix http tool set * fix * template author hide * dynamic IO ui * fix: auth test * fix dynamic input & output (#5690) Co-authored-by: Archer <545436317@qq.com> * fix: dynamic output id * doc * feat: model permission (#5666) * feat(permission): model permission definition & api * chore: support update model's collaborators * feat: remove unauthedmodel when paste and import * fix: type error * fix: test setup global model list * fix: http tool api * chore: update fastgpt-sdk version * chore: remove useless code * chore: myModelList cache * perf: user who is not manager can not configure model permission (FE) * perf: model => Set * feat: getMyModels moved to opensource code; cache the myModelList * fix: type error * fix dynamic input reference select type (#5694) * remove unique index * read file usage * perf: connection error * fix: abort token count * fix: debug usage concat * fix: immer clone object * fix: immer clone object * perf: throw error when error chat * update audit i18n * fix: 修复识别pptx文件后,返回内容顺序错乱问题 (#5696) * fix: pptx sort error * fix prompt editor (#5695) * fix prompt editor * fix * fix: redis cache prefix (#5697) * fix: redis cache prefix * fix: cache * fix: get model collaborator by model.model * feat: hint for model per * rename bucket name * model ui * doc * doc --------- Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com> Co-authored-by: Zeng Qingwen <143274079+fishwww-ww@users.noreply.github.com> Co-authored-by: heheer <zhiyu44@qq.com> Co-authored-by: Deepturn <33342819+Deepturn@users.noreply.github.com>
344 lines
8.4 KiB
TypeScript
344 lines
8.4 KiB
TypeScript
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||
import {
|
||
ChunkTriggerConfigTypeEnum,
|
||
DatasetSourceReadTypeEnum
|
||
} from '@fastgpt/global/core/dataset/constants';
|
||
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
||
import { urlsFetch } from '../../common/string/cheerio';
|
||
import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter';
|
||
import axios from 'axios';
|
||
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
|
||
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
|
||
import { getApiDatasetRequest } from './apiDataset';
|
||
import Papa from 'papaparse';
|
||
import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type';
|
||
import { text2Chunks } from '../../worker/function';
|
||
import { addLog } from '../../common/system/log';
|
||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||
import { getFileMaxSize } from '../../common/file/utils';
|
||
import { UserError } from '@fastgpt/global/common/error/utils';
|
||
|
||
export const readFileRawTextByUrl = async ({
|
||
teamId,
|
||
tmbId,
|
||
url,
|
||
customPdfParse,
|
||
getFormatText,
|
||
relatedId,
|
||
maxFileSize = getFileMaxSize()
|
||
}: {
|
||
teamId: string;
|
||
tmbId: string;
|
||
url: string;
|
||
customPdfParse?: boolean;
|
||
getFormatText?: boolean;
|
||
relatedId: string; // externalFileId / apiFileId
|
||
maxFileSize?: number;
|
||
}) => {
|
||
const extension = parseFileExtensionFromUrl(url);
|
||
|
||
// Check file size
|
||
try {
|
||
const headResponse = await axios.head(url, { timeout: 10000 });
|
||
const contentLength = parseInt(headResponse.headers['content-length'] || '0');
|
||
|
||
if (contentLength > 0 && contentLength > maxFileSize) {
|
||
return Promise.reject(
|
||
`File too large. Size: ${Math.round(contentLength / 1024 / 1024)}MB, Maximum allowed: ${Math.round(maxFileSize / 1024 / 1024)}MB`
|
||
);
|
||
}
|
||
} catch (error) {
|
||
addLog.warn('Check file HEAD request failed');
|
||
}
|
||
|
||
// Use stream response type, avoid double memory usage
|
||
const response = await axios({
|
||
method: 'get',
|
||
url: url,
|
||
responseType: 'stream',
|
||
maxContentLength: maxFileSize,
|
||
timeout: 30000
|
||
});
|
||
|
||
// 优化:直接从 stream 转换为 buffer,避免 arraybuffer 中间步骤
|
||
const chunks: Buffer[] = [];
|
||
let totalLength = 0;
|
||
|
||
return new Promise<string>((resolve, reject) => {
|
||
let isAborted = false;
|
||
|
||
const cleanup = () => {
|
||
if (!isAborted) {
|
||
isAborted = true;
|
||
chunks.length = 0; // 清理内存
|
||
response.data.destroy();
|
||
}
|
||
};
|
||
|
||
// Stream timeout
|
||
const timeoutId = setTimeout(() => {
|
||
cleanup();
|
||
reject('File download timeout after 30 seconds');
|
||
}, 600000);
|
||
|
||
response.data.on('data', (chunk: Buffer) => {
|
||
if (isAborted) return;
|
||
totalLength += chunk.length;
|
||
if (totalLength > maxFileSize) {
|
||
clearTimeout(timeoutId);
|
||
cleanup();
|
||
return reject(
|
||
`File too large. Maximum size allowed is ${Math.round(maxFileSize / 1024 / 1024)}MB.`
|
||
);
|
||
}
|
||
|
||
chunks.push(chunk);
|
||
});
|
||
|
||
response.data.on('end', async () => {
|
||
if (isAborted) return;
|
||
|
||
clearTimeout(timeoutId);
|
||
|
||
try {
|
||
// 合并所有 chunks 为单个 buffer
|
||
const buffer = Buffer.concat(chunks);
|
||
|
||
// 立即清理 chunks 数组释放内存
|
||
chunks.length = 0;
|
||
|
||
const { rawText } = await retryFn(() =>
|
||
readRawContentByFileBuffer({
|
||
customPdfParse,
|
||
getFormatText,
|
||
extension,
|
||
teamId,
|
||
tmbId,
|
||
buffer,
|
||
encoding: 'utf-8',
|
||
metadata: {
|
||
relatedId
|
||
}
|
||
})
|
||
);
|
||
|
||
resolve(rawText);
|
||
} catch (error) {
|
||
cleanup();
|
||
reject(error);
|
||
}
|
||
});
|
||
|
||
response.data.on('error', (error: Error) => {
|
||
clearTimeout(timeoutId);
|
||
cleanup();
|
||
reject(error);
|
||
});
|
||
|
||
response.data.on('close', () => {
|
||
clearTimeout(timeoutId);
|
||
cleanup();
|
||
});
|
||
});
|
||
};
|
||
|
||
/*
|
||
fileId - local file, read from mongo
|
||
link - request
|
||
externalFile/apiFile = request read
|
||
*/
|
||
export const readDatasetSourceRawText = async ({
|
||
teamId,
|
||
tmbId,
|
||
type,
|
||
sourceId,
|
||
selector,
|
||
externalFileId,
|
||
apiDatasetServer,
|
||
customPdfParse,
|
||
getFormatText,
|
||
usageId
|
||
}: {
|
||
teamId: string;
|
||
tmbId: string;
|
||
type: DatasetSourceReadTypeEnum;
|
||
sourceId: string;
|
||
customPdfParse?: boolean;
|
||
getFormatText?: boolean;
|
||
|
||
selector?: string; // link selector
|
||
externalFileId?: string; // external file dataset
|
||
apiDatasetServer?: ApiDatasetServerType; // api dataset
|
||
usageId?: string;
|
||
}): Promise<{
|
||
title?: string;
|
||
rawText: string;
|
||
}> => {
|
||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||
const { filename, rawText } = await readFileContentFromMongo({
|
||
teamId,
|
||
tmbId,
|
||
bucketName: BucketNameEnum.dataset,
|
||
fileId: sourceId,
|
||
getFormatText,
|
||
customPdfParse,
|
||
usageId
|
||
});
|
||
return {
|
||
title: filename,
|
||
rawText
|
||
};
|
||
} else if (type === DatasetSourceReadTypeEnum.link) {
|
||
const result = await urlsFetch({
|
||
urlList: [sourceId],
|
||
selector
|
||
});
|
||
|
||
const { title = sourceId, content = '' } = result[0];
|
||
if (!content || content === 'Cannot fetch internal url') {
|
||
return Promise.reject(content || 'Can not fetch content from link');
|
||
}
|
||
|
||
return {
|
||
title,
|
||
rawText: content
|
||
};
|
||
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
|
||
if (!externalFileId) return Promise.reject(new UserError('FileId not found'));
|
||
const rawText = await readFileRawTextByUrl({
|
||
teamId,
|
||
tmbId,
|
||
url: sourceId,
|
||
relatedId: externalFileId,
|
||
customPdfParse
|
||
});
|
||
return {
|
||
rawText
|
||
};
|
||
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
|
||
const { title, rawText } = await readApiServerFileContent({
|
||
apiDatasetServer,
|
||
apiFileId: sourceId,
|
||
teamId,
|
||
tmbId,
|
||
customPdfParse
|
||
});
|
||
return {
|
||
title,
|
||
rawText
|
||
};
|
||
}
|
||
return {
|
||
title: '',
|
||
rawText: ''
|
||
};
|
||
};
|
||
|
||
export const readApiServerFileContent = async ({
|
||
apiDatasetServer,
|
||
apiFileId,
|
||
teamId,
|
||
tmbId,
|
||
customPdfParse
|
||
}: {
|
||
apiDatasetServer?: ApiDatasetServerType;
|
||
apiFileId: string;
|
||
teamId: string;
|
||
tmbId: string;
|
||
customPdfParse?: boolean;
|
||
}): Promise<{
|
||
title?: string;
|
||
rawText: string;
|
||
}> => {
|
||
return (await getApiDatasetRequest(apiDatasetServer)).getFileContent({
|
||
teamId,
|
||
tmbId,
|
||
apiFileId,
|
||
customPdfParse
|
||
});
|
||
};
|
||
|
||
export const rawText2Chunks = async ({
|
||
rawText = '',
|
||
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
|
||
chunkTriggerMinSize = 1000,
|
||
backupParse,
|
||
chunkSize = 512,
|
||
imageIdList,
|
||
...splitProps
|
||
}: {
|
||
rawText: string;
|
||
imageIdList?: string[];
|
||
|
||
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
|
||
chunkTriggerMinSize?: number; // maxSize from agent model, not store
|
||
|
||
backupParse?: boolean;
|
||
tableParse?: boolean;
|
||
} & TextSplitProps): Promise<
|
||
{
|
||
q: string;
|
||
a: string;
|
||
indexes?: string[];
|
||
imageIdList?: string[];
|
||
}[]
|
||
> => {
|
||
const parseDatasetBackup2Chunks = (rawText: string) => {
|
||
const csvArr = Papa.parse(rawText).data as string[][];
|
||
|
||
const chunks = csvArr
|
||
.slice(1)
|
||
.map((item) => ({
|
||
q: item[0] || '',
|
||
a: item[1] || '',
|
||
indexes: item.slice(2).filter((item) => item.trim()),
|
||
imageIdList
|
||
}))
|
||
.filter((item) => item.q || item.a);
|
||
|
||
return {
|
||
chunks
|
||
};
|
||
};
|
||
|
||
if (backupParse) {
|
||
return parseDatasetBackup2Chunks(rawText).chunks;
|
||
}
|
||
|
||
// Chunk condition
|
||
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
|
||
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
|
||
const textLength = rawText.trim().length;
|
||
const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
|
||
if (textLength < maxSize) {
|
||
return [
|
||
{
|
||
q: rawText,
|
||
a: '',
|
||
imageIdList
|
||
}
|
||
];
|
||
}
|
||
}
|
||
// 2. 选择最小值条件,只有超过最小值(手动决定)才会触发分块
|
||
if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
|
||
const textLength = rawText.trim().length;
|
||
if (textLength < chunkTriggerMinSize) {
|
||
return [{ q: rawText, a: '', imageIdList }];
|
||
}
|
||
}
|
||
|
||
const { chunks } = await text2Chunks({
|
||
text: rawText,
|
||
chunkSize,
|
||
...splitProps
|
||
});
|
||
|
||
return chunks.map((item) => ({
|
||
q: item,
|
||
a: '',
|
||
indexes: [],
|
||
imageIdList
|
||
}));
|
||
};
|