mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-20 02:34:52 +00:00

* add logs chart (#5352) * charts * chart data * log chart * delete * rename api * fix * move api * fix * fix * pro config * fix * feat: Repository interaction (#5356) * feat: 1好像功能没问题了,明天再测 * feat: 2 解决了昨天遗留的bug,但全选按钮又bug了 * feat: 3 第三版,解决了全选功能bug * feat: 4 第四版,下面改小细节 * feat: 5 我勒个痘 * feat: 6 * feat: 6 pr * feat: 7 * feat: 8 * feat: 9 * feat: 10 * feat: 11 * feat: 12 * perf: checkbox ui * refactor: tweak login loyout (#5357) Co-authored-by: Archer <545436317@qq.com> * login ui * app chat log chart pro display (#5392) * app chat log chart pro display * add canopen props * perf: pro tag tip * perf: pro tag tip * feat: openrouter provider (#5406) * perf: login ui * feat: openrouter provider * provider * perf: custom error throw * perf: emb batch (#5407) * perf: emb batch * perf: vector retry * doc * doc (#5411) * doc * fix: team folder will add to workflow * fix: generateToc shell * Tool price (#5376) * resolve conflicts for cherry-pick * fix i18n * Enhance system plugin template data structure and update ToolSelectModal to include CostTooltip component * refactor: update systemKeyCost type to support array of objects in plugin and workflow types * refactor: simplify systemKeyCost type across plugin and workflow types to a single number * refactor: streamline systemKeyCost handling in plugin and workflow components * fix * fix * perf: toolset price config;fix: workflow array selector ui (#5419) * fix: workflow array selector ui * update default model tip * perf: toolset price config * doc * fix: test * Refactor/chat (#5418) * refactor: add homepage configuration; add home chat page; add side bar animated collapse and layout * fix: fix lint rules * chore: improve logics and code * chore: more clearer logics * chore: adjust api --------- Co-authored-by: Archer <545436317@qq.com> * perf: chat setting code * del history * logo image * perf: home chat ui * feat: enhance chat response handling with external links and user info (#5427) * feat: enhance chat response handling with external links and user info * fix * cite code * perf: toolset add in workflow * fix: test * fix: search paraentId * Fix/chat (#5434) * wip: rebase了upstream * wip: adapt mobile UI * fix: fix chat page logic and UI * fix: fix UI and improve some logics * fix: model selector missing logo; vision model to retrieve file * perf: role selector * fix: chat ui * optimize export app chat log (#5436) * doc * chore: move components to proper directory; fix the api to get app list (#5437) * chore: improve team app panel display form (#5438) * feat: add home chat log tab * chore: improve team app panel display form * chore: improve log panel * fix: spec * doc * fix: log permission * fix: dataset schema required * add loading status * remove ui weight * manage log * fix: log detail per * doc * fix: log menu * rename permission * bg color * fix: app log per * fix: log key selector * fix: log * doc --------- Co-authored-by: heheer <zhiyu44@qq.com> Co-authored-by: colnii <1286949794@qq.com> Co-authored-by: 伍闲犬 <76519998+xqvvu@users.noreply.github.com> Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com> Co-authored-by: 伍闲犬 <whoeverimf5@gmail.com> Co-authored-by: heheer <heheer@sealos.io>
341 lines
8.3 KiB
TypeScript
341 lines
8.3 KiB
TypeScript
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||
import {
|
||
ChunkTriggerConfigTypeEnum,
|
||
DatasetSourceReadTypeEnum
|
||
} from '@fastgpt/global/core/dataset/constants';
|
||
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
||
import { urlsFetch } from '../../common/string/cheerio';
|
||
import { type TextSplitProps } from '@fastgpt/global/common/string/textSplitter';
|
||
import axios from 'axios';
|
||
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
|
||
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
|
||
import { getApiDatasetRequest } from './apiDataset';
|
||
import Papa from 'papaparse';
|
||
import type { ApiDatasetServerType } from '@fastgpt/global/core/dataset/apiDataset/type';
|
||
import { text2Chunks } from '../../worker/function';
|
||
import { addLog } from '../../common/system/log';
|
||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||
import { getFileMaxSize } from '../../common/file/utils';
|
||
import { UserError } from '@fastgpt/global/common/error/utils';
|
||
|
||
export const readFileRawTextByUrl = async ({
|
||
teamId,
|
||
tmbId,
|
||
url,
|
||
customPdfParse,
|
||
getFormatText,
|
||
relatedId,
|
||
maxFileSize = getFileMaxSize()
|
||
}: {
|
||
teamId: string;
|
||
tmbId: string;
|
||
url: string;
|
||
customPdfParse?: boolean;
|
||
getFormatText?: boolean;
|
||
relatedId: string; // externalFileId / apiFileId
|
||
maxFileSize?: number;
|
||
}) => {
|
||
const extension = parseFileExtensionFromUrl(url);
|
||
|
||
// Check file size
|
||
try {
|
||
const headResponse = await axios.head(url, { timeout: 10000 });
|
||
const contentLength = parseInt(headResponse.headers['content-length'] || '0');
|
||
|
||
if (contentLength > 0 && contentLength > maxFileSize) {
|
||
return Promise.reject(
|
||
`File too large. Size: ${Math.round(contentLength / 1024 / 1024)}MB, Maximum allowed: ${Math.round(maxFileSize / 1024 / 1024)}MB`
|
||
);
|
||
}
|
||
} catch (error) {
|
||
addLog.warn('Check file HEAD request failed');
|
||
}
|
||
|
||
// Use stream response type, avoid double memory usage
|
||
const response = await axios({
|
||
method: 'get',
|
||
url: url,
|
||
responseType: 'stream',
|
||
maxContentLength: maxFileSize,
|
||
timeout: 30000
|
||
});
|
||
|
||
// 优化:直接从 stream 转换为 buffer,避免 arraybuffer 中间步骤
|
||
const chunks: Buffer[] = [];
|
||
let totalLength = 0;
|
||
|
||
return new Promise<string>((resolve, reject) => {
|
||
let isAborted = false;
|
||
|
||
const cleanup = () => {
|
||
if (!isAborted) {
|
||
isAborted = true;
|
||
chunks.length = 0; // 清理内存
|
||
response.data.destroy();
|
||
}
|
||
};
|
||
|
||
// Stream timeout
|
||
const timeoutId = setTimeout(() => {
|
||
cleanup();
|
||
reject('File download timeout after 30 seconds');
|
||
}, 600000);
|
||
|
||
response.data.on('data', (chunk: Buffer) => {
|
||
if (isAborted) return;
|
||
totalLength += chunk.length;
|
||
if (totalLength > maxFileSize) {
|
||
clearTimeout(timeoutId);
|
||
cleanup();
|
||
return reject(
|
||
`File too large. Maximum size allowed is ${Math.round(maxFileSize / 1024 / 1024)}MB.`
|
||
);
|
||
}
|
||
|
||
chunks.push(chunk);
|
||
});
|
||
|
||
response.data.on('end', async () => {
|
||
if (isAborted) return;
|
||
|
||
clearTimeout(timeoutId);
|
||
|
||
try {
|
||
// 合并所有 chunks 为单个 buffer
|
||
const buffer = Buffer.concat(chunks);
|
||
|
||
// 立即清理 chunks 数组释放内存
|
||
chunks.length = 0;
|
||
|
||
const { rawText } = await retryFn(() =>
|
||
readRawContentByFileBuffer({
|
||
customPdfParse,
|
||
getFormatText,
|
||
extension,
|
||
teamId,
|
||
tmbId,
|
||
buffer,
|
||
encoding: 'utf-8',
|
||
metadata: {
|
||
relatedId
|
||
}
|
||
})
|
||
);
|
||
|
||
resolve(rawText);
|
||
} catch (error) {
|
||
cleanup();
|
||
reject(error);
|
||
}
|
||
});
|
||
|
||
response.data.on('error', (error: Error) => {
|
||
clearTimeout(timeoutId);
|
||
cleanup();
|
||
reject(error);
|
||
});
|
||
|
||
response.data.on('close', () => {
|
||
clearTimeout(timeoutId);
|
||
cleanup();
|
||
});
|
||
});
|
||
};
|
||
|
||
/*
|
||
fileId - local file, read from mongo
|
||
link - request
|
||
externalFile/apiFile = request read
|
||
*/
|
||
export const readDatasetSourceRawText = async ({
|
||
teamId,
|
||
tmbId,
|
||
type,
|
||
sourceId,
|
||
selector,
|
||
externalFileId,
|
||
apiDatasetServer,
|
||
customPdfParse,
|
||
getFormatText
|
||
}: {
|
||
teamId: string;
|
||
tmbId: string;
|
||
type: DatasetSourceReadTypeEnum;
|
||
sourceId: string;
|
||
customPdfParse?: boolean;
|
||
getFormatText?: boolean;
|
||
|
||
selector?: string; // link selector
|
||
externalFileId?: string; // external file dataset
|
||
apiDatasetServer?: ApiDatasetServerType; // api dataset
|
||
}): Promise<{
|
||
title?: string;
|
||
rawText: string;
|
||
}> => {
|
||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||
const { filename, rawText } = await readFileContentFromMongo({
|
||
teamId,
|
||
tmbId,
|
||
bucketName: BucketNameEnum.dataset,
|
||
fileId: sourceId,
|
||
customPdfParse,
|
||
getFormatText
|
||
});
|
||
return {
|
||
title: filename,
|
||
rawText
|
||
};
|
||
} else if (type === DatasetSourceReadTypeEnum.link) {
|
||
const result = await urlsFetch({
|
||
urlList: [sourceId],
|
||
selector
|
||
});
|
||
|
||
const { title = sourceId, content = '' } = result[0];
|
||
if (!content || content === 'Cannot fetch internal url') {
|
||
return Promise.reject(content || 'Can not fetch content from link');
|
||
}
|
||
|
||
return {
|
||
title,
|
||
rawText: content
|
||
};
|
||
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
|
||
if (!externalFileId) return Promise.reject(new UserError('FileId not found'));
|
||
const rawText = await readFileRawTextByUrl({
|
||
teamId,
|
||
tmbId,
|
||
url: sourceId,
|
||
relatedId: externalFileId,
|
||
customPdfParse
|
||
});
|
||
return {
|
||
rawText
|
||
};
|
||
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
|
||
const { title, rawText } = await readApiServerFileContent({
|
||
apiDatasetServer,
|
||
apiFileId: sourceId,
|
||
teamId,
|
||
tmbId,
|
||
customPdfParse
|
||
});
|
||
return {
|
||
title,
|
||
rawText
|
||
};
|
||
}
|
||
return {
|
||
title: '',
|
||
rawText: ''
|
||
};
|
||
};
|
||
|
||
export const readApiServerFileContent = async ({
|
||
apiDatasetServer,
|
||
apiFileId,
|
||
teamId,
|
||
tmbId,
|
||
customPdfParse
|
||
}: {
|
||
apiDatasetServer?: ApiDatasetServerType;
|
||
apiFileId: string;
|
||
teamId: string;
|
||
tmbId: string;
|
||
customPdfParse?: boolean;
|
||
}): Promise<{
|
||
title?: string;
|
||
rawText: string;
|
||
}> => {
|
||
return (await getApiDatasetRequest(apiDatasetServer)).getFileContent({
|
||
teamId,
|
||
tmbId,
|
||
apiFileId,
|
||
customPdfParse
|
||
});
|
||
};
|
||
|
||
export const rawText2Chunks = async ({
|
||
rawText = '',
|
||
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
|
||
chunkTriggerMinSize = 1000,
|
||
backupParse,
|
||
chunkSize = 512,
|
||
imageIdList,
|
||
...splitProps
|
||
}: {
|
||
rawText: string;
|
||
imageIdList?: string[];
|
||
|
||
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
|
||
chunkTriggerMinSize?: number; // maxSize from agent model, not store
|
||
|
||
backupParse?: boolean;
|
||
tableParse?: boolean;
|
||
} & TextSplitProps): Promise<
|
||
{
|
||
q: string;
|
||
a: string;
|
||
indexes?: string[];
|
||
imageIdList?: string[];
|
||
}[]
|
||
> => {
|
||
const parseDatasetBackup2Chunks = (rawText: string) => {
|
||
const csvArr = Papa.parse(rawText).data as string[][];
|
||
|
||
const chunks = csvArr
|
||
.slice(1)
|
||
.map((item) => ({
|
||
q: item[0] || '',
|
||
a: item[1] || '',
|
||
indexes: item.slice(2).filter((item) => item.trim()),
|
||
imageIdList
|
||
}))
|
||
.filter((item) => item.q || item.a);
|
||
|
||
return {
|
||
chunks
|
||
};
|
||
};
|
||
|
||
if (backupParse) {
|
||
return parseDatasetBackup2Chunks(rawText).chunks;
|
||
}
|
||
|
||
// Chunk condition
|
||
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
|
||
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
|
||
const textLength = rawText.trim().length;
|
||
const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
|
||
if (textLength < maxSize) {
|
||
return [
|
||
{
|
||
q: rawText,
|
||
a: '',
|
||
imageIdList
|
||
}
|
||
];
|
||
}
|
||
}
|
||
// 2. 选择最小值条件,只有超过最小值(手动决定)才会触发分块
|
||
if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
|
||
const textLength = rawText.trim().length;
|
||
if (textLength < chunkTriggerMinSize) {
|
||
return [{ q: rawText, a: '', imageIdList }];
|
||
}
|
||
}
|
||
|
||
const { chunks } = await text2Chunks({
|
||
text: rawText,
|
||
chunkSize,
|
||
...splitProps
|
||
});
|
||
|
||
return chunks.map((item) => ({
|
||
q: item,
|
||
a: '',
|
||
indexes: [],
|
||
imageIdList
|
||
}));
|
||
};
|