Files
FastGPT/packages/service/common/file/read/utils.ts
Archer 051455238c V4.13.0 features (#5693)
* feat: concat usage code (#5657)

* feat: dataset parse queue (#5661)

* feat: chat usage concat (#5669)

* perf: search test usage

* feat: chat usage concat

* fix: ts

* fix: ts

* feat: chat node response store (#5675)

* feat: chat node response store

* limit export

* test

* add ai generate node (#5506)

* add node copilot

* apply code

* update dynamic input & output

* add code test

* usage

* dynamic input border render

* optimize input & output

* optimize code

* update style

* change card to popover

* prompt editor basic

* prompt editor

* handle key down

* update prompt

* merge

* fix

* fix

* fix

* perf: workflow performance (#5677)

* feat: chat node response store

* limit export

* perf: workflow performance

* remove log

* fix: app template get duplicate (#5682)

* fix: dynamic input lock & code param (#5680)

* fix: dynamic input lock & code param

* fix

* fix

* feat: multi node data sync & system tool hot-swapping (#5575)

* Enhance file upload functionality and system tool integration (#5257)

* Enhance file upload functionality and system tool integration

* Add supplementary documents and optimize the upload interface

* Refactor file plugin types and update upload configurations

* Refactor MinIO configuration variables and clean up API plugin handlers for improved readability and consistency

* File name change

* Refactor SystemTools component layout

* fix i18n

* fix

* fix

* fix

* optimize app logs sort (#5310)

* log keys config modal

* multiple select

* api

* fontsize

* code

* chatid

* fix build

* fix

* fix component

* change name

* log keys config

* fix

* delete unused

* fix

* chore: minio service class rewrite

* chore: s3 plugin upload

* feat: system global cache with multi node sync feature

* feat: cache

* chore: move images

* docs: update & remove useless code

* chore: resolve merge conflicts

* chore: adjust the code

* chore: adjust

* deps: upgrade @fastgpt-sdk/plugin to 0.1.17

* perf(s3): s3 config

* fix: cache syncKey refresh

* fix: update @fastgpt-sdk/plugin to v0.1.18 removing mongo definition for fixing vitest

* chore: adjust

---------

Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Archer <545436317@qq.com>

* perf: s3 api code

* fix: toolbox empty when second open modal

* feat: http tool set (#5599)

* feat: http toolSet manual create front end

* feat: http toolSet manual create i18n

* feat: http toolSet manual create back end

* feat: auth, as tool param, adapt mcp

* fix: delete unused httpPlugin

* fix: delete FlowNodeTypeEnum.httpPlugin

* fix: AppTypeEnum include httpToolSet and httpPlugin

* fix

* delete console

* fix

* output schema

* fix

* fix bg

* fix base url

* fix

---------

Co-authored-by: heheer <zhiyu44@qq.com>

* feat: app count

* perf: type check

* feat: catch error

* perf: plugin hot-swapping (#5688)

* perf: plugin hot-swapping

* chore: adjust code

* perf: cite data auth

* fix http toolset (#5689)

* temp

* fix http tool set

* fix

* template author hide

* dynamic IO ui

* fix: auth test

* fix dynamic input & output (#5690)

Co-authored-by: Archer <545436317@qq.com>

* fix: dynamic output id

* doc

* feat: model permission (#5666)

* feat(permission): model permission definition & api

* chore: support update model's collaborators

* feat: remove unauthedmodel when paste and import

* fix: type error

* fix: test setup global model list

* fix: http tool api

* chore: update fastgpt-sdk version

* chore: remove useless code

* chore: myModelList cache

* perf: user who is not manager can not configure model permission (FE)

* perf: model => Set

* feat: getMyModels moved to opensource code; cache the myModelList

* fix: type error

* fix dynamic input reference select type (#5694)

* remove unique index

* read file usage

* perf: connection error

* fix: abort token count

* fix: debug usage concat

* fix: immer clone object

* fix: immer clone object

* perf: throw error when error chat

* update audit i18n

* fix: 修复识别pptx文件后,返回内容顺序错乱问题 (#5696)

* fix: pptx sort error

* fix prompt editor (#5695)

* fix prompt editor

* fix

* fix: redis cache prefix (#5697)

* fix: redis cache prefix

* fix: cache

* fix: get model collaborator by model.model

* feat: hint for model per

* rename bucket name

* model ui

* doc

* doc

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: Ctrlz <143257420+ctrlz526@users.noreply.github.com>
Co-authored-by: Zeng Qingwen <143274079+fishwww-ww@users.noreply.github.com>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: Deepturn <33342819+Deepturn@users.noreply.github.com>
2025-09-24 22:40:31 +08:00

189 lines
4.9 KiB
TypeScript

import { uploadMongoImg } from '../image/controller';
import FormData from 'form-data';
import fs from 'fs';
import type { ReadFileResponse } from '../../../worker/readFile/type';
import axios from 'axios';
import { addLog } from '../../system/log';
import { batchRun } from '@fastgpt/global/common/system/utils';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
import { readRawContentFromBuffer } from '../../../worker/function';
export type readRawTextByLocalFileParams = {
teamId: string;
tmbId: string;
path: string;
encoding: string;
customPdfParse?: boolean;
getFormatText?: boolean;
metadata?: Record<string, any>;
};
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
const { path } = params;
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
const buffer = await fs.promises.readFile(path);
return readRawContentByFileBuffer({
extension,
customPdfParse: params.customPdfParse,
getFormatText: params.getFormatText,
teamId: params.teamId,
tmbId: params.tmbId,
encoding: params.encoding,
buffer,
metadata: params.metadata
});
};
export const readRawContentByFileBuffer = async ({
teamId,
tmbId,
extension,
buffer,
encoding,
metadata,
customPdfParse = false,
usageId,
getFormatText = true
}: {
teamId: string;
tmbId: string;
extension: string;
buffer: Buffer;
encoding: string;
metadata?: Record<string, any>;
customPdfParse?: boolean;
usageId?: string;
getFormatText?: boolean;
}): Promise<{
rawText: string;
}> => {
const systemParse = () =>
readRawContentFromBuffer({
extension,
encoding,
buffer
});
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
const url = global.systemEnv.customPdfParse?.url;
const token = global.systemEnv.customPdfParse?.key;
if (!url) return systemParse();
const start = Date.now();
addLog.info('Parsing files from an external service');
const data = new FormData();
data.append('file', buffer, {
filename: `file.${extension}`
});
const { data: response } = await axios.post<{
pages: number;
markdown: string;
error?: Object | string;
}>(url, data, {
timeout: 600000,
headers: {
...data.getHeaders(),
Authorization: token ? `Bearer ${token}` : undefined
}
});
if (response.error) {
return Promise.reject(response.error);
}
addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`);
const rawText = response.markdown;
const { text, imageList } = matchMdImg(rawText);
createPdfParseUsage({
teamId,
tmbId,
pages: response.pages,
usageId
});
return {
rawText: text,
formatText: text,
imageList
};
};
// Doc2x api
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
if (!doc2xKey) return systemParse();
const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer);
createPdfParseUsage({
teamId,
tmbId,
pages,
usageId
});
return {
rawText: text,
formatText: text,
imageList
};
};
// Custom read file service
const pdfParseFn = async (): Promise<ReadFileResponse> => {
if (!customPdfParse) return systemParse();
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
return systemParse();
};
const start = Date.now();
addLog.debug(`Start parse file`, { extension });
let { rawText, formatText, imageList } = await (async () => {
if (extension === 'pdf') {
return await pdfParseFn();
}
return await systemParse();
})();
addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `);
// markdown data format
if (imageList) {
await batchRun(imageList, async (item) => {
const src = await (async () => {
try {
return await uploadMongoImg({
base64Img: `data:${item.mime};base64,${item.base64}`,
teamId,
metadata: {
...metadata,
mime: item.mime
}
});
} catch (error) {
addLog.warn('Upload file image error', { error });
return 'Upload load image error';
}
})();
rawText = rawText.replace(item.uuid, src);
if (formatText) {
formatText = formatText.replace(item.uuid, src);
}
});
}
addLog.debug(`Upload file success, time: ${Date.now() - start}ms`);
return { rawText: getFormatText ? formatText || rawText : rawText };
};