Files
FastGPT/packages/service/common/file/read/utils.ts
T
Archer 58000324e2 feature: V4.14.3 (#5970)
* feat(marketplace): update plugin/ download count statistic (#5957)

* feat: download count

* feat: update ui

* fix: ui

* chore: update sdk verison

* chore: update .env.template

* chore: adjust

* chore: remove console.log

* chore: adjust

* Update projects/marketplace/src/pages/index.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update projects/marketplace/src/pages/index.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update projects/app/src/pages/config/tool/marketplace.tsx

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix: update refresh; feat: marketplace download count per hour

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* download

* marketplace code

* fix: ui (#5963)

* feat: support dataset and files as global variables (#5961)

* json & dataset

* file

* fix file var

* fix

* fix init

* remove

* perf: file vars

* fix: file uploading errors (#5969)

* fix: file uploading errors

* fix build

* perf: fileselector ux

* feat: integrate S3 for dataset with compatibility (#5941)

* fix: text split

* remove test

* feat: integrate S3 for dataset with compatibility

* fix: delay s3 files delete timing

* fix: remove imageKeys

* fix: remove parsed images' TTL

* fix: improve codes by pr comments

---------

Co-authored-by: archer <545436317@qq.com>

* remove log

* perf: request limit

* chore: s3 migration script (#5971)

* test

* perf: s3 code

* fix: migration script (#5972)

* perf: s3 move object

* wip: fix s3 bugs (#5976)

* fix: incorrect replace origin logic (#5978)

* fix: add downloadURL (#5980)

* perf: file variable ttl & quick create dataset with temp s3 bucket (#5973)

* perf: file variable ttl & quick create dataset with temp s3 bucket

* fix

* plugin & form input variables (#5979)

* plugin & form input variables

* fix

* docs: 4143.mdx (#5981)

* doc: update 4143.mdx (#5982)

* fix form input file ttl (#5983)

* trans file type (#5986)

* trans file type

* fix

* fix: S3 script early return (#5985)

* fix: S3 script typeof

* fix: truncate large filename to fit S3 name

* perf(permission): add a schema verification for resource permission, tmbId, groupId, orgId should be set at least one of them (#5987)

* fix: version & typo (#5988)

* fix-v4.14.3 (#5991)

* fix: empty alt make replace JWT failed & incorrect image dataset preview url (#5989)

* fix: empty alt make replace JWT failed & incorrect image dataset preview url

* fix: s3 files recovery script

* fix: incorrect chat external url parsing (#5993)

---------

Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Roy <whoeverimf5@gmail.com>
2025-11-26 20:47:28 +08:00

203 lines
5.4 KiB
TypeScript

import FormData from 'form-data';
import fs from 'fs';
import type { ReadFileResponse } from '../../../worker/readFile/type';
import axios from 'axios';
import { addLog } from '../../system/log';
import { batchRun } from '@fastgpt/global/common/system/utils';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
import { readRawContentFromBuffer } from '../../../worker/function';
import { uploadImage2S3Bucket } from '../../s3/utils';
import { Mimes } from '../../s3/constants';
export type readRawTextByLocalFileParams = {
teamId: string;
tmbId: string;
path: string;
encoding: string;
customPdfParse?: boolean;
getFormatText?: boolean;
fileParsedPrefix?: string;
metadata?: Record<string, any>;
};
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
const { path } = params;
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
const buffer = await fs.promises.readFile(path);
return readS3FileContentByBuffer({
extension,
customPdfParse: params.customPdfParse,
getFormatText: params.getFormatText,
teamId: params.teamId,
tmbId: params.tmbId,
encoding: params.encoding,
buffer,
imageKeyOptions: params.fileParsedPrefix
? {
prefix: params.fileParsedPrefix
}
: undefined
});
};
export const readS3FileContentByBuffer = async ({
teamId,
tmbId,
extension,
buffer,
encoding,
customPdfParse = false,
usageId,
getFormatText = true,
imageKeyOptions
}: {
teamId: string;
tmbId: string;
extension: string;
buffer: Buffer;
encoding: string;
customPdfParse?: boolean;
usageId?: string;
getFormatText?: boolean;
imageKeyOptions?: {
prefix: string;
expiredTime?: Date;
};
}): Promise<{
rawText: string;
}> => {
const systemParse = () =>
readRawContentFromBuffer({
extension,
encoding,
buffer
});
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
const url = global.systemEnv.customPdfParse?.url;
const token = global.systemEnv.customPdfParse?.key;
if (!url) return systemParse();
const start = Date.now();
addLog.info('Parsing files from an external service');
const data = new FormData();
data.append('file', buffer, {
filename: `file.${extension}`
});
const { data: response } = await axios.post<{
pages: number;
markdown: string;
error?: Object | string;
}>(url, data, {
timeout: 600000,
headers: {
...data.getHeaders(),
Authorization: token ? `Bearer ${token}` : undefined
}
});
if (response.error) {
return Promise.reject(response.error);
}
addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`);
const rawText = response.markdown;
const { text, imageList } = matchMdImg(rawText);
createPdfParseUsage({
teamId,
tmbId,
pages: response.pages,
usageId
});
return {
rawText: text,
formatText: text,
imageList
};
};
// Doc2x api
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
if (!doc2xKey) return systemParse();
const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer);
createPdfParseUsage({
teamId,
tmbId,
pages,
usageId
});
return {
rawText: text,
formatText: text,
imageList
};
};
// Custom read file service
const pdfParseFn = async (): Promise<ReadFileResponse> => {
if (!customPdfParse) return systemParse();
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
return systemParse();
};
const start = Date.now();
addLog.debug(`Start parse file`, { extension });
let { rawText, formatText, imageList } = await (async () => {
if (extension === 'pdf') {
return await pdfParseFn();
}
return await systemParse();
})();
addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `);
// markdown data format
if (imageList && imageList.length > 0) {
addLog.debug(`Processing ${imageList.length} images from parsed document`);
await batchRun(imageList, async (item) => {
const src = await (async () => {
if (!imageKeyOptions) return '';
try {
const { prefix, expiredTime } = imageKeyOptions;
const ext = `.${item.mime.split('/')[1].replace('x-', '')}`;
return await uploadImage2S3Bucket('private', {
base64Img: `data:${item.mime};base64,${item.base64}`,
uploadKey: `${prefix}/${item.uuid}${ext}`,
mimetype: Mimes[ext as keyof typeof Mimes],
filename: `${item.uuid}${ext}`,
expiredTime
});
} catch (error) {
return `[Image Upload Failed: ${item.uuid}]`;
}
})();
rawText = rawText.replace(item.uuid, src);
// rawText = rawText.replace(item.uuid, jwtSignS3ObjectKey(src, addDays(new Date(), 90)));
if (formatText) {
formatText = formatText.replace(item.uuid, src);
}
});
}
return {
rawText: getFormatText ? formatText || rawText : rawText
};
};