mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-16 01:09:01 +08:00
58000324e2
* feat(marketplace): update plugin/ download count statistic (#5957) * feat: download count * feat: update ui * fix: ui * chore: update sdk verison * chore: update .env.template * chore: adjust * chore: remove console.log * chore: adjust * Update projects/marketplace/src/pages/index.tsx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update projects/marketplace/src/pages/index.tsx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update projects/app/src/pages/config/tool/marketplace.tsx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix: update refresh; feat: marketplace download count per hour --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * download * marketplace code * fix: ui (#5963) * feat: support dataset and files as global variables (#5961) * json & dataset * file * fix file var * fix * fix init * remove * perf: file vars * fix: file uploading errors (#5969) * fix: file uploading errors * fix build * perf: fileselector ux * feat: integrate S3 for dataset with compatibility (#5941) * fix: text split * remove test * feat: integrate S3 for dataset with compatibility * fix: delay s3 files delete timing * fix: remove imageKeys * fix: remove parsed images' TTL * fix: improve codes by pr comments --------- Co-authored-by: archer <545436317@qq.com> * remove log * perf: request limit * chore: s3 migration script (#5971) * test * perf: s3 code * fix: migration script (#5972) * perf: s3 move object * wip: fix s3 bugs (#5976) * fix: incorrect replace origin logic (#5978) * fix: add downloadURL (#5980) * perf: file variable ttl & quick create dataset with temp s3 bucket (#5973) * perf: file variable ttl & quick create dataset with temp s3 bucket * fix * plugin & form input variables (#5979) * plugin & form input variables * fix * docs: 4143.mdx (#5981) * doc: update 4143.mdx (#5982) * fix form input file ttl (#5983) * trans file type (#5986) * trans file type * fix * fix: S3 script early return (#5985) * fix: S3 script typeof * fix: truncate large filename to fit S3 name * perf(permission): add a schema verification for resource permission, tmbId, groupId, orgId should be set at least one of them (#5987) * fix: version & typo (#5988) * fix-v4.14.3 (#5991) * fix: empty alt make replace JWT failed & incorrect image dataset preview url (#5989) * fix: empty alt make replace JWT failed & incorrect image dataset preview url * fix: s3 files recovery script * fix: incorrect chat external url parsing (#5993) --------- Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Roy <whoeverimf5@gmail.com>
203 lines
5.4 KiB
TypeScript
203 lines
5.4 KiB
TypeScript
import FormData from 'form-data';
|
|
import fs from 'fs';
|
|
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
|
import axios from 'axios';
|
|
import { addLog } from '../../system/log';
|
|
import { batchRun } from '@fastgpt/global/common/system/utils';
|
|
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
|
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
|
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
|
|
import { readRawContentFromBuffer } from '../../../worker/function';
|
|
import { uploadImage2S3Bucket } from '../../s3/utils';
|
|
import { Mimes } from '../../s3/constants';
|
|
|
|
export type readRawTextByLocalFileParams = {
|
|
teamId: string;
|
|
tmbId: string;
|
|
path: string;
|
|
encoding: string;
|
|
customPdfParse?: boolean;
|
|
getFormatText?: boolean;
|
|
fileParsedPrefix?: string;
|
|
metadata?: Record<string, any>;
|
|
};
|
|
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
|
|
const { path } = params;
|
|
|
|
const extension = path?.split('.')?.pop()?.toLowerCase() || '';
|
|
|
|
const buffer = await fs.promises.readFile(path);
|
|
|
|
return readS3FileContentByBuffer({
|
|
extension,
|
|
customPdfParse: params.customPdfParse,
|
|
getFormatText: params.getFormatText,
|
|
teamId: params.teamId,
|
|
tmbId: params.tmbId,
|
|
encoding: params.encoding,
|
|
buffer,
|
|
imageKeyOptions: params.fileParsedPrefix
|
|
? {
|
|
prefix: params.fileParsedPrefix
|
|
}
|
|
: undefined
|
|
});
|
|
};
|
|
|
|
export const readS3FileContentByBuffer = async ({
|
|
teamId,
|
|
tmbId,
|
|
|
|
extension,
|
|
buffer,
|
|
encoding,
|
|
customPdfParse = false,
|
|
usageId,
|
|
getFormatText = true,
|
|
imageKeyOptions
|
|
}: {
|
|
teamId: string;
|
|
tmbId: string;
|
|
|
|
extension: string;
|
|
buffer: Buffer;
|
|
encoding: string;
|
|
|
|
customPdfParse?: boolean;
|
|
usageId?: string;
|
|
getFormatText?: boolean;
|
|
imageKeyOptions?: {
|
|
prefix: string;
|
|
expiredTime?: Date;
|
|
};
|
|
}): Promise<{
|
|
rawText: string;
|
|
}> => {
|
|
const systemParse = () =>
|
|
readRawContentFromBuffer({
|
|
extension,
|
|
encoding,
|
|
buffer
|
|
});
|
|
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
|
|
const url = global.systemEnv.customPdfParse?.url;
|
|
const token = global.systemEnv.customPdfParse?.key;
|
|
if (!url) return systemParse();
|
|
|
|
const start = Date.now();
|
|
addLog.info('Parsing files from an external service');
|
|
|
|
const data = new FormData();
|
|
data.append('file', buffer, {
|
|
filename: `file.${extension}`
|
|
});
|
|
const { data: response } = await axios.post<{
|
|
pages: number;
|
|
markdown: string;
|
|
error?: Object | string;
|
|
}>(url, data, {
|
|
timeout: 600000,
|
|
headers: {
|
|
...data.getHeaders(),
|
|
Authorization: token ? `Bearer ${token}` : undefined
|
|
}
|
|
});
|
|
|
|
if (response.error) {
|
|
return Promise.reject(response.error);
|
|
}
|
|
|
|
addLog.info(`Custom file parsing is complete, time: ${Date.now() - start}ms`);
|
|
|
|
const rawText = response.markdown;
|
|
const { text, imageList } = matchMdImg(rawText);
|
|
|
|
createPdfParseUsage({
|
|
teamId,
|
|
tmbId,
|
|
pages: response.pages,
|
|
usageId
|
|
});
|
|
|
|
return {
|
|
rawText: text,
|
|
formatText: text,
|
|
imageList
|
|
};
|
|
};
|
|
// Doc2x api
|
|
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
|
|
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
|
|
if (!doc2xKey) return systemParse();
|
|
|
|
const { pages, text, imageList } = await useDoc2xServer({ apiKey: doc2xKey }).parsePDF(buffer);
|
|
|
|
createPdfParseUsage({
|
|
teamId,
|
|
tmbId,
|
|
pages,
|
|
usageId
|
|
});
|
|
|
|
return {
|
|
rawText: text,
|
|
formatText: text,
|
|
imageList
|
|
};
|
|
};
|
|
// Custom read file service
|
|
const pdfParseFn = async (): Promise<ReadFileResponse> => {
|
|
if (!customPdfParse) return systemParse();
|
|
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
|
|
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
|
|
|
|
return systemParse();
|
|
};
|
|
|
|
const start = Date.now();
|
|
addLog.debug(`Start parse file`, { extension });
|
|
|
|
let { rawText, formatText, imageList } = await (async () => {
|
|
if (extension === 'pdf') {
|
|
return await pdfParseFn();
|
|
}
|
|
return await systemParse();
|
|
})();
|
|
|
|
addLog.debug(`Parse file success, time: ${Date.now() - start}ms. `);
|
|
|
|
// markdown data format
|
|
if (imageList && imageList.length > 0) {
|
|
addLog.debug(`Processing ${imageList.length} images from parsed document`);
|
|
|
|
await batchRun(imageList, async (item) => {
|
|
const src = await (async () => {
|
|
if (!imageKeyOptions) return '';
|
|
try {
|
|
const { prefix, expiredTime } = imageKeyOptions;
|
|
const ext = `.${item.mime.split('/')[1].replace('x-', '')}`;
|
|
|
|
return await uploadImage2S3Bucket('private', {
|
|
base64Img: `data:${item.mime};base64,${item.base64}`,
|
|
uploadKey: `${prefix}/${item.uuid}${ext}`,
|
|
mimetype: Mimes[ext as keyof typeof Mimes],
|
|
filename: `${item.uuid}${ext}`,
|
|
expiredTime
|
|
});
|
|
} catch (error) {
|
|
return `[Image Upload Failed: ${item.uuid}]`;
|
|
}
|
|
})();
|
|
rawText = rawText.replace(item.uuid, src);
|
|
// rawText = rawText.replace(item.uuid, jwtSignS3ObjectKey(src, addDays(new Date(), 90)));
|
|
if (formatText) {
|
|
formatText = formatText.replace(item.uuid, src);
|
|
}
|
|
});
|
|
}
|
|
|
|
return {
|
|
rawText: getFormatText ? formatText || rawText : rawText
|
|
};
|
|
};
|