Files
FastGPT/packages/service/worker/htmlStr2Md/utils.ts
Archer d05259dedd perf: retry to load image;perf: default index check (#4004)
* perf: retry to load image

* perf: default index check
2025-03-06 18:28:09 +08:00

63 lines
1.6 KiB
TypeScript

import TurndownService from 'turndown';
import { ImageType } from '../readFile/type';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { getNanoid } from '@fastgpt/global/common/string/tools';
// @ts-ignore
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
const processBase64Images = (htmlContent: string) => {
const base64Regex = /src="data:([^;]+);base64,([^"]+)"/g;
const images: ImageType[] = [];
const processedHtml = htmlContent.replace(base64Regex, (match, mime, base64Data) => {
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
images.push({
uuid,
base64: base64Data,
mime
});
return `src="${uuid}"`;
});
return { processedHtml, images };
};
export const html2md = (
html: string
): {
rawText: string;
imageList: ImageType[];
} => {
const turndownService = new TurndownService({
headingStyle: 'atx',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '_',
strongDelimiter: '**',
linkStyle: 'inlined',
linkReferenceStyle: 'full'
});
try {
turndownService.remove(['i', 'script', 'iframe', 'style']);
turndownService.use(turndownPluginGfm.gfm);
// Base64 img to id, otherwise it will occupy memory when going to md
const { processedHtml, images } = processBase64Images(html);
const md = turndownService.turndown(processedHtml);
const { text, imageList } = matchMdImg(md);
return {
rawText: text,
imageList: [...images, ...imageList]
};
} catch (error) {
console.log('html 2 markdown error', error);
return {
rawText: '',
imageList: []
};
}
};