fix: upload file (#2992)

* fix: upload file

* chore: remove wasm, support html image parse

* chore: adjust

* chore: move base64match function into htmlstr2md
This commit is contained in:
Finley Ge
2024-10-28 21:44:50 +08:00
committed by GitHub
parent 4e3d817b63
commit b712a821f8
8 changed files with 440 additions and 240 deletions

View File

@@ -1,7 +1,14 @@
import TurndownService from 'turndown';
import { ImageType } from '../readFile/type';
// @ts-ignore
const turndownPluginGfm = require('joplin-turndown-plugin-gfm');
export const html2md = (html: string): string => {
export const html2md = (
html: string
): {
rawText: string;
imageList: ImageType[];
} => {
const turndownService = new TurndownService({
headingStyle: 'atx',
bulletListMarker: '-',
@@ -15,12 +22,32 @@ export const html2md = (html: string): string => {
try {
turndownService.remove(['i', 'script', 'iframe', 'style']);
turndownService.use(turndownPluginGfm.gfm);
return turndownService.turndown(html);
const base64Regex = /"(data:image\/[^;]+;base64[^"]+)"/g;
const imageList: ImageType[] = [];
const images = Array.from(html.match(base64Regex) || []);
for (const image of images) {
const uuid = crypto.randomUUID();
const mime = image.split(';')[0].split(':')[1];
const base64 = image.split(',')[1];
html = html.replace(image, uuid);
imageList.push({
uuid,
base64,
mime
});
}
return {
rawText: turndownService.turndown(html),
imageList
};
} catch (error) {
console.log('html 2 markdown error', error);
return '';
return {
rawText: '',
imageList: []
};
}
};

View File

@@ -1,20 +1,39 @@
import mammoth from 'mammoth';
import { ReadRawTextByBuffer, ReadFileResponse } from '../type';
import mammoth, { images } from 'mammoth';
import { ReadRawTextByBuffer, ReadFileResponse, ImageType } from '../type';
import { html2md } from '../../htmlStr2Md/utils';
/**
* read docx to markdown
*/
export const readDocsFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const imageList: ImageType[] = [];
try {
const { value: html } = await mammoth.convertToHtml({
buffer
});
const { value: html } = await mammoth.convertToHtml(
{
buffer
},
{
convertImage: images.imgElement(async (image) => {
const imageBase64 = await image.readAsBase64String();
const uuid = crypto.randomUUID();
const mime = image.contentType;
imageList.push({
uuid,
base64: imageBase64,
mime
});
return {
src: uuid
};
})
}
);
const rawText = html2md(html);
const { rawText } = html2md(html);
return {
rawText
rawText,
imageList
};
} catch (error) {
console.log('error doc read:', error);

View File

@@ -5,9 +5,10 @@ import { html2md } from '../../htmlStr2Md/utils';
export const readHtmlRawText = async (params: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
const { rawText: html } = readFileRawText(params);
const rawText = html2md(html);
const { rawText, imageList } = html2md(html);
return {
rawText
rawText,
imageList
};
};

View File

@@ -8,7 +8,14 @@ export type ReadRawTextProps<T> = {
export type ReadRawTextByBuffer = ReadRawTextProps<Buffer>;
export type ImageType = {
uuid: string;
base64: string;
mime: string;
};
export type ReadFileResponse = {
rawText: string;
formatText?: string;
imageList?: ImageType[];
};