4.6.7 first pr (#726)

This commit is contained in:
Archer
2024-01-10 23:35:04 +08:00
committed by GitHub
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions

View File

@@ -4,15 +4,13 @@ export type CompressImgProps = {
maxSize?: number;
};
export const compressBase64ImgAndUpload = ({
export const compressBase64Img = ({
base64Img,
maxW = 1080,
maxH = 1080,
maxSize = 1024 * 500, // 300kb
uploadController
maxSize = 1024 * 500 // 500kb
}: CompressImgProps & {
base64Img: string;
uploadController: (base64: string) => Promise<string>;
}) => {
return new Promise<string>((resolve, reject) => {
const fileType =
@@ -54,12 +52,7 @@ export const compressBase64ImgAndUpload = ({
return reject('图片太大了');
}
try {
const src = await uploadController(compressedDataUrl);
resolve(src);
} catch (error) {
reject(error);
}
resolve(compressedDataUrl);
};
img.onerror = reject;
});

View File

@@ -1,53 +0,0 @@
import { uploadMarkdownBase64 } from '@fastgpt/global/common/string/markdown';
import { htmlStr2Md } from '../string/markdown';
/**
* read file raw text
*/
export const readFileRawText = (file: File) => {
return new Promise((resolve: (_: string) => void, reject) => {
try {
const reader = new FileReader();
reader.onload = () => {
resolve(reader.result as string);
};
reader.onerror = (err) => {
console.log('error txt read:', err);
reject('Read file error');
};
reader.readAsText(file);
} catch (error) {
reject(error);
}
});
};
export const readMdFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController: (base64: string) => Promise<string>;
}) => {
const md = await readFileRawText(file);
const rawText = await uploadMarkdownBase64({
rawText: md,
uploadImgController
});
return rawText;
};
export const readHtmlFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController: (base64: string) => Promise<string>;
}) => {
const md = htmlStr2Md(await readFileRawText(file));
const rawText = await uploadMarkdownBase64({
rawText: md,
uploadImgController
});
return rawText;
};

View File

@@ -0,0 +1,21 @@
import { htmlStr2Md } from '../../string/markdown';
import { readFileRawText } from './rawText';
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
export const readHtmlFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const { rawText } = await readFileRawText(file);
const md = htmlStr2Md(rawText);
const simpleMd = await markdownProcess({
rawText: md,
uploadImgController
});
return { rawText: rawText };
};

View File

@@ -0,0 +1,46 @@
import { loadFile2Buffer } from '../utils';
import { readHtmlFile } from './html';
import { readMdFile } from './md';
import { readPdfFile } from './pdf';
import { readFileRawText } from './rawText';
import { readWordFile } from './word';
export const readFileRawContent = async ({
file,
uploadBase64Controller
}: {
file: File;
uploadBase64Controller?: (base64: string) => Promise<string>;
}): Promise<{
rawText: string;
}> => {
const extension = file?.name?.split('.')?.pop()?.toLowerCase();
switch (extension) {
case 'txt':
return readFileRawText(file);
case 'md':
return readMdFile({
file,
uploadImgController: uploadBase64Controller
});
case 'html':
return readHtmlFile({
file,
uploadImgController: uploadBase64Controller
});
case 'pdf':
const pdf = await loadFile2Buffer({ file });
return readPdfFile({ pdf });
case 'docx':
return readWordFile({
file,
uploadImgController: uploadBase64Controller
});
default:
return {
rawText: ''
};
}
};

View File

@@ -0,0 +1,17 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { readFileRawText } from './rawText';
export const readMdFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const { rawText: md } = await readFileRawText(file);
const simpleMd = await markdownProcess({
rawText: md,
uploadImgController
});
return { rawText: simpleMd };
};

View File

@@ -0,0 +1,64 @@
/* read file to txt */
import * as pdfjsLib from 'pdfjs-dist';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.95;
const footerThreshold = pageHeight * 0.05;
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const doc = await pdfjsLib.getDocument(pdf).promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
return {
rawText: pageTexts.join('')
};
};

View File

@@ -0,0 +1,22 @@
/**
* read file raw text
*/
export const readFileRawText = (file: File) => {
return new Promise<{ rawText: string }>((resolve, reject) => {
try {
const reader = new FileReader();
reader.onload = () => {
resolve({
rawText: reader.result as string
});
};
reader.onerror = (err) => {
console.log('error txt read:', err);
reject('Read file error');
};
reader.readAsText(file);
} catch (error) {
reject(error);
}
});
};

View File

@@ -0,0 +1,28 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { htmlStr2Md } from '../../string/markdown';
import { loadFile2Buffer } from '../utils';
import mammoth from 'mammoth';
export const readWordFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const buffer = await loadFile2Buffer({ file });
const { value: html } = await mammoth.convertToHtml({
arrayBuffer: buffer
});
const md = htmlStr2Md(html);
const rawText = await markdownProcess({
rawText: md,
uploadImgController: uploadImgController
});
return {
rawText
};
};

View File

@@ -0,0 +1,31 @@
import { getErrText } from '@fastgpt/global/common/error/utils';
export const loadFile2Buffer = ({ file, onError }: { file: File; onError?: (err: any) => void }) =>
new Promise<ArrayBuffer>((resolve, reject) => {
try {
let reader = new FileReader();
reader.readAsArrayBuffer(file);
reader.onload = async ({ target }) => {
if (!target?.result) {
onError?.('Load file error');
return reject('Load file error');
}
try {
resolve(target.result as ArrayBuffer);
} catch (err) {
console.log(err, 'Load file error');
onError?.(err);
reject(getErrText(err, 'Load file error'));
}
};
reader.onerror = (err) => {
console.log(err, 'Load file error');
onError?.(err);
reject(getErrText(err, 'Load file error'));
};
} catch (error) {
reject('The browser does not support file content reading');
}
});