mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
Feat: pptx and xlsx loader (#1118)
* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
This commit is contained in:
@@ -1,40 +0,0 @@
|
||||
import Papa from 'papaparse';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
/**
|
||||
* read csv to json
|
||||
* @response {
|
||||
* header: string[],
|
||||
* data: string[][]
|
||||
* }
|
||||
*/
|
||||
export const readCsvContent = async ({ file }: { file: File }) => {
|
||||
try {
|
||||
const { rawText: textArr } = await readFileRawText(file);
|
||||
const csvArr = Papa.parse(textArr).data as string[][];
|
||||
if (csvArr.length === 0) {
|
||||
throw new Error('csv 解析失败');
|
||||
}
|
||||
|
||||
const header = csvArr.shift() as string[];
|
||||
|
||||
// add title to data
|
||||
const rawText = csvArr
|
||||
.map((item) =>
|
||||
item.map((value, index) => {
|
||||
if (!header[index]) return value;
|
||||
return `${header[index]}: ${value}`;
|
||||
})
|
||||
)
|
||||
.flat()
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
rawText,
|
||||
header,
|
||||
data: csvArr.map((item) => item)
|
||||
};
|
||||
} catch (error) {
|
||||
return Promise.reject('解析 csv 文件失败');
|
||||
}
|
||||
};
|
@@ -1,21 +0,0 @@
|
||||
import { htmlStr2Md } from '../../string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
|
||||
export const readHtmlFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const { rawText } = await readFileRawText(file);
|
||||
const md = htmlStr2Md(rawText);
|
||||
|
||||
const simpleMd = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
|
||||
return { rawText: simpleMd };
|
||||
};
|
@@ -1,49 +0,0 @@
|
||||
import { loadFile2Buffer } from '../utils';
|
||||
import { readCsvContent } from './csv';
|
||||
import { readHtmlFile } from './html';
|
||||
import { readMdFile } from './md';
|
||||
import { readPdfFile } from './pdf';
|
||||
import { readFileRawText } from './rawText';
|
||||
import { readWordFile } from './word';
|
||||
|
||||
export const readFileRawContent = async ({
|
||||
file,
|
||||
uploadBase64Controller
|
||||
}: {
|
||||
file: File;
|
||||
uploadBase64Controller?: (base64: string) => Promise<string>;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
}> => {
|
||||
const extension = file?.name?.split('.')?.pop()?.toLowerCase();
|
||||
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(file);
|
||||
case 'md':
|
||||
return readMdFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
case 'html':
|
||||
return readHtmlFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
case 'csv':
|
||||
return readCsvContent({ file });
|
||||
case 'pdf':
|
||||
const pdf = await loadFile2Buffer({ file });
|
||||
return readPdfFile({ pdf });
|
||||
case 'docx':
|
||||
return readWordFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
|
||||
default:
|
||||
return {
|
||||
rawText: ''
|
||||
};
|
||||
}
|
||||
};
|
@@ -1,17 +0,0 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readMdFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const { rawText: md } = await readFileRawText(file);
|
||||
const simpleMd = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
return { rawText: simpleMd };
|
||||
};
|
@@ -1,64 +0,0 @@
|
||||
/* read file to txt */
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
|
||||
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pageHeight = viewport.height;
|
||||
const headerThreshold = pageHeight * 0.95;
|
||||
const footerThreshold = pageHeight * 0.05;
|
||||
|
||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||
return (
|
||||
!token.transform ||
|
||||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
||||
);
|
||||
});
|
||||
|
||||
// concat empty string 'hasEOL'
|
||||
for (let i = 0; i < pageTexts.length; i++) {
|
||||
const item = pageTexts[i];
|
||||
if (item.str === '' && pageTexts[i - 1]) {
|
||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||
pageTexts.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
page.cleanup();
|
||||
|
||||
return pageTexts
|
||||
.map((token) => {
|
||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||
|
||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||
})
|
||||
.join('');
|
||||
};
|
||||
|
||||
const doc = await pdfjsLib.getDocument(pdf).promise;
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
return {
|
||||
rawText: pageTexts.join('')
|
||||
};
|
||||
};
|
@@ -1,36 +0,0 @@
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
|
||||
/**
|
||||
* read file raw text
|
||||
*/
|
||||
export const readFileRawText = (file: File) => {
|
||||
return new Promise<{ rawText: string }>((resolve, reject) => {
|
||||
try {
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
//@ts-ignore
|
||||
const encode = detectFileEncoding(reader.result);
|
||||
|
||||
// 再次读取文件,这次使用检测到的编码
|
||||
const reader2 = new FileReader();
|
||||
reader2.onload = () => {
|
||||
resolve({
|
||||
rawText: reader2.result as string
|
||||
});
|
||||
};
|
||||
reader2.onerror = (err) => {
|
||||
console.log('Error reading file with detected encoding:', err);
|
||||
reject('Read file error with detected encoding');
|
||||
};
|
||||
reader2.readAsText(file, encode);
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log('error txt read:', err);
|
||||
reject('Read file error');
|
||||
};
|
||||
reader.readAsBinaryString(file);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
};
|
@@ -1,28 +0,0 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { htmlStr2Md } from '../../string/markdown';
|
||||
import { loadFile2Buffer } from '../utils';
|
||||
import mammoth from 'mammoth';
|
||||
|
||||
export const readWordFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const buffer = await loadFile2Buffer({ file });
|
||||
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
arrayBuffer: buffer
|
||||
});
|
||||
const md = htmlStr2Md(html);
|
||||
|
||||
const rawText = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController: uploadImgController
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
Reference in New Issue
Block a user