Feat: pptx and xlsx loader (#1118)

* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
This commit is contained in:
Archer
2024-04-01 19:01:26 +08:00
committed by GitHub
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions

View File

@@ -1,40 +0,0 @@
import Papa from 'papaparse';
import { readFileRawText } from './rawText';
/**
* read csv to json
* @response {
* header: string[],
* data: string[][]
* }
*/
export const readCsvContent = async ({ file }: { file: File }) => {
try {
const { rawText: textArr } = await readFileRawText(file);
const csvArr = Papa.parse(textArr).data as string[][];
if (csvArr.length === 0) {
throw new Error('csv 解析失败');
}
const header = csvArr.shift() as string[];
// add title to data
const rawText = csvArr
.map((item) =>
item.map((value, index) => {
if (!header[index]) return value;
return `${header[index]}: ${value}`;
})
)
.flat()
.join('\n');
return {
rawText,
header,
data: csvArr.map((item) => item)
};
} catch (error) {
return Promise.reject('解析 csv 文件失败');
}
};

View File

@@ -1,21 +0,0 @@
import { htmlStr2Md } from '../../string/markdown';
import { readFileRawText } from './rawText';
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
export const readHtmlFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const { rawText } = await readFileRawText(file);
const md = htmlStr2Md(rawText);
const simpleMd = await markdownProcess({
rawText: md,
uploadImgController
});
return { rawText: simpleMd };
};

View File

@@ -1,49 +0,0 @@
import { loadFile2Buffer } from '../utils';
import { readCsvContent } from './csv';
import { readHtmlFile } from './html';
import { readMdFile } from './md';
import { readPdfFile } from './pdf';
import { readFileRawText } from './rawText';
import { readWordFile } from './word';
export const readFileRawContent = async ({
file,
uploadBase64Controller
}: {
file: File;
uploadBase64Controller?: (base64: string) => Promise<string>;
}): Promise<{
rawText: string;
}> => {
const extension = file?.name?.split('.')?.pop()?.toLowerCase();
switch (extension) {
case 'txt':
return readFileRawText(file);
case 'md':
return readMdFile({
file,
uploadImgController: uploadBase64Controller
});
case 'html':
return readHtmlFile({
file,
uploadImgController: uploadBase64Controller
});
case 'csv':
return readCsvContent({ file });
case 'pdf':
const pdf = await loadFile2Buffer({ file });
return readPdfFile({ pdf });
case 'docx':
return readWordFile({
file,
uploadImgController: uploadBase64Controller
});
default:
return {
rawText: ''
};
}
};

View File

@@ -1,17 +0,0 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { readFileRawText } from './rawText';
export const readMdFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const { rawText: md } = await readFileRawText(file);
const simpleMd = await markdownProcess({
rawText: md,
uploadImgController
});
return { rawText: simpleMd };
};

View File

@@ -1,64 +0,0 @@
/* read file to txt */
import * as pdfjsLib from 'pdfjs-dist';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.95;
const footerThreshold = pageHeight * 0.05;
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const doc = await pdfjsLib.getDocument(pdf).promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
return {
rawText: pageTexts.join('')
};
};

View File

@@ -1,36 +0,0 @@
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
/**
* read file raw text
*/
export const readFileRawText = (file: File) => {
return new Promise<{ rawText: string }>((resolve, reject) => {
try {
const reader = new FileReader();
reader.onload = () => {
//@ts-ignore
const encode = detectFileEncoding(reader.result);
// 再次读取文件,这次使用检测到的编码
const reader2 = new FileReader();
reader2.onload = () => {
resolve({
rawText: reader2.result as string
});
};
reader2.onerror = (err) => {
console.log('Error reading file with detected encoding:', err);
reject('Read file error with detected encoding');
};
reader2.readAsText(file, encode);
};
reader.onerror = (err) => {
console.log('error txt read:', err);
reject('Read file error');
};
reader.readAsBinaryString(file);
} catch (error) {
reject(error);
}
});
};

View File

@@ -1,28 +0,0 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { htmlStr2Md } from '../../string/markdown';
import { loadFile2Buffer } from '../utils';
import mammoth from 'mammoth';
export const readWordFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const buffer = await loadFile2Buffer({ file });
const { value: html } = await mammoth.convertToHtml({
arrayBuffer: buffer
});
const md = htmlStr2Md(html);
const rawText = await markdownProcess({
rawText: md,
uploadImgController: uploadImgController
});
return {
rawText
};
};