Feat: pptx and xlsx loader (#1118)

* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
This commit is contained in:
Archer
2024-04-01 19:01:26 +08:00
committed by GitHub
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions

View File

@@ -1,40 +0,0 @@
import Papa from 'papaparse';
import { readFileRawText } from './rawText';
/**
* read csv to json
* @response {
* header: string[],
* data: string[][]
* }
*/
export const readCsvContent = async ({ file }: { file: File }) => {
try {
const { rawText: textArr } = await readFileRawText(file);
const csvArr = Papa.parse(textArr).data as string[][];
if (csvArr.length === 0) {
throw new Error('csv 解析失败');
}
const header = csvArr.shift() as string[];
// add title to data
const rawText = csvArr
.map((item) =>
item.map((value, index) => {
if (!header[index]) return value;
return `${header[index]}: ${value}`;
})
)
.flat()
.join('\n');
return {
rawText,
header,
data: csvArr.map((item) => item)
};
} catch (error) {
return Promise.reject('解析 csv 文件失败');
}
};

View File

@@ -1,21 +0,0 @@
import { htmlStr2Md } from '../../string/markdown';
import { readFileRawText } from './rawText';
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
export const readHtmlFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const { rawText } = await readFileRawText(file);
const md = htmlStr2Md(rawText);
const simpleMd = await markdownProcess({
rawText: md,
uploadImgController
});
return { rawText: simpleMd };
};

View File

@@ -1,49 +0,0 @@
import { loadFile2Buffer } from '../utils';
import { readCsvContent } from './csv';
import { readHtmlFile } from './html';
import { readMdFile } from './md';
import { readPdfFile } from './pdf';
import { readFileRawText } from './rawText';
import { readWordFile } from './word';
export const readFileRawContent = async ({
file,
uploadBase64Controller
}: {
file: File;
uploadBase64Controller?: (base64: string) => Promise<string>;
}): Promise<{
rawText: string;
}> => {
const extension = file?.name?.split('.')?.pop()?.toLowerCase();
switch (extension) {
case 'txt':
return readFileRawText(file);
case 'md':
return readMdFile({
file,
uploadImgController: uploadBase64Controller
});
case 'html':
return readHtmlFile({
file,
uploadImgController: uploadBase64Controller
});
case 'csv':
return readCsvContent({ file });
case 'pdf':
const pdf = await loadFile2Buffer({ file });
return readPdfFile({ pdf });
case 'docx':
return readWordFile({
file,
uploadImgController: uploadBase64Controller
});
default:
return {
rawText: ''
};
}
};

View File

@@ -1,17 +0,0 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { readFileRawText } from './rawText';
export const readMdFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const { rawText: md } = await readFileRawText(file);
const simpleMd = await markdownProcess({
rawText: md,
uploadImgController
});
return { rawText: simpleMd };
};

View File

@@ -1,64 +0,0 @@
/* read file to txt */
import * as pdfjsLib from 'pdfjs-dist';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.95;
const footerThreshold = pageHeight * 0.05;
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const doc = await pdfjsLib.getDocument(pdf).promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
return {
rawText: pageTexts.join('')
};
};

View File

@@ -1,36 +0,0 @@
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
/**
* read file raw text
*/
export const readFileRawText = (file: File) => {
return new Promise<{ rawText: string }>((resolve, reject) => {
try {
const reader = new FileReader();
reader.onload = () => {
//@ts-ignore
const encode = detectFileEncoding(reader.result);
// 再次读取文件,这次使用检测到的编码
const reader2 = new FileReader();
reader2.onload = () => {
resolve({
rawText: reader2.result as string
});
};
reader2.onerror = (err) => {
console.log('Error reading file with detected encoding:', err);
reject('Read file error with detected encoding');
};
reader2.readAsText(file, encode);
};
reader.onerror = (err) => {
console.log('error txt read:', err);
reject('Read file error');
};
reader.readAsBinaryString(file);
} catch (error) {
reject(error);
}
});
};

View File

@@ -1,28 +0,0 @@
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
import { htmlStr2Md } from '../../string/markdown';
import { loadFile2Buffer } from '../utils';
import mammoth from 'mammoth';
export const readWordFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const buffer = await loadFile2Buffer({ file });
const { value: html } = await mammoth.convertToHtml({
arrayBuffer: buffer
});
const md = htmlStr2Md(html);
const rawText = await markdownProcess({
rawText: md,
uploadImgController: uploadImgController
});
return {
rawText
};
};

View File

@@ -101,6 +101,7 @@ export const iconPaths = {
'core/dataset/mixedRecall': () => import('./icons/core/dataset/mixedRecall.svg'),
'core/dataset/modeEmbedding': () => import('./icons/core/dataset/modeEmbedding.svg'),
'core/dataset/rerank': () => import('./icons/core/dataset/rerank.svg'),
'core/dataset/splitLight': () => import('./icons/core/dataset/splitLight.svg'),
'core/dataset/tableCollection': () => import('./icons/core/dataset/tableCollection.svg'),
'core/dataset/websiteDataset': () => import('./icons/core/dataset/websiteDataset.svg'),
'core/modules/basicNode': () => import('./icons/core/modules/basicNode.svg'),

View File

@@ -0,0 +1,6 @@
<svg t="1711938287623" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg"
p-id="5143">
<path
d="M153.6 153.6h716.8a51.2 51.2 0 0 1 0 102.4H153.6a51.2 51.2 0 1 1 0-102.4z m0 614.4h716.8a51.2 51.2 0 0 1 0 102.4H153.6a51.2 51.2 0 0 1 0-102.4z m0-307.2h131.6352a51.2 51.2 0 1 1 0 102.4H153.6a51.2 51.2 0 0 1 0-102.4z m292.5568 0h131.6864a51.2 51.2 0 0 1 0 102.4H446.1568a51.2 51.2 0 0 1 0-102.4z m292.608 0H870.4a51.2 51.2 0 0 1 0 102.4h-131.6352a51.2 51.2 0 0 1 0-102.4z"
p-id="5144"></path>
</svg>

After

Width:  |  Height:  |  Size: 554 B

View File

@@ -0,0 +1,70 @@
import React from 'react';
import MyIcon from '../Icon';
import {
Drawer,
DrawerBody,
DrawerHeader,
DrawerOverlay,
DrawerContent,
DrawerCloseButton,
DrawerContentProps,
Flex,
Image
} from '@chakra-ui/react';
import { useLoading } from '../../../hooks/useLoading';
type Props = DrawerContentProps & {
onClose: () => void;
iconSrc?: string;
title?: any;
isLoading?: boolean;
};
const MyRightDrawer = ({
onClose,
iconSrc,
title,
maxW = ['90vw', '30vw'],
children,
isLoading,
...props
}: Props) => {
const { Loading } = useLoading();
return (
<Drawer isOpen placement="right" onClose={onClose}>
<DrawerOverlay />
<DrawerContent
maxW={maxW}
{...props}
h={'94%'}
mt={'2%'}
borderLeftRadius={'lg'}
overflow={'hidden'}
>
<DrawerCloseButton />
<DrawerHeader>
<Flex alignItems={'center'} pr={2}>
{iconSrc && (
<>
{iconSrc.startsWith('/') ? (
<Image mr={3} objectFit={'contain'} alt="" src={iconSrc} w={'20px'} />
) : (
<MyIcon mr={3} name={iconSrc as any} w={'20px'} />
)}
</>
)}
{title}
</Flex>
<DrawerCloseButton zIndex={1} />
</DrawerHeader>
<DrawerBody>
{children}
<Loading loading={isLoading} fixed={false} />
</DrawerBody>
</DrawerContent>
</Drawer>
);
};
export default MyRightDrawer;

View File

@@ -2,6 +2,8 @@ import React from 'react';
import { Box, Flex, useTheme, Grid, type GridProps } from '@chakra-ui/react';
import { useTranslation } from 'next-i18next';
import MyTooltip from '../MyTooltip';
import { QuestionOutlineIcon } from '@chakra-ui/icons';
import QuestionTip from '../MyTooltip/QuestionTip';
// @ts-ignore
interface Props extends GridProps {
@@ -36,58 +38,59 @@ const LeftRadio = ({
return (
<Grid gridGap={[3, 5]} fontSize={['sm', 'md']} {...props}>
{list.map((item) => (
<MyTooltip key={item.value} label={item.tooltip}>
<Flex
alignItems={item.desc ? align : 'center'}
cursor={'pointer'}
userSelect={'none'}
px={px}
py={py}
border={theme.borders.sm}
borderWidth={'1px'}
borderRadius={'md'}
position={'relative'}
{...(value === item.value
? {
borderColor: 'primary.400',
bg: activeBg,
boxShadow: 'focus'
<Flex
alignItems={item.desc ? align : 'center'}
key={item.value}
cursor={'pointer'}
userSelect={'none'}
px={px}
py={py}
border={theme.borders.sm}
borderWidth={'1px'}
borderRadius={'md'}
position={'relative'}
{...(value === item.value
? {
borderColor: 'primary.400',
bg: activeBg,
boxShadow: 'focus'
}
: {
bg: defaultBg,
_hover: {
borderColor: 'primary.300'
}
: {
bg: defaultBg,
_hover: {
borderColor: 'primary.300'
}
})}
onClick={() => onChange(item.value)}
})}
onClick={() => onChange(item.value)}
>
<Box
w={'18px'}
h={'18px'}
borderWidth={'2.4px'}
borderColor={value === item.value ? 'primary.015' : 'transparent'}
borderRadius={'50%'}
mr={3}
>
<Box
w={'18px'}
h={'18px'}
borderWidth={'2.4px'}
borderColor={value === item.value ? 'primary.015' : 'transparent'}
<Flex
w={'100%'}
h={'100%'}
borderWidth={'1px'}
borderColor={value === item.value ? 'primary.600' : 'borderColor.high'}
bg={value === item.value ? 'primary.1' : 'transparent'}
borderRadius={'50%'}
mr={3}
alignItems={'center'}
justifyContent={'center'}
>
<Flex
w={'100%'}
h={'100%'}
borderWidth={'1px'}
borderColor={value === item.value ? 'primary.600' : 'borderColor.high'}
bg={value === item.value ? 'primary.1' : 'transparent'}
<Box
w={'5px'}
h={'5px'}
borderRadius={'50%'}
alignItems={'center'}
justifyContent={'center'}
>
<Box
w={'5px'}
h={'5px'}
borderRadius={'50%'}
bg={value === item.value ? 'primary.600' : 'transparent'}
></Box>
</Flex>
</Box>
<Box flex={'1 0 0'}>
bg={value === item.value ? 'primary.600' : 'transparent'}
></Box>
</Flex>
</Box>
<Box flex={'1 0 0'}>
<Flex alignItems={'center'}>
<Box
color={'myGray.900'}
fontWeight={item.desc ? '500' : 'normal'}
@@ -95,15 +98,16 @@ const LeftRadio = ({
>
{typeof item.title === 'string' ? t(item.title) : item.title}
</Box>
{!!item.desc && (
<Box fontSize={'xs'} color={'myGray.500'} lineHeight={1.2}>
{t(item.desc)}
</Box>
)}
{item?.children}
</Box>
</Flex>
</MyTooltip>
{!!item.tooltip && <QuestionTip label={item.tooltip} ml={1} color={'myGray.600'} />}
</Flex>
{!!item.desc && (
<Box fontSize={'xs'} color={'myGray.500'} lineHeight={1.2}>
{t(item.desc)}
</Box>
)}
{item?.children}
</Box>
</Flex>
))}
</Grid>
);

View File

@@ -12,31 +12,31 @@
"@emotion/styled": "^11.11.0",
"@fastgpt/global": "workspace:*",
"@fingerprintjs/fingerprintjs": "^4.2.1",
"@lexical/react": "0.12.6",
"@lexical/text": "0.12.6",
"@lexical/utils": "0.12.6",
"@monaco-editor/react": "^4.6.0",
"mammoth": "^1.6.0",
"@tanstack/react-query": "^4.24.10",
"date-fns": "2.30.0",
"dayjs": "^1.11.7",
"i18next": "23.10.0",
"joplin-turndown-plugin-gfm": "^1.0.12",
"lexical": "0.12.6",
"lodash": "^4.17.21",
"mammoth": "^1.6.0",
"next-i18next": "15.2.0",
"papaparse": "^5.4.1",
"pdfjs-dist": "4.0.269",
"react": "18.2.0",
"react-day-picker": "^8.7.1",
"react-dom": "18.2.0",
"react-i18next": "13.5.0",
"turndown": "^7.1.2",
"lexical": "0.12.6",
"@lexical/react": "0.12.6",
"papaparse": "^5.4.1",
"@lexical/utils": "0.12.6",
"@lexical/text": "0.12.6",
"date-fns": "2.30.0",
"react-day-picker": "^8.7.1",
"lodash": "^4.17.21",
"@tanstack/react-query": "^4.24.10",
"dayjs": "^1.11.7"
"turndown": "^7.1.2"
},
"devDependencies": {
"@types/lodash": "^4.14.191",
"@types/react": "18.2.0",
"@types/papaparse": "^5.3.7",
"@types/react": "18.2.0",
"@types/react-dom": "18.2.0",
"@types/turndown": "^5.0.4"
}