mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 21:13:50 +00:00
perf: csv文件选择
This commit is contained in:
5
public/docs/csvSelect.md
Normal file
5
public/docs/csvSelect.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
接受一个csv文件,表格头包含 question 和 answer。question 代表问题,answer 代表答案。
|
||||||
|
| question | answer |
|
||||||
|
| --- | --- |
|
||||||
|
| 什么是 laf | laf 是一个云函数开发平台…… |
|
||||||
|
| 什么是 sealos | Sealos 是以 kubernetes 为内核的云操作系统发行版,可以…… |
|
@@ -4,6 +4,7 @@ import { connectToDatabase } from '@/service/mongo';
|
|||||||
import { authToken } from '@/service/utils/tools';
|
import { authToken } from '@/service/utils/tools';
|
||||||
import { connectRedis } from '@/service/redis';
|
import { connectRedis } from '@/service/redis';
|
||||||
import { VecModelDataIdx } from '@/constants/redis';
|
import { VecModelDataIdx } from '@/constants/redis';
|
||||||
|
import { clearStrLineBreak } from '@/utils/tools';
|
||||||
|
|
||||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||||
try {
|
try {
|
||||||
@@ -40,13 +41,16 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
const data = searchRes.documents.map((item: any) => ({
|
let str = `question,answer\n`;
|
||||||
prompt: item.value.q,
|
|
||||||
completion: item.value.text
|
searchRes.documents.forEach((item: any) => {
|
||||||
}));
|
if (item.value.q && item.value.text) {
|
||||||
|
str += `"${clearStrLineBreak(item.value.q)}","${clearStrLineBreak(item.value.text)}"\n`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
jsonRes(res, {
|
jsonRes(res, {
|
||||||
data: JSON.stringify(data)
|
data: str.slice(0, str.length - 1)
|
||||||
});
|
});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
jsonRes(res, {
|
jsonRes(res, {
|
||||||
|
@@ -28,8 +28,8 @@ import {
|
|||||||
getExportDataList
|
getExportDataList
|
||||||
} from '@/api/model';
|
} from '@/api/model';
|
||||||
import { DeleteIcon, RepeatIcon, EditIcon } from '@chakra-ui/icons';
|
import { DeleteIcon, RepeatIcon, EditIcon } from '@chakra-ui/icons';
|
||||||
import { useToast } from '@/hooks/useToast';
|
|
||||||
import { useLoading } from '@/hooks/useLoading';
|
import { useLoading } from '@/hooks/useLoading';
|
||||||
|
import { fileDownload } from '@/utils/file';
|
||||||
import dynamic from 'next/dynamic';
|
import dynamic from 'next/dynamic';
|
||||||
import { useMutation, useQuery } from '@tanstack/react-query';
|
import { useMutation, useQuery } from '@tanstack/react-query';
|
||||||
import type { FormData as InputDataType } from './InputDataModal';
|
import type { FormData as InputDataType } from './InputDataModal';
|
||||||
@@ -37,10 +37,10 @@ import type { FormData as InputDataType } from './InputDataModal';
|
|||||||
const InputModel = dynamic(() => import('./InputDataModal'));
|
const InputModel = dynamic(() => import('./InputDataModal'));
|
||||||
const SelectFileModel = dynamic(() => import('./SelectFileModal'));
|
const SelectFileModel = dynamic(() => import('./SelectFileModal'));
|
||||||
const SelectUrlModel = dynamic(() => import('./SelectUrlModal'));
|
const SelectUrlModel = dynamic(() => import('./SelectUrlModal'));
|
||||||
const SelectJsonModel = dynamic(() => import('./SelectJsonModal'));
|
const SelectCsvModal = dynamic(() => import('./SelectCsvModal'));
|
||||||
|
|
||||||
const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||||
const { Loading } = useLoading();
|
const { Loading, setIsLoading } = useLoading();
|
||||||
|
|
||||||
const {
|
const {
|
||||||
data: modelDataList,
|
data: modelDataList,
|
||||||
@@ -70,9 +70,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
|||||||
onClose: onCloseSelectUrlModal
|
onClose: onCloseSelectUrlModal
|
||||||
} = useDisclosure();
|
} = useDisclosure();
|
||||||
const {
|
const {
|
||||||
isOpen: isOpenSelectJsonModal,
|
isOpen: isOpenSelectCsvModal,
|
||||||
onOpen: onOpenSelectJsonModal,
|
onOpen: onOpenSelectCsvModal,
|
||||||
onClose: onCloseSelectJsonModal
|
onClose: onCloseSelectCsvModal
|
||||||
} = useDisclosure();
|
} = useDisclosure();
|
||||||
|
|
||||||
const { data: splitDataLen, refetch } = useQuery(['getModelSplitDataList'], () =>
|
const { data: splitDataLen, refetch } = useQuery(['getModelSplitDataList'], () =>
|
||||||
@@ -91,18 +91,18 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
|||||||
const { mutate: onclickExport, isLoading: isLoadingExport } = useMutation({
|
const { mutate: onclickExport, isLoading: isLoadingExport } = useMutation({
|
||||||
mutationFn: () => getExportDataList(model._id),
|
mutationFn: () => getExportDataList(model._id),
|
||||||
onSuccess(res) {
|
onSuccess(res) {
|
||||||
// 导出为文件
|
try {
|
||||||
const blob = new Blob([res], { type: 'application/json;charset=utf-8' });
|
console.log(res);
|
||||||
|
setIsLoading(true);
|
||||||
// 创建下载链接
|
fileDownload({
|
||||||
const downloadLink = document.createElement('a');
|
text: res,
|
||||||
downloadLink.href = window.URL.createObjectURL(blob);
|
type: 'text/csv',
|
||||||
downloadLink.download = `data.json`;
|
filename: 'data.csv'
|
||||||
|
});
|
||||||
// 添加链接到页面并触发下载
|
} catch (error) {
|
||||||
document.body.appendChild(downloadLink);
|
error;
|
||||||
downloadLink.click();
|
}
|
||||||
document.body.removeChild(downloadLink);
|
setIsLoading(false);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -110,7 +110,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
|||||||
<>
|
<>
|
||||||
<Flex>
|
<Flex>
|
||||||
<Box fontWeight={'bold'} fontSize={'lg'} flex={1} mr={2}>
|
<Box fontWeight={'bold'} fontSize={'lg'} flex={1} mr={2}>
|
||||||
模型数据: {total}组{' '}
|
模型数据: {total}组
|
||||||
<Box as={'span'} fontSize={'sm'}>
|
<Box as={'span'} fontSize={'sm'}>
|
||||||
(测试版本)
|
(测试版本)
|
||||||
</Box>
|
</Box>
|
||||||
@@ -128,7 +128,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
|||||||
mr={2}
|
mr={2}
|
||||||
size={'sm'}
|
size={'sm'}
|
||||||
isLoading={isLoadingExport}
|
isLoading={isLoadingExport}
|
||||||
title={'v2.3之前版本的数据无法导出'}
|
title={'换行数据导出时,会进行格式转换'}
|
||||||
onClick={() => onclickExport()}
|
onClick={() => onclickExport()}
|
||||||
>
|
>
|
||||||
导出
|
导出
|
||||||
@@ -148,9 +148,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
|||||||
>
|
>
|
||||||
手动输入
|
手动输入
|
||||||
</MenuItem>
|
</MenuItem>
|
||||||
<MenuItem onClick={onOpenSelectFileModal}>文件QA拆分</MenuItem>
|
<MenuItem onClick={onOpenSelectFileModal}>文本内容 QA 拆分</MenuItem>
|
||||||
<MenuItem onClick={onOpenSelectUrlModal}>网站内容 QA 拆分</MenuItem>
|
<MenuItem onClick={onOpenSelectUrlModal}>网站内容 QA 拆分</MenuItem>
|
||||||
<MenuItem onClick={onOpenSelectJsonModal}>JSON导入</MenuItem>
|
<MenuItem onClick={onOpenSelectCsvModal}>csv 问答对导入</MenuItem>
|
||||||
</MenuList>
|
</MenuList>
|
||||||
</Menu>
|
</Menu>
|
||||||
</Flex>
|
</Flex>
|
||||||
@@ -248,10 +248,10 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
|||||||
onSuccess={refetchData}
|
onSuccess={refetchData}
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
{isOpenSelectJsonModal && (
|
{isOpenSelectCsvModal && (
|
||||||
<SelectJsonModel
|
<SelectCsvModal
|
||||||
modelId={model._id}
|
modelId={model._id}
|
||||||
onClose={onCloseSelectJsonModal}
|
onClose={onCloseSelectCsvModal}
|
||||||
onSuccess={refetchData}
|
onSuccess={refetchData}
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
|
@@ -13,10 +13,14 @@ import {
|
|||||||
import { useToast } from '@/hooks/useToast';
|
import { useToast } from '@/hooks/useToast';
|
||||||
import { useSelectFile } from '@/hooks/useSelectFile';
|
import { useSelectFile } from '@/hooks/useSelectFile';
|
||||||
import { useConfirm } from '@/hooks/useConfirm';
|
import { useConfirm } from '@/hooks/useConfirm';
|
||||||
import { readTxtContent } from '@/utils/tools';
|
import { readCsvContent } from '@/utils/file';
|
||||||
import { useMutation } from '@tanstack/react-query';
|
import { useMutation } from '@tanstack/react-query';
|
||||||
import { postModelDataJsonData } from '@/api/model';
|
import { postModelDataJsonData } from '@/api/model';
|
||||||
import Markdown from '@/components/Markdown';
|
import Markdown from '@/components/Markdown';
|
||||||
|
import { useMarkdown } from '@/hooks/useMarkdown';
|
||||||
|
import { fileDownload } from '@/utils/file';
|
||||||
|
|
||||||
|
const csvTemplate = `question,answer\n"什么是 laf","laf 是一个云函数开发平台……"\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……"`;
|
||||||
|
|
||||||
const SelectJsonModal = ({
|
const SelectJsonModal = ({
|
||||||
onClose,
|
onClose,
|
||||||
@@ -29,7 +33,7 @@ const SelectJsonModal = ({
|
|||||||
}) => {
|
}) => {
|
||||||
const [selecting, setSelecting] = useState(false);
|
const [selecting, setSelecting] = useState(false);
|
||||||
const { toast } = useToast();
|
const { toast } = useToast();
|
||||||
const { File, onOpen } = useSelectFile({ fileType: '.json', multiple: true });
|
const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: true });
|
||||||
const [fileData, setFileData] = useState<
|
const [fileData, setFileData] = useState<
|
||||||
{ prompt: string; completion: string; vector?: number[] }[]
|
{ prompt: string; completion: string; vector?: number[] }[]
|
||||||
>([]);
|
>([]);
|
||||||
@@ -41,21 +45,12 @@ const SelectJsonModal = ({
|
|||||||
async (e: File[]) => {
|
async (e: File[]) => {
|
||||||
setSelecting(true);
|
setSelecting(true);
|
||||||
try {
|
try {
|
||||||
const jsonData = (
|
const data = await Promise.all(e.map((item) => readCsvContent(item)));
|
||||||
await Promise.all(e.map((item) => readTxtContent(item).then((text) => JSON.parse(text))))
|
console.log(data);
|
||||||
).flat();
|
|
||||||
// check 文件类型
|
|
||||||
for (let i = 0; i < jsonData.length; i++) {
|
|
||||||
if (!jsonData[i]?.prompt || !jsonData[i]?.completion) {
|
|
||||||
throw new Error('缺少 prompt 或 completion');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
setFileData(jsonData);
|
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
toast({
|
toast({
|
||||||
title: error?.message || 'JSON文件格式有误',
|
title: error?.message || 'csv 文件格式有误',
|
||||||
status: 'error'
|
status: 'error'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -84,34 +79,36 @@ const SelectJsonModal = ({
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const { data: intro } = useMarkdown({ url: '/csvSelect.md' });
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Modal isOpen={true} onClose={onClose} isCentered>
|
<Modal isOpen={true} onClose={onClose} isCentered>
|
||||||
<ModalOverlay />
|
<ModalOverlay />
|
||||||
<ModalContent maxW={'90vw'} position={'relative'} m={0} h={'90vh'}>
|
<ModalContent maxW={'90vw'} position={'relative'} m={0} h={'90vh'}>
|
||||||
<ModalHeader>JSON数据集</ModalHeader>
|
<ModalHeader>csv 问答对导入</ModalHeader>
|
||||||
<ModalCloseButton />
|
<ModalCloseButton />
|
||||||
|
|
||||||
<ModalBody h={'100%'} display={['block', 'flex']} fontSize={'sm'} overflowY={'auto'}>
|
<ModalBody h={'100%'} display={['block', 'flex']} fontSize={'sm'} overflowY={'auto'}>
|
||||||
<Box flex={'2 0 0'} w={['100%', 0]} mr={[0, 4]} mb={[4, 0]}>
|
<Box flex={'1 0 0'} w={['100%', 0]} mr={[0, 4]} mb={[4, 0]}>
|
||||||
<Markdown
|
<Markdown source={intro} />
|
||||||
source={`接受一个对象数组,每个对象必须包含 prompt 和 completion 格式,可以包含vector。prompt 代表问题,completion 代表回答的内容,可以多个问题对应一个回答,vector 为 prompt 的向量,如果没有讲有系统生成。例如:
|
<Box
|
||||||
~~~json
|
my={3}
|
||||||
[
|
cursor={'pointer'}
|
||||||
{
|
textDecoration={'underline'}
|
||||||
"prompt":"sealos是什么?\\n介绍下sealos\\nsealos有什么用",
|
color={'blue.600'}
|
||||||
"completion":"sealos是xxxxxx"
|
onClick={() =>
|
||||||
},
|
fileDownload({
|
||||||
{
|
text: csvTemplate,
|
||||||
"prompt":"laf是什么?",
|
type: 'text/csv',
|
||||||
"completion":"laf是xxxxxx",
|
filename: 'template.csv'
|
||||||
"vector":[-0.42,-0.4314314,0.43143]
|
})
|
||||||
}
|
}
|
||||||
]
|
>
|
||||||
~~~`}
|
点击下载csv模板
|
||||||
/>
|
</Box>
|
||||||
<Flex alignItems={'center'}>
|
<Flex alignItems={'center'}>
|
||||||
<Button isLoading={selecting} onClick={onOpen}>
|
<Button isLoading={selecting} onClick={onOpen}>
|
||||||
选择 JSON 数据集
|
选择 csv 问答对
|
||||||
</Button>
|
</Button>
|
||||||
|
|
||||||
<Box ml={4}>一共 {fileData.length} 组数据</Box>
|
<Box ml={4}>一共 {fileData.length} 组数据</Box>
|
@@ -16,7 +16,7 @@ import { useToast } from '@/hooks/useToast';
|
|||||||
import { useSelectFile } from '@/hooks/useSelectFile';
|
import { useSelectFile } from '@/hooks/useSelectFile';
|
||||||
import { encode } from 'gpt-token-utils';
|
import { encode } from 'gpt-token-utils';
|
||||||
import { useConfirm } from '@/hooks/useConfirm';
|
import { useConfirm } from '@/hooks/useConfirm';
|
||||||
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/tools';
|
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
|
||||||
import { useMutation } from '@tanstack/react-query';
|
import { useMutation } from '@tanstack/react-query';
|
||||||
import { postModelDataSplitData } from '@/api/model';
|
import { postModelDataSplitData } from '@/api/model';
|
||||||
import { formatPrice } from '@/utils/user';
|
import { formatPrice } from '@/utils/user';
|
||||||
|
136
src/utils/file.ts
Normal file
136
src/utils/file.ts
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
import mammoth from 'mammoth';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 读取 txt 文件内容
|
||||||
|
*/
|
||||||
|
export const readTxtContent = (file: File) => {
|
||||||
|
return new Promise((resolve: (_: string) => void, reject) => {
|
||||||
|
try {
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onload = () => {
|
||||||
|
resolve(reader.result as string);
|
||||||
|
};
|
||||||
|
reader.onerror = (err) => {
|
||||||
|
console.log('error txt read:', err);
|
||||||
|
reject('读取 txt 文件失败');
|
||||||
|
};
|
||||||
|
reader.readAsText(file);
|
||||||
|
} catch (error) {
|
||||||
|
reject('浏览器不支持文件内容读取');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 读取 pdf 内容
|
||||||
|
*/
|
||||||
|
export const readPdfContent = (file: File) =>
|
||||||
|
new Promise<string>((resolve, reject) => {
|
||||||
|
try {
|
||||||
|
const pdfjsLib = window['pdfjs-dist/build/pdf'];
|
||||||
|
pdfjsLib.workerSrc = '/js/pdf.worker.js';
|
||||||
|
|
||||||
|
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||||
|
const page = await doc.getPage(pageNo);
|
||||||
|
const tokenizedText = await page.getTextContent();
|
||||||
|
const pageText = tokenizedText.items.map((token: any) => token.str).join(' ');
|
||||||
|
return pageText;
|
||||||
|
};
|
||||||
|
|
||||||
|
let reader = new FileReader();
|
||||||
|
reader.readAsArrayBuffer(file);
|
||||||
|
reader.onload = async (event) => {
|
||||||
|
if (!event?.target?.result) return reject('解析 PDF 失败');
|
||||||
|
try {
|
||||||
|
const doc = await pdfjsLib.getDocument(event.target.result).promise;
|
||||||
|
const pageTextPromises = [];
|
||||||
|
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||||
|
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||||
|
}
|
||||||
|
const pageTexts = await Promise.all(pageTextPromises);
|
||||||
|
resolve(pageTexts.join('\n'));
|
||||||
|
} catch (err) {
|
||||||
|
console.log(err, 'pdfjs error');
|
||||||
|
reject('解析 PDF 失败');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.onerror = (err) => {
|
||||||
|
console.log(err, 'reader error');
|
||||||
|
reject('解析 PDF 失败');
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
reject('浏览器不支持文件内容读取');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 读取doc
|
||||||
|
*/
|
||||||
|
export const readDocContent = (file: File) =>
|
||||||
|
new Promise<string>((resolve, reject) => {
|
||||||
|
try {
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.readAsArrayBuffer(file);
|
||||||
|
reader.onload = async ({ target }) => {
|
||||||
|
if (!target?.result) return reject('读取 doc 文件失败');
|
||||||
|
try {
|
||||||
|
const res = await mammoth.extractRawText({
|
||||||
|
arrayBuffer: target.result as ArrayBuffer
|
||||||
|
});
|
||||||
|
resolve(res?.value);
|
||||||
|
} catch (error) {
|
||||||
|
reject('读取 doc 文件失败, 请转换成 PDF');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.onerror = (err) => {
|
||||||
|
console.log('error doc read:', err);
|
||||||
|
|
||||||
|
reject('读取 doc 文件失败');
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
reject('浏览器不支持文件内容读取');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 读取csv
|
||||||
|
*/
|
||||||
|
export const readCsvContent = async (file: File) => {
|
||||||
|
try {
|
||||||
|
const textArr = (await readTxtContent(file)).split('\n');
|
||||||
|
const header = textArr.shift()?.split(',');
|
||||||
|
if (!header) {
|
||||||
|
throw new Error('csv 格式错误');
|
||||||
|
}
|
||||||
|
// 拆分每一行数据
|
||||||
|
const data = [];
|
||||||
|
} catch (error) {
|
||||||
|
return Promise.reject('解析 csv 文件失败');
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* file download
|
||||||
|
*/
|
||||||
|
export const fileDownload = ({
|
||||||
|
text,
|
||||||
|
type,
|
||||||
|
filename
|
||||||
|
}: {
|
||||||
|
text: string;
|
||||||
|
type: string;
|
||||||
|
filename: string;
|
||||||
|
}) => {
|
||||||
|
// 导出为文件
|
||||||
|
const blob = new Blob([text], { type: `${type};charset=utf-8` });
|
||||||
|
|
||||||
|
// 创建下载链接
|
||||||
|
const downloadLink = document.createElement('a');
|
||||||
|
downloadLink.href = window.URL.createObjectURL(blob);
|
||||||
|
downloadLink.download = filename;
|
||||||
|
|
||||||
|
// 添加链接到页面并触发下载
|
||||||
|
document.body.appendChild(downloadLink);
|
||||||
|
downloadLink.click();
|
||||||
|
document.body.removeChild(downloadLink);
|
||||||
|
};
|
@@ -1,6 +1,5 @@
|
|||||||
import crypto from 'crypto';
|
import crypto from 'crypto';
|
||||||
import { useToast } from '@/hooks/useToast';
|
import { useToast } from '@/hooks/useToast';
|
||||||
import mammoth from 'mammoth';
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* copy text data
|
* copy text data
|
||||||
@@ -34,11 +33,17 @@ export const useCopyData = () => {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 密码加密
|
||||||
|
*/
|
||||||
export const createHashPassword = (text: string) => {
|
export const createHashPassword = (text: string) => {
|
||||||
const hash = crypto.createHash('sha256').update(text).digest('hex');
|
const hash = crypto.createHash('sha256').update(text).digest('hex');
|
||||||
return hash;
|
return hash;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 对象转成 query 字符串
|
||||||
|
*/
|
||||||
export const Obj2Query = (obj: Record<string, string | number>) => {
|
export const Obj2Query = (obj: Record<string, string | number>) => {
|
||||||
const queryParams = new URLSearchParams();
|
const queryParams = new URLSearchParams();
|
||||||
for (const key in obj) {
|
for (const key in obj) {
|
||||||
@@ -47,86 +52,6 @@ export const Obj2Query = (obj: Record<string, string | number>) => {
|
|||||||
return queryParams.toString();
|
return queryParams.toString();
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* 读取 txt 文件内容
|
|
||||||
*/
|
|
||||||
export const readTxtContent = (file: File) => {
|
|
||||||
return new Promise((resolve: (_: string) => void, reject) => {
|
|
||||||
const reader = new FileReader();
|
|
||||||
reader.onload = () => {
|
|
||||||
resolve(reader.result as string);
|
|
||||||
};
|
|
||||||
reader.onerror = (err) => {
|
|
||||||
console.log('error txt read:', err);
|
|
||||||
reject('读取 txt 文件失败');
|
|
||||||
};
|
|
||||||
reader.readAsText(file);
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 读取 pdf 内容
|
|
||||||
*/
|
|
||||||
export const readPdfContent = (file: File) =>
|
|
||||||
new Promise<string>((resolve, reject) => {
|
|
||||||
const pdfjsLib = window['pdfjs-dist/build/pdf'];
|
|
||||||
pdfjsLib.workerSrc = '/js/pdf.worker.js';
|
|
||||||
|
|
||||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
|
||||||
const page = await doc.getPage(pageNo);
|
|
||||||
const tokenizedText = await page.getTextContent();
|
|
||||||
const pageText = tokenizedText.items.map((token: any) => token.str).join(' ');
|
|
||||||
return pageText;
|
|
||||||
};
|
|
||||||
|
|
||||||
let reader = new FileReader();
|
|
||||||
reader.readAsArrayBuffer(file);
|
|
||||||
reader.onload = async (event) => {
|
|
||||||
if (!event?.target?.result) return reject('解析 PDF 失败');
|
|
||||||
try {
|
|
||||||
const doc = await pdfjsLib.getDocument(event.target.result).promise;
|
|
||||||
const pageTextPromises = [];
|
|
||||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
|
||||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
|
||||||
}
|
|
||||||
const pageTexts = await Promise.all(pageTextPromises);
|
|
||||||
resolve(pageTexts.join('\n'));
|
|
||||||
} catch (err) {
|
|
||||||
console.log(err, 'pdfjs error');
|
|
||||||
reject('解析 PDF 失败');
|
|
||||||
}
|
|
||||||
};
|
|
||||||
reader.onerror = (err) => {
|
|
||||||
console.log(err, 'reader error');
|
|
||||||
reject('解析 PDF 失败');
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 读取doc
|
|
||||||
*/
|
|
||||||
export const readDocContent = (file: File) =>
|
|
||||||
new Promise<string>((resolve, reject) => {
|
|
||||||
const reader = new FileReader();
|
|
||||||
reader.readAsArrayBuffer(file);
|
|
||||||
reader.onload = async ({ target }) => {
|
|
||||||
if (!target?.result) return reject('读取 doc 文件失败');
|
|
||||||
try {
|
|
||||||
const res = await mammoth.extractRawText({
|
|
||||||
arrayBuffer: target.result as ArrayBuffer
|
|
||||||
});
|
|
||||||
resolve(res?.value);
|
|
||||||
} catch (error) {
|
|
||||||
reject('读取 doc 文件失败, 请转换成 PDF');
|
|
||||||
}
|
|
||||||
};
|
|
||||||
reader.onerror = (err) => {
|
|
||||||
console.log('error doc read:', err);
|
|
||||||
|
|
||||||
reject('读取 doc 文件失败');
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 向量转成 float32 buffer 格式
|
* 向量转成 float32 buffer 格式
|
||||||
*/
|
*/
|
||||||
@@ -138,11 +63,18 @@ export const vectorToBuffer = (vector: number[]) => {
|
|||||||
return buffer;
|
return buffer;
|
||||||
};
|
};
|
||||||
|
|
||||||
export function formatVector(vector: number[]) {
|
export const formatVector = (vector: number[]) => {
|
||||||
let formattedVector = vector.slice(0, 1536); // 截取前1536个元素
|
let formattedVector = vector.slice(0, 1536); // 截取前1536个元素
|
||||||
if (vector.length > 1536) {
|
if (vector.length > 1536) {
|
||||||
formattedVector = formattedVector.concat(Array(1536 - formattedVector.length).fill(0)); // 在后面添加0
|
formattedVector = formattedVector.concat(Array(1536 - formattedVector.length).fill(0)); // 在后面添加0
|
||||||
}
|
}
|
||||||
|
|
||||||
return formattedVector;
|
return formattedVector;
|
||||||
}
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 字符串清理,替换换行符号
|
||||||
|
*/
|
||||||
|
export const clearStrLineBreak = (str: string) => {
|
||||||
|
return str.replace(/\n/g, '\n').replace(/\n/g, '\\n').trim();
|
||||||
|
};
|
||||||
|
Reference in New Issue
Block a user