mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
perf: csv文件选择
This commit is contained in:
5
public/docs/csvSelect.md
Normal file
5
public/docs/csvSelect.md
Normal file
@@ -0,0 +1,5 @@
|
||||
接受一个csv文件,表格头包含 question 和 answer。question 代表问题,answer 代表答案。
|
||||
| question | answer |
|
||||
| --- | --- |
|
||||
| 什么是 laf | laf 是一个云函数开发平台…… |
|
||||
| 什么是 sealos | Sealos 是以 kubernetes 为内核的云操作系统发行版,可以…… |
|
@@ -4,6 +4,7 @@ import { connectToDatabase } from '@/service/mongo';
|
||||
import { authToken } from '@/service/utils/tools';
|
||||
import { connectRedis } from '@/service/redis';
|
||||
import { VecModelDataIdx } from '@/constants/redis';
|
||||
import { clearStrLineBreak } from '@/utils/tools';
|
||||
|
||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||
try {
|
||||
@@ -40,13 +41,16 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
||||
}
|
||||
);
|
||||
|
||||
const data = searchRes.documents.map((item: any) => ({
|
||||
prompt: item.value.q,
|
||||
completion: item.value.text
|
||||
}));
|
||||
let str = `question,answer\n`;
|
||||
|
||||
searchRes.documents.forEach((item: any) => {
|
||||
if (item.value.q && item.value.text) {
|
||||
str += `"${clearStrLineBreak(item.value.q)}","${clearStrLineBreak(item.value.text)}"\n`;
|
||||
}
|
||||
});
|
||||
|
||||
jsonRes(res, {
|
||||
data: JSON.stringify(data)
|
||||
data: str.slice(0, str.length - 1)
|
||||
});
|
||||
} catch (err) {
|
||||
jsonRes(res, {
|
||||
|
@@ -28,8 +28,8 @@ import {
|
||||
getExportDataList
|
||||
} from '@/api/model';
|
||||
import { DeleteIcon, RepeatIcon, EditIcon } from '@chakra-ui/icons';
|
||||
import { useToast } from '@/hooks/useToast';
|
||||
import { useLoading } from '@/hooks/useLoading';
|
||||
import { fileDownload } from '@/utils/file';
|
||||
import dynamic from 'next/dynamic';
|
||||
import { useMutation, useQuery } from '@tanstack/react-query';
|
||||
import type { FormData as InputDataType } from './InputDataModal';
|
||||
@@ -37,10 +37,10 @@ import type { FormData as InputDataType } from './InputDataModal';
|
||||
const InputModel = dynamic(() => import('./InputDataModal'));
|
||||
const SelectFileModel = dynamic(() => import('./SelectFileModal'));
|
||||
const SelectUrlModel = dynamic(() => import('./SelectUrlModal'));
|
||||
const SelectJsonModel = dynamic(() => import('./SelectJsonModal'));
|
||||
const SelectCsvModal = dynamic(() => import('./SelectCsvModal'));
|
||||
|
||||
const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
const { Loading } = useLoading();
|
||||
const { Loading, setIsLoading } = useLoading();
|
||||
|
||||
const {
|
||||
data: modelDataList,
|
||||
@@ -70,9 +70,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
onClose: onCloseSelectUrlModal
|
||||
} = useDisclosure();
|
||||
const {
|
||||
isOpen: isOpenSelectJsonModal,
|
||||
onOpen: onOpenSelectJsonModal,
|
||||
onClose: onCloseSelectJsonModal
|
||||
isOpen: isOpenSelectCsvModal,
|
||||
onOpen: onOpenSelectCsvModal,
|
||||
onClose: onCloseSelectCsvModal
|
||||
} = useDisclosure();
|
||||
|
||||
const { data: splitDataLen, refetch } = useQuery(['getModelSplitDataList'], () =>
|
||||
@@ -91,18 +91,18 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
const { mutate: onclickExport, isLoading: isLoadingExport } = useMutation({
|
||||
mutationFn: () => getExportDataList(model._id),
|
||||
onSuccess(res) {
|
||||
// 导出为文件
|
||||
const blob = new Blob([res], { type: 'application/json;charset=utf-8' });
|
||||
|
||||
// 创建下载链接
|
||||
const downloadLink = document.createElement('a');
|
||||
downloadLink.href = window.URL.createObjectURL(blob);
|
||||
downloadLink.download = `data.json`;
|
||||
|
||||
// 添加链接到页面并触发下载
|
||||
document.body.appendChild(downloadLink);
|
||||
downloadLink.click();
|
||||
document.body.removeChild(downloadLink);
|
||||
try {
|
||||
console.log(res);
|
||||
setIsLoading(true);
|
||||
fileDownload({
|
||||
text: res,
|
||||
type: 'text/csv',
|
||||
filename: 'data.csv'
|
||||
});
|
||||
} catch (error) {
|
||||
error;
|
||||
}
|
||||
setIsLoading(false);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -110,7 +110,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
<>
|
||||
<Flex>
|
||||
<Box fontWeight={'bold'} fontSize={'lg'} flex={1} mr={2}>
|
||||
模型数据: {total}组{' '}
|
||||
模型数据: {total}组
|
||||
<Box as={'span'} fontSize={'sm'}>
|
||||
(测试版本)
|
||||
</Box>
|
||||
@@ -128,7 +128,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
mr={2}
|
||||
size={'sm'}
|
||||
isLoading={isLoadingExport}
|
||||
title={'v2.3之前版本的数据无法导出'}
|
||||
title={'换行数据导出时,会进行格式转换'}
|
||||
onClick={() => onclickExport()}
|
||||
>
|
||||
导出
|
||||
@@ -148,9 +148,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
>
|
||||
手动输入
|
||||
</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectFileModal}>文件QA拆分</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectUrlModal}>网站内容QA拆分</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectJsonModal}>JSON导入</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectFileModal}>文本内容 QA 拆分</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectUrlModal}>网站内容 QA 拆分</MenuItem>
|
||||
<MenuItem onClick={onOpenSelectCsvModal}>csv 问答对导入</MenuItem>
|
||||
</MenuList>
|
||||
</Menu>
|
||||
</Flex>
|
||||
@@ -248,10 +248,10 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
|
||||
onSuccess={refetchData}
|
||||
/>
|
||||
)}
|
||||
{isOpenSelectJsonModal && (
|
||||
<SelectJsonModel
|
||||
{isOpenSelectCsvModal && (
|
||||
<SelectCsvModal
|
||||
modelId={model._id}
|
||||
onClose={onCloseSelectJsonModal}
|
||||
onClose={onCloseSelectCsvModal}
|
||||
onSuccess={refetchData}
|
||||
/>
|
||||
)}
|
||||
|
@@ -13,10 +13,14 @@ import {
|
||||
import { useToast } from '@/hooks/useToast';
|
||||
import { useSelectFile } from '@/hooks/useSelectFile';
|
||||
import { useConfirm } from '@/hooks/useConfirm';
|
||||
import { readTxtContent } from '@/utils/tools';
|
||||
import { readCsvContent } from '@/utils/file';
|
||||
import { useMutation } from '@tanstack/react-query';
|
||||
import { postModelDataJsonData } from '@/api/model';
|
||||
import Markdown from '@/components/Markdown';
|
||||
import { useMarkdown } from '@/hooks/useMarkdown';
|
||||
import { fileDownload } from '@/utils/file';
|
||||
|
||||
const csvTemplate = `question,answer\n"什么是 laf","laf 是一个云函数开发平台……"\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……"`;
|
||||
|
||||
const SelectJsonModal = ({
|
||||
onClose,
|
||||
@@ -29,7 +33,7 @@ const SelectJsonModal = ({
|
||||
}) => {
|
||||
const [selecting, setSelecting] = useState(false);
|
||||
const { toast } = useToast();
|
||||
const { File, onOpen } = useSelectFile({ fileType: '.json', multiple: true });
|
||||
const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: true });
|
||||
const [fileData, setFileData] = useState<
|
||||
{ prompt: string; completion: string; vector?: number[] }[]
|
||||
>([]);
|
||||
@@ -41,21 +45,12 @@ const SelectJsonModal = ({
|
||||
async (e: File[]) => {
|
||||
setSelecting(true);
|
||||
try {
|
||||
const jsonData = (
|
||||
await Promise.all(e.map((item) => readTxtContent(item).then((text) => JSON.parse(text))))
|
||||
).flat();
|
||||
// check 文件类型
|
||||
for (let i = 0; i < jsonData.length; i++) {
|
||||
if (!jsonData[i]?.prompt || !jsonData[i]?.completion) {
|
||||
throw new Error('缺少 prompt 或 completion');
|
||||
}
|
||||
}
|
||||
|
||||
setFileData(jsonData);
|
||||
const data = await Promise.all(e.map((item) => readCsvContent(item)));
|
||||
console.log(data);
|
||||
} catch (error: any) {
|
||||
console.log(error);
|
||||
toast({
|
||||
title: error?.message || 'JSON文件格式有误',
|
||||
title: error?.message || 'csv 文件格式有误',
|
||||
status: 'error'
|
||||
});
|
||||
}
|
||||
@@ -84,34 +79,36 @@ const SelectJsonModal = ({
|
||||
}
|
||||
});
|
||||
|
||||
const { data: intro } = useMarkdown({ url: '/csvSelect.md' });
|
||||
|
||||
return (
|
||||
<Modal isOpen={true} onClose={onClose} isCentered>
|
||||
<ModalOverlay />
|
||||
<ModalContent maxW={'90vw'} position={'relative'} m={0} h={'90vh'}>
|
||||
<ModalHeader>JSON数据集</ModalHeader>
|
||||
<ModalHeader>csv 问答对导入</ModalHeader>
|
||||
<ModalCloseButton />
|
||||
|
||||
<ModalBody h={'100%'} display={['block', 'flex']} fontSize={'sm'} overflowY={'auto'}>
|
||||
<Box flex={'2 0 0'} w={['100%', 0]} mr={[0, 4]} mb={[4, 0]}>
|
||||
<Markdown
|
||||
source={`接受一个对象数组,每个对象必须包含 prompt 和 completion 格式,可以包含vector。prompt 代表问题,completion 代表回答的内容,可以多个问题对应一个回答,vector 为 prompt 的向量,如果没有讲有系统生成。例如:
|
||||
~~~json
|
||||
[
|
||||
{
|
||||
"prompt":"sealos是什么?\\n介绍下sealos\\nsealos有什么用",
|
||||
"completion":"sealos是xxxxxx"
|
||||
},
|
||||
{
|
||||
"prompt":"laf是什么?",
|
||||
"completion":"laf是xxxxxx",
|
||||
"vector":[-0.42,-0.4314314,0.43143]
|
||||
}
|
||||
]
|
||||
~~~`}
|
||||
/>
|
||||
<Box flex={'1 0 0'} w={['100%', 0]} mr={[0, 4]} mb={[4, 0]}>
|
||||
<Markdown source={intro} />
|
||||
<Box
|
||||
my={3}
|
||||
cursor={'pointer'}
|
||||
textDecoration={'underline'}
|
||||
color={'blue.600'}
|
||||
onClick={() =>
|
||||
fileDownload({
|
||||
text: csvTemplate,
|
||||
type: 'text/csv',
|
||||
filename: 'template.csv'
|
||||
})
|
||||
}
|
||||
>
|
||||
点击下载csv模板
|
||||
</Box>
|
||||
<Flex alignItems={'center'}>
|
||||
<Button isLoading={selecting} onClick={onOpen}>
|
||||
选择 JSON 数据集
|
||||
选择 csv 问答对
|
||||
</Button>
|
||||
|
||||
<Box ml={4}>一共 {fileData.length} 组数据</Box>
|
@@ -16,7 +16,7 @@ import { useToast } from '@/hooks/useToast';
|
||||
import { useSelectFile } from '@/hooks/useSelectFile';
|
||||
import { encode } from 'gpt-token-utils';
|
||||
import { useConfirm } from '@/hooks/useConfirm';
|
||||
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/tools';
|
||||
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
|
||||
import { useMutation } from '@tanstack/react-query';
|
||||
import { postModelDataSplitData } from '@/api/model';
|
||||
import { formatPrice } from '@/utils/user';
|
||||
|
136
src/utils/file.ts
Normal file
136
src/utils/file.ts
Normal file
@@ -0,0 +1,136 @@
|
||||
import mammoth from 'mammoth';
|
||||
|
||||
/**
|
||||
* 读取 txt 文件内容
|
||||
*/
|
||||
export const readTxtContent = (file: File) => {
|
||||
return new Promise((resolve: (_: string) => void, reject) => {
|
||||
try {
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
resolve(reader.result as string);
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log('error txt read:', err);
|
||||
reject('读取 txt 文件失败');
|
||||
};
|
||||
reader.readAsText(file);
|
||||
} catch (error) {
|
||||
reject('浏览器不支持文件内容读取');
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* 读取 pdf 内容
|
||||
*/
|
||||
export const readPdfContent = (file: File) =>
|
||||
new Promise<string>((resolve, reject) => {
|
||||
try {
|
||||
const pdfjsLib = window['pdfjs-dist/build/pdf'];
|
||||
pdfjsLib.workerSrc = '/js/pdf.worker.js';
|
||||
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
const pageText = tokenizedText.items.map((token: any) => token.str).join(' ');
|
||||
return pageText;
|
||||
};
|
||||
|
||||
let reader = new FileReader();
|
||||
reader.readAsArrayBuffer(file);
|
||||
reader.onload = async (event) => {
|
||||
if (!event?.target?.result) return reject('解析 PDF 失败');
|
||||
try {
|
||||
const doc = await pdfjsLib.getDocument(event.target.result).promise;
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
resolve(pageTexts.join('\n'));
|
||||
} catch (err) {
|
||||
console.log(err, 'pdfjs error');
|
||||
reject('解析 PDF 失败');
|
||||
}
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log(err, 'reader error');
|
||||
reject('解析 PDF 失败');
|
||||
};
|
||||
} catch (error) {
|
||||
reject('浏览器不支持文件内容读取');
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* 读取doc
|
||||
*/
|
||||
export const readDocContent = (file: File) =>
|
||||
new Promise<string>((resolve, reject) => {
|
||||
try {
|
||||
const reader = new FileReader();
|
||||
reader.readAsArrayBuffer(file);
|
||||
reader.onload = async ({ target }) => {
|
||||
if (!target?.result) return reject('读取 doc 文件失败');
|
||||
try {
|
||||
const res = await mammoth.extractRawText({
|
||||
arrayBuffer: target.result as ArrayBuffer
|
||||
});
|
||||
resolve(res?.value);
|
||||
} catch (error) {
|
||||
reject('读取 doc 文件失败, 请转换成 PDF');
|
||||
}
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log('error doc read:', err);
|
||||
|
||||
reject('读取 doc 文件失败');
|
||||
};
|
||||
} catch (error) {
|
||||
reject('浏览器不支持文件内容读取');
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* 读取csv
|
||||
*/
|
||||
export const readCsvContent = async (file: File) => {
|
||||
try {
|
||||
const textArr = (await readTxtContent(file)).split('\n');
|
||||
const header = textArr.shift()?.split(',');
|
||||
if (!header) {
|
||||
throw new Error('csv 格式错误');
|
||||
}
|
||||
// 拆分每一行数据
|
||||
const data = [];
|
||||
} catch (error) {
|
||||
return Promise.reject('解析 csv 文件失败');
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* file download
|
||||
*/
|
||||
export const fileDownload = ({
|
||||
text,
|
||||
type,
|
||||
filename
|
||||
}: {
|
||||
text: string;
|
||||
type: string;
|
||||
filename: string;
|
||||
}) => {
|
||||
// 导出为文件
|
||||
const blob = new Blob([text], { type: `${type};charset=utf-8` });
|
||||
|
||||
// 创建下载链接
|
||||
const downloadLink = document.createElement('a');
|
||||
downloadLink.href = window.URL.createObjectURL(blob);
|
||||
downloadLink.download = filename;
|
||||
|
||||
// 添加链接到页面并触发下载
|
||||
document.body.appendChild(downloadLink);
|
||||
downloadLink.click();
|
||||
document.body.removeChild(downloadLink);
|
||||
};
|
@@ -1,6 +1,5 @@
|
||||
import crypto from 'crypto';
|
||||
import { useToast } from '@/hooks/useToast';
|
||||
import mammoth from 'mammoth';
|
||||
|
||||
/**
|
||||
* copy text data
|
||||
@@ -34,11 +33,17 @@ export const useCopyData = () => {
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* 密码加密
|
||||
*/
|
||||
export const createHashPassword = (text: string) => {
|
||||
const hash = crypto.createHash('sha256').update(text).digest('hex');
|
||||
return hash;
|
||||
};
|
||||
|
||||
/**
|
||||
* 对象转成 query 字符串
|
||||
*/
|
||||
export const Obj2Query = (obj: Record<string, string | number>) => {
|
||||
const queryParams = new URLSearchParams();
|
||||
for (const key in obj) {
|
||||
@@ -47,86 +52,6 @@ export const Obj2Query = (obj: Record<string, string | number>) => {
|
||||
return queryParams.toString();
|
||||
};
|
||||
|
||||
/**
|
||||
* 读取 txt 文件内容
|
||||
*/
|
||||
export const readTxtContent = (file: File) => {
|
||||
return new Promise((resolve: (_: string) => void, reject) => {
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
resolve(reader.result as string);
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log('error txt read:', err);
|
||||
reject('读取 txt 文件失败');
|
||||
};
|
||||
reader.readAsText(file);
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* 读取 pdf 内容
|
||||
*/
|
||||
export const readPdfContent = (file: File) =>
|
||||
new Promise<string>((resolve, reject) => {
|
||||
const pdfjsLib = window['pdfjs-dist/build/pdf'];
|
||||
pdfjsLib.workerSrc = '/js/pdf.worker.js';
|
||||
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
const pageText = tokenizedText.items.map((token: any) => token.str).join(' ');
|
||||
return pageText;
|
||||
};
|
||||
|
||||
let reader = new FileReader();
|
||||
reader.readAsArrayBuffer(file);
|
||||
reader.onload = async (event) => {
|
||||
if (!event?.target?.result) return reject('解析 PDF 失败');
|
||||
try {
|
||||
const doc = await pdfjsLib.getDocument(event.target.result).promise;
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
resolve(pageTexts.join('\n'));
|
||||
} catch (err) {
|
||||
console.log(err, 'pdfjs error');
|
||||
reject('解析 PDF 失败');
|
||||
}
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log(err, 'reader error');
|
||||
reject('解析 PDF 失败');
|
||||
};
|
||||
});
|
||||
|
||||
/**
|
||||
* 读取doc
|
||||
*/
|
||||
export const readDocContent = (file: File) =>
|
||||
new Promise<string>((resolve, reject) => {
|
||||
const reader = new FileReader();
|
||||
reader.readAsArrayBuffer(file);
|
||||
reader.onload = async ({ target }) => {
|
||||
if (!target?.result) return reject('读取 doc 文件失败');
|
||||
try {
|
||||
const res = await mammoth.extractRawText({
|
||||
arrayBuffer: target.result as ArrayBuffer
|
||||
});
|
||||
resolve(res?.value);
|
||||
} catch (error) {
|
||||
reject('读取 doc 文件失败, 请转换成 PDF');
|
||||
}
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log('error doc read:', err);
|
||||
|
||||
reject('读取 doc 文件失败');
|
||||
};
|
||||
});
|
||||
|
||||
/**
|
||||
* 向量转成 float32 buffer 格式
|
||||
*/
|
||||
@@ -138,11 +63,18 @@ export const vectorToBuffer = (vector: number[]) => {
|
||||
return buffer;
|
||||
};
|
||||
|
||||
export function formatVector(vector: number[]) {
|
||||
export const formatVector = (vector: number[]) => {
|
||||
let formattedVector = vector.slice(0, 1536); // 截取前1536个元素
|
||||
if (vector.length > 1536) {
|
||||
formattedVector = formattedVector.concat(Array(1536 - formattedVector.length).fill(0)); // 在后面添加0
|
||||
}
|
||||
|
||||
return formattedVector;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* 字符串清理,替换换行符号
|
||||
*/
|
||||
export const clearStrLineBreak = (str: string) => {
|
||||
return str.replace(/\n/g, '\n').replace(/\n/g, '\\n').trim();
|
||||
};
|
||||
|
Reference in New Issue
Block a user