perf: csv文件选择

This commit is contained in:
archer
2023-04-10 19:47:03 +08:00
parent 0c55beb72d
commit c1d3a46dc7
7 changed files with 222 additions and 148 deletions

5
public/docs/csvSelect.md Normal file
View File

@@ -0,0 +1,5 @@
接受一个csv文件表格头包含 question 和 answer。question 代表问题answer 代表答案。
| question | answer |
| --- | --- |
| 什么是 laf | laf 是一个云函数开发平台…… |
| 什么是 sealos | Sealos 是以 kubernetes 为内核的云操作系统发行版,可以…… |

View File

@@ -4,6 +4,7 @@ import { connectToDatabase } from '@/service/mongo';
import { authToken } from '@/service/utils/tools';
import { connectRedis } from '@/service/redis';
import { VecModelDataIdx } from '@/constants/redis';
import { clearStrLineBreak } from '@/utils/tools';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try {
@@ -40,13 +41,16 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
}
);
const data = searchRes.documents.map((item: any) => ({
prompt: item.value.q,
completion: item.value.text
}));
let str = `question,answer\n`;
searchRes.documents.forEach((item: any) => {
if (item.value.q && item.value.text) {
str += `"${clearStrLineBreak(item.value.q)}","${clearStrLineBreak(item.value.text)}"\n`;
}
});
jsonRes(res, {
data: JSON.stringify(data)
data: str.slice(0, str.length - 1)
});
} catch (err) {
jsonRes(res, {

View File

@@ -28,8 +28,8 @@ import {
getExportDataList
} from '@/api/model';
import { DeleteIcon, RepeatIcon, EditIcon } from '@chakra-ui/icons';
import { useToast } from '@/hooks/useToast';
import { useLoading } from '@/hooks/useLoading';
import { fileDownload } from '@/utils/file';
import dynamic from 'next/dynamic';
import { useMutation, useQuery } from '@tanstack/react-query';
import type { FormData as InputDataType } from './InputDataModal';
@@ -37,10 +37,10 @@ import type { FormData as InputDataType } from './InputDataModal';
const InputModel = dynamic(() => import('./InputDataModal'));
const SelectFileModel = dynamic(() => import('./SelectFileModal'));
const SelectUrlModel = dynamic(() => import('./SelectUrlModal'));
const SelectJsonModel = dynamic(() => import('./SelectJsonModal'));
const SelectCsvModal = dynamic(() => import('./SelectCsvModal'));
const ModelDataCard = ({ model }: { model: ModelSchema }) => {
const { Loading } = useLoading();
const { Loading, setIsLoading } = useLoading();
const {
data: modelDataList,
@@ -70,9 +70,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
onClose: onCloseSelectUrlModal
} = useDisclosure();
const {
isOpen: isOpenSelectJsonModal,
onOpen: onOpenSelectJsonModal,
onClose: onCloseSelectJsonModal
isOpen: isOpenSelectCsvModal,
onOpen: onOpenSelectCsvModal,
onClose: onCloseSelectCsvModal
} = useDisclosure();
const { data: splitDataLen, refetch } = useQuery(['getModelSplitDataList'], () =>
@@ -91,18 +91,18 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
const { mutate: onclickExport, isLoading: isLoadingExport } = useMutation({
mutationFn: () => getExportDataList(model._id),
onSuccess(res) {
// 导出为文件
const blob = new Blob([res], { type: 'application/json;charset=utf-8' });
// 创建下载链接
const downloadLink = document.createElement('a');
downloadLink.href = window.URL.createObjectURL(blob);
downloadLink.download = `data.json`;
// 添加链接到页面并触发下载
document.body.appendChild(downloadLink);
downloadLink.click();
document.body.removeChild(downloadLink);
try {
console.log(res);
setIsLoading(true);
fileDownload({
text: res,
type: 'text/csv',
filename: 'data.csv'
});
} catch (error) {
error;
}
setIsLoading(false);
}
});
@@ -110,7 +110,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
<>
<Flex>
<Box fontWeight={'bold'} fontSize={'lg'} flex={1} mr={2}>
: {total}{' '}
: {total}
<Box as={'span'} fontSize={'sm'}>
</Box>
@@ -128,7 +128,7 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
mr={2}
size={'sm'}
isLoading={isLoadingExport}
title={'v2.3之前版本的数据无法导出'}
title={'换行数据导出时,会进行格式转换'}
onClick={() => onclickExport()}
>
@@ -148,9 +148,9 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
>
</MenuItem>
<MenuItem onClick={onOpenSelectFileModal}>QA拆</MenuItem>
<MenuItem onClick={onOpenSelectFileModal}> QA </MenuItem>
<MenuItem onClick={onOpenSelectUrlModal}> QA </MenuItem>
<MenuItem onClick={onOpenSelectJsonModal}>JSON导</MenuItem>
<MenuItem onClick={onOpenSelectCsvModal}>csv </MenuItem>
</MenuList>
</Menu>
</Flex>
@@ -248,10 +248,10 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
onSuccess={refetchData}
/>
)}
{isOpenSelectJsonModal && (
<SelectJsonModel
{isOpenSelectCsvModal && (
<SelectCsvModal
modelId={model._id}
onClose={onCloseSelectJsonModal}
onClose={onCloseSelectCsvModal}
onSuccess={refetchData}
/>
)}

View File

@@ -13,10 +13,14 @@ import {
import { useToast } from '@/hooks/useToast';
import { useSelectFile } from '@/hooks/useSelectFile';
import { useConfirm } from '@/hooks/useConfirm';
import { readTxtContent } from '@/utils/tools';
import { readCsvContent } from '@/utils/file';
import { useMutation } from '@tanstack/react-query';
import { postModelDataJsonData } from '@/api/model';
import Markdown from '@/components/Markdown';
import { useMarkdown } from '@/hooks/useMarkdown';
import { fileDownload } from '@/utils/file';
const csvTemplate = `question,answer\n"什么是 laf","laf 是一个云函数开发平台……"\n"什么是 sealos","Sealos 是以 kubernetes 为内核的云操作系统发行版,可以……"`;
const SelectJsonModal = ({
onClose,
@@ -29,7 +33,7 @@ const SelectJsonModal = ({
}) => {
const [selecting, setSelecting] = useState(false);
const { toast } = useToast();
const { File, onOpen } = useSelectFile({ fileType: '.json', multiple: true });
const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: true });
const [fileData, setFileData] = useState<
{ prompt: string; completion: string; vector?: number[] }[]
>([]);
@@ -41,21 +45,12 @@ const SelectJsonModal = ({
async (e: File[]) => {
setSelecting(true);
try {
const jsonData = (
await Promise.all(e.map((item) => readTxtContent(item).then((text) => JSON.parse(text))))
).flat();
// check 文件类型
for (let i = 0; i < jsonData.length; i++) {
if (!jsonData[i]?.prompt || !jsonData[i]?.completion) {
throw new Error('缺少 prompt 或 completion');
}
}
setFileData(jsonData);
const data = await Promise.all(e.map((item) => readCsvContent(item)));
console.log(data);
} catch (error: any) {
console.log(error);
toast({
title: error?.message || 'JSON文件格式有误',
title: error?.message || 'csv 文件格式有误',
status: 'error'
});
}
@@ -84,34 +79,36 @@ const SelectJsonModal = ({
}
});
const { data: intro } = useMarkdown({ url: '/csvSelect.md' });
return (
<Modal isOpen={true} onClose={onClose} isCentered>
<ModalOverlay />
<ModalContent maxW={'90vw'} position={'relative'} m={0} h={'90vh'}>
<ModalHeader>JSON数据集</ModalHeader>
<ModalHeader>csv </ModalHeader>
<ModalCloseButton />
<ModalBody h={'100%'} display={['block', 'flex']} fontSize={'sm'} overflowY={'auto'}>
<Box flex={'2 0 0'} w={['100%', 0]} mr={[0, 4]} mb={[4, 0]}>
<Markdown
source={`接受一个对象数组,每个对象必须包含 prompt 和 completion 格式可以包含vector。prompt 代表问题completion 代表回答的内容可以多个问题对应一个回答vector 为 prompt 的向量,如果没有讲有系统生成。例如:
~~~json
[
{
"prompt":"sealos是什么?\\n介绍下sealos\\nsealos有什么用",
"completion":"sealos是xxxxxx"
},
{
"prompt":"laf是什么?",
"completion":"laf是xxxxxx",
"vector":[-0.42,-0.4314314,0.43143]
<Box flex={'1 0 0'} w={['100%', 0]} mr={[0, 4]} mb={[4, 0]}>
<Markdown source={intro} />
<Box
my={3}
cursor={'pointer'}
textDecoration={'underline'}
color={'blue.600'}
onClick={() =>
fileDownload({
text: csvTemplate,
type: 'text/csv',
filename: 'template.csv'
})
}
]
~~~`}
/>
>
csv模板
</Box>
<Flex alignItems={'center'}>
<Button isLoading={selecting} onClick={onOpen}>
JSON
csv
</Button>
<Box ml={4}> {fileData.length} </Box>

View File

@@ -16,7 +16,7 @@ import { useToast } from '@/hooks/useToast';
import { useSelectFile } from '@/hooks/useSelectFile';
import { encode } from 'gpt-token-utils';
import { useConfirm } from '@/hooks/useConfirm';
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/tools';
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
import { useMutation } from '@tanstack/react-query';
import { postModelDataSplitData } from '@/api/model';
import { formatPrice } from '@/utils/user';

136
src/utils/file.ts Normal file
View File

@@ -0,0 +1,136 @@
import mammoth from 'mammoth';
/**
* 读取 txt 文件内容
*/
export const readTxtContent = (file: File) => {
return new Promise((resolve: (_: string) => void, reject) => {
try {
const reader = new FileReader();
reader.onload = () => {
resolve(reader.result as string);
};
reader.onerror = (err) => {
console.log('error txt read:', err);
reject('读取 txt 文件失败');
};
reader.readAsText(file);
} catch (error) {
reject('浏览器不支持文件内容读取');
}
});
};
/**
* 读取 pdf 内容
*/
export const readPdfContent = (file: File) =>
new Promise<string>((resolve, reject) => {
try {
const pdfjsLib = window['pdfjs-dist/build/pdf'];
pdfjsLib.workerSrc = '/js/pdf.worker.js';
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const pageText = tokenizedText.items.map((token: any) => token.str).join(' ');
return pageText;
};
let reader = new FileReader();
reader.readAsArrayBuffer(file);
reader.onload = async (event) => {
if (!event?.target?.result) return reject('解析 PDF 失败');
try {
const doc = await pdfjsLib.getDocument(event.target.result).promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
resolve(pageTexts.join('\n'));
} catch (err) {
console.log(err, 'pdfjs error');
reject('解析 PDF 失败');
}
};
reader.onerror = (err) => {
console.log(err, 'reader error');
reject('解析 PDF 失败');
};
} catch (error) {
reject('浏览器不支持文件内容读取');
}
});
/**
* 读取doc
*/
export const readDocContent = (file: File) =>
new Promise<string>((resolve, reject) => {
try {
const reader = new FileReader();
reader.readAsArrayBuffer(file);
reader.onload = async ({ target }) => {
if (!target?.result) return reject('读取 doc 文件失败');
try {
const res = await mammoth.extractRawText({
arrayBuffer: target.result as ArrayBuffer
});
resolve(res?.value);
} catch (error) {
reject('读取 doc 文件失败, 请转换成 PDF');
}
};
reader.onerror = (err) => {
console.log('error doc read:', err);
reject('读取 doc 文件失败');
};
} catch (error) {
reject('浏览器不支持文件内容读取');
}
});
/**
* 读取csv
*/
export const readCsvContent = async (file: File) => {
try {
const textArr = (await readTxtContent(file)).split('\n');
const header = textArr.shift()?.split(',');
if (!header) {
throw new Error('csv 格式错误');
}
// 拆分每一行数据
const data = [];
} catch (error) {
return Promise.reject('解析 csv 文件失败');
}
};
/**
* file download
*/
export const fileDownload = ({
text,
type,
filename
}: {
text: string;
type: string;
filename: string;
}) => {
// 导出为文件
const blob = new Blob([text], { type: `${type};charset=utf-8` });
// 创建下载链接
const downloadLink = document.createElement('a');
downloadLink.href = window.URL.createObjectURL(blob);
downloadLink.download = filename;
// 添加链接到页面并触发下载
document.body.appendChild(downloadLink);
downloadLink.click();
document.body.removeChild(downloadLink);
};

View File

@@ -1,6 +1,5 @@
import crypto from 'crypto';
import { useToast } from '@/hooks/useToast';
import mammoth from 'mammoth';
/**
* copy text data
@@ -34,11 +33,17 @@ export const useCopyData = () => {
};
};
/**
* 密码加密
*/
export const createHashPassword = (text: string) => {
const hash = crypto.createHash('sha256').update(text).digest('hex');
return hash;
};
/**
* 对象转成 query 字符串
*/
export const Obj2Query = (obj: Record<string, string | number>) => {
const queryParams = new URLSearchParams();
for (const key in obj) {
@@ -47,86 +52,6 @@ export const Obj2Query = (obj: Record<string, string | number>) => {
return queryParams.toString();
};
/**
* 读取 txt 文件内容
*/
export const readTxtContent = (file: File) => {
return new Promise((resolve: (_: string) => void, reject) => {
const reader = new FileReader();
reader.onload = () => {
resolve(reader.result as string);
};
reader.onerror = (err) => {
console.log('error txt read:', err);
reject('读取 txt 文件失败');
};
reader.readAsText(file);
});
};
/**
* 读取 pdf 内容
*/
export const readPdfContent = (file: File) =>
new Promise<string>((resolve, reject) => {
const pdfjsLib = window['pdfjs-dist/build/pdf'];
pdfjsLib.workerSrc = '/js/pdf.worker.js';
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const pageText = tokenizedText.items.map((token: any) => token.str).join(' ');
return pageText;
};
let reader = new FileReader();
reader.readAsArrayBuffer(file);
reader.onload = async (event) => {
if (!event?.target?.result) return reject('解析 PDF 失败');
try {
const doc = await pdfjsLib.getDocument(event.target.result).promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
resolve(pageTexts.join('\n'));
} catch (err) {
console.log(err, 'pdfjs error');
reject('解析 PDF 失败');
}
};
reader.onerror = (err) => {
console.log(err, 'reader error');
reject('解析 PDF 失败');
};
});
/**
* 读取doc
*/
export const readDocContent = (file: File) =>
new Promise<string>((resolve, reject) => {
const reader = new FileReader();
reader.readAsArrayBuffer(file);
reader.onload = async ({ target }) => {
if (!target?.result) return reject('读取 doc 文件失败');
try {
const res = await mammoth.extractRawText({
arrayBuffer: target.result as ArrayBuffer
});
resolve(res?.value);
} catch (error) {
reject('读取 doc 文件失败, 请转换成 PDF');
}
};
reader.onerror = (err) => {
console.log('error doc read:', err);
reject('读取 doc 文件失败');
};
});
/**
* 向量转成 float32 buffer 格式
*/
@@ -138,11 +63,18 @@ export const vectorToBuffer = (vector: number[]) => {
return buffer;
};
export function formatVector(vector: number[]) {
export const formatVector = (vector: number[]) => {
let formattedVector = vector.slice(0, 1536); // 截取前1536个元素
if (vector.length > 1536) {
formattedVector = formattedVector.concat(Array(1536 - formattedVector.length).fill(0)); // 在后面添加0
}
return formattedVector;
}
};
/**
* 字符串清理,替换换行符号
*/
export const clearStrLineBreak = (str: string) => {
return str.replace(/\n/g, '\n').replace(/\n/g, '\\n').trim();
};