perf: csv导入导出

This commit is contained in:
archer
2023-04-10 20:39:27 +08:00
parent c1d3a46dc7
commit 2a597964a2
9 changed files with 66 additions and 34 deletions

View File

@@ -36,6 +36,7 @@
"nodemailer": "^6.9.1",
"nprogress": "^0.2.0",
"openai": "^3.2.1",
"papaparse": "^5.4.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-hook-form": "^7.43.1",
@@ -58,6 +59,7 @@
"@types/lodash": "^4.14.191",
"@types/node": "18.14.0",
"@types/nodemailer": "^6.4.7",
"@types/papaparse": "^5.3.7",
"@types/react": "18.0.28",
"@types/react-dom": "18.0.11",
"@types/react-syntax-highlighter": "^15.5.6",

18
pnpm-lock.yaml generated
View File

@@ -14,6 +14,7 @@ specifiers:
'@types/node': 18.14.0
'@types/nodemailer': ^6.4.7
'@types/nprogress': ^0.2.0
'@types/papaparse': ^5.3.7
'@types/react': 18.0.28
'@types/react-dom': 18.0.11
'@types/react-syntax-highlighter': ^15.5.6
@@ -41,6 +42,7 @@ specifiers:
nodemailer: ^6.9.1
nprogress: ^0.2.0
openai: ^3.2.1
papaparse: ^5.4.1
prettier: ^2.8.4
react: 18.2.0
react-dom: 18.2.0
@@ -84,6 +86,7 @@ dependencies:
nodemailer: registry.npmmirror.com/nodemailer/6.9.1
nprogress: registry.npmmirror.com/nprogress/0.2.0
openai: registry.npmmirror.com/openai/3.2.1
papaparse: registry.npmmirror.com/papaparse/5.4.1
react: registry.npmmirror.com/react/18.2.0
react-dom: registry.npmmirror.com/react-dom/18.2.0_react@18.2.0
react-hook-form: registry.npmmirror.com/react-hook-form/7.43.1_react@18.2.0
@@ -106,6 +109,7 @@ devDependencies:
'@types/lodash': registry.npmmirror.com/@types/lodash/4.14.191
'@types/node': registry.npmmirror.com/@types/node/18.14.0
'@types/nodemailer': registry.npmmirror.com/@types/nodemailer/6.4.7
'@types/papaparse': registry.npmmirror.com/@types/papaparse/5.3.7
'@types/react': registry.npmmirror.com/@types/react/18.0.28
'@types/react-dom': registry.npmmirror.com/@types/react-dom/18.0.11
'@types/react-syntax-highlighter': registry.npmmirror.com/@types/react-syntax-highlighter/15.5.6
@@ -5046,6 +5050,14 @@ packages:
version: 0.2.0
dev: false
registry.npmmirror.com/@types/papaparse/5.3.7:
resolution: {integrity: sha512-f2HKmlnPdCvS0WI33WtCs5GD7X1cxzzS/aduaxSu3I7TbhWlENjSPs6z5TaB9K0J+BH1jbmqTaM+ja5puis4wg==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@types/papaparse/-/papaparse-5.3.7.tgz}
name: '@types/papaparse'
version: 5.3.7
dependencies:
'@types/node': registry.npmmirror.com/@types/node/18.14.0
dev: true
registry.npmmirror.com/@types/parse-json/4.0.0:
resolution: {integrity: sha512-//oorEZjL6sbPcKUaCdIGlIUeH26mgzimjBB77G6XRgnDl/L5wOnpyBGRe/Mmf5CVW3PwEBE1NjiMZ/ssFh4wA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@types/parse-json/-/parse-json-4.0.0.tgz}
name: '@types/parse-json'
@@ -9571,6 +9583,12 @@ packages:
version: 1.0.11
dev: false
registry.npmmirror.com/papaparse/5.4.1:
resolution: {integrity: sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/papaparse/-/papaparse-5.4.1.tgz}
name: papaparse
version: 5.4.1
dev: false
registry.npmmirror.com/parent-module/1.0.1:
resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/parent-module/-/parent-module-1.0.1.tgz}
name: parent-module

View File

@@ -60,7 +60,7 @@ export const getModelDataList = (props: GetModelDataListProps) =>
* 获取导出数据(不分页)
*/
export const getExportDataList = (modelId: string) =>
GET<string>(`/model/data/exportModelData?modelId=${modelId}`);
GET<[string, string][]>(`/model/data/exportModelData?modelId=${modelId}`);
/**
* 获取模型正在拆分数据的数量
@@ -90,10 +90,8 @@ export const postModelDataSplitData = (data: { modelId: string; text: string; pr
/**
* json导入数据
*/
export const postModelDataJsonData = (
modelId: string,
jsonData: { prompt: string; completion: string; vector?: number[] }[]
) => POST(`/model/data/pushModelDataJson`, { modelId, data: jsonData });
export const postModelDataCsvData = (modelId: string, data: string[][]) =>
POST(`/model/data/pushModelDataCsv`, { modelId, data: data });
/**
* 更新模型数据

View File

@@ -41,16 +41,16 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
}
);
let str = `question,answer\n`;
const data: [string, string][] = [];
searchRes.documents.forEach((item: any) => {
if (item.value.q && item.value.text) {
str += `"${clearStrLineBreak(item.value.q)}","${clearStrLineBreak(item.value.text)}"\n`;
data.push([clearStrLineBreak(item.value.q), clearStrLineBreak(item.value.text)]);
}
});
jsonRes(res, {
data: str.slice(0, str.length - 1)
data
});
} catch (err) {
jsonRes(res, {

View File

@@ -13,7 +13,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
try {
const { modelId, data } = req.body as {
modelId: string;
data: { prompt: string; completion: string; vector?: number[] }[];
data: string[][];
};
const { authorization } = req.headers;
@@ -44,8 +44,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
// 插入 redis
const insertRedisRes = await Promise.allSettled(
data.map((item) => {
const vector = item.vector;
return redis.sendCommand([
'HMSET',
`${VecModelDataPrefix}:${nanoid()}`,
@@ -53,13 +51,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
userId,
'modelId',
String(modelId),
...(vector ? ['vector', vectorToBuffer(formatVector(vector))] : []),
'q',
item.prompt,
item[0],
'text',
item.completion,
item[1],
'status',
vector ? ModelDataStatusEnum.ready : ModelDataStatusEnum.waiting
ModelDataStatusEnum.waiting
]);
})
);

View File

@@ -33,6 +33,7 @@ import { fileDownload } from '@/utils/file';
import dynamic from 'next/dynamic';
import { useMutation, useQuery } from '@tanstack/react-query';
import type { FormData as InputDataType } from './InputDataModal';
import Papa from 'papaparse';
const InputModel = dynamic(() => import('./InputDataModal'));
const SelectFileModel = dynamic(() => import('./SelectFileModal'));
@@ -92,10 +93,13 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
mutationFn: () => getExportDataList(model._id),
onSuccess(res) {
try {
console.log(res);
setIsLoading(true);
const text = Papa.unparse({
fields: ['question', 'answer'],
data: res
});
fileDownload({
text: res,
text,
type: 'text/csv',
filename: 'data.csv'
});

View File

@@ -15,7 +15,7 @@ import { useSelectFile } from '@/hooks/useSelectFile';
import { useConfirm } from '@/hooks/useConfirm';
import { readCsvContent } from '@/utils/file';
import { useMutation } from '@tanstack/react-query';
import { postModelDataJsonData } from '@/api/model';
import { postModelDataCsvData } from '@/api/model';
import Markdown from '@/components/Markdown';
import { useMarkdown } from '@/hooks/useMarkdown';
import { fileDownload } from '@/utils/file';
@@ -33,20 +33,22 @@ const SelectJsonModal = ({
}) => {
const [selecting, setSelecting] = useState(false);
const { toast } = useToast();
const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: true });
const [fileData, setFileData] = useState<
{ prompt: string; completion: string; vector?: number[] }[]
>([]);
const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: false });
const [fileData, setFileData] = useState<string[][]>([]);
const { openConfirm, ConfirmChild } = useConfirm({
content: '确认导入该数据集?'
});
const onSelectFile = useCallback(
async (e: File[]) => {
const file = e[0];
setSelecting(true);
try {
const data = await Promise.all(e.map((item) => readCsvContent(item)));
console.log(data);
const { header, data } = await readCsvContent(file);
if (header[0] !== 'question' || header[1] !== 'answer') {
throw new Error('csv 文件格式有误');
}
setFileData(data);
} catch (error: any) {
console.log(error);
toast({
@@ -62,8 +64,7 @@ const SelectJsonModal = ({
const { mutate, isLoading } = useMutation({
mutationFn: async () => {
if (!fileData) return;
const res = await postModelDataJsonData(modelId, fileData);
console.log(res);
await postModelDataCsvData(modelId, fileData);
toast({
title: '导入数据成功,需要一段时间训练',
status: 'success'
@@ -115,7 +116,16 @@ const SelectJsonModal = ({
</Flex>
</Box>
<Box flex={'2 0 0'} h={'100%'} overflow={'auto'} p={2} backgroundColor={'blackAlpha.50'}>
{JSON.stringify(fileData)}
{fileData.map((item, index) => (
<Box key={index}>
<Box>
Q{index + 1}. {item[0]}
</Box>
<Box>
A{index + 1}. {item[1]}
</Box>
</Box>
))}
</Box>
</ModalBody>

View File

@@ -1,4 +1,5 @@
import mammoth from 'mammoth';
import Papa from 'papaparse';
/**
* 读取 txt 文件内容
@@ -97,13 +98,15 @@ export const readDocContent = (file: File) =>
*/
export const readCsvContent = async (file: File) => {
try {
const textArr = (await readTxtContent(file)).split('\n');
const header = textArr.shift()?.split(',');
if (!header) {
throw new Error('csv 格式错误');
const textArr = await readTxtContent(file);
const json = Papa.parse(textArr).data as string[][];
if (json.length === 0) {
throw new Error('csv 解析失败');
}
// 拆分每一行数据
const data = [];
return {
header: json.shift()?.filter((item) => item) as string[],
data: json.map((item) => item?.filter((item) => item))
};
} catch (error) {
return Promise.reject('解析 csv 文件失败');
}

View File

@@ -76,5 +76,5 @@ export const formatVector = (vector: number[]) => {
* 字符串清理,替换换行符号
*/
export const clearStrLineBreak = (str: string) => {
return str.replace(/\n/g, '\n').replace(/\n/g, '\\n').trim();
return str.replace(/\n+/g, '\n').replace(/\n/g, '\\n').trim();
};