From 2a597964a265620dcdea473689d9da40a4b20161 Mon Sep 17 00:00:00 2001 From: archer <545436317@qq.com> Date: Mon, 10 Apr 2023 20:39:27 +0800 Subject: [PATCH] =?UTF-8?q?perf:=20csv=E5=AF=BC=E5=85=A5=E5=AF=BC=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package.json | 2 ++ pnpm-lock.yaml | 18 +++++++++++ src/api/model.ts | 8 ++--- src/pages/api/model/data/exportModelData.ts | 6 ++-- ...shModelDataJson.ts => pushModelDataCsv.ts} | 11 +++---- .../model/detail/components/ModelDataCard.tsx | 8 +++-- .../detail/components/SelectCsvModal.tsx | 30 ++++++++++++------- src/utils/file.ts | 15 ++++++---- src/utils/tools.ts | 2 +- 9 files changed, 66 insertions(+), 34 deletions(-) rename src/pages/api/model/data/{pushModelDataJson.ts => pushModelDataCsv.ts} (85%) diff --git a/package.json b/package.json index 6374a030f..183248fb6 100644 --- a/package.json +++ b/package.json @@ -36,6 +36,7 @@ "nodemailer": "^6.9.1", "nprogress": "^0.2.0", "openai": "^3.2.1", + "papaparse": "^5.4.1", "react": "18.2.0", "react-dom": "18.2.0", "react-hook-form": "^7.43.1", @@ -58,6 +59,7 @@ "@types/lodash": "^4.14.191", "@types/node": "18.14.0", "@types/nodemailer": "^6.4.7", + "@types/papaparse": "^5.3.7", "@types/react": "18.0.28", "@types/react-dom": "18.0.11", "@types/react-syntax-highlighter": "^15.5.6", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e955b1419..20336d4c8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -14,6 +14,7 @@ specifiers: '@types/node': 18.14.0 '@types/nodemailer': ^6.4.7 '@types/nprogress': ^0.2.0 + '@types/papaparse': ^5.3.7 '@types/react': 18.0.28 '@types/react-dom': 18.0.11 '@types/react-syntax-highlighter': ^15.5.6 @@ -41,6 +42,7 @@ specifiers: nodemailer: ^6.9.1 nprogress: ^0.2.0 openai: ^3.2.1 + papaparse: ^5.4.1 prettier: ^2.8.4 react: 18.2.0 react-dom: 18.2.0 @@ -84,6 +86,7 @@ dependencies: nodemailer: registry.npmmirror.com/nodemailer/6.9.1 nprogress: registry.npmmirror.com/nprogress/0.2.0 openai: registry.npmmirror.com/openai/3.2.1 + papaparse: registry.npmmirror.com/papaparse/5.4.1 react: registry.npmmirror.com/react/18.2.0 react-dom: registry.npmmirror.com/react-dom/18.2.0_react@18.2.0 react-hook-form: registry.npmmirror.com/react-hook-form/7.43.1_react@18.2.0 @@ -106,6 +109,7 @@ devDependencies: '@types/lodash': registry.npmmirror.com/@types/lodash/4.14.191 '@types/node': registry.npmmirror.com/@types/node/18.14.0 '@types/nodemailer': registry.npmmirror.com/@types/nodemailer/6.4.7 + '@types/papaparse': registry.npmmirror.com/@types/papaparse/5.3.7 '@types/react': registry.npmmirror.com/@types/react/18.0.28 '@types/react-dom': registry.npmmirror.com/@types/react-dom/18.0.11 '@types/react-syntax-highlighter': registry.npmmirror.com/@types/react-syntax-highlighter/15.5.6 @@ -5046,6 +5050,14 @@ packages: version: 0.2.0 dev: false + registry.npmmirror.com/@types/papaparse/5.3.7: + resolution: {integrity: sha512-f2HKmlnPdCvS0WI33WtCs5GD7X1cxzzS/aduaxSu3I7TbhWlENjSPs6z5TaB9K0J+BH1jbmqTaM+ja5puis4wg==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@types/papaparse/-/papaparse-5.3.7.tgz} + name: '@types/papaparse' + version: 5.3.7 + dependencies: + '@types/node': registry.npmmirror.com/@types/node/18.14.0 + dev: true + registry.npmmirror.com/@types/parse-json/4.0.0: resolution: {integrity: sha512-//oorEZjL6sbPcKUaCdIGlIUeH26mgzimjBB77G6XRgnDl/L5wOnpyBGRe/Mmf5CVW3PwEBE1NjiMZ/ssFh4wA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@types/parse-json/-/parse-json-4.0.0.tgz} name: '@types/parse-json' @@ -9571,6 +9583,12 @@ packages: version: 1.0.11 dev: false + registry.npmmirror.com/papaparse/5.4.1: + resolution: {integrity: sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/papaparse/-/papaparse-5.4.1.tgz} + name: papaparse + version: 5.4.1 + dev: false + registry.npmmirror.com/parent-module/1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/parent-module/-/parent-module-1.0.1.tgz} name: parent-module diff --git a/src/api/model.ts b/src/api/model.ts index 8f53e32c7..286553f4d 100644 --- a/src/api/model.ts +++ b/src/api/model.ts @@ -60,7 +60,7 @@ export const getModelDataList = (props: GetModelDataListProps) => * 获取导出数据(不分页) */ export const getExportDataList = (modelId: string) => - GET(`/model/data/exportModelData?modelId=${modelId}`); + GET<[string, string][]>(`/model/data/exportModelData?modelId=${modelId}`); /** * 获取模型正在拆分数据的数量 @@ -90,10 +90,8 @@ export const postModelDataSplitData = (data: { modelId: string; text: string; pr /** * json导入数据 */ -export const postModelDataJsonData = ( - modelId: string, - jsonData: { prompt: string; completion: string; vector?: number[] }[] -) => POST(`/model/data/pushModelDataJson`, { modelId, data: jsonData }); +export const postModelDataCsvData = (modelId: string, data: string[][]) => + POST(`/model/data/pushModelDataCsv`, { modelId, data: data }); /** * 更新模型数据 diff --git a/src/pages/api/model/data/exportModelData.ts b/src/pages/api/model/data/exportModelData.ts index 3b511eb6e..5f4c86d29 100644 --- a/src/pages/api/model/data/exportModelData.ts +++ b/src/pages/api/model/data/exportModelData.ts @@ -41,16 +41,16 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< } ); - let str = `question,answer\n`; + const data: [string, string][] = []; searchRes.documents.forEach((item: any) => { if (item.value.q && item.value.text) { - str += `"${clearStrLineBreak(item.value.q)}","${clearStrLineBreak(item.value.text)}"\n`; + data.push([clearStrLineBreak(item.value.q), clearStrLineBreak(item.value.text)]); } }); jsonRes(res, { - data: str.slice(0, str.length - 1) + data }); } catch (err) { jsonRes(res, { diff --git a/src/pages/api/model/data/pushModelDataJson.ts b/src/pages/api/model/data/pushModelDataCsv.ts similarity index 85% rename from src/pages/api/model/data/pushModelDataJson.ts rename to src/pages/api/model/data/pushModelDataCsv.ts index 8231acb5f..d060941bc 100644 --- a/src/pages/api/model/data/pushModelDataJson.ts +++ b/src/pages/api/model/data/pushModelDataCsv.ts @@ -13,7 +13,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< try { const { modelId, data } = req.body as { modelId: string; - data: { prompt: string; completion: string; vector?: number[] }[]; + data: string[][]; }; const { authorization } = req.headers; @@ -44,8 +44,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< // 插入 redis const insertRedisRes = await Promise.allSettled( data.map((item) => { - const vector = item.vector; - return redis.sendCommand([ 'HMSET', `${VecModelDataPrefix}:${nanoid()}`, @@ -53,13 +51,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< userId, 'modelId', String(modelId), - ...(vector ? ['vector', vectorToBuffer(formatVector(vector))] : []), 'q', - item.prompt, + item[0], 'text', - item.completion, + item[1], 'status', - vector ? ModelDataStatusEnum.ready : ModelDataStatusEnum.waiting + ModelDataStatusEnum.waiting ]); }) ); diff --git a/src/pages/model/detail/components/ModelDataCard.tsx b/src/pages/model/detail/components/ModelDataCard.tsx index 2abe88a6f..c4055cbf8 100644 --- a/src/pages/model/detail/components/ModelDataCard.tsx +++ b/src/pages/model/detail/components/ModelDataCard.tsx @@ -33,6 +33,7 @@ import { fileDownload } from '@/utils/file'; import dynamic from 'next/dynamic'; import { useMutation, useQuery } from '@tanstack/react-query'; import type { FormData as InputDataType } from './InputDataModal'; +import Papa from 'papaparse'; const InputModel = dynamic(() => import('./InputDataModal')); const SelectFileModel = dynamic(() => import('./SelectFileModal')); @@ -92,10 +93,13 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => { mutationFn: () => getExportDataList(model._id), onSuccess(res) { try { - console.log(res); setIsLoading(true); + const text = Papa.unparse({ + fields: ['question', 'answer'], + data: res + }); fileDownload({ - text: res, + text, type: 'text/csv', filename: 'data.csv' }); diff --git a/src/pages/model/detail/components/SelectCsvModal.tsx b/src/pages/model/detail/components/SelectCsvModal.tsx index 7473d3035..da01d03e1 100644 --- a/src/pages/model/detail/components/SelectCsvModal.tsx +++ b/src/pages/model/detail/components/SelectCsvModal.tsx @@ -15,7 +15,7 @@ import { useSelectFile } from '@/hooks/useSelectFile'; import { useConfirm } from '@/hooks/useConfirm'; import { readCsvContent } from '@/utils/file'; import { useMutation } from '@tanstack/react-query'; -import { postModelDataJsonData } from '@/api/model'; +import { postModelDataCsvData } from '@/api/model'; import Markdown from '@/components/Markdown'; import { useMarkdown } from '@/hooks/useMarkdown'; import { fileDownload } from '@/utils/file'; @@ -33,20 +33,22 @@ const SelectJsonModal = ({ }) => { const [selecting, setSelecting] = useState(false); const { toast } = useToast(); - const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: true }); - const [fileData, setFileData] = useState< - { prompt: string; completion: string; vector?: number[] }[] - >([]); + const { File, onOpen } = useSelectFile({ fileType: '.csv', multiple: false }); + const [fileData, setFileData] = useState([]); const { openConfirm, ConfirmChild } = useConfirm({ content: '确认导入该数据集?' }); const onSelectFile = useCallback( async (e: File[]) => { + const file = e[0]; setSelecting(true); try { - const data = await Promise.all(e.map((item) => readCsvContent(item))); - console.log(data); + const { header, data } = await readCsvContent(file); + if (header[0] !== 'question' || header[1] !== 'answer') { + throw new Error('csv 文件格式有误'); + } + setFileData(data); } catch (error: any) { console.log(error); toast({ @@ -62,8 +64,7 @@ const SelectJsonModal = ({ const { mutate, isLoading } = useMutation({ mutationFn: async () => { if (!fileData) return; - const res = await postModelDataJsonData(modelId, fileData); - console.log(res); + await postModelDataCsvData(modelId, fileData); toast({ title: '导入数据成功,需要一段时间训练', status: 'success' @@ -115,7 +116,16 @@ const SelectJsonModal = ({ - {JSON.stringify(fileData)} + {fileData.map((item, index) => ( + + + Q{index + 1}. {item[0]} + + + A{index + 1}. {item[1]} + + + ))} diff --git a/src/utils/file.ts b/src/utils/file.ts index 427486102..6962b3123 100644 --- a/src/utils/file.ts +++ b/src/utils/file.ts @@ -1,4 +1,5 @@ import mammoth from 'mammoth'; +import Papa from 'papaparse'; /** * 读取 txt 文件内容 @@ -97,13 +98,15 @@ export const readDocContent = (file: File) => */ export const readCsvContent = async (file: File) => { try { - const textArr = (await readTxtContent(file)).split('\n'); - const header = textArr.shift()?.split(','); - if (!header) { - throw new Error('csv 格式错误'); + const textArr = await readTxtContent(file); + const json = Papa.parse(textArr).data as string[][]; + if (json.length === 0) { + throw new Error('csv 解析失败'); } - // 拆分每一行数据 - const data = []; + return { + header: json.shift()?.filter((item) => item) as string[], + data: json.map((item) => item?.filter((item) => item)) + }; } catch (error) { return Promise.reject('解析 csv 文件失败'); } diff --git a/src/utils/tools.ts b/src/utils/tools.ts index 05589cd49..0225b1b85 100644 --- a/src/utils/tools.ts +++ b/src/utils/tools.ts @@ -76,5 +76,5 @@ export const formatVector = (vector: number[]) => { * 字符串清理,替换换行符号 */ export const clearStrLineBreak = (str: string) => { - return str.replace(/\n/g, '\n').replace(/\n/g, '\\n').trim(); + return str.replace(/\n+/g, '\n').replace(/\n/g, '\\n').trim(); };