perf: 文本拆分

2025-07-28 09:03:53 +00:00 · 2023-03-27 19:19:47 +08:00
parent af385b1b42
commit 7fb6f62cf6
1 changed files with 20 additions and 13 deletions
--- a/src/pages/api/data/splitData.ts
+++ b/src/pages/api/data/splitData.ts
@@ -8,11 +8,10 @@ import { generateAbstract } from '@/service/events/generateAbstract';
 /* 拆分数据成QA */
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
  try {
-    let { text, dataId } = req.body as { text: string; dataId: string };
+    const { text, dataId } = req.body as { text: string; dataId: string };
    if (!text || !dataId) {
      throw new Error('参数错误');
    }
    text = text.replace(/\n+/g, '\n');
    await connectToDatabase();
    const { authorization } = req.headers;
@@ -24,19 +23,27 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
    if (!DataRecord) {
      throw new Error('找不到数据集');
    }
    const replaceText = text.replace(/[\r\n\\n]+/g, ' ');
    // 文本拆分成 chunk
    let chunks = replaceText.match(/[^!?.。]+[!?.。]/g) || [];
    const dataItems: any[] = [];
    let splitText = '';
-    // 每 1000 字符一组
+    chunks.forEach((chunk) => {
-    for (let i = 0; i <= text.length / 1000; i++) {
+      splitText += chunk;
-      dataItems.push({
+      if (splitText.length >= 980) {
-        userId,
+        dataItems.push({
-        dataId,
+          userId,
-        type: DataRecord.type,
+          dataId,
-        text: text.slice(i * 1000, (i + 1) * 1000),
+          type: DataRecord.type,
-        status: 1
+          text: splitText,
-      });
+          status: 1
-    }
+        });
        splitText = '';
      }
    });
    // 批量插入数据
    await DataItem.insertMany(dataItems);
@@ -49,7 +56,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
    }
    jsonRes(res, {
-      data: ''
+      data: { chunks, replaceText }
    });
  } catch (err) {
    jsonRes(res, {