perf: 文本拆分

2025-07-28 00:56:26 +00:00 · 2023-03-27 19:19:47 +08:00
parent af385b1b42
commit 7fb6f62cf6
1 changed files with 20 additions and 13 deletions
--- a/src/pages/api/data/splitData.ts
+++ b/src/pages/api/data/splitData.ts
@@ -8,11 +8,10 @@ import { generateAbstract } from '@/service/events/generateAbstract';
 /* 拆分数据成QA */
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
  try {
-    let { text, dataId } = req.body as { text: string; dataId: string };
+    const { text, dataId } = req.body as { text: string; dataId: string };
    if (!text || !dataId) {
      throw new Error('参数错误');
    }
-    text = text.replace(/\n+/g, '\n');
    await connectToDatabase();

    const { authorization } = req.headers;
@@ -24,19 +23,27 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
    if (!DataRecord) {
      throw new Error('找不到数据集');
    }
+    const replaceText = text.replace(/[\r\n\\n]+/g, ' ');
+
+    // 文本拆分成 chunk
+    let chunks = replaceText.match(/[^!?.。]+[!?.。]/g) || [];

    const dataItems: any[] = [];
+    let splitText = '';

-    // 每 1000 字符一组
-    for (let i = 0; i <= text.length / 1000; i++) {
+    chunks.forEach((chunk) => {
+      splitText += chunk;
+      if (splitText.length >= 980) {
        dataItems.push({
          userId,
          dataId,
          type: DataRecord.type,
-        text: text.slice(i * 1000, (i + 1) * 1000),
+          text: splitText,
          status: 1
        });
+        splitText = '';
      }
+    });

    // 批量插入数据
    await DataItem.insertMany(dataItems);
@@ -49,7 +56,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
    }

    jsonRes(res, {
-      data: ''
+      data: { chunks, replaceText }
    });
  } catch (err) {
    jsonRes(res, {