perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)

* perf: password special chars * feat: llm paragraph;perf: chunk setting params * perf: text splitter worker * perf: get rawtext buffer * fix: test * fix: test * doc * min chunk size
2025-08-03 05:19:51 +00:00 · 2025-06-10 00:05:54 +08:00
parent 068918a9ee
commit 01ff56b42b
41 changed files with 546 additions and 448 deletions
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -112,24 +112,15 @@ export async function pushDataListToTrainingQueue({

  // format q and a, remove empty char
  data = data.filter((item) => {
-    item.q = simpleText(item.q);
-    item.a = simpleText(item.a);
-
-    item.indexes = item.indexes
-      ?.map((index) => {
-        return {
-          ...index,
-          text: simpleText(index.text)
-        };
-      })
-      .filter(Boolean);
+    const q = item.q || '';
+    const a = item.a || '';

    // filter repeat content
-    if (!item.imageId && !item.q) {
+    if (!item.imageId && !q) {
      return;
    }

-    const text = item.q + item.a;
+    const text = q + a;

    // Oversize llm tokens
    if (text.length > maxToken) {