perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)

* perf: password special chars

* feat: llm paragraph;perf: chunk setting params

* perf: text splitter worker

* perf: get rawtext buffer

* fix: test

* fix: test

* doc

* min chunk size
This commit is contained in:
Archer
2025-06-10 00:05:54 +08:00
committed by GitHub
parent 068918a9ee
commit 01ff56b42b
41 changed files with 546 additions and 448 deletions

View File

@@ -112,24 +112,15 @@ export async function pushDataListToTrainingQueue({
// format q and a, remove empty char
data = data.filter((item) => {
item.q = simpleText(item.q);
item.a = simpleText(item.a);
item.indexes = item.indexes
?.map((index) => {
return {
...index,
text: simpleText(index.text)
};
})
.filter(Boolean);
const q = item.q || '';
const a = item.a || '';
// filter repeat content
if (!item.imageId && !item.q) {
if (!item.imageId && !q) {
return;
}
const text = item.q + item.a;
const text = q + a;
// Oversize llm tokens
if (text.length > maxToken) {