perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)

* perf: password special chars

* feat: llm paragraph;perf: chunk setting params

* perf: text splitter worker

* perf: get rawtext buffer

* fix: test

* fix: test

* doc

* min chunk size
This commit is contained in:
Archer
2025-06-10 00:05:54 +08:00
committed by GitHub
parent 068918a9ee
commit 01ff56b42b
41 changed files with 546 additions and 448 deletions

View File

@@ -0,0 +1,18 @@
import type { MessagePort } from 'worker_threads';
export const workerResponse = ({
parentPort,
status,
data
}: {
parentPort: MessagePort | null;
status: 'success' | 'error';
data: any;
}) => {
parentPort?.postMessage({
type: status,
data: data
});
process.exit();
};

View File

@@ -0,0 +1,24 @@
import {
splitText2Chunks,
type SplitProps,
type SplitResponse
} from '@fastgpt/global/common/string/textSplitter';
import { runWorker, WorkerNameEnum } from './utils';
import type { ReadFileResponse } from './readFile/type';
import { isTestEnv } from '@fastgpt/global/common/system/constants';
export const text2Chunks = (props: SplitProps) => {
// Test env, not run worker
if (isTestEnv) {
return splitText2Chunks(props);
}
return runWorker<SplitResponse>(WorkerNameEnum.text2Chunks, props);
};
export const readRawContentFromBuffer = (props: {
extension: string;
encoding: string;
buffer: Buffer;
}) => {
return runWorker<ReadFileResponse>(WorkerNameEnum.readFile, props);
};

View File

@@ -1,19 +1,21 @@
import { parentPort } from 'worker_threads';
import { html2md } from './utils';
import { workerResponse } from '../controller';
parentPort?.on('message', (params: { html: string }) => {
try {
const md = html2md(params?.html || '');
parentPort?.postMessage({
type: 'success',
workerResponse({
parentPort,
status: 'success',
data: md
});
} catch (error) {
parentPort?.postMessage({
type: 'error',
workerResponse({
parentPort,
status: 'error',
data: error
});
}
process.exit();
});

View File

@@ -7,6 +7,7 @@ import { readDocsFile } from './extension/docx';
import { readPptxRawText } from './extension/pptx';
import { readXlsxRawText } from './extension/xlsx';
import { readCsvRawText } from './extension/csv';
import { workerResponse } from '../controller';
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
const read = async (params: ReadRawTextByBuffer) => {
@@ -41,17 +42,16 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
};
try {
parentPort?.postMessage({
type: 'success',
workerResponse({
parentPort,
status: 'success',
data: await read(newProps)
});
} catch (error) {
console.log(error);
parentPort?.postMessage({
type: 'error',
workerResponse({
parentPort,
status: 'error',
data: error
});
}
process.exit();
});

View File

@@ -0,0 +1,14 @@
import { parentPort } from 'worker_threads';
import type { SplitProps } from '@fastgpt/global/common/string/textSplitter';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { workerResponse } from '../controller';
parentPort?.on('message', async (props: SplitProps) => {
const result = splitText2Chunks(props);
workerResponse({
parentPort,
status: 'success',
data: result
});
});

View File

@@ -6,7 +6,8 @@ export enum WorkerNameEnum {
readFile = 'readFile',
htmlStr2Md = 'htmlStr2Md',
countGptMessagesTokens = 'countGptMessagesTokens',
systemPluginRun = 'systemPluginRun'
systemPluginRun = 'systemPluginRun',
text2Chunks = 'text2Chunks'
}
export const getSafeEnv = () => {