perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)

* perf: password special chars

* feat: llm paragraph;perf: chunk setting params

* perf: text splitter worker

* perf: get rawtext buffer

* fix: test

* fix: test

* doc

* min chunk size
This commit is contained in:
Archer
2025-06-10 00:05:54 +08:00
committed by GitHub
parent 068918a9ee
commit 01ff56b42b
41 changed files with 546 additions and 448 deletions

View File

@@ -3,9 +3,9 @@ export const checkPasswordRule = (password: string) => {
/\d/, // Contains digits
/[a-z]/, // Contains lowercase letters
/[A-Z]/, // Contains uppercase letters
/[!@#$%^&*()_+=-]/ // Contains special characters
/[!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]/ // Contains special characters
];
const validChars = /^[\dA-Za-z!@#$%^&*()_+=-]{8,100}$/;
const validChars = /^[\dA-Za-z!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]{8,100}$/;
// Check length and valid characters
if (!validChars.test(password)) return false;

View File

@@ -1,10 +1,11 @@
import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
import { getErrText } from '../error/utils';
import { simpleText } from './tools';
import { getTextValidLength } from './utils';
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
type SplitProps = {
export type SplitProps = {
text: string;
chunkSize: number;
@@ -19,7 +20,7 @@ export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
chunkSize?: number;
};
type SplitResponse = {
export type SplitResponse = {
chunks: string[];
chars: number;
};
@@ -474,7 +475,10 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => {
});
return {
chunks: splitResult.map((item) => item.chunks).flat(),
chunks: splitResult
.map((item) => item.chunks)
.flat()
.map((chunk) => simpleText(chunk)),
chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
};
};

View File

@@ -7,3 +7,4 @@ export const DEFAULT_ORG_AVATAR = '/imgs/avatar/defaultOrgAvatar.svg';
export const DEFAULT_USER_AVATAR = '/imgs/avatar/BlueAvatar.svg';
export const isProduction = process.env.NODE_ENV === 'production';
export const isTestEnv = process.env.NODE_ENV === 'test';

View File

@@ -211,7 +211,8 @@ export enum DataChunkSplitModeEnum {
}
export enum ParagraphChunkAIModeEnum {
auto = 'auto',
force = 'force'
force = 'force',
forbid = 'forbid'
}
/* ------------ data -------------- */

View File

@@ -3,8 +3,11 @@ import { type EmbeddingModelItemType, type LLMModelItemType } from '../../../cor
import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum
DatasetCollectionDataProcessModeEnum,
ParagraphChunkAIModeEnum
} from '../constants';
import type { ChunkSettingsType } from '../type';
import { cloneDeep } from 'lodash';
export const minChunkSize = 64; // min index and chunk size
@@ -103,53 +106,78 @@ export const getIndexSizeSelectList = (max = 512) => {
};
// Compute
export const computeChunkSize = (params: {
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
export const computedCollectionChunkSettings = <T extends ChunkSettingsType>({
llmModel,
vectorModel,
...data
}: {
llmModel?: LLMModelItemType;
chunkSize?: number;
}) => {
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return getLLMDefaultChunkSize(params.llmModel);
vectorModel?: EmbeddingModelItemType;
} & T) => {
const {
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
chunkSettingMode = ChunkSettingModeEnum.auto,
chunkSplitMode,
chunkSize,
paragraphChunkDeep = 5,
indexSize,
autoIndexes
} = data;
const cloneChunkSettings = cloneDeep(data);
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
delete cloneChunkSettings.qaPrompt;
}
// Format training type indexSize/chunkSize
const trainingModeSize: {
autoChunkSize: number;
autoIndexSize: number;
chunkSize?: number;
indexSize?: number;
} = (() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return {
autoChunkSize: getLLMDefaultChunkSize(llmModel),
autoIndexSize: getMaxIndexSize(vectorModel),
chunkSize,
indexSize: getMaxIndexSize(vectorModel)
};
} else if (autoIndexes) {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
} else {
return {
autoChunkSize: chunkAutoChunkSize,
autoIndexSize: getAutoIndexSize(vectorModel),
chunkSize,
indexSize
};
}
})();
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
cloneChunkSettings.chunkSplitMode = DataChunkSplitModeEnum.paragraph;
cloneChunkSettings.paragraphChunkAIMode = ParagraphChunkAIModeEnum.forbid;
cloneChunkSettings.paragraphChunkDeep = 5;
cloneChunkSettings.paragraphChunkMinSize = 100;
cloneChunkSettings.chunkSize = trainingModeSize.autoChunkSize;
cloneChunkSettings.indexSize = trainingModeSize.autoIndexSize;
cloneChunkSettings.chunkSplitter = undefined;
} else {
// chunk
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return chunkAutoChunkSize;
}
cloneChunkSettings.paragraphChunkDeep =
chunkSplitMode === DataChunkSplitModeEnum.paragraph ? paragraphChunkDeep : 0;
cloneChunkSettings.chunkSize = trainingModeSize.chunkSize
? Math.min(trainingModeSize.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(llmModel))
: undefined;
cloneChunkSettings.indexSize = trainingModeSize.indexSize;
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
return getLLMMaxChunkSize(params.llmModel);
}
return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
};
export const computeChunkSplitter = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSplitter?: string;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return undefined;
}
if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
return undefined;
}
return params.chunkSplitter;
};
export const computeParagraphChunkDeep = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
paragraphChunkDeep?: number;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return 5;
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
return params.paragraphChunkDeep;
}
return 0;
return cloneChunkSettings;
};

View File

@@ -15,9 +15,11 @@
"next": "14.2.28",
"openai": "4.61.0",
"openapi-types": "^12.1.3",
"timezones-list": "^3.0.2"
"timezones-list": "^3.0.2",
"lodash": "^4.17.21"
},
"devDependencies": {
"@types/lodash": "^4.14.191",
"@types/js-yaml": "^4.0.9",
"@types/node": "20.14.0"
}