mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-27 00:17:31 +00:00
perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)
* perf: password special chars * feat: llm paragraph;perf: chunk setting params * perf: text splitter worker * perf: get rawtext buffer * fix: test * fix: test * doc * min chunk size
This commit is contained in:
@@ -3,9 +3,9 @@ export const checkPasswordRule = (password: string) => {
|
||||
/\d/, // Contains digits
|
||||
/[a-z]/, // Contains lowercase letters
|
||||
/[A-Z]/, // Contains uppercase letters
|
||||
/[!@#$%^&*()_+=-]/ // Contains special characters
|
||||
/[!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]/ // Contains special characters
|
||||
];
|
||||
const validChars = /^[\dA-Za-z!@#$%^&*()_+=-]{8,100}$/;
|
||||
const validChars = /^[\dA-Za-z!@#$%^&*()_+=.,:;?\/\\|`~"'<>{}\[\]-]{8,100}$/;
|
||||
|
||||
// Check length and valid characters
|
||||
if (!validChars.test(password)) return false;
|
||||
|
@@ -1,10 +1,11 @@
|
||||
import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
|
||||
import { getErrText } from '../error/utils';
|
||||
import { simpleText } from './tools';
|
||||
import { getTextValidLength } from './utils';
|
||||
|
||||
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
||||
|
||||
type SplitProps = {
|
||||
export type SplitProps = {
|
||||
text: string;
|
||||
chunkSize: number;
|
||||
|
||||
@@ -19,7 +20,7 @@ export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
|
||||
chunkSize?: number;
|
||||
};
|
||||
|
||||
type SplitResponse = {
|
||||
export type SplitResponse = {
|
||||
chunks: string[];
|
||||
chars: number;
|
||||
};
|
||||
@@ -474,7 +475,10 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => {
|
||||
});
|
||||
|
||||
return {
|
||||
chunks: splitResult.map((item) => item.chunks).flat(),
|
||||
chunks: splitResult
|
||||
.map((item) => item.chunks)
|
||||
.flat()
|
||||
.map((chunk) => simpleText(chunk)),
|
||||
chars: splitResult.reduce((sum, item) => sum + item.chars, 0)
|
||||
};
|
||||
};
|
||||
|
@@ -7,3 +7,4 @@ export const DEFAULT_ORG_AVATAR = '/imgs/avatar/defaultOrgAvatar.svg';
|
||||
export const DEFAULT_USER_AVATAR = '/imgs/avatar/BlueAvatar.svg';
|
||||
|
||||
export const isProduction = process.env.NODE_ENV === 'production';
|
||||
export const isTestEnv = process.env.NODE_ENV === 'test';
|
||||
|
@@ -211,7 +211,8 @@ export enum DataChunkSplitModeEnum {
|
||||
}
|
||||
export enum ParagraphChunkAIModeEnum {
|
||||
auto = 'auto',
|
||||
force = 'force'
|
||||
force = 'force',
|
||||
forbid = 'forbid'
|
||||
}
|
||||
|
||||
/* ------------ data -------------- */
|
||||
|
@@ -3,8 +3,11 @@ import { type EmbeddingModelItemType, type LLMModelItemType } from '../../../cor
|
||||
import {
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
ParagraphChunkAIModeEnum
|
||||
} from '../constants';
|
||||
import type { ChunkSettingsType } from '../type';
|
||||
import { cloneDeep } from 'lodash';
|
||||
|
||||
export const minChunkSize = 64; // min index and chunk size
|
||||
|
||||
@@ -103,53 +106,78 @@ export const getIndexSizeSelectList = (max = 512) => {
|
||||
};
|
||||
|
||||
// Compute
|
||||
export const computeChunkSize = (params: {
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
export const computedCollectionChunkSettings = <T extends ChunkSettingsType>({
|
||||
llmModel,
|
||||
vectorModel,
|
||||
...data
|
||||
}: {
|
||||
llmModel?: LLMModelItemType;
|
||||
chunkSize?: number;
|
||||
}) => {
|
||||
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return getLLMDefaultChunkSize(params.llmModel);
|
||||
vectorModel?: EmbeddingModelItemType;
|
||||
} & T) => {
|
||||
const {
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
chunkSettingMode = ChunkSettingModeEnum.auto,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
paragraphChunkDeep = 5,
|
||||
indexSize,
|
||||
autoIndexes
|
||||
} = data;
|
||||
const cloneChunkSettings = cloneDeep(data);
|
||||
|
||||
if (trainingType !== DatasetCollectionDataProcessModeEnum.qa) {
|
||||
delete cloneChunkSettings.qaPrompt;
|
||||
}
|
||||
|
||||
// Format training type indexSize/chunkSize
|
||||
const trainingModeSize: {
|
||||
autoChunkSize: number;
|
||||
autoIndexSize: number;
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
} = (() => {
|
||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return {
|
||||
autoChunkSize: getLLMDefaultChunkSize(llmModel),
|
||||
autoIndexSize: getMaxIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize: getMaxIndexSize(vectorModel)
|
||||
};
|
||||
} else if (autoIndexes) {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
} else {
|
||||
return {
|
||||
autoChunkSize: chunkAutoChunkSize,
|
||||
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||
chunkSize,
|
||||
indexSize
|
||||
};
|
||||
}
|
||||
})();
|
||||
|
||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
cloneChunkSettings.chunkSplitMode = DataChunkSplitModeEnum.paragraph;
|
||||
cloneChunkSettings.paragraphChunkAIMode = ParagraphChunkAIModeEnum.forbid;
|
||||
cloneChunkSettings.paragraphChunkDeep = 5;
|
||||
cloneChunkSettings.paragraphChunkMinSize = 100;
|
||||
cloneChunkSettings.chunkSize = trainingModeSize.autoChunkSize;
|
||||
cloneChunkSettings.indexSize = trainingModeSize.autoIndexSize;
|
||||
|
||||
cloneChunkSettings.chunkSplitter = undefined;
|
||||
} else {
|
||||
// chunk
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return chunkAutoChunkSize;
|
||||
}
|
||||
cloneChunkSettings.paragraphChunkDeep =
|
||||
chunkSplitMode === DataChunkSplitModeEnum.paragraph ? paragraphChunkDeep : 0;
|
||||
|
||||
cloneChunkSettings.chunkSize = trainingModeSize.chunkSize
|
||||
? Math.min(trainingModeSize.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(llmModel))
|
||||
: undefined;
|
||||
cloneChunkSettings.indexSize = trainingModeSize.indexSize;
|
||||
}
|
||||
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
|
||||
return getLLMMaxChunkSize(params.llmModel);
|
||||
}
|
||||
|
||||
return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
|
||||
};
|
||||
export const computeChunkSplitter = (params: {
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
chunkSplitter?: string;
|
||||
}) => {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return undefined;
|
||||
}
|
||||
if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
|
||||
return undefined;
|
||||
}
|
||||
return params.chunkSplitter;
|
||||
};
|
||||
export const computeParagraphChunkDeep = (params: {
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
paragraphChunkDeep?: number;
|
||||
}) => {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return 5;
|
||||
}
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
|
||||
return params.paragraphChunkDeep;
|
||||
}
|
||||
return 0;
|
||||
return cloneChunkSettings;
|
||||
};
|
||||
|
@@ -15,9 +15,11 @@
|
||||
"next": "14.2.28",
|
||||
"openai": "4.61.0",
|
||||
"openapi-types": "^12.1.3",
|
||||
"timezones-list": "^3.0.2"
|
||||
"timezones-list": "^3.0.2",
|
||||
"lodash": "^4.17.21"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/lodash": "^4.14.191",
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
"@types/node": "20.14.0"
|
||||
}
|
||||
|
Reference in New Issue
Block a user