mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 12:20:34 +00:00
feat: chunk index independent config (#4271)
* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
This commit is contained in:
@@ -1,15 +1,17 @@
|
||||
import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
|
||||
import { getErrText } from '../error/utils';
|
||||
|
||||
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
||||
|
||||
type SplitProps = {
|
||||
text: string;
|
||||
chunkLen: number;
|
||||
chunkSize: number;
|
||||
maxSize?: number;
|
||||
overlapRatio?: number;
|
||||
customReg?: string[];
|
||||
};
|
||||
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
|
||||
chunkLen?: number;
|
||||
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
|
||||
chunkSize?: number;
|
||||
};
|
||||
|
||||
type SplitResponse = {
|
||||
@@ -55,7 +57,7 @@ const strIsMdTable = (str: string) => {
|
||||
return true;
|
||||
};
|
||||
const markdownTableSplit = (props: SplitProps): SplitResponse => {
|
||||
let { text = '', chunkLen } = props;
|
||||
let { text = '', chunkSize } = props;
|
||||
const splitText2Lines = text.split('\n');
|
||||
const header = splitText2Lines[0];
|
||||
const headerSize = header.split('|').length - 2;
|
||||
@@ -71,7 +73,7 @@ ${mdSplitString}
|
||||
`;
|
||||
|
||||
for (let i = 2; i < splitText2Lines.length; i++) {
|
||||
if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) {
|
||||
if (chunk.length + splitText2Lines[i].length > chunkSize * 1.2) {
|
||||
chunks.push(chunk);
|
||||
chunk = `${header}
|
||||
${mdSplitString}
|
||||
@@ -98,11 +100,17 @@ ${mdSplitString}
|
||||
5. 标点分割:重叠
|
||||
*/
|
||||
const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props;
|
||||
let {
|
||||
text = '',
|
||||
chunkSize,
|
||||
maxSize = defaultMaxChunkSize,
|
||||
overlapRatio = 0.15,
|
||||
customReg = []
|
||||
} = props;
|
||||
|
||||
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
||||
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
||||
const overlapLen = Math.round(chunkLen * overlapRatio);
|
||||
const overlapLen = Math.round(chunkSize * overlapRatio);
|
||||
|
||||
// replace code block all \n to codeBlockMarker
|
||||
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
|
||||
@@ -118,24 +126,24 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
|
||||
...customReg.map((text) => ({
|
||||
reg: text.replaceAll('\\n', '\n'),
|
||||
maxLen: chunkLen * 1.4
|
||||
maxLen: chunkSize
|
||||
})),
|
||||
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
|
||||
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
||||
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
||||
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||
|
||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
||||
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 },
|
||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
|
||||
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /(\n{2,})/g, maxLen: chunkSize },
|
||||
{ reg: /([\n])/g, maxLen: chunkSize },
|
||||
// ------ There's no overlap on the top
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /([;]|;\s)/g, maxLen: chunkLen * 1.6 },
|
||||
{ reg: /([,]|,\s)/g, maxLen: chunkLen * 2 }
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkSize },
|
||||
{ reg: /([!]|!\s)/g, maxLen: chunkSize },
|
||||
{ reg: /([?]|\?\s)/g, maxLen: chunkSize },
|
||||
{ reg: /([;]|;\s)/g, maxLen: chunkSize },
|
||||
{ reg: /([,]|,\s)/g, maxLen: chunkSize }
|
||||
];
|
||||
|
||||
const customRegLen = customReg.length;
|
||||
@@ -203,7 +211,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
/* Gets the overlap at the end of a text as the beginning of the next block */
|
||||
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
||||
const forbidOverlap = checkForbidOverlap(step);
|
||||
const maxOverlapLen = chunkLen * 0.4;
|
||||
const maxOverlapLen = chunkSize * 0.4;
|
||||
|
||||
// step >= stepReges.length: Do not overlap incomplete sentences
|
||||
if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
|
||||
@@ -246,13 +254,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
|
||||
// oversize
|
||||
if (step >= stepReges.length) {
|
||||
if (text.length < chunkLen * 3) {
|
||||
if (text.length < chunkSize * 3) {
|
||||
return [text];
|
||||
}
|
||||
// use slice-chunkLen to split text
|
||||
// use slice-chunkSize to split text
|
||||
const chunks: string[] = [];
|
||||
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
|
||||
chunks.push(text.slice(i, i + chunkLen));
|
||||
for (let i = 0; i < text.length; i += chunkSize - overlapLen) {
|
||||
chunks.push(text.slice(i, i + chunkSize));
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
@@ -260,8 +268,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
// split text by special char
|
||||
const splitTexts = getSplitTexts({ text, step });
|
||||
|
||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
|
||||
const minChunkLen = chunkLen * 0.7;
|
||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
|
||||
const minChunkLen = chunkSize * 0.7;
|
||||
|
||||
const chunks: string[] = [];
|
||||
for (let i = 0; i < splitTexts.length; i++) {
|
||||
@@ -297,7 +305,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
continue;
|
||||
}
|
||||
|
||||
// newText is too large(now, The lastText must be smaller than chunkLen)
|
||||
// newText is too large(now, The lastText must be smaller than chunkSize)
|
||||
if (newTextLen > maxLen) {
|
||||
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
||||
if (lastTextLen > minChunkLen) {
|
||||
@@ -352,7 +360,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
|
||||
/* If the last chunk is independent, it needs to be push chunks. */
|
||||
if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
|
||||
if (lastText.length < chunkLen * 0.4) {
|
||||
if (lastText.length < chunkSize * 0.4) {
|
||||
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
|
||||
} else {
|
||||
chunks.push(lastText);
|
||||
@@ -386,9 +394,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
||||
|
||||
/**
|
||||
* text split into chunks
|
||||
* chunkLen - one chunk len. max: 3500
|
||||
* chunkSize - one chunk len. max: 3500
|
||||
* overlapLen - The size of the before and after Text
|
||||
* chunkLen > overlapLen
|
||||
* chunkSize > overlapLen
|
||||
* markdown
|
||||
*/
|
||||
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
|
||||
|
13
packages/global/core/dataset/api.d.ts
vendored
13
packages/global/core/dataset/api.d.ts
vendored
@@ -1,5 +1,10 @@
|
||||
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
||||
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum
|
||||
} from './constants';
|
||||
import type { LLMModelItemType } from '../ai/model.d';
|
||||
import { ParentIdType } from 'common/parentFolder/type';
|
||||
|
||||
@@ -33,7 +38,13 @@ export type DatasetCollectionChunkMetadataType = {
|
||||
trainingType?: DatasetCollectionDataProcessModeEnum;
|
||||
imageIndex?: boolean;
|
||||
autoIndexes?: boolean;
|
||||
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
metadata?: Record<string, any>;
|
||||
|
@@ -129,6 +129,16 @@ export const DatasetCollectionDataProcessModeMap = {
|
||||
}
|
||||
};
|
||||
|
||||
export enum ChunkSettingModeEnum {
|
||||
auto = 'auto',
|
||||
custom = 'custom'
|
||||
}
|
||||
|
||||
export enum DataChunkSplitModeEnum {
|
||||
size = 'size',
|
||||
char = 'char'
|
||||
}
|
||||
|
||||
/* ------------ data -------------- */
|
||||
|
||||
/* ------------ training -------------- */
|
||||
|
1
packages/global/core/dataset/controller.d.ts
vendored
1
packages/global/core/dataset/controller.d.ts
vendored
@@ -13,6 +13,7 @@ export type CreateDatasetDataProps = {
|
||||
|
||||
export type UpdateDatasetDataProps = {
|
||||
dataId: string;
|
||||
|
||||
q?: string;
|
||||
a?: string;
|
||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {
|
||||
|
@@ -15,6 +15,8 @@ export type PushDataToTrainingQueueProps = {
|
||||
vectorModel: string;
|
||||
vlmModel?: string;
|
||||
|
||||
indexSize?: number;
|
||||
|
||||
billId?: string;
|
||||
session?: ClientSession;
|
||||
};
|
||||
|
136
packages/global/core/dataset/training/utils.ts
Normal file
136
packages/global/core/dataset/training/utils.ts
Normal file
@@ -0,0 +1,136 @@
|
||||
import { EmbeddingModelItemType, LLMModelItemType } from '../../../core/ai/model.d';
|
||||
import {
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '../constants';
|
||||
|
||||
export const minChunkSize = 64; // min index and chunk size
|
||||
|
||||
// Chunk size
|
||||
export const chunkAutoChunkSize = 1500;
|
||||
export const getMaxChunkSize = (model: LLMModelItemType) => {
|
||||
return Math.max(model.maxContext - model.maxResponse, 2000);
|
||||
};
|
||||
|
||||
// QA
|
||||
export const defaultMaxChunkSize = 8000;
|
||||
export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => {
|
||||
if (!model) return defaultMaxChunkSize;
|
||||
return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000);
|
||||
};
|
||||
|
||||
export const getLLMMaxChunkSize = (model?: LLMModelItemType) => {
|
||||
if (!model) return 8000;
|
||||
return Math.max(model.maxContext - model.maxResponse, 2000);
|
||||
};
|
||||
|
||||
// Index size
|
||||
export const getMaxIndexSize = (model?: EmbeddingModelItemType) => {
|
||||
return model?.maxToken || 512;
|
||||
};
|
||||
export const getAutoIndexSize = (model?: EmbeddingModelItemType) => {
|
||||
return model?.defaultToken || 512;
|
||||
};
|
||||
|
||||
const indexSizeSelectList = [
|
||||
{
|
||||
label: '64',
|
||||
value: 64
|
||||
},
|
||||
{
|
||||
label: '128',
|
||||
value: 128
|
||||
},
|
||||
{
|
||||
label: '256',
|
||||
value: 256
|
||||
},
|
||||
{
|
||||
label: '512',
|
||||
value: 512
|
||||
},
|
||||
{
|
||||
label: '768',
|
||||
value: 768
|
||||
},
|
||||
{
|
||||
label: '1024',
|
||||
value: 1024
|
||||
},
|
||||
{
|
||||
label: '1536',
|
||||
value: 1536
|
||||
},
|
||||
{
|
||||
label: '2048',
|
||||
value: 2048
|
||||
},
|
||||
{
|
||||
label: '3072',
|
||||
value: 3072
|
||||
},
|
||||
{
|
||||
label: '4096',
|
||||
value: 4096
|
||||
},
|
||||
{
|
||||
label: '5120',
|
||||
value: 5120
|
||||
},
|
||||
{
|
||||
label: '6144',
|
||||
value: 6144
|
||||
},
|
||||
{
|
||||
label: '7168',
|
||||
value: 7168
|
||||
},
|
||||
{
|
||||
label: '8192',
|
||||
value: 8192
|
||||
}
|
||||
];
|
||||
export const getIndexSizeSelectList = (max = 512) => {
|
||||
return indexSizeSelectList.filter((item) => item.value <= max);
|
||||
};
|
||||
|
||||
// Compute
|
||||
export const computeChunkSize = (params: {
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
llmModel?: LLMModelItemType;
|
||||
chunkSize?: number;
|
||||
}) => {
|
||||
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return getLLMDefaultChunkSize(params.llmModel);
|
||||
}
|
||||
} else {
|
||||
// chunk
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return chunkAutoChunkSize;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
|
||||
return getLLMMaxChunkSize(params.llmModel);
|
||||
}
|
||||
|
||||
return Math.min(params.chunkSize || chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
|
||||
};
|
||||
|
||||
export const computeChunkSplitter = (params: {
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
chunkSplitter?: string;
|
||||
}) => {
|
||||
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||
return undefined;
|
||||
}
|
||||
if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
|
||||
return undefined;
|
||||
}
|
||||
return params.chunkSplitter;
|
||||
};
|
9
packages/global/core/dataset/type.d.ts
vendored
9
packages/global/core/dataset/type.d.ts
vendored
@@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
|
||||
import { PermissionTypeEnum } from '../../support/permission/constant';
|
||||
import { PushDatasetDataChunkProps } from './api';
|
||||
import {
|
||||
DataChunkSplitModeEnum,
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetCollectionTypeEnum,
|
||||
DatasetStatusEnum,
|
||||
@@ -14,6 +15,7 @@ import { Permission } from '../../support/permission/controller';
|
||||
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
||||
import { SourceMemberType } from 'support/user/type';
|
||||
import { DatasetDataIndexTypeEnum } from './data/constants';
|
||||
import { ChunkSettingModeEnum } from './constants';
|
||||
|
||||
export type DatasetSchemaType = {
|
||||
_id: string;
|
||||
@@ -88,7 +90,12 @@ export type DatasetCollectionSchemaType = {
|
||||
autoIndexes?: boolean;
|
||||
imageIndex?: boolean;
|
||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||
chunkSize: number;
|
||||
|
||||
chunkSettingMode?: ChunkSettingModeEnum;
|
||||
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||
|
||||
chunkSize?: number;
|
||||
indexSize?: number;
|
||||
chunkSplitter?: string;
|
||||
qaPrompt?: string;
|
||||
};
|
||||
|
@@ -1,7 +1,6 @@
|
||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
|
||||
import { getFileIcon } from '../../common/file/icon';
|
||||
import { strIsLink } from '../../common/string/tools';
|
||||
import { DatasetDataIndexTypeEnum } from './data/constants';
|
||||
|
||||
export function getCollectionIcon(
|
||||
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
|
||||
@@ -38,26 +37,6 @@ export function getSourceNameIcon({
|
||||
return 'file/fill/file';
|
||||
}
|
||||
|
||||
/* get dataset data default index */
|
||||
export function getDefaultIndex(props?: { q?: string; a?: string }) {
|
||||
const { q = '', a } = props || {};
|
||||
|
||||
return [
|
||||
{
|
||||
text: q,
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
},
|
||||
...(a
|
||||
? [
|
||||
{
|
||||
text: a,
|
||||
type: DatasetDataIndexTypeEnum.default
|
||||
}
|
||||
]
|
||||
: [])
|
||||
];
|
||||
}
|
||||
|
||||
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
|
||||
if (mode === TrainingModeEnum.qa) return data.length * 20;
|
||||
if (mode === TrainingModeEnum.auto) return data.length * 5;
|
||||
|
@@ -27,6 +27,11 @@ import { addDays } from 'date-fns';
|
||||
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
import { getTrainingModeByCollection } from './utils';
|
||||
import {
|
||||
computeChunkSize,
|
||||
computeChunkSplitter,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
export const createCollectionAndInsertData = async ({
|
||||
dataset,
|
||||
@@ -54,18 +59,22 @@ export const createCollectionAndInsertData = async ({
|
||||
|
||||
const teamId = createCollectionParams.teamId;
|
||||
const tmbId = createCollectionParams.tmbId;
|
||||
// Chunk split params
|
||||
|
||||
// Set default params
|
||||
const trainingType =
|
||||
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
||||
const chunkSize = createCollectionParams.chunkSize || 512;
|
||||
const chunkSplitter = createCollectionParams.chunkSplitter;
|
||||
const qaPrompt = createCollectionParams.qaPrompt;
|
||||
const usageName = createCollectionParams.name;
|
||||
const chunkSize = computeChunkSize({
|
||||
...createCollectionParams,
|
||||
trainingType,
|
||||
llmModel: getLLMModel(dataset.agentModel)
|
||||
});
|
||||
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
||||
|
||||
// 1. split chunks
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
chunkLen: chunkSize,
|
||||
chunkSize,
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
isQAImport
|
||||
@@ -76,7 +85,7 @@ export const createCollectionAndInsertData = async ({
|
||||
teamId,
|
||||
insertLen: predictDataLimitLength(
|
||||
getTrainingModeByCollection({
|
||||
trainingType,
|
||||
trainingType: trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
@@ -88,6 +97,9 @@ export const createCollectionAndInsertData = async ({
|
||||
// 3. create collection
|
||||
const { _id: collectionId } = await createOneCollection({
|
||||
...createCollectionParams,
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
|
||||
hashRawText: hashStr(rawText),
|
||||
rawTextLength: rawText.length,
|
||||
@@ -111,7 +123,7 @@ export const createCollectionAndInsertData = async ({
|
||||
const { billId: newBillId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: usageName,
|
||||
appName: createCollectionParams.name,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
@@ -130,12 +142,13 @@ export const createCollectionAndInsertData = async ({
|
||||
agentModel: dataset.agentModel,
|
||||
vectorModel: dataset.vectorModel,
|
||||
vlmModel: dataset.vlmModel,
|
||||
indexSize: createCollectionParams.indexSize,
|
||||
mode: getTrainingModeByCollection({
|
||||
trainingType,
|
||||
trainingType: trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
prompt: qaPrompt,
|
||||
prompt: createCollectionParams.qaPrompt,
|
||||
billId: traingBillId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
@@ -207,11 +220,14 @@ export async function createOneCollection({
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
|
||||
// Chunk settings
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
autoIndexes,
|
||||
chunkSize = 512,
|
||||
trainingType,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
@@ -249,11 +265,14 @@ export async function createOneCollection({
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
autoIndexes,
|
||||
|
||||
// Chunk settings
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
chunkSettingMode,
|
||||
chunkSplitMode,
|
||||
chunkSize,
|
||||
indexSize,
|
||||
chunkSplitter,
|
||||
qaPrompt
|
||||
}
|
||||
|
@@ -3,7 +3,9 @@ const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import {
|
||||
DatasetCollectionTypeMap,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
ChunkSettingModeEnum,
|
||||
DataChunkSplitModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
@@ -94,11 +96,18 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: String,
|
||||
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
chunkSettingMode: {
|
||||
type: String,
|
||||
enum: Object.values(ChunkSettingModeEnum)
|
||||
},
|
||||
chunkSplitMode: {
|
||||
type: String,
|
||||
enum: Object.values(DataChunkSplitModeEnum)
|
||||
},
|
||||
chunkSize: Number,
|
||||
chunkSplitter: String,
|
||||
|
||||
indexSize: Number,
|
||||
qaPrompt: String
|
||||
});
|
||||
|
||||
|
@@ -185,7 +185,7 @@ export const readApiServerFileContent = async ({
|
||||
export const rawText2Chunks = ({
|
||||
rawText,
|
||||
isQAImport,
|
||||
chunkLen = 512,
|
||||
chunkSize = 512,
|
||||
...splitProps
|
||||
}: {
|
||||
rawText: string;
|
||||
@@ -198,7 +198,7 @@ export const rawText2Chunks = ({
|
||||
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: rawText,
|
||||
chunkLen,
|
||||
chunkSize,
|
||||
...splitProps
|
||||
});
|
||||
|
||||
|
@@ -12,6 +12,10 @@ import { getCollectionWithDataset } from '../controller';
|
||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
||||
import { i18nT } from '../../../../web/i18n/utils';
|
||||
import {
|
||||
getLLMDefaultChunkSize,
|
||||
getLLMMaxChunkSize
|
||||
} from '../../../../global/core/dataset/training/utils';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||
try {
|
||||
@@ -55,6 +59,7 @@ export async function pushDataListToTrainingQueue({
|
||||
prompt,
|
||||
billId,
|
||||
mode = TrainingModeEnum.chunk,
|
||||
indexSize,
|
||||
session
|
||||
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
||||
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
||||
@@ -68,38 +73,41 @@ export async function pushDataListToTrainingQueue({
|
||||
}
|
||||
return mode;
|
||||
};
|
||||
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(i18nT('common:error_embedding_not_config'));
|
||||
}
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(i18nT('common:error_llm_not_config'));
|
||||
}
|
||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||
prompt = undefined;
|
||||
}
|
||||
|
||||
const { model, maxToken, weight } = await (async () => {
|
||||
if (mode === TrainingModeEnum.chunk) {
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(i18nT('common:error_embedding_not_config'));
|
||||
}
|
||||
return {
|
||||
maxToken: vectorModelData.maxToken * 1.5,
|
||||
maxToken: getLLMMaxChunkSize(agentModelData),
|
||||
model: vectorModelData.model,
|
||||
weight: vectorModelData.weight
|
||||
};
|
||||
}
|
||||
|
||||
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(i18nT('common:error_llm_not_config'));
|
||||
}
|
||||
return {
|
||||
maxToken: agentModelData.maxContext * 0.8,
|
||||
maxToken: getLLMMaxChunkSize(agentModelData),
|
||||
model: agentModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
|
||||
if (mode === TrainingModeEnum.image) {
|
||||
const vllmModelData = getVlmModel(vlmModel);
|
||||
if (!vllmModelData) {
|
||||
return Promise.reject(i18nT('common:error_vlm_not_config'));
|
||||
}
|
||||
return {
|
||||
maxToken: vllmModelData.maxContext * 0.8,
|
||||
maxToken: getLLMMaxChunkSize(vllmModelData),
|
||||
model: vllmModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
@@ -107,10 +115,6 @@ export async function pushDataListToTrainingQueue({
|
||||
|
||||
return Promise.reject(`Training mode "${mode}" is inValid`);
|
||||
})();
|
||||
// Filter redundant params
|
||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||
prompt = undefined;
|
||||
}
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
@@ -143,13 +147,13 @@ export async function pushDataListToTrainingQueue({
|
||||
|
||||
const text = item.q + item.a;
|
||||
|
||||
// Oversize llm tokens
|
||||
if (text.length > maxToken) {
|
||||
filterResult.overToken.push(item);
|
||||
return;
|
||||
}
|
||||
|
||||
if (set.has(text)) {
|
||||
console.log('repeat', item);
|
||||
filterResult.repeat.push(item);
|
||||
} else {
|
||||
filterResult.success.push(item);
|
||||
@@ -182,6 +186,7 @@ export async function pushDataListToTrainingQueue({
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
indexSize,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes,
|
||||
retryCount: 5
|
||||
|
@@ -76,6 +76,7 @@ const TrainingDataSchema = new Schema({
|
||||
type: Number,
|
||||
default: 0
|
||||
},
|
||||
indexSize: Number,
|
||||
weight: {
|
||||
type: Number,
|
||||
default: 0
|
||||
|
@@ -72,7 +72,7 @@ const EditFolderModal = ({
|
||||
{...register('name', { required: true })}
|
||||
bg={'myGray.50'}
|
||||
autoFocus
|
||||
maxLength={20}
|
||||
maxLength={100}
|
||||
/>
|
||||
</Box>
|
||||
<Box mt={4}>
|
||||
|
67
packages/web/components/common/Radio/RadioGroup.tsx
Normal file
67
packages/web/components/common/Radio/RadioGroup.tsx
Normal file
@@ -0,0 +1,67 @@
|
||||
import React from 'react';
|
||||
import { Box, Flex, Grid, type GridProps, HStack } from '@chakra-ui/react';
|
||||
import { useTranslation } from 'next-i18next';
|
||||
import QuestionTip from '../MyTooltip/QuestionTip';
|
||||
|
||||
type Props<T> = Omit<GridProps, 'onChange'> & {
|
||||
list: {
|
||||
title: string;
|
||||
value: T;
|
||||
tooltip?: string;
|
||||
}[];
|
||||
value: T;
|
||||
defaultBg?: string;
|
||||
activeBg?: string;
|
||||
onChange: (e: T) => void;
|
||||
};
|
||||
|
||||
const RadioGroup = <T = any,>({ list, value, onChange, ...props }: Props<T>) => {
|
||||
const { t } = useTranslation();
|
||||
|
||||
return (
|
||||
<Flex gap={[3, 5]} fontSize={['sm', 'md']} alignItems={'center'} {...props}>
|
||||
{list.map((item) => (
|
||||
<Flex
|
||||
alignItems={'center'}
|
||||
key={item.value as any}
|
||||
cursor={'pointer'}
|
||||
userSelect={'none'}
|
||||
gap={1}
|
||||
onClick={() => onChange(item.value)}
|
||||
>
|
||||
<Box
|
||||
w={'18px'}
|
||||
h={'18px'}
|
||||
borderWidth={'2.4px'}
|
||||
borderColor={value === item.value ? 'primary.015' : 'transparent'}
|
||||
borderRadius={'50%'}
|
||||
>
|
||||
<Flex
|
||||
w={'100%'}
|
||||
h={'100%'}
|
||||
borderWidth={'1px'}
|
||||
borderColor={value === item.value ? 'primary.600' : 'borderColor.high'}
|
||||
bg={value === item.value ? 'primary.1' : 'transparent'}
|
||||
borderRadius={'50%'}
|
||||
alignItems={'center'}
|
||||
justifyContent={'center'}
|
||||
>
|
||||
<Box
|
||||
w={'5px'}
|
||||
h={'5px'}
|
||||
borderRadius={'50%'}
|
||||
bg={value === item.value ? 'primary.600' : 'transparent'}
|
||||
/>
|
||||
</Flex>
|
||||
</Box>
|
||||
<HStack spacing={1} color={'myGray.900'} whiteSpace={'nowrap'} fontSize={'sm'}>
|
||||
<Box>{typeof item.title === 'string' ? t(item.title as any) : item.title}</Box>
|
||||
{!!item.tooltip && <QuestionTip label={item.tooltip} color={'myGray.600'} />}
|
||||
</HStack>
|
||||
</Flex>
|
||||
))}
|
||||
</Flex>
|
||||
);
|
||||
};
|
||||
|
||||
export default RadioGroup;
|
@@ -569,7 +569,6 @@
|
||||
"core.dataset.import.Custom process": "Custom Rules",
|
||||
"core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
|
||||
"core.dataset.import.Custom prompt": "Custom Prompt",
|
||||
"core.dataset.import.Custom split char": "Custom Separator",
|
||||
"core.dataset.import.Custom text": "Custom Text",
|
||||
"core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
|
||||
"core.dataset.import.Data process params": "Data Processing Parameters",
|
||||
|
@@ -27,7 +27,6 @@
|
||||
"custom_data_process_params": "Custom",
|
||||
"custom_data_process_params_desc": "Customize data processing rules",
|
||||
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
||||
"data.ideal_chunk_length": "ideal block length",
|
||||
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
||||
"data_index_num": "Index {{index}}",
|
||||
"data_process_params": "Params",
|
||||
@@ -53,8 +52,6 @@
|
||||
"file_model_function_tip": "Enhances indexing and QA generation",
|
||||
"filename": "Filename",
|
||||
"folder_dataset": "Folder",
|
||||
"ideal_chunk_length": "ideal block length",
|
||||
"ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
|
||||
"image_auto_parse": "Automatic image indexing",
|
||||
"image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
|
||||
"image_training_queue": "Queue of image processing",
|
||||
@@ -68,6 +65,8 @@
|
||||
"import_param_setting": "Parameter settings",
|
||||
"import_select_file": "Select a file",
|
||||
"import_select_link": "Enter link",
|
||||
"index_size": "Index size",
|
||||
"index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
|
||||
"is_open_schedule": "Enable scheduled synchronization",
|
||||
"keep_image": "Keep the picture",
|
||||
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
|
||||
@@ -89,6 +88,8 @@
|
||||
"retain_collection": "Adjust Training Parameters",
|
||||
"retrain_task_submitted": "The retraining task has been submitted",
|
||||
"same_api_collection": "The same API set exists",
|
||||
"split_chunk_char": "Block by specified splitter",
|
||||
"split_chunk_size": "Block by length",
|
||||
"split_sign_break": "1 newline character",
|
||||
"split_sign_break2": "2 newline characters",
|
||||
"split_sign_custom": "Customize",
|
||||
|
@@ -573,7 +573,6 @@
|
||||
"core.dataset.import.Custom process": "自定义规则",
|
||||
"core.dataset.import.Custom process desc": "自定义设置数据处理规则",
|
||||
"core.dataset.import.Custom prompt": "自定义提示词",
|
||||
"core.dataset.import.Custom split char": "自定义分隔符",
|
||||
"core.dataset.import.Custom text": "自定义文本",
|
||||
"core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
|
||||
"core.dataset.import.Data process params": "数据处理参数",
|
||||
|
@@ -27,7 +27,6 @@
|
||||
"custom_data_process_params": "自定义",
|
||||
"custom_data_process_params_desc": "自定义设置数据处理规则",
|
||||
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
||||
"data.ideal_chunk_length": "理想分块长度",
|
||||
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
||||
"data_index_num": "索引 {{index}}",
|
||||
"data_process_params": "处理参数",
|
||||
@@ -53,8 +52,6 @@
|
||||
"file_model_function_tip": "用于增强索引和 QA 生成",
|
||||
"filename": "文件名",
|
||||
"folder_dataset": "文件夹",
|
||||
"ideal_chunk_length": "理想分块长度",
|
||||
"ideal_chunk_length_tips": "按结束符号进行分段,并将多个分段组成一个分块,该值决定了分块的预估大小,如果会有上下浮动。",
|
||||
"image_auto_parse": "图片自动索引",
|
||||
"image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引",
|
||||
"image_training_queue": "图片处理排队",
|
||||
@@ -68,6 +65,8 @@
|
||||
"import_param_setting": "参数设置",
|
||||
"import_select_file": "选择文件",
|
||||
"import_select_link": "输入链接",
|
||||
"index_size": "索引大小",
|
||||
"index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。",
|
||||
"is_open_schedule": "启用定时同步",
|
||||
"keep_image": "保留图片",
|
||||
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
|
||||
@@ -89,6 +88,8 @@
|
||||
"retain_collection": "调整训练参数",
|
||||
"retrain_task_submitted": "重新训练任务已提交",
|
||||
"same_api_collection": "存在相同的 API 集合",
|
||||
"split_chunk_char": "按指定分割符分块",
|
||||
"split_chunk_size": "按长度分块",
|
||||
"split_sign_break": "1 个换行符",
|
||||
"split_sign_break2": "2 个换行符",
|
||||
"split_sign_custom": "自定义",
|
||||
|
@@ -568,7 +568,6 @@
|
||||
"core.dataset.import.Custom process": "自訂規則",
|
||||
"core.dataset.import.Custom process desc": "自訂設定資料處理規則",
|
||||
"core.dataset.import.Custom prompt": "自訂提示詞",
|
||||
"core.dataset.import.Custom split char": "自訂分隔符",
|
||||
"core.dataset.import.Custom text": "自訂文字",
|
||||
"core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
|
||||
"core.dataset.import.Data process params": "資料處理參數",
|
||||
|
@@ -27,7 +27,6 @@
|
||||
"custom_data_process_params": "自訂",
|
||||
"custom_data_process_params_desc": "自訂資料處理規則",
|
||||
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如: * () [] {} 等。",
|
||||
"data.ideal_chunk_length": "理想分塊長度",
|
||||
"data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
|
||||
"data_index_num": "索引 {{index}}",
|
||||
"data_process_params": "處理參數",
|
||||
@@ -53,8 +52,6 @@
|
||||
"file_model_function_tip": "用於增強索引和問答生成",
|
||||
"filename": "檔案名稱",
|
||||
"folder_dataset": "資料夾",
|
||||
"ideal_chunk_length": "理想分塊長度",
|
||||
"ideal_chunk_length_tips": "依結束符號進行分段,並將多個分段組成一個分塊,此值決定了分塊的預估大小,可能會有上下浮動。",
|
||||
"image_auto_parse": "圖片自動索引",
|
||||
"image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引",
|
||||
"image_training_queue": "圖片處理排隊",
|
||||
@@ -68,6 +65,8 @@
|
||||
"import_param_setting": "參數設置",
|
||||
"import_select_file": "選擇文件",
|
||||
"import_select_link": "輸入鏈接",
|
||||
"index_size": "索引大小",
|
||||
"index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。",
|
||||
"is_open_schedule": "啟用定時同步",
|
||||
"keep_image": "保留圖片",
|
||||
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
|
||||
@@ -89,6 +88,8 @@
|
||||
"retain_collection": "調整訓練參數",
|
||||
"retrain_task_submitted": "重新訓練任務已提交",
|
||||
"same_api_collection": "存在相同的 API 集合",
|
||||
"split_chunk_char": "按指定分割符分塊",
|
||||
"split_chunk_size": "按長度分塊",
|
||||
"split_sign_break": "1 個換行符",
|
||||
"split_sign_break2": "2 個換行符",
|
||||
"split_sign_custom": "自定義",
|
||||
|
Reference in New Issue
Block a user