From 5e250b2f6562c013707c26b87861eab7f88eedc3 Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Thu, 9 May 2024 23:23:49 +0800 Subject: [PATCH] Change embedding (#1428) * fix: text spliter * perf: embedding model --- .../fastgpt/templates/configmap-config.yaml | 32 ++++++++++++++- packages/global/common/string/textSplitter.ts | 40 ++++++++++++++++--- packages/global/core/ai/model.ts | 2 +- packages/service/core/dataset/schema.ts | 2 +- projects/app/data/config.json | 25 ++++++++++++ .../api/core/dataset/file/getPreviewChunks.ts | 3 +- 6 files changed, 93 insertions(+), 11 deletions(-) diff --git a/files/helm/fastgpt/templates/configmap-config.yaml b/files/helm/fastgpt/templates/configmap-config.yaml index d5b82696f..403b19de2 100644 --- a/files/helm/fastgpt/templates/configmap-config.yaml +++ b/files/helm/fastgpt/templates/configmap-config.yaml @@ -99,13 +99,41 @@ data: } ], "vectorModels": [ + { + "model": "text-embedding-3-large", + "name": "Embedding-2", + "avatar": "/imgs/model/openai.svg", + "charsPointsPrice": 0, + "defaultToken": 512, + "maxToken": 3000, + "weight": 100, + "dbConfig": {}, + "queryConfig": {}, + "defaultConfig": { + "dimensions": 1024 + } + }, + { + "model": "text-embedding-3-small", + "name": "Embedding-2", + "avatar": "/imgs/model/openai.svg", + "charsPointsPrice": 0, + "defaultToken": 512, + "maxToken": 3000, + "weight": 100, + "dbConfig": {}, + "queryConfig": {} + }, { "model": "text-embedding-ada-002", "name": "Embedding-2", + "avatar": "/imgs/model/openai.svg", "charsPointsPrice": 0, - "defaultToken": 700, + "defaultToken": 512, "maxToken": 3000, - "weight": 100 + "weight": 100, + "dbConfig": {}, + "queryConfig": {} } ], "reRankModels": [], diff --git a/packages/global/common/string/textSplitter.ts b/packages/global/common/string/textSplitter.ts index b0faf70bf..1a6d02360 100644 --- a/packages/global/common/string/textSplitter.ts +++ b/packages/global/common/string/textSplitter.ts @@ -17,17 +17,47 @@ type SplitResponse = { // 判断字符串是否为markdown的表格形式 const strIsMdTable = (str: string) => { - const regex = /^(\|.*\|[\r]*)$/m; + // 检查是否包含表格分隔符 | + if (!str.includes('|')) { + return false; + } - return regex.test(str); + const lines = str.split('\n'); + + // 检查表格是否至少有两行 + if (lines.length < 2) { + return false; + } + + // 检查表头行是否包含 | + const headerLine = lines[0].trim(); + if (!headerLine.startsWith('|') || !headerLine.endsWith('|')) { + return false; + } + + // 检查分隔行是否由 | 和 - 组成 + const separatorLine = lines[1].trim(); + const separatorRegex = /^(\|[\s:]*-+[\s:]*)+\|$/; + if (!separatorRegex.test(separatorLine)) { + return false; + } + + // 检查数据行是否包含 | + for (let i = 2; i < lines.length; i++) { + const dataLine = lines[i].trim(); + if (dataLine && (!dataLine.startsWith('|') || !dataLine.endsWith('|'))) { + return false; + } + } + return true; }; const markdownTableSplit = (props: SplitProps): SplitResponse => { let { text = '', chunkLen } = props; const splitText2Lines = text.split('\n'); const header = splitText2Lines[0]; - const headerSize = header.split('|').length - 2; - const mdSplitString = `| ${new Array(headerSize) + + const mdSplitString = `| ${new Array(headerSize > 0 ? headerSize : 1) .fill(0) .map(() => '---') .join(' | ')} |`; @@ -304,7 +334,7 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => { const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN); const splitResult = splitWithCustomSign.map((item) => { - if (strIsMdTable(text)) { + if (strIsMdTable(item)) { return markdownTableSplit(props); } diff --git a/packages/global/core/ai/model.ts b/packages/global/core/ai/model.ts index 8669fa948..11f8730fe 100644 --- a/packages/global/core/ai/model.ts +++ b/packages/global/core/ai/model.ts @@ -23,7 +23,7 @@ export const defaultQAModels: LLMModelItemType[] = [ export const defaultVectorModels: VectorModelItemType[] = [ { - model: 'text-embedding-ada-002', + model: 'text-embedding-3-small', name: 'Embedding-2', charsPointsPrice: 0, defaultToken: 500, diff --git a/packages/service/core/dataset/schema.ts b/packages/service/core/dataset/schema.ts index 1c7c55eee..4139e042b 100644 --- a/packages/service/core/dataset/schema.ts +++ b/packages/service/core/dataset/schema.ts @@ -62,7 +62,7 @@ const DatasetSchema = new Schema({ vectorModel: { type: String, required: true, - default: 'text-embedding-ada-002' + default: 'text-embedding-3-small' }, agentModel: { type: String, diff --git a/projects/app/data/config.json b/projects/app/data/config.json index cde49dd43..30ceebdb5 100644 --- a/projects/app/data/config.json +++ b/projects/app/data/config.json @@ -80,6 +80,31 @@ } ], "vectorModels": [ + { + "model": "text-embedding-3-large", + "name": "Embedding-2", + "avatar": "/imgs/model/openai.svg", + "charsPointsPrice": 0, + "defaultToken": 512, + "maxToken": 3000, + "weight": 100, + "dbConfig": {}, + "queryConfig": {}, + "defaultConfig": { + "dimensions": 1024 + } + }, + { + "model": "text-embedding-3-small", + "name": "Embedding-2", + "avatar": "/imgs/model/openai.svg", + "charsPointsPrice": 0, + "defaultToken": 512, + "maxToken": 3000, + "weight": 100, + "dbConfig": {}, + "queryConfig": {} + }, { "model": "text-embedding-ada-002", "name": "Embedding-2", diff --git a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts index 8d94e6b5b..c60bf2542 100644 --- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts +++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts @@ -35,9 +35,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< csvFormat: true }); // split chunks (5 chunk) - const sliceRawText = 10 * chunkSize; const { chunks } = splitText2Chunks({ - text: rawText.slice(0, sliceRawText), + text: rawText, chunkLen: chunkSize, overlapRatio, customReg: customSplitChar ? [customSplitChar] : []