Change embedding (#1428)

* fix: text spliter * perf: embedding model
2025-10-17 08:37:59 +00:00 · 2024-05-09 23:23:49 +08:00
parent 434af56abd
commit 5e250b2f65
6 changed files with 93 additions and 11 deletions
--- a/files/helm/fastgpt/templates/configmap-config.yaml
+++ b/files/helm/fastgpt/templates/configmap-config.yaml
@@ -99,13 +99,41 @@ data:
        }
      ],
      "vectorModels": [
        {
          "model": "text-embedding-3-large",
          "name": "Embedding-2",
          "avatar": "/imgs/model/openai.svg",
          "charsPointsPrice": 0,
          "defaultToken": 512,
          "maxToken": 3000,
          "weight": 100,
          "dbConfig": {},
          "queryConfig": {},
          "defaultConfig": {
            "dimensions": 1024
          }
        },
        {
          "model": "text-embedding-3-small",
          "name": "Embedding-2",
          "avatar": "/imgs/model/openai.svg",
          "charsPointsPrice": 0,
          "defaultToken": 512,
          "maxToken": 3000,
          "weight": 100,
          "dbConfig": {},
          "queryConfig": {}
        },
        {
          "model": "text-embedding-ada-002",
          "name": "Embedding-2",
          "avatar": "/imgs/model/openai.svg",
          "charsPointsPrice": 0,
-          "defaultToken": 700,
+          "defaultToken": 512,
          "maxToken": 3000,
-          "weight": 100
+          "weight": 100,
          "dbConfig": {},
          "queryConfig": {}
        }
      ],
      "reRankModels": [],
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -17,17 +17,47 @@ type SplitResponse = {
 // 判断字符串是否为markdown的表格形式
 const strIsMdTable = (str: string) => {
-  const regex = /^(\|.*\|[\r]*)$/m;
+  // 检查是否包含表格分隔符 |
  if (!str.includes('|')) {
    return false;
  }
-  return regex.test(str);
+  const lines = str.split('\n');
  // 检查表格是否至少有两行
  if (lines.length < 2) {
    return false;
  }
  // 检查表头行是否包含 |
  const headerLine = lines[0].trim();
  if (!headerLine.startsWith('|') || !headerLine.endsWith('|')) {
    return false;
  }
  // 检查分隔行是否由 | 和 - 组成
  const separatorLine = lines[1].trim();
  const separatorRegex = /^(\|[\s:]*-+[\s:]*)+\|$/;
  if (!separatorRegex.test(separatorLine)) {
    return false;
  }
  // 检查数据行是否包含 |
  for (let i = 2; i < lines.length; i++) {
    const dataLine = lines[i].trim();
    if (dataLine && (!dataLine.startsWith('|') || !dataLine.endsWith('|'))) {
      return false;
    }
  }
  return true;
 };
 const markdownTableSplit = (props: SplitProps): SplitResponse => {
  let { text = '', chunkLen } = props;
  const splitText2Lines = text.split('\n');
  const header = splitText2Lines[0];
  const headerSize = header.split('|').length - 2;
-  const mdSplitString = `| ${new Array(headerSize)
+
  const mdSplitString = `| ${new Array(headerSize > 0 ? headerSize : 1)
    .fill(0)
    .map(() => '---')
    .join(' | ')} |`;
@@ -304,7 +334,7 @@ export const splitText2Chunks = (props: SplitProps): SplitResponse => {
  const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
  const splitResult = splitWithCustomSign.map((item) => {
-    if (strIsMdTable(text)) {
+    if (strIsMdTable(item)) {
      return markdownTableSplit(props);
    }
--- a/packages/global/core/ai/model.ts
+++ b/packages/global/core/ai/model.ts
@@ -23,7 +23,7 @@ export const defaultQAModels: LLMModelItemType[] = [
 export const defaultVectorModels: VectorModelItemType[] = [
  {
-    model: 'text-embedding-ada-002',
+    model: 'text-embedding-3-small',
    name: 'Embedding-2',
    charsPointsPrice: 0,
    defaultToken: 500,
--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -62,7 +62,7 @@ const DatasetSchema = new Schema({
  vectorModel: {
    type: String,
    required: true,
-    default: 'text-embedding-ada-002'
+    default: 'text-embedding-3-small'
  },
  agentModel: {
    type: String,
--- a/projects/app/data/config.json
+++ b/projects/app/data/config.json
@@ -80,6 +80,31 @@
    }
  ],
  "vectorModels": [
    {
      "model": "text-embedding-3-large",
      "name": "Embedding-2",
      "avatar": "/imgs/model/openai.svg",
      "charsPointsPrice": 0,
      "defaultToken": 512,
      "maxToken": 3000,
      "weight": 100,
      "dbConfig": {},
      "queryConfig": {},
      "defaultConfig": {
        "dimensions": 1024
      }
    },
    {
      "model": "text-embedding-3-small",
      "name": "Embedding-2",
      "avatar": "/imgs/model/openai.svg",
      "charsPointsPrice": 0,
      "defaultToken": 512,
      "maxToken": 3000,
      "weight": 100,
      "dbConfig": {},
      "queryConfig": {}
    },
    {
      "model": "text-embedding-ada-002",
      "name": "Embedding-2",
--- a/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
+++ b/projects/app/src/pages/api/core/dataset/file/getPreviewChunks.ts
@@ -35,9 +35,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
          csvFormat: true
        });
        // split chunks (5 chunk)
        const sliceRawText = 10 * chunkSize;
        const { chunks } = splitText2Chunks({
-          text: rawText.slice(0, sliceRawText),
+          text: rawText,
          chunkLen: chunkSize,
          overlapRatio,
          customReg: customSplitChar ? [customSplitChar] : []