4.6.7 first pr (#726)

2025-07-24 13:53:50 +00:00 · 2024-01-10 23:35:04 +08:00
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions
--- a/packages/service/core/ai/embedding/index.ts
+++ b/packages/service/core/ai/embedding/index.ts
@@ -32,7 +32,7 @@ export async function getVectorsByText({
          return Promise.reject('Embedding API 404');
        }
        if (!res?.data?.[0]?.embedding) {
-          console.log(res?.data);
+          console.log(res);
          // @ts-ignore
          return Promise.reject(res.data?.err?.message || 'Embedding API Error');
        }
--- a/packages/service/core/chat/chatItemSchema.ts
+++ b/packages/service/core/chat/chatItemSchema.ts
@@ -2,8 +2,7 @@ import { connectionMongo, type Model } from '../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { ChatItemSchema as ChatItemType } from '@fastgpt/global/core/chat/type';
 import { ChatRoleMap } from '@fastgpt/global/core/chat/constants';
-import { customAlphabet } from 'nanoid';
-const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 24);
+import { getNanoid } from '@fastgpt/global/common/string/tools';
 import {
  TeamCollectionName,
  TeamMemberCollectionName
@@ -13,24 +12,6 @@ import { userCollectionName } from '../../support/user/schema';
 import { ModuleOutputKeyEnum } from '@fastgpt/global/core/module/constants';

 const ChatItemSchema = new Schema({
-  dataId: {
-    type: String,
-    require: true,
-    default: () => nanoid()
-  },
-  appId: {
-    type: Schema.Types.ObjectId,
-    ref: appCollectionName,
-    required: true
-  },
-  chatId: {
-    type: String,
-    require: true
-  },
-  userId: {
-    type: Schema.Types.ObjectId,
-    ref: userCollectionName
-  },
  teamId: {
    type: Schema.Types.ObjectId,
    ref: TeamCollectionName,
@@ -41,6 +22,24 @@ const ChatItemSchema = new Schema({
    ref: TeamMemberCollectionName,
    required: true
  },
+  userId: {
+    type: Schema.Types.ObjectId,
+    ref: userCollectionName
+  },
+  chatId: {
+    type: String,
+    require: true
+  },
+  dataId: {
+    type: String,
+    require: true,
+    default: () => getNanoid(22)
+  },
+  appId: {
+    type: Schema.Types.ObjectId,
+    ref: appCollectionName,
+    required: true
+  },
  time: {
    type: Date,
    default: () => new Date()
@@ -80,10 +79,11 @@ const ChatItemSchema = new Schema({
 });

 try {
-  ChatItemSchema.index({ dataId: -1 });
+  ChatItemSchema.index({ teamId: 1 });
  ChatItemSchema.index({ time: -1 });
  ChatItemSchema.index({ appId: 1 });
  ChatItemSchema.index({ chatId: 1 });
+  ChatItemSchema.index({ obj: 1 });
  ChatItemSchema.index({ userGoodFeedback: 1 });
  ChatItemSchema.index({ userBadFeedback: 1 });
  ChatItemSchema.index({ customFeedbacks: 1 });
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -1,7 +1,4 @@
-import {
-  DatasetCollectionTrainingModeEnum,
-  DatasetCollectionTypeEnum
-} from '@fastgpt/global/core/dataset/constant';
+import { TrainingModeEnum, DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
 import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
 import { MongoDatasetCollection } from './schema';

@@ -12,11 +9,15 @@ export async function createOneCollection({
  parentId,
  datasetId,
  type,
-  trainingType = DatasetCollectionTrainingModeEnum.manual,
+
+  trainingType = TrainingModeEnum.chunk,
  chunkSize = 0,
+  chunkSplitter,
+  qaPrompt,
+
  fileId,
  rawLink,
-  qaPrompt,
+
  hashRawText,
  rawTextLength,
  metadata = {},
@@ -30,11 +31,15 @@ export async function createOneCollection({
    datasetId,
    name,
    type,
+
    trainingType,
    chunkSize,
+    chunkSplitter,
+    qaPrompt,
+
    fileId,
    rawLink,
-    qaPrompt,
+
    rawTextLength,
    hashRawText,
    metadata
@@ -74,7 +79,7 @@ export function createDefaultCollection({
    datasetId,
    parentId,
    type: DatasetCollectionTypeEnum.virtual,
-    trainingType: DatasetCollectionTrainingModeEnum.manual,
+    trainingType: TrainingModeEnum.chunk,
    chunkSize: 0,
    updateTime: new Date('2099')
  });
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -1,10 +1,7 @@
 import { connectionMongo, type Model } from '../../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
-import {
-  DatasetCollectionTrainingTypeMap,
-  DatasetCollectionTypeMap
-} from '@fastgpt/global/core/dataset/constant';
+import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constant';
 import { DatasetCollectionName } from '../schema';
 import {
  TeamCollectionName,
@@ -56,15 +53,23 @@ const DatasetCollectionSchema = new Schema({
    type: Date,
    default: () => new Date()
  },
+
  trainingType: {
    type: String,
-    enum: Object.keys(DatasetCollectionTrainingTypeMap),
+    enum: Object.keys(TrainingTypeMap),
    required: true
  },
  chunkSize: {
    type: Number,
    required: true
  },
+  chunkSplitter: {
+    type: String
+  },
+  qaPrompt: {
+    type: String
+  },
+
  fileId: {
    type: Schema.Types.ObjectId,
    ref: 'dataset.files'
@@ -72,9 +77,6 @@ const DatasetCollectionSchema = new Schema({
  rawLink: {
    type: String
  },
-  qaPrompt: {
-    type: String
-  },

  rawTextLength: {
    type: Number
@@ -89,8 +91,9 @@ const DatasetCollectionSchema = new Schema({
 });

 try {
+  DatasetCollectionSchema.index({ teamId: 1 });
  DatasetCollectionSchema.index({ datasetId: 1 });
-  DatasetCollectionSchema.index({ datasetId: 1, parentId: 1 });
+  DatasetCollectionSchema.index({ teamId: 1, datasetId: 1, parentId: 1 });
  DatasetCollectionSchema.index({ updateTime: -1 });
  DatasetCollectionSchema.index({ hashRawText: -1 });
 } catch (error) {
--- a/packages/service/core/dataset/collection/utils.ts
+++ b/packages/service/core/dataset/collection/utils.ts
@@ -4,7 +4,7 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
 import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import { MongoDatasetTraining } from '../training/schema';
 import { urlsFetch } from '../../../common/string/cheerio';
-import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
+import { DatasetCollectionTypeEnum, TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
 import { hashStr } from '@fastgpt/global/common/string/tools';

 /**
@@ -92,8 +92,12 @@ export const getCollectionAndRawText = async ({
    return Promise.reject('Collection not found');
  }

-  const rawText = await (async () => {
-    if (newRawText) return newRawText;
+  const { title, rawText } = await (async () => {
+    if (newRawText)
+      return {
+        title: '',
+        rawText: newRawText
+      };
    // link
    if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
      // crawl new data
@@ -102,12 +106,18 @@ export const getCollectionAndRawText = async ({
        selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
      });

-      return result[0].content;
+      return {
+        title: result[0].title,
+        rawText: result[0].content
+      };
    }

    // file

-    return '';
+    return {
+      title: '',
+      rawText: ''
+    };
  })();

  const hashRawText = hashStr(rawText);
@@ -115,6 +125,7 @@ export const getCollectionAndRawText = async ({

  return {
    collection: col,
+    title,
    rawText,
    isSameRawText
  };
@@ -135,6 +146,7 @@ export const reloadCollectionChunks = async ({
  rawText?: string;
 }) => {
  const {
+    title,
    rawText: newRawText,
    collection: col,
    isSameRawText
@@ -154,6 +166,11 @@ export const reloadCollectionChunks = async ({
  });

  // insert to training queue
+  const model = await (() => {
+    if (col.trainingType === TrainingModeEnum.chunk) return col.datasetId.vectorModel;
+    if (col.trainingType === TrainingModeEnum.qa) return col.datasetId.agentModel;
+    return Promise.reject('Training model error');
+  })();
  await MongoDatasetTraining.insertMany(
    chunks.map((item, i) => ({
      teamId: col.teamId,
@@ -163,7 +180,7 @@ export const reloadCollectionChunks = async ({
      billId,
      mode: col.trainingType,
      prompt: '',
-      model: col.datasetId.vectorModel,
+      model,
      q: item,
      a: '',
      chunkIndex: i
@@ -172,6 +189,7 @@ export const reloadCollectionChunks = async ({

  // update raw text
  await MongoDatasetCollection.findByIdAndUpdate(col._id, {
+    ...(title && { name: title }),
    rawTextLength: newRawText.length,
    hashRawText: hashStr(newRawText)
  });
--- a/packages/service/core/dataset/data/controller.ts
+++ b/packages/service/core/dataset/data/controller.ts
@@ -75,7 +75,13 @@ export async function delCollectionRelevantData({
 /**
 * delete one data by mongoDataId
 */
-export async function delDatasetDataByDataId(mongoDataId: string) {
-  await deleteDatasetDataVector({ dataIds: [mongoDataId] });
+export async function delDatasetDataByDataId({
+  collectionId,
+  mongoDataId
+}: {
+  collectionId: string;
+  mongoDataId: string;
+}) {
+  await deleteDatasetDataVector({ collectionId, dataIds: [mongoDataId] });
  await MongoDatasetData.findByIdAndDelete(mongoDataId);
 }
--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -85,12 +85,13 @@ const DatasetDataSchema = new Schema({
 });

 try {
+  DatasetDataSchema.index({ teamId: 1 });
  DatasetDataSchema.index({ datasetId: 1 });
  DatasetDataSchema.index({ collectionId: 1 });
  DatasetDataSchema.index({ updateTime: -1 });
+  DatasetDataSchema.index({ collectionId: 1, q: 1, a: 1 });
  // full text index
  DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
-  DatasetDataSchema.index({ inited: 1 });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -92,7 +92,7 @@ const DatasetSchema = new Schema({
 });

 try {
-  DatasetSchema.index({ userId: 1 });
+  DatasetSchema.index({ teamId: 1 });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/training/schema.ts
+++ b/packages/service/core/dataset/training/schema.ts
@@ -102,6 +102,7 @@ const TrainingDataSchema = new Schema({
 });

 try {
+  TrainingDataSchema.index({ teamId: 1 });
  TrainingDataSchema.index({ weight: -1 });
  TrainingDataSchema.index({ lockTime: 1 });
  TrainingDataSchema.index({ datasetId: 1 });