External dataset (#1519)

* perf: local file create collection * rename middleware * perf: remove code * feat: next14 * feat: external file dataset * collection tags field * external file dataset doc * fix: ts
2025-07-24 05:23:57 +00:00 · 2024-05-17 16:44:15 +08:00
parent 2d1ec9b3ad
commit 67c52992d7
102 changed files with 1839 additions and 1282 deletions
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -32,6 +32,9 @@ export async function createOneCollection({
  fileId,
  rawLink,

+  externalFileId,
+  externalFileUrl,
+
  hashRawText,
  rawTextLength,
  metadata = {},
@@ -61,6 +64,8 @@ export async function createOneCollection({

        fileId,
        rawLink,
+        externalFileId,
+        externalFileUrl,

        rawTextLength,
        hashRawText,
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -66,7 +66,11 @@ const DatasetCollectionSchema = new Schema({
    type: String
  },

-  sourceId: String,
+  tags: {
+    type: [String],
+    default: []
+  },
+
  // local file collection
  fileId: {
    type: Schema.Types.ObjectId,
@@ -74,13 +78,13 @@ const DatasetCollectionSchema = new Schema({
  },
  // web link collection
  rawLink: String,
-
  // external collection
+  externalFileId: String,

  // metadata
  rawTextLength: Number,
  hashRawText: String,
-  externalSourceUrl: String, // external import url
+  externalFileUrl: String, // external import url
  metadata: {
    type: Object,
    default: {}
--- a/packages/service/core/dataset/read.ts
+++ b/packages/service/core/dataset/read.ts
@@ -2,13 +2,20 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
 import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
 import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
 import { urlsFetch } from '../../common/string/cheerio';
-import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
 import { parseCsvTable2Chunks } from './training/utils';
 import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import axios from 'axios';
-import { readFileRawContent } from '../../common/file/read/utils';
+import { readRawContentByFileBuffer } from '../../common/file/read/utils';

-export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => {
+export const readFileRawTextByUrl = async ({
+  teamId,
+  url,
+  relatedId
+}: {
+  teamId: string;
+  url: string;
+  relatedId?: string;
+}) => {
  const response = await axios({
    method: 'get',
    url: url,
@@ -18,11 +25,14 @@ export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; ur

  const buffer = Buffer.from(response.data, 'binary');

-  const { rawText } = await readFileRawContent({
+  const { rawText } = await readRawContentByFileBuffer({
    extension,
    teamId,
    buffer,
-    encoding: 'utf-8'
+    encoding: 'utf-8',
+    metadata: {
+      relatedId
+    }
  });

  return rawText;
@@ -38,13 +48,15 @@ export const readDatasetSourceRawText = async ({
  type,
  sourceId,
  isQAImport,
-  selector
+  selector,
+  relatedId
 }: {
  teamId: string;
  type: DatasetSourceReadTypeEnum;
  sourceId: string;
  isQAImport?: boolean;
  selector?: string;
+  relatedId?: string;
 }): Promise<string> => {
  if (type === DatasetSourceReadTypeEnum.fileLocal) {
    const { rawText } = await readFileContentFromMongo({
@@ -64,7 +76,8 @@ export const readDatasetSourceRawText = async ({
  } else if (type === DatasetSourceReadTypeEnum.externalFile) {
    const rawText = await readFileRawTextByUrl({
      teamId,
-      url: sourceId
+      url: sourceId,
+      relatedId
    });
    return rawText;
  }
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -18,6 +18,7 @@ import { countPromptTokens } from '../../../common/string/tiktoken/index';
 import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
 import { hashStr } from '@fastgpt/global/common/string/tools';
 import { jiebaSplit } from '../../../common/string/jieba';
+import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';

 type SearchDatasetDataProps = {
  teamId: string;
@@ -98,7 +99,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      },
      'datasetId collectionId q a chunkIndex indexes'
    )
-      .populate('collectionId', 'name fileId rawLink')
+      .populate('collectionId', 'name fileId rawLink externalFileId externalFileUrl')
      .lean()) as DatasetDataWithCollectionType[];

    // add score to data(It's already sorted. The first one is the one with the most points)
@@ -130,8 +131,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
          chunkIndex: data.chunkIndex,
          datasetId: String(data.datasetId),
          collectionId: String(data.collectionId?._id),
-          sourceName: data.collectionId?.name || '',
-          sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
+          ...getCollectionSourceData(data.collectionId),
          score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
        };

@@ -205,8 +205,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
          id: String(item._id),
          datasetId: String(item.datasetId),
          collectionId: String(item.collectionId),
-          sourceName: collection?.name || '',
-          sourceId: collection?.fileId || collection?.rawLink,
+          ...getCollectionSourceData(collection),
          q: item.q,
          a: item.a,
          chunkIndex: item.chunkIndex,
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -174,7 +174,7 @@ export async function pushDataListToTrainingQueue({
  } catch (error: any) {
    addLog.error(`Insert error`, error);
    // 如果有错误，将失败的文档添加到失败列表中
-    error.writeErrors.forEach((writeError: any) => {
+    error.writeErrors?.forEach((writeError: any) => {
      failedDocuments.push(data[writeError.index]);
    });
    console.log('failed', failedDocuments);
--- a/packages/service/core/dataset/training/schema.ts
+++ b/packages/service/core/dataset/training/schema.ts
@@ -35,7 +35,7 @@ const TrainingDataSchema = new Schema({
  },
  billId: {
    // concat bill
-    type: Schema.Types.ObjectId
+    type: String
  },
  mode: {
    type: String,
--- a/packages/service/core/workflow/dispatch/tools/runLaf.ts
+++ b/packages/service/core/workflow/dispatch/tools/runLaf.ts
@@ -53,7 +53,7 @@ export const dispatchLafRequest = async (props: LafRequestProps): Promise<LafRes
      appId,
      chatId,
      responseChatItemId,
-      histories: histories.slice(0, 10)
+      histories: histories?.slice(0, 10)
    },
    variables,
    ...dynamicInput,