External dataset (#1519)

* perf: local file create collection * rename middleware * perf: remove code * feat: next14 * feat: external file dataset * collection tags field * external file dataset doc * fix: ts
2025-10-17 16:45:02 +00:00 · 2024-05-17 16:44:15 +08:00
parent 2d1ec9b3ad
commit 67c52992d7
102 changed files with 1839 additions and 1282 deletions
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -7,7 +7,7 @@ import { MongoFileSchema } from './schema';
 import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
 import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
 import { MongoRawTextBuffer } from '../../buffer/rawText/schema';
-import { readFileRawContent } from '../read/utils';
+import { readRawContentByFileBuffer } from '../read/utils';
 import { PassThrough } from 'stream';

 export function getGFSCollection(bucket: `${BucketNameEnum}`) {
@@ -196,7 +196,7 @@ export const readFileContentFromMongo = async ({
    });
  })();

-  const { rawText } = await readFileRawContent({
+  const { rawText } = await readRawContentByFileBuffer({
    extension,
    isQAImport,
    teamId,
--- a/packages/service/common/file/read/utils.ts
+++ b/packages/service/common/file/read/utils.ts
@@ -1,11 +1,12 @@
-import { markdownProcess, simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
+import { markdownProcess } from '@fastgpt/global/common/string/markdown';
 import { uploadMongoImg } from '../image/controller';
 import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
 import { addHours } from 'date-fns';

 import { WorkerNameEnum, runWorker } from '../../../worker/utils';
+import fs from 'fs';
+import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
 import { ReadFileResponse } from '../../../worker/file/type';
-import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';

 export const initMarkdownText = ({
  teamId,
@@ -28,7 +29,34 @@ export const initMarkdownText = ({
      })
  });

-export const readFileRawContent = async ({
+export type readRawTextByLocalFileParams = {
+  teamId: string;
+  path: string;
+  metadata?: Record<string, any>;
+};
+export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
+  const { path } = params;
+
+  const extension = path?.split('.')?.pop()?.toLowerCase() || '';
+
+  const buffer = fs.readFileSync(path);
+  const encoding = detectFileEncoding(buffer);
+
+  const { rawText } = await readRawContentByFileBuffer({
+    extension,
+    isQAImport: false,
+    teamId: params.teamId,
+    encoding,
+    buffer,
+    metadata: params.metadata
+  });
+
+  return {
+    rawText
+  };
+};
+
+export const readRawContentByFileBuffer = async ({
  extension,
  isQAImport,
  teamId,
@@ -69,9 +97,3 @@ export const readFileRawContent = async ({

  return { rawText };
 };
-
-export const htmlToMarkdown = async (html?: string | null) => {
-  const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
-
-  return simpleMarkdownText(md);
-};
--- a/packages/service/common/middle/entry.ts
+++ b/packages/service/common/middle/entry.ts
@@ -0,0 +1,38 @@
+import { jsonRes } from '../response';
+import type { NextApiResponse } from 'next';
+import { withNextCors } from './cors';
+import { ApiRequestProps } from '../../type/next';
+
+export type NextApiHandler<T = any> = (
+  req: ApiRequestProps,
+  res: NextApiResponse<T>
+) => unknown | Promise<unknown>;
+
+export const NextEntry = ({ beforeCallback = [] }: { beforeCallback?: Promise<any>[] }) => {
+  return (...args: NextApiHandler[]): NextApiHandler => {
+    return async function api(req: ApiRequestProps, res: NextApiResponse) {
+      try {
+        await Promise.all([withNextCors(req, res), ...beforeCallback]);
+
+        let response = null;
+        for (const handler of args) {
+          response = await handler(req, res);
+        }
+
+        const contentType = res.getHeader('Content-Type');
+        if ((!contentType || contentType === 'application/json') && !res.writableFinished) {
+          return jsonRes(res, {
+            code: 200,
+            data: response
+          });
+        }
+      } catch (error) {
+        return jsonRes(res, {
+          code: 500,
+          error,
+          url: req.url
+        });
+      }
+    };
+  };
+};
--- a/packages/service/common/string/cheerio.ts
+++ b/packages/service/common/string/cheerio.ts
@@ -1,7 +1,7 @@
 import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
 import * as cheerio from 'cheerio';
 import axios from 'axios';
-import { htmlToMarkdown } from '../file/read/utils';
+import { htmlToMarkdown } from './utils';

 export const cheerioToHtml = ({
  fetchUrl,
--- a/packages/service/common/string/utils.ts
+++ b/packages/service/common/string/utils.ts
@@ -0,0 +1,8 @@
+import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
+import { WorkerNameEnum, runWorker } from '../../worker/utils';
+
+export const htmlToMarkdown = async (html?: string | null) => {
+  const md = await runWorker<string>(WorkerNameEnum.htmlStr2Md, { html: html || '' });
+
+  return simpleMarkdownText(md);
+};
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -32,6 +32,9 @@ export async function createOneCollection({
  fileId,
  rawLink,

+  externalFileId,
+  externalFileUrl,
+
  hashRawText,
  rawTextLength,
  metadata = {},
@@ -61,6 +64,8 @@ export async function createOneCollection({

        fileId,
        rawLink,
+        externalFileId,
+        externalFileUrl,

        rawTextLength,
        hashRawText,
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -66,7 +66,11 @@ const DatasetCollectionSchema = new Schema({
    type: String
  },

-  sourceId: String,
+  tags: {
+    type: [String],
+    default: []
+  },
+
  // local file collection
  fileId: {
    type: Schema.Types.ObjectId,
@@ -74,13 +78,13 @@ const DatasetCollectionSchema = new Schema({
  },
  // web link collection
  rawLink: String,
-
  // external collection
+  externalFileId: String,

  // metadata
  rawTextLength: Number,
  hashRawText: String,
-  externalSourceUrl: String, // external import url
+  externalFileUrl: String, // external import url
  metadata: {
    type: Object,
    default: {}
--- a/packages/service/core/dataset/read.ts
+++ b/packages/service/core/dataset/read.ts
@@ -2,13 +2,20 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
 import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
 import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
 import { urlsFetch } from '../../common/string/cheerio';
-import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
 import { parseCsvTable2Chunks } from './training/utils';
 import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import axios from 'axios';
-import { readFileRawContent } from '../../common/file/read/utils';
+import { readRawContentByFileBuffer } from '../../common/file/read/utils';

-export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => {
+export const readFileRawTextByUrl = async ({
+  teamId,
+  url,
+  relatedId
+}: {
+  teamId: string;
+  url: string;
+  relatedId?: string;
+}) => {
  const response = await axios({
    method: 'get',
    url: url,
@@ -18,11 +25,14 @@ export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; ur

  const buffer = Buffer.from(response.data, 'binary');

-  const { rawText } = await readFileRawContent({
+  const { rawText } = await readRawContentByFileBuffer({
    extension,
    teamId,
    buffer,
-    encoding: 'utf-8'
+    encoding: 'utf-8',
+    metadata: {
+      relatedId
+    }
  });

  return rawText;
@@ -38,13 +48,15 @@ export const readDatasetSourceRawText = async ({
  type,
  sourceId,
  isQAImport,
-  selector
+  selector,
+  relatedId
 }: {
  teamId: string;
  type: DatasetSourceReadTypeEnum;
  sourceId: string;
  isQAImport?: boolean;
  selector?: string;
+  relatedId?: string;
 }): Promise<string> => {
  if (type === DatasetSourceReadTypeEnum.fileLocal) {
    const { rawText } = await readFileContentFromMongo({
@@ -64,7 +76,8 @@ export const readDatasetSourceRawText = async ({
  } else if (type === DatasetSourceReadTypeEnum.externalFile) {
    const rawText = await readFileRawTextByUrl({
      teamId,
-      url: sourceId
+      url: sourceId,
+      relatedId
    });
    return rawText;
  }
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -18,6 +18,7 @@ import { countPromptTokens } from '../../../common/string/tiktoken/index';
 import { datasetSearchResultConcat } from '@fastgpt/global/core/dataset/search/utils';
 import { hashStr } from '@fastgpt/global/common/string/tools';
 import { jiebaSplit } from '../../../common/string/jieba';
+import { getCollectionSourceData } from '@fastgpt/global/core/dataset/collection/utils';

 type SearchDatasetDataProps = {
  teamId: string;
@@ -98,7 +99,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
      },
      'datasetId collectionId q a chunkIndex indexes'
    )
-      .populate('collectionId', 'name fileId rawLink')
+      .populate('collectionId', 'name fileId rawLink externalFileId externalFileUrl')
      .lean()) as DatasetDataWithCollectionType[];

    // add score to data(It's already sorted. The first one is the one with the most points)
@@ -130,8 +131,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
          chunkIndex: data.chunkIndex,
          datasetId: String(data.datasetId),
          collectionId: String(data.collectionId?._id),
-          sourceName: data.collectionId?.name || '',
-          sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
+          ...getCollectionSourceData(data.collectionId),
          score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
        };

@@ -205,8 +205,7 @@ export async function searchDatasetData(props: SearchDatasetDataProps) {
          id: String(item._id),
          datasetId: String(item.datasetId),
          collectionId: String(item.collectionId),
-          sourceName: collection?.name || '',
-          sourceId: collection?.fileId || collection?.rawLink,
+          ...getCollectionSourceData(collection),
          q: item.q,
          a: item.a,
          chunkIndex: item.chunkIndex,
--- a/packages/service/core/dataset/training/controller.ts
+++ b/packages/service/core/dataset/training/controller.ts
@@ -174,7 +174,7 @@ export async function pushDataListToTrainingQueue({
  } catch (error: any) {
    addLog.error(`Insert error`, error);
    // 如果有错误，将失败的文档添加到失败列表中
-    error.writeErrors.forEach((writeError: any) => {
+    error.writeErrors?.forEach((writeError: any) => {
      failedDocuments.push(data[writeError.index]);
    });
    console.log('failed', failedDocuments);
--- a/packages/service/core/dataset/training/schema.ts
+++ b/packages/service/core/dataset/training/schema.ts
@@ -35,7 +35,7 @@ const TrainingDataSchema = new Schema({
  },
  billId: {
    // concat bill
-    type: Schema.Types.ObjectId
+    type: String
  },
  mode: {
    type: String,
--- a/packages/service/core/workflow/dispatch/tools/runLaf.ts
+++ b/packages/service/core/workflow/dispatch/tools/runLaf.ts
@@ -53,7 +53,7 @@ export const dispatchLafRequest = async (props: LafRequestProps): Promise<LafRes
      appId,
      chatId,
      responseChatItemId,
-      histories: histories.slice(0, 10)
+      histories: histories?.slice(0, 10)
    },
    variables,
    ...dynamicInput,
--- a/packages/service/package.json
+++ b/packages/service/package.json
@@ -21,7 +21,7 @@
    "mammoth": "^1.6.0",
    "mongoose": "^7.0.2",
    "multer": "1.4.5-lts.1",
-    "next": "13.5.2",
+    "next": "14.2.3",
    "nextjs-cors": "^2.1.2",
    "node-cron": "^3.0.3",
    "node-xlsx": "^0.23.0",
--- a/packages/service/support/permission/teamLimit.ts
+++ b/packages/service/support/permission/teamLimit.ts
@@ -19,7 +19,9 @@ export const checkDatasetLimit = async ({
  if (!standardConstants) return;

  if (usedSize + insertLen >= datasetMaxSize) {
-    return Promise.reject(TeamErrEnum.datasetSizeNotEnough);
+    return Promise.reject(
+      `您的知识库容量为: ${datasetMaxSize}组，已使用: ${usedSize}组，导入当前文件需要: ${insertLen}组，请增加知识库容量后导入。`
+    );
  }

  if (usedPoints >= totalPoints) {
--- a/packages/service/worker/file/read.ts
+++ b/packages/service/worker/file/read.ts
@@ -9,7 +9,7 @@ import { readXlsxRawText } from './extension/xlsx';
 import { readCsvRawText } from './extension/csv';

 parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
-  const readFileRawContent = async (params: ReadRawTextByBuffer) => {
+  const readRawContentByFileBuffer = async (params: ReadRawTextByBuffer) => {
    switch (params.extension) {
      case 'txt':
      case 'md':
@@ -41,7 +41,7 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
  try {
    parentPort?.postMessage({
      type: 'success',
-      data: await readFileRawContent(newProps)
+      data: await readRawContentByFileBuffer(newProps)
    });
  } catch (error) {
    console.log(error);