feat: Text check before synchronization (#689)

* fix: icon * fix: web selector * fix: web selector * perf: link sync * dev doc * chomd doc * perf: git intro * 466 intro * intro img * add json editor (#5) * team limit * websync limit * json editor * text editor * perf: search test * change cq value type * doc * intro img --------- Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
2025-07-27 00:17:31 +00:00 · 2024-01-04 23:19:24 +08:00
parent c2abbb579f
commit 828829011a
64 changed files with 1789 additions and 1489 deletions
--- a/packages/service/common/string/cheerio.ts
+++ b/packages/service/common/string/cheerio.ts
@@ -15,7 +15,8 @@ export const cheerioToHtml = ({
  // get origin url
  const originUrl = new URL(fetchUrl).origin;

-  const selectDom = $(selector || 'body');
+  const usedSelector = selector || 'body';
+  const selectDom = $(usedSelector);

  // remove i element
  selectDom.find('i,script').remove();
@@ -49,7 +50,10 @@ export const cheerioToHtml = ({
    .get()
    .join('\n');

-  return html;
+  return {
+    html,
+    usedSelector
+  };
 };
 export const urlsFetch = async ({
  urlList,
@@ -66,25 +70,25 @@ export const urlsFetch = async ({
          });

          const $ = cheerio.load(fetchRes.data);
-
-          const md = await htmlToMarkdown(
-            cheerioToHtml({
-              fetchUrl: url,
-              $,
-              selector
-            })
-          );
+          const { html, usedSelector } = cheerioToHtml({
+            fetchUrl: url,
+            $,
+            selector
+          });
+          const md = await htmlToMarkdown(html);

          return {
            url,
-            content: md
+            content: md,
+            selector: usedSelector
          };
        } catch (error) {
          console.log(error, 'fetch error');

          return {
            url,
-            content: ''
+            content: '',
+            selector: ''
          };
        }
      })
--- a/packages/service/common/string/markdown.ts
+++ b/packages/service/common/string/markdown.ts
@@ -21,6 +21,9 @@ export const htmlToMarkdown = (html?: string | null) =>
      worker.terminate();
      reject(err);
    });
+    worker.on('exit', (code) => {
+      console.log('html 2 md finish', code);
+    });

    worker.postMessage(html);
  });
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -19,14 +19,16 @@ export async function createOneCollection({
  qaPrompt,
  hashRawText,
  rawTextLength,
-  metadata = {}
-}: CreateDatasetCollectionParams & { teamId: string; tmbId: string }) {
+  metadata = {},
+  ...props
+}: CreateDatasetCollectionParams & { teamId: string; tmbId: string; [key: string]: any }) {
  const { _id } = await MongoDatasetCollection.create({
-    name,
+    ...props,
    teamId,
    tmbId,
-    datasetId,
    parentId: parentId || null,
+    datasetId,
+    name,
    type,
    trainingType,
    chunkSize,
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -75,6 +75,7 @@ const DatasetCollectionSchema = new Schema({
  qaPrompt: {
    type: String
  },
+
  rawTextLength: {
    type: Number
  },
--- a/packages/service/core/dataset/collection/utils.ts
+++ b/packages/service/core/dataset/collection/utils.ts
@@ -1,11 +1,11 @@
 import type { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type.d';
 import { MongoDatasetCollection } from './schema';
 import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type.d';
-import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
 import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
 import { MongoDatasetTraining } from '../training/schema';
 import { urlsFetch } from '../../../common/string/cheerio';
 import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
+import { hashStr } from '@fastgpt/global/common/string/tools';

 /**
 * get all collection by top collectionId
@@ -65,64 +65,114 @@ export function getCollectionUpdateTime({ name, time }: { time?: Date; name: str
  return new Date();
 }

-/* link collection start load data */
-export const loadingOneChunkCollection = async ({
+/**
+ * Get collection raw text by Collection or collectionId
+ */
+export const getCollectionAndRawText = async ({
  collectionId,
-  tmbId,
-  billId,
-  rawText
+  collection,
+  newRawText
 }: {
-  collectionId: string;
-  tmbId: string;
-  billId?: string;
-  rawText?: string;
+  collectionId?: string;
+  collection?: CollectionWithDatasetType;
+  newRawText?: string;
 }) => {
-  const collection = (await MongoDatasetCollection.findById(collectionId).populate(
-    'datasetId'
-  )) as CollectionWithDatasetType;
+  const col = await (async () => {
+    if (collection) return collection;
+    if (collectionId) {
+      return (await MongoDatasetCollection.findById(collectionId).populate(
+        'datasetId'
+      )) as CollectionWithDatasetType;
+    }

-  if (!collection) {
-    return Promise.reject(DatasetErrEnum.unCreateCollection);
+    return null;
+  })();
+
+  if (!col) {
+    return Promise.reject('Collection not found');
  }

-  const newRawText = await (async () => {
-    if (rawText) return rawText;
+  const rawText = await (async () => {
+    if (newRawText) return newRawText;
    // link
-    if (collection.type === DatasetCollectionTypeEnum.link && collection.rawLink) {
+    if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
      // crawl new data
      const result = await urlsFetch({
-        urlList: [collection.rawLink],
-        selector: collection.datasetId?.websiteConfig?.selector
+        urlList: [col.rawLink],
+        selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
      });

      return result[0].content;
    }
+
    // file

    return '';
  })();

+  const hashRawText = hashStr(rawText);
+  const isSameRawText = col.hashRawText === hashRawText;
+
+  return {
+    collection: col,
+    rawText,
+    isSameRawText
+  };
+};
+
+/* link collection start load data */
+export const reloadCollectionChunks = async ({
+  collectionId,
+  collection,
+  tmbId,
+  billId,
+  rawText
+}: {
+  collectionId?: string;
+  collection?: CollectionWithDatasetType;
+  tmbId: string;
+  billId?: string;
+  rawText?: string;
+}) => {
+  const {
+    rawText: newRawText,
+    collection: col,
+    isSameRawText
+  } = await getCollectionAndRawText({
+    collection,
+    collectionId,
+    newRawText: rawText
+  });
+
+  if (isSameRawText) return;
+
  // split data
  const { chunks } = splitText2Chunks({
    text: newRawText,
-    chunkLen: collection.chunkSize || 512,
+    chunkLen: col.chunkSize || 512,
    countTokens: false
  });

  // insert to training queue
  await MongoDatasetTraining.insertMany(
    chunks.map((item, i) => ({
-      teamId: collection.teamId,
+      teamId: col.teamId,
      tmbId,
-      datasetId: collection.datasetId._id,
-      collectionId: collection._id,
+      datasetId: col.datasetId._id,
+      collectionId: col._id,
      billId,
-      mode: collection.trainingType,
+      mode: col.trainingType,
      prompt: '',
-      model: collection.datasetId.vectorModel,
+      model: col.datasetId.vectorModel,
      q: item,
      a: '',
      chunkIndex: i
    }))
  );
+
+  // update raw text
+  await MongoDatasetCollection.findByIdAndUpdate(col._id, {
+    rawTextLength: newRawText.length,
+    hashRawText: hashStr(newRawText)
+  });
 };
--- a/packages/service/support/user/schema.ts
+++ b/packages/service/support/user/schema.ts
@@ -47,15 +47,6 @@ const UserSchema = new Schema({
    type: Number,
    default: 15
  },
-  limit: {
-    exportKbTime: {
-      // Every half hour
-      type: Date
-    },
-    datasetMaxCount: {
-      type: Number
-    }
-  },
  openaiAccount: {
    type: {
      key: String,
--- a/packages/service/support/user/team/teamSchema.ts
+++ b/packages/service/support/user/team/teamSchema.ts
@@ -32,6 +32,14 @@ const TeamSchema = new Schema({
  },
  lastDatasetBillTime: {
    type: Date
+  },
+  limit: {
+    lastExportDatasetTime: {
+      type: Date
+    },
+    lastWebsiteSyncTime: {
+      type: Date
+    }
  }
 });

--- a/packages/service/support/user/utils.ts
+++ b/packages/service/support/user/utils.ts
@@ -0,0 +1,69 @@
+import { MongoTeam } from './team/teamSchema';
+
+/* export dataset limit */
+export const updateExportDatasetLimit = async (teamId: string) => {
+  try {
+    await MongoTeam.findByIdAndUpdate(teamId, {
+      'limit.lastExportDatasetTime': new Date()
+    });
+  } catch (error) {}
+};
+export const checkExportDatasetLimit = async ({
+  teamId,
+  limitMinutes = 0
+}: {
+  teamId: string;
+  limitMinutes?: number;
+}) => {
+  const limitMinutesAgo = new Date(Date.now() - limitMinutes * 60 * 1000);
+
+  // auth export times
+  const authTimes = await MongoTeam.findOne(
+    {
+      _id: teamId,
+      $or: [
+        { 'limit.lastExportDatasetTime': { $exists: false } },
+        { 'limit.lastExportDatasetTime': { $lte: limitMinutesAgo } }
+      ]
+    },
+    '_id limit'
+  );
+
+  if (!authTimes) {
+    return Promise.reject(`每个团队，每 ${limitMinutes} 分钟仅可导出一次。`);
+  }
+};
+
+/* web sync limit */
+export const updateWebSyncLimit = async (teamId: string) => {
+  try {
+    await MongoTeam.findByIdAndUpdate(teamId, {
+      'limit.lastWebsiteSyncTime': new Date()
+    });
+  } catch (error) {}
+};
+export const checkWebSyncLimit = async ({
+  teamId,
+  limitMinutes = 0
+}: {
+  teamId: string;
+  limitMinutes?: number;
+}) => {
+  const limitMinutesAgo = new Date(Date.now() - limitMinutes * 60 * 1000);
+
+  // auth export times
+  const authTimes = await MongoTeam.findOne(
+    {
+      _id: teamId,
+      $or: [
+        { 'limit.lastWebsiteSyncTime': { $exists: false } },
+        { 'limit.lastWebsiteSyncTime': { $lte: limitMinutesAgo } }
+      ]
+    },
+    '_id limit'
+  );
+
+  if (!authTimes) {
+    return Promise.reject(`每个团队，每 ${limitMinutes} 分钟仅使用一次同步功能。`);
+  }
+};