4.6.3-website dataset (#532)

2025-07-23 05:12:39 +00:00 · 2023-12-03 20:45:57 +08:00
parent b916183848
commit a9ae270335
122 changed files with 3793 additions and 1360 deletions
--- a/packages/global/common/file/api.d.ts
+++ b/packages/global/common/file/api.d.ts
@@ -0,0 +1,8 @@
+export type UrlFetchParams = {
+  urlList: string[];
+  selector?: string;
+};
+export type UrlFetchResponse = {
+  url: string;
+  content: string;
+}[];
--- a/packages/global/common/file/tools.ts
+++ b/packages/global/common/file/tools.ts
@@ -1,3 +1,8 @@
+import axios from 'axios';
+import { UrlFetchParams, UrlFetchResponse } from './api.d';
+import { htmlToMarkdown } from '../string/markdown';
+import * as cheerio from 'cheerio';
+
 export const formatFileSize = (bytes: number): string => {
  if (bytes === 0) return '0 B';

@@ -7,3 +12,84 @@ export const formatFileSize = (bytes: number): string => {

  return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
 };
+
+export const cheerioToHtml = ({
+  fetchUrl,
+  $,
+  selector
+}: {
+  fetchUrl: string;
+  $: cheerio.CheerioAPI;
+  selector?: string;
+}) => {
+  // get origin url
+  const originUrl = new URL(fetchUrl).origin;
+
+  // remove i element
+  $('i,script').remove();
+
+  // remove empty a element
+  $('a')
+    .filter((i, el) => {
+      return $(el).text().trim() === '' && $(el).children().length === 0;
+    })
+    .remove();
+
+  // if link,img startWith /, add origin url
+  $('a').each((i, el) => {
+    const href = $(el).attr('href');
+    if (href && href.startsWith('/')) {
+      $(el).attr('href', originUrl + href);
+    }
+  });
+  $('img').each((i, el) => {
+    const src = $(el).attr('src');
+    if (src && src.startsWith('/')) {
+      $(el).attr('src', originUrl + src);
+    }
+  });
+
+  return $(selector || 'body').html();
+};
+export const urlsFetch = async ({
+  urlList,
+  selector
+}: UrlFetchParams): Promise<UrlFetchResponse> => {
+  urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
+
+  const response = (
+    await Promise.all(
+      urlList.map(async (url) => {
+        try {
+          const fetchRes = await axios.get(url, {
+            timeout: 30000
+          });
+
+          const $ = cheerio.load(fetchRes.data);
+
+          const md = htmlToMarkdown(
+            cheerioToHtml({
+              fetchUrl: url,
+              $,
+              selector
+            })
+          );
+
+          return {
+            url,
+            content: md
+          };
+        } catch (error) {
+          console.log(error, 'fetch error');
+
+          return {
+            url,
+            content: ''
+          };
+        }
+      })
+    )
+  ).filter((item) => item.content);
+
+  return response;
+};
--- a/packages/global/common/plugin/types/pluginRes.d.ts
+++ b/packages/global/common/plugin/types/pluginRes.d.ts
@@ -1,4 +0,0 @@
-export type FetchResultItem = {
-  url: string;
-  content: string;
-};
--- a/packages/global/common/string/jieba.ts
+++ b/packages/global/common/string/jieba.ts
--- a/packages/global/common/string/markdown.ts
+++ b/packages/global/common/string/markdown.ts
@@ -0,0 +1,97 @@
+import { simpleText } from './tools';
+import { NodeHtmlMarkdown } from 'node-html-markdown';
+
+/* Delete redundant text in markdown */
+export const simpleMarkdownText = (rawText: string) => {
+  rawText = simpleText(rawText);
+
+  // Remove a line feed from a hyperlink or picture
+  rawText = rawText.replace(/\[([^\]]+)\]\((.+?)\)/g, (match, linkText, url) => {
+    const cleanedLinkText = linkText.replace(/\n/g, ' ').trim();
+
+    if (!url) {
+      return '';
+    }
+
+    return `[${cleanedLinkText}](${url})`;
+  });
+
+  // replace special \.* ……
+  const reg1 = /\\([-.!`_(){}\[\]])/g;
+  if (reg1.test(rawText)) {
+    rawText = rawText.replace(/\\([`!*()+-_\[\]{}\\.])/g, '$1');
+  }
+
+  // replace \\n
+  rawText = rawText.replace(/\\\\n/g, '\\n');
+
+  // Remove headings and code blocks front spaces
+  ['####', '###', '##', '#', '```', '~~~'].forEach((item) => {
+    const reg = new RegExp(`\\n\\s*${item}`, 'g');
+    if (reg.test(rawText)) {
+      rawText = rawText.replace(new RegExp(`\\n\\s*(${item})`, 'g'), '\n$1');
+    }
+  });
+
+  return rawText.trim();
+};
+
+/* html string to markdown */
+export const htmlToMarkdown = (html?: string | null) => {
+  if (!html) return '';
+
+  const surround = (source: string, surroundStr: string) => `${surroundStr}${source}${surroundStr}`;
+
+  const nhm = new NodeHtmlMarkdown(
+    {
+      codeFence: '```',
+      codeBlockStyle: 'fenced',
+      ignore: ['i', 'script']
+    },
+    {
+      code: ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => {
+        const isCodeBlock = ['PRE', 'WRAPPED-PRE'].includes(parent?.tagName!);
+
+        if (!isCodeBlock) {
+          return {
+            spaceIfRepeatingChar: true,
+            noEscape: true,
+            postprocess: ({ content }) => {
+              // Find longest occurring sequence of running backticks and add one more (so content is escaped)
+              const delimiter =
+                '`' + (content.match(/`+/g)?.sort((a, b) => b.length - a.length)?.[0] || '');
+              const padding = delimiter.length > 1 ? ' ' : '';
+
+              return surround(surround(content, padding), delimiter);
+            }
+          };
+        }
+
+        /* Handle code block */
+        if (codeBlockStyle === 'fenced') {
+          const language =
+            node.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
+            parent?.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
+            '';
+
+          return {
+            noEscape: true,
+            prefix: `${codeFence}${language}\n`,
+            postfix: `\n${codeFence}\n`,
+            childTranslators: visitor.instance.codeBlockTranslators
+          };
+        }
+
+        return {
+          noEscape: true,
+          postprocess: ({ content }) => content.replace(/^/gm, '    '),
+          childTranslators: visitor.instance.codeBlockTranslators
+        };
+      }
+    }
+  );
+
+  const markdown = nhm.translate(html).trim();
+
+  return simpleMarkdownText(markdown);
+};
--- a/packages/global/common/string/textSplitter.ts
+++ b/packages/global/common/string/textSplitter.ts
@@ -15,11 +15,18 @@ export const splitText2Chunks = (props: {
 }): {
  chunks: string[];
  tokens: number;
+  overlapRatio?: number;
 } => {
-  const { text = '', chunkLen, overlapRatio = 0.2 } = props;
+  let { text = '', chunkLen, overlapRatio = 0.2 } = props;
  const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
+  const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
  const overlapLen = Math.round(chunkLen * overlapRatio);

+  // replace code block all \n to codeBlockMarker
+  text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
+    return match.replace(/\n/g, codeBlockMarker);
+  });
+
  // The larger maxLen is, the next sentence is less likely to trigger splitting
  const stepReges: { reg: RegExp; maxLen: number }[] = [
    { reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
@@ -27,8 +34,8 @@ export const splitText2Chunks = (props: {
    { reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
    { reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },

-    { reg: /([\n]{2})/g, maxLen: chunkLen * 1.4 },
-    { reg: /([\n](?![\*\-|>`0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
+    { reg: /([\n](`))/g, maxLen: chunkLen * 4 }, // code block
+    { reg: /([\n](?![\*\-|>0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
    { reg: /([\n])/g, maxLen: chunkLen * 1.4 },

    { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
@@ -38,9 +45,15 @@ export const splitText2Chunks = (props: {
    { reg: /([，]|,\s)/g, maxLen: chunkLen * 2 }
  ];

+  // if use markdown title split, Separate record title title
  const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
    if (step >= stepReges.length) {
-      return [text];
+      return [
+        {
+          text,
+          title: ''
+        }
+      ];
    }
    const isMarkdownSplit = step <= 3;
    const { reg } = stepReges[step];
@@ -49,7 +62,17 @@ export const splitText2Chunks = (props: {
      .replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
      .split(`${splitMarker}`)
      .filter((part) => part.trim());
-    return splitTexts;
+
+    return splitTexts
+      .map((text) => {
+        const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
+
+        return {
+          text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
+          title: matchTitle
+        };
+      })
+      .filter((item) => item.text.trim());
  };

  const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
@@ -63,7 +86,7 @@ export const splitText2Chunks = (props: {
    let overlayText = '';

    for (let i = splitTexts.length - 1; i >= 0; i--) {
-      const currentText = splitTexts[i];
+      const currentText = splitTexts[i].text;
      const newText = currentText + overlayText;
      const newTextLen = newText.length;

@@ -83,12 +106,16 @@ export const splitText2Chunks = (props: {
  const splitTextRecursively = ({
    text = '',
    step,
-    lastText
+    lastText,
+    mdTitle = ''
  }: {
    text: string;
    step: number;
    lastText: string;
+    mdTitle: string;
  }): string[] => {
+    const isMarkdownSplit = step <= 3;
+
    // mini text
    if (text.length <= chunkLen) {
      return [text];
@@ -102,7 +129,7 @@ export const splitText2Chunks = (props: {
      // use slice-chunkLen to split text
      const chunks: string[] = [];
      for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
-        chunks.push(text.slice(i, i + chunkLen));
+        chunks.push(`${mdTitle}${text.slice(i, i + chunkLen)}`);
      }
      return chunks;
    }
@@ -115,7 +142,10 @@ export const splitText2Chunks = (props: {

    const chunks: string[] = [];
    for (let i = 0; i < splitTexts.length; i++) {
-      const currentText = splitTexts[i];
+      const item = splitTexts[i];
+      const currentTitle = `${mdTitle}${item.title}`;
+
+      const currentText = item.text;
      const currentTextLen = currentText.length;
      const lastTextLen = lastText.length;
      const newText = lastText + currentText;
@@ -125,9 +155,10 @@ export const splitText2Chunks = (props: {
      if (newTextLen > maxLen) {
        // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
        if (lastTextLen > minChunkLen) {
-          chunks.push(lastText);
+          chunks.push(`${currentTitle}${lastText}`);
          lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
          i--;
+
          continue;
        }

@@ -135,11 +166,12 @@ export const splitText2Chunks = (props: {
        const innerChunks = splitTextRecursively({
          text: newText,
          step: step + 1,
-          lastText: ''
+          lastText: '',
+          mdTitle: currentTitle
        });
        const lastChunk = innerChunks[innerChunks.length - 1];
        // last chunk is too small, concat it to lastText
-        if (lastChunk.length < minChunkLen) {
+        if (!isMarkdownSplit && lastChunk.length < minChunkLen) {
          chunks.push(...innerChunks.slice(0, -1));
          lastText = lastChunk;
        } else {
@@ -156,10 +188,11 @@ export const splitText2Chunks = (props: {
      // size less than chunkLen, push text to last chunk. now, text definitely less than maxLen
      lastText = newText;

-      // If the chunk size reaches, add a chunk
-      if (newTextLen >= chunkLen) {
-        chunks.push(lastText);
-        lastText = getOneTextOverlapText({ text: lastText, step });
+      // markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
+      if (isMarkdownSplit || newTextLen >= chunkLen) {
+        chunks.push(`${currentTitle}${lastText}`);
+
+        lastText = isMarkdownSplit ? '' : getOneTextOverlapText({ text: lastText, step });
      }
    }

@@ -168,7 +201,7 @@ export const splitText2Chunks = (props: {
      if (lastText.length < chunkLen * 0.4) {
        chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
      } else {
-        chunks.push(lastText);
+        chunks.push(`${mdTitle}${lastText}`);
      }
    }

@@ -179,8 +212,9 @@ export const splitText2Chunks = (props: {
    const chunks = splitTextRecursively({
      text,
      step: 0,
-      lastText: ''
-    });
+      lastText: '',
+      mdTitle: ''
+    }).map((chunk) => chunk.replaceAll(codeBlockMarker, '\n')); // restore code block

    const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);

--- a/packages/global/common/system/utils.ts
+++ b/packages/global/common/system/utils.ts
@@ -0,0 +1,6 @@
+export const delay = (ms: number) =>
+  new Promise((resolve) => {
+    setTimeout(() => {
+      resolve('');
+    }, ms);
+  });
--- a/packages/global/core/chat/constants.ts
+++ b/packages/global/core/chat/constants.ts
@@ -5,7 +5,6 @@ export enum ChatRoleEnum {
  Function = 'Function',
  Tool = 'Tool'
 }
-
 export const ChatRoleMap = {
  [ChatRoleEnum.System]: {
    name: '系统提示词'
@@ -30,7 +29,6 @@ export enum ChatSourceEnum {
  share = 'share',
  api = 'api'
 }
-
 export const ChatSourceMap = {
  [ChatSourceEnum.test]: {
    name: 'chat.logs.test'
--- a/packages/global/core/dataset/api.d.ts
+++ b/packages/global/core/dataset/api.d.ts
@@ -1,8 +1,32 @@
-import { DatasetDataIndexItemType } from './type';
+import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
+import { DatasetCollectionTrainingModeEnum, DatasetCollectionTypeEnum } from './constant';
+import type { LLMModelItemType } from '../ai/model.d';

 /* ================= dataset ===================== */
+export type DatasetUpdateBody = {
+  id: string;
+  parentId?: string;
+  tags?: string[];
+  name?: string;
+  avatar?: string;
+  permission?: DatasetSchemaType['permission'];
+  agentModel?: LLMModelItemType;
+  websiteConfig?: DatasetSchemaType['websiteConfig'];
+  status?: DatasetSchemaType['status'];
+};

 /* ================= collection ===================== */
+export type CreateDatasetCollectionParams = {
+  datasetId: string;
+  parentId?: string;
+  name: string;
+  type: `${DatasetCollectionTypeEnum}`;
+  trainingType?: `${DatasetCollectionTrainingModeEnum}`;
+  chunkSize?: number;
+  fileId?: string;
+  rawLink?: string;
+  metadata?: Record<string, any>;
+};

 /* ================= data ===================== */
 export type PgSearchRawType = {
@@ -18,3 +42,8 @@ export type PushDatasetDataChunkProps = {
  a?: string; // bonus content
  indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
 };
+
+export type PostWebsiteSyncParams = {
+  datasetId: string;
+  billId: string;
+};
--- a/packages/global/core/dataset/constant.ts
+++ b/packages/global/core/dataset/constant.ts
@@ -3,15 +3,37 @@ export const PgDatasetTableName = 'modeldata';
 /* ------------ dataset -------------- */
 export enum DatasetTypeEnum {
  folder = 'folder',
-  dataset = 'dataset'
+  dataset = 'dataset',
+  websiteDataset = 'websiteDataset' // depp link
 }
-
 export const DatasetTypeMap = {
  [DatasetTypeEnum.folder]: {
-    name: 'folder'
+    icon: 'core/dataset/folderDataset',
+    label: 'core.dataset.Folder Dataset',
+    collectionLabel: 'common.Folder'
  },
  [DatasetTypeEnum.dataset]: {
-    name: 'dataset'
+    icon: 'core/dataset/commonDataset',
+    label: 'core.dataset.Common Dataset',
+    collectionLabel: 'common.File'
+  },
+  [DatasetTypeEnum.websiteDataset]: {
+    icon: 'core/dataset/websiteDataset',
+    label: 'core.dataset.Website Dataset',
+    collectionLabel: 'common.Website'
+  }
+};
+
+export enum DatasetStatusEnum {
+  active = 'active',
+  syncing = 'syncing'
+}
+export const DatasetStatusMap = {
+  [DatasetStatusEnum.active]: {
+    label: 'core.dataset.status.active'
+  },
+  [DatasetStatusEnum.syncing]: {
+    label: 'core.dataset.status.syncing'
  }
 };

@@ -19,7 +41,7 @@ export const DatasetTypeMap = {
 export enum DatasetCollectionTypeEnum {
  folder = 'folder',
  file = 'file',
-  link = 'link',
+  link = 'link', // one link
  virtual = 'virtual'
 }
 export const DatasetCollectionTypeMap = {
--- a/packages/global/core/dataset/type.d.ts
+++ b/packages/global/core/dataset/type.d.ts
@@ -4,6 +4,7 @@ import { PushDatasetDataChunkProps } from './api';
 import {
  DatasetCollectionTypeEnum,
  DatasetDataIndexTypeEnum,
+  DatasetStatusEnum,
  DatasetTypeEnum,
  TrainingModeEnum
 } from './constant';
@@ -20,9 +21,14 @@ export type DatasetSchemaType = {
  name: string;
  vectorModel: string;
  agentModel: string;
-  tags: string[];
+  intro: string;
  type: `${DatasetTypeEnum}`;
+  status: `${DatasetStatusEnum}`;
  permission: `${PermissionTypeEnum}`;
+  websiteConfig?: {
+    url: string;
+    selector: string;
+  };
 };

 export type DatasetCollectionSchemaType = {
@@ -39,6 +45,7 @@ export type DatasetCollectionSchemaType = {
  chunkSize: number;
  fileId?: string;
  rawLink?: string;
+  metadata?: Record<string, any>;
 };

 export type DatasetDataIndexItemType = {
@@ -91,6 +98,18 @@ export type DatasetDataWithCollectionType = Omit<DatasetDataSchemaType, 'collect
 };

 /* ================= dataset ===================== */
+export type DatasetListItemType = {
+  _id: string;
+  parentId: string;
+  avatar: string;
+  name: string;
+  intro: string;
+  type: `${DatasetTypeEnum}`;
+  isOwner: boolean;
+  canWrite: boolean;
+  permission: `${PermissionTypeEnum}`;
+  vectorModel: VectorModelItemType;
+};
 export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
  vectorModel: VectorModelItemType;
  agentModel: LLMModelItemType;
--- a/packages/global/package.json
+++ b/packages/global/package.json
@@ -3,13 +3,16 @@
  "version": "1.0.0",
  "dependencies": {
    "axios": "^1.5.1",
-    "timezones-list": "^3.0.2",
+    "cheerio": "1.0.0-rc.12",
    "dayjs": "^1.11.7",
    "encoding": "^0.1.13",
+    "js-tiktoken": "^1.0.7",
+    "node-html-markdown": "^1.3.0",
    "openai": "^4.16.1",
-    "js-tiktoken": "^1.0.7"
+    "timezones-list": "^3.0.2"
  },
  "devDependencies": {
-    "@types/node": "^20.8.5"
+    "@types/node": "^20.8.5",
+    "@types/turndown": "^5.0.4"
  }
 }
--- a/packages/global/support/wallet/bill/tools.ts
+++ b/packages/global/support/wallet/bill/tools.ts
@@ -1,6 +1,7 @@
 /* bill common  */
 import { PRICE_SCALE } from './constants';
-import { BillItemType, BillSchema } from './type';
+import { BillSourceEnum } from './constants';
+import { AuthUserTypeEnum } from '../../permission/constant';

 /**
 * dataset price / PRICE_SCALE = real price
@@ -8,3 +9,15 @@ import { BillItemType, BillSchema } from './type';
 export const formatPrice = (val = 0, multiple = 1) => {
  return Number(((val / PRICE_SCALE) * multiple).toFixed(10));
 };
+
+export const getBillSourceByAuthType = ({
+  shareId,
+  authType
+}: {
+  shareId?: string;
+  authType?: `${AuthUserTypeEnum}`;
+}) => {
+  if (shareId) return BillSourceEnum.shareLink;
+  if (authType === AuthUserTypeEnum.apikey) return BillSourceEnum.api;
+  return BillSourceEnum.fastgpt;
+};
--- a/packages/service/common/api/plusRequest.ts
+++ b/packages/service/common/api/plusRequest.ts
@@ -101,18 +101,18 @@ export function request(url: string, data: any, config: ConfigType, method: Meth
 * @param {Object} config
 * @returns
 */
-export function GET<T>(url: string, params = {}, config: ConfigType = {}): Promise<T> {
+export function GET<T = undefined>(url: string, params = {}, config: ConfigType = {}): Promise<T> {
  return request(url, params, config, 'GET');
 }

-export function POST<T>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
+export function POST<T = undefined>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
  return request(url, data, config, 'POST');
 }

-export function PUT<T>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
+export function PUT<T = undefined>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
  return request(url, data, config, 'PUT');
 }

-export function DELETE<T>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
+export function DELETE<T = undefined>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
  return request(url, data, config, 'DELETE');
 }
--- a/packages/service/common/file/gridfs/controller.ts
+++ b/packages/service/common/file/gridfs/controller.ts
@@ -89,7 +89,7 @@ export async function delFileById({
  return true;
 }

-export async function getDownloadBuf({
+export async function getDownloadStream({
  bucketName,
  fileId
 }: {
@@ -98,14 +98,5 @@ export async function getDownloadBuf({
 }) {
  const bucket = getGridBucket(bucketName);

-  const stream = bucket.openDownloadStream(new Types.ObjectId(fileId));
-
-  const buf: Buffer = await new Promise((resolve, reject) => {
-    const buffers: Buffer[] = [];
-    stream.on('data', (data) => buffers.push(data));
-    stream.on('error', reject);
-    stream.on('end', () => resolve(Buffer.concat(buffers)));
-  });
-
-  return buf;
+  return bucket.openDownloadStream(new Types.ObjectId(fileId));
 }
--- a/packages/service/core/chat/utils.ts
+++ b/packages/service/core/chat/utils.ts
@@ -3,6 +3,7 @@ import { ChatRoleEnum, IMG_BLOCK_KEY } from '@fastgpt/global/core/chat/constants
 import { countMessagesTokens, countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
 import { adaptRole_Chat2Message } from '@fastgpt/global/core/chat/adapt';
 import type { ChatCompletionContentPart } from '@fastgpt/global/core/ai/type.d';
+import axios from 'axios';

 /* slice chat context by tokens */
 export function ChatContextFilter({
@@ -81,11 +82,13 @@ export function ChatContextFilter({
            }
        ]
 */
-export function formatStr2ChatContent(str: string) {
+export async function formatStr2ChatContent(str: string) {
  const content: ChatCompletionContentPart[] = [];
  let lastIndex = 0;
  const regex = new RegExp(`\`\`\`(${IMG_BLOCK_KEY})\\n([\\s\\S]*?)\`\`\``, 'g');

+  const imgKey: 'image_url' = 'image_url';
+
  let match;

  while ((match = regex.exec(str)) !== null) {
@@ -115,7 +118,7 @@ export function formatStr2ChatContent(str: string) {

      content.push(
        ...jsonLines.map((item) => ({
-          type: 'image_url' as any,
+          type: imgKey,
          image_url: {
            url: item.src
          }
@@ -148,5 +151,18 @@ export function formatStr2ChatContent(str: string) {
  if (content.length === 1 && content[0].type === 'text') {
    return content[0].text;
  }
+
+  if (!content) return null;
+  // load img to base64
+  for await (const item of content) {
+    if (item.type === imgKey && item[imgKey]?.url) {
+      const response = await axios.get(item[imgKey].url, {
+        responseType: 'arraybuffer'
+      });
+      const base64 = Buffer.from(response.data).toString('base64');
+      item[imgKey].url = `data:${response.headers['content-type']};base64,${base64}`;
+    }
+  }
+
  return content ? content : null;
 }
--- a/packages/service/core/dataset/collection/controller.ts
+++ b/packages/service/core/dataset/collection/controller.ts
@@ -0,0 +1,73 @@
+import {
+  DatasetCollectionTrainingModeEnum,
+  DatasetCollectionTypeEnum
+} from '@fastgpt/global/core/dataset/constant';
+import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
+import { MongoDatasetCollection } from './schema';
+
+export async function createOneCollection({
+  name,
+  parentId,
+  datasetId,
+  type,
+  trainingType = DatasetCollectionTrainingModeEnum.manual,
+  chunkSize = 0,
+  fileId,
+  rawLink,
+  teamId,
+  tmbId,
+  metadata = {}
+}: CreateDatasetCollectionParams & { teamId: string; tmbId: string }) {
+  const { _id } = await MongoDatasetCollection.create({
+    name,
+    teamId,
+    tmbId,
+    datasetId,
+    parentId: parentId || null,
+    type,
+    trainingType,
+    chunkSize,
+    fileId,
+    rawLink,
+    metadata
+  });
+
+  // create default collection
+  if (type === DatasetCollectionTypeEnum.folder) {
+    await createDefaultCollection({
+      datasetId,
+      parentId: _id,
+      teamId,
+      tmbId
+    });
+  }
+
+  return _id;
+}
+
+// create default collection
+export function createDefaultCollection({
+  name = '手动录入',
+  datasetId,
+  parentId,
+  teamId,
+  tmbId
+}: {
+  name?: '手动录入' | '手动标注';
+  datasetId: string;
+  parentId?: string;
+  teamId: string;
+  tmbId: string;
+}) {
+  return MongoDatasetCollection.create({
+    name,
+    teamId,
+    tmbId,
+    datasetId,
+    parentId,
+    type: DatasetCollectionTypeEnum.virtual,
+    trainingType: DatasetCollectionTrainingModeEnum.manual,
+    chunkSize: 0,
+    updateTime: new Date('2099')
+  });
+}
--- a/packages/service/core/dataset/collection/schema.ts
+++ b/packages/service/core/dataset/collection/schema.ts
@@ -39,15 +39,16 @@ const DatasetCollectionSchema = new Schema({
    ref: DatasetCollectionName,
    required: true
  },
-  name: {
-    type: String,
-    required: true
-  },
  type: {
    type: String,
    enum: Object.keys(DatasetCollectionTypeMap),
    required: true
  },
+
+  name: {
+    type: String,
+    required: true
+  },
  createTime: {
    type: Date,
    default: () => new Date()
--- a/packages/service/core/dataset/data/controller.ts
+++ b/packages/service/core/dataset/data/controller.ts
@@ -0,0 +1,75 @@
+import { MongoDatasetData } from './schema';
+import { deletePgDataById } from './pg';
+import { MongoDatasetTraining } from '../training/schema';
+import { delFileById } from '../../../common/file/gridfs/controller';
+import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
+import { MongoDatasetCollection } from '../collection/schema';
+import { delDatasetFiles } from '../file/controller';
+import { delay } from '@fastgpt/global/common/system/utils';
+
+/* delete all data by datasetIds */
+export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
+  datasetIds = datasetIds.map((item) => String(item));
+
+  // delete training data(There could be a training mission)
+  await MongoDatasetTraining.deleteMany({
+    datasetId: { $in: datasetIds }
+  });
+
+  // delete related files
+  await Promise.all(datasetIds.map((id) => delDatasetFiles({ datasetId: id })));
+
+  await delay(1000);
+
+  // delete pg data
+  await deletePgDataById(`dataset_id IN ('${datasetIds.join("','")}')`);
+  // delete dataset.datas
+  await MongoDatasetData.deleteMany({ datasetId: { $in: datasetIds } });
+
+  // delete collections
+  await MongoDatasetCollection.deleteMany({
+    datasetId: { $in: datasetIds }
+  });
+}
+/**
+ * delete all data by collectionIds
+ */
+export async function delCollectionRelevantData({
+  collectionIds,
+  fileIds
+}: {
+  collectionIds: string[];
+  fileIds: string[];
+}) {
+  collectionIds = collectionIds.map((item) => String(item));
+  const filterFileIds = fileIds.filter(Boolean);
+
+  // delete training data
+  await MongoDatasetTraining.deleteMany({
+    collectionId: { $in: collectionIds }
+  });
+
+  // delete file
+  await Promise.all(
+    filterFileIds.map((fileId) => {
+      return delFileById({
+        bucketName: BucketNameEnum.dataset,
+        fileId
+      });
+    })
+  );
+
+  await delay(1000);
+
+  // delete pg data
+  await deletePgDataById(`collection_id IN ('${collectionIds.join("','")}')`);
+  // delete dataset.datas
+  await MongoDatasetData.deleteMany({ collectionId: { $in: collectionIds } });
+}
+/**
+ * delete one data by mongoDataId
+ */
+export async function delDatasetDataByDataId(mongoDataId: string) {
+  await deletePgDataById(['data_id', mongoDataId]);
+  await MongoDatasetData.findByIdAndDelete(mongoDataId);
+}
--- a/packages/service/core/dataset/data/pg.ts
+++ b/packages/service/core/dataset/data/pg.ts
@@ -0,0 +1,28 @@
+import { PgDatasetTableName } from '@fastgpt/global/core/dataset/constant';
+import { delay } from '@fastgpt/global/common/system/utils';
+import { PgClient } from '../../../common/pg';
+
+export async function deletePgDataById(
+  where: ['id' | 'dataset_id' | 'collection_id' | 'data_id', string] | string
+) {
+  let retry = 2;
+  async function deleteData(): Promise<any> {
+    try {
+      await PgClient.delete(PgDatasetTableName, {
+        where: [where]
+      });
+    } catch (error) {
+      if (--retry < 0) {
+        return Promise.reject(error);
+      }
+      await delay(500);
+      return deleteData();
+    }
+  }
+
+  await deleteData();
+
+  return {
+    tokenLen: 0
+  };
+}
--- a/packages/service/core/dataset/data/schema.ts
+++ b/packages/service/core/dataset/data/schema.ts
@@ -79,6 +79,9 @@ const DatasetDataSchema = new Schema({
  chunkIndex: {
    type: Number,
    default: 0
+  },
+  inited: {
+    type: Boolean
  }
 });

@@ -88,7 +91,7 @@ try {
  DatasetDataSchema.index({ collectionId: 1 });
  // full text index
  DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
-  DatasetDataSchema.index({ fullTextToken: 1 });
+  DatasetDataSchema.index({ inited: 1 });
 } catch (error) {
  console.log(error);
 }
--- a/packages/service/core/dataset/schema.ts
+++ b/packages/service/core/dataset/schema.ts
@@ -1,7 +1,11 @@
 import { connectionMongo, type Model } from '../../common/mongo';
 const { Schema, model, models } = connectionMongo;
 import { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
-import { DatasetTypeMap } from '@fastgpt/global/core/dataset/constant';
+import {
+  DatasetStatusEnum,
+  DatasetStatusMap,
+  DatasetTypeMap
+} from '@fastgpt/global/core/dataset/constant';
 import {
  TeamCollectionName,
  TeamMemberCollectionName
@@ -31,9 +35,16 @@ const DatasetSchema = new Schema({
    ref: TeamMemberCollectionName,
    required: true
  },
-  updateTime: {
-    type: Date,
-    default: () => new Date()
+  type: {
+    type: String,
+    enum: Object.keys(DatasetTypeMap),
+    required: true,
+    default: 'dataset'
+  },
+  status: {
+    type: String,
+    enum: Object.keys(DatasetStatusMap),
+    default: DatasetStatusEnum.active
  },
  avatar: {
    type: String,
@@ -43,6 +54,10 @@ const DatasetSchema = new Schema({
    type: String,
    required: true
  },
+  updateTime: {
+    type: Date,
+    default: () => new Date()
+  },
  vectorModel: {
    type: String,
    required: true,
@@ -53,24 +68,26 @@ const DatasetSchema = new Schema({
    required: true,
    default: 'gpt-3.5-turbo-16k'
  },
-  type: {
+  intro: {
    type: String,
-    enum: Object.keys(DatasetTypeMap),
-    required: true,
-    default: 'dataset'
-  },
-  tags: {
-    type: [String],
-    default: [],
-    set(val: string | string[]) {
-      if (Array.isArray(val)) return val;
-      return val.split(' ').filter((item) => item);
-    }
+    default: ''
  },
  permission: {
    type: String,
    enum: Object.keys(PermissionTypeMap),
    default: PermissionTypeEnum.private
+  },
+  websiteConfig: {
+    type: {
+      url: {
+        type: String,
+        required: true
+      },
+      selector: {
+        type: String,
+        default: 'body'
+      }
+    }
  }
 });