4.6.3-website dataset (#532)

This commit is contained in:
Archer
2023-12-03 20:45:57 +08:00
committed by GitHub
parent b916183848
commit a9ae270335
122 changed files with 3793 additions and 1360 deletions

8
packages/global/common/file/api.d.ts vendored Normal file
View File

@@ -0,0 +1,8 @@
export type UrlFetchParams = {
urlList: string[];
selector?: string;
};
export type UrlFetchResponse = {
url: string;
content: string;
}[];

View File

@@ -1,3 +1,8 @@
import axios from 'axios';
import { UrlFetchParams, UrlFetchResponse } from './api.d';
import { htmlToMarkdown } from '../string/markdown';
import * as cheerio from 'cheerio';
export const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B';
@@ -7,3 +12,84 @@ export const formatFileSize = (bytes: number): string => {
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
};
export const cheerioToHtml = ({
fetchUrl,
$,
selector
}: {
fetchUrl: string;
$: cheerio.CheerioAPI;
selector?: string;
}) => {
// get origin url
const originUrl = new URL(fetchUrl).origin;
// remove i element
$('i,script').remove();
// remove empty a element
$('a')
.filter((i, el) => {
return $(el).text().trim() === '' && $(el).children().length === 0;
})
.remove();
// if link,img startWith /, add origin url
$('a').each((i, el) => {
const href = $(el).attr('href');
if (href && href.startsWith('/')) {
$(el).attr('href', originUrl + href);
}
});
$('img').each((i, el) => {
const src = $(el).attr('src');
if (src && src.startsWith('/')) {
$(el).attr('src', originUrl + src);
}
});
return $(selector || 'body').html();
};
export const urlsFetch = async ({
urlList,
selector
}: UrlFetchParams): Promise<UrlFetchResponse> => {
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
const response = (
await Promise.all(
urlList.map(async (url) => {
try {
const fetchRes = await axios.get(url, {
timeout: 30000
});
const $ = cheerio.load(fetchRes.data);
const md = htmlToMarkdown(
cheerioToHtml({
fetchUrl: url,
$,
selector
})
);
return {
url,
content: md
};
} catch (error) {
console.log(error, 'fetch error');
return {
url,
content: ''
};
}
})
)
).filter((item) => item.content);
return response;
};

View File

@@ -1,4 +0,0 @@
export type FetchResultItem = {
url: string;
content: string;
};

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,97 @@
import { simpleText } from './tools';
import { NodeHtmlMarkdown } from 'node-html-markdown';
/* Delete redundant text in markdown */
export const simpleMarkdownText = (rawText: string) => {
rawText = simpleText(rawText);
// Remove a line feed from a hyperlink or picture
rawText = rawText.replace(/\[([^\]]+)\]\((.+?)\)/g, (match, linkText, url) => {
const cleanedLinkText = linkText.replace(/\n/g, ' ').trim();
if (!url) {
return '';
}
return `[${cleanedLinkText}](${url})`;
});
// replace special \.* ……
const reg1 = /\\([-.!`_(){}\[\]])/g;
if (reg1.test(rawText)) {
rawText = rawText.replace(/\\([`!*()+-_\[\]{}\\.])/g, '$1');
}
// replace \\n
rawText = rawText.replace(/\\\\n/g, '\\n');
// Remove headings and code blocks front spaces
['####', '###', '##', '#', '```', '~~~'].forEach((item) => {
const reg = new RegExp(`\\n\\s*${item}`, 'g');
if (reg.test(rawText)) {
rawText = rawText.replace(new RegExp(`\\n\\s*(${item})`, 'g'), '\n$1');
}
});
return rawText.trim();
};
/* html string to markdown */
export const htmlToMarkdown = (html?: string | null) => {
if (!html) return '';
const surround = (source: string, surroundStr: string) => `${surroundStr}${source}${surroundStr}`;
const nhm = new NodeHtmlMarkdown(
{
codeFence: '```',
codeBlockStyle: 'fenced',
ignore: ['i', 'script']
},
{
code: ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => {
const isCodeBlock = ['PRE', 'WRAPPED-PRE'].includes(parent?.tagName!);
if (!isCodeBlock) {
return {
spaceIfRepeatingChar: true,
noEscape: true,
postprocess: ({ content }) => {
// Find longest occurring sequence of running backticks and add one more (so content is escaped)
const delimiter =
'`' + (content.match(/`+/g)?.sort((a, b) => b.length - a.length)?.[0] || '');
const padding = delimiter.length > 1 ? ' ' : '';
return surround(surround(content, padding), delimiter);
}
};
}
/* Handle code block */
if (codeBlockStyle === 'fenced') {
const language =
node.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
parent?.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
'';
return {
noEscape: true,
prefix: `${codeFence}${language}\n`,
postfix: `\n${codeFence}\n`,
childTranslators: visitor.instance.codeBlockTranslators
};
}
return {
noEscape: true,
postprocess: ({ content }) => content.replace(/^/gm, ' '),
childTranslators: visitor.instance.codeBlockTranslators
};
}
}
);
const markdown = nhm.translate(html).trim();
return simpleMarkdownText(markdown);
};

View File

@@ -15,11 +15,18 @@ export const splitText2Chunks = (props: {
}): {
chunks: string[];
tokens: number;
overlapRatio?: number;
} => {
const { text = '', chunkLen, overlapRatio = 0.2 } = props;
let { text = '', chunkLen, overlapRatio = 0.2 } = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkLen * overlapRatio);
// replace code block all \n to codeBlockMarker
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
return match.replace(/\n/g, codeBlockMarker);
});
// The larger maxLen is, the next sentence is less likely to trigger splitting
const stepReges: { reg: RegExp; maxLen: number }[] = [
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
@@ -27,8 +34,8 @@ export const splitText2Chunks = (props: {
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
{ reg: /([\n]{2})/g, maxLen: chunkLen * 1.4 },
{ reg: /([\n](?![\*\-|>`0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n](`))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?![\*\-|>0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n])/g, maxLen: chunkLen * 1.4 },
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
@@ -38,9 +45,15 @@ export const splitText2Chunks = (props: {
{ reg: /([]|,\s)/g, maxLen: chunkLen * 2 }
];
// if use markdown title split, Separate record title title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
return [text];
return [
{
text,
title: ''
}
];
}
const isMarkdownSplit = step <= 3;
const { reg } = stepReges[step];
@@ -49,7 +62,17 @@ export const splitText2Chunks = (props: {
.replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
.split(`${splitMarker}`)
.filter((part) => part.trim());
return splitTexts;
return splitTexts
.map((text) => {
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
return {
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
title: matchTitle
};
})
.filter((item) => item.text.trim());
};
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
@@ -63,7 +86,7 @@ export const splitText2Chunks = (props: {
let overlayText = '';
for (let i = splitTexts.length - 1; i >= 0; i--) {
const currentText = splitTexts[i];
const currentText = splitTexts[i].text;
const newText = currentText + overlayText;
const newTextLen = newText.length;
@@ -83,12 +106,16 @@ export const splitText2Chunks = (props: {
const splitTextRecursively = ({
text = '',
step,
lastText
lastText,
mdTitle = ''
}: {
text: string;
step: number;
lastText: string;
mdTitle: string;
}): string[] => {
const isMarkdownSplit = step <= 3;
// mini text
if (text.length <= chunkLen) {
return [text];
@@ -102,7 +129,7 @@ export const splitText2Chunks = (props: {
// use slice-chunkLen to split text
const chunks: string[] = [];
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
chunks.push(text.slice(i, i + chunkLen));
chunks.push(`${mdTitle}${text.slice(i, i + chunkLen)}`);
}
return chunks;
}
@@ -115,7 +142,10 @@ export const splitText2Chunks = (props: {
const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
const currentText = splitTexts[i];
const item = splitTexts[i];
const currentTitle = `${mdTitle}${item.title}`;
const currentText = item.text;
const currentTextLen = currentText.length;
const lastTextLen = lastText.length;
const newText = lastText + currentText;
@@ -125,9 +155,10 @@ export const splitText2Chunks = (props: {
if (newTextLen > maxLen) {
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
if (lastTextLen > minChunkLen) {
chunks.push(lastText);
chunks.push(`${currentTitle}${lastText}`);
lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
i--;
continue;
}
@@ -135,11 +166,12 @@ export const splitText2Chunks = (props: {
const innerChunks = splitTextRecursively({
text: newText,
step: step + 1,
lastText: ''
lastText: '',
mdTitle: currentTitle
});
const lastChunk = innerChunks[innerChunks.length - 1];
// last chunk is too small, concat it to lastText
if (lastChunk.length < minChunkLen) {
if (!isMarkdownSplit && lastChunk.length < minChunkLen) {
chunks.push(...innerChunks.slice(0, -1));
lastText = lastChunk;
} else {
@@ -156,10 +188,11 @@ export const splitText2Chunks = (props: {
// size less than chunkLen, push text to last chunk. now, text definitely less than maxLen
lastText = newText;
// If the chunk size reaches, add a chunk
if (newTextLen >= chunkLen) {
chunks.push(lastText);
lastText = getOneTextOverlapText({ text: lastText, step });
// markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
if (isMarkdownSplit || newTextLen >= chunkLen) {
chunks.push(`${currentTitle}${lastText}`);
lastText = isMarkdownSplit ? '' : getOneTextOverlapText({ text: lastText, step });
}
}
@@ -168,7 +201,7 @@ export const splitText2Chunks = (props: {
if (lastText.length < chunkLen * 0.4) {
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
} else {
chunks.push(lastText);
chunks.push(`${mdTitle}${lastText}`);
}
}
@@ -179,8 +212,9 @@ export const splitText2Chunks = (props: {
const chunks = splitTextRecursively({
text,
step: 0,
lastText: ''
});
lastText: '',
mdTitle: ''
}).map((chunk) => chunk.replaceAll(codeBlockMarker, '\n')); // restore code block
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);

View File

@@ -0,0 +1,6 @@
export const delay = (ms: number) =>
new Promise((resolve) => {
setTimeout(() => {
resolve('');
}, ms);
});

View File

@@ -5,7 +5,6 @@ export enum ChatRoleEnum {
Function = 'Function',
Tool = 'Tool'
}
export const ChatRoleMap = {
[ChatRoleEnum.System]: {
name: '系统提示词'
@@ -30,7 +29,6 @@ export enum ChatSourceEnum {
share = 'share',
api = 'api'
}
export const ChatSourceMap = {
[ChatSourceEnum.test]: {
name: 'chat.logs.test'

View File

@@ -1,8 +1,32 @@
import { DatasetDataIndexItemType } from './type';
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import { DatasetCollectionTrainingModeEnum, DatasetCollectionTypeEnum } from './constant';
import type { LLMModelItemType } from '../ai/model.d';
/* ================= dataset ===================== */
export type DatasetUpdateBody = {
id: string;
parentId?: string;
tags?: string[];
name?: string;
avatar?: string;
permission?: DatasetSchemaType['permission'];
agentModel?: LLMModelItemType;
websiteConfig?: DatasetSchemaType['websiteConfig'];
status?: DatasetSchemaType['status'];
};
/* ================= collection ===================== */
export type CreateDatasetCollectionParams = {
datasetId: string;
parentId?: string;
name: string;
type: `${DatasetCollectionTypeEnum}`;
trainingType?: `${DatasetCollectionTrainingModeEnum}`;
chunkSize?: number;
fileId?: string;
rawLink?: string;
metadata?: Record<string, any>;
};
/* ================= data ===================== */
export type PgSearchRawType = {
@@ -18,3 +42,8 @@ export type PushDatasetDataChunkProps = {
a?: string; // bonus content
indexes?: Omit<DatasetDataIndexItemType, 'dataId'>[];
};
export type PostWebsiteSyncParams = {
datasetId: string;
billId: string;
};

View File

@@ -3,15 +3,37 @@ export const PgDatasetTableName = 'modeldata';
/* ------------ dataset -------------- */
export enum DatasetTypeEnum {
folder = 'folder',
dataset = 'dataset'
dataset = 'dataset',
websiteDataset = 'websiteDataset' // depp link
}
export const DatasetTypeMap = {
[DatasetTypeEnum.folder]: {
name: 'folder'
icon: 'core/dataset/folderDataset',
label: 'core.dataset.Folder Dataset',
collectionLabel: 'common.Folder'
},
[DatasetTypeEnum.dataset]: {
name: 'dataset'
icon: 'core/dataset/commonDataset',
label: 'core.dataset.Common Dataset',
collectionLabel: 'common.File'
},
[DatasetTypeEnum.websiteDataset]: {
icon: 'core/dataset/websiteDataset',
label: 'core.dataset.Website Dataset',
collectionLabel: 'common.Website'
}
};
export enum DatasetStatusEnum {
active = 'active',
syncing = 'syncing'
}
export const DatasetStatusMap = {
[DatasetStatusEnum.active]: {
label: 'core.dataset.status.active'
},
[DatasetStatusEnum.syncing]: {
label: 'core.dataset.status.syncing'
}
};
@@ -19,7 +41,7 @@ export const DatasetTypeMap = {
export enum DatasetCollectionTypeEnum {
folder = 'folder',
file = 'file',
link = 'link',
link = 'link', // one link
virtual = 'virtual'
}
export const DatasetCollectionTypeMap = {

View File

@@ -4,6 +4,7 @@ import { PushDatasetDataChunkProps } from './api';
import {
DatasetCollectionTypeEnum,
DatasetDataIndexTypeEnum,
DatasetStatusEnum,
DatasetTypeEnum,
TrainingModeEnum
} from './constant';
@@ -20,9 +21,14 @@ export type DatasetSchemaType = {
name: string;
vectorModel: string;
agentModel: string;
tags: string[];
intro: string;
type: `${DatasetTypeEnum}`;
status: `${DatasetStatusEnum}`;
permission: `${PermissionTypeEnum}`;
websiteConfig?: {
url: string;
selector: string;
};
};
export type DatasetCollectionSchemaType = {
@@ -39,6 +45,7 @@ export type DatasetCollectionSchemaType = {
chunkSize: number;
fileId?: string;
rawLink?: string;
metadata?: Record<string, any>;
};
export type DatasetDataIndexItemType = {
@@ -91,6 +98,18 @@ export type DatasetDataWithCollectionType = Omit<DatasetDataSchemaType, 'collect
};
/* ================= dataset ===================== */
export type DatasetListItemType = {
_id: string;
parentId: string;
avatar: string;
name: string;
intro: string;
type: `${DatasetTypeEnum}`;
isOwner: boolean;
canWrite: boolean;
permission: `${PermissionTypeEnum}`;
vectorModel: VectorModelItemType;
};
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
vectorModel: VectorModelItemType;
agentModel: LLMModelItemType;

View File

@@ -3,13 +3,16 @@
"version": "1.0.0",
"dependencies": {
"axios": "^1.5.1",
"timezones-list": "^3.0.2",
"cheerio": "1.0.0-rc.12",
"dayjs": "^1.11.7",
"encoding": "^0.1.13",
"js-tiktoken": "^1.0.7",
"node-html-markdown": "^1.3.0",
"openai": "^4.16.1",
"js-tiktoken": "^1.0.7"
"timezones-list": "^3.0.2"
},
"devDependencies": {
"@types/node": "^20.8.5"
"@types/node": "^20.8.5",
"@types/turndown": "^5.0.4"
}
}

View File

@@ -1,6 +1,7 @@
/* bill common */
import { PRICE_SCALE } from './constants';
import { BillItemType, BillSchema } from './type';
import { BillSourceEnum } from './constants';
import { AuthUserTypeEnum } from '../../permission/constant';
/**
* dataset price / PRICE_SCALE = real price
@@ -8,3 +9,15 @@ import { BillItemType, BillSchema } from './type';
export const formatPrice = (val = 0, multiple = 1) => {
return Number(((val / PRICE_SCALE) * multiple).toFixed(10));
};
export const getBillSourceByAuthType = ({
shareId,
authType
}: {
shareId?: string;
authType?: `${AuthUserTypeEnum}`;
}) => {
if (shareId) return BillSourceEnum.shareLink;
if (authType === AuthUserTypeEnum.apikey) return BillSourceEnum.api;
return BillSourceEnum.fastgpt;
};

View File

@@ -101,18 +101,18 @@ export function request(url: string, data: any, config: ConfigType, method: Meth
* @param {Object} config
* @returns
*/
export function GET<T>(url: string, params = {}, config: ConfigType = {}): Promise<T> {
export function GET<T = undefined>(url: string, params = {}, config: ConfigType = {}): Promise<T> {
return request(url, params, config, 'GET');
}
export function POST<T>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
export function POST<T = undefined>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
return request(url, data, config, 'POST');
}
export function PUT<T>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
export function PUT<T = undefined>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
return request(url, data, config, 'PUT');
}
export function DELETE<T>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
export function DELETE<T = undefined>(url: string, data = {}, config: ConfigType = {}): Promise<T> {
return request(url, data, config, 'DELETE');
}

View File

@@ -89,7 +89,7 @@ export async function delFileById({
return true;
}
export async function getDownloadBuf({
export async function getDownloadStream({
bucketName,
fileId
}: {
@@ -98,14 +98,5 @@ export async function getDownloadBuf({
}) {
const bucket = getGridBucket(bucketName);
const stream = bucket.openDownloadStream(new Types.ObjectId(fileId));
const buf: Buffer = await new Promise((resolve, reject) => {
const buffers: Buffer[] = [];
stream.on('data', (data) => buffers.push(data));
stream.on('error', reject);
stream.on('end', () => resolve(Buffer.concat(buffers)));
});
return buf;
return bucket.openDownloadStream(new Types.ObjectId(fileId));
}

View File

@@ -3,6 +3,7 @@ import { ChatRoleEnum, IMG_BLOCK_KEY } from '@fastgpt/global/core/chat/constants
import { countMessagesTokens, countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import { adaptRole_Chat2Message } from '@fastgpt/global/core/chat/adapt';
import type { ChatCompletionContentPart } from '@fastgpt/global/core/ai/type.d';
import axios from 'axios';
/* slice chat context by tokens */
export function ChatContextFilter({
@@ -81,11 +82,13 @@ export function ChatContextFilter({
}
]
*/
export function formatStr2ChatContent(str: string) {
export async function formatStr2ChatContent(str: string) {
const content: ChatCompletionContentPart[] = [];
let lastIndex = 0;
const regex = new RegExp(`\`\`\`(${IMG_BLOCK_KEY})\\n([\\s\\S]*?)\`\`\``, 'g');
const imgKey: 'image_url' = 'image_url';
let match;
while ((match = regex.exec(str)) !== null) {
@@ -115,7 +118,7 @@ export function formatStr2ChatContent(str: string) {
content.push(
...jsonLines.map((item) => ({
type: 'image_url' as any,
type: imgKey,
image_url: {
url: item.src
}
@@ -148,5 +151,18 @@ export function formatStr2ChatContent(str: string) {
if (content.length === 1 && content[0].type === 'text') {
return content[0].text;
}
if (!content) return null;
// load img to base64
for await (const item of content) {
if (item.type === imgKey && item[imgKey]?.url) {
const response = await axios.get(item[imgKey].url, {
responseType: 'arraybuffer'
});
const base64 = Buffer.from(response.data).toString('base64');
item[imgKey].url = `data:${response.headers['content-type']};base64,${base64}`;
}
}
return content ? content : null;
}

View File

@@ -0,0 +1,73 @@
import {
DatasetCollectionTrainingModeEnum,
DatasetCollectionTypeEnum
} from '@fastgpt/global/core/dataset/constant';
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
import { MongoDatasetCollection } from './schema';
export async function createOneCollection({
name,
parentId,
datasetId,
type,
trainingType = DatasetCollectionTrainingModeEnum.manual,
chunkSize = 0,
fileId,
rawLink,
teamId,
tmbId,
metadata = {}
}: CreateDatasetCollectionParams & { teamId: string; tmbId: string }) {
const { _id } = await MongoDatasetCollection.create({
name,
teamId,
tmbId,
datasetId,
parentId: parentId || null,
type,
trainingType,
chunkSize,
fileId,
rawLink,
metadata
});
// create default collection
if (type === DatasetCollectionTypeEnum.folder) {
await createDefaultCollection({
datasetId,
parentId: _id,
teamId,
tmbId
});
}
return _id;
}
// create default collection
export function createDefaultCollection({
name = '手动录入',
datasetId,
parentId,
teamId,
tmbId
}: {
name?: '手动录入' | '手动标注';
datasetId: string;
parentId?: string;
teamId: string;
tmbId: string;
}) {
return MongoDatasetCollection.create({
name,
teamId,
tmbId,
datasetId,
parentId,
type: DatasetCollectionTypeEnum.virtual,
trainingType: DatasetCollectionTrainingModeEnum.manual,
chunkSize: 0,
updateTime: new Date('2099')
});
}

View File

@@ -39,15 +39,16 @@ const DatasetCollectionSchema = new Schema({
ref: DatasetCollectionName,
required: true
},
name: {
type: String,
required: true
},
type: {
type: String,
enum: Object.keys(DatasetCollectionTypeMap),
required: true
},
name: {
type: String,
required: true
},
createTime: {
type: Date,
default: () => new Date()

View File

@@ -0,0 +1,75 @@
import { MongoDatasetData } from './schema';
import { deletePgDataById } from './pg';
import { MongoDatasetTraining } from '../training/schema';
import { delFileById } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { MongoDatasetCollection } from '../collection/schema';
import { delDatasetFiles } from '../file/controller';
import { delay } from '@fastgpt/global/common/system/utils';
/* delete all data by datasetIds */
export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
datasetIds = datasetIds.map((item) => String(item));
// delete training data(There could be a training mission)
await MongoDatasetTraining.deleteMany({
datasetId: { $in: datasetIds }
});
// delete related files
await Promise.all(datasetIds.map((id) => delDatasetFiles({ datasetId: id })));
await delay(1000);
// delete pg data
await deletePgDataById(`dataset_id IN ('${datasetIds.join("','")}')`);
// delete dataset.datas
await MongoDatasetData.deleteMany({ datasetId: { $in: datasetIds } });
// delete collections
await MongoDatasetCollection.deleteMany({
datasetId: { $in: datasetIds }
});
}
/**
* delete all data by collectionIds
*/
export async function delCollectionRelevantData({
collectionIds,
fileIds
}: {
collectionIds: string[];
fileIds: string[];
}) {
collectionIds = collectionIds.map((item) => String(item));
const filterFileIds = fileIds.filter(Boolean);
// delete training data
await MongoDatasetTraining.deleteMany({
collectionId: { $in: collectionIds }
});
// delete file
await Promise.all(
filterFileIds.map((fileId) => {
return delFileById({
bucketName: BucketNameEnum.dataset,
fileId
});
})
);
await delay(1000);
// delete pg data
await deletePgDataById(`collection_id IN ('${collectionIds.join("','")}')`);
// delete dataset.datas
await MongoDatasetData.deleteMany({ collectionId: { $in: collectionIds } });
}
/**
* delete one data by mongoDataId
*/
export async function delDatasetDataByDataId(mongoDataId: string) {
await deletePgDataById(['data_id', mongoDataId]);
await MongoDatasetData.findByIdAndDelete(mongoDataId);
}

View File

@@ -0,0 +1,28 @@
import { PgDatasetTableName } from '@fastgpt/global/core/dataset/constant';
import { delay } from '@fastgpt/global/common/system/utils';
import { PgClient } from '../../../common/pg';
export async function deletePgDataById(
where: ['id' | 'dataset_id' | 'collection_id' | 'data_id', string] | string
) {
let retry = 2;
async function deleteData(): Promise<any> {
try {
await PgClient.delete(PgDatasetTableName, {
where: [where]
});
} catch (error) {
if (--retry < 0) {
return Promise.reject(error);
}
await delay(500);
return deleteData();
}
}
await deleteData();
return {
tokenLen: 0
};
}

View File

@@ -79,6 +79,9 @@ const DatasetDataSchema = new Schema({
chunkIndex: {
type: Number,
default: 0
},
inited: {
type: Boolean
}
});
@@ -88,7 +91,7 @@ try {
DatasetDataSchema.index({ collectionId: 1 });
// full text index
DatasetDataSchema.index({ datasetId: 1, fullTextToken: 'text' });
DatasetDataSchema.index({ fullTextToken: 1 });
DatasetDataSchema.index({ inited: 1 });
} catch (error) {
console.log(error);
}

View File

@@ -1,7 +1,11 @@
import { connectionMongo, type Model } from '../../common/mongo';
const { Schema, model, models } = connectionMongo;
import { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { DatasetTypeMap } from '@fastgpt/global/core/dataset/constant';
import {
DatasetStatusEnum,
DatasetStatusMap,
DatasetTypeMap
} from '@fastgpt/global/core/dataset/constant';
import {
TeamCollectionName,
TeamMemberCollectionName
@@ -31,9 +35,16 @@ const DatasetSchema = new Schema({
ref: TeamMemberCollectionName,
required: true
},
updateTime: {
type: Date,
default: () => new Date()
type: {
type: String,
enum: Object.keys(DatasetTypeMap),
required: true,
default: 'dataset'
},
status: {
type: String,
enum: Object.keys(DatasetStatusMap),
default: DatasetStatusEnum.active
},
avatar: {
type: String,
@@ -43,6 +54,10 @@ const DatasetSchema = new Schema({
type: String,
required: true
},
updateTime: {
type: Date,
default: () => new Date()
},
vectorModel: {
type: String,
required: true,
@@ -53,24 +68,26 @@ const DatasetSchema = new Schema({
required: true,
default: 'gpt-3.5-turbo-16k'
},
type: {
intro: {
type: String,
enum: Object.keys(DatasetTypeMap),
required: true,
default: 'dataset'
},
tags: {
type: [String],
default: [],
set(val: string | string[]) {
if (Array.isArray(val)) return val;
return val.split(' ').filter((item) => item);
}
default: ''
},
permission: {
type: String,
enum: Object.keys(PermissionTypeMap),
default: PermissionTypeEnum.private
},
websiteConfig: {
type: {
url: {
type: String,
required: true
},
selector: {
type: String,
default: 'body'
}
}
}
});