4.6.4-alpha (#582)

This commit is contained in:
Archer
2023-12-08 15:01:11 +08:00
committed by GitHub
parent 54d52d8d25
commit b58249fc3a
66 changed files with 962 additions and 527 deletions

View File

@@ -0,0 +1,24 @@
import { ErrType } from '../errorCode';
/* dataset: 507000 */
const startCode = 507000;
export enum CommonErrEnum {
fileNotFound = 'fileNotFound'
}
const datasetErr = [
{
statusText: CommonErrEnum.fileNotFound,
message: 'error.fileNotFound'
}
];
export default datasetErr.reduce((acc, cur, index) => {
return {
...acc,
[cur.statusText]: {
code: startCode + index,
statusText: cur.statusText,
message: cur.message,
data: null
}
};
}, {} as ErrType<`${CommonErrEnum}`>);

View File

@@ -13,23 +13,23 @@ export enum DatasetErrEnum {
const datasetErr = [
{
statusText: DatasetErrEnum.unAuthDataset,
message: '无权操作该知识库'
message: 'core.dataset.error.unAuthDataset'
},
{
statusText: DatasetErrEnum.unAuthDatasetCollection,
message: '无权操作该数据集'
message: 'core.dataset.error.unAuthDatasetCollection'
},
{
statusText: DatasetErrEnum.unAuthDatasetData,
message: '无权操作该数据'
message: 'core.dataset.error.unAuthDatasetData'
},
{
statusText: DatasetErrEnum.unAuthDatasetFile,
message: '无权操作该文件'
message: 'core.dataset.error.unAuthDatasetFile'
},
{
statusText: DatasetErrEnum.unCreateCollection,
message: '无权创建数据集'
message: 'core.dataset.error.unCreateCollection'
},
{
statusText: DatasetErrEnum.unLinkCollection,

View File

@@ -6,6 +6,7 @@ import pluginErr from './code/plugin';
import outLinkErr from './code/outLink';
import teamErr from './code/team';
import userErr from './code/user';
import commonErr from './code/common';
export const ERROR_CODE: { [key: number]: string } = {
400: '请求失败',
@@ -96,5 +97,6 @@ export const ERROR_RESPONSE: Record<
...outLinkErr,
...teamErr,
...userErr,
...pluginErr
...pluginErr,
...commonErr
};

View File

@@ -1,3 +1,10 @@
export type UploadImgProps = {
base64Img: string;
expiredTime?: Date;
metadata?: Record<string, any>;
shareId?: string;
};
export type UrlFetchParams = {
urlList: string[];
selector?: string;

View File

@@ -49,7 +49,14 @@ export const cheerioToHtml = ({
}
});
return $(selector || 'body').html();
const html = $(selector || 'body')
.map((item, dom) => {
return $(dom).html();
})
.get()
.join('\n');
return html;
};
export const urlsFetch = async ({
urlList,

View File

@@ -26,10 +26,14 @@ export const simpleMarkdownText = (rawText: string) => {
rawText = rawText.replace(/\\\\n/g, '\\n');
// Remove headings and code blocks front spaces
['####', '###', '##', '#', '```', '~~~'].forEach((item) => {
['####', '###', '##', '#', '```', '~~~'].forEach((item, i) => {
const isMarkdown = i <= 3;
const reg = new RegExp(`\\n\\s*${item}`, 'g');
if (reg.test(rawText)) {
rawText = rawText.replace(new RegExp(`\\n\\s*(${item})`, 'g'), '\n$1');
rawText = rawText.replace(
new RegExp(`(\\n)\\s*(${item})`, 'g'),
isMarkdown ? '\n$1$2' : '$1$2'
);
}
});

View File

@@ -12,12 +12,13 @@ export const splitText2Chunks = (props: {
text: string;
chunkLen: number;
overlapRatio?: number;
customReg?: string[];
}): {
chunks: string[];
tokens: number;
overlapRatio?: number;
} => {
let { text = '', chunkLen, overlapRatio = 0.2 } = props;
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkLen * overlapRatio);
@@ -29,22 +30,29 @@ export const splitText2Chunks = (props: {
// The larger maxLen is, the next sentence is less likely to trigger splitting
const stepReges: { reg: RegExp; maxLen: number }[] = [
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /([\n](`))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?![\*\-|>0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n])/g, maxLen: chunkLen * 1.4 },
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.4 },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.6 },
{ reg: /([]|;\s)/g, maxLen: chunkLen * 1.8 },
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 },
{ reg: /([]|;\s)/g, maxLen: chunkLen * 1.6 },
{ reg: /([]|,\s)/g, maxLen: chunkLen * 2 }
];
const customRegLen = customReg.length;
const checkIsCustomStep = (step: number) => step < customRegLen;
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen;
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
// if use markdown title split, Separate record title title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
@@ -55,11 +63,13 @@ export const splitText2Chunks = (props: {
}
];
}
const isMarkdownSplit = step <= 3;
const isMarkdownSplit = checkIsMarkdownSplit(step);
const independentChunk = checkIndependentChunk(step);
const { reg } = stepReges[step];
const splitTexts = text
.replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
.replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
.split(`${splitMarker}`)
.filter((part) => part.trim());
@@ -76,7 +86,7 @@ export const splitText2Chunks = (props: {
};
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = step <= 6;
const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkLen * 0.4;
// step >= stepReges.length: Do not overlap incomplete sentences
@@ -114,7 +124,8 @@ export const splitText2Chunks = (props: {
lastText: string;
mdTitle: string;
}): string[] => {
const isMarkdownSplit = step <= 3;
const independentChunk = checkIndependentChunk(step);
const isCustomStep = checkIsCustomStep(step);
// mini text
if (text.length <= chunkLen) {
@@ -134,12 +145,13 @@ export const splitText2Chunks = (props: {
return chunks;
}
const { maxLen } = stepReges[step];
const minChunkLen = chunkLen * 0.7;
// split text by special char
const splitTexts = getSplitTexts({ text, step });
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
const minChunkLen = chunkLen * 0.7;
const miniChunkLen = 30;
const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) {
const item = splitTexts[i];
@@ -170,8 +182,8 @@ export const splitText2Chunks = (props: {
mdTitle: currentTitle
});
const lastChunk = innerChunks[innerChunks.length - 1];
// last chunk is too small, concat it to lastText
if (!isMarkdownSplit && lastChunk.length < minChunkLen) {
// last chunk is too small, concat it to lastText(next chunk start)
if (!independentChunk && lastChunk.length < minChunkLen) {
chunks.push(...innerChunks.slice(0, -1));
lastText = lastChunk;
} else {
@@ -189,10 +201,14 @@ export const splitText2Chunks = (props: {
lastText = newText;
// markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
if (isMarkdownSplit || newTextLen >= chunkLen) {
if (
isCustomStep ||
(independentChunk && newTextLen > miniChunkLen) ||
newTextLen >= chunkLen
) {
chunks.push(`${currentTitle}${lastText}`);
lastText = isMarkdownSplit ? '' : getOneTextOverlapText({ text: lastText, step });
lastText = getOneTextOverlapText({ text: lastText, step });
}
}

View File

@@ -24,7 +24,7 @@ export const getDefaultAppForm = (templateId = 'fastgpt-universal'): AppSimpleEd
dataset: {
datasets: [],
similarity: 0.4,
limit: 5,
limit: 1500,
searchEmptyText: '',
searchMode: DatasetSearchModeEnum.embedding
},

View File

@@ -55,3 +55,5 @@ export const LOGO_ICON = `/icon/logo.svg`;
export const IMG_BLOCK_KEY = 'img-block';
export const FILE_BLOCK_KEY = 'file-block';
export const MARKDOWN_QUOTE_SIGN = 'QUOTE SIGN';

View File

@@ -54,17 +54,10 @@ export const DatasetSearchModule: FlowModuleTemplateType = {
{
key: ModuleInputKeyEnum.datasetLimit,
type: FlowNodeInputTypeEnum.hidden,
label: '单次搜索上限',
description: '最多取 n 条记录作为本次问题引用',
value: 5,
label: '引用上限',
description: '单次搜索最大的 Tokens 数量中文约1字=1.7Tokens英文约1字=1Tokens',
value: 1500,
valueType: ModuleDataTypeEnum.number,
min: 1,
max: 20,
step: 1,
markList: [
{ label: '1', value: 1 },
{ label: '20', value: 20 }
],
showTargetInApp: false,
showTargetInPlugin: false
},

View File

@@ -3,6 +3,7 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import fsp from 'fs/promises';
import fs from 'fs';
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
import { delImgByFileIdList } from '../image/controller';
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
return connectionMongo.connection.db.collection(`${bucket}.files`);
@@ -69,24 +70,65 @@ export async function getFileById({
_id: new Types.ObjectId(fileId)
});
if (!file) {
return Promise.reject('File not found');
}
// if (!file) {
// return Promise.reject('File not found');
// }
return file;
return file || undefined;
}
export async function delFileById({
export async function delFileByFileIdList({
bucketName,
fileId
fileIdList,
retry = 3
}: {
bucketName: `${BucketNameEnum}`;
fileId: string;
fileIdList: string[];
retry?: number;
}): Promise<any> {
try {
const bucket = getGridBucket(bucketName);
await Promise.all(fileIdList.map((id) => bucket.delete(new Types.ObjectId(id))));
} catch (error) {
if (retry > 0) {
return delFileByFileIdList({ bucketName, fileIdList, retry: retry - 1 });
}
}
}
// delete file by metadata(datasetId)
export async function delFileByMetadata({
bucketName,
datasetId
}: {
bucketName: `${BucketNameEnum}`;
datasetId?: string;
}) {
const bucket = getGridBucket(bucketName);
await bucket.delete(new Types.ObjectId(fileId));
return true;
const files = await bucket
.find(
{
...(datasetId && { 'metadata.datasetId': datasetId })
},
{
projection: {
_id: 1
}
}
)
.toArray();
const idList = files.map((item) => String(item._id));
// delete img
await delImgByFileIdList(idList);
// delete file
await delFileByFileIdList({
bucketName,
fileIdList: idList
});
}
export async function getDownloadStream({

View File

@@ -1,3 +1,4 @@
import { UploadImgProps } from '@fastgpt/global/common/file/api';
import { imageBaseUrl } from './constant';
import { MongoImage } from './schema';
@@ -9,11 +10,10 @@ export const maxImgSize = 1024 * 1024 * 12;
export async function uploadMongoImg({
base64Img,
teamId,
expiredTime
}: {
base64Img: string;
expiredTime,
metadata
}: UploadImgProps & {
teamId: string;
expiredTime?: Date;
}) {
if (base64Img.length > maxImgSize) {
return Promise.reject('Image too large');
@@ -24,7 +24,8 @@ export async function uploadMongoImg({
const { _id } = await MongoImage.create({
teamId,
binary: Buffer.from(base64Data, 'base64'),
expiredTime
expiredTime: expiredTime,
metadata
});
return getMongoImgUrl(String(_id));
@@ -37,3 +38,9 @@ export async function readMongoImg({ id }: { id: string }) {
}
return data?.binary;
}
export async function delImgByFileIdList(fileIds: string[]) {
return MongoImage.deleteMany({
'metadata.fileId': { $in: fileIds.map((item) => String(item)) }
});
}

View File

@@ -5,13 +5,17 @@ const { Schema, model, models } = connectionMongo;
const ImageSchema = new Schema({
teamId: {
type: Schema.Types.ObjectId,
ref: TeamCollectionName
ref: TeamCollectionName,
required: true
},
binary: {
type: Buffer
},
expiredTime: {
type: Date
},
metadata: {
type: Object
}
});
@@ -21,7 +25,7 @@ try {
console.log(error);
}
export const MongoImage: Model<{ teamId: string; binary: Buffer }> =
export const MongoImage: Model<{ teamId: string; binary: Buffer; metadata?: Record<string, any> }> =
models['image'] || model('image', ImageSchema);
MongoImage.syncIndexes();

View File

@@ -82,7 +82,7 @@ export const sseErrRes = (res: NextApiResponse, error: any) => {
} else if (error?.response?.data?.error?.message) {
msg = error?.response?.data?.error?.message;
} else if (error?.error?.message) {
msg = error?.error?.message;
msg = `${error?.error?.code} ${error?.error?.message}`;
}
addLog.error(`sse error: ${msg}`, error);

View File

@@ -1,11 +1,11 @@
import { MongoDatasetData } from './schema';
import { deletePgDataById } from './pg';
import { MongoDatasetTraining } from '../training/schema';
import { delFileById } from '../../../common/file/gridfs/controller';
import { delFileByFileIdList, delFileByMetadata } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { MongoDatasetCollection } from '../collection/schema';
import { delDatasetFiles } from '../file/controller';
import { delay } from '@fastgpt/global/common/system/utils';
import { delImgByFileIdList } from '../../../common/file/image/controller';
/* delete all data by datasetIds */
export async function delDatasetRelevantData({ datasetIds }: { datasetIds: string[] }) {
@@ -17,9 +17,11 @@ export async function delDatasetRelevantData({ datasetIds }: { datasetIds: strin
});
// delete related files
await Promise.all(datasetIds.map((id) => delDatasetFiles({ datasetId: id })));
await Promise.all(
datasetIds.map((id) => delFileByMetadata({ bucketName: BucketNameEnum.dataset, datasetId: id }))
);
await delay(1000);
await delay(500);
// delete pg data
await deletePgDataById(`dataset_id IN ('${datasetIds.join("','")}')`);
@@ -49,17 +51,16 @@ export async function delCollectionRelevantData({
collectionId: { $in: collectionIds }
});
// delete file
await Promise.all(
filterFileIds.map((fileId) => {
return delFileById({
bucketName: BucketNameEnum.dataset,
fileId
});
// delete file and imgs
await Promise.all([
delImgByFileIdList(filterFileIds),
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList: filterFileIds
})
);
]);
await delay(1000);
await delay(500);
// delete pg data
await deletePgDataById(`collection_id IN ('${collectionIds.join("','")}')`);

View File

@@ -1,9 +0,0 @@
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { getGFSCollection } from '../../../common/file/gridfs/controller';
export async function delDatasetFiles({ datasetId }: { datasetId: string }) {
const db = getGFSCollection(BucketNameEnum.dataset);
await db.deleteMany({
'metadata.datasetId': String(datasetId)
});
}

View File

@@ -12,7 +12,7 @@ export const authCert = async (props: AuthModeType) => {
canWrite: true
};
};
export async function authCertAndShareId({
export async function authCertOrShareId({
shareId,
...props
}: AuthModeType & { shareId?: string }) {

View File

@@ -14,6 +14,7 @@ import {
import { getFileById } from '../../../common/file/gridfs/controller';
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { getTeamInfoByTmbId } from '../../user/team/controller';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
export async function authDatasetByTmbId({
teamId,
@@ -167,6 +168,10 @@ export async function authDatasetFile({
const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId });
if (!file) {
return Promise.reject(CommonErrEnum.fileNotFound);
}
if (file.metadata.teamId !== teamId) {
return Promise.reject(DatasetErrEnum.unAuthDatasetFile);
}