mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
4.6.4-alpha (#582)
This commit is contained in:
24
packages/global/common/error/code/common.ts
Normal file
24
packages/global/common/error/code/common.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import { ErrType } from '../errorCode';
|
||||
|
||||
/* dataset: 507000 */
|
||||
const startCode = 507000;
|
||||
export enum CommonErrEnum {
|
||||
fileNotFound = 'fileNotFound'
|
||||
}
|
||||
const datasetErr = [
|
||||
{
|
||||
statusText: CommonErrEnum.fileNotFound,
|
||||
message: 'error.fileNotFound'
|
||||
}
|
||||
];
|
||||
export default datasetErr.reduce((acc, cur, index) => {
|
||||
return {
|
||||
...acc,
|
||||
[cur.statusText]: {
|
||||
code: startCode + index,
|
||||
statusText: cur.statusText,
|
||||
message: cur.message,
|
||||
data: null
|
||||
}
|
||||
};
|
||||
}, {} as ErrType<`${CommonErrEnum}`>);
|
@@ -13,23 +13,23 @@ export enum DatasetErrEnum {
|
||||
const datasetErr = [
|
||||
{
|
||||
statusText: DatasetErrEnum.unAuthDataset,
|
||||
message: '无权操作该知识库'
|
||||
message: 'core.dataset.error.unAuthDataset'
|
||||
},
|
||||
{
|
||||
statusText: DatasetErrEnum.unAuthDatasetCollection,
|
||||
message: '无权操作该数据集'
|
||||
message: 'core.dataset.error.unAuthDatasetCollection'
|
||||
},
|
||||
{
|
||||
statusText: DatasetErrEnum.unAuthDatasetData,
|
||||
message: '无权操作该数据'
|
||||
message: 'core.dataset.error.unAuthDatasetData'
|
||||
},
|
||||
{
|
||||
statusText: DatasetErrEnum.unAuthDatasetFile,
|
||||
message: '无权操作该文件'
|
||||
message: 'core.dataset.error.unAuthDatasetFile'
|
||||
},
|
||||
{
|
||||
statusText: DatasetErrEnum.unCreateCollection,
|
||||
message: '无权创建数据集'
|
||||
message: 'core.dataset.error.unCreateCollection'
|
||||
},
|
||||
{
|
||||
statusText: DatasetErrEnum.unLinkCollection,
|
||||
|
@@ -6,6 +6,7 @@ import pluginErr from './code/plugin';
|
||||
import outLinkErr from './code/outLink';
|
||||
import teamErr from './code/team';
|
||||
import userErr from './code/user';
|
||||
import commonErr from './code/common';
|
||||
|
||||
export const ERROR_CODE: { [key: number]: string } = {
|
||||
400: '请求失败',
|
||||
@@ -96,5 +97,6 @@ export const ERROR_RESPONSE: Record<
|
||||
...outLinkErr,
|
||||
...teamErr,
|
||||
...userErr,
|
||||
...pluginErr
|
||||
...pluginErr,
|
||||
...commonErr
|
||||
};
|
||||
|
7
packages/global/common/file/api.d.ts
vendored
7
packages/global/common/file/api.d.ts
vendored
@@ -1,3 +1,10 @@
|
||||
export type UploadImgProps = {
|
||||
base64Img: string;
|
||||
expiredTime?: Date;
|
||||
metadata?: Record<string, any>;
|
||||
shareId?: string;
|
||||
};
|
||||
|
||||
export type UrlFetchParams = {
|
||||
urlList: string[];
|
||||
selector?: string;
|
||||
|
@@ -49,7 +49,14 @@ export const cheerioToHtml = ({
|
||||
}
|
||||
});
|
||||
|
||||
return $(selector || 'body').html();
|
||||
const html = $(selector || 'body')
|
||||
.map((item, dom) => {
|
||||
return $(dom).html();
|
||||
})
|
||||
.get()
|
||||
.join('\n');
|
||||
|
||||
return html;
|
||||
};
|
||||
export const urlsFetch = async ({
|
||||
urlList,
|
||||
|
@@ -26,10 +26,14 @@ export const simpleMarkdownText = (rawText: string) => {
|
||||
rawText = rawText.replace(/\\\\n/g, '\\n');
|
||||
|
||||
// Remove headings and code blocks front spaces
|
||||
['####', '###', '##', '#', '```', '~~~'].forEach((item) => {
|
||||
['####', '###', '##', '#', '```', '~~~'].forEach((item, i) => {
|
||||
const isMarkdown = i <= 3;
|
||||
const reg = new RegExp(`\\n\\s*${item}`, 'g');
|
||||
if (reg.test(rawText)) {
|
||||
rawText = rawText.replace(new RegExp(`\\n\\s*(${item})`, 'g'), '\n$1');
|
||||
rawText = rawText.replace(
|
||||
new RegExp(`(\\n)\\s*(${item})`, 'g'),
|
||||
isMarkdown ? '\n$1$2' : '$1$2'
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
|
@@ -12,12 +12,13 @@ export const splitText2Chunks = (props: {
|
||||
text: string;
|
||||
chunkLen: number;
|
||||
overlapRatio?: number;
|
||||
customReg?: string[];
|
||||
}): {
|
||||
chunks: string[];
|
||||
tokens: number;
|
||||
overlapRatio?: number;
|
||||
} => {
|
||||
let { text = '', chunkLen, overlapRatio = 0.2 } = props;
|
||||
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
|
||||
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
||||
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
||||
const overlapLen = Math.round(chunkLen * overlapRatio);
|
||||
@@ -29,22 +30,29 @@ export const splitText2Chunks = (props: {
|
||||
|
||||
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
||||
const stepReges: { reg: RegExp; maxLen: number }[] = [
|
||||
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
|
||||
...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
|
||||
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
|
||||
{ reg: /([\n](`))/g, maxLen: chunkLen * 4 }, // code block
|
||||
{ reg: /([\n](?![\*\-|>0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
||||
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /([!]|!\s)/g, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.6 },
|
||||
{ reg: /([;]|;\s)/g, maxLen: chunkLen * 1.8 },
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /([;]|;\s)/g, maxLen: chunkLen * 1.6 },
|
||||
{ reg: /([,]|,\s)/g, maxLen: chunkLen * 2 }
|
||||
];
|
||||
|
||||
const customRegLen = customReg.length;
|
||||
const checkIsCustomStep = (step: number) => step < customRegLen;
|
||||
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen;
|
||||
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
|
||||
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
|
||||
|
||||
// if use markdown title split, Separate record title title
|
||||
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
||||
if (step >= stepReges.length) {
|
||||
@@ -55,11 +63,13 @@ export const splitText2Chunks = (props: {
|
||||
}
|
||||
];
|
||||
}
|
||||
const isMarkdownSplit = step <= 3;
|
||||
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
||||
const independentChunk = checkIndependentChunk(step);
|
||||
|
||||
const { reg } = stepReges[step];
|
||||
|
||||
const splitTexts = text
|
||||
.replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
|
||||
.replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
|
||||
.split(`${splitMarker}`)
|
||||
.filter((part) => part.trim());
|
||||
|
||||
@@ -76,7 +86,7 @@ export const splitText2Chunks = (props: {
|
||||
};
|
||||
|
||||
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
||||
const forbidOverlap = step <= 6;
|
||||
const forbidOverlap = checkForbidOverlap(step);
|
||||
const maxOverlapLen = chunkLen * 0.4;
|
||||
|
||||
// step >= stepReges.length: Do not overlap incomplete sentences
|
||||
@@ -114,7 +124,8 @@ export const splitText2Chunks = (props: {
|
||||
lastText: string;
|
||||
mdTitle: string;
|
||||
}): string[] => {
|
||||
const isMarkdownSplit = step <= 3;
|
||||
const independentChunk = checkIndependentChunk(step);
|
||||
const isCustomStep = checkIsCustomStep(step);
|
||||
|
||||
// mini text
|
||||
if (text.length <= chunkLen) {
|
||||
@@ -134,12 +145,13 @@ export const splitText2Chunks = (props: {
|
||||
return chunks;
|
||||
}
|
||||
|
||||
const { maxLen } = stepReges[step];
|
||||
const minChunkLen = chunkLen * 0.7;
|
||||
|
||||
// split text by special char
|
||||
const splitTexts = getSplitTexts({ text, step });
|
||||
|
||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
|
||||
const minChunkLen = chunkLen * 0.7;
|
||||
const miniChunkLen = 30;
|
||||
|
||||
const chunks: string[] = [];
|
||||
for (let i = 0; i < splitTexts.length; i++) {
|
||||
const item = splitTexts[i];
|
||||
@@ -170,8 +182,8 @@ export const splitText2Chunks = (props: {
|
||||
mdTitle: currentTitle
|
||||
});
|
||||
const lastChunk = innerChunks[innerChunks.length - 1];
|
||||
// last chunk is too small, concat it to lastText
|
||||
if (!isMarkdownSplit && lastChunk.length < minChunkLen) {
|
||||
// last chunk is too small, concat it to lastText(next chunk start)
|
||||
if (!independentChunk && lastChunk.length < minChunkLen) {
|
||||
chunks.push(...innerChunks.slice(0, -1));
|
||||
lastText = lastChunk;
|
||||
} else {
|
||||
@@ -189,10 +201,14 @@ export const splitText2Chunks = (props: {
|
||||
lastText = newText;
|
||||
|
||||
// markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
|
||||
if (isMarkdownSplit || newTextLen >= chunkLen) {
|
||||
if (
|
||||
isCustomStep ||
|
||||
(independentChunk && newTextLen > miniChunkLen) ||
|
||||
newTextLen >= chunkLen
|
||||
) {
|
||||
chunks.push(`${currentTitle}${lastText}`);
|
||||
|
||||
lastText = isMarkdownSplit ? '' : getOneTextOverlapText({ text: lastText, step });
|
||||
lastText = getOneTextOverlapText({ text: lastText, step });
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -24,7 +24,7 @@ export const getDefaultAppForm = (templateId = 'fastgpt-universal'): AppSimpleEd
|
||||
dataset: {
|
||||
datasets: [],
|
||||
similarity: 0.4,
|
||||
limit: 5,
|
||||
limit: 1500,
|
||||
searchEmptyText: '',
|
||||
searchMode: DatasetSearchModeEnum.embedding
|
||||
},
|
||||
|
@@ -55,3 +55,5 @@ export const LOGO_ICON = `/icon/logo.svg`;
|
||||
|
||||
export const IMG_BLOCK_KEY = 'img-block';
|
||||
export const FILE_BLOCK_KEY = 'file-block';
|
||||
|
||||
export const MARKDOWN_QUOTE_SIGN = 'QUOTE SIGN';
|
||||
|
@@ -54,17 +54,10 @@ export const DatasetSearchModule: FlowModuleTemplateType = {
|
||||
{
|
||||
key: ModuleInputKeyEnum.datasetLimit,
|
||||
type: FlowNodeInputTypeEnum.hidden,
|
||||
label: '单次搜索上限',
|
||||
description: '最多取 n 条记录作为本次问题引用',
|
||||
value: 5,
|
||||
label: '引用上限',
|
||||
description: '单次搜索最大的 Tokens 数量,中文约1字=1.7Tokens,英文约1字=1Tokens',
|
||||
value: 1500,
|
||||
valueType: ModuleDataTypeEnum.number,
|
||||
min: 1,
|
||||
max: 20,
|
||||
step: 1,
|
||||
markList: [
|
||||
{ label: '1', value: 1 },
|
||||
{ label: '20', value: 20 }
|
||||
],
|
||||
showTargetInApp: false,
|
||||
showTargetInPlugin: false
|
||||
},
|
||||
|
Reference in New Issue
Block a user