mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
V4.6.6-1 (#656)
This commit is contained in:
62
packages/global/common/file/read/index.ts
Normal file
62
packages/global/common/file/read/index.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
/* read file to txt */
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
|
||||
export const readPdfFile = async ({ pdf }: { pdf: string | URL | ArrayBuffer }) => {
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pageHeight = viewport.height;
|
||||
const headerThreshold = pageHeight * 0.07; // 假设页头在页面顶部5%的区域内
|
||||
const footerThreshold = pageHeight * 0.93; // 假设页脚在页面底部5%的区域内
|
||||
|
||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||
return (
|
||||
!token.transform ||
|
||||
(token.transform[5] > headerThreshold && token.transform[5] < footerThreshold)
|
||||
);
|
||||
});
|
||||
|
||||
// concat empty string 'hasEOL'
|
||||
for (let i = 0; i < pageTexts.length; i++) {
|
||||
const item = pageTexts[i];
|
||||
if (item.str === '' && pageTexts[i - 1]) {
|
||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||
pageTexts.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
page.cleanup();
|
||||
|
||||
return pageTexts
|
||||
.map((token) => {
|
||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||
|
||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||
})
|
||||
.join('');
|
||||
};
|
||||
|
||||
const doc = await pdfjsLib.getDocument(pdf).promise;
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
return pageTexts.join('');
|
||||
};
|
@@ -34,3 +34,41 @@ export const simpleMarkdownText = (rawText: string) => {
|
||||
|
||||
return rawText.trim();
|
||||
};
|
||||
|
||||
/**
|
||||
* format markdown
|
||||
* 1. upload base64
|
||||
* 2. replace \
|
||||
*/
|
||||
export const uploadMarkdownBase64 = async ({
|
||||
rawText,
|
||||
uploadImgController
|
||||
}: {
|
||||
rawText: string;
|
||||
uploadImgController: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
// match base64, upload and replace it
|
||||
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
|
||||
const base64Arr = rawText.match(base64Regex) || [];
|
||||
// upload base64 and replace it
|
||||
await Promise.all(
|
||||
base64Arr.map(async (base64Img) => {
|
||||
try {
|
||||
const str = await uploadImgController(base64Img);
|
||||
|
||||
rawText = rawText.replace(base64Img, str);
|
||||
} catch (error) {
|
||||
rawText = rawText.replace(base64Img, '');
|
||||
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
// Remove white space on both sides of the picture
|
||||
const trimReg = /(!\[.*\]\(.*\))\s*/g;
|
||||
if (trimReg.test(rawText)) {
|
||||
rawText = rawText.replace(trimReg, '$1');
|
||||
}
|
||||
|
||||
return simpleMarkdownText(rawText);
|
||||
};
|
||||
|
@@ -31,7 +31,7 @@ export const splitText2Chunks = (props: {
|
||||
|
||||
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
||||
const stepReges: { reg: RegExp; maxLen: number }[] = [
|
||||
...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
|
||||
...customReg.map((text) => ({ reg: new RegExp(`(${text})`, 'g'), maxLen: chunkLen * 1.4 })),
|
||||
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
@@ -64,13 +64,22 @@ export const splitText2Chunks = (props: {
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
const isCustomSteep = checkIsCustomStep(step);
|
||||
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
||||
const independentChunk = checkIndependentChunk(step);
|
||||
|
||||
const { reg } = stepReges[step];
|
||||
|
||||
const splitTexts = text
|
||||
.replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
|
||||
.replace(
|
||||
reg,
|
||||
(() => {
|
||||
if (isCustomSteep) return splitMarker;
|
||||
if (independentChunk) return `${splitMarker}$1`;
|
||||
return `$1${splitMarker}`;
|
||||
})()
|
||||
)
|
||||
.split(`${splitMarker}`)
|
||||
.filter((part) => part.trim());
|
||||
|
||||
@@ -128,11 +137,6 @@ export const splitText2Chunks = (props: {
|
||||
const independentChunk = checkIndependentChunk(step);
|
||||
const isCustomStep = checkIsCustomStep(step);
|
||||
|
||||
// mini text
|
||||
if (text.length <= chunkLen) {
|
||||
return [text];
|
||||
}
|
||||
|
||||
// oversize
|
||||
if (step >= stepReges.length) {
|
||||
if (text.length < chunkLen * 3) {
|
||||
@@ -221,6 +225,8 @@ export const splitText2Chunks = (props: {
|
||||
} else {
|
||||
chunks.push(`${mdTitle}${lastText}`);
|
||||
}
|
||||
} else if (lastText && chunks.length === 0) {
|
||||
chunks.push(lastText);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
|
29
packages/global/common/system/types/index.d.ts
vendored
29
packages/global/common/system/types/index.d.ts
vendored
@@ -1,4 +1,29 @@
|
||||
export type FeConfigsType = {
|
||||
import type {
|
||||
ChatModelItemType,
|
||||
FunctionModelItemType,
|
||||
LLMModelItemType,
|
||||
VectorModelItemType,
|
||||
AudioSpeechModels,
|
||||
WhisperModelType,
|
||||
ReRankModelItemType
|
||||
} from '../../../core/ai/model.d';
|
||||
|
||||
/* fastgpt main */
|
||||
export type FastGPTConfigFileType = {
|
||||
feConfigs: FastGPTFeConfigsType;
|
||||
systemEnv: SystemEnvType;
|
||||
chatModels: ChatModelItemType[];
|
||||
qaModels: LLMModelItemType[];
|
||||
cqModels: FunctionModelItemType[];
|
||||
extractModels: FunctionModelItemType[];
|
||||
qgModels: LLMModelItemType[];
|
||||
vectorModels: VectorModelItemType[];
|
||||
reRankModels: ReRankModelItemType[];
|
||||
audioSpeechModels: AudioSpeechModelType[];
|
||||
whisperModel: WhisperModelType;
|
||||
};
|
||||
|
||||
export type FastGPTFeConfigsType = {
|
||||
show_emptyChat?: boolean;
|
||||
show_register?: boolean;
|
||||
show_appStore?: boolean;
|
||||
@@ -34,6 +59,6 @@ export type SystemEnvType = {
|
||||
};
|
||||
|
||||
declare global {
|
||||
var feConfigs: FeConfigsType;
|
||||
var feConfigs: FastGPTFeConfigsType;
|
||||
var systemEnv: SystemEnvType;
|
||||
}
|
||||
|
1
packages/global/core/ai/model.d.ts
vendored
1
packages/global/core/ai/model.d.ts
vendored
@@ -24,6 +24,7 @@ export type VectorModelItemType = {
|
||||
defaultToken: number;
|
||||
price: number;
|
||||
maxToken: number;
|
||||
weight: number;
|
||||
};
|
||||
|
||||
export type ReRankModelItemType = {
|
||||
|
@@ -16,6 +16,7 @@ export const defaultVectorModels: VectorModelItemType[] = [
|
||||
name: 'Embedding-2',
|
||||
price: 0,
|
||||
defaultToken: 500,
|
||||
maxToken: 3000
|
||||
maxToken: 3000,
|
||||
weight: 100
|
||||
}
|
||||
];
|
||||
|
1
packages/global/core/dataset/type.d.ts
vendored
1
packages/global/core/dataset/type.d.ts
vendored
@@ -89,6 +89,7 @@ export type DatasetTrainingSchemaType = {
|
||||
q: string;
|
||||
a: string;
|
||||
chunkIndex: number;
|
||||
weight: number;
|
||||
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
|
||||
};
|
||||
|
||||
|
@@ -36,10 +36,11 @@ export const ContextExtractModule: FlowModuleTemplateType = {
|
||||
type: FlowNodeInputTypeEnum.textarea,
|
||||
valueType: ModuleIOValueTypeEnum.string,
|
||||
label: '提取要求描述',
|
||||
description: '给AI一些对应的背景知识或要求描述,引导AI更好的完成任务',
|
||||
description:
|
||||
'给AI一些对应的背景知识或要求描述,引导AI更好的完成任务。\n该输入框可使用全局变量。',
|
||||
required: true,
|
||||
placeholder:
|
||||
'例如: \n1. 你是一个实验室预约助手,你的任务是帮助用户预约实验室。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
|
||||
'例如: \n1. 当前时间为: {{cTime}}。你是一个实验室预约助手,你的任务是帮助用户预约实验室,从文本中获取对应的预约信息。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
|
||||
showTargetInApp: true,
|
||||
showTargetInPlugin: true
|
||||
},
|
||||
|
@@ -2,11 +2,12 @@
|
||||
"name": "@fastgpt/global",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"axios": "^1.5.1",
|
||||
"dayjs": "^1.11.7",
|
||||
"openai": "4.23.0",
|
||||
"encoding": "^0.1.13",
|
||||
"js-tiktoken": "^1.0.7",
|
||||
"axios": "^1.5.1",
|
||||
"openai": "4.23.0",
|
||||
"pdfjs-dist": "^4.0.269",
|
||||
"timezones-list": "^3.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
Reference in New Issue
Block a user