mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
V4.6.6-1 (#656)
This commit is contained in:
62
packages/global/common/file/read/index.ts
Normal file
62
packages/global/common/file/read/index.ts
Normal file
@@ -0,0 +1,62 @@
|
||||
/* read file to txt */
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
|
||||
export const readPdfFile = async ({ pdf }: { pdf: string | URL | ArrayBuffer }) => {
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
|
||||
const viewport = page.getViewport({ scale: 1 });
|
||||
const pageHeight = viewport.height;
|
||||
const headerThreshold = pageHeight * 0.07; // 假设页头在页面顶部5%的区域内
|
||||
const footerThreshold = pageHeight * 0.93; // 假设页脚在页面底部5%的区域内
|
||||
|
||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||
return (
|
||||
!token.transform ||
|
||||
(token.transform[5] > headerThreshold && token.transform[5] < footerThreshold)
|
||||
);
|
||||
});
|
||||
|
||||
// concat empty string 'hasEOL'
|
||||
for (let i = 0; i < pageTexts.length; i++) {
|
||||
const item = pageTexts[i];
|
||||
if (item.str === '' && pageTexts[i - 1]) {
|
||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||
pageTexts.splice(i, 1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
page.cleanup();
|
||||
|
||||
return pageTexts
|
||||
.map((token) => {
|
||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||
|
||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||
})
|
||||
.join('');
|
||||
};
|
||||
|
||||
const doc = await pdfjsLib.getDocument(pdf).promise;
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
return pageTexts.join('');
|
||||
};
|
@@ -34,3 +34,41 @@ export const simpleMarkdownText = (rawText: string) => {
|
||||
|
||||
return rawText.trim();
|
||||
};
|
||||
|
||||
/**
|
||||
* format markdown
|
||||
* 1. upload base64
|
||||
* 2. replace \
|
||||
*/
|
||||
export const uploadMarkdownBase64 = async ({
|
||||
rawText,
|
||||
uploadImgController
|
||||
}: {
|
||||
rawText: string;
|
||||
uploadImgController: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
// match base64, upload and replace it
|
||||
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
|
||||
const base64Arr = rawText.match(base64Regex) || [];
|
||||
// upload base64 and replace it
|
||||
await Promise.all(
|
||||
base64Arr.map(async (base64Img) => {
|
||||
try {
|
||||
const str = await uploadImgController(base64Img);
|
||||
|
||||
rawText = rawText.replace(base64Img, str);
|
||||
} catch (error) {
|
||||
rawText = rawText.replace(base64Img, '');
|
||||
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
// Remove white space on both sides of the picture
|
||||
const trimReg = /(!\[.*\]\(.*\))\s*/g;
|
||||
if (trimReg.test(rawText)) {
|
||||
rawText = rawText.replace(trimReg, '$1');
|
||||
}
|
||||
|
||||
return simpleMarkdownText(rawText);
|
||||
};
|
||||
|
@@ -31,7 +31,7 @@ export const splitText2Chunks = (props: {
|
||||
|
||||
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
||||
const stepReges: { reg: RegExp; maxLen: number }[] = [
|
||||
...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
|
||||
...customReg.map((text) => ({ reg: new RegExp(`(${text})`, 'g'), maxLen: chunkLen * 1.4 })),
|
||||
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
@@ -64,13 +64,22 @@ export const splitText2Chunks = (props: {
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
const isCustomSteep = checkIsCustomStep(step);
|
||||
const isMarkdownSplit = checkIsMarkdownSplit(step);
|
||||
const independentChunk = checkIndependentChunk(step);
|
||||
|
||||
const { reg } = stepReges[step];
|
||||
|
||||
const splitTexts = text
|
||||
.replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
|
||||
.replace(
|
||||
reg,
|
||||
(() => {
|
||||
if (isCustomSteep) return splitMarker;
|
||||
if (independentChunk) return `${splitMarker}$1`;
|
||||
return `$1${splitMarker}`;
|
||||
})()
|
||||
)
|
||||
.split(`${splitMarker}`)
|
||||
.filter((part) => part.trim());
|
||||
|
||||
@@ -128,11 +137,6 @@ export const splitText2Chunks = (props: {
|
||||
const independentChunk = checkIndependentChunk(step);
|
||||
const isCustomStep = checkIsCustomStep(step);
|
||||
|
||||
// mini text
|
||||
if (text.length <= chunkLen) {
|
||||
return [text];
|
||||
}
|
||||
|
||||
// oversize
|
||||
if (step >= stepReges.length) {
|
||||
if (text.length < chunkLen * 3) {
|
||||
@@ -221,6 +225,8 @@ export const splitText2Chunks = (props: {
|
||||
} else {
|
||||
chunks.push(`${mdTitle}${lastText}`);
|
||||
}
|
||||
} else if (lastText && chunks.length === 0) {
|
||||
chunks.push(lastText);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
|
29
packages/global/common/system/types/index.d.ts
vendored
29
packages/global/common/system/types/index.d.ts
vendored
@@ -1,4 +1,29 @@
|
||||
export type FeConfigsType = {
|
||||
import type {
|
||||
ChatModelItemType,
|
||||
FunctionModelItemType,
|
||||
LLMModelItemType,
|
||||
VectorModelItemType,
|
||||
AudioSpeechModels,
|
||||
WhisperModelType,
|
||||
ReRankModelItemType
|
||||
} from '../../../core/ai/model.d';
|
||||
|
||||
/* fastgpt main */
|
||||
export type FastGPTConfigFileType = {
|
||||
feConfigs: FastGPTFeConfigsType;
|
||||
systemEnv: SystemEnvType;
|
||||
chatModels: ChatModelItemType[];
|
||||
qaModels: LLMModelItemType[];
|
||||
cqModels: FunctionModelItemType[];
|
||||
extractModels: FunctionModelItemType[];
|
||||
qgModels: LLMModelItemType[];
|
||||
vectorModels: VectorModelItemType[];
|
||||
reRankModels: ReRankModelItemType[];
|
||||
audioSpeechModels: AudioSpeechModelType[];
|
||||
whisperModel: WhisperModelType;
|
||||
};
|
||||
|
||||
export type FastGPTFeConfigsType = {
|
||||
show_emptyChat?: boolean;
|
||||
show_register?: boolean;
|
||||
show_appStore?: boolean;
|
||||
@@ -34,6 +59,6 @@ export type SystemEnvType = {
|
||||
};
|
||||
|
||||
declare global {
|
||||
var feConfigs: FeConfigsType;
|
||||
var feConfigs: FastGPTFeConfigsType;
|
||||
var systemEnv: SystemEnvType;
|
||||
}
|
||||
|
1
packages/global/core/ai/model.d.ts
vendored
1
packages/global/core/ai/model.d.ts
vendored
@@ -24,6 +24,7 @@ export type VectorModelItemType = {
|
||||
defaultToken: number;
|
||||
price: number;
|
||||
maxToken: number;
|
||||
weight: number;
|
||||
};
|
||||
|
||||
export type ReRankModelItemType = {
|
||||
|
@@ -16,6 +16,7 @@ export const defaultVectorModels: VectorModelItemType[] = [
|
||||
name: 'Embedding-2',
|
||||
price: 0,
|
||||
defaultToken: 500,
|
||||
maxToken: 3000
|
||||
maxToken: 3000,
|
||||
weight: 100
|
||||
}
|
||||
];
|
||||
|
1
packages/global/core/dataset/type.d.ts
vendored
1
packages/global/core/dataset/type.d.ts
vendored
@@ -89,6 +89,7 @@ export type DatasetTrainingSchemaType = {
|
||||
q: string;
|
||||
a: string;
|
||||
chunkIndex: number;
|
||||
weight: number;
|
||||
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
|
||||
};
|
||||
|
||||
|
@@ -36,10 +36,11 @@ export const ContextExtractModule: FlowModuleTemplateType = {
|
||||
type: FlowNodeInputTypeEnum.textarea,
|
||||
valueType: ModuleIOValueTypeEnum.string,
|
||||
label: '提取要求描述',
|
||||
description: '给AI一些对应的背景知识或要求描述,引导AI更好的完成任务',
|
||||
description:
|
||||
'给AI一些对应的背景知识或要求描述,引导AI更好的完成任务。\n该输入框可使用全局变量。',
|
||||
required: true,
|
||||
placeholder:
|
||||
'例如: \n1. 你是一个实验室预约助手,你的任务是帮助用户预约实验室。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
|
||||
'例如: \n1. 当前时间为: {{cTime}}。你是一个实验室预约助手,你的任务是帮助用户预约实验室,从文本中获取对应的预约信息。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
|
||||
showTargetInApp: true,
|
||||
showTargetInPlugin: true
|
||||
},
|
||||
|
@@ -2,11 +2,12 @@
|
||||
"name": "@fastgpt/global",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"axios": "^1.5.1",
|
||||
"dayjs": "^1.11.7",
|
||||
"openai": "4.23.0",
|
||||
"encoding": "^0.1.13",
|
||||
"js-tiktoken": "^1.0.7",
|
||||
"axios": "^1.5.1",
|
||||
"openai": "4.23.0",
|
||||
"pdfjs-dist": "^4.0.269",
|
||||
"timezones-list": "^3.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
@@ -20,12 +20,12 @@ export async function connectMongo({
|
||||
console.log('mongo start connect');
|
||||
try {
|
||||
mongoose.set('strictQuery', true);
|
||||
const maxConnecting = Math.max(20, Number(process.env.DB_MAX_LINK || 20));
|
||||
const maxConnecting = Math.max(30, Number(process.env.DB_MAX_LINK || 20));
|
||||
await mongoose.connect(process.env.MONGODB_URI as string, {
|
||||
bufferCommands: true,
|
||||
maxConnecting: maxConnecting,
|
||||
maxPoolSize: maxConnecting,
|
||||
minPoolSize: Math.max(5, Math.round(Number(process.env.DB_MAX_LINK || 5) * 0.1)),
|
||||
minPoolSize: 20,
|
||||
connectTimeoutMS: 60000,
|
||||
waitQueueTimeoutMS: 60000,
|
||||
socketTimeoutMS: 60000,
|
||||
|
@@ -9,7 +9,8 @@ export const connectPg = async (): Promise<Pool> => {
|
||||
|
||||
global.pgClient = new Pool({
|
||||
connectionString: process.env.PG_URL,
|
||||
max: Number(process.env.DB_MAX_LINK || 5),
|
||||
max: Number(process.env.DB_MAX_LINK || 20),
|
||||
min: 10,
|
||||
keepAlive: true,
|
||||
idleTimeoutMillis: 60000,
|
||||
connectionTimeoutMillis: 20000
|
||||
|
@@ -1,15 +1,15 @@
|
||||
import { SystemConfigsTypeEnum } from '@fastgpt/global/common/system/config/constants';
|
||||
import { MongoSystemConfigs } from './schema';
|
||||
import { FeConfigsType } from '@fastgpt/global/common/system/types';
|
||||
import { FastGPTConfigFileType } from '@fastgpt/global/common/system/types';
|
||||
|
||||
export const getFastGPTFeConfig = async () => {
|
||||
export const getFastGPTConfigFromDB = async () => {
|
||||
const res = await MongoSystemConfigs.findOne({
|
||||
type: SystemConfigsTypeEnum.fastgpt
|
||||
}).sort({
|
||||
createTime: -1
|
||||
});
|
||||
|
||||
const config: FeConfigsType = res?.value?.FeConfig || {};
|
||||
const config = res?.value || {};
|
||||
|
||||
return config;
|
||||
return config as Omit<FastGPTConfigFileType, 'systemEnv'>;
|
||||
};
|
||||
|
@@ -22,7 +22,6 @@ const systemConfigSchema = new Schema({
|
||||
});
|
||||
|
||||
try {
|
||||
systemConfigSchema.index({ createTime: -1 }, { expireAfterSeconds: 90 * 24 * 60 * 60 });
|
||||
systemConfigSchema.index({ type: 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
@@ -79,6 +79,10 @@ const TrainingDataSchema = new Schema({
|
||||
type: Number,
|
||||
default: 0
|
||||
},
|
||||
weight: {
|
||||
type: Number,
|
||||
default: 0
|
||||
},
|
||||
indexes: {
|
||||
type: [
|
||||
{
|
||||
|
@@ -56,7 +56,7 @@ export async function parseHeaderCert({
|
||||
async function authCookieToken(cookie?: string, token?: string) {
|
||||
// 获取 cookie
|
||||
const cookies = Cookie.parse(cookie || '');
|
||||
const cookieToken = cookies.token || token;
|
||||
const cookieToken = token || cookies.token;
|
||||
|
||||
if (!cookieToken) {
|
||||
return Promise.reject(ERROR_ENUM.unAuthorization);
|
||||
@@ -127,7 +127,7 @@ export async function parseHeaderCert({
|
||||
authType: AuthUserTypeEnum.apikey
|
||||
};
|
||||
}
|
||||
if (authToken && (cookie || token)) {
|
||||
if (authToken && (token || cookie)) {
|
||||
// user token(from fastgpt web)
|
||||
const res = await authCookieToken(cookie, token);
|
||||
return {
|
||||
@@ -182,7 +182,7 @@ export async function parseHeaderCert({
|
||||
export const setCookie = (res: NextApiResponse, token: string) => {
|
||||
res.setHeader(
|
||||
'Set-Cookie',
|
||||
`token=${token}; Path=/; HttpOnly; Max-Age=604800; Samesite=None; Secure;`
|
||||
`token=${token}; Path=/; HttpOnly; Max-Age=604800; Samesite=Strict; Secure;`
|
||||
);
|
||||
};
|
||||
/* clear cookie */
|
||||
|
66
packages/web/common/file/img.ts
Normal file
66
packages/web/common/file/img.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
export type CompressImgProps = {
|
||||
maxW?: number;
|
||||
maxH?: number;
|
||||
maxSize?: number;
|
||||
};
|
||||
|
||||
export const compressBase64ImgAndUpload = ({
|
||||
base64Img,
|
||||
maxW = 1080,
|
||||
maxH = 1080,
|
||||
maxSize = 1024 * 500, // 300kb
|
||||
uploadController
|
||||
}: CompressImgProps & {
|
||||
base64Img: string;
|
||||
uploadController: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
const fileType =
|
||||
/^data:([a-zA-Z0-9]+\/[a-zA-Z0-9-.+]+).*,/.exec(base64Img)?.[1] || 'image/jpeg';
|
||||
|
||||
const img = new Image();
|
||||
img.src = base64Img;
|
||||
img.onload = async () => {
|
||||
let width = img.width;
|
||||
let height = img.height;
|
||||
|
||||
if (width > height) {
|
||||
if (width > maxW) {
|
||||
height *= maxW / width;
|
||||
width = maxW;
|
||||
}
|
||||
} else {
|
||||
if (height > maxH) {
|
||||
width *= maxH / height;
|
||||
height = maxH;
|
||||
}
|
||||
}
|
||||
|
||||
const canvas = document.createElement('canvas');
|
||||
canvas.width = width;
|
||||
canvas.height = height;
|
||||
const ctx = canvas.getContext('2d');
|
||||
|
||||
if (!ctx) {
|
||||
return reject('压缩图片异常');
|
||||
}
|
||||
|
||||
ctx.drawImage(img, 0, 0, width, height);
|
||||
const compressedDataUrl = canvas.toDataURL(fileType, 1);
|
||||
// 移除 canvas 元素
|
||||
canvas.remove();
|
||||
|
||||
if (compressedDataUrl.length > maxSize) {
|
||||
return reject('图片太大了');
|
||||
}
|
||||
|
||||
try {
|
||||
const src = await uploadController(compressedDataUrl);
|
||||
resolve(src);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
};
|
||||
img.onerror = reject;
|
||||
});
|
||||
};
|
53
packages/web/common/file/read.ts
Normal file
53
packages/web/common/file/read.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
import { uploadMarkdownBase64 } from '@fastgpt/global/common/string/markdown';
|
||||
import { htmlStr2Md } from '../string/markdown';
|
||||
/**
|
||||
* read file raw text
|
||||
*/
|
||||
export const readFileRawText = (file: File) => {
|
||||
return new Promise((resolve: (_: string) => void, reject) => {
|
||||
try {
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
resolve(reader.result as string);
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log('error txt read:', err);
|
||||
reject('Read file error');
|
||||
};
|
||||
reader.readAsText(file);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
export const readMdFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const md = await readFileRawText(file);
|
||||
const rawText = await uploadMarkdownBase64({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
return rawText;
|
||||
};
|
||||
|
||||
export const readHtmlFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const md = htmlStr2Md(await readFileRawText(file));
|
||||
const rawText = await uploadMarkdownBase64({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
|
||||
return rawText;
|
||||
};
|
@@ -2,6 +2,7 @@
|
||||
"name": "@fastgpt/web",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"@fastgpt/global": "workspace:*",
|
||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||
"turndown": "^7.1.2"
|
||||
},
|
||||
|
Reference in New Issue
Block a user