This commit is contained in:
Archer
2023-12-27 11:07:39 +08:00
committed by GitHub
parent 86286efb54
commit 759a2330e6
182 changed files with 3099 additions and 81685 deletions

View File

@@ -0,0 +1,62 @@
/* read file to txt */
import * as pdfjsLib from 'pdfjs-dist';
export const readPdfFile = async ({ pdf }: { pdf: string | URL | ArrayBuffer }) => {
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.07; // 假设页头在页面顶部5%的区域内
const footerThreshold = pageHeight * 0.93; // 假设页脚在页面底部5%的区域内
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] > headerThreshold && token.transform[5] < footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const doc = await pdfjsLib.getDocument(pdf).promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
return pageTexts.join('');
};

View File

@@ -34,3 +34,41 @@ export const simpleMarkdownText = (rawText: string) => {
return rawText.trim();
};
/**
* format markdown
* 1. upload base64
* 2. replace \
*/
export const uploadMarkdownBase64 = async ({
rawText,
uploadImgController
}: {
rawText: string;
uploadImgController: (base64: string) => Promise<string>;
}) => {
// match base64, upload and replace it
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
const base64Arr = rawText.match(base64Regex) || [];
// upload base64 and replace it
await Promise.all(
base64Arr.map(async (base64Img) => {
try {
const str = await uploadImgController(base64Img);
rawText = rawText.replace(base64Img, str);
} catch (error) {
rawText = rawText.replace(base64Img, '');
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
}
})
);
// Remove white space on both sides of the picture
const trimReg = /(!\[.*\]\(.*\))\s*/g;
if (trimReg.test(rawText)) {
rawText = rawText.replace(trimReg, '$1');
}
return simpleMarkdownText(rawText);
};

View File

@@ -31,7 +31,7 @@ export const splitText2Chunks = (props: {
// The larger maxLen is, the next sentence is less likely to trigger splitting
const stepReges: { reg: RegExp; maxLen: number }[] = [
...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
...customReg.map((text) => ({ reg: new RegExp(`(${text})`, 'g'), maxLen: chunkLen * 1.4 })),
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
@@ -64,13 +64,22 @@ export const splitText2Chunks = (props: {
}
];
}
const isCustomSteep = checkIsCustomStep(step);
const isMarkdownSplit = checkIsMarkdownSplit(step);
const independentChunk = checkIndependentChunk(step);
const { reg } = stepReges[step];
const splitTexts = text
.replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
.replace(
reg,
(() => {
if (isCustomSteep) return splitMarker;
if (independentChunk) return `${splitMarker}$1`;
return `$1${splitMarker}`;
})()
)
.split(`${splitMarker}`)
.filter((part) => part.trim());
@@ -128,11 +137,6 @@ export const splitText2Chunks = (props: {
const independentChunk = checkIndependentChunk(step);
const isCustomStep = checkIsCustomStep(step);
// mini text
if (text.length <= chunkLen) {
return [text];
}
// oversize
if (step >= stepReges.length) {
if (text.length < chunkLen * 3) {
@@ -221,6 +225,8 @@ export const splitText2Chunks = (props: {
} else {
chunks.push(`${mdTitle}${lastText}`);
}
} else if (lastText && chunks.length === 0) {
chunks.push(lastText);
}
return chunks;

View File

@@ -1,4 +1,29 @@
export type FeConfigsType = {
import type {
ChatModelItemType,
FunctionModelItemType,
LLMModelItemType,
VectorModelItemType,
AudioSpeechModels,
WhisperModelType,
ReRankModelItemType
} from '../../../core/ai/model.d';
/* fastgpt main */
export type FastGPTConfigFileType = {
feConfigs: FastGPTFeConfigsType;
systemEnv: SystemEnvType;
chatModels: ChatModelItemType[];
qaModels: LLMModelItemType[];
cqModels: FunctionModelItemType[];
extractModels: FunctionModelItemType[];
qgModels: LLMModelItemType[];
vectorModels: VectorModelItemType[];
reRankModels: ReRankModelItemType[];
audioSpeechModels: AudioSpeechModelType[];
whisperModel: WhisperModelType;
};
export type FastGPTFeConfigsType = {
show_emptyChat?: boolean;
show_register?: boolean;
show_appStore?: boolean;
@@ -34,6 +59,6 @@ export type SystemEnvType = {
};
declare global {
var feConfigs: FeConfigsType;
var feConfigs: FastGPTFeConfigsType;
var systemEnv: SystemEnvType;
}

View File

@@ -24,6 +24,7 @@ export type VectorModelItemType = {
defaultToken: number;
price: number;
maxToken: number;
weight: number;
};
export type ReRankModelItemType = {

View File

@@ -16,6 +16,7 @@ export const defaultVectorModels: VectorModelItemType[] = [
name: 'Embedding-2',
price: 0,
defaultToken: 500,
maxToken: 3000
maxToken: 3000,
weight: 100
}
];

View File

@@ -89,6 +89,7 @@ export type DatasetTrainingSchemaType = {
q: string;
a: string;
chunkIndex: number;
weight: number;
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
};

View File

@@ -36,10 +36,11 @@ export const ContextExtractModule: FlowModuleTemplateType = {
type: FlowNodeInputTypeEnum.textarea,
valueType: ModuleIOValueTypeEnum.string,
label: '提取要求描述',
description: '给AI一些对应的背景知识或要求描述引导AI更好的完成任务',
description:
'给AI一些对应的背景知识或要求描述引导AI更好的完成任务。\n该输入框可使用全局变量。',
required: true,
placeholder:
'例如: \n1. 你是一个实验室预约助手,你的任务是帮助用户预约实验室。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
'例如: \n1. 当前时间为: {{cTime}}。你是一个实验室预约助手,你的任务是帮助用户预约实验室,从文本中获取对应的预约信息。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
showTargetInApp: true,
showTargetInPlugin: true
},

View File

@@ -2,11 +2,12 @@
"name": "@fastgpt/global",
"version": "1.0.0",
"dependencies": {
"axios": "^1.5.1",
"dayjs": "^1.11.7",
"openai": "4.23.0",
"encoding": "^0.1.13",
"js-tiktoken": "^1.0.7",
"axios": "^1.5.1",
"openai": "4.23.0",
"pdfjs-dist": "^4.0.269",
"timezones-list": "^3.0.2"
},
"devDependencies": {

View File

@@ -20,12 +20,12 @@ export async function connectMongo({
console.log('mongo start connect');
try {
mongoose.set('strictQuery', true);
const maxConnecting = Math.max(20, Number(process.env.DB_MAX_LINK || 20));
const maxConnecting = Math.max(30, Number(process.env.DB_MAX_LINK || 20));
await mongoose.connect(process.env.MONGODB_URI as string, {
bufferCommands: true,
maxConnecting: maxConnecting,
maxPoolSize: maxConnecting,
minPoolSize: Math.max(5, Math.round(Number(process.env.DB_MAX_LINK || 5) * 0.1)),
minPoolSize: 20,
connectTimeoutMS: 60000,
waitQueueTimeoutMS: 60000,
socketTimeoutMS: 60000,

View File

@@ -9,7 +9,8 @@ export const connectPg = async (): Promise<Pool> => {
global.pgClient = new Pool({
connectionString: process.env.PG_URL,
max: Number(process.env.DB_MAX_LINK || 5),
max: Number(process.env.DB_MAX_LINK || 20),
min: 10,
keepAlive: true,
idleTimeoutMillis: 60000,
connectionTimeoutMillis: 20000

View File

@@ -1,15 +1,15 @@
import { SystemConfigsTypeEnum } from '@fastgpt/global/common/system/config/constants';
import { MongoSystemConfigs } from './schema';
import { FeConfigsType } from '@fastgpt/global/common/system/types';
import { FastGPTConfigFileType } from '@fastgpt/global/common/system/types';
export const getFastGPTFeConfig = async () => {
export const getFastGPTConfigFromDB = async () => {
const res = await MongoSystemConfigs.findOne({
type: SystemConfigsTypeEnum.fastgpt
}).sort({
createTime: -1
});
const config: FeConfigsType = res?.value?.FeConfig || {};
const config = res?.value || {};
return config;
return config as Omit<FastGPTConfigFileType, 'systemEnv'>;
};

View File

@@ -22,7 +22,6 @@ const systemConfigSchema = new Schema({
});
try {
systemConfigSchema.index({ createTime: -1 }, { expireAfterSeconds: 90 * 24 * 60 * 60 });
systemConfigSchema.index({ type: 1 });
} catch (error) {
console.log(error);

View File

@@ -79,6 +79,10 @@ const TrainingDataSchema = new Schema({
type: Number,
default: 0
},
weight: {
type: Number,
default: 0
},
indexes: {
type: [
{

View File

@@ -56,7 +56,7 @@ export async function parseHeaderCert({
async function authCookieToken(cookie?: string, token?: string) {
// 获取 cookie
const cookies = Cookie.parse(cookie || '');
const cookieToken = cookies.token || token;
const cookieToken = token || cookies.token;
if (!cookieToken) {
return Promise.reject(ERROR_ENUM.unAuthorization);
@@ -127,7 +127,7 @@ export async function parseHeaderCert({
authType: AuthUserTypeEnum.apikey
};
}
if (authToken && (cookie || token)) {
if (authToken && (token || cookie)) {
// user token(from fastgpt web)
const res = await authCookieToken(cookie, token);
return {
@@ -182,7 +182,7 @@ export async function parseHeaderCert({
export const setCookie = (res: NextApiResponse, token: string) => {
res.setHeader(
'Set-Cookie',
`token=${token}; Path=/; HttpOnly; Max-Age=604800; Samesite=None; Secure;`
`token=${token}; Path=/; HttpOnly; Max-Age=604800; Samesite=Strict; Secure;`
);
};
/* clear cookie */

View File

@@ -0,0 +1,66 @@
export type CompressImgProps = {
maxW?: number;
maxH?: number;
maxSize?: number;
};
export const compressBase64ImgAndUpload = ({
base64Img,
maxW = 1080,
maxH = 1080,
maxSize = 1024 * 500, // 300kb
uploadController
}: CompressImgProps & {
base64Img: string;
uploadController: (base64: string) => Promise<string>;
}) => {
return new Promise<string>((resolve, reject) => {
const fileType =
/^data:([a-zA-Z0-9]+\/[a-zA-Z0-9-.+]+).*,/.exec(base64Img)?.[1] || 'image/jpeg';
const img = new Image();
img.src = base64Img;
img.onload = async () => {
let width = img.width;
let height = img.height;
if (width > height) {
if (width > maxW) {
height *= maxW / width;
width = maxW;
}
} else {
if (height > maxH) {
width *= maxH / height;
height = maxH;
}
}
const canvas = document.createElement('canvas');
canvas.width = width;
canvas.height = height;
const ctx = canvas.getContext('2d');
if (!ctx) {
return reject('压缩图片异常');
}
ctx.drawImage(img, 0, 0, width, height);
const compressedDataUrl = canvas.toDataURL(fileType, 1);
// 移除 canvas 元素
canvas.remove();
if (compressedDataUrl.length > maxSize) {
return reject('图片太大了');
}
try {
const src = await uploadController(compressedDataUrl);
resolve(src);
} catch (error) {
reject(error);
}
};
img.onerror = reject;
});
};

View File

@@ -0,0 +1,53 @@
import { uploadMarkdownBase64 } from '@fastgpt/global/common/string/markdown';
import { htmlStr2Md } from '../string/markdown';
/**
* read file raw text
*/
export const readFileRawText = (file: File) => {
return new Promise((resolve: (_: string) => void, reject) => {
try {
const reader = new FileReader();
reader.onload = () => {
resolve(reader.result as string);
};
reader.onerror = (err) => {
console.log('error txt read:', err);
reject('Read file error');
};
reader.readAsText(file);
} catch (error) {
reject(error);
}
});
};
export const readMdFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController: (base64: string) => Promise<string>;
}) => {
const md = await readFileRawText(file);
const rawText = await uploadMarkdownBase64({
rawText: md,
uploadImgController
});
return rawText;
};
export const readHtmlFile = async ({
file,
uploadImgController
}: {
file: File;
uploadImgController: (base64: string) => Promise<string>;
}) => {
const md = htmlStr2Md(await readFileRawText(file));
const rawText = await uploadMarkdownBase64({
rawText: md,
uploadImgController
});
return rawText;
};

View File

@@ -2,6 +2,7 @@
"name": "@fastgpt/web",
"version": "1.0.0",
"dependencies": {
"@fastgpt/global": "workspace:*",
"joplin-turndown-plugin-gfm": "^1.0.12",
"turndown": "^7.1.2"
},