This commit is contained in:
Archer
2023-12-27 11:07:39 +08:00
committed by GitHub
parent 86286efb54
commit 759a2330e6
182 changed files with 3099 additions and 81685 deletions

View File

@@ -0,0 +1,62 @@
/* read file to txt */
import * as pdfjsLib from 'pdfjs-dist';
export const readPdfFile = async ({ pdf }: { pdf: string | URL | ArrayBuffer }) => {
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
type TokenType = {
str: string;
dir: string;
width: number;
height: number;
transform: number[];
fontName: string;
hasEOL: boolean;
};
const readPDFPage = async (doc: any, pageNo: number) => {
const page = await doc.getPage(pageNo);
const tokenizedText = await page.getTextContent();
const viewport = page.getViewport({ scale: 1 });
const pageHeight = viewport.height;
const headerThreshold = pageHeight * 0.07; // 假设页头在页面顶部5%的区域内
const footerThreshold = pageHeight * 0.93; // 假设页脚在页面底部5%的区域内
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
return (
!token.transform ||
(token.transform[5] > headerThreshold && token.transform[5] < footerThreshold)
);
});
// concat empty string 'hasEOL'
for (let i = 0; i < pageTexts.length; i++) {
const item = pageTexts[i];
if (item.str === '' && pageTexts[i - 1]) {
pageTexts[i - 1].hasEOL = item.hasEOL;
pageTexts.splice(i, 1);
i--;
}
}
page.cleanup();
return pageTexts
.map((token) => {
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
return paragraphEnd ? `${token.str}\n` : token.str;
})
.join('');
};
const doc = await pdfjsLib.getDocument(pdf).promise;
const pageTextPromises = [];
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
pageTextPromises.push(readPDFPage(doc, pageNo));
}
const pageTexts = await Promise.all(pageTextPromises);
return pageTexts.join('');
};

View File

@@ -34,3 +34,41 @@ export const simpleMarkdownText = (rawText: string) => {
return rawText.trim();
};
/**
* format markdown
* 1. upload base64
* 2. replace \
*/
export const uploadMarkdownBase64 = async ({
rawText,
uploadImgController
}: {
rawText: string;
uploadImgController: (base64: string) => Promise<string>;
}) => {
// match base64, upload and replace it
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
const base64Arr = rawText.match(base64Regex) || [];
// upload base64 and replace it
await Promise.all(
base64Arr.map(async (base64Img) => {
try {
const str = await uploadImgController(base64Img);
rawText = rawText.replace(base64Img, str);
} catch (error) {
rawText = rawText.replace(base64Img, '');
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
}
})
);
// Remove white space on both sides of the picture
const trimReg = /(!\[.*\]\(.*\))\s*/g;
if (trimReg.test(rawText)) {
rawText = rawText.replace(trimReg, '$1');
}
return simpleMarkdownText(rawText);
};

View File

@@ -31,7 +31,7 @@ export const splitText2Chunks = (props: {
// The larger maxLen is, the next sentence is less likely to trigger splitting
const stepReges: { reg: RegExp; maxLen: number }[] = [
...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
...customReg.map((text) => ({ reg: new RegExp(`(${text})`, 'g'), maxLen: chunkLen * 1.4 })),
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
@@ -64,13 +64,22 @@ export const splitText2Chunks = (props: {
}
];
}
const isCustomSteep = checkIsCustomStep(step);
const isMarkdownSplit = checkIsMarkdownSplit(step);
const independentChunk = checkIndependentChunk(step);
const { reg } = stepReges[step];
const splitTexts = text
.replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
.replace(
reg,
(() => {
if (isCustomSteep) return splitMarker;
if (independentChunk) return `${splitMarker}$1`;
return `$1${splitMarker}`;
})()
)
.split(`${splitMarker}`)
.filter((part) => part.trim());
@@ -128,11 +137,6 @@ export const splitText2Chunks = (props: {
const independentChunk = checkIndependentChunk(step);
const isCustomStep = checkIsCustomStep(step);
// mini text
if (text.length <= chunkLen) {
return [text];
}
// oversize
if (step >= stepReges.length) {
if (text.length < chunkLen * 3) {
@@ -221,6 +225,8 @@ export const splitText2Chunks = (props: {
} else {
chunks.push(`${mdTitle}${lastText}`);
}
} else if (lastText && chunks.length === 0) {
chunks.push(lastText);
}
return chunks;

View File

@@ -1,4 +1,29 @@
export type FeConfigsType = {
import type {
ChatModelItemType,
FunctionModelItemType,
LLMModelItemType,
VectorModelItemType,
AudioSpeechModels,
WhisperModelType,
ReRankModelItemType
} from '../../../core/ai/model.d';
/* fastgpt main */
export type FastGPTConfigFileType = {
feConfigs: FastGPTFeConfigsType;
systemEnv: SystemEnvType;
chatModels: ChatModelItemType[];
qaModels: LLMModelItemType[];
cqModels: FunctionModelItemType[];
extractModels: FunctionModelItemType[];
qgModels: LLMModelItemType[];
vectorModels: VectorModelItemType[];
reRankModels: ReRankModelItemType[];
audioSpeechModels: AudioSpeechModelType[];
whisperModel: WhisperModelType;
};
export type FastGPTFeConfigsType = {
show_emptyChat?: boolean;
show_register?: boolean;
show_appStore?: boolean;
@@ -34,6 +59,6 @@ export type SystemEnvType = {
};
declare global {
var feConfigs: FeConfigsType;
var feConfigs: FastGPTFeConfigsType;
var systemEnv: SystemEnvType;
}

View File

@@ -24,6 +24,7 @@ export type VectorModelItemType = {
defaultToken: number;
price: number;
maxToken: number;
weight: number;
};
export type ReRankModelItemType = {

View File

@@ -16,6 +16,7 @@ export const defaultVectorModels: VectorModelItemType[] = [
name: 'Embedding-2',
price: 0,
defaultToken: 500,
maxToken: 3000
maxToken: 3000,
weight: 100
}
];

View File

@@ -89,6 +89,7 @@ export type DatasetTrainingSchemaType = {
q: string;
a: string;
chunkIndex: number;
weight: number;
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
};

View File

@@ -36,10 +36,11 @@ export const ContextExtractModule: FlowModuleTemplateType = {
type: FlowNodeInputTypeEnum.textarea,
valueType: ModuleIOValueTypeEnum.string,
label: '提取要求描述',
description: '给AI一些对应的背景知识或要求描述引导AI更好的完成任务',
description:
'给AI一些对应的背景知识或要求描述引导AI更好的完成任务。\n该输入框可使用全局变量。',
required: true,
placeholder:
'例如: \n1. 你是一个实验室预约助手,你的任务是帮助用户预约实验室。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
'例如: \n1. 当前时间为: {{cTime}}。你是一个实验室预约助手,你的任务是帮助用户预约实验室,从文本中获取对应的预约信息。\n2. 你是谷歌搜索助手,需要从文本中提取出合适的搜索词。',
showTargetInApp: true,
showTargetInPlugin: true
},

View File

@@ -2,11 +2,12 @@
"name": "@fastgpt/global",
"version": "1.0.0",
"dependencies": {
"axios": "^1.5.1",
"dayjs": "^1.11.7",
"openai": "4.23.0",
"encoding": "^0.1.13",
"js-tiktoken": "^1.0.7",
"axios": "^1.5.1",
"openai": "4.23.0",
"pdfjs-dist": "^4.0.269",
"timezones-list": "^3.0.2"
},
"devDependencies": {