This commit is contained in:
Archer
2023-12-18 16:24:50 +08:00
committed by GitHub
parent d33c99f564
commit 703583fff7
130 changed files with 3418 additions and 2579 deletions

View File

@@ -1,8 +1,3 @@
import axios from 'axios';
import { UrlFetchParams, UrlFetchResponse } from './api.d';
import { htmlToMarkdown } from '../string/markdown';
import * as cheerio from 'cheerio';
export const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B';
@@ -12,91 +7,3 @@ export const formatFileSize = (bytes: number): string => {
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
};
export const cheerioToHtml = ({
fetchUrl,
$,
selector
}: {
fetchUrl: string;
$: cheerio.CheerioAPI;
selector?: string;
}) => {
// get origin url
const originUrl = new URL(fetchUrl).origin;
// remove i element
$('i,script').remove();
// remove empty a element
$('a')
.filter((i, el) => {
return $(el).text().trim() === '' && $(el).children().length === 0;
})
.remove();
// if link,img startWith /, add origin url
$('a').each((i, el) => {
const href = $(el).attr('href');
if (href && href.startsWith('/')) {
$(el).attr('href', originUrl + href);
}
});
$('img').each((i, el) => {
const src = $(el).attr('src');
if (src && src.startsWith('/')) {
$(el).attr('src', originUrl + src);
}
});
const html = $(selector || 'body')
.map((item, dom) => {
return $(dom).html();
})
.get()
.join('\n');
return html;
};
export const urlsFetch = async ({
urlList,
selector
}: UrlFetchParams): Promise<UrlFetchResponse> => {
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
const response = (
await Promise.all(
urlList.map(async (url) => {
try {
const fetchRes = await axios.get(url, {
timeout: 30000
});
const $ = cheerio.load(fetchRes.data);
const md = htmlToMarkdown(
cheerioToHtml({
fetchUrl: url,
$,
selector
})
);
return {
url,
content: md
};
} catch (error) {
console.log(error, 'fetch error');
return {
url,
content: ''
};
}
})
)
).filter((item) => item.content);
return response;
};

View File

@@ -1,5 +1,4 @@
import { simpleText } from './tools';
import { NodeHtmlMarkdown } from 'node-html-markdown';
/* Delete redundant text in markdown */
export const simpleMarkdownText = (rawText: string) => {
@@ -27,75 +26,11 @@ export const simpleMarkdownText = (rawText: string) => {
// Remove headings and code blocks front spaces
['####', '###', '##', '#', '```', '~~~'].forEach((item, i) => {
const isMarkdown = i <= 3;
const reg = new RegExp(`\\n\\s*${item}`, 'g');
if (reg.test(rawText)) {
rawText = rawText.replace(
new RegExp(`(\\n)\\s*(${item})`, 'g'),
isMarkdown ? '\n$1$2' : '$1$2'
);
rawText = rawText.replace(new RegExp(`(\\n)\\s*(${item})`, 'g'), '$1$2');
}
});
return rawText.trim();
};
/* html string to markdown */
export const htmlToMarkdown = (html?: string | null) => {
if (!html) return '';
const surround = (source: string, surroundStr: string) => `${surroundStr}${source}${surroundStr}`;
const nhm = new NodeHtmlMarkdown(
{
codeFence: '```',
codeBlockStyle: 'fenced',
ignore: ['i', 'script']
},
{
code: ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => {
const isCodeBlock = ['PRE', 'WRAPPED-PRE'].includes(parent?.tagName!);
if (!isCodeBlock) {
return {
spaceIfRepeatingChar: true,
noEscape: true,
postprocess: ({ content }) => {
// Find longest occurring sequence of running backticks and add one more (so content is escaped)
const delimiter =
'`' + (content.match(/`+/g)?.sort((a, b) => b.length - a.length)?.[0] || '');
const padding = delimiter.length > 1 ? ' ' : '';
return surround(surround(content, padding), delimiter);
}
};
}
/* Handle code block */
if (codeBlockStyle === 'fenced') {
const language =
node.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
parent?.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
'';
return {
noEscape: true,
prefix: `${codeFence}${language}\n`,
postfix: `\n${codeFence}\n`,
childTranslators: visitor.instance.codeBlockTranslators
};
}
return {
noEscape: true,
postprocess: ({ content }) => content.replace(/^/gm, ' '),
childTranslators: visitor.instance.codeBlockTranslators
};
}
}
);
const markdown = nhm.translate(html).trim();
return simpleMarkdownText(markdown);
};

View File

@@ -13,12 +13,13 @@ export const splitText2Chunks = (props: {
chunkLen: number;
overlapRatio?: number;
customReg?: string[];
countTokens?: boolean;
}): {
chunks: string[];
tokens: number;
overlapRatio?: number;
} => {
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [], countTokens = true } = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkLen * overlapRatio);
@@ -233,7 +234,9 @@ export const splitText2Chunks = (props: {
mdTitle: ''
}).map((chunk) => chunk.replaceAll(codeBlockMarker, '\n')); // restore code block
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
const tokens = countTokens
? chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0)
: 0;
return {
chunks,

View File

@@ -40,7 +40,7 @@ export type ChatItemSchema = {
value: string;
userGoodFeedback?: string;
userBadFeedback?: string;
robotBadFeedback?: string;
customFeedbacks?: string[];
adminFeedback?: AdminFbkType;
[ModuleOutputKeyEnum.responseData]?: ChatHistoryItemResType[];
};
@@ -60,6 +60,7 @@ export type ChatItemType = {
value: any;
userGoodFeedback?: string;
userBadFeedback?: string;
customFeedbacks?: ChatItemSchema['customFeedbacks'];
adminFeedback?: ChatItemSchema['feedback'];
[ModuleOutputKeyEnum.responseData]?: ChatHistoryItemResType[];
};

View File

@@ -5,12 +5,14 @@ export type SelectedDatasetType = { datasetId: string; vectorModel: VectorModelI
export type HttpBodyType<T = any> = {
appId: string;
chatId?: string;
responseChatItemId?: string;
variables: Record<string, any>;
data: T;
};
export type HttpQueryType = {
appId: string;
chatId?: string;
responseChatItemId?: string;
variables: Record<string, any>;
[key: string]: any;
};

View File

@@ -23,5 +23,6 @@ export type PluginTemplateType = {
name: string;
avatar: string;
intro: string;
showStatus?: boolean;
modules: ModuleItemType[];
};

View File

@@ -2,17 +2,14 @@
"name": "@fastgpt/global",
"version": "1.0.0",
"dependencies": {
"axios": "^1.5.1",
"cheerio": "1.0.0-rc.12",
"dayjs": "^1.11.7",
"openai": "4.23.0",
"encoding": "^0.1.13",
"js-tiktoken": "^1.0.7",
"node-html-markdown": "^1.3.0",
"openai": "^4.20.1",
"axios": "^1.5.1",
"timezones-list": "^3.0.2"
},
"devDependencies": {
"@types/node": "^20.8.5",
"@types/turndown": "^5.0.4"
"@types/node": "^20.8.5"
}
}