mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
4.6.3-website dataset (#532)
This commit is contained in:
8
packages/global/common/file/api.d.ts
vendored
Normal file
8
packages/global/common/file/api.d.ts
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
export type UrlFetchParams = {
|
||||
urlList: string[];
|
||||
selector?: string;
|
||||
};
|
||||
export type UrlFetchResponse = {
|
||||
url: string;
|
||||
content: string;
|
||||
}[];
|
@@ -1,3 +1,8 @@
|
||||
import axios from 'axios';
|
||||
import { UrlFetchParams, UrlFetchResponse } from './api.d';
|
||||
import { htmlToMarkdown } from '../string/markdown';
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
export const formatFileSize = (bytes: number): string => {
|
||||
if (bytes === 0) return '0 B';
|
||||
|
||||
@@ -7,3 +12,84 @@ export const formatFileSize = (bytes: number): string => {
|
||||
|
||||
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
||||
};
|
||||
|
||||
export const cheerioToHtml = ({
|
||||
fetchUrl,
|
||||
$,
|
||||
selector
|
||||
}: {
|
||||
fetchUrl: string;
|
||||
$: cheerio.CheerioAPI;
|
||||
selector?: string;
|
||||
}) => {
|
||||
// get origin url
|
||||
const originUrl = new URL(fetchUrl).origin;
|
||||
|
||||
// remove i element
|
||||
$('i,script').remove();
|
||||
|
||||
// remove empty a element
|
||||
$('a')
|
||||
.filter((i, el) => {
|
||||
return $(el).text().trim() === '' && $(el).children().length === 0;
|
||||
})
|
||||
.remove();
|
||||
|
||||
// if link,img startWith /, add origin url
|
||||
$('a').each((i, el) => {
|
||||
const href = $(el).attr('href');
|
||||
if (href && href.startsWith('/')) {
|
||||
$(el).attr('href', originUrl + href);
|
||||
}
|
||||
});
|
||||
$('img').each((i, el) => {
|
||||
const src = $(el).attr('src');
|
||||
if (src && src.startsWith('/')) {
|
||||
$(el).attr('src', originUrl + src);
|
||||
}
|
||||
});
|
||||
|
||||
return $(selector || 'body').html();
|
||||
};
|
||||
export const urlsFetch = async ({
|
||||
urlList,
|
||||
selector
|
||||
}: UrlFetchParams): Promise<UrlFetchResponse> => {
|
||||
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
|
||||
|
||||
const response = (
|
||||
await Promise.all(
|
||||
urlList.map(async (url) => {
|
||||
try {
|
||||
const fetchRes = await axios.get(url, {
|
||||
timeout: 30000
|
||||
});
|
||||
|
||||
const $ = cheerio.load(fetchRes.data);
|
||||
|
||||
const md = htmlToMarkdown(
|
||||
cheerioToHtml({
|
||||
fetchUrl: url,
|
||||
$,
|
||||
selector
|
||||
})
|
||||
);
|
||||
|
||||
return {
|
||||
url,
|
||||
content: md
|
||||
};
|
||||
} catch (error) {
|
||||
console.log(error, 'fetch error');
|
||||
|
||||
return {
|
||||
url,
|
||||
content: ''
|
||||
};
|
||||
}
|
||||
})
|
||||
)
|
||||
).filter((item) => item.content);
|
||||
|
||||
return response;
|
||||
};
|
||||
|
@@ -1,4 +0,0 @@
|
||||
export type FetchResultItem = {
|
||||
url: string;
|
||||
content: string;
|
||||
};
|
1508
packages/global/common/string/jieba.ts
Normal file
1508
packages/global/common/string/jieba.ts
Normal file
File diff suppressed because it is too large
Load Diff
97
packages/global/common/string/markdown.ts
Normal file
97
packages/global/common/string/markdown.ts
Normal file
@@ -0,0 +1,97 @@
|
||||
import { simpleText } from './tools';
|
||||
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
||||
|
||||
/* Delete redundant text in markdown */
|
||||
export const simpleMarkdownText = (rawText: string) => {
|
||||
rawText = simpleText(rawText);
|
||||
|
||||
// Remove a line feed from a hyperlink or picture
|
||||
rawText = rawText.replace(/\[([^\]]+)\]\((.+?)\)/g, (match, linkText, url) => {
|
||||
const cleanedLinkText = linkText.replace(/\n/g, ' ').trim();
|
||||
|
||||
if (!url) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return `[${cleanedLinkText}](${url})`;
|
||||
});
|
||||
|
||||
// replace special \.* ……
|
||||
const reg1 = /\\([-.!`_(){}\[\]])/g;
|
||||
if (reg1.test(rawText)) {
|
||||
rawText = rawText.replace(/\\([`!*()+-_\[\]{}\\.])/g, '$1');
|
||||
}
|
||||
|
||||
// replace \\n
|
||||
rawText = rawText.replace(/\\\\n/g, '\\n');
|
||||
|
||||
// Remove headings and code blocks front spaces
|
||||
['####', '###', '##', '#', '```', '~~~'].forEach((item) => {
|
||||
const reg = new RegExp(`\\n\\s*${item}`, 'g');
|
||||
if (reg.test(rawText)) {
|
||||
rawText = rawText.replace(new RegExp(`\\n\\s*(${item})`, 'g'), '\n$1');
|
||||
}
|
||||
});
|
||||
|
||||
return rawText.trim();
|
||||
};
|
||||
|
||||
/* html string to markdown */
|
||||
export const htmlToMarkdown = (html?: string | null) => {
|
||||
if (!html) return '';
|
||||
|
||||
const surround = (source: string, surroundStr: string) => `${surroundStr}${source}${surroundStr}`;
|
||||
|
||||
const nhm = new NodeHtmlMarkdown(
|
||||
{
|
||||
codeFence: '```',
|
||||
codeBlockStyle: 'fenced',
|
||||
ignore: ['i', 'script']
|
||||
},
|
||||
{
|
||||
code: ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => {
|
||||
const isCodeBlock = ['PRE', 'WRAPPED-PRE'].includes(parent?.tagName!);
|
||||
|
||||
if (!isCodeBlock) {
|
||||
return {
|
||||
spaceIfRepeatingChar: true,
|
||||
noEscape: true,
|
||||
postprocess: ({ content }) => {
|
||||
// Find longest occurring sequence of running backticks and add one more (so content is escaped)
|
||||
const delimiter =
|
||||
'`' + (content.match(/`+/g)?.sort((a, b) => b.length - a.length)?.[0] || '');
|
||||
const padding = delimiter.length > 1 ? ' ' : '';
|
||||
|
||||
return surround(surround(content, padding), delimiter);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/* Handle code block */
|
||||
if (codeBlockStyle === 'fenced') {
|
||||
const language =
|
||||
node.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
|
||||
parent?.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
|
||||
'';
|
||||
|
||||
return {
|
||||
noEscape: true,
|
||||
prefix: `${codeFence}${language}\n`,
|
||||
postfix: `\n${codeFence}\n`,
|
||||
childTranslators: visitor.instance.codeBlockTranslators
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
noEscape: true,
|
||||
postprocess: ({ content }) => content.replace(/^/gm, ' '),
|
||||
childTranslators: visitor.instance.codeBlockTranslators
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
const markdown = nhm.translate(html).trim();
|
||||
|
||||
return simpleMarkdownText(markdown);
|
||||
};
|
@@ -15,11 +15,18 @@ export const splitText2Chunks = (props: {
|
||||
}): {
|
||||
chunks: string[];
|
||||
tokens: number;
|
||||
overlapRatio?: number;
|
||||
} => {
|
||||
const { text = '', chunkLen, overlapRatio = 0.2 } = props;
|
||||
let { text = '', chunkLen, overlapRatio = 0.2 } = props;
|
||||
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
||||
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
||||
const overlapLen = Math.round(chunkLen * overlapRatio);
|
||||
|
||||
// replace code block all \n to codeBlockMarker
|
||||
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
|
||||
return match.replace(/\n/g, codeBlockMarker);
|
||||
});
|
||||
|
||||
// The larger maxLen is, the next sentence is less likely to trigger splitting
|
||||
const stepReges: { reg: RegExp; maxLen: number }[] = [
|
||||
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
|
||||
@@ -27,8 +34,8 @@ export const splitText2Chunks = (props: {
|
||||
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
|
||||
|
||||
{ reg: /([\n]{2})/g, maxLen: chunkLen * 1.4 },
|
||||
{ reg: /([\n](?![\*\-|>`0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n](`))/g, maxLen: chunkLen * 4 }, // code block
|
||||
{ reg: /([\n](?![\*\-|>0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.4 },
|
||||
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
|
||||
@@ -38,9 +45,15 @@ export const splitText2Chunks = (props: {
|
||||
{ reg: /([,]|,\s)/g, maxLen: chunkLen * 2 }
|
||||
];
|
||||
|
||||
// if use markdown title split, Separate record title title
|
||||
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
||||
if (step >= stepReges.length) {
|
||||
return [text];
|
||||
return [
|
||||
{
|
||||
text,
|
||||
title: ''
|
||||
}
|
||||
];
|
||||
}
|
||||
const isMarkdownSplit = step <= 3;
|
||||
const { reg } = stepReges[step];
|
||||
@@ -49,7 +62,17 @@ export const splitText2Chunks = (props: {
|
||||
.replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
|
||||
.split(`${splitMarker}`)
|
||||
.filter((part) => part.trim());
|
||||
return splitTexts;
|
||||
|
||||
return splitTexts
|
||||
.map((text) => {
|
||||
const matchTitle = isMarkdownSplit ? text.match(reg)?.[0] || '' : '';
|
||||
|
||||
return {
|
||||
text: isMarkdownSplit ? text.replace(matchTitle, '') : text,
|
||||
title: matchTitle
|
||||
};
|
||||
})
|
||||
.filter((item) => item.text.trim());
|
||||
};
|
||||
|
||||
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
||||
@@ -63,7 +86,7 @@ export const splitText2Chunks = (props: {
|
||||
let overlayText = '';
|
||||
|
||||
for (let i = splitTexts.length - 1; i >= 0; i--) {
|
||||
const currentText = splitTexts[i];
|
||||
const currentText = splitTexts[i].text;
|
||||
const newText = currentText + overlayText;
|
||||
const newTextLen = newText.length;
|
||||
|
||||
@@ -83,12 +106,16 @@ export const splitText2Chunks = (props: {
|
||||
const splitTextRecursively = ({
|
||||
text = '',
|
||||
step,
|
||||
lastText
|
||||
lastText,
|
||||
mdTitle = ''
|
||||
}: {
|
||||
text: string;
|
||||
step: number;
|
||||
lastText: string;
|
||||
mdTitle: string;
|
||||
}): string[] => {
|
||||
const isMarkdownSplit = step <= 3;
|
||||
|
||||
// mini text
|
||||
if (text.length <= chunkLen) {
|
||||
return [text];
|
||||
@@ -102,7 +129,7 @@ export const splitText2Chunks = (props: {
|
||||
// use slice-chunkLen to split text
|
||||
const chunks: string[] = [];
|
||||
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
|
||||
chunks.push(text.slice(i, i + chunkLen));
|
||||
chunks.push(`${mdTitle}${text.slice(i, i + chunkLen)}`);
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
@@ -115,7 +142,10 @@ export const splitText2Chunks = (props: {
|
||||
|
||||
const chunks: string[] = [];
|
||||
for (let i = 0; i < splitTexts.length; i++) {
|
||||
const currentText = splitTexts[i];
|
||||
const item = splitTexts[i];
|
||||
const currentTitle = `${mdTitle}${item.title}`;
|
||||
|
||||
const currentText = item.text;
|
||||
const currentTextLen = currentText.length;
|
||||
const lastTextLen = lastText.length;
|
||||
const newText = lastText + currentText;
|
||||
@@ -125,9 +155,10 @@ export const splitText2Chunks = (props: {
|
||||
if (newTextLen > maxLen) {
|
||||
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
||||
if (lastTextLen > minChunkLen) {
|
||||
chunks.push(lastText);
|
||||
chunks.push(`${currentTitle}${lastText}`);
|
||||
lastText = getOneTextOverlapText({ text: lastText, step }); // next chunk will start with overlayText
|
||||
i--;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -135,11 +166,12 @@ export const splitText2Chunks = (props: {
|
||||
const innerChunks = splitTextRecursively({
|
||||
text: newText,
|
||||
step: step + 1,
|
||||
lastText: ''
|
||||
lastText: '',
|
||||
mdTitle: currentTitle
|
||||
});
|
||||
const lastChunk = innerChunks[innerChunks.length - 1];
|
||||
// last chunk is too small, concat it to lastText
|
||||
if (lastChunk.length < minChunkLen) {
|
||||
if (!isMarkdownSplit && lastChunk.length < minChunkLen) {
|
||||
chunks.push(...innerChunks.slice(0, -1));
|
||||
lastText = lastChunk;
|
||||
} else {
|
||||
@@ -156,10 +188,11 @@ export const splitText2Chunks = (props: {
|
||||
// size less than chunkLen, push text to last chunk. now, text definitely less than maxLen
|
||||
lastText = newText;
|
||||
|
||||
// If the chunk size reaches, add a chunk
|
||||
if (newTextLen >= chunkLen) {
|
||||
chunks.push(lastText);
|
||||
lastText = getOneTextOverlapText({ text: lastText, step });
|
||||
// markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
|
||||
if (isMarkdownSplit || newTextLen >= chunkLen) {
|
||||
chunks.push(`${currentTitle}${lastText}`);
|
||||
|
||||
lastText = isMarkdownSplit ? '' : getOneTextOverlapText({ text: lastText, step });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,7 +201,7 @@ export const splitText2Chunks = (props: {
|
||||
if (lastText.length < chunkLen * 0.4) {
|
||||
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
|
||||
} else {
|
||||
chunks.push(lastText);
|
||||
chunks.push(`${mdTitle}${lastText}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,8 +212,9 @@ export const splitText2Chunks = (props: {
|
||||
const chunks = splitTextRecursively({
|
||||
text,
|
||||
step: 0,
|
||||
lastText: ''
|
||||
});
|
||||
lastText: '',
|
||||
mdTitle: ''
|
||||
}).map((chunk) => chunk.replaceAll(codeBlockMarker, '\n')); // restore code block
|
||||
|
||||
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
|
||||
|
||||
|
6
packages/global/common/system/utils.ts
Normal file
6
packages/global/common/system/utils.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
export const delay = (ms: number) =>
|
||||
new Promise((resolve) => {
|
||||
setTimeout(() => {
|
||||
resolve('');
|
||||
}, ms);
|
||||
});
|
Reference in New Issue
Block a user