Files
FastGPT/packages/global/common/string/markdown.ts
Archer e25d7efb5b feature: V4.11.1 (#5350)
* perf: system toolset & mcp (#5200)

* feat: support system toolset

* fix: type

* fix: system tool config

* chore: mcptool config migrate

* refactor: mcp toolset

* fix: fe type error

* fix: type error

* fix: show version

* chore: support extract tool's secretInputConfig out of inputs

* chore: compatible with old version mcp

* chore: adjust

* deps: update dependency @fastgpt-skd/plugin

* fix: version

* fix: some bug (#5316)

* chore: compatible with old version mcp

* fix: version

* fix: compatible bug

* fix: mcp object params

* fix: type error

* chore: update test cases

* chore: remove log

* fix: toolset node name

* optimize app logs sort (#5310)

* log keys config modal

* multiple select

* api

* fontsize

* code

* chatid

* fix build

* fix

* fix component

* change name

* log keys config

* fix

* delete unused

* fix

* perf: log code

* perf: send auth code modal enter press

* fix log (#5328)

* perf: mcp toolset comment

* perf: log ui

* remove log (#5347)

* doc

* fix: action

* remove log

* fix: Table Optimization (#5319)

* feat: table test: 1

* feat: table test: 2

* feat: table test: 3

* feat: table test: 4

* feat: table test : 5 把maxSize改回chunkSize

* feat: table test : 6 都删了,只看maxSize

* feat: table test : 7 恢复初始,接下来删除标签功能

* feat: table test : 8 删除标签功能

* feat: table test : 9 删除标签功能成功

* feat: table test : 10 继续调试,修改trainingStates

* feat: table test : 11 修改第一步

* feat: table test : 12 修改第二步

* feat: table test : 13 修改了HtmlTable2Md

* feat: table test : 14 修改表头分块规则

* feat: table test : 15 前面表格分的太细了

* feat: table test : 16 改着改着表头又不加了

* feat: table test : 17 用CUSTOM_SPLIT_SIGN不行,重新改

* feat: table test : 18 表头仍然还会多加,但现在分块搞的合理了终于

* feat: table test : 19 还是需要搞好表头问题,先保存一下调试情况

* feat: table test : 20 调试结束,看一下replace有没有问题,没问题就pr

* feat: table test : 21 先把注释删了

* feat: table test : 21 注释replace都改了,下面切main分支看看情况

* feat: table test : 22 修改旧文件

* feat: table test : 23 修改测试文件

* feat: table test : 24 xlsx表格处理

* feat: table test : 25 刚才没保存先com了

* feat: table test : 26 fix

* feat: table test : 27 先com一版调试

* feat: table test : 28 试试放format2csv里

* feat: table test : 29 xlsx解决

* feat: table test : 30 tablesplit解决

* feat: table test : 31

* feat: table test : 32

* perf: table split

* perf: mcp old version compatibility (#5342)

* fix: system-tool secret inputs

* fix: rewrite runtime node i18n for system tool

* perf: mcp old version compatibility

* fix: splitPluginId

* fix: old mcp toolId

* fix: filter secret key

* feat: support system toolset activation

* chore: remove log

* perf: mcp update

* perf: rewrite toolset

* fix:delete variable id (#5335)

* perf: variable update

* fix: multiple select ui

* perf: model config move to plugin

* fix: var conflit

* perf: variable checker

* Avoid empty number

* update doc time

* fix: test

* fix: mcp object

* update count app

* update count app

---------

Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: heheer <zhiyu44@qq.com>
Co-authored-by: colnii <1286949794@qq.com>
Co-authored-by: dreamer6680 <1468683855@qq.com>
2025-08-01 16:08:20 +08:00

199 lines
5.5 KiB
TypeScript

import { batchRun } from '../system/utils';
import { getNanoid, simpleText } from './tools';
import type { ImageType } from '../../../service/worker/readFile/type';
/* Delete redundant text in markdown */
export const simpleMarkdownText = (rawText: string) => {
rawText = simpleText(rawText);
// Remove a line feed from a hyperlink or picture
rawText = rawText.replace(/\[([^\]]+)\]\((.+?)\)/g, (match, linkText, url) => {
const cleanedLinkText = linkText.replace(/\n/g, ' ').trim();
if (!url) {
return '';
}
return `[${cleanedLinkText}](${url})`;
});
// replace special #\.* ……
const reg1 = /\\([#`!*()+-_\[\]{}\\.])/g;
if (reg1.test(rawText)) {
rawText = rawText.replace(reg1, '$1');
}
// replace \\n
rawText = rawText.replace(/\\\\n/g, '\\n');
// Remove headings and code blocks front spaces
['####', '###', '##', '#', '```', '~~~'].forEach((item, i) => {
const reg = new RegExp(`\\n\\s*${item}`, 'g');
if (reg.test(rawText)) {
rawText = rawText.replace(new RegExp(`(\\n)( *)(${item})`, 'g'), '$1$3');
}
});
return rawText.trim();
};
export const htmlTable2Md = (content: string): string => {
return content.replace(/<table>[\s\S]*?<\/table>/g, (htmlTable) => {
try {
// Clean up whitespace and newlines
const cleanHtml = htmlTable.replace(/\n\s*/g, '');
const rows = cleanHtml.match(/<tr>(.*?)<\/tr>/g);
if (!rows) return htmlTable;
// Parse table data
let tableData: string[][] = [];
let maxColumns = 0;
// Try to convert to markdown table
rows.forEach((row, rowIndex) => {
if (!tableData[rowIndex]) {
tableData[rowIndex] = [];
}
let colIndex = 0;
const cells = row.match(/<td[^>]*\/>|<td[^>]*>.*?<\/td>/g) || [];
cells.forEach((cell) => {
while (tableData[rowIndex][colIndex]) {
colIndex++;
}
const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1');
const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1');
let content = '';
if (cell.endsWith('/>')) {
content = '';
} else {
content = cell.replace(/<td[^>]*>|<\/td>/g, '').trim();
}
for (let i = 0; i < rowspan; i++) {
for (let j = 0; j < colspan; j++) {
if (!tableData[rowIndex + i]) {
tableData[rowIndex + i] = [];
}
tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^';
}
}
colIndex += colspan;
maxColumns = Math.max(maxColumns, colIndex);
});
for (let i = 0; i < maxColumns; i++) {
if (!tableData[rowIndex][i]) {
tableData[rowIndex][i] = ' ';
}
}
});
const chunks: string[] = [];
const headerCells = tableData[0]
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
const headerRow = '| ' + headerCells.join(' | ') + ' |';
chunks.push(headerRow);
const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |';
chunks.push(separator);
tableData.slice(1).forEach((row) => {
const paddedRow = row
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
while (paddedRow.length < maxColumns) {
paddedRow.push(' ');
}
chunks.push('| ' + paddedRow.join(' | ') + ' |');
});
return chunks.join('\n');
} catch (error) {
return htmlTable;
}
});
};
/**
* format markdown
* 1. upload base64
* 2. replace \
*/
export const uploadMarkdownBase64 = async ({
rawText,
uploadImgController
}: {
rawText: string;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
if (uploadImgController) {
// match base64, upload and replace it
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
const base64Arr = rawText.match(base64Regex) || [];
// upload base64 and replace it
await batchRun(
base64Arr,
async (base64Img) => {
try {
const str = await uploadImgController(base64Img);
rawText = rawText.replace(base64Img, str);
} catch (error) {
rawText = rawText.replace(base64Img, '');
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
}
},
20
);
}
// Remove white space on both sides of the picture
// const trimReg = /(!\[.*\]\(.*\))\s*/g;
// if (trimReg.test(rawText)) {
// rawText = rawText.replace(trimReg, '$1');
// }
return rawText;
};
export const markdownProcess = async ({
rawText,
uploadImgController
}: {
rawText: string;
uploadImgController?: (base64: string) => Promise<string>;
}) => {
const imageProcess = await uploadMarkdownBase64({
rawText,
uploadImgController
});
return simpleMarkdownText(imageProcess);
};
export const matchMdImg = (text: string) => {
const base64Regex = /!\[([^\]]*)\]\((data:image\/[^;]+;base64[^)]+)\)/g;
const imageList: ImageType[] = [];
text = text.replace(base64Regex, (match, altText, base64Url) => {
const uuid = `IMAGE_${getNanoid(12)}_IMAGE`;
const mime = base64Url.split(';')[0].split(':')[1];
const base64 = base64Url.split(',')[1];
imageList.push({
uuid,
base64,
mime
});
// 保持原有的 alt 文本,只替换 base64 部分
return `![${altText}](${uuid})`;
});
return {
text,
imageList
};
};