diff --git a/docSite/content/zh-cn/docs/development/upgrading/498.md b/docSite/content/zh-cn/docs/development/upgrading/498.md index d288914bb..5a34c15fb 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/498.md +++ b/docSite/content/zh-cn/docs/development/upgrading/498.md @@ -23,6 +23,7 @@ weight: 792 1. Chat log list 优化,避免大数据时超出内存限制。 2. 预加载 token 计算 worker,避免主任务中并发创建导致线程阻塞。 3. 工作流节点版本控制交互优化。 +4. 网络获取以及 html2md 优化,支持视频和音频标签的转换。 ## 🐛 修复 diff --git a/packages/service/common/string/cheerio.ts b/packages/service/common/string/cheerio.ts index 08df3c695..763a4888d 100644 --- a/packages/service/common/string/cheerio.ts +++ b/packages/service/common/string/cheerio.ts @@ -42,7 +42,7 @@ export const cheerioToHtml = ({ } } }); - selectDom.find('img').each((i, el) => { + selectDom.find('img, video, source, audio, iframe').each((i, el) => { const src = $(el).attr('src'); if (src) { if (src.startsWith('//')) { diff --git a/packages/service/worker/htmlStr2Md/utils.ts b/packages/service/worker/htmlStr2Md/utils.ts index f1c690640..4445f1143 100644 --- a/packages/service/worker/htmlStr2Md/utils.ts +++ b/packages/service/worker/htmlStr2Md/utils.ts @@ -43,6 +43,24 @@ export const html2md = ( turndownService.remove(['i', 'script', 'iframe', 'style']); turndownService.use(turndownPluginGfm.gfm); + // add custom handling for media tag + turndownService.addRule('media', { + filter: ['video', 'source', 'audio'], + replacement: function (content, node) { + const mediaNode = node as HTMLVideoElement | HTMLAudioElement | HTMLSourceElement; + const src = mediaNode.getAttribute('src'); + const sources = mediaNode.getElementsByTagName('source'); + const firstSourceSrc = sources.length > 0 ? sources[0].getAttribute('src') : null; + const mediaSrc = src || firstSourceSrc; + + if (mediaSrc) { + return `[${mediaSrc}](${mediaSrc}) `; + } + + return content; + } + }); + // Base64 img to id, otherwise it will occupy memory when going to md const { processedHtml, images } = processBase64Images(html); const md = turndownService.turndown(processedHtml);