Files
FastGPT/packages/service/common/string/cheerio.ts
T
Archer c93c3937e1 S3 sdk (#6215)
* refactor: fastgpt object storage & global proxy (#6155)

* feat: migrate to fastgpt storage sdk

* chore: rename env variable

* chore: move to sdk dir

* docs: object storage

* CHORE

* chore: storage mocks

* chore: update docker-compose

* fix: global proxy agent

* fix: update COS proxy

* refactor: use fetch instead of http.request

* fix: axios request base url

* fix: axios proxy request behavior

* fix: bumps axios

* fix: patch axios for proxy

* fix: replace axios with proxied axios

* fix: upload txt file encoding

* clean code

* fix: use "minio" for minio adapter (#6205)

* fix: use minio client to delete files when using minio vendor (#6206)

* doc

* feat: filter citations and add response button control (#6170)

* feat: filter citations and add response button control

* i18n

* fix

* fix test

* perf: chat api code

* fix: workflow edge overlap and auto-align in folded loop nodes (#6204)

* fix: workflow edge overlap and auto-align in folded loop nodes

* sort

* fix

* fix edge

* fix icon

* perf: s3 file name

* perf: admin get app api

* perf: catch user error

* fix: refactor useOrg hook to use debounced search key (#6180)

* chore: comment minio adapter (#6207)

* chore: filename with suffix random id

* perf: s3 storage code

* fix: encode filename when copy object

---------

Co-authored-by: archer <545436317@qq.com>

* fix: node card link

* json

* perf: chat index;

* index

* chat item soft delete (#6216)

* chat item soft delete

* temp

* fix

* remove code

* perf: delete chat item

---------

Co-authored-by: archer <545436317@qq.com>

* feat: select wheather filter sensitive info when export apps (#6222)

* fix some bugs (#6210)

* fix v4.14.5 bugs

* type

* fix

* fix

* custom feedback

* fix

* code

* fix

* remove invalid function

---------

Co-authored-by: archer <545436317@qq.com>

* perf: test

* fix file default local upload (#6223)

* docs: improve object storage introduction (#6224)

* doc

---------

Co-authored-by: roy <whoeverimf5@gmail.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
2026-01-09 18:25:02 +08:00

126 lines
2.9 KiB
TypeScript

import { type UrlFetchParams, type UrlFetchResponse } from '@fastgpt/global/common/file/api';
import * as cheerio from 'cheerio';
import { axios } from '../api/axios';
import { htmlToMarkdown } from './utils';
import { isInternalAddress } from '../system/utils';
export const cheerioToHtml = ({
fetchUrl,
$,
selector
}: {
fetchUrl: string;
$: cheerio.CheerioAPI;
selector?: string;
}) => {
// get origin url
const originUrl = new URL(fetchUrl).origin;
const protocol = new URL(fetchUrl).protocol; // http: or https:
const usedSelector = selector || 'body';
const selectDom = $(usedSelector);
// remove i element
selectDom.find('i,script,style').remove();
// remove empty a element
selectDom
.find('a')
.filter((i, el) => {
return $(el).text().trim() === '' && $(el).children().length === 0;
})
.remove();
// if link,img startWith /, add origin url
selectDom.find('a').each((i, el) => {
const href = $(el).attr('href');
if (href) {
if (href.startsWith('//')) {
$(el).attr('href', protocol + href);
} else if (href.startsWith('/')) {
$(el).attr('href', originUrl + href);
}
}
});
selectDom.find('img, video, source, audio, iframe').each((i, el) => {
const src = $(el).attr('src');
if (src) {
if (src.startsWith('//')) {
$(el).attr('src', protocol + src);
} else if (src.startsWith('/')) {
$(el).attr('src', originUrl + src);
}
}
});
const html = selectDom
.map((item, dom) => {
return $(dom).html();
})
.get()
.join('\n');
const title = $('head title').text() || $('h1:first').text() || fetchUrl;
return {
html,
title,
usedSelector
};
};
export const urlsFetch = async ({
urlList,
selector
}: UrlFetchParams): Promise<UrlFetchResponse> => {
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
const response = await Promise.all(
urlList.map(async (url) => {
const isInternal = isInternalAddress(url);
if (isInternal) {
return {
url,
title: '',
content: 'Cannot fetch internal url',
selector: ''
};
}
try {
const fetchRes = await axios.get(url, {
timeout: 30000
});
const $ = cheerio.load(fetchRes.data);
const { title, html, usedSelector } = cheerioToHtml({
fetchUrl: url,
$,
selector
});
const md = await htmlToMarkdown(html);
return {
url,
title,
content: md,
selector: usedSelector
};
} catch (error) {
console.log(error, 'fetch error');
return {
url,
title: '',
content: '',
selector: ''
};
}
})
);
return response;
};
export const loadContentByCheerio = async (content: string) => cheerio.load(content);