mirror of
https://github.com/labring/FastGPT.git
synced 2026-05-07 01:02:55 +08:00
c93c3937e1
* refactor: fastgpt object storage & global proxy (#6155) * feat: migrate to fastgpt storage sdk * chore: rename env variable * chore: move to sdk dir * docs: object storage * CHORE * chore: storage mocks * chore: update docker-compose * fix: global proxy agent * fix: update COS proxy * refactor: use fetch instead of http.request * fix: axios request base url * fix: axios proxy request behavior * fix: bumps axios * fix: patch axios for proxy * fix: replace axios with proxied axios * fix: upload txt file encoding * clean code * fix: use "minio" for minio adapter (#6205) * fix: use minio client to delete files when using minio vendor (#6206) * doc * feat: filter citations and add response button control (#6170) * feat: filter citations and add response button control * i18n * fix * fix test * perf: chat api code * fix: workflow edge overlap and auto-align in folded loop nodes (#6204) * fix: workflow edge overlap and auto-align in folded loop nodes * sort * fix * fix edge * fix icon * perf: s3 file name * perf: admin get app api * perf: catch user error * fix: refactor useOrg hook to use debounced search key (#6180) * chore: comment minio adapter (#6207) * chore: filename with suffix random id * perf: s3 storage code * fix: encode filename when copy object --------- Co-authored-by: archer <545436317@qq.com> * fix: node card link * json * perf: chat index; * index * chat item soft delete (#6216) * chat item soft delete * temp * fix * remove code * perf: delete chat item --------- Co-authored-by: archer <545436317@qq.com> * feat: select wheather filter sensitive info when export apps (#6222) * fix some bugs (#6210) * fix v4.14.5 bugs * type * fix * fix * custom feedback * fix * code * fix * remove invalid function --------- Co-authored-by: archer <545436317@qq.com> * perf: test * fix file default local upload (#6223) * docs: improve object storage introduction (#6224) * doc --------- Co-authored-by: roy <whoeverimf5@gmail.com> Co-authored-by: heheer <heheer@sealos.io> Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
126 lines
2.9 KiB
TypeScript
126 lines
2.9 KiB
TypeScript
import { type UrlFetchParams, type UrlFetchResponse } from '@fastgpt/global/common/file/api';
|
|
import * as cheerio from 'cheerio';
|
|
import { axios } from '../api/axios';
|
|
import { htmlToMarkdown } from './utils';
|
|
import { isInternalAddress } from '../system/utils';
|
|
|
|
export const cheerioToHtml = ({
|
|
fetchUrl,
|
|
$,
|
|
selector
|
|
}: {
|
|
fetchUrl: string;
|
|
$: cheerio.CheerioAPI;
|
|
selector?: string;
|
|
}) => {
|
|
// get origin url
|
|
const originUrl = new URL(fetchUrl).origin;
|
|
const protocol = new URL(fetchUrl).protocol; // http: or https:
|
|
|
|
const usedSelector = selector || 'body';
|
|
const selectDom = $(usedSelector);
|
|
|
|
// remove i element
|
|
selectDom.find('i,script,style').remove();
|
|
|
|
// remove empty a element
|
|
selectDom
|
|
.find('a')
|
|
.filter((i, el) => {
|
|
return $(el).text().trim() === '' && $(el).children().length === 0;
|
|
})
|
|
.remove();
|
|
|
|
// if link,img startWith /, add origin url
|
|
selectDom.find('a').each((i, el) => {
|
|
const href = $(el).attr('href');
|
|
if (href) {
|
|
if (href.startsWith('//')) {
|
|
$(el).attr('href', protocol + href);
|
|
} else if (href.startsWith('/')) {
|
|
$(el).attr('href', originUrl + href);
|
|
}
|
|
}
|
|
});
|
|
selectDom.find('img, video, source, audio, iframe').each((i, el) => {
|
|
const src = $(el).attr('src');
|
|
if (src) {
|
|
if (src.startsWith('//')) {
|
|
$(el).attr('src', protocol + src);
|
|
} else if (src.startsWith('/')) {
|
|
$(el).attr('src', originUrl + src);
|
|
}
|
|
}
|
|
});
|
|
|
|
const html = selectDom
|
|
.map((item, dom) => {
|
|
return $(dom).html();
|
|
})
|
|
.get()
|
|
.join('\n');
|
|
|
|
const title = $('head title').text() || $('h1:first').text() || fetchUrl;
|
|
|
|
return {
|
|
html,
|
|
title,
|
|
usedSelector
|
|
};
|
|
};
|
|
export const urlsFetch = async ({
|
|
urlList,
|
|
selector
|
|
}: UrlFetchParams): Promise<UrlFetchResponse> => {
|
|
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
|
|
|
|
const response = await Promise.all(
|
|
urlList.map(async (url) => {
|
|
const isInternal = isInternalAddress(url);
|
|
if (isInternal) {
|
|
return {
|
|
url,
|
|
title: '',
|
|
content: 'Cannot fetch internal url',
|
|
selector: ''
|
|
};
|
|
}
|
|
|
|
try {
|
|
const fetchRes = await axios.get(url, {
|
|
timeout: 30000
|
|
});
|
|
|
|
const $ = cheerio.load(fetchRes.data);
|
|
const { title, html, usedSelector } = cheerioToHtml({
|
|
fetchUrl: url,
|
|
$,
|
|
selector
|
|
});
|
|
|
|
const md = await htmlToMarkdown(html);
|
|
|
|
return {
|
|
url,
|
|
title,
|
|
content: md,
|
|
selector: usedSelector
|
|
};
|
|
} catch (error) {
|
|
console.log(error, 'fetch error');
|
|
|
|
return {
|
|
url,
|
|
title: '',
|
|
content: '',
|
|
selector: ''
|
|
};
|
|
}
|
|
})
|
|
);
|
|
|
|
return response;
|
|
};
|
|
|
|
export const loadContentByCheerio = async (content: string) => cheerio.load(content);
|