mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
v4.6.5 (#620)
This commit is contained in:
@@ -1,8 +1,3 @@
|
||||
import axios from 'axios';
|
||||
import { UrlFetchParams, UrlFetchResponse } from './api.d';
|
||||
import { htmlToMarkdown } from '../string/markdown';
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
export const formatFileSize = (bytes: number): string => {
|
||||
if (bytes === 0) return '0 B';
|
||||
|
||||
@@ -12,91 +7,3 @@ export const formatFileSize = (bytes: number): string => {
|
||||
|
||||
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
||||
};
|
||||
|
||||
export const cheerioToHtml = ({
|
||||
fetchUrl,
|
||||
$,
|
||||
selector
|
||||
}: {
|
||||
fetchUrl: string;
|
||||
$: cheerio.CheerioAPI;
|
||||
selector?: string;
|
||||
}) => {
|
||||
// get origin url
|
||||
const originUrl = new URL(fetchUrl).origin;
|
||||
|
||||
// remove i element
|
||||
$('i,script').remove();
|
||||
|
||||
// remove empty a element
|
||||
$('a')
|
||||
.filter((i, el) => {
|
||||
return $(el).text().trim() === '' && $(el).children().length === 0;
|
||||
})
|
||||
.remove();
|
||||
|
||||
// if link,img startWith /, add origin url
|
||||
$('a').each((i, el) => {
|
||||
const href = $(el).attr('href');
|
||||
if (href && href.startsWith('/')) {
|
||||
$(el).attr('href', originUrl + href);
|
||||
}
|
||||
});
|
||||
$('img').each((i, el) => {
|
||||
const src = $(el).attr('src');
|
||||
if (src && src.startsWith('/')) {
|
||||
$(el).attr('src', originUrl + src);
|
||||
}
|
||||
});
|
||||
|
||||
const html = $(selector || 'body')
|
||||
.map((item, dom) => {
|
||||
return $(dom).html();
|
||||
})
|
||||
.get()
|
||||
.join('\n');
|
||||
|
||||
return html;
|
||||
};
|
||||
export const urlsFetch = async ({
|
||||
urlList,
|
||||
selector
|
||||
}: UrlFetchParams): Promise<UrlFetchResponse> => {
|
||||
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
|
||||
|
||||
const response = (
|
||||
await Promise.all(
|
||||
urlList.map(async (url) => {
|
||||
try {
|
||||
const fetchRes = await axios.get(url, {
|
||||
timeout: 30000
|
||||
});
|
||||
|
||||
const $ = cheerio.load(fetchRes.data);
|
||||
|
||||
const md = htmlToMarkdown(
|
||||
cheerioToHtml({
|
||||
fetchUrl: url,
|
||||
$,
|
||||
selector
|
||||
})
|
||||
);
|
||||
|
||||
return {
|
||||
url,
|
||||
content: md
|
||||
};
|
||||
} catch (error) {
|
||||
console.log(error, 'fetch error');
|
||||
|
||||
return {
|
||||
url,
|
||||
content: ''
|
||||
};
|
||||
}
|
||||
})
|
||||
)
|
||||
).filter((item) => item.content);
|
||||
|
||||
return response;
|
||||
};
|
||||
|
@@ -1,5 +1,4 @@
|
||||
import { simpleText } from './tools';
|
||||
import { NodeHtmlMarkdown } from 'node-html-markdown';
|
||||
|
||||
/* Delete redundant text in markdown */
|
||||
export const simpleMarkdownText = (rawText: string) => {
|
||||
@@ -27,75 +26,11 @@ export const simpleMarkdownText = (rawText: string) => {
|
||||
|
||||
// Remove headings and code blocks front spaces
|
||||
['####', '###', '##', '#', '```', '~~~'].forEach((item, i) => {
|
||||
const isMarkdown = i <= 3;
|
||||
const reg = new RegExp(`\\n\\s*${item}`, 'g');
|
||||
if (reg.test(rawText)) {
|
||||
rawText = rawText.replace(
|
||||
new RegExp(`(\\n)\\s*(${item})`, 'g'),
|
||||
isMarkdown ? '\n$1$2' : '$1$2'
|
||||
);
|
||||
rawText = rawText.replace(new RegExp(`(\\n)\\s*(${item})`, 'g'), '$1$2');
|
||||
}
|
||||
});
|
||||
|
||||
return rawText.trim();
|
||||
};
|
||||
|
||||
/* html string to markdown */
|
||||
export const htmlToMarkdown = (html?: string | null) => {
|
||||
if (!html) return '';
|
||||
|
||||
const surround = (source: string, surroundStr: string) => `${surroundStr}${source}${surroundStr}`;
|
||||
|
||||
const nhm = new NodeHtmlMarkdown(
|
||||
{
|
||||
codeFence: '```',
|
||||
codeBlockStyle: 'fenced',
|
||||
ignore: ['i', 'script']
|
||||
},
|
||||
{
|
||||
code: ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => {
|
||||
const isCodeBlock = ['PRE', 'WRAPPED-PRE'].includes(parent?.tagName!);
|
||||
|
||||
if (!isCodeBlock) {
|
||||
return {
|
||||
spaceIfRepeatingChar: true,
|
||||
noEscape: true,
|
||||
postprocess: ({ content }) => {
|
||||
// Find longest occurring sequence of running backticks and add one more (so content is escaped)
|
||||
const delimiter =
|
||||
'`' + (content.match(/`+/g)?.sort((a, b) => b.length - a.length)?.[0] || '');
|
||||
const padding = delimiter.length > 1 ? ' ' : '';
|
||||
|
||||
return surround(surround(content, padding), delimiter);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/* Handle code block */
|
||||
if (codeBlockStyle === 'fenced') {
|
||||
const language =
|
||||
node.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
|
||||
parent?.getAttribute('class')?.match(/language-(\S+)/)?.[1] ||
|
||||
'';
|
||||
|
||||
return {
|
||||
noEscape: true,
|
||||
prefix: `${codeFence}${language}\n`,
|
||||
postfix: `\n${codeFence}\n`,
|
||||
childTranslators: visitor.instance.codeBlockTranslators
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
noEscape: true,
|
||||
postprocess: ({ content }) => content.replace(/^/gm, ' '),
|
||||
childTranslators: visitor.instance.codeBlockTranslators
|
||||
};
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
const markdown = nhm.translate(html).trim();
|
||||
|
||||
return simpleMarkdownText(markdown);
|
||||
};
|
||||
|
@@ -13,12 +13,13 @@ export const splitText2Chunks = (props: {
|
||||
chunkLen: number;
|
||||
overlapRatio?: number;
|
||||
customReg?: string[];
|
||||
countTokens?: boolean;
|
||||
}): {
|
||||
chunks: string[];
|
||||
tokens: number;
|
||||
overlapRatio?: number;
|
||||
} => {
|
||||
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
|
||||
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [], countTokens = true } = props;
|
||||
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
||||
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
||||
const overlapLen = Math.round(chunkLen * overlapRatio);
|
||||
@@ -233,7 +234,9 @@ export const splitText2Chunks = (props: {
|
||||
mdTitle: ''
|
||||
}).map((chunk) => chunk.replaceAll(codeBlockMarker, '\n')); // restore code block
|
||||
|
||||
const tokens = chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0);
|
||||
const tokens = countTokens
|
||||
? chunks.reduce((sum, chunk) => sum + countPromptTokens(chunk, 'system'), 0)
|
||||
: 0;
|
||||
|
||||
return {
|
||||
chunks,
|
||||
|
3
packages/global/core/chat/type.d.ts
vendored
3
packages/global/core/chat/type.d.ts
vendored
@@ -40,7 +40,7 @@ export type ChatItemSchema = {
|
||||
value: string;
|
||||
userGoodFeedback?: string;
|
||||
userBadFeedback?: string;
|
||||
robotBadFeedback?: string;
|
||||
customFeedbacks?: string[];
|
||||
adminFeedback?: AdminFbkType;
|
||||
[ModuleOutputKeyEnum.responseData]?: ChatHistoryItemResType[];
|
||||
};
|
||||
@@ -60,6 +60,7 @@ export type ChatItemType = {
|
||||
value: any;
|
||||
userGoodFeedback?: string;
|
||||
userBadFeedback?: string;
|
||||
customFeedbacks?: ChatItemSchema['customFeedbacks'];
|
||||
adminFeedback?: ChatItemSchema['feedback'];
|
||||
[ModuleOutputKeyEnum.responseData]?: ChatHistoryItemResType[];
|
||||
};
|
||||
|
2
packages/global/core/module/api.d.ts
vendored
2
packages/global/core/module/api.d.ts
vendored
@@ -5,12 +5,14 @@ export type SelectedDatasetType = { datasetId: string; vectorModel: VectorModelI
|
||||
export type HttpBodyType<T = any> = {
|
||||
appId: string;
|
||||
chatId?: string;
|
||||
responseChatItemId?: string;
|
||||
variables: Record<string, any>;
|
||||
data: T;
|
||||
};
|
||||
export type HttpQueryType = {
|
||||
appId: string;
|
||||
chatId?: string;
|
||||
responseChatItemId?: string;
|
||||
variables: Record<string, any>;
|
||||
[key: string]: any;
|
||||
};
|
||||
|
1
packages/global/core/plugin/type.d.ts
vendored
1
packages/global/core/plugin/type.d.ts
vendored
@@ -23,5 +23,6 @@ export type PluginTemplateType = {
|
||||
name: string;
|
||||
avatar: string;
|
||||
intro: string;
|
||||
showStatus?: boolean;
|
||||
modules: ModuleItemType[];
|
||||
};
|
||||
|
@@ -2,17 +2,14 @@
|
||||
"name": "@fastgpt/global",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"axios": "^1.5.1",
|
||||
"cheerio": "1.0.0-rc.12",
|
||||
"dayjs": "^1.11.7",
|
||||
"openai": "4.23.0",
|
||||
"encoding": "^0.1.13",
|
||||
"js-tiktoken": "^1.0.7",
|
||||
"node-html-markdown": "^1.3.0",
|
||||
"openai": "^4.20.1",
|
||||
"axios": "^1.5.1",
|
||||
"timezones-list": "^3.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.8.5",
|
||||
"@types/turndown": "^5.0.4"
|
||||
"@types/node": "^20.8.5"
|
||||
}
|
||||
}
|
||||
|
@@ -3,6 +3,8 @@
|
||||
"version": "1.0.0",
|
||||
"dependencies": {},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.8.5"
|
||||
"@types/node": "^20.8.5",
|
||||
"@fastgpt/global": "workspace:*",
|
||||
"@fastgpt/service": "workspace:*"
|
||||
}
|
||||
}
|
||||
|
@@ -71,7 +71,7 @@ instance.interceptors.response.use(responseSuccess, (err) => Promise.reject(err)
|
||||
|
||||
export function request(url: string, data: any, config: ConfigType, method: Method): any {
|
||||
if (!global.systemEnv || !global.systemEnv?.pluginBaseUrl) {
|
||||
console.log('未部署商业版接口');
|
||||
console.log('未部署商业版接口', url);
|
||||
return Promise.reject('The The request was denied...');
|
||||
}
|
||||
|
||||
|
95
packages/service/common/string/cheerio.ts
Normal file
95
packages/service/common/string/cheerio.ts
Normal file
@@ -0,0 +1,95 @@
|
||||
import { UrlFetchParams, UrlFetchResponse } from '@fastgpt/global/common/file/api';
|
||||
import * as cheerio from 'cheerio';
|
||||
import axios from 'axios';
|
||||
import { htmlToMarkdown } from './markdown';
|
||||
|
||||
export const cheerioToHtml = ({
|
||||
fetchUrl,
|
||||
$,
|
||||
selector
|
||||
}: {
|
||||
fetchUrl: string;
|
||||
$: cheerio.CheerioAPI;
|
||||
selector?: string;
|
||||
}) => {
|
||||
// get origin url
|
||||
const originUrl = new URL(fetchUrl).origin;
|
||||
|
||||
const selectDom = $(selector || 'body');
|
||||
|
||||
// remove i element
|
||||
selectDom.find('i,script').remove();
|
||||
|
||||
// remove empty a element
|
||||
selectDom
|
||||
.find('a')
|
||||
.filter((i, el) => {
|
||||
return $(el).text().trim() === '' && $(el).children().length === 0;
|
||||
})
|
||||
.remove();
|
||||
|
||||
// if link,img startWith /, add origin url
|
||||
selectDom.find('a').each((i, el) => {
|
||||
const href = $(el).attr('href');
|
||||
if (href && href.startsWith('/')) {
|
||||
$(el).attr('href', originUrl + href);
|
||||
}
|
||||
});
|
||||
selectDom.find('img').each((i, el) => {
|
||||
const src = $(el).attr('src');
|
||||
if (src && src.startsWith('/')) {
|
||||
$(el).attr('src', originUrl + src);
|
||||
}
|
||||
});
|
||||
|
||||
const html = selectDom
|
||||
.map((item, dom) => {
|
||||
return $(dom).html();
|
||||
})
|
||||
.get()
|
||||
.join('\n');
|
||||
|
||||
return html;
|
||||
};
|
||||
export const urlsFetch = async ({
|
||||
urlList,
|
||||
selector
|
||||
}: UrlFetchParams): Promise<UrlFetchResponse> => {
|
||||
urlList = urlList.filter((url) => /^(http|https):\/\/[^ "]+$/.test(url));
|
||||
|
||||
const response = (
|
||||
await Promise.all(
|
||||
urlList.map(async (url) => {
|
||||
try {
|
||||
const fetchRes = await axios.get(url, {
|
||||
timeout: 30000
|
||||
});
|
||||
|
||||
const $ = cheerio.load(fetchRes.data);
|
||||
|
||||
const md = await htmlToMarkdown(
|
||||
cheerioToHtml({
|
||||
fetchUrl: url,
|
||||
$,
|
||||
selector
|
||||
})
|
||||
);
|
||||
|
||||
return {
|
||||
url,
|
||||
content: md
|
||||
};
|
||||
} catch (error) {
|
||||
console.log(error, 'fetch error');
|
||||
|
||||
return {
|
||||
url,
|
||||
content: ''
|
||||
};
|
||||
}
|
||||
})
|
||||
)
|
||||
).filter((item) => item.content);
|
||||
|
||||
return response;
|
||||
};
|
23
packages/service/common/string/markdown.ts
Normal file
23
packages/service/common/string/markdown.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import { simpleMarkdownText } from '@fastgpt/global/common/string/markdown';
|
||||
import { Worker } from 'worker_threads';
|
||||
import { getWorkerPath } from './utils';
|
||||
|
||||
/* html string to markdown */
|
||||
export const htmlToMarkdown = (html?: string | null) =>
|
||||
new Promise<string>((resolve, reject) => {
|
||||
if (!html) return resolve('');
|
||||
|
||||
const start = Date.now();
|
||||
|
||||
// worker
|
||||
const worker = new Worker(getWorkerPath('html2md'));
|
||||
|
||||
worker.on('message', (md: string) => {
|
||||
resolve(simpleMarkdownText(md));
|
||||
});
|
||||
worker.on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
|
||||
worker.postMessage(html);
|
||||
});
|
9
packages/service/common/string/utils.ts
Normal file
9
packages/service/common/string/utils.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
export const getWorkerPath = (name: string) => {
|
||||
// @ts-ignore
|
||||
const isSubModule = !!global?.systemConfig;
|
||||
|
||||
const isProd = process.env.NODE_ENV === 'production';
|
||||
return isProd
|
||||
? `/app/worker/${name}.js`
|
||||
: `../../${isSubModule ? 'FastGPT/' : ''}/worker/${name}.js`;
|
||||
};
|
@@ -61,8 +61,8 @@ const ChatItemSchema = new Schema({
|
||||
userBadFeedback: {
|
||||
type: String
|
||||
},
|
||||
robotBadFeedback: {
|
||||
type: String
|
||||
customFeedbacks: {
|
||||
type: [String]
|
||||
},
|
||||
adminFeedback: {
|
||||
type: {
|
||||
@@ -86,7 +86,7 @@ try {
|
||||
ChatItemSchema.index({ chatId: 1 });
|
||||
ChatItemSchema.index({ userGoodFeedback: 1 });
|
||||
ChatItemSchema.index({ userBadFeedback: 1 });
|
||||
ChatItemSchema.index({ robotBadFeedback: 1 });
|
||||
ChatItemSchema.index({ customFeedbacks: 1 });
|
||||
ChatItemSchema.index({ adminFeedback: 1 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
@@ -1,5 +1,6 @@
|
||||
import type { ChatItemType } from '@fastgpt/global/core/chat/type';
|
||||
import { MongoChatItem } from './chatItemSchema';
|
||||
import { addLog } from '../../common/system/log';
|
||||
|
||||
export async function getChatItems({
|
||||
chatId,
|
||||
@@ -20,3 +21,29 @@ export async function getChatItems({
|
||||
|
||||
return { history };
|
||||
}
|
||||
|
||||
export const addCustomFeedbacks = async ({
|
||||
chatId,
|
||||
chatItemId,
|
||||
feedbacks
|
||||
}: {
|
||||
chatId?: string;
|
||||
chatItemId?: string;
|
||||
feedbacks: string[];
|
||||
}) => {
|
||||
if (!chatId || !chatItemId) return;
|
||||
|
||||
try {
|
||||
await MongoChatItem.findOneAndUpdate(
|
||||
{
|
||||
chatId,
|
||||
dataId: chatItemId
|
||||
},
|
||||
{
|
||||
$push: { customFeedbacks: { $each: feedbacks } }
|
||||
}
|
||||
);
|
||||
} catch (error) {
|
||||
addLog.error('addCustomFeedbacks error', error);
|
||||
}
|
||||
};
|
||||
|
@@ -4,7 +4,7 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder
|
||||
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { urlsFetch } from '@fastgpt/global/common/file/tools';
|
||||
import { urlsFetch } from '../../../common/string/cheerio';
|
||||
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
|
||||
/**
|
||||
@@ -105,7 +105,8 @@ export const loadingOneChunkCollection = async ({
|
||||
// split data
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: newRawText,
|
||||
chunkLen: collection.chunkSize || 512
|
||||
chunkLen: collection.chunkSize || 512,
|
||||
countTokens: false
|
||||
});
|
||||
|
||||
// insert to training queue
|
||||
|
@@ -44,6 +44,7 @@ const getPluginTemplateById = async (id: string): Promise<PluginTemplateType> =>
|
||||
name: item.name,
|
||||
avatar: item.avatar,
|
||||
intro: item.intro,
|
||||
showStatus: true,
|
||||
source: PluginSourceEnum.personal,
|
||||
modules: item.modules,
|
||||
templateType: ModuleTemplateTypeEnum.personalPlugin
|
||||
@@ -67,7 +68,7 @@ export async function getPluginPreviewModule({
|
||||
avatar: plugin.avatar,
|
||||
name: plugin.name,
|
||||
intro: plugin.intro,
|
||||
showStatus: true,
|
||||
showStatus: plugin.showStatus,
|
||||
...plugin2ModuleIO(plugin.id, plugin.modules)
|
||||
};
|
||||
}
|
||||
|
@@ -3,22 +3,24 @@
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"@fastgpt/global": "workspace:*",
|
||||
"axios": "^1.5.1",
|
||||
"cookie": "^0.5.0",
|
||||
"encoding": "^0.1.13",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"mongoose": "^7.0.2",
|
||||
"nanoid": "^4.0.1",
|
||||
"dayjs": "^1.11.7",
|
||||
"next": "13.5.2",
|
||||
"multer": "1.4.5-lts.1",
|
||||
"axios": "^1.5.1",
|
||||
"cheerio": "1.0.0-rc.12",
|
||||
"nextjs-cors": "^2.1.2",
|
||||
"pg": "^8.10.0",
|
||||
"tunnel": "^0.0.6",
|
||||
"dayjs": "^1.11.7"
|
||||
"tunnel": "^0.0.6"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/cookie": "^0.5.2",
|
||||
"@types/jsonwebtoken": "^9.0.3",
|
||||
"@types/node": "^20.8.5",
|
||||
"@types/multer": "^1.4.10",
|
||||
"@types/pg": "^8.6.6",
|
||||
"@types/tunnel": "^0.0.4"
|
||||
}
|
||||
|
@@ -31,3 +31,11 @@ export async function authCertOrShareId({
|
||||
canWrite: false
|
||||
};
|
||||
}
|
||||
|
||||
/* auth the request from local service */
|
||||
export const authRequestFromLocal = ({ req }: AuthModeType) => {
|
||||
const host = `${process.env.HOSTNAME || 'localhost'}:${process.env.PORT || 3000}`;
|
||||
if (host !== req.headers.host) {
|
||||
return Promise.reject('Invalid request');
|
||||
}
|
||||
};
|
||||
|
@@ -1,8 +1,6 @@
|
||||
{
|
||||
"name": "@fastgpt/web",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"axios": "^1.5.1"
|
||||
},
|
||||
"dependencies": {},
|
||||
"devDependencies": {}
|
||||
}
|
||||
|
Reference in New Issue
Block a user