mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-27 00:17:31 +00:00
feat: Text check before synchronization (#689)
* fix: icon * fix: web selector * fix: web selector * perf: link sync * dev doc * chomd doc * perf: git intro * 466 intro * intro img * add json editor (#5) * team limit * websync limit * json editor * text editor * perf: search test * change cq value type * doc * intro img --------- Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
@@ -15,7 +15,8 @@ export const cheerioToHtml = ({
|
||||
// get origin url
|
||||
const originUrl = new URL(fetchUrl).origin;
|
||||
|
||||
const selectDom = $(selector || 'body');
|
||||
const usedSelector = selector || 'body';
|
||||
const selectDom = $(usedSelector);
|
||||
|
||||
// remove i element
|
||||
selectDom.find('i,script').remove();
|
||||
@@ -49,7 +50,10 @@ export const cheerioToHtml = ({
|
||||
.get()
|
||||
.join('\n');
|
||||
|
||||
return html;
|
||||
return {
|
||||
html,
|
||||
usedSelector
|
||||
};
|
||||
};
|
||||
export const urlsFetch = async ({
|
||||
urlList,
|
||||
@@ -66,25 +70,25 @@ export const urlsFetch = async ({
|
||||
});
|
||||
|
||||
const $ = cheerio.load(fetchRes.data);
|
||||
|
||||
const md = await htmlToMarkdown(
|
||||
cheerioToHtml({
|
||||
fetchUrl: url,
|
||||
$,
|
||||
selector
|
||||
})
|
||||
);
|
||||
const { html, usedSelector } = cheerioToHtml({
|
||||
fetchUrl: url,
|
||||
$,
|
||||
selector
|
||||
});
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
return {
|
||||
url,
|
||||
content: md
|
||||
content: md,
|
||||
selector: usedSelector
|
||||
};
|
||||
} catch (error) {
|
||||
console.log(error, 'fetch error');
|
||||
|
||||
return {
|
||||
url,
|
||||
content: ''
|
||||
content: '',
|
||||
selector: ''
|
||||
};
|
||||
}
|
||||
})
|
||||
|
@@ -21,6 +21,9 @@ export const htmlToMarkdown = (html?: string | null) =>
|
||||
worker.terminate();
|
||||
reject(err);
|
||||
});
|
||||
worker.on('exit', (code) => {
|
||||
console.log('html 2 md finish', code);
|
||||
});
|
||||
|
||||
worker.postMessage(html);
|
||||
});
|
||||
|
@@ -19,14 +19,16 @@ export async function createOneCollection({
|
||||
qaPrompt,
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {}
|
||||
}: CreateDatasetCollectionParams & { teamId: string; tmbId: string }) {
|
||||
metadata = {},
|
||||
...props
|
||||
}: CreateDatasetCollectionParams & { teamId: string; tmbId: string; [key: string]: any }) {
|
||||
const { _id } = await MongoDatasetCollection.create({
|
||||
name,
|
||||
...props,
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
parentId: parentId || null,
|
||||
datasetId,
|
||||
name,
|
||||
type,
|
||||
trainingType,
|
||||
chunkSize,
|
||||
|
@@ -75,6 +75,7 @@ const DatasetCollectionSchema = new Schema({
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
|
||||
rawTextLength: {
|
||||
type: Number
|
||||
},
|
||||
|
@@ -1,11 +1,11 @@
|
||||
import type { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder/type.d';
|
||||
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { urlsFetch } from '../../../common/string/cheerio';
|
||||
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
/**
|
||||
* get all collection by top collectionId
|
||||
@@ -65,64 +65,114 @@ export function getCollectionUpdateTime({ name, time }: { time?: Date; name: str
|
||||
return new Date();
|
||||
}
|
||||
|
||||
/* link collection start load data */
|
||||
export const loadingOneChunkCollection = async ({
|
||||
/**
|
||||
* Get collection raw text by Collection or collectionId
|
||||
*/
|
||||
export const getCollectionAndRawText = async ({
|
||||
collectionId,
|
||||
tmbId,
|
||||
billId,
|
||||
rawText
|
||||
collection,
|
||||
newRawText
|
||||
}: {
|
||||
collectionId: string;
|
||||
tmbId: string;
|
||||
billId?: string;
|
||||
rawText?: string;
|
||||
collectionId?: string;
|
||||
collection?: CollectionWithDatasetType;
|
||||
newRawText?: string;
|
||||
}) => {
|
||||
const collection = (await MongoDatasetCollection.findById(collectionId).populate(
|
||||
'datasetId'
|
||||
)) as CollectionWithDatasetType;
|
||||
const col = await (async () => {
|
||||
if (collection) return collection;
|
||||
if (collectionId) {
|
||||
return (await MongoDatasetCollection.findById(collectionId).populate(
|
||||
'datasetId'
|
||||
)) as CollectionWithDatasetType;
|
||||
}
|
||||
|
||||
if (!collection) {
|
||||
return Promise.reject(DatasetErrEnum.unCreateCollection);
|
||||
return null;
|
||||
})();
|
||||
|
||||
if (!col) {
|
||||
return Promise.reject('Collection not found');
|
||||
}
|
||||
|
||||
const newRawText = await (async () => {
|
||||
if (rawText) return rawText;
|
||||
const rawText = await (async () => {
|
||||
if (newRawText) return newRawText;
|
||||
// link
|
||||
if (collection.type === DatasetCollectionTypeEnum.link && collection.rawLink) {
|
||||
if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
|
||||
// crawl new data
|
||||
const result = await urlsFetch({
|
||||
urlList: [collection.rawLink],
|
||||
selector: collection.datasetId?.websiteConfig?.selector
|
||||
urlList: [col.rawLink],
|
||||
selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
|
||||
});
|
||||
|
||||
return result[0].content;
|
||||
}
|
||||
|
||||
// file
|
||||
|
||||
return '';
|
||||
})();
|
||||
|
||||
const hashRawText = hashStr(rawText);
|
||||
const isSameRawText = col.hashRawText === hashRawText;
|
||||
|
||||
return {
|
||||
collection: col,
|
||||
rawText,
|
||||
isSameRawText
|
||||
};
|
||||
};
|
||||
|
||||
/* link collection start load data */
|
||||
export const reloadCollectionChunks = async ({
|
||||
collectionId,
|
||||
collection,
|
||||
tmbId,
|
||||
billId,
|
||||
rawText
|
||||
}: {
|
||||
collectionId?: string;
|
||||
collection?: CollectionWithDatasetType;
|
||||
tmbId: string;
|
||||
billId?: string;
|
||||
rawText?: string;
|
||||
}) => {
|
||||
const {
|
||||
rawText: newRawText,
|
||||
collection: col,
|
||||
isSameRawText
|
||||
} = await getCollectionAndRawText({
|
||||
collection,
|
||||
collectionId,
|
||||
newRawText: rawText
|
||||
});
|
||||
|
||||
if (isSameRawText) return;
|
||||
|
||||
// split data
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: newRawText,
|
||||
chunkLen: collection.chunkSize || 512,
|
||||
chunkLen: col.chunkSize || 512,
|
||||
countTokens: false
|
||||
});
|
||||
|
||||
// insert to training queue
|
||||
await MongoDatasetTraining.insertMany(
|
||||
chunks.map((item, i) => ({
|
||||
teamId: collection.teamId,
|
||||
teamId: col.teamId,
|
||||
tmbId,
|
||||
datasetId: collection.datasetId._id,
|
||||
collectionId: collection._id,
|
||||
datasetId: col.datasetId._id,
|
||||
collectionId: col._id,
|
||||
billId,
|
||||
mode: collection.trainingType,
|
||||
mode: col.trainingType,
|
||||
prompt: '',
|
||||
model: collection.datasetId.vectorModel,
|
||||
model: col.datasetId.vectorModel,
|
||||
q: item,
|
||||
a: '',
|
||||
chunkIndex: i
|
||||
}))
|
||||
);
|
||||
|
||||
// update raw text
|
||||
await MongoDatasetCollection.findByIdAndUpdate(col._id, {
|
||||
rawTextLength: newRawText.length,
|
||||
hashRawText: hashStr(newRawText)
|
||||
});
|
||||
};
|
||||
|
@@ -47,15 +47,6 @@ const UserSchema = new Schema({
|
||||
type: Number,
|
||||
default: 15
|
||||
},
|
||||
limit: {
|
||||
exportKbTime: {
|
||||
// Every half hour
|
||||
type: Date
|
||||
},
|
||||
datasetMaxCount: {
|
||||
type: Number
|
||||
}
|
||||
},
|
||||
openaiAccount: {
|
||||
type: {
|
||||
key: String,
|
||||
|
@@ -32,6 +32,14 @@ const TeamSchema = new Schema({
|
||||
},
|
||||
lastDatasetBillTime: {
|
||||
type: Date
|
||||
},
|
||||
limit: {
|
||||
lastExportDatasetTime: {
|
||||
type: Date
|
||||
},
|
||||
lastWebsiteSyncTime: {
|
||||
type: Date
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
69
packages/service/support/user/utils.ts
Normal file
69
packages/service/support/user/utils.ts
Normal file
@@ -0,0 +1,69 @@
|
||||
import { MongoTeam } from './team/teamSchema';
|
||||
|
||||
/* export dataset limit */
|
||||
export const updateExportDatasetLimit = async (teamId: string) => {
|
||||
try {
|
||||
await MongoTeam.findByIdAndUpdate(teamId, {
|
||||
'limit.lastExportDatasetTime': new Date()
|
||||
});
|
||||
} catch (error) {}
|
||||
};
|
||||
export const checkExportDatasetLimit = async ({
|
||||
teamId,
|
||||
limitMinutes = 0
|
||||
}: {
|
||||
teamId: string;
|
||||
limitMinutes?: number;
|
||||
}) => {
|
||||
const limitMinutesAgo = new Date(Date.now() - limitMinutes * 60 * 1000);
|
||||
|
||||
// auth export times
|
||||
const authTimes = await MongoTeam.findOne(
|
||||
{
|
||||
_id: teamId,
|
||||
$or: [
|
||||
{ 'limit.lastExportDatasetTime': { $exists: false } },
|
||||
{ 'limit.lastExportDatasetTime': { $lte: limitMinutesAgo } }
|
||||
]
|
||||
},
|
||||
'_id limit'
|
||||
);
|
||||
|
||||
if (!authTimes) {
|
||||
return Promise.reject(`每个团队,每 ${limitMinutes} 分钟仅可导出一次。`);
|
||||
}
|
||||
};
|
||||
|
||||
/* web sync limit */
|
||||
export const updateWebSyncLimit = async (teamId: string) => {
|
||||
try {
|
||||
await MongoTeam.findByIdAndUpdate(teamId, {
|
||||
'limit.lastWebsiteSyncTime': new Date()
|
||||
});
|
||||
} catch (error) {}
|
||||
};
|
||||
export const checkWebSyncLimit = async ({
|
||||
teamId,
|
||||
limitMinutes = 0
|
||||
}: {
|
||||
teamId: string;
|
||||
limitMinutes?: number;
|
||||
}) => {
|
||||
const limitMinutesAgo = new Date(Date.now() - limitMinutes * 60 * 1000);
|
||||
|
||||
// auth export times
|
||||
const authTimes = await MongoTeam.findOne(
|
||||
{
|
||||
_id: teamId,
|
||||
$or: [
|
||||
{ 'limit.lastWebsiteSyncTime': { $exists: false } },
|
||||
{ 'limit.lastWebsiteSyncTime': { $lte: limitMinutesAgo } }
|
||||
]
|
||||
},
|
||||
'_id limit'
|
||||
);
|
||||
|
||||
if (!authTimes) {
|
||||
return Promise.reject(`每个团队,每 ${limitMinutes} 分钟仅使用一次同步功能。`);
|
||||
}
|
||||
};
|
Reference in New Issue
Block a user