V4.8.15 feature (#3331)

* feat: add customize toolkit (#3205)

* chaoyang

* fix-auth

* add toolkit

* add order

* plugin usage

* fix

* delete console:

* Fix: Fix fullscreen preview top positioning and improve Markdown rendering logic (#3247)

* 完成任务:修复全屏预览顶部固定问题,优化 Markdown 渲染逻辑

* 有问题修改

* 问题再修改

* 修正问题

* fix: plugin standalone display issue (#3254)

* 4.8.15 test (#3246)

* o1 config

* perf: system plugin code

* 调整系统插件代码。增加html 渲染安全配置。 (#3258)

* perf: base64 picker

* perf: list app or dataset

* perf: plugin config code

* 小窗适配等问题 (#3257)

* 小窗适配等问题

* git问题

* 小窗剩余问题

* feat: system plugin auth and lock version (#3265)

* feat: system plugin auth and lock version

* update comment

* 4.8.15 test (#3267)

* tmp log

* perf: login direct

* perf: iframe html code

* remove log

* fix: plugin standalone display (#3277)

* refactor: 页面拆分&i18n拆分 (#3281)

* refactor: account组件拆成独立页面

* script: 新增i18n json文件创建脚本

* refactor: 页面i18n拆分

* i18n: add en&hant

* 4.8.15 test (#3285)

* tmp log

* remove log

* fix: watch avatar refresh

* perf: i18n code

* fix(plugin): use intro instead of userguide (#3290)

* Universal SSO (#3292)

* tmp log

* remove log

* feat: common oauth

* readme

* perf: sso provider

* remove sso code

* perf: refresh plugins

* feat: add api dataset (#3272)

* add api-dataset

* fix api-dataset

* fix api dataset

* fix ts

* perf: create collection code (#3301)

* tmp log

* remove log

* perf: i18n change

* update version doc

* feat: question guide from chatId

* perf: create collection code

* fix: request api

* fix: request api

* fix: tts auth and response type (#3303)

* perf: md splitter

* fix: tts auth and response type

* fix: api file dataset (#3307)

* perf: api dataset init (#3310)

* perf: collection schema

* perf: api dataset init

* refactor: 团队管理独立页面 (#3302)

* ui: 团队管理独立页面

* 代码优化

* fix

* perf: sync collection and ui check (#3314)

* perf: sync collection

* remove script

* perf: update api server

* perf: api dataset parent

* perf: team ui

* perf: team 18n

* update team ui

* perf: ui check

* perf: i18n

* fix: debug variables & cronjob & system plugin callback load (#3315)

* fix: debug variables & cronjob & system plugin callback load

* fix type

* fix

* fix

* fix: plugin dataset quote;perf: system variables init (#3316)

* fix: plugin dataset quote

* perf: system variables init

* perf: node templates ui;fix: dataset import ui (#3318)

* fix: dataset import ui

* perf: node templates ui

* perf: ui refresh

* feat:套餐改名和套餐跳转配置 (#3309)

* fixing:except Sidebar

* 去除了多余的代码

* 修正了套餐说明的代码

* 修正了误删除的show_git代码

* 修正了名字部分等代码

* 修正了问题,遗留了其他和ui讨论不一致的部分

* 4.8.15 test (#3319)

* remove log

* pref: bill ui

* pref: bill ui

* perf: log

* html渲染文档 (#3270)

* html渲染文档

* 文档有点小问题

* feat: doc (#3322)

* 集合重训练 (#3282)

* rebaser

* 一点补充

* 小问题

* 其他问题修正,删除集合保留文件的参数还没找到...

* reTraining

* delete uesless

* 删除了一行错误代码

* 集合重训练部分

* fixing

* 删除console代码

* feat: navbar item config (#3326)

* perf: custom navbar code;perf: retraining code;feat: api dataset and dataset api doc (#3329)

* feat: api dataset and dataset api doc

* perf: retraining code

* perf: custom navbar code

* fix: ts (#3330)

* fix: ts

* fix: ts

* retraining ui

* perf: api collection filter

* perf: retrining button

---------

Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Jiangween <145003935+Jiangween@users.noreply.github.com>
Co-authored-by: papapatrick <109422393+Patrickill@users.noreply.github.com>
This commit is contained in:
Archer
2024-12-06 10:56:53 +08:00
committed by GitHub
parent b188544386
commit 1aebe5f185
307 changed files with 7383 additions and 3981 deletions

View File

@@ -1,17 +1,19 @@
import type { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type.d';
import { MongoDatasetCollection } from './schema';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { MongoDatasetTraining } from '../training/schema';
import { urlsFetch } from '../../../common/string/cheerio';
import {
DatasetCollectionTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { ClientSession } from '../../../common/mongo';
import { PushDatasetDataResponse } from '@fastgpt/global/core/dataset/api';
import { MongoDatasetCollectionTags } from '../tag/schema';
import { readFromSecondary } from '../../../common/mongo/utils';
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
import {
DatasetCollectionSyncResultEnum,
DatasetCollectionTypeEnum,
DatasetSourceReadTypeEnum,
DatasetTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
import { readDatasetSourceRawText } from '../read';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { createCollectionAndInsertData, delCollection } from './controller';
/**
* get all collection by top collectionId
@@ -61,148 +63,6 @@ export function getCollectionUpdateTime({ name, time }: { time?: Date; name: str
return new Date();
}
/**
* Get collection raw text by Collection or collectionId
*/
export const getCollectionAndRawText = async ({
collectionId,
collection,
newRawText
}: {
collectionId?: string;
collection?: CollectionWithDatasetType;
newRawText?: string;
}) => {
const col = await (async () => {
if (collection) return collection;
if (collectionId) {
return (await MongoDatasetCollection.findById(collectionId).populate(
'datasetId'
)) as CollectionWithDatasetType;
}
return null;
})();
if (!col) {
return Promise.reject('Collection not found');
}
const { title, rawText } = await (async () => {
if (newRawText)
return {
title: '',
rawText: newRawText
};
// link
if (col.type === DatasetCollectionTypeEnum.link && col.rawLink) {
// crawl new data
const result = await urlsFetch({
urlList: [col.rawLink],
selector: col.datasetId?.websiteConfig?.selector || col?.metadata?.webPageSelector
});
return {
title: result[0]?.title,
rawText: result[0]?.content
};
}
// file
return {
title: '',
rawText: ''
};
})();
const hashRawText = hashStr(rawText);
const isSameRawText = rawText && col.hashRawText === hashRawText;
return {
collection: col,
title,
rawText,
isSameRawText
};
};
/* link collection start load data */
export const reloadCollectionChunks = async ({
collection,
tmbId,
billId,
rawText,
session
}: {
collection: CollectionWithDatasetType;
tmbId: string;
billId?: string;
rawText?: string;
session: ClientSession;
}): Promise<PushDatasetDataResponse> => {
const {
title,
rawText: newRawText,
collection: col,
isSameRawText
} = await getCollectionAndRawText({
collection,
newRawText: rawText
});
if (isSameRawText)
return {
insertLen: 0
};
// split data
const { chunks } = splitText2Chunks({
text: newRawText,
chunkLen: col.chunkSize || 512,
customReg: col.chunkSplitter ? [col.chunkSplitter] : []
});
// insert to training queue
const model = await (() => {
if (col.trainingType === TrainingModeEnum.chunk) return col.datasetId.vectorModel;
if (col.trainingType === TrainingModeEnum.qa) return col.datasetId.agentModel;
return Promise.reject('Training model error');
})();
const result = await MongoDatasetTraining.insertMany(
chunks.map((item, i) => ({
teamId: col.teamId,
tmbId,
datasetId: col.datasetId._id,
collectionId: col._id,
billId,
mode: col.trainingType,
prompt: '',
model,
q: item,
a: '',
chunkIndex: i
})),
{ session }
);
// update raw text
await MongoDatasetCollection.findByIdAndUpdate(
col._id,
{
...(title && { name: title }),
rawTextLength: newRawText.length,
hashRawText: hashStr(newRawText)
},
{ session }
);
return {
insertLen: result.length
};
};
export const createOrGetCollectionTags = async ({
tags,
datasetId,
@@ -268,3 +128,88 @@ export const collectionTagsToTagLabel = async ({
})
.filter(Boolean);
};
export const syncCollection = async (collection: CollectionWithDatasetType) => {
const dataset = collection.datasetId;
if (
collection.type !== DatasetCollectionTypeEnum.link &&
dataset.type !== DatasetTypeEnum.apiDataset
) {
return Promise.reject(DatasetErrEnum.notSupportSync);
}
// Get new text
const sourceReadType = await (async () => {
if (collection.type === DatasetCollectionTypeEnum.link) {
if (!collection.rawLink) return Promise.reject('rawLink is missing');
return {
type: DatasetSourceReadTypeEnum.link,
sourceId: collection.rawLink,
selector: collection.metadata?.webPageSelector
};
}
if (!collection.apiFileId) return Promise.reject('apiFileId is missing');
if (!dataset.apiServer) return Promise.reject('apiServer not found');
return {
type: DatasetSourceReadTypeEnum.apiFile,
sourceId: collection.apiFileId,
apiServer: dataset.apiServer
};
})();
const rawText = await readDatasetSourceRawText({
teamId: collection.teamId,
...sourceReadType
});
// Check if the original text is the same: skip if same
const hashRawText = hashStr(rawText);
if (collection.hashRawText && hashRawText === collection.hashRawText) {
return DatasetCollectionSyncResultEnum.sameRaw;
}
await mongoSessionRun(async (session) => {
// Create new collection
await createCollectionAndInsertData({
session,
dataset,
rawText: rawText,
createCollectionParams: {
teamId: collection.teamId,
tmbId: collection.tmbId,
datasetId: collection.datasetId._id,
name: collection.name,
type: collection.type,
fileId: collection.fileId,
rawLink: collection.rawLink,
externalFileId: collection.externalFileId,
externalFileUrl: collection.externalFileUrl,
apiFileId: collection.apiFileId,
rawTextLength: rawText.length,
hashRawText,
tags: collection.tags,
createTime: collection.createTime,
parentId: collection.parentId,
trainingType: collection.trainingType,
chunkSize: collection.chunkSize,
chunkSplitter: collection.chunkSplitter,
qaPrompt: collection.qaPrompt,
metadata: collection.metadata
}
});
// Delete old collection
await delCollection({
collections: [collection],
delRelatedSource: false,
session
});
});
return DatasetCollectionSyncResultEnum.success;
};