mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 21:13:50 +00:00
perf: text splitter (#4313)
* sync collection * remove lock * perf: text splitter * update comment
This commit is contained in:
@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
|
||||
isLoading={isLoadingOrgs}
|
||||
>
|
||||
<Box mb={3}>
|
||||
<Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} />
|
||||
<Path paths={paths} rootName={userInfo?.team?.teamName} />
|
||||
</Box>
|
||||
<Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
|
||||
<MemberScrollData flex="1">
|
||||
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
|
||||
<ActionButton
|
||||
icon="common/administrator"
|
||||
text={t('account_team:manage_member')}
|
||||
onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)}
|
||||
onClick={() => setManageMemberOrg(currentOrg)}
|
||||
/>
|
||||
{currentOrg && currentOrg?.path !== '' && (
|
||||
<>
|
||||
|
@@ -94,7 +94,7 @@ async function handler(
|
||||
per: WritePermissionVal
|
||||
});
|
||||
|
||||
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
|
||||
if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
|
||||
|
@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||
|
||||
const formatIndexes = async ({
|
||||
indexes,
|
||||
q,
|
||||
a = '',
|
||||
indexSize
|
||||
indexSize,
|
||||
maxIndexSize
|
||||
}: {
|
||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
||||
q: string;
|
||||
a?: string;
|
||||
indexSize: number;
|
||||
maxIndexSize: number;
|
||||
}): Promise<
|
||||
{
|
||||
type: `${DatasetDataIndexTypeEnum}`;
|
||||
@@ -46,9 +47,12 @@ const formatIndexes = async ({
|
||||
}) => {
|
||||
const qChunks = splitText2Chunks({
|
||||
text: q,
|
||||
chunkSize: indexSize
|
||||
chunkSize: indexSize,
|
||||
maxSize: maxIndexSize
|
||||
}).chunks;
|
||||
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
|
||||
const aChunks = a
|
||||
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
|
||||
: [];
|
||||
|
||||
return [
|
||||
...qChunks.map((text) => ({
|
||||
@@ -100,7 +104,11 @@ const formatIndexes = async ({
|
||||
// If oversize tokens, split it
|
||||
const tokens = await countPromptTokens(item.text);
|
||||
if (tokens > indexSize) {
|
||||
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
|
||||
const splitText = splitText2Chunks({
|
||||
text: item.text,
|
||||
chunkSize: 512,
|
||||
maxSize: maxIndexSize
|
||||
}).chunks;
|
||||
return splitText.map((text) => ({
|
||||
text,
|
||||
type: item.type
|
||||
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
|
||||
indexes,
|
||||
q,
|
||||
a,
|
||||
indexSize
|
||||
indexSize,
|
||||
maxIndexSize: embModel.maxToken
|
||||
});
|
||||
|
||||
// insert to vector store
|
||||
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
|
||||
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
||||
|
||||
// 2. Compute indexes
|
||||
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
|
||||
const formatIndexesResult = await formatIndexes({
|
||||
indexes,
|
||||
q,
|
||||
a,
|
||||
indexSize,
|
||||
maxIndexSize: getEmbeddingModel(model).maxToken
|
||||
});
|
||||
|
||||
// 3. Patch indexes, create, update, delete
|
||||
const patchResult: PatchIndexesProps[] = [];
|
||||
|
Reference in New Issue
Block a user