perf: text splitter (#4313)

* sync collection

* remove lock

* perf: text splitter

* update comment
This commit is contained in:
Archer
2025-03-25 17:44:38 +08:00
committed by archer
parent 826a53dcb6
commit 37b4a1919b
8 changed files with 716 additions and 70 deletions

View File

@@ -196,7 +196,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
isLoading={isLoadingOrgs}
>
<Box mb={3}>
<Path paths={paths} rootName={userInfo?.team?.teamName} onClick={setPath} />
<Path paths={paths} rootName={userInfo?.team?.teamName} />
</Box>
<Flex flex={'1 0 0'} h={0} w={'100%'} gap={'4'}>
<MemberScrollData flex="1">
@@ -420,7 +420,7 @@ function OrgTable({ Tabs }: { Tabs: React.ReactNode }) {
<ActionButton
icon="common/administrator"
text={t('account_team:manage_member')}
onClick={() => setManageMemberOrg(currentOrg ?? rootOrg)}
onClick={() => setManageMemberOrg(currentOrg)}
/>
{currentOrg && currentOrg?.path !== '' && (
<>

View File

@@ -94,7 +94,7 @@ async function handler(
per: WritePermissionVal
});
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
if (fileAuthRes && String(fileAuthRes.tmbId) !== String(tmbId) && !fileAuthRes.isRoot) {
return Promise.reject(CommonErrEnum.unAuthFile);
}

View File

@@ -15,18 +15,19 @@ import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTex
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const formatIndexes = async ({
indexes,
q,
a = '',
indexSize
indexSize,
maxIndexSize
}: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string;
a?: string;
indexSize: number;
maxIndexSize: number;
}): Promise<
{
type: `${DatasetDataIndexTypeEnum}`;
@@ -46,9 +47,12 @@ const formatIndexes = async ({
}) => {
const qChunks = splitText2Chunks({
text: q,
chunkSize: indexSize
chunkSize: indexSize,
maxSize: maxIndexSize
}).chunks;
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
const aChunks = a
? splitText2Chunks({ text: a, chunkSize: indexSize, maxSize: maxIndexSize }).chunks
: [];
return [
...qChunks.map((text) => ({
@@ -100,7 +104,11 @@ const formatIndexes = async ({
// If oversize tokens, split it
const tokens = await countPromptTokens(item.text);
if (tokens > indexSize) {
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
const splitText = splitText2Chunks({
text: item.text,
chunkSize: 512,
maxSize: maxIndexSize
}).chunks;
return splitText.map((text) => ({
text,
type: item.type
@@ -151,7 +159,8 @@ export async function insertData2Dataset({
indexes,
q,
a,
indexSize
indexSize,
maxIndexSize: embModel.maxToken
});
// insert to vector store
@@ -236,7 +245,13 @@ export async function updateData2Dataset({
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// 2. Compute indexes
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
const formatIndexesResult = await formatIndexes({
indexes,
q,
a,
indexSize,
maxIndexSize: getEmbeddingModel(model).maxToken
});
// 3. Patch indexes, create, update, delete
const patchResult: PatchIndexesProps[] = [];