perf: backup import (#4866)

* i18n

* remove invalid code

* perf: backup import

* backup tip

* fix: indexsize invalid
This commit is contained in:
Archer
2025-05-22 15:53:51 +08:00
committed by GitHub
parent dd3c251603
commit 88bd3aaa9e
67 changed files with 751 additions and 388 deletions

View File

@@ -146,7 +146,8 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
tmbId,
url: previewUrl,
relatedId: apiFileId,
customPdfParse
customPdfParse,
getFormatText: true
});
return {
title,

View File

@@ -36,13 +36,14 @@ import {
computeChunkSplitter,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
export const createCollectionAndInsertData = async ({
dataset,
rawText,
relatedId,
createCollectionParams,
isQAImport = false,
backupParse = false,
billId,
session
}: {
@@ -50,8 +51,8 @@ export const createCollectionAndInsertData = async ({
rawText: string;
relatedId?: string;
createCollectionParams: CreateOneCollectionParams;
backupParse?: boolean;
isQAImport?: boolean;
billId?: string;
session?: ClientSession;
}) => {
@@ -81,7 +82,7 @@ export const createCollectionAndInsertData = async ({
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport
backupParse
});
// 2. auth limit
@@ -157,6 +158,10 @@ export const createCollectionAndInsertData = async ({
billId: traingBillId,
data: chunks.map((item, index) => ({
...item,
indexes: item.indexes?.map((text) => ({
type: DatasetDataIndexTypeEnum.custom,
text
})),
chunkIndex: index
})),
session

View File

@@ -2,7 +2,6 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { parseCsvTable2Chunks } from './training/utils';
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios';
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
@@ -13,18 +12,21 @@ import {
type YuqueServer
} from '@fastgpt/global/core/dataset/apiDataset';
import { useApiDatasetRequest } from './apiDataset/api';
import Papa from 'papaparse';
export const readFileRawTextByUrl = async ({
teamId,
tmbId,
url,
customPdfParse,
getFormatText,
relatedId
}: {
teamId: string;
tmbId: string;
url: string;
customPdfParse?: boolean;
getFormatText?: boolean;
relatedId: string; // externalFileId / apiFileId
}) => {
const response = await axios({
@@ -38,7 +40,7 @@ export const readFileRawTextByUrl = async ({
const { rawText } = await readRawContentByFileBuffer({
customPdfParse,
isQAImport: false,
getFormatText,
extension,
teamId,
tmbId,
@@ -62,21 +64,21 @@ export const readDatasetSourceRawText = async ({
tmbId,
type,
sourceId,
isQAImport,
selector,
externalFileId,
apiServer,
feishuServer,
yuqueServer,
customPdfParse
customPdfParse,
getFormatText
}: {
teamId: string;
tmbId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
customPdfParse?: boolean;
getFormatText?: boolean;
isQAImport?: boolean; // csv data
selector?: string; // link selector
externalFileId?: string; // external file dataset
apiServer?: APIFileServer; // api dataset
@@ -92,8 +94,8 @@ export const readDatasetSourceRawText = async ({
tmbId,
bucketName: BucketNameEnum.dataset,
fileId: sourceId,
isQAImport,
customPdfParse
customPdfParse,
getFormatText
});
return {
title: filename,
@@ -183,16 +185,38 @@ export const readApiServerFileContent = async ({
export const rawText2Chunks = ({
rawText,
isQAImport,
backupParse,
chunkSize = 512,
...splitProps
}: {
rawText: string;
isQAImport?: boolean;
} & TextSplitProps) => {
if (isQAImport) {
const { chunks } = parseCsvTable2Chunks(rawText);
return chunks;
backupParse?: boolean;
tableParse?: boolean;
} & TextSplitProps): {
q: string;
a: string;
indexes?: string[];
}[] => {
const parseDatasetBackup2Chunks = (rawText: string) => {
const csvArr = Papa.parse(rawText).data as string[][];
console.log(rawText, csvArr);
const chunks = csvArr
.slice(1)
.map((item) => ({
q: item[0] || '',
a: item[1] || '',
indexes: item.slice(2)
}))
.filter((item) => item.q || item.a);
return {
chunks
};
};
if (backupParse) {
return parseDatasetBackup2Chunks(rawText).chunks;
}
const { chunks } = splitText2Chunks({
@@ -203,6 +227,7 @@ export const rawText2Chunks = ({
return chunks.map((item) => ({
q: item,
a: ''
a: '',
indexes: []
}));
};

View File

@@ -1,6 +1,5 @@
export enum ImportDataSourceEnum {
fileLocal = 'fileLocal',
fileLink = 'fileLink',
fileCustom = 'fileCustom',
tableLocal = 'tableLocal'
fileCustom = 'fileCustom'
}

View File

@@ -1,16 +0,0 @@
import Papa from 'papaparse';
export const parseCsvTable2Chunks = (rawText: string) => {
const csvArr = Papa.parse(rawText).data as string[][];
const chunks = csvArr
.map((item) => ({
q: item[0] || '',
a: item[1] || ''
}))
.filter((item) => item.q || item.a);
return {
chunks
};
};

View File

@@ -223,28 +223,29 @@ const toolChoice = async (props: ActionProps) => {
}
];
const body = llmCompletionsBodyFormat(
{
stream: true,
model: extractModel.model,
temperature: 0.01,
messages: filterMessages,
tools,
tool_choice: { type: 'function', function: { name: agentFunName } }
},
extractModel
);
const { response } = await createChatCompletion({
body: llmCompletionsBodyFormat(
{
stream: true,
model: extractModel.model,
temperature: 0.01,
messages: filterMessages,
tools,
tool_choice: { type: 'function', function: { name: agentFunName } }
},
extractModel
),
body,
userKey: externalProvider.openaiAccount
});
const { toolCalls, usage } = await formatLLMResponse(response);
const { text, toolCalls, usage } = await formatLLMResponse(response);
const arg: Record<string, any> = (() => {
try {
return json5.parse(toolCalls?.[0]?.function?.arguments || '');
} catch (error) {
console.log(agentFunction.parameters);
console.log(toolCalls?.[0]?.function);
console.log('body', body);
console.log('AI response', text, toolCalls?.[0]?.function);
console.log('Your model may not support tool_call', error);
return {};
}

View File

@@ -211,12 +211,12 @@ export const getFileContentFromLinks = async ({
// Read file
const { rawText } = await readRawContentByFileBuffer({
extension,
isQAImport: false,
teamId,
tmbId,
buffer,
encoding,
customPdfParse
customPdfParse,
getFormatText: true
});
// Add to buffer