mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 04:06:18 +00:00
perf: backup import (#4866)
* i18n * remove invalid code * perf: backup import * backup tip * fix: indexsize invalid
This commit is contained in:
@@ -146,7 +146,8 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
|
||||
tmbId,
|
||||
url: previewUrl,
|
||||
relatedId: apiFileId,
|
||||
customPdfParse
|
||||
customPdfParse,
|
||||
getFormatText: true
|
||||
});
|
||||
return {
|
||||
title,
|
||||
|
@@ -36,13 +36,14 @@ import {
|
||||
computeChunkSplitter,
|
||||
getLLMMaxChunkSize
|
||||
} from '@fastgpt/global/core/dataset/training/utils';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
|
||||
export const createCollectionAndInsertData = async ({
|
||||
dataset,
|
||||
rawText,
|
||||
relatedId,
|
||||
createCollectionParams,
|
||||
isQAImport = false,
|
||||
backupParse = false,
|
||||
billId,
|
||||
session
|
||||
}: {
|
||||
@@ -50,8 +51,8 @@ export const createCollectionAndInsertData = async ({
|
||||
rawText: string;
|
||||
relatedId?: string;
|
||||
createCollectionParams: CreateOneCollectionParams;
|
||||
backupParse?: boolean;
|
||||
|
||||
isQAImport?: boolean;
|
||||
billId?: string;
|
||||
session?: ClientSession;
|
||||
}) => {
|
||||
@@ -81,7 +82,7 @@ export const createCollectionAndInsertData = async ({
|
||||
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
isQAImport
|
||||
backupParse
|
||||
});
|
||||
|
||||
// 2. auth limit
|
||||
@@ -157,6 +158,10 @@ export const createCollectionAndInsertData = async ({
|
||||
billId: traingBillId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
indexes: item.indexes?.map((text) => ({
|
||||
type: DatasetDataIndexTypeEnum.custom,
|
||||
text
|
||||
})),
|
||||
chunkIndex: index
|
||||
})),
|
||||
session
|
||||
|
@@ -2,7 +2,6 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
||||
import { urlsFetch } from '../../common/string/cheerio';
|
||||
import { parseCsvTable2Chunks } from './training/utils';
|
||||
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import axios from 'axios';
|
||||
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
|
||||
@@ -13,18 +12,21 @@ import {
|
||||
type YuqueServer
|
||||
} from '@fastgpt/global/core/dataset/apiDataset';
|
||||
import { useApiDatasetRequest } from './apiDataset/api';
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const readFileRawTextByUrl = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
url,
|
||||
customPdfParse,
|
||||
getFormatText,
|
||||
relatedId
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
url: string;
|
||||
customPdfParse?: boolean;
|
||||
getFormatText?: boolean;
|
||||
relatedId: string; // externalFileId / apiFileId
|
||||
}) => {
|
||||
const response = await axios({
|
||||
@@ -38,7 +40,7 @@ export const readFileRawTextByUrl = async ({
|
||||
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
customPdfParse,
|
||||
isQAImport: false,
|
||||
getFormatText,
|
||||
extension,
|
||||
teamId,
|
||||
tmbId,
|
||||
@@ -62,21 +64,21 @@ export const readDatasetSourceRawText = async ({
|
||||
tmbId,
|
||||
type,
|
||||
sourceId,
|
||||
isQAImport,
|
||||
selector,
|
||||
externalFileId,
|
||||
apiServer,
|
||||
feishuServer,
|
||||
yuqueServer,
|
||||
customPdfParse
|
||||
customPdfParse,
|
||||
getFormatText
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
type: DatasetSourceReadTypeEnum;
|
||||
sourceId: string;
|
||||
customPdfParse?: boolean;
|
||||
getFormatText?: boolean;
|
||||
|
||||
isQAImport?: boolean; // csv data
|
||||
selector?: string; // link selector
|
||||
externalFileId?: string; // external file dataset
|
||||
apiServer?: APIFileServer; // api dataset
|
||||
@@ -92,8 +94,8 @@ export const readDatasetSourceRawText = async ({
|
||||
tmbId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId: sourceId,
|
||||
isQAImport,
|
||||
customPdfParse
|
||||
customPdfParse,
|
||||
getFormatText
|
||||
});
|
||||
return {
|
||||
title: filename,
|
||||
@@ -183,16 +185,38 @@ export const readApiServerFileContent = async ({
|
||||
|
||||
export const rawText2Chunks = ({
|
||||
rawText,
|
||||
isQAImport,
|
||||
backupParse,
|
||||
chunkSize = 512,
|
||||
...splitProps
|
||||
}: {
|
||||
rawText: string;
|
||||
isQAImport?: boolean;
|
||||
} & TextSplitProps) => {
|
||||
if (isQAImport) {
|
||||
const { chunks } = parseCsvTable2Chunks(rawText);
|
||||
return chunks;
|
||||
backupParse?: boolean;
|
||||
tableParse?: boolean;
|
||||
} & TextSplitProps): {
|
||||
q: string;
|
||||
a: string;
|
||||
indexes?: string[];
|
||||
}[] => {
|
||||
const parseDatasetBackup2Chunks = (rawText: string) => {
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
console.log(rawText, csvArr);
|
||||
|
||||
const chunks = csvArr
|
||||
.slice(1)
|
||||
.map((item) => ({
|
||||
q: item[0] || '',
|
||||
a: item[1] || '',
|
||||
indexes: item.slice(2)
|
||||
}))
|
||||
.filter((item) => item.q || item.a);
|
||||
|
||||
return {
|
||||
chunks
|
||||
};
|
||||
};
|
||||
|
||||
if (backupParse) {
|
||||
return parseDatasetBackup2Chunks(rawText).chunks;
|
||||
}
|
||||
|
||||
const { chunks } = splitText2Chunks({
|
||||
@@ -203,6 +227,7 @@ export const rawText2Chunks = ({
|
||||
|
||||
return chunks.map((item) => ({
|
||||
q: item,
|
||||
a: ''
|
||||
a: '',
|
||||
indexes: []
|
||||
}));
|
||||
};
|
||||
|
@@ -1,6 +1,5 @@
|
||||
export enum ImportDataSourceEnum {
|
||||
fileLocal = 'fileLocal',
|
||||
fileLink = 'fileLink',
|
||||
fileCustom = 'fileCustom',
|
||||
tableLocal = 'tableLocal'
|
||||
fileCustom = 'fileCustom'
|
||||
}
|
||||
|
@@ -1,16 +0,0 @@
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const parseCsvTable2Chunks = (rawText: string) => {
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
const chunks = csvArr
|
||||
.map((item) => ({
|
||||
q: item[0] || '',
|
||||
a: item[1] || ''
|
||||
}))
|
||||
.filter((item) => item.q || item.a);
|
||||
|
||||
return {
|
||||
chunks
|
||||
};
|
||||
};
|
@@ -223,28 +223,29 @@ const toolChoice = async (props: ActionProps) => {
|
||||
}
|
||||
];
|
||||
|
||||
const body = llmCompletionsBodyFormat(
|
||||
{
|
||||
stream: true,
|
||||
model: extractModel.model,
|
||||
temperature: 0.01,
|
||||
messages: filterMessages,
|
||||
tools,
|
||||
tool_choice: { type: 'function', function: { name: agentFunName } }
|
||||
},
|
||||
extractModel
|
||||
);
|
||||
const { response } = await createChatCompletion({
|
||||
body: llmCompletionsBodyFormat(
|
||||
{
|
||||
stream: true,
|
||||
model: extractModel.model,
|
||||
temperature: 0.01,
|
||||
messages: filterMessages,
|
||||
tools,
|
||||
tool_choice: { type: 'function', function: { name: agentFunName } }
|
||||
},
|
||||
extractModel
|
||||
),
|
||||
body,
|
||||
userKey: externalProvider.openaiAccount
|
||||
});
|
||||
const { toolCalls, usage } = await formatLLMResponse(response);
|
||||
const { text, toolCalls, usage } = await formatLLMResponse(response);
|
||||
|
||||
const arg: Record<string, any> = (() => {
|
||||
try {
|
||||
return json5.parse(toolCalls?.[0]?.function?.arguments || '');
|
||||
} catch (error) {
|
||||
console.log(agentFunction.parameters);
|
||||
console.log(toolCalls?.[0]?.function);
|
||||
console.log('body', body);
|
||||
console.log('AI response', text, toolCalls?.[0]?.function);
|
||||
console.log('Your model may not support tool_call', error);
|
||||
return {};
|
||||
}
|
||||
|
@@ -211,12 +211,12 @@ export const getFileContentFromLinks = async ({
|
||||
// Read file
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
extension,
|
||||
isQAImport: false,
|
||||
teamId,
|
||||
tmbId,
|
||||
buffer,
|
||||
encoding,
|
||||
customPdfParse
|
||||
customPdfParse,
|
||||
getFormatText: true
|
||||
});
|
||||
|
||||
// Add to buffer
|
||||
|
Reference in New Issue
Block a user