mirror of
https://github.com/labring/FastGPT.git
synced 2025-08-07 16:30:40 +00:00
4.6.7 first pr (#726)
This commit is contained in:
@@ -4,14 +4,8 @@ import { useSelectFile } from '@/web/common/file/hooks/useSelectFile';
|
||||
import { useToast } from '@/web/common/hooks/useToast';
|
||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import {
|
||||
fileDownload,
|
||||
readCsvContent,
|
||||
readPdfContent,
|
||||
readDocContent
|
||||
} from '@/web/common/file/utils';
|
||||
import { readFileRawText, readMdFile, readHtmlFile } from '@fastgpt/web/common/file/read';
|
||||
import { getUploadMdImgController, uploadFiles } from '@/web/common/file/controller';
|
||||
import { fileDownload, readCsvContent } from '@/web/common/file/utils';
|
||||
import { getUploadBase64ImgController, uploadFiles } from '@/web/common/file/controller';
|
||||
import { Box, Flex, useDisclosure, type BoxProps } from '@chakra-ui/react';
|
||||
import React, { DragEvent, useCallback, useState } from 'react';
|
||||
import { useTranslation } from 'next-i18next';
|
||||
@@ -25,6 +19,8 @@ import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { UrlFetchResponse } from '@fastgpt/global/common/file/api.d';
|
||||
import { readFileRawContent } from '@fastgpt/web/common/file/read/index';
|
||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||
|
||||
const UrlFetchModal = dynamic(() => import('./UrlFetchModal'));
|
||||
const CreateFileModal = dynamic(() => import('./CreateFileModal'));
|
||||
@@ -168,36 +164,22 @@ const FileSelect = ({
|
||||
}
|
||||
|
||||
// parse and upload files
|
||||
let text = await (async () => {
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(file);
|
||||
case 'md':
|
||||
return readMdFile({
|
||||
file,
|
||||
uploadImgController: (base64Img) =>
|
||||
getUploadMdImgController({ base64Img, metadata: { fileId } })
|
||||
});
|
||||
case 'html':
|
||||
return readHtmlFile({
|
||||
file,
|
||||
uploadImgController: (base64Img) =>
|
||||
getUploadMdImgController({ base64Img, metadata: { fileId } })
|
||||
});
|
||||
case 'pdf':
|
||||
return readPdfContent(file);
|
||||
case 'docx':
|
||||
return readDocContent(file, {
|
||||
let { rawText } = await readFileRawContent({
|
||||
file,
|
||||
uploadBase64Controller: (base64Img) =>
|
||||
getUploadBase64ImgController({
|
||||
base64Img,
|
||||
type: MongoImageTypeEnum.docImage,
|
||||
metadata: {
|
||||
fileId
|
||||
});
|
||||
}
|
||||
return '';
|
||||
})();
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
if (text) {
|
||||
text = simpleText(text);
|
||||
if (rawText) {
|
||||
rawText = simpleText(rawText);
|
||||
const { chunks, tokens } = splitText2Chunks({
|
||||
text,
|
||||
text: rawText,
|
||||
chunkLen,
|
||||
overlapRatio,
|
||||
customReg: customSplitChar ? [customSplitChar] : []
|
||||
@@ -207,7 +189,7 @@ const FileSelect = ({
|
||||
id: nanoid(),
|
||||
filename: file.name,
|
||||
icon,
|
||||
rawText: text,
|
||||
rawText,
|
||||
tokens,
|
||||
type: DatasetCollectionTypeEnum.file,
|
||||
fileId,
|
||||
|
@@ -10,10 +10,7 @@ const CsvImport = dynamic(() => import('./Csv'), {});
|
||||
import MyModal from '@/components/MyModal';
|
||||
import Provider from './Provider';
|
||||
import { useDatasetStore } from '@/web/core/dataset/store/dataset';
|
||||
import {
|
||||
DatasetCollectionTrainingModeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
|
||||
export enum ImportTypeEnum {
|
||||
chunk = 'chunk',
|
||||
@@ -46,24 +43,21 @@ const ImportData = ({
|
||||
chunkOverlapRatio: 0.2,
|
||||
inputPrice: vectorModel?.inputPrice || 0,
|
||||
outputPrice: 0,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
collectionTrainingType: DatasetCollectionTrainingModeEnum.chunk
|
||||
collectionTrainingType: TrainingModeEnum.chunk
|
||||
},
|
||||
[ImportTypeEnum.qa]: {
|
||||
defaultChunkLen: agentModel?.maxContext * 0.55 || 8000,
|
||||
chunkOverlapRatio: 0,
|
||||
inputPrice: agentModel?.inputPrice || 0,
|
||||
outputPrice: agentModel?.outputPrice || 0,
|
||||
mode: TrainingModeEnum.qa,
|
||||
collectionTrainingType: DatasetCollectionTrainingModeEnum.qa
|
||||
collectionTrainingType: TrainingModeEnum.qa
|
||||
},
|
||||
[ImportTypeEnum.csv]: {
|
||||
defaultChunkLen: 0,
|
||||
chunkOverlapRatio: 0,
|
||||
inputPrice: vectorModel?.inputPrice || 0,
|
||||
outputPrice: 0,
|
||||
mode: TrainingModeEnum.chunk,
|
||||
collectionTrainingType: DatasetCollectionTrainingModeEnum.manual
|
||||
collectionTrainingType: TrainingModeEnum.chunk
|
||||
}
|
||||
};
|
||||
return map[importType];
|
||||
|
@@ -16,10 +16,7 @@ import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { useToast } from '@/web/common/hooks/useToast';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import {
|
||||
DatasetCollectionTrainingModeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
|
||||
import { Box, Flex, Image, useTheme } from '@chakra-ui/react';
|
||||
import { CloseIcon } from '@chakra-ui/icons';
|
||||
import DeleteIcon, { hoverDeleteStyles } from '@fastgpt/web/components/common/Icon/delete';
|
||||
@@ -104,7 +101,6 @@ const Provider = ({
|
||||
parentId,
|
||||
inputPrice,
|
||||
outputPrice,
|
||||
mode,
|
||||
collectionTrainingType,
|
||||
vectorModel,
|
||||
agentModel,
|
||||
@@ -118,8 +114,7 @@ const Provider = ({
|
||||
parentId: string;
|
||||
inputPrice: number;
|
||||
outputPrice: number;
|
||||
mode: `${TrainingModeEnum}`;
|
||||
collectionTrainingType: `${DatasetCollectionTrainingModeEnum}`;
|
||||
collectionTrainingType: `${TrainingModeEnum}`;
|
||||
vectorModel: string;
|
||||
agentModel: string;
|
||||
defaultChunkLen: number;
|
||||
@@ -147,14 +142,14 @@ const Provider = ({
|
||||
const totalTokens = useMemo(() => files.reduce((sum, file) => sum + file.tokens, 0), [files]);
|
||||
|
||||
const price = useMemo(() => {
|
||||
if (mode === TrainingModeEnum.qa) {
|
||||
if (collectionTrainingType === TrainingModeEnum.qa) {
|
||||
const inputTotal = totalTokens * inputPrice;
|
||||
const outputTotal = totalTokens * 0.5 * outputPrice;
|
||||
|
||||
return formatModelPrice2Read(inputTotal + outputTotal);
|
||||
}
|
||||
return formatModelPrice2Read(totalTokens * inputPrice);
|
||||
}, [inputPrice, mode, outputPrice, totalTokens]);
|
||||
}, [collectionTrainingType, inputPrice, outputPrice, totalTokens]);
|
||||
|
||||
/*
|
||||
start upload data
|
||||
@@ -169,7 +164,7 @@ const Provider = ({
|
||||
for await (const file of files) {
|
||||
// create training bill
|
||||
const billId = await postCreateTrainingBill({
|
||||
name: t('dataset.collections.Create Training Data', { filename: file.filename }),
|
||||
name: file.filename,
|
||||
vectorModel,
|
||||
agentModel
|
||||
});
|
||||
@@ -180,11 +175,15 @@ const Provider = ({
|
||||
parentId,
|
||||
name: file.filename,
|
||||
type: file.type,
|
||||
|
||||
trainingType: collectionTrainingType,
|
||||
chunkSize: chunkLen,
|
||||
chunkSplitter: customSplitChar,
|
||||
qaPrompt: collectionTrainingType === TrainingModeEnum.qa ? prompt : '',
|
||||
|
||||
fileId: file.fileId,
|
||||
rawLink: file.rawLink,
|
||||
chunkSize: chunkLen,
|
||||
trainingType: collectionTrainingType,
|
||||
qaPrompt: mode === TrainingModeEnum.qa ? prompt : '',
|
||||
|
||||
rawTextLength: file.rawText.length,
|
||||
hashRawText: hashStr(file.rawText),
|
||||
metadata: file.metadata
|
||||
@@ -195,8 +194,8 @@ const Provider = ({
|
||||
const { insertLen } = await chunksUpload({
|
||||
collectionId,
|
||||
billId,
|
||||
trainingMode: collectionTrainingType,
|
||||
chunks,
|
||||
mode,
|
||||
onUploading: (insertLen) => {
|
||||
setSuccessChunks((state) => state + insertLen);
|
||||
},
|
||||
|
Reference in New Issue
Block a user