4.6.7 first pr (#726)

This commit is contained in:
Archer
2024-01-10 23:35:04 +08:00
committed by GitHub
parent 414b693303
commit 006ad17c6a
186 changed files with 2996 additions and 1838 deletions

View File

@@ -4,14 +4,8 @@ import { useSelectFile } from '@/web/common/file/hooks/useSelectFile';
import { useToast } from '@/web/common/hooks/useToast';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { simpleText } from '@fastgpt/global/common/string/tools';
import {
fileDownload,
readCsvContent,
readPdfContent,
readDocContent
} from '@/web/common/file/utils';
import { readFileRawText, readMdFile, readHtmlFile } from '@fastgpt/web/common/file/read';
import { getUploadMdImgController, uploadFiles } from '@/web/common/file/controller';
import { fileDownload, readCsvContent } from '@/web/common/file/utils';
import { getUploadBase64ImgController, uploadFiles } from '@/web/common/file/controller';
import { Box, Flex, useDisclosure, type BoxProps } from '@chakra-ui/react';
import React, { DragEvent, useCallback, useState } from 'react';
import { useTranslation } from 'next-i18next';
@@ -25,6 +19,8 @@ import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant';
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d';
import { UrlFetchResponse } from '@fastgpt/global/common/file/api.d';
import { readFileRawContent } from '@fastgpt/web/common/file/read/index';
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
const UrlFetchModal = dynamic(() => import('./UrlFetchModal'));
const CreateFileModal = dynamic(() => import('./CreateFileModal'));
@@ -168,36 +164,22 @@ const FileSelect = ({
}
// parse and upload files
let text = await (async () => {
switch (extension) {
case 'txt':
return readFileRawText(file);
case 'md':
return readMdFile({
file,
uploadImgController: (base64Img) =>
getUploadMdImgController({ base64Img, metadata: { fileId } })
});
case 'html':
return readHtmlFile({
file,
uploadImgController: (base64Img) =>
getUploadMdImgController({ base64Img, metadata: { fileId } })
});
case 'pdf':
return readPdfContent(file);
case 'docx':
return readDocContent(file, {
let { rawText } = await readFileRawContent({
file,
uploadBase64Controller: (base64Img) =>
getUploadBase64ImgController({
base64Img,
type: MongoImageTypeEnum.docImage,
metadata: {
fileId
});
}
return '';
})();
}
})
});
if (text) {
text = simpleText(text);
if (rawText) {
rawText = simpleText(rawText);
const { chunks, tokens } = splitText2Chunks({
text,
text: rawText,
chunkLen,
overlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
@@ -207,7 +189,7 @@ const FileSelect = ({
id: nanoid(),
filename: file.name,
icon,
rawText: text,
rawText,
tokens,
type: DatasetCollectionTypeEnum.file,
fileId,

View File

@@ -10,10 +10,7 @@ const CsvImport = dynamic(() => import('./Csv'), {});
import MyModal from '@/components/MyModal';
import Provider from './Provider';
import { useDatasetStore } from '@/web/core/dataset/store/dataset';
import {
DatasetCollectionTrainingModeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constant';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
export enum ImportTypeEnum {
chunk = 'chunk',
@@ -46,24 +43,21 @@ const ImportData = ({
chunkOverlapRatio: 0.2,
inputPrice: vectorModel?.inputPrice || 0,
outputPrice: 0,
mode: TrainingModeEnum.chunk,
collectionTrainingType: DatasetCollectionTrainingModeEnum.chunk
collectionTrainingType: TrainingModeEnum.chunk
},
[ImportTypeEnum.qa]: {
defaultChunkLen: agentModel?.maxContext * 0.55 || 8000,
chunkOverlapRatio: 0,
inputPrice: agentModel?.inputPrice || 0,
outputPrice: agentModel?.outputPrice || 0,
mode: TrainingModeEnum.qa,
collectionTrainingType: DatasetCollectionTrainingModeEnum.qa
collectionTrainingType: TrainingModeEnum.qa
},
[ImportTypeEnum.csv]: {
defaultChunkLen: 0,
chunkOverlapRatio: 0,
inputPrice: vectorModel?.inputPrice || 0,
outputPrice: 0,
mode: TrainingModeEnum.chunk,
collectionTrainingType: DatasetCollectionTrainingModeEnum.manual
collectionTrainingType: TrainingModeEnum.chunk
}
};
return map[importType];

View File

@@ -16,10 +16,7 @@ import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { hashStr } from '@fastgpt/global/common/string/tools';
import { useToast } from '@/web/common/hooks/useToast';
import { getErrText } from '@fastgpt/global/common/error/utils';
import {
DatasetCollectionTrainingModeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constant';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant';
import { Box, Flex, Image, useTheme } from '@chakra-ui/react';
import { CloseIcon } from '@chakra-ui/icons';
import DeleteIcon, { hoverDeleteStyles } from '@fastgpt/web/components/common/Icon/delete';
@@ -104,7 +101,6 @@ const Provider = ({
parentId,
inputPrice,
outputPrice,
mode,
collectionTrainingType,
vectorModel,
agentModel,
@@ -118,8 +114,7 @@ const Provider = ({
parentId: string;
inputPrice: number;
outputPrice: number;
mode: `${TrainingModeEnum}`;
collectionTrainingType: `${DatasetCollectionTrainingModeEnum}`;
collectionTrainingType: `${TrainingModeEnum}`;
vectorModel: string;
agentModel: string;
defaultChunkLen: number;
@@ -147,14 +142,14 @@ const Provider = ({
const totalTokens = useMemo(() => files.reduce((sum, file) => sum + file.tokens, 0), [files]);
const price = useMemo(() => {
if (mode === TrainingModeEnum.qa) {
if (collectionTrainingType === TrainingModeEnum.qa) {
const inputTotal = totalTokens * inputPrice;
const outputTotal = totalTokens * 0.5 * outputPrice;
return formatModelPrice2Read(inputTotal + outputTotal);
}
return formatModelPrice2Read(totalTokens * inputPrice);
}, [inputPrice, mode, outputPrice, totalTokens]);
}, [collectionTrainingType, inputPrice, outputPrice, totalTokens]);
/*
start upload data
@@ -169,7 +164,7 @@ const Provider = ({
for await (const file of files) {
// create training bill
const billId = await postCreateTrainingBill({
name: t('dataset.collections.Create Training Data', { filename: file.filename }),
name: file.filename,
vectorModel,
agentModel
});
@@ -180,11 +175,15 @@ const Provider = ({
parentId,
name: file.filename,
type: file.type,
trainingType: collectionTrainingType,
chunkSize: chunkLen,
chunkSplitter: customSplitChar,
qaPrompt: collectionTrainingType === TrainingModeEnum.qa ? prompt : '',
fileId: file.fileId,
rawLink: file.rawLink,
chunkSize: chunkLen,
trainingType: collectionTrainingType,
qaPrompt: mode === TrainingModeEnum.qa ? prompt : '',
rawTextLength: file.rawText.length,
hashRawText: hashStr(file.rawText),
metadata: file.metadata
@@ -195,8 +194,8 @@ const Provider = ({
const { insertLen } = await chunksUpload({
collectionId,
billId,
trainingMode: collectionTrainingType,
chunks,
mode,
onUploading: (insertLen) => {
setSuccessChunks((state) => state + insertLen);
},