perf: dataset import params code (#4875)

* perf: dataset import params code

* perf: api dataset code

* model
This commit is contained in:
Archer
2025-05-23 10:40:25 +08:00
committed by GitHub
parent 9af92d1eae
commit fae76e887a
23 changed files with 366 additions and 295 deletions

View File

@@ -1,6 +1,54 @@
{
"provider": "Claude",
"list": [
{
"model": "claude-sonnet-4-20250514",
"name": "claude-sonnet-4-20250514",
"maxContext": 200000,
"maxResponse": 8000,
"quoteMaxToken": 100000,
"maxTemperature": 1,
"showTopP": true,
"showStopSign": true,
"vision": true,
"toolChoice": true,
"functionCall": false,
"defaultSystemChatPrompt": "",
"datasetProcess": true,
"usedInClassify": true,
"customCQPrompt": "",
"usedInExtractFields": true,
"usedInQueryExtension": true,
"customExtractPrompt": "",
"usedInToolCall": true,
"defaultConfig": {},
"fieldMap": {},
"type": "llm"
},
{
"model": "claude-opus-4-20250514",
"name": "claude-opus-4-20250514",
"maxContext": 200000,
"maxResponse": 4096,
"quoteMaxToken": 100000,
"maxTemperature": 1,
"showTopP": true,
"showStopSign": true,
"vision": true,
"toolChoice": true,
"functionCall": false,
"defaultSystemChatPrompt": "",
"datasetProcess": true,
"usedInClassify": true,
"customCQPrompt": "",
"usedInExtractFields": true,
"usedInQueryExtension": true,
"customExtractPrompt": "",
"usedInToolCall": true,
"defaultConfig": {},
"fieldMap": {},
"type": "llm"
},
{
"model": "claude-3-7-sonnet-20250219",
"name": "claude-3-7-sonnet-20250219",

View File

@@ -25,6 +25,30 @@
"showTopP": true,
"showStopSign": true
},
{
"model": "gemini-2.5-flash-preview-04-17",
"name": "gemini-2.5-flash-preview-04-17",
"maxContext": 1000000,
"maxResponse": 8000,
"quoteMaxToken": 60000,
"maxTemperature": 1,
"vision": true,
"toolChoice": true,
"functionCall": false,
"defaultSystemChatPrompt": "",
"datasetProcess": true,
"usedInClassify": true,
"customCQPrompt": "",
"usedInExtractFields": true,
"usedInQueryExtension": true,
"customExtractPrompt": "",
"usedInToolCall": true,
"defaultConfig": {},
"fieldMap": {},
"type": "llm",
"showTopP": true,
"showStopSign": true
},
{
"model": "gemini-2.0-flash",
"name": "gemini-2.0-flash",

View File

@@ -74,6 +74,15 @@ export const createCollectionAndInsertData = async ({
llmModel: getLLMModel(dataset.agentModel)
});
const chunkSplitter = computeChunkSplitter(createCollectionParams);
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
delete createCollectionParams.chunkTriggerType;
delete createCollectionParams.chunkTriggerMinSize;
delete createCollectionParams.dataEnhanceCollectionName;
delete createCollectionParams.imageIndex;
delete createCollectionParams.autoIndexes;
delete createCollectionParams.indexSize;
delete createCollectionParams.qaPrompt;
}
// 1. split chunks
const chunks = rawText2Chunks({

View File

@@ -163,7 +163,7 @@ export const readApiServerFileContent = async ({
title?: string;
rawText: string;
}> => {
const data = (
return (
await getApiDatasetRequest({
apiServer,
yuqueServer,
@@ -175,10 +175,6 @@ export const readApiServerFileContent = async ({
apiFileId,
customPdfParse
});
if (data) {
return data;
}
return Promise.reject(Error);
};
export const rawText2Chunks = ({

View File

@@ -1,10 +1,12 @@
import { getMongoModel, Schema } from '../../common/mongo';
import {
ChunkSettingModeEnum,
ChunkTriggerConfigTypeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
DatasetTypeEnum,
DatasetTypeMap
DatasetTypeMap,
ParagraphChunkAIModeEnum
} from '@fastgpt/global/core/dataset/constants';
import {
TeamCollectionName,
@@ -15,12 +17,22 @@ import type { DatasetSchemaType } from '@fastgpt/global/core/dataset/type.d';
export const DatasetCollectionName = 'datasets';
export const ChunkSettings = {
imageIndex: Boolean,
autoIndexes: Boolean,
trainingType: {
type: String,
enum: Object.values(DatasetCollectionDataProcessModeEnum)
},
chunkTriggerType: {
type: String,
enum: Object.values(ChunkTriggerConfigTypeEnum)
},
chunkTriggerMinSize: Number,
dataEnhanceCollectionName: Boolean,
imageIndex: Boolean,
autoIndexes: Boolean,
chunkSettingMode: {
type: String,
enum: Object.values(ChunkSettingModeEnum)
@@ -29,6 +41,13 @@ export const ChunkSettings = {
type: String,
enum: Object.values(DataChunkSplitModeEnum)
},
paragraphChunkAIMode: {
type: String,
enum: Object.values(ParagraphChunkAIModeEnum)
},
paragraphChunkDeep: Number,
paragraphChunkMinSize: Number,
paragraphChunkMaxSize: Number,
chunkSize: Number,
chunkSplitter: String,
@@ -115,9 +134,7 @@ const DatasetSchema = new Schema({
// abandoned
autoSync: Boolean,
externalReadUrl: {
type: String
},
externalReadUrl: String,
defaultPermission: Number
});