mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
Add image index and pdf parse (#3956)
* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
This commit is contained in:
@@ -186,20 +186,25 @@ export async function getDownloadStream({
|
||||
|
||||
export const readFileContentFromMongo = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
bucketName,
|
||||
fileId,
|
||||
isQAImport = false
|
||||
isQAImport = false,
|
||||
customPdfParse = false
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
isQAImport?: boolean;
|
||||
customPdfParse?: boolean;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
filename: string;
|
||||
}> => {
|
||||
const bufferId = `${fileId}-${customPdfParse}`;
|
||||
// read buffer
|
||||
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: fileId }, undefined, {
|
||||
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: bufferId }, undefined, {
|
||||
...readFromSecondary
|
||||
}).lean();
|
||||
if (fileBuffer) {
|
||||
@@ -227,9 +232,11 @@ export const readFileContentFromMongo = async ({
|
||||
|
||||
// Get raw text
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
customPdfParse,
|
||||
extension,
|
||||
isQAImport,
|
||||
teamId,
|
||||
tmbId,
|
||||
buffer: fileBuffers,
|
||||
encoding,
|
||||
metadata: {
|
||||
@@ -240,7 +247,7 @@ export const readFileContentFromMongo = async ({
|
||||
// < 14M
|
||||
if (fileBuffers.length < 14 * 1024 * 1024 && rawText.trim()) {
|
||||
MongoRawTextBuffer.create({
|
||||
sourceId: fileId,
|
||||
sourceId: bufferId,
|
||||
rawText,
|
||||
metadata: {
|
||||
filename: file.filename
|
||||
|
27
packages/service/common/file/image/utils.ts
Normal file
27
packages/service/common/file/image/utils.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
import axios from 'axios';
|
||||
import { addLog } from '../../system/log';
|
||||
import { serverRequestBaseUrl } from '../../api/serverRequest';
|
||||
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../utils';
|
||||
|
||||
export const getImageBase64 = async (url: string) => {
|
||||
addLog.debug(`Load image to base64: ${url}`);
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
baseURL: serverRequestBaseUrl,
|
||||
responseType: 'arraybuffer',
|
||||
proxy: false
|
||||
});
|
||||
|
||||
const base64 = Buffer.from(response.data, 'binary').toString('base64');
|
||||
const imageType =
|
||||
getFileContentTypeFromHeader(response.headers['content-type']) ||
|
||||
guessBase64ImageType(base64);
|
||||
|
||||
return `data:${imageType};base64,${base64}`;
|
||||
} catch (error) {
|
||||
addLog.debug(`Load image to base64 failed: ${url}`);
|
||||
console.log(error);
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
@@ -1,18 +1,23 @@
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import FormData from 'form-data';
|
||||
|
||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||
import fs from 'fs';
|
||||
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
||||
import type { ImageType, ReadFileResponse } from '../../../worker/readFile/type';
|
||||
import axios from 'axios';
|
||||
import { addLog } from '../../system/log';
|
||||
import { batchRun } from '@fastgpt/global/common/fn/utils';
|
||||
import { matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { batchRun } from '@fastgpt/global/common/system/utils';
|
||||
import { htmlTable2Md, matchMdImgTextAndUpload } from '@fastgpt/global/common/string/markdown';
|
||||
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
|
||||
export type readRawTextByLocalFileParams = {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
path: string;
|
||||
encoding: string;
|
||||
customPdfParse?: boolean;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParams) => {
|
||||
@@ -22,46 +27,51 @@ export const readRawTextByLocalFile = async (params: readRawTextByLocalFileParam
|
||||
|
||||
const buffer = await fs.promises.readFile(path);
|
||||
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
return readRawContentByFileBuffer({
|
||||
extension,
|
||||
isQAImport: false,
|
||||
customPdfParse: params.customPdfParse,
|
||||
teamId: params.teamId,
|
||||
tmbId: params.tmbId,
|
||||
encoding: params.encoding,
|
||||
buffer,
|
||||
metadata: params.metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
||||
|
||||
export const readRawContentByFileBuffer = async ({
|
||||
extension,
|
||||
isQAImport,
|
||||
teamId,
|
||||
tmbId,
|
||||
|
||||
extension,
|
||||
buffer,
|
||||
encoding,
|
||||
metadata
|
||||
metadata,
|
||||
customPdfParse = false,
|
||||
isQAImport = false
|
||||
}: {
|
||||
isQAImport?: boolean;
|
||||
extension: string;
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
|
||||
extension: string;
|
||||
buffer: Buffer;
|
||||
encoding: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) => {
|
||||
// Custom read file service
|
||||
const customReadfileUrl = process.env.CUSTOM_READ_FILE_URL;
|
||||
const customReadFileExtension = process.env.CUSTOM_READ_FILE_EXTENSION || '';
|
||||
const ocrParse = process.env.CUSTOM_READ_FILE_OCR || 'false';
|
||||
const readFileFromCustomService = async (): Promise<ReadFileResponse | undefined> => {
|
||||
if (
|
||||
!customReadfileUrl ||
|
||||
!customReadFileExtension ||
|
||||
!customReadFileExtension.includes(extension)
|
||||
)
|
||||
return;
|
||||
|
||||
customPdfParse?: boolean;
|
||||
isQAImport: boolean;
|
||||
}): Promise<ReadFileResponse> => {
|
||||
const systemParse = () =>
|
||||
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
extension,
|
||||
encoding,
|
||||
buffer,
|
||||
teamId
|
||||
});
|
||||
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
|
||||
const url = global.systemEnv.customPdfParse?.url;
|
||||
const token = global.systemEnv.customPdfParse?.key;
|
||||
if (!url) return systemParse();
|
||||
|
||||
const start = Date.now();
|
||||
addLog.info('Parsing files from an external service');
|
||||
@@ -70,20 +80,18 @@ export const readRawContentByFileBuffer = async ({
|
||||
data.append('file', buffer, {
|
||||
filename: `file.${extension}`
|
||||
});
|
||||
data.append('extension', extension);
|
||||
data.append('ocr', ocrParse);
|
||||
const { data: response } = await axios.post<{
|
||||
success: boolean;
|
||||
message: string;
|
||||
data: {
|
||||
page: number;
|
||||
markdown: string;
|
||||
duration: number;
|
||||
};
|
||||
}>(customReadfileUrl, data, {
|
||||
}>(url, data, {
|
||||
timeout: 600000,
|
||||
headers: {
|
||||
...data.getHeaders()
|
||||
...data.getHeaders(),
|
||||
Authorization: token ? `Bearer ${token}` : undefined
|
||||
}
|
||||
});
|
||||
|
||||
@@ -92,21 +100,208 @@ export const readRawContentByFileBuffer = async ({
|
||||
const rawText = response.data.markdown;
|
||||
const { text, imageList } = matchMdImgTextAndUpload(rawText);
|
||||
|
||||
createPdfParseUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
pages: response.data.page
|
||||
});
|
||||
|
||||
return {
|
||||
rawText: text,
|
||||
formatText: rawText,
|
||||
imageList
|
||||
};
|
||||
};
|
||||
const parsePdfFromDoc2x = async (): Promise<ReadFileResponse> => {
|
||||
const doc2xKey = global.systemEnv.customPdfParse?.doc2xKey;
|
||||
if (!doc2xKey) return systemParse();
|
||||
|
||||
let { rawText, formatText, imageList } =
|
||||
(await readFileFromCustomService()) ||
|
||||
(await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
extension,
|
||||
encoding,
|
||||
buffer,
|
||||
teamId
|
||||
}));
|
||||
const parseTextImage = async (text: string) => {
|
||||
// Extract image links and convert to base64
|
||||
const imageList: { id: string; url: string }[] = [];
|
||||
const processedText = text.replace(/!\[.*?\]\((http[^)]+)\)/g, (match, url) => {
|
||||
const id = getNanoid();
|
||||
imageList.push({
|
||||
id,
|
||||
url
|
||||
});
|
||||
return ``;
|
||||
});
|
||||
|
||||
let resultImageList: ImageType[] = [];
|
||||
await Promise.all(
|
||||
imageList.map(async (item) => {
|
||||
try {
|
||||
const response = await axios.get(item.url, { responseType: 'arraybuffer' });
|
||||
const mime = response.headers['content-type'] || 'image/jpeg';
|
||||
const base64 = response.data.toString('base64');
|
||||
resultImageList.push({
|
||||
uuid: item.id,
|
||||
mime,
|
||||
base64
|
||||
});
|
||||
} catch (error) {
|
||||
addLog.warn(`Failed to get image from ${item.url}: ${getErrText(error)}`);
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
return {
|
||||
text: processedText,
|
||||
imageList: resultImageList
|
||||
};
|
||||
};
|
||||
|
||||
let startTime = Date.now();
|
||||
|
||||
// 1. Get pre-upload URL first
|
||||
const { data: preupload_data } = await axios
|
||||
.post<{ code: string; data: { uid: string; url: string } }>(
|
||||
'https://v2.doc2x.noedgeai.com/api/v2/parse/preupload',
|
||||
null,
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${doc2xKey}`
|
||||
}
|
||||
}
|
||||
)
|
||||
.catch((error) => {
|
||||
return Promise.reject(
|
||||
`[Pre-upload Error] Failed to get pre-upload URL: ${getErrText(error)}`
|
||||
);
|
||||
});
|
||||
if (preupload_data?.code !== 'success') {
|
||||
return Promise.reject(`Failed to get pre-upload URL: ${JSON.stringify(preupload_data)}`);
|
||||
}
|
||||
|
||||
const upload_url = preupload_data.data.url;
|
||||
const uid = preupload_data.data.uid;
|
||||
|
||||
// 2. Upload file to pre-signed URL with binary stream
|
||||
const blob = new Blob([buffer], { type: 'application/pdf' });
|
||||
const response = await axios
|
||||
.put(upload_url, blob, {
|
||||
headers: {
|
||||
'Content-Type': 'application/pdf'
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
return Promise.reject(`[Upload Error] Failed to upload file: ${getErrText(error)}`);
|
||||
});
|
||||
if (response.status !== 200) {
|
||||
return Promise.reject(`Upload failed with status ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
await delay(5000);
|
||||
addLog.debug(`Uploaded file to Doc2x, uid: ${uid}`);
|
||||
// 3. Get the result by uid
|
||||
const checkResult = async (retry = 30) => {
|
||||
if (retry <= 0) {
|
||||
return Promise.reject(
|
||||
`[Parse Timeout Error] Failed to get result (uid: ${uid}): Process timeout`
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
const { data: result_data } = await axios
|
||||
.get<{
|
||||
code: string;
|
||||
data: {
|
||||
progress: number;
|
||||
status: 'processing' | 'failed' | 'success';
|
||||
result: {
|
||||
pages: {
|
||||
md: string;
|
||||
}[];
|
||||
};
|
||||
};
|
||||
}>(`https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid=${uid}`, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${doc2xKey}`
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
return Promise.reject(
|
||||
`[Parse Status Error] Failed to get parse status: ${getErrText(error)}`
|
||||
);
|
||||
});
|
||||
|
||||
// Error
|
||||
if (!['ok', 'success'].includes(result_data.code)) {
|
||||
return Promise.reject(
|
||||
`Failed to get result (uid: ${uid}): ${JSON.stringify(result_data)}`
|
||||
);
|
||||
}
|
||||
|
||||
// Process
|
||||
if (['ready', 'processing'].includes(result_data.data.status)) {
|
||||
addLog.debug(`Waiting for the result, uid: ${uid}`);
|
||||
await delay(5000);
|
||||
return checkResult(retry - 1);
|
||||
}
|
||||
|
||||
// Finifsh
|
||||
if (result_data.data.status === 'success') {
|
||||
const result = result_data.data.result.pages
|
||||
.map((page) => page.md)
|
||||
.join('\n')
|
||||
// Do some post-processing
|
||||
.replace(/\\[\(\)]/g, '$')
|
||||
.replace(/\\[\[\]]/g, '$$')
|
||||
.replace(/<img\s+src="([^"]+)"(?:\s*\?[^>]*)?(?:\s*\/>|>)/g, '')
|
||||
.replace(/<!-- Media -->/g, '')
|
||||
.replace(/<!-- Footnote -->/g, '')
|
||||
.replace(/\$(.+?)\s+\\tag\{(.+?)\}\$/g, '$$$1 \\qquad \\qquad ($2)$$')
|
||||
.replace(/\\text\{([^}]*?)(\b\w+)_(\w+\b)([^}]*?)\}/g, '\\text{$1$2\\_$3$4}');
|
||||
|
||||
const { text, imageList } = await parseTextImage(htmlTable2Md(result));
|
||||
|
||||
return {
|
||||
pages: result_data.data.result.pages.length,
|
||||
text,
|
||||
imageList
|
||||
};
|
||||
}
|
||||
return checkResult(retry - 1);
|
||||
} catch (error) {
|
||||
if (retry > 1) {
|
||||
await delay(100);
|
||||
return checkResult(retry - 1);
|
||||
}
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
|
||||
const { pages, text, imageList } = await checkResult();
|
||||
|
||||
createPdfParseUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
pages
|
||||
});
|
||||
|
||||
addLog.info(`Doc2x parse success, time: ${Date.now() - startTime}ms`);
|
||||
return {
|
||||
rawText: text,
|
||||
formatText: text,
|
||||
imageList
|
||||
};
|
||||
};
|
||||
// Custom read file service
|
||||
const pdfParseFn = async (): Promise<ReadFileResponse> => {
|
||||
if (!customPdfParse) return systemParse();
|
||||
if (global.systemEnv.customPdfParse?.url) return parsePdfFromCustomService();
|
||||
if (global.systemEnv.customPdfParse?.doc2xKey) return parsePdfFromDoc2x();
|
||||
|
||||
return systemParse();
|
||||
};
|
||||
|
||||
let { rawText, formatText, imageList } = await (async () => {
|
||||
if (extension === 'pdf') {
|
||||
return await pdfParseFn();
|
||||
}
|
||||
return await systemParse();
|
||||
})();
|
||||
|
||||
// markdown data format
|
||||
if (imageList) {
|
||||
@@ -142,5 +337,5 @@ export const readRawContentByFileBuffer = async ({
|
||||
}
|
||||
}
|
||||
|
||||
return { rawText };
|
||||
return { rawText, formatText, imageList };
|
||||
};
|
||||
|
@@ -10,6 +10,11 @@ export const SERVICE_LOCAL_HOST =
|
||||
export const initFastGPTConfig = (config?: FastGPTConfigFileType) => {
|
||||
if (!config) return;
|
||||
|
||||
// Special config computed
|
||||
config.feConfigs.showCustomPdfParse =
|
||||
!!config.systemEnv.customPdfParse?.url || !!config.systemEnv.customPdfParse?.doc2xKey;
|
||||
config.feConfigs.customPdfParsePrice = config.systemEnv.customPdfParse?.price || 0;
|
||||
|
||||
global.feConfigs = config.feConfigs;
|
||||
global.systemEnv = config.systemEnv;
|
||||
global.subPlans = config.subPlans;
|
||||
|
@@ -13,6 +13,11 @@ export const getDatasetModel = (model?: string) => {
|
||||
?.find((item) => item.model === model || item.name === model) ?? getDefaultLLMModel()
|
||||
);
|
||||
};
|
||||
export const getVlmModel = (model?: string) => {
|
||||
return Array.from(global.llmModelMap.values())
|
||||
?.filter((item) => item.vision)
|
||||
?.find((item) => item.model === model || item.name === model);
|
||||
};
|
||||
|
||||
export const getDefaultEmbeddingModel = () => global?.systemDefaultModel.embedding!;
|
||||
export const getEmbeddingModel = (model?: string) => {
|
||||
|
@@ -9,10 +9,9 @@ import type {
|
||||
} from '@fastgpt/global/core/ai/type.d';
|
||||
import axios from 'axios';
|
||||
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
|
||||
import { getFileContentTypeFromHeader, guessBase64ImageType } from '../../common/file/utils';
|
||||
import { serverRequestBaseUrl } from '../../common/api/serverRequest';
|
||||
import { i18nT } from '../../../web/i18n/utils';
|
||||
import { addLog } from '../../common/system/log';
|
||||
import { getImageBase64 } from '../../common/file/image/utils';
|
||||
|
||||
export const filterGPTMessageByMaxContext = async ({
|
||||
messages = [],
|
||||
@@ -166,25 +165,13 @@ export const loadRequestMessages = async ({
|
||||
try {
|
||||
// If imgUrl is a local path, load image from local, and set url to base64
|
||||
if (imgUrl.startsWith('/') || process.env.MULTIPLE_DATA_TO_BASE64 === 'true') {
|
||||
addLog.debug('Load image from local server', {
|
||||
baseUrl: serverRequestBaseUrl,
|
||||
requestUrl: imgUrl
|
||||
});
|
||||
const response = await axios.get(imgUrl, {
|
||||
baseURL: serverRequestBaseUrl,
|
||||
responseType: 'arraybuffer',
|
||||
proxy: false
|
||||
});
|
||||
const base64 = Buffer.from(response.data, 'binary').toString('base64');
|
||||
const imageType =
|
||||
getFileContentTypeFromHeader(response.headers['content-type']) ||
|
||||
guessBase64ImageType(base64);
|
||||
const base64 = await getImageBase64(imgUrl);
|
||||
|
||||
return {
|
||||
...item,
|
||||
image_url: {
|
||||
...item.image_url,
|
||||
url: `data:${imageType};base64,${base64}`
|
||||
url: base64
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -223,7 +210,8 @@ export const loadRequestMessages = async ({
|
||||
await Promise.all(
|
||||
content.map(async (item) => {
|
||||
if (item.type === 'text') {
|
||||
if (item.text) return parseStringWithImages(item.text);
|
||||
// If it is array, not need to parse image
|
||||
if (item.text) return item;
|
||||
return;
|
||||
}
|
||||
if (item.type === 'file_url') return; // LLM not support file_url
|
||||
|
@@ -108,7 +108,15 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
|
||||
return formattedFiles;
|
||||
};
|
||||
|
||||
const getFileContent = async ({ teamId, apiFileId }: { teamId: string; apiFileId: string }) => {
|
||||
const getFileContent = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
apiFileId
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
apiFileId: string;
|
||||
}) => {
|
||||
const data = await request<APIFileContentResponse>(
|
||||
`/v1/file/content`,
|
||||
{ id: apiFileId },
|
||||
@@ -123,6 +131,7 @@ export const useApiDatasetRequest = ({ apiServer }: { apiServer: APIFileServer }
|
||||
if (previewUrl) {
|
||||
const rawText = await readFileRawTextByUrl({
|
||||
teamId,
|
||||
tmbId,
|
||||
url: previewUrl,
|
||||
relatedId: apiFileId
|
||||
});
|
||||
|
@@ -1,6 +1,6 @@
|
||||
import {
|
||||
DatasetCollectionTypeEnum,
|
||||
TrainingModeEnum
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d';
|
||||
import { MongoDatasetCollection } from './schema';
|
||||
@@ -19,13 +19,14 @@ import { predictDataLimitLength } from '../../../../global/core/dataset/utils';
|
||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||
import { createTrainingUsage } from '../../../support/wallet/usage/controller';
|
||||
import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants';
|
||||
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
|
||||
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
|
||||
import { pushDataListToTrainingQueue } from '../training/controller';
|
||||
import { MongoImage } from '../../../common/file/image/schema';
|
||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||
import { addDays } from 'date-fns';
|
||||
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||
import { delay, retryFn } from '@fastgpt/global/common/system/utils';
|
||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||
import { getTrainingModeByCollection } from './utils';
|
||||
|
||||
export const createCollectionAndInsertData = async ({
|
||||
dataset,
|
||||
@@ -33,6 +34,7 @@ export const createCollectionAndInsertData = async ({
|
||||
relatedId,
|
||||
createCollectionParams,
|
||||
isQAImport = false,
|
||||
billId,
|
||||
session
|
||||
}: {
|
||||
dataset: DatasetSchemaType;
|
||||
@@ -41,13 +43,21 @@ export const createCollectionAndInsertData = async ({
|
||||
createCollectionParams: CreateOneCollectionParams;
|
||||
|
||||
isQAImport?: boolean;
|
||||
billId?: string;
|
||||
session?: ClientSession;
|
||||
}) => {
|
||||
// Adapter 4.9.0
|
||||
if (createCollectionParams.trainingType === DatasetCollectionDataProcessModeEnum.auto) {
|
||||
createCollectionParams.trainingType = DatasetCollectionDataProcessModeEnum.chunk;
|
||||
createCollectionParams.autoIndexes = true;
|
||||
}
|
||||
|
||||
const teamId = createCollectionParams.teamId;
|
||||
const tmbId = createCollectionParams.tmbId;
|
||||
// Chunk split params
|
||||
const trainingType = createCollectionParams.trainingType || TrainingModeEnum.chunk;
|
||||
const chunkSize = createCollectionParams.chunkSize;
|
||||
const trainingType =
|
||||
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
||||
const chunkSize = createCollectionParams.chunkSize || 512;
|
||||
const chunkSplitter = createCollectionParams.chunkSplitter;
|
||||
const qaPrompt = createCollectionParams.qaPrompt;
|
||||
const usageName = createCollectionParams.name;
|
||||
@@ -56,7 +66,7 @@ export const createCollectionAndInsertData = async ({
|
||||
const chunks = rawText2Chunks({
|
||||
rawText,
|
||||
chunkLen: chunkSize,
|
||||
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
|
||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||
isQAImport
|
||||
});
|
||||
@@ -64,7 +74,14 @@ export const createCollectionAndInsertData = async ({
|
||||
// 2. auth limit
|
||||
await checkDatasetLimit({
|
||||
teamId,
|
||||
insertLen: predictDataLimitLength(trainingType, chunks)
|
||||
insertLen: predictDataLimitLength(
|
||||
getTrainingModeByCollection({
|
||||
trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
chunks
|
||||
)
|
||||
});
|
||||
|
||||
const fn = async (session: ClientSession) => {
|
||||
@@ -89,15 +106,20 @@ export const createCollectionAndInsertData = async ({
|
||||
});
|
||||
|
||||
// 4. create training bill
|
||||
const { billId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: usageName,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
session
|
||||
});
|
||||
const traingBillId = await (async () => {
|
||||
if (billId) return billId;
|
||||
const { billId: newBillId } = await createTrainingUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: usageName,
|
||||
billSource: UsageSourceEnum.training,
|
||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||
vllmModel: getVlmModel(dataset.vlmModel)?.name,
|
||||
session
|
||||
});
|
||||
return newBillId;
|
||||
})();
|
||||
|
||||
// 5. insert to training queue
|
||||
const insertResults = await pushDataListToTrainingQueue({
|
||||
@@ -107,9 +129,14 @@ export const createCollectionAndInsertData = async ({
|
||||
collectionId,
|
||||
agentModel: dataset.agentModel,
|
||||
vectorModel: dataset.vectorModel,
|
||||
trainingMode: trainingType,
|
||||
vlmModel: dataset.vlmModel,
|
||||
mode: getTrainingModeByCollection({
|
||||
trainingType,
|
||||
autoIndexes: createCollectionParams.autoIndexes,
|
||||
imageIndex: createCollectionParams.imageIndex
|
||||
}),
|
||||
prompt: qaPrompt,
|
||||
billId,
|
||||
billId: traingBillId,
|
||||
data: chunks.map((item, index) => ({
|
||||
...item,
|
||||
chunkIndex: index
|
||||
@@ -161,10 +188,15 @@ export async function createOneCollection({
|
||||
datasetId,
|
||||
type,
|
||||
|
||||
trainingType = TrainingModeEnum.chunk,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
createTime,
|
||||
updateTime,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
tags,
|
||||
|
||||
nextSyncTime,
|
||||
|
||||
fileId,
|
||||
rawLink,
|
||||
@@ -172,15 +204,18 @@ export async function createOneCollection({
|
||||
externalFileUrl,
|
||||
apiFileId,
|
||||
|
||||
hashRawText,
|
||||
rawTextLength,
|
||||
metadata = {},
|
||||
session,
|
||||
tags,
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime
|
||||
// Chunk settings
|
||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
||||
autoIndexes,
|
||||
chunkSize = 512,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
|
||||
session
|
||||
}: CreateOneCollectionParams) {
|
||||
// Create collection tags
|
||||
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
|
||||
@@ -196,25 +231,31 @@ export async function createOneCollection({
|
||||
name,
|
||||
type,
|
||||
|
||||
trainingType,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt,
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
tags: collectionTags,
|
||||
metadata,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime,
|
||||
|
||||
...(fileId ? { fileId } : {}),
|
||||
...(rawLink ? { rawLink } : {}),
|
||||
...(externalFileId ? { externalFileId } : {}),
|
||||
...(externalFileUrl ? { externalFileUrl } : {}),
|
||||
...(apiFileId ? { apiFileId } : {}),
|
||||
|
||||
rawTextLength,
|
||||
hashRawText,
|
||||
tags: collectionTags,
|
||||
// Parse settings
|
||||
customPdfParse,
|
||||
imageIndex,
|
||||
|
||||
createTime,
|
||||
updateTime,
|
||||
nextSyncTime
|
||||
// Chunk settings
|
||||
trainingType,
|
||||
autoIndexes,
|
||||
chunkSize,
|
||||
chunkSplitter,
|
||||
qaPrompt
|
||||
}
|
||||
],
|
||||
{ session, ordered: true }
|
||||
|
@@ -1,7 +1,10 @@
|
||||
import { connectionMongo, getMongoModel } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||
import { TrainingTypeMap, DatasetCollectionTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
import {
|
||||
DatasetCollectionTypeMap,
|
||||
DatasetCollectionDataProcessModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
@@ -31,6 +34,8 @@ const DatasetCollectionSchema = new Schema({
|
||||
ref: DatasetCollectionName,
|
||||
required: true
|
||||
},
|
||||
|
||||
// Basic info
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.keys(DatasetCollectionTypeMap),
|
||||
@@ -40,6 +45,11 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
tags: {
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
|
||||
createTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
@@ -48,33 +58,8 @@ const DatasetCollectionSchema = new Schema({
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
forbid: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
|
||||
// chunk filed
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.keys(TrainingTypeMap)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: {
|
||||
type: String
|
||||
},
|
||||
qaPrompt: {
|
||||
type: String
|
||||
},
|
||||
ocrParse: Boolean,
|
||||
|
||||
tags: {
|
||||
type: [String],
|
||||
default: []
|
||||
},
|
||||
|
||||
// Metadata
|
||||
// local file collection
|
||||
fileId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
@@ -82,22 +67,39 @@ const DatasetCollectionSchema = new Schema({
|
||||
},
|
||||
// web link collection
|
||||
rawLink: String,
|
||||
// api collection
|
||||
// Api collection
|
||||
apiFileId: String,
|
||||
// external collection
|
||||
// external collection(Abandoned)
|
||||
externalFileId: String,
|
||||
externalFileUrl: String, // external import url
|
||||
|
||||
// next sync time
|
||||
nextSyncTime: Date,
|
||||
|
||||
// metadata
|
||||
rawTextLength: Number,
|
||||
hashRawText: String,
|
||||
metadata: {
|
||||
type: Object,
|
||||
default: {}
|
||||
}
|
||||
},
|
||||
|
||||
forbid: Boolean,
|
||||
// next sync time
|
||||
nextSyncTime: Date,
|
||||
|
||||
// Parse settings
|
||||
customPdfParse: Boolean,
|
||||
|
||||
// Chunk settings
|
||||
imageIndex: Boolean,
|
||||
autoIndexes: Boolean,
|
||||
trainingType: {
|
||||
type: String,
|
||||
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
||||
},
|
||||
chunkSize: {
|
||||
type: Number,
|
||||
required: true
|
||||
},
|
||||
chunkSplitter: String,
|
||||
qaPrompt: String
|
||||
});
|
||||
|
||||
DatasetCollectionSchema.virtual('dataset', {
|
||||
|
@@ -2,12 +2,17 @@ import { MongoDatasetCollection } from './schema';
|
||||
import { ClientSession } from '../../../common/mongo';
|
||||
import { MongoDatasetCollectionTags } from '../tag/schema';
|
||||
import { readFromSecondary } from '../../../common/mongo/utils';
|
||||
import { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type';
|
||||
import {
|
||||
CollectionWithDatasetType,
|
||||
DatasetCollectionSchemaType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import {
|
||||
DatasetCollectionDataProcessModeEnum,
|
||||
DatasetCollectionSyncResultEnum,
|
||||
DatasetCollectionTypeEnum,
|
||||
DatasetSourceReadTypeEnum,
|
||||
DatasetTypeEnum
|
||||
DatasetTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset';
|
||||
import { readDatasetSourceRawText } from '../read';
|
||||
@@ -160,6 +165,7 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
|
||||
})();
|
||||
const rawText = await readDatasetSourceRawText({
|
||||
teamId: collection.teamId,
|
||||
tmbId: collection.tmbId,
|
||||
...sourceReadType
|
||||
});
|
||||
|
||||
@@ -220,3 +226,24 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => {
|
||||
|
||||
return DatasetCollectionSyncResultEnum.success;
|
||||
};
|
||||
|
||||
/*
|
||||
QA: 独立进程
|
||||
Chunk: Image Index -> Auto index -> chunk index
|
||||
*/
|
||||
export const getTrainingModeByCollection = (collection: {
|
||||
trainingType: DatasetCollectionSchemaType['trainingType'];
|
||||
autoIndexes?: DatasetCollectionSchemaType['autoIndexes'];
|
||||
imageIndex?: DatasetCollectionSchemaType['imageIndex'];
|
||||
}) => {
|
||||
if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||
return TrainingModeEnum.qa;
|
||||
}
|
||||
if (collection.imageIndex && global.feConfigs?.isPlus) {
|
||||
return TrainingModeEnum.image;
|
||||
}
|
||||
if (collection.autoIndexes && global.feConfigs?.isPlus) {
|
||||
return TrainingModeEnum.auto;
|
||||
}
|
||||
return TrainingModeEnum.chunk;
|
||||
};
|
||||
|
@@ -7,6 +7,7 @@ import {
|
||||
} from '@fastgpt/global/support/user/team/constant';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import { DatasetColCollectionName } from '../collection/schema';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
|
||||
export const DatasetDataCollectionName = 'dataset_datas';
|
||||
|
||||
@@ -42,10 +43,16 @@ const DatasetDataSchema = new Schema({
|
||||
indexes: {
|
||||
type: [
|
||||
{
|
||||
// Abandon
|
||||
defaultIndex: {
|
||||
type: Boolean,
|
||||
default: false
|
||||
},
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.values(DatasetDataIndexTypeEnum),
|
||||
default: DatasetDataIndexTypeEnum.custom
|
||||
},
|
||||
dataId: {
|
||||
type: String,
|
||||
required: true
|
||||
|
@@ -13,11 +13,15 @@ import { POST } from '../../common/api/plusRequest';
|
||||
|
||||
export const readFileRawTextByUrl = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
url,
|
||||
customPdfParse,
|
||||
relatedId
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
url: string;
|
||||
customPdfParse?: boolean;
|
||||
relatedId: string; // externalFileId / apiFileId
|
||||
}) => {
|
||||
const response = await axios({
|
||||
@@ -30,8 +34,11 @@ export const readFileRawTextByUrl = async ({
|
||||
const buffer = Buffer.from(response.data, 'binary');
|
||||
|
||||
const { rawText } = await readRawContentByFileBuffer({
|
||||
customPdfParse,
|
||||
isQAImport: false,
|
||||
extension,
|
||||
teamId,
|
||||
tmbId,
|
||||
buffer,
|
||||
encoding: 'utf-8',
|
||||
metadata: {
|
||||
@@ -49,6 +56,7 @@ export const readFileRawTextByUrl = async ({
|
||||
*/
|
||||
export const readDatasetSourceRawText = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
type,
|
||||
sourceId,
|
||||
isQAImport,
|
||||
@@ -56,11 +64,14 @@ export const readDatasetSourceRawText = async ({
|
||||
externalFileId,
|
||||
apiServer,
|
||||
feishuServer,
|
||||
yuqueServer
|
||||
yuqueServer,
|
||||
customPdfParse
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
type: DatasetSourceReadTypeEnum;
|
||||
sourceId: string;
|
||||
customPdfParse?: boolean;
|
||||
|
||||
isQAImport?: boolean; // csv data
|
||||
selector?: string; // link selector
|
||||
@@ -72,9 +83,11 @@ export const readDatasetSourceRawText = async ({
|
||||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||
const { rawText } = await readFileContentFromMongo({
|
||||
teamId,
|
||||
tmbId,
|
||||
bucketName: BucketNameEnum.dataset,
|
||||
fileId: sourceId,
|
||||
isQAImport
|
||||
isQAImport,
|
||||
customPdfParse
|
||||
});
|
||||
return rawText;
|
||||
} else if (type === DatasetSourceReadTypeEnum.link) {
|
||||
@@ -88,8 +101,10 @@ export const readDatasetSourceRawText = async ({
|
||||
if (!externalFileId) return Promise.reject('FileId not found');
|
||||
const rawText = await readFileRawTextByUrl({
|
||||
teamId,
|
||||
tmbId,
|
||||
url: sourceId,
|
||||
relatedId: externalFileId
|
||||
relatedId: externalFileId,
|
||||
customPdfParse
|
||||
});
|
||||
return rawText;
|
||||
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
|
||||
@@ -98,7 +113,8 @@ export const readDatasetSourceRawText = async ({
|
||||
feishuServer,
|
||||
yuqueServer,
|
||||
apiFileId: sourceId,
|
||||
teamId
|
||||
teamId,
|
||||
tmbId
|
||||
});
|
||||
return rawText;
|
||||
}
|
||||
@@ -110,16 +126,18 @@ export const readApiServerFileContent = async ({
|
||||
feishuServer,
|
||||
yuqueServer,
|
||||
apiFileId,
|
||||
teamId
|
||||
teamId,
|
||||
tmbId
|
||||
}: {
|
||||
apiServer?: APIFileServer;
|
||||
feishuServer?: FeishuServer;
|
||||
yuqueServer?: YuqueServer;
|
||||
apiFileId: string;
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
}) => {
|
||||
if (apiServer) {
|
||||
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, apiFileId });
|
||||
return useApiDatasetRequest({ apiServer }).getFileContent({ teamId, tmbId, apiFileId });
|
||||
}
|
||||
|
||||
if (feishuServer || yuqueServer) {
|
||||
|
@@ -67,6 +67,7 @@ const DatasetSchema = new Schema({
|
||||
required: true,
|
||||
default: 'gpt-4o-mini'
|
||||
},
|
||||
vlmModel: String,
|
||||
intro: {
|
||||
type: String,
|
||||
default: ''
|
||||
|
@@ -1,16 +1,16 @@
|
||||
import { MongoDatasetTraining } from './schema';
|
||||
import type {
|
||||
PushDatasetDataChunkProps,
|
||||
PushDatasetDataProps,
|
||||
PushDatasetDataResponse
|
||||
} from '@fastgpt/global/core/dataset/api.d';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import { ClientSession } from '../../../common/mongo';
|
||||
import { getLLMModel, getEmbeddingModel } from '../../ai/model';
|
||||
import { getLLMModel, getEmbeddingModel, getVlmModel } from '../../ai/model';
|
||||
import { addLog } from '../../../common/system/log';
|
||||
import { getCollectionWithDataset } from '../controller';
|
||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||
try {
|
||||
@@ -28,20 +28,17 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
|
||||
export const pushDataListToTrainingQueueByCollectionId = async ({
|
||||
collectionId,
|
||||
...props
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps) => {
|
||||
}: Omit<PushDataToTrainingQueueProps, 'datasetId' | 'agentModel' | 'vectorModel' | 'vlmModel'>) => {
|
||||
const {
|
||||
dataset: { _id: datasetId, agentModel, vectorModel }
|
||||
dataset: { _id: datasetId, agentModel, vectorModel, vlmModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
return pushDataListToTrainingQueue({
|
||||
...props,
|
||||
datasetId,
|
||||
collectionId,
|
||||
vectorModel,
|
||||
agentModel,
|
||||
vectorModel
|
||||
vlmModel
|
||||
});
|
||||
};
|
||||
|
||||
@@ -52,30 +49,30 @@ export async function pushDataListToTrainingQueue({
|
||||
collectionId,
|
||||
agentModel,
|
||||
vectorModel,
|
||||
vlmModel,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode = TrainingModeEnum.chunk,
|
||||
mode = TrainingModeEnum.chunk,
|
||||
session
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
datasetId: string;
|
||||
agentModel: string;
|
||||
vectorModel: string;
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
||||
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
||||
if (mode !== TrainingModeEnum.image) return mode;
|
||||
// 检查内容中,是否包含  的图片格式
|
||||
const text = data.q + data.a || '';
|
||||
const regex = /!\[\]\((.*?)\)/g;
|
||||
const match = text.match(regex);
|
||||
if (match) {
|
||||
return TrainingModeEnum.image;
|
||||
}
|
||||
return mode;
|
||||
};
|
||||
const { model, maxToken, weight } = await (async () => {
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(`File model ${agentModel} is inValid`);
|
||||
}
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Vector model ${vectorModel} is inValid`);
|
||||
}
|
||||
|
||||
if (trainingMode === TrainingModeEnum.chunk) {
|
||||
if (mode === TrainingModeEnum.chunk) {
|
||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Vector model ${vectorModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
maxToken: vectorModelData.maxToken * 1.5,
|
||||
model: vectorModelData.model,
|
||||
@@ -83,7 +80,11 @@ export async function pushDataListToTrainingQueue({
|
||||
};
|
||||
}
|
||||
|
||||
if (trainingMode === TrainingModeEnum.qa || trainingMode === TrainingModeEnum.auto) {
|
||||
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(`File model ${agentModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
maxToken: agentModelData.maxContext * 0.8,
|
||||
model: agentModelData.model,
|
||||
@@ -91,8 +92,24 @@ export async function pushDataListToTrainingQueue({
|
||||
};
|
||||
}
|
||||
|
||||
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
|
||||
if (mode === TrainingModeEnum.image) {
|
||||
const vllmModelData = getVlmModel(vlmModel);
|
||||
if (!vllmModelData) {
|
||||
return Promise.reject(`Vlm model ${vlmModel} is inValid`);
|
||||
}
|
||||
return {
|
||||
maxToken: vllmModelData.maxContext * 0.8,
|
||||
model: vllmModelData.model,
|
||||
weight: 0
|
||||
};
|
||||
}
|
||||
|
||||
return Promise.reject(`Training mode "${mode}" is inValid`);
|
||||
})();
|
||||
// Filter redundant params
|
||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||
prompt = undefined;
|
||||
}
|
||||
|
||||
// filter repeat or equal content
|
||||
const set = new Set();
|
||||
@@ -158,7 +175,7 @@ export async function pushDataListToTrainingQueue({
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
mode: getImageChunkMode(item, mode),
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
|
@@ -1,14 +1,15 @@
|
||||
/* 模型的知识库 */
|
||||
import { connectionMongo, getMongoModel, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { connectionMongo, getMongoModel } from '../../../common/mongo';
|
||||
const { Schema } = connectionMongo;
|
||||
import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import { TrainingTypeMap } from '@fastgpt/global/core/dataset/constants';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { DatasetColCollectionName } from '../collection/schema';
|
||||
import { DatasetCollectionName } from '../schema';
|
||||
import {
|
||||
TeamCollectionName,
|
||||
TeamMemberCollectionName
|
||||
} from '@fastgpt/global/support/user/team/constant';
|
||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||
|
||||
export const DatasetTrainingCollectionName = 'dataset_trainings';
|
||||
|
||||
@@ -25,7 +26,6 @@ const TrainingDataSchema = new Schema({
|
||||
},
|
||||
datasetId: {
|
||||
type: Schema.Types.ObjectId,
|
||||
ref: DatasetCollectionName,
|
||||
required: true
|
||||
},
|
||||
collectionId: {
|
||||
@@ -33,15 +33,13 @@ const TrainingDataSchema = new Schema({
|
||||
ref: DatasetColCollectionName,
|
||||
required: true
|
||||
},
|
||||
billId: {
|
||||
// concat bill
|
||||
type: String
|
||||
},
|
||||
billId: String,
|
||||
mode: {
|
||||
type: String,
|
||||
enum: Object.keys(TrainingTypeMap),
|
||||
enum: Object.values(TrainingModeEnum),
|
||||
required: true
|
||||
},
|
||||
|
||||
expireAt: {
|
||||
// It will be deleted after 7 days
|
||||
type: Date,
|
||||
@@ -88,6 +86,10 @@ const TrainingDataSchema = new Schema({
|
||||
indexes: {
|
||||
type: [
|
||||
{
|
||||
type: {
|
||||
type: String,
|
||||
enum: Object.values(DatasetDataIndexTypeEnum)
|
||||
},
|
||||
text: {
|
||||
type: String,
|
||||
required: true
|
||||
@@ -98,6 +100,19 @@ const TrainingDataSchema = new Schema({
|
||||
}
|
||||
});
|
||||
|
||||
TrainingDataSchema.virtual('dataset', {
|
||||
ref: DatasetCollectionName,
|
||||
localField: 'datasetId',
|
||||
foreignField: '_id',
|
||||
justOne: true
|
||||
});
|
||||
TrainingDataSchema.virtual('collection', {
|
||||
ref: DatasetColCollectionName,
|
||||
localField: 'collectionId',
|
||||
foreignField: '_id',
|
||||
justOne: true
|
||||
});
|
||||
|
||||
try {
|
||||
// lock training data(teamId); delete training data
|
||||
TrainingDataSchema.index({ teamId: 1, datasetId: 1 });
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import { NodeOutputKeyEnum } from '@fastgpt/global/core/workflow/constants';
|
||||
import { DispatchNodeResponseKeyEnum } from '@fastgpt/global/core/workflow/runtime/constants';
|
||||
import type {
|
||||
ChatDispatchProps,
|
||||
DispatchNodeResultType,
|
||||
RuntimeNodeItemType
|
||||
} from '@fastgpt/global/core/workflow/runtime/type';
|
||||
@@ -46,7 +47,7 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
|
||||
query,
|
||||
requestOrigin,
|
||||
chatConfig,
|
||||
runningAppInfo: { teamId },
|
||||
runningUserInfo,
|
||||
externalProvider,
|
||||
params: {
|
||||
model,
|
||||
@@ -99,10 +100,10 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
|
||||
|
||||
const globalFiles = chatValue2RuntimePrompt(query).files;
|
||||
const { documentQuoteText, userFiles } = await getMultiInput({
|
||||
runningUserInfo,
|
||||
histories: chatHistories,
|
||||
requestOrigin,
|
||||
maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
|
||||
teamId,
|
||||
fileLinks,
|
||||
inputFiles: globalFiles,
|
||||
hasReadFilesTool
|
||||
@@ -289,19 +290,19 @@ export const dispatchRunTools = async (props: DispatchToolModuleProps): Promise<
|
||||
};
|
||||
|
||||
const getMultiInput = async ({
|
||||
runningUserInfo,
|
||||
histories,
|
||||
fileLinks,
|
||||
requestOrigin,
|
||||
maxFiles,
|
||||
teamId,
|
||||
inputFiles,
|
||||
hasReadFilesTool
|
||||
}: {
|
||||
runningUserInfo: ChatDispatchProps['runningUserInfo'];
|
||||
histories: ChatItemType[];
|
||||
fileLinks?: string[];
|
||||
requestOrigin?: string;
|
||||
maxFiles: number;
|
||||
teamId: string;
|
||||
inputFiles: UserChatItemValueItemType['file'][];
|
||||
hasReadFilesTool: boolean;
|
||||
}) => {
|
||||
@@ -329,7 +330,8 @@ const getMultiInput = async ({
|
||||
urls,
|
||||
requestOrigin,
|
||||
maxFiles,
|
||||
teamId
|
||||
teamId: runningUserInfo.teamId,
|
||||
tmbId: runningUserInfo.tmbId
|
||||
});
|
||||
|
||||
return {
|
||||
|
@@ -11,7 +11,10 @@ import { formatModelChars2Points } from '../../../../support/wallet/usage/utils'
|
||||
import type { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||
import { postTextCensor } from '../../../../common/api/requestPlusApi';
|
||||
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
|
||||
import type { DispatchNodeResultType } from '@fastgpt/global/core/workflow/runtime/type';
|
||||
import type {
|
||||
ChatDispatchProps,
|
||||
DispatchNodeResultType
|
||||
} from '@fastgpt/global/core/workflow/runtime/type';
|
||||
import { countGptMessagesTokens } from '../../../../common/string/tiktoken/index';
|
||||
import {
|
||||
chats2GPTMessages,
|
||||
@@ -69,7 +72,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
|
||||
histories,
|
||||
node: { name },
|
||||
query,
|
||||
runningAppInfo: { teamId },
|
||||
runningUserInfo,
|
||||
workflowStreamResponse,
|
||||
chatConfig,
|
||||
params: {
|
||||
@@ -121,7 +124,7 @@ export const dispatchChatCompletion = async (props: ChatProps): Promise<ChatResp
|
||||
stringQuoteText,
|
||||
requestOrigin,
|
||||
maxFiles: chatConfig?.fileSelectConfig?.maxFiles || 20,
|
||||
teamId
|
||||
runningUserInfo
|
||||
})
|
||||
]);
|
||||
|
||||
@@ -355,7 +358,7 @@ async function getMultiInput({
|
||||
stringQuoteText,
|
||||
requestOrigin,
|
||||
maxFiles,
|
||||
teamId
|
||||
runningUserInfo
|
||||
}: {
|
||||
histories: ChatItemType[];
|
||||
inputFiles: UserChatItemValueItemType['file'][];
|
||||
@@ -363,7 +366,7 @@ async function getMultiInput({
|
||||
stringQuoteText?: string; // file quote
|
||||
requestOrigin?: string;
|
||||
maxFiles: number;
|
||||
teamId: string;
|
||||
runningUserInfo: ChatDispatchProps['runningUserInfo'];
|
||||
}) {
|
||||
// 旧版本适配====>
|
||||
if (stringQuoteText) {
|
||||
@@ -400,7 +403,8 @@ async function getMultiInput({
|
||||
urls,
|
||||
requestOrigin,
|
||||
maxFiles,
|
||||
teamId
|
||||
teamId: runningUserInfo.teamId,
|
||||
tmbId: runningUserInfo.tmbId
|
||||
});
|
||||
|
||||
return {
|
||||
|
@@ -45,7 +45,7 @@ ${content.slice(0, 100)}${content.length > 100 ? '......' : ''}
|
||||
export const dispatchReadFiles = async (props: Props): Promise<Response> => {
|
||||
const {
|
||||
requestOrigin,
|
||||
runningAppInfo: { teamId },
|
||||
runningUserInfo: { teamId, tmbId },
|
||||
histories,
|
||||
chatConfig,
|
||||
node: { version },
|
||||
@@ -61,7 +61,8 @@ export const dispatchReadFiles = async (props: Props): Promise<Response> => {
|
||||
urls: [...fileUrlList, ...filesFromHistories],
|
||||
requestOrigin,
|
||||
maxFiles,
|
||||
teamId
|
||||
teamId,
|
||||
tmbId
|
||||
});
|
||||
|
||||
return {
|
||||
@@ -105,12 +106,14 @@ export const getFileContentFromLinks = async ({
|
||||
urls,
|
||||
requestOrigin,
|
||||
maxFiles,
|
||||
teamId
|
||||
teamId,
|
||||
tmbId
|
||||
}: {
|
||||
urls: string[];
|
||||
requestOrigin?: string;
|
||||
maxFiles: number;
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
}) => {
|
||||
const parseUrlList = urls
|
||||
// Remove invalid urls
|
||||
@@ -205,6 +208,7 @@ export const getFileContentFromLinks = async ({
|
||||
extension,
|
||||
isQAImport: false,
|
||||
teamId,
|
||||
tmbId,
|
||||
buffer,
|
||||
encoding
|
||||
});
|
||||
|
@@ -117,14 +117,16 @@ export const createTrainingUsage = async ({
|
||||
billSource,
|
||||
vectorModel,
|
||||
agentModel,
|
||||
vllmModel,
|
||||
session
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
appName: string;
|
||||
billSource: UsageSourceEnum;
|
||||
vectorModel: string;
|
||||
agentModel: string;
|
||||
vectorModel?: string;
|
||||
agentModel?: string;
|
||||
vllmModel?: string;
|
||||
session?: ClientSession;
|
||||
}) => {
|
||||
const [{ _id }] = await MongoUsage.create(
|
||||
@@ -136,27 +138,46 @@ export const createTrainingUsage = async ({
|
||||
source: billSource,
|
||||
totalPoints: 0,
|
||||
list: [
|
||||
{
|
||||
moduleName: i18nT('common:support.wallet.moduleName.index'),
|
||||
model: vectorModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
},
|
||||
{
|
||||
moduleName: i18nT('common:support.wallet.moduleName.qa'),
|
||||
model: agentModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
},
|
||||
{
|
||||
moduleName: i18nT('common:core.dataset.training.Auto mode'),
|
||||
model: agentModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
}
|
||||
...(vectorModel
|
||||
? [
|
||||
{
|
||||
moduleName: i18nT('account_usage:embedding_index'),
|
||||
model: vectorModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
}
|
||||
]
|
||||
: []),
|
||||
...(agentModel
|
||||
? [
|
||||
{
|
||||
moduleName: i18nT('account_usage:qa'),
|
||||
model: agentModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
},
|
||||
{
|
||||
moduleName: i18nT('account_usage:auto_index'),
|
||||
model: agentModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
}
|
||||
]
|
||||
: []),
|
||||
...(vllmModel
|
||||
? [
|
||||
{
|
||||
moduleName: i18nT('account_usage:image_parse'),
|
||||
model: vllmModel,
|
||||
amount: 0,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0
|
||||
}
|
||||
]
|
||||
: [])
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -165,3 +186,31 @@ export const createTrainingUsage = async ({
|
||||
|
||||
return { billId: String(_id) };
|
||||
};
|
||||
|
||||
export const createPdfParseUsage = async ({
|
||||
teamId,
|
||||
tmbId,
|
||||
pages
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
pages: number;
|
||||
}) => {
|
||||
const unitPrice = global.systemEnv?.customPdfParse?.price || 0;
|
||||
const totalPoints = pages * unitPrice;
|
||||
|
||||
createUsage({
|
||||
teamId,
|
||||
tmbId,
|
||||
appName: i18nT('account_usage:pdf_enhanced_parse'),
|
||||
totalPoints,
|
||||
source: UsageSourceEnum.pdfParse,
|
||||
list: [
|
||||
{
|
||||
moduleName: i18nT('account_usage:pdf_enhanced_parse'),
|
||||
amount: totalPoints,
|
||||
pages
|
||||
}
|
||||
]
|
||||
});
|
||||
};
|
||||
|
@@ -9,7 +9,7 @@ import { readXlsxRawText } from './extension/xlsx';
|
||||
import { readCsvRawText } from './extension/csv';
|
||||
|
||||
parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
|
||||
const readRawContentByFileBuffer = async (params: ReadRawTextByBuffer) => {
|
||||
const read = async (params: ReadRawTextByBuffer) => {
|
||||
switch (params.extension) {
|
||||
case 'txt':
|
||||
case 'md':
|
||||
@@ -41,7 +41,7 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
|
||||
try {
|
||||
parentPort?.postMessage({
|
||||
type: 'success',
|
||||
data: await readRawContentByFileBuffer(newProps)
|
||||
data: await read(newProps)
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
|
Reference in New Issue
Block a user