Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -1,31 +0,0 @@
export const retryRun = <T>(fn: () => T, retry = 2): T => {
try {
return fn();
} catch (error) {
if (retry > 0) {
return retryRun(fn, retry - 1);
}
throw error;
}
};
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
const batchArr = new Array(batchSize).fill(null);
const result: any[] = [];
const batchFn = async () => {
const data = arr.shift();
if (data) {
result.push(await fn(data));
return batchFn();
}
};
await Promise.all(
batchArr.map(async () => {
await batchFn();
})
);
return result;
};

View File

@@ -1,4 +1,4 @@
import { batchRun } from '../fn/utils';
import { batchRun } from '../system/utils';
import { getNanoid, simpleText } from './tools';
import type { ImageType } from '../../../service/worker/readFile/type';
@@ -37,6 +37,80 @@ export const simpleMarkdownText = (rawText: string) => {
return rawText.trim();
};
export const htmlTable2Md = (content: string): string => {
return content.replace(/<table>[\s\S]*?<\/table>/g, (htmlTable) => {
try {
// Clean up whitespace and newlines
const cleanHtml = htmlTable.replace(/\n\s*/g, '');
const rows = cleanHtml.match(/<tr>(.*?)<\/tr>/g);
if (!rows) return htmlTable;
// Parse table data
let tableData: string[][] = [];
let maxColumns = 0;
// Try to convert to markdown table
rows.forEach((row, rowIndex) => {
if (!tableData[rowIndex]) {
tableData[rowIndex] = [];
}
let colIndex = 0;
const cells = row.match(/<td.*?>(.*?)<\/td>/g) || [];
cells.forEach((cell) => {
while (tableData[rowIndex][colIndex]) {
colIndex++;
}
const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1');
const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1');
const content = cell.replace(/<td.*?>|<\/td>/g, '').trim();
for (let i = 0; i < rowspan; i++) {
for (let j = 0; j < colspan; j++) {
if (!tableData[rowIndex + i]) {
tableData[rowIndex + i] = [];
}
tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^';
}
}
colIndex += colspan;
maxColumns = Math.max(maxColumns, colIndex);
});
for (let i = 0; i < maxColumns; i++) {
if (!tableData[rowIndex][i]) {
tableData[rowIndex][i] = ' ';
}
}
});
const chunks: string[] = [];
const headerCells = tableData[0]
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
const headerRow = '| ' + headerCells.join(' | ') + ' |';
chunks.push(headerRow);
const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |';
chunks.push(separator);
tableData.slice(1).forEach((row) => {
const paddedRow = row
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
while (paddedRow.length < maxColumns) {
paddedRow.push(' ');
}
chunks.push('| ' + paddedRow.join(' | ') + ' |');
});
return chunks.join('\n');
} catch (error) {
return htmlTable;
}
});
};
/**
* format markdown
* 1. upload base64

View File

@@ -43,10 +43,14 @@ export type FastGPTConfigFileType = {
export type FastGPTFeConfigsType = {
show_workorder?: boolean;
show_emptyChat?: boolean;
isPlus?: boolean;
register_method?: ['email' | 'phone' | 'sync'];
login_method?: ['email' | 'phone']; // Attention: login method is diffrent with oauth
find_password_method?: ['email' | 'phone'];
bind_notification_method?: ['email' | 'phone'];
googleClientVerKey?: string;
show_emptyChat?: boolean;
show_appStore?: boolean;
show_git?: boolean;
show_pay?: boolean;
@@ -57,15 +61,19 @@ export type FastGPTFeConfigsType = {
show_aiproxy?: boolean;
concatMd?: string;
concatMd?: string;
docUrl?: string;
openAPIDocUrl?: string;
systemPluginCourseUrl?: string;
appTemplateCourse?: string;
customApiDomain?: string;
customSharePageDomain?: string;
systemTitle?: string;
systemDescription?: string;
googleClientVerKey?: string;
isPlus?: boolean;
scripts?: { [key: string]: string }[];
favicon?: string;
sso?: {
icon?: string;
title?: string;
@@ -91,13 +99,14 @@ export type FastGPTFeConfigsType = {
exportDatasetLimitMinutes?: number;
websiteSyncLimitMinuted?: number;
};
scripts?: { [key: string]: string }[];
favicon?: string;
customApiDomain?: string;
customSharePageDomain?: string;
uploadFileMaxAmount?: number;
uploadFileMaxSize?: number;
// Compute by systemEnv.customPdfParse
showCustomPdfParse?: boolean;
customPdfParsePrice?: number;
lafEnv?: string;
navbarItems?: NavbarItemType[];
externalProviderWorkflowVariables?: ExternalProviderWorkflowVarType[];
@@ -107,9 +116,18 @@ export type SystemEnvType = {
openapiPrefix?: string;
vectorMaxProcess: number;
qaMaxProcess: number;
vlmMaxProcess: number;
pgHNSWEfSearch: number;
tokenWorkers: number; // token count max worker
oneapiUrl?: string;
chatApiKey?: string;
customPdfParse?: {
url?: string;
key?: string;
doc2xKey?: string;
price?: number; // n points/1 page
};
};

View File

@@ -16,3 +16,24 @@ export const retryFn = async <T>(fn: () => Promise<T>, retryTimes = 3): Promise<
return Promise.reject(error);
}
};
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
const batchArr = new Array(batchSize).fill(null);
const result: any[] = [];
const batchFn = async () => {
const data = arr.shift();
if (data) {
result.push(await fn(data));
return batchFn();
}
};
await Promise.all(
batchArr.map(async () => {
await batchFn();
})
);
return result;
};

View File

@@ -22,7 +22,7 @@ export const defaultQAModels: LLMModelItemType[] = [
maxTemperature: 1.2,
charsPointsPrice: 0,
censor: false,
vision: false,
vision: true,
datasetProcess: true,
toolChoice: true,
functionCall: false,
@@ -59,10 +59,17 @@ export const defaultSTTModels: STTModelType[] = [
export const getModelFromList = (
modelList: { provider: ModelProviderIdType; name: string; model: string }[],
model: string
) => {
):
| {
avatar: string;
provider: ModelProviderIdType;
name: string;
model: string;
}
| undefined => {
const modelData = modelList.find((item) => item.model === model) ?? modelList[0];
if (!modelData) {
throw new Error('No Key model is configured');
return;
}
const provider = getModelProvider(modelData.provider);
return {

View File

@@ -1,5 +1,5 @@
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
import type { LLMModelItemType } from '../ai/model.d';
import { ParentIdType } from 'common/parentFolder/type';
@@ -10,9 +10,11 @@ export type DatasetUpdateBody = {
name?: string;
avatar?: string;
intro?: string;
agentModel?: LLMModelItemType;
status?: DatasetSchemaType['status'];
agentModel?: string;
vlmModel?: string;
websiteConfig?: DatasetSchemaType['websiteConfig'];
externalReadUrl?: DatasetSchemaType['externalReadUrl'];
defaultPermission?: DatasetSchemaType['defaultPermission'];
@@ -27,7 +29,10 @@ export type DatasetUpdateBody = {
/* ================= collection ===================== */
export type DatasetCollectionChunkMetadataType = {
parentId?: string;
trainingType?: TrainingModeEnum;
customPdfParse?: boolean;
trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean;
autoIndexes?: boolean;
chunkSize?: number;
chunkSplitter?: string;
qaPrompt?: string;
@@ -131,9 +136,15 @@ export type PostWebsiteSyncParams = {
export type PushDatasetDataProps = {
collectionId: string;
data: PushDatasetDataChunkProps[];
trainingMode: TrainingModeEnum;
trainingType?: DatasetCollectionDataProcessModeEnum;
autoIndexes?: boolean;
imageIndex?: boolean;
prompt?: string;
billId?: string;
// Abandon
trainingMode?: DatasetCollectionDataProcessModeEnum;
};
export type PushDatasetDataResponse = {
insertLen: number;

View File

@@ -1,4 +1,4 @@
import { DatasetCollectionTypeEnum, TrainingModeEnum, TrainingTypeMap } from '../constants';
import { DatasetCollectionTypeEnum } from '../constants';
import { DatasetCollectionSchemaType } from '../type';
export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType) => {
@@ -16,9 +16,3 @@ export const getCollectionSourceData = (collection?: DatasetCollectionSchemaType
export const checkCollectionIsFolder = (type: DatasetCollectionTypeEnum) => {
return type === DatasetCollectionTypeEnum.folder || type === DatasetCollectionTypeEnum.virtual;
};
export const getTrainingTypeLabel = (type?: TrainingModeEnum) => {
if (!type) return '';
if (!TrainingTypeMap[type]) return '';
return TrainingTypeMap[type].label;
};

View File

@@ -109,6 +109,26 @@ export const DatasetCollectionSyncResultMap = {
}
};
export enum DatasetCollectionDataProcessModeEnum {
chunk = 'chunk',
qa = 'qa',
auto = 'auto' // abandon
}
export const DatasetCollectionDataProcessModeMap = {
[DatasetCollectionDataProcessModeEnum.chunk]: {
label: i18nT('common:core.dataset.training.Chunk mode'),
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip')
},
[DatasetCollectionDataProcessModeEnum.qa]: {
label: i18nT('common:core.dataset.training.QA mode'),
tooltip: i18nT('common:core.dataset.import.QA Import Tip')
},
[DatasetCollectionDataProcessModeEnum.auto]: {
label: i18nT('common:core.dataset.training.Auto mode'),
tooltip: i18nT('common:core.dataset.training.Auto mode Tip')
}
};
/* ------------ data -------------- */
/* ------------ training -------------- */
@@ -124,28 +144,11 @@ export enum ImportDataSourceEnum {
export enum TrainingModeEnum {
chunk = 'chunk',
qa = 'qa',
auto = 'auto',
qa = 'qa'
image = 'image'
}
export const TrainingTypeMap = {
[TrainingModeEnum.chunk]: {
label: i18nT('common:core.dataset.training.Chunk mode'),
tooltip: i18nT('common:core.dataset.import.Chunk Split Tip'),
openSource: true
},
[TrainingModeEnum.auto]: {
label: i18nT('common:core.dataset.training.Auto mode'),
tooltip: i18nT('common:core.dataset.training.Auto mode Tip'),
openSource: false
},
[TrainingModeEnum.qa]: {
label: i18nT('common:core.dataset.training.QA mode'),
tooltip: i18nT('common:core.dataset.import.QA Import Tip'),
openSource: true
}
};
/* ------------ search -------------- */
export enum DatasetSearchModeEnum {
embedding = 'embedding',

View File

@@ -20,9 +20,22 @@ export type UpdateDatasetDataProps = {
})[];
};
export type PatchIndexesProps = {
type: 'create' | 'update' | 'delete' | 'unChange';
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string;
};
};
export type PatchIndexesProps =
| {
type: 'create';
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string;
};
}
| {
type: 'update';
index: DatasetDataIndexItemType;
}
| {
type: 'delete';
index: DatasetDataIndexItemType;
}
| {
type: 'unChange';
index: DatasetDataIndexItemType;
};

View File

@@ -0,0 +1,42 @@
import { i18nT } from '../../../../web/i18n/utils';
export enum DatasetDataIndexTypeEnum {
default = 'default',
custom = 'custom',
summary = 'summary',
question = 'question',
image = 'image'
}
export const DatasetDataIndexMap: Record<
`${DatasetDataIndexTypeEnum}`,
{
label: any;
color: string;
}
> = {
[DatasetDataIndexTypeEnum.default]: {
label: i18nT('dataset:data_index_default'),
color: 'gray'
},
[DatasetDataIndexTypeEnum.custom]: {
label: i18nT('dataset:data_index_custom'),
color: 'blue'
},
[DatasetDataIndexTypeEnum.summary]: {
label: i18nT('dataset:data_index_summary'),
color: 'green'
},
[DatasetDataIndexTypeEnum.question]: {
label: i18nT('dataset:data_index_question'),
color: 'red'
},
[DatasetDataIndexTypeEnum.image]: {
label: i18nT('dataset:data_index_image'),
color: 'purple'
}
};
export const defaultDatasetIndexData = DatasetDataIndexMap[DatasetDataIndexTypeEnum.custom];
export const getDatasetIndexMapData = (type: `${DatasetDataIndexTypeEnum}`) => {
return DatasetDataIndexMap[type] || defaultDatasetIndexData;
};

View File

@@ -0,0 +1,20 @@
import { PushDatasetDataChunkProps } from '../api';
import { TrainingModeEnum } from '../constants';
export type PushDataToTrainingQueueProps = {
teamId: string;
tmbId: string;
datasetId: string;
collectionId: string;
mode?: TrainingModeEnum;
data: PushDatasetDataChunkProps[];
prompt?: string;
agentModel: string;
vectorModel: string;
vlmModel?: string;
billId?: string;
session?: ClientSession;
};

View File

@@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
import { PermissionTypeEnum } from '../../support/permission/constant';
import { PushDatasetDataChunkProps } from './api';
import {
DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum,
DatasetStatusEnum,
DatasetTypeEnum,
@@ -12,6 +13,7 @@ import { DatasetPermission } from '../../support/permission/dataset/controller';
import { Permission } from '../../support/permission/controller';
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import { SourceMemberType } from 'support/user/type';
import { DatasetDataIndexTypeEnum } from './data/constants';
export type DatasetSchemaType = {
_id: string;
@@ -23,11 +25,14 @@ export type DatasetSchemaType = {
avatar: string;
name: string;
vectorModel: string;
agentModel: string;
intro: string;
type: `${DatasetTypeEnum}`;
status: `${DatasetStatusEnum}`;
vectorModel: string;
agentModel: string;
vlmModel?: string;
websiteConfig?: {
url: string;
selector: string;
@@ -52,26 +57,22 @@ export type DatasetCollectionSchemaType = {
parentId?: string;
name: string;
type: DatasetCollectionTypeEnum;
createTime: Date;
updateTime: Date;
forbid?: boolean;
trainingType: TrainingModeEnum;
chunkSize: number;
chunkSplitter?: string;
qaPrompt?: string;
ocrParse?: boolean;
tags?: string[];
createTime: Date;
updateTime: Date;
// Status
forbid?: boolean;
nextSyncTime?: Date;
// Collection metadata
fileId?: string; // local file id
rawLink?: string; // link url
externalFileId?: string; //external file id
apiFileId?: string; // api file id
externalFileUrl?: string; // external import url
nextSyncTime?: Date;
rawTextLength?: number;
hashRawText?: string;
metadata?: {
@@ -80,6 +81,16 @@ export type DatasetCollectionSchemaType = {
[key: string]: any;
};
// Parse settings
customPdfParse?: boolean;
// Chunk settings
autoIndexes?: boolean;
imageIndex?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSize: number;
chunkSplitter?: string;
qaPrompt?: string;
};
export type DatasetCollectionTagsSchemaType = {
@@ -90,7 +101,7 @@ export type DatasetCollectionTagsSchemaType = {
};
export type DatasetDataIndexItemType = {
defaultIndex: boolean;
type: `${DatasetDataIndexTypeEnum}`;
dataId: string; // pg data id
text: string;
};
@@ -141,6 +152,7 @@ export type DatasetTrainingSchemaType = {
chunkIndex: number;
weight: number;
indexes: Omit<DatasetDataIndexItemType, 'dataId'>[];
retryCount: number;
};
export type CollectionWithDatasetType = DatasetCollectionSchemaType & {
@@ -169,9 +181,10 @@ export type DatasetListItemType = {
sourceMember?: SourceMemberType;
};
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel'> & {
export type DatasetItemType = Omit<DatasetSchemaType, 'vectorModel' | 'agentModel' | 'vlmModel'> & {
vectorModel: EmbeddingModelItemType;
agentModel: LLMModelItemType;
vlmModel?: LLMModelItemType;
permission: DatasetPermission;
};

View File

@@ -1,6 +1,7 @@
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools';
import { DatasetDataIndexTypeEnum } from './data/constants';
export function getCollectionIcon(
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
@@ -38,14 +39,23 @@ export function getSourceNameIcon({
}
/* get dataset data default index */
export function getDefaultIndex(props?: { q?: string; a?: string; dataId?: string }) {
const { q = '', a, dataId } = props || {};
const qaStr = `${q}\n${a}`.trim();
return {
defaultIndex: true,
text: a ? qaStr : q,
dataId
};
export function getDefaultIndex(props?: { q?: string; a?: string }) {
const { q = '', a } = props || {};
return [
{
text: q,
type: DatasetDataIndexTypeEnum.default
},
...(a
? [
{
text: a,
type: DatasetDataIndexTypeEnum.default
}
]
: [])
];
}
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {

View File

@@ -10,7 +10,8 @@ export enum UsageSourceEnum {
wecom = 'wecom',
feishu = 'feishu',
dingtalk = 'dingtalk',
official_account = 'official_account'
official_account = 'official_account',
pdfParse = 'pdfParse'
}
export const UsageSourceMap = {
@@ -43,5 +44,8 @@ export const UsageSourceMap = {
},
[UsageSourceEnum.dingtalk]: {
label: i18nT('account_usage:dingtalk')
},
[UsageSourceEnum.pdfParse]: {
label: i18nT('account_usage:pdf_parse')
}
};

View File

@@ -7,6 +7,7 @@ export type UsageListItemCountType = {
outputTokens?: number;
charsLength?: number;
duration?: number;
pages?: number;
// deprecated
tokens?: number;