Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -1,31 +0,0 @@
export const retryRun = <T>(fn: () => T, retry = 2): T => {
try {
return fn();
} catch (error) {
if (retry > 0) {
return retryRun(fn, retry - 1);
}
throw error;
}
};
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
const batchArr = new Array(batchSize).fill(null);
const result: any[] = [];
const batchFn = async () => {
const data = arr.shift();
if (data) {
result.push(await fn(data));
return batchFn();
}
};
await Promise.all(
batchArr.map(async () => {
await batchFn();
})
);
return result;
};

View File

@@ -1,4 +1,4 @@
import { batchRun } from '../fn/utils';
import { batchRun } from '../system/utils';
import { getNanoid, simpleText } from './tools';
import type { ImageType } from '../../../service/worker/readFile/type';
@@ -37,6 +37,80 @@ export const simpleMarkdownText = (rawText: string) => {
return rawText.trim();
};
export const htmlTable2Md = (content: string): string => {
return content.replace(/<table>[\s\S]*?<\/table>/g, (htmlTable) => {
try {
// Clean up whitespace and newlines
const cleanHtml = htmlTable.replace(/\n\s*/g, '');
const rows = cleanHtml.match(/<tr>(.*?)<\/tr>/g);
if (!rows) return htmlTable;
// Parse table data
let tableData: string[][] = [];
let maxColumns = 0;
// Try to convert to markdown table
rows.forEach((row, rowIndex) => {
if (!tableData[rowIndex]) {
tableData[rowIndex] = [];
}
let colIndex = 0;
const cells = row.match(/<td.*?>(.*?)<\/td>/g) || [];
cells.forEach((cell) => {
while (tableData[rowIndex][colIndex]) {
colIndex++;
}
const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1');
const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1');
const content = cell.replace(/<td.*?>|<\/td>/g, '').trim();
for (let i = 0; i < rowspan; i++) {
for (let j = 0; j < colspan; j++) {
if (!tableData[rowIndex + i]) {
tableData[rowIndex + i] = [];
}
tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^';
}
}
colIndex += colspan;
maxColumns = Math.max(maxColumns, colIndex);
});
for (let i = 0; i < maxColumns; i++) {
if (!tableData[rowIndex][i]) {
tableData[rowIndex][i] = ' ';
}
}
});
const chunks: string[] = [];
const headerCells = tableData[0]
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
const headerRow = '| ' + headerCells.join(' | ') + ' |';
chunks.push(headerRow);
const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |';
chunks.push(separator);
tableData.slice(1).forEach((row) => {
const paddedRow = row
.slice(0, maxColumns)
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
while (paddedRow.length < maxColumns) {
paddedRow.push(' ');
}
chunks.push('| ' + paddedRow.join(' | ') + ' |');
});
return chunks.join('\n');
} catch (error) {
return htmlTable;
}
});
};
/**
* format markdown
* 1. upload base64

View File

@@ -43,10 +43,14 @@ export type FastGPTConfigFileType = {
export type FastGPTFeConfigsType = {
show_workorder?: boolean;
show_emptyChat?: boolean;
isPlus?: boolean;
register_method?: ['email' | 'phone' | 'sync'];
login_method?: ['email' | 'phone']; // Attention: login method is diffrent with oauth
find_password_method?: ['email' | 'phone'];
bind_notification_method?: ['email' | 'phone'];
googleClientVerKey?: string;
show_emptyChat?: boolean;
show_appStore?: boolean;
show_git?: boolean;
show_pay?: boolean;
@@ -57,15 +61,19 @@ export type FastGPTFeConfigsType = {
show_aiproxy?: boolean;
concatMd?: string;
concatMd?: string;
docUrl?: string;
openAPIDocUrl?: string;
systemPluginCourseUrl?: string;
appTemplateCourse?: string;
customApiDomain?: string;
customSharePageDomain?: string;
systemTitle?: string;
systemDescription?: string;
googleClientVerKey?: string;
isPlus?: boolean;
scripts?: { [key: string]: string }[];
favicon?: string;
sso?: {
icon?: string;
title?: string;
@@ -91,13 +99,14 @@ export type FastGPTFeConfigsType = {
exportDatasetLimitMinutes?: number;
websiteSyncLimitMinuted?: number;
};
scripts?: { [key: string]: string }[];
favicon?: string;
customApiDomain?: string;
customSharePageDomain?: string;
uploadFileMaxAmount?: number;
uploadFileMaxSize?: number;
// Compute by systemEnv.customPdfParse
showCustomPdfParse?: boolean;
customPdfParsePrice?: number;
lafEnv?: string;
navbarItems?: NavbarItemType[];
externalProviderWorkflowVariables?: ExternalProviderWorkflowVarType[];
@@ -107,9 +116,18 @@ export type SystemEnvType = {
openapiPrefix?: string;
vectorMaxProcess: number;
qaMaxProcess: number;
vlmMaxProcess: number;
pgHNSWEfSearch: number;
tokenWorkers: number; // token count max worker
oneapiUrl?: string;
chatApiKey?: string;
customPdfParse?: {
url?: string;
key?: string;
doc2xKey?: string;
price?: number; // n points/1 page
};
};

View File

@@ -16,3 +16,24 @@ export const retryFn = async <T>(fn: () => Promise<T>, retryTimes = 3): Promise<
return Promise.reject(error);
}
};
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
const batchArr = new Array(batchSize).fill(null);
const result: any[] = [];
const batchFn = async () => {
const data = arr.shift();
if (data) {
result.push(await fn(data));
return batchFn();
}
};
await Promise.all(
batchArr.map(async () => {
await batchFn();
})
);
return result;
};