mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 20:37:48 +00:00
Add image index and pdf parse (#3956)
* feat: think tag parse * feat: parse think tag test * feat: pdf parse ux * feat: doc2x parse * perf: rewrite training mode setting * feat: image parse queue * perf: image index * feat: image parse process * feat: add init sh * fix: ts
This commit is contained in:
@@ -1,31 +0,0 @@
|
||||
export const retryRun = <T>(fn: () => T, retry = 2): T => {
|
||||
try {
|
||||
return fn();
|
||||
} catch (error) {
|
||||
if (retry > 0) {
|
||||
return retryRun(fn, retry - 1);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
|
||||
const batchArr = new Array(batchSize).fill(null);
|
||||
const result: any[] = [];
|
||||
|
||||
const batchFn = async () => {
|
||||
const data = arr.shift();
|
||||
if (data) {
|
||||
result.push(await fn(data));
|
||||
return batchFn();
|
||||
}
|
||||
};
|
||||
|
||||
await Promise.all(
|
||||
batchArr.map(async () => {
|
||||
await batchFn();
|
||||
})
|
||||
);
|
||||
|
||||
return result;
|
||||
};
|
@@ -1,4 +1,4 @@
|
||||
import { batchRun } from '../fn/utils';
|
||||
import { batchRun } from '../system/utils';
|
||||
import { getNanoid, simpleText } from './tools';
|
||||
import type { ImageType } from '../../../service/worker/readFile/type';
|
||||
|
||||
@@ -37,6 +37,80 @@ export const simpleMarkdownText = (rawText: string) => {
|
||||
return rawText.trim();
|
||||
};
|
||||
|
||||
export const htmlTable2Md = (content: string): string => {
|
||||
return content.replace(/<table>[\s\S]*?<\/table>/g, (htmlTable) => {
|
||||
try {
|
||||
// Clean up whitespace and newlines
|
||||
const cleanHtml = htmlTable.replace(/\n\s*/g, '');
|
||||
const rows = cleanHtml.match(/<tr>(.*?)<\/tr>/g);
|
||||
if (!rows) return htmlTable;
|
||||
|
||||
// Parse table data
|
||||
let tableData: string[][] = [];
|
||||
let maxColumns = 0;
|
||||
|
||||
// Try to convert to markdown table
|
||||
rows.forEach((row, rowIndex) => {
|
||||
if (!tableData[rowIndex]) {
|
||||
tableData[rowIndex] = [];
|
||||
}
|
||||
let colIndex = 0;
|
||||
const cells = row.match(/<td.*?>(.*?)<\/td>/g) || [];
|
||||
|
||||
cells.forEach((cell) => {
|
||||
while (tableData[rowIndex][colIndex]) {
|
||||
colIndex++;
|
||||
}
|
||||
const colspan = parseInt(cell.match(/colspan="(\d+)"/)?.[1] || '1');
|
||||
const rowspan = parseInt(cell.match(/rowspan="(\d+)"/)?.[1] || '1');
|
||||
const content = cell.replace(/<td.*?>|<\/td>/g, '').trim();
|
||||
|
||||
for (let i = 0; i < rowspan; i++) {
|
||||
for (let j = 0; j < colspan; j++) {
|
||||
if (!tableData[rowIndex + i]) {
|
||||
tableData[rowIndex + i] = [];
|
||||
}
|
||||
tableData[rowIndex + i][colIndex + j] = i === 0 && j === 0 ? content : '^^';
|
||||
}
|
||||
}
|
||||
colIndex += colspan;
|
||||
maxColumns = Math.max(maxColumns, colIndex);
|
||||
});
|
||||
|
||||
for (let i = 0; i < maxColumns; i++) {
|
||||
if (!tableData[rowIndex][i]) {
|
||||
tableData[rowIndex][i] = ' ';
|
||||
}
|
||||
}
|
||||
});
|
||||
const chunks: string[] = [];
|
||||
|
||||
const headerCells = tableData[0]
|
||||
.slice(0, maxColumns)
|
||||
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
|
||||
const headerRow = '| ' + headerCells.join(' | ') + ' |';
|
||||
chunks.push(headerRow);
|
||||
|
||||
const separator = '| ' + Array(headerCells.length).fill('---').join(' | ') + ' |';
|
||||
chunks.push(separator);
|
||||
|
||||
tableData.slice(1).forEach((row) => {
|
||||
const paddedRow = row
|
||||
.slice(0, maxColumns)
|
||||
.map((cell) => (cell === '^^' ? ' ' : cell || ' '));
|
||||
while (paddedRow.length < maxColumns) {
|
||||
paddedRow.push(' ');
|
||||
}
|
||||
chunks.push('| ' + paddedRow.join(' | ') + ' |');
|
||||
});
|
||||
|
||||
return chunks.join('\n');
|
||||
} catch (error) {
|
||||
return htmlTable;
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* format markdown
|
||||
* 1. upload base64
|
||||
|
30
packages/global/common/system/types/index.d.ts
vendored
30
packages/global/common/system/types/index.d.ts
vendored
@@ -43,10 +43,14 @@ export type FastGPTConfigFileType = {
|
||||
export type FastGPTFeConfigsType = {
|
||||
show_workorder?: boolean;
|
||||
show_emptyChat?: boolean;
|
||||
isPlus?: boolean;
|
||||
register_method?: ['email' | 'phone' | 'sync'];
|
||||
login_method?: ['email' | 'phone']; // Attention: login method is diffrent with oauth
|
||||
find_password_method?: ['email' | 'phone'];
|
||||
bind_notification_method?: ['email' | 'phone'];
|
||||
googleClientVerKey?: string;
|
||||
|
||||
show_emptyChat?: boolean;
|
||||
show_appStore?: boolean;
|
||||
show_git?: boolean;
|
||||
show_pay?: boolean;
|
||||
@@ -57,15 +61,19 @@ export type FastGPTFeConfigsType = {
|
||||
show_aiproxy?: boolean;
|
||||
concatMd?: string;
|
||||
|
||||
concatMd?: string;
|
||||
docUrl?: string;
|
||||
openAPIDocUrl?: string;
|
||||
systemPluginCourseUrl?: string;
|
||||
appTemplateCourse?: string;
|
||||
customApiDomain?: string;
|
||||
customSharePageDomain?: string;
|
||||
|
||||
systemTitle?: string;
|
||||
systemDescription?: string;
|
||||
googleClientVerKey?: string;
|
||||
isPlus?: boolean;
|
||||
scripts?: { [key: string]: string }[];
|
||||
favicon?: string;
|
||||
|
||||
sso?: {
|
||||
icon?: string;
|
||||
title?: string;
|
||||
@@ -91,13 +99,14 @@ export type FastGPTFeConfigsType = {
|
||||
exportDatasetLimitMinutes?: number;
|
||||
websiteSyncLimitMinuted?: number;
|
||||
};
|
||||
scripts?: { [key: string]: string }[];
|
||||
favicon?: string;
|
||||
customApiDomain?: string;
|
||||
customSharePageDomain?: string;
|
||||
|
||||
uploadFileMaxAmount?: number;
|
||||
uploadFileMaxSize?: number;
|
||||
|
||||
// Compute by systemEnv.customPdfParse
|
||||
showCustomPdfParse?: boolean;
|
||||
customPdfParsePrice?: number;
|
||||
|
||||
lafEnv?: string;
|
||||
navbarItems?: NavbarItemType[];
|
||||
externalProviderWorkflowVariables?: ExternalProviderWorkflowVarType[];
|
||||
@@ -107,9 +116,18 @@ export type SystemEnvType = {
|
||||
openapiPrefix?: string;
|
||||
vectorMaxProcess: number;
|
||||
qaMaxProcess: number;
|
||||
vlmMaxProcess: number;
|
||||
pgHNSWEfSearch: number;
|
||||
tokenWorkers: number; // token count max worker
|
||||
|
||||
oneapiUrl?: string;
|
||||
chatApiKey?: string;
|
||||
|
||||
customPdfParse?: {
|
||||
url?: string;
|
||||
key?: string;
|
||||
|
||||
doc2xKey?: string;
|
||||
price?: number; // n points/1 page
|
||||
};
|
||||
};
|
||||
|
@@ -16,3 +16,24 @@ export const retryFn = async <T>(fn: () => Promise<T>, retryTimes = 3): Promise<
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
|
||||
export const batchRun = async <T>(arr: T[], fn: (arr: T) => any, batchSize = 10) => {
|
||||
const batchArr = new Array(batchSize).fill(null);
|
||||
const result: any[] = [];
|
||||
|
||||
const batchFn = async () => {
|
||||
const data = arr.shift();
|
||||
if (data) {
|
||||
result.push(await fn(data));
|
||||
return batchFn();
|
||||
}
|
||||
};
|
||||
|
||||
await Promise.all(
|
||||
batchArr.map(async () => {
|
||||
await batchFn();
|
||||
})
|
||||
);
|
||||
|
||||
return result;
|
||||
};
|
||||
|
Reference in New Issue
Block a user