Add image index and pdf parse (#3956)

* feat: think tag parse

* feat: parse think tag test

* feat: pdf parse ux

* feat: doc2x parse

* perf: rewrite training mode setting

* feat: image parse queue

* perf: image index

* feat: image parse process

* feat: add init sh

* fix: ts
This commit is contained in:
Archer
2025-03-03 23:08:29 +08:00
committed by archer
parent 08b6f594df
commit adf5377ebe
106 changed files with 2337 additions and 1454 deletions

View File

@@ -186,20 +186,25 @@ export async function getDownloadStream({
export const readFileContentFromMongo = async ({
teamId,
tmbId,
bucketName,
fileId,
isQAImport = false
isQAImport = false,
customPdfParse = false
}: {
teamId: string;
tmbId: string;
bucketName: `${BucketNameEnum}`;
fileId: string;
isQAImport?: boolean;
customPdfParse?: boolean;
}): Promise<{
rawText: string;
filename: string;
}> => {
const bufferId = `${fileId}-${customPdfParse}`;
// read buffer
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: fileId }, undefined, {
const fileBuffer = await MongoRawTextBuffer.findOne({ sourceId: bufferId }, undefined, {
...readFromSecondary
}).lean();
if (fileBuffer) {
@@ -227,9 +232,11 @@ export const readFileContentFromMongo = async ({
// Get raw text
const { rawText } = await readRawContentByFileBuffer({
customPdfParse,
extension,
isQAImport,
teamId,
tmbId,
buffer: fileBuffers,
encoding,
metadata: {
@@ -240,7 +247,7 @@ export const readFileContentFromMongo = async ({
// < 14M
if (fileBuffers.length < 14 * 1024 * 1024 && rawText.trim()) {
MongoRawTextBuffer.create({
sourceId: fileId,
sourceId: bufferId,
rawText,
metadata: {
filename: file.filename