mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-18 09:24:03 +00:00
perf: password special chars;feat: llm paragraph;perf: chunk setting params;perf: text splitter worker (#4984)
* perf: password special chars * feat: llm paragraph;perf: chunk setting params * perf: text splitter worker * perf: get rawtext buffer * fix: test * fix: test * doc * min chunk size
This commit is contained in:
@@ -5,6 +5,8 @@ import { addLog } from '../../system/log';
|
||||
import { setCron } from '../../system/cron';
|
||||
import { checkTimerLock } from '../../system/timerLock/utils';
|
||||
import { TimerIdEnum } from '../../system/timerLock/constants';
|
||||
import { gridFsStream2Buffer } from '../../file/gridfs/utils';
|
||||
import { readRawContentFromBuffer } from '../../../worker/function';
|
||||
|
||||
const getGridBucket = () => {
|
||||
return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, {
|
||||
@@ -85,30 +87,27 @@ export const getRawTextBuffer = async (sourceId: string) => {
|
||||
|
||||
// Read file content
|
||||
const downloadStream = gridBucket.openDownloadStream(bufferData._id);
|
||||
const chunks: Buffer[] = [];
|
||||
|
||||
return new Promise<{
|
||||
text: string;
|
||||
sourceName: string;
|
||||
} | null>((resolve, reject) => {
|
||||
downloadStream.on('data', (chunk) => {
|
||||
chunks.push(chunk);
|
||||
});
|
||||
const fileBuffers = await gridFsStream2Buffer(downloadStream);
|
||||
|
||||
downloadStream.on('end', () => {
|
||||
const buffer = Buffer.concat(chunks);
|
||||
const text = buffer.toString('utf8');
|
||||
resolve({
|
||||
text,
|
||||
sourceName: bufferData.metadata?.sourceName || ''
|
||||
});
|
||||
});
|
||||
const rawText = await (async () => {
|
||||
if (fileBuffers.length < 10000000) {
|
||||
return fileBuffers.toString('utf8');
|
||||
} else {
|
||||
return (
|
||||
await readRawContentFromBuffer({
|
||||
extension: 'txt',
|
||||
encoding: 'utf8',
|
||||
buffer: fileBuffers
|
||||
})
|
||||
).rawText;
|
||||
}
|
||||
})();
|
||||
|
||||
downloadStream.on('error', (error) => {
|
||||
addLog.error('getRawTextBuffer error', error);
|
||||
resolve(null);
|
||||
});
|
||||
});
|
||||
return {
|
||||
text: rawText,
|
||||
sourceName: bufferData.metadata?.sourceName || ''
|
||||
};
|
||||
});
|
||||
};
|
||||
|
||||
|
@@ -55,13 +55,17 @@ export const createFileFromText = async ({
|
||||
|
||||
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
if (!stream.readable) {
|
||||
return resolve(Buffer.from([]));
|
||||
}
|
||||
|
||||
const chunks: Uint8Array[] = [];
|
||||
|
||||
stream.on('data', (chunk) => {
|
||||
chunks.push(chunk);
|
||||
});
|
||||
stream.on('end', () => {
|
||||
const resultBuffer = Buffer.concat(chunks); // 一次性拼接
|
||||
const resultBuffer = Buffer.concat(chunks); // One-time splicing
|
||||
resolve(resultBuffer);
|
||||
});
|
||||
stream.on('error', (err) => {
|
||||
|
@@ -1,6 +1,5 @@
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import FormData from 'form-data';
|
||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||
import fs from 'fs';
|
||||
import type { ReadFileResponse } from '../../../worker/readFile/type';
|
||||
import axios from 'axios';
|
||||
@@ -9,6 +8,7 @@ import { batchRun } from '@fastgpt/global/common/system/utils';
|
||||
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
|
||||
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
|
||||
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
|
||||
import { readRawContentFromBuffer } from '../../../worker/function';
|
||||
|
||||
export type readRawTextByLocalFileParams = {
|
||||
teamId: string;
|
||||
@@ -63,11 +63,10 @@ export const readRawContentByFileBuffer = async ({
|
||||
rawText: string;
|
||||
}> => {
|
||||
const systemParse = () =>
|
||||
runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
readRawContentFromBuffer({
|
||||
extension,
|
||||
encoding,
|
||||
buffer,
|
||||
teamId
|
||||
buffer
|
||||
});
|
||||
const parsePdfFromCustomService = async (): Promise<ReadFileResponse> => {
|
||||
const url = global.systemEnv.customPdfParse?.url;
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import { isTestEnv } from '@fastgpt/global/common/system/constants';
|
||||
import { addLog } from '../../common/system/log';
|
||||
import type { Model } from 'mongoose';
|
||||
import mongoose, { Mongoose } from 'mongoose';
|
||||
@@ -70,7 +71,7 @@ const addCommonMiddleware = (schema: mongoose.Schema) => {
|
||||
|
||||
export const getMongoModel = <T>(name: string, schema: mongoose.Schema) => {
|
||||
if (connectionMongo.models[name]) return connectionMongo.models[name] as Model<T>;
|
||||
if (process.env.NODE_ENV !== 'test') console.log('Load model======', name);
|
||||
if (!isTestEnv) console.log('Load model======', name);
|
||||
addCommonMiddleware(schema);
|
||||
|
||||
const model = connectionMongo.model<T>(name, schema);
|
||||
|
Reference in New Issue
Block a user