mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
Feat: pptx and xlsx loader (#1118)
* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
This commit is contained in:
@@ -3,12 +3,17 @@ import { ErrType } from '../errorCode';
|
||||
/* dataset: 507000 */
|
||||
const startCode = 507000;
|
||||
export enum CommonErrEnum {
|
||||
fileNotFound = 'fileNotFound'
|
||||
fileNotFound = 'fileNotFound',
|
||||
unAuthFile = 'unAuthFile'
|
||||
}
|
||||
const datasetErr = [
|
||||
{
|
||||
statusText: CommonErrEnum.fileNotFound,
|
||||
message: 'error.fileNotFound'
|
||||
},
|
||||
{
|
||||
statusText: CommonErrEnum.unAuthFile,
|
||||
message: 'error.unAuthFile'
|
||||
}
|
||||
];
|
||||
export default datasetErr.reduce((acc, cur, index) => {
|
||||
|
@@ -40,9 +40,9 @@ export const splitText2Chunks = (props: {
|
||||
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
|
||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
||||
|
||||
// ------ There's no overlap on the top
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 },
|
||||
@@ -56,7 +56,7 @@ export const splitText2Chunks = (props: {
|
||||
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
|
||||
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
|
||||
|
||||
// if use markdown title split, Separate record title title
|
||||
// if use markdown title split, Separate record title
|
||||
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
||||
if (step >= stepReges.length) {
|
||||
return [
|
||||
@@ -97,6 +97,7 @@ export const splitText2Chunks = (props: {
|
||||
.filter((item) => item.text.trim());
|
||||
};
|
||||
|
||||
/* Gets the overlap at the end of a text as the beginning of the next block */
|
||||
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
||||
const forbidOverlap = checkForbidOverlap(step);
|
||||
const maxOverlapLen = chunkLen * 0.4;
|
||||
|
@@ -55,6 +55,7 @@ export type FastGPTFeConfigsType = {
|
||||
customApiDomain?: string;
|
||||
customSharePageDomain?: string;
|
||||
|
||||
uploadFileMaxAmount?: number;
|
||||
uploadFileMaxSize?: number;
|
||||
};
|
||||
|
||||
|
12
packages/global/core/dataset/api.d.ts
vendored
12
packages/global/core/dataset/api.d.ts
vendored
@@ -44,14 +44,18 @@ export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
|
||||
export type LinkCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
link: string;
|
||||
};
|
||||
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
fileId: string;
|
||||
};
|
||||
export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
name: string;
|
||||
rawTextLength: number;
|
||||
hashRawText: string;
|
||||
|
||||
fileMetadata?: Record<string, any>;
|
||||
collectionMetadata?: Record<string, any>;
|
||||
};
|
||||
export type CsvTableCreateDatasetCollectionParams = {
|
||||
datasetId: string;
|
||||
parentId?: string;
|
||||
fileId: string;
|
||||
};
|
||||
|
||||
/* ================= data ===================== */
|
||||
export type PgSearchRawType = {
|
||||
|
@@ -73,6 +73,13 @@ export const DatasetCollectionSyncResultMap = {
|
||||
/* ------------ data -------------- */
|
||||
|
||||
/* ------------ training -------------- */
|
||||
export enum ImportDataSourceEnum {
|
||||
fileLocal = 'fileLocal',
|
||||
fileLink = 'fileLink',
|
||||
fileCustom = 'fileCustom',
|
||||
csvTable = 'csvTable'
|
||||
}
|
||||
|
||||
export enum TrainingModeEnum {
|
||||
chunk = 'chunk',
|
||||
auto = 'auto',
|
||||
|
@@ -2,18 +2,18 @@
|
||||
"name": "@fastgpt/global",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"@apidevtools/swagger-parser": "^10.1.0",
|
||||
"axios": "^1.5.1",
|
||||
"dayjs": "^1.11.7",
|
||||
"encoding": "^0.1.13",
|
||||
"js-tiktoken": "^1.0.7",
|
||||
"openapi-types": "^12.1.3",
|
||||
"openai": "4.28.0",
|
||||
"nanoid": "^4.0.1",
|
||||
"js-yaml": "^4.1.0",
|
||||
"timezones-list": "^3.0.2",
|
||||
"next": "13.5.2",
|
||||
"jschardet": "3.1.1",
|
||||
"@apidevtools/swagger-parser": "^10.1.0"
|
||||
"nanoid": "^4.0.1",
|
||||
"next": "13.5.2",
|
||||
"openai": "4.28.0",
|
||||
"openapi-types": "^12.1.3",
|
||||
"timezones-list": "^3.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
|
33
packages/service/common/buffer/rawText/schema.ts
Normal file
33
packages/service/common/buffer/rawText/schema.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import { connectionMongo, type Model } from '../../mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { RawTextBufferSchemaType } from './type';
|
||||
|
||||
export const collectionName = 'buffer.rawText';
|
||||
|
||||
const RawTextBufferSchema = new Schema({
|
||||
sourceId: {
|
||||
type: String,
|
||||
required: true
|
||||
},
|
||||
rawText: {
|
||||
type: String,
|
||||
default: ''
|
||||
},
|
||||
createTime: {
|
||||
type: Date,
|
||||
default: () => new Date()
|
||||
},
|
||||
metadata: Object
|
||||
});
|
||||
|
||||
try {
|
||||
RawTextBufferSchema.index({ sourceId: 1 });
|
||||
// 20 minutes
|
||||
RawTextBufferSchema.index({ createTime: 1 }, { expireAfterSeconds: 20 * 60 });
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
|
||||
export const MongoRwaTextBuffer: Model<RawTextBufferSchemaType> =
|
||||
models[collectionName] || model(collectionName, RawTextBufferSchema);
|
||||
MongoRwaTextBuffer.syncIndexes();
|
8
packages/service/common/buffer/rawText/type.d.ts
vendored
Normal file
8
packages/service/common/buffer/rawText/type.d.ts
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
export type RawTextBufferSchemaType = {
|
||||
sourceId: string;
|
||||
rawText: string;
|
||||
createTime: Date;
|
||||
metadata?: {
|
||||
filename: string;
|
||||
};
|
||||
};
|
@@ -2,7 +2,7 @@ import { connectionMongo, type Model } from '../../../common/mongo';
|
||||
const { Schema, model, models } = connectionMongo;
|
||||
import { TTSBufferSchemaType } from './type.d';
|
||||
|
||||
export const collectionName = 'ttsbuffers';
|
||||
export const collectionName = 'buffer.tts';
|
||||
|
||||
const TTSBufferSchema = new Schema({
|
||||
bufferId: {
|
||||
|
@@ -4,6 +4,18 @@ import fsp from 'fs/promises';
|
||||
import fs from 'fs';
|
||||
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoFileSchema } from './schema';
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
import { readFileRawText } from '../read/rawText';
|
||||
import { ReadFileByBufferParams } from '../read/type';
|
||||
import { readMarkdown } from '../read/markdown';
|
||||
import { readHtmlRawText } from '../read/html';
|
||||
import { readPdfFile } from '../read/pdf';
|
||||
import { readWordFile } from '../read/word';
|
||||
import { readCsvRawText } from '../read/csv';
|
||||
import { MongoRwaTextBuffer } from '../../buffer/rawText/schema';
|
||||
import { readPptxRawText } from '../read/pptx';
|
||||
import { readXlsxRawText } from '../read/xlsx';
|
||||
|
||||
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
|
||||
MongoFileSchema;
|
||||
@@ -111,3 +123,139 @@ export async function getDownloadStream({
|
||||
|
||||
return bucket.openDownloadStream(new Types.ObjectId(fileId));
|
||||
}
|
||||
|
||||
export const readFileEncode = async ({
|
||||
bucketName,
|
||||
fileId
|
||||
}: {
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
}) => {
|
||||
const encodeStream = await getDownloadStream({ bucketName, fileId });
|
||||
let buffers: Buffer = Buffer.from([]);
|
||||
for await (const chunk of encodeStream) {
|
||||
buffers = Buffer.concat([buffers, chunk]);
|
||||
if (buffers.length > 10) {
|
||||
encodeStream.abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const encoding = detectFileEncoding(buffers);
|
||||
|
||||
return encoding as BufferEncoding;
|
||||
};
|
||||
|
||||
export const readFileContent = async ({
|
||||
teamId,
|
||||
bucketName,
|
||||
fileId,
|
||||
csvFormat = false
|
||||
}: {
|
||||
teamId: string;
|
||||
bucketName: `${BucketNameEnum}`;
|
||||
fileId: string;
|
||||
csvFormat?: boolean;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
filename: string;
|
||||
}> => {
|
||||
// read buffer
|
||||
const fileBuffer = await MongoRwaTextBuffer.findOne({ sourceId: fileId }).lean();
|
||||
if (fileBuffer) {
|
||||
return {
|
||||
rawText: fileBuffer.rawText,
|
||||
filename: fileBuffer.metadata?.filename || ''
|
||||
};
|
||||
}
|
||||
|
||||
const [file, encoding, fileStream] = await Promise.all([
|
||||
getFileById({ bucketName, fileId }),
|
||||
readFileEncode({ bucketName, fileId }),
|
||||
getDownloadStream({ bucketName, fileId })
|
||||
]);
|
||||
|
||||
if (!file) {
|
||||
return Promise.reject(CommonErrEnum.fileNotFound);
|
||||
}
|
||||
|
||||
const extension = file?.filename?.split('.')?.pop()?.toLowerCase() || '';
|
||||
|
||||
const fileBuffers = await (() => {
|
||||
return new Promise<Buffer>((resolve, reject) => {
|
||||
let buffers = Buffer.from([]);
|
||||
fileStream.on('data', (chunk) => {
|
||||
buffers = Buffer.concat([buffers, chunk]);
|
||||
});
|
||||
fileStream.on('end', () => {
|
||||
resolve(buffers);
|
||||
});
|
||||
fileStream.on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
})();
|
||||
|
||||
const params: ReadFileByBufferParams = {
|
||||
teamId,
|
||||
buffer: fileBuffers,
|
||||
encoding,
|
||||
metadata: {
|
||||
relatedId: fileId
|
||||
}
|
||||
};
|
||||
|
||||
const { rawText } = await (async () => {
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(params);
|
||||
case 'md':
|
||||
return readMarkdown(params);
|
||||
case 'html':
|
||||
return readHtmlRawText(params);
|
||||
case 'pdf':
|
||||
return readPdfFile(params);
|
||||
case 'docx':
|
||||
return readWordFile(params);
|
||||
case 'pptx':
|
||||
return readPptxRawText(params);
|
||||
case 'xlsx':
|
||||
const xlsxResult = await readXlsxRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: xlsxResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: xlsxResult.rawText
|
||||
};
|
||||
case 'csv':
|
||||
const csvResult = await readCsvRawText(params);
|
||||
if (csvFormat) {
|
||||
return {
|
||||
rawText: csvResult.formatText || ''
|
||||
};
|
||||
}
|
||||
return {
|
||||
rawText: csvResult.rawText
|
||||
};
|
||||
default:
|
||||
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
|
||||
}
|
||||
})();
|
||||
|
||||
if (rawText.trim()) {
|
||||
await MongoRwaTextBuffer.create({
|
||||
sourceId: fileId,
|
||||
rawText,
|
||||
metadata: {
|
||||
filename: file.filename
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
rawText,
|
||||
filename: file.filename
|
||||
};
|
||||
};
|
||||
|
@@ -14,7 +14,6 @@ export async function uploadMongoImg({
|
||||
teamId,
|
||||
expiredTime,
|
||||
metadata,
|
||||
|
||||
shareId
|
||||
}: UploadImgProps & {
|
||||
teamId: string;
|
||||
@@ -30,9 +29,8 @@ export async function uploadMongoImg({
|
||||
type,
|
||||
teamId,
|
||||
binary,
|
||||
expiredTime: expiredTime,
|
||||
expiredTime,
|
||||
metadata,
|
||||
|
||||
shareId
|
||||
});
|
||||
|
||||
|
@@ -25,13 +25,13 @@ const ImageSchema = new Schema({
|
||||
enum: Object.keys(mongoImageTypeMap),
|
||||
required: true
|
||||
},
|
||||
|
||||
metadata: {
|
||||
type: Object
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
// tts expired
|
||||
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
|
||||
ImageSchema.index({ type: 1 });
|
||||
ImageSchema.index({ createTime: 1 });
|
||||
|
21
packages/service/common/file/read/csv.ts
Normal file
21
packages/service/common/file/read/csv.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
import Papa from 'papaparse';
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
// 加载源文件内容
|
||||
export const readCsvRawText = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const { rawText } = readFileRawText(params);
|
||||
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
const header = csvArr[0];
|
||||
|
||||
const formatText = header
|
||||
? csvArr.map((item) => item.map((item, i) => `${header[i]}:${item}`).join('\n')).join('\n')
|
||||
: '';
|
||||
|
||||
return {
|
||||
rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
23
packages/service/common/file/read/html.ts
Normal file
23
packages/service/common/file/read/html.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { initMarkdownText } from './utils';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readHtmlRawText = async (
|
||||
params: ReadFileByBufferParams
|
||||
): Promise<ReadFileResponse> => {
|
||||
const { teamId, metadata } = params;
|
||||
const { rawText: html } = readFileRawText(params);
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
18
packages/service/common/file/read/markdown.ts
Normal file
18
packages/service/common/file/read/markdown.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import { initMarkdownText } from './utils';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readMarkdown = async (params: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const { teamId, metadata } = params;
|
||||
const { rawText: md } = readFileRawText(params);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
119
packages/service/common/file/read/parseOffice.ts
Normal file
119
packages/service/common/file/read/parseOffice.ts
Normal file
@@ -0,0 +1,119 @@
|
||||
import { getNanoid } from '@fastgpt/global/common/string/tools';
|
||||
import fs from 'fs';
|
||||
import decompress from 'decompress';
|
||||
import { DOMParser } from '@xmldom/xmldom';
|
||||
import { clearDirFiles } from '../utils';
|
||||
import { addLog } from '../../system/log';
|
||||
|
||||
const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
|
||||
|
||||
function getNewFileName(ext: string) {
|
||||
return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
|
||||
}
|
||||
|
||||
const parseString = (xml: string) => {
|
||||
let parser = new DOMParser();
|
||||
return parser.parseFromString(xml, 'text/xml');
|
||||
};
|
||||
|
||||
const parsePowerPoint = async ({
|
||||
filepath,
|
||||
decompressPath,
|
||||
encoding
|
||||
}: {
|
||||
filepath: string;
|
||||
decompressPath: string;
|
||||
encoding: BufferEncoding;
|
||||
}) => {
|
||||
// Files regex that hold our content of interest
|
||||
const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
|
||||
const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
|
||||
|
||||
/** The decompress location which contains the filename in it */
|
||||
|
||||
const files = await decompress(filepath, decompressPath, {
|
||||
filter: (x) => !!x.path.match(allFilesRegex)
|
||||
});
|
||||
|
||||
// Verify if atleast the slides xml files exist in the extracted files list.
|
||||
if (
|
||||
files.length == 0 ||
|
||||
!files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
|
||||
) {
|
||||
return Promise.reject('解析 PPT 失败');
|
||||
}
|
||||
|
||||
// Returning an array of all the xml contents read using fs.readFileSync
|
||||
const xmlContentArray = files.map((file) =>
|
||||
fs.readFileSync(`${decompressPath}/${file.path}`, encoding)
|
||||
);
|
||||
|
||||
let responseArr: string[] = [];
|
||||
|
||||
xmlContentArray.forEach((xmlContent) => {
|
||||
/** Find text nodes with a:p tags */
|
||||
const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
|
||||
|
||||
/** Store all the text content to respond */
|
||||
responseArr.push(
|
||||
Array.from(xmlParagraphNodesList)
|
||||
// Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
|
||||
.filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
|
||||
.map((paragraphNode) => {
|
||||
/** Find text nodes with a:t tags */
|
||||
const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
|
||||
return Array.from(xmlTextNodeList)
|
||||
.filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
|
||||
.map((textNode) => textNode.childNodes[0].nodeValue)
|
||||
.join('');
|
||||
})
|
||||
.join('\n')
|
||||
);
|
||||
});
|
||||
|
||||
return responseArr.join('\n');
|
||||
};
|
||||
|
||||
export const parseOffice = async ({
|
||||
buffer,
|
||||
encoding,
|
||||
extension
|
||||
}: {
|
||||
buffer: Buffer;
|
||||
encoding: BufferEncoding;
|
||||
extension: string;
|
||||
}) => {
|
||||
// Prepare file for processing
|
||||
// create temp file subdirectory if it does not exist
|
||||
if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
|
||||
fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
|
||||
}
|
||||
|
||||
// temp file name
|
||||
const filepath = getNewFileName(extension);
|
||||
const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
|
||||
// const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
|
||||
|
||||
// write new file
|
||||
fs.writeFileSync(filepath, buffer, {
|
||||
encoding
|
||||
});
|
||||
|
||||
const text = await (async () => {
|
||||
try {
|
||||
switch (extension) {
|
||||
case 'pptx':
|
||||
return parsePowerPoint({ filepath, decompressPath, encoding });
|
||||
default:
|
||||
return Promise.reject('只能读取 .pptx 文件');
|
||||
}
|
||||
} catch (error) {
|
||||
addLog.error(`Load ppt error`, { error });
|
||||
}
|
||||
return '';
|
||||
})();
|
||||
|
||||
fs.unlinkSync(filepath);
|
||||
clearDirFiles(decompressPath);
|
||||
return text;
|
||||
};
|
@@ -1,5 +1,7 @@
|
||||
/* read file to txt */
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
// @ts-ignore
|
||||
import('pdfjs-dist/legacy/build/pdf.worker.min.mjs');
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type';
|
||||
|
||||
type TokenType = {
|
||||
str: string;
|
||||
@@ -11,9 +13,9 @@ type TokenType = {
|
||||
hasEOL: boolean;
|
||||
};
|
||||
|
||||
export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = '/js/pdf.worker.js';
|
||||
|
||||
export const readPdfFile = async ({
|
||||
buffer
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||
const page = await doc.getPage(pageNo);
|
||||
const tokenizedText = await page.getTextContent();
|
||||
@@ -51,14 +53,19 @@ export const readPdfFile = async ({ pdf }: { pdf: ArrayBuffer }) => {
|
||||
.join('');
|
||||
};
|
||||
|
||||
const doc = await pdfjsLib.getDocument(pdf).promise;
|
||||
const loadingTask = pdfjs.getDocument(buffer.buffer);
|
||||
const doc = await loadingTask.promise;
|
||||
|
||||
const pageTextPromises = [];
|
||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||
}
|
||||
const pageTexts = await Promise.all(pageTextPromises);
|
||||
|
||||
loadingTask.destroy();
|
||||
|
||||
return {
|
||||
rawText: pageTexts.join('')
|
||||
rawText: pageTexts.join(''),
|
||||
metadata: {}
|
||||
};
|
||||
};
|
14
packages/service/common/file/read/pptx.ts
Normal file
14
packages/service/common/file/read/pptx.ts
Normal file
@@ -0,0 +1,14 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
// import { parseOfficeAsync } from 'officeparser';
|
||||
import { parseOffice } from './parseOffice';
|
||||
|
||||
export const readPptxRawText = async ({
|
||||
buffer,
|
||||
encoding
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const result = await parseOffice({ buffer, encoding, extension: 'pptx' });
|
||||
|
||||
return {
|
||||
rawText: result
|
||||
};
|
||||
};
|
10
packages/service/common/file/read/rawText.ts
Normal file
10
packages/service/common/file/read/rawText.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
|
||||
// 加载源文件内容
|
||||
export const readFileRawText = ({ buffer, encoding }: ReadFileByBufferParams): ReadFileResponse => {
|
||||
const content = buffer.toString(encoding);
|
||||
|
||||
return {
|
||||
rawText: content
|
||||
};
|
||||
};
|
12
packages/service/common/file/read/type.d.ts
vendored
Normal file
12
packages/service/common/file/read/type.d.ts
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
export type ReadFileByBufferParams = {
|
||||
teamId: string;
|
||||
buffer: Buffer;
|
||||
encoding: BufferEncoding;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
||||
|
||||
export type ReadFileResponse = {
|
||||
rawText: string;
|
||||
formatText?: string;
|
||||
metadata?: Record<string, any>;
|
||||
};
|
25
packages/service/common/file/read/utils.ts
Normal file
25
packages/service/common/file/read/utils.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { uploadMongoImg } from '../image/controller';
|
||||
import { MongoImageTypeEnum } from '@fastgpt/global/common/file/image/constants';
|
||||
import { addHours } from 'date-fns';
|
||||
|
||||
export const initMarkdownText = ({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
}: {
|
||||
md: string;
|
||||
teamId: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) =>
|
||||
markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController: (base64Img) =>
|
||||
uploadMongoImg({
|
||||
type: MongoImageTypeEnum.collectionImage,
|
||||
base64Img,
|
||||
teamId,
|
||||
metadata,
|
||||
expiredTime: addHours(new Date(), 2)
|
||||
})
|
||||
});
|
35
packages/service/common/file/read/word.ts
Normal file
35
packages/service/common/file/read/word.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import mammoth from 'mammoth';
|
||||
import { htmlToMarkdown } from '../../string/markdown';
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type';
|
||||
import { initMarkdownText } from './utils';
|
||||
|
||||
/**
|
||||
* read docx to markdown
|
||||
*/
|
||||
export const readWordFile = async ({
|
||||
teamId,
|
||||
buffer,
|
||||
metadata = {}
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
try {
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
buffer
|
||||
});
|
||||
|
||||
const md = await htmlToMarkdown(html);
|
||||
|
||||
const rawText = await initMarkdownText({
|
||||
teamId,
|
||||
md,
|
||||
metadata
|
||||
});
|
||||
|
||||
return {
|
||||
rawText,
|
||||
metadata: {}
|
||||
};
|
||||
} catch (error) {
|
||||
console.log('error doc read:', error);
|
||||
return Promise.reject('Can not read doc file, please convert to PDF');
|
||||
}
|
||||
};
|
45
packages/service/common/file/read/xlsx.ts
Normal file
45
packages/service/common/file/read/xlsx.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
import { ReadFileByBufferParams, ReadFileResponse } from './type.d';
|
||||
import xlsx from 'node-xlsx';
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const readXlsxRawText = async ({
|
||||
buffer
|
||||
}: ReadFileByBufferParams): Promise<ReadFileResponse> => {
|
||||
const result = xlsx.parse(buffer, {
|
||||
skipHidden: false,
|
||||
defval: ''
|
||||
});
|
||||
|
||||
const format2Csv = result.map(({ name, data }) => {
|
||||
return {
|
||||
title: `#${name}`,
|
||||
csvText: data.map((item) => item.join(',')).join('\n')
|
||||
};
|
||||
});
|
||||
|
||||
const rawText = format2Csv.map((item) => item.csvText).join('\n');
|
||||
const formatText = format2Csv
|
||||
.map((item) => {
|
||||
const csvArr = Papa.parse(item.csvText).data as string[][];
|
||||
const header = csvArr[0];
|
||||
|
||||
const formatText = header
|
||||
? csvArr
|
||||
.map((item) =>
|
||||
item
|
||||
.map((item, i) => (item ? `${header[i]}:${item}` : ''))
|
||||
.filter(Boolean)
|
||||
.join('\n')
|
||||
)
|
||||
.join('\n')
|
||||
: '';
|
||||
|
||||
return `${item.title}\n${formatText}`;
|
||||
})
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
rawText: rawText,
|
||||
formatText
|
||||
};
|
||||
};
|
@@ -35,13 +35,8 @@ export const clearDirFiles = (dirPath: string) => {
|
||||
return;
|
||||
}
|
||||
|
||||
fs.readdirSync(dirPath).forEach((file) => {
|
||||
const curPath = `${dirPath}/${file}`;
|
||||
if (fs.lstatSync(curPath).isDirectory()) {
|
||||
clearDirFiles(curPath);
|
||||
} else {
|
||||
fs.unlinkSync(curPath);
|
||||
}
|
||||
fs.rmdirSync(dirPath, {
|
||||
recursive: true
|
||||
});
|
||||
};
|
||||
|
||||
|
@@ -9,7 +9,6 @@ import {
|
||||
DatasetCollectionSchemaType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { MongoDatasetTraining } from '../training/schema';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDatasetData } from '../data/schema';
|
||||
import { delImgByRelatedId } from '../../../common/file/image/controller';
|
||||
import { deleteDatasetDataVector } from '../../../common/vectorStore/controller';
|
||||
|
6
packages/service/core/dataset/training/constants.ts
Normal file
6
packages/service/core/dataset/training/constants.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
export enum ImportDataSourceEnum {
|
||||
fileLocal = 'fileLocal',
|
||||
fileLink = 'fileLink',
|
||||
fileCustom = 'fileCustom',
|
||||
tableLocal = 'tableLocal'
|
||||
}
|
@@ -1,14 +1,16 @@
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
import { MongoDatasetTraining } from './schema';
|
||||
import type {
|
||||
PushDatasetDataChunkProps,
|
||||
PushDatasetDataProps,
|
||||
PushDatasetDataResponse
|
||||
} from '@fastgpt/global/core/dataset/api.d';
|
||||
import { getCollectionWithDataset } from '../controller';
|
||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||
import { ClientSession } from '../../../common/mongo';
|
||||
import { getLLMModel, getVectorModel } from '../../ai/model';
|
||||
import { addLog } from '../../../common/system/log';
|
||||
import { getCollectionWithDataset } from '../controller';
|
||||
|
||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||
try {
|
||||
@@ -23,31 +25,52 @@ export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> =>
|
||||
} catch (error) {}
|
||||
};
|
||||
|
||||
export async function pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
export const pushDataListToTrainingQueueByCollectionId = async ({
|
||||
collectionId,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode = TrainingModeEnum.chunk
|
||||
...props
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||
const vectorModelList = global.vectorModels;
|
||||
const datasetModelList = global.llmModels;
|
||||
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps) => {
|
||||
const {
|
||||
datasetId: { _id: datasetId, vectorModel, agentModel }
|
||||
datasetId: { _id: datasetId, agentModel, vectorModel }
|
||||
} = await getCollectionWithDataset(collectionId);
|
||||
return pushDataListToTrainingQueue({
|
||||
...props,
|
||||
datasetId,
|
||||
collectionId,
|
||||
agentModel,
|
||||
vectorModel
|
||||
});
|
||||
};
|
||||
|
||||
export async function pushDataListToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
agentModel,
|
||||
vectorModel,
|
||||
data,
|
||||
prompt,
|
||||
billId,
|
||||
trainingMode = TrainingModeEnum.chunk,
|
||||
session
|
||||
}: {
|
||||
teamId: string;
|
||||
tmbId: string;
|
||||
datasetId: string;
|
||||
agentModel: string;
|
||||
vectorModel: string;
|
||||
session?: ClientSession;
|
||||
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||
const checkModelValid = async () => {
|
||||
const agentModelData = datasetModelList?.find((item) => item.model === agentModel);
|
||||
const agentModelData = getLLMModel(agentModel);
|
||||
if (!agentModelData) {
|
||||
return Promise.reject(`File model ${agentModel} is inValid`);
|
||||
}
|
||||
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
|
||||
const vectorModelData = getVectorModel(vectorModel);
|
||||
if (!vectorModelData) {
|
||||
return Promise.reject(`Vector model ${vectorModel} is inValid`);
|
||||
}
|
||||
@@ -124,52 +147,43 @@ export async function pushDataListToTrainingQueue({
|
||||
});
|
||||
|
||||
// insert data to db
|
||||
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
|
||||
try {
|
||||
const results = await MongoDatasetTraining.insertMany(
|
||||
dataList.map((item, i) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
}))
|
||||
);
|
||||
await delay(500);
|
||||
return results.length;
|
||||
} catch (error) {
|
||||
if (retry > 0) {
|
||||
await delay(500);
|
||||
return insertData(dataList, retry - 1);
|
||||
}
|
||||
return Promise.reject(error);
|
||||
}
|
||||
};
|
||||
const insertLen = filterResult.success.length;
|
||||
const failedDocuments: PushDatasetDataChunkProps[] = [];
|
||||
|
||||
let insertLen = 0;
|
||||
const chunkSize = 50;
|
||||
const chunkList = filterResult.success.reduce(
|
||||
(acc, cur) => {
|
||||
const lastChunk = acc[acc.length - 1];
|
||||
if (lastChunk.length < chunkSize) {
|
||||
lastChunk.push(cur);
|
||||
} else {
|
||||
acc.push([cur]);
|
||||
// 使用 insertMany 批量插入
|
||||
try {
|
||||
await MongoDatasetTraining.insertMany(
|
||||
filterResult.success.map((item) => ({
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
billId,
|
||||
mode: trainingMode,
|
||||
prompt,
|
||||
model,
|
||||
q: item.q,
|
||||
a: item.a,
|
||||
chunkIndex: item.chunkIndex ?? 0,
|
||||
weight: weight ?? 0,
|
||||
indexes: item.indexes
|
||||
})),
|
||||
{
|
||||
session
|
||||
}
|
||||
return acc;
|
||||
},
|
||||
[[]] as PushDatasetDataChunkProps[][]
|
||||
);
|
||||
for await (const chunks of chunkList) {
|
||||
insertLen += await insertData(chunks);
|
||||
);
|
||||
} catch (error: any) {
|
||||
addLog.error(`Insert error`, error);
|
||||
// 如果有错误,将失败的文档添加到失败列表中
|
||||
error.writeErrors.forEach((writeError: any) => {
|
||||
failedDocuments.push(data[writeError.index]);
|
||||
});
|
||||
console.log('failed', failedDocuments);
|
||||
}
|
||||
|
||||
// 对于失败的文档,尝试单独插入
|
||||
for await (const item of failedDocuments) {
|
||||
await MongoDatasetTraining.create(item);
|
||||
}
|
||||
|
||||
delete filterResult.success;
|
||||
|
@@ -2,6 +2,7 @@ import { DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
|
||||
import { addLog } from '../../../common/system/log';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { MongoDatasetTraining } from './schema';
|
||||
import Papa from 'papaparse';
|
||||
|
||||
export const checkInvalidChunkAndLock = async ({
|
||||
err,
|
||||
@@ -39,3 +40,18 @@ export const checkInvalidChunkAndLock = async ({
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
export const parseCsvTable2Chunks = (rawText: string) => {
|
||||
const csvArr = Papa.parse(rawText).data as string[][];
|
||||
|
||||
const chunks = csvArr
|
||||
.map((item) => ({
|
||||
q: item[0] || '',
|
||||
a: item[1] || ''
|
||||
}))
|
||||
.filter((item) => item.q || item.a);
|
||||
|
||||
return {
|
||||
chunks
|
||||
};
|
||||
};
|
||||
|
@@ -4,27 +4,36 @@
|
||||
"dependencies": {
|
||||
"@fastgpt/global": "workspace:*",
|
||||
"@node-rs/jieba": "1.10.0",
|
||||
"@xmldom/xmldom": "^0.8.10",
|
||||
"axios": "^1.5.1",
|
||||
"cheerio": "1.0.0-rc.12",
|
||||
"cookie": "^0.5.0",
|
||||
"date-fns": "2.30.0",
|
||||
"dayjs": "^1.11.7",
|
||||
"decompress": "^4.2.1",
|
||||
"encoding": "^0.1.13",
|
||||
"file-type": "^19.0.0",
|
||||
"json5": "^2.2.3",
|
||||
"jsonwebtoken": "^9.0.2",
|
||||
"mammoth": "^1.6.0",
|
||||
"mongoose": "^7.0.2",
|
||||
"multer": "1.4.5-lts.1",
|
||||
"next": "13.5.2",
|
||||
"nextjs-cors": "^2.1.2",
|
||||
"node-cron": "^3.0.3",
|
||||
"node-xlsx": "^0.23.0",
|
||||
"papaparse": "5.4.1",
|
||||
"pdfjs-dist": "4.0.269",
|
||||
"pg": "^8.10.0",
|
||||
"tunnel": "^0.0.6"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/cookie": "^0.5.2",
|
||||
"@types/decompress": "^4.2.7",
|
||||
"@types/jsonwebtoken": "^9.0.3",
|
||||
"@types/multer": "^1.4.10",
|
||||
"@types/node-cron": "^3.0.11",
|
||||
"@types/papaparse": "5.3.7",
|
||||
"@types/pg": "^8.6.6",
|
||||
"@types/tunnel": "^0.0.4"
|
||||
}
|
||||
|
42
packages/service/support/permission/auth/file.ts
Normal file
42
packages/service/support/permission/auth/file.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
import { AuthResponseType } from '@fastgpt/global/support/permission/type';
|
||||
import { AuthModeType } from '../type';
|
||||
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
|
||||
import { parseHeaderCert } from '../controller';
|
||||
import { getFileById } from '../../../common/file/gridfs/controller';
|
||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||
|
||||
export async function authFile({
|
||||
fileId,
|
||||
per = 'owner',
|
||||
...props
|
||||
}: AuthModeType & {
|
||||
fileId: string;
|
||||
}): Promise<
|
||||
AuthResponseType & {
|
||||
file: DatasetFileSchema;
|
||||
}
|
||||
> {
|
||||
const authRes = await parseHeaderCert(props);
|
||||
const { teamId, tmbId } = authRes;
|
||||
|
||||
const file = await getFileById({ bucketName: BucketNameEnum.dataset, fileId });
|
||||
|
||||
if (!file) {
|
||||
return Promise.reject(CommonErrEnum.fileNotFound);
|
||||
}
|
||||
|
||||
if (file.metadata?.teamId !== teamId) {
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
if (per === 'owner' && file.metadata?.tmbId !== tmbId) {
|
||||
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||
}
|
||||
|
||||
return {
|
||||
...authRes,
|
||||
isOwner: per === 'owner',
|
||||
canWrite: per === 'owner',
|
||||
file
|
||||
};
|
||||
}
|
@@ -1,40 +0,0 @@
|
||||
import Papa from 'papaparse';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
/**
|
||||
* read csv to json
|
||||
* @response {
|
||||
* header: string[],
|
||||
* data: string[][]
|
||||
* }
|
||||
*/
|
||||
export const readCsvContent = async ({ file }: { file: File }) => {
|
||||
try {
|
||||
const { rawText: textArr } = await readFileRawText(file);
|
||||
const csvArr = Papa.parse(textArr).data as string[][];
|
||||
if (csvArr.length === 0) {
|
||||
throw new Error('csv 解析失败');
|
||||
}
|
||||
|
||||
const header = csvArr.shift() as string[];
|
||||
|
||||
// add title to data
|
||||
const rawText = csvArr
|
||||
.map((item) =>
|
||||
item.map((value, index) => {
|
||||
if (!header[index]) return value;
|
||||
return `${header[index]}: ${value}`;
|
||||
})
|
||||
)
|
||||
.flat()
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
rawText,
|
||||
header,
|
||||
data: csvArr.map((item) => item)
|
||||
};
|
||||
} catch (error) {
|
||||
return Promise.reject('解析 csv 文件失败');
|
||||
}
|
||||
};
|
@@ -1,21 +0,0 @@
|
||||
import { htmlStr2Md } from '../../string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
|
||||
export const readHtmlFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const { rawText } = await readFileRawText(file);
|
||||
const md = htmlStr2Md(rawText);
|
||||
|
||||
const simpleMd = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
|
||||
return { rawText: simpleMd };
|
||||
};
|
@@ -1,49 +0,0 @@
|
||||
import { loadFile2Buffer } from '../utils';
|
||||
import { readCsvContent } from './csv';
|
||||
import { readHtmlFile } from './html';
|
||||
import { readMdFile } from './md';
|
||||
import { readPdfFile } from './pdf';
|
||||
import { readFileRawText } from './rawText';
|
||||
import { readWordFile } from './word';
|
||||
|
||||
export const readFileRawContent = async ({
|
||||
file,
|
||||
uploadBase64Controller
|
||||
}: {
|
||||
file: File;
|
||||
uploadBase64Controller?: (base64: string) => Promise<string>;
|
||||
}): Promise<{
|
||||
rawText: string;
|
||||
}> => {
|
||||
const extension = file?.name?.split('.')?.pop()?.toLowerCase();
|
||||
|
||||
switch (extension) {
|
||||
case 'txt':
|
||||
return readFileRawText(file);
|
||||
case 'md':
|
||||
return readMdFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
case 'html':
|
||||
return readHtmlFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
case 'csv':
|
||||
return readCsvContent({ file });
|
||||
case 'pdf':
|
||||
const pdf = await loadFile2Buffer({ file });
|
||||
return readPdfFile({ pdf });
|
||||
case 'docx':
|
||||
return readWordFile({
|
||||
file,
|
||||
uploadImgController: uploadBase64Controller
|
||||
});
|
||||
|
||||
default:
|
||||
return {
|
||||
rawText: ''
|
||||
};
|
||||
}
|
||||
};
|
@@ -1,17 +0,0 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { readFileRawText } from './rawText';
|
||||
|
||||
export const readMdFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const { rawText: md } = await readFileRawText(file);
|
||||
const simpleMd = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController
|
||||
});
|
||||
return { rawText: simpleMd };
|
||||
};
|
@@ -1,36 +0,0 @@
|
||||
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
|
||||
|
||||
/**
|
||||
* read file raw text
|
||||
*/
|
||||
export const readFileRawText = (file: File) => {
|
||||
return new Promise<{ rawText: string }>((resolve, reject) => {
|
||||
try {
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
//@ts-ignore
|
||||
const encode = detectFileEncoding(reader.result);
|
||||
|
||||
// 再次读取文件,这次使用检测到的编码
|
||||
const reader2 = new FileReader();
|
||||
reader2.onload = () => {
|
||||
resolve({
|
||||
rawText: reader2.result as string
|
||||
});
|
||||
};
|
||||
reader2.onerror = (err) => {
|
||||
console.log('Error reading file with detected encoding:', err);
|
||||
reject('Read file error with detected encoding');
|
||||
};
|
||||
reader2.readAsText(file, encode);
|
||||
};
|
||||
reader.onerror = (err) => {
|
||||
console.log('error txt read:', err);
|
||||
reject('Read file error');
|
||||
};
|
||||
reader.readAsBinaryString(file);
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
}
|
||||
});
|
||||
};
|
@@ -1,28 +0,0 @@
|
||||
import { markdownProcess } from '@fastgpt/global/common/string/markdown';
|
||||
import { htmlStr2Md } from '../../string/markdown';
|
||||
import { loadFile2Buffer } from '../utils';
|
||||
import mammoth from 'mammoth';
|
||||
|
||||
export const readWordFile = async ({
|
||||
file,
|
||||
uploadImgController
|
||||
}: {
|
||||
file: File;
|
||||
uploadImgController?: (base64: string) => Promise<string>;
|
||||
}) => {
|
||||
const buffer = await loadFile2Buffer({ file });
|
||||
|
||||
const { value: html } = await mammoth.convertToHtml({
|
||||
arrayBuffer: buffer
|
||||
});
|
||||
const md = htmlStr2Md(html);
|
||||
|
||||
const rawText = await markdownProcess({
|
||||
rawText: md,
|
||||
uploadImgController: uploadImgController
|
||||
});
|
||||
|
||||
return {
|
||||
rawText
|
||||
};
|
||||
};
|
@@ -101,6 +101,7 @@ export const iconPaths = {
|
||||
'core/dataset/mixedRecall': () => import('./icons/core/dataset/mixedRecall.svg'),
|
||||
'core/dataset/modeEmbedding': () => import('./icons/core/dataset/modeEmbedding.svg'),
|
||||
'core/dataset/rerank': () => import('./icons/core/dataset/rerank.svg'),
|
||||
'core/dataset/splitLight': () => import('./icons/core/dataset/splitLight.svg'),
|
||||
'core/dataset/tableCollection': () => import('./icons/core/dataset/tableCollection.svg'),
|
||||
'core/dataset/websiteDataset': () => import('./icons/core/dataset/websiteDataset.svg'),
|
||||
'core/modules/basicNode': () => import('./icons/core/modules/basicNode.svg'),
|
||||
|
@@ -0,0 +1,6 @@
|
||||
<svg t="1711938287623" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg"
|
||||
p-id="5143">
|
||||
<path
|
||||
d="M153.6 153.6h716.8a51.2 51.2 0 0 1 0 102.4H153.6a51.2 51.2 0 1 1 0-102.4z m0 614.4h716.8a51.2 51.2 0 0 1 0 102.4H153.6a51.2 51.2 0 0 1 0-102.4z m0-307.2h131.6352a51.2 51.2 0 1 1 0 102.4H153.6a51.2 51.2 0 0 1 0-102.4z m292.5568 0h131.6864a51.2 51.2 0 0 1 0 102.4H446.1568a51.2 51.2 0 0 1 0-102.4z m292.608 0H870.4a51.2 51.2 0 0 1 0 102.4h-131.6352a51.2 51.2 0 0 1 0-102.4z"
|
||||
p-id="5144"></path>
|
||||
</svg>
|
After Width: | Height: | Size: 554 B |
70
packages/web/components/common/MyDrawer/MyRightDrawer.tsx
Normal file
70
packages/web/components/common/MyDrawer/MyRightDrawer.tsx
Normal file
@@ -0,0 +1,70 @@
|
||||
import React from 'react';
|
||||
import MyIcon from '../Icon';
|
||||
import {
|
||||
Drawer,
|
||||
DrawerBody,
|
||||
DrawerHeader,
|
||||
DrawerOverlay,
|
||||
DrawerContent,
|
||||
DrawerCloseButton,
|
||||
DrawerContentProps,
|
||||
Flex,
|
||||
Image
|
||||
} from '@chakra-ui/react';
|
||||
import { useLoading } from '../../../hooks/useLoading';
|
||||
|
||||
type Props = DrawerContentProps & {
|
||||
onClose: () => void;
|
||||
iconSrc?: string;
|
||||
title?: any;
|
||||
isLoading?: boolean;
|
||||
};
|
||||
|
||||
const MyRightDrawer = ({
|
||||
onClose,
|
||||
iconSrc,
|
||||
title,
|
||||
maxW = ['90vw', '30vw'],
|
||||
children,
|
||||
isLoading,
|
||||
...props
|
||||
}: Props) => {
|
||||
const { Loading } = useLoading();
|
||||
return (
|
||||
<Drawer isOpen placement="right" onClose={onClose}>
|
||||
<DrawerOverlay />
|
||||
<DrawerContent
|
||||
maxW={maxW}
|
||||
{...props}
|
||||
h={'94%'}
|
||||
mt={'2%'}
|
||||
borderLeftRadius={'lg'}
|
||||
overflow={'hidden'}
|
||||
>
|
||||
<DrawerCloseButton />
|
||||
<DrawerHeader>
|
||||
<Flex alignItems={'center'} pr={2}>
|
||||
{iconSrc && (
|
||||
<>
|
||||
{iconSrc.startsWith('/') ? (
|
||||
<Image mr={3} objectFit={'contain'} alt="" src={iconSrc} w={'20px'} />
|
||||
) : (
|
||||
<MyIcon mr={3} name={iconSrc as any} w={'20px'} />
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
{title}
|
||||
</Flex>
|
||||
<DrawerCloseButton zIndex={1} />
|
||||
</DrawerHeader>
|
||||
|
||||
<DrawerBody>
|
||||
{children}
|
||||
<Loading loading={isLoading} fixed={false} />
|
||||
</DrawerBody>
|
||||
</DrawerContent>
|
||||
</Drawer>
|
||||
);
|
||||
};
|
||||
|
||||
export default MyRightDrawer;
|
@@ -2,6 +2,8 @@ import React from 'react';
|
||||
import { Box, Flex, useTheme, Grid, type GridProps } from '@chakra-ui/react';
|
||||
import { useTranslation } from 'next-i18next';
|
||||
import MyTooltip from '../MyTooltip';
|
||||
import { QuestionOutlineIcon } from '@chakra-ui/icons';
|
||||
import QuestionTip from '../MyTooltip/QuestionTip';
|
||||
|
||||
// @ts-ignore
|
||||
interface Props extends GridProps {
|
||||
@@ -36,58 +38,59 @@ const LeftRadio = ({
|
||||
return (
|
||||
<Grid gridGap={[3, 5]} fontSize={['sm', 'md']} {...props}>
|
||||
{list.map((item) => (
|
||||
<MyTooltip key={item.value} label={item.tooltip}>
|
||||
<Flex
|
||||
alignItems={item.desc ? align : 'center'}
|
||||
cursor={'pointer'}
|
||||
userSelect={'none'}
|
||||
px={px}
|
||||
py={py}
|
||||
border={theme.borders.sm}
|
||||
borderWidth={'1px'}
|
||||
borderRadius={'md'}
|
||||
position={'relative'}
|
||||
{...(value === item.value
|
||||
? {
|
||||
borderColor: 'primary.400',
|
||||
bg: activeBg,
|
||||
boxShadow: 'focus'
|
||||
<Flex
|
||||
alignItems={item.desc ? align : 'center'}
|
||||
key={item.value}
|
||||
cursor={'pointer'}
|
||||
userSelect={'none'}
|
||||
px={px}
|
||||
py={py}
|
||||
border={theme.borders.sm}
|
||||
borderWidth={'1px'}
|
||||
borderRadius={'md'}
|
||||
position={'relative'}
|
||||
{...(value === item.value
|
||||
? {
|
||||
borderColor: 'primary.400',
|
||||
bg: activeBg,
|
||||
boxShadow: 'focus'
|
||||
}
|
||||
: {
|
||||
bg: defaultBg,
|
||||
_hover: {
|
||||
borderColor: 'primary.300'
|
||||
}
|
||||
: {
|
||||
bg: defaultBg,
|
||||
_hover: {
|
||||
borderColor: 'primary.300'
|
||||
}
|
||||
})}
|
||||
onClick={() => onChange(item.value)}
|
||||
})}
|
||||
onClick={() => onChange(item.value)}
|
||||
>
|
||||
<Box
|
||||
w={'18px'}
|
||||
h={'18px'}
|
||||
borderWidth={'2.4px'}
|
||||
borderColor={value === item.value ? 'primary.015' : 'transparent'}
|
||||
borderRadius={'50%'}
|
||||
mr={3}
|
||||
>
|
||||
<Box
|
||||
w={'18px'}
|
||||
h={'18px'}
|
||||
borderWidth={'2.4px'}
|
||||
borderColor={value === item.value ? 'primary.015' : 'transparent'}
|
||||
<Flex
|
||||
w={'100%'}
|
||||
h={'100%'}
|
||||
borderWidth={'1px'}
|
||||
borderColor={value === item.value ? 'primary.600' : 'borderColor.high'}
|
||||
bg={value === item.value ? 'primary.1' : 'transparent'}
|
||||
borderRadius={'50%'}
|
||||
mr={3}
|
||||
alignItems={'center'}
|
||||
justifyContent={'center'}
|
||||
>
|
||||
<Flex
|
||||
w={'100%'}
|
||||
h={'100%'}
|
||||
borderWidth={'1px'}
|
||||
borderColor={value === item.value ? 'primary.600' : 'borderColor.high'}
|
||||
bg={value === item.value ? 'primary.1' : 'transparent'}
|
||||
<Box
|
||||
w={'5px'}
|
||||
h={'5px'}
|
||||
borderRadius={'50%'}
|
||||
alignItems={'center'}
|
||||
justifyContent={'center'}
|
||||
>
|
||||
<Box
|
||||
w={'5px'}
|
||||
h={'5px'}
|
||||
borderRadius={'50%'}
|
||||
bg={value === item.value ? 'primary.600' : 'transparent'}
|
||||
></Box>
|
||||
</Flex>
|
||||
</Box>
|
||||
<Box flex={'1 0 0'}>
|
||||
bg={value === item.value ? 'primary.600' : 'transparent'}
|
||||
></Box>
|
||||
</Flex>
|
||||
</Box>
|
||||
<Box flex={'1 0 0'}>
|
||||
<Flex alignItems={'center'}>
|
||||
<Box
|
||||
color={'myGray.900'}
|
||||
fontWeight={item.desc ? '500' : 'normal'}
|
||||
@@ -95,15 +98,16 @@ const LeftRadio = ({
|
||||
>
|
||||
{typeof item.title === 'string' ? t(item.title) : item.title}
|
||||
</Box>
|
||||
{!!item.desc && (
|
||||
<Box fontSize={'xs'} color={'myGray.500'} lineHeight={1.2}>
|
||||
{t(item.desc)}
|
||||
</Box>
|
||||
)}
|
||||
{item?.children}
|
||||
</Box>
|
||||
</Flex>
|
||||
</MyTooltip>
|
||||
{!!item.tooltip && <QuestionTip label={item.tooltip} ml={1} color={'myGray.600'} />}
|
||||
</Flex>
|
||||
{!!item.desc && (
|
||||
<Box fontSize={'xs'} color={'myGray.500'} lineHeight={1.2}>
|
||||
{t(item.desc)}
|
||||
</Box>
|
||||
)}
|
||||
{item?.children}
|
||||
</Box>
|
||||
</Flex>
|
||||
))}
|
||||
</Grid>
|
||||
);
|
||||
|
@@ -12,31 +12,31 @@
|
||||
"@emotion/styled": "^11.11.0",
|
||||
"@fastgpt/global": "workspace:*",
|
||||
"@fingerprintjs/fingerprintjs": "^4.2.1",
|
||||
"@lexical/react": "0.12.6",
|
||||
"@lexical/text": "0.12.6",
|
||||
"@lexical/utils": "0.12.6",
|
||||
"@monaco-editor/react": "^4.6.0",
|
||||
"mammoth": "^1.6.0",
|
||||
"@tanstack/react-query": "^4.24.10",
|
||||
"date-fns": "2.30.0",
|
||||
"dayjs": "^1.11.7",
|
||||
"i18next": "23.10.0",
|
||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||
"lexical": "0.12.6",
|
||||
"lodash": "^4.17.21",
|
||||
"mammoth": "^1.6.0",
|
||||
"next-i18next": "15.2.0",
|
||||
"papaparse": "^5.4.1",
|
||||
"pdfjs-dist": "4.0.269",
|
||||
"react": "18.2.0",
|
||||
"react-day-picker": "^8.7.1",
|
||||
"react-dom": "18.2.0",
|
||||
"react-i18next": "13.5.0",
|
||||
"turndown": "^7.1.2",
|
||||
"lexical": "0.12.6",
|
||||
"@lexical/react": "0.12.6",
|
||||
"papaparse": "^5.4.1",
|
||||
"@lexical/utils": "0.12.6",
|
||||
"@lexical/text": "0.12.6",
|
||||
"date-fns": "2.30.0",
|
||||
"react-day-picker": "^8.7.1",
|
||||
"lodash": "^4.17.21",
|
||||
"@tanstack/react-query": "^4.24.10",
|
||||
"dayjs": "^1.11.7"
|
||||
"turndown": "^7.1.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/lodash": "^4.14.191",
|
||||
"@types/react": "18.2.0",
|
||||
"@types/papaparse": "^5.3.7",
|
||||
"@types/react": "18.2.0",
|
||||
"@types/react-dom": "18.2.0",
|
||||
"@types/turndown": "^5.0.4"
|
||||
}
|
||||
|
Reference in New Issue
Block a user