Files
FastGPT/packages/service/worker/readFile/parseOffice.ts
Archer 10d8c56e23 V4.8.18 feature (#3565)
* feat: org CRUD (#3380)

* feat: add org schema

* feat: org manage UI

* feat: OrgInfoModal

* feat: org tree view

* feat: org management

* fix: init root org

* feat: org permission for app

* feat: org support for dataset

* fix: disable org role control

* styles: opt type signatures

* fix: remove unused permission

* feat: delete org collaborator

* perf: Team org ui (#3499)

* perf: org ui

* perf: org ui

* feat: org auth for app & dataset (#3498)

* feat: auth org resource permission

* feat: org auth support for app & dataset

* perf: org permission check (#3500)

* i18n (#3501)

* name

* i18n

* feat: support dataset changeOwner (#3483)

* feat: support dataset changeOwner

* chore: update dataset change owner api

* feat: permission manage UI for org (#3503)

* perf: password check;perf: image upload check;perf: sso login check (#3509)

* perf: password check

* perf: image upload check

* perf: sso login check

* force show update notification modal & fix login page text (#3512)

* fix login page English text

* update notification modal

* perf: notify account (#3515)

* perf(plugin): improve searXNG empty result handling and documentation (#3507)

* perf(plugin): improve searXNG empty result handling and documentation

* 修改了文档和代码部分无搜索的结果的反馈

* refactor: org pathId (#3516)

* optimize payment process (#3517)

* feat: support wecom sso (#3518)

* feat: support wecom sso

* chore: remove unused wecom js-sdk dependency

* fix qrcode script (#3520)

* fix qrcode script

* i18n

* perf: full text collection and search code;perf: rename function (#3519)

* perf: full text collection and search code

* perf: rename function

* perf: notify modal

* remove invalid code

* perf: sso login

* perf: pay process

* 4.8.18 test (#3524)

* perf: remove local token

* perf: index

* perf: file encoding;perf: leave team code;@c121914yu perf: full text search code (#3528)

* perf: text encoding

* perf: leave team code

* perf: full text search code

* fix: http status

* perf: embedding search and vector avatar

* perf: async read file (#3531)

* refactor: team permission  manager (#3535)

* perf: classify org, group and member

* refactor: team per manager

* fix: missing functions

* 4.8.18 test (#3543)

* perf: login check

* doc

* perf: llm model config

* perf: team clb config

* fix: MemberModal UI (#3553)

* fix: adapt MemberModal title and icon

* fix: adapt member modal

* fix: search input placeholder

* fix: add button text

* perf: org permission (#3556)

* docs:用户答疑的官方文档补充 (#3540)

* docs:用户答疑的官方文档补充

* 问题回答的内容修补

* share link random avatar (#3541)

* share link random avatar

* fix

* delete unused code

* share page avatar (#3558)

* feat: init 4818

* share page avatar

* feat: tmp upgrade code (#3559)

* feat: tmp upgrade code

* fulltext search test

* update action

* full text tmp code (#3561)

* full text tmp code

* fix: init

* fix: init

* remove tmp code

* remove tmp code

* 4818-alpha

* 4.8.18 test (#3562)

* full text tmp code

* fix: init

* upgrade code

* account log

* account log

* perf: dockerfile

* upgrade code

* chore: update docs app template submission (#3564)

---------

Co-authored-by: a.e. <49438478+I-Info@users.noreply.github.com>
Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com>
Co-authored-by: heheer <heheer@sealos.io>
Co-authored-by: Jiangween <145003935+Jiangween@users.noreply.github.com>
2025-01-11 15:15:38 +08:00

132 lines
3.8 KiB
TypeScript

import { getNanoid } from '@fastgpt/global/common/string/tools';
import fs from 'fs';
import decompress from 'decompress';
import { DOMParser } from '@xmldom/xmldom';
import { clearDirFiles } from '../../common/file/utils';
import { addLog } from '../../common/system/log';
const DEFAULTDECOMPRESSSUBLOCATION = '/tmp';
function getNewFileName(ext: string) {
return `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}.${ext}`;
}
const parseString = (xml: string) => {
let parser = new DOMParser();
return parser.parseFromString(xml, 'text/xml');
};
const parsePowerPoint = async ({
filepath,
decompressPath,
encoding
}: {
filepath: string;
decompressPath: string;
encoding: BufferEncoding;
}) => {
// Files regex that hold our content of interest
const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g;
const slidesRegex = /ppt\/slides\/slide\d+.xml/g;
/** The decompress location which contains the filename in it */
const files = await decompress(filepath, decompressPath, {
filter: (x) => !!x.path.match(allFilesRegex)
});
// Verify if atleast the slides xml files exist in the extracted files list.
if (
files.length == 0 ||
!files.map((file) => file.path).some((filename) => filename.match(slidesRegex))
) {
return Promise.reject('解析 PPT 失败');
}
// Returning an array of all the xml contents read using fs.readFileSync
const xmlContentArray = await Promise.all(
files.map((file) => {
try {
return fs.promises.readFile(`${decompressPath}/${file.path}`, encoding);
} catch (err) {
return fs.promises.readFile(`${decompressPath}/${file.path}`, 'utf-8');
}
})
);
let responseArr: string[] = [];
xmlContentArray.forEach((xmlContent) => {
/** Find text nodes with a:p tags */
const xmlParagraphNodesList = parseString(xmlContent).getElementsByTagName('a:p');
/** Store all the text content to respond */
responseArr.push(
Array.from(xmlParagraphNodesList)
// Filter paragraph nodes than do not have any text nodes which are identifiable by a:t tag
.filter((paragraphNode) => paragraphNode.getElementsByTagName('a:t').length != 0)
.map((paragraphNode) => {
/** Find text nodes with a:t tags */
const xmlTextNodeList = paragraphNode.getElementsByTagName('a:t');
return Array.from(xmlTextNodeList)
.filter((textNode) => textNode.childNodes[0] && textNode.childNodes[0].nodeValue)
.map((textNode) => textNode.childNodes[0].nodeValue)
.join('');
})
.join('\n')
);
});
return responseArr.join('\n');
};
export const parseOffice = async ({
buffer,
encoding,
extension
}: {
buffer: Buffer;
encoding: BufferEncoding;
extension: string;
}) => {
// Prepare file for processing
// create temp file subdirectory if it does not exist
if (!fs.existsSync(DEFAULTDECOMPRESSSUBLOCATION)) {
fs.mkdirSync(DEFAULTDECOMPRESSSUBLOCATION, { recursive: true });
}
// temp file name
const filepath = getNewFileName(extension);
const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/${getNanoid()}`;
// const decompressPath = `${DEFAULTDECOMPRESSSUBLOCATION}/test`;
// write new file
try {
fs.writeFileSync(filepath, buffer, {
encoding
});
} catch (err) {
fs.writeFileSync(filepath, buffer, {
encoding: 'utf-8'
});
}
const text = await (async () => {
try {
switch (extension) {
case 'pptx':
return parsePowerPoint({ filepath, decompressPath, encoding });
default:
return Promise.reject('只能读取 .pptx 文件');
}
} catch (error) {
addLog.error(`Load ppt error`, { error });
}
return '';
})();
fs.unlinkSync(filepath);
clearDirFiles(decompressPath);
return text;
};