mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 13:03:50 +00:00
External dataset (#1497)
* perf: read rawText and chunk code * perf: read raw text * perf: read rawtext * perf: token count * log
This commit is contained in:
@@ -5,6 +5,7 @@ import { addHours } from 'date-fns';
|
||||
|
||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||
import { ReadFileResponse } from '../../../worker/file/type';
|
||||
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
|
||||
|
||||
export const initMarkdownText = ({
|
||||
teamId,
|
||||
@@ -29,36 +30,44 @@ export const initMarkdownText = ({
|
||||
|
||||
export const readFileRawContent = async ({
|
||||
extension,
|
||||
csvFormat,
|
||||
isQAImport,
|
||||
teamId,
|
||||
buffer,
|
||||
encoding,
|
||||
metadata
|
||||
}: {
|
||||
csvFormat?: boolean;
|
||||
isQAImport?: boolean;
|
||||
extension: string;
|
||||
teamId: string;
|
||||
buffer: Buffer;
|
||||
encoding: string;
|
||||
metadata?: Record<string, any>;
|
||||
}) => {
|
||||
const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
let { rawText, formatText } = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||
extension,
|
||||
csvFormat,
|
||||
encoding,
|
||||
buffer
|
||||
});
|
||||
|
||||
// markdown data format
|
||||
if (['md', 'html', 'docx'].includes(extension)) {
|
||||
result.rawText = await initMarkdownText({
|
||||
rawText = await initMarkdownText({
|
||||
teamId: teamId,
|
||||
md: result.rawText,
|
||||
md: rawText,
|
||||
metadata: metadata
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
if (['csv', 'xlsx'].includes(extension)) {
|
||||
// qa data
|
||||
if (isQAImport) {
|
||||
rawText = rawText || '';
|
||||
} else {
|
||||
rawText = formatText || '';
|
||||
}
|
||||
}
|
||||
|
||||
return { rawText };
|
||||
};
|
||||
|
||||
export const htmlToMarkdown = async (html?: string | null) => {
|
||||
|
Reference in New Issue
Block a user