External dataset (#1497)

* perf: read rawText and chunk code

* perf: read raw text

* perf: read rawtext

* perf: token count

* log
This commit is contained in:
Archer
2024-05-16 11:47:53 +08:00
committed by GitHub
parent d5073f98ab
commit c6d9b15897
36 changed files with 531 additions and 267 deletions

View File

@@ -0,0 +1,99 @@
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
import { urlsFetch } from '../../common/string/cheerio';
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
import { parseCsvTable2Chunks } from './training/utils';
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import axios from 'axios';
import { readFileRawContent } from '../../common/file/read/utils';
export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => {
const response = await axios({
method: 'get',
url: url,
responseType: 'arraybuffer'
});
const extension = url.split('.')?.pop()?.toLowerCase() || '';
const buffer = Buffer.from(response.data, 'binary');
const { rawText } = await readFileRawContent({
extension,
teamId,
buffer,
encoding: 'utf-8'
});
return rawText;
};
/*
fileId - local file, read from mongo
link - request
externalFile = request read
*/
export const readDatasetSourceRawText = async ({
teamId,
type,
sourceId,
isQAImport,
selector
}: {
teamId: string;
type: DatasetSourceReadTypeEnum;
sourceId: string;
isQAImport?: boolean;
selector?: string;
}): Promise<string> => {
if (type === DatasetSourceReadTypeEnum.fileLocal) {
const { rawText } = await readFileContentFromMongo({
teamId,
bucketName: BucketNameEnum.dataset,
fileId: sourceId,
isQAImport
});
return rawText;
} else if (type === DatasetSourceReadTypeEnum.link) {
const result = await urlsFetch({
urlList: [sourceId],
selector
});
return result[0]?.content || '';
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
const rawText = await readFileRawTextByUrl({
teamId,
url: sourceId
});
return rawText;
}
return '';
};
export const rawText2Chunks = ({
rawText,
isQAImport,
chunkLen = 512,
...splitProps
}: {
rawText: string;
isQAImport?: boolean;
} & TextSplitProps) => {
if (isQAImport) {
const { chunks } = parseCsvTable2Chunks(rawText);
return chunks;
}
const { chunks } = splitText2Chunks({
text: rawText,
chunkLen,
...splitProps
});
return chunks.map((item) => ({
q: item,
a: ''
}));
};

View File

@@ -71,7 +71,7 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
chatId,
responseChatItemId,
...variables,
histories: histories.slice(-10),
histories: histories?.slice(-10) || [],
...body,
...dynamicInput
};

View File

@@ -62,7 +62,10 @@ export const valueTypeFormat = (value: any, type?: WorkflowIOValueTypeEnum) => {
return JSON.stringify(value);
}
if (type === 'number') return Number(value);
if (type === 'boolean') return value === 'true' ? true : false;
if (type === 'boolean') {
if (typeof value === 'string') return value === 'true';
return Boolean(value);
}
try {
if (type === WorkflowIOValueTypeEnum.datasetQuote && !Array.isArray(value)) {
return JSON.parse(value);