External dataset (#1497)

* perf: read rawText and chunk code

* perf: read raw text

* perf: read rawtext

* perf: token count

* log
This commit is contained in:
Archer
2024-05-16 11:47:53 +08:00
committed by GitHub
parent d5073f98ab
commit c6d9b15897
36 changed files with 531 additions and 267 deletions

View File

@@ -10,6 +10,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
import { importType2ReadType } from '@fastgpt/global/core/dataset/read';
const PreviewChunks = ({
previewSource,
@@ -27,19 +28,7 @@ const PreviewChunks = ({
const { data = [], isLoading } = useQuery(
['previewSource'],
() => {
if (
importSource === ImportDataSourceEnum.fileLocal ||
importSource === ImportDataSourceEnum.csvTable ||
importSource === ImportDataSourceEnum.fileLink
) {
return getPreviewChunks({
type: importSource,
sourceId: previewSource.dbFileId || previewSource.link || '',
chunkSize,
overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar')
});
} else if (importSource === ImportDataSourceEnum.fileCustom) {
if (importSource === ImportDataSourceEnum.fileCustom) {
const customSplitChar = processParamsForm.getValues('customSplitChar');
const { chunks } = splitText2Chunks({
text: previewSource.rawText || '',
@@ -52,7 +41,27 @@ const PreviewChunks = ({
a: ''
}));
}
return [];
if (importSource === ImportDataSourceEnum.csvTable) {
return getPreviewChunks({
type: importType2ReadType(importSource),
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
chunkSize,
overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar'),
selector: processParamsForm.getValues('webSelector'),
isQAImport: true
});
}
return getPreviewChunks({
type: importType2ReadType(importSource),
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
chunkSize,
overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar'),
selector: processParamsForm.getValues('webSelector'),
isQAImport: false
});
},
{
onError(err) {

View File

@@ -9,6 +9,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast';
import { getErrText } from '@fastgpt/global/common/error/utils';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
import { importType2ReadType } from '@fastgpt/global/core/dataset/read';
const PreviewRawText = ({
previewSource,
@@ -18,32 +19,30 @@ const PreviewRawText = ({
onClose: () => void;
}) => {
const { toast } = useToast();
const { importSource } = useContextSelector(DatasetImportContext, (v) => v);
const { importSource, processParamsForm } = useContextSelector(DatasetImportContext, (v) => v);
const { data, isLoading } = useQuery(
['previewSource', previewSource?.dbFileId],
['previewSource', previewSource.dbFileId, previewSource.link, previewSource.sourceUrl],
() => {
if (importSource === ImportDataSourceEnum.fileLocal && previewSource.dbFileId) {
return getPreviewFileContent({
fileId: previewSource.dbFileId,
csvFormat: true
});
if (importSource === ImportDataSourceEnum.fileCustom && previewSource.rawText) {
return {
previewContent: previewSource.rawText.slice(0, 3000)
};
}
if (importSource === ImportDataSourceEnum.csvTable && previewSource.dbFileId) {
return getPreviewFileContent({
fileId: previewSource.dbFileId,
csvFormat: false
type: importType2ReadType(importSource),
sourceId: previewSource.dbFileId,
isQAImport: true
});
}
if (importSource === ImportDataSourceEnum.fileCustom) {
return {
previewContent: (previewSource.rawText || '').slice(0, 3000)
};
}
return {
previewContent: ''
};
return getPreviewFileContent({
type: importType2ReadType(importSource),
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
isQAImport: false,
selector: processParamsForm.getValues('webSelector')
});
},
{
onError(err) {

View File

@@ -162,7 +162,7 @@ const CustomLinkInput = () => {
{commonT('Add new')}
</Button>
<Button
isDisabled={list.length === 0}
isDisabled={list.filter((item) => !!item.sourceUrl).length === 0}
onClick={handleSubmit((data) => {
setSources(
data.list

View File

@@ -23,7 +23,7 @@ const LinkCollection = () => {
return (
<>
{activeStep === 0 && <CustomLinkImport />}
{activeStep === 1 && <DataProcess showPreviewChunks={false} />}
{activeStep === 1 && <DataProcess showPreviewChunks />}
{activeStep === 2 && <Upload />}
</>
);

View File

@@ -29,7 +29,8 @@ const FileLocal = () => {
export default React.memo(FileLocal);
const csvTemplate = `"第一列内容","第二列内容"
const csvTemplate = `index,content
"第一列内容","第二列内容"
"必填列","可选列。CSV 中请注意内容不能包含双引号,双引号是列分割符号"
"只会将第一和第二列内容导入,其余列会被忽略",""
"结合人工智能的演进历程,AIGC的发展大致可以分为三个阶段即:早期萌芽阶段(20世纪50年代至90年代中期)、沉淀积累阶段(20世纪90年代中期至21世纪10年代中期),以及快速发展展阶段(21世纪10年代中期至今)。",""