mirror of
https://github.com/labring/FastGPT.git
synced 2025-08-03 05:19:51 +00:00
External dataset (#1497)
* perf: read rawText and chunk code * perf: read raw text * perf: read rawtext * perf: token count * log
This commit is contained in:
@@ -10,6 +10,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetImportContext } from '../Context';
|
||||
import { importType2ReadType } from '@fastgpt/global/core/dataset/read';
|
||||
|
||||
const PreviewChunks = ({
|
||||
previewSource,
|
||||
@@ -27,19 +28,7 @@ const PreviewChunks = ({
|
||||
const { data = [], isLoading } = useQuery(
|
||||
['previewSource'],
|
||||
() => {
|
||||
if (
|
||||
importSource === ImportDataSourceEnum.fileLocal ||
|
||||
importSource === ImportDataSourceEnum.csvTable ||
|
||||
importSource === ImportDataSourceEnum.fileLink
|
||||
) {
|
||||
return getPreviewChunks({
|
||||
type: importSource,
|
||||
sourceId: previewSource.dbFileId || previewSource.link || '',
|
||||
chunkSize,
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
customSplitChar: processParamsForm.getValues('customSplitChar')
|
||||
});
|
||||
} else if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
const customSplitChar = processParamsForm.getValues('customSplitChar');
|
||||
const { chunks } = splitText2Chunks({
|
||||
text: previewSource.rawText || '',
|
||||
@@ -52,7 +41,27 @@ const PreviewChunks = ({
|
||||
a: ''
|
||||
}));
|
||||
}
|
||||
return [];
|
||||
if (importSource === ImportDataSourceEnum.csvTable) {
|
||||
return getPreviewChunks({
|
||||
type: importType2ReadType(importSource),
|
||||
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
|
||||
chunkSize,
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
||||
selector: processParamsForm.getValues('webSelector'),
|
||||
isQAImport: true
|
||||
});
|
||||
}
|
||||
|
||||
return getPreviewChunks({
|
||||
type: importType2ReadType(importSource),
|
||||
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
|
||||
chunkSize,
|
||||
overlapRatio: chunkOverlapRatio,
|
||||
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
||||
selector: processParamsForm.getValues('webSelector'),
|
||||
isQAImport: false
|
||||
});
|
||||
},
|
||||
{
|
||||
onError(err) {
|
||||
|
@@ -9,6 +9,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||
import { useContextSelector } from 'use-context-selector';
|
||||
import { DatasetImportContext } from '../Context';
|
||||
import { importType2ReadType } from '@fastgpt/global/core/dataset/read';
|
||||
|
||||
const PreviewRawText = ({
|
||||
previewSource,
|
||||
@@ -18,32 +19,30 @@ const PreviewRawText = ({
|
||||
onClose: () => void;
|
||||
}) => {
|
||||
const { toast } = useToast();
|
||||
const { importSource } = useContextSelector(DatasetImportContext, (v) => v);
|
||||
const { importSource, processParamsForm } = useContextSelector(DatasetImportContext, (v) => v);
|
||||
|
||||
const { data, isLoading } = useQuery(
|
||||
['previewSource', previewSource?.dbFileId],
|
||||
['previewSource', previewSource.dbFileId, previewSource.link, previewSource.sourceUrl],
|
||||
() => {
|
||||
if (importSource === ImportDataSourceEnum.fileLocal && previewSource.dbFileId) {
|
||||
return getPreviewFileContent({
|
||||
fileId: previewSource.dbFileId,
|
||||
csvFormat: true
|
||||
});
|
||||
if (importSource === ImportDataSourceEnum.fileCustom && previewSource.rawText) {
|
||||
return {
|
||||
previewContent: previewSource.rawText.slice(0, 3000)
|
||||
};
|
||||
}
|
||||
if (importSource === ImportDataSourceEnum.csvTable && previewSource.dbFileId) {
|
||||
return getPreviewFileContent({
|
||||
fileId: previewSource.dbFileId,
|
||||
csvFormat: false
|
||||
type: importType2ReadType(importSource),
|
||||
sourceId: previewSource.dbFileId,
|
||||
isQAImport: true
|
||||
});
|
||||
}
|
||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||
return {
|
||||
previewContent: (previewSource.rawText || '').slice(0, 3000)
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
previewContent: ''
|
||||
};
|
||||
return getPreviewFileContent({
|
||||
type: importType2ReadType(importSource),
|
||||
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
|
||||
isQAImport: false,
|
||||
selector: processParamsForm.getValues('webSelector')
|
||||
});
|
||||
},
|
||||
{
|
||||
onError(err) {
|
||||
|
@@ -162,7 +162,7 @@ const CustomLinkInput = () => {
|
||||
{commonT('Add new')}
|
||||
</Button>
|
||||
<Button
|
||||
isDisabled={list.length === 0}
|
||||
isDisabled={list.filter((item) => !!item.sourceUrl).length === 0}
|
||||
onClick={handleSubmit((data) => {
|
||||
setSources(
|
||||
data.list
|
||||
|
@@ -23,7 +23,7 @@ const LinkCollection = () => {
|
||||
return (
|
||||
<>
|
||||
{activeStep === 0 && <CustomLinkImport />}
|
||||
{activeStep === 1 && <DataProcess showPreviewChunks={false} />}
|
||||
{activeStep === 1 && <DataProcess showPreviewChunks />}
|
||||
{activeStep === 2 && <Upload />}
|
||||
</>
|
||||
);
|
||||
|
@@ -29,7 +29,8 @@ const FileLocal = () => {
|
||||
|
||||
export default React.memo(FileLocal);
|
||||
|
||||
const csvTemplate = `"第一列内容","第二列内容"
|
||||
const csvTemplate = `index,content
|
||||
"第一列内容","第二列内容"
|
||||
"必填列","可选列。CSV 中请注意内容不能包含双引号,双引号是列分割符号"
|
||||
"只会将第一和第二列内容导入,其余列会被忽略",""
|
||||
"结合人工智能的演进历程,AIGC的发展大致可以分为三个阶段,即:早期萌芽阶段(20世纪50年代至90年代中期)、沉淀积累阶段(20世纪90年代中期至21世纪10年代中期),以及快速发展展阶段(21世纪10年代中期至今)。",""
|
||||
|
Reference in New Issue
Block a user