External dataset (#1497)

* perf: read rawText and chunk code

* perf: read raw text

* perf: read rawtext

* perf: token count

* log
This commit is contained in:
Archer
2024-05-16 11:47:53 +08:00
committed by GitHub
parent d5073f98ab
commit c6d9b15897
36 changed files with 531 additions and 267 deletions

View File

@@ -9,6 +9,9 @@ type SplitProps = {
overlapRatio?: number;
customReg?: string[];
};
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
chunkLen?: number;
};
type SplitResponse = {
chunks: string[];
@@ -49,6 +52,7 @@ const strIsMdTable = (str: string) => {
return false;
}
}
return true;
};
const markdownTableSplit = (props: SplitProps): SplitResponse => {
@@ -77,6 +81,10 @@ ${mdSplitString}
chunk += `${splitText2Lines[i]}\n`;
}
if (chunk) {
chunks.push(chunk);
}
return {
chunks,
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)

View File

@@ -66,6 +66,8 @@ export type SystemEnvType = {
vectorMaxProcess: number;
qaMaxProcess: number;
pgHNSWEfSearch: number;
tokenWorkers: number; // token count max worker
oneapiUrl?: string;
chatApiKey?: string;
};

View File

@@ -170,3 +170,10 @@ export const SearchScoreTypeMap = {
export const CustomCollectionIcon = 'common/linkBlue';
export const LinkCollectionIcon = 'common/linkBlue';
/* source prefix */
export enum DatasetSourceReadTypeEnum {
fileLocal = 'fileLocal',
link = 'link',
externalFile = 'externalFile'
}

View File

@@ -0,0 +1,16 @@
import { DatasetSourceReadTypeEnum, ImportDataSourceEnum } from './constants';
export const rawTextBackupPrefix = 'index,content';
export const importType2ReadType = (type: ImportDataSourceEnum) => {
if (type === ImportDataSourceEnum.csvTable || type === ImportDataSourceEnum.fileLocal) {
return DatasetSourceReadTypeEnum.fileLocal;
}
if (type === ImportDataSourceEnum.fileLink) {
return DatasetSourceReadTypeEnum.link;
}
if (type === ImportDataSourceEnum.externalFile) {
return DatasetSourceReadTypeEnum.externalFile;
}
return DatasetSourceReadTypeEnum.link;
};