mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-24 22:03:54 +00:00
External dataset (#1497)
* perf: read rawText and chunk code * perf: read raw text * perf: read rawtext * perf: token count * log
This commit is contained in:
@@ -9,6 +9,9 @@ type SplitProps = {
|
||||
overlapRatio?: number;
|
||||
customReg?: string[];
|
||||
};
|
||||
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
|
||||
chunkLen?: number;
|
||||
};
|
||||
|
||||
type SplitResponse = {
|
||||
chunks: string[];
|
||||
@@ -49,6 +52,7 @@ const strIsMdTable = (str: string) => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
const markdownTableSplit = (props: SplitProps): SplitResponse => {
|
||||
@@ -77,6 +81,10 @@ ${mdSplitString}
|
||||
chunk += `${splitText2Lines[i]}\n`;
|
||||
}
|
||||
|
||||
if (chunk) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
return {
|
||||
chunks,
|
||||
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
|
||||
|
@@ -66,6 +66,8 @@ export type SystemEnvType = {
|
||||
vectorMaxProcess: number;
|
||||
qaMaxProcess: number;
|
||||
pgHNSWEfSearch: number;
|
||||
tokenWorkers: number; // token count max worker
|
||||
|
||||
oneapiUrl?: string;
|
||||
chatApiKey?: string;
|
||||
};
|
||||
|
@@ -170,3 +170,10 @@ export const SearchScoreTypeMap = {
|
||||
|
||||
export const CustomCollectionIcon = 'common/linkBlue';
|
||||
export const LinkCollectionIcon = 'common/linkBlue';
|
||||
|
||||
/* source prefix */
|
||||
export enum DatasetSourceReadTypeEnum {
|
||||
fileLocal = 'fileLocal',
|
||||
link = 'link',
|
||||
externalFile = 'externalFile'
|
||||
}
|
||||
|
16
packages/global/core/dataset/read.ts
Normal file
16
packages/global/core/dataset/read.ts
Normal file
@@ -0,0 +1,16 @@
|
||||
import { DatasetSourceReadTypeEnum, ImportDataSourceEnum } from './constants';
|
||||
|
||||
export const rawTextBackupPrefix = 'index,content';
|
||||
|
||||
export const importType2ReadType = (type: ImportDataSourceEnum) => {
|
||||
if (type === ImportDataSourceEnum.csvTable || type === ImportDataSourceEnum.fileLocal) {
|
||||
return DatasetSourceReadTypeEnum.fileLocal;
|
||||
}
|
||||
if (type === ImportDataSourceEnum.fileLink) {
|
||||
return DatasetSourceReadTypeEnum.link;
|
||||
}
|
||||
if (type === ImportDataSourceEnum.externalFile) {
|
||||
return DatasetSourceReadTypeEnum.externalFile;
|
||||
}
|
||||
return DatasetSourceReadTypeEnum.link;
|
||||
};
|
Reference in New Issue
Block a user