mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
External dataset (#1497)
* perf: read rawText and chunk code * perf: read raw text * perf: read rawtext * perf: token count * log
This commit is contained in:
@@ -9,6 +9,9 @@ type SplitProps = {
|
||||
overlapRatio?: number;
|
||||
customReg?: string[];
|
||||
};
|
||||
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
|
||||
chunkLen?: number;
|
||||
};
|
||||
|
||||
type SplitResponse = {
|
||||
chunks: string[];
|
||||
@@ -49,6 +52,7 @@ const strIsMdTable = (str: string) => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
const markdownTableSplit = (props: SplitProps): SplitResponse => {
|
||||
@@ -77,6 +81,10 @@ ${mdSplitString}
|
||||
chunk += `${splitText2Lines[i]}\n`;
|
||||
}
|
||||
|
||||
if (chunk) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
return {
|
||||
chunks,
|
||||
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
|
||||
|
@@ -66,6 +66,8 @@ export type SystemEnvType = {
|
||||
vectorMaxProcess: number;
|
||||
qaMaxProcess: number;
|
||||
pgHNSWEfSearch: number;
|
||||
tokenWorkers: number; // token count max worker
|
||||
|
||||
oneapiUrl?: string;
|
||||
chatApiKey?: string;
|
||||
};
|
||||
|
Reference in New Issue
Block a user