External dataset (#1497)

* perf: read rawText and chunk code

* perf: read raw text

* perf: read rawtext

* perf: token count

* log
This commit is contained in:
Archer
2024-05-16 11:47:53 +08:00
committed by GitHub
parent d5073f98ab
commit c6d9b15897
36 changed files with 531 additions and 267 deletions

View File

@@ -9,6 +9,9 @@ type SplitProps = {
overlapRatio?: number;
customReg?: string[];
};
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
chunkLen?: number;
};
type SplitResponse = {
chunks: string[];
@@ -49,6 +52,7 @@ const strIsMdTable = (str: string) => {
return false;
}
}
return true;
};
const markdownTableSplit = (props: SplitProps): SplitResponse => {
@@ -77,6 +81,10 @@ ${mdSplitString}
chunk += `${splitText2Lines[i]}\n`;
}
if (chunk) {
chunks.push(chunk);
}
return {
chunks,
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)