Feat: pptx and xlsx loader (#1118)

* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
This commit is contained in:
Archer
2024-04-01 19:01:26 +08:00
committed by GitHub
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions

View File

@@ -40,9 +40,9 @@ export const splitText2Chunks = (props: {
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
// ------ There's no overlap on the top
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 },
@@ -56,7 +56,7 @@ export const splitText2Chunks = (props: {
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
// if use markdown title split, Separate record title title
// if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
return [
@@ -97,6 +97,7 @@ export const splitText2Chunks = (props: {
.filter((item) => item.text.trim());
};
/* Gets the overlap at the end of a text as the beginning of the next block */
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkLen * 0.4;