Feat: pptx and xlsx loader (#1118)

* perf: plan tip

* perf: upload size controller

* feat: add image ttl index

* feat: new upload file ux

* remove file

* feat: support read pptx

* feat: support xlsx

* fix: rerank docker flie
This commit is contained in:
Archer
2024-04-01 19:01:26 +08:00
committed by GitHub
parent f9d266a6af
commit 21288d1736
90 changed files with 2707 additions and 1678 deletions

View File

@@ -3,12 +3,17 @@ import { ErrType } from '../errorCode';
/* dataset: 507000 */
const startCode = 507000;
export enum CommonErrEnum {
fileNotFound = 'fileNotFound'
fileNotFound = 'fileNotFound',
unAuthFile = 'unAuthFile'
}
const datasetErr = [
{
statusText: CommonErrEnum.fileNotFound,
message: 'error.fileNotFound'
},
{
statusText: CommonErrEnum.unAuthFile,
message: 'error.unAuthFile'
}
];
export default datasetErr.reduce((acc, cur, index) => {

View File

@@ -40,9 +40,9 @@ export const splitText2Chunks = (props: {
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
// ------ There's no overlap on the top
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 },
@@ -56,7 +56,7 @@ export const splitText2Chunks = (props: {
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
// if use markdown title split, Separate record title title
// if use markdown title split, Separate record title
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
if (step >= stepReges.length) {
return [
@@ -97,6 +97,7 @@ export const splitText2Chunks = (props: {
.filter((item) => item.text.trim());
};
/* Gets the overlap at the end of a text as the beginning of the next block */
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkLen * 0.4;

View File

@@ -55,6 +55,7 @@ export type FastGPTFeConfigsType = {
customApiDomain?: string;
customSharePageDomain?: string;
uploadFileMaxAmount?: number;
uploadFileMaxSize?: number;
};

View File

@@ -44,14 +44,18 @@ export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
export type LinkCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
link: string;
};
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
fileId: string;
};
export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
name: string;
rawTextLength: number;
hashRawText: string;
fileMetadata?: Record<string, any>;
collectionMetadata?: Record<string, any>;
};
export type CsvTableCreateDatasetCollectionParams = {
datasetId: string;
parentId?: string;
fileId: string;
};
/* ================= data ===================== */
export type PgSearchRawType = {

View File

@@ -73,6 +73,13 @@ export const DatasetCollectionSyncResultMap = {
/* ------------ data -------------- */
/* ------------ training -------------- */
export enum ImportDataSourceEnum {
fileLocal = 'fileLocal',
fileLink = 'fileLink',
fileCustom = 'fileCustom',
csvTable = 'csvTable'
}
export enum TrainingModeEnum {
chunk = 'chunk',
auto = 'auto',

View File

@@ -2,18 +2,18 @@
"name": "@fastgpt/global",
"version": "1.0.0",
"dependencies": {
"@apidevtools/swagger-parser": "^10.1.0",
"axios": "^1.5.1",
"dayjs": "^1.11.7",
"encoding": "^0.1.13",
"js-tiktoken": "^1.0.7",
"openapi-types": "^12.1.3",
"openai": "4.28.0",
"nanoid": "^4.0.1",
"js-yaml": "^4.1.0",
"timezones-list": "^3.0.2",
"next": "13.5.2",
"jschardet": "3.1.1",
"@apidevtools/swagger-parser": "^10.1.0"
"nanoid": "^4.0.1",
"next": "13.5.2",
"openai": "4.28.0",
"openapi-types": "^12.1.3",
"timezones-list": "^3.0.2"
},
"devDependencies": {
"@types/js-yaml": "^4.0.9",