mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-22 20:37:48 +00:00
Feat: pptx and xlsx loader (#1118)
* perf: plan tip * perf: upload size controller * feat: add image ttl index * feat: new upload file ux * remove file * feat: support read pptx * feat: support xlsx * fix: rerank docker flie
This commit is contained in:
@@ -3,12 +3,17 @@ import { ErrType } from '../errorCode';
|
||||
/* dataset: 507000 */
|
||||
const startCode = 507000;
|
||||
export enum CommonErrEnum {
|
||||
fileNotFound = 'fileNotFound'
|
||||
fileNotFound = 'fileNotFound',
|
||||
unAuthFile = 'unAuthFile'
|
||||
}
|
||||
const datasetErr = [
|
||||
{
|
||||
statusText: CommonErrEnum.fileNotFound,
|
||||
message: 'error.fileNotFound'
|
||||
},
|
||||
{
|
||||
statusText: CommonErrEnum.unAuthFile,
|
||||
message: 'error.unAuthFile'
|
||||
}
|
||||
];
|
||||
export default datasetErr.reduce((acc, cur, index) => {
|
||||
|
@@ -40,9 +40,9 @@ export const splitText2Chunks = (props: {
|
||||
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
|
||||
|
||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
||||
|
||||
// ------ There's no overlap on the top
|
||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 },
|
||||
{ reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 },
|
||||
@@ -56,7 +56,7 @@ export const splitText2Chunks = (props: {
|
||||
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
|
||||
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
|
||||
|
||||
// if use markdown title split, Separate record title title
|
||||
// if use markdown title split, Separate record title
|
||||
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
|
||||
if (step >= stepReges.length) {
|
||||
return [
|
||||
@@ -97,6 +97,7 @@ export const splitText2Chunks = (props: {
|
||||
.filter((item) => item.text.trim());
|
||||
};
|
||||
|
||||
/* Gets the overlap at the end of a text as the beginning of the next block */
|
||||
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
||||
const forbidOverlap = checkForbidOverlap(step);
|
||||
const maxOverlapLen = chunkLen * 0.4;
|
||||
|
@@ -55,6 +55,7 @@ export type FastGPTFeConfigsType = {
|
||||
customApiDomain?: string;
|
||||
customSharePageDomain?: string;
|
||||
|
||||
uploadFileMaxAmount?: number;
|
||||
uploadFileMaxSize?: number;
|
||||
};
|
||||
|
||||
|
12
packages/global/core/dataset/api.d.ts
vendored
12
packages/global/core/dataset/api.d.ts
vendored
@@ -44,14 +44,18 @@ export type TextCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
|
||||
export type LinkCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
link: string;
|
||||
};
|
||||
export type FileIdCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
fileId: string;
|
||||
};
|
||||
export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & {
|
||||
name: string;
|
||||
rawTextLength: number;
|
||||
hashRawText: string;
|
||||
|
||||
fileMetadata?: Record<string, any>;
|
||||
collectionMetadata?: Record<string, any>;
|
||||
};
|
||||
export type CsvTableCreateDatasetCollectionParams = {
|
||||
datasetId: string;
|
||||
parentId?: string;
|
||||
fileId: string;
|
||||
};
|
||||
|
||||
/* ================= data ===================== */
|
||||
export type PgSearchRawType = {
|
||||
|
@@ -73,6 +73,13 @@ export const DatasetCollectionSyncResultMap = {
|
||||
/* ------------ data -------------- */
|
||||
|
||||
/* ------------ training -------------- */
|
||||
export enum ImportDataSourceEnum {
|
||||
fileLocal = 'fileLocal',
|
||||
fileLink = 'fileLink',
|
||||
fileCustom = 'fileCustom',
|
||||
csvTable = 'csvTable'
|
||||
}
|
||||
|
||||
export enum TrainingModeEnum {
|
||||
chunk = 'chunk',
|
||||
auto = 'auto',
|
||||
|
@@ -2,18 +2,18 @@
|
||||
"name": "@fastgpt/global",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"@apidevtools/swagger-parser": "^10.1.0",
|
||||
"axios": "^1.5.1",
|
||||
"dayjs": "^1.11.7",
|
||||
"encoding": "^0.1.13",
|
||||
"js-tiktoken": "^1.0.7",
|
||||
"openapi-types": "^12.1.3",
|
||||
"openai": "4.28.0",
|
||||
"nanoid": "^4.0.1",
|
||||
"js-yaml": "^4.1.0",
|
||||
"timezones-list": "^3.0.2",
|
||||
"next": "13.5.2",
|
||||
"jschardet": "3.1.1",
|
||||
"@apidevtools/swagger-parser": "^10.1.0"
|
||||
"nanoid": "^4.0.1",
|
||||
"next": "13.5.2",
|
||||
"openai": "4.28.0",
|
||||
"openapi-types": "^12.1.3",
|
||||
"timezones-list": "^3.0.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
|
Reference in New Issue
Block a user