mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-27 00:17:31 +00:00

* fix: remove DefaultTeam (#4037) * fix :Get application bound knowledge base information logical rewrite (#4057) * fix :Get application bound knowledge base information logical rewrite * fix :Get application bound knowledge base information logical rewrite * fix :Get application bound knowledge base information logical rewrite * fix :Get application bound knowledge base information logical rewrite * update package * fix: import dataset step error;perf: ai proxy avatar (#4074) * perf: pg config params * perf: ai proxy avatar * fix: import dataset step error * feat: data input ux * perf: app dataset rewite * fix: 文本提取不支持arrayString,arrayNumber等jsonSchema (#4079) * update doc ;perf: model test (#4098) * perf: extract array * update doc * perf: model test * perf: model test * perf: think tag parse (#4102) * chat quote reader (#3912) * init chat quote full text reader * linked structure * dataset data linked * optimize code * fix ts build * test finish * delete log * fix * fix ts * fix ts * remove nextId * initial scroll * fix * fix * perf: chunk read (#4109) * package * perf: chunk read * feat: api dataset support pdf parse;fix: chunk reader auth (#4117) * feat: api dataset support pdf parse * fix: chunk reader auth * feat: invitation link (#3979) * feat: invitation link schema and apis * feat: add invitation link * feat: member status: active, leave, forbidden * fix: expires show hours and minutes * feat: invalid invitation link hint * fix: typo * chore: fix typo & i18n * fix * pref: fe * feat: add ttl index for 30-day-clean-up * perf: invite member code (#4118) * perf: invite member code * fix: ts * fix: model test channel id;fix: quote reader (#4123) * fix: model test channel id * fix: quote reader * fix chat quote reader (#4125) * perf: model test;perf: sidebar trigger (#4127) * fix: import dataset step error;perf: ai proxy avatar (#4074) * perf: pg config params * perf: ai proxy avatar * fix: import dataset step error * feat: data input ux * perf: app dataset rewite * perf: model test * perf: sidebar trigger * lock * update nanoid version * fix: select component ux * fix: ts * fix: vitest * remove test * fix: prompt toolcall ui (#4139) * load log error adapt * fix: prompt toolcall ui * perf: commercial function tip * update package * pref: copy link (#4147) * fix(i18n): namespace (#4143) * hiden dataset source (#4152) * hiden dataset source * perf: reader * chore: move all tests into a single folder (#4160) * fix modal close scroll (#4162) * fix modal close scroll * update refresh * feat: rerank modal select and weight (#4164) * fix loadInitData refresh (#4169) * fix * fix * form input number default & api dataset max token * feat: mix search weight (#4170) * feat: mix search weight * feat: svg render * fix: avatar error remove (#4173) * fix: avatar error remove * fix: index * fix: guide * fix: auth * update package;fix: input data model ui (#4181) * update package * fix: ts * update config * update jieba package * add type sign * fix: input data ui * fix: page title refresh (#4186) * fix: ts * update jieba package * fix: page title refresh * fix: remove member length check when opening invite create modal (#4193) * add env to check internal ip (#4187) * fix: ts * update jieba package * add env to check internal ip * package * fix: jieba * reset package * update config * fix: jieba package * init shell * init version * change team reload * update jieba package (#4200) * update jieba package * package * update package * remove invalid code * action * package (#4201) * package * update package * remove invalid code * package * remove i18n tip (#4202) * doc (#4205) * fix: i18n (#4208) * fix: next config (#4207) * reset package * i18n * update config * i18n * remove log --------- Co-authored-by: Finley Ge <32237950+FinleyGe@users.noreply.github.com> Co-authored-by: gggaaallleee <91131304+gggaaallleee@users.noreply.github.com> Co-authored-by: shilin <39396378+shilin66@users.noreply.github.com> Co-authored-by: heheer <heheer@sealos.io>
187 lines
4.4 KiB
TypeScript
187 lines
4.4 KiB
TypeScript
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
|
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
|
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
|
import { urlsFetch } from '../../common/string/cheerio';
|
|
import { parseCsvTable2Chunks } from './training/utils';
|
|
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
|
import axios from 'axios';
|
|
import { readRawContentByFileBuffer } from '../../common/file/read/utils';
|
|
import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools';
|
|
import { APIFileServer, FeishuServer, YuqueServer } from '@fastgpt/global/core/dataset/apiDataset';
|
|
import { useApiDatasetRequest } from './apiDataset/api';
|
|
import { POST } from '../../common/api/plusRequest';
|
|
|
|
export const readFileRawTextByUrl = async ({
|
|
teamId,
|
|
tmbId,
|
|
url,
|
|
customPdfParse,
|
|
relatedId
|
|
}: {
|
|
teamId: string;
|
|
tmbId: string;
|
|
url: string;
|
|
customPdfParse?: boolean;
|
|
relatedId: string; // externalFileId / apiFileId
|
|
}) => {
|
|
const response = await axios({
|
|
method: 'get',
|
|
url: url,
|
|
responseType: 'arraybuffer'
|
|
});
|
|
const extension = parseFileExtensionFromUrl(url);
|
|
|
|
const buffer = Buffer.from(response.data, 'binary');
|
|
|
|
const { rawText } = await readRawContentByFileBuffer({
|
|
customPdfParse,
|
|
isQAImport: false,
|
|
extension,
|
|
teamId,
|
|
tmbId,
|
|
buffer,
|
|
encoding: 'utf-8',
|
|
metadata: {
|
|
relatedId
|
|
}
|
|
});
|
|
|
|
return rawText;
|
|
};
|
|
|
|
/*
|
|
fileId - local file, read from mongo
|
|
link - request
|
|
externalFile/apiFile = request read
|
|
*/
|
|
export const readDatasetSourceRawText = async ({
|
|
teamId,
|
|
tmbId,
|
|
type,
|
|
sourceId,
|
|
isQAImport,
|
|
selector,
|
|
externalFileId,
|
|
apiServer,
|
|
feishuServer,
|
|
yuqueServer,
|
|
customPdfParse
|
|
}: {
|
|
teamId: string;
|
|
tmbId: string;
|
|
type: DatasetSourceReadTypeEnum;
|
|
sourceId: string;
|
|
customPdfParse?: boolean;
|
|
|
|
isQAImport?: boolean; // csv data
|
|
selector?: string; // link selector
|
|
externalFileId?: string; // external file dataset
|
|
apiServer?: APIFileServer; // api dataset
|
|
feishuServer?: FeishuServer; // feishu dataset
|
|
yuqueServer?: YuqueServer; // yuque dataset
|
|
}): Promise<string> => {
|
|
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
|
const { rawText } = await readFileContentFromMongo({
|
|
teamId,
|
|
tmbId,
|
|
bucketName: BucketNameEnum.dataset,
|
|
fileId: sourceId,
|
|
isQAImport,
|
|
customPdfParse
|
|
});
|
|
return rawText;
|
|
} else if (type === DatasetSourceReadTypeEnum.link) {
|
|
const result = await urlsFetch({
|
|
urlList: [sourceId],
|
|
selector
|
|
});
|
|
|
|
return result[0]?.content || '';
|
|
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
|
|
if (!externalFileId) return Promise.reject('FileId not found');
|
|
const rawText = await readFileRawTextByUrl({
|
|
teamId,
|
|
tmbId,
|
|
url: sourceId,
|
|
relatedId: externalFileId,
|
|
customPdfParse
|
|
});
|
|
return rawText;
|
|
} else if (type === DatasetSourceReadTypeEnum.apiFile) {
|
|
const rawText = await readApiServerFileContent({
|
|
apiServer,
|
|
feishuServer,
|
|
yuqueServer,
|
|
apiFileId: sourceId,
|
|
teamId,
|
|
tmbId
|
|
});
|
|
return rawText;
|
|
}
|
|
return '';
|
|
};
|
|
|
|
export const readApiServerFileContent = async ({
|
|
apiServer,
|
|
feishuServer,
|
|
yuqueServer,
|
|
apiFileId,
|
|
teamId,
|
|
tmbId,
|
|
customPdfParse
|
|
}: {
|
|
apiServer?: APIFileServer;
|
|
feishuServer?: FeishuServer;
|
|
yuqueServer?: YuqueServer;
|
|
apiFileId: string;
|
|
teamId: string;
|
|
tmbId: string;
|
|
customPdfParse?: boolean;
|
|
}) => {
|
|
if (apiServer) {
|
|
return useApiDatasetRequest({ apiServer }).getFileContent({
|
|
teamId,
|
|
tmbId,
|
|
apiFileId,
|
|
customPdfParse
|
|
});
|
|
}
|
|
|
|
if (feishuServer || yuqueServer) {
|
|
return POST<string>(`/core/dataset/systemApiDataset`, {
|
|
type: 'content',
|
|
feishuServer,
|
|
yuqueServer,
|
|
apiFileId
|
|
});
|
|
}
|
|
|
|
return Promise.reject('No apiServer or feishuServer or yuqueServer');
|
|
};
|
|
|
|
export const rawText2Chunks = ({
|
|
rawText,
|
|
isQAImport,
|
|
chunkLen = 512,
|
|
...splitProps
|
|
}: {
|
|
rawText: string;
|
|
isQAImport?: boolean;
|
|
} & TextSplitProps) => {
|
|
if (isQAImport) {
|
|
const { chunks } = parseCsvTable2Chunks(rawText);
|
|
return chunks;
|
|
}
|
|
|
|
const { chunks } = splitText2Chunks({
|
|
text: rawText,
|
|
chunkLen,
|
|
...splitProps
|
|
});
|
|
|
|
return chunks.map((item) => ({
|
|
q: item,
|
|
a: ''
|
|
}));
|
|
};
|