mirror of
https://github.com/labring/FastGPT.git
synced 2025-10-21 03:10:50 +00:00
External dataset (#1497)
* perf: read rawText and chunk code * perf: read raw text * perf: read rawtext * perf: token count * log
This commit is contained in:
4
.vscode/nextapi.code-snippets
vendored
4
.vscode/nextapi.code-snippets
vendored
@@ -20,9 +20,9 @@
|
|||||||
"export type ${TM_FILENAME_BASE}Response = {};",
|
"export type ${TM_FILENAME_BASE}Response = {};",
|
||||||
"",
|
"",
|
||||||
"async function handler(",
|
"async function handler(",
|
||||||
" req: ApiRequestProps<getDatasetTrainingQueueBody, getDatasetTrainingQueueQuery>,",
|
" req: ApiRequestProps<${TM_FILENAME_BASE}Body, ${TM_FILENAME_BASE}Query>,",
|
||||||
" res: ApiResponseType<any>",
|
" res: ApiResponseType<any>",
|
||||||
"): Promise<getDatasetTrainingQueueResponse> {",
|
"): Promise<${TM_FILENAME_BASE}Response> {",
|
||||||
" $1",
|
" $1",
|
||||||
" return {}",
|
" return {}",
|
||||||
"}",
|
"}",
|
||||||
|
@@ -9,6 +9,9 @@ type SplitProps = {
|
|||||||
overlapRatio?: number;
|
overlapRatio?: number;
|
||||||
customReg?: string[];
|
customReg?: string[];
|
||||||
};
|
};
|
||||||
|
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
|
||||||
|
chunkLen?: number;
|
||||||
|
};
|
||||||
|
|
||||||
type SplitResponse = {
|
type SplitResponse = {
|
||||||
chunks: string[];
|
chunks: string[];
|
||||||
@@ -49,6 +52,7 @@ const strIsMdTable = (str: string) => {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
const markdownTableSplit = (props: SplitProps): SplitResponse => {
|
const markdownTableSplit = (props: SplitProps): SplitResponse => {
|
||||||
@@ -77,6 +81,10 @@ ${mdSplitString}
|
|||||||
chunk += `${splitText2Lines[i]}\n`;
|
chunk += `${splitText2Lines[i]}\n`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (chunk) {
|
||||||
|
chunks.push(chunk);
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
chunks,
|
chunks,
|
||||||
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
|
chars: chunks.reduce((sum, chunk) => sum + chunk.length, 0)
|
||||||
|
@@ -66,6 +66,8 @@ export type SystemEnvType = {
|
|||||||
vectorMaxProcess: number;
|
vectorMaxProcess: number;
|
||||||
qaMaxProcess: number;
|
qaMaxProcess: number;
|
||||||
pgHNSWEfSearch: number;
|
pgHNSWEfSearch: number;
|
||||||
|
tokenWorkers: number; // token count max worker
|
||||||
|
|
||||||
oneapiUrl?: string;
|
oneapiUrl?: string;
|
||||||
chatApiKey?: string;
|
chatApiKey?: string;
|
||||||
};
|
};
|
||||||
|
@@ -170,3 +170,10 @@ export const SearchScoreTypeMap = {
|
|||||||
|
|
||||||
export const CustomCollectionIcon = 'common/linkBlue';
|
export const CustomCollectionIcon = 'common/linkBlue';
|
||||||
export const LinkCollectionIcon = 'common/linkBlue';
|
export const LinkCollectionIcon = 'common/linkBlue';
|
||||||
|
|
||||||
|
/* source prefix */
|
||||||
|
export enum DatasetSourceReadTypeEnum {
|
||||||
|
fileLocal = 'fileLocal',
|
||||||
|
link = 'link',
|
||||||
|
externalFile = 'externalFile'
|
||||||
|
}
|
||||||
|
16
packages/global/core/dataset/read.ts
Normal file
16
packages/global/core/dataset/read.ts
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
import { DatasetSourceReadTypeEnum, ImportDataSourceEnum } from './constants';
|
||||||
|
|
||||||
|
export const rawTextBackupPrefix = 'index,content';
|
||||||
|
|
||||||
|
export const importType2ReadType = (type: ImportDataSourceEnum) => {
|
||||||
|
if (type === ImportDataSourceEnum.csvTable || type === ImportDataSourceEnum.fileLocal) {
|
||||||
|
return DatasetSourceReadTypeEnum.fileLocal;
|
||||||
|
}
|
||||||
|
if (type === ImportDataSourceEnum.fileLink) {
|
||||||
|
return DatasetSourceReadTypeEnum.link;
|
||||||
|
}
|
||||||
|
if (type === ImportDataSourceEnum.externalFile) {
|
||||||
|
return DatasetSourceReadTypeEnum.externalFile;
|
||||||
|
}
|
||||||
|
return DatasetSourceReadTypeEnum.link;
|
||||||
|
};
|
@@ -151,12 +151,12 @@ export const readFileContentFromMongo = async ({
|
|||||||
teamId,
|
teamId,
|
||||||
bucketName,
|
bucketName,
|
||||||
fileId,
|
fileId,
|
||||||
csvFormat = false
|
isQAImport = false
|
||||||
}: {
|
}: {
|
||||||
teamId: string;
|
teamId: string;
|
||||||
bucketName: `${BucketNameEnum}`;
|
bucketName: `${BucketNameEnum}`;
|
||||||
fileId: string;
|
fileId: string;
|
||||||
csvFormat?: boolean;
|
isQAImport?: boolean;
|
||||||
}): Promise<{
|
}): Promise<{
|
||||||
rawText: string;
|
rawText: string;
|
||||||
filename: string;
|
filename: string;
|
||||||
@@ -198,7 +198,7 @@ export const readFileContentFromMongo = async ({
|
|||||||
|
|
||||||
const { rawText } = await readFileRawContent({
|
const { rawText } = await readFileRawContent({
|
||||||
extension,
|
extension,
|
||||||
csvFormat,
|
isQAImport,
|
||||||
teamId,
|
teamId,
|
||||||
buffer: fileBuffers,
|
buffer: fileBuffers,
|
||||||
encoding,
|
encoding,
|
||||||
|
@@ -5,6 +5,7 @@ import { addHours } from 'date-fns';
|
|||||||
|
|
||||||
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
import { WorkerNameEnum, runWorker } from '../../../worker/utils';
|
||||||
import { ReadFileResponse } from '../../../worker/file/type';
|
import { ReadFileResponse } from '../../../worker/file/type';
|
||||||
|
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
|
||||||
|
|
||||||
export const initMarkdownText = ({
|
export const initMarkdownText = ({
|
||||||
teamId,
|
teamId,
|
||||||
@@ -29,36 +30,44 @@ export const initMarkdownText = ({
|
|||||||
|
|
||||||
export const readFileRawContent = async ({
|
export const readFileRawContent = async ({
|
||||||
extension,
|
extension,
|
||||||
csvFormat,
|
isQAImport,
|
||||||
teamId,
|
teamId,
|
||||||
buffer,
|
buffer,
|
||||||
encoding,
|
encoding,
|
||||||
metadata
|
metadata
|
||||||
}: {
|
}: {
|
||||||
csvFormat?: boolean;
|
isQAImport?: boolean;
|
||||||
extension: string;
|
extension: string;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
buffer: Buffer;
|
buffer: Buffer;
|
||||||
encoding: string;
|
encoding: string;
|
||||||
metadata?: Record<string, any>;
|
metadata?: Record<string, any>;
|
||||||
}) => {
|
}) => {
|
||||||
const result = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
let { rawText, formatText } = await runWorker<ReadFileResponse>(WorkerNameEnum.readFile, {
|
||||||
extension,
|
extension,
|
||||||
csvFormat,
|
|
||||||
encoding,
|
encoding,
|
||||||
buffer
|
buffer
|
||||||
});
|
});
|
||||||
|
|
||||||
// markdown data format
|
// markdown data format
|
||||||
if (['md', 'html', 'docx'].includes(extension)) {
|
if (['md', 'html', 'docx'].includes(extension)) {
|
||||||
result.rawText = await initMarkdownText({
|
rawText = await initMarkdownText({
|
||||||
teamId: teamId,
|
teamId: teamId,
|
||||||
md: result.rawText,
|
md: rawText,
|
||||||
metadata: metadata
|
metadata: metadata
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
if (['csv', 'xlsx'].includes(extension)) {
|
||||||
|
// qa data
|
||||||
|
if (isQAImport) {
|
||||||
|
rawText = rawText || '';
|
||||||
|
} else {
|
||||||
|
rawText = formatText || '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { rawText };
|
||||||
};
|
};
|
||||||
|
|
||||||
export const htmlToMarkdown = async (html?: string | null) => {
|
export const htmlToMarkdown = async (html?: string | null) => {
|
||||||
|
@@ -77,9 +77,8 @@ export const urlsFetch = async ({
|
|||||||
$,
|
$,
|
||||||
selector
|
selector
|
||||||
});
|
});
|
||||||
console.log('html====', html);
|
|
||||||
const md = await htmlToMarkdown(html);
|
const md = await htmlToMarkdown(html);
|
||||||
console.log('html====', md);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url,
|
url,
|
||||||
|
@@ -12,27 +12,34 @@ import { getNanoid } from '@fastgpt/global/common/string/tools';
|
|||||||
import { addLog } from '../../system/log';
|
import { addLog } from '../../system/log';
|
||||||
|
|
||||||
export const getTiktokenWorker = () => {
|
export const getTiktokenWorker = () => {
|
||||||
if (global.tiktokenWorker) {
|
const maxWorkers = global.systemEnv?.tokenWorkers || 20;
|
||||||
return global.tiktokenWorker;
|
|
||||||
|
if (!global.tiktokenWorkers) {
|
||||||
|
global.tiktokenWorkers = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (global.tiktokenWorkers.length >= maxWorkers) {
|
||||||
|
return global.tiktokenWorkers[Math.floor(Math.random() * global.tiktokenWorkers.length)];
|
||||||
}
|
}
|
||||||
|
|
||||||
const worker = getWorker(WorkerNameEnum.countGptMessagesTokens);
|
const worker = getWorker(WorkerNameEnum.countGptMessagesTokens);
|
||||||
|
|
||||||
|
const i = global.tiktokenWorkers.push({
|
||||||
|
index: global.tiktokenWorkers.length,
|
||||||
|
worker,
|
||||||
|
callbackMap: {}
|
||||||
|
});
|
||||||
|
|
||||||
worker.on('message', ({ id, data }: { id: string; data: number }) => {
|
worker.on('message', ({ id, data }: { id: string; data: number }) => {
|
||||||
const callback = global.tiktokenWorker?.callbackMap?.[id];
|
const callback = global.tiktokenWorkers[i - 1]?.callbackMap?.[id];
|
||||||
|
|
||||||
if (callback) {
|
if (callback) {
|
||||||
callback?.(data);
|
callback?.(data);
|
||||||
delete global.tiktokenWorker.callbackMap[id];
|
delete global.tiktokenWorkers[i - 1].callbackMap[id];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
global.tiktokenWorker = {
|
return global.tiktokenWorkers[i - 1];
|
||||||
worker,
|
|
||||||
callbackMap: {}
|
|
||||||
};
|
|
||||||
|
|
||||||
return global.tiktokenWorker;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export const countGptMessagesTokens = (
|
export const countGptMessagesTokens = (
|
||||||
@@ -44,20 +51,29 @@ export const countGptMessagesTokens = (
|
|||||||
const start = Date.now();
|
const start = Date.now();
|
||||||
|
|
||||||
const { worker, callbackMap } = getTiktokenWorker();
|
const { worker, callbackMap } = getTiktokenWorker();
|
||||||
|
|
||||||
const id = getNanoid();
|
const id = getNanoid();
|
||||||
|
|
||||||
const timer = setTimeout(() => {
|
const timer = setTimeout(() => {
|
||||||
resolve(0);
|
console.log('Count token Time out');
|
||||||
|
resolve(
|
||||||
|
messages.reduce((sum, item) => {
|
||||||
|
if (item.content) {
|
||||||
|
return sum + item.content.length * 0.5;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}, 0)
|
||||||
|
);
|
||||||
delete callbackMap[id];
|
delete callbackMap[id];
|
||||||
}, 300);
|
}, 60000);
|
||||||
|
|
||||||
callbackMap[id] = (data) => {
|
callbackMap[id] = (data) => {
|
||||||
|
// 检测是否有内存泄漏
|
||||||
|
addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`);
|
||||||
|
// console.log(process.memoryUsage());
|
||||||
|
|
||||||
resolve(data);
|
resolve(data);
|
||||||
clearTimeout(timer);
|
clearTimeout(timer);
|
||||||
|
|
||||||
// 检测是否有内存泄漏
|
|
||||||
// addLog.info(`Count token time: ${Date.now() - start}, token: ${data}`);
|
|
||||||
// console.log(process.memoryUsage());
|
|
||||||
};
|
};
|
||||||
|
|
||||||
worker.postMessage({
|
worker.postMessage({
|
||||||
|
99
packages/service/core/dataset/read.ts
Normal file
99
packages/service/core/dataset/read.ts
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
||||||
|
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
|
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
|
||||||
|
import { urlsFetch } from '../../common/string/cheerio';
|
||||||
|
import { rawTextBackupPrefix } from '@fastgpt/global/core/dataset/read';
|
||||||
|
import { parseCsvTable2Chunks } from './training/utils';
|
||||||
|
import { TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||||
|
import axios from 'axios';
|
||||||
|
import { readFileRawContent } from '../../common/file/read/utils';
|
||||||
|
|
||||||
|
export const readFileRawTextByUrl = async ({ teamId, url }: { teamId: string; url: string }) => {
|
||||||
|
const response = await axios({
|
||||||
|
method: 'get',
|
||||||
|
url: url,
|
||||||
|
responseType: 'arraybuffer'
|
||||||
|
});
|
||||||
|
const extension = url.split('.')?.pop()?.toLowerCase() || '';
|
||||||
|
|
||||||
|
const buffer = Buffer.from(response.data, 'binary');
|
||||||
|
|
||||||
|
const { rawText } = await readFileRawContent({
|
||||||
|
extension,
|
||||||
|
teamId,
|
||||||
|
buffer,
|
||||||
|
encoding: 'utf-8'
|
||||||
|
});
|
||||||
|
|
||||||
|
return rawText;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
fileId - local file, read from mongo
|
||||||
|
link - request
|
||||||
|
externalFile = request read
|
||||||
|
*/
|
||||||
|
export const readDatasetSourceRawText = async ({
|
||||||
|
teamId,
|
||||||
|
type,
|
||||||
|
sourceId,
|
||||||
|
isQAImport,
|
||||||
|
selector
|
||||||
|
}: {
|
||||||
|
teamId: string;
|
||||||
|
type: DatasetSourceReadTypeEnum;
|
||||||
|
sourceId: string;
|
||||||
|
isQAImport?: boolean;
|
||||||
|
selector?: string;
|
||||||
|
}): Promise<string> => {
|
||||||
|
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||||
|
const { rawText } = await readFileContentFromMongo({
|
||||||
|
teamId,
|
||||||
|
bucketName: BucketNameEnum.dataset,
|
||||||
|
fileId: sourceId,
|
||||||
|
isQAImport
|
||||||
|
});
|
||||||
|
return rawText;
|
||||||
|
} else if (type === DatasetSourceReadTypeEnum.link) {
|
||||||
|
const result = await urlsFetch({
|
||||||
|
urlList: [sourceId],
|
||||||
|
selector
|
||||||
|
});
|
||||||
|
|
||||||
|
return result[0]?.content || '';
|
||||||
|
} else if (type === DatasetSourceReadTypeEnum.externalFile) {
|
||||||
|
const rawText = await readFileRawTextByUrl({
|
||||||
|
teamId,
|
||||||
|
url: sourceId
|
||||||
|
});
|
||||||
|
return rawText;
|
||||||
|
}
|
||||||
|
|
||||||
|
return '';
|
||||||
|
};
|
||||||
|
|
||||||
|
export const rawText2Chunks = ({
|
||||||
|
rawText,
|
||||||
|
isQAImport,
|
||||||
|
chunkLen = 512,
|
||||||
|
...splitProps
|
||||||
|
}: {
|
||||||
|
rawText: string;
|
||||||
|
isQAImport?: boolean;
|
||||||
|
} & TextSplitProps) => {
|
||||||
|
if (isQAImport) {
|
||||||
|
const { chunks } = parseCsvTable2Chunks(rawText);
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { chunks } = splitText2Chunks({
|
||||||
|
text: rawText,
|
||||||
|
chunkLen,
|
||||||
|
...splitProps
|
||||||
|
});
|
||||||
|
|
||||||
|
return chunks.map((item) => ({
|
||||||
|
q: item,
|
||||||
|
a: ''
|
||||||
|
}));
|
||||||
|
};
|
@@ -71,7 +71,7 @@ export const dispatchHttp468Request = async (props: HttpRequestProps): Promise<H
|
|||||||
chatId,
|
chatId,
|
||||||
responseChatItemId,
|
responseChatItemId,
|
||||||
...variables,
|
...variables,
|
||||||
histories: histories.slice(-10),
|
histories: histories?.slice(-10) || [],
|
||||||
...body,
|
...body,
|
||||||
...dynamicInput
|
...dynamicInput
|
||||||
};
|
};
|
||||||
|
@@ -62,7 +62,10 @@ export const valueTypeFormat = (value: any, type?: WorkflowIOValueTypeEnum) => {
|
|||||||
return JSON.stringify(value);
|
return JSON.stringify(value);
|
||||||
}
|
}
|
||||||
if (type === 'number') return Number(value);
|
if (type === 'number') return Number(value);
|
||||||
if (type === 'boolean') return value === 'true' ? true : false;
|
if (type === 'boolean') {
|
||||||
|
if (typeof value === 'string') return value === 'true';
|
||||||
|
return Boolean(value);
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
if (type === WorkflowIOValueTypeEnum.datasetQuote && !Array.isArray(value)) {
|
if (type === WorkflowIOValueTypeEnum.datasetQuote && !Array.isArray(value)) {
|
||||||
return JSON.parse(value);
|
return JSON.parse(value);
|
||||||
|
@@ -13,10 +13,10 @@
|
|||||||
"decompress": "^4.2.1",
|
"decompress": "^4.2.1",
|
||||||
"domino-ext": "^2.1.4",
|
"domino-ext": "^2.1.4",
|
||||||
"encoding": "^0.1.13",
|
"encoding": "^0.1.13",
|
||||||
|
"fastgpt-js-tiktoken": "^1.0.12",
|
||||||
"file-type": "^19.0.0",
|
"file-type": "^19.0.0",
|
||||||
"iconv-lite": "^0.6.3",
|
"iconv-lite": "^0.6.3",
|
||||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||||
"js-tiktoken": "^1.0.7",
|
|
||||||
"json5": "^2.2.3",
|
"json5": "^2.2.3",
|
||||||
"jsonwebtoken": "^9.0.2",
|
"jsonwebtoken": "^9.0.2",
|
||||||
"mammoth": "^1.6.0",
|
"mammoth": "^1.6.0",
|
||||||
|
5
packages/service/type.d.ts
vendored
5
packages/service/type.d.ts
vendored
@@ -20,8 +20,9 @@ declare global {
|
|||||||
var whisperModel: WhisperModelType;
|
var whisperModel: WhisperModelType;
|
||||||
var reRankModels: ReRankModelItemType[];
|
var reRankModels: ReRankModelItemType[];
|
||||||
|
|
||||||
var tiktokenWorker: {
|
var tiktokenWorkers: {
|
||||||
|
index: number;
|
||||||
worker: Worker;
|
worker: Worker;
|
||||||
callbackMap: Record<string, (e: number) => void>;
|
callbackMap: Record<string, (e: number) => void>;
|
||||||
};
|
}[];
|
||||||
}
|
}
|
||||||
|
@@ -15,40 +15,45 @@ type TokenType = {
|
|||||||
|
|
||||||
export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<ReadFileResponse> => {
|
||||||
const readPDFPage = async (doc: any, pageNo: number) => {
|
const readPDFPage = async (doc: any, pageNo: number) => {
|
||||||
const page = await doc.getPage(pageNo);
|
try {
|
||||||
const tokenizedText = await page.getTextContent();
|
const page = await doc.getPage(pageNo);
|
||||||
|
const tokenizedText = await page.getTextContent();
|
||||||
|
|
||||||
const viewport = page.getViewport({ scale: 1 });
|
const viewport = page.getViewport({ scale: 1 });
|
||||||
const pageHeight = viewport.height;
|
const pageHeight = viewport.height;
|
||||||
const headerThreshold = pageHeight * 0.95;
|
const headerThreshold = pageHeight * 0.95;
|
||||||
const footerThreshold = pageHeight * 0.05;
|
const footerThreshold = pageHeight * 0.05;
|
||||||
|
|
||||||
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
const pageTexts: TokenType[] = tokenizedText.items.filter((token: TokenType) => {
|
||||||
return (
|
return (
|
||||||
!token.transform ||
|
!token.transform ||
|
||||||
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
(token.transform[5] < headerThreshold && token.transform[5] > footerThreshold)
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
// concat empty string 'hasEOL'
|
// concat empty string 'hasEOL'
|
||||||
for (let i = 0; i < pageTexts.length; i++) {
|
for (let i = 0; i < pageTexts.length; i++) {
|
||||||
const item = pageTexts[i];
|
const item = pageTexts[i];
|
||||||
if (item.str === '' && pageTexts[i - 1]) {
|
if (item.str === '' && pageTexts[i - 1]) {
|
||||||
pageTexts[i - 1].hasEOL = item.hasEOL;
|
pageTexts[i - 1].hasEOL = item.hasEOL;
|
||||||
pageTexts.splice(i, 1);
|
pageTexts.splice(i, 1);
|
||||||
i--;
|
i--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
page.cleanup();
|
||||||
|
|
||||||
|
return pageTexts
|
||||||
|
.map((token) => {
|
||||||
|
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
||||||
|
|
||||||
|
return paragraphEnd ? `${token.str}\n` : token.str;
|
||||||
|
})
|
||||||
|
.join('');
|
||||||
|
} catch (error) {
|
||||||
|
console.log('pdf read error', error);
|
||||||
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
page.cleanup();
|
|
||||||
|
|
||||||
return pageTexts
|
|
||||||
.map((token) => {
|
|
||||||
const paragraphEnd = token.hasEOL && /([。?!.?!\n\r]|(\r\n))$/.test(token.str);
|
|
||||||
|
|
||||||
return paragraphEnd ? `${token.str}\n` : token.str;
|
|
||||||
})
|
|
||||||
.join('');
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const loadingTask = pdfjs.getDocument(buffer.buffer);
|
const loadingTask = pdfjs.getDocument(buffer.buffer);
|
||||||
@@ -58,6 +63,7 @@ export const readPdfFile = async ({ buffer }: ReadRawTextByBuffer): Promise<Read
|
|||||||
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
|
||||||
pageTextPromises.push(readPDFPage(doc, pageNo));
|
pageTextPromises.push(readPDFPage(doc, pageNo));
|
||||||
}
|
}
|
||||||
|
|
||||||
const pageTexts = await Promise.all(pageTextPromises);
|
const pageTexts = await Promise.all(pageTextPromises);
|
||||||
|
|
||||||
loadingTask.destroy();
|
loadingTask.destroy();
|
||||||
|
@@ -23,25 +23,9 @@ parentPort?.on('message', async (props: ReadRawTextProps<Uint8Array>) => {
|
|||||||
case 'pptx':
|
case 'pptx':
|
||||||
return readPptxRawText(params);
|
return readPptxRawText(params);
|
||||||
case 'xlsx':
|
case 'xlsx':
|
||||||
const xlsxResult = await readXlsxRawText(params);
|
return readXlsxRawText(params);
|
||||||
if (params.csvFormat) {
|
|
||||||
return {
|
|
||||||
rawText: xlsxResult.formatText || ''
|
|
||||||
};
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
rawText: xlsxResult.rawText
|
|
||||||
};
|
|
||||||
case 'csv':
|
case 'csv':
|
||||||
const csvResult = await readCsvRawText(params);
|
return readCsvRawText(params);
|
||||||
if (params.csvFormat) {
|
|
||||||
return {
|
|
||||||
rawText: csvResult.formatText || ''
|
|
||||||
};
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
rawText: csvResult.rawText
|
|
||||||
};
|
|
||||||
default:
|
default:
|
||||||
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
|
return Promise.reject('Only support .txt, .md, .html, .pdf, .docx, pptx, .csv, .xlsx');
|
||||||
}
|
}
|
||||||
|
1
packages/service/worker/file/type.d.ts
vendored
1
packages/service/worker/file/type.d.ts
vendored
@@ -1,7 +1,6 @@
|
|||||||
import { ReadFileByBufferParams } from '../../common/file/read/type';
|
import { ReadFileByBufferParams } from '../../common/file/read/type';
|
||||||
|
|
||||||
export type ReadRawTextProps<T> = {
|
export type ReadRawTextProps<T> = {
|
||||||
csvFormat?: boolean;
|
|
||||||
extension: string;
|
extension: string;
|
||||||
buffer: T;
|
buffer: T;
|
||||||
encoding: string;
|
encoding: string;
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
/* Only the token of gpt-3.5-turbo is used */
|
/* Only the token of gpt-3.5-turbo is used */
|
||||||
import { Tiktoken } from 'js-tiktoken/lite';
|
import { Tiktoken } from 'fastgpt-js-tiktoken/lite';
|
||||||
import encodingJson from './cl100k_base.json';
|
import cl100k_base from './cl100k_base.json';
|
||||||
import {
|
import {
|
||||||
ChatCompletionMessageParam,
|
ChatCompletionMessageParam,
|
||||||
ChatCompletionContentPart,
|
ChatCompletionContentPart,
|
||||||
@@ -10,7 +10,7 @@ import {
|
|||||||
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
|
import { ChatCompletionRequestMessageRoleEnum } from '@fastgpt/global/core/ai/constants';
|
||||||
import { parentPort } from 'worker_threads';
|
import { parentPort } from 'worker_threads';
|
||||||
|
|
||||||
const enc = new Tiktoken(encodingJson);
|
const enc = new Tiktoken(cl100k_base);
|
||||||
|
|
||||||
/* count messages tokens */
|
/* count messages tokens */
|
||||||
parentPort?.on(
|
parentPort?.on(
|
||||||
|
81
pnpm-lock.yaml
generated
81
pnpm-lock.yaml
generated
@@ -126,6 +126,9 @@ importers:
|
|||||||
encoding:
|
encoding:
|
||||||
specifier: ^0.1.13
|
specifier: ^0.1.13
|
||||||
version: 0.1.13
|
version: 0.1.13
|
||||||
|
fastgpt-js-tiktoken:
|
||||||
|
specifier: ^1.0.12
|
||||||
|
version: registry.npmjs.org/fastgpt-js-tiktoken@1.0.12
|
||||||
file-type:
|
file-type:
|
||||||
specifier: ^19.0.0
|
specifier: ^19.0.0
|
||||||
version: 19.0.0
|
version: 19.0.0
|
||||||
@@ -135,9 +138,6 @@ importers:
|
|||||||
joplin-turndown-plugin-gfm:
|
joplin-turndown-plugin-gfm:
|
||||||
specifier: ^1.0.12
|
specifier: ^1.0.12
|
||||||
version: 1.0.12
|
version: 1.0.12
|
||||||
js-tiktoken:
|
|
||||||
specifier: ^1.0.7
|
|
||||||
version: 1.0.7
|
|
||||||
json5:
|
json5:
|
||||||
specifier: ^2.2.3
|
specifier: ^2.2.3
|
||||||
version: 2.2.3
|
version: 2.2.3
|
||||||
@@ -155,7 +155,7 @@ importers:
|
|||||||
version: 1.4.5-lts.1
|
version: 1.4.5-lts.1
|
||||||
next:
|
next:
|
||||||
specifier: 13.5.2
|
specifier: 13.5.2
|
||||||
version: 13.5.2(@babel/core@7.24.4)(react-dom@18.2.0)(react@18.2.0)(sass@1.58.3)
|
version: 13.5.2(react-dom@18.2.0)(react@18.2.0)
|
||||||
nextjs-cors:
|
nextjs-cors:
|
||||||
specifier: ^2.1.2
|
specifier: ^2.1.2
|
||||||
version: 2.1.2(next@13.5.2)
|
version: 2.1.2(next@13.5.2)
|
||||||
@@ -8722,12 +8722,6 @@ packages:
|
|||||||
resolution: {integrity: sha512-dwXFwByc/ajSV6m5bcKAPwe4yDDF6D614pxmIi5odytzxRlwqF6nwoiCek80Ixc7Cvma5awClxrzFtxCQvcM8w==}
|
resolution: {integrity: sha512-dwXFwByc/ajSV6m5bcKAPwe4yDDF6D614pxmIi5odytzxRlwqF6nwoiCek80Ixc7Cvma5awClxrzFtxCQvcM8w==}
|
||||||
dev: true
|
dev: true
|
||||||
|
|
||||||
/js-tiktoken@1.0.7:
|
|
||||||
resolution: {integrity: sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==}
|
|
||||||
dependencies:
|
|
||||||
base64-js: 1.5.1
|
|
||||||
dev: false
|
|
||||||
|
|
||||||
/js-tokens@4.0.0:
|
/js-tokens@4.0.0:
|
||||||
resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==}
|
resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==}
|
||||||
|
|
||||||
@@ -9933,13 +9927,53 @@ packages:
|
|||||||
- '@babel/core'
|
- '@babel/core'
|
||||||
- babel-plugin-macros
|
- babel-plugin-macros
|
||||||
|
|
||||||
|
/next@13.5.2(react-dom@18.2.0)(react@18.2.0):
|
||||||
|
resolution: {integrity: sha512-vog4UhUaMYAzeqfiAAmgB/QWLW7p01/sg+2vn6bqc/CxHFYizMzLv6gjxKzl31EVFkfl/F+GbxlKizlkTE9RdA==}
|
||||||
|
engines: {node: '>=16.14.0'}
|
||||||
|
hasBin: true
|
||||||
|
peerDependencies:
|
||||||
|
'@opentelemetry/api': ^1.1.0
|
||||||
|
react: ^18.2.0
|
||||||
|
react-dom: ^18.2.0
|
||||||
|
sass: ^1.3.0
|
||||||
|
peerDependenciesMeta:
|
||||||
|
'@opentelemetry/api':
|
||||||
|
optional: true
|
||||||
|
sass:
|
||||||
|
optional: true
|
||||||
|
dependencies:
|
||||||
|
'@next/env': 13.5.2
|
||||||
|
'@swc/helpers': 0.5.2
|
||||||
|
busboy: 1.6.0
|
||||||
|
caniuse-lite: 1.0.30001603
|
||||||
|
postcss: 8.4.14
|
||||||
|
react: 18.2.0
|
||||||
|
react-dom: 18.2.0(react@18.2.0)
|
||||||
|
styled-jsx: 5.1.1(react@18.2.0)
|
||||||
|
watchpack: 2.4.0
|
||||||
|
zod: 3.21.4
|
||||||
|
optionalDependencies:
|
||||||
|
'@next/swc-darwin-arm64': 13.5.2
|
||||||
|
'@next/swc-darwin-x64': 13.5.2
|
||||||
|
'@next/swc-linux-arm64-gnu': 13.5.2
|
||||||
|
'@next/swc-linux-arm64-musl': 13.5.2
|
||||||
|
'@next/swc-linux-x64-gnu': 13.5.2
|
||||||
|
'@next/swc-linux-x64-musl': 13.5.2
|
||||||
|
'@next/swc-win32-arm64-msvc': 13.5.2
|
||||||
|
'@next/swc-win32-ia32-msvc': 13.5.2
|
||||||
|
'@next/swc-win32-x64-msvc': 13.5.2
|
||||||
|
transitivePeerDependencies:
|
||||||
|
- '@babel/core'
|
||||||
|
- babel-plugin-macros
|
||||||
|
dev: false
|
||||||
|
|
||||||
/nextjs-cors@2.1.2(next@13.5.2):
|
/nextjs-cors@2.1.2(next@13.5.2):
|
||||||
resolution: {integrity: sha512-2yOVivaaf2ILe4f/qY32hnj3oC77VCOsUQJQfhVMGsXE/YMEWUY2zy78sH9FKUCM7eG42/l3pDofIzMD781XGA==}
|
resolution: {integrity: sha512-2yOVivaaf2ILe4f/qY32hnj3oC77VCOsUQJQfhVMGsXE/YMEWUY2zy78sH9FKUCM7eG42/l3pDofIzMD781XGA==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
next: ^8.1.1-canary.54 || ^9.0.0 || ^10.0.0-0 || ^11.0.0 || ^12.0.0 || ^13.0.0
|
next: ^8.1.1-canary.54 || ^9.0.0 || ^10.0.0-0 || ^11.0.0 || ^12.0.0 || ^13.0.0
|
||||||
dependencies:
|
dependencies:
|
||||||
cors: 2.8.5
|
cors: 2.8.5
|
||||||
next: 13.5.2(@babel/core@7.24.4)(react-dom@18.2.0)(react@18.2.0)(sass@1.58.3)
|
next: 13.5.2(react-dom@18.2.0)(react@18.2.0)
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
/nextjs-node-loader@1.1.5(webpack@5.91.0):
|
/nextjs-node-loader@1.1.5(webpack@5.91.0):
|
||||||
@@ -11725,6 +11759,23 @@ packages:
|
|||||||
client-only: 0.0.1
|
client-only: 0.0.1
|
||||||
react: 18.2.0
|
react: 18.2.0
|
||||||
|
|
||||||
|
/styled-jsx@5.1.1(react@18.2.0):
|
||||||
|
resolution: {integrity: sha512-pW7uC1l4mBZ8ugbiZrcIsiIvVx1UmTfw7UkC3Um2tmfUq9Bhk8IiyEIPl6F8agHgjzku6j0xQEZbfA5uSgSaCw==}
|
||||||
|
engines: {node: '>= 12.0.0'}
|
||||||
|
peerDependencies:
|
||||||
|
'@babel/core': '*'
|
||||||
|
babel-plugin-macros: '*'
|
||||||
|
react: '>= 16.8.0 || 17.x.x || ^18.0.0-0'
|
||||||
|
peerDependenciesMeta:
|
||||||
|
'@babel/core':
|
||||||
|
optional: true
|
||||||
|
babel-plugin-macros:
|
||||||
|
optional: true
|
||||||
|
dependencies:
|
||||||
|
client-only: 0.0.1
|
||||||
|
react: 18.2.0
|
||||||
|
dev: false
|
||||||
|
|
||||||
/stylis@4.2.0:
|
/stylis@4.2.0:
|
||||||
resolution: {integrity: sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw==}
|
resolution: {integrity: sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw==}
|
||||||
dev: false
|
dev: false
|
||||||
@@ -12799,3 +12850,11 @@ packages:
|
|||||||
engines: {node: '>=0.8'}
|
engines: {node: '>=0.8'}
|
||||||
hasBin: true
|
hasBin: true
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
registry.npmjs.org/fastgpt-js-tiktoken@1.0.12:
|
||||||
|
resolution: {integrity: sha512-93UQM9h267PFQqnaJjcc+tqbKRZuipRbi+ASxVcE1FBzXOVb4GKfOMlsxXKCsSDdP+Luv8Fgul7F3HXKITXjYQ==, registry: https://registry.npmmirror.com/, tarball: https://registry.npmjs.org/fastgpt-js-tiktoken/-/fastgpt-js-tiktoken-1.0.12.tgz}
|
||||||
|
name: fastgpt-js-tiktoken
|
||||||
|
version: 1.0.12
|
||||||
|
dependencies:
|
||||||
|
base64-js: 1.5.1
|
||||||
|
dev: false
|
||||||
|
@@ -6,7 +6,8 @@
|
|||||||
"openapiPrefix": "fastgpt",
|
"openapiPrefix": "fastgpt",
|
||||||
"vectorMaxProcess": 15,
|
"vectorMaxProcess": 15,
|
||||||
"qaMaxProcess": 15,
|
"qaMaxProcess": 15,
|
||||||
"pgHNSWEfSearch": 100
|
"pgHNSWEfSearch": 100,
|
||||||
|
"tokenWorkers": 20
|
||||||
},
|
},
|
||||||
"llmModels": [
|
"llmModels": [
|
||||||
{
|
{
|
||||||
|
20
projects/app/src/global/core/dataset/api.d.ts
vendored
20
projects/app/src/global/core/dataset/api.d.ts
vendored
@@ -1,6 +1,7 @@
|
|||||||
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
|
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
|
||||||
import {
|
import {
|
||||||
DatasetSearchModeEnum,
|
DatasetSearchModeEnum,
|
||||||
|
DatasetSourceReadTypeEnum,
|
||||||
DatasetTypeEnum,
|
DatasetTypeEnum,
|
||||||
ImportDataSourceEnum,
|
ImportDataSourceEnum,
|
||||||
TrainingModeEnum
|
TrainingModeEnum
|
||||||
@@ -75,22 +76,3 @@ export type SearchTestResponse = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/* =========== training =========== */
|
/* =========== training =========== */
|
||||||
export type PostPreviewFilesChunksProps = {
|
|
||||||
type: ImportDataSourceEnum;
|
|
||||||
sourceId: string;
|
|
||||||
chunkSize: number;
|
|
||||||
overlapRatio: number;
|
|
||||||
customSplitChar?: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
export type PostPreviewFilesChunksResponse = {
|
|
||||||
fileId: string;
|
|
||||||
rawTextLength: number;
|
|
||||||
chunks: string[];
|
|
||||||
}[];
|
|
||||||
export type PostPreviewTableChunksResponse = {
|
|
||||||
fileId: string;
|
|
||||||
totalChunks: number;
|
|
||||||
chunks: { q: string; a: string; chunkIndex: number }[];
|
|
||||||
errorText?: string;
|
|
||||||
}[];
|
|
||||||
|
18
projects/app/src/middleware.ts
Normal file
18
projects/app/src/middleware.ts
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import { addLog } from '@fastgpt/service/common/system/log';
|
||||||
|
import { NextResponse } from 'next/server';
|
||||||
|
import type { NextRequest } from 'next/server';
|
||||||
|
|
||||||
|
export function middleware(request: NextRequest) {
|
||||||
|
const response = NextResponse.next();
|
||||||
|
|
||||||
|
addLog.info(`Request URL: ${request.url}`, {
|
||||||
|
body: request.body
|
||||||
|
});
|
||||||
|
|
||||||
|
return response;
|
||||||
|
}
|
||||||
|
|
||||||
|
// See "Matching Paths" below to learn more
|
||||||
|
export const config = {
|
||||||
|
matcher: '/api/:path*'
|
||||||
|
};
|
@@ -1,41 +1,50 @@
|
|||||||
/*
|
/*
|
||||||
Read db file content and response 3000 words
|
Read db file content and response 3000 words
|
||||||
*/
|
*/
|
||||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
import type { NextApiResponse } from 'next';
|
||||||
import { jsonRes } from '@fastgpt/service/common/response';
|
import { jsonRes } from '@fastgpt/service/common/response';
|
||||||
import { connectToDatabase } from '@/service/mongo';
|
|
||||||
import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller';
|
|
||||||
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
||||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
import { NextAPI } from '@/service/middle/entry';
|
||||||
|
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
|
import { readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||||
|
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
||||||
|
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||||
|
|
||||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
export type PreviewContextProps = {
|
||||||
try {
|
type: DatasetSourceReadTypeEnum;
|
||||||
await connectToDatabase();
|
sourceId: string;
|
||||||
const { fileId, csvFormat } = req.body as { fileId: string; csvFormat?: boolean };
|
isQAImport?: boolean;
|
||||||
|
selector?: string;
|
||||||
|
};
|
||||||
|
|
||||||
if (!fileId) {
|
async function handler(req: ApiRequestProps<PreviewContextProps>, res: NextApiResponse<any>) {
|
||||||
throw new Error('fileId is empty');
|
const { type, sourceId, isQAImport, selector } = req.body;
|
||||||
}
|
|
||||||
|
|
||||||
const { teamId } = await authFile({ req, authToken: true, fileId });
|
if (!sourceId) {
|
||||||
|
throw new Error('fileId is empty');
|
||||||
const { rawText } = await readFileContentFromMongo({
|
|
||||||
teamId,
|
|
||||||
bucketName: BucketNameEnum.dataset,
|
|
||||||
fileId,
|
|
||||||
csvFormat
|
|
||||||
});
|
|
||||||
|
|
||||||
jsonRes(res, {
|
|
||||||
data: {
|
|
||||||
previewContent: rawText.slice(0, 3000),
|
|
||||||
totalLength: rawText.length
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
jsonRes(res, {
|
|
||||||
code: 500,
|
|
||||||
error
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const { teamId } = await (async () => {
|
||||||
|
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||||
|
return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId });
|
||||||
|
}
|
||||||
|
return authCert({ req, authApiKey: true, authToken: true });
|
||||||
|
})();
|
||||||
|
|
||||||
|
const rawText = await readDatasetSourceRawText({
|
||||||
|
teamId,
|
||||||
|
type,
|
||||||
|
sourceId: sourceId,
|
||||||
|
isQAImport,
|
||||||
|
selector
|
||||||
|
});
|
||||||
|
|
||||||
|
jsonRes(res, {
|
||||||
|
data: {
|
||||||
|
previewContent: rawText.slice(0, 3000),
|
||||||
|
totalLength: rawText.length
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export default NextAPI(handler);
|
||||||
|
41
projects/app/src/pages/api/core/ai/token.ts
Normal file
41
projects/app/src/pages/api/core/ai/token.ts
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import type { ApiRequestProps, ApiResponseType } from '@fastgpt/service/type/next';
|
||||||
|
import { NextAPI } from '@/service/middle/entry';
|
||||||
|
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||||
|
import { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type';
|
||||||
|
import { countGptMessagesTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||||
|
|
||||||
|
export type tokenQuery = {};
|
||||||
|
|
||||||
|
export type tokenBody = {
|
||||||
|
messages: ChatCompletionMessageParam[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export type tokenResponse = {};
|
||||||
|
|
||||||
|
async function handler(
|
||||||
|
req: ApiRequestProps<tokenBody, tokenQuery>,
|
||||||
|
res: ApiResponseType<any>
|
||||||
|
): Promise<tokenResponse> {
|
||||||
|
await authCert({ req, authRoot: true });
|
||||||
|
const start = Date.now();
|
||||||
|
|
||||||
|
const tokens = await countGptMessagesTokens(req.body.messages);
|
||||||
|
|
||||||
|
return {
|
||||||
|
tokens,
|
||||||
|
time: Date.now() - start,
|
||||||
|
|
||||||
|
memory: process.memoryUsage()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export default NextAPI(handler);
|
||||||
|
|
||||||
|
export const config = {
|
||||||
|
api: {
|
||||||
|
bodyParser: {
|
||||||
|
sizeLimit: '20mb'
|
||||||
|
},
|
||||||
|
responseLimit: '20mb'
|
||||||
|
}
|
||||||
|
};
|
@@ -19,6 +19,7 @@ import { UsageSourceEnum } from '@fastgpt/global/support/wallet/usage/constants'
|
|||||||
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
||||||
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
|
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
|
||||||
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
||||||
|
import { rawText2Chunks } from '@fastgpt/service/core/dataset/read';
|
||||||
|
|
||||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||||
const { datasetId, parentId, fileId } = req.body as FileIdCreateDatasetCollectionParams;
|
const { datasetId, parentId, fileId } = req.body as FileIdCreateDatasetCollectionParams;
|
||||||
@@ -39,10 +40,15 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
const { rawText, filename } = await readFileContentFromMongo({
|
const { rawText, filename } = await readFileContentFromMongo({
|
||||||
teamId,
|
teamId,
|
||||||
bucketName: BucketNameEnum.dataset,
|
bucketName: BucketNameEnum.dataset,
|
||||||
fileId
|
fileId,
|
||||||
|
isQAImport: true
|
||||||
});
|
});
|
||||||
|
console.log(rawText);
|
||||||
// 2. split chunks
|
// 2. split chunks
|
||||||
const { chunks = [] } = parseCsvTable2Chunks(rawText);
|
const chunks = rawText2Chunks({
|
||||||
|
rawText,
|
||||||
|
isQAImport: true
|
||||||
|
});
|
||||||
|
|
||||||
// 3. auth limit
|
// 3. auth limit
|
||||||
await checkDatasetLimit({
|
await checkDatasetLimit({
|
||||||
|
@@ -22,6 +22,7 @@ import { getLLMModel, getVectorModel } from '@fastgpt/service/core/ai/model';
|
|||||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||||
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
import { startTrainingQueue } from '@/service/core/dataset/training/utils';
|
||||||
import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema';
|
import { MongoRawTextBuffer } from '@fastgpt/service/common/buffer/rawText/schema';
|
||||||
|
import { rawText2Chunks } from '@fastgpt/service/core/dataset/read';
|
||||||
|
|
||||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||||
const {
|
const {
|
||||||
@@ -51,8 +52,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
fileId
|
fileId
|
||||||
});
|
});
|
||||||
// 2. split chunks
|
// 2. split chunks
|
||||||
const { chunks } = splitText2Chunks({
|
const chunks = rawText2Chunks({
|
||||||
text: rawText,
|
rawText,
|
||||||
chunkLen: chunkSize,
|
chunkLen: chunkSize,
|
||||||
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
|
overlapRatio: trainingType === TrainingModeEnum.chunk ? 0.2 : 0,
|
||||||
customReg: chunkSplitter ? [chunkSplitter] : []
|
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||||
@@ -110,8 +111,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
trainingMode: trainingType,
|
trainingMode: trainingType,
|
||||||
prompt: qaPrompt,
|
prompt: qaPrompt,
|
||||||
billId,
|
billId,
|
||||||
data: chunks.map((text, index) => ({
|
data: chunks.map((item, index) => ({
|
||||||
q: text,
|
...item,
|
||||||
chunkIndex: index
|
chunkIndex: index
|
||||||
})),
|
})),
|
||||||
session
|
session
|
||||||
|
@@ -1,79 +1,60 @@
|
|||||||
import type { NextApiRequest, NextApiResponse } from 'next';
|
import type { NextApiResponse } from 'next';
|
||||||
import { jsonRes } from '@fastgpt/service/common/response';
|
|
||||||
import { connectToDatabase } from '@/service/mongo';
|
|
||||||
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
|
|
||||||
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
import { authFile } from '@fastgpt/service/support/permission/auth/file';
|
||||||
import { PostPreviewFilesChunksProps } from '@/global/core/dataset/api';
|
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import { readFileContentFromMongo } from '@fastgpt/service/common/file/gridfs/controller';
|
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
import { authCert } from '@fastgpt/service/support/permission/auth/common';
|
||||||
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
|
import { NextAPI } from '@/service/middle/entry';
|
||||||
import { parseCsvTable2Chunks } from '@fastgpt/service/core/dataset/training/utils';
|
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
||||||
|
|
||||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
export type PostPreviewFilesChunksProps = {
|
||||||
try {
|
type: DatasetSourceReadTypeEnum;
|
||||||
await connectToDatabase();
|
sourceId: string;
|
||||||
|
chunkSize: number;
|
||||||
|
overlapRatio: number;
|
||||||
|
customSplitChar?: string;
|
||||||
|
selector?: string;
|
||||||
|
isQAImport?: boolean;
|
||||||
|
};
|
||||||
|
export type PreviewChunksResponse = {
|
||||||
|
q: string;
|
||||||
|
a: string;
|
||||||
|
}[];
|
||||||
|
|
||||||
const { type, sourceId, chunkSize, customSplitChar, overlapRatio } =
|
async function handler(
|
||||||
req.body as PostPreviewFilesChunksProps;
|
req: ApiRequestProps<PostPreviewFilesChunksProps>,
|
||||||
|
res: NextApiResponse<any>
|
||||||
|
): Promise<PreviewChunksResponse> {
|
||||||
|
const { type, sourceId, chunkSize, customSplitChar, overlapRatio, selector, isQAImport } =
|
||||||
|
req.body;
|
||||||
|
|
||||||
if (!sourceId) {
|
if (!sourceId) {
|
||||||
throw new Error('fileIdList is empty');
|
throw new Error('sourceId is empty');
|
||||||
}
|
|
||||||
if (chunkSize > 30000) {
|
|
||||||
throw new Error('chunkSize is too large, should be less than 30000');
|
|
||||||
}
|
|
||||||
|
|
||||||
const { chunks } = await (async () => {
|
|
||||||
if (type === ImportDataSourceEnum.fileLocal) {
|
|
||||||
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
|
|
||||||
const fileId = String(file._id);
|
|
||||||
|
|
||||||
const { rawText } = await readFileContentFromMongo({
|
|
||||||
teamId,
|
|
||||||
bucketName: BucketNameEnum.dataset,
|
|
||||||
fileId,
|
|
||||||
csvFormat: true
|
|
||||||
});
|
|
||||||
// split chunks (5 chunk)
|
|
||||||
const { chunks } = splitText2Chunks({
|
|
||||||
text: rawText,
|
|
||||||
chunkLen: chunkSize,
|
|
||||||
overlapRatio,
|
|
||||||
customReg: customSplitChar ? [customSplitChar] : []
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
chunks: chunks.map((item) => ({
|
|
||||||
q: item,
|
|
||||||
a: ''
|
|
||||||
}))
|
|
||||||
};
|
|
||||||
}
|
|
||||||
if (type === ImportDataSourceEnum.csvTable) {
|
|
||||||
const { file, teamId } = await authFile({ req, authToken: true, fileId: sourceId });
|
|
||||||
const fileId = String(file._id);
|
|
||||||
const { rawText } = await readFileContentFromMongo({
|
|
||||||
teamId,
|
|
||||||
bucketName: BucketNameEnum.dataset,
|
|
||||||
fileId,
|
|
||||||
csvFormat: false
|
|
||||||
});
|
|
||||||
const { chunks } = parseCsvTable2Chunks(rawText);
|
|
||||||
|
|
||||||
return {
|
|
||||||
chunks: chunks || []
|
|
||||||
};
|
|
||||||
}
|
|
||||||
return { chunks: [] };
|
|
||||||
})();
|
|
||||||
|
|
||||||
jsonRes<{ q: string; a: string }[]>(res, {
|
|
||||||
data: chunks.slice(0, 5)
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
jsonRes(res, {
|
|
||||||
code: 500,
|
|
||||||
error
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
if (chunkSize > 30000) {
|
||||||
|
throw new Error('chunkSize is too large, should be less than 30000');
|
||||||
|
}
|
||||||
|
|
||||||
|
const { teamId } = await (async () => {
|
||||||
|
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
||||||
|
return authFile({ req, authToken: true, authApiKey: true, fileId: sourceId });
|
||||||
|
}
|
||||||
|
return authCert({ req, authApiKey: true, authToken: true });
|
||||||
|
})();
|
||||||
|
|
||||||
|
const rawText = await readDatasetSourceRawText({
|
||||||
|
teamId,
|
||||||
|
type,
|
||||||
|
sourceId: sourceId,
|
||||||
|
selector,
|
||||||
|
isQAImport
|
||||||
|
});
|
||||||
|
|
||||||
|
return rawText2Chunks({
|
||||||
|
rawText,
|
||||||
|
chunkLen: chunkSize,
|
||||||
|
overlapRatio,
|
||||||
|
customReg: customSplitChar ? [customSplitChar] : [],
|
||||||
|
isQAImport: isQAImport
|
||||||
|
}).slice(0, 5);
|
||||||
}
|
}
|
||||||
|
export default NextAPI(handler);
|
||||||
|
@@ -16,8 +16,10 @@ import { useAppStore } from '@/web/core/app/store/useAppStore';
|
|||||||
import PermissionIconText from '@/components/support/permission/IconText';
|
import PermissionIconText from '@/components/support/permission/IconText';
|
||||||
import { useUserStore } from '@/web/support/user/useUserStore';
|
import { useUserStore } from '@/web/support/user/useUserStore';
|
||||||
import { useI18n } from '@/web/context/I18n';
|
import { useI18n } from '@/web/context/I18n';
|
||||||
|
import { useTranslation } from 'next-i18next';
|
||||||
|
|
||||||
const MyApps = () => {
|
const MyApps = () => {
|
||||||
|
const { t } = useTranslation();
|
||||||
const { toast } = useToast();
|
const { toast } = useToast();
|
||||||
const { appT, commonT } = useI18n();
|
const { appT, commonT } = useI18n();
|
||||||
|
|
||||||
@@ -46,12 +48,12 @@ const MyApps = () => {
|
|||||||
loadMyApps(true);
|
loadMyApps(true);
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
toast({
|
toast({
|
||||||
title: err?.message || '删除失败',
|
title: err?.message || t('common.Delete Failed'),
|
||||||
status: 'error'
|
status: 'error'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
[toast, loadMyApps]
|
[toast, loadMyApps, t]
|
||||||
);
|
);
|
||||||
|
|
||||||
/* 加载模型 */
|
/* 加载模型 */
|
||||||
|
@@ -10,6 +10,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast';
|
|||||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||||
import { useContextSelector } from 'use-context-selector';
|
import { useContextSelector } from 'use-context-selector';
|
||||||
import { DatasetImportContext } from '../Context';
|
import { DatasetImportContext } from '../Context';
|
||||||
|
import { importType2ReadType } from '@fastgpt/global/core/dataset/read';
|
||||||
|
|
||||||
const PreviewChunks = ({
|
const PreviewChunks = ({
|
||||||
previewSource,
|
previewSource,
|
||||||
@@ -27,19 +28,7 @@ const PreviewChunks = ({
|
|||||||
const { data = [], isLoading } = useQuery(
|
const { data = [], isLoading } = useQuery(
|
||||||
['previewSource'],
|
['previewSource'],
|
||||||
() => {
|
() => {
|
||||||
if (
|
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||||
importSource === ImportDataSourceEnum.fileLocal ||
|
|
||||||
importSource === ImportDataSourceEnum.csvTable ||
|
|
||||||
importSource === ImportDataSourceEnum.fileLink
|
|
||||||
) {
|
|
||||||
return getPreviewChunks({
|
|
||||||
type: importSource,
|
|
||||||
sourceId: previewSource.dbFileId || previewSource.link || '',
|
|
||||||
chunkSize,
|
|
||||||
overlapRatio: chunkOverlapRatio,
|
|
||||||
customSplitChar: processParamsForm.getValues('customSplitChar')
|
|
||||||
});
|
|
||||||
} else if (importSource === ImportDataSourceEnum.fileCustom) {
|
|
||||||
const customSplitChar = processParamsForm.getValues('customSplitChar');
|
const customSplitChar = processParamsForm.getValues('customSplitChar');
|
||||||
const { chunks } = splitText2Chunks({
|
const { chunks } = splitText2Chunks({
|
||||||
text: previewSource.rawText || '',
|
text: previewSource.rawText || '',
|
||||||
@@ -52,7 +41,27 @@ const PreviewChunks = ({
|
|||||||
a: ''
|
a: ''
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
return [];
|
if (importSource === ImportDataSourceEnum.csvTable) {
|
||||||
|
return getPreviewChunks({
|
||||||
|
type: importType2ReadType(importSource),
|
||||||
|
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
|
||||||
|
chunkSize,
|
||||||
|
overlapRatio: chunkOverlapRatio,
|
||||||
|
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
||||||
|
selector: processParamsForm.getValues('webSelector'),
|
||||||
|
isQAImport: true
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return getPreviewChunks({
|
||||||
|
type: importType2ReadType(importSource),
|
||||||
|
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
|
||||||
|
chunkSize,
|
||||||
|
overlapRatio: chunkOverlapRatio,
|
||||||
|
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
||||||
|
selector: processParamsForm.getValues('webSelector'),
|
||||||
|
isQAImport: false
|
||||||
|
});
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
onError(err) {
|
onError(err) {
|
||||||
|
@@ -9,6 +9,7 @@ import { useToast } from '@fastgpt/web/hooks/useToast';
|
|||||||
import { getErrText } from '@fastgpt/global/common/error/utils';
|
import { getErrText } from '@fastgpt/global/common/error/utils';
|
||||||
import { useContextSelector } from 'use-context-selector';
|
import { useContextSelector } from 'use-context-selector';
|
||||||
import { DatasetImportContext } from '../Context';
|
import { DatasetImportContext } from '../Context';
|
||||||
|
import { importType2ReadType } from '@fastgpt/global/core/dataset/read';
|
||||||
|
|
||||||
const PreviewRawText = ({
|
const PreviewRawText = ({
|
||||||
previewSource,
|
previewSource,
|
||||||
@@ -18,32 +19,30 @@ const PreviewRawText = ({
|
|||||||
onClose: () => void;
|
onClose: () => void;
|
||||||
}) => {
|
}) => {
|
||||||
const { toast } = useToast();
|
const { toast } = useToast();
|
||||||
const { importSource } = useContextSelector(DatasetImportContext, (v) => v);
|
const { importSource, processParamsForm } = useContextSelector(DatasetImportContext, (v) => v);
|
||||||
|
|
||||||
const { data, isLoading } = useQuery(
|
const { data, isLoading } = useQuery(
|
||||||
['previewSource', previewSource?.dbFileId],
|
['previewSource', previewSource.dbFileId, previewSource.link, previewSource.sourceUrl],
|
||||||
() => {
|
() => {
|
||||||
if (importSource === ImportDataSourceEnum.fileLocal && previewSource.dbFileId) {
|
if (importSource === ImportDataSourceEnum.fileCustom && previewSource.rawText) {
|
||||||
return getPreviewFileContent({
|
return {
|
||||||
fileId: previewSource.dbFileId,
|
previewContent: previewSource.rawText.slice(0, 3000)
|
||||||
csvFormat: true
|
};
|
||||||
});
|
|
||||||
}
|
}
|
||||||
if (importSource === ImportDataSourceEnum.csvTable && previewSource.dbFileId) {
|
if (importSource === ImportDataSourceEnum.csvTable && previewSource.dbFileId) {
|
||||||
return getPreviewFileContent({
|
return getPreviewFileContent({
|
||||||
fileId: previewSource.dbFileId,
|
type: importType2ReadType(importSource),
|
||||||
csvFormat: false
|
sourceId: previewSource.dbFileId,
|
||||||
|
isQAImport: true
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
|
||||||
return {
|
|
||||||
previewContent: (previewSource.rawText || '').slice(0, 3000)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return getPreviewFileContent({
|
||||||
previewContent: ''
|
type: importType2ReadType(importSource),
|
||||||
};
|
sourceId: previewSource.dbFileId || previewSource.link || previewSource.sourceUrl || '',
|
||||||
|
isQAImport: false,
|
||||||
|
selector: processParamsForm.getValues('webSelector')
|
||||||
|
});
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
onError(err) {
|
onError(err) {
|
||||||
|
@@ -162,7 +162,7 @@ const CustomLinkInput = () => {
|
|||||||
{commonT('Add new')}
|
{commonT('Add new')}
|
||||||
</Button>
|
</Button>
|
||||||
<Button
|
<Button
|
||||||
isDisabled={list.length === 0}
|
isDisabled={list.filter((item) => !!item.sourceUrl).length === 0}
|
||||||
onClick={handleSubmit((data) => {
|
onClick={handleSubmit((data) => {
|
||||||
setSources(
|
setSources(
|
||||||
data.list
|
data.list
|
||||||
|
@@ -23,7 +23,7 @@ const LinkCollection = () => {
|
|||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
{activeStep === 0 && <CustomLinkImport />}
|
{activeStep === 0 && <CustomLinkImport />}
|
||||||
{activeStep === 1 && <DataProcess showPreviewChunks={false} />}
|
{activeStep === 1 && <DataProcess showPreviewChunks />}
|
||||||
{activeStep === 2 && <Upload />}
|
{activeStep === 2 && <Upload />}
|
||||||
</>
|
</>
|
||||||
);
|
);
|
||||||
|
@@ -29,7 +29,8 @@ const FileLocal = () => {
|
|||||||
|
|
||||||
export default React.memo(FileLocal);
|
export default React.memo(FileLocal);
|
||||||
|
|
||||||
const csvTemplate = `"第一列内容","第二列内容"
|
const csvTemplate = `index,content
|
||||||
|
"第一列内容","第二列内容"
|
||||||
"必填列","可选列。CSV 中请注意内容不能包含双引号,双引号是列分割符号"
|
"必填列","可选列。CSV 中请注意内容不能包含双引号,双引号是列分割符号"
|
||||||
"只会将第一和第二列内容导入,其余列会被忽略",""
|
"只会将第一和第二列内容导入,其余列会被忽略",""
|
||||||
"结合人工智能的演进历程,AIGC的发展大致可以分为三个阶段,即:早期萌芽阶段(20世纪50年代至90年代中期)、沉淀积累阶段(20世纪90年代中期至21世纪10年代中期),以及快速发展展阶段(21世纪10年代中期至今)。",""
|
"结合人工智能的演进历程,AIGC的发展大致可以分为三个阶段,即:早期萌芽阶段(20世纪50年代至90年代中期)、沉淀积累阶段(20世纪90年代中期至21世纪10年代中期),以及快速发展展阶段(21世纪10年代中期至今)。",""
|
||||||
|
@@ -123,7 +123,9 @@ export async function checkInvalidDatasetData(start: Date, end: Date) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} catch (error) {}
|
} catch (error) {}
|
||||||
console.log(++index);
|
if (++index % 100 === 0) {
|
||||||
|
console.log(index);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
|
import type { PreviewContextProps } from '@/pages/api/common/file/previewContent';
|
||||||
import { GET, POST } from '@/web/common/api/request';
|
import { GET, POST } from '@/web/common/api/request';
|
||||||
import type { UploadImgProps } from '@fastgpt/global/common/file/api.d';
|
import type { UploadImgProps } from '@fastgpt/global/common/file/api.d';
|
||||||
import { AxiosProgressEvent } from 'axios';
|
import { AxiosProgressEvent } from 'axios';
|
||||||
@@ -16,7 +17,7 @@ export const postUploadFiles = (
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
export const getPreviewFileContent = (data: { fileId: string; csvFormat: boolean }) =>
|
export const getPreviewFileContent = (data: PreviewContextProps) =>
|
||||||
POST<{
|
POST<{
|
||||||
previewContent: string;
|
previewContent: string;
|
||||||
totalLength: number;
|
totalLength: number;
|
||||||
|
@@ -22,7 +22,6 @@ import type {
|
|||||||
import type {
|
import type {
|
||||||
GetTrainingQueueProps,
|
GetTrainingQueueProps,
|
||||||
GetTrainingQueueResponse,
|
GetTrainingQueueResponse,
|
||||||
PostPreviewFilesChunksProps,
|
|
||||||
SearchTestProps,
|
SearchTestProps,
|
||||||
SearchTestResponse
|
SearchTestResponse
|
||||||
} from '@/global/core/dataset/api.d';
|
} from '@/global/core/dataset/api.d';
|
||||||
@@ -41,6 +40,10 @@ import type { DatasetCollectionsListItemType } from '@/global/core/dataset/type.
|
|||||||
import { PagingData } from '@/types';
|
import { PagingData } from '@/types';
|
||||||
import type { getDatasetTrainingQueueResponse } from '@/pages/api/core/dataset/training/getDatasetTrainingQueue';
|
import type { getDatasetTrainingQueueResponse } from '@/pages/api/core/dataset/training/getDatasetTrainingQueue';
|
||||||
import type { rebuildEmbeddingBody } from '@/pages/api/core/dataset/training/rebuildEmbedding';
|
import type { rebuildEmbeddingBody } from '@/pages/api/core/dataset/training/rebuildEmbedding';
|
||||||
|
import type {
|
||||||
|
PostPreviewFilesChunksProps,
|
||||||
|
PreviewChunksResponse
|
||||||
|
} from '@/pages/api/core/dataset/file/getPreviewChunks';
|
||||||
|
|
||||||
/* ======================== dataset ======================= */
|
/* ======================== dataset ======================= */
|
||||||
export const getDatasets = (data: { parentId?: string; type?: DatasetTypeEnum }) =>
|
export const getDatasets = (data: { parentId?: string; type?: DatasetTypeEnum }) =>
|
||||||
@@ -139,7 +142,7 @@ export const getDatasetTrainingQueue = (datasetId: string) =>
|
|||||||
});
|
});
|
||||||
|
|
||||||
export const getPreviewChunks = (data: PostPreviewFilesChunksProps) =>
|
export const getPreviewChunks = (data: PostPreviewFilesChunksProps) =>
|
||||||
POST<{ q: string; a: string }[]>('/core/dataset/file/getPreviewChunks', data);
|
POST<PreviewChunksResponse>('/core/dataset/file/getPreviewChunks', data);
|
||||||
|
|
||||||
/* ================== file ======================== */
|
/* ================== file ======================== */
|
||||||
export const getFileViewUrl = (fileId: string) =>
|
export const getFileViewUrl = (fileId: string) =>
|
||||||
|
Reference in New Issue
Block a user