4.6.7 fix (#752)

This commit is contained in:
Archer
2024-01-19 20:16:08 +08:00
committed by GitHub
parent c031e6dcc9
commit 5e2adb22f0
37 changed files with 420 additions and 293 deletions

View File

@@ -17,6 +17,11 @@ weight: 707
| 500w 组向量 | 8c32g | 16c64g 200GB | | 500w 组向量 | 8c32g | 16c64g 200GB |
{{< /table >}} {{< /table >}}
## 部署架构图
![](/imgs/sealos-fastgpt.webp)
### 1. 准备好代理环境(国外服务器可忽略) ### 1. 准备好代理环境(国外服务器可忽略)
确保可以访问 OpenAI具体方案可以参考[代理方案](/docs/development/proxy/)。或直接在 Sealos 上 [部署 OneAPI](/docs/development/one-api),既解决代理问题也能实现多 Key 轮询、接入其他大模型。 确保可以访问 OpenAI具体方案可以参考[代理方案](/docs/development/proxy/)。或直接在 Sealos 上 [部署 OneAPI](/docs/development/one-api),既解决代理问题也能实现多 Key 轮询、接入其他大模型。

View File

@@ -19,6 +19,10 @@ images: []
## 通用问题 ## 通用问题
### 能否纯本地允许
可以。需要准备好向量模型和LLM模型。
### insufficient_user_quota user quota is not enough ### insufficient_user_quota user quota is not enough
OneAPI 账号的余额不足,默认 root 用户只有 200 刀,可以手动修改。 OneAPI 账号的余额不足,默认 root 用户只有 200 刀,可以手动修改。
@@ -105,7 +109,7 @@ mongo连接失败检查
### TypeError: Cannot read properties of null (reading 'useMemo' ) ### TypeError: Cannot read properties of null (reading 'useMemo' )
用 Node18 试试,可能最新的 Node 有问题。 本地开发流程: 删除所有的`node_modules`,用 Node18 重新 install 试试,可能最新的 Node 有问题。 本地开发流程:
1. 根目录: `pnpm i` 1. 根目录: `pnpm i`
2. 复制 `config.json` -> `config.local.json` 2. 复制 `config.json` -> `config.local.json`

View File

@@ -3,10 +3,12 @@ export const fileImgs = [
{ suffix: 'csv', src: 'file/fill/csv' }, { suffix: 'csv', src: 'file/fill/csv' },
{ suffix: '(doc|docs)', src: 'file/fill/doc' }, { suffix: '(doc|docs)', src: 'file/fill/doc' },
{ suffix: 'txt', src: 'file/fill/txt' }, { suffix: 'txt', src: 'file/fill/txt' },
{ suffix: 'md', src: 'file/fill/markdown' } { suffix: 'md', src: 'file/fill/markdown' },
{ suffix: 'html', src: 'file/fill/html' }
// { suffix: '.', src: '/imgs/files/file.svg' } // { suffix: '.', src: '/imgs/files/file.svg' }
]; ];
export function getFileIcon(name = '', defaultImg = '/imgs/files/file.svg') { export function getFileIcon(name = '', defaultImg = 'file/fill/file') {
return fileImgs.find((item) => new RegExp(item.suffix, 'gi').test(name))?.src || defaultImg; return fileImgs.find((item) => new RegExp(item.suffix, 'gi').test(name))?.src || defaultImg;
} }

View File

@@ -51,19 +51,18 @@ export const uploadMarkdownBase64 = async ({
// match base64, upload and replace it // match base64, upload and replace it
const base64Regex = /data:image\/.*;base64,([^\)]+)/g; const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
const base64Arr = rawText.match(base64Regex) || []; const base64Arr = rawText.match(base64Regex) || [];
// upload base64 and replace it
await Promise.all(
base64Arr.map(async (base64Img) => {
try {
const str = await uploadImgController(base64Img);
rawText = rawText.replace(base64Img, str); // upload base64 and replace it
} catch (error) { for await (const base64Img of base64Arr) {
rawText = rawText.replace(base64Img, ''); try {
rawText = rawText.replace(/!\[.*\]\(\)/g, ''); const str = await uploadImgController(base64Img);
}
}) rawText = rawText.replace(base64Img, str);
); } catch (error) {
rawText = rawText.replace(base64Img, '');
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
}
}
} }
// Remove white space on both sides of the picture // Remove white space on both sides of the picture

View File

@@ -48,10 +48,6 @@ export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
name: string; name: string;
rawTextLength: number; rawTextLength: number;
hashRawText: string; hashRawText: string;
trainingType: `${TrainingModeEnum}`;
chunkSize: number;
chunkSplitter: string;
qaPrompt: string;
fileMetadata?: Record<string, any>; fileMetadata?: Record<string, any>;
collectionMetadata?: Record<string, any>; collectionMetadata?: Record<string, any>;
@@ -74,3 +70,14 @@ export type PostWebsiteSyncParams = {
datasetId: string; datasetId: string;
billId: string; billId: string;
}; };
export type PushDatasetDataProps = {
collectionId: string;
data: PushDatasetDataChunkProps[];
trainingMode: `${TrainingModeEnum}`;
prompt?: string;
billId?: string;
};
export type PushDatasetDataResponse = {
insertLen: number;
};

View File

@@ -21,7 +21,7 @@ export type UpdateDatasetDataProps = {
}; };
export type PatchIndexesProps = { export type PatchIndexesProps = {
type: 'create' | 'update' | 'delete'; type: 'create' | 'update' | 'delete' | 'unChange';
index: Omit<DatasetDataIndexItemType, 'dataId'> & { index: Omit<DatasetDataIndexItemType, 'dataId'> & {
dataId?: string; dataId?: string;
}; };

View File

@@ -46,8 +46,17 @@ export async function readMongoImg({ id }: { id: string }) {
return data?.binary; return data?.binary;
} }
export async function delImgByRelatedId(relateIds: string[]) { export async function delImgByRelatedId({
teamId,
relateIds
}: {
teamId: string;
relateIds: string[];
}) {
if (relateIds.length === 0) return;
return MongoImage.deleteMany({ return MongoImage.deleteMany({
teamId,
'metadata.relatedId': { $in: relateIds.map((id) => String(id)) } 'metadata.relatedId': { $in: relateIds.map((id) => String(id)) }
}); });
} }

View File

@@ -34,9 +34,8 @@ const ImageSchema = new Schema({
try { try {
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 }); ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
ImageSchema.index({ type: 1 }); ImageSchema.index({ type: 1 });
ImageSchema.index({ teamId: 1 });
ImageSchema.index({ createTime: 1 }); ImageSchema.index({ createTime: 1 });
ImageSchema.index({ 'metadata.relatedId': 1 }); ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 });
} catch (error) { } catch (error) {
console.log(error); console.log(error);
} }

View File

@@ -28,12 +28,16 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
// }, // },
filename: async (req, file, cb) => { filename: async (req, file, cb) => {
const { ext } = path.parse(decodeURIComponent(file.originalname)); const { ext } = path.parse(decodeURIComponent(file.originalname));
cb(null, `${getNanoid(32)}${ext}`); cb(null, `${getNanoid()}${ext}`);
} }
}) })
}).single('file'); }).single('file');
async doUpload<T = Record<string, any>>(req: NextApiRequest, res: NextApiResponse) { async doUpload<T = Record<string, any>>(
req: NextApiRequest,
res: NextApiResponse,
originBuckerName?: `${BucketNameEnum}`
) {
return new Promise<{ return new Promise<{
file: FileType; file: FileType;
metadata: Record<string, any>; metadata: Record<string, any>;
@@ -47,7 +51,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
} }
// check bucket name // check bucket name
const bucketName = req.body?.bucketName as `${BucketNameEnum}`; const bucketName = (req.body?.bucketName || originBuckerName) as `${BucketNameEnum}`;
if (bucketName && !bucketNameMap[bucketName]) { if (bucketName && !bucketNameMap[bucketName]) {
return reject('BucketName is invalid'); return reject('BucketName is invalid');
} }

View File

@@ -39,14 +39,15 @@ export const insertDatasetDataVector = async (
} }
): Promise<{ insertId: string }> => { ): Promise<{ insertId: string }> => {
const { teamId, datasetId, collectionId, vectors, retry = 3 } = props; const { teamId, datasetId, collectionId, vectors, retry = 3 } = props;
try { try {
const { rows } = await PgClient.insert(PgDatasetTableName, { const { rows } = await PgClient.insert(PgDatasetTableName, {
values: [ values: [
[ [
{ key: 'vector', value: `[${vectors[0]}]` }, { key: 'vector', value: `[${vectors[0]}]` },
{ key: 'team_id', value: String(teamId) }, { key: 'team_id', value: String(teamId) },
{ key: 'dataset_id', value: datasetId }, { key: 'dataset_id', value: String(datasetId) },
{ key: 'collection_id', value: collectionId } { key: 'collection_id', value: String(collectionId) }
] ]
] ]
}); });
@@ -176,8 +177,8 @@ export const getVectorDataByTime = async (start: Date, end: Date) => {
`); `);
return rows.map((item) => ({ return rows.map((item) => ({
id: item.id, id: String(item.id),
datasetId: item.dataset_id, teamId: item.team_id,
teamId: item.team_id datasetId: item.dataset_id
})); }));
}; };

View File

@@ -89,6 +89,7 @@ try {
close custom feedback; close custom feedback;
*/ */
ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true }); ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true });
ChatItemSchema.index({ time: -1 }, { background: true });
ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true }); ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true });
ChatItemSchema.index({ userBadFeedback: 1 }, { background: true }); ChatItemSchema.index({ userBadFeedback: 1 }, { background: true });
ChatItemSchema.index({ customFeedbacks: 1 }, { background: true }); ChatItemSchema.index({ customFeedbacks: 1 }, { background: true });

View File

@@ -25,7 +25,7 @@ export async function createOneCollection({
type, type,
trainingType = TrainingModeEnum.chunk, trainingType = TrainingModeEnum.chunk,
chunkSize = 0, chunkSize = 512,
chunkSplitter, chunkSplitter,
qaPrompt, qaPrompt,
@@ -134,7 +134,10 @@ export async function delCollectionAndRelatedSources({
// delete file and imgs // delete file and imgs
await Promise.all([ await Promise.all([
delImgByRelatedId(relatedImageIds), delImgByRelatedId({
teamId,
relateIds: relatedImageIds
}),
delFileByFileIdList({ delFileByFileIdList({
bucketName: BucketNameEnum.dataset, bucketName: BucketNameEnum.dataset,
fileIdList fileIdList

View File

@@ -1,5 +1,15 @@
import { delay } from '@fastgpt/global/common/system/utils'; import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetTraining } from './schema'; import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { getCollectionWithDataset } from '../controller';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import type { VectorModelItemType, LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise<any> => { export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise<any> => {
try { try {
@@ -19,3 +29,165 @@ export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promi
return Promise.reject(error); return Promise.reject(error);
} }
}; };
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
collectionId,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
vectorModelList = [],
qaModelList = []
}: {
teamId: string;
tmbId: string;
vectorModelList: VectorModelItemType[];
qaModelList: LLMModelItemType[];
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
const {
datasetId: { _id: datasetId, vectorModel, agentModel }
} = await getCollectionWithDataset(collectionId);
const checkModelValid = async ({ collectionId }: { collectionId: string }) => {
if (!collectionId) return Promise.reject(`CollectionId is empty`);
if (trainingMode === TrainingModeEnum.chunk) {
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
if (!vectorModelData) {
return Promise.reject(`Model ${vectorModel} is inValid`);
}
return {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (trainingMode === TrainingModeEnum.qa) {
const qaModelData = qaModelList?.find((item) => item.model === agentModel);
if (!qaModelData) {
return Promise.reject(`Model ${agentModel} is inValid`);
}
return {
maxToken: qaModelData.maxContext * 0.8,
model: qaModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
};
const { model, maxToken, weight } = await checkModelValid({
collectionId
});
// format q and a, remove empty char
data.forEach((item) => {
item.q = simpleText(item.q);
item.a = simpleText(item.a);
item.indexes = item.indexes
?.map((index) => {
return {
...index,
text: simpleText(index.text)
};
})
.filter(Boolean);
});
// filter repeat or equal content
const set = new Set();
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
success: [],
overToken: [],
repeat: [],
error: []
};
// filter repeat content
data.forEach((item) => {
if (!item.q) {
filterResult.error.push(item);
return;
}
const text = item.q + item.a;
// count q token
const token = countPromptTokens(item.q);
if (token > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
set.add(text);
}
});
// insert data to db
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
try {
const results = await MongoDatasetTraining.insertMany(
dataList.map((item, i) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: trainingMode,
prompt,
model,
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? i,
weight: weight ?? 0,
indexes: item.indexes
}))
);
await delay(500);
return results.length;
} catch (error) {
if (retry > 0) {
await delay(500);
return insertData(dataList, retry - 1);
}
return Promise.reject(error);
}
};
let insertLen = 0;
const chunkSize = 50;
const chunkList = filterResult.success.reduce(
(acc, cur) => {
const lastChunk = acc[acc.length - 1];
if (lastChunk.length < chunkSize) {
lastChunk.push(cur);
} else {
acc.push([cur]);
}
return acc;
},
[[]] as PushDatasetDataChunkProps[][]
);
for await (const chunks of chunkList) {
insertLen += await insertData(chunks);
}
delete filterResult.success;
return {
insertLen,
...filterResult
};
}

View File

@@ -52,7 +52,7 @@ const BillSchema = new Schema({
}); });
try { try {
BillSchema.index({ teamId: 1, tmbId: 1, time: -1 }); BillSchema.index({ teamId: 1, time: -1 });
BillSchema.index({ time: 1 }, { expireAfterSeconds: 180 * 24 * 60 * 60 }); BillSchema.index({ time: 1 }, { expireAfterSeconds: 180 * 24 * 60 * 60 });
} catch (error) { } catch (error) {
console.log(error); console.log(error);

View File

@@ -0,0 +1,40 @@
import Papa from 'papaparse';
import { readFileRawText } from './rawText';
/**
* read csv to json
* @response {
* header: string[],
* data: string[][]
* }
*/
export const readCsvContent = async ({ file }: { file: File }) => {
try {
const { rawText: textArr } = await readFileRawText(file);
const csvArr = Papa.parse(textArr).data as string[][];
if (csvArr.length === 0) {
throw new Error('csv 解析失败');
}
const header = csvArr.shift() as string[];
// add title to data
const rawText = csvArr
.map((item) =>
item.map((value, index) => {
if (!header[index]) return value;
return `${header[index]}: ${value}`;
})
)
.flat()
.join('\n');
return {
rawText,
header,
data: csvArr.map((item) => item)
};
} catch (error) {
return Promise.reject('解析 csv 文件失败');
}
};

View File

@@ -1,4 +1,5 @@
import { loadFile2Buffer } from '../utils'; import { loadFile2Buffer } from '../utils';
import { readCsvContent } from './csv';
import { readHtmlFile } from './html'; import { readHtmlFile } from './html';
import { readMdFile } from './md'; import { readMdFile } from './md';
import { readPdfFile } from './pdf'; import { readPdfFile } from './pdf';
@@ -29,6 +30,8 @@ export const readFileRawContent = async ({
file, file,
uploadImgController: uploadBase64Controller uploadImgController: uploadBase64Controller
}); });
case 'csv':
return readCsvContent({ file });
case 'pdf': case 'pdf':
const pdf = await loadFile2Buffer({ file }); const pdf = await loadFile2Buffer({ file });
return readPdfFile({ pdf }); return readPdfFile({ pdf });

View File

@@ -74,7 +74,7 @@ const JSONEditor = ({ defaultValue, value, onChange, resize, ...props }: Props)
<Box <Box
borderWidth={'1px'} borderWidth={'1px'}
borderRadius={'base'} borderRadius={'md'}
borderColor={'myGray.200'} borderColor={'myGray.200'}
py={2} py={2}
{...props} {...props}

View File

@@ -22,13 +22,15 @@
"react-dom": "18.2.0", "react-dom": "18.2.0",
"react-i18next": "^12.3.1", "react-i18next": "^12.3.1",
"turndown": "^7.1.2", "turndown": "^7.1.2",
"lexical":"0.12.6", "lexical": "0.12.6",
"@lexical/react": "0.12.6", "@lexical/react": "0.12.6",
"papaparse": "^5.4.1",
"@lexical/utils": "0.12.6", "@lexical/utils": "0.12.6",
"@lexical/text": "0.12.6" "@lexical/text": "0.12.6"
}, },
"devDependencies": { "devDependencies": {
"@types/react": "18.2.0", "@types/react": "18.2.0",
"@types/papaparse": "^5.3.7",
"@types/react-dom": "18.2.0", "@types/react-dom": "18.2.0",
"@types/turndown": "^5.0.4" "@types/turndown": "^5.0.4"
} }

12
pnpm-lock.yaml generated
View File

@@ -196,6 +196,9 @@ importers:
next-i18next: next-i18next:
specifier: ^13.3.0 specifier: ^13.3.0
version: registry.npmmirror.com/next-i18next@13.3.0(i18next@22.5.1)(next@13.5.2)(react-i18next@12.3.1)(react@18.2.0) version: registry.npmmirror.com/next-i18next@13.3.0(i18next@22.5.1)(next@13.5.2)(react-i18next@12.3.1)(react@18.2.0)
papaparse:
specifier: ^5.4.1
version: registry.npmmirror.com/papaparse@5.4.1
pdfjs-dist: pdfjs-dist:
specifier: ^4.0.269 specifier: ^4.0.269
version: registry.npmmirror.com/pdfjs-dist@4.0.269 version: registry.npmmirror.com/pdfjs-dist@4.0.269
@@ -212,6 +215,9 @@ importers:
specifier: ^7.1.2 specifier: ^7.1.2
version: registry.npmmirror.com/turndown@7.1.2 version: registry.npmmirror.com/turndown@7.1.2
devDependencies: devDependencies:
'@types/papaparse':
specifier: ^5.3.7
version: registry.npmmirror.com/@types/papaparse@5.3.7
'@types/react': '@types/react':
specifier: 18.2.0 specifier: 18.2.0
version: registry.npmmirror.com/@types/react@18.2.0 version: registry.npmmirror.com/@types/react@18.2.0
@@ -323,9 +329,6 @@ importers:
nprogress: nprogress:
specifier: ^0.2.0 specifier: ^0.2.0
version: registry.npmmirror.com/nprogress@0.2.0 version: registry.npmmirror.com/nprogress@0.2.0
papaparse:
specifier: ^5.4.1
version: registry.npmmirror.com/papaparse@5.4.1
react: react:
specifier: 18.2.0 specifier: 18.2.0
version: registry.npmmirror.com/react@18.2.0 version: registry.npmmirror.com/react@18.2.0
@@ -390,9 +393,6 @@ importers:
'@types/node': '@types/node':
specifier: ^20.8.5 specifier: ^20.8.5
version: registry.npmmirror.com/@types/node@20.8.5 version: registry.npmmirror.com/@types/node@20.8.5
'@types/papaparse':
specifier: ^5.3.7
version: registry.npmmirror.com/@types/papaparse@5.3.7
'@types/react': '@types/react':
specifier: 18.2.0 specifier: 18.2.0
version: registry.npmmirror.com/@types/react@18.2.0 version: registry.npmmirror.com/@types/react@18.2.0

View File

@@ -42,7 +42,6 @@
"next": "13.5.2", "next": "13.5.2",
"next-i18next": "^13.3.0", "next-i18next": "^13.3.0",
"nprogress": "^0.2.0", "nprogress": "^0.2.0",
"papaparse": "^5.4.1",
"react": "18.2.0", "react": "18.2.0",
"react-day-picker": "^8.7.1", "react-day-picker": "^8.7.1",
"react-dom": "18.2.0", "react-dom": "18.2.0",
@@ -66,7 +65,6 @@
"@types/jsonwebtoken": "^9.0.3", "@types/jsonwebtoken": "^9.0.3",
"@types/lodash": "^4.14.191", "@types/lodash": "^4.14.191",
"@types/node": "^20.8.5", "@types/node": "^20.8.5",
"@types/papaparse": "^5.3.7",
"@types/react": "18.2.0", "@types/react": "18.2.0",
"@types/react-dom": "18.2.0", "@types/react-dom": "18.2.0",
"@types/react-syntax-highlighter": "^15.5.6", "@types/react-syntax-highlighter": "^15.5.6",

View File

@@ -226,7 +226,7 @@
"Chat test": "测试对话", "Chat test": "测试对话",
"Max Token": "单条数据上限", "Max Token": "单条数据上限",
"Start chat": "立即对话", "Start chat": "立即对话",
"Total chars": "总字数: {{total}}", "Total chars": "总字数: {{total}}",
"Total tokens": "总 Tokens: {{total}}", "Total tokens": "总 Tokens: {{total}}",
"ai": { "ai": {
"Model": "AI 模型", "Model": "AI 模型",
@@ -541,8 +541,7 @@
"success": "开始同步" "success": "开始同步"
} }
}, },
"training": { "training": {}
}
}, },
"data": { "data": {
"Auxiliary Data": "辅助数据", "Auxiliary Data": "辅助数据",

View File

@@ -17,7 +17,7 @@ const ButtonEdge = (props: EdgeProps) => {
style = {} style = {}
} = props; } = props;
const [labelX, labelY] = getBezierPath({ const [, labelX, labelY] = getBezierPath({
sourceX, sourceX,
sourceY, sourceY,
sourcePosition, sourcePosition,

View File

@@ -8,6 +8,3 @@ import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type';
/* ======= collection =========== */ /* ======= collection =========== */
/* ==== data ===== */ /* ==== data ===== */
export type PushDataResponse = {
insertLen: number;
};

View File

@@ -27,13 +27,7 @@ export type CreateDatasetParams = {
export type InsertOneDatasetDataProps = PushDatasetDataChunkProps & { export type InsertOneDatasetDataProps = PushDatasetDataChunkProps & {
collectionId: string; collectionId: string;
}; };
export type PushDatasetDataProps = {
collectionId: string;
data: PushDatasetDataChunkProps[];
trainingMode: `${TrainingModeEnum}`;
prompt?: string;
billId?: string;
};
export type UpdateDatasetDataProps = { export type UpdateDatasetDataProps = {
id: string; id: string;
q?: string; // embedding content q?: string; // embedding content

View File

@@ -16,11 +16,15 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dat
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils'; import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
import { pushDataToTrainingQueue } from '@/service/core/dataset/data/controller'; import { pushDataToTrainingQueue } from '@/service/core/dataset/data/controller';
import { hashStr } from '@fastgpt/global/common/string/tools'; import { hashStr } from '@fastgpt/global/common/string/tools';
import { createTrainingBill } from '@fastgpt/service/support/wallet/bill/controller';
import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) { export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try { try {
await connectToDatabase(); await connectToDatabase();
const { const {
name,
text, text,
trainingType = TrainingModeEnum.chunk, trainingType = TrainingModeEnum.chunk,
chunkSize = 512, chunkSize = 512,
@@ -29,7 +33,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
...body ...body
} = req.body as TextCreateDatasetCollectionParams; } = req.body as TextCreateDatasetCollectionParams;
const { teamId, tmbId } = await authDataset({ const { teamId, tmbId, dataset } = await authDataset({
req, req,
authToken: true, authToken: true,
authApiKey: true, authApiKey: true,
@@ -52,21 +56,32 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
insertLen: predictDataLimitLength(trainingType, chunks) insertLen: predictDataLimitLength(trainingType, chunks)
}); });
// 3. create collection // 3. create collection and training bill
const collectionId = await createOneCollection({ const [collectionId, { billId }] = await Promise.all([
...body, createOneCollection({
teamId, ...body,
tmbId, teamId,
type: DatasetCollectionTypeEnum.virtual, tmbId,
type: DatasetCollectionTypeEnum.virtual,
trainingType, name,
chunkSize, trainingType,
chunkSplitter, chunkSize,
qaPrompt, chunkSplitter,
qaPrompt,
hashRawText: hashStr(text), hashRawText: hashStr(text),
rawTextLength: text.length rawTextLength: text.length
}); }),
createTrainingBill({
teamId,
tmbId,
appName: name,
billSource: BillSourceEnum.training,
vectorModel: getVectorModel(dataset.vectorModel)?.name,
agentModel: getQAModel(dataset.agentModel)?.name
})
]);
// 4. push chunks to training queue // 4. push chunks to training queue
const insertResults = await pushDataToTrainingQueue({ const insertResults = await pushDataToTrainingQueue({
@@ -74,6 +89,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
tmbId, tmbId,
collectionId, collectionId,
trainingMode: trainingType, trainingMode: trainingType,
prompt: qaPrompt,
billId,
data: chunks.map((text, index) => ({ data: chunks.map((text, index) => ({
q: text, q: text,
chunkIndex: index chunkIndex: index
@@ -90,3 +107,11 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
}); });
} }
} }
export const config = {
api: {
bodyParser: {
sizeLimit: '10mb'
}
}
};

View File

@@ -3,8 +3,10 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@fastgpt/service/common/response'; import { jsonRes } from '@fastgpt/service/common/response';
import { connectToDatabase } from '@/service/mongo'; import { connectToDatabase } from '@/service/mongo';
import { withNextCors } from '@fastgpt/service/common/middle/cors'; import { withNextCors } from '@fastgpt/service/common/middle/cors';
import type { PushDataResponse } from '@/global/core/api/datasetRes.d'; import type {
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d'; PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset'; import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset'; import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset';
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils'; import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
@@ -39,7 +41,7 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
insertLen: predictDataLimitLength(collection.trainingType, data) insertLen: predictDataLimitLength(collection.trainingType, data)
}); });
jsonRes<PushDataResponse>(res, { jsonRes<PushDatasetDataResponse>(res, {
data: await pushDataToTrainingQueue({ data: await pushDataToTrainingQueue({
...req.body, ...req.body,
teamId, teamId,

View File

@@ -12,16 +12,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
const method = (req.method || 'POST') as Method; const method = (req.method || 'POST') as Method;
const { path = [], ...query } = req.query as any; const { path = [], ...query } = req.query as any;
const url = `/${path?.join('/')}`; const url = `/${path?.join('/')}?${new URLSearchParams(query).toString()}`;
if (!url) { if (!url) {
throw new Error('url is empty'); throw new Error('url is empty');
} }
const data = { const data = req.body || query;
...req.body,
...query
};
const repose = await request( const repose = await request(
url, url,
@@ -56,3 +53,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}); });
} }
} }
export const config = {
api: {
bodyParser: {
sizeLimit: '10mb'
},
responseLimit: '10mb'
}
};

View File

@@ -27,7 +27,7 @@ const Upload = dynamic(() => import('../commonProgress/Upload'));
const PreviewRawText = dynamic(() => import('../components/PreviewRawText')); const PreviewRawText = dynamic(() => import('../components/PreviewRawText'));
type FileItemType = ImportSourceItemType & { file: File }; type FileItemType = ImportSourceItemType & { file: File };
const fileType = '.txt, .docx, .pdf, .md, .html'; const fileType = '.txt, .docx, .csv, .pdf, .md, .html';
const maxSelectFileCount = 1000; const maxSelectFileCount = 1000;
const FileLocal = ({ activeStep, goToNext }: ImportDataComponentProps) => { const FileLocal = ({ activeStep, goToNext }: ImportDataComponentProps) => {

View File

@@ -14,7 +14,8 @@ import { useImportStore } from '../Provider';
import { feConfigs } from '@/web/common/system/staticData'; import { feConfigs } from '@/web/common/system/staticData';
import dynamic from 'next/dynamic'; import dynamic from 'next/dynamic';
import { fileDownload, readCsvContent } from '@/web/common/file/utils'; import { fileDownload } from '@/web/common/file/utils';
import { readCsvContent } from '@fastgpt/web/common/file/read/csv';
const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
const Upload = dynamic(() => import('../commonProgress/Upload')); const Upload = dynamic(() => import('../commonProgress/Upload'));
@@ -56,7 +57,7 @@ const SelectFile = React.memo(function SelectFile({ goToNext }: { goToNext: () =
{ {
for await (const selectFile of files) { for await (const selectFile of files) {
const { file, folderPath } = selectFile; const { file, folderPath } = selectFile;
const { header, data } = await readCsvContent(file); const { header, data } = await readCsvContent({ file });
const filterData: FileItemType['chunks'] = data const filterData: FileItemType['chunks'] = data
.filter((item) => item[0]) .filter((item) => item[0])

View File

@@ -193,7 +193,10 @@ const InputDataModal = ({
// not exactly same // not exactly same
await putDatasetDataById({ await putDatasetDataById({
id: dataId, id: dataId,
...e ...e,
indexes: e.indexes.map((index) =>
index.defaultIndex ? getDefaultIndex({ q: e.q, a: e.a }) : index
)
}); });
return { return {

View File

@@ -35,7 +35,8 @@ import dynamic from 'next/dynamic';
import { useForm } from 'react-hook-form'; import { useForm } from 'react-hook-form';
import MySelect from '@/components/Select'; import MySelect from '@/components/Select';
import { useSelectFile } from '@/web/common/file/hooks/useSelectFile'; import { useSelectFile } from '@/web/common/file/hooks/useSelectFile';
import { fileDownload, readCsvContent } from '@/web/common/file/utils'; import { fileDownload } from '@/web/common/file/utils';
import { readCsvContent } from '@fastgpt/web/common/file/read/csv';
import { delay } from '@fastgpt/global/common/system/utils'; import { delay } from '@fastgpt/global/common/system/utils';
import QuoteItem from '@/components/core/dataset/QuoteItem'; import QuoteItem from '@/components/core/dataset/QuoteItem';
@@ -125,7 +126,7 @@ const Test = ({ datasetId }: { datasetId: string }) => {
const { mutate: onFileTest, isLoading: fileTestIsLoading } = useRequest({ const { mutate: onFileTest, isLoading: fileTestIsLoading } = useRequest({
mutationFn: async ({ searchParams }: FormType) => { mutationFn: async ({ searchParams }: FormType) => {
if (!selectFile) return Promise.reject('File is not selected'); if (!selectFile) return Promise.reject('File is not selected');
const { data } = await readCsvContent(selectFile); const { data } = await readCsvContent({ file: selectFile });
const testList = data.slice(0, 100); const testList = data.slice(0, 100);
const results: SearchTestResponse[] = []; const results: SearchTestResponse[] = [];

View File

@@ -3,6 +3,11 @@ import { generateQA } from '@/service/events/generateQA';
import { generateVector } from '@/service/events/generateVector'; import { generateVector } from '@/service/events/generateVector';
import { setCron } from '@fastgpt/service/common/system/cron'; import { setCron } from '@fastgpt/service/common/system/cron';
export const startCron = () => {
setUpdateSystemConfigCron();
setTrainingQueueCron();
};
export const setUpdateSystemConfigCron = () => { export const setUpdateSystemConfigCron = () => {
setCron('*/5 * * * *', () => { setCron('*/5 * * * *', () => {
initSystemConfig(); initSystemConfig();
@@ -11,7 +16,7 @@ export const setUpdateSystemConfigCron = () => {
}; };
export const setTrainingQueueCron = () => { export const setTrainingQueueCron = () => {
setCron('*/3 * * * *', () => { setCron('*/1 * * * *', () => {
generateVector(); generateVector();
generateQA(); generateQA();
}); });

View File

@@ -9,13 +9,11 @@ import {
recallFromVectorStore, recallFromVectorStore,
updateDatasetDataVector updateDatasetDataVector
} from '@fastgpt/service/common/vectorStore/controller'; } from '@fastgpt/service/common/vectorStore/controller';
import { Types } from 'mongoose';
import { import {
DatasetDataIndexTypeEnum, DatasetDataIndexTypeEnum,
DatasetSearchModeEnum, DatasetSearchModeEnum,
DatasetSearchModeMap, DatasetSearchModeMap,
SearchScoreTypeEnum, SearchScoreTypeEnum
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils'; import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
import { jiebaSplit } from '@/service/common/string/jieba'; import { jiebaSplit } from '@/service/common/string/jieba';
@@ -29,172 +27,26 @@ import {
} from '@fastgpt/global/core/dataset/type'; } from '@fastgpt/global/core/dataset/type';
import { reRankRecall } from '../../ai/rerank'; import { reRankRecall } from '../../ai/rerank';
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken'; import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import { hashStr, simpleText } from '@fastgpt/global/common/string/tools'; import { hashStr } from '@fastgpt/global/common/string/tools';
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d'; import type {
import type { PushDataResponse } from '@/global/core/api/datasetRes'; PushDatasetDataProps,
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; PushDatasetDataResponse
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; } from '@fastgpt/global/core/dataset/api.d';
import { startQueue } from '@/service/utils/tools'; import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller';
import { getQAModel, getVectorModel } from '../../ai/model';
import { delay } from '@fastgpt/global/common/system/utils';
export async function pushDataToTrainingQueue({ export async function pushDataToTrainingQueue(
teamId, props: {
tmbId, teamId: string;
collectionId, tmbId: string;
data, } & PushDatasetDataProps
prompt, ): Promise<PushDatasetDataResponse> {
billId, const result = await pushDataListToTrainingQueue({
trainingMode ...props,
}: { vectorModelList: global.vectorModels,
teamId: string; qaModelList: global.qaModels
tmbId: string;
} & PushDatasetDataProps): Promise<PushDataResponse> {
const checkModelValid = async ({ collectionId }: { collectionId: string }) => {
const {
datasetId: { _id: datasetId, vectorModel, agentModel }
} = await getCollectionWithDataset(collectionId);
if (trainingMode === TrainingModeEnum.chunk) {
if (!collectionId) return Promise.reject(`CollectionId is empty`);
const vectorModelData = getVectorModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(`Model ${vectorModel} is inValid`);
}
return {
datasetId,
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (trainingMode === TrainingModeEnum.qa) {
const qaModelData = getQAModel(agentModel);
if (!qaModelData) {
return Promise.reject(`Model ${agentModel} is inValid`);
}
return {
datasetId,
maxToken: qaModelData.maxContext * 0.8,
model: qaModelData.model,
weight: 0
};
}
return Promise.reject(`Mode ${trainingMode} is inValid`);
};
const { datasetId, model, maxToken, weight } = await checkModelValid({
collectionId
}); });
// format q and a, remove empty char return result;
data.forEach((item) => {
item.q = simpleText(item.q);
item.a = simpleText(item.a);
item.indexes = item.indexes
?.map((index) => {
return {
...index,
text: simpleText(index.text)
};
})
.filter(Boolean);
});
// filter repeat or equal content
const set = new Set();
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
success: [],
overToken: [],
repeat: [],
error: []
};
data.forEach((item) => {
if (!item.q) {
filterResult.error.push(item);
return;
}
const text = item.q + item.a;
// count q token
const token = countPromptTokens(item.q);
if (token > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
set.add(text);
}
});
// 插入记录
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
try {
const results = await MongoDatasetTraining.insertMany(
dataList.map((item, i) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: trainingMode,
prompt,
model,
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? i,
weight: weight ?? 0,
indexes: item.indexes
}))
);
await delay(500);
return results.length;
} catch (error) {
if (retry > 0) {
await delay(1000);
return insertData(dataList, retry - 1);
}
return Promise.reject(error);
}
};
let insertLen = 0;
const chunkSize = 50;
const chunkList = filterResult.success.reduce(
(acc, cur) => {
const lastChunk = acc[acc.length - 1];
if (lastChunk.length < chunkSize) {
lastChunk.push(cur);
} else {
acc.push([cur]);
}
return acc;
},
[[]] as PushDatasetDataChunkProps[][]
);
for await (const chunks of chunkList) {
insertLen += await insertData(chunks);
}
startQueue();
delete filterResult.success;
return {
insertLen,
...filterResult
};
} }
/* insert data. /* insert data.
@@ -341,6 +193,11 @@ export async function updateData2Dataset({
text: qaStr text: qaStr
} }
}); });
} else {
patchResult.push({
type: 'unChange',
index: item
});
} }
} else { } else {
// not in database, create // not in database, create
@@ -379,6 +236,7 @@ export async function updateData2Dataset({
model model
}); });
item.index.dataId = result.insertId; item.index.dataId = result.insertId;
return result; return result;
} }
if (item.type === 'delete' && item.index.dataId) { if (item.type === 'delete' && item.index.dataId) {
@@ -397,13 +255,14 @@ export async function updateData2Dataset({
); );
const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0); const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0);
const newIndexes = patchResult.filter((item) => item.type !== 'delete').map((item) => item.index);
// update mongo other data // update mongo other data
mongoData.q = q || mongoData.q; mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a; mongoData.a = a ?? mongoData.a;
mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a }); mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a });
// @ts-ignore // @ts-ignore
mongoData.indexes = indexes; mongoData.indexes = newIndexes;
await mongoData.save(); await mongoData.save();
return { return {

View File

@@ -7,7 +7,7 @@ import { createDefaultTeam } from '@fastgpt/service/support/user/team/controller
import { exit } from 'process'; import { exit } from 'process';
import { initVectorStore } from '@fastgpt/service/common/vectorStore/controller'; import { initVectorStore } from '@fastgpt/service/common/vectorStore/controller';
import { getInitConfig } from '@/pages/api/common/system/getInitData'; import { getInitConfig } from '@/pages/api/common/system/getInitData';
import { setUpdateSystemConfigCron, setTrainingQueueCron } from './common/system/cron'; import { startCron } from './common/system/cron';
/** /**
* connect MongoDB and init data * connect MongoDB and init data
@@ -23,8 +23,7 @@ export function connectToDatabase(): Promise<void> {
getInitConfig(); getInitConfig();
// cron // cron
setUpdateSystemConfigCron(); startCron();
setTrainingQueueCron();
initRootUser(); initRootUser();
} }

View File

@@ -32,13 +32,24 @@ export const uploadFiles = ({
}); });
}; };
export const getUploadBase64ImgController = (props: CompressImgProps & UploadImgProps) => export const getUploadBase64ImgController = (
compressBase64ImgAndUpload({ props: CompressImgProps & UploadImgProps,
maxW: 4000, retry = 3
maxH: 4000, ): Promise<string> => {
maxSize: 1024 * 1024 * 5, try {
...props return compressBase64ImgAndUpload({
}); maxW: 4000,
maxH: 4000,
maxSize: 1024 * 1024 * 5,
...props
});
} catch (error) {
if (retry > 0) {
return getUploadBase64ImgController(props, retry - 1);
}
return Promise.reject(error);
}
};
/** /**
* compress image. response base64 * compress image. response base64

View File

@@ -1,29 +1,3 @@
import Papa from 'papaparse';
import { readFileRawText } from '@fastgpt/web/common/file/read/rawText';
/**
* read csv to json
* @response {
* header: string[],
* data: string[][]
* }
*/
export const readCsvContent = async (file: File) => {
try {
const { rawText: textArr } = await readFileRawText(file);
const csvArr = Papa.parse(textArr).data as string[][];
if (csvArr.length === 0) {
throw new Error('csv 解析失败');
}
return {
header: csvArr.shift() as string[],
data: csvArr.map((item) => item)
};
} catch (error) {
return Promise.reject('解析 csv 文件失败');
}
};
/** /**
* file download by text * file download by text
*/ */

View File

@@ -19,12 +19,14 @@ import type {
SearchTestResponse SearchTestResponse
} from '@/global/core/dataset/api.d'; } from '@/global/core/dataset/api.d';
import type { import type {
PushDatasetDataProps,
UpdateDatasetDataProps, UpdateDatasetDataProps,
CreateDatasetParams, CreateDatasetParams,
InsertOneDatasetDataProps InsertOneDatasetDataProps
} from '@/global/core/dataset/api.d'; } from '@/global/core/dataset/api.d';
import type { PushDataResponse } from '@/global/core/api/datasetRes.d'; import type {
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import type { DatasetCollectionItemType } from '@fastgpt/global/core/dataset/type'; import type { DatasetCollectionItemType } from '@fastgpt/global/core/dataset/type';
import { import {
DatasetCollectionSyncResultEnum, DatasetCollectionSyncResultEnum,
@@ -97,7 +99,7 @@ export const getDatasetDataItemById = (id: string) =>
* push data to training queue * push data to training queue
*/ */
export const postChunks2Dataset = (data: PushDatasetDataProps) => export const postChunks2Dataset = (data: PushDatasetDataProps) =>
POST<PushDataResponse>(`/core/dataset/data/pushData`, data); POST<PushDatasetDataResponse>(`/core/dataset/data/pushData`, data);
/** /**
* insert one data to dataset (immediately insert) * insert one data to dataset (immediately insert)