mirror of
https://github.com/labring/FastGPT.git
synced 2025-07-23 05:12:39 +00:00
4.6.7 fix (#752)
This commit is contained in:
@@ -17,6 +17,11 @@ weight: 707
|
|||||||
| 500w 组向量 | 8c32g | 16c64g 200GB |
|
| 500w 组向量 | 8c32g | 16c64g 200GB |
|
||||||
{{< /table >}}
|
{{< /table >}}
|
||||||
|
|
||||||
|
## 部署架构图
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
### 1. 准备好代理环境(国外服务器可忽略)
|
### 1. 准备好代理环境(国外服务器可忽略)
|
||||||
|
|
||||||
确保可以访问 OpenAI,具体方案可以参考:[代理方案](/docs/development/proxy/)。或直接在 Sealos 上 [部署 OneAPI](/docs/development/one-api),既解决代理问题也能实现多 Key 轮询、接入其他大模型。
|
确保可以访问 OpenAI,具体方案可以参考:[代理方案](/docs/development/proxy/)。或直接在 Sealos 上 [部署 OneAPI](/docs/development/one-api),既解决代理问题也能实现多 Key 轮询、接入其他大模型。
|
||||||
|
@@ -19,6 +19,10 @@ images: []
|
|||||||
|
|
||||||
## 通用问题
|
## 通用问题
|
||||||
|
|
||||||
|
### 能否纯本地允许
|
||||||
|
|
||||||
|
可以。需要准备好向量模型和LLM模型。
|
||||||
|
|
||||||
### insufficient_user_quota user quota is not enough
|
### insufficient_user_quota user quota is not enough
|
||||||
|
|
||||||
OneAPI 账号的余额不足,默认 root 用户只有 200 刀,可以手动修改。
|
OneAPI 账号的余额不足,默认 root 用户只有 200 刀,可以手动修改。
|
||||||
@@ -105,7 +109,7 @@ mongo连接失败,检查
|
|||||||
|
|
||||||
### TypeError: Cannot read properties of null (reading 'useMemo' )
|
### TypeError: Cannot read properties of null (reading 'useMemo' )
|
||||||
|
|
||||||
用 Node18 试试,可能最新的 Node 有问题。 本地开发流程:
|
删除所有的`node_modules`,用 Node18 重新 install 试试,可能最新的 Node 有问题。 本地开发流程:
|
||||||
|
|
||||||
1. 根目录: `pnpm i`
|
1. 根目录: `pnpm i`
|
||||||
2. 复制 `config.json` -> `config.local.json`
|
2. 复制 `config.json` -> `config.local.json`
|
||||||
|
@@ -3,10 +3,12 @@ export const fileImgs = [
|
|||||||
{ suffix: 'csv', src: 'file/fill/csv' },
|
{ suffix: 'csv', src: 'file/fill/csv' },
|
||||||
{ suffix: '(doc|docs)', src: 'file/fill/doc' },
|
{ suffix: '(doc|docs)', src: 'file/fill/doc' },
|
||||||
{ suffix: 'txt', src: 'file/fill/txt' },
|
{ suffix: 'txt', src: 'file/fill/txt' },
|
||||||
{ suffix: 'md', src: 'file/fill/markdown' }
|
{ suffix: 'md', src: 'file/fill/markdown' },
|
||||||
|
{ suffix: 'html', src: 'file/fill/html' }
|
||||||
|
|
||||||
// { suffix: '.', src: '/imgs/files/file.svg' }
|
// { suffix: '.', src: '/imgs/files/file.svg' }
|
||||||
];
|
];
|
||||||
|
|
||||||
export function getFileIcon(name = '', defaultImg = '/imgs/files/file.svg') {
|
export function getFileIcon(name = '', defaultImg = 'file/fill/file') {
|
||||||
return fileImgs.find((item) => new RegExp(item.suffix, 'gi').test(name))?.src || defaultImg;
|
return fileImgs.find((item) => new RegExp(item.suffix, 'gi').test(name))?.src || defaultImg;
|
||||||
}
|
}
|
||||||
|
@@ -51,19 +51,18 @@ export const uploadMarkdownBase64 = async ({
|
|||||||
// match base64, upload and replace it
|
// match base64, upload and replace it
|
||||||
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
|
const base64Regex = /data:image\/.*;base64,([^\)]+)/g;
|
||||||
const base64Arr = rawText.match(base64Regex) || [];
|
const base64Arr = rawText.match(base64Regex) || [];
|
||||||
// upload base64 and replace it
|
|
||||||
await Promise.all(
|
|
||||||
base64Arr.map(async (base64Img) => {
|
|
||||||
try {
|
|
||||||
const str = await uploadImgController(base64Img);
|
|
||||||
|
|
||||||
rawText = rawText.replace(base64Img, str);
|
// upload base64 and replace it
|
||||||
} catch (error) {
|
for await (const base64Img of base64Arr) {
|
||||||
rawText = rawText.replace(base64Img, '');
|
try {
|
||||||
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
|
const str = await uploadImgController(base64Img);
|
||||||
}
|
|
||||||
})
|
rawText = rawText.replace(base64Img, str);
|
||||||
);
|
} catch (error) {
|
||||||
|
rawText = rawText.replace(base64Img, '');
|
||||||
|
rawText = rawText.replace(/!\[.*\]\(\)/g, '');
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove white space on both sides of the picture
|
// Remove white space on both sides of the picture
|
||||||
|
15
packages/global/core/dataset/api.d.ts
vendored
15
packages/global/core/dataset/api.d.ts
vendored
@@ -48,10 +48,6 @@ export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams
|
|||||||
name: string;
|
name: string;
|
||||||
rawTextLength: number;
|
rawTextLength: number;
|
||||||
hashRawText: string;
|
hashRawText: string;
|
||||||
trainingType: `${TrainingModeEnum}`;
|
|
||||||
chunkSize: number;
|
|
||||||
chunkSplitter: string;
|
|
||||||
qaPrompt: string;
|
|
||||||
|
|
||||||
fileMetadata?: Record<string, any>;
|
fileMetadata?: Record<string, any>;
|
||||||
collectionMetadata?: Record<string, any>;
|
collectionMetadata?: Record<string, any>;
|
||||||
@@ -74,3 +70,14 @@ export type PostWebsiteSyncParams = {
|
|||||||
datasetId: string;
|
datasetId: string;
|
||||||
billId: string;
|
billId: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type PushDatasetDataProps = {
|
||||||
|
collectionId: string;
|
||||||
|
data: PushDatasetDataChunkProps[];
|
||||||
|
trainingMode: `${TrainingModeEnum}`;
|
||||||
|
prompt?: string;
|
||||||
|
billId?: string;
|
||||||
|
};
|
||||||
|
export type PushDatasetDataResponse = {
|
||||||
|
insertLen: number;
|
||||||
|
};
|
||||||
|
2
packages/global/core/dataset/controller.d.ts
vendored
2
packages/global/core/dataset/controller.d.ts
vendored
@@ -21,7 +21,7 @@ export type UpdateDatasetDataProps = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export type PatchIndexesProps = {
|
export type PatchIndexesProps = {
|
||||||
type: 'create' | 'update' | 'delete';
|
type: 'create' | 'update' | 'delete' | 'unChange';
|
||||||
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
|
index: Omit<DatasetDataIndexItemType, 'dataId'> & {
|
||||||
dataId?: string;
|
dataId?: string;
|
||||||
};
|
};
|
||||||
|
@@ -46,8 +46,17 @@ export async function readMongoImg({ id }: { id: string }) {
|
|||||||
return data?.binary;
|
return data?.binary;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function delImgByRelatedId(relateIds: string[]) {
|
export async function delImgByRelatedId({
|
||||||
|
teamId,
|
||||||
|
relateIds
|
||||||
|
}: {
|
||||||
|
teamId: string;
|
||||||
|
relateIds: string[];
|
||||||
|
}) {
|
||||||
|
if (relateIds.length === 0) return;
|
||||||
|
|
||||||
return MongoImage.deleteMany({
|
return MongoImage.deleteMany({
|
||||||
|
teamId,
|
||||||
'metadata.relatedId': { $in: relateIds.map((id) => String(id)) }
|
'metadata.relatedId': { $in: relateIds.map((id) => String(id)) }
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@@ -34,9 +34,8 @@ const ImageSchema = new Schema({
|
|||||||
try {
|
try {
|
||||||
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
|
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
|
||||||
ImageSchema.index({ type: 1 });
|
ImageSchema.index({ type: 1 });
|
||||||
ImageSchema.index({ teamId: 1 });
|
|
||||||
ImageSchema.index({ createTime: 1 });
|
ImageSchema.index({ createTime: 1 });
|
||||||
ImageSchema.index({ 'metadata.relatedId': 1 });
|
ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
}
|
}
|
||||||
|
@@ -28,12 +28,16 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
|
|||||||
// },
|
// },
|
||||||
filename: async (req, file, cb) => {
|
filename: async (req, file, cb) => {
|
||||||
const { ext } = path.parse(decodeURIComponent(file.originalname));
|
const { ext } = path.parse(decodeURIComponent(file.originalname));
|
||||||
cb(null, `${getNanoid(32)}${ext}`);
|
cb(null, `${getNanoid()}${ext}`);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}).single('file');
|
}).single('file');
|
||||||
|
|
||||||
async doUpload<T = Record<string, any>>(req: NextApiRequest, res: NextApiResponse) {
|
async doUpload<T = Record<string, any>>(
|
||||||
|
req: NextApiRequest,
|
||||||
|
res: NextApiResponse,
|
||||||
|
originBuckerName?: `${BucketNameEnum}`
|
||||||
|
) {
|
||||||
return new Promise<{
|
return new Promise<{
|
||||||
file: FileType;
|
file: FileType;
|
||||||
metadata: Record<string, any>;
|
metadata: Record<string, any>;
|
||||||
@@ -47,7 +51,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check bucket name
|
// check bucket name
|
||||||
const bucketName = req.body?.bucketName as `${BucketNameEnum}`;
|
const bucketName = (req.body?.bucketName || originBuckerName) as `${BucketNameEnum}`;
|
||||||
if (bucketName && !bucketNameMap[bucketName]) {
|
if (bucketName && !bucketNameMap[bucketName]) {
|
||||||
return reject('BucketName is invalid');
|
return reject('BucketName is invalid');
|
||||||
}
|
}
|
||||||
|
@@ -39,14 +39,15 @@ export const insertDatasetDataVector = async (
|
|||||||
}
|
}
|
||||||
): Promise<{ insertId: string }> => {
|
): Promise<{ insertId: string }> => {
|
||||||
const { teamId, datasetId, collectionId, vectors, retry = 3 } = props;
|
const { teamId, datasetId, collectionId, vectors, retry = 3 } = props;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { rows } = await PgClient.insert(PgDatasetTableName, {
|
const { rows } = await PgClient.insert(PgDatasetTableName, {
|
||||||
values: [
|
values: [
|
||||||
[
|
[
|
||||||
{ key: 'vector', value: `[${vectors[0]}]` },
|
{ key: 'vector', value: `[${vectors[0]}]` },
|
||||||
{ key: 'team_id', value: String(teamId) },
|
{ key: 'team_id', value: String(teamId) },
|
||||||
{ key: 'dataset_id', value: datasetId },
|
{ key: 'dataset_id', value: String(datasetId) },
|
||||||
{ key: 'collection_id', value: collectionId }
|
{ key: 'collection_id', value: String(collectionId) }
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
@@ -176,8 +177,8 @@ export const getVectorDataByTime = async (start: Date, end: Date) => {
|
|||||||
`);
|
`);
|
||||||
|
|
||||||
return rows.map((item) => ({
|
return rows.map((item) => ({
|
||||||
id: item.id,
|
id: String(item.id),
|
||||||
datasetId: item.dataset_id,
|
teamId: item.team_id,
|
||||||
teamId: item.team_id
|
datasetId: item.dataset_id
|
||||||
}));
|
}));
|
||||||
};
|
};
|
||||||
|
@@ -89,6 +89,7 @@ try {
|
|||||||
close custom feedback;
|
close custom feedback;
|
||||||
*/
|
*/
|
||||||
ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true });
|
ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true });
|
||||||
|
ChatItemSchema.index({ time: -1 }, { background: true });
|
||||||
ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true });
|
ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true });
|
||||||
ChatItemSchema.index({ userBadFeedback: 1 }, { background: true });
|
ChatItemSchema.index({ userBadFeedback: 1 }, { background: true });
|
||||||
ChatItemSchema.index({ customFeedbacks: 1 }, { background: true });
|
ChatItemSchema.index({ customFeedbacks: 1 }, { background: true });
|
||||||
|
@@ -25,7 +25,7 @@ export async function createOneCollection({
|
|||||||
type,
|
type,
|
||||||
|
|
||||||
trainingType = TrainingModeEnum.chunk,
|
trainingType = TrainingModeEnum.chunk,
|
||||||
chunkSize = 0,
|
chunkSize = 512,
|
||||||
chunkSplitter,
|
chunkSplitter,
|
||||||
qaPrompt,
|
qaPrompt,
|
||||||
|
|
||||||
@@ -134,7 +134,10 @@ export async function delCollectionAndRelatedSources({
|
|||||||
|
|
||||||
// delete file and imgs
|
// delete file and imgs
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
delImgByRelatedId(relatedImageIds),
|
delImgByRelatedId({
|
||||||
|
teamId,
|
||||||
|
relateIds: relatedImageIds
|
||||||
|
}),
|
||||||
delFileByFileIdList({
|
delFileByFileIdList({
|
||||||
bucketName: BucketNameEnum.dataset,
|
bucketName: BucketNameEnum.dataset,
|
||||||
fileIdList
|
fileIdList
|
||||||
|
@@ -1,5 +1,15 @@
|
|||||||
import { delay } from '@fastgpt/global/common/system/utils';
|
import { delay } from '@fastgpt/global/common/system/utils';
|
||||||
import { MongoDatasetTraining } from './schema';
|
import { MongoDatasetTraining } from './schema';
|
||||||
|
import type {
|
||||||
|
PushDatasetDataChunkProps,
|
||||||
|
PushDatasetDataProps,
|
||||||
|
PushDatasetDataResponse
|
||||||
|
} from '@fastgpt/global/core/dataset/api.d';
|
||||||
|
import { getCollectionWithDataset } from '../controller';
|
||||||
|
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
|
import { simpleText } from '@fastgpt/global/common/string/tools';
|
||||||
|
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||||
|
import type { VectorModelItemType, LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||||
|
|
||||||
export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise<any> => {
|
export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise<any> => {
|
||||||
try {
|
try {
|
||||||
@@ -19,3 +29,165 @@ export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promi
|
|||||||
return Promise.reject(error);
|
return Promise.reject(error);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export async function pushDataListToTrainingQueue({
|
||||||
|
teamId,
|
||||||
|
tmbId,
|
||||||
|
collectionId,
|
||||||
|
data,
|
||||||
|
prompt,
|
||||||
|
billId,
|
||||||
|
trainingMode = TrainingModeEnum.chunk,
|
||||||
|
|
||||||
|
vectorModelList = [],
|
||||||
|
qaModelList = []
|
||||||
|
}: {
|
||||||
|
teamId: string;
|
||||||
|
tmbId: string;
|
||||||
|
vectorModelList: VectorModelItemType[];
|
||||||
|
qaModelList: LLMModelItemType[];
|
||||||
|
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
|
||||||
|
const {
|
||||||
|
datasetId: { _id: datasetId, vectorModel, agentModel }
|
||||||
|
} = await getCollectionWithDataset(collectionId);
|
||||||
|
|
||||||
|
const checkModelValid = async ({ collectionId }: { collectionId: string }) => {
|
||||||
|
if (!collectionId) return Promise.reject(`CollectionId is empty`);
|
||||||
|
|
||||||
|
if (trainingMode === TrainingModeEnum.chunk) {
|
||||||
|
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
|
||||||
|
if (!vectorModelData) {
|
||||||
|
return Promise.reject(`Model ${vectorModel} is inValid`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
maxToken: vectorModelData.maxToken * 1.5,
|
||||||
|
model: vectorModelData.model,
|
||||||
|
weight: vectorModelData.weight
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (trainingMode === TrainingModeEnum.qa) {
|
||||||
|
const qaModelData = qaModelList?.find((item) => item.model === agentModel);
|
||||||
|
if (!qaModelData) {
|
||||||
|
return Promise.reject(`Model ${agentModel} is inValid`);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
maxToken: qaModelData.maxContext * 0.8,
|
||||||
|
model: qaModelData.model,
|
||||||
|
weight: 0
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
|
||||||
|
};
|
||||||
|
|
||||||
|
const { model, maxToken, weight } = await checkModelValid({
|
||||||
|
collectionId
|
||||||
|
});
|
||||||
|
|
||||||
|
// format q and a, remove empty char
|
||||||
|
data.forEach((item) => {
|
||||||
|
item.q = simpleText(item.q);
|
||||||
|
item.a = simpleText(item.a);
|
||||||
|
|
||||||
|
item.indexes = item.indexes
|
||||||
|
?.map((index) => {
|
||||||
|
return {
|
||||||
|
...index,
|
||||||
|
text: simpleText(index.text)
|
||||||
|
};
|
||||||
|
})
|
||||||
|
.filter(Boolean);
|
||||||
|
});
|
||||||
|
|
||||||
|
// filter repeat or equal content
|
||||||
|
const set = new Set();
|
||||||
|
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
|
||||||
|
success: [],
|
||||||
|
overToken: [],
|
||||||
|
repeat: [],
|
||||||
|
error: []
|
||||||
|
};
|
||||||
|
|
||||||
|
// filter repeat content
|
||||||
|
data.forEach((item) => {
|
||||||
|
if (!item.q) {
|
||||||
|
filterResult.error.push(item);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = item.q + item.a;
|
||||||
|
|
||||||
|
// count q token
|
||||||
|
const token = countPromptTokens(item.q);
|
||||||
|
|
||||||
|
if (token > maxToken) {
|
||||||
|
filterResult.overToken.push(item);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (set.has(text)) {
|
||||||
|
console.log('repeat', item);
|
||||||
|
filterResult.repeat.push(item);
|
||||||
|
} else {
|
||||||
|
filterResult.success.push(item);
|
||||||
|
set.add(text);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// insert data to db
|
||||||
|
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
|
||||||
|
try {
|
||||||
|
const results = await MongoDatasetTraining.insertMany(
|
||||||
|
dataList.map((item, i) => ({
|
||||||
|
teamId,
|
||||||
|
tmbId,
|
||||||
|
datasetId,
|
||||||
|
collectionId,
|
||||||
|
billId,
|
||||||
|
mode: trainingMode,
|
||||||
|
prompt,
|
||||||
|
model,
|
||||||
|
q: item.q,
|
||||||
|
a: item.a,
|
||||||
|
chunkIndex: item.chunkIndex ?? i,
|
||||||
|
weight: weight ?? 0,
|
||||||
|
indexes: item.indexes
|
||||||
|
}))
|
||||||
|
);
|
||||||
|
await delay(500);
|
||||||
|
return results.length;
|
||||||
|
} catch (error) {
|
||||||
|
if (retry > 0) {
|
||||||
|
await delay(500);
|
||||||
|
return insertData(dataList, retry - 1);
|
||||||
|
}
|
||||||
|
return Promise.reject(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let insertLen = 0;
|
||||||
|
const chunkSize = 50;
|
||||||
|
const chunkList = filterResult.success.reduce(
|
||||||
|
(acc, cur) => {
|
||||||
|
const lastChunk = acc[acc.length - 1];
|
||||||
|
if (lastChunk.length < chunkSize) {
|
||||||
|
lastChunk.push(cur);
|
||||||
|
} else {
|
||||||
|
acc.push([cur]);
|
||||||
|
}
|
||||||
|
return acc;
|
||||||
|
},
|
||||||
|
[[]] as PushDatasetDataChunkProps[][]
|
||||||
|
);
|
||||||
|
for await (const chunks of chunkList) {
|
||||||
|
insertLen += await insertData(chunks);
|
||||||
|
}
|
||||||
|
|
||||||
|
delete filterResult.success;
|
||||||
|
|
||||||
|
return {
|
||||||
|
insertLen,
|
||||||
|
...filterResult
|
||||||
|
};
|
||||||
|
}
|
||||||
|
@@ -52,7 +52,7 @@ const BillSchema = new Schema({
|
|||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
BillSchema.index({ teamId: 1, tmbId: 1, time: -1 });
|
BillSchema.index({ teamId: 1, time: -1 });
|
||||||
BillSchema.index({ time: 1 }, { expireAfterSeconds: 180 * 24 * 60 * 60 });
|
BillSchema.index({ time: 1 }, { expireAfterSeconds: 180 * 24 * 60 * 60 });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.log(error);
|
console.log(error);
|
||||||
|
40
packages/web/common/file/read/csv.ts
Normal file
40
packages/web/common/file/read/csv.ts
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import Papa from 'papaparse';
|
||||||
|
import { readFileRawText } from './rawText';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* read csv to json
|
||||||
|
* @response {
|
||||||
|
* header: string[],
|
||||||
|
* data: string[][]
|
||||||
|
* }
|
||||||
|
*/
|
||||||
|
export const readCsvContent = async ({ file }: { file: File }) => {
|
||||||
|
try {
|
||||||
|
const { rawText: textArr } = await readFileRawText(file);
|
||||||
|
const csvArr = Papa.parse(textArr).data as string[][];
|
||||||
|
if (csvArr.length === 0) {
|
||||||
|
throw new Error('csv 解析失败');
|
||||||
|
}
|
||||||
|
|
||||||
|
const header = csvArr.shift() as string[];
|
||||||
|
|
||||||
|
// add title to data
|
||||||
|
const rawText = csvArr
|
||||||
|
.map((item) =>
|
||||||
|
item.map((value, index) => {
|
||||||
|
if (!header[index]) return value;
|
||||||
|
return `${header[index]}: ${value}`;
|
||||||
|
})
|
||||||
|
)
|
||||||
|
.flat()
|
||||||
|
.join('\n');
|
||||||
|
|
||||||
|
return {
|
||||||
|
rawText,
|
||||||
|
header,
|
||||||
|
data: csvArr.map((item) => item)
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
return Promise.reject('解析 csv 文件失败');
|
||||||
|
}
|
||||||
|
};
|
@@ -1,4 +1,5 @@
|
|||||||
import { loadFile2Buffer } from '../utils';
|
import { loadFile2Buffer } from '../utils';
|
||||||
|
import { readCsvContent } from './csv';
|
||||||
import { readHtmlFile } from './html';
|
import { readHtmlFile } from './html';
|
||||||
import { readMdFile } from './md';
|
import { readMdFile } from './md';
|
||||||
import { readPdfFile } from './pdf';
|
import { readPdfFile } from './pdf';
|
||||||
@@ -29,6 +30,8 @@ export const readFileRawContent = async ({
|
|||||||
file,
|
file,
|
||||||
uploadImgController: uploadBase64Controller
|
uploadImgController: uploadBase64Controller
|
||||||
});
|
});
|
||||||
|
case 'csv':
|
||||||
|
return readCsvContent({ file });
|
||||||
case 'pdf':
|
case 'pdf':
|
||||||
const pdf = await loadFile2Buffer({ file });
|
const pdf = await loadFile2Buffer({ file });
|
||||||
return readPdfFile({ pdf });
|
return readPdfFile({ pdf });
|
||||||
|
@@ -74,7 +74,7 @@ const JSONEditor = ({ defaultValue, value, onChange, resize, ...props }: Props)
|
|||||||
|
|
||||||
<Box
|
<Box
|
||||||
borderWidth={'1px'}
|
borderWidth={'1px'}
|
||||||
borderRadius={'base'}
|
borderRadius={'md'}
|
||||||
borderColor={'myGray.200'}
|
borderColor={'myGray.200'}
|
||||||
py={2}
|
py={2}
|
||||||
{...props}
|
{...props}
|
||||||
|
@@ -22,13 +22,15 @@
|
|||||||
"react-dom": "18.2.0",
|
"react-dom": "18.2.0",
|
||||||
"react-i18next": "^12.3.1",
|
"react-i18next": "^12.3.1",
|
||||||
"turndown": "^7.1.2",
|
"turndown": "^7.1.2",
|
||||||
"lexical":"0.12.6",
|
"lexical": "0.12.6",
|
||||||
"@lexical/react": "0.12.6",
|
"@lexical/react": "0.12.6",
|
||||||
|
"papaparse": "^5.4.1",
|
||||||
"@lexical/utils": "0.12.6",
|
"@lexical/utils": "0.12.6",
|
||||||
"@lexical/text": "0.12.6"
|
"@lexical/text": "0.12.6"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/react": "18.2.0",
|
"@types/react": "18.2.0",
|
||||||
|
"@types/papaparse": "^5.3.7",
|
||||||
"@types/react-dom": "18.2.0",
|
"@types/react-dom": "18.2.0",
|
||||||
"@types/turndown": "^5.0.4"
|
"@types/turndown": "^5.0.4"
|
||||||
}
|
}
|
||||||
|
12
pnpm-lock.yaml
generated
12
pnpm-lock.yaml
generated
@@ -196,6 +196,9 @@ importers:
|
|||||||
next-i18next:
|
next-i18next:
|
||||||
specifier: ^13.3.0
|
specifier: ^13.3.0
|
||||||
version: registry.npmmirror.com/next-i18next@13.3.0(i18next@22.5.1)(next@13.5.2)(react-i18next@12.3.1)(react@18.2.0)
|
version: registry.npmmirror.com/next-i18next@13.3.0(i18next@22.5.1)(next@13.5.2)(react-i18next@12.3.1)(react@18.2.0)
|
||||||
|
papaparse:
|
||||||
|
specifier: ^5.4.1
|
||||||
|
version: registry.npmmirror.com/papaparse@5.4.1
|
||||||
pdfjs-dist:
|
pdfjs-dist:
|
||||||
specifier: ^4.0.269
|
specifier: ^4.0.269
|
||||||
version: registry.npmmirror.com/pdfjs-dist@4.0.269
|
version: registry.npmmirror.com/pdfjs-dist@4.0.269
|
||||||
@@ -212,6 +215,9 @@ importers:
|
|||||||
specifier: ^7.1.2
|
specifier: ^7.1.2
|
||||||
version: registry.npmmirror.com/turndown@7.1.2
|
version: registry.npmmirror.com/turndown@7.1.2
|
||||||
devDependencies:
|
devDependencies:
|
||||||
|
'@types/papaparse':
|
||||||
|
specifier: ^5.3.7
|
||||||
|
version: registry.npmmirror.com/@types/papaparse@5.3.7
|
||||||
'@types/react':
|
'@types/react':
|
||||||
specifier: 18.2.0
|
specifier: 18.2.0
|
||||||
version: registry.npmmirror.com/@types/react@18.2.0
|
version: registry.npmmirror.com/@types/react@18.2.0
|
||||||
@@ -323,9 +329,6 @@ importers:
|
|||||||
nprogress:
|
nprogress:
|
||||||
specifier: ^0.2.0
|
specifier: ^0.2.0
|
||||||
version: registry.npmmirror.com/nprogress@0.2.0
|
version: registry.npmmirror.com/nprogress@0.2.0
|
||||||
papaparse:
|
|
||||||
specifier: ^5.4.1
|
|
||||||
version: registry.npmmirror.com/papaparse@5.4.1
|
|
||||||
react:
|
react:
|
||||||
specifier: 18.2.0
|
specifier: 18.2.0
|
||||||
version: registry.npmmirror.com/react@18.2.0
|
version: registry.npmmirror.com/react@18.2.0
|
||||||
@@ -390,9 +393,6 @@ importers:
|
|||||||
'@types/node':
|
'@types/node':
|
||||||
specifier: ^20.8.5
|
specifier: ^20.8.5
|
||||||
version: registry.npmmirror.com/@types/node@20.8.5
|
version: registry.npmmirror.com/@types/node@20.8.5
|
||||||
'@types/papaparse':
|
|
||||||
specifier: ^5.3.7
|
|
||||||
version: registry.npmmirror.com/@types/papaparse@5.3.7
|
|
||||||
'@types/react':
|
'@types/react':
|
||||||
specifier: 18.2.0
|
specifier: 18.2.0
|
||||||
version: registry.npmmirror.com/@types/react@18.2.0
|
version: registry.npmmirror.com/@types/react@18.2.0
|
||||||
|
@@ -42,7 +42,6 @@
|
|||||||
"next": "13.5.2",
|
"next": "13.5.2",
|
||||||
"next-i18next": "^13.3.0",
|
"next-i18next": "^13.3.0",
|
||||||
"nprogress": "^0.2.0",
|
"nprogress": "^0.2.0",
|
||||||
"papaparse": "^5.4.1",
|
|
||||||
"react": "18.2.0",
|
"react": "18.2.0",
|
||||||
"react-day-picker": "^8.7.1",
|
"react-day-picker": "^8.7.1",
|
||||||
"react-dom": "18.2.0",
|
"react-dom": "18.2.0",
|
||||||
@@ -66,7 +65,6 @@
|
|||||||
"@types/jsonwebtoken": "^9.0.3",
|
"@types/jsonwebtoken": "^9.0.3",
|
||||||
"@types/lodash": "^4.14.191",
|
"@types/lodash": "^4.14.191",
|
||||||
"@types/node": "^20.8.5",
|
"@types/node": "^20.8.5",
|
||||||
"@types/papaparse": "^5.3.7",
|
|
||||||
"@types/react": "18.2.0",
|
"@types/react": "18.2.0",
|
||||||
"@types/react-dom": "18.2.0",
|
"@types/react-dom": "18.2.0",
|
||||||
"@types/react-syntax-highlighter": "^15.5.6",
|
"@types/react-syntax-highlighter": "^15.5.6",
|
||||||
|
@@ -226,7 +226,7 @@
|
|||||||
"Chat test": "测试对话",
|
"Chat test": "测试对话",
|
||||||
"Max Token": "单条数据上限",
|
"Max Token": "单条数据上限",
|
||||||
"Start chat": "立即对话",
|
"Start chat": "立即对话",
|
||||||
"Total chars": "总字符数: {{total}}",
|
"Total chars": "总字数: {{total}}",
|
||||||
"Total tokens": "总 Tokens: {{total}}",
|
"Total tokens": "总 Tokens: {{total}}",
|
||||||
"ai": {
|
"ai": {
|
||||||
"Model": "AI 模型",
|
"Model": "AI 模型",
|
||||||
@@ -541,8 +541,7 @@
|
|||||||
"success": "开始同步"
|
"success": "开始同步"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"training": {
|
"training": {}
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"data": {
|
"data": {
|
||||||
"Auxiliary Data": "辅助数据",
|
"Auxiliary Data": "辅助数据",
|
||||||
|
@@ -17,7 +17,7 @@ const ButtonEdge = (props: EdgeProps) => {
|
|||||||
style = {}
|
style = {}
|
||||||
} = props;
|
} = props;
|
||||||
|
|
||||||
const [labelX, labelY] = getBezierPath({
|
const [, labelX, labelY] = getBezierPath({
|
||||||
sourceX,
|
sourceX,
|
||||||
sourceY,
|
sourceY,
|
||||||
sourcePosition,
|
sourcePosition,
|
||||||
|
@@ -8,6 +8,3 @@ import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type';
|
|||||||
/* ======= collection =========== */
|
/* ======= collection =========== */
|
||||||
|
|
||||||
/* ==== data ===== */
|
/* ==== data ===== */
|
||||||
export type PushDataResponse = {
|
|
||||||
insertLen: number;
|
|
||||||
};
|
|
||||||
|
@@ -27,13 +27,7 @@ export type CreateDatasetParams = {
|
|||||||
export type InsertOneDatasetDataProps = PushDatasetDataChunkProps & {
|
export type InsertOneDatasetDataProps = PushDatasetDataChunkProps & {
|
||||||
collectionId: string;
|
collectionId: string;
|
||||||
};
|
};
|
||||||
export type PushDatasetDataProps = {
|
|
||||||
collectionId: string;
|
|
||||||
data: PushDatasetDataChunkProps[];
|
|
||||||
trainingMode: `${TrainingModeEnum}`;
|
|
||||||
prompt?: string;
|
|
||||||
billId?: string;
|
|
||||||
};
|
|
||||||
export type UpdateDatasetDataProps = {
|
export type UpdateDatasetDataProps = {
|
||||||
id: string;
|
id: string;
|
||||||
q?: string; // embedding content
|
q?: string; // embedding content
|
||||||
|
@@ -16,11 +16,15 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dat
|
|||||||
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
||||||
import { pushDataToTrainingQueue } from '@/service/core/dataset/data/controller';
|
import { pushDataToTrainingQueue } from '@/service/core/dataset/data/controller';
|
||||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||||
|
import { createTrainingBill } from '@fastgpt/service/support/wallet/bill/controller';
|
||||||
|
import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants';
|
||||||
|
import { getQAModel, getVectorModel } from '@/service/core/ai/model';
|
||||||
|
|
||||||
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
|
||||||
try {
|
try {
|
||||||
await connectToDatabase();
|
await connectToDatabase();
|
||||||
const {
|
const {
|
||||||
|
name,
|
||||||
text,
|
text,
|
||||||
trainingType = TrainingModeEnum.chunk,
|
trainingType = TrainingModeEnum.chunk,
|
||||||
chunkSize = 512,
|
chunkSize = 512,
|
||||||
@@ -29,7 +33,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
...body
|
...body
|
||||||
} = req.body as TextCreateDatasetCollectionParams;
|
} = req.body as TextCreateDatasetCollectionParams;
|
||||||
|
|
||||||
const { teamId, tmbId } = await authDataset({
|
const { teamId, tmbId, dataset } = await authDataset({
|
||||||
req,
|
req,
|
||||||
authToken: true,
|
authToken: true,
|
||||||
authApiKey: true,
|
authApiKey: true,
|
||||||
@@ -52,21 +56,32 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
insertLen: predictDataLimitLength(trainingType, chunks)
|
insertLen: predictDataLimitLength(trainingType, chunks)
|
||||||
});
|
});
|
||||||
|
|
||||||
// 3. create collection
|
// 3. create collection and training bill
|
||||||
const collectionId = await createOneCollection({
|
const [collectionId, { billId }] = await Promise.all([
|
||||||
...body,
|
createOneCollection({
|
||||||
teamId,
|
...body,
|
||||||
tmbId,
|
teamId,
|
||||||
type: DatasetCollectionTypeEnum.virtual,
|
tmbId,
|
||||||
|
type: DatasetCollectionTypeEnum.virtual,
|
||||||
|
|
||||||
trainingType,
|
name,
|
||||||
chunkSize,
|
trainingType,
|
||||||
chunkSplitter,
|
chunkSize,
|
||||||
qaPrompt,
|
chunkSplitter,
|
||||||
|
qaPrompt,
|
||||||
|
|
||||||
hashRawText: hashStr(text),
|
hashRawText: hashStr(text),
|
||||||
rawTextLength: text.length
|
rawTextLength: text.length
|
||||||
});
|
}),
|
||||||
|
createTrainingBill({
|
||||||
|
teamId,
|
||||||
|
tmbId,
|
||||||
|
appName: name,
|
||||||
|
billSource: BillSourceEnum.training,
|
||||||
|
vectorModel: getVectorModel(dataset.vectorModel)?.name,
|
||||||
|
agentModel: getQAModel(dataset.agentModel)?.name
|
||||||
|
})
|
||||||
|
]);
|
||||||
|
|
||||||
// 4. push chunks to training queue
|
// 4. push chunks to training queue
|
||||||
const insertResults = await pushDataToTrainingQueue({
|
const insertResults = await pushDataToTrainingQueue({
|
||||||
@@ -74,6 +89,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
tmbId,
|
tmbId,
|
||||||
collectionId,
|
collectionId,
|
||||||
trainingMode: trainingType,
|
trainingMode: trainingType,
|
||||||
|
prompt: qaPrompt,
|
||||||
|
billId,
|
||||||
data: chunks.map((text, index) => ({
|
data: chunks.map((text, index) => ({
|
||||||
q: text,
|
q: text,
|
||||||
chunkIndex: index
|
chunkIndex: index
|
||||||
@@ -90,3 +107,11 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const config = {
|
||||||
|
api: {
|
||||||
|
bodyParser: {
|
||||||
|
sizeLimit: '10mb'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
@@ -3,8 +3,10 @@ import type { NextApiRequest, NextApiResponse } from 'next';
|
|||||||
import { jsonRes } from '@fastgpt/service/common/response';
|
import { jsonRes } from '@fastgpt/service/common/response';
|
||||||
import { connectToDatabase } from '@/service/mongo';
|
import { connectToDatabase } from '@/service/mongo';
|
||||||
import { withNextCors } from '@fastgpt/service/common/middle/cors';
|
import { withNextCors } from '@fastgpt/service/common/middle/cors';
|
||||||
import type { PushDataResponse } from '@/global/core/api/datasetRes.d';
|
import type {
|
||||||
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d';
|
PushDatasetDataProps,
|
||||||
|
PushDatasetDataResponse
|
||||||
|
} from '@fastgpt/global/core/dataset/api.d';
|
||||||
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
|
import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset';
|
||||||
import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset';
|
import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset';
|
||||||
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils';
|
||||||
@@ -39,7 +41,7 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex
|
|||||||
insertLen: predictDataLimitLength(collection.trainingType, data)
|
insertLen: predictDataLimitLength(collection.trainingType, data)
|
||||||
});
|
});
|
||||||
|
|
||||||
jsonRes<PushDataResponse>(res, {
|
jsonRes<PushDatasetDataResponse>(res, {
|
||||||
data: await pushDataToTrainingQueue({
|
data: await pushDataToTrainingQueue({
|
||||||
...req.body,
|
...req.body,
|
||||||
teamId,
|
teamId,
|
||||||
|
@@ -12,16 +12,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
|||||||
const method = (req.method || 'POST') as Method;
|
const method = (req.method || 'POST') as Method;
|
||||||
const { path = [], ...query } = req.query as any;
|
const { path = [], ...query } = req.query as any;
|
||||||
|
|
||||||
const url = `/${path?.join('/')}`;
|
const url = `/${path?.join('/')}?${new URLSearchParams(query).toString()}`;
|
||||||
|
|
||||||
if (!url) {
|
if (!url) {
|
||||||
throw new Error('url is empty');
|
throw new Error('url is empty');
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = {
|
const data = req.body || query;
|
||||||
...req.body,
|
|
||||||
...query
|
|
||||||
};
|
|
||||||
|
|
||||||
const repose = await request(
|
const repose = await request(
|
||||||
url,
|
url,
|
||||||
@@ -56,3 +53,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const config = {
|
||||||
|
api: {
|
||||||
|
bodyParser: {
|
||||||
|
sizeLimit: '10mb'
|
||||||
|
},
|
||||||
|
responseLimit: '10mb'
|
||||||
|
}
|
||||||
|
};
|
||||||
|
@@ -27,7 +27,7 @@ const Upload = dynamic(() => import('../commonProgress/Upload'));
|
|||||||
const PreviewRawText = dynamic(() => import('../components/PreviewRawText'));
|
const PreviewRawText = dynamic(() => import('../components/PreviewRawText'));
|
||||||
|
|
||||||
type FileItemType = ImportSourceItemType & { file: File };
|
type FileItemType = ImportSourceItemType & { file: File };
|
||||||
const fileType = '.txt, .docx, .pdf, .md, .html';
|
const fileType = '.txt, .docx, .csv, .pdf, .md, .html';
|
||||||
const maxSelectFileCount = 1000;
|
const maxSelectFileCount = 1000;
|
||||||
|
|
||||||
const FileLocal = ({ activeStep, goToNext }: ImportDataComponentProps) => {
|
const FileLocal = ({ activeStep, goToNext }: ImportDataComponentProps) => {
|
||||||
|
@@ -14,7 +14,8 @@ import { useImportStore } from '../Provider';
|
|||||||
import { feConfigs } from '@/web/common/system/staticData';
|
import { feConfigs } from '@/web/common/system/staticData';
|
||||||
|
|
||||||
import dynamic from 'next/dynamic';
|
import dynamic from 'next/dynamic';
|
||||||
import { fileDownload, readCsvContent } from '@/web/common/file/utils';
|
import { fileDownload } from '@/web/common/file/utils';
|
||||||
|
import { readCsvContent } from '@fastgpt/web/common/file/read/csv';
|
||||||
|
|
||||||
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
|
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
|
||||||
const Upload = dynamic(() => import('../commonProgress/Upload'));
|
const Upload = dynamic(() => import('../commonProgress/Upload'));
|
||||||
@@ -56,7 +57,7 @@ const SelectFile = React.memo(function SelectFile({ goToNext }: { goToNext: () =
|
|||||||
{
|
{
|
||||||
for await (const selectFile of files) {
|
for await (const selectFile of files) {
|
||||||
const { file, folderPath } = selectFile;
|
const { file, folderPath } = selectFile;
|
||||||
const { header, data } = await readCsvContent(file);
|
const { header, data } = await readCsvContent({ file });
|
||||||
|
|
||||||
const filterData: FileItemType['chunks'] = data
|
const filterData: FileItemType['chunks'] = data
|
||||||
.filter((item) => item[0])
|
.filter((item) => item[0])
|
||||||
|
@@ -193,7 +193,10 @@ const InputDataModal = ({
|
|||||||
// not exactly same
|
// not exactly same
|
||||||
await putDatasetDataById({
|
await putDatasetDataById({
|
||||||
id: dataId,
|
id: dataId,
|
||||||
...e
|
...e,
|
||||||
|
indexes: e.indexes.map((index) =>
|
||||||
|
index.defaultIndex ? getDefaultIndex({ q: e.q, a: e.a }) : index
|
||||||
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@@ -35,7 +35,8 @@ import dynamic from 'next/dynamic';
|
|||||||
import { useForm } from 'react-hook-form';
|
import { useForm } from 'react-hook-form';
|
||||||
import MySelect from '@/components/Select';
|
import MySelect from '@/components/Select';
|
||||||
import { useSelectFile } from '@/web/common/file/hooks/useSelectFile';
|
import { useSelectFile } from '@/web/common/file/hooks/useSelectFile';
|
||||||
import { fileDownload, readCsvContent } from '@/web/common/file/utils';
|
import { fileDownload } from '@/web/common/file/utils';
|
||||||
|
import { readCsvContent } from '@fastgpt/web/common/file/read/csv';
|
||||||
import { delay } from '@fastgpt/global/common/system/utils';
|
import { delay } from '@fastgpt/global/common/system/utils';
|
||||||
import QuoteItem from '@/components/core/dataset/QuoteItem';
|
import QuoteItem from '@/components/core/dataset/QuoteItem';
|
||||||
|
|
||||||
@@ -125,7 +126,7 @@ const Test = ({ datasetId }: { datasetId: string }) => {
|
|||||||
const { mutate: onFileTest, isLoading: fileTestIsLoading } = useRequest({
|
const { mutate: onFileTest, isLoading: fileTestIsLoading } = useRequest({
|
||||||
mutationFn: async ({ searchParams }: FormType) => {
|
mutationFn: async ({ searchParams }: FormType) => {
|
||||||
if (!selectFile) return Promise.reject('File is not selected');
|
if (!selectFile) return Promise.reject('File is not selected');
|
||||||
const { data } = await readCsvContent(selectFile);
|
const { data } = await readCsvContent({ file: selectFile });
|
||||||
const testList = data.slice(0, 100);
|
const testList = data.slice(0, 100);
|
||||||
const results: SearchTestResponse[] = [];
|
const results: SearchTestResponse[] = [];
|
||||||
|
|
||||||
|
@@ -3,6 +3,11 @@ import { generateQA } from '@/service/events/generateQA';
|
|||||||
import { generateVector } from '@/service/events/generateVector';
|
import { generateVector } from '@/service/events/generateVector';
|
||||||
import { setCron } from '@fastgpt/service/common/system/cron';
|
import { setCron } from '@fastgpt/service/common/system/cron';
|
||||||
|
|
||||||
|
export const startCron = () => {
|
||||||
|
setUpdateSystemConfigCron();
|
||||||
|
setTrainingQueueCron();
|
||||||
|
};
|
||||||
|
|
||||||
export const setUpdateSystemConfigCron = () => {
|
export const setUpdateSystemConfigCron = () => {
|
||||||
setCron('*/5 * * * *', () => {
|
setCron('*/5 * * * *', () => {
|
||||||
initSystemConfig();
|
initSystemConfig();
|
||||||
@@ -11,7 +16,7 @@ export const setUpdateSystemConfigCron = () => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export const setTrainingQueueCron = () => {
|
export const setTrainingQueueCron = () => {
|
||||||
setCron('*/3 * * * *', () => {
|
setCron('*/1 * * * *', () => {
|
||||||
generateVector();
|
generateVector();
|
||||||
generateQA();
|
generateQA();
|
||||||
});
|
});
|
||||||
|
@@ -9,13 +9,11 @@ import {
|
|||||||
recallFromVectorStore,
|
recallFromVectorStore,
|
||||||
updateDatasetDataVector
|
updateDatasetDataVector
|
||||||
} from '@fastgpt/service/common/vectorStore/controller';
|
} from '@fastgpt/service/common/vectorStore/controller';
|
||||||
import { Types } from 'mongoose';
|
|
||||||
import {
|
import {
|
||||||
DatasetDataIndexTypeEnum,
|
DatasetDataIndexTypeEnum,
|
||||||
DatasetSearchModeEnum,
|
DatasetSearchModeEnum,
|
||||||
DatasetSearchModeMap,
|
DatasetSearchModeMap,
|
||||||
SearchScoreTypeEnum,
|
SearchScoreTypeEnum
|
||||||
TrainingModeEnum
|
|
||||||
} from '@fastgpt/global/core/dataset/constants';
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
|
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
|
||||||
import { jiebaSplit } from '@/service/common/string/jieba';
|
import { jiebaSplit } from '@/service/common/string/jieba';
|
||||||
@@ -29,172 +27,26 @@ import {
|
|||||||
} from '@fastgpt/global/core/dataset/type';
|
} from '@fastgpt/global/core/dataset/type';
|
||||||
import { reRankRecall } from '../../ai/rerank';
|
import { reRankRecall } from '../../ai/rerank';
|
||||||
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
|
||||||
import { hashStr, simpleText } from '@fastgpt/global/common/string/tools';
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||||
import type { PushDatasetDataProps } from '@/global/core/dataset/api.d';
|
import type {
|
||||||
import type { PushDataResponse } from '@/global/core/api/datasetRes';
|
PushDatasetDataProps,
|
||||||
import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
|
PushDatasetDataResponse
|
||||||
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';
|
} from '@fastgpt/global/core/dataset/api.d';
|
||||||
import { startQueue } from '@/service/utils/tools';
|
import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller';
|
||||||
import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller';
|
|
||||||
import { getQAModel, getVectorModel } from '../../ai/model';
|
|
||||||
import { delay } from '@fastgpt/global/common/system/utils';
|
|
||||||
|
|
||||||
export async function pushDataToTrainingQueue({
|
export async function pushDataToTrainingQueue(
|
||||||
teamId,
|
props: {
|
||||||
tmbId,
|
teamId: string;
|
||||||
collectionId,
|
tmbId: string;
|
||||||
data,
|
} & PushDatasetDataProps
|
||||||
prompt,
|
): Promise<PushDatasetDataResponse> {
|
||||||
billId,
|
const result = await pushDataListToTrainingQueue({
|
||||||
trainingMode
|
...props,
|
||||||
}: {
|
vectorModelList: global.vectorModels,
|
||||||
teamId: string;
|
qaModelList: global.qaModels
|
||||||
tmbId: string;
|
|
||||||
} & PushDatasetDataProps): Promise<PushDataResponse> {
|
|
||||||
const checkModelValid = async ({ collectionId }: { collectionId: string }) => {
|
|
||||||
const {
|
|
||||||
datasetId: { _id: datasetId, vectorModel, agentModel }
|
|
||||||
} = await getCollectionWithDataset(collectionId);
|
|
||||||
|
|
||||||
if (trainingMode === TrainingModeEnum.chunk) {
|
|
||||||
if (!collectionId) return Promise.reject(`CollectionId is empty`);
|
|
||||||
const vectorModelData = getVectorModel(vectorModel);
|
|
||||||
if (!vectorModelData) {
|
|
||||||
return Promise.reject(`Model ${vectorModel} is inValid`);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
datasetId,
|
|
||||||
maxToken: vectorModelData.maxToken * 1.5,
|
|
||||||
model: vectorModelData.model,
|
|
||||||
weight: vectorModelData.weight
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
if (trainingMode === TrainingModeEnum.qa) {
|
|
||||||
const qaModelData = getQAModel(agentModel);
|
|
||||||
if (!qaModelData) {
|
|
||||||
return Promise.reject(`Model ${agentModel} is inValid`);
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
datasetId,
|
|
||||||
maxToken: qaModelData.maxContext * 0.8,
|
|
||||||
model: qaModelData.model,
|
|
||||||
weight: 0
|
|
||||||
};
|
|
||||||
}
|
|
||||||
return Promise.reject(`Mode ${trainingMode} is inValid`);
|
|
||||||
};
|
|
||||||
|
|
||||||
const { datasetId, model, maxToken, weight } = await checkModelValid({
|
|
||||||
collectionId
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// format q and a, remove empty char
|
return result;
|
||||||
data.forEach((item) => {
|
|
||||||
item.q = simpleText(item.q);
|
|
||||||
item.a = simpleText(item.a);
|
|
||||||
|
|
||||||
item.indexes = item.indexes
|
|
||||||
?.map((index) => {
|
|
||||||
return {
|
|
||||||
...index,
|
|
||||||
text: simpleText(index.text)
|
|
||||||
};
|
|
||||||
})
|
|
||||||
.filter(Boolean);
|
|
||||||
});
|
|
||||||
|
|
||||||
// filter repeat or equal content
|
|
||||||
const set = new Set();
|
|
||||||
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
|
|
||||||
success: [],
|
|
||||||
overToken: [],
|
|
||||||
repeat: [],
|
|
||||||
error: []
|
|
||||||
};
|
|
||||||
|
|
||||||
data.forEach((item) => {
|
|
||||||
if (!item.q) {
|
|
||||||
filterResult.error.push(item);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const text = item.q + item.a;
|
|
||||||
|
|
||||||
// count q token
|
|
||||||
const token = countPromptTokens(item.q);
|
|
||||||
|
|
||||||
if (token > maxToken) {
|
|
||||||
filterResult.overToken.push(item);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (set.has(text)) {
|
|
||||||
console.log('repeat', item);
|
|
||||||
filterResult.repeat.push(item);
|
|
||||||
} else {
|
|
||||||
filterResult.success.push(item);
|
|
||||||
set.add(text);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// 插入记录
|
|
||||||
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
|
|
||||||
try {
|
|
||||||
const results = await MongoDatasetTraining.insertMany(
|
|
||||||
dataList.map((item, i) => ({
|
|
||||||
teamId,
|
|
||||||
tmbId,
|
|
||||||
datasetId,
|
|
||||||
collectionId,
|
|
||||||
billId,
|
|
||||||
mode: trainingMode,
|
|
||||||
prompt,
|
|
||||||
model,
|
|
||||||
q: item.q,
|
|
||||||
a: item.a,
|
|
||||||
chunkIndex: item.chunkIndex ?? i,
|
|
||||||
weight: weight ?? 0,
|
|
||||||
indexes: item.indexes
|
|
||||||
}))
|
|
||||||
);
|
|
||||||
await delay(500);
|
|
||||||
return results.length;
|
|
||||||
} catch (error) {
|
|
||||||
if (retry > 0) {
|
|
||||||
await delay(1000);
|
|
||||||
return insertData(dataList, retry - 1);
|
|
||||||
}
|
|
||||||
return Promise.reject(error);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let insertLen = 0;
|
|
||||||
const chunkSize = 50;
|
|
||||||
const chunkList = filterResult.success.reduce(
|
|
||||||
(acc, cur) => {
|
|
||||||
const lastChunk = acc[acc.length - 1];
|
|
||||||
if (lastChunk.length < chunkSize) {
|
|
||||||
lastChunk.push(cur);
|
|
||||||
} else {
|
|
||||||
acc.push([cur]);
|
|
||||||
}
|
|
||||||
return acc;
|
|
||||||
},
|
|
||||||
[[]] as PushDatasetDataChunkProps[][]
|
|
||||||
);
|
|
||||||
for await (const chunks of chunkList) {
|
|
||||||
insertLen += await insertData(chunks);
|
|
||||||
}
|
|
||||||
|
|
||||||
startQueue();
|
|
||||||
delete filterResult.success;
|
|
||||||
|
|
||||||
return {
|
|
||||||
insertLen,
|
|
||||||
...filterResult
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* insert data.
|
/* insert data.
|
||||||
@@ -341,6 +193,11 @@ export async function updateData2Dataset({
|
|||||||
text: qaStr
|
text: qaStr
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
} else {
|
||||||
|
patchResult.push({
|
||||||
|
type: 'unChange',
|
||||||
|
index: item
|
||||||
|
});
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// not in database, create
|
// not in database, create
|
||||||
@@ -379,6 +236,7 @@ export async function updateData2Dataset({
|
|||||||
model
|
model
|
||||||
});
|
});
|
||||||
item.index.dataId = result.insertId;
|
item.index.dataId = result.insertId;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
if (item.type === 'delete' && item.index.dataId) {
|
if (item.type === 'delete' && item.index.dataId) {
|
||||||
@@ -397,13 +255,14 @@ export async function updateData2Dataset({
|
|||||||
);
|
);
|
||||||
|
|
||||||
const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0);
|
const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0);
|
||||||
|
const newIndexes = patchResult.filter((item) => item.type !== 'delete').map((item) => item.index);
|
||||||
|
|
||||||
// update mongo other data
|
// update mongo other data
|
||||||
mongoData.q = q || mongoData.q;
|
mongoData.q = q || mongoData.q;
|
||||||
mongoData.a = a ?? mongoData.a;
|
mongoData.a = a ?? mongoData.a;
|
||||||
mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a });
|
mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a });
|
||||||
// @ts-ignore
|
// @ts-ignore
|
||||||
mongoData.indexes = indexes;
|
mongoData.indexes = newIndexes;
|
||||||
await mongoData.save();
|
await mongoData.save();
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@@ -7,7 +7,7 @@ import { createDefaultTeam } from '@fastgpt/service/support/user/team/controller
|
|||||||
import { exit } from 'process';
|
import { exit } from 'process';
|
||||||
import { initVectorStore } from '@fastgpt/service/common/vectorStore/controller';
|
import { initVectorStore } from '@fastgpt/service/common/vectorStore/controller';
|
||||||
import { getInitConfig } from '@/pages/api/common/system/getInitData';
|
import { getInitConfig } from '@/pages/api/common/system/getInitData';
|
||||||
import { setUpdateSystemConfigCron, setTrainingQueueCron } from './common/system/cron';
|
import { startCron } from './common/system/cron';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* connect MongoDB and init data
|
* connect MongoDB and init data
|
||||||
@@ -23,8 +23,7 @@ export function connectToDatabase(): Promise<void> {
|
|||||||
getInitConfig();
|
getInitConfig();
|
||||||
|
|
||||||
// cron
|
// cron
|
||||||
setUpdateSystemConfigCron();
|
startCron();
|
||||||
setTrainingQueueCron();
|
|
||||||
|
|
||||||
initRootUser();
|
initRootUser();
|
||||||
}
|
}
|
||||||
|
@@ -32,13 +32,24 @@ export const uploadFiles = ({
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
export const getUploadBase64ImgController = (props: CompressImgProps & UploadImgProps) =>
|
export const getUploadBase64ImgController = (
|
||||||
compressBase64ImgAndUpload({
|
props: CompressImgProps & UploadImgProps,
|
||||||
maxW: 4000,
|
retry = 3
|
||||||
maxH: 4000,
|
): Promise<string> => {
|
||||||
maxSize: 1024 * 1024 * 5,
|
try {
|
||||||
...props
|
return compressBase64ImgAndUpload({
|
||||||
});
|
maxW: 4000,
|
||||||
|
maxH: 4000,
|
||||||
|
maxSize: 1024 * 1024 * 5,
|
||||||
|
...props
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
if (retry > 0) {
|
||||||
|
return getUploadBase64ImgController(props, retry - 1);
|
||||||
|
}
|
||||||
|
return Promise.reject(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* compress image. response base64
|
* compress image. response base64
|
||||||
|
@@ -1,29 +1,3 @@
|
|||||||
import Papa from 'papaparse';
|
|
||||||
import { readFileRawText } from '@fastgpt/web/common/file/read/rawText';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* read csv to json
|
|
||||||
* @response {
|
|
||||||
* header: string[],
|
|
||||||
* data: string[][]
|
|
||||||
* }
|
|
||||||
*/
|
|
||||||
export const readCsvContent = async (file: File) => {
|
|
||||||
try {
|
|
||||||
const { rawText: textArr } = await readFileRawText(file);
|
|
||||||
const csvArr = Papa.parse(textArr).data as string[][];
|
|
||||||
if (csvArr.length === 0) {
|
|
||||||
throw new Error('csv 解析失败');
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
header: csvArr.shift() as string[],
|
|
||||||
data: csvArr.map((item) => item)
|
|
||||||
};
|
|
||||||
} catch (error) {
|
|
||||||
return Promise.reject('解析 csv 文件失败');
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* file download by text
|
* file download by text
|
||||||
*/
|
*/
|
||||||
|
@@ -19,12 +19,14 @@ import type {
|
|||||||
SearchTestResponse
|
SearchTestResponse
|
||||||
} from '@/global/core/dataset/api.d';
|
} from '@/global/core/dataset/api.d';
|
||||||
import type {
|
import type {
|
||||||
PushDatasetDataProps,
|
|
||||||
UpdateDatasetDataProps,
|
UpdateDatasetDataProps,
|
||||||
CreateDatasetParams,
|
CreateDatasetParams,
|
||||||
InsertOneDatasetDataProps
|
InsertOneDatasetDataProps
|
||||||
} from '@/global/core/dataset/api.d';
|
} from '@/global/core/dataset/api.d';
|
||||||
import type { PushDataResponse } from '@/global/core/api/datasetRes.d';
|
import type {
|
||||||
|
PushDatasetDataProps,
|
||||||
|
PushDatasetDataResponse
|
||||||
|
} from '@fastgpt/global/core/dataset/api.d';
|
||||||
import type { DatasetCollectionItemType } from '@fastgpt/global/core/dataset/type';
|
import type { DatasetCollectionItemType } from '@fastgpt/global/core/dataset/type';
|
||||||
import {
|
import {
|
||||||
DatasetCollectionSyncResultEnum,
|
DatasetCollectionSyncResultEnum,
|
||||||
@@ -97,7 +99,7 @@ export const getDatasetDataItemById = (id: string) =>
|
|||||||
* push data to training queue
|
* push data to training queue
|
||||||
*/
|
*/
|
||||||
export const postChunks2Dataset = (data: PushDatasetDataProps) =>
|
export const postChunks2Dataset = (data: PushDatasetDataProps) =>
|
||||||
POST<PushDataResponse>(`/core/dataset/data/pushData`, data);
|
POST<PushDatasetDataResponse>(`/core/dataset/data/pushData`, data);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* insert one data to dataset (immediately insert)
|
* insert one data to dataset (immediately insert)
|
||||||
|
Reference in New Issue
Block a user