4.6.7 fix (#752)

This commit is contained in:
Archer
2024-01-19 20:16:08 +08:00
committed by GitHub
parent c031e6dcc9
commit 5e2adb22f0
37 changed files with 420 additions and 293 deletions

View File

@@ -46,8 +46,17 @@ export async function readMongoImg({ id }: { id: string }) {
return data?.binary;
}
export async function delImgByRelatedId(relateIds: string[]) {
export async function delImgByRelatedId({
teamId,
relateIds
}: {
teamId: string;
relateIds: string[];
}) {
if (relateIds.length === 0) return;
return MongoImage.deleteMany({
teamId,
'metadata.relatedId': { $in: relateIds.map((id) => String(id)) }
});
}

View File

@@ -34,9 +34,8 @@ const ImageSchema = new Schema({
try {
ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 });
ImageSchema.index({ type: 1 });
ImageSchema.index({ teamId: 1 });
ImageSchema.index({ createTime: 1 });
ImageSchema.index({ 'metadata.relatedId': 1 });
ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 });
} catch (error) {
console.log(error);
}

View File

@@ -28,12 +28,16 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
// },
filename: async (req, file, cb) => {
const { ext } = path.parse(decodeURIComponent(file.originalname));
cb(null, `${getNanoid(32)}${ext}`);
cb(null, `${getNanoid()}${ext}`);
}
})
}).single('file');
async doUpload<T = Record<string, any>>(req: NextApiRequest, res: NextApiResponse) {
async doUpload<T = Record<string, any>>(
req: NextApiRequest,
res: NextApiResponse,
originBuckerName?: `${BucketNameEnum}`
) {
return new Promise<{
file: FileType;
metadata: Record<string, any>;
@@ -47,7 +51,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => {
}
// check bucket name
const bucketName = req.body?.bucketName as `${BucketNameEnum}`;
const bucketName = (req.body?.bucketName || originBuckerName) as `${BucketNameEnum}`;
if (bucketName && !bucketNameMap[bucketName]) {
return reject('BucketName is invalid');
}

View File

@@ -39,14 +39,15 @@ export const insertDatasetDataVector = async (
}
): Promise<{ insertId: string }> => {
const { teamId, datasetId, collectionId, vectors, retry = 3 } = props;
try {
const { rows } = await PgClient.insert(PgDatasetTableName, {
values: [
[
{ key: 'vector', value: `[${vectors[0]}]` },
{ key: 'team_id', value: String(teamId) },
{ key: 'dataset_id', value: datasetId },
{ key: 'collection_id', value: collectionId }
{ key: 'dataset_id', value: String(datasetId) },
{ key: 'collection_id', value: String(collectionId) }
]
]
});
@@ -176,8 +177,8 @@ export const getVectorDataByTime = async (start: Date, end: Date) => {
`);
return rows.map((item) => ({
id: item.id,
datasetId: item.dataset_id,
teamId: item.team_id
id: String(item.id),
teamId: item.team_id,
datasetId: item.dataset_id
}));
};

View File

@@ -89,6 +89,7 @@ try {
close custom feedback;
*/
ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true });
ChatItemSchema.index({ time: -1 }, { background: true });
ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true });
ChatItemSchema.index({ userBadFeedback: 1 }, { background: true });
ChatItemSchema.index({ customFeedbacks: 1 }, { background: true });

View File

@@ -25,7 +25,7 @@ export async function createOneCollection({
type,
trainingType = TrainingModeEnum.chunk,
chunkSize = 0,
chunkSize = 512,
chunkSplitter,
qaPrompt,
@@ -134,7 +134,10 @@ export async function delCollectionAndRelatedSources({
// delete file and imgs
await Promise.all([
delImgByRelatedId(relatedImageIds),
delImgByRelatedId({
teamId,
relateIds: relatedImageIds
}),
delFileByFileIdList({
bucketName: BucketNameEnum.dataset,
fileIdList

View File

@@ -1,5 +1,15 @@
import { delay } from '@fastgpt/global/common/system/utils';
import { MongoDatasetTraining } from './schema';
import type {
PushDatasetDataChunkProps,
PushDatasetDataProps,
PushDatasetDataResponse
} from '@fastgpt/global/core/dataset/api.d';
import { getCollectionWithDataset } from '../controller';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { simpleText } from '@fastgpt/global/common/string/tools';
import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken';
import type { VectorModelItemType, LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise<any> => {
try {
@@ -19,3 +29,165 @@ export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promi
return Promise.reject(error);
}
};
export async function pushDataListToTrainingQueue({
teamId,
tmbId,
collectionId,
data,
prompt,
billId,
trainingMode = TrainingModeEnum.chunk,
vectorModelList = [],
qaModelList = []
}: {
teamId: string;
tmbId: string;
vectorModelList: VectorModelItemType[];
qaModelList: LLMModelItemType[];
} & PushDatasetDataProps): Promise<PushDatasetDataResponse> {
const {
datasetId: { _id: datasetId, vectorModel, agentModel }
} = await getCollectionWithDataset(collectionId);
const checkModelValid = async ({ collectionId }: { collectionId: string }) => {
if (!collectionId) return Promise.reject(`CollectionId is empty`);
if (trainingMode === TrainingModeEnum.chunk) {
const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel);
if (!vectorModelData) {
return Promise.reject(`Model ${vectorModel} is inValid`);
}
return {
maxToken: vectorModelData.maxToken * 1.5,
model: vectorModelData.model,
weight: vectorModelData.weight
};
}
if (trainingMode === TrainingModeEnum.qa) {
const qaModelData = qaModelList?.find((item) => item.model === agentModel);
if (!qaModelData) {
return Promise.reject(`Model ${agentModel} is inValid`);
}
return {
maxToken: qaModelData.maxContext * 0.8,
model: qaModelData.model,
weight: 0
};
}
return Promise.reject(`Training mode "${trainingMode}" is inValid`);
};
const { model, maxToken, weight } = await checkModelValid({
collectionId
});
// format q and a, remove empty char
data.forEach((item) => {
item.q = simpleText(item.q);
item.a = simpleText(item.a);
item.indexes = item.indexes
?.map((index) => {
return {
...index,
text: simpleText(index.text)
};
})
.filter(Boolean);
});
// filter repeat or equal content
const set = new Set();
const filterResult: Record<string, PushDatasetDataChunkProps[]> = {
success: [],
overToken: [],
repeat: [],
error: []
};
// filter repeat content
data.forEach((item) => {
if (!item.q) {
filterResult.error.push(item);
return;
}
const text = item.q + item.a;
// count q token
const token = countPromptTokens(item.q);
if (token > maxToken) {
filterResult.overToken.push(item);
return;
}
if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item);
} else {
filterResult.success.push(item);
set.add(text);
}
});
// insert data to db
const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise<number> => {
try {
const results = await MongoDatasetTraining.insertMany(
dataList.map((item, i) => ({
teamId,
tmbId,
datasetId,
collectionId,
billId,
mode: trainingMode,
prompt,
model,
q: item.q,
a: item.a,
chunkIndex: item.chunkIndex ?? i,
weight: weight ?? 0,
indexes: item.indexes
}))
);
await delay(500);
return results.length;
} catch (error) {
if (retry > 0) {
await delay(500);
return insertData(dataList, retry - 1);
}
return Promise.reject(error);
}
};
let insertLen = 0;
const chunkSize = 50;
const chunkList = filterResult.success.reduce(
(acc, cur) => {
const lastChunk = acc[acc.length - 1];
if (lastChunk.length < chunkSize) {
lastChunk.push(cur);
} else {
acc.push([cur]);
}
return acc;
},
[[]] as PushDatasetDataChunkProps[][]
);
for await (const chunks of chunkList) {
insertLen += await insertData(chunks);
}
delete filterResult.success;
return {
insertLen,
...filterResult
};
}

View File

@@ -52,7 +52,7 @@ const BillSchema = new Schema({
});
try {
BillSchema.index({ teamId: 1, tmbId: 1, time: -1 });
BillSchema.index({ teamId: 1, time: -1 });
BillSchema.index({ time: 1 }, { expireAfterSeconds: 180 * 24 * 60 * 60 });
} catch (error) {
console.log(error);