perf: vector generate (#1748)

This commit is contained in:
Archer
2024-06-12 16:42:46 +08:00
committed by GitHub
parent d0085a23e6
commit 05611df056
2 changed files with 109 additions and 109 deletions

View File

@@ -81,14 +81,18 @@ async function handler(
}); });
// get 10 init dataset.data // get 10 init dataset.data
const arr = new Array(10).fill(0); const max = global.systemEnv?.vectorMaxProcess || 10;
const arr = new Array(max * 2).fill(0);
for await (const _ of arr) { for await (const _ of arr) {
await mongoSessionRun(async (session) => { try {
const hasNext = await mongoSessionRun(async (session) => {
// get next dataset.data
const data = await MongoDatasetData.findOneAndUpdate( const data = await MongoDatasetData.findOneAndUpdate(
{ {
rebuilding: true,
teamId, teamId,
datasetId, datasetId
rebuilding: true
}, },
{ {
$unset: { $unset: {
@@ -124,7 +128,14 @@ async function handler(
} }
); );
} }
return !!data;
}); });
if (!hasNext) {
break;
}
} catch (error) {}
} }
return {}; return {};

View File

@@ -158,43 +158,8 @@ const rebuildData = async ({
const deleteVectorIdList = mongoData.indexes.map((index) => index.dataId); const deleteVectorIdList = mongoData.indexes.map((index) => index.dataId);
const { tokens } = await mongoSessionRun(async (session) => { // Find next rebuilding data to insert training queue
// update vector, update dataset.data rebuilding status, delete data from training await mongoSessionRun(async (session) => {
const updateResult = await Promise.all(
mongoData.indexes.map(async (index, i) => {
const result = await insertDatasetDataVector({
query: index.text,
model: getVectorModel(trainingData.model),
teamId: mongoData.teamId,
datasetId: mongoData.datasetId,
collectionId: mongoData.collectionId
});
mongoData.indexes[i].dataId = result.insertId;
return result;
})
);
// Ensure that the training data is deleted after the Mongo update is successful
await mongoData.save({ session });
await trainingData.deleteOne({ session });
// delete old vector
await deleteDatasetDataVector({
teamId: mongoData.teamId,
idList: deleteVectorIdList
});
return {
tokens: updateResult.reduce((acc, cur) => acc + cur.tokens, 0)
};
});
// find next data insert to training queue
const arr = new Array(5).fill(0);
for await (const _ of arr) {
try {
const hasNextData = await mongoSessionRun(async (session) => {
// get new mongoData insert to training // get new mongoData insert to training
const newRebuildingData = await MongoDatasetData.findOneAndUpdate( const newRebuildingData = await MongoDatasetData.findOneAndUpdate(
{ {
@@ -232,15 +197,39 @@ const rebuildData = async ({
{ session } { session }
); );
} }
return !!newRebuildingData;
}); });
if (!hasNextData) { // update vector, update dataset_data rebuilding status, delete data from training
break; // 1. Insert new vector to dataset_data
} const updateResult = await Promise.all(
} catch (error) {} mongoData.indexes.map(async (index, i) => {
} const result = await insertDatasetDataVector({
query: index.text,
model: getVectorModel(trainingData.model),
teamId: mongoData.teamId,
datasetId: mongoData.datasetId,
collectionId: mongoData.collectionId
});
mongoData.indexes[i].dataId = result.insertId;
return result;
})
);
const { tokens } = await mongoSessionRun(async (session) => {
// 2. Ensure that the training data is deleted after the Mongo update is successful
await mongoData.save({ session });
// 3. Delete the training data
await trainingData.deleteOne({ session });
// 4. Delete old vector
await deleteDatasetDataVector({
teamId: mongoData.teamId,
idList: deleteVectorIdList
});
return {
tokens: updateResult.reduce((acc, cur) => acc + cur.tokens, 0)
};
});
return { tokens }; return { tokens };
}; };