mirror of
https://github.com/labring/FastGPT.git
synced 2025-08-02 12:48:30 +00:00
4.6.7-alpha commit (#743)
Co-authored-by: Archer <545436317@qq.com> Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
@@ -16,7 +16,7 @@ import {
|
||||
DatasetSearchModeMap,
|
||||
SearchScoreTypeEnum,
|
||||
TrainingModeEnum
|
||||
} from '@fastgpt/global/core/dataset/constant';
|
||||
} from '@fastgpt/global/core/dataset/constants';
|
||||
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
|
||||
import { jiebaSplit } from '@/service/common/string/jieba';
|
||||
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
|
||||
@@ -24,6 +24,7 @@ import { getVectorsByText } from '@fastgpt/service/core/ai/embedding';
|
||||
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
|
||||
import {
|
||||
DatasetDataSchemaType,
|
||||
DatasetDataWithCollectionType,
|
||||
SearchDataResponseItemType
|
||||
} from '@fastgpt/global/core/dataset/type';
|
||||
import { reRankRecall } from '../../ai/rerank';
|
||||
@@ -38,7 +39,7 @@ import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controll
|
||||
import { getQAModel, getVectorModel } from '../../ai/model';
|
||||
import { delay } from '@fastgpt/global/common/system/utils';
|
||||
|
||||
export async function pushDataToDatasetCollection({
|
||||
export async function pushDataToTrainingQueue({
|
||||
teamId,
|
||||
tmbId,
|
||||
collectionId,
|
||||
@@ -222,7 +223,6 @@ export async function insertData2Dataset({
|
||||
return Promise.reject("teamId and tmbId can't be the same");
|
||||
}
|
||||
|
||||
const id = new Types.ObjectId();
|
||||
const qaStr = `${q}\n${a}`.trim();
|
||||
|
||||
// empty indexes check, if empty, create default index
|
||||
@@ -242,17 +242,14 @@ export async function insertData2Dataset({
|
||||
query: item.text,
|
||||
model,
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
collectionId,
|
||||
dataId: String(id)
|
||||
collectionId
|
||||
})
|
||||
)
|
||||
);
|
||||
|
||||
// create mongo
|
||||
// create mongo data
|
||||
const { _id } = await MongoDatasetData.create({
|
||||
_id: id,
|
||||
teamId,
|
||||
tmbId,
|
||||
datasetId,
|
||||
@@ -269,7 +266,7 @@ export async function insertData2Dataset({
|
||||
|
||||
return {
|
||||
insertId: _id,
|
||||
tokens: result.reduce((acc, cur) => acc + cur.tokens, 0)
|
||||
charsLength: result.reduce((acc, cur) => acc + cur.charsLength, 0)
|
||||
};
|
||||
}
|
||||
|
||||
@@ -293,7 +290,7 @@ export async function updateData2Dataset({
|
||||
|
||||
// patch index and update pg
|
||||
const mongoData = await MongoDatasetData.findById(dataId);
|
||||
if (!mongoData) return Promise.reject('Data not found');
|
||||
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
||||
|
||||
// make sure have one index
|
||||
if (indexes.length === 0) {
|
||||
@@ -354,6 +351,11 @@ export async function updateData2Dataset({
|
||||
}
|
||||
}
|
||||
|
||||
// update mongo updateTime
|
||||
mongoData.updateTime = new Date();
|
||||
await mongoData.save();
|
||||
|
||||
// update vector
|
||||
const result = await Promise.all(
|
||||
patchResult.map(async (item) => {
|
||||
if (item.type === 'create') {
|
||||
@@ -361,38 +363,42 @@ export async function updateData2Dataset({
|
||||
query: item.index.text,
|
||||
model,
|
||||
teamId: mongoData.teamId,
|
||||
tmbId: mongoData.tmbId,
|
||||
datasetId: mongoData.datasetId,
|
||||
collectionId: mongoData.collectionId,
|
||||
dataId
|
||||
collectionId: mongoData.collectionId
|
||||
});
|
||||
item.index.dataId = result.insertId;
|
||||
return result;
|
||||
}
|
||||
if (item.type === 'update' && item.index.dataId) {
|
||||
return updateDatasetDataVector({
|
||||
const result = await updateDatasetDataVector({
|
||||
teamId: mongoData.teamId,
|
||||
datasetId: mongoData.datasetId,
|
||||
collectionId: mongoData.collectionId,
|
||||
id: item.index.dataId,
|
||||
query: item.index.text,
|
||||
model
|
||||
});
|
||||
item.index.dataId = result.insertId;
|
||||
return result;
|
||||
}
|
||||
if (item.type === 'delete' && item.index.dataId) {
|
||||
await deleteDatasetDataVector({
|
||||
teamId: mongoData.teamId,
|
||||
id: item.index.dataId
|
||||
});
|
||||
return {
|
||||
tokens: 0
|
||||
charsLength: 0
|
||||
};
|
||||
}
|
||||
return {
|
||||
tokens: 0
|
||||
charsLength: 0
|
||||
};
|
||||
})
|
||||
);
|
||||
|
||||
const tokens = result.reduce((acc, cur) => acc + cur.tokens, 0);
|
||||
const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0);
|
||||
|
||||
// update mongo
|
||||
// update mongo other data
|
||||
mongoData.q = q || mongoData.q;
|
||||
mongoData.a = a ?? mongoData.a;
|
||||
mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a });
|
||||
@@ -401,11 +407,12 @@ export async function updateData2Dataset({
|
||||
await mongoData.save();
|
||||
|
||||
return {
|
||||
tokens
|
||||
charsLength
|
||||
};
|
||||
}
|
||||
|
||||
export async function searchDatasetData(props: {
|
||||
teamId: string;
|
||||
model: string;
|
||||
similarity?: number; // min distance
|
||||
limit: number; // max Token limit
|
||||
@@ -416,6 +423,7 @@ export async function searchDatasetData(props: {
|
||||
queries: string[];
|
||||
}) {
|
||||
let {
|
||||
teamId,
|
||||
rawQuery,
|
||||
queries,
|
||||
model,
|
||||
@@ -460,7 +468,7 @@ export async function searchDatasetData(props: {
|
||||
};
|
||||
};
|
||||
const embeddingRecall = async ({ query, limit }: { query: string; limit: number }) => {
|
||||
const { vectors, tokens } = await getVectorsByText({
|
||||
const { vectors, charsLength } = await getVectorsByText({
|
||||
model,
|
||||
input: query
|
||||
});
|
||||
@@ -472,41 +480,45 @@ export async function searchDatasetData(props: {
|
||||
});
|
||||
|
||||
// get q and a
|
||||
const [collections, dataList] = await Promise.all([
|
||||
MongoDatasetCollection.find(
|
||||
{
|
||||
_id: { $in: results.map((item) => item.collectionId) }
|
||||
},
|
||||
'name fileId rawLink'
|
||||
).lean(),
|
||||
MongoDatasetData.find(
|
||||
{
|
||||
_id: { $in: results.map((item) => item.dataId?.trim()) }
|
||||
},
|
||||
'datasetId collectionId q a chunkIndex indexes'
|
||||
).lean()
|
||||
]);
|
||||
const dataList = (await MongoDatasetData.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: { $in: datasetIds },
|
||||
'indexes.dataId': { $in: results.map((item) => item.id?.trim()) }
|
||||
},
|
||||
'datasetId collectionId q a chunkIndex indexes'
|
||||
)
|
||||
.populate('collectionId', 'name fileId rawLink')
|
||||
.lean()) as DatasetDataWithCollectionType[];
|
||||
|
||||
const formatResult = results
|
||||
.map((item, index) => {
|
||||
const collection = collections.find(
|
||||
(collection) => String(collection._id) === item.collectionId
|
||||
);
|
||||
const data = dataList.find((data) => String(data._id) === item.dataId);
|
||||
// add score to data(It's already sorted. The first one is the one with the most points)
|
||||
const concatResults = dataList.map((data) => {
|
||||
const dataIdList = data.indexes.map((item) => item.dataId);
|
||||
|
||||
// if collection or data UnExist, the relational mongo data already deleted
|
||||
if (!collection || !data) return null;
|
||||
const maxScoreResult = results.find((item) => {
|
||||
return dataIdList.includes(item.id);
|
||||
});
|
||||
|
||||
return {
|
||||
...data,
|
||||
score: maxScoreResult?.score || 0
|
||||
};
|
||||
});
|
||||
|
||||
concatResults.sort((a, b) => b.score - a.score);
|
||||
|
||||
const formatResult = concatResults
|
||||
.map((data, index) => {
|
||||
const result: SearchDataResponseItemType = {
|
||||
id: String(data._id),
|
||||
q: data.q,
|
||||
a: data.a,
|
||||
chunkIndex: data.chunkIndex,
|
||||
datasetId: String(data.datasetId),
|
||||
collectionId: String(data.collectionId),
|
||||
sourceName: collection.name || '',
|
||||
sourceId: collection?.fileId || collection?.rawLink,
|
||||
score: [{ type: SearchScoreTypeEnum.embedding, value: item.score, index }]
|
||||
collectionId: String(data.collectionId._id),
|
||||
sourceName: data.collectionId.name || '',
|
||||
sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
|
||||
score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
|
||||
};
|
||||
|
||||
return result;
|
||||
@@ -515,7 +527,7 @@ export async function searchDatasetData(props: {
|
||||
|
||||
return {
|
||||
embeddingRecallResults: formatResult,
|
||||
tokens
|
||||
charsLength
|
||||
};
|
||||
};
|
||||
const fullTextRecall = async ({
|
||||
@@ -540,6 +552,7 @@ export async function searchDatasetData(props: {
|
||||
datasetIds.map((id) =>
|
||||
MongoDatasetData.find(
|
||||
{
|
||||
teamId,
|
||||
datasetId: id,
|
||||
$text: { $search: jiebaSplit({ text: query }) }
|
||||
},
|
||||
@@ -688,26 +701,27 @@ export async function searchDatasetData(props: {
|
||||
// multi query recall
|
||||
const embeddingRecallResList: SearchDataResponseItemType[][] = [];
|
||||
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
|
||||
let embTokens = 0;
|
||||
let totalCharsLength = 0;
|
||||
for await (const query of queries) {
|
||||
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
|
||||
embeddingRecall({
|
||||
query,
|
||||
limit: embeddingLimit
|
||||
}),
|
||||
fullTextRecall({
|
||||
query,
|
||||
limit: fullTextLimit
|
||||
})
|
||||
]);
|
||||
embTokens += tokens;
|
||||
const [{ charsLength, embeddingRecallResults }, { fullTextRecallResults }] =
|
||||
await Promise.all([
|
||||
embeddingRecall({
|
||||
query,
|
||||
limit: embeddingLimit
|
||||
}),
|
||||
fullTextRecall({
|
||||
query,
|
||||
limit: fullTextLimit
|
||||
})
|
||||
]);
|
||||
totalCharsLength += charsLength;
|
||||
|
||||
embeddingRecallResList.push(embeddingRecallResults);
|
||||
fullTextRecallResList.push(fullTextRecallResults);
|
||||
}
|
||||
|
||||
return {
|
||||
tokens: embTokens,
|
||||
charsLength: totalCharsLength,
|
||||
embeddingRecallResults: embeddingRecallResList[0],
|
||||
fullTextRecallResults: fullTextRecallResList[0]
|
||||
};
|
||||
@@ -778,7 +792,7 @@ export async function searchDatasetData(props: {
|
||||
const { embeddingLimit, fullTextLimit } = countRecallLimit();
|
||||
|
||||
// recall
|
||||
const { embeddingRecallResults, fullTextRecallResults, tokens } = await multiQueryRecall({
|
||||
const { embeddingRecallResults, fullTextRecallResults, charsLength } = await multiQueryRecall({
|
||||
embeddingLimit,
|
||||
fullTextLimit
|
||||
});
|
||||
@@ -851,7 +865,7 @@ export async function searchDatasetData(props: {
|
||||
|
||||
return {
|
||||
searchRes: filterResultsByMaxTokens(scoreFilter, maxTokens),
|
||||
tokens,
|
||||
charsLength,
|
||||
searchMode,
|
||||
limit: maxTokens,
|
||||
similarity,
|
||||
|
@@ -4,18 +4,21 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
|
||||
* Same value judgment
|
||||
*/
|
||||
export async function hasSameValue({
|
||||
teamId,
|
||||
collectionId,
|
||||
q,
|
||||
a = ''
|
||||
}: {
|
||||
teamId: string;
|
||||
collectionId: string;
|
||||
q: string;
|
||||
a?: string;
|
||||
}) {
|
||||
const count = await MongoDatasetData.countDocuments({
|
||||
teamId,
|
||||
collectionId,
|
||||
q,
|
||||
a,
|
||||
collectionId
|
||||
a
|
||||
});
|
||||
|
||||
if (count > 0) {
|
||||
|
Reference in New Issue
Block a user