4.6.7-alpha commit (#743)

Co-authored-by: Archer <545436317@qq.com>
Co-authored-by: heheer <71265218+newfish-cmyk@users.noreply.github.com>
This commit is contained in:
Archer
2024-01-19 11:17:28 +08:00
committed by GitHub
parent 8ee7407c4c
commit c031e6dcc9
324 changed files with 8509 additions and 4757 deletions

View File

@@ -16,7 +16,7 @@ import {
DatasetSearchModeMap,
SearchScoreTypeEnum,
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constant';
} from '@fastgpt/global/core/dataset/constants';
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
import { jiebaSplit } from '@/service/common/string/jieba';
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
@@ -24,6 +24,7 @@ import { getVectorsByText } from '@fastgpt/service/core/ai/embedding';
import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection/schema';
import {
DatasetDataSchemaType,
DatasetDataWithCollectionType,
SearchDataResponseItemType
} from '@fastgpt/global/core/dataset/type';
import { reRankRecall } from '../../ai/rerank';
@@ -38,7 +39,7 @@ import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controll
import { getQAModel, getVectorModel } from '../../ai/model';
import { delay } from '@fastgpt/global/common/system/utils';
export async function pushDataToDatasetCollection({
export async function pushDataToTrainingQueue({
teamId,
tmbId,
collectionId,
@@ -222,7 +223,6 @@ export async function insertData2Dataset({
return Promise.reject("teamId and tmbId can't be the same");
}
const id = new Types.ObjectId();
const qaStr = `${q}\n${a}`.trim();
// empty indexes check, if empty, create default index
@@ -242,17 +242,14 @@ export async function insertData2Dataset({
query: item.text,
model,
teamId,
tmbId,
datasetId,
collectionId,
dataId: String(id)
collectionId
})
)
);
// create mongo
// create mongo data
const { _id } = await MongoDatasetData.create({
_id: id,
teamId,
tmbId,
datasetId,
@@ -269,7 +266,7 @@ export async function insertData2Dataset({
return {
insertId: _id,
tokens: result.reduce((acc, cur) => acc + cur.tokens, 0)
charsLength: result.reduce((acc, cur) => acc + cur.charsLength, 0)
};
}
@@ -293,7 +290,7 @@ export async function updateData2Dataset({
// patch index and update pg
const mongoData = await MongoDatasetData.findById(dataId);
if (!mongoData) return Promise.reject('Data not found');
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// make sure have one index
if (indexes.length === 0) {
@@ -354,6 +351,11 @@ export async function updateData2Dataset({
}
}
// update mongo updateTime
mongoData.updateTime = new Date();
await mongoData.save();
// update vector
const result = await Promise.all(
patchResult.map(async (item) => {
if (item.type === 'create') {
@@ -361,38 +363,42 @@ export async function updateData2Dataset({
query: item.index.text,
model,
teamId: mongoData.teamId,
tmbId: mongoData.tmbId,
datasetId: mongoData.datasetId,
collectionId: mongoData.collectionId,
dataId
collectionId: mongoData.collectionId
});
item.index.dataId = result.insertId;
return result;
}
if (item.type === 'update' && item.index.dataId) {
return updateDatasetDataVector({
const result = await updateDatasetDataVector({
teamId: mongoData.teamId,
datasetId: mongoData.datasetId,
collectionId: mongoData.collectionId,
id: item.index.dataId,
query: item.index.text,
model
});
item.index.dataId = result.insertId;
return result;
}
if (item.type === 'delete' && item.index.dataId) {
await deleteDatasetDataVector({
teamId: mongoData.teamId,
id: item.index.dataId
});
return {
tokens: 0
charsLength: 0
};
}
return {
tokens: 0
charsLength: 0
};
})
);
const tokens = result.reduce((acc, cur) => acc + cur.tokens, 0);
const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0);
// update mongo
// update mongo other data
mongoData.q = q || mongoData.q;
mongoData.a = a ?? mongoData.a;
mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a });
@@ -401,11 +407,12 @@ export async function updateData2Dataset({
await mongoData.save();
return {
tokens
charsLength
};
}
export async function searchDatasetData(props: {
teamId: string;
model: string;
similarity?: number; // min distance
limit: number; // max Token limit
@@ -416,6 +423,7 @@ export async function searchDatasetData(props: {
queries: string[];
}) {
let {
teamId,
rawQuery,
queries,
model,
@@ -460,7 +468,7 @@ export async function searchDatasetData(props: {
};
};
const embeddingRecall = async ({ query, limit }: { query: string; limit: number }) => {
const { vectors, tokens } = await getVectorsByText({
const { vectors, charsLength } = await getVectorsByText({
model,
input: query
});
@@ -472,41 +480,45 @@ export async function searchDatasetData(props: {
});
// get q and a
const [collections, dataList] = await Promise.all([
MongoDatasetCollection.find(
{
_id: { $in: results.map((item) => item.collectionId) }
},
'name fileId rawLink'
).lean(),
MongoDatasetData.find(
{
_id: { $in: results.map((item) => item.dataId?.trim()) }
},
'datasetId collectionId q a chunkIndex indexes'
).lean()
]);
const dataList = (await MongoDatasetData.find(
{
teamId,
datasetId: { $in: datasetIds },
'indexes.dataId': { $in: results.map((item) => item.id?.trim()) }
},
'datasetId collectionId q a chunkIndex indexes'
)
.populate('collectionId', 'name fileId rawLink')
.lean()) as DatasetDataWithCollectionType[];
const formatResult = results
.map((item, index) => {
const collection = collections.find(
(collection) => String(collection._id) === item.collectionId
);
const data = dataList.find((data) => String(data._id) === item.dataId);
// add score to data(It's already sorted. The first one is the one with the most points)
const concatResults = dataList.map((data) => {
const dataIdList = data.indexes.map((item) => item.dataId);
// if collection or data UnExist, the relational mongo data already deleted
if (!collection || !data) return null;
const maxScoreResult = results.find((item) => {
return dataIdList.includes(item.id);
});
return {
...data,
score: maxScoreResult?.score || 0
};
});
concatResults.sort((a, b) => b.score - a.score);
const formatResult = concatResults
.map((data, index) => {
const result: SearchDataResponseItemType = {
id: String(data._id),
q: data.q,
a: data.a,
chunkIndex: data.chunkIndex,
datasetId: String(data.datasetId),
collectionId: String(data.collectionId),
sourceName: collection.name || '',
sourceId: collection?.fileId || collection?.rawLink,
score: [{ type: SearchScoreTypeEnum.embedding, value: item.score, index }]
collectionId: String(data.collectionId._id),
sourceName: data.collectionId.name || '',
sourceId: data.collectionId?.fileId || data.collectionId?.rawLink,
score: [{ type: SearchScoreTypeEnum.embedding, value: data.score, index }]
};
return result;
@@ -515,7 +527,7 @@ export async function searchDatasetData(props: {
return {
embeddingRecallResults: formatResult,
tokens
charsLength
};
};
const fullTextRecall = async ({
@@ -540,6 +552,7 @@ export async function searchDatasetData(props: {
datasetIds.map((id) =>
MongoDatasetData.find(
{
teamId,
datasetId: id,
$text: { $search: jiebaSplit({ text: query }) }
},
@@ -688,26 +701,27 @@ export async function searchDatasetData(props: {
// multi query recall
const embeddingRecallResList: SearchDataResponseItemType[][] = [];
const fullTextRecallResList: SearchDataResponseItemType[][] = [];
let embTokens = 0;
let totalCharsLength = 0;
for await (const query of queries) {
const [{ tokens, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([
embeddingRecall({
query,
limit: embeddingLimit
}),
fullTextRecall({
query,
limit: fullTextLimit
})
]);
embTokens += tokens;
const [{ charsLength, embeddingRecallResults }, { fullTextRecallResults }] =
await Promise.all([
embeddingRecall({
query,
limit: embeddingLimit
}),
fullTextRecall({
query,
limit: fullTextLimit
})
]);
totalCharsLength += charsLength;
embeddingRecallResList.push(embeddingRecallResults);
fullTextRecallResList.push(fullTextRecallResults);
}
return {
tokens: embTokens,
charsLength: totalCharsLength,
embeddingRecallResults: embeddingRecallResList[0],
fullTextRecallResults: fullTextRecallResList[0]
};
@@ -778,7 +792,7 @@ export async function searchDatasetData(props: {
const { embeddingLimit, fullTextLimit } = countRecallLimit();
// recall
const { embeddingRecallResults, fullTextRecallResults, tokens } = await multiQueryRecall({
const { embeddingRecallResults, fullTextRecallResults, charsLength } = await multiQueryRecall({
embeddingLimit,
fullTextLimit
});
@@ -851,7 +865,7 @@ export async function searchDatasetData(props: {
return {
searchRes: filterResultsByMaxTokens(scoreFilter, maxTokens),
tokens,
charsLength,
searchMode,
limit: maxTokens,
similarity,

View File

@@ -4,18 +4,21 @@ import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema';
* Same value judgment
*/
export async function hasSameValue({
teamId,
collectionId,
q,
a = ''
}: {
teamId: string;
collectionId: string;
q: string;
a?: string;
}) {
const count = await MongoDatasetData.countDocuments({
teamId,
collectionId,
q,
a,
collectionId
a
});
if (count > 0) {