rrf_weight (#5551)

Co-authored-by: xxYyh <xxyyh@xxYyhdeMacBook-Pro.local>
2025-10-13 14:29:40 +00:00 · 2025-08-29 00:54:29 +08:00
parent 486d791b94
commit e4756c76dd
4 changed files with 16 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -37,4 +37,6 @@ files/helm/fastgpt/charts/*.tgz

 tmp/
 coverage
-document/.source
+document/.source
+
+bun.lock
--- a/packages/global/core/dataset/search/utils.ts
+++ b/packages/global/core/dataset/search/utils.ts
@@ -3,7 +3,7 @@ import { type SearchDataResponseItemType } from '../type';

 /* dataset search result concat */
 export const datasetSearchResultConcat = (
-  arr: { k: number; list: SearchDataResponseItemType[] }[]
+  arr: { weight: number; list: SearchDataResponseItemType[] }[]
 ): SearchDataResponseItemType[] => {
  arr = arr.filter((item) => item.list.length > 0);

@@ -14,12 +14,11 @@ export const datasetSearchResultConcat = (

  // rrf
  arr.forEach((item) => {
-    const k = item.k;
+    const weight = item.weight;

    item.list.forEach((data, index) => {
      const rank = index + 1;
-      const score = 1 / (k + rank);
-
+      const score = (weight * 1) / (60 + rank);
      const record = map.get(data.id);
      if (record) {
        // 合并两个score,有相同type的score,取最大值
--- a/packages/service/core/dataset/search/controller.ts
+++ b/packages/service/core/dataset/search/controller.ts
@@ -784,10 +784,10 @@ export async function searchDatasetData(

    // rrf concat
    const rrfEmbRecall = datasetSearchResultConcat(
-      embeddingRecallResults.map((list) => ({ k: 60, list }))
+      embeddingRecallResults.map((list) => ({ weight: 1, list }))
    ).slice(0, embeddingLimit);
    const rrfFTRecall = datasetSearchResultConcat(
-      fullTextRecallResults.map((list) => ({ k: 60, list }))
+      fullTextRecallResults.map((list) => ({ weight: 1, list }))
    ).slice(0, fullTextLimit);

    return {
@@ -850,24 +850,22 @@ export async function searchDatasetData(
  })();

  // embedding recall and fullText recall rrf concat
-  const baseK = 120;
-  const embK = Math.round(baseK * (1 - embeddingWeight)); // 搜索结果的 k 值
-  const fullTextK = Math.round(baseK * embeddingWeight); // rerank 结果的 k 值
+  const embWeight = embeddingWeight; // 向量索引的 weight 大小
+  const fullTextWeight = 1 - embeddingWeight; // 全文索引的 weight 大小

  const rrfSearchResult = datasetSearchResultConcat([
-    { k: embK, list: embeddingRecallResults },
-    { k: fullTextK, list: fullTextRecallResults }
+    { weight: embWeight, list: embeddingRecallResults },
+    { weight: fullTextWeight, list: fullTextRecallResults }
  ]);
  const rrfConcatResults = (() => {
    if (reRankResults.length === 0) return rrfSearchResult;
    if (rerankWeight === 1) return reRankResults;

-    const searchK = Math.round(baseK * rerankWeight); // 搜索结果的 k 值
-    const rerankK = Math.round(baseK * (1 - rerankWeight)); // rerank 结果的 k 值
+    const searchWeight = 1 - rerankWeight; // 搜索结果的 weight 大小

    return datasetSearchResultConcat([
-      { k: searchK, list: rrfSearchResult },
-      { k: rerankK, list: reRankResults }
+      { weight: searchWeight, list: rrfSearchResult },
+      { weight: rerankWeight, list: reRankResults }
    ]);
  })();

--- a/packages/service/core/workflow/dispatch/dataset/concat.ts
+++ b/packages/service/core/workflow/dispatch/dataset/concat.ts
@@ -29,7 +29,7 @@ export async function dispatchDatasetConcat(

  const rrfConcatResults = datasetSearchResultConcat(
    quoteList.map((list) => ({
-      k: 60,
+      weight: 1,
      list
    }))
  );