Hybrid Search Normalization for Nested Queries

Versions (relevant - OpenSearch/Dashboard/Server OS/Browser):

1.19.0

Describe the issue:

I have a dataset where I want to test different weightings of hybrid vector/keyword search for a nested field, but the search pipelines are not working as I would have expected. It seems like the scores for the keyword + KNN search are being directly added, rather than being normalized + averaged. I tried used the new hybrid search explanation processor but still was unable to debug my issue.

My query:

POST knowledge_base/_search?search_pipeline=hybrid_search_explain&explain=true
{
  "size": 3,
  "_source": {
    "excludes": ["chunks"]
  },
  "query": {
    "nested": {
      "inner_hits": {
        "_source": {
          "excludes": [
            "chunks.embedding"
          ]
        }
      },
      "path": "chunks",
      "query": {
        "hybrid": {
          "queries": [
            {
              "knn": {
                "chunks.embedding": {
                  "filter": {
                    "bool": {
                      "should": {
                        "match_all": {}
                      }
                    }
                  },
                  "k": 10000,
                  "method_parameters": {
                    "ef_search": 10000
                  },
                  "vector": [1, 1, 1, 1 ...]
                }
              }
            },
            {
              "match": {
                "chunks.text": "Hello World"
              }
            }
          ]
        }
      }
    }
  }
}

A snippet from response I got (please let me know if anything else would be useful here):

...
  "hits": {
    "total": {
      "value": 10000,
      "relation": "gte"
    },
    "max_score": 17.144974,
    "hits": [
      {
        "_score": 17.144974,
        "_explanation": {
          "value": 17.144974,
          "description": "Score based on 1 child docs in range from 55175 to 55175, using score mode Avg",
          "details": [
            {
              "value": 17.144974,
              "description": "sum of:",
              "details": [
                {
                  "value": 16.618610382080078,
                  "description": "combined score of:",
                  "details": [
                    {
                      "value": 0.52636397,
                      "description": "within top 10000",
                      "details": []
                    },
                    {
                      "value": 16.61861,
                      "description": "sum of:",
                      "details": [
                        {
                          "value": 6.8290243,
                          "description": "weight(chunks.text:request in 6897) [PerFieldSimilarity], result of:",
...

So in this case the vector search had a score of 0.526 while the keyword search had a score of 16.618, and the total score is 17.145. I would like for:

  1. the keyword score should be normalized between 0-1, not 16
  2. The final score should be the average of the vector/keyword scores, not the sum

Configuration:

Index mapping (some fields removed):

{
  "index_name": {
    "mappings": {
      "properties": {
        "aggregate_id": {
          "type": "keyword"
        },
        "chunks": {
          "type": "nested",
          "properties": {
            "chunk_id": {
              "type": "keyword"
            },
            "embedding": {
              "type": "knn_vector",
              "dimension": 1024,
              "method": {
                "engine": "lucene",
                "space_type": "cosinesimil",
                "name": "hnsw",
                "parameters": {
                  "ef_construction": 64,
                  "m": 16
                }
              }
            },
            "text": {
              "type": "text"
            }
          }
        }
      }
    }
  }
}

Pipeline:

  "hybrid_seach_explain": {
    "description": "Post processor for hybrid search",
    "phase_results_processors": [
      {
        "normalization-processor": {
          "normalization": {
            "technique": "min_max"
          },
          "combination": {
            "technique": "arithmetic_mean"
          }
        }
      }
    ],
    "response_processors": [
      {
        "hybrid_score_explanation": {}
      }
    ]
  },