Hybrid search returning duplicate docs

dswitzer2 · May 6, 2024, 2:22pm

So it turns out my problem was due to the lack of specifying a normalization-processor in my search. If I alter my query to include a search pipeline, such as the following, it fixes the problem:

			, "search_pipeline" : {
				"phase_results_processors": [
					{
						"normalization-processor": {
							"normalization": {
								"technique": "min_max"
							}
							, "combination": {
									"technique": "arithmetic_mean"
								, "parameters": {
									"weights": [0.3, 0.7]
								}
							}
							, "ignore_failure": false
						}
					}
				]
			}

So if I change my DSL to the following, I no longer see the duplicate results:

{
  "query": {
    "hybrid": {
      "queries": [
        {
          "bool": {
            "must": [
              {
                "function_score": {
                  "query": {
                    "bool": {
                      "should": [
                        {
                          "simple_query_string": {
                            "query": "my search phrase",
                            "fields": ["keywords^4", "title^2", "article.basic", "article.intermediate", "article.advanced"],
                            "quote_field_suffix": "exact",
                            "default_operator": "OR",
                            "minimum_should_match": "2<75%"
                          }
                        },
                        {
                          "nested": {
                            "path": "attachments",
                            "query": {
                              "simple_query_string": {
                                "query": "my search phrase",
                                "fields": [
                                  "attachments.attachment.content"
                                ],
                                "quote_field_suffix": "exact",
                                "default_operator": "OR",
                                "minimum_should_match": "2<75%"
                              }
                            },
                            "inner_hits": {
                              "_source": false,
                              "highlight": {
                                "type": "unified",
                                "fragmenter": "span",
                                "number_of_fragments": 1,
                                "encoder": "html",
                                "pre_tags": [
                                  "<span class=\"search-results-highlight\">"
                                ],
                                "post_tags": [
                                  "</span>"
                                ],
                                "fragment_size": 200,
                                "no_match_size": 0,
                                "order": "score",
                                "fields": {
                                  "attachments.*": {}
                                }
                              }
                            }
                          }
                        }
                      ]
                    }
                  },
                  "functions": [
                    {
                      "field_value_factor": {
                        "field": "rating",
                        "factor": 0.25,
                        "modifier": "log1p",
                        "missing": 0
                      }
                    },
                    {
                      "field_value_factor": {
                        "field": "usedcount",
                        "factor": 0.375,
                        "modifier": "log1p",
                        "missing": 0
                      }
                    },
                    {
                      "gauss": {
                        "modifieddate": {
                          "origin": "now",
                          "scale": "30d",
                          "offset": "5d",
                          "decay": 0.5
                        }
                      }
                    }
                  ],
                  "boost_mode": "sum",
                  "score_mode": "sum"
                }
              }
            ],
            "filter": [
              {
                "bool": {
                  "must": [
                    {
                      "bool": {
                        "should": [
                          {
                            "terms": {
                              "statusid": [
                                3
                              ]
                            }
                          },
                          {
                            "bool": {
                              "must": [
                                {
                                  "term": {
                                    "ishot": true
                                  }
                                },
                                {
                                  "terms": {
                                    "statusid": [
                                      1,
                                      2
                                    ]
                                  }
                                }
                              ]
                            }
                          }
                        ]
                      }
                    },
                    {
                      "terms": {
                        "servicedeskid": [
                          1
                        ]
                      }
                    },
                    {
                      "terms": {
                        "securitylevelid": [
                          -1,
                          6,
                          7,
                          8
                        ]
                      }
                    }
                  ]
                }
              }
            ]
          }
        },
        {
          "bool": {
            "must": [
              {
                "function_score": {
                  "query": {
                    "bool": {
                      "should": [
                        {
                          "neural_sparse": {
                            "embeddings.title": {
                              "query_text": "my search phrase",
                              "model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
                            }
                          }
                        },
                        {
                          "neural_sparse": {
                            "embeddings.article.basic": {
                              "query_text": "my search phrase",
                              "model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
                            }
                          }
                        },
                        {
                          "neural_sparse": {
                            "embeddings.article.intermediate": {
                              "query_text": "my search phrase",
                              "model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
                            }
                          }
                        },
                        {
                          "neural_sparse": {
                            "embeddings.article.advanced": {
                              "query_text": "my search phrase",
                              "model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
                            }
                          }
                        }
                      ]
                    }
                  },
                  "functions": [
                    {
                      "field_value_factor": {
                        "field": "rating",
                        "factor": 0.25,
                        "modifier": "log1p",
                        "missing": 0
                      }
                    },
                    {
                      "field_value_factor": {
                        "field": "usedcount",
                        "factor": 0.375,
                        "modifier": "log1p",
                        "missing": 0
                      }
                    },
                    {
                      "gauss": {
                        "modifieddate": {
                          "origin": "now",
                          "scale": "30d",
                          "offset": "5d",
                          "decay": 0.5
                        }
                      }
                    }
                  ],
                  "boost_mode": "sum",
                  "score_mode": "sum"
                }
              }
            ],
            "filter": [
              {
                "bool": {
                  "must": [
                    {
                      "bool": {
                        "should": [
                          {
                            "terms": {
                              "statusid": [
                                3
                              ]
                            }
                          },
                          {
                            "bool": {
                              "must": [
                                {
                                  "term": {
                                    "ishot": true
                                  }
                                },
                                {
                                  "terms": {
                                    "statusid": [
                                      1,
                                      2
                                    ]
                                  }
                                }
                              ]
                            }
                          }
                        ]
                      }
                    },
                    {
                      "terms": {
                        "servicedeskid": [
                          1
                        ]
                      }
                    },
                    {
                      "terms": {
                        "securitylevelid": [
                          -1,
                          6,
                          7,
                          8
                        ]
                      }
                    }
                  ]
                }
              }
            ]
          }
        }
      ]
    }
  },
  "search_pipeline" : {
    "phase_results_processors": [
      {
        "normalization-processor": {
          "normalization": {
            "technique": "min_max"
          }
          , "combination": {
              "technique": "arithmetic_mean"
            , "parameters": {
              "weights": [0.3, 0.7]
            }
          }
          , "ignore_failure": false
        }
      }
    ]
  },
  "from": 0,
  "size": 50,
  "explain": false,
  "highlight": {
    "type": "unified",
    "fragmenter": "span",
    "number_of_fragments": 1,
    "encoder": "html",
    "pre_tags": [
      "<span class=\"search-results-highlight\">"
    ],
    "post_tags": [
      "</span>"
    ],
    "fragment_size": 200,
    "no_match_size": 0,
    "order": "score",
    "fields": {
      "article.basic": {
        "no_match_size": 200
      },
      "article.intermediate": {},
      "article.advanced": {},
      "attachments.*": {}
    }
  },
  "sort": [
    {
      "_score": "desc"
    },
    {
      "title.sort": "asc"
    }
  ]
}

No obviously you can create a pipeline, but for this use cases I’m just defining a temporary pipeline to show the fix.

Topic		Replies	Views
Hybrid searches missing inner_hits results Machine Learning	2	261	July 7, 2024
Negative scores and duplicated results using Hybrid search OpenSearch troubleshoot	5	846	July 2, 2024
Poor search quality over semantically equivalent searches k-NN discuss	1	23	April 13, 2025
Unexpected Document Retrieval in Hybrid Search: Beyond BM25 and kNN OpenSearch	0	23	February 14, 2025
Hybrid Search Normalization for Nested Queries OpenSearch troubleshoot , configure	3	94	March 10, 2025

Hybrid search returning duplicate docs

Related topics