Versions (relevant - OpenSearch/Dashboard/Server OS/Browser):
"version" : {
"distribution" : "opensearch",
"number" : "2.13.0",
"build_type" : "tar",
"build_hash" : "7ec678d1b7c87d6e779fdef94e33623e1f1e2647",
"build_date" : "2024-03-26T00:02:39.659767978Z",
"build_snapshot" : false,
"lucene_version" : "9.10.0",
"minimum_wire_compatibility_version" : "7.10.0",
"minimum_index_compatibility_version" : "7.0.0"
}
Describe the issue:
When using a “hybrid” search, I’m seeing getting duplicate documents returned (or perhaps it’s only the first result that gets duplicated). When I run each of the two queries individual (outside of the hybrid search), there are no duplicates return.
The problem appears to be with the neural_sparse
search. If I run the hybrid search with only the neural_sparse
search, I still get duplicates but if I remove the hybrid search, I just get one result. What I do see is the score for each of the duplicate documents is different, so I think there’s some kind of issue with the way it’s unifying the scores.
Here’s the query I’m using:
{
"query": {
"hybrid": {
"queries": [
{
"bool": {
"must": [
{
"function_score": {
"query": {
"bool": {
"should": [
{
"simple_query_string": {
"query": "my search phrase",
"fields": ["keywords^4", "title^2", "article.basic", "article.intermediate", "article.advanced"],
"quote_field_suffix": "exact",
"default_operator": "OR",
"minimum_should_match": "2<75%"
}
},
{
"nested": {
"path": "attachments",
"query": {
"simple_query_string": {
"query": "my search phrase",
"fields": [
"attachments.attachment.content"
],
"quote_field_suffix": "exact",
"default_operator": "OR",
"minimum_should_match": "2<75%"
}
},
"inner_hits": {
"_source": false,
"highlight": {
"type": "unified",
"fragmenter": "span",
"number_of_fragments": 1,
"encoder": "html",
"pre_tags": [
"<span class=\"search-results-highlight\">"
],
"post_tags": [
"</span>"
],
"fragment_size": 200,
"no_match_size": 0,
"order": "score",
"fields": {
"attachments.*": {}
}
}
}
}
}
]
}
},
"functions": [
{
"field_value_factor": {
"field": "rating",
"factor": 0.25,
"modifier": "log1p",
"missing": 0
}
},
{
"field_value_factor": {
"field": "usedcount",
"factor": 0.375,
"modifier": "log1p",
"missing": 0
}
},
{
"gauss": {
"modifieddate": {
"origin": "now",
"scale": "30d",
"offset": "5d",
"decay": 0.5
}
}
}
],
"boost_mode": "sum",
"score_mode": "sum"
}
}
],
"filter": [
{
"bool": {
"must": [
{
"bool": {
"should": [
{
"terms": {
"statusid": [
3
]
}
},
{
"bool": {
"must": [
{
"term": {
"ishot": true
}
},
{
"terms": {
"statusid": [
1,
2
]
}
}
]
}
}
]
}
},
{
"terms": {
"servicedeskid": [
1
]
}
},
{
"terms": {
"securitylevelid": [
-1,
6,
7,
8
]
}
}
]
}
}
]
}
},
{
"bool": {
"must": [
{
"function_score": {
"query": {
"bool": {
"should": [
{
"neural_sparse": {
"embeddings.title": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
},
{
"neural_sparse": {
"embeddings.article.basic": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
},
{
"neural_sparse": {
"embeddings.article.intermediate": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
},
{
"neural_sparse": {
"embeddings.article.advanced": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
}
]
}
},
"functions": [
{
"field_value_factor": {
"field": "rating",
"factor": 0.25,
"modifier": "log1p",
"missing": 0
}
},
{
"field_value_factor": {
"field": "usedcount",
"factor": 0.375,
"modifier": "log1p",
"missing": 0
}
},
{
"gauss": {
"modifieddate": {
"origin": "now",
"scale": "30d",
"offset": "5d",
"decay": 0.5
}
}
}
],
"boost_mode": "sum",
"score_mode": "sum"
}
}
],
"filter": [
{
"bool": {
"must": [
{
"bool": {
"should": [
{
"terms": {
"statusid": [
3
]
}
},
{
"bool": {
"must": [
{
"term": {
"ishot": true
}
},
{
"terms": {
"statusid": [
1,
2
]
}
}
]
}
}
]
}
},
{
"terms": {
"servicedeskid": [
1
]
}
},
{
"terms": {
"securitylevelid": [
-1,
6,
7,
8
]
}
}
]
}
}
]
}
}
]
}
},
"from": 0,
"size": 50,
"explain": false,
"highlight": {
"type": "unified",
"fragmenter": "span",
"number_of_fragments": 1,
"encoder": "html",
"pre_tags": [
"<span class=\"search-results-highlight\">"
],
"post_tags": [
"</span>"
],
"fragment_size": 200,
"no_match_size": 0,
"order": "score",
"fields": {
"article.basic": {
"no_match_size": 200
},
"article.intermediate": {},
"article.advanced": {},
"attachments.*": {}
}
},
"sort": [
{
"_score": "desc"
},
{
"title.sort": "asc"
}
]
}
If I remove the first hybrid queries
entry and just run the neural_sparse
inside the hybrid, I still see the duplicates. However, if the query without the hybrid
, there are no duplicates:
{
"query": {
"bool": {
"must": [
{
"function_score": {
"query": {
"bool": {
"should": [
{
"neural_sparse": {
"embeddings.title": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
},
{
"neural_sparse": {
"embeddings.article.basic": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
},
{
"neural_sparse": {
"embeddings.article.intermediate": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
},
{
"neural_sparse": {
"embeddings.article.advanced": {
"query_text": "my search phrase",
"model_id": "Zjo6tY4Bo7jGFiU7IP8Q"
}
}
}
]
}
},
"functions": [
{
"field_value_factor": {
"field": "rating",
"factor": 0.25,
"modifier": "log1p",
"missing": 0
}
},
{
"field_value_factor": {
"field": "usedcount",
"factor": 0.375,
"modifier": "log1p",
"missing": 0
}
},
{
"gauss": {
"modifieddate": {
"origin": "now",
"scale": "30d",
"offset": "5d",
"decay": 0.5
}
}
}
],
"boost_mode": "sum",
"score_mode": "sum"
}
}
],
"filter": [
{
"bool": {
"must": [
{
"bool": {
"should": [
{
"terms": {
"statusid": [
3
]
}
},
{
"bool": {
"must": [
{
"term": {
"ishot": true
}
},
{
"terms": {
"statusid": [
1,
2
]
}
}
]
}
}
]
}
},
{
"terms": {
"servicedeskid": [
1
]
}
},
{
"terms": {
"securitylevelid": [
-1,
6,
7,
8
]
}
}
]
}
}
]
}
},
"from": 0,
"size": 50,
"explain": false,
"highlight": {
"type": "unified",
"fragmenter": "span",
"number_of_fragments": 1,
"encoder": "html",
"pre_tags": [
"<span class=\"search-results-highlight\">"
],
"post_tags": [
"</span>"
],
"fragment_size": 200,
"no_match_size": 0,
"order": "score",
"fields": {
"article.basic": {
"no_match_size": 200
},
"article.intermediate": {},
"article.advanced": {},
"attachments.*": {}
}
},
"sort": [
{
"_score": "desc"
},
{
"title.sort": "asc"
}
]
}
Is this expected or am I doing something wrong?