Versions (relevant - OpenSearch/Dashboard/Server OS/Browser): 2.13
Describe the issue:
Have an OpenSearch index named my_html with the following mapping:
{
"my_html": {
"mappings": {
"properties": {
"content": {
"type": "text",
"fields": {
"ngram": {
"type": "text",
"analyzer": "default_my_ngram"
}
},
"analyzer": "html_content"
},
"crawl_completed": {
"type": "date",
"format": "date_optional_time"
},
"index_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
},
"ngram": {
"type": "text",
"analyzer": "default_my_ngram"
}
},
"analyzer": "my_normalized_keyword"
},
"minimized_byte_size": {
"type": "integer"
},
"original_byte_size": {
"type": "integer"
},
"url": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
},
"ngram": {
"type": "text",
"analyzer": "default_my_ngram"
}
},
"analyzer": "my_normalized_keyword"
},
"url_submitted": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
},
"ngram": {
"type": "text",
"analyzer": "default_my_ngram"
}
},
"analyzer": "my_normalized_keyword"
},
"url_submitted_hash": {
"type": "keyword",
"store": true
}
}
}
}
}
I am sending the following query to the cluster:
{
"size": 10000,
"_source": false,
"stored_fields": "url_submitted_hash",
"query": {
"bool": {
"filter": {
"bool": {
"should": [
{
"match_phrase": {
"content.ngram": "Hello"
}
},
{
"match_phrase": {
"content.ngram": "World"
}
},
{
"match_phrase": {
"content.ngram": "Goodbye"
}
},
{
"match_phrase": {
"content.ngram": "Story"
}
}
],
"minimum_should_match": 1
}
}
}
}
}
While I am getting some results, I am also getting the following failures in the response:
{
"took": 3981,
"timed_out": false,
"_shards": {
"total": 300,
"successful": 298,
"skipped": 0,
"failed": 2,
"failures": [
{
"shard": 0,
"index": "my_html",
"node": "AOz4jZjPTP6aCKoCzCLK4g",
"reason": {
"type": "rejected_execution_exception",
"reason": "cancelled task with reason: heap usage exceeded [634mb >= 38.4mb]"
}
},
{
"shard": 0,
"index": "my_html",
"node": "AOz4jZjPTP6aCKoCzCLK4g",
"reason": {
"type": "rejected_execution_exception",
"reason": "cancelled task with reason: heap usage exceeded [645.7mb >= 38.4mb]"
}
}
]
}
. . .
This error appears randomly so the query isn’t apparently not always using the same amount of heap? Here is the jvm of that node:
"jvm": {
"timestamp": 1725384222017,
"uptime_in_millis": 1119230629,
"mem": {
"heap_used_in_bytes": 2353246688,
"heap_used_percent": 28,
"heap_committed_in_bytes": 8187281408,
"heap_max_in_bytes": 8187281408,
"non_heap_used_in_bytes": 508853184,
"non_heap_committed_in_bytes": 516882432,
"pools": {
"young": {
"used_in_bytes": 1707081728,
"max_in_bytes": 0,
"peak_used_in_bytes": 4907335680,
"peak_max_in_bytes": 0,
"last_gc_stats": {
"used_in_bytes": 0,
"max_in_bytes": 0,
"usage_percent": -1
}
},
"old": {
"used_in_bytes": 633675264,
"max_in_bytes": 8187281408,
"peak_used_in_bytes": 6907823104,
"peak_max_in_bytes": 8187281408,
"last_gc_stats": {
"used_in_bytes": 608272384,
"max_in_bytes": 8187281408,
"usage_percent": 7
}
},
"survivor": {
"used_in_bytes": 12489696,
"max_in_bytes": 0,
"peak_used_in_bytes": 616562688,
"peak_max_in_bytes": 0,
"last_gc_stats": {
"used_in_bytes": 12489696,
"max_in_bytes": 0,
"usage_percent": -1
}
}
}
},
"threads": {
"count": 278,
"peak_count": 278
},
"gc": {
"collectors": {
"young": {
"collection_count": 12144,
"collection_time_in_millis": 1008431
},
"old": {
"collection_count": 0,
"collection_time_in_millis": 0
}
}
},
"buffer_pools": {
"mapped": {
"count": 4716,
"used_in_bytes": 58081248997,
"total_capacity_in_bytes": 58081248997
},
"direct": {
"count": 284,
"used_in_bytes": 6439270,
"total_capacity_in_bytes": 6439268
},
"mapped - 'non-volatile memory'": {
"count": 0,
"used_in_bytes": 0,
"total_capacity_in_bytes": 0
}
},
"classes": {
"current_loaded_count": 54918,
"total_loaded_count": 55469,
"total_unloaded_count": 551
}
}
Based on this info, I wouldn’t expect to be getting this heap error based on the large jvm memory available to the data node. Any suggestions on where to look?
Configuration:
Relevant Logs or Screenshots: