Versions (relevant - OpenSearch/Dashboard/Server OS/Browser):
2.17
Describe the issue:
Circuit breaker is tripped during queries. The error message says that the “parent” circuit breaker is tripped. As per the documentation the values of these breakers should not be modified but instead the root cause found.
When looking at search_backpressure, the cancelation is at 0 and the status is “monitor_only”. What is the relation of this to the circuit breaker?
We seem to get this issue every 3 months or so. Is it related to heap not being properly cleaned? What can I check/configure to tune the system better to avoid this situation?
So far our solution is to restart the cluster and it’s not too problematic to do it every 3 months but it does seem to point to a larger problem where the heap is gradually lost over time. Are there other settings to clean the heap more aggressively?
This error can also be seen in the data prepper:
2024-12-10T11:51:33,196 [raw-pipeline-processor-worker-9-thread-1] ERROR org.opensearch.dataprepper.plugins.processor.oteltracegroup.OTelTraceGroupProcessor - Search request for traceGroup failed for traceIds: due to OpenSearch exception [type=circuit_breaking_exception, reason=[parent] Data too large, data for [<http_request>] would be [1037622004/989.5mb], which is larger than the limit of [1020054732/972.7mb], real usage: [1037621232/989.5mb], new bytes reserved: [772/772b], usages [request=0/0b, fielddata=7539/7.3kb, in_flight_requests=10484692/9.9mb]]
Configuration:
GET _nodes/stats/breaker
{
“_nodes”: {
“total”: 2,
“successful”: 1,
“failed”: 1,
“failures”: [
{
“type”: “failed_node_exception”,
“reason”: “Failed node [LGr7ErNcQM2h44_Y_4WewQ]”,
“node_id”: “LGr7ErNcQM2h44_Y_4WewQ”,
“caused_by”: {
“type”: “circuit_breaking_exception”,
“reason”: “[parent] Data too large, data for [cluster:monitor/nodes/stats[n]] would be [1031312560/983.5mb], which is larger than the limit of [1020054732/972.7mb], real usage: [1031312432/983.5mb], new bytes reserved: [128/128b], usages [request=0/0b, fielddata=14209/13.8kb, in_flight_requests=128/128b]”,
“bytes_wanted”: 1031312560,
“bytes_limit”: 1020054732,
“durability”: “PERMANENT”
}
}
]
},
“cluster_name”: “opensearch-cluster”,
“nodes”: {
“MvQWodTNQLG2xnzsFuU9qQ”: {
“timestamp”: 1733830114047,
“name”: “opensearch-node1”,
“transport_address”: “192.168.64.4:9300”,
“host”: “192.168.64.4”,
“ip”: “192.168.64.4:9300”,
“roles”: [
“cluster_manager”,
“data”,
“ingest”,
“remote_cluster_client”
],
“attributes”: {
“shard_indexing_pressure_enabled”: “true”
},
“breakers”: {
“request”: {
“limit_size_in_bytes”: 644245094,
“limit_size”: “614.3mb”,
“estimated_size_in_bytes”: 0,
“estimated_size”: “0b”,
“overhead”: 1,
“tripped”: 0
},
“fielddata”: {
“limit_size_in_bytes”: 429496729,
“limit_size”: “409.5mb”,
“estimated_size_in_bytes”: 7320,
“estimated_size”: “7.1kb”,
“overhead”: 1.03,
“tripped”: 0
},
“in_flight_requests”: {
“limit_size_in_bytes”: 1073741824,
“limit_size”: “1gb”,
“estimated_size_in_bytes”: 5241960,
“estimated_size”: “4.9mb”,
“overhead”: 2,
“tripped”: 0
},
“parent”: {
“limit_size_in_bytes”: 1020054732,
“limit_size”: “972.7mb”,
“estimated_size_in_bytes”: 1037126984,
“estimated_size”: “989mb”,
“overhead”: 1,
“tripped”: 83594
}
}
}
}
}
GET _nodes/stats/search_backpressure
{
“_nodes”: {
“total”: 2,
“successful”: 2,
“failed”: 0
},
“cluster_name”: “opensearch-cluster”,
“nodes”: {
“LGr7ErNcQM2h44_Y_4WewQ”: {
“timestamp”: 1733830444618,
“name”: “opensearch-node2”,
“transport_address”: “192.168.64.3:9300”,
“host”: “192.168.64.3”,
“ip”: “192.168.64.3:9300”,
“roles”: [
“cluster_manager”,
“data”,
“ingest”,
“remote_cluster_client”
],
“attributes”: {
“shard_indexing_pressure_enabled”: “true”
},
“search_backpressure”: {
“search_task”: {
“resource_tracker_stats”: {
“cpu_usage_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
},
“heap_usage_tracker”: {
“cancellation_count”: 0,
“current_max_bytes”: 0,
“current_avg_bytes”: 0,
“rolling_avg_bytes”: 0
},
“elapsed_time_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
}
},
“cancellation_stats”: {
“cancellation_count”: 0,
“cancellation_limit_reached_count”: 0
}
},
“search_shard_task”: {
“resource_tracker_stats”: {
“cpu_usage_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
},
“heap_usage_tracker”: {
“cancellation_count”: 0,
“current_max_bytes”: 0,
“current_avg_bytes”: 0,
“rolling_avg_bytes”: 854
},
“elapsed_time_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
}
},
“cancellation_stats”: {
“cancellation_count”: 0,
“cancellation_limit_reached_count”: 0
}
},
“mode”: “monitor_only”
}
},
“MvQWodTNQLG2xnzsFuU9qQ”: {
“timestamp”: 1733830444618,
“name”: “opensearch-node1”,
“transport_address”: “192.168.64.4:9300”,
“host”: “192.168.64.4”,
“ip”: “192.168.64.4:9300”,
“roles”: [
“cluster_manager”,
“data”,
“ingest”,
“remote_cluster_client”
],
“attributes”: {
“shard_indexing_pressure_enabled”: “true”
},
“search_backpressure”: {
“search_task”: {
“resource_tracker_stats”: {
“elapsed_time_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
},
“heap_usage_tracker”: {
“cancellation_count”: 0,
“current_max_bytes”: 0,
“current_avg_bytes”: 0,
“rolling_avg_bytes”: 790112
},
“cpu_usage_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
}
},
“cancellation_stats”: {
“cancellation_count”: 0,
“cancellation_limit_reached_count”: 0
}
},
“search_shard_task”: {
“resource_tracker_stats”: {
“elapsed_time_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
},
“heap_usage_tracker”: {
“cancellation_count”: 0,
“current_max_bytes”: 0,
“current_avg_bytes”: 0,
“rolling_avg_bytes”: 35527
},
“cpu_usage_tracker”: {
“cancellation_count”: 0,
“current_max_millis”: 0,
“current_avg_millis”: 0
}
},
“cancellation_stats”: {
“cancellation_count”: 0,
“cancellation_limit_reached_count”: 0
}
},
“mode”: “monitor_only”
}
}
}
}
Relevant Logs or Screenshots: