OpenSearch Version: 2.8.0
OpenSearch cluster using 100% CPU intermittently, it’s happens once in a day all of sudden. This issue started after upgrading the cluster from 2.4 to 2.8.0.
Thanks in advance
Configuration:
Using Managed Aiven cluster:
path: /my-index-name/_search, params: {index=my-index-name}
org.opensearch.action.search.SearchPhaseExecutionException: all shards failed|
at org.opensearch.action.search.AbstractSearchAsyncAction.onPhaseFailure
(AbstractSearchAsyncAction.java:664) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.AbstractSearchAsyncAction.executeNextPhase
(AbstractSearchAsyncAction.java:372) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.AbstractSearchAsyncAction.onPhaseDone
(AbstractSearchAsyncAction.java:699) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.AbstractSearchAsyncAction.onShardFailure
(AbstractSearchAsyncAction.java:472) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.AbstractSearchAsyncAction$1.onFailure
(AbstractSearchAsyncAction.java:294) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.SearchExecutionStatsCollector.onFailure
(SearchExecutionStatsCollector.java:104) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.ActionListenerResponseHandler.handleException
(ActionListenerResponseHandler.java:74) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.
handleException(SearchTransportService.java:755) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.transport.TransportService.sendRequest(TransportService.java:892)
[opensearch-2.8.0.jar:2.8.0]| at
org.opensearch.transport.TransportService.sendChildRequest
(TransportService.java:958) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.transport.TransportService.sendChildRequest
(TransportService.java:946) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.SearchTransportService.sendExecuteQuery
(SearchTransportService.java:248) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.SearchQueryThenFetchAsyncAction.executePhaseOnShard
(SearchQueryThenFetchAsyncAction.java:133) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.AbstractSearchAsyncAction.
lambda$performPhaseOnShard$3(AbstractSearchAsyncAction.java:281)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction.
performPhaseOnShard(AbstractSearchAsyncAction.java:324) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.AbstractSearchAsyncAction.onShardFailure(AbstractSearchAsyncAction.java:483)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.
AbstractSearchAsyncAction$1.onFailure(AbstractSearchAsyncAction.java:294)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.SearchExecutionStatsCollector.
onFailure(SearchExecutionStatsCollector.java:104) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.ActionListenerResponseHandler.handleException(ActionListenerResponseHandler.java:74)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.
handleException(SearchTransportService.java:755) [opensearch-2.8.0.jar:2.8.0]| at
org.opensearch.transport.TransportService.sendRequest(TransportService.java:892)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService.sendChildRequest(TransportService.java:958)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService.sendChildRequest(TransportService.java:946)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.SearchTransportService.sendExecuteQuery
(SearchTransportService.java:248) [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.
SearchQueryThenFetchAsyncAction.executePhaseOnShard(SearchQueryThenFetchAsyncAction.java:133)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction.
lambda$performPhaseOnShard$3(AbstractSearchAsyncAction.java:281) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.AbstractSearchAsyncAction.performPhaseOnShard(AbstractSearchAsyncAction.java:324)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction.onShardFailure(AbstractSearchAsyncAction.java:483)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction$1.
onFailure(AbstractSearchAsyncAction.java:294) [opensearch-2.8.0.jar:2.8.0]| at
org.opensearch.action.search.SearchExecutionStatsCollector.onFailure(SearchExecutionStatsCollector.java:104)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.ActionListenerResponseHandler.
handleException(ActionListenerResponseHandler.java:74) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.
handleException(SearchTransportService.java:755) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.transport.TransportService$6.handleException(TransportService.java:862)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService$ContextRestoreResponseHandler.
handleException(TransportService.java:1482) [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.
transport.TransportService$DirectResponseChannel.processException(TransportService.java:1596)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService$DirectResponseChannel
.sendResponse(TransportService.java:1570) [opensearch-2.8.0.jar:2.8.0]| at
org.opensearch.transport.TaskTransportChannel.sendResponse(TaskTransportChannel.java:79)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportChannel.sendErrorResponse
(TransportChannel.java:71) [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.support.
ChannelActionListener.onFailure(ChannelActionListener.java:70) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.action.ActionRunnable.onFailure(ActionRunnable.java:103) [opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:54)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.
ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:806)
[opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.AbstractRunnable.run
(AbstractRunnable.java:52) [opensearch-2.8.0.jar:2.8.0]| at java.util.concurrent.
ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) [?:?]| at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
[?:?]| at java.lang.Thread.run(Thread.java:833) [?:?]|Caused by: org.opensearch.tasks.TaskCancelledException:
The parent task was cancelled, shouldn't start any child tasks| at org.opensearch.tasks.
TaskManager$CancellableTaskHolder.registerChildNode(TaskManager.java:632) ~[opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.tasks.TaskManager.registerChildNode(TaskManager.java:311) ~[opensearch-2.8.0.jar:2.8.0]|
at org.opensearch.transport.TransportService.sendRequest(TransportService.java:851) ~[opensearch-2.8.0.jar:2.8.0]|
Hot-thread api response during the incident:
Hot threads at 2023-08-24T14:20:41.673Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
74.4% (372.2ms out of 500ms) cpu usage by thread 'opensearch[hostname][search][T#13]'
3/10 snapshots sharing following 40 elements
app//org.apache.lucene.index.SingletonSortedNumericDocValues.nextDoc(SingletonSortedNumericDocValues.java:52)
app//org.apache.lucene.search.DisjunctionDISIApproximation.nextDoc(DisjunctionDISIApproximation.java:56)
app//org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll(Weight.java:303)
app//org.apache.lucene.search.Weight$DefaultBulkScorer.score(Weight.java:247)
app//org.apache.lucene.search.BulkScorer.score(BulkScorer.java:38)
app//org.apache.lucene.search.LRUQueryCache.cacheIntoBitSet(LRUQueryCache.java:513)
app//org.apache.lucene.search.LRUQueryCache.cacheImpl(LRUQueryCache.java:504)
app//org.apache.lucene.search.LRUQueryCache$CachingWrapperWeight$1.get(LRUQueryCache.java:776)
app//org.apache.lucene.search.Boolean2ScorerSupplier.req(Boolean2ScorerSupplier.java:196)
app//org.apache.lucene.search.Boolean2ScorerSupplier.getInternal(Boolean2ScorerSupplier.java:137)
app//org.apache.lucene.search.Boolean2ScorerSupplier.get(Boolean2ScorerSupplier.java:96)
app//org.apache.lucene.search.BooleanWeight.scorer(BooleanWeight.java:511)
app//org.apache.lucene.search.Weight.bulkScorer(Weight.java:166)
app//org.apache.lucene.search.BooleanWeight.bulkScorer(BooleanWeight.java:395)
app//org.opensearch.search.internal.ContextIndexSearcher$1.bulkScorer(ContextIndexSearcher.java:367)
app//org.opensearch.search.internal.ContextIndexSearcher.searchLeaf(ContextIndexSearcher.java:318)
app//org.opensearch.search.internal.ContextIndexSearcher.search(ContextIndexSearcher.java:286)
app//org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:551)
app//org.opensearch.search.query.QueryPhase.searchWithCollector(QueryPhase.java:336)
app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWithCollector(QueryPhase.java:414)
app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWith(QueryPhase.java:403)
app//org.opensearch.search.query.QueryPhase.executeInternal(QueryPhase.java:262)
app//org.opensearch.search.query.QueryPhase.execute(QueryPhase.java:149)
app//org.opensearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:526)
app//org.opensearch.search.SearchService.executeQueryPhase(SearchService.java:590)
app//org.opensearch.search.SearchService$2.lambda$onResponse$0(SearchService.java:559)
app//org.opensearch.search.SearchService$2$$Lambda$5849/0x0000000801feb668.get(Unknown Source)
app//org.opensearch.search.SearchService$$Lambda$5850/0x0000000801feb890.get(Unknown Source)
app//org.opensearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:73)
app//org.opensearch.action.ActionRunnable$$Lambda$4817/0x0000000801dddf18.accept(Unknown Source)
app//org.opensearch.action.ActionRunnable$2.doRun(ActionRunnable.java:88)
app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
app//org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78)
app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
app//org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59)
app//org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:806)
app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
java.base@17.0.7/java.lang.Thread.run(Thread.java:833)
7/10 snapshots sharing following 39 elements
app//org.apache.lucene.search.DisjunctionDISIApproximation.nextDoc(DisjunctionDISIApproximation.java:56)
app//org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll(Weight.java:303)
app//org.apache.lucene.search.Weight$DefaultBulkScorer.score(Weight.java:247)
app//org.apache.lucene.search.BulkScorer.score(BulkScorer.java:38)
app//org.apache.lucene.search.LRUQueryCache.cacheIntoBitSet(LRUQueryCache.java:513)
app//org.apache.lucene.search.LRUQueryCache.cacheImpl(LRUQueryCache.java:504)
app//org.apache.lucene.search.LRUQueryCache$CachingWrapperWeight$1.get(LRUQueryCache.java:776)
app//org.apache.lucene.search.Boolean2ScorerSupplier.req(Boolean2ScorerSupplier.java:196)
app//org.apache.lucene.search.Boolean2ScorerSupplier.getInternal(Boolean2ScorerSupplier.java:137)
app//org.apache.lucene.search.Boolean2ScorerSupplier.get(Boolean2ScorerSupplier.java:96)
app//org.apache.lucene.search.BooleanWeight.scorer(BooleanWeight.java:511)
app//org.apache.lucene.search.Weight.bulkScorer(Weight.java:166)
app//org.apache.lucene.search.BooleanWeight.bulkScorer(BooleanWeight.java:395)
app//org.opensearch.search.internal.ContextIndexSearcher$1.bulkScorer(ContextIndexSearcher.java:367)
app//org.opensearch.search.internal.ContextIndexSearcher.searchLeaf(ContextIndexSearcher.java:318)
app//org.opensearch.search.internal.ContextIndexSearcher.search(ContextIndexSearcher.java:286)
app//org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:551)
app//org.opensearch.search.query.QueryPhase.searchWithCollector(QueryPhase.java:336)
app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWithCollector(QueryPhase.java:414)
app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWith(QueryPhase.java:403)
app//org.opensearch.search.query.QueryPhase.executeInternal(QueryPhase.java:262)
app//org.opensearch.search.query.QueryPhase.execute(QueryPhase.java:149)
app//org.opensearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:526)
app//org.opensearch.search.SearchService.executeQueryPhase(SearchService.java:590)
app//org.opensearch.search.SearchService$2.lambda$onResponse$0(SearchService.java:559)
app//org.opensearch.search.SearchService$2$$Lambda$5849/0x0000000801feb668.get(Unknown Source)
app//org.opensearch.search.SearchService$$Lambda$5850/0x0000000801feb890.get(Unknown Source)
app//org.opensearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:73)
app//org.opensearch.action.ActionRunnable$$Lambda$4817/0x0000000801dddf18.accept(Unknown Source)
app//org.opensearch.action.ActionRunnable$2.doRun(ActionRunnable.java:88)
app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
app//org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78)
app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
app//org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59)
app//org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:806)
app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
java.base@17.0.7/java.lang.Thread.run(Thread.java:833)
But cannot figure it out which query leads to this issue
CPU Usage: