CPU usage get spiked(100%) intermittently in OpenSearch cluster and it causing all the search operation to fail

OpenSearch Version: 2.8.0

OpenSearch cluster using 100% CPU intermittently, it’s happens once in a day all of sudden. This issue started after upgrading the cluster from 2.4 to 2.8.0.

Thanks in advance :pray:

Configuration:
Using Managed Aiven cluster:

path: /my-index-name/_search, params: {index=my-index-name} 
    org.opensearch.action.search.SearchPhaseExecutionException: all shards failed| 
    at org.opensearch.action.search.AbstractSearchAsyncAction.onPhaseFailure
    (AbstractSearchAsyncAction.java:664) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.AbstractSearchAsyncAction.executeNextPhase
    (AbstractSearchAsyncAction.java:372) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.AbstractSearchAsyncAction.onPhaseDone
    (AbstractSearchAsyncAction.java:699) [opensearch-2.8.0.jar:2.8.0]|
    at org.opensearch.action.search.AbstractSearchAsyncAction.onShardFailure
    (AbstractSearchAsyncAction.java:472) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.AbstractSearchAsyncAction$1.onFailure
    (AbstractSearchAsyncAction.java:294) [opensearch-2.8.0.jar:2.8.0]|
    at org.opensearch.action.search.SearchExecutionStatsCollector.onFailure
    (SearchExecutionStatsCollector.java:104) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.ActionListenerResponseHandler.handleException
    (ActionListenerResponseHandler.java:74) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.
    handleException(SearchTransportService.java:755) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.transport.TransportService.sendRequest(TransportService.java:892)
    [opensearch-2.8.0.jar:2.8.0]| at 
    org.opensearch.transport.TransportService.sendChildRequest
    (TransportService.java:958) [opensearch-2.8.0.jar:2.8.0]|
    at org.opensearch.transport.TransportService.sendChildRequest
    (TransportService.java:946) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.SearchTransportService.sendExecuteQuery
    (SearchTransportService.java:248) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.SearchQueryThenFetchAsyncAction.executePhaseOnShard
    (SearchQueryThenFetchAsyncAction.java:133) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.AbstractSearchAsyncAction.
    lambda$performPhaseOnShard$3(AbstractSearchAsyncAction.java:281) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction.
    performPhaseOnShard(AbstractSearchAsyncAction.java:324) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.AbstractSearchAsyncAction.onShardFailure(AbstractSearchAsyncAction.java:483) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.
    AbstractSearchAsyncAction$1.onFailure(AbstractSearchAsyncAction.java:294) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.SearchExecutionStatsCollector.
    onFailure(SearchExecutionStatsCollector.java:104) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.ActionListenerResponseHandler.handleException(ActionListenerResponseHandler.java:74) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.
    handleException(SearchTransportService.java:755) [opensearch-2.8.0.jar:2.8.0]| at
    org.opensearch.transport.TransportService.sendRequest(TransportService.java:892) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService.sendChildRequest(TransportService.java:958)
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService.sendChildRequest(TransportService.java:946)
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.SearchTransportService.sendExecuteQuery
    (SearchTransportService.java:248) [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.
    SearchQueryThenFetchAsyncAction.executePhaseOnShard(SearchQueryThenFetchAsyncAction.java:133) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction.
    lambda$performPhaseOnShard$3(AbstractSearchAsyncAction.java:281) [opensearch-2.8.0.jar:2.8.0]|
    at org.opensearch.action.search.AbstractSearchAsyncAction.performPhaseOnShard(AbstractSearchAsyncAction.java:324)
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction.onShardFailure(AbstractSearchAsyncAction.java:483) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.search.AbstractSearchAsyncAction$1.
    onFailure(AbstractSearchAsyncAction.java:294) [opensearch-2.8.0.jar:2.8.0]| at 
    org.opensearch.action.search.SearchExecutionStatsCollector.onFailure(SearchExecutionStatsCollector.java:104) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.ActionListenerResponseHandler.
    handleException(ActionListenerResponseHandler.java:74) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.search.SearchTransportService$ConnectionCountingHandler.
    handleException(SearchTransportService.java:755) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.transport.TransportService$6.handleException(TransportService.java:862)
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService$ContextRestoreResponseHandler.
    handleException(TransportService.java:1482) [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.
    transport.TransportService$DirectResponseChannel.processException(TransportService.java:1596)
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportService$DirectResponseChannel
    .sendResponse(TransportService.java:1570) [opensearch-2.8.0.jar:2.8.0]| at 
    org.opensearch.transport.TaskTransportChannel.sendResponse(TaskTransportChannel.java:79)
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.transport.TransportChannel.sendErrorResponse
    (TransportChannel.java:71) [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.action.support.
    ChannelActionListener.onFailure(ChannelActionListener.java:70) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.action.ActionRunnable.onFailure(ActionRunnable.java:103) [opensearch-2.8.0.jar:2.8.0]| 
    at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:54) 
    [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78)
      [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
      [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59) 
      [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.
      ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:806) 
     [opensearch-2.8.0.jar:2.8.0]| at org.opensearch.common.util.concurrent.AbstractRunnable.run
     (AbstractRunnable.java:52) [opensearch-2.8.0.jar:2.8.0]| at java.util.concurrent.
     ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) [?:?]| at 
     java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
      [?:?]| at java.lang.Thread.run(Thread.java:833) [?:?]|Caused by: org.opensearch.tasks.TaskCancelledException: 
      The parent task was cancelled, shouldn't start any child tasks| at org.opensearch.tasks.
      TaskManager$CancellableTaskHolder.registerChildNode(TaskManager.java:632) ~[opensearch-2.8.0.jar:2.8.0]| 
      at org.opensearch.tasks.TaskManager.registerChildNode(TaskManager.java:311) ~[opensearch-2.8.0.jar:2.8.0]| 
      at org.opensearch.transport.TransportService.sendRequest(TransportService.java:851) ~[opensearch-2.8.0.jar:2.8.0]|

Hot-thread api response during the incident:

   Hot threads at 2023-08-24T14:20:41.673Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:

   74.4% (372.2ms out of 500ms) cpu usage by thread 'opensearch[hostname][search][T#13]'
     3/10 snapshots sharing following 40 elements
       app//org.apache.lucene.index.SingletonSortedNumericDocValues.nextDoc(SingletonSortedNumericDocValues.java:52)
       app//org.apache.lucene.search.DisjunctionDISIApproximation.nextDoc(DisjunctionDISIApproximation.java:56)
       app//org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll(Weight.java:303)
       app//org.apache.lucene.search.Weight$DefaultBulkScorer.score(Weight.java:247)
       app//org.apache.lucene.search.BulkScorer.score(BulkScorer.java:38)
       app//org.apache.lucene.search.LRUQueryCache.cacheIntoBitSet(LRUQueryCache.java:513)
       app//org.apache.lucene.search.LRUQueryCache.cacheImpl(LRUQueryCache.java:504)
       app//org.apache.lucene.search.LRUQueryCache$CachingWrapperWeight$1.get(LRUQueryCache.java:776)
       app//org.apache.lucene.search.Boolean2ScorerSupplier.req(Boolean2ScorerSupplier.java:196)
       app//org.apache.lucene.search.Boolean2ScorerSupplier.getInternal(Boolean2ScorerSupplier.java:137)
       app//org.apache.lucene.search.Boolean2ScorerSupplier.get(Boolean2ScorerSupplier.java:96)
       app//org.apache.lucene.search.BooleanWeight.scorer(BooleanWeight.java:511)
       app//org.apache.lucene.search.Weight.bulkScorer(Weight.java:166)
       app//org.apache.lucene.search.BooleanWeight.bulkScorer(BooleanWeight.java:395)
       app//org.opensearch.search.internal.ContextIndexSearcher$1.bulkScorer(ContextIndexSearcher.java:367)
       app//org.opensearch.search.internal.ContextIndexSearcher.searchLeaf(ContextIndexSearcher.java:318)
       app//org.opensearch.search.internal.ContextIndexSearcher.search(ContextIndexSearcher.java:286)
       app//org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:551)
       app//org.opensearch.search.query.QueryPhase.searchWithCollector(QueryPhase.java:336)
       app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWithCollector(QueryPhase.java:414)
       app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWith(QueryPhase.java:403)
       app//org.opensearch.search.query.QueryPhase.executeInternal(QueryPhase.java:262)
       app//org.opensearch.search.query.QueryPhase.execute(QueryPhase.java:149)
       app//org.opensearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:526)
       app//org.opensearch.search.SearchService.executeQueryPhase(SearchService.java:590)
       app//org.opensearch.search.SearchService$2.lambda$onResponse$0(SearchService.java:559)
       app//org.opensearch.search.SearchService$2$$Lambda$5849/0x0000000801feb668.get(Unknown Source)
       app//org.opensearch.search.SearchService$$Lambda$5850/0x0000000801feb890.get(Unknown Source)
       app//org.opensearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:73)
       app//org.opensearch.action.ActionRunnable$$Lambda$4817/0x0000000801dddf18.accept(Unknown Source)
       app//org.opensearch.action.ActionRunnable$2.doRun(ActionRunnable.java:88)
       app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
       app//org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78)
       app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
       app//org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59)
       app//org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:806)
       app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
       java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
       java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
       java.base@17.0.7/java.lang.Thread.run(Thread.java:833)
     7/10 snapshots sharing following 39 elements
       app//org.apache.lucene.search.DisjunctionDISIApproximation.nextDoc(DisjunctionDISIApproximation.java:56)
       app//org.apache.lucene.search.Weight$DefaultBulkScorer.scoreAll(Weight.java:303)
       app//org.apache.lucene.search.Weight$DefaultBulkScorer.score(Weight.java:247)
       app//org.apache.lucene.search.BulkScorer.score(BulkScorer.java:38)
       app//org.apache.lucene.search.LRUQueryCache.cacheIntoBitSet(LRUQueryCache.java:513)
       app//org.apache.lucene.search.LRUQueryCache.cacheImpl(LRUQueryCache.java:504)
       app//org.apache.lucene.search.LRUQueryCache$CachingWrapperWeight$1.get(LRUQueryCache.java:776)
       app//org.apache.lucene.search.Boolean2ScorerSupplier.req(Boolean2ScorerSupplier.java:196)
       app//org.apache.lucene.search.Boolean2ScorerSupplier.getInternal(Boolean2ScorerSupplier.java:137)
       app//org.apache.lucene.search.Boolean2ScorerSupplier.get(Boolean2ScorerSupplier.java:96)
       app//org.apache.lucene.search.BooleanWeight.scorer(BooleanWeight.java:511)
       app//org.apache.lucene.search.Weight.bulkScorer(Weight.java:166)
       app//org.apache.lucene.search.BooleanWeight.bulkScorer(BooleanWeight.java:395)
       app//org.opensearch.search.internal.ContextIndexSearcher$1.bulkScorer(ContextIndexSearcher.java:367)
       app//org.opensearch.search.internal.ContextIndexSearcher.searchLeaf(ContextIndexSearcher.java:318)
       app//org.opensearch.search.internal.ContextIndexSearcher.search(ContextIndexSearcher.java:286)
       app//org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:551)
       app//org.opensearch.search.query.QueryPhase.searchWithCollector(QueryPhase.java:336)
       app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWithCollector(QueryPhase.java:414)
       app//org.opensearch.search.query.QueryPhase$DefaultQueryPhaseSearcher.searchWith(QueryPhase.java:403)
       app//org.opensearch.search.query.QueryPhase.executeInternal(QueryPhase.java:262)
       app//org.opensearch.search.query.QueryPhase.execute(QueryPhase.java:149)
       app//org.opensearch.search.SearchService.loadOrExecuteQueryPhase(SearchService.java:526)
       app//org.opensearch.search.SearchService.executeQueryPhase(SearchService.java:590)
       app//org.opensearch.search.SearchService$2.lambda$onResponse$0(SearchService.java:559)
       app//org.opensearch.search.SearchService$2$$Lambda$5849/0x0000000801feb668.get(Unknown Source)
       app//org.opensearch.search.SearchService$$Lambda$5850/0x0000000801feb890.get(Unknown Source)
       app//org.opensearch.action.ActionRunnable.lambda$supply$0(ActionRunnable.java:73)
       app//org.opensearch.action.ActionRunnable$$Lambda$4817/0x0000000801dddf18.accept(Unknown Source)
       app//org.opensearch.action.ActionRunnable$2.doRun(ActionRunnable.java:88)
       app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
       app//org.opensearch.threadpool.TaskAwareRunnable.doRun(TaskAwareRunnable.java:78)
       app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
       app//org.opensearch.common.util.concurrent.TimedRunnable.doRun(TimedRunnable.java:59)
       app//org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:806)
       app//org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52)
       java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
       java.base@17.0.7/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
       java.base@17.0.7/java.lang.Thread.run(Thread.java:833)

But cannot figure it out which query leads to this issue :thinking:

CPU Usage:
Screenshot 2023-08-28 at 19.37.57

Interestingly enough, I am doing a similar investigation on one of my clusters where the whole cluster hits 100% cpu (mostly data nodes) for a decent duration of time.

I have been testing the last few days with Logs - OpenSearch documentation but am coming up short as its a little more difficult to correlate the happenings across this cluster and there is not an easy way to ingest the logs (working on it).

I’ll add anything I come up with for further debugging here. Hopefully it will help.

In case it is useful for anyone, I wrote a quick logstash config that parses the logs for the task details:

input {
  file {
    path => ["/var/log/opensearch/*_task_detailslog.log"]
    type => "opensearch-debug-log"
    tags => ["opensearch"]
  }
}

filter {
if [type] == "opensearch-debug-log" {


        grok {
                match => { "message" =>
                "\[(?<timestamp>%{YEAR}-%{MONTHNUM2}-%{MONTHDAY}T%{TIME})\]\[%{DATA:logLevel}\]\[%{DATA:thread}\] \[%{DATA:hostname}\] taskId:\[%{DATA:taskId}\], type:\[%{DATA:thread_type}\], action:\[%{DATA:action}\], description:\[shardId\[\[%{DATA:index_name}\]\[%{NUMBER:shardId}\]\]\], start_time_millis:\[%{NUMBER:start_time}\], resource_stats:\[\{?(%{GREEDYDATA:resource_stats})\}\], metadata:\[source\[%{GREEDYDATA:query_details}\]\]"
                }
        }
        if [resource_stats] {
                grok {
                  match => { "resource_stats" =>
                        "%{NUMBER}=\[\{cpu_time_in_nanos=%{NUMBER:cpu_time_in_nanos}, memory_in_bytes=%{NUMBER:memory_in_bytes}\}, stats_type=%{DATA:stats_type}, is_active=%{DATA:is_active}, threadId=%{NUMBER:threadId}\]"
                }
                }

        }
        mutate {
                add_field => { "logtype" => "opensearch-debug-log" }
                remove_field => ["message", "resource_stats"]
        }



} 
}

output {
    stdout { codec => "rubydebug" } # change to whatever output you need to ingest
}

Had to use Grok to parse the .log file instead of the .json file since the query details are included, it causes issues with logstash’s json codec and not being able to parse the logs.

YMMV