Hey everyone,
I am having some issues with traces not showing up in the UI of the OpenSearch Dashboard’s Observability plugin, while they seem to be ingested into the trace index. See below for details:
Versions (relevant - OpenSearch/Dashboard/Server OS/Browser):
OpenSearch 2.17 (Amazon OpenSearch Service)
Browser:
Chrome Version 133.0.6943.54 (Official Build) (arm64)
Safari Version 18.2
Data Prepper: 2.9
Describe the issue:
I have set up OpenTelemetry Collector and Data Prepper to ingest traces into OpenSearch. While I can see that Data Prepper has created the otel -v1-apm-span-000001 and otel-v1-apm-service-map indexes and traces are arriving at otel -v1-apm-span-000001, I don’t see any traces or services in the plugin section.
In the developer view of my browser, I see that the query for the tracing page fails. This is the request:
{"size":0,"query":{"bool":{"must":[{"range":{"startTime":{"gte":"now-5m","lte":"now"}}},{"range":{"startTime":{"gte":"now-5m","lte":"now"}}}],"should":[],"must_not":[],"filter":[{"terms":{"serviceName":["redacted-service1","redacted-service2","redacted-service3","redacted-service4","redacted-service5"]}},{"bool":{"should":[{"bool":{"filter":[{"bool":{"must_not":{"term":{"parentSpanId":{"value":""}}}}},{"terms":{"name":[]}}]}},{"bool":{"must":{"term":{"parentSpanId":{"value":""}}}}}],"adjust_pure_negative":true,"boost":1}}]}},"aggregations":{"service_name":{"terms":{"field":"serviceName","size":500,"min_doc_count":1,"shard_min_doc_count":0,"show_term_doc_count_error":false,"order":[{"_count":"desc"},{"_key":"asc"}]},"aggregations":{"average_latency_nanos":{"avg":{"field":"durationInNanos"}},"average_latency":{"bucket_script":{"buckets_path":{"count":"_count","latency":"average_latency_nanos.value"},"script":"Math.round(params.latency / 10000) / 100.0"}},"error_count":{"filter":{"term":{"status.code":"2"}}},"error_rate":{"bucket_script":{"buckets_path":{"total":"_count","errors":"error_count._count"},"script":"params.errors / params.total * 100"}}}}}}
And this is the response:
{
"statusCode": 400,
"error": "Bad Request",
"message": "[illegal_argument_exception] Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [serviceName] in order to load field data by uninverting the inverted index. Note that this can use significant memory."
}
The traces worked for a few hours when I first set them up, but then at some point no new traces were visible in the UI with the error I described above.
Here is a sample document from the trace index:
{
"_index" : "otel-v1-apm-span-000001",
"_id" : "fd9ab0351d240794",
"_score" : 1.0,
"_source" : {
"traceId" : "0eb4239a3677f2e344d66b623c0db8eb",
"droppedLinksCount" : 0,
"kind" : "SPAN_KIND_CLIENT",
"droppedEventsCount" : 0,
"traceGroupFields" : {
"endTime" : "2025-02-10T14:04:09.833302330Z",
"durationInNanos" : 10024170551,
"statusCode" : 0
},
"traceGroup" : "POST",
"serviceName" : "redacted-service1",
"parentSpanId" : "",
"spanId" : "fd9ab0351d240794",
"traceState" : "",
"name" : "POST",
"startTime" : "2025-02-10T14:03:59.809131779Z",
"links" : [ ],
"endTime" : "2025-02-10T14:04:09.833302330Z",
"droppedAttributesCount" : 0,
"durationInNanos" : 10024170551,
"events" : [ ],
"span.attributes.http@method" : "POST",
"span.attributes.http@url" : "https://sqs.eu-central-1.amazonaws.com/",
"instrumentationScope.name" : "opentelemetry.instrumentation.aiohttp_client",
"resource.attributes.telemetry@sdk@name" : "opentelemetry",
"resource.attributes.telemetry@sdk@language" : "python",
"resource.attributes.telemetry@sdk@version" : "1.30.0",
"resource.attributes.telemetry@auto@version" : "0.51b0",
"resource.attributes.service@name" : "redacted-service1",
"span.attributes.http@status_code" : 200,
"status.code" : 0,
"instrumentationScope.version" : "0.51b0"
}
}
And finally, here is the pipeline.yaml that Data Prepper is using:
# based on: https://github.com/opensearch-project/data-prepper/blob/main/docs/trace_analytics.md
otel-trace-pipeline:
workers: 8
delay: "100"
source:
otel_trace_source:
ssl: false
authentication:
unauthenticated:
health_check_service: true
proto_reflection_service: true
buffer:
bounded_blocking:
buffer_size: 25600
batch_size: 400
sink:
- stdout:
- pipeline:
name: "raw-pipeline"
- pipeline:
name: "service-map-pipeline"
raw-pipeline:
workers: 8
delay: "3000"
source:
pipeline:
name: "otel-trace-pipeline"
buffer:
bounded_blocking:
buffer_size: 25600
batch_size: 3200
processor:
- otel_traces:
- otel_trace_group:
hosts: [ "${opensearch_host}" ]
insecure: true
aws_sigv4: true
aws_region: ${aws_region}
sink:
- opensearch:
hosts: [ "${opensearch_host}" ]
index_type: trace-analytics-raw
insecure: true
aws_sigv4: true
aws_region: ${aws_region}
service-map-pipeline:
workers: 8
delay: "100"
source:
pipeline:
name: "otel-trace-pipeline"
processor:
- service_map:
window_duration: 180
buffer:
bounded_blocking:
buffer_size: 25600
batch_size: 400
sink:
- opensearch:
hosts: [ "${opensearch_host}" ]
index_type: trace-analytics-service-map
insecure: true
aws_sigv4: true
aws_region: ${aws_region}
Does anyone have any idea what might be wrong?