Skip to content

Prometheus shows spans with latency >2s but no matching traces in Tempo #7500

Description

@baoyu05

I send span latency metrics to Prometheus and use the following alert rule:

sum(increase(traces_spanmetrics_latency_bucket{le="+Inf"}[5m])) by (service,span_name,span_kind)
-
sum(increase(traces_spanmetrics_latency_bucket{le="2"}[5m])) by (service,span_name,span_kind)
> 0

Prometheus returns results showing spans with latency greater than 2 seconds.
However, when I query Tempo for traces exceeding 2 seconds, no results are returned.
It looks like the metrics exist, but the corresponding traces are missing.

My question is:

  1. Is this caused by sampling configuration?
  2. Could Tempo be dropping slow spans?
  3. Or is there a mismatch between span metrics and actual trace export?

Any guidance would be appreciated.

Image Image

tempo, version 2.8.1
Tempo configuration:

target: scalable-single-binary
server:
  http_listen_port: 3200
  grpc_listen_port: 9095
  grpc_server_max_recv_msg_size: 107374182400
  grpc_server_max_send_msg_size: 107374182400
  http_server_read_timeout: 600s
  http_server_write_timeout: 600s
  log_level: debug

distributor:
  ring:
    kvstore:
      store: memberlist
    instance_addr: ${HOSTNAME}
    instance_port: 9095
  receivers:
    otlp:
      protocols:
        grpc:
          endpoint: "0.0.0.0:4317"
        http:
          endpoint: "0.0.0.0:4318"
  retry_after_on_resource_exhausted: 5s

ingester:
  lifecycler:
    ring:
      kvstore:
        store: memberlist
      replication_factor: 1
      heartbeat_timeout: 1m
    address: ${HOSTNAME}
    port: 9095
    heartbeat_period: 2s
    heartbeat_timeout: 3s
    observe_period: 5s
    join_after: 5s
    final_sleep: 5s
    min_ready_duration: 3s
  flush_all_on_shutdown: true
  flush_check_period: 60s
  complete_block_timeout: 600s
  max_block_bytes: 5242880
  trace_idle_period: 60s
  max_block_duration: 600s

query_frontend:
  search:
    default_result_limit: 100
    max_duration:  24h
    duration_slo: 5s
    throughput_bytes_slo: 1073741824
    metadata_slo:
      duration_slo: 5s
      throughput_bytes_slo: 1073741824
  trace_by_id:
    duration_slo: 5s
  metrics:
    max_duration:  720h
    concurrent_jobs: 100
  multi_tenant_queries_enabled: false

metrics_generator:
  ring:
    kvstore:
      store: memberlist
    instance_addr: ${HOSTNAME}
    instance_port: 9095
  processor:
    service_graphs:
      dimensions: [server.address, client.address, network.peer.address, ip.address, protocol, host.name]
      enable_client_server_prefix: true
      enable_messaging_system_latency_histogram: true
      peer_attributes: []
      enable_virtual_node_label: true
      histogram_buckets: [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 6, 7, 8, 9, 10, 30]
    span_metrics:
      dimensions: [server.address, client.address, network.peer.address, ip.address, protocol, host.name]
      intrinsic_dimensions:
        status_message: true
      enable_target_info: true
      histogram_buckets: [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 6, 7, 8, 9, 10, 30]
    local_blocks:
      block:
        version: vParquet3
      filter_server_spans: true
      flush_to_storage: true
  storage:
    path: /export/tempo_data/metrics_storage
    remote_write:
      - url: http://prome.d.vb.local/api/v1/write
        send_exemplars: true
  traces_storage:
    path: /export/tempo_data/metrics_traces_storage
    version: vParquet3

querier:
  frontend_worker:
    frontend_address: ${HOSTNAME}:9095
    grpc_client_config:
      max_recv_msg_size: 10737418240
      max_send_msg_size: 10737418240
  search:
    query_timeout: 600s
  max_concurrent_queries: 30

compactor:
  ring:
    kvstore:
      store: memberlist
    instance_addr: ${HOSTNAME}
    instance_port: 9095
  compaction:
    compaction_cycle: 120s
    compaction_window: 300s
    block_retention: 720h
    compacted_block_retention: 1h
    retention_concurrency: 10
    v2_out_buffer_bytes: 20971520
    max_block_bytes: 5242880

storage:
  trace:
    backend: azure
    azure:
      storage_account_name: <account_name>
      container_name: tempo
    blocklist_poll: 5m
    blocklist_poll_tolerate_consecutive_errors: 10
    blocklist_poll_tolerate_tenant_failures: 10
    wal:
      path: /export/tempo_data/storage_wal
      version: vParquet3
    block:
      version: vParquet3

memberlist:
  randomize_node_name: false
  retransmit_factor: 2
  gossip_nodes: 2
  message_history_buffer_bytes: 1000
  gossip_to_dead_nodes_time: 3s
  pull_push_interval: 3s
  leave_timeout: 3s
  bind_addr: ["0.0.0.0"]
  bind_port: 7946
  abort_if_cluster_join_fails: false
  join_members:
    - <IP1>:7946
    - <IP2>:7946
    - <IP3>:7946
    - <IP4>:7946

overrides:
  defaults:
    ingestion:
      rate_strategy: global
      max_traces_per_user: 1000000000000
    metrics_generator:
      processors:
        - local-blocks
        - service-graphs
        - span-metrics

usage_report:
  reporting_enabled: false

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions