228 lines
5.1 KiB
YAML
228 lines
5.1 KiB
YAML
# OpenTelemetry Collector Configuration
|
|
# Receives metrics, logs, and traces and exports to various backends
|
|
|
|
receivers:
|
|
# OTLP receiver (standard OpenTelemetry protocol)
|
|
otlp:
|
|
protocols:
|
|
grpc:
|
|
endpoint: 0.0.0.0:4317
|
|
http:
|
|
endpoint: 0.0.0.0:4318
|
|
|
|
# Prometheus receiver (scrape Prometheus endpoints)
|
|
prometheus:
|
|
config:
|
|
scrape_configs:
|
|
- job_name: 'otel-collector'
|
|
scrape_interval: 30s
|
|
static_configs:
|
|
- targets: ['localhost:8888']
|
|
|
|
# Host metrics (CPU, memory, disk, network)
|
|
hostmetrics:
|
|
collection_interval: 30s
|
|
scrapers:
|
|
cpu:
|
|
memory:
|
|
disk:
|
|
network:
|
|
filesystem:
|
|
load:
|
|
|
|
# Kubernetes receiver (cluster metrics)
|
|
k8s_cluster:
|
|
auth_type: serviceAccount
|
|
node_conditions_to_report: [Ready, MemoryPressure, DiskPressure]
|
|
distribution: kubernetes
|
|
|
|
# Zipkin receiver (legacy tracing)
|
|
zipkin:
|
|
endpoint: 0.0.0.0:9411
|
|
|
|
processors:
|
|
# Batch processor (improves performance)
|
|
batch:
|
|
timeout: 10s
|
|
send_batch_size: 1024
|
|
send_batch_max_size: 2048
|
|
|
|
# Memory limiter (prevent OOM)
|
|
memory_limiter:
|
|
check_interval: 1s
|
|
limit_mib: 512
|
|
spike_limit_mib: 128
|
|
|
|
# Resource processor (add resource attributes)
|
|
resource:
|
|
attributes:
|
|
- key: environment
|
|
value: production
|
|
action: insert
|
|
- key: cluster.name
|
|
value: prod-cluster
|
|
action: insert
|
|
|
|
# Attributes processor (modify span/metric attributes)
|
|
attributes:
|
|
actions:
|
|
- key: http.url
|
|
action: delete # Remove potentially sensitive URLs
|
|
- key: db.statement
|
|
action: hash # Hash SQL queries for privacy
|
|
|
|
# Filter processor (drop unwanted data)
|
|
filter:
|
|
metrics:
|
|
# Drop metrics matching criteria
|
|
exclude:
|
|
match_type: regexp
|
|
metric_names:
|
|
- ^go_.* # Drop Go runtime metrics
|
|
- ^process_.* # Drop process metrics
|
|
|
|
# Tail sampling (intelligent trace sampling)
|
|
tail_sampling:
|
|
decision_wait: 10s
|
|
num_traces: 100
|
|
policies:
|
|
# Always sample errors
|
|
- name: error-policy
|
|
type: status_code
|
|
status_code:
|
|
status_codes: [ERROR]
|
|
|
|
# Sample slow traces
|
|
- name: latency-policy
|
|
type: latency
|
|
latency:
|
|
threshold_ms: 1000
|
|
|
|
# Sample 10% of others
|
|
- name: probabilistic-policy
|
|
type: probabilistic
|
|
probabilistic:
|
|
sampling_percentage: 10
|
|
|
|
# Span processor (modify spans)
|
|
span:
|
|
name:
|
|
to_attributes:
|
|
rules:
|
|
- ^\/api\/v1\/users\/(?P<user_id>.*)$
|
|
from_attributes:
|
|
- db.name
|
|
- http.method
|
|
|
|
exporters:
|
|
# Prometheus exporter (expose metrics endpoint)
|
|
prometheus:
|
|
endpoint: 0.0.0.0:8889
|
|
namespace: otel
|
|
|
|
# OTLP exporters (send to backends)
|
|
otlp/tempo:
|
|
endpoint: tempo:4317
|
|
tls:
|
|
insecure: true
|
|
|
|
otlp/mimir:
|
|
endpoint: mimir:4317
|
|
tls:
|
|
insecure: true
|
|
|
|
# Loki exporter (for logs)
|
|
loki:
|
|
endpoint: http://loki:3100/loki/api/v1/push
|
|
labels:
|
|
resource:
|
|
service.name: "service_name"
|
|
service.namespace: "service_namespace"
|
|
attributes:
|
|
level: "level"
|
|
|
|
# Jaeger exporter (alternative tracing backend)
|
|
jaeger:
|
|
endpoint: jaeger:14250
|
|
tls:
|
|
insecure: true
|
|
|
|
# Elasticsearch exporter (for logs)
|
|
elasticsearch:
|
|
endpoints:
|
|
- http://elasticsearch:9200
|
|
logs_index: otel-logs
|
|
traces_index: otel-traces
|
|
|
|
# CloudWatch exporter (AWS)
|
|
awscloudwatch:
|
|
region: us-east-1
|
|
namespace: MyApp
|
|
log_group_name: /aws/otel/logs
|
|
log_stream_name: otel-collector
|
|
|
|
# Datadog exporter
|
|
datadog:
|
|
api:
|
|
key: ${DD_API_KEY}
|
|
site: datadoghq.com
|
|
|
|
# File exporter (debugging)
|
|
file:
|
|
path: /tmp/otel-output.json
|
|
|
|
# Logging exporter (console output for debugging)
|
|
logging:
|
|
verbosity: detailed
|
|
sampling_initial: 5
|
|
sampling_thereafter: 200
|
|
|
|
extensions:
|
|
# Health check endpoint
|
|
health_check:
|
|
endpoint: 0.0.0.0:13133
|
|
|
|
# Pprof endpoint (for profiling)
|
|
pprof:
|
|
endpoint: 0.0.0.0:1777
|
|
|
|
# ZPages (internal diagnostics)
|
|
zpages:
|
|
endpoint: 0.0.0.0:55679
|
|
|
|
service:
|
|
extensions: [health_check, pprof, zpages]
|
|
|
|
pipelines:
|
|
# Traces pipeline
|
|
traces:
|
|
receivers: [otlp, zipkin]
|
|
processors: [memory_limiter, batch, tail_sampling, resource, span]
|
|
exporters: [otlp/tempo, jaeger, logging]
|
|
|
|
# Metrics pipeline
|
|
metrics:
|
|
receivers: [otlp, prometheus, hostmetrics, k8s_cluster]
|
|
processors: [memory_limiter, batch, filter, resource]
|
|
exporters: [otlp/mimir, prometheus, awscloudwatch]
|
|
|
|
# Logs pipeline
|
|
logs:
|
|
receivers: [otlp]
|
|
processors: [memory_limiter, batch, resource, attributes]
|
|
exporters: [loki, elasticsearch, awscloudwatch]
|
|
|
|
# Telemetry (collector's own metrics)
|
|
telemetry:
|
|
logs:
|
|
level: info
|
|
metrics:
|
|
address: 0.0.0.0:8888
|
|
|
|
# Notes:
|
|
# 1. Replace ${DD_API_KEY} with actual API key or use environment variable
|
|
# 2. Adjust endpoints to match your infrastructure
|
|
# 3. Comment out exporters you don't use
|
|
# 4. Adjust sampling rates based on your volume and needs
|
|
# 5. Add TLS configuration for production deployments
|