# OpenTelemetry Collector Configuration # Receives metrics, logs, and traces and exports to various backends receivers: # OTLP receiver (standard OpenTelemetry protocol) otlp: protocols: grpc: endpoint: 0.0.0.0:4317 http: endpoint: 0.0.0.0:4318 # Prometheus receiver (scrape Prometheus endpoints) prometheus: config: scrape_configs: - job_name: 'otel-collector' scrape_interval: 30s static_configs: - targets: ['localhost:8888'] # Host metrics (CPU, memory, disk, network) hostmetrics: collection_interval: 30s scrapers: cpu: memory: disk: network: filesystem: load: # Kubernetes receiver (cluster metrics) k8s_cluster: auth_type: serviceAccount node_conditions_to_report: [Ready, MemoryPressure, DiskPressure] distribution: kubernetes # Zipkin receiver (legacy tracing) zipkin: endpoint: 0.0.0.0:9411 processors: # Batch processor (improves performance) batch: timeout: 10s send_batch_size: 1024 send_batch_max_size: 2048 # Memory limiter (prevent OOM) memory_limiter: check_interval: 1s limit_mib: 512 spike_limit_mib: 128 # Resource processor (add resource attributes) resource: attributes: - key: environment value: production action: insert - key: cluster.name value: prod-cluster action: insert # Attributes processor (modify span/metric attributes) attributes: actions: - key: http.url action: delete # Remove potentially sensitive URLs - key: db.statement action: hash # Hash SQL queries for privacy # Filter processor (drop unwanted data) filter: metrics: # Drop metrics matching criteria exclude: match_type: regexp metric_names: - ^go_.* # Drop Go runtime metrics - ^process_.* # Drop process metrics # Tail sampling (intelligent trace sampling) tail_sampling: decision_wait: 10s num_traces: 100 policies: # Always sample errors - name: error-policy type: status_code status_code: status_codes: [ERROR] # Sample slow traces - name: latency-policy type: latency latency: threshold_ms: 1000 # Sample 10% of others - name: probabilistic-policy type: probabilistic probabilistic: sampling_percentage: 10 # Span processor (modify spans) span: name: to_attributes: rules: - ^\/api\/v1\/users\/(?P.*)$ from_attributes: - db.name - http.method exporters: # Prometheus exporter (expose metrics endpoint) prometheus: endpoint: 0.0.0.0:8889 namespace: otel # OTLP exporters (send to backends) otlp/tempo: endpoint: tempo:4317 tls: insecure: true otlp/mimir: endpoint: mimir:4317 tls: insecure: true # Loki exporter (for logs) loki: endpoint: http://loki:3100/loki/api/v1/push labels: resource: service.name: "service_name" service.namespace: "service_namespace" attributes: level: "level" # Jaeger exporter (alternative tracing backend) jaeger: endpoint: jaeger:14250 tls: insecure: true # Elasticsearch exporter (for logs) elasticsearch: endpoints: - http://elasticsearch:9200 logs_index: otel-logs traces_index: otel-traces # CloudWatch exporter (AWS) awscloudwatch: region: us-east-1 namespace: MyApp log_group_name: /aws/otel/logs log_stream_name: otel-collector # Datadog exporter datadog: api: key: ${DD_API_KEY} site: datadoghq.com # File exporter (debugging) file: path: /tmp/otel-output.json # Logging exporter (console output for debugging) logging: verbosity: detailed sampling_initial: 5 sampling_thereafter: 200 extensions: # Health check endpoint health_check: endpoint: 0.0.0.0:13133 # Pprof endpoint (for profiling) pprof: endpoint: 0.0.0.0:1777 # ZPages (internal diagnostics) zpages: endpoint: 0.0.0.0:55679 service: extensions: [health_check, pprof, zpages] pipelines: # Traces pipeline traces: receivers: [otlp, zipkin] processors: [memory_limiter, batch, tail_sampling, resource, span] exporters: [otlp/tempo, jaeger, logging] # Metrics pipeline metrics: receivers: [otlp, prometheus, hostmetrics, k8s_cluster] processors: [memory_limiter, batch, filter, resource] exporters: [otlp/mimir, prometheus, awscloudwatch] # Logs pipeline logs: receivers: [otlp] processors: [memory_limiter, batch, resource, attributes] exporters: [loki, elasticsearch, awscloudwatch] # Telemetry (collector's own metrics) telemetry: logs: level: info metrics: address: 0.0.0.0:8888 # Notes: # 1. Replace ${DD_API_KEY} with actual API key or use environment variable # 2. Adjust endpoints to match your infrastructure # 3. Comment out exporters you don't use # 4. Adjust sampling rates based on your volume and needs # 5. Add TLS configuration for production deployments