Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 17:51:22 +08:00
commit 23753b435e
24 changed files with 9837 additions and 0 deletions

View File

@@ -0,0 +1,227 @@
# OpenTelemetry Collector Configuration
# Receives metrics, logs, and traces and exports to various backends
receivers:
# OTLP receiver (standard OpenTelemetry protocol)
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
# Prometheus receiver (scrape Prometheus endpoints)
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
# Host metrics (CPU, memory, disk, network)
hostmetrics:
collection_interval: 30s
scrapers:
cpu:
memory:
disk:
network:
filesystem:
load:
# Kubernetes receiver (cluster metrics)
k8s_cluster:
auth_type: serviceAccount
node_conditions_to_report: [Ready, MemoryPressure, DiskPressure]
distribution: kubernetes
# Zipkin receiver (legacy tracing)
zipkin:
endpoint: 0.0.0.0:9411
processors:
# Batch processor (improves performance)
batch:
timeout: 10s
send_batch_size: 1024
send_batch_max_size: 2048
# Memory limiter (prevent OOM)
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
# Resource processor (add resource attributes)
resource:
attributes:
- key: environment
value: production
action: insert
- key: cluster.name
value: prod-cluster
action: insert
# Attributes processor (modify span/metric attributes)
attributes:
actions:
- key: http.url
action: delete # Remove potentially sensitive URLs
- key: db.statement
action: hash # Hash SQL queries for privacy
# Filter processor (drop unwanted data)
filter:
metrics:
# Drop metrics matching criteria
exclude:
match_type: regexp
metric_names:
- ^go_.* # Drop Go runtime metrics
- ^process_.* # Drop process metrics
# Tail sampling (intelligent trace sampling)
tail_sampling:
decision_wait: 10s
num_traces: 100
policies:
# Always sample errors
- name: error-policy
type: status_code
status_code:
status_codes: [ERROR]
# Sample slow traces
- name: latency-policy
type: latency
latency:
threshold_ms: 1000
# Sample 10% of others
- name: probabilistic-policy
type: probabilistic
probabilistic:
sampling_percentage: 10
# Span processor (modify spans)
span:
name:
to_attributes:
rules:
- ^\/api\/v1\/users\/(?P<user_id>.*)$
from_attributes:
- db.name
- http.method
exporters:
# Prometheus exporter (expose metrics endpoint)
prometheus:
endpoint: 0.0.0.0:8889
namespace: otel
# OTLP exporters (send to backends)
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
otlp/mimir:
endpoint: mimir:4317
tls:
insecure: true
# Loki exporter (for logs)
loki:
endpoint: http://loki:3100/loki/api/v1/push
labels:
resource:
service.name: "service_name"
service.namespace: "service_namespace"
attributes:
level: "level"
# Jaeger exporter (alternative tracing backend)
jaeger:
endpoint: jaeger:14250
tls:
insecure: true
# Elasticsearch exporter (for logs)
elasticsearch:
endpoints:
- http://elasticsearch:9200
logs_index: otel-logs
traces_index: otel-traces
# CloudWatch exporter (AWS)
awscloudwatch:
region: us-east-1
namespace: MyApp
log_group_name: /aws/otel/logs
log_stream_name: otel-collector
# Datadog exporter
datadog:
api:
key: ${DD_API_KEY}
site: datadoghq.com
# File exporter (debugging)
file:
path: /tmp/otel-output.json
# Logging exporter (console output for debugging)
logging:
verbosity: detailed
sampling_initial: 5
sampling_thereafter: 200
extensions:
# Health check endpoint
health_check:
endpoint: 0.0.0.0:13133
# Pprof endpoint (for profiling)
pprof:
endpoint: 0.0.0.0:1777
# ZPages (internal diagnostics)
zpages:
endpoint: 0.0.0.0:55679
service:
extensions: [health_check, pprof, zpages]
pipelines:
# Traces pipeline
traces:
receivers: [otlp, zipkin]
processors: [memory_limiter, batch, tail_sampling, resource, span]
exporters: [otlp/tempo, jaeger, logging]
# Metrics pipeline
metrics:
receivers: [otlp, prometheus, hostmetrics, k8s_cluster]
processors: [memory_limiter, batch, filter, resource]
exporters: [otlp/mimir, prometheus, awscloudwatch]
# Logs pipeline
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resource, attributes]
exporters: [loki, elasticsearch, awscloudwatch]
# Telemetry (collector's own metrics)
telemetry:
logs:
level: info
metrics:
address: 0.0.0.0:8888
# Notes:
# 1. Replace ${DD_API_KEY} with actual API key or use environment variable
# 2. Adjust endpoints to match your infrastructure
# 3. Comment out exporters you don't use
# 4. Adjust sampling rates based on your volume and needs
# 5. Add TLS configuration for production deployments