Initial commit

2025-11-29 18:28:04 +08:00
commit bd47b24e8d
12 changed files with 5227 additions and 0 deletions
--- a/skills/go-optimization/SKILL.md
+++ b/skills/go-optimization/SKILL.md
@@ -0,0 +1,654 @@
+---
+name: go-optimization
+description: Performance optimization techniques including profiling, memory management, benchmarking, and runtime tuning. Use when optimizing Go code performance, reducing memory usage, or analyzing bottlenecks.
+---
+
+# Go Optimization Skill
+
+This skill provides expert guidance on Go performance optimization, covering profiling, benchmarking, memory management, and runtime tuning for building high-performance applications.
+
+## When to Use
+
+Activate this skill when:
+- Profiling application performance
+- Optimizing CPU-intensive operations
+- Reducing memory allocations
+- Tuning garbage collection
+- Writing benchmarks
+- Analyzing performance bottlenecks
+- Optimizing hot paths
+- Reducing lock contention
+
+## Profiling
+
+### CPU Profiling
+
+```go
+import (
+    "os"
+    "runtime/pprof"
+)
+
+func main() {
+    // Start CPU profiling
+    f, err := os.Create("cpu.prof")
+    if err != nil {
+        log.Fatal(err)
+    }
+    defer f.Close()
+
+    if err := pprof.StartCPUProfile(f); err != nil {
+        log.Fatal(err)
+    }
+    defer pprof.StopCPUProfile()
+
+    // Your code here
+    runApplication()
+}
+
+// Analyze:
+// go tool pprof cpu.prof
+// (pprof) top10
+// (pprof) list functionName
+// (pprof) web
+```
+
+### Memory Profiling
+
+```go
+import (
+    "os"
+    "runtime"
+    "runtime/pprof"
+)
+
+func writeMemProfile(filename string) {
+    f, err := os.Create(filename)
+    if err != nil {
+        log.Fatal(err)
+    }
+    defer f.Close()
+
+    runtime.GC() // Force GC before snapshot
+    if err := pprof.WriteHeapProfile(f); err != nil {
+        log.Fatal(err)
+    }
+}
+
+// Analyze:
+// go tool pprof -alloc_space mem.prof
+// go tool pprof -inuse_space mem.prof
+```
+
+### HTTP Profiling
+
+```go
+import (
+    _ "net/http/pprof"
+    "net/http"
+)
+
+func main() {
+    // Enable pprof endpoints
+    go func() {
+        log.Println(http.ListenAndServe("localhost:6060", nil))
+    }()
+
+    // Your application
+    runServer()
+}
+
+// Access profiles:
+// http://localhost:6060/debug/pprof/
+// go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
+// go tool pprof http://localhost:6060/debug/pprof/heap
+```
+
+### Execution Tracing
+
+```go
+import (
+    "os"
+    "runtime/trace"
+)
+
+func main() {
+    f, err := os.Create("trace.out")
+    if err != nil {
+        log.Fatal(err)
+    }
+    defer f.Close()
+
+    if err := trace.Start(f); err != nil {
+        log.Fatal(err)
+    }
+    defer trace.Stop()
+
+    // Your code
+    runApplication()
+}
+
+// View trace:
+// go tool trace trace.out
+```
+
+## Benchmarking
+
+### Basic Benchmarks
+
+```go
+func BenchmarkStringConcat(b *testing.B) {
+    for i := 0; i < b.N; i++ {
+        _ = "hello" + " " + "world"
+    }
+}
+
+func BenchmarkStringBuilder(b *testing.B) {
+    for i := 0; i < b.N; i++ {
+        var sb strings.Builder
+        sb.WriteString("hello")
+        sb.WriteString(" ")
+        sb.WriteString("world")
+        _ = sb.String()
+    }
+}
+
+// Run: go test -bench=. -benchmem
+```
+
+### Sub-benchmarks
+
+```go
+func BenchmarkEncode(b *testing.B) {
+    data := generateTestData()
+
+    b.Run("JSON", func(b *testing.B) {
+        b.ReportAllocs()
+        for i := 0; i < b.N; i++ {
+            json.Marshal(data)
+        }
+    })
+
+    b.Run("MessagePack", func(b *testing.B) {
+        b.ReportAllocs()
+        for i := 0; i < b.N; i++ {
+            msgpack.Marshal(data)
+        }
+    })
+}
+```
+
+### Parallel Benchmarks
+
+```go
+func BenchmarkConcurrentAccess(b *testing.B) {
+    cache := NewCache()
+
+    b.RunParallel(func(pb *testing.PB) {
+        for pb.Next() {
+            cache.Get("key")
+        }
+    })
+}
+```
+
+### Benchmark Comparison
+
+```bash
+# Run benchmarks and save results
+go test -bench=. -benchmem > old.txt
+
+# Make optimizations
+
+# Run again and compare
+go test -bench=. -benchmem > new.txt
+benchstat old.txt new.txt
+```
+
+## Memory Optimization
+
+### Escape Analysis
+
+```go
+// Check what escapes to heap
+// go build -gcflags="-m" main.go
+
+// ✅ GOOD: Stack allocation
+func stackAlloc() int {
+    x := 42
+    return x
+}
+
+// ❌ BAD: Heap escape
+func heapEscape() *int {
+    x := 42
+    return &x // x escapes to heap
+}
+
+// ✅ GOOD: Interface without allocation
+func noAlloc(w io.Writer, data []byte) {
+    w.Write(data)
+}
+
+// ❌ BAD: Interface causes allocation
+func withAlloc() io.Writer {
+    var b bytes.Buffer
+    return &b // &b escapes
+}
+```
+
+### Pre-allocation
+
+```go
+// ❌ BAD: Growing slice
+func badAppend(n int) []int {
+    var result []int
+    for i := 0; i < n; i++ {
+        result = append(result, i) // Multiple allocations
+    }
+    return result
+}
+
+// ✅ GOOD: Pre-allocate
+func goodAppend(n int) []int {
+    result := make([]int, 0, n) // Single allocation
+    for i := 0; i < n; i++ {
+        result = append(result, i)
+    }
+    return result
+}
+
+// ✅ GOOD: Known length
+func knownLength(n int) []int {
+    result := make([]int, n)
+    for i := 0; i < n; i++ {
+        result[i] = i
+    }
+    return result
+}
+
+// ❌ BAD: String concatenation
+func badConcat(strs []string) string {
+    result := ""
+    for _, s := range strs {
+        result += s // New allocation each time
+    }
+    return result
+}
+
+// ✅ GOOD: strings.Builder
+func goodConcat(strs []string) string {
+    var sb strings.Builder
+    sb.Grow(estimateSize(strs))
+    for _, s := range strs {
+        sb.WriteString(s)
+    }
+    return sb.String()
+}
+```
+
+### sync.Pool
+
+```go
+var bufferPool = sync.Pool{
+    New: func() interface{} {
+        return new(bytes.Buffer)
+    },
+}
+
+func processData(data []byte) []byte {
+    // Get buffer from pool
+    buf := bufferPool.Get().(*bytes.Buffer)
+    buf.Reset()
+    defer bufferPool.Put(buf)
+
+    // Use buffer
+    buf.Write(data)
+    // Process...
+
+    return buf.Bytes()
+}
+
+// String builder pool
+var sbPool = sync.Pool{
+    New: func() interface{} {
+        return &strings.Builder{}
+    },
+}
+
+func buildString(parts []string) string {
+    sb := sbPool.Get().(*strings.Builder)
+    sb.Reset()
+    defer sbPool.Put(sb)
+
+    for _, part := range parts {
+        sb.WriteString(part)
+    }
+    return sb.String()
+}
+```
+
+### Zero-Copy Techniques
+
+```go
+// Use byte slices instead of strings
+func parseHeader(header []byte) (key, value []byte) {
+    i := bytes.IndexByte(header, ':')
+    if i < 0 {
+        return nil, nil
+    }
+    return header[:i], header[i+1:]
+}
+
+// Reuse buffers
+type Parser struct {
+    buf []byte
+}
+
+func (p *Parser) Parse(data []byte) error {
+    p.buf = p.buf[:0] // Reset length, keep capacity
+    p.buf = append(p.buf, data...)
+    // Process p.buf...
+    return nil
+}
+
+// Direct writing
+func writeResponse(w io.Writer, data interface{}) error {
+    enc := json.NewEncoder(w) // Write directly to w
+    return enc.Encode(data)
+}
+```
+
+## Garbage Collection Tuning
+
+### GC Control
+
+```go
+import "runtime/debug"
+
+// Adjust GC target percentage
+debug.SetGCPercent(100) // Default
+// Higher = less frequent GC, more memory
+// Lower = more frequent GC, less memory
+
+// Force GC (use sparingly!)
+runtime.GC()
+
+// Monitor GC stats
+var stats runtime.MemStats
+runtime.ReadMemStats(&stats)
+fmt.Printf("Alloc = %v MB\n", stats.Alloc/1024/1024)
+fmt.Printf("TotalAlloc = %v MB\n", stats.TotalAlloc/1024/1024)
+fmt.Printf("Sys = %v MB\n", stats.Sys/1024/1024)
+fmt.Printf("NumGC = %v\n", stats.NumGC)
+```
+
+### GOGC Environment Variable
+
+```bash
+# Default (100%)
+GOGC=100 ./myapp
+
+# More aggressive GC (uses less memory)
+GOGC=50 ./myapp
+
+# Less frequent GC (uses more memory)
+GOGC=200 ./myapp
+
+# Disable GC (for debugging)
+GOGC=off ./myapp
+```
+
+## Concurrency Optimization
+
+### Reduce Lock Contention
+
+```go
+// ❌ BAD: Single lock
+type BadCache struct {
+    mu    sync.Mutex
+    items map[string]interface{}
+}
+
+// ✅ GOOD: RWMutex
+type GoodCache struct {
+    mu    sync.RWMutex
+    items map[string]interface{}
+}
+
+func (c *GoodCache) Get(key string) interface{} {
+    c.mu.RLock()
+    defer c.mu.RUnlock()
+    return c.items[key]
+}
+
+// ✅ BETTER: Sharded locks
+type ShardedCache struct {
+    shards [256]*shard
+}
+
+type shard struct {
+    mu    sync.RWMutex
+    items map[string]interface{}
+}
+
+func (c *ShardedCache) Get(key string) interface{} {
+    shard := c.getShard(key)
+    shard.mu.RLock()
+    defer shard.mu.RUnlock()
+    return shard.items[key]
+}
+```
+
+### Channel Buffering
+
+```go
+// ❌ BAD: Unbuffered channel causes blocking
+ch := make(chan int)
+
+// ✅ GOOD: Buffered channel
+ch := make(chan int, 100)
+
+// Optimal buffer size depends on:
+// - Producer/consumer rates
+// - Memory constraints
+// - Latency requirements
+```
+
+### Atomic Operations
+
+```go
+import "sync/atomic"
+
+type Counter struct {
+    value int64
+}
+
+func (c *Counter) Increment() {
+    atomic.AddInt64(&c.value, 1)
+}
+
+func (c *Counter) Value() int64 {
+    return atomic.LoadInt64(&c.value)
+}
+
+// ✅ Faster than mutex for simple operations
+// ❌ Limited to basic types and operations
+```
+
+## Algorithmic Optimization
+
+### Map Pre-sizing
+
+```go
+// ❌ BAD: Growing map
+func badMap(items []Item) map[string]Item {
+    m := make(map[string]Item)
+    for _, item := range items {
+        m[item.ID] = item
+    }
+    return m
+}
+
+// ✅ GOOD: Pre-sized map
+func goodMap(items []Item) map[string]Item {
+    m := make(map[string]Item, len(items))
+    for _, item := range items {
+        m[item.ID] = item
+    }
+    return m
+}
+```
+
+### Avoid Unnecessary Work
+
+```go
+// ❌ BAD: Repeated computation
+func process(items []Item) {
+    for _, item := range items {
+        if isValid(item) {
+            result := expensiveComputation(item)
+            if result > threshold {
+                handleResult(result)
+            }
+        }
+    }
+}
+
+// ✅ GOOD: Early returns
+func process(items []Item) {
+    for _, item := range items {
+        if !isValid(item) {
+            continue // Skip early
+        }
+        result := expensiveComputation(item)
+        if result <= threshold {
+            continue // Skip early
+        }
+        handleResult(result)
+    }
+}
+
+// ✅ BETTER: Fast path
+func process(items []Item) {
+    for _, item := range items {
+        // Fast path for common case
+        if item.IsSimple() {
+            handleSimple(item)
+            continue
+        }
+        // Slow path for complex case
+        handleComplex(item)
+    }
+}
+```
+
+## Runtime Tuning
+
+### GOMAXPROCS
+
+```go
+import "runtime"
+
+// Set number of OS threads
+runtime.GOMAXPROCS(runtime.NumCPU())
+
+// For CPU-bound: NumCPU
+// For I/O-bound: NumCPU * 2 or more
+```
+
+### Environment Variables
+
+```bash
+# Max OS threads
+GOMAXPROCS=8 ./myapp
+
+# GC aggressiveness
+GOGC=100 ./myapp
+
+# Memory limit (Go 1.19+)
+GOMEMLIMIT=4GiB ./myapp
+
+# Trace execution
+GODEBUG=gctrace=1 ./myapp
+```
+
+## Performance Patterns
+
+### Inline Functions
+
+```go
+// Compiler inlines small functions automatically
+
+//go:inline
+func add(a, b int) int {
+    return a + b
+}
+
+// Keep hot-path functions small for inlining
+```
+
+### Avoid Interface Allocations
+
+```go
+// ❌ BAD: Interface allocation
+func badPrint(value interface{}) {
+    fmt.Println(value) // value escapes
+}
+
+// ✅ GOOD: Type-specific functions
+func printInt(value int) {
+    fmt.Println(value)
+}
+
+func printString(value string) {
+    fmt.Println(value)
+}
+```
+
+### Batch Operations
+
+```go
+// ❌ BAD: Individual operations
+for _, item := range items {
+    db.Insert(item) // N database calls
+}
+
+// ✅ GOOD: Batch operations
+db.BatchInsert(items) // 1 database call
+```
+
+## Best Practices
+
+1. **Profile before optimizing** - Measure, don't guess
+2. **Focus on hot paths** - Optimize the 20% that matters
+3. **Reduce allocations** - Reuse objects, pre-allocate
+4. **Use appropriate data structures** - Map vs slice vs array
+5. **Minimize lock contention** - Use RWMutex, sharding
+6. **Benchmark changes** - Use benchstat for comparisons
+7. **Test with race detector** - `go test -race`
+8. **Monitor in production** - Use profiling endpoints
+9. **Balance readability and performance** - Don't over-optimize
+10. **Use PGO** - Profile-guided optimization (Go 1.20+)
+
+## Profile-Guided Optimization (PGO)
+
+```bash
+# 1. Build with profiling
+go build -o myapp
+
+# 2. Run and collect profile
+./myapp -cpuprofile=default.pgo
+
+# 3. Rebuild with PGO
+go build -pgo=default.pgo -o myapp-optimized
+
+# Performance improvement: 5-15% typical
+```
+
+## Resources
+
+Additional resources in:
+- `assets/examples/` - Performance optimization examples
+- `assets/benchmarks/` - Benchmark templates
+- `references/` - Links to profiling guides and performance papers