Files
gh-jeanluciano-quaestor-src…/skills/optimizing-performance/languages/GO.md
2025-11-29 18:50:24 +08:00

9.2 KiB

Go Performance Optimization

Load this file when: Optimizing performance in Go projects

Profiling Tools

Built-in pprof

# CPU profiling
go test -cpuprofile=cpu.prof -bench=.
go tool pprof cpu.prof

# Memory profiling
go test -memprofile=mem.prof -bench=.
go tool pprof mem.prof

# Web UI for profiles
go tool pprof -http=:8080 cpu.prof

# Goroutine profiling
go tool pprof http://localhost:6060/debug/pprof/goroutine

# Heap profiling
go tool pprof http://localhost:6060/debug/pprof/heap

Benchmarking

// Basic benchmark
func BenchmarkFibonacci(b *testing.B) {
    for i := 0; i < b.N; i++ {
        fibonacci(20)
    }
}

// With sub-benchmarks
func BenchmarkSizes(b *testing.B) {
    sizes := []int{10, 100, 1000}
    for _, size := range sizes {
        b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) {
            for i := 0; i < b.N; i++ {
                process(size)
            }
        })
    }
}

// Reset timer for setup
func BenchmarkWithSetup(b *testing.B) {
    data := setupExpensiveData()
    b.ResetTimer()  // Don't count setup time

    for i := 0; i < b.N; i++ {
        process(data)
    }
}

Runtime Metrics

import (
    "net/http"
    _ "net/http/pprof"  // Import for side effects
    "runtime"
)

func init() {
    // Enable profiling endpoint
    go func() {
        http.ListenAndServe("localhost:6060", nil)
    }()
}

// Monitor goroutines
func printStats() {
    fmt.Printf("Goroutines: %d\n", runtime.NumGoroutine())

    var m runtime.MemStats
    runtime.ReadMemStats(&m)
    fmt.Printf("Alloc: %d MB\n", m.Alloc/1024/1024)
    fmt.Printf("TotalAlloc: %d MB\n", m.TotalAlloc/1024/1024)
}

Memory Management

Avoiding Allocations

// Bad: Allocates on every call
func process(data []byte) []byte {
    result := make([]byte, len(data))  // New allocation
    copy(result, data)
    return result
}

// Good: Reuse buffer
var bufferPool = sync.Pool{
    New: func() interface{} {
        return make([]byte, 1024)
    },
}

func process(data []byte) {
    buf := bufferPool.Get().([]byte)
    defer bufferPool.Put(buf)
    // Process with buf
}

Preallocate Slices

// Bad: Multiple allocations as slice grows
items := []Item{}
for i := 0; i < 1000; i++ {
    items = append(items, Item{i})  // Reallocates when cap exceeded
}

// Good: Single allocation
items := make([]Item, 0, 1000)
for i := 0; i < 1000; i++ {
    items = append(items, Item{i})  // No reallocation
}

// Or if final size is known
items := make([]Item, 1000)
for i := 0; i < 1000; i++ {
    items[i] = Item{i}
}

String vs []byte

// Bad: String concatenation allocates
var result string
for _, s := range strings {
    result += s  // New allocation each time
}

// Good: Use strings.Builder
var builder strings.Builder
builder.Grow(estimatedSize)  // Preallocate
for _, s := range strings {
    builder.WriteString(s)
}
result := builder.String()

// For byte operations, work with []byte
data := []byte("hello")
data = append(data, " world"...)  // Efficient

Goroutine Optimization

Worker Pool Pattern

// Bad: Unlimited goroutines
for _, task := range tasks {
    go process(task)  // Could spawn millions!
}

// Good: Limited worker pool
func workerPool(tasks <-chan Task, workers int) {
    var wg sync.WaitGroup
    for i := 0; i < workers; i++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            for task := range tasks {
                process(task)
            }
        }()
    }
    wg.Wait()
}

// Usage
taskChan := make(chan Task, 100)
go workerPool(taskChan, 10)  // 10 workers

Channel Patterns

// Buffered channels reduce blocking
ch := make(chan int, 100)  // Buffer of 100

// Fan-out pattern for parallel work
func fanOut(in <-chan int, n int) []<-chan int {
    outs := make([]<-chan int, n)
    for i := 0; i < n; i++ {
        out := make(chan int)
        outs[i] = out
        go func() {
            for v := range in {
                out <- process(v)
            }
            close(out)
        }()
    }
    return outs
}

// Fan-in pattern to merge results
func fanIn(channels ...<-chan int) <-chan int {
    out := make(chan int)
    var wg sync.WaitGroup

    for _, ch := range channels {
        wg.Add(1)
        go func(c <-chan int) {
            defer wg.Done()
            for v := range c {
                out <- v
            }
        }(ch)
    }

    go func() {
        wg.Wait()
        close(out)
    }()

    return out
}

Data Structure Optimization

Map Preallocation

// Bad: Map grows as needed
m := make(map[string]int)
for i := 0; i < 10000; i++ {
    m[fmt.Sprint(i)] = i  // Reallocates periodically
}

// Good: Preallocate
m := make(map[string]int, 10000)
for i := 0; i < 10000; i++ {
    m[fmt.Sprint(i)] = i  // No reallocation
}

Struct Field Alignment

// Bad: Poor alignment (40 bytes due to padding)
type BadLayout struct {
    a bool   // 1 byte + 7 padding
    b int64  // 8 bytes
    c bool   // 1 byte + 7 padding
    d int64  // 8 bytes
    e bool   // 1 byte + 7 padding
}

// Good: Optimal alignment (24 bytes)
type GoodLayout struct {
    b int64  // 8 bytes
    d int64  // 8 bytes
    a bool   // 1 byte
    c bool   // 1 byte
    e bool   // 1 byte + 5 padding
}

I/O Optimization

Buffered I/O

// Bad: Unbuffered reads
file, _ := os.Open("file.txt")
scanner := bufio.NewScanner(file)

// Good: Buffered with custom size
file, _ := os.Open("file.txt")
reader := bufio.NewReaderSize(file, 64*1024)  // 64KB buffer
scanner := bufio.NewScanner(reader)

Connection Pooling

// HTTP client with connection pooling
client := &http.Client{
    Transport: &http.Transport{
        MaxIdleConns:        100,
        MaxIdleConnsPerHost: 10,
        IdleConnTimeout:     90 * time.Second,
    },
    Timeout: 10 * time.Second,
}

// Database connection pool
db, _ := sql.Open("postgres", dsn)
db.SetMaxOpenConns(25)
db.SetMaxIdleConns(5)
db.SetConnMaxLifetime(5 * time.Minute)

Performance Anti-Patterns

Unnecessary Interface Conversions

// Bad: Interface conversion in hot path
func process(items []interface{}) {
    for _, item := range items {
        v := item.(MyType)  // Type assertion overhead
        use(v)
    }
}

// Good: Use concrete types
func process(items []MyType) {
    for _, item := range items {
        use(item)  // Direct access
    }
}

Defer in Loops

// Bad: Defers accumulate in loop
for _, file := range files {
    f, _ := os.Open(file)
    defer f.Close()  // All close calls deferred until function returns!
}

// Good: Close immediately or use function
for _, file := range files {
    func() {
        f, _ := os.Open(file)
        defer f.Close()  // Deferred to end of this closure
        process(f)
    }()
}

Lock Contention

// Bad: Lock held during expensive operation
mu.Lock()
result := expensiveComputation(data)
cache[key] = result
mu.Unlock()

// Good: Minimize lock time
result := expensiveComputation(data)
mu.Lock()
cache[key] = result
mu.Unlock()

// Better: Use sync.Map for concurrent reads
var cache sync.Map
cache.Store(key, value)
val, ok := cache.Load(key)

Compiler Optimizations

Escape Analysis

// Bad: Escapes to heap
func makeSlice() *[]int {
    s := make([]int, 1000)
    return &s  // Pointer returned, allocates on heap
}

// Good: Stays on stack
func makeSlice() []int {
    s := make([]int, 1000)
    return s  // Value returned, can stay on stack
}

// Check with: go build -gcflags='-m'

Inline Functions

// Small functions are inlined automatically
func add(a, b int) int {
    return a + b  // Will be inlined
}

// Prevent inlining if needed: //go:noinline

Performance Checklist

Before Optimizing:

  • Profile with pprof to identify bottlenecks
  • Write benchmarks for hot paths
  • Measure allocations with -benchmem
  • Check for goroutine leaks

Go-Specific Optimizations:

  • Preallocate slices and maps with known capacity
  • Use strings.Builder for string concatenation
  • Implement worker pools instead of unlimited goroutines
  • Use buffered channels to reduce blocking
  • Reuse buffers with sync.Pool
  • Minimize allocations in hot paths
  • Order struct fields by size (largest first)
  • Use concrete types instead of interfaces in hot paths
  • Avoid defer in tight loops
  • Use sync.Map for concurrent read-heavy maps

After Optimizing:

  • Re-profile to verify improvements
  • Compare benchmarks: benchstat old.txt new.txt
  • Check memory allocations decreased
  • Monitor goroutine count in production
  • Use go test -race to check for race conditions

Tools and Packages

Profiling:

  • pprof - Built-in profiler
  • go-torch - Flamegraph generation
  • benchstat - Compare benchmark results
  • trace - Execution tracer

Optimization:

  • sync.Pool - Object pooling
  • sync.Map - Concurrent map
  • strings.Builder - Efficient string building
  • bufio - Buffered I/O

Analysis:

  • -gcflags='-m' - Escape analysis
  • go test -race - Race detector
  • go test -benchmem - Memory allocations
  • goleak - Goroutine leak detection

Go-specific performance optimization with goroutines, channels, and profiling