# Go Performance Optimization **Load this file when:** Optimizing performance in Go projects ## Profiling Tools ### Built-in pprof ```bash # CPU profiling go test -cpuprofile=cpu.prof -bench=. go tool pprof cpu.prof # Memory profiling go test -memprofile=mem.prof -bench=. go tool pprof mem.prof # Web UI for profiles go tool pprof -http=:8080 cpu.prof # Goroutine profiling go tool pprof http://localhost:6060/debug/pprof/goroutine # Heap profiling go tool pprof http://localhost:6060/debug/pprof/heap ``` ### Benchmarking ```go // Basic benchmark func BenchmarkFibonacci(b *testing.B) { for i := 0; i < b.N; i++ { fibonacci(20) } } // With sub-benchmarks func BenchmarkSizes(b *testing.B) { sizes := []int{10, 100, 1000} for _, size := range sizes { b.Run(fmt.Sprintf("size=%d", size), func(b *testing.B) { for i := 0; i < b.N; i++ { process(size) } }) } } // Reset timer for setup func BenchmarkWithSetup(b *testing.B) { data := setupExpensiveData() b.ResetTimer() // Don't count setup time for i := 0; i < b.N; i++ { process(data) } } ``` ### Runtime Metrics ```go import ( "net/http" _ "net/http/pprof" // Import for side effects "runtime" ) func init() { // Enable profiling endpoint go func() { http.ListenAndServe("localhost:6060", nil) }() } // Monitor goroutines func printStats() { fmt.Printf("Goroutines: %d\n", runtime.NumGoroutine()) var m runtime.MemStats runtime.ReadMemStats(&m) fmt.Printf("Alloc: %d MB\n", m.Alloc/1024/1024) fmt.Printf("TotalAlloc: %d MB\n", m.TotalAlloc/1024/1024) } ``` ## Memory Management ### Avoiding Allocations ```go // Bad: Allocates on every call func process(data []byte) []byte { result := make([]byte, len(data)) // New allocation copy(result, data) return result } // Good: Reuse buffer var bufferPool = sync.Pool{ New: func() interface{} { return make([]byte, 1024) }, } func process(data []byte) { buf := bufferPool.Get().([]byte) defer bufferPool.Put(buf) // Process with buf } ``` ### Preallocate Slices ```go // Bad: Multiple allocations as slice grows items := []Item{} for i := 0; i < 1000; i++ { items = append(items, Item{i}) // Reallocates when cap exceeded } // Good: Single allocation items := make([]Item, 0, 1000) for i := 0; i < 1000; i++ { items = append(items, Item{i}) // No reallocation } // Or if final size is known items := make([]Item, 1000) for i := 0; i < 1000; i++ { items[i] = Item{i} } ``` ### String vs []byte ```go // Bad: String concatenation allocates var result string for _, s := range strings { result += s // New allocation each time } // Good: Use strings.Builder var builder strings.Builder builder.Grow(estimatedSize) // Preallocate for _, s := range strings { builder.WriteString(s) } result := builder.String() // For byte operations, work with []byte data := []byte("hello") data = append(data, " world"...) // Efficient ``` ## Goroutine Optimization ### Worker Pool Pattern ```go // Bad: Unlimited goroutines for _, task := range tasks { go process(task) // Could spawn millions! } // Good: Limited worker pool func workerPool(tasks <-chan Task, workers int) { var wg sync.WaitGroup for i := 0; i < workers; i++ { wg.Add(1) go func() { defer wg.Done() for task := range tasks { process(task) } }() } wg.Wait() } // Usage taskChan := make(chan Task, 100) go workerPool(taskChan, 10) // 10 workers ``` ### Channel Patterns ```go // Buffered channels reduce blocking ch := make(chan int, 100) // Buffer of 100 // Fan-out pattern for parallel work func fanOut(in <-chan int, n int) []<-chan int { outs := make([]<-chan int, n) for i := 0; i < n; i++ { out := make(chan int) outs[i] = out go func() { for v := range in { out <- process(v) } close(out) }() } return outs } // Fan-in pattern to merge results func fanIn(channels ...<-chan int) <-chan int { out := make(chan int) var wg sync.WaitGroup for _, ch := range channels { wg.Add(1) go func(c <-chan int) { defer wg.Done() for v := range c { out <- v } }(ch) } go func() { wg.Wait() close(out) }() return out } ``` ## Data Structure Optimization ### Map Preallocation ```go // Bad: Map grows as needed m := make(map[string]int) for i := 0; i < 10000; i++ { m[fmt.Sprint(i)] = i // Reallocates periodically } // Good: Preallocate m := make(map[string]int, 10000) for i := 0; i < 10000; i++ { m[fmt.Sprint(i)] = i // No reallocation } ``` ### Struct Field Alignment ```go // Bad: Poor alignment (40 bytes due to padding) type BadLayout struct { a bool // 1 byte + 7 padding b int64 // 8 bytes c bool // 1 byte + 7 padding d int64 // 8 bytes e bool // 1 byte + 7 padding } // Good: Optimal alignment (24 bytes) type GoodLayout struct { b int64 // 8 bytes d int64 // 8 bytes a bool // 1 byte c bool // 1 byte e bool // 1 byte + 5 padding } ``` ## I/O Optimization ### Buffered I/O ```go // Bad: Unbuffered reads file, _ := os.Open("file.txt") scanner := bufio.NewScanner(file) // Good: Buffered with custom size file, _ := os.Open("file.txt") reader := bufio.NewReaderSize(file, 64*1024) // 64KB buffer scanner := bufio.NewScanner(reader) ``` ### Connection Pooling ```go // HTTP client with connection pooling client := &http.Client{ Transport: &http.Transport{ MaxIdleConns: 100, MaxIdleConnsPerHost: 10, IdleConnTimeout: 90 * time.Second, }, Timeout: 10 * time.Second, } // Database connection pool db, _ := sql.Open("postgres", dsn) db.SetMaxOpenConns(25) db.SetMaxIdleConns(5) db.SetConnMaxLifetime(5 * time.Minute) ``` ## Performance Anti-Patterns ### Unnecessary Interface Conversions ```go // Bad: Interface conversion in hot path func process(items []interface{}) { for _, item := range items { v := item.(MyType) // Type assertion overhead use(v) } } // Good: Use concrete types func process(items []MyType) { for _, item := range items { use(item) // Direct access } } ``` ### Defer in Loops ```go // Bad: Defers accumulate in loop for _, file := range files { f, _ := os.Open(file) defer f.Close() // All close calls deferred until function returns! } // Good: Close immediately or use function for _, file := range files { func() { f, _ := os.Open(file) defer f.Close() // Deferred to end of this closure process(f) }() } ``` ### Lock Contention ```go // Bad: Lock held during expensive operation mu.Lock() result := expensiveComputation(data) cache[key] = result mu.Unlock() // Good: Minimize lock time result := expensiveComputation(data) mu.Lock() cache[key] = result mu.Unlock() // Better: Use sync.Map for concurrent reads var cache sync.Map cache.Store(key, value) val, ok := cache.Load(key) ``` ## Compiler Optimizations ### Escape Analysis ```go // Bad: Escapes to heap func makeSlice() *[]int { s := make([]int, 1000) return &s // Pointer returned, allocates on heap } // Good: Stays on stack func makeSlice() []int { s := make([]int, 1000) return s // Value returned, can stay on stack } // Check with: go build -gcflags='-m' ``` ### Inline Functions ```go // Small functions are inlined automatically func add(a, b int) int { return a + b // Will be inlined } // Prevent inlining if needed: //go:noinline ``` ## Performance Checklist **Before Optimizing:** - [ ] Profile with pprof to identify bottlenecks - [ ] Write benchmarks for hot paths - [ ] Measure allocations with `-benchmem` - [ ] Check for goroutine leaks **Go-Specific Optimizations:** - [ ] Preallocate slices and maps with known capacity - [ ] Use `strings.Builder` for string concatenation - [ ] Implement worker pools instead of unlimited goroutines - [ ] Use buffered channels to reduce blocking - [ ] Reuse buffers with `sync.Pool` - [ ] Minimize allocations in hot paths - [ ] Order struct fields by size (largest first) - [ ] Use concrete types instead of interfaces in hot paths - [ ] Avoid `defer` in tight loops - [ ] Use `sync.Map` for concurrent read-heavy maps **After Optimizing:** - [ ] Re-profile to verify improvements - [ ] Compare benchmarks: `benchstat old.txt new.txt` - [ ] Check memory allocations decreased - [ ] Monitor goroutine count in production - [ ] Use `go test -race` to check for race conditions ## Tools and Packages **Profiling:** - `pprof` - Built-in profiler - `go-torch` - Flamegraph generation - `benchstat` - Compare benchmark results - `trace` - Execution tracer **Optimization:** - `sync.Pool` - Object pooling - `sync.Map` - Concurrent map - `strings.Builder` - Efficient string building - `bufio` - Buffered I/O **Analysis:** - `-gcflags='-m'` - Escape analysis - `go test -race` - Race detector - `go test -benchmem` - Memory allocations - `goleak` - Goroutine leak detection --- *Go-specific performance optimization with goroutines, channels, and profiling*