Initial commit
This commit is contained in:
654
skills/go-optimization/SKILL.md
Normal file
654
skills/go-optimization/SKILL.md
Normal file
@@ -0,0 +1,654 @@
|
||||
---
|
||||
name: go-optimization
|
||||
description: Performance optimization techniques including profiling, memory management, benchmarking, and runtime tuning. Use when optimizing Go code performance, reducing memory usage, or analyzing bottlenecks.
|
||||
---
|
||||
|
||||
# Go Optimization Skill
|
||||
|
||||
This skill provides expert guidance on Go performance optimization, covering profiling, benchmarking, memory management, and runtime tuning for building high-performance applications.
|
||||
|
||||
## When to Use
|
||||
|
||||
Activate this skill when:
|
||||
- Profiling application performance
|
||||
- Optimizing CPU-intensive operations
|
||||
- Reducing memory allocations
|
||||
- Tuning garbage collection
|
||||
- Writing benchmarks
|
||||
- Analyzing performance bottlenecks
|
||||
- Optimizing hot paths
|
||||
- Reducing lock contention
|
||||
|
||||
## Profiling
|
||||
|
||||
### CPU Profiling
|
||||
|
||||
```go
|
||||
import (
|
||||
"os"
|
||||
"runtime/pprof"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Start CPU profiling
|
||||
f, err := os.Create("cpu.prof")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err := pprof.StartCPUProfile(f); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer pprof.StopCPUProfile()
|
||||
|
||||
// Your code here
|
||||
runApplication()
|
||||
}
|
||||
|
||||
// Analyze:
|
||||
// go tool pprof cpu.prof
|
||||
// (pprof) top10
|
||||
// (pprof) list functionName
|
||||
// (pprof) web
|
||||
```
|
||||
|
||||
### Memory Profiling
|
||||
|
||||
```go
|
||||
import (
|
||||
"os"
|
||||
"runtime"
|
||||
"runtime/pprof"
|
||||
)
|
||||
|
||||
func writeMemProfile(filename string) {
|
||||
f, err := os.Create(filename)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
runtime.GC() // Force GC before snapshot
|
||||
if err := pprof.WriteHeapProfile(f); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Analyze:
|
||||
// go tool pprof -alloc_space mem.prof
|
||||
// go tool pprof -inuse_space mem.prof
|
||||
```
|
||||
|
||||
### HTTP Profiling
|
||||
|
||||
```go
|
||||
import (
|
||||
_ "net/http/pprof"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Enable pprof endpoints
|
||||
go func() {
|
||||
log.Println(http.ListenAndServe("localhost:6060", nil))
|
||||
}()
|
||||
|
||||
// Your application
|
||||
runServer()
|
||||
}
|
||||
|
||||
// Access profiles:
|
||||
// http://localhost:6060/debug/pprof/
|
||||
// go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
|
||||
// go tool pprof http://localhost:6060/debug/pprof/heap
|
||||
```
|
||||
|
||||
### Execution Tracing
|
||||
|
||||
```go
|
||||
import (
|
||||
"os"
|
||||
"runtime/trace"
|
||||
)
|
||||
|
||||
func main() {
|
||||
f, err := os.Create("trace.out")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err := trace.Start(f); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer trace.Stop()
|
||||
|
||||
// Your code
|
||||
runApplication()
|
||||
}
|
||||
|
||||
// View trace:
|
||||
// go tool trace trace.out
|
||||
```
|
||||
|
||||
## Benchmarking
|
||||
|
||||
### Basic Benchmarks
|
||||
|
||||
```go
|
||||
func BenchmarkStringConcat(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
_ = "hello" + " " + "world"
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkStringBuilder(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
var sb strings.Builder
|
||||
sb.WriteString("hello")
|
||||
sb.WriteString(" ")
|
||||
sb.WriteString("world")
|
||||
_ = sb.String()
|
||||
}
|
||||
}
|
||||
|
||||
// Run: go test -bench=. -benchmem
|
||||
```
|
||||
|
||||
### Sub-benchmarks
|
||||
|
||||
```go
|
||||
func BenchmarkEncode(b *testing.B) {
|
||||
data := generateTestData()
|
||||
|
||||
b.Run("JSON", func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
json.Marshal(data)
|
||||
}
|
||||
})
|
||||
|
||||
b.Run("MessagePack", func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
for i := 0; i < b.N; i++ {
|
||||
msgpack.Marshal(data)
|
||||
}
|
||||
})
|
||||
}
|
||||
```
|
||||
|
||||
### Parallel Benchmarks
|
||||
|
||||
```go
|
||||
func BenchmarkConcurrentAccess(b *testing.B) {
|
||||
cache := NewCache()
|
||||
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
cache.Get("key")
|
||||
}
|
||||
})
|
||||
}
|
||||
```
|
||||
|
||||
### Benchmark Comparison
|
||||
|
||||
```bash
|
||||
# Run benchmarks and save results
|
||||
go test -bench=. -benchmem > old.txt
|
||||
|
||||
# Make optimizations
|
||||
|
||||
# Run again and compare
|
||||
go test -bench=. -benchmem > new.txt
|
||||
benchstat old.txt new.txt
|
||||
```
|
||||
|
||||
## Memory Optimization
|
||||
|
||||
### Escape Analysis
|
||||
|
||||
```go
|
||||
// Check what escapes to heap
|
||||
// go build -gcflags="-m" main.go
|
||||
|
||||
// ✅ GOOD: Stack allocation
|
||||
func stackAlloc() int {
|
||||
x := 42
|
||||
return x
|
||||
}
|
||||
|
||||
// ❌ BAD: Heap escape
|
||||
func heapEscape() *int {
|
||||
x := 42
|
||||
return &x // x escapes to heap
|
||||
}
|
||||
|
||||
// ✅ GOOD: Interface without allocation
|
||||
func noAlloc(w io.Writer, data []byte) {
|
||||
w.Write(data)
|
||||
}
|
||||
|
||||
// ❌ BAD: Interface causes allocation
|
||||
func withAlloc() io.Writer {
|
||||
var b bytes.Buffer
|
||||
return &b // &b escapes
|
||||
}
|
||||
```
|
||||
|
||||
### Pre-allocation
|
||||
|
||||
```go
|
||||
// ❌ BAD: Growing slice
|
||||
func badAppend(n int) []int {
|
||||
var result []int
|
||||
for i := 0; i < n; i++ {
|
||||
result = append(result, i) // Multiple allocations
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ✅ GOOD: Pre-allocate
|
||||
func goodAppend(n int) []int {
|
||||
result := make([]int, 0, n) // Single allocation
|
||||
for i := 0; i < n; i++ {
|
||||
result = append(result, i)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ✅ GOOD: Known length
|
||||
func knownLength(n int) []int {
|
||||
result := make([]int, n)
|
||||
for i := 0; i < n; i++ {
|
||||
result[i] = i
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ❌ BAD: String concatenation
|
||||
func badConcat(strs []string) string {
|
||||
result := ""
|
||||
for _, s := range strs {
|
||||
result += s // New allocation each time
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ✅ GOOD: strings.Builder
|
||||
func goodConcat(strs []string) string {
|
||||
var sb strings.Builder
|
||||
sb.Grow(estimateSize(strs))
|
||||
for _, s := range strs {
|
||||
sb.WriteString(s)
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
```
|
||||
|
||||
### sync.Pool
|
||||
|
||||
```go
|
||||
var bufferPool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
return new(bytes.Buffer)
|
||||
},
|
||||
}
|
||||
|
||||
func processData(data []byte) []byte {
|
||||
// Get buffer from pool
|
||||
buf := bufferPool.Get().(*bytes.Buffer)
|
||||
buf.Reset()
|
||||
defer bufferPool.Put(buf)
|
||||
|
||||
// Use buffer
|
||||
buf.Write(data)
|
||||
// Process...
|
||||
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// String builder pool
|
||||
var sbPool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
return &strings.Builder{}
|
||||
},
|
||||
}
|
||||
|
||||
func buildString(parts []string) string {
|
||||
sb := sbPool.Get().(*strings.Builder)
|
||||
sb.Reset()
|
||||
defer sbPool.Put(sb)
|
||||
|
||||
for _, part := range parts {
|
||||
sb.WriteString(part)
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
```
|
||||
|
||||
### Zero-Copy Techniques
|
||||
|
||||
```go
|
||||
// Use byte slices instead of strings
|
||||
func parseHeader(header []byte) (key, value []byte) {
|
||||
i := bytes.IndexByte(header, ':')
|
||||
if i < 0 {
|
||||
return nil, nil
|
||||
}
|
||||
return header[:i], header[i+1:]
|
||||
}
|
||||
|
||||
// Reuse buffers
|
||||
type Parser struct {
|
||||
buf []byte
|
||||
}
|
||||
|
||||
func (p *Parser) Parse(data []byte) error {
|
||||
p.buf = p.buf[:0] // Reset length, keep capacity
|
||||
p.buf = append(p.buf, data...)
|
||||
// Process p.buf...
|
||||
return nil
|
||||
}
|
||||
|
||||
// Direct writing
|
||||
func writeResponse(w io.Writer, data interface{}) error {
|
||||
enc := json.NewEncoder(w) // Write directly to w
|
||||
return enc.Encode(data)
|
||||
}
|
||||
```
|
||||
|
||||
## Garbage Collection Tuning
|
||||
|
||||
### GC Control
|
||||
|
||||
```go
|
||||
import "runtime/debug"
|
||||
|
||||
// Adjust GC target percentage
|
||||
debug.SetGCPercent(100) // Default
|
||||
// Higher = less frequent GC, more memory
|
||||
// Lower = more frequent GC, less memory
|
||||
|
||||
// Force GC (use sparingly!)
|
||||
runtime.GC()
|
||||
|
||||
// Monitor GC stats
|
||||
var stats runtime.MemStats
|
||||
runtime.ReadMemStats(&stats)
|
||||
fmt.Printf("Alloc = %v MB\n", stats.Alloc/1024/1024)
|
||||
fmt.Printf("TotalAlloc = %v MB\n", stats.TotalAlloc/1024/1024)
|
||||
fmt.Printf("Sys = %v MB\n", stats.Sys/1024/1024)
|
||||
fmt.Printf("NumGC = %v\n", stats.NumGC)
|
||||
```
|
||||
|
||||
### GOGC Environment Variable
|
||||
|
||||
```bash
|
||||
# Default (100%)
|
||||
GOGC=100 ./myapp
|
||||
|
||||
# More aggressive GC (uses less memory)
|
||||
GOGC=50 ./myapp
|
||||
|
||||
# Less frequent GC (uses more memory)
|
||||
GOGC=200 ./myapp
|
||||
|
||||
# Disable GC (for debugging)
|
||||
GOGC=off ./myapp
|
||||
```
|
||||
|
||||
## Concurrency Optimization
|
||||
|
||||
### Reduce Lock Contention
|
||||
|
||||
```go
|
||||
// ❌ BAD: Single lock
|
||||
type BadCache struct {
|
||||
mu sync.Mutex
|
||||
items map[string]interface{}
|
||||
}
|
||||
|
||||
// ✅ GOOD: RWMutex
|
||||
type GoodCache struct {
|
||||
mu sync.RWMutex
|
||||
items map[string]interface{}
|
||||
}
|
||||
|
||||
func (c *GoodCache) Get(key string) interface{} {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
return c.items[key]
|
||||
}
|
||||
|
||||
// ✅ BETTER: Sharded locks
|
||||
type ShardedCache struct {
|
||||
shards [256]*shard
|
||||
}
|
||||
|
||||
type shard struct {
|
||||
mu sync.RWMutex
|
||||
items map[string]interface{}
|
||||
}
|
||||
|
||||
func (c *ShardedCache) Get(key string) interface{} {
|
||||
shard := c.getShard(key)
|
||||
shard.mu.RLock()
|
||||
defer shard.mu.RUnlock()
|
||||
return shard.items[key]
|
||||
}
|
||||
```
|
||||
|
||||
### Channel Buffering
|
||||
|
||||
```go
|
||||
// ❌ BAD: Unbuffered channel causes blocking
|
||||
ch := make(chan int)
|
||||
|
||||
// ✅ GOOD: Buffered channel
|
||||
ch := make(chan int, 100)
|
||||
|
||||
// Optimal buffer size depends on:
|
||||
// - Producer/consumer rates
|
||||
// - Memory constraints
|
||||
// - Latency requirements
|
||||
```
|
||||
|
||||
### Atomic Operations
|
||||
|
||||
```go
|
||||
import "sync/atomic"
|
||||
|
||||
type Counter struct {
|
||||
value int64
|
||||
}
|
||||
|
||||
func (c *Counter) Increment() {
|
||||
atomic.AddInt64(&c.value, 1)
|
||||
}
|
||||
|
||||
func (c *Counter) Value() int64 {
|
||||
return atomic.LoadInt64(&c.value)
|
||||
}
|
||||
|
||||
// ✅ Faster than mutex for simple operations
|
||||
// ❌ Limited to basic types and operations
|
||||
```
|
||||
|
||||
## Algorithmic Optimization
|
||||
|
||||
### Map Pre-sizing
|
||||
|
||||
```go
|
||||
// ❌ BAD: Growing map
|
||||
func badMap(items []Item) map[string]Item {
|
||||
m := make(map[string]Item)
|
||||
for _, item := range items {
|
||||
m[item.ID] = item
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// ✅ GOOD: Pre-sized map
|
||||
func goodMap(items []Item) map[string]Item {
|
||||
m := make(map[string]Item, len(items))
|
||||
for _, item := range items {
|
||||
m[item.ID] = item
|
||||
}
|
||||
return m
|
||||
}
|
||||
```
|
||||
|
||||
### Avoid Unnecessary Work
|
||||
|
||||
```go
|
||||
// ❌ BAD: Repeated computation
|
||||
func process(items []Item) {
|
||||
for _, item := range items {
|
||||
if isValid(item) {
|
||||
result := expensiveComputation(item)
|
||||
if result > threshold {
|
||||
handleResult(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ✅ GOOD: Early returns
|
||||
func process(items []Item) {
|
||||
for _, item := range items {
|
||||
if !isValid(item) {
|
||||
continue // Skip early
|
||||
}
|
||||
result := expensiveComputation(item)
|
||||
if result <= threshold {
|
||||
continue // Skip early
|
||||
}
|
||||
handleResult(result)
|
||||
}
|
||||
}
|
||||
|
||||
// ✅ BETTER: Fast path
|
||||
func process(items []Item) {
|
||||
for _, item := range items {
|
||||
// Fast path for common case
|
||||
if item.IsSimple() {
|
||||
handleSimple(item)
|
||||
continue
|
||||
}
|
||||
// Slow path for complex case
|
||||
handleComplex(item)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Runtime Tuning
|
||||
|
||||
### GOMAXPROCS
|
||||
|
||||
```go
|
||||
import "runtime"
|
||||
|
||||
// Set number of OS threads
|
||||
runtime.GOMAXPROCS(runtime.NumCPU())
|
||||
|
||||
// For CPU-bound: NumCPU
|
||||
// For I/O-bound: NumCPU * 2 or more
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
```bash
|
||||
# Max OS threads
|
||||
GOMAXPROCS=8 ./myapp
|
||||
|
||||
# GC aggressiveness
|
||||
GOGC=100 ./myapp
|
||||
|
||||
# Memory limit (Go 1.19+)
|
||||
GOMEMLIMIT=4GiB ./myapp
|
||||
|
||||
# Trace execution
|
||||
GODEBUG=gctrace=1 ./myapp
|
||||
```
|
||||
|
||||
## Performance Patterns
|
||||
|
||||
### Inline Functions
|
||||
|
||||
```go
|
||||
// Compiler inlines small functions automatically
|
||||
|
||||
//go:inline
|
||||
func add(a, b int) int {
|
||||
return a + b
|
||||
}
|
||||
|
||||
// Keep hot-path functions small for inlining
|
||||
```
|
||||
|
||||
### Avoid Interface Allocations
|
||||
|
||||
```go
|
||||
// ❌ BAD: Interface allocation
|
||||
func badPrint(value interface{}) {
|
||||
fmt.Println(value) // value escapes
|
||||
}
|
||||
|
||||
// ✅ GOOD: Type-specific functions
|
||||
func printInt(value int) {
|
||||
fmt.Println(value)
|
||||
}
|
||||
|
||||
func printString(value string) {
|
||||
fmt.Println(value)
|
||||
}
|
||||
```
|
||||
|
||||
### Batch Operations
|
||||
|
||||
```go
|
||||
// ❌ BAD: Individual operations
|
||||
for _, item := range items {
|
||||
db.Insert(item) // N database calls
|
||||
}
|
||||
|
||||
// ✅ GOOD: Batch operations
|
||||
db.BatchInsert(items) // 1 database call
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Profile before optimizing** - Measure, don't guess
|
||||
2. **Focus on hot paths** - Optimize the 20% that matters
|
||||
3. **Reduce allocations** - Reuse objects, pre-allocate
|
||||
4. **Use appropriate data structures** - Map vs slice vs array
|
||||
5. **Minimize lock contention** - Use RWMutex, sharding
|
||||
6. **Benchmark changes** - Use benchstat for comparisons
|
||||
7. **Test with race detector** - `go test -race`
|
||||
8. **Monitor in production** - Use profiling endpoints
|
||||
9. **Balance readability and performance** - Don't over-optimize
|
||||
10. **Use PGO** - Profile-guided optimization (Go 1.20+)
|
||||
|
||||
## Profile-Guided Optimization (PGO)
|
||||
|
||||
```bash
|
||||
# 1. Build with profiling
|
||||
go build -o myapp
|
||||
|
||||
# 2. Run and collect profile
|
||||
./myapp -cpuprofile=default.pgo
|
||||
|
||||
# 3. Rebuild with PGO
|
||||
go build -pgo=default.pgo -o myapp-optimized
|
||||
|
||||
# Performance improvement: 5-15% typical
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
Additional resources in:
|
||||
- `assets/examples/` - Performance optimization examples
|
||||
- `assets/benchmarks/` - Benchmark templates
|
||||
- `references/` - Links to profiling guides and performance papers
|
||||
Reference in New Issue
Block a user