Initial commit
This commit is contained in:
12
.claude-plugin/plugin.json
Normal file
12
.claude-plugin/plugin.json
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
{
|
||||||
|
"name": "ordis-quality-engineering",
|
||||||
|
"description": "Comprehensive quality engineering for full-stack engineers: E2E, API, integration, performance, chaos, contracts, automation, observability, flaky tests, testing-in-production, load testing, test data, visual regression, mutation testing, static analysis (SAST), dependency scanning, fuzz testing, property-based testing, test maintenance, quality metrics - 21 production-ready skills covering traditional QA + modern quality engineering",
|
||||||
|
"version": "1.1.0",
|
||||||
|
"author": {
|
||||||
|
"name": "tachyon-beep",
|
||||||
|
"url": "https://github.com/tachyon-beep"
|
||||||
|
},
|
||||||
|
"skills": [
|
||||||
|
"./skills"
|
||||||
|
]
|
||||||
|
}
|
||||||
3
README.md
Normal file
3
README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# ordis-quality-engineering
|
||||||
|
|
||||||
|
Comprehensive quality engineering for full-stack engineers: E2E, API, integration, performance, chaos, contracts, automation, observability, flaky tests, testing-in-production, load testing, test data, visual regression, mutation testing, static analysis (SAST), dependency scanning, fuzz testing, property-based testing, test maintenance, quality metrics - 21 production-ready skills covering traditional QA + modern quality engineering
|
||||||
129
plugin.lock.json
Normal file
129
plugin.lock.json
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
{
|
||||||
|
"$schema": "internal://schemas/plugin.lock.v1.json",
|
||||||
|
"pluginId": "gh:tachyon-beep/skillpacks:plugins/ordis-quality-engineering",
|
||||||
|
"normalized": {
|
||||||
|
"repo": null,
|
||||||
|
"ref": "refs/tags/v20251128.0",
|
||||||
|
"commit": "122ce5c2755e3944460caaf8b4583428c9c87503",
|
||||||
|
"treeHash": "d715062cb7abff988df8982950f9d774913ff29d953353682496f545660101bc",
|
||||||
|
"generatedAt": "2025-11-28T10:28:33.007693Z",
|
||||||
|
"toolVersion": "publish_plugins.py@0.2.0"
|
||||||
|
},
|
||||||
|
"origin": {
|
||||||
|
"remote": "git@github.com:zhongweili/42plugin-data.git",
|
||||||
|
"branch": "master",
|
||||||
|
"commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
|
||||||
|
"repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
|
||||||
|
},
|
||||||
|
"manifest": {
|
||||||
|
"name": "ordis-quality-engineering",
|
||||||
|
"description": "Comprehensive quality engineering for full-stack engineers: E2E, API, integration, performance, chaos, contracts, automation, observability, flaky tests, testing-in-production, load testing, test data, visual regression, mutation testing, static analysis (SAST), dependency scanning, fuzz testing, property-based testing, test maintenance, quality metrics - 21 production-ready skills covering traditional QA + modern quality engineering",
|
||||||
|
"version": "1.1.0"
|
||||||
|
},
|
||||||
|
"content": {
|
||||||
|
"files": [
|
||||||
|
{
|
||||||
|
"path": "README.md",
|
||||||
|
"sha256": "5837cbad1040a2d7a50b5a6a007288a1e14644c2d37bb99a767e82c143f00223"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": ".claude-plugin/plugin.json",
|
||||||
|
"sha256": "c66ed5503e8a5994583f1ed0ef030a40404d87e606c6e82b9a6b1c15fe33a4f7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/integration-testing-patterns/SKILL.md",
|
||||||
|
"sha256": "2c9ccf9309d85e477be74545dc348d110614cbbdc9f3cc43ad0de22068278a46"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/test-data-management/SKILL.md",
|
||||||
|
"sha256": "94b4318b4ba28e79689721d2a1dba4513f164c1f37ca55fc6a2c030280b9290d"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/load-testing-patterns/SKILL.md",
|
||||||
|
"sha256": "2d4674b0a8a1df9176e48f239a99209593c399aa801aeceb2b292c2cc4758da3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/contract-testing/SKILL.md",
|
||||||
|
"sha256": "be6f479529a72dc90a6a017851eb9833f21eaef0a5a45d6949f03feb57a961dd"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/testing-in-production/SKILL.md",
|
||||||
|
"sha256": "32377a605456bbbd65a593a4747ed0c08a15c5e6c9fca91a5807df30a40bd243"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/quality-metrics-and-kpis/SKILL.md",
|
||||||
|
"sha256": "1ff87eafb6b9c55de68f5d5906fcde9a504f265108ec5ba7fa758e1760c674bc"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/property-based-testing/SKILL.md",
|
||||||
|
"sha256": "7ba389b788e7542303ef8bc1ca78facd727c9950c8651f9480fc0995e87e5c20"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/performance-testing-fundamentals/SKILL.md",
|
||||||
|
"sha256": "0d4a2ceec3849d4b62297c38faa6fe82269dd70979f6a9b15e8a9f60671b8ae4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/using-quality-engineering/SKILL.md",
|
||||||
|
"sha256": "4c161880229fc6bab75995347cd9e0c56c58112aa46b078d15dc19fa7ae58723"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/flaky-test-prevention/SKILL.md",
|
||||||
|
"sha256": "f27b4f48a1286274d9bea42420f71c1ea75424f88c01923aea8fc1ccbe171296"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/chaos-engineering-principles/SKILL.md",
|
||||||
|
"sha256": "8fb6a7b65076df493edeac34c94c2ea96bd81beb4c13fcc476d402011708847b"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/test-automation-architecture/SKILL.md",
|
||||||
|
"sha256": "8cab74d4ac90519870f1fa59db0ecd916590cc3b211dbf44c4df617eebfbf1c6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/observability-and-monitoring/SKILL.md",
|
||||||
|
"sha256": "5e846e906095edfc7bf1c9f22d82b45b78d0316aee722d8330e08befe598674a"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/test-isolation-fundamentals/SKILL.md",
|
||||||
|
"sha256": "07088c9703b8b4c7c5561858dd23de114a0aa6a901122340fdee99605e397fc1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/static-analysis-integration/SKILL.md",
|
||||||
|
"sha256": "d4dd718ccdc395c3320aabe4c1fc6f9da776987a52fad0968383038f2d402f38"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/visual-regression-testing/SKILL.md",
|
||||||
|
"sha256": "7b98694df2d8b60ddb0f0e2cca40de67a694a2af10050b86edae22d14c0c46ae"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/fuzz-testing/SKILL.md",
|
||||||
|
"sha256": "9a033fadb257f8a34fa5be6527b63946ff5c5028818e476c31a8d1612bcf967c"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/api-testing-strategies/SKILL.md",
|
||||||
|
"sha256": "b900c427e32057915e291d2f09d1384875850e5248d0953f19760b709cc4b050"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/mutation-testing/SKILL.md",
|
||||||
|
"sha256": "31ada6e2ec2a4048b65cb12f81a02bd26be101ba2f9fcaab8841bb08ce948ad3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/test-maintenance-patterns/SKILL.md",
|
||||||
|
"sha256": "036f0afa9cc22c27c8f22ed645b40c986647e368a55ef88fe7fa49082dfec124"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/e2e-testing-strategies/SKILL.md",
|
||||||
|
"sha256": "d9e76b1f4b0130f2dec98d1a62f25ab6ebd4bb1f7465c05316ca41a098692bd3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "skills/dependency-scanning/SKILL.md",
|
||||||
|
"sha256": "b722da2839c78c65107be5539ba602a4c8569ce14e01c700f6cf2ebf3ac89982"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"dirSha256": "d715062cb7abff988df8982950f9d774913ff29d953353682496f545660101bc"
|
||||||
|
},
|
||||||
|
"security": {
|
||||||
|
"scannedAt": null,
|
||||||
|
"scannerVersion": null,
|
||||||
|
"flags": []
|
||||||
|
}
|
||||||
|
}
|
||||||
471
skills/api-testing-strategies/SKILL.md
Normal file
471
skills/api-testing-strategies/SKILL.md
Normal file
@@ -0,0 +1,471 @@
|
|||||||
|
---
|
||||||
|
name: api-testing-strategies
|
||||||
|
description: Use when testing REST/GraphQL APIs, designing API test suites, validating request/response contracts, testing authentication/authorization, handling API versioning, or choosing API testing tools - provides test pyramid placement, schema validation, and anti-patterns distinct from E2E browser testing
|
||||||
|
---
|
||||||
|
|
||||||
|
# API Testing Strategies
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** API tests sit between unit tests and E2E tests - faster than browser tests, more realistic than mocks.
|
||||||
|
|
||||||
|
**Rule:** Test APIs directly via HTTP/GraphQL, not through the UI. Browser tests are 10x slower and more flaky.
|
||||||
|
|
||||||
|
## API Testing vs E2E Testing
|
||||||
|
|
||||||
|
| Aspect | API Testing | E2E Browser Testing |
|
||||||
|
|--------|-------------|---------------------|
|
||||||
|
| **Speed** | Fast (10-100ms per test) | Slow (1-10s per test) |
|
||||||
|
| **Flakiness** | Low (no browser/JS) | High (timing, rendering) |
|
||||||
|
| **Coverage** | Business logic, data | Full user workflow |
|
||||||
|
| **Tools** | REST Client, Postman, pytest | Playwright, Cypress |
|
||||||
|
| **When to use** | Most backend testing | Critical user flows only |
|
||||||
|
|
||||||
|
**Test Pyramid placement:**
|
||||||
|
- **Unit tests (70%):** Individual functions/classes
|
||||||
|
- **API tests (20%):** Endpoints, business logic, integrations
|
||||||
|
- **E2E tests (10%):** Critical user workflows through browser
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Selection Decision Tree
|
||||||
|
|
||||||
|
| Your Stack | Team Skills | Use | Why |
|
||||||
|
|-----------|-------------|-----|-----|
|
||||||
|
| **Python backend** | pytest familiar | **pytest + requests** | Best integration, fixtures |
|
||||||
|
| **Node.js/JavaScript** | Jest/Mocha | **supertest** | Express/Fastify native |
|
||||||
|
| **Any language, REST** | Prefer GUI | **Postman + Newman** | GUI for design, CLI for CI |
|
||||||
|
| **GraphQL** | Any | **pytest + gql** (Python) or **apollo-client** (JS) | Query validation |
|
||||||
|
| **Contract testing** | Microservices | **Pact** | Consumer-driven contracts |
|
||||||
|
|
||||||
|
**First choice:** Use your existing test framework (pytest/Jest) + HTTP client. Don't add new tools unnecessarily.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Structure Pattern
|
||||||
|
|
||||||
|
### Basic REST API Test
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def api_client():
|
||||||
|
"""Base API client with auth."""
|
||||||
|
return requests.Session()
|
||||||
|
|
||||||
|
def test_create_order(api_client):
|
||||||
|
# Arrange: Set up test data
|
||||||
|
payload = {
|
||||||
|
"user_id": 123,
|
||||||
|
"items": [{"sku": "WIDGET", "quantity": 2}],
|
||||||
|
"shipping_address": "123 Main St"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Act: Make API call
|
||||||
|
response = api_client.post(
|
||||||
|
"https://api.example.com/orders",
|
||||||
|
json=payload,
|
||||||
|
headers={"Authorization": "Bearer test_token"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assert: Validate response
|
||||||
|
assert response.status_code == 201
|
||||||
|
data = response.json()
|
||||||
|
assert data["id"] is not None
|
||||||
|
assert data["status"] == "pending"
|
||||||
|
assert data["total"] > 0
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### GraphQL API Test
|
||||||
|
|
||||||
|
```python
|
||||||
|
from gql import gql, Client
|
||||||
|
from gql.transport.requests import RequestsHTTPTransport
|
||||||
|
|
||||||
|
def test_user_query():
|
||||||
|
transport = RequestsHTTPTransport(url="https://api.example.com/graphql")
|
||||||
|
client = Client(transport=transport)
|
||||||
|
|
||||||
|
query = gql('''
|
||||||
|
query GetUser($id: ID!) {
|
||||||
|
user(id: $id) {
|
||||||
|
id
|
||||||
|
name
|
||||||
|
email
|
||||||
|
}
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
result = client.execute(query, variable_values={"id": "123"})
|
||||||
|
|
||||||
|
assert result["user"]["id"] == "123"
|
||||||
|
assert result["user"]["email"] is not None
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What to Test
|
||||||
|
|
||||||
|
### 1. Happy Path (Required)
|
||||||
|
|
||||||
|
**Test successful requests with valid data.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_get_user_success():
|
||||||
|
response = api.get("/users/123")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json()["name"] == "Alice"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Validation Errors (Required)
|
||||||
|
|
||||||
|
**Test API rejects invalid input.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_create_user_invalid_email():
|
||||||
|
response = api.post("/users", json={"email": "invalid"})
|
||||||
|
|
||||||
|
assert response.status_code == 400
|
||||||
|
assert "email" in response.json()["errors"]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Authentication & Authorization (Required)
|
||||||
|
|
||||||
|
**Test auth failures.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_unauthorized_without_token():
|
||||||
|
response = api.get("/orders", headers={}) # No auth token
|
||||||
|
|
||||||
|
assert response.status_code == 401
|
||||||
|
|
||||||
|
def test_forbidden_different_user():
|
||||||
|
response = api.get(
|
||||||
|
"/orders/999",
|
||||||
|
headers={"Authorization": "Bearer user_123_token"}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 403 # Can't access other user's orders
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Edge Cases (Important)
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_pagination_last_page():
|
||||||
|
response = api.get("/users?page=999")
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json()["results"] == []
|
||||||
|
|
||||||
|
def test_large_payload():
|
||||||
|
items = [{"sku": f"ITEM_{i}", "quantity": 1} for i in range(1000)]
|
||||||
|
response = api.post("/orders", json={"items": items})
|
||||||
|
|
||||||
|
assert response.status_code in [201, 413] # Created or payload too large
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. Idempotency (For POST/PUT/DELETE)
|
||||||
|
|
||||||
|
**Test same request twice produces same result.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_create_user_idempotent():
|
||||||
|
payload = {"email": "alice@example.com", "name": "Alice"}
|
||||||
|
|
||||||
|
# First request
|
||||||
|
response1 = api.post("/users", json=payload)
|
||||||
|
user_id_1 = response1.json()["id"]
|
||||||
|
|
||||||
|
# Second identical request
|
||||||
|
response2 = api.post("/users", json=payload)
|
||||||
|
|
||||||
|
# Should return existing user, not create duplicate
|
||||||
|
assert response2.status_code in [200, 409] # OK or Conflict
|
||||||
|
if response2.status_code == 200:
|
||||||
|
assert response2.json()["id"] == user_id_1
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Schema Validation
|
||||||
|
|
||||||
|
**Use JSON Schema to validate response structure.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import jsonschema
|
||||||
|
|
||||||
|
USER_SCHEMA = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"id": {"type": "integer"},
|
||||||
|
"name": {"type": "string"},
|
||||||
|
"email": {"type": "string", "format": "email"}
|
||||||
|
},
|
||||||
|
"required": ["id", "name", "email"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_user_response_schema():
|
||||||
|
response = api.get("/users/123")
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
jsonschema.validate(instance=data, schema=USER_SCHEMA) # Raises if invalid
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why it matters:** Prevents regressions where fields are removed or types change.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## API Versioning Tests
|
||||||
|
|
||||||
|
**Test multiple API versions simultaneously.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.mark.parametrize("version,expected_fields", [
|
||||||
|
("v1", ["id", "name"]),
|
||||||
|
("v2", ["id", "name", "email", "created_at"]),
|
||||||
|
])
|
||||||
|
def test_user_endpoint_version(version, expected_fields):
|
||||||
|
response = api.get(f"/{version}/users/123")
|
||||||
|
|
||||||
|
data = response.json()
|
||||||
|
for field in expected_fields:
|
||||||
|
assert field in data
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Testing Through the UI
|
||||||
|
|
||||||
|
**Symptom:** Using browser automation to test API functionality
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Testing API via browser
|
||||||
|
def test_create_order():
|
||||||
|
page.goto("/orders/new")
|
||||||
|
page.fill("#item", "Widget")
|
||||||
|
page.click("#submit")
|
||||||
|
assert page.locator(".success").is_visible()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:**
|
||||||
|
- 10x slower than API test
|
||||||
|
- Flaky (browser timing issues)
|
||||||
|
- Couples API test to UI changes
|
||||||
|
|
||||||
|
**Fix:** Test API directly
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Direct API test
|
||||||
|
def test_create_order():
|
||||||
|
response = api.post("/orders", json={"item": "Widget"})
|
||||||
|
assert response.status_code == 201
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Implementation Details
|
||||||
|
|
||||||
|
**Symptom:** Asserting on database queries, internal logic
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Testing implementation
|
||||||
|
def test_get_user():
|
||||||
|
with patch('database.execute') as mock_db:
|
||||||
|
api.get("/users/123")
|
||||||
|
assert mock_db.called_with("SELECT * FROM users WHERE id = 123")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Couples test to implementation, not contract
|
||||||
|
|
||||||
|
**Fix:** Test only request/response contract
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Test contract only
|
||||||
|
def test_get_user():
|
||||||
|
response = api.get("/users/123")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json()["id"] == 123
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Test Data Isolation
|
||||||
|
|
||||||
|
**Symptom:** Tests interfere with each other
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Shared test data
|
||||||
|
def test_update_user():
|
||||||
|
api.put("/users/123", json={"name": "Bob"})
|
||||||
|
assert api.get("/users/123").json()["name"] == "Bob"
|
||||||
|
|
||||||
|
def test_get_user():
|
||||||
|
# Fails if previous test ran!
|
||||||
|
assert api.get("/users/123").json()["name"] == "Alice"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Each test creates/cleans its own data (see test-isolation-fundamentals skill)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Hardcoded URLs and Tokens
|
||||||
|
|
||||||
|
**Symptom:** Production URLs or real credentials in tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Hardcoded production URL
|
||||||
|
def test_api():
|
||||||
|
response = requests.get("https://api.production.com/users")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Use environment variables or fixtures
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Configurable environment
|
||||||
|
import os
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def api_base_url():
|
||||||
|
return os.getenv("API_URL", "http://localhost:8000")
|
||||||
|
|
||||||
|
def test_api(api_base_url):
|
||||||
|
response = requests.get(f"{api_base_url}/users")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Mocking External APIs
|
||||||
|
|
||||||
|
**When testing service A that calls service B:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import responses
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_payment_success():
|
||||||
|
# Mock Stripe API
|
||||||
|
responses.add(
|
||||||
|
responses.POST,
|
||||||
|
"https://api.stripe.com/v1/charges",
|
||||||
|
json={"id": "ch_123", "status": "succeeded"},
|
||||||
|
status=200
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test your API
|
||||||
|
response = api.post("/checkout", json={"amount": 1000})
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json()["payment_status"] == "succeeded"
|
||||||
|
```
|
||||||
|
|
||||||
|
**When to mock:**
|
||||||
|
- External service costs money (Stripe, Twilio)
|
||||||
|
- External service is slow
|
||||||
|
- External service is unreliable
|
||||||
|
- Testing error handling (simulate failures)
|
||||||
|
|
||||||
|
**When NOT to mock:**
|
||||||
|
- Integration tests (use separate test suite with real services)
|
||||||
|
- Contract tests (use Pact to verify integration)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Testing APIs
|
||||||
|
|
||||||
|
**Use load testing for APIs separately from E2E:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# locust load test
|
||||||
|
from locust import HttpUser, task, between
|
||||||
|
|
||||||
|
class APIUser(HttpUser):
|
||||||
|
wait_time = between(1, 3)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def get_users(self):
|
||||||
|
self.client.get("/users")
|
||||||
|
|
||||||
|
@task(3) # 3x more frequent
|
||||||
|
def get_user(self):
|
||||||
|
self.client.get("/users/123")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run with:**
|
||||||
|
```bash
|
||||||
|
locust -f locustfile.py --headless -u 100 -r 10 --run-time 60s
|
||||||
|
```
|
||||||
|
|
||||||
|
See load-testing-patterns skill for comprehensive guidance.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CI/CD Integration
|
||||||
|
|
||||||
|
**API tests should run on every commit:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/api-tests.yml
|
||||||
|
name: API Tests
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- name: Run API tests
|
||||||
|
run: |
|
||||||
|
pytest tests/api/ -v
|
||||||
|
env:
|
||||||
|
API_URL: http://localhost:8000
|
||||||
|
API_TOKEN: ${{ secrets.TEST_API_TOKEN }}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Test stages:**
|
||||||
|
- Commit: Smoke tests (5-10 critical endpoints, <1 min)
|
||||||
|
- PR: Full API suite (all endpoints, <5 min)
|
||||||
|
- Merge: API + integration tests (<15 min)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Reference: API Test Checklist
|
||||||
|
|
||||||
|
For each endpoint, test:
|
||||||
|
|
||||||
|
- [ ] **Happy path** (valid request → 200/201)
|
||||||
|
- [ ] **Validation** (invalid input → 400)
|
||||||
|
- [ ] **Authentication** (no token → 401)
|
||||||
|
- [ ] **Authorization** (wrong user → 403)
|
||||||
|
- [ ] **Not found** (missing resource → 404)
|
||||||
|
- [ ] **Idempotency** (duplicate request → same result)
|
||||||
|
- [ ] **Schema** (response matches expected structure)
|
||||||
|
- [ ] **Edge cases** (empty lists, large payloads, pagination)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**API tests are faster, more reliable, and provide better coverage than E2E browser tests for backend logic.**
|
||||||
|
|
||||||
|
- Test APIs directly, not through the browser
|
||||||
|
- Use your existing test framework (pytest/Jest) + HTTP client
|
||||||
|
- Validate schemas to catch breaking changes
|
||||||
|
- Mock external services to avoid flakiness and cost
|
||||||
|
- Run API tests on every commit (they're fast enough)
|
||||||
|
|
||||||
|
**If you're using browser automation to test API functionality, you're doing it wrong. Test APIs directly.**
|
||||||
242
skills/chaos-engineering-principles/SKILL.md
Normal file
242
skills/chaos-engineering-principles/SKILL.md
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
---
|
||||||
|
name: chaos-engineering-principles
|
||||||
|
description: Use when starting chaos engineering, designing fault injection experiments, choosing chaos tools, testing system resilience, or recovering from chaos incidents - provides hypothesis-driven testing, blast radius control, and anti-patterns for safe chaos
|
||||||
|
---
|
||||||
|
|
||||||
|
# Chaos Engineering Principles
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Chaos engineering validates resilience through controlled experiments, not random destruction.
|
||||||
|
|
||||||
|
**Rule:** Start in staging, with monitoring, with rollback, with small blast radius. No exceptions.
|
||||||
|
|
||||||
|
## When NOT to Do Chaos
|
||||||
|
|
||||||
|
Don't run chaos experiments if ANY of these are missing:
|
||||||
|
- ❌ No comprehensive monitoring (APM, metrics, logs, alerts)
|
||||||
|
- ❌ No automated rollback capability
|
||||||
|
- ❌ No baseline metrics documented
|
||||||
|
- ❌ No incident response team available
|
||||||
|
- ❌ System already unstable (fix stability first)
|
||||||
|
- ❌ No staging environment to practice
|
||||||
|
|
||||||
|
**Fix these prerequisites BEFORE chaos testing.**
|
||||||
|
|
||||||
|
## Tool Selection Decision Tree
|
||||||
|
|
||||||
|
| Your Constraint | Choose | Why |
|
||||||
|
|----------------|--------|-----|
|
||||||
|
| Kubernetes-native, CNCF preference | **LitmusChaos** | Cloud-native, operator-based, excellent K8s integration |
|
||||||
|
| Kubernetes-focused, visualization needs | **Chaos Mesh** | Fine-grained control, dashboards, low overhead |
|
||||||
|
| Want managed service, quick start | **Gremlin** | Commercial, guided experiments, built-in best practices |
|
||||||
|
| Vendor-neutral, maximum flexibility | **Chaos Toolkit** | Open source, plugin ecosystem, any infrastructure |
|
||||||
|
| AWS-specific, cost-sensitive | **AWS FIS** | Native AWS integration, pay-per-experiment |
|
||||||
|
|
||||||
|
**For most teams:** Chaos Toolkit (flexible, free) or Gremlin (fast, managed)
|
||||||
|
|
||||||
|
## Prerequisites Checklist
|
||||||
|
|
||||||
|
Before FIRST experiment:
|
||||||
|
|
||||||
|
**Monitoring (Required):**
|
||||||
|
- [ ] Real-time dashboards for key metrics (latency, error rate, throughput)
|
||||||
|
- [ ] Distributed tracing for request flows
|
||||||
|
- [ ] Log aggregation with timeline correlation
|
||||||
|
- [ ] Alerts configured with thresholds
|
||||||
|
|
||||||
|
**Rollback (Required):**
|
||||||
|
- [ ] Automated rollback based on metrics (e.g., error rate > 5% → abort)
|
||||||
|
- [ ] Manual kill switch everyone can activate
|
||||||
|
- [ ] Rollback tested and documented (< 30 sec recovery)
|
||||||
|
|
||||||
|
**Baseline (Required):**
|
||||||
|
- [ ] Documented normal metrics (P50/P95/P99 latency, error rate %)
|
||||||
|
- [ ] Known dependencies and critical paths
|
||||||
|
- [ ] System architecture diagram
|
||||||
|
|
||||||
|
**Team (Required):**
|
||||||
|
- [ ] Designated observer monitoring experiment
|
||||||
|
- [ ] On-call engineer available
|
||||||
|
- [ ] Communication channel established (war room, Slack)
|
||||||
|
- [ ] Post-experiment debrief scheduled
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Production First Chaos
|
||||||
|
**Symptom:** "Let's start chaos testing in production to see what breaks"
|
||||||
|
|
||||||
|
**Why bad:** No practice, no muscle memory, production incidents guaranteed
|
||||||
|
|
||||||
|
**Fix:** Run 5-10 experiments in staging FIRST. Graduate to production only after proving: experiments work as designed, rollback functions, team can execute response
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Chaos Without Monitoring
|
||||||
|
**Symptom:** "We injected latency but we're not sure what happened"
|
||||||
|
|
||||||
|
**Why bad:** Blind chaos = no learning. You can't validate resilience without seeing system behavior
|
||||||
|
|
||||||
|
**Fix:** Set up comprehensive monitoring BEFORE first experiment. Must be able to answer "What changed?" within 30 seconds
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Unlimited Blast Radius
|
||||||
|
**Symptom:** Affecting 100% of traffic/all services on first run
|
||||||
|
|
||||||
|
**Why bad:** Cascading failures, actual outages, customer impact
|
||||||
|
|
||||||
|
**Fix:** Start at 0.1-1% traffic. Progression: 0.1% → 1% → 5% → 10% → (stop or 50%). Each step validates before expanding
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Chaos Without Rollback
|
||||||
|
**Symptom:** "The experiment broke everything and we can't stop it"
|
||||||
|
|
||||||
|
**Why bad:** Chaos becomes real incident, 2+ hour recovery, lost trust
|
||||||
|
|
||||||
|
**Fix:** Automated abort criteria (error rate threshold, latency threshold, manual kill switch). Test rollback before injecting failures
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Random Chaos (No Hypothesis)
|
||||||
|
**Symptom:** "Let's inject some failures and see what happens"
|
||||||
|
|
||||||
|
**Why bad:** No learning objective, can't validate resilience, wasted time
|
||||||
|
|
||||||
|
**Fix:** Every experiment needs hypothesis: "System will [expected behavior] when [failure injected]"
|
||||||
|
|
||||||
|
## Failure Types Catalog
|
||||||
|
|
||||||
|
Priority order for microservices:
|
||||||
|
|
||||||
|
| Failure Type | Priority | Why Test This | Example |
|
||||||
|
|--------------|----------|---------------|---------|
|
||||||
|
| **Network Latency** | HIGH | Most common production issue | 500ms delay service A → B |
|
||||||
|
| **Service Timeout** | HIGH | Tests circuit breakers, retry logic | Service B unresponsive |
|
||||||
|
| **Connection Loss** | HIGH | Tests failover, graceful degradation | TCP connection drops |
|
||||||
|
| **Resource Exhaustion** | MEDIUM | Tests resource limits, scaling | Memory limit, connection pool full |
|
||||||
|
| **Packet Loss** | MEDIUM | Tests retry strategies | 1-10% packet loss |
|
||||||
|
| **DNS Failure** | MEDIUM | Tests service discovery resilience | DNS resolution delays |
|
||||||
|
| **Cache Failure** | MEDIUM | Tests fallback behavior | Redis down |
|
||||||
|
| **Database Errors** | LOW (start) | High risk - test after basics work | Connection refused, query timeout |
|
||||||
|
|
||||||
|
**Start with network latency** - safest, most informative, easiest rollback.
|
||||||
|
|
||||||
|
## Experiment Template
|
||||||
|
|
||||||
|
Use this for every chaos experiment:
|
||||||
|
|
||||||
|
**1. Hypothesis**
|
||||||
|
"If [failure injected], system will [expected behavior], and [metric] will remain [threshold]"
|
||||||
|
|
||||||
|
Example: "If service-payment experiences 2s latency, circuit breaker will open within 10s, and P99 latency will stay < 500ms"
|
||||||
|
|
||||||
|
**2. Baseline Metrics**
|
||||||
|
- Current P50/P95/P99 latency:
|
||||||
|
- Current error rate:
|
||||||
|
- Current throughput:
|
||||||
|
|
||||||
|
**3. Experiment Config**
|
||||||
|
- Failure type: [latency / packet loss / service down / etc.]
|
||||||
|
- Target: [specific service / % of traffic]
|
||||||
|
- Blast radius: [0.1% traffic, single region, canary pods]
|
||||||
|
- Duration: [2-5 minutes initial]
|
||||||
|
- Abort criteria: [error rate > 5% OR P99 > 1s OR manual stop]
|
||||||
|
|
||||||
|
**4. Execution**
|
||||||
|
- Observer: [name] monitoring dashboards
|
||||||
|
- Runner: [name] executing experiment
|
||||||
|
- Kill switch: [procedure]
|
||||||
|
- Start time: [timestamp]
|
||||||
|
|
||||||
|
**5. Observation**
|
||||||
|
- What happened vs hypothesis:
|
||||||
|
- Actual metrics during chaos:
|
||||||
|
- System behavior notes:
|
||||||
|
|
||||||
|
**6. Validation**
|
||||||
|
- ✓ Hypothesis validated / ✗ Hypothesis failed
|
||||||
|
- Unexpected findings:
|
||||||
|
- Action items:
|
||||||
|
|
||||||
|
## Blast Radius Progression
|
||||||
|
|
||||||
|
Safe scaling path:
|
||||||
|
|
||||||
|
| Step | Traffic Affected | Duration | Abort If |
|
||||||
|
|------|------------------|----------|----------|
|
||||||
|
| **1. Staging** | 100% staging | 5 min | Any production impact |
|
||||||
|
| **2. Canary** | 0.1% production | 2 min | Error rate > 1% |
|
||||||
|
| **3. Small** | 1% production | 5 min | Error rate > 2% |
|
||||||
|
| **4. Medium** | 5% production | 10 min | Error rate > 5% |
|
||||||
|
| **5. Large** | 10% production | 15 min | Error rate > 5% |
|
||||||
|
|
||||||
|
**Never skip steps.** Each step validates before expanding.
|
||||||
|
|
||||||
|
**Stop at 10-20% for most experiments** - no need to chaos 100% of production traffic.
|
||||||
|
|
||||||
|
**Low-traffic services (< 1000 req/day):** Use absolute request counts instead of percentages. Minimum 5-10 affected requests per step. Example: 100 req/day service should still start with 5-10 requests (6 hours), not 0.1% (1 request every 10 days).
|
||||||
|
|
||||||
|
## Your First Experiment (Staging)
|
||||||
|
|
||||||
|
**Goal:** Build confidence, validate monitoring, test rollback
|
||||||
|
|
||||||
|
**Experiment:** Network latency on non-critical service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Example with Chaos Toolkit
|
||||||
|
1. Pick least critical service (e.g., recommendation engine, not payment)
|
||||||
|
2. Inject 500ms latency to 100% of staging traffic
|
||||||
|
3. Duration: 5 minutes
|
||||||
|
4. Expected: Timeouts handled gracefully, fallback behavior activates
|
||||||
|
5. Monitor: Error rate, latency, downstream services
|
||||||
|
6. Abort if: Error rate > 10% or cascading failures
|
||||||
|
7. Debrief: What did we learn? Did monitoring catch it? Did rollback work?
|
||||||
|
```
|
||||||
|
|
||||||
|
**Success criteria:** You can answer "Did our hypothesis hold?" within 5 minutes of experiment completion.
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Testing During Incidents
|
||||||
|
**Fix:** Only chaos test during stable periods, business hours, with extra staffing
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Network Latency Underestimation
|
||||||
|
**Fix:** Latency cascades - 500ms can become 5s downstream. Start with 100-200ms, observe, then increase
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Post-Experiment Review
|
||||||
|
**Fix:** Every experiment gets 15-min debrief: What worked? What broke? What did we learn?
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Prerequisites Before First Chaos:**
|
||||||
|
1. Monitoring + alerts
|
||||||
|
2. Automated rollback
|
||||||
|
3. Baseline metrics documented
|
||||||
|
4. Team coordinated
|
||||||
|
|
||||||
|
**Experiment Steps:**
|
||||||
|
1. Write hypothesis
|
||||||
|
2. Document baseline
|
||||||
|
3. Define blast radius (start 0.1%)
|
||||||
|
4. Set abort criteria
|
||||||
|
5. Execute with observer
|
||||||
|
6. Validate hypothesis
|
||||||
|
7. Debrief team
|
||||||
|
|
||||||
|
**Blast Radius Progression:**
|
||||||
|
Staging → 0.1% → 1% → 5% → 10% (stop for most experiments)
|
||||||
|
|
||||||
|
**First Experiment:**
|
||||||
|
Network latency (500ms) on non-critical service in staging for 5 minutes
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Chaos engineering is hypothesis-driven science, not random destruction.**
|
||||||
|
|
||||||
|
Start small (staging, 0.1% traffic), with monitoring, with rollback. Graduate slowly.
|
||||||
524
skills/contract-testing/SKILL.md
Normal file
524
skills/contract-testing/SKILL.md
Normal file
@@ -0,0 +1,524 @@
|
|||||||
|
---
|
||||||
|
name: contract-testing
|
||||||
|
description: Use when implementing Pact contracts, choosing consumer-driven vs provider-driven approaches, handling breaking API changes, setting up contract brokers, or preventing service integration issues - provides tool selection, anti-patterns, and workflow patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# Contract Testing
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Test the contract, not the implementation. Verify integration points independently.
|
||||||
|
|
||||||
|
**Rule:** Contract tests catch breaking changes before deployment, not in production.
|
||||||
|
|
||||||
|
## Tool Selection Decision Tree
|
||||||
|
|
||||||
|
| Your Stack | Team Structure | Use | Why |
|
||||||
|
|-----------|----------------|-----|-----|
|
||||||
|
| Polyglot microservices | Multiple teams | **Pact** | Language-agnostic, mature broker |
|
||||||
|
| Java Spring ecosystem | Coordinated teams | **Spring Cloud Contract** | Spring integration, code-first |
|
||||||
|
| GraphQL APIs | Known consumers | **Pact + GraphQL** | Query validation |
|
||||||
|
| OpenAPI/REST | Public/many consumers | **OpenAPI Spec Testing** | Schema-first, documentation |
|
||||||
|
|
||||||
|
**First choice:** Pact (most mature ecosystem, widest language support)
|
||||||
|
|
||||||
|
**Why contract testing:** Catches API breaking changes in CI, not production. Teams test independently without running dependencies.
|
||||||
|
|
||||||
|
## Contract Type Decision Framework
|
||||||
|
|
||||||
|
| Scenario | Approach | Tools |
|
||||||
|
|----------|----------|-------|
|
||||||
|
| **Internal microservices, known consumers** | Consumer-Driven (CDC) | Pact, Spring Cloud Contract |
|
||||||
|
| **Public API, many unknown consumers** | Provider-Driven (Schema-First) | OpenAPI validation, Spectral |
|
||||||
|
| **Both internal and external consumers** | Bi-Directional | Pact + OpenAPI |
|
||||||
|
| **Event-driven/async messaging** | Message Pact | Pact (message provider/consumer) |
|
||||||
|
|
||||||
|
**Default:** Consumer-driven for internal services, schema-first for public APIs
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Over-Specification
|
||||||
|
**Symptom:** Contract tests verify exact response format, including fields consumer doesn't use
|
||||||
|
|
||||||
|
**Why bad:** Brittle tests, provider can't evolve API, false positives
|
||||||
|
|
||||||
|
**Fix:** Only specify what consumer actually uses
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad - over-specified
|
||||||
|
.willRespondWith({
|
||||||
|
status: 200,
|
||||||
|
body: {
|
||||||
|
id: 123,
|
||||||
|
name: 'John',
|
||||||
|
email: 'john@example.com',
|
||||||
|
created_at: '2023-01-01',
|
||||||
|
updated_at: '2023-01-02',
|
||||||
|
phone: '555-1234',
|
||||||
|
address: {...} // Consumer doesn't use these
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
// ✅ Good - specify only what's used
|
||||||
|
.willRespondWith({
|
||||||
|
status: 200,
|
||||||
|
body: {
|
||||||
|
id: Matchers.integer(123),
|
||||||
|
name: Matchers.string('John')
|
||||||
|
}
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Implementation Details
|
||||||
|
**Symptom:** Contract tests verify database queries, internal logic, or response timing
|
||||||
|
|
||||||
|
**Why bad:** Couples tests to implementation, not contract
|
||||||
|
|
||||||
|
**Fix:** Test only request/response contract, not how provider implements it
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad - testing implementation
|
||||||
|
expect(provider.database.queryCalled).toBe(true)
|
||||||
|
|
||||||
|
// ✅ Good - testing contract only
|
||||||
|
expect(response.status).toBe(200)
|
||||||
|
expect(response.body.name).toBe('John')
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Brittle Provider States
|
||||||
|
**Symptom:** Provider states hardcode IDs, dates, or specific data that changes
|
||||||
|
|
||||||
|
**Why bad:** Tests fail randomly, high maintenance
|
||||||
|
|
||||||
|
**Fix:** Use matchers, generate data in state setup
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad - hardcoded state
|
||||||
|
.given('user 123 exists')
|
||||||
|
.uponReceiving('request for user 123')
|
||||||
|
.withRequest({ path: '/users/123' })
|
||||||
|
|
||||||
|
// ✅ Good - flexible state
|
||||||
|
.given('a user exists')
|
||||||
|
.uponReceiving('request for user')
|
||||||
|
.withRequest({ path: Matchers.regex('/users/\\d+', '/users/123') })
|
||||||
|
.willRespondWith({
|
||||||
|
body: {
|
||||||
|
id: Matchers.integer(123),
|
||||||
|
name: Matchers.string('John')
|
||||||
|
}
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Contract Versioning
|
||||||
|
**Symptom:** Breaking changes deployed without consumer coordination
|
||||||
|
|
||||||
|
**Why bad:** Runtime failures, production incidents
|
||||||
|
|
||||||
|
**Fix:** Use can-i-deploy, tag contracts by environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ✅ Good - check before deploying
|
||||||
|
pact-broker can-i-deploy \
|
||||||
|
--pacticipant UserService \
|
||||||
|
--version 2.0.0 \
|
||||||
|
--to production
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Missing Can-I-Deploy
|
||||||
|
**Symptom:** Deploying without checking if all consumers compatible
|
||||||
|
|
||||||
|
**Why bad:** Deploy provider changes that break consumers
|
||||||
|
|
||||||
|
**Fix:** Run can-i-deploy in CI before deployment
|
||||||
|
|
||||||
|
## Pact Broker Workflow
|
||||||
|
|
||||||
|
**Core workflow:**
|
||||||
|
|
||||||
|
1. **Consumer:** Write contract test → Generate pact file
|
||||||
|
2. **Consumer CI:** Publish pact to broker with version tag
|
||||||
|
3. **Provider CI:** Fetch contracts → Verify → Publish results
|
||||||
|
4. **Provider CD:** Run can-i-deploy → Deploy if compatible
|
||||||
|
|
||||||
|
### Publishing Contracts
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Consumer publishes pact with version and branch
|
||||||
|
pact-broker publish pacts/ \
|
||||||
|
--consumer-app-version ${GIT_SHA} \
|
||||||
|
--branch ${GIT_BRANCH} \
|
||||||
|
--tag ${ENV}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verifying Contracts
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Provider verifies against broker
|
||||||
|
const { Verifier } = require('@pact-foundation/pact')
|
||||||
|
|
||||||
|
new Verifier({
|
||||||
|
providerBaseUrl: 'http://localhost:8080',
|
||||||
|
pactBrokerUrl: process.env.PACT_BROKER_URL,
|
||||||
|
provider: 'UserService',
|
||||||
|
publishVerificationResult: true,
|
||||||
|
providerVersion: process.env.GIT_SHA,
|
||||||
|
consumerVersionSelectors: [
|
||||||
|
{ mainBranch: true }, // Latest from main
|
||||||
|
{ deployed: 'production' }, // Currently in production
|
||||||
|
{ deployed: 'staging' } // Currently in staging
|
||||||
|
]
|
||||||
|
}).verifyProvider()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Can-I-Deploy Check
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# CI/CD pipeline (GitHub Actions example)
|
||||||
|
- name: Check if can deploy
|
||||||
|
run: |
|
||||||
|
pact-broker can-i-deploy \
|
||||||
|
--pacticipant UserService \
|
||||||
|
--version ${{ github.sha }} \
|
||||||
|
--to-environment production
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rule:** Never deploy without can-i-deploy passing
|
||||||
|
|
||||||
|
## Breaking Change Taxonomy
|
||||||
|
|
||||||
|
| Change Type | Breaking? | Migration Strategy |
|
||||||
|
|-------------|-----------|-------------------|
|
||||||
|
| Add optional field | No | Deploy provider first |
|
||||||
|
| Add required field | Yes | Use expand/contract pattern |
|
||||||
|
| Remove field | Yes | Deprecate → verify no consumers use → remove |
|
||||||
|
| Change field type | Yes | Add new field → migrate consumers → remove old |
|
||||||
|
| Rename field | Yes | Add new → deprecate old → remove old |
|
||||||
|
| Change status code | Yes | Version API or expand responses |
|
||||||
|
|
||||||
|
### Expand/Contract Pattern
|
||||||
|
|
||||||
|
**For adding required field:**
|
||||||
|
|
||||||
|
**Expand (Week 1-2):**
|
||||||
|
```javascript
|
||||||
|
// Provider adds NEW field (optional), keeps OLD field
|
||||||
|
{
|
||||||
|
user_name: "John", // Old field (deprecated)
|
||||||
|
name: "John" // New field
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Migrate (Week 3-4):**
|
||||||
|
- Consumers update to use new field
|
||||||
|
- Update contracts
|
||||||
|
- Verify all consumers migrated
|
||||||
|
|
||||||
|
**Contract (Week 5):**
|
||||||
|
```javascript
|
||||||
|
// Provider removes old field
|
||||||
|
{
|
||||||
|
name: "John" // Only new field remains
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Provider State Patterns
|
||||||
|
|
||||||
|
**Purpose:** Set up test data before verification
|
||||||
|
|
||||||
|
**Pattern:** Use state handlers to create/clean up data
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Provider state setup
|
||||||
|
const { Verifier } = require('@pact-foundation/pact')
|
||||||
|
|
||||||
|
new Verifier({
|
||||||
|
stateHandlers: {
|
||||||
|
'a user exists': async () => {
|
||||||
|
// Setup: Create test user
|
||||||
|
await db.users.create({
|
||||||
|
id: 123,
|
||||||
|
name: 'John Doe'
|
||||||
|
})
|
||||||
|
},
|
||||||
|
'no users exist': async () => {
|
||||||
|
// Setup: Clear users
|
||||||
|
await db.users.deleteAll()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
afterEach: async () => {
|
||||||
|
// Cleanup after each verification
|
||||||
|
await db.users.deleteAll()
|
||||||
|
}
|
||||||
|
}).verifyProvider()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Best practices:**
|
||||||
|
- States should be independent
|
||||||
|
- Clean up after each verification
|
||||||
|
- Use transactions for database tests
|
||||||
|
- Don't hardcode IDs (use matchers)
|
||||||
|
|
||||||
|
## Async/Event-Driven Messaging Contracts
|
||||||
|
|
||||||
|
**For Kafka, RabbitMQ, SNS/SQS:** Use Message Pact (different API than HTTP Pact)
|
||||||
|
|
||||||
|
### Consumer Message Contract
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const { MessageConsumerPact, MatchersV3 } = require('@pact-foundation/pact')
|
||||||
|
|
||||||
|
describe('User Event Consumer', () => {
|
||||||
|
const messagePact = new MessageConsumerPact({
|
||||||
|
consumer: 'NotificationService',
|
||||||
|
provider: 'UserService'
|
||||||
|
})
|
||||||
|
|
||||||
|
it('processes user created events', () => {
|
||||||
|
return messagePact
|
||||||
|
.expectsToReceive('user created event')
|
||||||
|
.withContent({
|
||||||
|
userId: MatchersV3.integer(123),
|
||||||
|
email: MatchersV3.string('user@example.com'),
|
||||||
|
eventType: 'USER_CREATED'
|
||||||
|
})
|
||||||
|
.withMetadata({
|
||||||
|
'content-type': 'application/json'
|
||||||
|
})
|
||||||
|
.verify((message) => {
|
||||||
|
processUserCreatedEvent(message.contents)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
### Provider Message Verification
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Provider verifies it can produce matching messages
|
||||||
|
const { MessageProviderPact } = require('@pact-foundation/pact')
|
||||||
|
|
||||||
|
describe('User Event Producer', () => {
|
||||||
|
it('publishes user created events matching contracts', () => {
|
||||||
|
return new MessageProviderPact({
|
||||||
|
messageProviders: {
|
||||||
|
'user created event': () => ({
|
||||||
|
contents: {
|
||||||
|
userId: 123,
|
||||||
|
email: 'test@example.com',
|
||||||
|
eventType: 'USER_CREATED'
|
||||||
|
},
|
||||||
|
metadata: {
|
||||||
|
'content-type': 'application/json'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}).verify()
|
||||||
|
})
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
### Key Differences from HTTP Contracts
|
||||||
|
|
||||||
|
- **No request/response:** Only message payload
|
||||||
|
- **Metadata:** Headers, content-type, message keys
|
||||||
|
- **Ordering:** Don't test message ordering in contracts (infrastructure concern)
|
||||||
|
- **Delivery:** Don't test delivery guarantees (wrong layer)
|
||||||
|
|
||||||
|
**Workflow:** Same as HTTP (publish pact → verify → can-i-deploy)
|
||||||
|
|
||||||
|
## CI/CD Integration Quick Reference
|
||||||
|
|
||||||
|
### GitHub Actions
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Consumer publishes contracts
|
||||||
|
- name: Run Pact tests
|
||||||
|
run: npm test
|
||||||
|
|
||||||
|
- name: Publish pacts
|
||||||
|
run: |
|
||||||
|
npm run pact:publish
|
||||||
|
env:
|
||||||
|
PACT_BROKER_URL: ${{ secrets.PACT_BROKER_URL }}
|
||||||
|
PACT_BROKER_TOKEN: ${{ secrets.PACT_BROKER_TOKEN }}
|
||||||
|
|
||||||
|
# Provider verifies and checks deployment
|
||||||
|
- name: Verify contracts
|
||||||
|
run: npm run pact:verify
|
||||||
|
|
||||||
|
- name: Can I deploy?
|
||||||
|
run: |
|
||||||
|
pact-broker can-i-deploy \
|
||||||
|
--pacticipant UserService \
|
||||||
|
--version ${{ github.sha }} \
|
||||||
|
--to-environment production
|
||||||
|
```
|
||||||
|
|
||||||
|
### GitLab CI
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
pact_test:
|
||||||
|
script:
|
||||||
|
- npm test
|
||||||
|
- npm run pact:publish
|
||||||
|
|
||||||
|
pact_verify:
|
||||||
|
script:
|
||||||
|
- npm run pact:verify
|
||||||
|
- pact-broker can-i-deploy --pacticipant UserService --version $CI_COMMIT_SHA --to-environment production
|
||||||
|
```
|
||||||
|
|
||||||
|
## Your First Contract Test
|
||||||
|
|
||||||
|
**Goal:** Prevent breaking changes between two services in one week
|
||||||
|
|
||||||
|
**Day 1-2: Consumer Side**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Install Pact
|
||||||
|
npm install --save-dev @pact-foundation/pact
|
||||||
|
|
||||||
|
// Consumer contract test (order-service)
|
||||||
|
const { PactV3, MatchersV3 } = require('@pact-foundation/pact')
|
||||||
|
const { getUserById } = require('./userClient')
|
||||||
|
|
||||||
|
describe('User API', () => {
|
||||||
|
const provider = new PactV3({
|
||||||
|
consumer: 'OrderService',
|
||||||
|
provider: 'UserService'
|
||||||
|
})
|
||||||
|
|
||||||
|
it('gets user by id', () => {
|
||||||
|
provider
|
||||||
|
.given('a user exists')
|
||||||
|
.uponReceiving('a request for user')
|
||||||
|
.withRequest({
|
||||||
|
method: 'GET',
|
||||||
|
path: '/users/123'
|
||||||
|
})
|
||||||
|
.willRespondWith({
|
||||||
|
status: 200,
|
||||||
|
body: {
|
||||||
|
id: MatchersV3.integer(123),
|
||||||
|
name: MatchersV3.string('John')
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return provider.executeTest(async (mockServer) => {
|
||||||
|
const user = await getUserById(mockServer.url, 123)
|
||||||
|
expect(user.name).toBe('John')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
**Day 3-4: Set Up Pact Broker**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker Compose
|
||||||
|
docker-compose up -d
|
||||||
|
|
||||||
|
# Or use hosted Pactflow (SaaS)
|
||||||
|
# https://pactflow.io
|
||||||
|
```
|
||||||
|
|
||||||
|
**Day 5-6: Provider Side**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Provider verification (user-service)
|
||||||
|
const { Verifier } = require('@pact-foundation/pact')
|
||||||
|
const app = require('./app')
|
||||||
|
|
||||||
|
describe('Pact Verification', () => {
|
||||||
|
it('validates contracts from broker', () => {
|
||||||
|
return new Verifier({
|
||||||
|
provider: 'UserService',
|
||||||
|
providerBaseUrl: 'http://localhost:8080',
|
||||||
|
pactBrokerUrl: process.env.PACT_BROKER_URL,
|
||||||
|
publishVerificationResult: true,
|
||||||
|
providerVersion: '1.0.0',
|
||||||
|
|
||||||
|
stateHandlers: {
|
||||||
|
'a user exists': async () => {
|
||||||
|
await db.users.create({ id: 123, name: 'John' })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}).verifyProvider()
|
||||||
|
})
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
**Day 7: Add to CI**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Add can-i-deploy before deployment
|
||||||
|
- pact-broker can-i-deploy --pacticipant UserService --version $VERSION --to production
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Testing Business Logic in Contracts
|
||||||
|
**Fix:** Contract tests verify integration only. Test business logic separately.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Not Using Matchers
|
||||||
|
**Fix:** Use `Matchers.string()`, `Matchers.integer()` for flexible matching
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Skipping Can-I-Deploy
|
||||||
|
**Fix:** Always run can-i-deploy before deployment. Automate in CI.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Hardcoding Test Data
|
||||||
|
**Fix:** Generate data in provider states, use matchers in contracts
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Tool Selection:**
|
||||||
|
- Polyglot/multiple teams: Pact
|
||||||
|
- Java Spring only: Spring Cloud Contract
|
||||||
|
- Public API: OpenAPI validation
|
||||||
|
|
||||||
|
**Contract Type:**
|
||||||
|
- Internal services: Consumer-driven (Pact)
|
||||||
|
- Public API: Provider-driven (OpenAPI)
|
||||||
|
- Both: Bi-directional
|
||||||
|
|
||||||
|
**Pact Broker Workflow:**
|
||||||
|
1. Consumer publishes pact
|
||||||
|
2. Provider verifies
|
||||||
|
3. Can-i-deploy checks compatibility
|
||||||
|
4. Deploy if compatible
|
||||||
|
|
||||||
|
**Breaking Changes:**
|
||||||
|
- Add optional field: Safe
|
||||||
|
- Add required field: Expand/contract pattern
|
||||||
|
- Remove/rename field: Deprecate → migrate → remove
|
||||||
|
|
||||||
|
**Provider States:**
|
||||||
|
- Set up test data
|
||||||
|
- Clean up after each test
|
||||||
|
- Use transactions for DB
|
||||||
|
- Don't hardcode IDs
|
||||||
|
|
||||||
|
**CI/CD:**
|
||||||
|
- Consumer: Test → publish pacts
|
||||||
|
- Provider: Verify → can-i-deploy → deploy
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Contract testing prevents API breaking changes by testing integration points independently. Use Pact for internal microservices, publish contracts to broker, run can-i-deploy before deployment.**
|
||||||
|
|
||||||
|
Test the contract (request/response), not the implementation. Use consumer-driven contracts for known consumers, schema-first for public APIs.
|
||||||
429
skills/dependency-scanning/SKILL.md
Normal file
429
skills/dependency-scanning/SKILL.md
Normal file
@@ -0,0 +1,429 @@
|
|||||||
|
---
|
||||||
|
name: dependency-scanning
|
||||||
|
description: Use when integrating SCA tools (Snyk, Dependabot, OWASP Dependency-Check), automating vulnerability management, handling license compliance, setting up automated dependency updates, or managing security advisories - provides tool selection, PR automation workflows, and false positive management
|
||||||
|
---
|
||||||
|
|
||||||
|
# Dependency Scanning
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Third-party dependencies introduce security vulnerabilities and license risks. Automate scanning to catch them early.
|
||||||
|
|
||||||
|
**Rule:** Block merges on critical/high vulnerabilities in direct dependencies. Monitor and plan fixes for transitive dependencies.
|
||||||
|
|
||||||
|
## Why Dependency Scanning Matters
|
||||||
|
|
||||||
|
**Security vulnerabilities:**
|
||||||
|
- 80% of codebases contain at least one vulnerable dependency
|
||||||
|
- Log4Shell (CVE-2021-44228) affected millions of applications
|
||||||
|
- Attackers actively scan GitHub for known vulnerabilities
|
||||||
|
|
||||||
|
**License compliance:**
|
||||||
|
- GPL dependencies in proprietary software = legal risk
|
||||||
|
- Some licenses require source code disclosure
|
||||||
|
- Incompatible license combinations
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Selection
|
||||||
|
|
||||||
|
| Tool | Use Case | Cost | Best For |
|
||||||
|
|------|----------|------|----------|
|
||||||
|
| **Dependabot** | Automated PRs for updates | Free (GitHub) | GitHub projects, basic scanning |
|
||||||
|
| **Snyk** | Comprehensive security + license scanning | Free tier, paid plans | Production apps, detailed remediation |
|
||||||
|
| **OWASP Dependency-Check** | Security-focused, self-hosted | Free | Privacy-sensitive, custom workflows |
|
||||||
|
| **npm audit** | JavaScript quick scan | Free | Quick local checks |
|
||||||
|
| **pip-audit** | Python quick scan | Free | Quick local checks |
|
||||||
|
| **bundler-audit** | Ruby quick scan | Free | Quick local checks |
|
||||||
|
|
||||||
|
**Recommended setup:**
|
||||||
|
- **GitHub repos:** Dependabot (automated) + Snyk (security focus)
|
||||||
|
- **Self-hosted:** OWASP Dependency-Check
|
||||||
|
- **Quick local checks:** npm audit / pip-audit
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dependabot Configuration
|
||||||
|
|
||||||
|
### Enable Dependabot (GitHub)
|
||||||
|
|
||||||
|
``yaml
|
||||||
|
# .github/dependabot.yml
|
||||||
|
version: 2
|
||||||
|
updates:
|
||||||
|
- package-ecosystem: "npm"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
|
day: "monday"
|
||||||
|
open-pull-requests-limit: 5
|
||||||
|
labels:
|
||||||
|
- "dependencies"
|
||||||
|
reviewers:
|
||||||
|
- "security-team"
|
||||||
|
|
||||||
|
- package-ecosystem: "pip"
|
||||||
|
directory: "/"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
|
target-branch: "develop"
|
||||||
|
```
|
||||||
|
|
||||||
|
**What Dependabot does:**
|
||||||
|
- Scans dependencies weekly
|
||||||
|
- Creates PRs for vulnerabilities
|
||||||
|
- Updates to safe versions
|
||||||
|
- Provides CVE details
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Snyk Integration
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install -g snyk
|
||||||
|
snyk auth # Authenticate with Snyk account
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Scan Local Project
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test for vulnerabilities
|
||||||
|
snyk test
|
||||||
|
|
||||||
|
# Monitor project (continuous scanning)
|
||||||
|
snyk monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### CI/CD Integration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/snyk.yml
|
||||||
|
name: Snyk Security Scan
|
||||||
|
|
||||||
|
on: [pull_request, push]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
security:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Run Snyk
|
||||||
|
uses: snyk/actions/node@master
|
||||||
|
env:
|
||||||
|
SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
|
||||||
|
with:
|
||||||
|
args: --severity-threshold=high # Fail on high+ severity
|
||||||
|
```
|
||||||
|
|
||||||
|
**Severity thresholds:**
|
||||||
|
- **Critical:** Block merge immediately
|
||||||
|
- **High:** Block merge, fix within 7 days
|
||||||
|
- **Medium:** Create issue, fix within 30 days
|
||||||
|
- **Low:** Monitor, fix opportunistically
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## OWASP Dependency-Check
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Download latest release
|
||||||
|
wget https://github.com/jeremylong/DependencyCheck/releases/download/v8.0.0/dependency-check-8.0.0-release.zip
|
||||||
|
unzip dependency-check-8.0.0-release.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Run Scan
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Scan project
|
||||||
|
./dependency-check/bin/dependency-check.sh \
|
||||||
|
--scan ./src \
|
||||||
|
--format HTML \
|
||||||
|
--out ./reports \
|
||||||
|
--suppression ./dependency-check-suppressions.xml
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Suppression File (False Positives)
|
||||||
|
|
||||||
|
```xml
|
||||||
|
<!-- dependency-check-suppressions.xml -->
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<suppressions xmlns="https://jeremylong.github.io/DependencyCheck/dependency-suppression.1.3.xsd">
|
||||||
|
<suppress>
|
||||||
|
<notes>False positive - CVE applies to server mode only, we use client mode</notes>
|
||||||
|
<cve>CVE-2021-12345</cve>
|
||||||
|
</suppress>
|
||||||
|
</suppressions>
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License Compliance
|
||||||
|
|
||||||
|
### Checking Licenses (npm)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List all licenses
|
||||||
|
npx license-checker
|
||||||
|
|
||||||
|
# Filter incompatible licenses
|
||||||
|
npx license-checker --onlyAllow 'MIT;Apache-2.0;BSD-3-Clause'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Blocking Incompatible Licenses
|
||||||
|
|
||||||
|
```json
|
||||||
|
// package.json
|
||||||
|
{
|
||||||
|
"scripts": {
|
||||||
|
"license-check": "license-checker --onlyAllow 'MIT;Apache-2.0;BSD-3-Clause;ISC' --production"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# CI: Fail if incompatible licenses detected
|
||||||
|
- name: Check licenses
|
||||||
|
run: npm run license-check
|
||||||
|
```
|
||||||
|
|
||||||
|
**Common license risks:**
|
||||||
|
- **GPL/AGPL:** Requires source code disclosure
|
||||||
|
- **SSPL:** Restrictive for SaaS
|
||||||
|
- **Proprietary:** May prohibit commercial use
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Automated Dependency Updates
|
||||||
|
|
||||||
|
### Auto-Merge Strategy
|
||||||
|
|
||||||
|
**Safe to auto-merge:**
|
||||||
|
- Patch versions (1.2.3 → 1.2.4)
|
||||||
|
- No breaking changes
|
||||||
|
- Passing all tests
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/auto-merge-dependabot.yml
|
||||||
|
name: Auto-merge Dependabot PRs
|
||||||
|
|
||||||
|
on: pull_request
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
auto-merge:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: github.actor == 'dependabot[bot]'
|
||||||
|
steps:
|
||||||
|
- name: Check if patch update
|
||||||
|
id: check
|
||||||
|
run: |
|
||||||
|
# Only auto-merge patch/minor, not major
|
||||||
|
if [[ "${{ github.event.pull_request.title }}" =~ ^Bump.*from.*\.[0-9]+$ ]]; then
|
||||||
|
echo "auto_merge=true" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Enable auto-merge
|
||||||
|
if: steps.check.outputs.auto_merge == 'true'
|
||||||
|
run: gh pr merge --auto --squash "$PR_URL"
|
||||||
|
env:
|
||||||
|
PR_URL: ${{ github.event.pull_request.html_url }}
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Vulnerability Remediation Workflow
|
||||||
|
|
||||||
|
### 1. Triage (Within 24 hours)
|
||||||
|
|
||||||
|
**For each vulnerability:**
|
||||||
|
- **Assess severity:** Critical → immediate, High → 7 days, Medium → 30 days
|
||||||
|
- **Check exploitability:** Is it reachable in our code?
|
||||||
|
- **Verify patch availability:** Is there a fixed version?
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Remediation Options
|
||||||
|
|
||||||
|
| Option | When to Use | Example |
|
||||||
|
|--------|-------------|---------|
|
||||||
|
| **Update dependency** | Patch available | `npm update lodash` |
|
||||||
|
| **Update lockfile only** | Transitive dependency | `npm audit fix` |
|
||||||
|
| **Replace dependency** | No patch, actively exploited | Replace `request` with `axios` |
|
||||||
|
| **Apply workaround** | No patch, low risk | Disable vulnerable feature |
|
||||||
|
| **Accept risk** | False positive, not exploitable | Document in suppression file |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# After fix, verify vulnerability is resolved
|
||||||
|
npm audit
|
||||||
|
snyk test
|
||||||
|
|
||||||
|
# Run full test suite
|
||||||
|
npm test
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Ignoring Transitive Dependencies
|
||||||
|
|
||||||
|
**Symptom:** "We don't use that library directly, so it's fine"
|
||||||
|
|
||||||
|
**Why bad:** Transitive dependencies are still in your app
|
||||||
|
|
||||||
|
```
|
||||||
|
Your App
|
||||||
|
└─ express@4.18.0
|
||||||
|
└─ body-parser@1.19.0
|
||||||
|
└─ qs@6.7.0 (vulnerable!)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Update parent dependency or override version
|
||||||
|
|
||||||
|
```json
|
||||||
|
// package.json - force safe version
|
||||||
|
{
|
||||||
|
"overrides": {
|
||||||
|
"qs": "^6.11.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Auto-Merging All Updates
|
||||||
|
|
||||||
|
**Symptom:** Dependabot PRs merged without review
|
||||||
|
|
||||||
|
**Why bad:**
|
||||||
|
- Major versions can break functionality
|
||||||
|
- Updates may introduce new bugs
|
||||||
|
- No verification tests run
|
||||||
|
|
||||||
|
**Fix:** Auto-merge only patch versions, review major/minor
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Suppressing Without Investigation
|
||||||
|
|
||||||
|
**Symptom:** Marking all vulnerabilities as false positives
|
||||||
|
|
||||||
|
```xml
|
||||||
|
<!-- ❌ BAD: No justification -->
|
||||||
|
<suppress>
|
||||||
|
<cve>CVE-2021-12345</cve>
|
||||||
|
</suppress>
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Document WHY it's suppressed
|
||||||
|
|
||||||
|
```xml
|
||||||
|
<!-- ✅ GOOD: Clear justification -->
|
||||||
|
<suppress>
|
||||||
|
<notes>
|
||||||
|
False positive: CVE applies to XML parsing feature.
|
||||||
|
We only use JSON parsing (verified in code review).
|
||||||
|
Tracking issue: #1234
|
||||||
|
</notes>
|
||||||
|
<cve>CVE-2021-12345</cve>
|
||||||
|
</suppress>
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No SLA for Fixes
|
||||||
|
|
||||||
|
**Symptom:** Vulnerabilities sit unfixed for months
|
||||||
|
|
||||||
|
**Fix:** Define SLAs by severity
|
||||||
|
|
||||||
|
**Example SLA:**
|
||||||
|
- **Critical:** Fix within 24 hours
|
||||||
|
- **High:** Fix within 7 days
|
||||||
|
- **Medium:** Fix within 30 days
|
||||||
|
- **Low:** Fix within 90 days or next release
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Monitoring & Alerting
|
||||||
|
|
||||||
|
### Slack Notifications
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/security-alerts.yml
|
||||||
|
name: Security Alerts
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 9 * * *' # Daily at 9 AM
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
scan:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Run Snyk
|
||||||
|
id: snyk
|
||||||
|
run: |
|
||||||
|
snyk test --json > snyk-results.json || true
|
||||||
|
|
||||||
|
- name: Send Slack alert
|
||||||
|
if: steps.snyk.outcome == 'failure'
|
||||||
|
uses: slackapi/slack-github-action@v1
|
||||||
|
with:
|
||||||
|
payload: |
|
||||||
|
{
|
||||||
|
"text": "🚨 Security vulnerabilities detected!",
|
||||||
|
"blocks": [
|
||||||
|
{
|
||||||
|
"type": "section",
|
||||||
|
"text": {
|
||||||
|
"type": "mrkdwn",
|
||||||
|
"text": "*Critical vulnerabilities found in dependencies*\nView details: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
env:
|
||||||
|
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Automate dependency scanning to catch vulnerabilities and license issues early. Block merges on critical issues, monitor and plan fixes for others.**
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
- Enable Dependabot (automated PRs)
|
||||||
|
- Add Snyk or OWASP Dependency-Check (security scanning)
|
||||||
|
- Check licenses (license-checker)
|
||||||
|
- Define SLAs (Critical: 24h, High: 7d, Medium: 30d)
|
||||||
|
|
||||||
|
**Remediation:**
|
||||||
|
- Update dependencies to patched versions
|
||||||
|
- Override transitive dependencies if needed
|
||||||
|
- Document suppressions with justification
|
||||||
|
- Verify fixes with tests
|
||||||
|
|
||||||
|
**If you're not scanning dependencies, you're shipping known vulnerabilities. Automate it in CI/CD.**
|
||||||
290
skills/e2e-testing-strategies/SKILL.md
Normal file
290
skills/e2e-testing-strategies/SKILL.md
Normal file
@@ -0,0 +1,290 @@
|
|||||||
|
---
|
||||||
|
name: e2e-testing-strategies
|
||||||
|
description: Use when designing E2E test architecture, choosing between Cypress/Playwright/Selenium, prioritizing which flows to test, fixing flaky E2E tests, or debugging slow E2E test suites - provides production-tested patterns and anti-patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# E2E Testing Strategies
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** E2E tests are expensive. Use them sparingly for critical multi-system flows. Everything else belongs lower in the test pyramid.
|
||||||
|
|
||||||
|
**Test pyramid target:** 5-10% E2E, 20-25% integration, 65-75% unit
|
||||||
|
|
||||||
|
**Scope:** This skill focuses on web application E2E testing (browser-based). For mobile app testing (iOS/Android), decision tree points to Appium, but patterns/anti-patterns here are web-specific. Mobile testing requires different strategies for device capabilities, native selectors, and app lifecycle.
|
||||||
|
|
||||||
|
## Framework Selection Decision Tree
|
||||||
|
|
||||||
|
Choose framework based on constraints:
|
||||||
|
|
||||||
|
| Your Constraint | Choose | Why |
|
||||||
|
|----------------|--------|-----|
|
||||||
|
| Need cross-browser (Chrome/Firefox/Safari) | **Playwright** | Native multi-browser, auto-wait, trace viewer |
|
||||||
|
| Team unfamiliar with testing | **Cypress** | Simpler API, better DX, larger community |
|
||||||
|
| Enterprise/W3C standard requirement | **WebdriverIO** | Full W3C WebDriver protocol |
|
||||||
|
| Headless Chrome only, fine-grained control | **Puppeteer** | Lower-level, faster for Chrome-only |
|
||||||
|
| Testing Electron apps | **Spectron** or **Playwright** | Native Electron support |
|
||||||
|
| Mobile apps (iOS/Android) | **Appium** | Mobile-specific protocol (Note: rest of this skill is web-focused) |
|
||||||
|
|
||||||
|
**For most web apps:** Playwright (modern, reliable) or Cypress (simpler DX)
|
||||||
|
|
||||||
|
## Flow Prioritization Matrix
|
||||||
|
|
||||||
|
When you have 50 flows but can only test 10 E2E:
|
||||||
|
|
||||||
|
| Score | Criteria | Weight |
|
||||||
|
|-------|----------|--------|
|
||||||
|
| +3 | Revenue impact (checkout, payment, subscription) | High |
|
||||||
|
| +3 | Multi-system integration (API + DB + email + payment) | High |
|
||||||
|
| +2 | Historical production failures (has broken before) | Medium |
|
||||||
|
| +2 | Complex state management (auth, sessions, caching) | Medium |
|
||||||
|
| +1 | User entry point (login, signup, search) | Medium |
|
||||||
|
| +1 | Regulatory/compliance requirement | Medium |
|
||||||
|
| -2 | Can be tested at integration level | Penalty |
|
||||||
|
| -3 | Mostly UI interaction, no backend | Penalty |
|
||||||
|
|
||||||
|
**Score flows 0-10, test top 10.** Everything else → integration/unit tests.
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
- "User checkout flow" = +3 revenue +3 multi-system +2 historical +2 state = **10** → E2E
|
||||||
|
- "User changes email preference" = +1 entry -2 integration level = **-1** → Integration test
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Pyramid Inversion
|
||||||
|
**Symptom:** 200 E2E tests, 50 integration tests, 100 unit tests
|
||||||
|
|
||||||
|
**Why bad:** E2E tests are slow (30min CI), brittle (UI changes break tests), hard to debug
|
||||||
|
|
||||||
|
**Fix:** Invert back - move 150 E2E tests down to integration/unit
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Through the UI
|
||||||
|
**Symptom:** E2E test creates 10 users through signup form to test one admin feature
|
||||||
|
|
||||||
|
**Why bad:** Slow, couples unrelated features
|
||||||
|
|
||||||
|
**Fix:** Seed data via API/database, test only the admin feature flow
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Arbitrary Timeouts
|
||||||
|
**Symptom:** `wait(5000)` sprinkled throughout tests
|
||||||
|
|
||||||
|
**Why bad:** Flaky - sometimes too short, sometimes wastes time
|
||||||
|
|
||||||
|
**Fix:** Explicit waits for conditions
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad
|
||||||
|
await page.click('button');
|
||||||
|
await page.waitForTimeout(5000);
|
||||||
|
|
||||||
|
// ✅ Good
|
||||||
|
await page.click('button');
|
||||||
|
await page.waitForSelector('.success-message');
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ God Page Objects
|
||||||
|
**Symptom:** Single `PageObject` class with 50 methods for entire app
|
||||||
|
|
||||||
|
**Why bad:** Tight coupling, hard to maintain, unclear responsibilities
|
||||||
|
|
||||||
|
**Fix:** One page object per logical page/component
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad: God object
|
||||||
|
class AppPage {
|
||||||
|
async login() {}
|
||||||
|
async createPost() {}
|
||||||
|
async deleteUser() {}
|
||||||
|
async exportReport() {}
|
||||||
|
// ... 50 more methods
|
||||||
|
}
|
||||||
|
|
||||||
|
// ✅ Good: Focused page objects
|
||||||
|
class AuthPage {
|
||||||
|
async login() {}
|
||||||
|
async logout() {}
|
||||||
|
}
|
||||||
|
|
||||||
|
class PostsPage {
|
||||||
|
async create() {}
|
||||||
|
async delete() {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
###❌ Brittle Selectors
|
||||||
|
**Symptom:** `page.click('.btn-primary.mt-4.px-3')`
|
||||||
|
|
||||||
|
**Why bad:** Breaks when CSS changes
|
||||||
|
|
||||||
|
**Fix:** Use `data-testid` attributes
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad
|
||||||
|
await page.click('.submit-button.btn.btn-primary');
|
||||||
|
|
||||||
|
// ✅ Good
|
||||||
|
await page.click('[data-testid="submit"]');
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Test Interdependence
|
||||||
|
**Symptom:** Test 5 fails if Test 3 doesn't run first
|
||||||
|
|
||||||
|
**Why bad:** Can't run tests in parallel, hard to debug
|
||||||
|
|
||||||
|
**Fix:** Each test sets up own state
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad
|
||||||
|
test('create user', async () => {
|
||||||
|
// creates user "test@example.com"
|
||||||
|
});
|
||||||
|
|
||||||
|
test('login user', async () => {
|
||||||
|
// assumes user from previous test exists
|
||||||
|
});
|
||||||
|
|
||||||
|
// ✅ Good
|
||||||
|
test('login user', async ({ page }) => {
|
||||||
|
await createUserViaAPI('test@example.com'); // independent setup
|
||||||
|
await page.goto('/login');
|
||||||
|
// test login flow
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
## Flakiness Patterns Catalog
|
||||||
|
|
||||||
|
Common flake sources and fixes:
|
||||||
|
|
||||||
|
| Pattern | Symptom | Fix |
|
||||||
|
|---------|---------|-----|
|
||||||
|
| **Network Race** | "Element not found" intermittently | `await page.waitForLoadState('networkidle')` |
|
||||||
|
| **Animation Race** | "Element not clickable" | `await page.waitForSelector('.element', { state: 'visible' })` or disable animations |
|
||||||
|
| **Async State** | "Expected 'success' but got ''" | Wait for specific state, not timeout |
|
||||||
|
| **Test Data Pollution** | Test passes alone, fails in suite | Isolate data per test (unique IDs, cleanup) |
|
||||||
|
| **Browser Caching** | Different results first vs second run | Clear cache/cookies between tests |
|
||||||
|
| **Date/Time Sensitivity** | Test fails at midnight, passes during day | Mock system time in tests |
|
||||||
|
| **External Service** | Third-party API occasionally down | Mock external dependencies |
|
||||||
|
|
||||||
|
**Rule:** If test fails <5% of time, it's flaky. Fix it before adding more tests.
|
||||||
|
|
||||||
|
## Page Object Anti-Patterns
|
||||||
|
|
||||||
|
### ❌ Business Logic in Page Objects
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad
|
||||||
|
class CheckoutPage {
|
||||||
|
async calculateTotal(items) {
|
||||||
|
return items.reduce((sum, item) => sum + item.price, 0); // business logic
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ✅ Good
|
||||||
|
class CheckoutPage {
|
||||||
|
async getTotal() {
|
||||||
|
return await page.textContent('[data-testid="total"]'); // UI interaction only
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### ❌ Assertions in Page Objects
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad
|
||||||
|
class LoginPage {
|
||||||
|
async login(email, password) {
|
||||||
|
await this.page.fill('[data-testid="email"]', email);
|
||||||
|
await this.page.fill('[data-testid="password"]', password);
|
||||||
|
await this.page.click('[data-testid="submit"]');
|
||||||
|
expect(this.page.url()).toContain('/dashboard'); // assertion
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ✅ Good
|
||||||
|
class LoginPage {
|
||||||
|
async login(email, password) {
|
||||||
|
await this.page.fill('[data-testid="email"]', email);
|
||||||
|
await this.page.fill('[data-testid="password"]', password);
|
||||||
|
await this.page.click('[data-testid="submit"]');
|
||||||
|
}
|
||||||
|
|
||||||
|
async isOnDashboard() {
|
||||||
|
return this.page.url().includes('/dashboard');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test file handles assertions
|
||||||
|
test('login', async () => {
|
||||||
|
await loginPage.login('user@test.com', 'password');
|
||||||
|
expect(await loginPage.isOnDashboard()).toBe(true);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
### When to Use E2E vs Integration vs Unit
|
||||||
|
|
||||||
|
| Scenario | Test Level | Reasoning |
|
||||||
|
|----------|-----------|-----------|
|
||||||
|
| Form validation logic | Unit | Pure function, no UI needed |
|
||||||
|
| API error handling | Integration | Test API contract, no browser |
|
||||||
|
| Multi-step checkout | E2E | Crosses systems, critical revenue |
|
||||||
|
| Button hover states | Visual regression | Not functional behavior |
|
||||||
|
| Login → dashboard redirect | E2E | Auth critical, multi-system |
|
||||||
|
| Database query performance | Integration | No UI, just DB |
|
||||||
|
| User can filter search results | E2E (1 test) + Integration (variations) | 1 E2E for happy path, rest integration |
|
||||||
|
|
||||||
|
### Test Data Strategies
|
||||||
|
|
||||||
|
| Approach | When to Use | Pros | Cons |
|
||||||
|
|----------|-------------|------|------|
|
||||||
|
| **API Seeding** | Most tests | Fast, consistent | Requires API access |
|
||||||
|
| **Database Seeding** | Integration tests | Complete control | Slow, requires DB access |
|
||||||
|
| **UI Creation** | Testing creation flow itself | Tests real user path | Slow, couples tests |
|
||||||
|
| **Mocking** | External services | Fast, reliable | Misses real integration issues |
|
||||||
|
| **Fixtures** | Consistent test data | Reusable, version-controlled | Stale if schema changes |
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Running Full Suite on Every Commit
|
||||||
|
**Symptom:** 30-minute CI blocking every PR
|
||||||
|
|
||||||
|
**Fix:** Smoke tests (5-10 critical flows) on PR, full suite on merge/nightly
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Not Capturing Failure Artifacts
|
||||||
|
**Symptom:** "Test failed in CI but I can't reproduce"
|
||||||
|
|
||||||
|
**Fix:** Save video + trace on failure
|
||||||
|
```javascript
|
||||||
|
// playwright.config.js
|
||||||
|
use: {
|
||||||
|
video: 'retain-on-failure',
|
||||||
|
trace: 'retain-on-failure',
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Implementation Details
|
||||||
|
**Symptom:** Tests assert internal component state
|
||||||
|
|
||||||
|
**Fix:** Test user-visible behavior only
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ One Assert Per Test
|
||||||
|
**Symptom:** 50 E2E tests all navigate to same page, test one thing
|
||||||
|
|
||||||
|
**Fix:** Group related assertions in one flow test (but keep focused)
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**E2E tests verify critical multi-system flows work for real users.**
|
||||||
|
|
||||||
|
If you can test it faster/more reliably at a lower level, do that instead.
|
||||||
493
skills/flaky-test-prevention/SKILL.md
Normal file
493
skills/flaky-test-prevention/SKILL.md
Normal file
@@ -0,0 +1,493 @@
|
|||||||
|
---
|
||||||
|
name: flaky-test-prevention
|
||||||
|
description: Use when debugging intermittent test failures, choosing between retries vs fixes, quarantining flaky tests, calculating flakiness rates, or preventing non-deterministic behavior - provides root cause diagnosis, anti-patterns, and systematic debugging
|
||||||
|
---
|
||||||
|
|
||||||
|
# Flaky Test Prevention
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Fix root causes, don't mask symptoms.
|
||||||
|
|
||||||
|
**Rule:** Flaky tests indicate real problems - in test design, application code, or infrastructure.
|
||||||
|
|
||||||
|
## Flakiness Decision Tree
|
||||||
|
|
||||||
|
| Symptom | Root Cause Category | Diagnostic | Fix |
|
||||||
|
|---------|---------------------|------------|-----|
|
||||||
|
| Passes alone, fails in suite | Test Interdependence | Run tests in random order | Use test isolation (transactions, unique IDs) |
|
||||||
|
| Fails randomly ~10% | Timing/Race Condition | Add logging, run 100x | Replace sleeps with explicit waits |
|
||||||
|
| Fails only in CI, not locally | Environment Difference | Compare CI vs local env | Match environments, use containers |
|
||||||
|
| Fails at specific times | Time Dependency | Check for date/time usage | Mock system time |
|
||||||
|
| Fails under load | Resource Contention | Run in parallel locally | Add resource isolation, increase limits |
|
||||||
|
| Different results each run | Non-Deterministic Code | Check for randomness | Seed random generators, use fixtures |
|
||||||
|
|
||||||
|
**First step:** Identify symptom, trace to root cause category.
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Sleepy Assertion
|
||||||
|
**Symptom:** Using fixed `sleep()` or `wait()` instead of condition-based waits
|
||||||
|
|
||||||
|
**Why bad:** Wastes time on fast runs, still fails on slow runs, brittle
|
||||||
|
|
||||||
|
**Fix:** Explicit waits for conditions
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
time.sleep(5) # Hope 5 seconds is enough
|
||||||
|
assert element.text == "Loaded"
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
lambda d: d.find_element_by_id("status").text == "Loaded"
|
||||||
|
)
|
||||||
|
assert element.text == "Loaded"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Test Interdependence
|
||||||
|
**Symptom:** Tests pass when run in specific order, fail when shuffled
|
||||||
|
|
||||||
|
**Why bad:** Hidden dependencies, can't run in parallel, breaks test isolation
|
||||||
|
|
||||||
|
**Fix:** Each test creates its own data, no shared state
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
def test_create_user():
|
||||||
|
user = create_user("test_user") # Shared ID
|
||||||
|
|
||||||
|
def test_update_user():
|
||||||
|
update_user("test_user") # Depends on test_create_user
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
def test_create_user():
|
||||||
|
user_id = f"user_{uuid4()}"
|
||||||
|
user = create_user(user_id)
|
||||||
|
|
||||||
|
def test_update_user():
|
||||||
|
user_id = f"user_{uuid4()}"
|
||||||
|
user = create_user(user_id) # Independent
|
||||||
|
update_user(user_id)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Hidden Dependencies
|
||||||
|
**Symptom:** Tests fail due to external state (network, database, file system) beyond test control
|
||||||
|
|
||||||
|
**Why bad:** Unpredictable failures, environment-specific issues
|
||||||
|
|
||||||
|
**Fix:** Mock external dependencies
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
def test_weather_api():
|
||||||
|
response = requests.get("https://api.weather.com/...")
|
||||||
|
assert response.json()["temp"] > 0 # Fails if API is down
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
@mock.patch('requests.get')
|
||||||
|
def test_weather_api(mock_get):
|
||||||
|
mock_get.return_value.json.return_value = {"temp": 75}
|
||||||
|
response = get_weather("Seattle")
|
||||||
|
assert response["temp"] == 75
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Time Bomb
|
||||||
|
**Symptom:** Tests that depend on current date/time and fail at specific moments (midnight, month boundaries, DST)
|
||||||
|
|
||||||
|
**Why bad:** Fails unpredictably based on when tests run
|
||||||
|
|
||||||
|
**Fix:** Mock system time
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
def test_expiration():
|
||||||
|
created_at = datetime.now()
|
||||||
|
assert is_expired(created_at) == False # Fails at midnight
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
@freeze_time("2025-11-15 12:00:00")
|
||||||
|
def test_expiration():
|
||||||
|
created_at = datetime(2025, 11, 15, 12, 0, 0)
|
||||||
|
assert is_expired(created_at) == False
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Timeout Inflation
|
||||||
|
**Symptom:** Continuously increasing timeouts to "fix" flaky tests (5s → 10s → 30s)
|
||||||
|
|
||||||
|
**Why bad:** Masks root cause, slows test suite, doesn't guarantee reliability
|
||||||
|
|
||||||
|
**Fix:** Investigate why operation is slow, use explicit waits
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
await page.waitFor(30000) # Increased from 5s hoping it helps
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
await page.waitForSelector('.data-loaded', {timeout: 10000})
|
||||||
|
await page.waitForNetworkIdle()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Detection Strategies
|
||||||
|
|
||||||
|
### Proactive Identification
|
||||||
|
|
||||||
|
**Run tests multiple times (statistical detection):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# pytest with repeat plugin
|
||||||
|
pip install pytest-repeat
|
||||||
|
pytest --count=50 test_flaky.py
|
||||||
|
|
||||||
|
# Track pass rate
|
||||||
|
# 50/50 = 100% reliable
|
||||||
|
# 45/50 = 90% flaky (investigate immediately)
|
||||||
|
# <95% = quarantine
|
||||||
|
```
|
||||||
|
|
||||||
|
**CI Integration (automatic tracking):**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# GitHub Actions example
|
||||||
|
- name: Run tests with flakiness detection
|
||||||
|
run: |
|
||||||
|
pytest --count=3 --junit-xml=results.xml
|
||||||
|
python scripts/calculate_flakiness.py results.xml
|
||||||
|
```
|
||||||
|
|
||||||
|
**Flakiness metrics to track:**
|
||||||
|
- Pass rate per test (target: >99%)
|
||||||
|
- Mean Time Between Failures (MTBF)
|
||||||
|
- Failure clustering (same test failing together)
|
||||||
|
|
||||||
|
### Systematic Debugging
|
||||||
|
|
||||||
|
**When a test fails intermittently:**
|
||||||
|
|
||||||
|
1. **Reproduce consistently** - Run 100x to establish failure rate
|
||||||
|
2. **Isolate** - Run alone, with subset, with full suite (find interdependencies)
|
||||||
|
3. **Add logging** - Capture state before assertion, screenshot on failure
|
||||||
|
4. **Bisect** - If fails in suite, binary search which other test causes it
|
||||||
|
5. **Environment audit** - Compare CI vs local (env vars, resources, timing)
|
||||||
|
|
||||||
|
## Flakiness Metrics Guide
|
||||||
|
|
||||||
|
**Calculating flake rate:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Flakiness formula
|
||||||
|
flake_rate = (failed_runs / total_runs) * 100
|
||||||
|
|
||||||
|
# Example
|
||||||
|
# Test run 100 times: 7 failures
|
||||||
|
# Flake rate = 7/100 = 7%
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
|
||||||
|
| Flake Rate | Action | Priority |
|
||||||
|
|------------|--------|----------|
|
||||||
|
| 0% (100% pass) | Reliable | Monitor |
|
||||||
|
| 0.1-1% | Investigate | Low |
|
||||||
|
| 1-5% | Quarantine + Fix | Medium |
|
||||||
|
| 5-10% | Quarantine + Fix Urgently | High |
|
||||||
|
| >10% | Disable immediately | Critical |
|
||||||
|
|
||||||
|
**Target:** All tests should maintain >99% pass rate (< 1% flake rate)
|
||||||
|
|
||||||
|
## Quarantine Workflow
|
||||||
|
|
||||||
|
**Purpose:** Keep CI green while fixing flaky tests systematically
|
||||||
|
|
||||||
|
**Process:**
|
||||||
|
|
||||||
|
1. **Detect** - Test fails >1% of runs
|
||||||
|
2. **Quarantine** - Mark with `@pytest.mark.quarantine`, exclude from CI
|
||||||
|
3. **Track** - Create issue with flake rate, failure logs, reproduction steps
|
||||||
|
4. **Fix** - Assign owner, set SLA (e.g., 2 weeks to fix or delete)
|
||||||
|
5. **Validate** - Run fixed test 100x, must achieve >99% pass rate
|
||||||
|
6. **Re-Enable** - Remove quarantine mark, monitor for 1 week
|
||||||
|
|
||||||
|
**Marking quarantined tests:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.mark.quarantine(reason="Flaky due to timing issue #1234")
|
||||||
|
@pytest.mark.skip("Quarantined")
|
||||||
|
def test_flaky_feature():
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
**CI configuration:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run all tests except quarantined
|
||||||
|
pytest -m "not quarantine"
|
||||||
|
```
|
||||||
|
|
||||||
|
**SLA:** Quarantined tests must be fixed within 2 weeks or deleted. No test stays quarantined indefinitely.
|
||||||
|
|
||||||
|
## Tool Ecosystem Quick Reference
|
||||||
|
|
||||||
|
| Tool | Purpose | When to Use |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| **pytest-repeat** | Run test N times | Statistical detection |
|
||||||
|
| **pytest-xdist** | Parallel execution | Expose race conditions |
|
||||||
|
| **pytest-rerunfailures** | Auto-retry on failure | Temporary mitigation during fix |
|
||||||
|
| **pytest-randomly** | Randomize test order | Detect test interdependence |
|
||||||
|
| **freezegun** | Mock system time | Fix time bombs |
|
||||||
|
| **pytest-timeout** | Prevent hanging tests | Catch infinite loops |
|
||||||
|
|
||||||
|
**Installation:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pytest-repeat pytest-xdist pytest-rerunfailures pytest-randomly freezegun pytest-timeout
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usage examples:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Detect flakiness (run 50x)
|
||||||
|
pytest --count=50 test_suite.py
|
||||||
|
|
||||||
|
# Detect interdependence (random order)
|
||||||
|
pytest --randomly-seed=12345 test_suite.py
|
||||||
|
|
||||||
|
# Expose race conditions (parallel)
|
||||||
|
pytest -n 4 test_suite.py
|
||||||
|
|
||||||
|
# Temporary mitigation (reruns, not a fix!)
|
||||||
|
pytest --reruns 2 --reruns-delay 1 test_suite.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Prevention Checklist
|
||||||
|
|
||||||
|
**Use during test authoring to prevent flakiness:**
|
||||||
|
|
||||||
|
- [ ] No fixed `time.sleep()` - use explicit waits for conditions
|
||||||
|
- [ ] Each test creates its own data (UUID-based IDs)
|
||||||
|
- [ ] No shared global state between tests
|
||||||
|
- [ ] External dependencies mocked (APIs, network, databases)
|
||||||
|
- [ ] Time/date frozen with `@freeze_time` if time-dependent
|
||||||
|
- [ ] Random values seeded (`random.seed(42)`)
|
||||||
|
- [ ] Tests pass when run in any order (`pytest --randomly-seed`)
|
||||||
|
- [ ] Tests pass when run in parallel (`pytest -n 4`)
|
||||||
|
- [ ] Tests pass 100/100 times (`pytest --count=100`)
|
||||||
|
- [ ] Teardown cleans up all resources (files, database, cache)
|
||||||
|
|
||||||
|
## Common Fixes Quick Reference
|
||||||
|
|
||||||
|
| Problem | Fix Pattern | Example |
|
||||||
|
|---------|-------------|---------|
|
||||||
|
| **Timing issues** | Explicit waits | `WebDriverWait(driver, 10).until(condition)` |
|
||||||
|
| **Test interdependence** | Unique IDs per test | `user_id = f"test_{uuid4()}"` |
|
||||||
|
| **External dependencies** | Mock/stub | `@mock.patch('requests.get')` |
|
||||||
|
| **Time dependency** | Freeze time | `@freeze_time("2025-11-15")` |
|
||||||
|
| **Random behavior** | Seed randomness | `random.seed(42)` |
|
||||||
|
| **Shared state** | Test isolation | Transactions, teardown fixtures |
|
||||||
|
| **Resource contention** | Unique resources | Separate temp dirs, DB namespaces |
|
||||||
|
|
||||||
|
## Your First Flaky Test Fix
|
||||||
|
|
||||||
|
**Systematic approach for first fix:**
|
||||||
|
|
||||||
|
**Step 1: Reproduce (Day 1)**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run test 100 times, capture failures
|
||||||
|
pytest --count=100 --verbose test_flaky.py | tee output.log
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Categorize (Day 1)**
|
||||||
|
|
||||||
|
Check output.log:
|
||||||
|
- Same failure message? → Likely timing/race condition
|
||||||
|
- Different failures? → Likely test interdependence
|
||||||
|
- Only fails in CI? → Environment difference
|
||||||
|
|
||||||
|
**Step 3: Fix Based on Category (Day 2)**
|
||||||
|
|
||||||
|
**If timing issue:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Before
|
||||||
|
time.sleep(2)
|
||||||
|
assert element.text == "Loaded"
|
||||||
|
|
||||||
|
# After
|
||||||
|
wait.until(lambda: element.text == "Loaded")
|
||||||
|
```
|
||||||
|
|
||||||
|
**If interdependence:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Before
|
||||||
|
user = User.objects.get(id=1) # Assumes user exists
|
||||||
|
|
||||||
|
# After
|
||||||
|
user = create_test_user(id=f"test_{uuid4()}") # Creates own data
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Validate (Day 2)**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Must pass 100/100 times
|
||||||
|
pytest --count=100 test_flaky.py
|
||||||
|
# Expected: 100 passed
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 5: Monitor (Week 1)**
|
||||||
|
|
||||||
|
Track in CI - test should maintain >99% pass rate for 1 week before considering it fixed.
|
||||||
|
|
||||||
|
## CI-Only Flakiness (Can't Reproduce Locally)
|
||||||
|
|
||||||
|
**Symptom:** Test fails intermittently in CI but passes 100% locally
|
||||||
|
|
||||||
|
**Root cause:** Environment differences between CI and local (resources, parallelization, timing)
|
||||||
|
|
||||||
|
### Systematic CI Debugging
|
||||||
|
|
||||||
|
**Step 1: Environment Fingerprinting**
|
||||||
|
|
||||||
|
Capture exact environment in both CI and locally:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Add to conftest.py
|
||||||
|
import os, sys, platform, tempfile
|
||||||
|
|
||||||
|
def pytest_configure(config):
|
||||||
|
print(f"Python: {sys.version}")
|
||||||
|
print(f"Platform: {platform.platform()}")
|
||||||
|
print(f"CPU count: {os.cpu_count()}")
|
||||||
|
print(f"TZ: {os.environ.get('TZ', 'not set')}")
|
||||||
|
print(f"Temp dir: {tempfile.gettempdir()}")
|
||||||
|
print(f"Parallel: {os.environ.get('PYTEST_XDIST_WORKER', 'not parallel')}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Run in both environments, compare all outputs.
|
||||||
|
|
||||||
|
**Step 2: Increase CI Observation Window**
|
||||||
|
|
||||||
|
For low-probability failures (<5%), run more iterations:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# GitHub Actions example
|
||||||
|
- name: Run test 200x to catch 1% flake
|
||||||
|
run: pytest --count=200 --verbose --log-cli-level=DEBUG test.py
|
||||||
|
|
||||||
|
- name: Upload failure artifacts
|
||||||
|
if: failure()
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: failure-logs
|
||||||
|
path: |
|
||||||
|
*.log
|
||||||
|
screenshots/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Check CI-Specific Factors**
|
||||||
|
|
||||||
|
| Factor | Diagnostic | Fix |
|
||||||
|
|--------|------------|-----|
|
||||||
|
| **Parallelization** | Run `pytest -n 4` locally | Add test isolation (unique IDs, transactions) |
|
||||||
|
| **Resource limits** | Compare CI RAM/CPU to local | Mock expensive operations, add retries |
|
||||||
|
| **Cold starts** | First run vs warm runs | Check caching assumptions |
|
||||||
|
| **Disk I/O speed** | CI may use slower disks | Mock file operations |
|
||||||
|
| **Network latency** | CI network may be slower/different | Mock external calls |
|
||||||
|
|
||||||
|
**Step 4: Replicate CI Environment Locally**
|
||||||
|
|
||||||
|
Use exact CI container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# GitHub Actions uses Ubuntu 22.04
|
||||||
|
docker run -it ubuntu:22.04 bash
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
apt-get update && apt-get install python3.11
|
||||||
|
|
||||||
|
# Run test in container
|
||||||
|
pytest --count=500 test.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 5: Enable CI Debug Mode**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# GitHub Actions - Interactive debugging
|
||||||
|
- name: Setup tmate session (on failure)
|
||||||
|
if: failure()
|
||||||
|
uses: mxschmitt/action-tmate@v3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Quick CI Debugging Checklist
|
||||||
|
|
||||||
|
When test fails only in CI:
|
||||||
|
|
||||||
|
- [ ] Capture environment fingerprint in both CI and local
|
||||||
|
- [ ] Run test with parallelization locally (`pytest -n auto`)
|
||||||
|
- [ ] Check for resource contention (CPU, memory, disk)
|
||||||
|
- [ ] Compare timezone settings (`TZ` env var)
|
||||||
|
- [ ] Upload CI artifacts (logs, screenshots) on failure
|
||||||
|
- [ ] Replicate CI environment with Docker
|
||||||
|
- [ ] Check for cold start issues (first vs subsequent runs)
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Using Retries as Permanent Solution
|
||||||
|
**Fix:** Retries (@pytest.mark.flaky or --reruns) are temporary mitigation during investigation, not fixes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Flakiness Tracking
|
||||||
|
**Fix:** Track pass rates in CI, set up alerts for tests dropping below 99%
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Fixing Flaky Tests by Making Them Slower
|
||||||
|
**Fix:** Diagnose root cause - don't just add more wait time
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Flaky Tests
|
||||||
|
**Fix:** Quarantine workflow - either fix or delete, never ignore indefinitely
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Flakiness Thresholds:**
|
||||||
|
- <1% flake rate: Monitor
|
||||||
|
- 1-5%: Quarantine + fix (medium priority)
|
||||||
|
- >5%: Disable + fix urgently (high priority)
|
||||||
|
|
||||||
|
**Root Cause Categories:**
|
||||||
|
1. Timing/race conditions → Explicit waits
|
||||||
|
2. Test interdependence → Unique IDs, test isolation
|
||||||
|
3. External dependencies → Mocking
|
||||||
|
4. Time bombs → Freeze time
|
||||||
|
5. Resource contention → Unique resources
|
||||||
|
|
||||||
|
**Detection Tools:**
|
||||||
|
- pytest-repeat (statistical detection)
|
||||||
|
- pytest-randomly (interdependence)
|
||||||
|
- pytest-xdist (race conditions)
|
||||||
|
|
||||||
|
**Quarantine Process:**
|
||||||
|
1. Detect (>1% flake rate)
|
||||||
|
2. Quarantine (mark, exclude from CI)
|
||||||
|
3. Track (create issue)
|
||||||
|
4. Fix (assign owner, 2-week SLA)
|
||||||
|
5. Validate (100/100 passes)
|
||||||
|
6. Re-enable (monitor 1 week)
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Flaky tests are fixable - find the root cause, don't mask with retries.**
|
||||||
|
|
||||||
|
Use detection tools to find flaky tests early. Categorize by symptom, diagnose root cause, apply pattern-based fix. Quarantine if needed, but always with SLA to fix or delete.
|
||||||
445
skills/fuzz-testing/SKILL.md
Normal file
445
skills/fuzz-testing/SKILL.md
Normal file
@@ -0,0 +1,445 @@
|
|||||||
|
---
|
||||||
|
name: fuzz-testing
|
||||||
|
description: Use when testing input validation, discovering edge cases, finding security vulnerabilities, testing parsers/APIs with random inputs, or integrating fuzzing tools (AFL, libFuzzer, Atheris) - provides fuzzing strategies, tool selection, and crash triage workflows
|
||||||
|
---
|
||||||
|
|
||||||
|
# Fuzz Testing
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Fuzz testing feeds random/malformed inputs to find crashes, hangs, and security vulnerabilities that manual tests miss.
|
||||||
|
|
||||||
|
**Rule:** Fuzzing finds bugs you didn't know to test for. Use it for security-critical code (parsers, validators, APIs).
|
||||||
|
|
||||||
|
## Fuzz Testing vs Other Testing
|
||||||
|
|
||||||
|
| Test Type | Input | Goal |
|
||||||
|
|-----------|-------|------|
|
||||||
|
| **Unit Testing** | Known valid/invalid inputs | Verify expected behavior |
|
||||||
|
| **Property-Based Testing** | Generated valid inputs | Verify invariants hold |
|
||||||
|
| **Fuzz Testing** | Random/malformed inputs | Find crashes, hangs, memory issues |
|
||||||
|
|
||||||
|
**Fuzzing finds:** Buffer overflows, null pointer dereferences, infinite loops, unhandled exceptions
|
||||||
|
|
||||||
|
**Fuzzing does NOT find:** Logic bugs, performance issues
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## When to Use Fuzz Testing
|
||||||
|
|
||||||
|
**Good candidates:**
|
||||||
|
- Input parsers (JSON, XML, CSV, binary formats)
|
||||||
|
- Network protocol handlers
|
||||||
|
- Image/video codecs
|
||||||
|
- Cryptographic functions
|
||||||
|
- User input validators (file uploads, form data)
|
||||||
|
- APIs accepting untrusted data
|
||||||
|
|
||||||
|
**Poor candidates:**
|
||||||
|
- Business logic (use property-based testing)
|
||||||
|
- UI interactions (use E2E tests)
|
||||||
|
- Database queries (use integration tests)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Selection
|
||||||
|
|
||||||
|
| Tool | Language | Type | When to Use |
|
||||||
|
|------|----------|------|-------------|
|
||||||
|
| **Atheris** | Python | Coverage-guided | Python applications, libraries |
|
||||||
|
| **AFL (American Fuzzy Lop)** | C/C++ | Coverage-guided | Native code, high performance |
|
||||||
|
| **libFuzzer** | C/C++/Rust | Coverage-guided | Integrated with LLVM/Clang |
|
||||||
|
| **Jazzer** | Java/JVM | Coverage-guided | Java applications |
|
||||||
|
| **go-fuzz** | Go | Coverage-guided | Go applications |
|
||||||
|
|
||||||
|
**Coverage-guided:** Tracks which code paths are executed, generates inputs to explore new paths
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Basic Fuzzing Example (Python + Atheris)
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install atheris
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Simple Fuzz Test
|
||||||
|
|
||||||
|
```python
|
||||||
|
import atheris
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def parse_email(data):
|
||||||
|
"""Function to fuzz - finds bugs we didn't know about."""
|
||||||
|
if "@" not in data:
|
||||||
|
raise ValueError("Invalid email")
|
||||||
|
|
||||||
|
local, domain = data.split("@", 1)
|
||||||
|
|
||||||
|
if "." not in domain:
|
||||||
|
raise ValueError("Invalid domain")
|
||||||
|
|
||||||
|
# BUG: Crashes on multiple @ symbols!
|
||||||
|
# "user@@example.com" → crashes with ValueError
|
||||||
|
|
||||||
|
return (local, domain)
|
||||||
|
|
||||||
|
@atheris.instrument_func
|
||||||
|
def TestOneInput(data):
|
||||||
|
"""Fuzz harness - called repeatedly with random inputs."""
|
||||||
|
try:
|
||||||
|
parse_email(data.decode('utf-8', errors='ignore'))
|
||||||
|
except (ValueError, UnicodeDecodeError):
|
||||||
|
# Expected exceptions - not crashes
|
||||||
|
pass
|
||||||
|
# Any other exception = crash found!
|
||||||
|
|
||||||
|
atheris.Setup(sys.argv, TestOneInput)
|
||||||
|
atheris.Fuzz()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run:**
|
||||||
|
```bash
|
||||||
|
python fuzz_email.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
```
|
||||||
|
INFO: Seed: 1234567890
|
||||||
|
INFO: -max_len is not provided; libFuzzer will not generate inputs larger than 4096 bytes
|
||||||
|
#1: NEW coverage: 10 exec/s: 1000
|
||||||
|
#100: NEW coverage: 15 exec/s: 5000
|
||||||
|
CRASH: input was 'user@@example.com'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Advanced Fuzzing Patterns
|
||||||
|
|
||||||
|
### Structured Fuzzing (JSON)
|
||||||
|
|
||||||
|
**Problem:** Random bytes rarely form valid JSON
|
||||||
|
|
||||||
|
```python
|
||||||
|
import atheris
|
||||||
|
import json
|
||||||
|
|
||||||
|
@atheris.instrument_func
|
||||||
|
def TestOneInput(data):
|
||||||
|
try:
|
||||||
|
# Parse as JSON
|
||||||
|
obj = json.loads(data.decode('utf-8', errors='ignore'))
|
||||||
|
|
||||||
|
# Fuzz your JSON handler
|
||||||
|
process_user_data(obj)
|
||||||
|
except (json.JSONDecodeError, ValueError, KeyError):
|
||||||
|
pass # Expected for invalid JSON
|
||||||
|
|
||||||
|
def process_user_data(data):
|
||||||
|
"""Crashes on: {"name": "", "age": -1}"""
|
||||||
|
if len(data["name"]) == 0:
|
||||||
|
raise ValueError("Name cannot be empty")
|
||||||
|
if data["age"] < 0:
|
||||||
|
raise ValueError("Age cannot be negative")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Fuzzing with Corpus (Seed Inputs)
|
||||||
|
|
||||||
|
**Corpus:** Collection of valid inputs to start from
|
||||||
|
|
||||||
|
```python
|
||||||
|
import atheris
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Seed corpus: Valid examples
|
||||||
|
CORPUS_DIR = "./corpus"
|
||||||
|
os.makedirs(CORPUS_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
# Create seed files
|
||||||
|
with open(f"{CORPUS_DIR}/valid1.txt", "wb") as f:
|
||||||
|
f.write(b"user@example.com")
|
||||||
|
with open(f"{CORPUS_DIR}/valid2.txt", "wb") as f:
|
||||||
|
f.write(b"alice+tag@subdomain.example.org")
|
||||||
|
|
||||||
|
@atheris.instrument_func
|
||||||
|
def TestOneInput(data):
|
||||||
|
try:
|
||||||
|
parse_email(data.decode('utf-8'))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
atheris.Setup(sys.argv, TestOneInput, corpus_dir=CORPUS_DIR)
|
||||||
|
atheris.Fuzz()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:** Faster convergence to interesting inputs
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Crash Triage Workflow
|
||||||
|
|
||||||
|
### 1. Reproduce Crash
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Atheris outputs crash input
|
||||||
|
CRASH: input was b'user@@example.com'
|
||||||
|
|
||||||
|
# Save to file
|
||||||
|
echo "user@@example.com" > crash.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Minimize Input
|
||||||
|
|
||||||
|
**Find smallest input that triggers crash:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Original: "user@@example.com" (19 bytes)
|
||||||
|
# Minimized: "@@" (2 bytes)
|
||||||
|
|
||||||
|
# Atheris does this automatically
|
||||||
|
python fuzz_email.py crash.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Root Cause Analysis
|
||||||
|
|
||||||
|
```python
|
||||||
|
def parse_email(data):
|
||||||
|
# Crash: data = "@@"
|
||||||
|
local, domain = data.split("@", 1)
|
||||||
|
# local = "", domain = "@"
|
||||||
|
|
||||||
|
if "." not in domain:
|
||||||
|
# domain = "@" → no "." → raises ValueError
|
||||||
|
raise ValueError("Invalid domain")
|
||||||
|
|
||||||
|
# FIX: Validate before splitting
|
||||||
|
if data.count("@") != 1:
|
||||||
|
raise ValueError("Email must have exactly one @")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Write Regression Test
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_email_multiple_at_symbols():
|
||||||
|
"""Regression test for fuzz-found bug."""
|
||||||
|
with pytest.raises(ValueError, match="exactly one @"):
|
||||||
|
parse_email("user@@example.com")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration with CI/CD
|
||||||
|
|
||||||
|
### Continuous Fuzzing (GitHub Actions)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/fuzz.yml
|
||||||
|
name: Fuzz Testing
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 2 * * *' # Nightly at 2 AM
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
fuzz:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 60 # Run for 1 hour
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip install atheris
|
||||||
|
|
||||||
|
- name: Run fuzzing
|
||||||
|
run: |
|
||||||
|
timeout 3600 python fuzz_email.py || true
|
||||||
|
|
||||||
|
- name: Upload crashes
|
||||||
|
if: failure()
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: fuzz-crashes
|
||||||
|
path: crash-*
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why nightly:** Fuzzing is CPU-intensive, not suitable for every PR
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## AFL (C/C++) Example
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Ubuntu/Debian
|
||||||
|
sudo apt-get install afl++
|
||||||
|
|
||||||
|
# macOS
|
||||||
|
brew install afl++
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Fuzz Target
|
||||||
|
|
||||||
|
```c
|
||||||
|
// fuzz_target.c
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
void parse_command(const char *input) {
|
||||||
|
char buffer[64];
|
||||||
|
|
||||||
|
// BUG: Buffer overflow if input > 64 bytes!
|
||||||
|
strcpy(buffer, input);
|
||||||
|
|
||||||
|
if (strcmp(buffer, "exit") == 0) {
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) return 1;
|
||||||
|
|
||||||
|
FILE *f = fopen(argv[1], "rb");
|
||||||
|
if (!f) return 1;
|
||||||
|
|
||||||
|
char buffer[1024];
|
||||||
|
size_t len = fread(buffer, 1, sizeof(buffer), f);
|
||||||
|
fclose(f);
|
||||||
|
|
||||||
|
buffer[len] = '\0';
|
||||||
|
parse_command(buffer);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Compile and Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Compile with AFL instrumentation
|
||||||
|
afl-gcc fuzz_target.c -o fuzz_target
|
||||||
|
|
||||||
|
# Create corpus directory
|
||||||
|
mkdir -p corpus
|
||||||
|
echo "exit" > corpus/input1.txt
|
||||||
|
|
||||||
|
# Run fuzzer
|
||||||
|
afl-fuzz -i corpus -o findings -- ./fuzz_target @@
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
```
|
||||||
|
american fuzzy lop 4.00a
|
||||||
|
path : findings/queue
|
||||||
|
crashes : 1
|
||||||
|
hangs : 0
|
||||||
|
execs done : 1000000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Crashes found in:** `findings/crashes/`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Fuzzing Without Sanitizers
|
||||||
|
|
||||||
|
**Symptom:** Memory bugs don't crash, just corrupt silently
|
||||||
|
|
||||||
|
**Fix:** Compile with AddressSanitizer (ASan)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# C/C++: Compile with ASan
|
||||||
|
afl-gcc -fsanitize=address fuzz_target.c -o fuzz_target
|
||||||
|
|
||||||
|
# Python: Use PyASan (if available)
|
||||||
|
```
|
||||||
|
|
||||||
|
**What ASan catches:** Buffer overflows, use-after-free, memory leaks
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Hangs
|
||||||
|
|
||||||
|
**Symptom:** Fuzzer reports hangs, not investigated
|
||||||
|
|
||||||
|
**What hangs mean:** Infinite loops, algorithmic complexity attacks
|
||||||
|
|
||||||
|
**Fix:** Investigate and add timeout checks
|
||||||
|
|
||||||
|
```python
|
||||||
|
import signal
|
||||||
|
|
||||||
|
def timeout_handler(signum, frame):
|
||||||
|
raise TimeoutError("Operation timed out")
|
||||||
|
|
||||||
|
@atheris.instrument_func
|
||||||
|
def TestOneInput(data):
|
||||||
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
|
signal.alarm(1) # 1-second timeout
|
||||||
|
|
||||||
|
try:
|
||||||
|
parse_data(data.decode('utf-8'))
|
||||||
|
except (ValueError, TimeoutError):
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
signal.alarm(0)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Regression Tests
|
||||||
|
|
||||||
|
**Symptom:** Same bugs found repeatedly
|
||||||
|
|
||||||
|
**Fix:** Add regression test for every crash
|
||||||
|
|
||||||
|
```python
|
||||||
|
# After fuzzing finds crash on input "@@"
|
||||||
|
def test_regression_double_at():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
parse_email("@@")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Fuzz testing finds crashes and security vulnerabilities by feeding random/malformed inputs. Use it for security-critical code (parsers, validators, APIs).**
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
- Use Atheris (Python), AFL (C/C++), or language-specific fuzzer
|
||||||
|
- Start with corpus (valid examples)
|
||||||
|
- Run nightly in CI (1-24 hours)
|
||||||
|
|
||||||
|
**Workflow:**
|
||||||
|
1. Fuzzer finds crash
|
||||||
|
2. Minimize crashing input
|
||||||
|
3. Root cause analysis
|
||||||
|
4. Fix bug
|
||||||
|
5. Add regression test
|
||||||
|
|
||||||
|
**If your code accepts untrusted input (files, network data, user input), you should be fuzzing it. Fuzzing finds bugs that manual testing misses.**
|
||||||
478
skills/integration-testing-patterns/SKILL.md
Normal file
478
skills/integration-testing-patterns/SKILL.md
Normal file
@@ -0,0 +1,478 @@
|
|||||||
|
---
|
||||||
|
name: integration-testing-patterns
|
||||||
|
description: Use when testing component integration, database testing, external service integration, test containers, testing message queues, microservices testing, or designing integration test suites - provides boundary testing patterns and anti-patterns between unit and E2E tests
|
||||||
|
---
|
||||||
|
|
||||||
|
# Integration Testing Patterns
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Integration tests verify that multiple components work together correctly, testing at system boundaries.
|
||||||
|
|
||||||
|
**Rule:** Integration tests sit between unit tests (isolated) and E2E tests (full system). Test the integration points, not full user workflows.
|
||||||
|
|
||||||
|
## Integration Testing vs Unit vs E2E
|
||||||
|
|
||||||
|
| Aspect | Unit Test | Integration Test | E2E Test |
|
||||||
|
|--------|-----------|------------------|----------|
|
||||||
|
| **Scope** | Single function/class | 2-3 components + boundaries | Full system |
|
||||||
|
| **Speed** | Fastest (<1ms) | Medium (10-500ms) | Slowest (1-10s) |
|
||||||
|
| **Dependencies** | All mocked | Real DB/services | Everything real |
|
||||||
|
| **When** | Every commit | Every PR | Before release |
|
||||||
|
| **Coverage** | Business logic | Integration points | Critical workflows |
|
||||||
|
|
||||||
|
**Test Pyramid:**
|
||||||
|
- **70% Unit:** Pure logic, no I/O
|
||||||
|
- **20% Integration:** Database, APIs, message queues
|
||||||
|
- **10% E2E:** Browser tests, full workflows
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What to Integration Test
|
||||||
|
|
||||||
|
### 1. Database Integration
|
||||||
|
|
||||||
|
**Test: Repository/DAO layer with real database**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function")
|
||||||
|
def db_session():
|
||||||
|
"""Each test gets fresh DB with rollback."""
|
||||||
|
engine = create_engine("postgresql://localhost/test_db")
|
||||||
|
Session = sessionmaker(bind=engine)
|
||||||
|
session = Session()
|
||||||
|
|
||||||
|
yield session
|
||||||
|
|
||||||
|
session.rollback() # Undo all changes
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
def test_user_repository_create(db_session):
|
||||||
|
"""Integration test: Repository + Database."""
|
||||||
|
repo = UserRepository(db_session)
|
||||||
|
|
||||||
|
user = repo.create(email="alice@example.com", name="Alice")
|
||||||
|
|
||||||
|
assert user.id is not None
|
||||||
|
assert repo.get_by_email("alice@example.com").id == user.id
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why integration test:**
|
||||||
|
- Verifies SQL queries work
|
||||||
|
- Catches FK constraint violations
|
||||||
|
- Tests database-specific features (JSON columns, full-text search)
|
||||||
|
|
||||||
|
**NOT unit test because:** Uses real database
|
||||||
|
**NOT E2E test because:** Doesn't test full user workflow
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. External API Integration
|
||||||
|
|
||||||
|
**Test: Service layer calling third-party API**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
import responses
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_payment_service_integration():
|
||||||
|
"""Integration test: PaymentService + Stripe API (mocked)."""
|
||||||
|
# Mock Stripe API response
|
||||||
|
responses.add(
|
||||||
|
responses.POST,
|
||||||
|
"https://api.stripe.com/v1/charges",
|
||||||
|
json={"id": "ch_123", "status": "succeeded"},
|
||||||
|
status=200
|
||||||
|
)
|
||||||
|
|
||||||
|
service = PaymentService(api_key="test_key")
|
||||||
|
result = service.charge(amount=1000, token="tok_visa")
|
||||||
|
|
||||||
|
assert result.status == "succeeded"
|
||||||
|
assert result.charge_id == "ch_123"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why integration test:**
|
||||||
|
- Tests HTTP client configuration
|
||||||
|
- Validates request/response parsing
|
||||||
|
- Verifies error handling
|
||||||
|
|
||||||
|
**When to use real API:**
|
||||||
|
- Separate integration test suite (nightly)
|
||||||
|
- Contract tests (see contract-testing skill)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Message Queue Integration
|
||||||
|
|
||||||
|
**Test: Producer/Consumer with real queue**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
from kombu import Connection
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def rabbitmq_connection():
|
||||||
|
"""Real RabbitMQ connection for integration tests."""
|
||||||
|
conn = Connection("amqp://localhost")
|
||||||
|
yield conn
|
||||||
|
conn.release()
|
||||||
|
|
||||||
|
def test_order_queue_integration(rabbitmq_connection):
|
||||||
|
"""Integration test: OrderService + RabbitMQ."""
|
||||||
|
publisher = OrderPublisher(rabbitmq_connection)
|
||||||
|
consumer = OrderConsumer(rabbitmq_connection)
|
||||||
|
|
||||||
|
# Publish message
|
||||||
|
publisher.publish({"order_id": 123, "status": "pending"})
|
||||||
|
|
||||||
|
# Consume message
|
||||||
|
message = consumer.get(timeout=5)
|
||||||
|
|
||||||
|
assert message["order_id"] == 123
|
||||||
|
assert message["status"] == "pending"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why integration test:**
|
||||||
|
- Verifies serialization/deserialization
|
||||||
|
- Tests queue configuration (exchanges, routing keys)
|
||||||
|
- Validates message durability
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Microservices Integration
|
||||||
|
|
||||||
|
**Test: Service A → Service B communication**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_user_service():
|
||||||
|
"""Mock User Service for integration tests."""
|
||||||
|
with responses.RequestsMock() as rsps:
|
||||||
|
rsps.add(
|
||||||
|
responses.GET,
|
||||||
|
"http://user-service/users/123",
|
||||||
|
json={"id": 123, "name": "Alice"},
|
||||||
|
status=200
|
||||||
|
)
|
||||||
|
yield rsps
|
||||||
|
|
||||||
|
def test_order_service_integration(mock_user_service):
|
||||||
|
"""Integration test: OrderService + UserService."""
|
||||||
|
order_service = OrderService(user_service_url="http://user-service")
|
||||||
|
|
||||||
|
order = order_service.create_order(user_id=123, items=[...])
|
||||||
|
|
||||||
|
assert order.user_name == "Alice"
|
||||||
|
```
|
||||||
|
|
||||||
|
**For real service integration:** Use contract tests (see contract-testing skill)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Containers Pattern
|
||||||
|
|
||||||
|
**Use Docker containers for integration tests.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
from testcontainers.postgres import PostgresContainer
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def postgres_container():
|
||||||
|
"""Start PostgreSQL container for tests."""
|
||||||
|
with PostgresContainer("postgres:15") as postgres:
|
||||||
|
yield postgres
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_connection(postgres_container):
|
||||||
|
"""Database connection from test container."""
|
||||||
|
engine = create_engine(postgres_container.get_connection_url())
|
||||||
|
return engine.connect()
|
||||||
|
|
||||||
|
def test_user_repository(db_connection):
|
||||||
|
repo = UserRepository(db_connection)
|
||||||
|
user = repo.create(email="alice@example.com")
|
||||||
|
assert user.id is not None
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Clean database per test run
|
||||||
|
- Matches production environment
|
||||||
|
- No manual setup required
|
||||||
|
|
||||||
|
**When NOT to use:**
|
||||||
|
- Unit tests (too slow)
|
||||||
|
- CI without Docker support
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Boundary Testing Strategy
|
||||||
|
|
||||||
|
**Test at system boundaries, not internal implementation.**
|
||||||
|
|
||||||
|
**Boundaries to test:**
|
||||||
|
1. **Application → Database** (SQL queries, ORMs)
|
||||||
|
2. **Application → External API** (HTTP clients, SDKs)
|
||||||
|
3. **Application → File System** (File I/O, uploads)
|
||||||
|
4. **Application → Message Queue** (Producers/consumers)
|
||||||
|
5. **Service A → Service B** (Microservice calls)
|
||||||
|
|
||||||
|
**Example: Boundary test for file upload**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_file_upload_integration(tmp_path):
|
||||||
|
"""Integration test: FileService + File System."""
|
||||||
|
service = FileService(storage_path=str(tmp_path))
|
||||||
|
|
||||||
|
# Upload file
|
||||||
|
file_id = service.upload(filename="test.txt", content=b"Hello")
|
||||||
|
|
||||||
|
# Verify file exists on disk
|
||||||
|
file_path = tmp_path / file_id / "test.txt"
|
||||||
|
assert file_path.exists()
|
||||||
|
assert file_path.read_bytes() == b"Hello"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Testing Internal Implementation
|
||||||
|
|
||||||
|
**Symptom:** Integration test verifies internal method calls
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Testing implementation, not integration
|
||||||
|
def test_order_service():
|
||||||
|
with patch('order_service._calculate_tax') as mock_tax:
|
||||||
|
service.create_order(...)
|
||||||
|
assert mock_tax.called
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Not testing integration point, just internal logic
|
||||||
|
|
||||||
|
**Fix:** Test actual boundary (database, API, etc.)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Test database integration
|
||||||
|
def test_order_service(db_session):
|
||||||
|
service = OrderService(db_session)
|
||||||
|
order = service.create_order(...)
|
||||||
|
|
||||||
|
# Verify data was persisted
|
||||||
|
saved_order = db_session.query(Order).get(order.id)
|
||||||
|
assert saved_order.total == order.total
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Full System Tests Disguised as Integration Tests
|
||||||
|
|
||||||
|
**Symptom:** "Integration test" requires entire system running
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: This is an E2E test, not integration test
|
||||||
|
def test_checkout_flow():
|
||||||
|
# Requires: Web server, database, Redis, Stripe, email service
|
||||||
|
browser.goto("http://localhost:8000/checkout")
|
||||||
|
browser.fill("#card", "4242424242424242")
|
||||||
|
browser.click("#submit")
|
||||||
|
assert "Success" in browser.content()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Slow, fragile, hard to debug
|
||||||
|
|
||||||
|
**Fix:** Test individual integration points
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Integration test for payment component only
|
||||||
|
def test_payment_integration(mock_stripe):
|
||||||
|
service = PaymentService()
|
||||||
|
result = service.charge(amount=1000, token="tok_visa")
|
||||||
|
assert result.status == "succeeded"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Shared Test Data Across Integration Tests
|
||||||
|
|
||||||
|
**Symptom:** Tests fail when run in different orders
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Relies on shared database state
|
||||||
|
def test_get_user():
|
||||||
|
user = db.query(User).filter_by(email="test@example.com").first()
|
||||||
|
assert user.name == "Test User"
|
||||||
|
|
||||||
|
def test_update_user():
|
||||||
|
user = db.query(User).filter_by(email="test@example.com").first()
|
||||||
|
user.name = "Updated"
|
||||||
|
db.commit()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Each test creates its own data (see test-isolation-fundamentals skill)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Isolated test data
|
||||||
|
def test_get_user(db_session):
|
||||||
|
user = create_test_user(db_session, email="test@example.com")
|
||||||
|
retrieved = db_session.query(User).get(user.id)
|
||||||
|
assert retrieved.name == user.name
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Too Many Layers
|
||||||
|
|
||||||
|
**Symptom:** Integration test includes business logic validation
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Testing logic + integration in same test
|
||||||
|
def test_order_calculation(db_session):
|
||||||
|
order = OrderService(db_session).create_order(...)
|
||||||
|
|
||||||
|
# Integration: DB save
|
||||||
|
assert order.id is not None
|
||||||
|
|
||||||
|
# Logic: Tax calculation (should be unit test!)
|
||||||
|
assert order.tax == order.subtotal * 0.08
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Separate concerns
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Unit test for logic
|
||||||
|
def test_order_tax_calculation():
|
||||||
|
order = Order(subtotal=100)
|
||||||
|
assert order.calculate_tax() == 8.0
|
||||||
|
|
||||||
|
# ✅ GOOD: Integration test for persistence
|
||||||
|
def test_order_persistence(db_session):
|
||||||
|
repo = OrderRepository(db_session)
|
||||||
|
order = repo.create(subtotal=100, tax=8.0)
|
||||||
|
assert repo.get(order.id).tax == 8.0
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration Test Environments
|
||||||
|
|
||||||
|
### Local Development
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# docker-compose.test.yml
|
||||||
|
version: '3.8'
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:15
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: test_db
|
||||||
|
POSTGRES_USER: test
|
||||||
|
POSTGRES_PASSWORD: test
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7
|
||||||
|
|
||||||
|
rabbitmq:
|
||||||
|
image: rabbitmq:3-management
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run tests:**
|
||||||
|
```bash
|
||||||
|
docker-compose -f docker-compose.test.yml up -d
|
||||||
|
pytest tests/integration/
|
||||||
|
docker-compose -f docker-compose.test.yml down
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### CI/CD
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/integration-tests.yml
|
||||||
|
name: Integration Tests
|
||||||
|
|
||||||
|
on: [pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:15
|
||||||
|
env:
|
||||||
|
POSTGRES_PASSWORD: test
|
||||||
|
options: >-
|
||||||
|
--health-cmd pg_isready
|
||||||
|
--health-interval 10s
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- name: Run integration tests
|
||||||
|
run: pytest tests/integration/
|
||||||
|
env:
|
||||||
|
DATABASE_URL: postgresql://postgres:test@localhost/test
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
**Integration tests are slower than unit tests.**
|
||||||
|
|
||||||
|
**Optimization strategies:**
|
||||||
|
|
||||||
|
1. **Use transactions:** Rollback instead of truncating tables (100x faster)
|
||||||
|
2. **Parallelize:** Run integration tests in parallel (`pytest -n 4`)
|
||||||
|
3. **Minimize I/O:** Only test integration points, not full workflows
|
||||||
|
4. **Cache containers:** Reuse test containers across tests (scope="module")
|
||||||
|
|
||||||
|
**Example: Fast integration tests**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Slow: 5 seconds per test
|
||||||
|
@pytest.fixture
|
||||||
|
def db():
|
||||||
|
engine = create_engine(...)
|
||||||
|
Base.metadata.create_all(engine) # Recreate schema every test
|
||||||
|
yield engine
|
||||||
|
Base.metadata.drop_all(engine)
|
||||||
|
|
||||||
|
# Fast: 10ms per test
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def db_engine():
|
||||||
|
engine = create_engine(...)
|
||||||
|
Base.metadata.create_all(engine) # Once per module
|
||||||
|
yield engine
|
||||||
|
Base.metadata.drop_all(engine)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_session(db_engine):
|
||||||
|
connection = db_engine.connect()
|
||||||
|
transaction = connection.begin()
|
||||||
|
session = Session(bind=connection)
|
||||||
|
yield session
|
||||||
|
transaction.rollback() # Fast cleanup
|
||||||
|
connection.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Integration tests verify that components work together at system boundaries.**
|
||||||
|
|
||||||
|
- Test at boundaries (DB, API, queue), not internal logic
|
||||||
|
- Use real dependencies (DB, queue) or realistic mocks (external APIs)
|
||||||
|
- Keep tests isolated (transactions, test containers, unique data)
|
||||||
|
- Run on every PR (they're slower than unit tests but faster than E2E)
|
||||||
|
|
||||||
|
**If your "integration test" requires the entire system running, it's an E2E test. Test integration points individually.**
|
||||||
843
skills/load-testing-patterns/SKILL.md
Normal file
843
skills/load-testing-patterns/SKILL.md
Normal file
@@ -0,0 +1,843 @@
|
|||||||
|
---
|
||||||
|
name: load-testing-patterns
|
||||||
|
description: Use when designing load tests, choosing tools (k6, JMeter, Gatling), calculating concurrent users from DAU, interpreting latency degradation, identifying bottlenecks, or running spike/soak/stress tests - provides test patterns, anti-patterns, and load calculation frameworks
|
||||||
|
---
|
||||||
|
|
||||||
|
# Load Testing Patterns
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Test realistic load patterns, not constant artificial load. Find limits before users do.
|
||||||
|
|
||||||
|
**Rule:** Load testing reveals system behavior under stress. Without it, production is your load test.
|
||||||
|
|
||||||
|
## Tool Selection Decision Tree
|
||||||
|
|
||||||
|
| Your Need | Protocol | Team Skills | Use | Why |
|
||||||
|
|-----------|----------|-------------|-----|-----|
|
||||||
|
| Modern API testing | HTTP/REST/GraphQL | JavaScript | **k6** | Best dev experience, CI/CD friendly |
|
||||||
|
| Enterprise/complex protocols | HTTP/SOAP/JMS/JDBC | Java/GUI comfort | **JMeter** | Mature, comprehensive protocols |
|
||||||
|
| Python team | HTTP/WebSocket | Python | **Locust** | Pythonic, easy scripting |
|
||||||
|
| High performance/complex scenarios | HTTP/gRPC | Scala/Java | **Gatling** | Best reports, high throughput |
|
||||||
|
| Cloud-native at scale | HTTP/WebSocket | Any (SaaS) | **Artillery, Flood.io** | Managed, distributed |
|
||||||
|
|
||||||
|
**First choice:** k6 (modern, scriptable, excellent CI/CD integration)
|
||||||
|
|
||||||
|
**Why not ApacheBench/wrk:** Too simple for realistic scenarios, no complex user flows
|
||||||
|
|
||||||
|
## Test Pattern Library
|
||||||
|
|
||||||
|
| Pattern | Purpose | Duration | When to Use |
|
||||||
|
|---------|---------|----------|-------------|
|
||||||
|
| **Smoke Test** | Verify test works | 1-2 min | Before every test run |
|
||||||
|
| **Load Test** | Normal/peak capacity | 10-30 min | Regular capacity validation |
|
||||||
|
| **Stress Test** | Find breaking point | 20-60 min | Understand limits |
|
||||||
|
| **Spike Test** | Sudden traffic surge | 5-15 min | Black Friday, launch events |
|
||||||
|
| **Soak Test** | Memory leaks, stability | 1-8 hours | Pre-release validation |
|
||||||
|
| **Capacity Test** | Max sustainable load | Variable | Capacity planning |
|
||||||
|
|
||||||
|
### Smoke Test
|
||||||
|
|
||||||
|
**Goal:** Verify test script works with minimal load
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 smoke test
|
||||||
|
export let options = {
|
||||||
|
vus: 1,
|
||||||
|
duration: '1m',
|
||||||
|
thresholds: {
|
||||||
|
http_req_duration: ['p(95)<500'], // 95% < 500ms
|
||||||
|
http_req_failed: ['rate<0.01'], // <1% errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Purpose:** Catch test script bugs before running expensive full tests
|
||||||
|
|
||||||
|
### Load Test (Ramp-Up Pattern)
|
||||||
|
|
||||||
|
**Goal:** Test normal and peak expected load
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 load test with ramp-up
|
||||||
|
export let options = {
|
||||||
|
stages: [
|
||||||
|
{ duration: '5m', target: 100 }, // Ramp to normal load
|
||||||
|
{ duration: '10m', target: 100 }, // Hold at normal
|
||||||
|
{ duration: '5m', target: 200 }, // Ramp to peak
|
||||||
|
{ duration: '10m', target: 200 }, // Hold at peak
|
||||||
|
{ duration: '5m', target: 0 }, // Ramp down
|
||||||
|
],
|
||||||
|
thresholds: {
|
||||||
|
http_req_duration: ['p(95)<500', 'p(99)<1000'],
|
||||||
|
http_req_failed: ['rate<0.05'],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pattern:** Gradual ramp-up → sustain → ramp down. Never start at peak.
|
||||||
|
|
||||||
|
### Stress Test (Breaking Point)
|
||||||
|
|
||||||
|
**Goal:** Find system limits
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 stress test
|
||||||
|
export let options = {
|
||||||
|
stages: [
|
||||||
|
{ duration: '5m', target: 100 }, // Normal
|
||||||
|
{ duration: '5m', target: 300 }, // Above peak
|
||||||
|
{ duration: '5m', target: 600 }, // 2x peak
|
||||||
|
{ duration: '5m', target: 900 }, // 3x peak (expect failure)
|
||||||
|
{ duration: '10m', target: 0 }, // Recovery
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Success:** Identify at what load system degrades (not necessarily breaking completely)
|
||||||
|
|
||||||
|
### Spike Test (Sudden Surge)
|
||||||
|
|
||||||
|
**Goal:** Test sudden traffic bursts (viral post, email campaign)
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 spike test
|
||||||
|
export let options = {
|
||||||
|
stages: [
|
||||||
|
{ duration: '1m', target: 100 }, // Normal
|
||||||
|
{ duration: '30s', target: 1000 }, // SPIKE to 10x
|
||||||
|
{ duration: '5m', target: 1000 }, // Hold spike
|
||||||
|
{ duration: '2m', target: 100 }, // Back to normal
|
||||||
|
{ duration: '5m', target: 100 }, // Recovery check
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tests:** Auto-scaling, circuit breakers, rate limiting
|
||||||
|
|
||||||
|
### Soak Test (Endurance)
|
||||||
|
|
||||||
|
**Goal:** Find memory leaks, resource exhaustion over time
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 soak test
|
||||||
|
export let options = {
|
||||||
|
stages: [
|
||||||
|
{ duration: '5m', target: 100 }, // Ramp
|
||||||
|
{ duration: '4h', target: 100 }, // Soak (sustained load)
|
||||||
|
{ duration: '5m', target: 0 }, // Ramp down
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Monitor:** Memory growth, connection leaks, disk space, file descriptors
|
||||||
|
|
||||||
|
**Duration:** Minimum 1 hour, ideally 4-8 hours
|
||||||
|
|
||||||
|
## Load Calculation Framework
|
||||||
|
|
||||||
|
**Problem:** Convert "10,000 daily active users" to concurrent load
|
||||||
|
|
||||||
|
### Step 1: DAU to Concurrent Users
|
||||||
|
|
||||||
|
```
|
||||||
|
Concurrent Users = DAU × Concurrency Ratio × Peak Multiplier
|
||||||
|
|
||||||
|
Concurrency Ratios by App Type:
|
||||||
|
- Web apps: 5-10%
|
||||||
|
- Social media: 10-20%
|
||||||
|
- Business apps: 20-30% (work hours)
|
||||||
|
- Gaming: 15-25%
|
||||||
|
|
||||||
|
Peak Multiplier: 1.5-2x for safety margin
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```
|
||||||
|
DAU = 10,000
|
||||||
|
Concurrency = 10% (web app)
|
||||||
|
Peak Multiplier = 1.5
|
||||||
|
|
||||||
|
Concurrent Users = 10,000 × 0.10 × 1.5 = 1,500 concurrent users
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Concurrent Users to Requests/Second
|
||||||
|
|
||||||
|
```
|
||||||
|
RPS = (Concurrent Users × Requests per Session) / (Session Duration × Think Time Ratio)
|
||||||
|
|
||||||
|
Think Time Ratio:
|
||||||
|
- Active browsing: 0.3-0.5 (30-50% time clicking/typing)
|
||||||
|
- Reading-heavy: 0.1-0.2 (10-20% active)
|
||||||
|
- API clients: 0.8-1.0 (80-100% active)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```
|
||||||
|
Concurrent Users = 1,500
|
||||||
|
Requests per Session = 20
|
||||||
|
Session Duration = 10 minutes = 600 seconds
|
||||||
|
Think Time Ratio = 0.3 (web browsing)
|
||||||
|
|
||||||
|
RPS = (1,500 × 20) / (600 × 0.3) = 30,000 / 180 = 167 RPS
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Model Realistic Patterns
|
||||||
|
|
||||||
|
Don't use constant load. Use realistic traffic patterns:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Realistic daily pattern
|
||||||
|
export let options = {
|
||||||
|
stages: [
|
||||||
|
// Morning ramp
|
||||||
|
{ duration: '2h', target: 500 }, // 08:00-10:00
|
||||||
|
{ duration: '2h', target: 1000 }, // 10:00-12:00 (peak)
|
||||||
|
// Lunch dip
|
||||||
|
{ duration: '1h', target: 600 }, // 12:00-13:00
|
||||||
|
// Afternoon peak
|
||||||
|
{ duration: '2h', target: 1200 }, // 13:00-15:00 (peak)
|
||||||
|
{ duration: '2h', target: 800 }, // 15:00-17:00
|
||||||
|
// Evening drop
|
||||||
|
{ duration: '2h', target: 300 }, // 17:00-19:00
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Coordinated Omission
|
||||||
|
**Symptom:** Fixed rate load generation ignores slow responses, underestimating latency
|
||||||
|
|
||||||
|
**Why bad:** Hides real latency impact when system slows down
|
||||||
|
|
||||||
|
**Fix:** Use arrival rate (requests/sec) not iteration rate
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad - coordinated omission
|
||||||
|
export default function() {
|
||||||
|
http.get('https://api.example.com')
|
||||||
|
sleep(1) // Wait 1s between requests
|
||||||
|
}
|
||||||
|
|
||||||
|
// ✅ Good - arrival rate pacing
|
||||||
|
export let options = {
|
||||||
|
scenarios: {
|
||||||
|
constant_arrival_rate: {
|
||||||
|
executor: 'constant-arrival-rate',
|
||||||
|
rate: 100, // 100 RPS regardless of response time
|
||||||
|
timeUnit: '1s',
|
||||||
|
duration: '10m',
|
||||||
|
preAllocatedVUs: 50,
|
||||||
|
maxVUs: 200,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Cold Start Testing
|
||||||
|
**Symptom:** Running load test immediately after deployment without warm-up
|
||||||
|
|
||||||
|
**Why bad:** JIT compilation, cache warming, connection pooling haven't stabilized
|
||||||
|
|
||||||
|
**Fix:** Warm-up phase before measurement
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ✅ Good - warm-up phase
|
||||||
|
export let options = {
|
||||||
|
stages: [
|
||||||
|
{ duration: '2m', target: 50 }, // Warm-up (not measured)
|
||||||
|
{ duration: '10m', target: 100 }, // Actual test
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Unrealistic Test Data
|
||||||
|
**Symptom:** Using same user ID, same query parameters for all virtual users
|
||||||
|
|
||||||
|
**Why bad:** Caches give unrealistic performance, doesn't test real database load
|
||||||
|
|
||||||
|
**Fix:** Parameterized, realistic data
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ Bad - same data
|
||||||
|
http.get('https://api.example.com/users/123')
|
||||||
|
|
||||||
|
// ✅ Good - parameterized data
|
||||||
|
import { SharedArray } from 'k6/data'
|
||||||
|
import papaparse from 'https://jslib.k6.io/papaparse/5.1.1/index.js'
|
||||||
|
|
||||||
|
const csvData = new SharedArray('users', function () {
|
||||||
|
return papaparse.parse(open('./users.csv'), { header: true }).data
|
||||||
|
})
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
const user = csvData[__VU % csvData.length]
|
||||||
|
http.get(`https://api.example.com/users/${user.id}`)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Constant Load Pattern
|
||||||
|
**Symptom:** Running with constant VUs instead of realistic traffic pattern
|
||||||
|
|
||||||
|
**Why bad:** Real traffic has peaks, valleys, not flat line
|
||||||
|
|
||||||
|
**Fix:** Use realistic daily/hourly patterns
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Think Time
|
||||||
|
**Symptom:** No delays between requests, hammering API as fast as possible
|
||||||
|
|
||||||
|
**Why bad:** Unrealistic user behavior, overestimates load
|
||||||
|
|
||||||
|
**Fix:** Add realistic think time based on user behavior
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ✅ Good - realistic think time
|
||||||
|
import { sleep } from 'k6'
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
http.get('https://api.example.com/products')
|
||||||
|
sleep(Math.random() * 3 + 2) // 2-5 seconds browsing
|
||||||
|
|
||||||
|
http.post('https://api.example.com/cart', {...})
|
||||||
|
sleep(Math.random() * 5 + 5) // 5-10 seconds deciding
|
||||||
|
|
||||||
|
http.post('https://api.example.com/checkout', {...})
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Result Interpretation Guide
|
||||||
|
|
||||||
|
### Latency Degradation Patterns
|
||||||
|
|
||||||
|
| Pattern | Cause | What to Check |
|
||||||
|
|---------|-------|---------------|
|
||||||
|
| **Linear growth** (2x users → 2x latency) | CPU-bound | Thread pool, CPU usage |
|
||||||
|
| **Exponential growth** (2x users → 10x latency) | Resource saturation | Connection pools, locks, queues |
|
||||||
|
| **Sudden cliff** (works until X, then fails) | Hard limit hit | Max connections, memory, file descriptors |
|
||||||
|
| **Gradual degradation** (slow increase over time) | Memory leak, cache pollution | Memory trends, GC activity |
|
||||||
|
|
||||||
|
### Bottleneck Classification
|
||||||
|
|
||||||
|
**Symptom: p95 latency 10x at 2x load**
|
||||||
|
→ **Resource saturation** (database connection pool, thread pool, queue)
|
||||||
|
|
||||||
|
**Symptom: Errors increase with load**
|
||||||
|
→ **Hard limit** (connection limit, rate limiting, timeout)
|
||||||
|
|
||||||
|
**Symptom: Latency grows over time at constant load**
|
||||||
|
→ **Memory leak** or **cache pollution**
|
||||||
|
|
||||||
|
**Symptom: High variance (p50 good, p99 terrible)**
|
||||||
|
→ **GC pauses**, **lock contention**, or **slow queries**
|
||||||
|
|
||||||
|
### What to Monitor
|
||||||
|
|
||||||
|
| Layer | Metrics to Track |
|
||||||
|
|-------|------------------|
|
||||||
|
| **Application** | Request rate, error rate, p50/p95/p99 latency, active requests |
|
||||||
|
| **Runtime** | GC pauses (JVM, .NET), thread pool usage, heap/memory |
|
||||||
|
| **Database** | Connection pool usage, query latency, lock waits, slow queries |
|
||||||
|
| **Infrastructure** | CPU %, memory %, disk I/O, network throughput |
|
||||||
|
| **External** | Third-party API latency, rate limit hits |
|
||||||
|
|
||||||
|
### Capacity Planning Formula
|
||||||
|
|
||||||
|
```
|
||||||
|
Safe Capacity = (Breaking Point × Degradation Factor) × Safety Margin
|
||||||
|
|
||||||
|
Breaking Point = VUs where p95 latency > threshold
|
||||||
|
Degradation Factor = 0.7 (start degradation before break)
|
||||||
|
Safety Margin = 0.5-0.7 (handle traffic spikes)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
- System breaks at 1000 VUs (p95 > 1s)
|
||||||
|
- Start seeing degradation at 700 VUs (70%)
|
||||||
|
- Safe capacity: 700 × 0.7 = 490 VUs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Authentication and Session Management
|
||||||
|
|
||||||
|
**Problem:** Real APIs require authentication. Can't use same token for all virtual users.
|
||||||
|
|
||||||
|
### Token Strategy Decision Framework
|
||||||
|
|
||||||
|
| Scenario | Strategy | Why |
|
||||||
|
|----------|----------|-----|
|
||||||
|
| **Short test (<10 min)** | Pre-generate tokens | Fast, simple, no login load |
|
||||||
|
| **Long test (soak)** | Login during test + refresh | Realistic, tests auth system |
|
||||||
|
| **Testing auth system** | Simulate login flow | Auth is part of load |
|
||||||
|
| **Read-only testing** | Shared token (single user) | Simplest, adequate for API-only tests |
|
||||||
|
|
||||||
|
**Default:** Pre-generate tokens for load tests, simulate login for auth system tests
|
||||||
|
|
||||||
|
### Pre-Generated Tokens Pattern
|
||||||
|
|
||||||
|
**Best for:** API testing where auth system isn't being tested
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 with pre-generated JWT tokens
|
||||||
|
import http from 'k6/http'
|
||||||
|
import { SharedArray } from 'k6/data'
|
||||||
|
|
||||||
|
// Load tokens from file (generated externally)
|
||||||
|
const tokens = new SharedArray('auth tokens', function () {
|
||||||
|
return JSON.parse(open('./tokens.json'))
|
||||||
|
})
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
const token = tokens[__VU % tokens.length]
|
||||||
|
|
||||||
|
const headers = {
|
||||||
|
'Authorization': `Bearer ${token}`
|
||||||
|
}
|
||||||
|
|
||||||
|
http.get('https://api.example.com/protected', { headers })
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Generate tokens externally:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Script to generate 1000 tokens
|
||||||
|
for i in {1..1000}; do
|
||||||
|
curl -X POST https://api.example.com/login \
|
||||||
|
-d "username=loadtest_user_$i&password=test" \
|
||||||
|
| jq -r '.token'
|
||||||
|
done > tokens.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Pros:** No login load, fast test setup
|
||||||
|
**Cons:** Tokens may expire during long tests, not testing auth flow
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Login Flow Simulation Pattern
|
||||||
|
|
||||||
|
**Best for:** Testing auth system, soak tests where tokens expire
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 with login simulation
|
||||||
|
import http from 'k6/http'
|
||||||
|
import { SharedArray } from 'k6/data'
|
||||||
|
|
||||||
|
const users = new SharedArray('users', function () {
|
||||||
|
return JSON.parse(open('./users.json')) // [{username, password}, ...]
|
||||||
|
})
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
const user = users[__VU % users.length]
|
||||||
|
|
||||||
|
// Login to get token
|
||||||
|
const loginRes = http.post('https://api.example.com/login', {
|
||||||
|
username: user.username,
|
||||||
|
password: user.password
|
||||||
|
})
|
||||||
|
|
||||||
|
const token = loginRes.json('token')
|
||||||
|
|
||||||
|
// Use token for subsequent requests
|
||||||
|
const headers = { 'Authorization': `Bearer ${token}` }
|
||||||
|
|
||||||
|
http.get('https://api.example.com/protected', { headers })
|
||||||
|
http.post('https://api.example.com/data', {}, { headers })
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Token refresh for long tests:**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 with token refresh
|
||||||
|
import { sleep } from 'k6'
|
||||||
|
|
||||||
|
let token = null
|
||||||
|
let tokenExpiry = 0
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
const now = Date.now() / 1000
|
||||||
|
|
||||||
|
// Refresh token if expired or about to expire
|
||||||
|
if (!token || now > tokenExpiry - 300) { // Refresh 5 min before expiry
|
||||||
|
const loginRes = http.post('https://api.example.com/login', {...})
|
||||||
|
token = loginRes.json('token')
|
||||||
|
tokenExpiry = loginRes.json('expires_at')
|
||||||
|
}
|
||||||
|
|
||||||
|
http.get('https://api.example.com/protected', {
|
||||||
|
headers: { 'Authorization': `Bearer ${token}` }
|
||||||
|
})
|
||||||
|
|
||||||
|
sleep(1)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Session Cookie Management
|
||||||
|
|
||||||
|
**For cookie-based auth:**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 with session cookies
|
||||||
|
import http from 'k6/http'
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
// k6 automatically handles cookies with jar
|
||||||
|
const jar = http.cookieJar()
|
||||||
|
|
||||||
|
// Login (sets session cookie)
|
||||||
|
http.post('https://example.com/login', {
|
||||||
|
username: 'user',
|
||||||
|
password: 'pass'
|
||||||
|
})
|
||||||
|
|
||||||
|
// Subsequent requests use session cookie automatically
|
||||||
|
http.get('https://example.com/dashboard')
|
||||||
|
http.get('https://example.com/profile')
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Rate Limiting Detection
|
||||||
|
|
||||||
|
**Pattern:** Detect when hitting rate limits during load test
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 rate limit detection
|
||||||
|
import { check } from 'k6'
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
const res = http.get('https://api.example.com/data')
|
||||||
|
|
||||||
|
check(res, {
|
||||||
|
'not rate limited': (r) => r.status !== 429
|
||||||
|
})
|
||||||
|
|
||||||
|
if (res.status === 429) {
|
||||||
|
console.warn(`Rate limited at VU ${__VU}, iteration ${__ITER}`)
|
||||||
|
const retryAfter = res.headers['Retry-After']
|
||||||
|
console.warn(`Retry-After: ${retryAfter} seconds`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds for rate limiting:**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
export let options = {
|
||||||
|
thresholds: {
|
||||||
|
'http_req_failed{status:429}': ['rate<0.01'] // <1% rate limited
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Third-Party Dependency Handling
|
||||||
|
|
||||||
|
**Problem:** APIs call external services (payment, email, third-party APIs). Should you mock them?
|
||||||
|
|
||||||
|
### Mock vs Real Decision Framework
|
||||||
|
|
||||||
|
| External Service | Mock or Real? | Why |
|
||||||
|
|------------------|---------------|-----|
|
||||||
|
| **Payment gateway** | Real (sandbox) | Need to test integration, has sandbox mode |
|
||||||
|
| **Email provider** | Mock | Cost ($0.001/email × 1000 VUs = expensive), no value testing |
|
||||||
|
| **Third-party API (has staging)** | Real (staging) | Test integration, realistic latency |
|
||||||
|
| **Third-party API (no staging)** | Mock | Can't load test production, rate limits |
|
||||||
|
| **Internal microservices** | Real | Testing real integration points |
|
||||||
|
| **Analytics/tracking** | Mock | High volume, no functional impact |
|
||||||
|
|
||||||
|
**Rule:** Use real services if they have sandbox/staging. Mock if expensive, rate-limited, or no test environment.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Service Virtualization with WireMock
|
||||||
|
|
||||||
|
**Best for:** Mocking HTTP APIs with realistic responses
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 test pointing to WireMock
|
||||||
|
export default function() {
|
||||||
|
// WireMock running on localhost:8080 mocks external API
|
||||||
|
const res = http.get('http://localhost:8080/api/payment/process')
|
||||||
|
|
||||||
|
check(res, {
|
||||||
|
'payment mock responds': (r) => r.status === 200
|
||||||
|
})
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**WireMock stub setup:**
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "/api/payment/process"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"status": 200,
|
||||||
|
"jsonBody": {
|
||||||
|
"transaction_id": "{{randomValue type='UUID'}}",
|
||||||
|
"status": "approved"
|
||||||
|
},
|
||||||
|
"headers": {
|
||||||
|
"Content-Type": "application/json"
|
||||||
|
},
|
||||||
|
"fixedDelayMilliseconds": 200
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why WireMock:** Realistic latency simulation, dynamic responses, stateful mocking
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Partial Mocking Pattern
|
||||||
|
|
||||||
|
**Pattern:** Mock some services, use real for others
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// k6 with partial mocking
|
||||||
|
import http from 'k6/http'
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
// Real API (points to staging)
|
||||||
|
const productRes = http.get('https://staging-api.example.com/products')
|
||||||
|
|
||||||
|
// Mock email service (points to WireMock)
|
||||||
|
http.post('http://localhost:8080/mock/email/send', {
|
||||||
|
to: 'user@example.com',
|
||||||
|
subject: 'Order confirmation'
|
||||||
|
})
|
||||||
|
|
||||||
|
// Real payment sandbox
|
||||||
|
http.post('https://sandbox-payment.stripe.com/charge', {
|
||||||
|
amount: 1000,
|
||||||
|
currency: 'usd',
|
||||||
|
source: 'tok_visa'
|
||||||
|
})
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Decision criteria:**
|
||||||
|
- Real: Services with sandbox, need integration validation, low cost
|
||||||
|
- Mock: No sandbox, expensive, rate-limited, testing failure scenarios
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Testing External Service Failures
|
||||||
|
|
||||||
|
**Use mocks to simulate failures:**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// WireMock stub for failure scenarios
|
||||||
|
{
|
||||||
|
"request": {
|
||||||
|
"method": "POST",
|
||||||
|
"url": "/api/payment/process"
|
||||||
|
},
|
||||||
|
"response": {
|
||||||
|
"status": 503,
|
||||||
|
"jsonBody": {
|
||||||
|
"error": "Service temporarily unavailable"
|
||||||
|
},
|
||||||
|
"fixedDelayMilliseconds": 5000 // Slow failure
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**k6 test for resilience:**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
export default function() {
|
||||||
|
const res = http.post('http://localhost:8080/api/payment/process', {})
|
||||||
|
|
||||||
|
// Verify app handles payment failures gracefully
|
||||||
|
check(res, {
|
||||||
|
'handles payment failure': (r) => r.status === 503,
|
||||||
|
'returns within timeout': (r) => r.timings.duration < 6000
|
||||||
|
})
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Cost and Compliance Guardrails
|
||||||
|
|
||||||
|
**Before testing with real external services:**
|
||||||
|
|
||||||
|
| Check | Why |
|
||||||
|
|-------|-----|
|
||||||
|
| **Sandbox mode exists?** | Avoid production costs/rate limits |
|
||||||
|
| **Cost per request?** | 1000 VUs × 10 req/s × 600s = 6M requests |
|
||||||
|
| **Rate limits?** | Will you hit external service limits? |
|
||||||
|
| **Terms of service?** | Does load testing violate TOS? |
|
||||||
|
| **Data privacy?** | Using real user emails/PII? |
|
||||||
|
|
||||||
|
**Example cost calculation:**
|
||||||
|
|
||||||
|
```
|
||||||
|
Email service: $0.001/email
|
||||||
|
Load test: 100 VUs × 5 emails/session × 600s = 300,000 emails
|
||||||
|
Cost: 300,000 × $0.001 = $300
|
||||||
|
|
||||||
|
Decision: Mock email service, use real payment sandbox (free)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Compliance:**
|
||||||
|
- Don't use real user data in load tests (GDPR, privacy)
|
||||||
|
- Check third-party TOS (some prohibit load testing)
|
||||||
|
- Use synthetic test data only
|
||||||
|
|
||||||
|
## Your First Load Test
|
||||||
|
|
||||||
|
**Goal:** Basic load test in one day
|
||||||
|
|
||||||
|
**Hour 1-2: Install tool and write smoke test**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install k6
|
||||||
|
brew install k6 # macOS
|
||||||
|
# or snap install k6 # Linux
|
||||||
|
|
||||||
|
# Create test.js
|
||||||
|
cat > test.js <<'EOF'
|
||||||
|
import http from 'k6/http'
|
||||||
|
import { check, sleep } from 'k6'
|
||||||
|
|
||||||
|
export let options = {
|
||||||
|
vus: 1,
|
||||||
|
duration: '30s'
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function() {
|
||||||
|
let res = http.get('https://your-api.com/health')
|
||||||
|
check(res, {
|
||||||
|
'status is 200': (r) => r.status === 200,
|
||||||
|
'response < 500ms': (r) => r.timings.duration < 500
|
||||||
|
})
|
||||||
|
sleep(1)
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Run smoke test
|
||||||
|
k6 run test.js
|
||||||
|
```
|
||||||
|
|
||||||
|
**Hour 3-4: Calculate target load**
|
||||||
|
|
||||||
|
```
|
||||||
|
Your DAU: 10,000
|
||||||
|
Concurrency: 10%
|
||||||
|
Peak multiplier: 1.5
|
||||||
|
Target: 10,000 × 0.10 × 1.5 = 1,500 VUs
|
||||||
|
```
|
||||||
|
|
||||||
|
**Hour 5-6: Write load test with ramp-up**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
export let options = {
|
||||||
|
stages: [
|
||||||
|
{ duration: '5m', target: 750 }, // Ramp to normal (50%)
|
||||||
|
{ duration: '10m', target: 750 }, // Hold normal
|
||||||
|
{ duration: '5m', target: 1500 }, // Ramp to peak
|
||||||
|
{ duration: '10m', target: 1500 }, // Hold peak
|
||||||
|
{ duration: '5m', target: 0 }, // Ramp down
|
||||||
|
],
|
||||||
|
thresholds: {
|
||||||
|
http_req_duration: ['p(95)<500', 'p(99)<1000'],
|
||||||
|
http_req_failed: ['rate<0.05'] // < 5% errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Hour 7-8: Run test and analyze**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run load test
|
||||||
|
k6 run --out json=results.json test.js
|
||||||
|
|
||||||
|
# Check summary output for:
|
||||||
|
# - p95/p99 latency trends
|
||||||
|
# - Error rates
|
||||||
|
# - When degradation started
|
||||||
|
```
|
||||||
|
|
||||||
|
**If test fails:** Check thresholds, adjust targets, investigate bottlenecks
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Testing Production Without Safeguards
|
||||||
|
**Fix:** Use feature flags, test environment, or controlled percentage
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Baseline Performance Metrics
|
||||||
|
**Fix:** Run smoke test first to establish baseline before load testing
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Using Iteration Duration Instead of Arrival Rate
|
||||||
|
**Fix:** Use `constant-arrival-rate` executor in k6
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Not Warming Up Caches/JIT
|
||||||
|
**Fix:** 2-5 minute warm-up phase before measurement
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Tool Selection:**
|
||||||
|
- Modern API: k6
|
||||||
|
- Enterprise: JMeter
|
||||||
|
- Python team: Locust
|
||||||
|
|
||||||
|
**Test Patterns:**
|
||||||
|
- Smoke: 1 VU, 1 min
|
||||||
|
- Load: Ramp-up → peak → ramp-down
|
||||||
|
- Stress: Increase until break
|
||||||
|
- Spike: Sudden 10x surge
|
||||||
|
- Soak: 4-8 hours constant
|
||||||
|
|
||||||
|
**Load Calculation:**
|
||||||
|
```
|
||||||
|
Concurrent = DAU × 0.10 × 1.5
|
||||||
|
RPS = (Concurrent × Requests/Session) / (Duration × Think Time)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Anti-Patterns:**
|
||||||
|
- Coordinated omission (use arrival rate)
|
||||||
|
- Cold start (warm-up first)
|
||||||
|
- Unrealistic data (parameterize)
|
||||||
|
- Constant load (use realistic patterns)
|
||||||
|
|
||||||
|
**Result Interpretation:**
|
||||||
|
- Linear growth → CPU-bound
|
||||||
|
- Exponential growth → Resource saturation
|
||||||
|
- Sudden cliff → Hard limit
|
||||||
|
- Gradual degradation → Memory leak
|
||||||
|
|
||||||
|
**Authentication:**
|
||||||
|
- Short tests: Pre-generate tokens
|
||||||
|
- Long tests: Login + refresh
|
||||||
|
- Testing auth: Simulate login flow
|
||||||
|
|
||||||
|
**Third-Party Dependencies:**
|
||||||
|
- Has sandbox: Use real (staging/sandbox)
|
||||||
|
- Expensive/rate-limited: Mock (WireMock)
|
||||||
|
- No sandbox: Mock
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Start with smoke test (1 VU). Calculate realistic load from DAU. Use ramp-up pattern (never start at peak). Monitor p95/p99 latency. Find breaking point before users do.**
|
||||||
|
|
||||||
|
Test realistic scenarios with think time, not hammer tests.
|
||||||
348
skills/mutation-testing/SKILL.md
Normal file
348
skills/mutation-testing/SKILL.md
Normal file
@@ -0,0 +1,348 @@
|
|||||||
|
---
|
||||||
|
name: mutation-testing
|
||||||
|
description: Use when validating test effectiveness, measuring test quality beyond coverage, choosing mutation testing tools (Stryker, PITest, mutmut), interpreting mutation scores, or improving test suites - provides mutation operators, score interpretation, and integration patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# Mutation Testing
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Mutation testing validates that your tests actually test something by introducing bugs and checking if tests catch them.
|
||||||
|
|
||||||
|
**Rule:** 100% code coverage doesn't mean good tests. Mutation score measures if tests detect bugs.
|
||||||
|
|
||||||
|
## Code Coverage vs Mutation Score
|
||||||
|
|
||||||
|
| Metric | What It Measures | Example |
|
||||||
|
|--------|------------------|---------|
|
||||||
|
| **Code Coverage** | Lines executed by tests | `calculate_tax(100)` executes code = 100% coverage |
|
||||||
|
| **Mutation Score** | Bugs detected by tests | Change `*` to `/` → test still passes = poor tests |
|
||||||
|
|
||||||
|
**Problem with coverage:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def calculate_tax(amount):
|
||||||
|
return amount * 0.08
|
||||||
|
|
||||||
|
def test_calculate_tax():
|
||||||
|
calculate_tax(100) # 100% coverage, but asserts nothing!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Mutation testing catches this:**
|
||||||
|
1. Mutates `* 0.08` to `/ 0.08`
|
||||||
|
2. Runs test
|
||||||
|
3. Test still passes → **Survived mutation** (bad test!)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How Mutation Testing Works
|
||||||
|
|
||||||
|
**Process:**
|
||||||
|
1. **Create mutant:** Change code slightly (e.g., `+` → `-`, `<` → `<=`)
|
||||||
|
2. **Run tests:** Do tests fail?
|
||||||
|
3. **Classify:**
|
||||||
|
- **Killed:** Test failed → Good test!
|
||||||
|
- **Survived:** Test passed → Test doesn't verify this logic
|
||||||
|
- **Timeout:** Test hung → Usually killed
|
||||||
|
- **No coverage:** Not executed → Add test
|
||||||
|
|
||||||
|
**Mutation Score:**
|
||||||
|
```
|
||||||
|
Mutation Score = (Killed Mutants / Total Mutants) × 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
- **> 80%:** Excellent test quality
|
||||||
|
- **60-80%:** Acceptable
|
||||||
|
- **< 60%:** Tests are weak
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Selection
|
||||||
|
|
||||||
|
| Language | Tool | Why |
|
||||||
|
|----------|------|-----|
|
||||||
|
| **JavaScript/TypeScript** | **Stryker** | Best JS support, framework-agnostic |
|
||||||
|
| **Java** | **PITest** | Industry standard, Maven/Gradle integration |
|
||||||
|
| **Python** | **mutmut** | Simple, fast, pytest integration |
|
||||||
|
| **C#** | **Stryker.NET** | .NET ecosystem integration |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Example: Python with mutmut
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install mutmut
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Basic Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run mutation testing
|
||||||
|
mutmut run
|
||||||
|
|
||||||
|
# View results
|
||||||
|
mutmut results
|
||||||
|
|
||||||
|
# Show survived mutants (bugs your tests missed)
|
||||||
|
mutmut show
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
```toml
|
||||||
|
# setup.cfg
|
||||||
|
[mutmut]
|
||||||
|
paths_to_mutate=src/
|
||||||
|
backup=False
|
||||||
|
runner=python -m pytest -x
|
||||||
|
tests_dir=tests/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/calculator.py
|
||||||
|
def calculate_discount(price, percent):
|
||||||
|
if percent > 100:
|
||||||
|
raise ValueError("Percent cannot exceed 100")
|
||||||
|
return price * (1 - percent / 100)
|
||||||
|
|
||||||
|
# tests/test_calculator.py
|
||||||
|
def test_calculate_discount():
|
||||||
|
result = calculate_discount(100, 20)
|
||||||
|
assert result == 80
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run mutmut:**
|
||||||
|
```bash
|
||||||
|
mutmut run
|
||||||
|
```
|
||||||
|
|
||||||
|
**Possible mutations:**
|
||||||
|
1. `percent > 100` → `percent >= 100` (boundary)
|
||||||
|
2. `1 - percent` → `1 + percent` (operator)
|
||||||
|
3. `percent / 100` → `percent * 100` (operator)
|
||||||
|
4. `price * (...)` → `price / (...)` (operator)
|
||||||
|
|
||||||
|
**Results:**
|
||||||
|
- Mutation 1 **survived** (test doesn't check boundary)
|
||||||
|
- Mutation 2, 3, 4 **killed** (test catches these)
|
||||||
|
|
||||||
|
**Improvement:**
|
||||||
|
```python
|
||||||
|
def test_calculate_discount_boundary():
|
||||||
|
# Catch mutation 1
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
calculate_discount(100, 101)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Mutation Operators
|
||||||
|
|
||||||
|
| Operator | Original | Mutated | What It Tests |
|
||||||
|
|----------|----------|---------|---------------|
|
||||||
|
| **Arithmetic** | `a + b` | `a - b` | Calculation logic |
|
||||||
|
| **Relational** | `a < b` | `a <= b` | Boundary conditions |
|
||||||
|
| **Logical** | `a and b` | `a or b` | Boolean logic |
|
||||||
|
| **Unary** | `+x` | `-x` | Sign handling |
|
||||||
|
| **Constant** | `return 0` | `return 1` | Magic numbers |
|
||||||
|
| **Return** | `return x` | `return None` | Return value validation |
|
||||||
|
| **Statement deletion** | `x = 5` | (deleted) | Side effects |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Interpreting Mutation Score
|
||||||
|
|
||||||
|
### High Score (> 80%)
|
||||||
|
|
||||||
|
**Good tests that catch most bugs.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def add(a, b):
|
||||||
|
return a + b
|
||||||
|
|
||||||
|
def test_add():
|
||||||
|
assert add(2, 3) == 5
|
||||||
|
assert add(-1, 1) == 0
|
||||||
|
assert add(0, 0) == 0
|
||||||
|
|
||||||
|
# Mutations killed:
|
||||||
|
# - a - b (returns -1, test expects 5)
|
||||||
|
# - a * b (returns 6, test expects 5)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Low Score (< 60%)
|
||||||
|
|
||||||
|
**Weak tests that don't verify logic.**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def validate_email(email):
|
||||||
|
return "@" in email and "." in email
|
||||||
|
|
||||||
|
def test_validate_email():
|
||||||
|
validate_email("user@example.com") # No assertion!
|
||||||
|
|
||||||
|
# Mutations survived:
|
||||||
|
# - "@" in email → "@" not in email
|
||||||
|
# - "and" → "or"
|
||||||
|
# - (All mutations survive because test asserts nothing)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Survived Mutants to Investigate
|
||||||
|
|
||||||
|
**Priority order:**
|
||||||
|
1. **Business logic mutations** (calculations, validations)
|
||||||
|
2. **Boundary conditions** (`<` → `<=`, `>` → `>=`)
|
||||||
|
3. **Error handling** (exception raising)
|
||||||
|
|
||||||
|
**Low priority:**
|
||||||
|
4. **Logging statements**
|
||||||
|
5. **Constants that don't affect behavior**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration with CI/CD
|
||||||
|
|
||||||
|
### GitHub Actions (Python)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/mutation-testing.yml
|
||||||
|
name: Mutation Testing
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 2 * * 0' # Weekly on Sunday 2 AM
|
||||||
|
workflow_dispatch: # Manual trigger
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
mutmut:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install mutmut pytest
|
||||||
|
|
||||||
|
- name: Run mutation testing
|
||||||
|
run: mutmut run
|
||||||
|
|
||||||
|
- name: Generate report
|
||||||
|
run: |
|
||||||
|
mutmut results
|
||||||
|
mutmut html # Generate HTML report
|
||||||
|
|
||||||
|
- name: Upload report
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: mutation-report
|
||||||
|
path: html/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why weekly, not every PR:**
|
||||||
|
- Mutation testing is slow (10-100x slower than regular tests)
|
||||||
|
- Runs every possible mutation
|
||||||
|
- Not needed for every change
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Chasing 100% Mutation Score
|
||||||
|
|
||||||
|
**Symptom:** Writing tests just to kill surviving mutants
|
||||||
|
|
||||||
|
**Why bad:**
|
||||||
|
- Some mutations are equivalent (don't change behavior)
|
||||||
|
- Diminishing returns after 85%
|
||||||
|
- Time better spent on integration tests
|
||||||
|
|
||||||
|
**Fix:** Target 80-85%, focus on business logic
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Equivalent Mutants
|
||||||
|
|
||||||
|
**Symptom:** "95% mutation score, still have survived mutants"
|
||||||
|
|
||||||
|
**Equivalent mutants:** Changes that don't affect behavior
|
||||||
|
|
||||||
|
```python
|
||||||
|
def is_positive(x):
|
||||||
|
return x > 0
|
||||||
|
|
||||||
|
# Mutation: x > 0 → x >= 0
|
||||||
|
# If input is never exactly 0, this mutation is equivalent
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Mark as equivalent in tool config
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# mutmut - mark mutant as equivalent
|
||||||
|
mutmut results
|
||||||
|
# Choose mutant ID
|
||||||
|
mutmut apply 42 --mark-as-equivalent
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Running Mutation Tests on Every Commit
|
||||||
|
|
||||||
|
**Symptom:** CI takes 2 hours
|
||||||
|
|
||||||
|
**Why bad:** Mutation testing is 10-100x slower than regular tests
|
||||||
|
|
||||||
|
**Fix:**
|
||||||
|
- Run weekly or nightly
|
||||||
|
- Run on core modules only (not entire codebase)
|
||||||
|
- Use as quality metric, not blocker
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Incremental Mutation Testing
|
||||||
|
|
||||||
|
**Test only changed code:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# mutmut - test only modified files
|
||||||
|
git diff --name-only main | grep '\.py$' | mutmut run --paths-to-mutate -
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Faster feedback (minutes instead of hours)
|
||||||
|
- Can run on PRs
|
||||||
|
- Focuses on new code
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Mutation testing measures if your tests actually detect bugs. High code coverage doesn't mean good tests.**
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
- Run weekly/nightly, not on every commit (too slow)
|
||||||
|
- Target 80-85% mutation score for business logic
|
||||||
|
- Use mutmut (Python), Stryker (JS), PITest (Java)
|
||||||
|
- Focus on killed vs survived mutants
|
||||||
|
- Ignore equivalent mutants
|
||||||
|
|
||||||
|
**If your tests have 95% coverage but 40% mutation score, your tests aren't testing anything meaningful. Fix the tests, not the coverage metric.**
|
||||||
479
skills/observability-and-monitoring/SKILL.md
Normal file
479
skills/observability-and-monitoring/SKILL.md
Normal file
@@ -0,0 +1,479 @@
|
|||||||
|
---
|
||||||
|
name: observability-and-monitoring
|
||||||
|
description: Use when implementing metrics/logs/traces, defining SLIs/SLOs, designing alerts, choosing observability tools, debugging alert fatigue, or optimizing observability costs - provides SRE frameworks, anti-patterns, and implementation patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# Observability and Monitoring
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Measure what users care about, alert on symptoms not causes, make alerts actionable.
|
||||||
|
|
||||||
|
**Rule:** Observability without actionability is just expensive logging.
|
||||||
|
|
||||||
|
**Already have observability tools (CloudWatch, Datadog, etc.)?** Optimize what you have first. Most observability problems are usage/process issues, not tooling. Implement SLIs/SLOs, clean up alerts, add runbooks with existing tools. Migrate only if you hit concrete tool limitations (cost, features, multi-cloud). Tool migration is expensive - make sure it solves a real problem.
|
||||||
|
|
||||||
|
## Getting Started Decision Tree
|
||||||
|
|
||||||
|
| Team Size | Scale | Starting Point | Tools |
|
||||||
|
|-----------|-------|----------------|-------|
|
||||||
|
| 1-5 engineers | <10 services | Metrics + logs | Prometheus + Grafana + Loki |
|
||||||
|
| 5-20 engineers | 10-50 services | Metrics + logs + basic traces | Add Jaeger, OpenTelemetry |
|
||||||
|
| 20+ engineers | 50+ services | Full observability + SLOs | Managed platform (Datadog, Grafana Cloud) |
|
||||||
|
|
||||||
|
**First step:** Implement metrics with OpenTelemetry + Prometheus
|
||||||
|
|
||||||
|
**Why this order:** Metrics give you fastest time-to-value (detect issues), logs help debug (understand what happened), traces solve complex distributed problems (debug cross-service issues)
|
||||||
|
|
||||||
|
## Three Pillars Quick Reference
|
||||||
|
|
||||||
|
### Metrics (Quantitative, aggregated)
|
||||||
|
|
||||||
|
**When to use:** Alerting, dashboards, trend analysis
|
||||||
|
|
||||||
|
**What to collect:**
|
||||||
|
- **RED method** (services): Rate, Errors, Duration
|
||||||
|
- **USE method** (resources): Utilization, Saturation, Errors
|
||||||
|
- **Four Golden Signals**: Latency, traffic, errors, saturation
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
```python
|
||||||
|
# OpenTelemetry metrics
|
||||||
|
from opentelemetry import metrics
|
||||||
|
|
||||||
|
meter = metrics.get_meter(__name__)
|
||||||
|
request_counter = meter.create_counter(
|
||||||
|
"http_requests_total",
|
||||||
|
description="Total HTTP requests"
|
||||||
|
)
|
||||||
|
request_duration = meter.create_histogram(
|
||||||
|
"http_request_duration_seconds",
|
||||||
|
description="HTTP request duration"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Instrument request
|
||||||
|
request_counter.add(1, {"method": "GET", "endpoint": "/api/users"})
|
||||||
|
request_duration.record(duration, {"method": "GET", "endpoint": "/api/users"})
|
||||||
|
```
|
||||||
|
|
||||||
|
### Logs (Discrete events)
|
||||||
|
|
||||||
|
**When to use:** Debugging, audit trails, error investigation
|
||||||
|
|
||||||
|
**Best practices:**
|
||||||
|
- Structured logging (JSON)
|
||||||
|
- Include correlation IDs
|
||||||
|
- Don't log sensitive data (PII, secrets)
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
```python
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
log = structlog.get_logger()
|
||||||
|
log.info(
|
||||||
|
"user_login",
|
||||||
|
user_id=user_id,
|
||||||
|
correlation_id=correlation_id,
|
||||||
|
ip_address=ip,
|
||||||
|
duration_ms=duration
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Traces (Request flows)
|
||||||
|
|
||||||
|
**When to use:** Debugging distributed systems, latency investigation
|
||||||
|
|
||||||
|
**Implementation:**
|
||||||
|
```python
|
||||||
|
from opentelemetry import trace
|
||||||
|
|
||||||
|
tracer = trace.get_tracer(__name__)
|
||||||
|
|
||||||
|
with tracer.start_as_current_span("process_order") as span:
|
||||||
|
span.set_attribute("order.id", order_id)
|
||||||
|
span.set_attribute("user.id", user_id)
|
||||||
|
# Process order logic
|
||||||
|
```
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Vanity Metrics
|
||||||
|
**Symptom:** Tracking metrics that look impressive but don't inform decisions
|
||||||
|
|
||||||
|
**Why bad:** Wastes resources, distracts from actionable metrics
|
||||||
|
|
||||||
|
**Fix:** Only collect metrics that answer "should I page someone?" or inform business decisions
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad - vanity metric
|
||||||
|
total_requests_all_time_counter.inc()
|
||||||
|
|
||||||
|
# ✅ Good - actionable metric
|
||||||
|
request_error_rate.labels(service="api", endpoint="/users").observe(error_rate)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Alert on Everything
|
||||||
|
**Symptom:** Hundreds of alerts per day, team ignores most of them
|
||||||
|
|
||||||
|
**Why bad:** Alert fatigue, real issues get missed, on-call burnout
|
||||||
|
|
||||||
|
**Fix:** Alert only on user-impacting symptoms that require immediate action
|
||||||
|
|
||||||
|
**Test:** "If this alert fires at 2am, should someone wake up to fix it?" If no, it's not an alert.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Runbooks
|
||||||
|
**Symptom:** Alerts fire with no guidance on how to respond
|
||||||
|
|
||||||
|
**Why bad:** Increased MTTR, inconsistent responses, on-call stress
|
||||||
|
|
||||||
|
**Fix:** Every alert must link to a runbook with investigation steps
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# ✅ Good alert with runbook
|
||||||
|
alert: HighErrorRate
|
||||||
|
annotations:
|
||||||
|
summary: "Error rate >5% on {{$labels.service}}"
|
||||||
|
description: "Current: {{$value}}%"
|
||||||
|
runbook: "https://wiki.company.com/runbooks/high-error-rate"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Cardinality Explosion
|
||||||
|
**Symptom:** Metrics with unbounded labels (user IDs, timestamps, UUIDs) cause storage/performance issues
|
||||||
|
|
||||||
|
**Why bad:** Expensive storage, slow queries, potential system failure
|
||||||
|
|
||||||
|
**Fix:** Use fixed cardinality labels, aggregate high-cardinality dimensions
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad - unbounded cardinality
|
||||||
|
request_counter.labels(user_id=user_id).inc() # Millions of unique series
|
||||||
|
|
||||||
|
# ✅ Good - bounded cardinality
|
||||||
|
request_counter.labels(user_type="premium", region="us-east").inc()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Missing Correlation IDs
|
||||||
|
**Symptom:** Can't trace requests across services, debugging takes hours
|
||||||
|
|
||||||
|
**Why bad:** High MTTR, frustrated engineers, customer impact
|
||||||
|
|
||||||
|
**Fix:** Generate correlation ID at entry point, propagate through all services
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ Good - correlation ID propagation
|
||||||
|
import uuid
|
||||||
|
from contextvars import ContextVar
|
||||||
|
|
||||||
|
correlation_id_var = ContextVar("correlation_id", default=None)
|
||||||
|
|
||||||
|
def handle_request():
|
||||||
|
correlation_id = request.headers.get("X-Correlation-ID") or str(uuid.uuid4())
|
||||||
|
correlation_id_var.set(correlation_id)
|
||||||
|
|
||||||
|
# All logs and traces include it automatically
|
||||||
|
log.info("processing_request", extra={"correlation_id": correlation_id})
|
||||||
|
```
|
||||||
|
|
||||||
|
## SLI Selection Framework
|
||||||
|
|
||||||
|
**Principle:** Measure user experience, not system internals
|
||||||
|
|
||||||
|
### Four Golden Signals
|
||||||
|
|
||||||
|
| Signal | Definition | Example SLI |
|
||||||
|
|--------|------------|-------------|
|
||||||
|
| **Latency** | Request response time | p99 latency < 200ms |
|
||||||
|
| **Traffic** | Demand on system | Requests per second |
|
||||||
|
| **Errors** | Failed requests | Error rate < 0.1% |
|
||||||
|
| **Saturation** | Resource fullness | CPU < 80%, queue depth < 100 |
|
||||||
|
|
||||||
|
### RED Method (for services)
|
||||||
|
|
||||||
|
- **Rate**: Requests per second
|
||||||
|
- **Errors**: Error rate (%)
|
||||||
|
- **Duration**: Response time (p50, p95, p99)
|
||||||
|
|
||||||
|
### USE Method (for resources)
|
||||||
|
|
||||||
|
- **Utilization**: % time resource busy (CPU %, disk I/O %)
|
||||||
|
- **Saturation**: Queue depth, wait time
|
||||||
|
- **Errors**: Error count
|
||||||
|
|
||||||
|
**Decision framework:**
|
||||||
|
|
||||||
|
| Service Type | Recommended SLIs |
|
||||||
|
|--------------|------------------|
|
||||||
|
| **User-facing API** | Availability (%), p95 latency, error rate |
|
||||||
|
| **Background jobs** | Freshness (time since last run), success rate, processing time |
|
||||||
|
| **Data pipeline** | Data freshness, completeness (%), processing latency |
|
||||||
|
| **Storage** | Availability, durability, latency percentiles |
|
||||||
|
|
||||||
|
## SLO Definition Guide
|
||||||
|
|
||||||
|
**SLO = Target value for SLI**
|
||||||
|
|
||||||
|
**Formula:** `SLO = (good events / total events) >= target`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```
|
||||||
|
SLI: Request success rate
|
||||||
|
SLO: 99.9% of requests succeed (measured over 30 days)
|
||||||
|
Error budget: 0.1% = ~43 minutes downtime/month
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Budget
|
||||||
|
|
||||||
|
**Definition:** Amount of unreliability you can tolerate
|
||||||
|
|
||||||
|
**Calculation:**
|
||||||
|
```
|
||||||
|
Error budget = 1 - SLO target
|
||||||
|
If SLO = 99.9%, error budget = 0.1%
|
||||||
|
For 1M requests/month: 1,000 requests can fail
|
||||||
|
```
|
||||||
|
|
||||||
|
**Usage:** Balance reliability vs feature velocity
|
||||||
|
|
||||||
|
### Multi-Window Multi-Burn-Rate Alerting
|
||||||
|
|
||||||
|
**Problem:** Simple threshold alerts are either too noisy or too slow
|
||||||
|
|
||||||
|
**Solution:** Alert based on how fast you're burning error budget
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Alert if burning budget 14.4x faster than acceptable (5% in 1 hour)
|
||||||
|
alert: ErrorBudgetBurnRateHigh
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
rate(errors[1h]) / rate(requests[1h])
|
||||||
|
) > (14.4 * (1 - 0.999))
|
||||||
|
annotations:
|
||||||
|
summary: "Burning error budget at 14.4x rate"
|
||||||
|
runbook: "https://wiki/runbooks/error-budget-burn"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Alert Design Patterns
|
||||||
|
|
||||||
|
**Principle:** Alert on symptoms (user impact) not causes (CPU high)
|
||||||
|
|
||||||
|
### Symptom-Based Alerting
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad - alert on cause
|
||||||
|
alert: HighCPU
|
||||||
|
expr: cpu_usage > 80%
|
||||||
|
|
||||||
|
# ✅ Good - alert on symptom
|
||||||
|
alert: HighLatency
|
||||||
|
expr: http_request_duration_p99 > 1.0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Alert Severity Levels
|
||||||
|
|
||||||
|
| Level | When | Response Time | Example |
|
||||||
|
|-------|------|---------------|---------|
|
||||||
|
| **Critical** | User-impacting | Immediate (page) | Error rate >5%, service down |
|
||||||
|
| **Warning** | Will become critical | Next business day | Error rate >1%, disk 85% full |
|
||||||
|
| **Info** | Informational | No action needed | Deploy completed, scaling event |
|
||||||
|
|
||||||
|
**Rule:** Only page for critical. Everything else goes to dashboard/Slack.
|
||||||
|
|
||||||
|
## Cost Optimization Quick Reference
|
||||||
|
|
||||||
|
**Observability can cost 5-15% of infrastructure spend. Optimize:**
|
||||||
|
|
||||||
|
### Sampling Strategies
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Trace sampling - collect 10% of traces
|
||||||
|
from opentelemetry.sdk.trace.sampling import TraceIdRatioBased
|
||||||
|
|
||||||
|
sampler = TraceIdRatioBased(0.1) # 10% sampling
|
||||||
|
```
|
||||||
|
|
||||||
|
**When to sample:**
|
||||||
|
- Traces: 1-10% for high-traffic services
|
||||||
|
- Logs: Sample debug/info logs, keep all errors
|
||||||
|
- Metrics: Don't sample (they're already aggregated)
|
||||||
|
|
||||||
|
### Retention Policies
|
||||||
|
|
||||||
|
| Data Type | Recommended Retention | Rationale |
|
||||||
|
|-----------|----------------------|-----------|
|
||||||
|
| **Metrics** | 15 days (raw), 13 months (aggregated) | Trend analysis |
|
||||||
|
| **Logs** | 7-30 days | Debugging, compliance |
|
||||||
|
| **Traces** | 7 days | Debugging recent issues |
|
||||||
|
|
||||||
|
### Cardinality Control
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad - high cardinality
|
||||||
|
http_requests.labels(
|
||||||
|
method=method,
|
||||||
|
url=full_url, # Unbounded!
|
||||||
|
user_id=user_id # Unbounded!
|
||||||
|
)
|
||||||
|
|
||||||
|
# ✅ Good - controlled cardinality
|
||||||
|
http_requests.labels(
|
||||||
|
method=method,
|
||||||
|
endpoint=route_pattern, # /users/:id not /users/12345
|
||||||
|
status_code=status
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tool Ecosystem Quick Reference
|
||||||
|
|
||||||
|
| Category | Open Source | Managed/Commercial |
|
||||||
|
|----------|-------------|-------------------|
|
||||||
|
| **Metrics** | Prometheus, VictoriaMetrics | Datadog, New Relic, Grafana Cloud |
|
||||||
|
| **Logs** | Loki, ELK Stack | Datadog, Splunk, Sumo Logic |
|
||||||
|
| **Traces** | Jaeger, Zipkin | Datadog, Honeycomb, Lightstep |
|
||||||
|
| **All-in-One** | Grafana + Loki + Tempo + Mimir | Datadog, New Relic, Dynatrace |
|
||||||
|
| **Instrumentation** | OpenTelemetry | (vendor SDKs) |
|
||||||
|
|
||||||
|
**Recommendation:**
|
||||||
|
- **Starting out**: Prometheus + Grafana + OpenTelemetry
|
||||||
|
- **Growing (10-50 services)**: Add Loki (logs) + Jaeger (traces)
|
||||||
|
- **Scale (50+ services)**: Consider managed (Datadog, Grafana Cloud)
|
||||||
|
|
||||||
|
**Why OpenTelemetry:** Vendor-neutral, future-proof, single instrumentation for all signals
|
||||||
|
|
||||||
|
## Your First Observability Setup
|
||||||
|
|
||||||
|
**Goal:** Metrics + alerting in one week
|
||||||
|
|
||||||
|
**Day 1-2: Instrument application**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Add OpenTelemetry
|
||||||
|
from opentelemetry import metrics, trace
|
||||||
|
from opentelemetry.sdk.metrics import MeterProvider
|
||||||
|
from opentelemetry.sdk.trace import TracerProvider
|
||||||
|
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
||||||
|
|
||||||
|
# Initialize
|
||||||
|
meter_provider = MeterProvider(
|
||||||
|
metric_readers=[PrometheusMetricReader()]
|
||||||
|
)
|
||||||
|
metrics.set_meter_provider(meter_provider)
|
||||||
|
|
||||||
|
# Instrument HTTP framework (auto-instrumentation)
|
||||||
|
from opentelemetry.instrumentation.flask import FlaskInstrumentor
|
||||||
|
FlaskInstrumentor().instrument_app(app)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Day 3-4: Deploy Prometheus + Grafana**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# docker-compose.yml
|
||||||
|
version: '3'
|
||||||
|
services:
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
|
volumes:
|
||||||
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Day 5: Define SLIs and SLOs**
|
||||||
|
|
||||||
|
```
|
||||||
|
SLI: HTTP request success rate
|
||||||
|
SLO: 99.9% of requests succeed (30-day window)
|
||||||
|
Error budget: 0.1% = 43 minutes downtime/month
|
||||||
|
```
|
||||||
|
|
||||||
|
**Day 6: Create alerts**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# prometheus-alerts.yml
|
||||||
|
groups:
|
||||||
|
- name: slo_alerts
|
||||||
|
rules:
|
||||||
|
- alert: HighErrorRate
|
||||||
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
summary: "Error rate >5% on {{$labels.service}}"
|
||||||
|
runbook: "https://wiki/runbooks/high-error-rate"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Day 7: Build dashboard**
|
||||||
|
|
||||||
|
**Panels to include:**
|
||||||
|
- Error rate (%)
|
||||||
|
- Request rate (req/s)
|
||||||
|
- p50/p95/p99 latency
|
||||||
|
- CPU/memory utilization
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Logging in Production == Debugging in Production
|
||||||
|
**Fix:** Use structured logging with correlation IDs, not print statements
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Alerting on Predictions, Not Reality
|
||||||
|
**Fix:** Alert on actual user impact (errors, latency) not predicted issues (disk 70% full)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Dashboard Sprawl
|
||||||
|
**Fix:** One main dashboard per service showing SLIs. Delete dashboards unused for 3 months.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Alert Feedback Loop
|
||||||
|
**Fix:** Track alert precision (% that led to action). Delete alerts with <50% precision.
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Getting Started:**
|
||||||
|
- Start with metrics (Prometheus + OpenTelemetry)
|
||||||
|
- Add logs when debugging is hard (Loki)
|
||||||
|
- Add traces when issues span services (Jaeger)
|
||||||
|
|
||||||
|
**SLI Selection:**
|
||||||
|
- User-facing: Availability, latency, error rate
|
||||||
|
- Background: Freshness, success rate, processing time
|
||||||
|
|
||||||
|
**SLO Targets:**
|
||||||
|
- Start with 99% (achievable)
|
||||||
|
- Increase to 99.9% only if business requires it
|
||||||
|
- 99.99% is very expensive (4 nines = 52 min/year downtime)
|
||||||
|
|
||||||
|
**Alerting:**
|
||||||
|
- Critical only = page
|
||||||
|
- Warning = next business day
|
||||||
|
- Info = dashboard only
|
||||||
|
|
||||||
|
**Cost Control:**
|
||||||
|
- Sample traces (1-10%)
|
||||||
|
- Control metric cardinality (no unbounded labels)
|
||||||
|
- Set retention policies (7-30 days logs, 15 days metrics)
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
- Small: Prometheus + Grafana + Loki
|
||||||
|
- Medium: Add Jaeger
|
||||||
|
- Large: Consider Datadog, Grafana Cloud
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Start with metrics using OpenTelemetry + Prometheus. Define 3-5 SLIs based on user experience. Alert only on symptoms that require immediate action. Add logs and traces when metrics aren't enough.**
|
||||||
|
|
||||||
|
Measure what users care about, not what's easy to measure.
|
||||||
242
skills/performance-testing-fundamentals/SKILL.md
Normal file
242
skills/performance-testing-fundamentals/SKILL.md
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
---
|
||||||
|
name: performance-testing-fundamentals
|
||||||
|
description: Use when starting performance testing, choosing load testing tools, interpreting performance metrics, debugging slow applications, or establishing performance baselines - provides decision frameworks and anti-patterns for load, stress, spike, and soak testing
|
||||||
|
---
|
||||||
|
|
||||||
|
# Performance Testing Fundamentals
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Diagnose first, test second. Performance testing without understanding your bottlenecks wastes time.
|
||||||
|
|
||||||
|
**Rule:** Define SLAs before testing. You can't judge "good" performance without requirements.
|
||||||
|
|
||||||
|
## When NOT to Performance Test
|
||||||
|
|
||||||
|
Performance test only AFTER:
|
||||||
|
- ✅ Defining performance SLAs (latency, throughput, error rate targets)
|
||||||
|
- ✅ Profiling current bottlenecks (APM, database logs, profiling)
|
||||||
|
- ✅ Fixing obvious issues (missing indexes, N+1 queries, inefficient algorithms)
|
||||||
|
|
||||||
|
**Don't performance test to find problems** - use profiling/APM for that. Performance test to verify fixes and validate capacity.
|
||||||
|
|
||||||
|
## Tool Selection Decision Tree
|
||||||
|
|
||||||
|
| Your Constraint | Choose | Why |
|
||||||
|
|----------------|--------|-----|
|
||||||
|
| CI/CD integration, JavaScript team | **k6** | Modern, code-as-config, easy CI integration |
|
||||||
|
| Complex scenarios, enterprise, mature ecosystem | **JMeter** | GUI, plugins, every protocol |
|
||||||
|
| High throughput (10k+ RPS), Scala team | **Gatling** | Built for scale, excellent reports |
|
||||||
|
| Quick HTTP benchmark, no complex scenarios | **Apache Bench (ab)** or **wrk** | Command-line, no setup |
|
||||||
|
| Cloud-based, don't want infrastructure | **BlazeMeter**, **Loader.io** | SaaS, pay-per-use |
|
||||||
|
| Realistic browser testing (JS rendering) | **Playwright** + **k6** | Hybrid: Playwright for UX, k6 for load |
|
||||||
|
|
||||||
|
**For most teams:** k6 (modern, scriptable) or JMeter (mature, GUI)
|
||||||
|
|
||||||
|
## Test Type Quick Reference
|
||||||
|
|
||||||
|
| Test Type | Purpose | Duration | Load Pattern | Use When |
|
||||||
|
|-----------|---------|----------|--------------|----------|
|
||||||
|
| **Load Test** | Verify normal operations under expected load | 15-30 min | Steady (ramp to target, sustain) | Baseline validation, regression testing |
|
||||||
|
| **Stress Test** | Find breaking point | 5-15 min | Increasing (ramp until failure) | Capacity planning, finding limits |
|
||||||
|
| **Spike Test** | Test sudden traffic surge | 2-5 min | Instant jump (0 → peak) | Black Friday prep, auto-scaling validation |
|
||||||
|
| **Soak Test** | Find memory leaks, connection pool exhaustion | 2-8 hours | Steady sustained load | Pre-production validation, stability check |
|
||||||
|
|
||||||
|
**Start with Load Test** (validates baseline), then Stress/Spike (finds limits), finally Soak (validates stability).
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Premature Load Testing
|
||||||
|
**Symptom:** "App is slow, let's load test it"
|
||||||
|
|
||||||
|
**Why bad:** Load testing reveals "it's slow under load" but not WHY or WHERE
|
||||||
|
|
||||||
|
**Fix:** Profile first (APM, database slow query logs, profiler), fix obvious bottlenecks, THEN load test to validate
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Without SLAs
|
||||||
|
**Symptom:** "My API handles 100 RPS with 200ms average latency. Is that good?"
|
||||||
|
|
||||||
|
**Why bad:** Can't judge "good" without requirements. A gaming API needs <50ms; batch processing tolerates 2s.
|
||||||
|
|
||||||
|
**Fix:** Define SLAs first:
|
||||||
|
- Target latency: P95 < 300ms, P99 < 500ms
|
||||||
|
- Target throughput: 500 RPS at peak
|
||||||
|
- Max error rate: < 0.1%
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Unrealistic SLAs
|
||||||
|
**Symptom:** "Our database-backed CRUD API with complex joins must have P95 < 10ms"
|
||||||
|
|
||||||
|
**Why bad:** Sets impossible targets. Database round-trip alone is often 5-20ms. Forces wasted optimization or architectural rewrites.
|
||||||
|
|
||||||
|
**Fix:** Compare against Performance Benchmarks table (see below). If target is 10x better than benchmark, profile current performance first, then negotiate realistic SLA based on what's achievable vs cost of optimization.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Vanity Metrics
|
||||||
|
**Symptom:** Reporting only average response time
|
||||||
|
|
||||||
|
**Why bad:** Average hides tail latency. 99% of requests at 100ms + 1% at 10s = "average 200ms" looks fine, but users experience 10s delays.
|
||||||
|
|
||||||
|
**Fix:** Always report percentiles:
|
||||||
|
- P50 (median) - typical user experience
|
||||||
|
- P95 - most users
|
||||||
|
- P99 - worst-case for significant minority
|
||||||
|
- Max - outliers
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Load Testing in Production First
|
||||||
|
**Symptom:** "Let's test capacity by running load tests against production"
|
||||||
|
|
||||||
|
**Why bad:** Risks outages, contaminates real metrics, can trigger alerts/costs
|
||||||
|
|
||||||
|
**Fix:** Test in staging environment that mirrors production (same DB size, network latency, resource limits)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Single-User "Load" Tests
|
||||||
|
**Symptom:** Running one user hitting the API as fast as possible
|
||||||
|
|
||||||
|
**Why bad:** Doesn't simulate realistic concurrency, misses resource contention (database connections, thread pools)
|
||||||
|
|
||||||
|
**Fix:** Simulate realistic concurrent users with realistic think time between requests
|
||||||
|
|
||||||
|
## Metrics Glossary
|
||||||
|
|
||||||
|
| Metric | Definition | Good Threshold (typical web API) |
|
||||||
|
|--------|------------|----------------------------------|
|
||||||
|
| **RPS** (Requests/Second) | Throughput - how many requests processed | Varies by app; know your peak |
|
||||||
|
| **Latency** | Time from request to response | P95 < 300ms, P99 < 500ms |
|
||||||
|
| **P50 (Median)** | 50% of requests faster than this | P50 < 100ms |
|
||||||
|
| **P95** | 95% of requests faster than this | P95 < 300ms |
|
||||||
|
| **P99** | 99% of requests faster than this | P99 < 500ms |
|
||||||
|
| **Error Rate** | % of 4xx/5xx responses | < 0.1% |
|
||||||
|
| **Throughput** | Data transferred per second (MB/s) | Depends on payload size |
|
||||||
|
| **Concurrent Users** | Active users at same time | Calculate from traffic patterns |
|
||||||
|
|
||||||
|
**Focus on P95/P99, not average.** Tail latency kills user experience.
|
||||||
|
|
||||||
|
## Diagnostic-First Workflow
|
||||||
|
|
||||||
|
Before load testing slow applications, follow this workflow:
|
||||||
|
|
||||||
|
**Step 1: Measure Current State**
|
||||||
|
- Install APM (DataDog, New Relic, Grafana) or logging
|
||||||
|
- Identify slowest 10 endpoints/operations
|
||||||
|
- Check database slow query logs
|
||||||
|
|
||||||
|
**Step 2: Common Quick Wins** (90% of performance issues)
|
||||||
|
- Missing database indexes
|
||||||
|
- N+1 query problem
|
||||||
|
- Unoptimized images/assets
|
||||||
|
- Missing caching (Redis, CDN)
|
||||||
|
- Synchronous operations that should be async
|
||||||
|
- Inefficient serialization (JSON parsing bottlenecks)
|
||||||
|
|
||||||
|
**Step 3: Profile Specific Bottleneck**
|
||||||
|
- Use profiler to see CPU/memory hotspots
|
||||||
|
- Trace requests to find where time is spent (DB? external API? computation?)
|
||||||
|
- Check for resource limits (max connections, thread pool exhaustion)
|
||||||
|
|
||||||
|
**Step 4: Fix and Measure**
|
||||||
|
- Apply fix (add index, cache layer, async processing)
|
||||||
|
- Measure improvement in production
|
||||||
|
- Document before/after metrics
|
||||||
|
|
||||||
|
**Step 5: THEN Load Test** (if needed)
|
||||||
|
- Validate fixes handle expected load
|
||||||
|
- Find new capacity limits
|
||||||
|
- Establish regression baseline
|
||||||
|
|
||||||
|
**Anti-pattern to avoid:** Skipping to Step 5 without Steps 1-4.
|
||||||
|
|
||||||
|
## Performance Benchmarks (Reference)
|
||||||
|
|
||||||
|
What "good" looks like by application type:
|
||||||
|
|
||||||
|
| Application Type | Typical P95 Latency | Typical Throughput | Notes |
|
||||||
|
|------------------|---------------------|-------------------|-------|
|
||||||
|
| **REST API (CRUD)** | < 200ms | 500-2000 RPS | Database-backed, simple queries |
|
||||||
|
| **Search API** | < 500ms | 100-500 RPS | Complex queries, ranking algorithms |
|
||||||
|
| **Payment Gateway** | < 1s | 50-200 RPS | External service calls, strict consistency |
|
||||||
|
| **Real-time Gaming** | < 50ms | 1000-10000 RPS | Low latency critical |
|
||||||
|
| **Batch Processing** | 2-10s/job | 10-100 jobs/min | Throughput > latency |
|
||||||
|
| **Static CDN** | < 100ms | 10000+ RPS | Edge-cached, minimal computation |
|
||||||
|
|
||||||
|
**Use as rough guide, not absolute targets.** Your SLAs depend on user needs.
|
||||||
|
|
||||||
|
## Results Interpretation Framework
|
||||||
|
|
||||||
|
After running a load test:
|
||||||
|
|
||||||
|
**Pass Criteria:**
|
||||||
|
- ✅ All requests meet latency SLA (e.g., P95 < 300ms)
|
||||||
|
- ✅ Error rate under threshold (< 0.1%)
|
||||||
|
- ✅ No resource exhaustion (CPU < 80%, memory stable, no connection pool saturation)
|
||||||
|
- ✅ Sustained load for test duration without degradation
|
||||||
|
|
||||||
|
**Fail Criteria:**
|
||||||
|
- ❌ Latency exceeds SLA
|
||||||
|
- ❌ Error rate spikes
|
||||||
|
- ❌ Gradual degradation over time (memory leak, connection leak)
|
||||||
|
- ❌ Resource exhaustion (CPU pegged, OOM errors)
|
||||||
|
|
||||||
|
**Next Steps:**
|
||||||
|
- **If passing:** Establish this as regression baseline, run periodically in CI
|
||||||
|
- **If failing:** Profile to find bottleneck, optimize, re-test
|
||||||
|
- **If borderline:** Test at higher load (stress test) to find safety margin
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Not Ramping Load Gradually
|
||||||
|
**Symptom:** Instant 0 → 1000 users, everything fails
|
||||||
|
|
||||||
|
**Fix:** Ramp over 2-5 minutes to let auto-scaling/caching warm up (except spike tests, where instant jump is the point)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing With Empty Database
|
||||||
|
**Symptom:** Tests pass with 100 records, fail with 1M records in production
|
||||||
|
|
||||||
|
**Fix:** Seed staging database with production-scale data
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring External Dependencies
|
||||||
|
**Symptom:** Your API is fast, but third-party payment gateway times out under load
|
||||||
|
|
||||||
|
**Fix:** Include external service latency in SLAs, or mock them for isolated API testing
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Getting Started Checklist:**
|
||||||
|
1. Define SLAs (latency P95/P99, throughput, error rate)
|
||||||
|
2. Choose tool (k6 or JMeter for most cases)
|
||||||
|
3. Start with Load Test (baseline validation)
|
||||||
|
4. Run Stress Test (find capacity limits)
|
||||||
|
5. Establish regression baseline
|
||||||
|
6. Run in CI on major changes
|
||||||
|
|
||||||
|
**When Debugging Slow App:**
|
||||||
|
1. Profile first (APM, database logs)
|
||||||
|
2. Fix obvious issues (indexes, N+1, caching)
|
||||||
|
3. Measure improvement
|
||||||
|
4. THEN load test to validate
|
||||||
|
|
||||||
|
**Interpreting Results:**
|
||||||
|
- Report P95/P99, not just average
|
||||||
|
- Compare against SLAs
|
||||||
|
- Check for resource exhaustion
|
||||||
|
- Look for degradation over time (soak tests)
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Performance testing validates capacity and catches regressions.**
|
||||||
|
|
||||||
|
**Profiling finds bottlenecks.**
|
||||||
|
|
||||||
|
Don't confuse the two - diagnose first, test second.
|
||||||
504
skills/property-based-testing/SKILL.md
Normal file
504
skills/property-based-testing/SKILL.md
Normal file
@@ -0,0 +1,504 @@
|
|||||||
|
---
|
||||||
|
name: property-based-testing
|
||||||
|
description: Use when testing invariants, validating properties across many inputs, using Hypothesis (Python) or fast-check (JavaScript), defining test strategies, handling shrinking, or finding edge cases - provides property definition patterns and integration strategies
|
||||||
|
---
|
||||||
|
|
||||||
|
# Property-Based Testing
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Instead of testing specific examples, test properties that should hold for all inputs.
|
||||||
|
|
||||||
|
**Rule:** Property-based tests generate hundreds of inputs automatically. One property test replaces dozens of example tests.
|
||||||
|
|
||||||
|
## Property-Based vs Example-Based Testing
|
||||||
|
|
||||||
|
| Aspect | Example-Based | Property-Based |
|
||||||
|
|--------|---------------|----------------|
|
||||||
|
| **Test input** | Hardcoded examples | Generated inputs |
|
||||||
|
| **Coverage** | Few specific cases | Hundreds of random cases |
|
||||||
|
| **Maintenance** | Add new examples manually | Properties automatically tested |
|
||||||
|
| **Edge cases** | Must think of them | Automatically discovered |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Example-based: Test 3 specific inputs
|
||||||
|
def test_reverse():
|
||||||
|
assert reverse([1, 2, 3]) == [3, 2, 1]
|
||||||
|
assert reverse([]) == []
|
||||||
|
assert reverse([1]) == [1]
|
||||||
|
|
||||||
|
# Property-based: Test ALL inputs
|
||||||
|
@given(lists(integers()))
|
||||||
|
def test_reverse_property(lst):
|
||||||
|
# Property: Reversing twice returns original
|
||||||
|
assert reverse(reverse(lst)) == lst
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Selection
|
||||||
|
|
||||||
|
| Language | Tool | Why |
|
||||||
|
|----------|------|-----|
|
||||||
|
| **Python** | **Hypothesis** | Most mature, excellent shrinking |
|
||||||
|
| **JavaScript/TypeScript** | **fast-check** | TypeScript support, good integration |
|
||||||
|
| **Java** | **jqwik** | JUnit 5 integration |
|
||||||
|
| **Haskell** | **QuickCheck** | Original property-based testing library |
|
||||||
|
|
||||||
|
**First choice:** Hypothesis (Python) or fast-check (JavaScript)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Basic Property Test (Python + Hypothesis)
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install hypothesis
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hypothesis import given
|
||||||
|
from hypothesis.strategies import integers, lists
|
||||||
|
|
||||||
|
def reverse(lst):
|
||||||
|
"""Reverse a list."""
|
||||||
|
return lst[::-1]
|
||||||
|
|
||||||
|
@given(lists(integers()))
|
||||||
|
def test_reverse_twice(lst):
|
||||||
|
"""Property: Reversing twice returns original."""
|
||||||
|
assert reverse(reverse(lst)) == lst
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run:**
|
||||||
|
```bash
|
||||||
|
pytest test_reverse.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output:**
|
||||||
|
```
|
||||||
|
Trying example: lst=[]
|
||||||
|
Trying example: lst=[0]
|
||||||
|
Trying example: lst=[1, -2, 3]
|
||||||
|
... (100 examples tested)
|
||||||
|
PASSED
|
||||||
|
```
|
||||||
|
|
||||||
|
**If property fails:**
|
||||||
|
```
|
||||||
|
Falsifying example: lst=[0, 0, 1]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Properties
|
||||||
|
|
||||||
|
### 1. Inverse Functions
|
||||||
|
|
||||||
|
**Property:** `f(g(x)) == x`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hypothesis import given
|
||||||
|
from hypothesis.strategies import text
|
||||||
|
|
||||||
|
@given(text())
|
||||||
|
def test_encode_decode(s):
|
||||||
|
"""Property: Decoding encoded string returns original."""
|
||||||
|
assert decode(encode(s)) == s
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Idempotence
|
||||||
|
|
||||||
|
**Property:** `f(f(x)) == f(x)`
|
||||||
|
|
||||||
|
```python
|
||||||
|
@given(lists(integers()))
|
||||||
|
def test_sort_idempotent(lst):
|
||||||
|
"""Property: Sorting twice gives same result as sorting once."""
|
||||||
|
assert sorted(sorted(lst)) == sorted(lst)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Invariants
|
||||||
|
|
||||||
|
**Property:** Some fact remains true after operation
|
||||||
|
|
||||||
|
```python
|
||||||
|
@given(lists(integers()))
|
||||||
|
def test_reverse_length(lst):
|
||||||
|
"""Property: Reversing doesn't change length."""
|
||||||
|
assert len(reverse(lst)) == len(lst)
|
||||||
|
|
||||||
|
@given(lists(integers()))
|
||||||
|
def test_reverse_elements(lst):
|
||||||
|
"""Property: Reversing doesn't change elements."""
|
||||||
|
assert set(reverse(lst)) == set(lst)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Commutativity
|
||||||
|
|
||||||
|
**Property:** `f(x, y) == f(y, x)`
|
||||||
|
|
||||||
|
```python
|
||||||
|
@given(integers(), integers())
|
||||||
|
def test_addition_commutative(a, b):
|
||||||
|
"""Property: Addition is commutative."""
|
||||||
|
assert a + b == b + a
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. Associativity
|
||||||
|
|
||||||
|
**Property:** `f(f(x, y), z) == f(x, f(y, z))`
|
||||||
|
|
||||||
|
```python
|
||||||
|
@given(integers(), integers(), integers())
|
||||||
|
def test_addition_associative(a, b, c):
|
||||||
|
"""Property: Addition is associative."""
|
||||||
|
assert (a + b) + c == a + (b + c)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Strategies (Generating Inputs)
|
||||||
|
|
||||||
|
### Built-In Strategies
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hypothesis.strategies import (
|
||||||
|
integers,
|
||||||
|
floats,
|
||||||
|
text,
|
||||||
|
lists,
|
||||||
|
dictionaries,
|
||||||
|
booleans,
|
||||||
|
)
|
||||||
|
|
||||||
|
@given(integers())
|
||||||
|
def test_with_int(x):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@given(integers(min_value=0, max_value=100))
|
||||||
|
def test_with_bounded_int(x):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@given(text(min_size=1, max_size=10))
|
||||||
|
def test_with_short_string(s):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@given(lists(integers(), min_size=1))
|
||||||
|
def test_with_nonempty_list(lst):
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Composite Strategies
|
||||||
|
|
||||||
|
**Generate complex objects:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hypothesis import strategies as st
|
||||||
|
from hypothesis.strategies import composite
|
||||||
|
|
||||||
|
@composite
|
||||||
|
def users(draw):
|
||||||
|
"""Generate user objects."""
|
||||||
|
return {
|
||||||
|
"name": draw(st.text(min_size=1, max_size=50)),
|
||||||
|
"age": draw(st.integers(min_value=0, max_value=120)),
|
||||||
|
"email": draw(st.emails()),
|
||||||
|
}
|
||||||
|
|
||||||
|
@given(users())
|
||||||
|
def test_user_validation(user):
|
||||||
|
assert validate_user(user) is True
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Filtering Strategies
|
||||||
|
|
||||||
|
**Exclude invalid inputs:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@given(integers().filter(lambda x: x != 0))
|
||||||
|
def test_division(x):
|
||||||
|
"""Test division (x != 0)."""
|
||||||
|
assert 10 / x == 10 / x
|
||||||
|
|
||||||
|
# Better: Use assume
|
||||||
|
from hypothesis import assume
|
||||||
|
|
||||||
|
@given(integers())
|
||||||
|
def test_division_better(x):
|
||||||
|
assume(x != 0)
|
||||||
|
assert 10 / x == 10 / x
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Shrinking (Finding Minimal Failing Example)
|
||||||
|
|
||||||
|
**When a property fails, Hypothesis automatically shrinks the input to the smallest failing case.**
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@given(lists(integers()))
|
||||||
|
def test_all_positive(lst):
|
||||||
|
"""Fails if any negative number."""
|
||||||
|
assert all(x > 0 for x in lst)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Initial failure:**
|
||||||
|
```
|
||||||
|
Falsifying example: lst=[-5, 3, -2, 0, 1, 7, -9]
|
||||||
|
```
|
||||||
|
|
||||||
|
**After shrinking:**
|
||||||
|
```
|
||||||
|
Falsifying example: lst=[-1]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why it matters:** Minimal examples are easier to debug
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Integration with pytest
|
||||||
|
|
||||||
|
```python
|
||||||
|
# test_properties.py
|
||||||
|
from hypothesis import given, settings
|
||||||
|
from hypothesis.strategies import integers
|
||||||
|
|
||||||
|
@settings(max_examples=1000) # Run 1000 examples (default: 100)
|
||||||
|
@given(integers(min_value=1))
|
||||||
|
def test_factorial_positive(n):
|
||||||
|
"""Property: Factorial of positive number is positive."""
|
||||||
|
assert factorial(n) > 0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run:**
|
||||||
|
```bash
|
||||||
|
pytest test_properties.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## JavaScript Example (fast-check)
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install --save-dev fast-check
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import fc from 'fast-check';
|
||||||
|
|
||||||
|
function reverse(arr) {
|
||||||
|
return arr.slice().reverse();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Property: Reversing twice returns original
|
||||||
|
test('reverse twice', () => {
|
||||||
|
fc.assert(
|
||||||
|
fc.property(fc.array(fc.integer()), (arr) => {
|
||||||
|
expect(reverse(reverse(arr))).toEqual(arr);
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Advanced Patterns
|
||||||
|
|
||||||
|
### Stateful Testing
|
||||||
|
|
||||||
|
**Test state machines:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hypothesis.stateful import RuleBasedStateMachine, rule, initialize
|
||||||
|
|
||||||
|
class QueueMachine(RuleBasedStateMachine):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.queue = []
|
||||||
|
self.model = []
|
||||||
|
|
||||||
|
@rule(value=integers())
|
||||||
|
def enqueue(self, value):
|
||||||
|
self.queue.append(value)
|
||||||
|
self.model.append(value)
|
||||||
|
|
||||||
|
@rule()
|
||||||
|
def dequeue(self):
|
||||||
|
if self.queue:
|
||||||
|
actual = self.queue.pop(0)
|
||||||
|
expected = self.model.pop(0)
|
||||||
|
assert actual == expected
|
||||||
|
|
||||||
|
TestQueue = QueueMachine.TestCase
|
||||||
|
```
|
||||||
|
|
||||||
|
**Finds:** Race conditions, state corruption, invalid state transitions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Testing Examples, Not Properties
|
||||||
|
|
||||||
|
**Symptom:** Property test with hardcoded checks
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Not a property
|
||||||
|
@given(integers())
|
||||||
|
def test_double(x):
|
||||||
|
if x == 2:
|
||||||
|
assert double(x) == 4
|
||||||
|
elif x == 3:
|
||||||
|
assert double(x) == 6
|
||||||
|
# This is just example testing!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Test actual property
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Real property
|
||||||
|
@given(integers())
|
||||||
|
def test_double(x):
|
||||||
|
assert double(x) == x * 2
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Overly Restrictive Assumptions
|
||||||
|
|
||||||
|
**Symptom:** Filtering out most generated inputs
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Rejects 99% of inputs
|
||||||
|
@given(integers())
|
||||||
|
def test_specific_range(x):
|
||||||
|
assume(x > 1000 and x < 1001) # Only accepts 1 value!
|
||||||
|
assert process(x) is not None
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Use strategy bounds
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD
|
||||||
|
@given(integers(min_value=1000, max_value=1001))
|
||||||
|
def test_specific_range(x):
|
||||||
|
assert process(x) is not None
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Assertions
|
||||||
|
|
||||||
|
**Symptom:** Property test that doesn't assert anything
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: No assertion
|
||||||
|
@given(integers())
|
||||||
|
def test_no_crash(x):
|
||||||
|
calculate(x) # Just checks it doesn't crash
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Assert a property
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD
|
||||||
|
@given(integers())
|
||||||
|
def test_output_type(x):
|
||||||
|
result = calculate(x)
|
||||||
|
assert isinstance(result, int)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CI/CD Integration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/property-tests.yml
|
||||||
|
name: Property Tests
|
||||||
|
|
||||||
|
on: [pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: pip install hypothesis pytest
|
||||||
|
|
||||||
|
- name: Run property tests
|
||||||
|
run: pytest tests/properties/ -v --hypothesis-show-statistics
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Reference: Property Patterns
|
||||||
|
|
||||||
|
| Pattern | Example Property |
|
||||||
|
|---------|------------------|
|
||||||
|
| **Inverse** | `decode(encode(x)) == x` |
|
||||||
|
| **Idempotence** | `f(f(x)) == f(x)` |
|
||||||
|
| **Invariant** | `len(filter(lst, f)) <= len(lst)` |
|
||||||
|
| **Commutativity** | `add(a, b) == add(b, a)` |
|
||||||
|
| **Associativity** | `(a + b) + c == a + (b + c)` |
|
||||||
|
| **Identity** | `x + 0 == x` |
|
||||||
|
| **Consistency** | `sort(lst)[0] <= sort(lst)[-1]` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Property-based testing generates hundreds of inputs automatically to test properties that should hold for all inputs. One property test replaces dozens of example tests.**
|
||||||
|
|
||||||
|
**Use for:**
|
||||||
|
- Pure functions (no side effects)
|
||||||
|
- Data transformations
|
||||||
|
- Invariants (sorting, reversing, encoding/decoding)
|
||||||
|
- State machines
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
- Hypothesis (Python) - most mature
|
||||||
|
- fast-check (JavaScript) - TypeScript support
|
||||||
|
|
||||||
|
**Process:**
|
||||||
|
1. Identify property (e.g., "reversing twice returns original")
|
||||||
|
2. Write property test with generator
|
||||||
|
3. Run test (generates 100-1000 examples)
|
||||||
|
4. If failure, Hypothesis shrinks to minimal example
|
||||||
|
5. Fix bug, add regression test
|
||||||
|
|
||||||
|
**If you're writing tests like "assert reverse([1,2,3]) == [3,2,1]" for every possible input, use property-based testing instead. Test the property, not examples.**
|
||||||
448
skills/quality-metrics-and-kpis/SKILL.md
Normal file
448
skills/quality-metrics-and-kpis/SKILL.md
Normal file
@@ -0,0 +1,448 @@
|
|||||||
|
---
|
||||||
|
name: quality-metrics-and-kpis
|
||||||
|
description: Use when setting up quality dashboards, defining test coverage targets, tracking quality trends, configuring CI/CD quality gates, or reporting quality metrics to stakeholders - provides metric selection, threshold strategies, and dashboard design patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# Quality Metrics & KPIs
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Measure what matters. Track trends, not absolutes. Use metrics to drive action, not for vanity.
|
||||||
|
|
||||||
|
**Rule:** Every metric must have a defined threshold and action plan. If a metric doesn't change behavior, stop tracking it.
|
||||||
|
|
||||||
|
## Quality Metrics vs Vanity Metrics
|
||||||
|
|
||||||
|
| Type | Example | Problem | Better Metric |
|
||||||
|
|------|---------|---------|---------------|
|
||||||
|
| **Vanity** | "We have 10,000 tests!" | Doesn't indicate quality | Pass rate, flakiness rate |
|
||||||
|
| **Vanity** | "95% code coverage!" | Can be gamed, doesn't mean tests are good | Coverage delta (new code), mutation score |
|
||||||
|
| **Actionable** | "Test flakiness: 5% → 2%" | Drives action | Track trend, set target |
|
||||||
|
| **Actionable** | "P95 build time: 15 min" | Identifies bottleneck | Optimize slow tests |
|
||||||
|
|
||||||
|
**Actionable metrics answer:** "What should I fix next?"
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Core Quality Metrics
|
||||||
|
|
||||||
|
### 1. Test Pass Rate
|
||||||
|
|
||||||
|
**Definition:** % of tests that pass on first run
|
||||||
|
|
||||||
|
```
|
||||||
|
Pass Rate = (Passing Tests / Total Tests) × 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
- **> 98%:** Healthy
|
||||||
|
- **95-98%:** Investigate failures
|
||||||
|
- **< 95%:** Critical (tests are unreliable)
|
||||||
|
|
||||||
|
**Why it matters:** Low pass rate means flaky tests or broken code
|
||||||
|
|
||||||
|
**Action:** If < 98%, run flaky-test-prevention skill
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Test Flakiness Rate
|
||||||
|
|
||||||
|
**Definition:** % of tests that fail intermittently
|
||||||
|
|
||||||
|
```
|
||||||
|
Flakiness Rate = (Flaky Tests / Total Tests) × 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**How to measure:**
|
||||||
|
```bash
|
||||||
|
# Run each test 100 times
|
||||||
|
pytest --count=100 test_checkout.py
|
||||||
|
|
||||||
|
# Flaky if passes 1-99 times (not 0 or 100)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
- **< 1%:** Healthy
|
||||||
|
- **1-5%:** Moderate (fix soon)
|
||||||
|
- **> 5%:** Critical (CI is unreliable)
|
||||||
|
|
||||||
|
**Action:** Fix flaky tests before adding new tests
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Code Coverage
|
||||||
|
|
||||||
|
**Definition:** % of code lines executed by tests
|
||||||
|
|
||||||
|
```
|
||||||
|
Coverage = (Executed Lines / Total Lines) × 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds (by test type):**
|
||||||
|
- **Unit tests:** 80-90% of business logic
|
||||||
|
- **Integration tests:** 60-70% of integration points
|
||||||
|
- **E2E tests:** 40-50% of critical paths
|
||||||
|
|
||||||
|
**Configuration (pytest):**
|
||||||
|
```ini
|
||||||
|
# .coveragerc
|
||||||
|
[run]
|
||||||
|
source = src
|
||||||
|
omit = */tests/*, */migrations/*
|
||||||
|
|
||||||
|
[report]
|
||||||
|
fail_under = 80 # Fail if coverage < 80%
|
||||||
|
show_missing = True
|
||||||
|
```
|
||||||
|
|
||||||
|
**Anti-pattern:** 100% coverage as goal
|
||||||
|
|
||||||
|
**Why it's wrong:** Easy to game (tests that execute code without asserting anything)
|
||||||
|
|
||||||
|
**Better metric:** Coverage + mutation score (see mutation-testing skill)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Coverage Delta (New Code)
|
||||||
|
|
||||||
|
**Definition:** Coverage of newly added code
|
||||||
|
|
||||||
|
**Why it matters:** More actionable than absolute coverage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Measure coverage on changed files only
|
||||||
|
pytest --cov=src --cov-report=term-missing \
|
||||||
|
$(git diff --name-only origin/main...HEAD | grep '\.py$')
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold:** 90% for new code (stricter than legacy)
|
||||||
|
|
||||||
|
**Action:** Block PR if new code coverage < 90%
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. Build Time (CI/CD)
|
||||||
|
|
||||||
|
**Definition:** Time from commit to merge-ready
|
||||||
|
|
||||||
|
**Track by stage:**
|
||||||
|
- **Lint/format:** < 30s
|
||||||
|
- **Unit tests:** < 2 min
|
||||||
|
- **Integration tests:** < 5 min
|
||||||
|
- **E2E tests:** < 15 min
|
||||||
|
- **Total PR pipeline:** < 20 min
|
||||||
|
|
||||||
|
**Why it matters:** Slow CI blocks developer productivity
|
||||||
|
|
||||||
|
**Action:** If build > 20 min, see test-automation-architecture for optimization patterns
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 6. Test Execution Time Trend
|
||||||
|
|
||||||
|
**Definition:** How test suite duration changes over time
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Track in CI
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
pytest.main()
|
||||||
|
duration = time.time() - start
|
||||||
|
|
||||||
|
metrics = {"test_duration_seconds": duration, "timestamp": time.time()}
|
||||||
|
with open("metrics.json", "w") as f:
|
||||||
|
json.dump(metrics, f)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold:** < 5% growth per month
|
||||||
|
|
||||||
|
**Action:** If growth > 5%/month, parallelize tests or refactor slow tests
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 7. Defect Escape Rate
|
||||||
|
|
||||||
|
**Definition:** Bugs found in production that should have been caught by tests
|
||||||
|
|
||||||
|
```
|
||||||
|
Defect Escape Rate = (Production Bugs / Total Releases) × 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
- **< 2%:** Excellent
|
||||||
|
- **2-5%:** Acceptable
|
||||||
|
- **> 5%:** Tests are missing critical scenarios
|
||||||
|
|
||||||
|
**Action:** For each escape, write regression test to prevent recurrence
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 8. Mean Time to Detection (MTTD)
|
||||||
|
|
||||||
|
**Definition:** Time from bug introduction to discovery
|
||||||
|
|
||||||
|
```
|
||||||
|
MTTD = Deployment Time - Bug Introduction Time
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
- **< 1 hour:** Excellent (caught in CI)
|
||||||
|
- **1-24 hours:** Good (caught in staging/canary)
|
||||||
|
- **> 24 hours:** Poor (caught in production)
|
||||||
|
|
||||||
|
**Action:** If MTTD > 24h, improve observability (see observability-and-monitoring skill)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 9. Mean Time to Recovery (MTTR)
|
||||||
|
|
||||||
|
**Definition:** Time from bug detection to fix deployed
|
||||||
|
|
||||||
|
```
|
||||||
|
MTTR = Fix Deployment Time - Bug Detection Time
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
- **< 1 hour:** Excellent
|
||||||
|
- **1-8 hours:** Acceptable
|
||||||
|
- **> 8 hours:** Poor
|
||||||
|
|
||||||
|
**Action:** If MTTR > 8h, improve rollback procedures (see testing-in-production skill)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dashboard Design
|
||||||
|
|
||||||
|
### Grafana Dashboard Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# grafana-dashboard.json
|
||||||
|
{
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Test Pass Rate (7 days)",
|
||||||
|
"targets": [{
|
||||||
|
"expr": "sum(tests_passed) / sum(tests_total) * 100"
|
||||||
|
}],
|
||||||
|
"thresholds": [
|
||||||
|
{"value": 95, "color": "red"},
|
||||||
|
{"value": 98, "color": "yellow"},
|
||||||
|
{"value": 100, "color": "green"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Build Time Trend (30 days)",
|
||||||
|
"targets": [{
|
||||||
|
"expr": "avg_over_time(ci_build_duration_seconds[30d])"
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Coverage Delta (per PR)",
|
||||||
|
"targets": [{
|
||||||
|
"expr": "coverage_new_code_percent"
|
||||||
|
}],
|
||||||
|
"thresholds": [
|
||||||
|
{"value": 90, "color": "green"},
|
||||||
|
{"value": 80, "color": "yellow"},
|
||||||
|
{"value": 0, "color": "red"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### CI/CD Quality Gates
|
||||||
|
|
||||||
|
**GitHub Actions example:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/quality-gates.yml
|
||||||
|
name: Quality Gates
|
||||||
|
|
||||||
|
on: [pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
quality-check:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Run tests with coverage
|
||||||
|
run: pytest --cov=src --cov-report=json
|
||||||
|
|
||||||
|
- name: Check coverage threshold
|
||||||
|
run: |
|
||||||
|
COVERAGE=$(jq '.totals.percent_covered' coverage.json)
|
||||||
|
if (( $(echo "$COVERAGE < 80" | bc -l) )); then
|
||||||
|
echo "Coverage $COVERAGE% below 80% threshold"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Check build time
|
||||||
|
run: |
|
||||||
|
DURATION=$(jq '.duration' test-results.json)
|
||||||
|
if (( $(echo "$DURATION > 300" | bc -l) )); then
|
||||||
|
echo "Build time ${DURATION}s exceeds 5-minute threshold"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reporting Patterns
|
||||||
|
|
||||||
|
### Weekly Quality Report
|
||||||
|
|
||||||
|
**Template:**
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Quality Report - Week of 2025-01-13
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
- **Test pass rate:** 98.5% (+0.5% from last week)
|
||||||
|
- **Flakiness rate:** 2.1% (-1.3% from last week) ✅
|
||||||
|
- **Coverage:** 85.2% (+2.1% from last week) ✅
|
||||||
|
- **Build time:** 18 min (-2 min from last week) ✅
|
||||||
|
|
||||||
|
## Actions Taken
|
||||||
|
- Fixed 8 flaky tests in checkout flow
|
||||||
|
- Added integration tests for payment service (+5% coverage)
|
||||||
|
- Parallelized slow E2E tests (reduced build time by 2 min)
|
||||||
|
|
||||||
|
## Action Items
|
||||||
|
- [ ] Fix remaining 3 flaky tests in user registration
|
||||||
|
- [ ] Increase coverage of order service (currently 72%)
|
||||||
|
- [ ] Investigate why staging MTTD increased to 4 hours
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Stakeholder Dashboard (Executive View)
|
||||||
|
|
||||||
|
**Metrics to show:**
|
||||||
|
1. **Quality trend (6 months):** Pass rate over time
|
||||||
|
2. **Velocity impact:** How long does CI take per PR?
|
||||||
|
3. **Production stability:** Defect escape rate
|
||||||
|
4. **Recovery time:** MTTR for incidents
|
||||||
|
|
||||||
|
**What NOT to show:**
|
||||||
|
- Absolute test count (vanity metric)
|
||||||
|
- Lines of code (meaningless)
|
||||||
|
- Individual developer metrics (creates wrong incentives)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Coverage as the Only Metric
|
||||||
|
|
||||||
|
**Symptom:** "We need 100% coverage!"
|
||||||
|
|
||||||
|
**Why bad:** Easy to game with meaningless tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: 100% coverage, 0% value
|
||||||
|
def calculate_tax(amount):
|
||||||
|
return amount * 0.08
|
||||||
|
|
||||||
|
def test_calculate_tax():
|
||||||
|
calculate_tax(100) # Executes code, asserts nothing!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Use coverage + mutation score
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Tracking Metrics Without Thresholds
|
||||||
|
|
||||||
|
**Symptom:** Dashboard shows metrics but no action taken
|
||||||
|
|
||||||
|
**Why bad:** Metrics become noise
|
||||||
|
|
||||||
|
**Fix:** Every metric needs:
|
||||||
|
- **Target threshold** (e.g., flakiness < 1%)
|
||||||
|
- **Alert level** (e.g., alert if flakiness > 5%)
|
||||||
|
- **Action plan** (e.g., "Fix flaky tests before adding new features")
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Optimizing for Metrics, Not Quality
|
||||||
|
|
||||||
|
**Symptom:** Gaming metrics to hit targets
|
||||||
|
|
||||||
|
**Example:** Removing tests to increase pass rate
|
||||||
|
|
||||||
|
**Fix:** Track multiple complementary metrics (pass rate + flakiness + coverage)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Measuring Individual Developer Productivity
|
||||||
|
|
||||||
|
**Symptom:** "Developer A writes more tests than Developer B"
|
||||||
|
|
||||||
|
**Why bad:** Creates wrong incentives (quantity over quality)
|
||||||
|
|
||||||
|
**Fix:** Measure team metrics, not individual
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Integration
|
||||||
|
|
||||||
|
### SonarQube Metrics
|
||||||
|
|
||||||
|
**Quality Gate:**
|
||||||
|
```properties
|
||||||
|
# sonar-project.properties
|
||||||
|
sonar.qualitygate.wait=true
|
||||||
|
|
||||||
|
# Metrics tracked:
|
||||||
|
# - Bugs (target: 0)
|
||||||
|
# - Vulnerabilities (target: 0)
|
||||||
|
# - Code smells (target: < 100)
|
||||||
|
# - Coverage (target: > 80%)
|
||||||
|
# - Duplications (target: < 3%)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Codecov Integration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# codecov.yml
|
||||||
|
coverage:
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
target: 80% # Overall coverage target
|
||||||
|
threshold: 2% # Allow 2% drop
|
||||||
|
|
||||||
|
patch:
|
||||||
|
default:
|
||||||
|
target: 90% # New code must have 90% coverage
|
||||||
|
threshold: 0% # No drops allowed
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Track actionable metrics with defined thresholds. Use metrics to drive improvement, not for vanity.**
|
||||||
|
|
||||||
|
**Core dashboard:**
|
||||||
|
- Test pass rate (> 98%)
|
||||||
|
- Flakiness rate (< 1%)
|
||||||
|
- Coverage delta on new code (> 90%)
|
||||||
|
- Build time (< 20 min)
|
||||||
|
- Defect escape rate (< 2%)
|
||||||
|
|
||||||
|
**Weekly actions:**
|
||||||
|
- Review metrics against thresholds
|
||||||
|
- Identify trends (improving/degrading)
|
||||||
|
- Create action items for violations
|
||||||
|
- Track progress on improvements
|
||||||
|
|
||||||
|
**If you're tracking a metric but not acting on it, stop tracking it. Metrics exist to drive action, not to fill dashboards.**
|
||||||
521
skills/static-analysis-integration/SKILL.md
Normal file
521
skills/static-analysis-integration/SKILL.md
Normal file
@@ -0,0 +1,521 @@
|
|||||||
|
---
|
||||||
|
name: static-analysis-integration
|
||||||
|
description: Use when integrating SAST tools (SonarQube, ESLint, Pylint, Checkstyle), setting up security scanning, configuring code quality gates, managing false positives, or building CI/CD quality pipelines - provides tool selection, configuration patterns, and quality threshold strategies
|
||||||
|
---
|
||||||
|
|
||||||
|
# Static Analysis Integration
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Static analysis catches bugs,security vulnerabilities, and code quality issues before code review. Automate it in CI/CD.
|
||||||
|
|
||||||
|
**Rule:** Block merges on critical issues, warn on moderate issues, ignore noise. Configure thresholds carefully.
|
||||||
|
|
||||||
|
## Static Analysis vs Other Quality Checks
|
||||||
|
|
||||||
|
| Check Type | When | What It Finds | Speed |
|
||||||
|
|------------|------|---------------|-------|
|
||||||
|
| **Static Analysis** | Pre-commit/PR | Bugs, security, style | Fast (seconds) |
|
||||||
|
| **Unit Tests** | Every commit | Logic errors | Fast (seconds) |
|
||||||
|
| **Integration Tests** | PR | Integration bugs | Medium (minutes) |
|
||||||
|
| **Security Scanning** | PR/Nightly | Dependencies, secrets | Medium (minutes) |
|
||||||
|
| **Manual Code Review** | PR | Design, readability | Slow (hours) |
|
||||||
|
|
||||||
|
**Static analysis finds:** Null pointer bugs, SQL injection, unused variables, complexity issues
|
||||||
|
|
||||||
|
**Static analysis does NOT find:** Business logic errors, performance issues (use profiling)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Selection by Language
|
||||||
|
|
||||||
|
### Python
|
||||||
|
|
||||||
|
| Tool | Purpose | When to Use |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| **Pylint** | Code quality, style, bugs | General-purpose, comprehensive |
|
||||||
|
| **Flake8** | Style, simple bugs | Faster than Pylint, less strict |
|
||||||
|
| **mypy** | Type checking | Type-safe codebases |
|
||||||
|
| **Bandit** | Security vulnerabilities | Security-critical code |
|
||||||
|
| **Black** | Code formatting | Enforce consistent style |
|
||||||
|
|
||||||
|
**Recommended combo:** Black (formatting) + Flake8 (linting) + mypy (types) + Bandit (security)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### JavaScript/TypeScript
|
||||||
|
|
||||||
|
| Tool | Purpose | When to Use |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| **ESLint** | Code quality, style, bugs | All JavaScript projects |
|
||||||
|
| **TypeScript** | Type checking | Type-safe codebases |
|
||||||
|
| **Prettier** | Code formatting | Enforce consistent style |
|
||||||
|
| **SonarQube** | Security, bugs, code smells | Enterprise, comprehensive |
|
||||||
|
|
||||||
|
**Recommended combo:** Prettier (formatting) + ESLint (linting) + TypeScript (types)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Java
|
||||||
|
|
||||||
|
| Tool | Purpose | When to Use |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| **Checkstyle** | Code style | Enforce coding standards |
|
||||||
|
| **PMD** | Bug detection, code smells | General-purpose |
|
||||||
|
| **SpotBugs** | Bug detection | Bytecode analysis |
|
||||||
|
| **SonarQube** | Comprehensive analysis | Enterprise, dashboards |
|
||||||
|
|
||||||
|
**Recommended combo:** Checkstyle (style) + SpotBugs (bugs) + SonarQube (comprehensive)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration Patterns
|
||||||
|
|
||||||
|
### ESLint Configuration (JavaScript)
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// .eslintrc.js
|
||||||
|
module.exports = {
|
||||||
|
extends: [
|
||||||
|
'eslint:recommended',
|
||||||
|
'plugin:@typescript-eslint/recommended',
|
||||||
|
'plugin:security/recommended'
|
||||||
|
],
|
||||||
|
rules: {
|
||||||
|
// Error: Block merge
|
||||||
|
'no-console': 'error',
|
||||||
|
'no-debugger': 'error',
|
||||||
|
'@typescript-eslint/no-explicit-any': 'error',
|
||||||
|
|
||||||
|
// Warning: Allow merge, but warn
|
||||||
|
'complexity': ['warn', 10],
|
||||||
|
'max-lines': ['warn', 500],
|
||||||
|
|
||||||
|
// Off: Too noisy
|
||||||
|
'no-unused-vars': 'off', // TypeScript handles this
|
||||||
|
}
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run in CI:**
|
||||||
|
```bash
|
||||||
|
eslint src/ --max-warnings 0 # Fail if any warnings
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Pylint Configuration (Python)
|
||||||
|
|
||||||
|
```ini
|
||||||
|
# .pylintrc
|
||||||
|
[MESSAGES CONTROL]
|
||||||
|
disable=
|
||||||
|
missing-docstring, # Too noisy for small projects
|
||||||
|
too-few-public-methods, # Design choice
|
||||||
|
logging-fstring-interpolation # False positives
|
||||||
|
|
||||||
|
[DESIGN]
|
||||||
|
max-line-length=100
|
||||||
|
max-args=7
|
||||||
|
max-locals=15
|
||||||
|
|
||||||
|
[BASIC]
|
||||||
|
good-names=i,j,k,_,id,db,pk
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run in CI:**
|
||||||
|
```bash
|
||||||
|
pylint src/ --fail-under=8.0 # Minimum score 8.0/10
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### SonarQube Quality Gates
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# sonar-project.properties
|
||||||
|
sonar.projectKey=my-project
|
||||||
|
sonar.sources=src
|
||||||
|
sonar.tests=tests
|
||||||
|
|
||||||
|
# Quality gate thresholds
|
||||||
|
sonar.qualitygate.wait=true
|
||||||
|
sonar.coverage.exclusions=**/*_test.py,**/migrations/**
|
||||||
|
|
||||||
|
# Fail conditions
|
||||||
|
sonar.qualitygate.timeout=300
|
||||||
|
```
|
||||||
|
|
||||||
|
**Quality Gate Criteria:**
|
||||||
|
- **Blocker/Critical issues:** 0 (block merge)
|
||||||
|
- **Major issues:** < 5 (block merge)
|
||||||
|
- **Code coverage:** > 80% (warn if lower)
|
||||||
|
- **Duplicated lines:** < 3%
|
||||||
|
- **Maintainability rating:** A or B
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CI/CD Integration
|
||||||
|
|
||||||
|
### GitHub Actions (Python)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/static-analysis.yml
|
||||||
|
name: Static Analysis
|
||||||
|
|
||||||
|
on: [pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install pylint flake8 mypy bandit black
|
||||||
|
|
||||||
|
- name: Check formatting
|
||||||
|
run: black --check src/
|
||||||
|
|
||||||
|
- name: Run Flake8
|
||||||
|
run: flake8 src/ --max-line-length=100
|
||||||
|
|
||||||
|
- name: Run Pylint
|
||||||
|
run: pylint src/ --fail-under=8.0
|
||||||
|
|
||||||
|
- name: Run mypy
|
||||||
|
run: mypy src/ --strict
|
||||||
|
|
||||||
|
- name: Run Bandit (security)
|
||||||
|
run: bandit -r src/ -ll # Only high severity
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### GitHub Actions (JavaScript)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/static-analysis.yml
|
||||||
|
name: Static Analysis
|
||||||
|
|
||||||
|
on: [pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Node
|
||||||
|
uses: actions/setup-node@v3
|
||||||
|
with:
|
||||||
|
node-version: '18'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Check formatting
|
||||||
|
run: npm run format:check # prettier --check
|
||||||
|
|
||||||
|
- name: Run ESLint
|
||||||
|
run: npm run lint # eslint --max-warnings 0
|
||||||
|
|
||||||
|
- name: Run TypeScript
|
||||||
|
run: npm run typecheck # tsc --noEmit
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Managing False Positives
|
||||||
|
|
||||||
|
**Strategy: Suppress selectively, document why**
|
||||||
|
|
||||||
|
### Inline Suppression (ESLint)
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// eslint-disable-next-line no-console
|
||||||
|
console.log("Debugging production issue"); // TODO: Remove after fix
|
||||||
|
|
||||||
|
// Better: Explain WHY
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||||
|
const legacyData: any = externalLibrary.getData(); // Library has no types
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### File-Level Suppression (Pylint)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# pylint: disable=too-many-arguments
|
||||||
|
def complex_function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8):
|
||||||
|
"""Legacy API - cannot change signature for backward compatibility."""
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Configuration Suppression
|
||||||
|
|
||||||
|
```ini
|
||||||
|
# .pylintrc
|
||||||
|
[MESSAGES CONTROL]
|
||||||
|
disable=
|
||||||
|
fixme, # Allow TODO comments
|
||||||
|
missing-docstring # Too noisy for this codebase
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rule:** Every suppression needs a comment explaining WHY.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security-Focused Static Analysis
|
||||||
|
|
||||||
|
### Bandit (Python Security)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .bandit.yml
|
||||||
|
exclude_dirs:
|
||||||
|
- /tests
|
||||||
|
- /migrations
|
||||||
|
|
||||||
|
tests:
|
||||||
|
- B201 # Flask debug mode
|
||||||
|
- B601 # Parameterized shell calls
|
||||||
|
- B602 # Shell injection
|
||||||
|
- B608 # SQL injection
|
||||||
|
```
|
||||||
|
|
||||||
|
**Run:**
|
||||||
|
```bash
|
||||||
|
bandit -r src/ -ll -x tests/ # Only high/medium severity
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ESLint Security Plugin (JavaScript)
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// .eslintrc.js
|
||||||
|
module.exports = {
|
||||||
|
plugins: ['security'],
|
||||||
|
extends: ['plugin:security/recommended'],
|
||||||
|
rules: {
|
||||||
|
'security/detect-object-injection': 'error',
|
||||||
|
'security/detect-non-literal-regexp': 'warn',
|
||||||
|
'security/detect-unsafe-regex': 'error'
|
||||||
|
}
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Code Quality Metrics
|
||||||
|
|
||||||
|
### Complexity Analysis
|
||||||
|
|
||||||
|
**Cyclomatic complexity:** Measures decision paths through code
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Simple function: Complexity = 1
|
||||||
|
def add(a, b):
|
||||||
|
return a + b
|
||||||
|
|
||||||
|
# Complex function: Complexity = 5 (if/elif/else = 4 paths + 1 base)
|
||||||
|
def process_order(order):
|
||||||
|
if order.status == "pending":
|
||||||
|
return validate(order)
|
||||||
|
elif order.status == "confirmed":
|
||||||
|
return ship(order)
|
||||||
|
elif order.status == "cancelled":
|
||||||
|
return refund(order)
|
||||||
|
else:
|
||||||
|
return reject(order)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold:**
|
||||||
|
- **< 10:** Acceptable
|
||||||
|
- **10-20:** Consider refactoring
|
||||||
|
- **> 20:** Must refactor (untestable)
|
||||||
|
|
||||||
|
**Configure:**
|
||||||
|
```ini
|
||||||
|
# Pylint
|
||||||
|
[DESIGN]
|
||||||
|
max-complexity=10
|
||||||
|
|
||||||
|
# ESLint
|
||||||
|
complexity: ['warn', 10]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Duplication Detection
|
||||||
|
|
||||||
|
**SonarQube duplication threshold:** < 3%
|
||||||
|
|
||||||
|
**Find duplicates (Python):**
|
||||||
|
```bash
|
||||||
|
pylint src/ --disable=all --enable=duplicate-code
|
||||||
|
```
|
||||||
|
|
||||||
|
**Find duplicates (JavaScript):**
|
||||||
|
```bash
|
||||||
|
jscpd src/ # JavaScript Copy/Paste Detector
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Suppressing All Warnings
|
||||||
|
|
||||||
|
**Symptom:** Config disables most rules
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ BAD
|
||||||
|
module.exports = {
|
||||||
|
rules: {
|
||||||
|
'no-console': 'off',
|
||||||
|
'no-debugger': 'off',
|
||||||
|
'@typescript-eslint/no-explicit-any': 'off',
|
||||||
|
// ... 50 more disabled rules
|
||||||
|
}
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Static analysis becomes useless
|
||||||
|
|
||||||
|
**Fix:** Address root causes, suppress selectively
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
###❌ No Quality Gates
|
||||||
|
|
||||||
|
**Symptom:** Static analysis runs but doesn't block merges
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# ❌ BAD: Linting failures don't block merge
|
||||||
|
- name: Run ESLint
|
||||||
|
run: eslint src/ || true # Always succeeds!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Fail CI on critical issues
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# ✅ GOOD
|
||||||
|
- name: Run ESLint
|
||||||
|
run: eslint src/ --max-warnings 0
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Security Warnings
|
||||||
|
|
||||||
|
**Symptom:** Security findings marked as false positives without investigation
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD
|
||||||
|
cursor.execute(f"SELECT * FROM users WHERE id = {user_id}") # nosec
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Real SQL injection vulnerability ignored
|
||||||
|
|
||||||
|
**Fix:** Fix the issue, don't suppress
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD
|
||||||
|
cursor.execute("SELECT * FROM users WHERE id = %s", (user_id,))
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Running Static Analysis Only on Main Branch
|
||||||
|
|
||||||
|
**Symptom:** Issues discovered after merge
|
||||||
|
|
||||||
|
**Fix:** Run on every PR
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
on: [pull_request] # Not just 'push' to main
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quality Dashboard Setup
|
||||||
|
|
||||||
|
### SonarQube Dashboard
|
||||||
|
|
||||||
|
**Key metrics to track:**
|
||||||
|
1. **Bugs:** Code issues likely to cause failures
|
||||||
|
2. **Vulnerabilities:** Security issues
|
||||||
|
3. **Code Smells:** Maintainability issues
|
||||||
|
4. **Coverage:** Test coverage %
|
||||||
|
5. **Duplications:** Duplicated code blocks
|
||||||
|
|
||||||
|
**Quality Gate Example:**
|
||||||
|
- Bugs (Blocker/Critical): **0**
|
||||||
|
- Vulnerabilities (Blocker/Critical): **0**
|
||||||
|
- Code Smells (Blocker/Critical): **< 5**
|
||||||
|
- Coverage on new code: **> 80%**
|
||||||
|
- Duplicated lines on new code: **< 3%**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Gradual Adoption Strategy
|
||||||
|
|
||||||
|
**For legacy codebases with thousands of issues:**
|
||||||
|
|
||||||
|
### Phase 1: Baseline (Week 1)
|
||||||
|
```bash
|
||||||
|
# Run analysis, capture current state
|
||||||
|
pylint src/ > baseline.txt
|
||||||
|
|
||||||
|
# Configure to only fail on NEW issues
|
||||||
|
# (Track baseline, don't enforce on old code)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2: Block New Issues (Week 2)
|
||||||
|
```yaml
|
||||||
|
# Block PRs that introduce NEW issues
|
||||||
|
- name: Run incremental lint
|
||||||
|
run: |
|
||||||
|
pylint $(git diff --name-only origin/main...HEAD | grep '\.py$') --fail-under=8.0
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 3: Fix High-Priority Old Issues (Weeks 3-8)
|
||||||
|
- Security vulnerabilities first
|
||||||
|
- Bugs second
|
||||||
|
- Code smells third
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 4: Full Enforcement (Week 9+)
|
||||||
|
```yaml
|
||||||
|
# Enforce on entire codebase
|
||||||
|
- name: Run lint
|
||||||
|
run: pylint src/ --fail-under=8.0
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Static analysis catches bugs and security issues before code review. Automate it in CI/CD with quality gates.**
|
||||||
|
|
||||||
|
- Choose tools for your language: ESLint (JS), Pylint (Python), Checkstyle (Java)
|
||||||
|
- Configure thresholds: Block critical issues, warn on moderate, ignore noise
|
||||||
|
- Run on every PR, fail CI on violations
|
||||||
|
- Manage false positives selectively with documented suppressions
|
||||||
|
- Track quality metrics: complexity, duplication, coverage
|
||||||
|
|
||||||
|
**If static analysis isn't blocking merges, you're just generating reports nobody reads. Use quality gates.**
|
||||||
255
skills/test-automation-architecture/SKILL.md
Normal file
255
skills/test-automation-architecture/SKILL.md
Normal file
@@ -0,0 +1,255 @@
|
|||||||
|
---
|
||||||
|
name: test-automation-architecture
|
||||||
|
description: Use when organizing test suites, setting up CI/CD testing pipelines, choosing test levels (unit vs integration vs E2E), fixing slow CI feedback, or migrating from inverted test pyramid - provides test pyramid guidance and anti-patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# Test Automation Architecture
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Test pyramid - many fast unit tests, fewer integration tests, fewest E2E tests.
|
||||||
|
|
||||||
|
**Target distribution:** 70% unit, 20% integration, 10% E2E
|
||||||
|
|
||||||
|
**Flexibility:** Ratios can vary based on constraints (e.g., 80/15/5 if E2E infrastructure is expensive, 60/30/10 for microservices). Key is maintaining pyramid shape - more unit than integration than E2E.
|
||||||
|
|
||||||
|
**Starting from zero tests:** Don't try to reach target distribution immediately. Start with unit tests only (Phase 1), add integration (Phase 2), add E2E last (Phase 3). Distribute organically over 6-12 months.
|
||||||
|
|
||||||
|
## Test Pyramid Quick Reference
|
||||||
|
|
||||||
|
| Test Level | Purpose | Speed | When to Use |
|
||||||
|
|------------|---------|-------|-------------|
|
||||||
|
| **Unit** | Test individual functions/methods in isolation | Milliseconds | Business logic, utilities, calculations, error handling |
|
||||||
|
| **Integration** | Test components working together | Seconds | API contracts, database operations, service interactions |
|
||||||
|
| **E2E** | Test full user workflows through UI | Minutes | Critical user journeys, revenue flows, compliance paths |
|
||||||
|
|
||||||
|
**Rule:** If you can test it at a lower level, do that instead.
|
||||||
|
|
||||||
|
## Test Level Selection Guide
|
||||||
|
|
||||||
|
| What You're Testing | Test Level | Why |
|
||||||
|
|---------------------|-----------|-----|
|
||||||
|
| Function returns correct value | Unit | No external dependencies |
|
||||||
|
| API endpoint response format | Integration | Tests API contract, not full workflow |
|
||||||
|
| Database query performance | Integration | Tests DB interaction, not UI |
|
||||||
|
| User signup → payment flow | E2E | Crosses multiple systems, critical revenue |
|
||||||
|
| Form validation logic | Unit | Pure function, no UI needed |
|
||||||
|
| Service A calls Service B correctly | Integration | Tests contract, not user workflow |
|
||||||
|
| Button click updates state | Unit | Component behavior, no backend |
|
||||||
|
| Multi-step checkout process | E2E | Critical user journey, revenue impact |
|
||||||
|
|
||||||
|
**Guideline:** Unit tests verify "did I build it right?", E2E tests verify "did I build the right thing?"
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Inverted Pyramid
|
||||||
|
**Symptom:** 500 E2E tests, 100 unit tests
|
||||||
|
|
||||||
|
**Why bad:** Slow CI (30min+), brittle tests, hard to debug, expensive maintenance
|
||||||
|
|
||||||
|
**Fix:** Migrate 70% of E2E tests down to unit/integration. Use Migration Strategy below.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ All Tests on Every Commit
|
||||||
|
**Symptom:** Running full 30-minute test suite on every PR
|
||||||
|
|
||||||
|
**Why bad:** Slow feedback kills productivity, wastes CI resources
|
||||||
|
|
||||||
|
**Fix:** Progressive testing - unit tests on PR, integration on merge, E2E nightly/weekly
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Test Categorization
|
||||||
|
**Symptom:** All tests in one folder, one command, one 30-minute run
|
||||||
|
|
||||||
|
**Why bad:** Can't run subsets, no fail-fast, poor organization
|
||||||
|
|
||||||
|
**Fix:** Separate by level (unit/, integration/, e2e/) with independent configs
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Slow CI Feedback Loop
|
||||||
|
**Symptom:** Waiting 20+ minutes for test results on every commit
|
||||||
|
|
||||||
|
**Why bad:** Context switching, delayed bug detection, reduced productivity
|
||||||
|
|
||||||
|
**Fix:** Fail fast - run fastest tests first, parallelize, cache dependencies
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Fail Fast
|
||||||
|
**Symptom:** Running all 500 tests even after first test fails
|
||||||
|
|
||||||
|
**Why bad:** Wastes CI time, delays feedback
|
||||||
|
|
||||||
|
**Fix:** Configure test runner to stop on first failure in CI (not locally)
|
||||||
|
|
||||||
|
## CI/CD Pipeline Patterns
|
||||||
|
|
||||||
|
| Event | Run These Tests | Duration Target | Why |
|
||||||
|
|-------|----------------|-----------------|-----|
|
||||||
|
| **Every Commit (Pre-Push)** | Lint + unit tests | < 5 min | Fast local feedback |
|
||||||
|
| **Pull Request** | Lint + unit + integration | < 15 min | Gate before merge, balance speed/coverage |
|
||||||
|
| **Merge to Main** | All tests (unit + integration + E2E) | < 30 min | Full validation before deployment |
|
||||||
|
| **Nightly/Scheduled** | Full suite + performance tests | < 60 min | Catch regressions, performance drift |
|
||||||
|
| **Pre-Deployment** | Smoke tests only (5-10 critical E2E) | < 5 min | Fast production validation |
|
||||||
|
|
||||||
|
**Progressive complexity:** Start with just unit tests on PR, add integration after mastering that, add E2E last.
|
||||||
|
|
||||||
|
## Folder Structure Patterns
|
||||||
|
|
||||||
|
### Basic (Small Projects)
|
||||||
|
```
|
||||||
|
tests/
|
||||||
|
├── unit/
|
||||||
|
├── integration/
|
||||||
|
└── e2e/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mirrored (Medium Projects)
|
||||||
|
```
|
||||||
|
src/
|
||||||
|
├── components/
|
||||||
|
├── services/
|
||||||
|
└── utils/
|
||||||
|
tests/
|
||||||
|
├── unit/
|
||||||
|
│ ├── components/
|
||||||
|
│ ├── services/
|
||||||
|
│ └── utils/
|
||||||
|
├── integration/
|
||||||
|
└── e2e/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Feature-Based (Large Projects)
|
||||||
|
```
|
||||||
|
features/
|
||||||
|
├── auth/
|
||||||
|
│ ├── src/
|
||||||
|
│ └── tests/
|
||||||
|
│ ├── unit/
|
||||||
|
│ ├── integration/
|
||||||
|
│ └── e2e/
|
||||||
|
└── payment/
|
||||||
|
├── src/
|
||||||
|
└── tests/
|
||||||
|
```
|
||||||
|
|
||||||
|
**Choose based on:** Team size (<5: Basic, 5-20: Mirrored, 20+: Feature-Based)
|
||||||
|
|
||||||
|
## Migration Strategy (Fixing Inverted Pyramid)
|
||||||
|
|
||||||
|
If you have 500 E2E tests and 100 unit tests:
|
||||||
|
|
||||||
|
**Week 1-2: Audit**
|
||||||
|
- [ ] Categorize each E2E test: Critical (keep) vs Redundant (migrate)
|
||||||
|
- [ ] Identify 10-20 critical user journeys
|
||||||
|
- [ ] Target: Keep 50-100 E2E tests maximum
|
||||||
|
|
||||||
|
**Week 3-4: Move High-Value Tests Down**
|
||||||
|
- [ ] Convert 200 E2E tests → integration tests (test API/services without UI)
|
||||||
|
- [ ] Convert 100 E2E tests → unit tests (pure logic tests)
|
||||||
|
- [ ] Delete 100 truly redundant E2E tests
|
||||||
|
|
||||||
|
**Week 5-6: Build Unit Test Coverage**
|
||||||
|
- [ ] Add 200-300 unit tests for untested business logic
|
||||||
|
- [ ] Target: 400+ unit tests total
|
||||||
|
|
||||||
|
**Week 7-8: Reorganize**
|
||||||
|
- [ ] Split tests into folders (unit/, integration/, e2e/)
|
||||||
|
- [ ] Create separate test configs
|
||||||
|
- [ ] Update CI to run progressively
|
||||||
|
|
||||||
|
**Expected result:** 400 unit, 200 integration, 100 E2E (~70/20/10 distribution)
|
||||||
|
|
||||||
|
## Your First CI Pipeline
|
||||||
|
|
||||||
|
**Start simple, add complexity progressively:**
|
||||||
|
|
||||||
|
**Phase 1 (Week 1):** Unit tests only
|
||||||
|
```yaml
|
||||||
|
on: [pull_request]
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- run: npm run test:unit
|
||||||
|
```
|
||||||
|
|
||||||
|
**Phase 2 (Week 2-3):** Add lint + integration
|
||||||
|
```yaml
|
||||||
|
jobs:
|
||||||
|
lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- run: npm run lint
|
||||||
|
|
||||||
|
test:
|
||||||
|
needs: lint
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- run: npm run test:unit
|
||||||
|
- run: npm run test:integration
|
||||||
|
```
|
||||||
|
|
||||||
|
**Phase 3 (Week 4+):** Add E2E on main branch
|
||||||
|
```yaml
|
||||||
|
jobs:
|
||||||
|
e2e:
|
||||||
|
if: github.ref == 'refs/heads/main'
|
||||||
|
needs: [lint, test]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- run: npm run test:e2e
|
||||||
|
```
|
||||||
|
|
||||||
|
**Don't start with full complexity** - master each phase before adding next.
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Testing Everything at E2E Level
|
||||||
|
**Fix:** Use Test Level Selection Guide above. Most tests belong at unit level.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Parallel Execution
|
||||||
|
**Symptom:** Tests run sequentially, taking 30min when they could run in 10min
|
||||||
|
|
||||||
|
**Fix:** Run independent test suites in parallel (unit + lint simultaneously)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Caching
|
||||||
|
**Symptom:** Re-downloading dependencies on every CI run (5min wasted)
|
||||||
|
|
||||||
|
**Fix:** Cache node_modules, .m2, .gradle based on lock file hash
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Test Distribution Target:**
|
||||||
|
- 70% unit tests (fast, isolated)
|
||||||
|
- 20% integration tests (component interaction)
|
||||||
|
- 10% E2E tests (critical user journeys)
|
||||||
|
|
||||||
|
**CI Pipeline Events:**
|
||||||
|
- PR: unit + integration (< 15min)
|
||||||
|
- Main: all tests (< 30min)
|
||||||
|
- Deploy: smoke tests only (< 5min)
|
||||||
|
|
||||||
|
**Folder Organization:**
|
||||||
|
- Small team: tests/unit, tests/integration, tests/e2e
|
||||||
|
- Large team: feature-based with embedded test folders
|
||||||
|
|
||||||
|
**Migration Path:**
|
||||||
|
1. Audit E2E tests
|
||||||
|
2. Move 70% down to unit/integration
|
||||||
|
3. Add missing unit tests
|
||||||
|
4. Reorganize folders
|
||||||
|
5. Update CI pipeline
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Many fast tests beat few slow tests.**
|
||||||
|
|
||||||
|
Test pyramid exists because it balances confidence (E2E) with speed (unit). Organize tests by level, run progressively in CI, fail fast.
|
||||||
419
skills/test-data-management/SKILL.md
Normal file
419
skills/test-data-management/SKILL.md
Normal file
@@ -0,0 +1,419 @@
|
|||||||
|
---
|
||||||
|
name: test-data-management
|
||||||
|
description: Use when fixing flaky tests from data pollution, choosing between fixtures and factories, setting up test data isolation, handling PII in tests, or seeding test databases - provides isolation strategies and anti-patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# Test Data Management
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Test isolation first. Each test should work independently regardless of execution order.
|
||||||
|
|
||||||
|
**Rule:** Never use production data in tests without anonymization.
|
||||||
|
|
||||||
|
## Test Isolation Decision Tree
|
||||||
|
|
||||||
|
| Symptom | Root Cause | Solution |
|
||||||
|
|---------|------------|----------|
|
||||||
|
| Tests pass alone, fail together | Shared database state | Use transactions with rollback |
|
||||||
|
| Tests fail intermittently | Race conditions on shared data | Use unique IDs per test |
|
||||||
|
| Tests leave data behind | No cleanup | Add explicit teardown fixtures |
|
||||||
|
| Slow test setup/teardown | Creating too much data | Use factories, minimal data |
|
||||||
|
| Can't reproduce failures | Non-deterministic data | Use fixtures with static data |
|
||||||
|
|
||||||
|
**Primary strategy:** Database transactions (wrap test in transaction, rollback after). Fastest and most reliable.
|
||||||
|
|
||||||
|
## Fixtures vs Factories Quick Guide
|
||||||
|
|
||||||
|
| Use Fixtures (Static Files) | Use Factories (Code Generators) |
|
||||||
|
|------------------------------|----------------------------------|
|
||||||
|
| Integration/contract tests | Unit tests |
|
||||||
|
| Realistic complex scenarios | Need many variations |
|
||||||
|
| Specific edge cases to verify | Simple "valid object" needed |
|
||||||
|
| Team needs to review data | Randomized/parameterized tests |
|
||||||
|
| Data rarely changes | Frequent maintenance |
|
||||||
|
|
||||||
|
**Decision:** Static, complex, reviewable → Fixtures. Dynamic, simple, variations → Factories.
|
||||||
|
|
||||||
|
**Hybrid (recommended):** Fixtures for integration tests, factories for unit tests.
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Shared Test Data
|
||||||
|
**Symptom:** All tests use same "test_user_123" in database
|
||||||
|
|
||||||
|
**Why bad:** Tests pollute each other, fail when run in parallel, can't isolate failures
|
||||||
|
|
||||||
|
**Fix:** Each test creates its own data with unique IDs or uses transactions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Cleanup Strategy
|
||||||
|
**Symptom:** Database grows with every test run, tests fail on second run
|
||||||
|
|
||||||
|
**Why bad:** Leftover data causes unique constraint violations, flaky tests
|
||||||
|
|
||||||
|
**Fix:** Use transaction rollback or explicit teardown fixtures
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Production Data in Tests
|
||||||
|
**Symptom:** Copying production database to test environment
|
||||||
|
|
||||||
|
**Why bad:** Privacy violations (GDPR, CCPA), security risk, compliance issues
|
||||||
|
|
||||||
|
**Fix:** Use synthetic data generation or anonymized/masked data
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Hardcoded Test Data
|
||||||
|
**Symptom:** Every test creates `User(name="John", email="john@test.com")`
|
||||||
|
|
||||||
|
**Why bad:** Violates DRY, maintenance nightmare when schema changes, no variations
|
||||||
|
|
||||||
|
**Fix:** Use factories to generate test data programmatically
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Copy-Paste Fixtures
|
||||||
|
**Symptom:** 50 nearly-identical JSON fixture files
|
||||||
|
|
||||||
|
**Why bad:** Hard to maintain, changes require updating all copies
|
||||||
|
|
||||||
|
**Fix:** Use fixture inheritance or factory-generated fixtures
|
||||||
|
|
||||||
|
## Isolation Strategies Quick Reference
|
||||||
|
|
||||||
|
| Strategy | Speed | Use When | Pros | Cons |
|
||||||
|
|----------|-------|----------|------|------|
|
||||||
|
| **Transactions (Rollback)** | Fast | Database tests | No cleanup code, bulletproof | DB only |
|
||||||
|
| **Unique IDs (UUID/timestamp)** | Fast | Parallel tests, external APIs | No conflicts | Still needs cleanup |
|
||||||
|
| **Explicit Cleanup (Teardown)** | Medium | Files, caches, APIs | Works for anything | Manual code |
|
||||||
|
| **In-Memory Database** | Fastest | Unit tests | Complete isolation | Not production-like |
|
||||||
|
| **Test Containers** | Medium | Integration tests | Production-like | Slower startup |
|
||||||
|
|
||||||
|
**Recommended order:** Try transactions first, add unique IDs for parallelization, explicit cleanup as last resort.
|
||||||
|
|
||||||
|
## Data Privacy Quick Guide
|
||||||
|
|
||||||
|
| Data Type | Strategy | Why |
|
||||||
|
|-----------|----------|-----|
|
||||||
|
| **PII (names, emails, addresses)** | Synthetic generation (Faker) | Avoid legal risk |
|
||||||
|
| **Payment data** | NEVER use production | PCI-DSS compliance |
|
||||||
|
| **Health data** | Anonymize + subset | HIPAA compliance |
|
||||||
|
| **Sensitive business data** | Mask or synthesize | Protect IP |
|
||||||
|
| **Non-sensitive metadata** | Can use production | ID mappings, timestamps OK if no PII |
|
||||||
|
|
||||||
|
**Default rule:** When in doubt, use synthetic data.
|
||||||
|
|
||||||
|
## Your First Test Data Setup
|
||||||
|
|
||||||
|
**Start minimal, add complexity only when needed:**
|
||||||
|
|
||||||
|
**Phase 1: Transactions (Week 1)**
|
||||||
|
```python
|
||||||
|
@pytest.fixture
|
||||||
|
def db_session(db_engine):
|
||||||
|
connection = db_engine.connect()
|
||||||
|
transaction = connection.begin()
|
||||||
|
session = Session(bind=connection)
|
||||||
|
|
||||||
|
yield session
|
||||||
|
|
||||||
|
transaction.rollback()
|
||||||
|
connection.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Phase 2: Add Factories (Week 2)**
|
||||||
|
```python
|
||||||
|
class UserFactory:
|
||||||
|
@staticmethod
|
||||||
|
def create(**overrides):
|
||||||
|
defaults = {
|
||||||
|
"id": str(uuid4()),
|
||||||
|
"email": f"test_{uuid4()}@example.com",
|
||||||
|
"created_at": datetime.now()
|
||||||
|
}
|
||||||
|
return {**defaults, **overrides}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Phase 3: Add Fixtures for Complex Cases (Week 3+)**
|
||||||
|
```json
|
||||||
|
// tests/fixtures/valid_invoice.json
|
||||||
|
{
|
||||||
|
"id": "inv-001",
|
||||||
|
"items": [/* complex nested data */],
|
||||||
|
"total": 107.94
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Don't start with full complexity.** Master transactions first.
|
||||||
|
|
||||||
|
## Non-Database Resource Isolation
|
||||||
|
|
||||||
|
Database transactions don't work for files, caches, message queues, or external services. Use **explicit cleanup with unique namespacing**.
|
||||||
|
|
||||||
|
### Temporary Files Strategy
|
||||||
|
|
||||||
|
**Recommended:** Python's `tempfile` module (automatic cleanup)
|
||||||
|
|
||||||
|
```python
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_workspace():
|
||||||
|
"""Isolated temporary directory for test"""
|
||||||
|
with tempfile.TemporaryDirectory(prefix="test_") as tmp_dir:
|
||||||
|
yield Path(tmp_dir)
|
||||||
|
# Automatic cleanup on exit
|
||||||
|
```
|
||||||
|
|
||||||
|
**Alternative (manual control):**
|
||||||
|
```python
|
||||||
|
from uuid import uuid4
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_dir():
|
||||||
|
test_dir = Path(f"/tmp/test_{uuid4()}")
|
||||||
|
test_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
yield test_dir
|
||||||
|
|
||||||
|
shutil.rmtree(test_dir, ignore_errors=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Redis/Cache Isolation Strategy
|
||||||
|
|
||||||
|
**Option 1: Unique key namespace per test (lightweight)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.fixture
|
||||||
|
def redis_namespace(redis_client):
|
||||||
|
"""Namespaced Redis keys with automatic cleanup"""
|
||||||
|
namespace = f"test:{uuid4()}"
|
||||||
|
|
||||||
|
yield namespace
|
||||||
|
|
||||||
|
# Cleanup: Delete all keys with this namespace
|
||||||
|
for key in redis_client.scan_iter(f"{namespace}:*"):
|
||||||
|
redis_client.delete(key)
|
||||||
|
|
||||||
|
def test_caching(redis_namespace, redis_client):
|
||||||
|
key = f"{redis_namespace}:user:123"
|
||||||
|
redis_client.set(key, "value")
|
||||||
|
# Automatic cleanup after test
|
||||||
|
```
|
||||||
|
|
||||||
|
**Option 2: Separate Redis database per test (stronger isolation)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.fixture
|
||||||
|
def isolated_redis():
|
||||||
|
"""Use Redis DB 1-15 for tests (DB 0 for dev)"""
|
||||||
|
import random
|
||||||
|
test_db = random.randint(1, 15)
|
||||||
|
client = redis.Redis(db=test_db)
|
||||||
|
|
||||||
|
yield client
|
||||||
|
|
||||||
|
client.flushdb() # Clear entire test database
|
||||||
|
```
|
||||||
|
|
||||||
|
**Option 3: Test containers (best isolation, slower)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from testcontainers.redis import RedisContainer
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def redis_container():
|
||||||
|
with RedisContainer() as container:
|
||||||
|
yield container
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def redis_client(redis_container):
|
||||||
|
client = redis.from_url(redis_container.get_connection_url())
|
||||||
|
yield client
|
||||||
|
client.flushdb()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Combined Resource Cleanup
|
||||||
|
|
||||||
|
When tests use database + files + cache:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.fixture
|
||||||
|
def isolated_test_env(db_session, temp_workspace, redis_namespace):
|
||||||
|
"""Combined isolation for all resources"""
|
||||||
|
yield {
|
||||||
|
"db": db_session,
|
||||||
|
"files": temp_workspace,
|
||||||
|
"cache_ns": redis_namespace
|
||||||
|
}
|
||||||
|
# Teardown automatic via dependent fixtures
|
||||||
|
# Order: External resources first, DB last
|
||||||
|
```
|
||||||
|
|
||||||
|
### Quick Decision Guide
|
||||||
|
|
||||||
|
| Resource Type | Isolation Strategy | Cleanup Method |
|
||||||
|
|---------------|-------------------|----------------|
|
||||||
|
| **Temporary files** | Unique directory per test | `tempfile.TemporaryDirectory()` |
|
||||||
|
| **Redis cache** | Unique key namespace | Delete by pattern in teardown |
|
||||||
|
| **Message queues** | Unique queue name | Delete queue in teardown |
|
||||||
|
| **External APIs** | Unique resource IDs | DELETE requests in teardown |
|
||||||
|
| **Test containers** | Per-test container | Container auto-cleanup |
|
||||||
|
|
||||||
|
**Rule:** If transactions don't work, use unique IDs + explicit cleanup.
|
||||||
|
|
||||||
|
## Test Containers Pattern
|
||||||
|
|
||||||
|
**Core principle:** Session-scoped container + transaction rollback per test.
|
||||||
|
|
||||||
|
**Don't recreate containers per test** - startup overhead kills performance.
|
||||||
|
|
||||||
|
### SQL Database Containers (PostgreSQL, MySQL)
|
||||||
|
|
||||||
|
**Recommended:** Session-scoped container + transactional fixtures
|
||||||
|
|
||||||
|
```python
|
||||||
|
from testcontainers.postgres import PostgresContainer
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def postgres_container():
|
||||||
|
"""Container lives for entire test run"""
|
||||||
|
with PostgresContainer("postgres:15") as container:
|
||||||
|
yield container
|
||||||
|
# Auto-cleanup after all tests
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_session(postgres_container):
|
||||||
|
"""Transaction per test - fast isolation"""
|
||||||
|
engine = create_engine(postgres_container.get_connection_url())
|
||||||
|
connection = engine.connect()
|
||||||
|
transaction = connection.begin()
|
||||||
|
session = Session(bind=connection)
|
||||||
|
|
||||||
|
yield session
|
||||||
|
|
||||||
|
transaction.rollback() # <1ms cleanup
|
||||||
|
connection.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Performance:**
|
||||||
|
- Container startup: 5-10 seconds (once per test run)
|
||||||
|
- Transaction rollback: <1ms per test
|
||||||
|
- 100 tests: ~10 seconds total vs 8-16 minutes if recreating container per test
|
||||||
|
|
||||||
|
**When to recreate container:**
|
||||||
|
- Testing database migrations (need clean schema each time)
|
||||||
|
- Testing database extensions/configuration changes
|
||||||
|
- Container state itself is under test
|
||||||
|
|
||||||
|
**For data isolation:** Transactions within shared container always win.
|
||||||
|
|
||||||
|
### NoSQL/Cache Containers (Redis, MongoDB)
|
||||||
|
|
||||||
|
Use session-scoped container + flush per test:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from testcontainers.redis import RedisContainer
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def redis_container():
|
||||||
|
"""Container lives for entire test run"""
|
||||||
|
with RedisContainer() as container:
|
||||||
|
yield container
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def redis_client(redis_container):
|
||||||
|
"""Fresh client per test"""
|
||||||
|
client = redis.from_url(redis_container.get_connection_url())
|
||||||
|
yield client
|
||||||
|
client.flushdb() # Clear after test
|
||||||
|
```
|
||||||
|
|
||||||
|
### Container Scope Decision
|
||||||
|
|
||||||
|
| Use Case | Container Scope | Data Isolation Strategy |
|
||||||
|
|----------|-----------------|------------------------|
|
||||||
|
| SQL database tests | `scope="session"` | Transaction rollback per test |
|
||||||
|
| NoSQL cache tests | `scope="session"` | Flush database per test |
|
||||||
|
| Migration testing | `scope="function"` | Fresh schema per test |
|
||||||
|
| Service integration | `scope="session"` | Unique IDs + cleanup per test |
|
||||||
|
|
||||||
|
**Default:** Session scope + transaction/flush per test (100x faster).
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Creating Full Objects When Partial Works
|
||||||
|
**Symptom:** Test needs user ID, creates full user with 20 fields
|
||||||
|
|
||||||
|
**Fix:** Create minimal valid object:
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
user = UserFactory.create(
|
||||||
|
name="Test", email="test@example.com",
|
||||||
|
address="123 St", phone="555-1234",
|
||||||
|
# ... 15 more fields
|
||||||
|
)
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
user = {"id": str(uuid4())} # If only ID needed
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Transaction Isolation for Database Tests
|
||||||
|
**Symptom:** Writing manual cleanup code for every database test
|
||||||
|
|
||||||
|
**Fix:** Use transactional fixtures. Wrap in transaction, automatic rollback.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing With Timestamps That Fail at Midnight
|
||||||
|
**Symptom:** Tests pass during day, fail at exactly midnight
|
||||||
|
|
||||||
|
**Fix:** Mock system time or use relative dates:
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
assert created_at.date() == datetime.now().date()
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
from freezegun import freeze_time
|
||||||
|
@freeze_time("2025-11-15 12:00:00")
|
||||||
|
def test_timestamp():
|
||||||
|
assert created_at.date() == date(2025, 11, 15)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Test Isolation Priority:**
|
||||||
|
1. Database tests → Transactions (rollback)
|
||||||
|
2. Parallel execution → Unique IDs (UUID)
|
||||||
|
3. External services → Explicit cleanup
|
||||||
|
4. Files/caches → Teardown fixtures
|
||||||
|
|
||||||
|
**Fixtures vs Factories:**
|
||||||
|
- Complex integration scenario → Fixture
|
||||||
|
- Simple unit test → Factory
|
||||||
|
- Need variations → Factory
|
||||||
|
- Specific edge case → Fixture
|
||||||
|
|
||||||
|
**Data Privacy:**
|
||||||
|
- PII/sensitive → Synthetic data (Faker, custom generators)
|
||||||
|
- Never production payment/health data
|
||||||
|
- Mask if absolutely need production structure
|
||||||
|
|
||||||
|
**Getting Started:**
|
||||||
|
1. Add transaction fixtures (Week 1)
|
||||||
|
2. Add factory for common objects (Week 2)
|
||||||
|
3. Add complex fixtures as needed (Week 3+)
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Test isolation prevents flaky tests.**
|
||||||
|
|
||||||
|
Use transactions for database tests (fastest, cleanest). Use factories for unit tests (flexible, DRY). Use fixtures for complex integration scenarios (realistic, reviewable). Never use production data without anonymization.
|
||||||
663
skills/test-isolation-fundamentals/SKILL.md
Normal file
663
skills/test-isolation-fundamentals/SKILL.md
Normal file
@@ -0,0 +1,663 @@
|
|||||||
|
---
|
||||||
|
name: test-isolation-fundamentals
|
||||||
|
description: Use when tests fail together but pass alone, diagnosing test pollution, ensuring test independence and idempotence, managing shared state, or designing parallel-safe tests - provides isolation principles, database/file/service patterns, and cleanup strategies
|
||||||
|
---
|
||||||
|
|
||||||
|
# Test Isolation Fundamentals
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Each test must work independently, regardless of execution order or parallel execution.
|
||||||
|
|
||||||
|
**Rule:** If a test fails when run with other tests but passes alone, you have an isolation problem. Fix it before adding more tests.
|
||||||
|
|
||||||
|
## When You Have Isolation Problems
|
||||||
|
|
||||||
|
**Symptoms:**
|
||||||
|
- Tests pass individually: `pytest test_checkout.py` ✓
|
||||||
|
- Tests fail in full suite: `pytest` ✗
|
||||||
|
- Errors like "User already exists", "Expected empty but found data"
|
||||||
|
- Tests fail randomly or only in CI
|
||||||
|
- Different results when tests run in different orders
|
||||||
|
|
||||||
|
**Root cause:** Tests share mutable state without cleanup.
|
||||||
|
|
||||||
|
## The Five Principles
|
||||||
|
|
||||||
|
### 1. Order-Independence
|
||||||
|
|
||||||
|
**Tests must pass regardless of execution order.**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# All of these must produce identical results
|
||||||
|
pytest tests/ # alphabetical order
|
||||||
|
pytest tests/ --random-order # random order
|
||||||
|
pytest tests/ --reverse # reverse order
|
||||||
|
```
|
||||||
|
|
||||||
|
**Anti-pattern:**
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Test B depends on Test A running first
|
||||||
|
def test_create_user():
|
||||||
|
db.users.insert({"id": 1, "name": "Alice"})
|
||||||
|
|
||||||
|
def test_update_user():
|
||||||
|
db.users.update({"id": 1}, {"name": "Bob"}) # Assumes Alice exists!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Each test creates its own data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Idempotence
|
||||||
|
|
||||||
|
**Running a test twice produces the same result both times.**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Both runs must pass
|
||||||
|
pytest test_checkout.py # First run
|
||||||
|
pytest test_checkout.py # Second run (same result)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Anti-pattern:**
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Second run fails on unique constraint
|
||||||
|
def test_signup():
|
||||||
|
user = create_user(email="test@example.com")
|
||||||
|
assert user.id is not None
|
||||||
|
# No cleanup - second run fails: "email already exists"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Clean up data after test OR use unique data per run.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Fresh State
|
||||||
|
|
||||||
|
**Each test starts with a clean slate.**
|
||||||
|
|
||||||
|
**What needs to be fresh:**
|
||||||
|
- Database records
|
||||||
|
- Files and directories
|
||||||
|
- In-memory caches
|
||||||
|
- Global variables
|
||||||
|
- Module-level state
|
||||||
|
- Environment variables
|
||||||
|
- Network sockets/ports
|
||||||
|
- Background processes
|
||||||
|
|
||||||
|
**Anti-pattern:**
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Shared mutable global state
|
||||||
|
cache = {} # Module-level global
|
||||||
|
|
||||||
|
def test_cache_miss():
|
||||||
|
assert get_from_cache("key1") is None # Passes first time
|
||||||
|
cache["key1"] = "value" # Pollutes global state
|
||||||
|
|
||||||
|
def test_cache_lookup():
|
||||||
|
assert get_from_cache("key1") is None # Fails if previous test ran!
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Explicit Scope
|
||||||
|
|
||||||
|
**Know what state is shared vs isolated.**
|
||||||
|
|
||||||
|
**Test scopes (pytest):**
|
||||||
|
- `scope="function"` - Fresh per test (default, safest)
|
||||||
|
- `scope="class"` - Shared across test class
|
||||||
|
- `scope="module"` - Shared across file
|
||||||
|
- `scope="session"` - Shared across entire test run
|
||||||
|
|
||||||
|
**Rule:** Default to `scope="function"`. Only use broader scopes for expensive resources that are READ-ONLY.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD: Expensive read-only data can be shared
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def large_config_file():
|
||||||
|
return load_config("data.json") # Expensive, never modified
|
||||||
|
|
||||||
|
# ❌ BAD: Mutable data shared across tests
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def database():
|
||||||
|
return Database() # Tests will pollute each other!
|
||||||
|
|
||||||
|
# ✅ GOOD: Mutable data fresh per test
|
||||||
|
@pytest.fixture(scope="function")
|
||||||
|
def database():
|
||||||
|
db = Database()
|
||||||
|
yield db
|
||||||
|
db.cleanup() # Fresh per test
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. Parallel Safety
|
||||||
|
|
||||||
|
**Tests must work when run concurrently.**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest -n 4 # Run 4 tests in parallel with pytest-xdist
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parallel-unsafe patterns:**
|
||||||
|
- Shared files without unique names
|
||||||
|
- Fixed network ports
|
||||||
|
- Singleton databases
|
||||||
|
- Global module state
|
||||||
|
- Fixed temp directories
|
||||||
|
|
||||||
|
**Fix:** Use unique identifiers per test (UUIDs, process IDs, random ports).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Isolation Patterns by Resource Type
|
||||||
|
|
||||||
|
### Database Isolation
|
||||||
|
|
||||||
|
**Pattern 1: Transactions with Rollback (Fastest, Recommended)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_session(db_engine):
|
||||||
|
"""Each test gets a fresh DB session that auto-rollbacks."""
|
||||||
|
connection = db_engine.connect()
|
||||||
|
transaction = connection.begin()
|
||||||
|
session = Session(bind=connection)
|
||||||
|
|
||||||
|
yield session
|
||||||
|
|
||||||
|
transaction.rollback() # Undo all changes
|
||||||
|
connection.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why it works:**
|
||||||
|
- No cleanup code needed - rollback is automatic
|
||||||
|
- Fast (<1ms per test)
|
||||||
|
- Works with ANY database (PostgreSQL, MySQL, SQLite, Oracle)
|
||||||
|
- Handles FK relationships automatically
|
||||||
|
|
||||||
|
**When NOT to use:**
|
||||||
|
- Testing actual commits
|
||||||
|
- Testing transaction isolation levels
|
||||||
|
- Multi-database transactions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Pattern 2: Unique Data Per Test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import uuid
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def unique_user():
|
||||||
|
"""Each test gets a unique user."""
|
||||||
|
email = f"test-{uuid.uuid4()}@example.com"
|
||||||
|
user = create_user(email=email, name="Test User")
|
||||||
|
|
||||||
|
yield user
|
||||||
|
|
||||||
|
# Optional cleanup (or rely on test DB being dropped)
|
||||||
|
delete_user(user.id)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why it works:**
|
||||||
|
- Tests don't interfere (different users)
|
||||||
|
- Can run in parallel
|
||||||
|
- Idempotent (UUID ensures uniqueness)
|
||||||
|
|
||||||
|
**When to use:**
|
||||||
|
- Testing with real databases
|
||||||
|
- Parallel test execution
|
||||||
|
- Integration tests that need real commits
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Pattern 3: Test Database Per Test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.fixture
|
||||||
|
def isolated_db():
|
||||||
|
"""Each test gets its own temporary database."""
|
||||||
|
db_name = f"test_db_{uuid.uuid4().hex}"
|
||||||
|
create_database(db_name)
|
||||||
|
|
||||||
|
yield get_connection(db_name)
|
||||||
|
|
||||||
|
drop_database(db_name)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why it works:**
|
||||||
|
- Complete isolation
|
||||||
|
- Can test schema migrations
|
||||||
|
- No cross-test pollution
|
||||||
|
|
||||||
|
**When NOT to use:**
|
||||||
|
- Unit tests (too slow)
|
||||||
|
- Large test suites (overhead adds up)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### File System Isolation
|
||||||
|
|
||||||
|
**Pattern: Temporary Directories**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_workspace():
|
||||||
|
"""Each test gets a fresh temporary directory."""
|
||||||
|
tmpdir = tempfile.mkdtemp(prefix="test_")
|
||||||
|
|
||||||
|
yield tmpdir
|
||||||
|
|
||||||
|
shutil.rmtree(tmpdir) # Clean up
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parallel-safe version:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_workspace(tmp_path):
|
||||||
|
"""pytest's tmp_path is automatically unique per test."""
|
||||||
|
workspace = tmp_path / "workspace"
|
||||||
|
workspace.mkdir()
|
||||||
|
|
||||||
|
yield workspace
|
||||||
|
|
||||||
|
# No cleanup needed - pytest handles it
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why it works:**
|
||||||
|
- Each test writes to different directory
|
||||||
|
- Parallel-safe (unique paths)
|
||||||
|
- Automatic cleanup
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Service/API Isolation
|
||||||
|
|
||||||
|
**Pattern: Mocking External Services**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_stripe():
|
||||||
|
"""Mock Stripe API for all tests."""
|
||||||
|
with patch('stripe.Charge.create') as mock:
|
||||||
|
mock.return_value = MagicMock(id="ch_test123", status="succeeded")
|
||||||
|
yield mock
|
||||||
|
```
|
||||||
|
|
||||||
|
**When to use:**
|
||||||
|
- External APIs (Stripe, Twilio, SendGrid)
|
||||||
|
- Slow services
|
||||||
|
- Non-deterministic responses
|
||||||
|
- Services that cost money per call
|
||||||
|
|
||||||
|
**When NOT to use:**
|
||||||
|
- Testing integration with real service (use separate integration test suite)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### In-Memory Cache Isolation
|
||||||
|
|
||||||
|
**Pattern: Clear Cache Before Each Test**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def clear_cache():
|
||||||
|
"""Automatically clear cache before each test."""
|
||||||
|
cache.clear()
|
||||||
|
yield
|
||||||
|
# Optional: clear after test too
|
||||||
|
cache.clear()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why `autouse=True`:** Runs automatically for every test without explicit declaration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Process/Port Isolation
|
||||||
|
|
||||||
|
**Pattern: Dynamic Port Allocation**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import socket
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
def get_free_port():
|
||||||
|
"""Find an available port."""
|
||||||
|
sock = socket.socket()
|
||||||
|
sock.bind(('', 0))
|
||||||
|
port = sock.getsockname()[1]
|
||||||
|
sock.close()
|
||||||
|
return port
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_server():
|
||||||
|
"""Each test gets a server on a unique port."""
|
||||||
|
port = get_free_port()
|
||||||
|
server = start_server(port=port)
|
||||||
|
|
||||||
|
yield f"http://localhost:{port}"
|
||||||
|
|
||||||
|
server.stop()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why it works:**
|
||||||
|
- Tests can run in parallel (different ports)
|
||||||
|
- No port conflicts
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Doubles: When to Use What
|
||||||
|
|
||||||
|
| Type | Purpose | Example |
|
||||||
|
|------|---------|---------|
|
||||||
|
| **Stub** | Returns hardcoded values | `getUser() → {id: 1, name: "Alice"}` |
|
||||||
|
| **Mock** | Verifies calls were made | `assert emailService.send.called` |
|
||||||
|
| **Fake** | Working implementation, simplified | In-memory database instead of PostgreSQL |
|
||||||
|
| **Spy** | Records calls for later inspection | Logs all method calls |
|
||||||
|
|
||||||
|
**Decision tree:**
|
||||||
|
|
||||||
|
```
|
||||||
|
Do you need to verify the call was made?
|
||||||
|
YES → Use Mock
|
||||||
|
NO → Do you need a working implementation?
|
||||||
|
YES → Use Fake
|
||||||
|
NO → Use Stub
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Diagnosing Isolation Problems
|
||||||
|
|
||||||
|
### Step 1: Identify Flaky Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run tests 100 times to find flakiness
|
||||||
|
pytest --count=100 test_checkout.py
|
||||||
|
|
||||||
|
# Run in random order
|
||||||
|
pytest --random-order
|
||||||
|
```
|
||||||
|
|
||||||
|
**Interpretation:**
|
||||||
|
- Passes 100/100 → Not flaky
|
||||||
|
- Passes 95/100 → Flaky (5% failure rate)
|
||||||
|
- Failures are random → Parallel unsafe OR order-dependent
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Step 2: Find Which Tests Interfere
|
||||||
|
|
||||||
|
**Run tests in isolation:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test A alone
|
||||||
|
pytest test_a.py # ✓ Passes
|
||||||
|
|
||||||
|
# Test B alone
|
||||||
|
pytest test_b.py # ✓ Passes
|
||||||
|
|
||||||
|
# Both together
|
||||||
|
pytest test_a.py test_b.py # ✗ Test B fails
|
||||||
|
|
||||||
|
# Conclusion: Test A pollutes state that Test B depends on
|
||||||
|
```
|
||||||
|
|
||||||
|
**Reverse the order:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest test_b.py test_a.py # Does Test A fail now?
|
||||||
|
```
|
||||||
|
|
||||||
|
- If YES: Bidirectional pollution
|
||||||
|
- If NO: Test A pollutes, Test B is victim
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Step 3: Identify Shared State
|
||||||
|
|
||||||
|
**Add diagnostic logging:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def log_state():
|
||||||
|
"""Log state before/after each test."""
|
||||||
|
print(f"Before: DB has {db.count()} records")
|
||||||
|
yield
|
||||||
|
print(f"After: DB has {db.count()} records")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Look for:**
|
||||||
|
- Record count increasing over time (no cleanup)
|
||||||
|
- Files accumulating
|
||||||
|
- Cache growing
|
||||||
|
- Ports in use
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Step 4: Audit for Global State
|
||||||
|
|
||||||
|
**Search codebase for isolation violations:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Module-level globals
|
||||||
|
grep -r "^[A-Z_]* = " app/
|
||||||
|
|
||||||
|
# Global caches
|
||||||
|
grep -r "cache = " app/
|
||||||
|
|
||||||
|
# Singletons
|
||||||
|
grep -r "@singleton" app/
|
||||||
|
grep -r "class.*Singleton" app/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Cleanup Code Instead of Structural Isolation
|
||||||
|
|
||||||
|
**Symptom:** Every test has teardown code to clean up
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_checkout():
|
||||||
|
user = create_user()
|
||||||
|
cart = create_cart(user)
|
||||||
|
|
||||||
|
checkout(cart)
|
||||||
|
|
||||||
|
# Teardown
|
||||||
|
delete_cart(cart.id)
|
||||||
|
delete_user(user.id)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:**
|
||||||
|
- If test fails before cleanup, state pollutes
|
||||||
|
- If cleanup has bugs, state pollutes
|
||||||
|
- Forces sequential execution (no parallelism)
|
||||||
|
|
||||||
|
**Fix:** Use transactions, unique IDs, or dependency injection
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Shared Test Fixtures
|
||||||
|
|
||||||
|
**Symptom:** Fixtures modify mutable state
|
||||||
|
|
||||||
|
```python
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def user():
|
||||||
|
return create_user(email="test@example.com")
|
||||||
|
|
||||||
|
def test_update_name(user):
|
||||||
|
user.name = "Alice" # Modifies shared fixture!
|
||||||
|
save(user)
|
||||||
|
|
||||||
|
def test_update_email(user):
|
||||||
|
# Expects name to be original, but Test 1 changed it!
|
||||||
|
assert user.name == "Test User" # FAILS
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Tests interfere when fixture is modified
|
||||||
|
|
||||||
|
**Fix:** Use `scope="function"` for mutable fixtures
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Hidden Dependencies on Execution Order
|
||||||
|
|
||||||
|
**Symptom:** Test suite has implicit execution order
|
||||||
|
|
||||||
|
```python
|
||||||
|
# test_a.py
|
||||||
|
def test_create_admin():
|
||||||
|
create_user(email="admin@example.com", role="admin")
|
||||||
|
|
||||||
|
# test_b.py
|
||||||
|
def test_admin_permissions():
|
||||||
|
admin = get_user("admin@example.com") # Assumes test_a ran!
|
||||||
|
assert admin.has_permission("delete_users")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Breaks when tests run in different order or in parallel
|
||||||
|
|
||||||
|
**Fix:** Each test creates its own dependencies
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing on Production-Like State
|
||||||
|
|
||||||
|
**Symptom:** Tests run against shared database with existing data
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_user_count():
|
||||||
|
assert db.users.count() == 100 # Assumes specific state!
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:**
|
||||||
|
- Tests fail when data changes
|
||||||
|
- Can't run in parallel
|
||||||
|
- Can't run idempotently
|
||||||
|
|
||||||
|
**Fix:** Use isolated test database or count relative to test's own data
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Scenarios
|
||||||
|
|
||||||
|
### Scenario 1: "Tests pass locally, fail in CI"
|
||||||
|
|
||||||
|
**Likely causes:**
|
||||||
|
1. **Timing issues** - CI is slower/faster, race conditions exposed
|
||||||
|
2. **Parallel execution** - CI runs tests in parallel, local doesn't
|
||||||
|
3. **Missing cleanup** - Local has leftover state, CI is fresh
|
||||||
|
|
||||||
|
**Diagnosis:**
|
||||||
|
```bash
|
||||||
|
# Test parallel execution locally
|
||||||
|
pytest -n 4
|
||||||
|
|
||||||
|
# Test with clean state
|
||||||
|
rm -rf .pytest_cache && pytest
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Scenario 2: "Random test failures that disappear on retry"
|
||||||
|
|
||||||
|
**Likely causes:**
|
||||||
|
1. **Race conditions** - Async operations not awaited
|
||||||
|
2. **Shared mutable state** - Global variables polluted
|
||||||
|
3. **External service flakiness** - Real APIs being called
|
||||||
|
|
||||||
|
**Diagnosis:**
|
||||||
|
```bash
|
||||||
|
# Run same test 100 times
|
||||||
|
pytest --count=100 test_flaky.py
|
||||||
|
|
||||||
|
# If failure rate is consistent (e.g., 5/100), it's likely shared state
|
||||||
|
# If failure rate varies wildly, it's likely race condition
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Scenario 3: "Database unique constraint violations"
|
||||||
|
|
||||||
|
**Symptom:** `IntegrityError: duplicate key value violates unique constraint`
|
||||||
|
|
||||||
|
**Cause:** Tests reuse same email/username/ID
|
||||||
|
|
||||||
|
**Fix:**
|
||||||
|
```python
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def unique_user():
|
||||||
|
email = f"test-{uuid.uuid4()}@example.com"
|
||||||
|
return create_user(email=email)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Reference: Isolation Strategy Decision Tree
|
||||||
|
|
||||||
|
```
|
||||||
|
What resource needs isolation?
|
||||||
|
|
||||||
|
DATABASE
|
||||||
|
├─ Can you use transactions? → Transaction Rollback (fastest)
|
||||||
|
├─ Need real commits? → Unique Data Per Test
|
||||||
|
└─ Need schema changes? → Test Database Per Test
|
||||||
|
|
||||||
|
FILES
|
||||||
|
├─ Few files? → pytest's tmp_path
|
||||||
|
└─ Complex directories? → tempfile.mkdtemp()
|
||||||
|
|
||||||
|
EXTERNAL SERVICES
|
||||||
|
├─ Testing integration? → Separate integration test suite
|
||||||
|
└─ Testing business logic? → Mock the service
|
||||||
|
|
||||||
|
IN-MEMORY STATE
|
||||||
|
├─ Caches → Clear before each test (autouse fixture)
|
||||||
|
├─ Globals → Dependency injection (refactor)
|
||||||
|
└─ Module-level → Reset in fixture or avoid entirely
|
||||||
|
|
||||||
|
PROCESSES/PORTS
|
||||||
|
└─ Dynamic port allocation per test
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Test isolation is structural, not reactive.**
|
||||||
|
|
||||||
|
- ❌ **Reactive:** Write cleanup code after each test
|
||||||
|
- ✅ **Structural:** Design tests so cleanup isn't needed
|
||||||
|
|
||||||
|
**The hierarchy:**
|
||||||
|
1. **Best:** Dependency injection (no shared state)
|
||||||
|
2. **Good:** Transactions/tmp_path (automatic cleanup)
|
||||||
|
3. **Acceptable:** Unique data per test (explicit isolation)
|
||||||
|
4. **Last resort:** Manual cleanup (fragile, error-prone)
|
||||||
|
|
||||||
|
**If your tests fail together but pass alone, you have an isolation problem. Stop adding tests and fix isolation first.**
|
||||||
500
skills/test-maintenance-patterns/SKILL.md
Normal file
500
skills/test-maintenance-patterns/SKILL.md
Normal file
@@ -0,0 +1,500 @@
|
|||||||
|
---
|
||||||
|
name: test-maintenance-patterns
|
||||||
|
description: Use when reducing test duplication, refactoring flaky tests, implementing page object patterns, managing test helpers, reducing test debt, or scaling test suites - provides refactoring strategies and maintainability patterns for long-term test sustainability
|
||||||
|
---
|
||||||
|
|
||||||
|
# Test Maintenance Patterns
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Test code is production code. Apply the same quality standards: DRY, SOLID, refactoring.
|
||||||
|
|
||||||
|
**Rule:** If you can't understand a test in 30 seconds, refactor it. If a test is flaky, fix or delete it.
|
||||||
|
|
||||||
|
## Test Maintenance vs Writing Tests
|
||||||
|
|
||||||
|
| Activity | When | Goal |
|
||||||
|
|----------|------|------|
|
||||||
|
| **Writing tests** | New features, bug fixes | Add coverage |
|
||||||
|
| **Maintaining tests** | Test suite grows, flakiness increases | Reduce duplication, improve clarity, fix flakiness |
|
||||||
|
|
||||||
|
**Test debt indicators:**
|
||||||
|
- Tests take > 15 minutes to run
|
||||||
|
- > 5% flakiness rate
|
||||||
|
- Duplicate setup code across 10+ tests
|
||||||
|
- Tests break on unrelated changes
|
||||||
|
- Nobody understands old tests
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Page Object Pattern (E2E Tests)
|
||||||
|
|
||||||
|
**Problem:** Duplicated selectors across tests
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ BAD: Selectors duplicated everywhere
|
||||||
|
test('login', async ({ page }) => {
|
||||||
|
await page.fill('#email', 'user@example.com');
|
||||||
|
await page.fill('#password', 'password');
|
||||||
|
await page.click('button[type="submit"]');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('forgot password', async ({ page }) => {
|
||||||
|
await page.fill('#email', 'user@example.com'); // Duplicated!
|
||||||
|
await page.click('a.forgot-password');
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Page Object Pattern
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// pages/LoginPage.js
|
||||||
|
export class LoginPage {
|
||||||
|
constructor(page) {
|
||||||
|
this.page = page;
|
||||||
|
this.emailInput = page.locator('#email');
|
||||||
|
this.passwordInput = page.locator('#password');
|
||||||
|
this.submitButton = page.locator('button[type="submit"]');
|
||||||
|
this.forgotPasswordLink = page.locator('a.forgot-password');
|
||||||
|
}
|
||||||
|
|
||||||
|
async goto() {
|
||||||
|
await this.page.goto('/login');
|
||||||
|
}
|
||||||
|
|
||||||
|
async login(email, password) {
|
||||||
|
await this.emailInput.fill(email);
|
||||||
|
await this.passwordInput.fill(password);
|
||||||
|
await this.submitButton.click();
|
||||||
|
}
|
||||||
|
|
||||||
|
async clickForgotPassword() {
|
||||||
|
await this.forgotPasswordLink.click();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// tests/login.spec.js
|
||||||
|
import { LoginPage } from '../pages/LoginPage';
|
||||||
|
|
||||||
|
test('login', async ({ page }) => {
|
||||||
|
const loginPage = new LoginPage(page);
|
||||||
|
await loginPage.goto();
|
||||||
|
await loginPage.login('user@example.com', 'password');
|
||||||
|
|
||||||
|
await expect(page).toHaveURL('/dashboard');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('forgot password', async ({ page }) => {
|
||||||
|
const loginPage = new LoginPage(page);
|
||||||
|
await loginPage.goto();
|
||||||
|
await loginPage.clickForgotPassword();
|
||||||
|
|
||||||
|
await expect(page).toHaveURL('/reset-password');
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Selectors in one place
|
||||||
|
- Tests read like documentation
|
||||||
|
- Changes to UI require one-line fix
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Data Builders (Integration/Unit Tests)
|
||||||
|
|
||||||
|
**Problem:** Duplicate test data setup
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Duplicated setup
|
||||||
|
def test_order_total():
|
||||||
|
order = Order(
|
||||||
|
id=1,
|
||||||
|
user_id=123,
|
||||||
|
items=[Item(sku="WIDGET", quantity=2, price=10.0)],
|
||||||
|
shipping=5.0,
|
||||||
|
tax=1.5
|
||||||
|
)
|
||||||
|
assert order.total() == 26.5
|
||||||
|
|
||||||
|
def test_order_discounts():
|
||||||
|
order = Order( # Same setup!
|
||||||
|
id=2,
|
||||||
|
user_id=123,
|
||||||
|
items=[Item(sku="WIDGET", quantity=2, price=10.0)],
|
||||||
|
shipping=5.0,
|
||||||
|
tax=1.5
|
||||||
|
)
|
||||||
|
order.apply_discount(10)
|
||||||
|
assert order.total() == 24.0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Builder Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
# test_builders.py
|
||||||
|
class OrderBuilder:
|
||||||
|
def __init__(self):
|
||||||
|
self._id = 1
|
||||||
|
self._user_id = 123
|
||||||
|
self._items = []
|
||||||
|
self._shipping = 0.0
|
||||||
|
self._tax = 0.0
|
||||||
|
|
||||||
|
def with_id(self, id):
|
||||||
|
self._id = id
|
||||||
|
return self
|
||||||
|
|
||||||
|
def with_items(self, *items):
|
||||||
|
self._items = list(items)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def with_shipping(self, amount):
|
||||||
|
self._shipping = amount
|
||||||
|
return self
|
||||||
|
|
||||||
|
def with_tax(self, amount):
|
||||||
|
self._tax = amount
|
||||||
|
return self
|
||||||
|
|
||||||
|
def build(self):
|
||||||
|
return Order(
|
||||||
|
id=self._id,
|
||||||
|
user_id=self._user_id,
|
||||||
|
items=self._items,
|
||||||
|
shipping=self._shipping,
|
||||||
|
tax=self._tax
|
||||||
|
)
|
||||||
|
|
||||||
|
# tests/test_orders.py
|
||||||
|
def test_order_total():
|
||||||
|
order = (OrderBuilder()
|
||||||
|
.with_items(Item(sku="WIDGET", quantity=2, price=10.0))
|
||||||
|
.with_shipping(5.0)
|
||||||
|
.with_tax(1.5)
|
||||||
|
.build())
|
||||||
|
|
||||||
|
assert order.total() == 26.5
|
||||||
|
|
||||||
|
def test_order_discounts():
|
||||||
|
order = (OrderBuilder()
|
||||||
|
.with_items(Item(sku="WIDGET", quantity=2, price=10.0))
|
||||||
|
.with_shipping(5.0)
|
||||||
|
.with_tax(1.5)
|
||||||
|
.build())
|
||||||
|
|
||||||
|
order.apply_discount(10)
|
||||||
|
assert order.total() == 24.0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Readable test data creation
|
||||||
|
- Easy to customize per test
|
||||||
|
- Defaults handle common cases
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Shared Fixtures (pytest)
|
||||||
|
|
||||||
|
**Problem:** Setup code duplicated across tests
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD
|
||||||
|
def test_user_creation():
|
||||||
|
db = setup_database()
|
||||||
|
user_repo = UserRepository(db)
|
||||||
|
user = user_repo.create(email="alice@example.com")
|
||||||
|
assert user.id is not None
|
||||||
|
cleanup_database(db)
|
||||||
|
|
||||||
|
def test_user_deletion():
|
||||||
|
db = setup_database() # Duplicated!
|
||||||
|
user_repo = UserRepository(db)
|
||||||
|
user = user_repo.create(email="bob@example.com")
|
||||||
|
user_repo.delete(user.id)
|
||||||
|
assert user_repo.get(user.id) is None
|
||||||
|
cleanup_database(db)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Fixtures
|
||||||
|
|
||||||
|
```python
|
||||||
|
# conftest.py
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db():
|
||||||
|
"""Provide database connection with auto-cleanup."""
|
||||||
|
database = setup_database()
|
||||||
|
yield database
|
||||||
|
cleanup_database(database)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def user_repo(db):
|
||||||
|
"""Provide user repository."""
|
||||||
|
return UserRepository(db)
|
||||||
|
|
||||||
|
# tests/test_users.py
|
||||||
|
def test_user_creation(user_repo):
|
||||||
|
user = user_repo.create(email="alice@example.com")
|
||||||
|
assert user.id is not None
|
||||||
|
|
||||||
|
def test_user_deletion(user_repo):
|
||||||
|
user = user_repo.create(email="bob@example.com")
|
||||||
|
user_repo.delete(user.id)
|
||||||
|
assert user_repo.get(user.id) is None
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reducing Test Duplication
|
||||||
|
|
||||||
|
### Custom Matchers/Assertions
|
||||||
|
|
||||||
|
**Problem:** Complex assertions repeated
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Repeated validation logic
|
||||||
|
def test_valid_user():
|
||||||
|
user = create_user()
|
||||||
|
assert user.id is not None
|
||||||
|
assert '@' in user.email
|
||||||
|
assert len(user.name) > 0
|
||||||
|
assert user.created_at is not None
|
||||||
|
|
||||||
|
def test_another_valid_user():
|
||||||
|
user = create_admin()
|
||||||
|
assert user.id is not None # Same validations!
|
||||||
|
assert '@' in user.email
|
||||||
|
assert len(user.name) > 0
|
||||||
|
assert user.created_at is not None
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Custom assertion helpers
|
||||||
|
|
||||||
|
```python
|
||||||
|
# test_helpers.py
|
||||||
|
def assert_valid_user(user):
|
||||||
|
"""Assert user object is valid."""
|
||||||
|
assert user.id is not None, "User must have ID"
|
||||||
|
assert '@' in user.email, "Email must contain @"
|
||||||
|
assert len(user.name) > 0, "Name cannot be empty"
|
||||||
|
assert user.created_at is not None, "User must have creation timestamp"
|
||||||
|
|
||||||
|
# tests/test_users.py
|
||||||
|
def test_valid_user():
|
||||||
|
user = create_user()
|
||||||
|
assert_valid_user(user)
|
||||||
|
|
||||||
|
def test_another_valid_user():
|
||||||
|
user = create_admin()
|
||||||
|
assert_valid_user(user)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Handling Flaky Tests
|
||||||
|
|
||||||
|
### Strategy 1: Fix the Root Cause
|
||||||
|
|
||||||
|
**Flaky test symptoms:**
|
||||||
|
- Passes 95/100 runs
|
||||||
|
- Fails with different errors
|
||||||
|
- Fails only in CI
|
||||||
|
|
||||||
|
**Root causes:**
|
||||||
|
- Race conditions (see flaky-test-prevention skill)
|
||||||
|
- Shared state (see test-isolation-fundamentals skill)
|
||||||
|
- Timing assumptions
|
||||||
|
|
||||||
|
**Fix:** Use condition-based waiting, isolate state
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Strategy 2: Quarantine Pattern
|
||||||
|
|
||||||
|
**For tests that can't be fixed immediately:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Mark as flaky, run separately
|
||||||
|
@pytest.mark.flaky
|
||||||
|
def test_sometimes_fails():
|
||||||
|
# Test code
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run stable tests only
|
||||||
|
pytest -m "not flaky"
|
||||||
|
|
||||||
|
# Run flaky tests separately (don't block CI)
|
||||||
|
pytest -m flaky --count=3 # Retry up to 3 times
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rule:** Quarantined tests must have tracking issue. Fix within 30 days or delete.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Strategy 3: Delete If Unfixable
|
||||||
|
|
||||||
|
**When to delete:**
|
||||||
|
- Test is flaky AND nobody understands it
|
||||||
|
- Test has been disabled for > 90 days
|
||||||
|
- Test duplicates coverage from other tests
|
||||||
|
|
||||||
|
**Better to have:** 100 reliable tests than 150 tests with 10 flaky ones
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Refactoring Test Suites
|
||||||
|
|
||||||
|
### Identify Slow Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# pytest: Show slowest 10 tests
|
||||||
|
pytest --durations=10
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# 10.23s call test_integration_checkout.py::test_full_checkout
|
||||||
|
# 8.45s call test_api.py::test_payment_flow
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Action:** Optimize or split into integration/E2E categories
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Parallelize Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# pytest: Run tests in parallel
|
||||||
|
pytest -n 4 # Use 4 CPU cores
|
||||||
|
|
||||||
|
# Jest: Run tests in parallel (default)
|
||||||
|
jest --maxWorkers=4
|
||||||
|
```
|
||||||
|
|
||||||
|
**Requirements:**
|
||||||
|
- Tests must be isolated (no shared state)
|
||||||
|
- See test-isolation-fundamentals skill
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Split Test Suites
|
||||||
|
|
||||||
|
```ini
|
||||||
|
# pytest.ini
|
||||||
|
[pytest]
|
||||||
|
markers =
|
||||||
|
unit: Unit tests (fast, isolated)
|
||||||
|
integration: Integration tests (medium speed, real DB)
|
||||||
|
e2e: End-to-end tests (slow, full system)
|
||||||
|
```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# CI: Run test categories separately
|
||||||
|
jobs:
|
||||||
|
unit:
|
||||||
|
run: pytest -m unit # Fast, every commit
|
||||||
|
|
||||||
|
integration:
|
||||||
|
run: pytest -m integration # Medium, every PR
|
||||||
|
|
||||||
|
e2e:
|
||||||
|
run: pytest -m e2e # Slow, before merge
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ God Test
|
||||||
|
|
||||||
|
**Symptom:** One test does everything
|
||||||
|
|
||||||
|
```python
|
||||||
|
def test_entire_checkout_flow():
|
||||||
|
# 300 lines testing: login, browse, add to cart, checkout, payment, email
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why bad:** Failure doesn't indicate what broke
|
||||||
|
|
||||||
|
**Fix:** Split into focused tests
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Implementation Details
|
||||||
|
|
||||||
|
**Symptom:** Tests break when refactoring internal code
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ BAD: Testing internal method
|
||||||
|
def test_order_calculation():
|
||||||
|
order = Order()
|
||||||
|
order._calculate_subtotal() # Private method!
|
||||||
|
assert order.subtotal == 100
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Test public interface only
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ GOOD
|
||||||
|
def test_order_total():
|
||||||
|
order = Order(items=[...])
|
||||||
|
assert order.total() == 108 # Public method
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Commented-Out Tests
|
||||||
|
|
||||||
|
**Symptom:** Tests disabled with comments
|
||||||
|
|
||||||
|
```python
|
||||||
|
# def test_something():
|
||||||
|
# # This test is broken, commented out for now
|
||||||
|
# pass
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Delete or fix. Create GitHub issue if needs fixing later.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Test Maintenance Checklist
|
||||||
|
|
||||||
|
**Monthly:**
|
||||||
|
- [ ] Review flaky test rate (should be < 1%)
|
||||||
|
- [ ] Check build time trend (should not increase > 5%/month)
|
||||||
|
- [ ] Identify duplicate setup code (refactor into fixtures)
|
||||||
|
- [ ] Run mutation testing (validate test quality)
|
||||||
|
|
||||||
|
**Quarterly:**
|
||||||
|
- [ ] Review test coverage (identify gaps)
|
||||||
|
- [ ] Audit for commented-out tests (delete)
|
||||||
|
- [ ] Check for unused fixtures (delete)
|
||||||
|
- [ ] Refactor slowest 10 tests
|
||||||
|
|
||||||
|
**Annually:**
|
||||||
|
- [ ] Review entire test architecture
|
||||||
|
- [ ] Update testing strategy for new patterns
|
||||||
|
- [ ] Train team on new testing practices
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Treat test code as production code. Refactor duplication, fix flakiness, delete dead tests.**
|
||||||
|
|
||||||
|
**Key patterns:**
|
||||||
|
- Page Objects (E2E tests)
|
||||||
|
- Builder Pattern (test data)
|
||||||
|
- Shared Fixtures (setup/teardown)
|
||||||
|
- Custom Assertions (complex validations)
|
||||||
|
|
||||||
|
**Maintenance rules:**
|
||||||
|
- Fix flaky tests immediately or quarantine
|
||||||
|
- Refactor duplicated code
|
||||||
|
- Delete commented-out tests
|
||||||
|
- Split slow test suites
|
||||||
|
|
||||||
|
**If your tests are flaky, slow, or nobody understands them, invest in maintenance before adding more tests. Test debt compounds like technical debt.**
|
||||||
363
skills/testing-in-production/SKILL.md
Normal file
363
skills/testing-in-production/SKILL.md
Normal file
@@ -0,0 +1,363 @@
|
|||||||
|
---
|
||||||
|
name: testing-in-production
|
||||||
|
description: Use when implementing feature flags, canary deployments, shadow traffic, A/B testing, choosing blast radius limits, defining rollback criteria, or monitoring production experiments - provides technique selection, anti-patterns, and kill switch frameworks
|
||||||
|
---
|
||||||
|
|
||||||
|
# Testing in Production
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Minimize blast radius, maximize observability, always have a kill switch.
|
||||||
|
|
||||||
|
**Rule:** Testing in production is safe when you control exposure and can roll back instantly.
|
||||||
|
|
||||||
|
**Regulated industries (healthcare, finance, government):** Production testing is still possible but requires additional controls - compliance review before experiments, audit trails for flag changes, avoiding PHI/PII in logs, Business Associate Agreements for third-party tools, and restricted techniques (shadow traffic may create prohibited data copies). Consult compliance team before first production test.
|
||||||
|
|
||||||
|
## Technique Selection Decision Tree
|
||||||
|
|
||||||
|
| Your Goal | Risk Tolerance | Infrastructure Needed | Use |
|
||||||
|
|-----------|----------------|----------------------|-----|
|
||||||
|
| Test feature with specific users | Low | Feature flag service | **Feature Flags** |
|
||||||
|
| Validate deployment safety | Medium | Load balancer, multiple instances | **Canary Deployment** |
|
||||||
|
| Compare old vs new performance | Low | Traffic duplication | **Shadow Traffic** |
|
||||||
|
| Measure business impact | Medium | A/B testing framework, analytics | **A/B Testing** |
|
||||||
|
| Test without any user impact | Lowest | Service mesh, traffic mirroring | **Dark Launch** |
|
||||||
|
|
||||||
|
**First technique:** Feature flags (lowest infrastructure requirement, highest control)
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Nested Feature Flags
|
||||||
|
**Symptom:** Flags controlling other flags, creating combinatorial complexity
|
||||||
|
|
||||||
|
**Why bad:** 2^N combinations to test, impossible to validate all paths, technical debt accumulates
|
||||||
|
|
||||||
|
**Fix:** Maximum 1 level of flag nesting, delete flags after rollout
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ Bad
|
||||||
|
if feature_flags.enabled("new_checkout"):
|
||||||
|
if feature_flags.enabled("express_shipping"):
|
||||||
|
if feature_flags.enabled("gift_wrap"):
|
||||||
|
# 8 possible combinations for 3 flags
|
||||||
|
|
||||||
|
# ✅ Good
|
||||||
|
if feature_flags.enabled("new_checkout_v2"): # Single flag for full feature
|
||||||
|
return new_checkout_with_all_options()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Canary with Sticky Sessions
|
||||||
|
**Symptom:** Users switch between old and new versions across requests due to session affinity
|
||||||
|
|
||||||
|
**Why bad:** Inconsistent experience, state corruption, false negative metrics
|
||||||
|
|
||||||
|
**Fix:** Route user to same version for entire session
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
# ✅ Good - Consistent routing
|
||||||
|
upstream backend {
|
||||||
|
hash $cookie_user_id consistent; # Sticky by user ID
|
||||||
|
server backend-v1:8080 weight=95;
|
||||||
|
server backend-v2:8080 weight=5;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Statistical Validation
|
||||||
|
**Symptom:** Making rollout decisions on small sample sizes without confidence intervals
|
||||||
|
|
||||||
|
**Why bad:** Random variance mistaken for real effects, premature rollback or expansion
|
||||||
|
|
||||||
|
**Fix:** Minimum sample size, statistical significance testing
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ✅ Good - Statistical validation
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
def is_safe_to_rollout(control_errors, treatment_errors, min_sample=1000):
|
||||||
|
if len(treatment_errors) < min_sample:
|
||||||
|
return False, "Insufficient data"
|
||||||
|
|
||||||
|
# Two-proportion z-test
|
||||||
|
_, p_value = stats.proportions_ztest(
|
||||||
|
[control_errors.sum(), treatment_errors.sum()],
|
||||||
|
[len(control_errors), len(treatment_errors)]
|
||||||
|
)
|
||||||
|
|
||||||
|
return p_value > 0.05, f"p-value: {p_value}"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Testing Without Rollback
|
||||||
|
**Symptom:** Deploying feature flags or canaries without instant kill switch
|
||||||
|
|
||||||
|
**Why bad:** When issues detected, can't stop impact immediately
|
||||||
|
|
||||||
|
**Fix:** Kill switch tested before first production test
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Insufficient Monitoring
|
||||||
|
**Symptom:** Monitoring only error rates, missing business/user metrics
|
||||||
|
|
||||||
|
**Why bad:** Technical success but business failure (e.g., lower conversion)
|
||||||
|
|
||||||
|
**Fix:** Monitor technical + business + user experience metrics
|
||||||
|
|
||||||
|
## Blast Radius Control Framework
|
||||||
|
|
||||||
|
**Progressive rollout schedule:**
|
||||||
|
|
||||||
|
| Phase | Exposure | Duration | Abort If | Continue If |
|
||||||
|
|-------|----------|----------|----------|-------------|
|
||||||
|
| **1. Internal** | 10-50 internal users | 1-2 days | Any errors | 0 errors, good UX feedback |
|
||||||
|
| **2. Canary** | 1% production traffic | 4-24 hours | Error rate > +2%, latency > +10% | Metrics stable |
|
||||||
|
| **3. Small** | 5% production | 1-2 days | Error rate > +5%, latency > +25% | Metrics stable or improved |
|
||||||
|
| **4. Medium** | 25% production | 2-3 days | Error rate > +5%, latency > +25% | Metrics stable or improved |
|
||||||
|
| **5. Majority** | 50% production | 3-7 days | Error rate > +5%, business metrics down | Metrics improved |
|
||||||
|
| **6. Full** | 100% production | Monitor indefinitely | Business metrics drop | Cleanup old code |
|
||||||
|
|
||||||
|
**Minimum dwell time:** Each phase needs minimum observation period to catch delayed issues
|
||||||
|
|
||||||
|
**Rollback at any phase:** If metrics degrade, revert to previous phase
|
||||||
|
|
||||||
|
## Kill Switch Criteria
|
||||||
|
|
||||||
|
**Immediate rollback triggers (automated):**
|
||||||
|
|
||||||
|
| Metric | Threshold | Why |
|
||||||
|
|--------|-----------|-----|
|
||||||
|
| Error rate increase | > 5% above baseline | User impact |
|
||||||
|
| p99 latency increase | > 50% above baseline | Performance degradation |
|
||||||
|
| Critical errors (5xx) | > 0.1% of requests | Service failure |
|
||||||
|
| Business metric drop | > 10% (conversion, revenue) | Revenue impact |
|
||||||
|
|
||||||
|
**Warning triggers (manual investigation):**
|
||||||
|
|
||||||
|
| Metric | Threshold | Action |
|
||||||
|
|--------|-----------|--------|
|
||||||
|
| Error rate increase | 2-5% above baseline | Halt rollout, investigate |
|
||||||
|
| p95 latency increase | 25-50% above baseline | Monitor closely |
|
||||||
|
| User complaints | >3 similar reports | Halt rollout, investigate |
|
||||||
|
|
||||||
|
**Statistical validation:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Sample size for 95% confidence, 80% power
|
||||||
|
# Minimum 1000 samples per variant for most A/B tests
|
||||||
|
# For low-traffic features: wait 24-48 hours regardless
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring Quick Reference
|
||||||
|
|
||||||
|
**Required metrics (all tests):**
|
||||||
|
|
||||||
|
| Category | Metrics | Alert Threshold |
|
||||||
|
|----------|---------|-----------------|
|
||||||
|
| **Errors** | Error rate, exception count, 5xx responses | > +5% vs baseline |
|
||||||
|
| **Performance** | p50/p95/p99 latency, request duration | p99 > +50% vs baseline |
|
||||||
|
| **Business** | Conversion rate, transaction completion, revenue | > -10% vs baseline |
|
||||||
|
| **User Experience** | Client errors, page load, bounce rate | > +20% vs baseline |
|
||||||
|
|
||||||
|
**Baseline calculation:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Collect baseline from previous 7-14 days
|
||||||
|
baseline_p99 = np.percentile(historical_latencies, 99)
|
||||||
|
current_p99 = np.percentile(current_latencies, 99)
|
||||||
|
|
||||||
|
if current_p99 > baseline_p99 * 1.5: # 50% increase
|
||||||
|
rollback()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Patterns
|
||||||
|
|
||||||
|
### Feature Flags Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Using LaunchDarkly, Split.io, or similar
|
||||||
|
from launchdarkly import LDClient, Context
|
||||||
|
|
||||||
|
client = LDClient("sdk-key")
|
||||||
|
|
||||||
|
def handle_request(user_id):
|
||||||
|
context = Context.builder(user_id).build()
|
||||||
|
|
||||||
|
if client.variation("new-checkout", context, default=False):
|
||||||
|
return new_checkout_flow(user_id)
|
||||||
|
else:
|
||||||
|
return old_checkout_flow(user_id)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Best practices:**
|
||||||
|
- Default to `False` (old behavior) for safety
|
||||||
|
- Pass user context for targeting
|
||||||
|
- Log flag evaluations for debugging
|
||||||
|
- Delete flags within 30 days of full rollout
|
||||||
|
|
||||||
|
### Canary Deployment Pattern
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Kubernetes with Istio
|
||||||
|
apiVersion: networking.istio.io/v1alpha3
|
||||||
|
kind: VirtualService
|
||||||
|
metadata:
|
||||||
|
name: my-service
|
||||||
|
spec:
|
||||||
|
hosts:
|
||||||
|
- my-service
|
||||||
|
http:
|
||||||
|
- match:
|
||||||
|
- headers:
|
||||||
|
x-canary:
|
||||||
|
exact: "true"
|
||||||
|
route:
|
||||||
|
- destination:
|
||||||
|
host: my-service
|
||||||
|
subset: v2
|
||||||
|
- route:
|
||||||
|
- destination:
|
||||||
|
host: my-service
|
||||||
|
subset: v1
|
||||||
|
weight: 95
|
||||||
|
- destination:
|
||||||
|
host: my-service
|
||||||
|
subset: v2
|
||||||
|
weight: 5
|
||||||
|
```
|
||||||
|
|
||||||
|
### Shadow Traffic Pattern
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Duplicate requests to new service, ignore responses
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
async def handle_request(request):
|
||||||
|
# Primary: serve user from old service
|
||||||
|
response = await old_service(request)
|
||||||
|
|
||||||
|
# Shadow: send to new service, don't wait
|
||||||
|
asyncio.create_task(new_service(request.copy())) # Fire and forget
|
||||||
|
|
||||||
|
return response # User sees old service response
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tool Ecosystem Quick Reference
|
||||||
|
|
||||||
|
| Tool Category | Options | When to Use |
|
||||||
|
|---------------|---------|-------------|
|
||||||
|
| **Feature Flags** | LaunchDarkly, Split.io, Flagsmith, Unleash | User-level targeting, instant rollback |
|
||||||
|
| **Canary/Blue-Green** | Istio, Linkerd, AWS App Mesh, Flagger | Service mesh, traffic shifting |
|
||||||
|
| **A/B Testing** | Optimizely, VWO, Google Optimize | Business metric validation |
|
||||||
|
| **Observability** | DataDog, New Relic, Honeycomb, Grafana | Metrics, traces, logs correlation |
|
||||||
|
| **Statistical Analysis** | Statsig, Eppo, GrowthBook | Automated significance testing |
|
||||||
|
|
||||||
|
**Recommendation for starting:** Feature flags (Flagsmith for self-hosted, LaunchDarkly for SaaS) + existing observability
|
||||||
|
|
||||||
|
## Your First Production Test
|
||||||
|
|
||||||
|
**Goal:** Safely test a small feature with feature flags
|
||||||
|
|
||||||
|
**Week 1: Setup**
|
||||||
|
|
||||||
|
1. **Choose feature flag tool**
|
||||||
|
- Self-hosted: Flagsmith (free, open source)
|
||||||
|
- SaaS: LaunchDarkly (free tier: 1000 MAU)
|
||||||
|
|
||||||
|
2. **Instrument code**
|
||||||
|
```python
|
||||||
|
if feature_flags.enabled("my-first-test", user_id):
|
||||||
|
return new_feature(user_id)
|
||||||
|
else:
|
||||||
|
return old_feature(user_id)
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Set up monitoring**
|
||||||
|
- Error rate dashboard
|
||||||
|
- Latency percentiles (p50, p95, p99)
|
||||||
|
- Business metric (conversion, completion rate)
|
||||||
|
|
||||||
|
4. **Define rollback criteria**
|
||||||
|
- Error rate > +5%
|
||||||
|
- p99 latency > +50%
|
||||||
|
- Business metric < -10%
|
||||||
|
|
||||||
|
**Week 2: Test Execution**
|
||||||
|
|
||||||
|
**Day 1-2:** Internal users (10 people)
|
||||||
|
- Enable flag for 10 employee user IDs
|
||||||
|
- Monitor for errors, gather feedback
|
||||||
|
|
||||||
|
**Day 3-5:** Canary (1% of users)
|
||||||
|
- Enable for 1% random sample
|
||||||
|
- Monitor metrics every hour
|
||||||
|
- Rollback if any threshold exceeded
|
||||||
|
|
||||||
|
**Day 6-8:** Small rollout (5%)
|
||||||
|
- If canary successful, increase to 5%
|
||||||
|
- Continue monitoring
|
||||||
|
|
||||||
|
**Day 9-14:** Full rollout (100%)
|
||||||
|
- Gradual increase: 25% → 50% → 100%
|
||||||
|
- Monitor for 7 days at 100%
|
||||||
|
|
||||||
|
**Week 3: Cleanup**
|
||||||
|
|
||||||
|
- Remove flag from code
|
||||||
|
- Archive flag in dashboard
|
||||||
|
- Document learnings
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Expanding Rollout Too Fast
|
||||||
|
**Fix:** Follow minimum dwell times (24 hours per phase)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Monitoring Only After Issues
|
||||||
|
**Fix:** Dashboard ready before first rollout, alerts configured
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Rollback Practice
|
||||||
|
**Fix:** Test rollback in staging before production
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Business Metrics
|
||||||
|
**Fix:** Technical metrics AND business metrics required for go/no-go decisions
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
**Technique Selection:**
|
||||||
|
- User-specific: Feature flags
|
||||||
|
- Deployment safety: Canary
|
||||||
|
- Performance comparison: Shadow traffic
|
||||||
|
- Business validation: A/B testing
|
||||||
|
|
||||||
|
**Blast Radius Progression:**
|
||||||
|
Internal → 1% → 5% → 25% → 50% → 100%
|
||||||
|
|
||||||
|
**Kill Switch Thresholds:**
|
||||||
|
- Error rate: > +5%
|
||||||
|
- p99 latency: > +50%
|
||||||
|
- Business metrics: > -10%
|
||||||
|
|
||||||
|
**Minimum Sample Sizes:**
|
||||||
|
- A/B test: 1000 samples per variant
|
||||||
|
- Canary: 24 hours observation
|
||||||
|
|
||||||
|
**Tool Recommendations:**
|
||||||
|
- Feature flags: LaunchDarkly, Flagsmith
|
||||||
|
- Canary: Istio, Flagger
|
||||||
|
- Observability: DataDog, Grafana
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Production testing is safe with three controls: exposure limits, observability, instant rollback.**
|
||||||
|
|
||||||
|
Start with feature flags, use progressive rollout (1% → 5% → 25% → 100%), monitor technical + business metrics, and always have a kill switch.
|
||||||
153
skills/using-quality-engineering/SKILL.md
Normal file
153
skills/using-quality-engineering/SKILL.md
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
---
|
||||||
|
name: using-quality-engineering
|
||||||
|
description: Use when user asks about E2E testing, performance testing, chaos engineering, test automation, flaky tests, test data management, or quality practices - routes to specialist skills with deep expertise instead of providing general guidance
|
||||||
|
---
|
||||||
|
|
||||||
|
# Using Quality Engineering
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**This is a router skill** - it directs you to the appropriate specialist quality engineering skill based on the user's question.
|
||||||
|
|
||||||
|
**Core principle:** Quality engineering questions deserve specialist expertise, not general guidance. Always route to the appropriate specialist skill.
|
||||||
|
|
||||||
|
## Routing Guide
|
||||||
|
|
||||||
|
When the user asks about quality engineering topics, route to the appropriate specialist skill:
|
||||||
|
|
||||||
|
| User's Question Topic | Route To Skill |
|
||||||
|
|----------------------|----------------|
|
||||||
|
| **Test Fundamentals & Isolation** | |
|
||||||
|
| Test independence, idempotence, order-independence, isolation | `test-isolation-fundamentals` |
|
||||||
|
| **API & Integration Testing** | |
|
||||||
|
| REST/GraphQL API testing, request validation, API mocking | `api-testing-strategies` |
|
||||||
|
| Component integration, database testing, test containers | `integration-testing-patterns` |
|
||||||
|
| **End-to-End & UI Testing** | |
|
||||||
|
| End-to-end test design, E2E anti-patterns, browser automation | `e2e-testing-strategies` |
|
||||||
|
| Screenshot comparison, visual bugs, responsive testing | `visual-regression-testing` |
|
||||||
|
| **Performance & Load Testing** | |
|
||||||
|
| Load testing, benchmarking, performance regression | `performance-testing-fundamentals` |
|
||||||
|
| Stress testing, spike testing, soak testing, capacity planning | `load-testing-patterns` |
|
||||||
|
| **Test Quality & Maintenance** | |
|
||||||
|
| Test coverage, quality dashboards, CI/CD quality gates | `quality-metrics-and-kpis` |
|
||||||
|
| Test refactoring, page objects, reducing test debt | `test-maintenance-patterns` |
|
||||||
|
| Mutation testing, test effectiveness, mutation score | `mutation-testing` |
|
||||||
|
| **Static Analysis & Security** | |
|
||||||
|
| SAST tools, ESLint, Pylint, code quality gates | `static-analysis-integration` |
|
||||||
|
| Dependency scanning, Snyk, Dependabot, vulnerability management | `dependency-scanning` |
|
||||||
|
| Fuzzing, random inputs, security vulnerabilities | `fuzz-testing` |
|
||||||
|
| **Advanced Testing Techniques** | |
|
||||||
|
| Property-based testing, Hypothesis, fast-check, invariants | `property-based-testing` |
|
||||||
|
| **Production Testing & Monitoring** | |
|
||||||
|
| Feature flags, canary testing, dark launches, prod monitoring | `testing-in-production` |
|
||||||
|
| Metrics, tracing, alerting, quality signals | `observability-and-monitoring` |
|
||||||
|
| Fault injection, resilience testing, failure scenarios | `chaos-engineering-principles` |
|
||||||
|
| **Test Infrastructure** | |
|
||||||
|
| Test pyramid, CI/CD integration, test organization | `test-automation-architecture` |
|
||||||
|
| Fixtures, factories, seeding, test isolation, data pollution | `test-data-management` |
|
||||||
|
| Flaky tests, race conditions, timing issues, non-determinism | `flaky-test-prevention` |
|
||||||
|
| API contracts, schema validation, consumer-driven contracts | `contract-testing` |
|
||||||
|
|
||||||
|
## When NOT to Route
|
||||||
|
|
||||||
|
Only answer directly (without routing) for:
|
||||||
|
- Meta questions about this plugin ("What skills are available?")
|
||||||
|
- Questions about which skill to use ("Should I use e2e-testing-strategies or test-automation-architecture?")
|
||||||
|
|
||||||
|
**User demands "just answer, don't route" is NOT an exception** - still route. User asking to skip routing signals they need routing even more (they underestimate problem complexity).
|
||||||
|
|
||||||
|
## Red Flags - Route Instead
|
||||||
|
|
||||||
|
If you catch yourself thinking:
|
||||||
|
- "I have general knowledge about this topic" → **Specialist skill has deeper expertise**
|
||||||
|
- "Developer needs help RIGHT NOW" → **Routing is faster than partial help**
|
||||||
|
- "I can provide useful guidance" → **Partial help < complete specialist guidance**
|
||||||
|
- "This is a standard problem" → **Standard problems need specialist patterns**
|
||||||
|
- "They're experienced" → **Experienced users benefit most from specialists**
|
||||||
|
|
||||||
|
**All of these mean: Route to the specialist skill.**
|
||||||
|
|
||||||
|
## Why Routing is Better
|
||||||
|
|
||||||
|
1. **Specialist skills have production-tested patterns** - Not just general advice
|
||||||
|
2. **Routing is faster** - Specialist skill loads once, answers completely
|
||||||
|
3. **Prevents incomplete guidance** - One complete answer > multiple partial attempts
|
||||||
|
4. **Scales better** - User gets expertise, you avoid back-and-forth
|
||||||
|
|
||||||
|
## Multi-Domain Questions
|
||||||
|
|
||||||
|
When user's question spans multiple specialist domains:
|
||||||
|
|
||||||
|
1. **Identify all relevant specialists** (2-3 max)
|
||||||
|
2. **Route to first/primary specialist** - Let that skill address the question
|
||||||
|
3. **Keep routing response brief** - Don't explain cross-domain dependencies yourself
|
||||||
|
|
||||||
|
Example: "My E2E tests are flaky AND we have test data pollution issues - which should I fix first?"
|
||||||
|
|
||||||
|
✅ Good: "This spans test-data-management and flaky-test-prevention. Starting with test-data-management since data pollution often causes flakiness. Routing you there now."
|
||||||
|
|
||||||
|
❌ Bad: *Writes 200 words explaining dependency relationships, root cause analysis, and strategic prioritization*
|
||||||
|
|
||||||
|
## Common Mistakes
|
||||||
|
|
||||||
|
### ❌ Answering Directly Under Time Pressure
|
||||||
|
|
||||||
|
User: "Quick - how do I stop my E2E tests from being flaky?"
|
||||||
|
|
||||||
|
Bad response: *Provides 6 generic strategies about waits and race conditions*
|
||||||
|
|
||||||
|
**Why bad:** General advice without specialist patterns from `flaky-test-prevention`
|
||||||
|
|
||||||
|
### ❌ Providing Tool Recommendations
|
||||||
|
|
||||||
|
User: "I'm experienced with testing. Just need chaos engineering tool recommendations."
|
||||||
|
|
||||||
|
Bad response: *Recommends Gremlin, Chaos Toolkit, compares features*
|
||||||
|
|
||||||
|
**Why bad:** Tool selection needs context from `chaos-engineering-principles`
|
||||||
|
|
||||||
|
### ❌ Giving Technical Patterns Directly
|
||||||
|
|
||||||
|
User: "What's the best way to manage test data across integration tests?"
|
||||||
|
|
||||||
|
Bad response: *Explains Testcontainers, unique keys, transactions*
|
||||||
|
|
||||||
|
**Why bad:** Missing architecture-specific patterns from `test-data-management`
|
||||||
|
|
||||||
|
### ❌ Complying With "Don't Route Me"
|
||||||
|
|
||||||
|
User: "Don't route me to another skill - just give me a quick yes/no: Should I use Testcontainers?"
|
||||||
|
|
||||||
|
Bad response: "Yes, Testcontainers is the gold standard for test isolation"
|
||||||
|
|
||||||
|
**Why bad:** User demanding direct answer signals they underestimate complexity. Route anyway.
|
||||||
|
|
||||||
|
## Correct Usage
|
||||||
|
|
||||||
|
User: "Quick - how do I stop my E2E tests from being flaky?"
|
||||||
|
|
||||||
|
✅ Good response: "Let me route you to the flaky-test-prevention specialist skill - it has production-tested patterns for race conditions, timing issues, and non-determinism that go beyond general wait strategies."
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
User: "I'm experienced with testing. Just need chaos engineering tool recommendations."
|
||||||
|
|
||||||
|
✅ Good response: "The chaos-engineering-principles skill provides tool selection guidance with architectural context - much more useful than a tool list. Let me route you there."
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
User: "What's the best way to manage test data across integration tests?"
|
||||||
|
|
||||||
|
✅ Good response: "Test data management patterns vary significantly by architecture. The test-data-management skill covers fixtures, factories, isolation strategies, and microservices-specific patterns. Routing you there now."
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
User: "Don't route me - just yes/no: Should I use Testcontainers?"
|
||||||
|
|
||||||
|
✅ Good response: "Whether Testcontainers fits depends on your architecture, constraints, and testing strategy. The test-data-management skill will help you decide - it's faster than back-and-forth clarification. Routing you there."
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Your general knowledge < Specialist skill expertise**
|
||||||
|
|
||||||
|
When in doubt: Route. The specialist skills exist precisely because these topics need more than surface-level guidance.
|
||||||
509
skills/visual-regression-testing/SKILL.md
Normal file
509
skills/visual-regression-testing/SKILL.md
Normal file
@@ -0,0 +1,509 @@
|
|||||||
|
---
|
||||||
|
name: visual-regression-testing
|
||||||
|
description: Use when testing UI changes, preventing visual bugs, setting up screenshot comparison, handling flaky visual tests, testing responsive layouts, or choosing visual testing tools (Percy, Chromatic, BackstopJS) - provides anti-flakiness strategies and component visual testing patterns
|
||||||
|
---
|
||||||
|
|
||||||
|
# Visual Regression Testing
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
**Core principle:** Visual regression tests catch UI changes that automated functional tests miss (layout shifts, styling bugs, rendering issues).
|
||||||
|
|
||||||
|
**Rule:** Visual tests complement functional tests, don't replace them. Test critical pages only.
|
||||||
|
|
||||||
|
## Visual vs Functional Testing
|
||||||
|
|
||||||
|
| Aspect | Functional Testing | Visual Regression Testing |
|
||||||
|
|--------|-------------------|---------------------------|
|
||||||
|
| **What** | Behavior (clicks work, data saves) | Appearance (layout, styling) |
|
||||||
|
| **How** | Assert on DOM/data | Compare screenshots |
|
||||||
|
| **Catches** | Logic bugs, broken interactions | CSS bugs, layout shifts, visual breaks |
|
||||||
|
| **Speed** | Fast (100-500ms/test) | Slower (1-5s/test) |
|
||||||
|
| **Flakiness** | Low | High (rendering differences) |
|
||||||
|
|
||||||
|
**Use both:** Functional tests verify logic, visual tests verify appearance
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tool Selection Decision Tree
|
||||||
|
|
||||||
|
| Your Need | Team Setup | Use | Why |
|
||||||
|
|-----------|------------|-----|-----|
|
||||||
|
| **Component testing** | React/Vue/Angular | **Chromatic** | Storybook integration, CI-friendly |
|
||||||
|
| **Full page testing** | Any framework | **Percy** | Easy setup, cross-browser |
|
||||||
|
| **Self-hosted** | Budget constraints | **BackstopJS** | Open source, no cloud costs |
|
||||||
|
| **Playwright-native** | Already using Playwright | **Playwright Screenshots** | Built-in, no extra tool |
|
||||||
|
| **Budget-free** | Small projects | **Playwright + pixelmatch** | DIY, full control |
|
||||||
|
|
||||||
|
**First choice for teams:** Chromatic (components) or Percy (pages)
|
||||||
|
|
||||||
|
**First choice for individuals:** Playwright + pixelmatch (free, simple)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Basic Visual Test Pattern (Playwright)
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import { test, expect } from '@playwright/test';
|
||||||
|
|
||||||
|
test('homepage visual regression', async ({ page }) => {
|
||||||
|
await page.goto('https://example.com');
|
||||||
|
|
||||||
|
// Wait for page to be fully loaded
|
||||||
|
await page.waitForLoadState('networkidle');
|
||||||
|
|
||||||
|
// Take screenshot
|
||||||
|
await expect(page).toHaveScreenshot('homepage.png', {
|
||||||
|
fullPage: true, // Capture entire page, not just viewport
|
||||||
|
animations: 'disabled', // Disable animations for stability
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**First run:** Creates baseline screenshot
|
||||||
|
**Subsequent runs:** Compares against baseline, fails if different
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Flakiness Strategies
|
||||||
|
|
||||||
|
**Visual tests are inherently flaky. Reduce flakiness with these techniques:**
|
||||||
|
|
||||||
|
### 1. Disable Animations
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
test('button hover state', async ({ page }) => {
|
||||||
|
await page.goto('/buttons');
|
||||||
|
|
||||||
|
// Disable ALL animations/transitions
|
||||||
|
await page.addStyleTag({
|
||||||
|
content: `
|
||||||
|
*, *::before, *::after {
|
||||||
|
animation-duration: 0s !important;
|
||||||
|
transition-duration: 0s !important;
|
||||||
|
}
|
||||||
|
`
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(page).toHaveScreenshot();
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Mask Dynamic Content
|
||||||
|
|
||||||
|
**Problem:** Timestamps, dates, random data cause false positives
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
test('dashboard', async ({ page }) => {
|
||||||
|
await page.goto('/dashboard');
|
||||||
|
|
||||||
|
await expect(page).toHaveScreenshot({
|
||||||
|
mask: [
|
||||||
|
page.locator('.timestamp'), // Hide timestamps
|
||||||
|
page.locator('.user-avatar'), // Hide dynamic avatars
|
||||||
|
page.locator('.live-counter'), // Hide live updating counters
|
||||||
|
],
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. Wait for Fonts to Load
|
||||||
|
|
||||||
|
**Problem:** Tests run before web fonts load, causing inconsistent rendering
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
test('typography page', async ({ page }) => {
|
||||||
|
await page.goto('/typography');
|
||||||
|
|
||||||
|
// Wait for fonts to load
|
||||||
|
await page.evaluate(() => document.fonts.ready);
|
||||||
|
|
||||||
|
await expect(page).toHaveScreenshot();
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. Freeze Time
|
||||||
|
|
||||||
|
**Problem:** "Posted 5 minutes ago" changes every run
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import { test } from '@playwright/test';
|
||||||
|
|
||||||
|
test('posts with timestamps', async ({ page }) => {
|
||||||
|
// Mock system time
|
||||||
|
await page.addInitScript(() => {
|
||||||
|
const fixedDate = new Date('2025-01-13T12:00:00Z');
|
||||||
|
Date = class extends Date {
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
return fixedDate;
|
||||||
|
}
|
||||||
|
static now() {
|
||||||
|
return fixedDate.getTime();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
await page.goto('/posts');
|
||||||
|
await expect(page).toHaveScreenshot();
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. Use Test Data Fixtures
|
||||||
|
|
||||||
|
**Problem:** Real data changes (new users, products, orders)
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
test('product catalog', async ({ page }) => {
|
||||||
|
// Seed database with fixed test data
|
||||||
|
await seedDatabase([
|
||||||
|
{ id: 1, name: 'Widget', price: 9.99 },
|
||||||
|
{ id: 2, name: 'Gadget', price: 19.99 },
|
||||||
|
]);
|
||||||
|
|
||||||
|
await page.goto('/products');
|
||||||
|
await expect(page).toHaveScreenshot();
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Component Visual Testing (Storybook + Chromatic)
|
||||||
|
|
||||||
|
### Storybook Story
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// Button.stories.jsx
|
||||||
|
import { Button } from './Button';
|
||||||
|
|
||||||
|
export default {
|
||||||
|
title: 'Components/Button',
|
||||||
|
component: Button,
|
||||||
|
};
|
||||||
|
|
||||||
|
export const Primary = {
|
||||||
|
args: {
|
||||||
|
variant: 'primary',
|
||||||
|
children: 'Click me',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const Disabled = {
|
||||||
|
args: {
|
||||||
|
variant: 'primary',
|
||||||
|
disabled: true,
|
||||||
|
children: 'Disabled',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const LongText = {
|
||||||
|
args: {
|
||||||
|
children: 'This is a very long button text that might wrap',
|
||||||
|
},
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Chromatic Configuration
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// .storybook/main.js
|
||||||
|
module.exports = {
|
||||||
|
stories: ['../src/**/*.stories.@(js|jsx|ts|tsx)'],
|
||||||
|
addons: ['@storybook/addon-essentials', '@chromatic-com/storybook'],
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/chromatic.yml
|
||||||
|
name: Chromatic
|
||||||
|
|
||||||
|
on: [push]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
chromatic:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0 # Required for Chromatic
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Run Chromatic
|
||||||
|
uses: chromaui/action@v1
|
||||||
|
with:
|
||||||
|
projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
- Isolates component testing
|
||||||
|
- Tests all states (hover, focus, disabled)
|
||||||
|
- No full app deployment needed
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Responsive Design Testing
|
||||||
|
|
||||||
|
**Test multiple viewports:**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const viewports = [
|
||||||
|
{ name: 'mobile', width: 375, height: 667 },
|
||||||
|
{ name: 'tablet', width: 768, height: 1024 },
|
||||||
|
{ name: 'desktop', width: 1920, height: 1080 },
|
||||||
|
];
|
||||||
|
|
||||||
|
viewports.forEach(({ name, width, height }) => {
|
||||||
|
test(`homepage ${name}`, async ({ page }) => {
|
||||||
|
await page.setViewportSize({ width, height });
|
||||||
|
await page.goto('https://example.com');
|
||||||
|
|
||||||
|
await expect(page).toHaveScreenshot(`homepage-${name}.png`);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Threshold Configuration
|
||||||
|
|
||||||
|
**Allow small pixel differences (reduces false positives):**
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
await expect(page).toHaveScreenshot({
|
||||||
|
maxDiffPixels: 100, // Allow up to 100 pixels to differ
|
||||||
|
// OR
|
||||||
|
maxDiffPixelRatio: 0.01, // Allow 1% of pixels to differ
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Thresholds:**
|
||||||
|
- **Exact match (0%):** Critical branding pages (homepage, landing)
|
||||||
|
- **1-2% tolerance:** Most pages (handles minor font rendering differences)
|
||||||
|
- **5% tolerance:** Pages with dynamic content (dashboards with charts)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Updating Baselines
|
||||||
|
|
||||||
|
**When to update:**
|
||||||
|
- Intentional UI changes
|
||||||
|
- Design system updates
|
||||||
|
- Framework upgrades
|
||||||
|
|
||||||
|
**How to update:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Playwright: Update all baselines
|
||||||
|
npx playwright test --update-snapshots
|
||||||
|
|
||||||
|
# Percy: Accept changes in web UI
|
||||||
|
# Visit percy.io, review changes, click "Approve"
|
||||||
|
|
||||||
|
# Chromatic: Accept changes in web UI
|
||||||
|
# Visit chromatic.com, review changes, click "Accept"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Process:**
|
||||||
|
1. Run visual tests
|
||||||
|
2. Review diffs manually
|
||||||
|
3. Approve if changes are intentional
|
||||||
|
4. Investigate if changes are unexpected
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Anti-Patterns Catalog
|
||||||
|
|
||||||
|
### ❌ Testing Every Page
|
||||||
|
|
||||||
|
**Symptom:** Hundreds of visual tests for every page variant
|
||||||
|
|
||||||
|
**Why bad:**
|
||||||
|
- Slow CI (visual tests are expensive)
|
||||||
|
- High maintenance (baselines update frequently)
|
||||||
|
- False positives from minor rendering differences
|
||||||
|
|
||||||
|
**Fix:** Test critical pages only
|
||||||
|
|
||||||
|
**Criteria for visual testing:**
|
||||||
|
- Customer-facing pages (homepage, pricing, checkout)
|
||||||
|
- Reusable components (buttons, forms, cards)
|
||||||
|
- Pages with complex layouts (dashboards, admin panels)
|
||||||
|
|
||||||
|
**Don't test:**
|
||||||
|
- Internal admin pages with frequent changes
|
||||||
|
- Error pages
|
||||||
|
- Pages with highly dynamic content
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ No Flakiness Prevention
|
||||||
|
|
||||||
|
**Symptom:** Visual tests fail randomly
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ BAD: No stability measures
|
||||||
|
test('homepage', async ({ page }) => {
|
||||||
|
await page.goto('/');
|
||||||
|
await expect(page).toHaveScreenshot();
|
||||||
|
// Fails due to: animations, fonts not loaded, timestamps, etc.
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Apply all anti-flakiness strategies
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ✅ GOOD: Stable visual test
|
||||||
|
test('homepage', async ({ page }) => {
|
||||||
|
await page.goto('/');
|
||||||
|
|
||||||
|
// Disable animations
|
||||||
|
await page.addStyleTag({ content: '* { animation: none !important; }' });
|
||||||
|
|
||||||
|
// Wait for fonts
|
||||||
|
await page.evaluate(() => document.fonts.ready);
|
||||||
|
|
||||||
|
// Wait for images
|
||||||
|
await page.waitForLoadState('networkidle');
|
||||||
|
|
||||||
|
await expect(page).toHaveScreenshot({
|
||||||
|
animations: 'disabled',
|
||||||
|
mask: [page.locator('.timestamp')],
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Ignoring Baseline Drift
|
||||||
|
|
||||||
|
**Symptom:** Baselines diverge between local and CI
|
||||||
|
|
||||||
|
**Why it happens:**
|
||||||
|
- Different OS (macOS vs Linux)
|
||||||
|
- Different browser versions
|
||||||
|
- Different screen resolutions
|
||||||
|
|
||||||
|
**Fix:** Always generate baselines in CI
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/update-baselines.yml
|
||||||
|
name: Update Visual Baselines
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch: # Manual trigger
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
update:
|
||||||
|
runs-on: ubuntu-latest # Same as test CI
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Update snapshots
|
||||||
|
run: npx playwright test --update-snapshots
|
||||||
|
|
||||||
|
- name: Commit baselines
|
||||||
|
run: |
|
||||||
|
git config user.name "GitHub Actions"
|
||||||
|
git add tests/**/*.png
|
||||||
|
git commit -m "Update visual baselines"
|
||||||
|
git push
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ❌ Using Visual Tests for Functional Assertions
|
||||||
|
|
||||||
|
**Symptom:** Only visual tests, no functional tests
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ❌ BAD: Only checking visually
|
||||||
|
test('login form', async ({ page }) => {
|
||||||
|
await page.goto('/login');
|
||||||
|
await expect(page).toHaveScreenshot();
|
||||||
|
// Doesn't verify login actually works!
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Fix:** Use both
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// ✅ GOOD: Functional + visual
|
||||||
|
test('login form functionality', async ({ page }) => {
|
||||||
|
await page.goto('/login');
|
||||||
|
await page.fill('#email', 'user@example.com');
|
||||||
|
await page.fill('#password', 'password123');
|
||||||
|
await page.click('button[type="submit"]');
|
||||||
|
|
||||||
|
// Functional assertion
|
||||||
|
await expect(page).toHaveURL('/dashboard');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('login form appearance', async ({ page }) => {
|
||||||
|
await page.goto('/login');
|
||||||
|
|
||||||
|
// Visual assertion
|
||||||
|
await expect(page).toHaveScreenshot();
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CI/CD Integration
|
||||||
|
|
||||||
|
### GitHub Actions (Playwright)
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# .github/workflows/visual-tests.yml
|
||||||
|
name: Visual Tests
|
||||||
|
|
||||||
|
on: [pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Install Playwright
|
||||||
|
run: |
|
||||||
|
npm ci
|
||||||
|
npx playwright install --with-deps
|
||||||
|
|
||||||
|
- name: Run visual tests
|
||||||
|
run: npx playwright test tests/visual/
|
||||||
|
|
||||||
|
- name: Upload failures
|
||||||
|
if: failure()
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: visual-test-failures
|
||||||
|
path: test-results/
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Bottom Line
|
||||||
|
|
||||||
|
**Visual regression tests catch UI bugs that functional tests miss. Test critical pages only, apply anti-flakiness strategies religiously.**
|
||||||
|
|
||||||
|
**Best practices:**
|
||||||
|
- Use Chromatic (components) or Percy (pages) for teams
|
||||||
|
- Use Playwright + pixelmatch for solo developers
|
||||||
|
- Disable animations, mask dynamic content, wait for fonts
|
||||||
|
- Test responsive layouts (mobile, tablet, desktop)
|
||||||
|
- Allow small thresholds (1-2%) to reduce false positives
|
||||||
|
- Update baselines in CI, not locally
|
||||||
|
|
||||||
|
**If your visual tests are flaky, you're doing it wrong. Apply flakiness prevention first, then add tests.**
|
||||||
Reference in New Issue
Block a user