commit 966ef521f7c40ae6d06bcdc79cc4a1828cae3dd3 Author: Zhongwei Li Date: Sun Nov 30 08:59:43 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..bb43972 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "ordis-quality-engineering", + "description": "Comprehensive quality engineering for full-stack engineers: E2E, API, integration, performance, chaos, contracts, automation, observability, flaky tests, testing-in-production, load testing, test data, visual regression, mutation testing, static analysis (SAST), dependency scanning, fuzz testing, property-based testing, test maintenance, quality metrics - 21 production-ready skills covering traditional QA + modern quality engineering", + "version": "1.1.0", + "author": { + "name": "tachyon-beep", + "url": "https://github.com/tachyon-beep" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..13395ac --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# ordis-quality-engineering + +Comprehensive quality engineering for full-stack engineers: E2E, API, integration, performance, chaos, contracts, automation, observability, flaky tests, testing-in-production, load testing, test data, visual regression, mutation testing, static analysis (SAST), dependency scanning, fuzz testing, property-based testing, test maintenance, quality metrics - 21 production-ready skills covering traditional QA + modern quality engineering diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..c75cd95 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,129 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:tachyon-beep/skillpacks:plugins/ordis-quality-engineering", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "122ce5c2755e3944460caaf8b4583428c9c87503", + "treeHash": "d715062cb7abff988df8982950f9d774913ff29d953353682496f545660101bc", + "generatedAt": "2025-11-28T10:28:33.007693Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "ordis-quality-engineering", + "description": "Comprehensive quality engineering for full-stack engineers: E2E, API, integration, performance, chaos, contracts, automation, observability, flaky tests, testing-in-production, load testing, test data, visual regression, mutation testing, static analysis (SAST), dependency scanning, fuzz testing, property-based testing, test maintenance, quality metrics - 21 production-ready skills covering traditional QA + modern quality engineering", + "version": "1.1.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "5837cbad1040a2d7a50b5a6a007288a1e14644c2d37bb99a767e82c143f00223" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "c66ed5503e8a5994583f1ed0ef030a40404d87e606c6e82b9a6b1c15fe33a4f7" + }, + { + "path": "skills/integration-testing-patterns/SKILL.md", + "sha256": "2c9ccf9309d85e477be74545dc348d110614cbbdc9f3cc43ad0de22068278a46" + }, + { + "path": "skills/test-data-management/SKILL.md", + "sha256": "94b4318b4ba28e79689721d2a1dba4513f164c1f37ca55fc6a2c030280b9290d" + }, + { + "path": "skills/load-testing-patterns/SKILL.md", + "sha256": "2d4674b0a8a1df9176e48f239a99209593c399aa801aeceb2b292c2cc4758da3" + }, + { + "path": "skills/contract-testing/SKILL.md", + "sha256": "be6f479529a72dc90a6a017851eb9833f21eaef0a5a45d6949f03feb57a961dd" + }, + { + "path": "skills/testing-in-production/SKILL.md", + "sha256": "32377a605456bbbd65a593a4747ed0c08a15c5e6c9fca91a5807df30a40bd243" + }, + { + "path": "skills/quality-metrics-and-kpis/SKILL.md", + "sha256": "1ff87eafb6b9c55de68f5d5906fcde9a504f265108ec5ba7fa758e1760c674bc" + }, + { + "path": "skills/property-based-testing/SKILL.md", + "sha256": "7ba389b788e7542303ef8bc1ca78facd727c9950c8651f9480fc0995e87e5c20" + }, + { + "path": "skills/performance-testing-fundamentals/SKILL.md", + "sha256": "0d4a2ceec3849d4b62297c38faa6fe82269dd70979f6a9b15e8a9f60671b8ae4" + }, + { + "path": "skills/using-quality-engineering/SKILL.md", + "sha256": "4c161880229fc6bab75995347cd9e0c56c58112aa46b078d15dc19fa7ae58723" + }, + { + "path": "skills/flaky-test-prevention/SKILL.md", + "sha256": "f27b4f48a1286274d9bea42420f71c1ea75424f88c01923aea8fc1ccbe171296" + }, + { + "path": "skills/chaos-engineering-principles/SKILL.md", + "sha256": "8fb6a7b65076df493edeac34c94c2ea96bd81beb4c13fcc476d402011708847b" + }, + { + "path": "skills/test-automation-architecture/SKILL.md", + "sha256": "8cab74d4ac90519870f1fa59db0ecd916590cc3b211dbf44c4df617eebfbf1c6" + }, + { + "path": "skills/observability-and-monitoring/SKILL.md", + "sha256": "5e846e906095edfc7bf1c9f22d82b45b78d0316aee722d8330e08befe598674a" + }, + { + "path": "skills/test-isolation-fundamentals/SKILL.md", + "sha256": "07088c9703b8b4c7c5561858dd23de114a0aa6a901122340fdee99605e397fc1" + }, + { + "path": "skills/static-analysis-integration/SKILL.md", + "sha256": "d4dd718ccdc395c3320aabe4c1fc6f9da776987a52fad0968383038f2d402f38" + }, + { + "path": "skills/visual-regression-testing/SKILL.md", + "sha256": "7b98694df2d8b60ddb0f0e2cca40de67a694a2af10050b86edae22d14c0c46ae" + }, + { + "path": "skills/fuzz-testing/SKILL.md", + "sha256": "9a033fadb257f8a34fa5be6527b63946ff5c5028818e476c31a8d1612bcf967c" + }, + { + "path": "skills/api-testing-strategies/SKILL.md", + "sha256": "b900c427e32057915e291d2f09d1384875850e5248d0953f19760b709cc4b050" + }, + { + "path": "skills/mutation-testing/SKILL.md", + "sha256": "31ada6e2ec2a4048b65cb12f81a02bd26be101ba2f9fcaab8841bb08ce948ad3" + }, + { + "path": "skills/test-maintenance-patterns/SKILL.md", + "sha256": "036f0afa9cc22c27c8f22ed645b40c986647e368a55ef88fe7fa49082dfec124" + }, + { + "path": "skills/e2e-testing-strategies/SKILL.md", + "sha256": "d9e76b1f4b0130f2dec98d1a62f25ab6ebd4bb1f7465c05316ca41a098692bd3" + }, + { + "path": "skills/dependency-scanning/SKILL.md", + "sha256": "b722da2839c78c65107be5539ba602a4c8569ce14e01c700f6cf2ebf3ac89982" + } + ], + "dirSha256": "d715062cb7abff988df8982950f9d774913ff29d953353682496f545660101bc" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/api-testing-strategies/SKILL.md b/skills/api-testing-strategies/SKILL.md new file mode 100644 index 0000000..ed7c067 --- /dev/null +++ b/skills/api-testing-strategies/SKILL.md @@ -0,0 +1,471 @@ +--- +name: api-testing-strategies +description: Use when testing REST/GraphQL APIs, designing API test suites, validating request/response contracts, testing authentication/authorization, handling API versioning, or choosing API testing tools - provides test pyramid placement, schema validation, and anti-patterns distinct from E2E browser testing +--- + +# API Testing Strategies + +## Overview + +**Core principle:** API tests sit between unit tests and E2E tests - faster than browser tests, more realistic than mocks. + +**Rule:** Test APIs directly via HTTP/GraphQL, not through the UI. Browser tests are 10x slower and more flaky. + +## API Testing vs E2E Testing + +| Aspect | API Testing | E2E Browser Testing | +|--------|-------------|---------------------| +| **Speed** | Fast (10-100ms per test) | Slow (1-10s per test) | +| **Flakiness** | Low (no browser/JS) | High (timing, rendering) | +| **Coverage** | Business logic, data | Full user workflow | +| **Tools** | REST Client, Postman, pytest | Playwright, Cypress | +| **When to use** | Most backend testing | Critical user flows only | + +**Test Pyramid placement:** +- **Unit tests (70%):** Individual functions/classes +- **API tests (20%):** Endpoints, business logic, integrations +- **E2E tests (10%):** Critical user workflows through browser + +--- + +## Tool Selection Decision Tree + +| Your Stack | Team Skills | Use | Why | +|-----------|-------------|-----|-----| +| **Python backend** | pytest familiar | **pytest + requests** | Best integration, fixtures | +| **Node.js/JavaScript** | Jest/Mocha | **supertest** | Express/Fastify native | +| **Any language, REST** | Prefer GUI | **Postman + Newman** | GUI for design, CLI for CI | +| **GraphQL** | Any | **pytest + gql** (Python) or **apollo-client** (JS) | Query validation | +| **Contract testing** | Microservices | **Pact** | Consumer-driven contracts | + +**First choice:** Use your existing test framework (pytest/Jest) + HTTP client. Don't add new tools unnecessarily. + +--- + +## Test Structure Pattern + +### Basic REST API Test + +```python +import pytest +import requests + +@pytest.fixture +def api_client(): + """Base API client with auth.""" + return requests.Session() + +def test_create_order(api_client): + # Arrange: Set up test data + payload = { + "user_id": 123, + "items": [{"sku": "WIDGET", "quantity": 2}], + "shipping_address": "123 Main St" + } + + # Act: Make API call + response = api_client.post( + "https://api.example.com/orders", + json=payload, + headers={"Authorization": "Bearer test_token"} + ) + + # Assert: Validate response + assert response.status_code == 201 + data = response.json() + assert data["id"] is not None + assert data["status"] == "pending" + assert data["total"] > 0 +``` + +--- + +### GraphQL API Test + +```python +from gql import gql, Client +from gql.transport.requests import RequestsHTTPTransport + +def test_user_query(): + transport = RequestsHTTPTransport(url="https://api.example.com/graphql") + client = Client(transport=transport) + + query = gql(''' + query GetUser($id: ID!) { + user(id: $id) { + id + name + email + } + } + ''') + + result = client.execute(query, variable_values={"id": "123"}) + + assert result["user"]["id"] == "123" + assert result["user"]["email"] is not None +``` + +--- + +## What to Test + +### 1. Happy Path (Required) + +**Test successful requests with valid data.** + +```python +def test_get_user_success(): + response = api.get("/users/123") + assert response.status_code == 200 + assert response.json()["name"] == "Alice" +``` + +--- + +### 2. Validation Errors (Required) + +**Test API rejects invalid input.** + +```python +def test_create_user_invalid_email(): + response = api.post("/users", json={"email": "invalid"}) + + assert response.status_code == 400 + assert "email" in response.json()["errors"] +``` + +--- + +### 3. Authentication & Authorization (Required) + +**Test auth failures.** + +```python +def test_unauthorized_without_token(): + response = api.get("/orders", headers={}) # No auth token + + assert response.status_code == 401 + +def test_forbidden_different_user(): + response = api.get( + "/orders/999", + headers={"Authorization": "Bearer user_123_token"} + ) + + assert response.status_code == 403 # Can't access other user's orders +``` + +--- + +### 4. Edge Cases (Important) + +```python +def test_pagination_last_page(): + response = api.get("/users?page=999") + + assert response.status_code == 200 + assert response.json()["results"] == [] + +def test_large_payload(): + items = [{"sku": f"ITEM_{i}", "quantity": 1} for i in range(1000)] + response = api.post("/orders", json={"items": items}) + + assert response.status_code in [201, 413] # Created or payload too large +``` + +--- + +### 5. Idempotency (For POST/PUT/DELETE) + +**Test same request twice produces same result.** + +```python +def test_create_user_idempotent(): + payload = {"email": "alice@example.com", "name": "Alice"} + + # First request + response1 = api.post("/users", json=payload) + user_id_1 = response1.json()["id"] + + # Second identical request + response2 = api.post("/users", json=payload) + + # Should return existing user, not create duplicate + assert response2.status_code in [200, 409] # OK or Conflict + if response2.status_code == 200: + assert response2.json()["id"] == user_id_1 +``` + +--- + +## Schema Validation + +**Use JSON Schema to validate response structure.** + +```python +import jsonschema + +USER_SCHEMA = { + "type": "object", + "properties": { + "id": {"type": "integer"}, + "name": {"type": "string"}, + "email": {"type": "string", "format": "email"} + }, + "required": ["id", "name", "email"] +} + +def test_user_response_schema(): + response = api.get("/users/123") + + data = response.json() + jsonschema.validate(instance=data, schema=USER_SCHEMA) # Raises if invalid +``` + +**Why it matters:** Prevents regressions where fields are removed or types change. + +--- + +## API Versioning Tests + +**Test multiple API versions simultaneously.** + +```python +@pytest.mark.parametrize("version,expected_fields", [ + ("v1", ["id", "name"]), + ("v2", ["id", "name", "email", "created_at"]), +]) +def test_user_endpoint_version(version, expected_fields): + response = api.get(f"/{version}/users/123") + + data = response.json() + for field in expected_fields: + assert field in data +``` + +--- + +## Anti-Patterns Catalog + +### ❌ Testing Through the UI + +**Symptom:** Using browser automation to test API functionality + +```python +# ❌ BAD: Testing API via browser +def test_create_order(): + page.goto("/orders/new") + page.fill("#item", "Widget") + page.click("#submit") + assert page.locator(".success").is_visible() +``` + +**Why bad:** +- 10x slower than API test +- Flaky (browser timing issues) +- Couples API test to UI changes + +**Fix:** Test API directly + +```python +# ✅ GOOD: Direct API test +def test_create_order(): + response = api.post("/orders", json={"item": "Widget"}) + assert response.status_code == 201 +``` + +--- + +### ❌ Testing Implementation Details + +**Symptom:** Asserting on database queries, internal logic + +```python +# ❌ BAD: Testing implementation +def test_get_user(): + with patch('database.execute') as mock_db: + api.get("/users/123") + assert mock_db.called_with("SELECT * FROM users WHERE id = 123") +``` + +**Why bad:** Couples test to implementation, not contract + +**Fix:** Test only request/response contract + +```python +# ✅ GOOD: Test contract only +def test_get_user(): + response = api.get("/users/123") + assert response.status_code == 200 + assert response.json()["id"] == 123 +``` + +--- + +### ❌ No Test Data Isolation + +**Symptom:** Tests interfere with each other + +```python +# ❌ BAD: Shared test data +def test_update_user(): + api.put("/users/123", json={"name": "Bob"}) + assert api.get("/users/123").json()["name"] == "Bob" + +def test_get_user(): + # Fails if previous test ran! + assert api.get("/users/123").json()["name"] == "Alice" +``` + +**Fix:** Each test creates/cleans its own data (see test-isolation-fundamentals skill) + +--- + +### ❌ Hardcoded URLs and Tokens + +**Symptom:** Production URLs or real credentials in tests + +```python +# ❌ BAD: Hardcoded production URL +def test_api(): + response = requests.get("https://api.production.com/users") +``` + +**Fix:** Use environment variables or fixtures + +```python +# ✅ GOOD: Configurable environment +import os + +@pytest.fixture +def api_base_url(): + return os.getenv("API_URL", "http://localhost:8000") + +def test_api(api_base_url): + response = requests.get(f"{api_base_url}/users") +``` + +--- + +## Mocking External APIs + +**When testing service A that calls service B:** + +```python +import responses + +@responses.activate +def test_payment_success(): + # Mock Stripe API + responses.add( + responses.POST, + "https://api.stripe.com/v1/charges", + json={"id": "ch_123", "status": "succeeded"}, + status=200 + ) + + # Test your API + response = api.post("/checkout", json={"amount": 1000}) + + assert response.status_code == 200 + assert response.json()["payment_status"] == "succeeded" +``` + +**When to mock:** +- External service costs money (Stripe, Twilio) +- External service is slow +- External service is unreliable +- Testing error handling (simulate failures) + +**When NOT to mock:** +- Integration tests (use separate test suite with real services) +- Contract tests (use Pact to verify integration) + +--- + +## Performance Testing APIs + +**Use load testing for APIs separately from E2E:** + +```python +# locust load test +from locust import HttpUser, task, between + +class APIUser(HttpUser): + wait_time = between(1, 3) + + @task + def get_users(self): + self.client.get("/users") + + @task(3) # 3x more frequent + def get_user(self): + self.client.get("/users/123") +``` + +**Run with:** +```bash +locust -f locustfile.py --headless -u 100 -r 10 --run-time 60s +``` + +See load-testing-patterns skill for comprehensive guidance. + +--- + +## CI/CD Integration + +**API tests should run on every commit:** + +```yaml +# .github/workflows/api-tests.yml +name: API Tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run API tests + run: | + pytest tests/api/ -v + env: + API_URL: http://localhost:8000 + API_TOKEN: ${{ secrets.TEST_API_TOKEN }} +``` + +**Test stages:** +- Commit: Smoke tests (5-10 critical endpoints, <1 min) +- PR: Full API suite (all endpoints, <5 min) +- Merge: API + integration tests (<15 min) + +--- + +## Quick Reference: API Test Checklist + +For each endpoint, test: + +- [ ] **Happy path** (valid request → 200/201) +- [ ] **Validation** (invalid input → 400) +- [ ] **Authentication** (no token → 401) +- [ ] **Authorization** (wrong user → 403) +- [ ] **Not found** (missing resource → 404) +- [ ] **Idempotency** (duplicate request → same result) +- [ ] **Schema** (response matches expected structure) +- [ ] **Edge cases** (empty lists, large payloads, pagination) + +--- + +## Bottom Line + +**API tests are faster, more reliable, and provide better coverage than E2E browser tests for backend logic.** + +- Test APIs directly, not through the browser +- Use your existing test framework (pytest/Jest) + HTTP client +- Validate schemas to catch breaking changes +- Mock external services to avoid flakiness and cost +- Run API tests on every commit (they're fast enough) + +**If you're using browser automation to test API functionality, you're doing it wrong. Test APIs directly.** diff --git a/skills/chaos-engineering-principles/SKILL.md b/skills/chaos-engineering-principles/SKILL.md new file mode 100644 index 0000000..34e0cf5 --- /dev/null +++ b/skills/chaos-engineering-principles/SKILL.md @@ -0,0 +1,242 @@ +--- +name: chaos-engineering-principles +description: Use when starting chaos engineering, designing fault injection experiments, choosing chaos tools, testing system resilience, or recovering from chaos incidents - provides hypothesis-driven testing, blast radius control, and anti-patterns for safe chaos +--- + +# Chaos Engineering Principles + +## Overview + +**Core principle:** Chaos engineering validates resilience through controlled experiments, not random destruction. + +**Rule:** Start in staging, with monitoring, with rollback, with small blast radius. No exceptions. + +## When NOT to Do Chaos + +Don't run chaos experiments if ANY of these are missing: +- ❌ No comprehensive monitoring (APM, metrics, logs, alerts) +- ❌ No automated rollback capability +- ❌ No baseline metrics documented +- ❌ No incident response team available +- ❌ System already unstable (fix stability first) +- ❌ No staging environment to practice + +**Fix these prerequisites BEFORE chaos testing.** + +## Tool Selection Decision Tree + +| Your Constraint | Choose | Why | +|----------------|--------|-----| +| Kubernetes-native, CNCF preference | **LitmusChaos** | Cloud-native, operator-based, excellent K8s integration | +| Kubernetes-focused, visualization needs | **Chaos Mesh** | Fine-grained control, dashboards, low overhead | +| Want managed service, quick start | **Gremlin** | Commercial, guided experiments, built-in best practices | +| Vendor-neutral, maximum flexibility | **Chaos Toolkit** | Open source, plugin ecosystem, any infrastructure | +| AWS-specific, cost-sensitive | **AWS FIS** | Native AWS integration, pay-per-experiment | + +**For most teams:** Chaos Toolkit (flexible, free) or Gremlin (fast, managed) + +## Prerequisites Checklist + +Before FIRST experiment: + +**Monitoring (Required):** +- [ ] Real-time dashboards for key metrics (latency, error rate, throughput) +- [ ] Distributed tracing for request flows +- [ ] Log aggregation with timeline correlation +- [ ] Alerts configured with thresholds + +**Rollback (Required):** +- [ ] Automated rollback based on metrics (e.g., error rate > 5% → abort) +- [ ] Manual kill switch everyone can activate +- [ ] Rollback tested and documented (< 30 sec recovery) + +**Baseline (Required):** +- [ ] Documented normal metrics (P50/P95/P99 latency, error rate %) +- [ ] Known dependencies and critical paths +- [ ] System architecture diagram + +**Team (Required):** +- [ ] Designated observer monitoring experiment +- [ ] On-call engineer available +- [ ] Communication channel established (war room, Slack) +- [ ] Post-experiment debrief scheduled + +## Anti-Patterns Catalog + +### ❌ Production First Chaos +**Symptom:** "Let's start chaos testing in production to see what breaks" + +**Why bad:** No practice, no muscle memory, production incidents guaranteed + +**Fix:** Run 5-10 experiments in staging FIRST. Graduate to production only after proving: experiments work as designed, rollback functions, team can execute response + +--- + +### ❌ Chaos Without Monitoring +**Symptom:** "We injected latency but we're not sure what happened" + +**Why bad:** Blind chaos = no learning. You can't validate resilience without seeing system behavior + +**Fix:** Set up comprehensive monitoring BEFORE first experiment. Must be able to answer "What changed?" within 30 seconds + +--- + +### ❌ Unlimited Blast Radius +**Symptom:** Affecting 100% of traffic/all services on first run + +**Why bad:** Cascading failures, actual outages, customer impact + +**Fix:** Start at 0.1-1% traffic. Progression: 0.1% → 1% → 5% → 10% → (stop or 50%). Each step validates before expanding + +--- + +### ❌ Chaos Without Rollback +**Symptom:** "The experiment broke everything and we can't stop it" + +**Why bad:** Chaos becomes real incident, 2+ hour recovery, lost trust + +**Fix:** Automated abort criteria (error rate threshold, latency threshold, manual kill switch). Test rollback before injecting failures + +--- + +### ❌ Random Chaos (No Hypothesis) +**Symptom:** "Let's inject some failures and see what happens" + +**Why bad:** No learning objective, can't validate resilience, wasted time + +**Fix:** Every experiment needs hypothesis: "System will [expected behavior] when [failure injected]" + +## Failure Types Catalog + +Priority order for microservices: + +| Failure Type | Priority | Why Test This | Example | +|--------------|----------|---------------|---------| +| **Network Latency** | HIGH | Most common production issue | 500ms delay service A → B | +| **Service Timeout** | HIGH | Tests circuit breakers, retry logic | Service B unresponsive | +| **Connection Loss** | HIGH | Tests failover, graceful degradation | TCP connection drops | +| **Resource Exhaustion** | MEDIUM | Tests resource limits, scaling | Memory limit, connection pool full | +| **Packet Loss** | MEDIUM | Tests retry strategies | 1-10% packet loss | +| **DNS Failure** | MEDIUM | Tests service discovery resilience | DNS resolution delays | +| **Cache Failure** | MEDIUM | Tests fallback behavior | Redis down | +| **Database Errors** | LOW (start) | High risk - test after basics work | Connection refused, query timeout | + +**Start with network latency** - safest, most informative, easiest rollback. + +## Experiment Template + +Use this for every chaos experiment: + +**1. Hypothesis** +"If [failure injected], system will [expected behavior], and [metric] will remain [threshold]" + +Example: "If service-payment experiences 2s latency, circuit breaker will open within 10s, and P99 latency will stay < 500ms" + +**2. Baseline Metrics** +- Current P50/P95/P99 latency: +- Current error rate: +- Current throughput: + +**3. Experiment Config** +- Failure type: [latency / packet loss / service down / etc.] +- Target: [specific service / % of traffic] +- Blast radius: [0.1% traffic, single region, canary pods] +- Duration: [2-5 minutes initial] +- Abort criteria: [error rate > 5% OR P99 > 1s OR manual stop] + +**4. Execution** +- Observer: [name] monitoring dashboards +- Runner: [name] executing experiment +- Kill switch: [procedure] +- Start time: [timestamp] + +**5. Observation** +- What happened vs hypothesis: +- Actual metrics during chaos: +- System behavior notes: + +**6. Validation** +- ✓ Hypothesis validated / ✗ Hypothesis failed +- Unexpected findings: +- Action items: + +## Blast Radius Progression + +Safe scaling path: + +| Step | Traffic Affected | Duration | Abort If | +|------|------------------|----------|----------| +| **1. Staging** | 100% staging | 5 min | Any production impact | +| **2. Canary** | 0.1% production | 2 min | Error rate > 1% | +| **3. Small** | 1% production | 5 min | Error rate > 2% | +| **4. Medium** | 5% production | 10 min | Error rate > 5% | +| **5. Large** | 10% production | 15 min | Error rate > 5% | + +**Never skip steps.** Each step validates before expanding. + +**Stop at 10-20% for most experiments** - no need to chaos 100% of production traffic. + +**Low-traffic services (< 1000 req/day):** Use absolute request counts instead of percentages. Minimum 5-10 affected requests per step. Example: 100 req/day service should still start with 5-10 requests (6 hours), not 0.1% (1 request every 10 days). + +## Your First Experiment (Staging) + +**Goal:** Build confidence, validate monitoring, test rollback + +**Experiment:** Network latency on non-critical service + +```bash +# Example with Chaos Toolkit +1. Pick least critical service (e.g., recommendation engine, not payment) +2. Inject 500ms latency to 100% of staging traffic +3. Duration: 5 minutes +4. Expected: Timeouts handled gracefully, fallback behavior activates +5. Monitor: Error rate, latency, downstream services +6. Abort if: Error rate > 10% or cascading failures +7. Debrief: What did we learn? Did monitoring catch it? Did rollback work? +``` + +**Success criteria:** You can answer "Did our hypothesis hold?" within 5 minutes of experiment completion. + +## Common Mistakes + +### ❌ Testing During Incidents +**Fix:** Only chaos test during stable periods, business hours, with extra staffing + +--- + +### ❌ Network Latency Underestimation +**Fix:** Latency cascades - 500ms can become 5s downstream. Start with 100-200ms, observe, then increase + +--- + +### ❌ No Post-Experiment Review +**Fix:** Every experiment gets 15-min debrief: What worked? What broke? What did we learn? + +## Quick Reference + +**Prerequisites Before First Chaos:** +1. Monitoring + alerts +2. Automated rollback +3. Baseline metrics documented +4. Team coordinated + +**Experiment Steps:** +1. Write hypothesis +2. Document baseline +3. Define blast radius (start 0.1%) +4. Set abort criteria +5. Execute with observer +6. Validate hypothesis +7. Debrief team + +**Blast Radius Progression:** +Staging → 0.1% → 1% → 5% → 10% (stop for most experiments) + +**First Experiment:** +Network latency (500ms) on non-critical service in staging for 5 minutes + +## Bottom Line + +**Chaos engineering is hypothesis-driven science, not random destruction.** + +Start small (staging, 0.1% traffic), with monitoring, with rollback. Graduate slowly. diff --git a/skills/contract-testing/SKILL.md b/skills/contract-testing/SKILL.md new file mode 100644 index 0000000..188fcca --- /dev/null +++ b/skills/contract-testing/SKILL.md @@ -0,0 +1,524 @@ +--- +name: contract-testing +description: Use when implementing Pact contracts, choosing consumer-driven vs provider-driven approaches, handling breaking API changes, setting up contract brokers, or preventing service integration issues - provides tool selection, anti-patterns, and workflow patterns +--- + +# Contract Testing + +## Overview + +**Core principle:** Test the contract, not the implementation. Verify integration points independently. + +**Rule:** Contract tests catch breaking changes before deployment, not in production. + +## Tool Selection Decision Tree + +| Your Stack | Team Structure | Use | Why | +|-----------|----------------|-----|-----| +| Polyglot microservices | Multiple teams | **Pact** | Language-agnostic, mature broker | +| Java Spring ecosystem | Coordinated teams | **Spring Cloud Contract** | Spring integration, code-first | +| GraphQL APIs | Known consumers | **Pact + GraphQL** | Query validation | +| OpenAPI/REST | Public/many consumers | **OpenAPI Spec Testing** | Schema-first, documentation | + +**First choice:** Pact (most mature ecosystem, widest language support) + +**Why contract testing:** Catches API breaking changes in CI, not production. Teams test independently without running dependencies. + +## Contract Type Decision Framework + +| Scenario | Approach | Tools | +|----------|----------|-------| +| **Internal microservices, known consumers** | Consumer-Driven (CDC) | Pact, Spring Cloud Contract | +| **Public API, many unknown consumers** | Provider-Driven (Schema-First) | OpenAPI validation, Spectral | +| **Both internal and external consumers** | Bi-Directional | Pact + OpenAPI | +| **Event-driven/async messaging** | Message Pact | Pact (message provider/consumer) | + +**Default:** Consumer-driven for internal services, schema-first for public APIs + +## Anti-Patterns Catalog + +### ❌ Over-Specification +**Symptom:** Contract tests verify exact response format, including fields consumer doesn't use + +**Why bad:** Brittle tests, provider can't evolve API, false positives + +**Fix:** Only specify what consumer actually uses + +```javascript +// ❌ Bad - over-specified +.willRespondWith({ + status: 200, + body: { + id: 123, + name: 'John', + email: 'john@example.com', + created_at: '2023-01-01', + updated_at: '2023-01-02', + phone: '555-1234', + address: {...} // Consumer doesn't use these + } +}) + +// ✅ Good - specify only what's used +.willRespondWith({ + status: 200, + body: { + id: Matchers.integer(123), + name: Matchers.string('John') + } +}) +``` + +--- + +### ❌ Testing Implementation Details +**Symptom:** Contract tests verify database queries, internal logic, or response timing + +**Why bad:** Couples tests to implementation, not contract + +**Fix:** Test only request/response contract, not how provider implements it + +```javascript +// ❌ Bad - testing implementation +expect(provider.database.queryCalled).toBe(true) + +// ✅ Good - testing contract only +expect(response.status).toBe(200) +expect(response.body.name).toBe('John') +``` + +--- + +### ❌ Brittle Provider States +**Symptom:** Provider states hardcode IDs, dates, or specific data that changes + +**Why bad:** Tests fail randomly, high maintenance + +**Fix:** Use matchers, generate data in state setup + +```javascript +// ❌ Bad - hardcoded state +.given('user 123 exists') +.uponReceiving('request for user 123') +.withRequest({ path: '/users/123' }) + +// ✅ Good - flexible state +.given('a user exists') +.uponReceiving('request for user') +.withRequest({ path: Matchers.regex('/users/\\d+', '/users/123') }) +.willRespondWith({ + body: { + id: Matchers.integer(123), + name: Matchers.string('John') + } +}) +``` + +--- + +### ❌ No Contract Versioning +**Symptom:** Breaking changes deployed without consumer coordination + +**Why bad:** Runtime failures, production incidents + +**Fix:** Use can-i-deploy, tag contracts by environment + +```bash +# ✅ Good - check before deploying +pact-broker can-i-deploy \ + --pacticipant UserService \ + --version 2.0.0 \ + --to production +``` + +--- + +### ❌ Missing Can-I-Deploy +**Symptom:** Deploying without checking if all consumers compatible + +**Why bad:** Deploy provider changes that break consumers + +**Fix:** Run can-i-deploy in CI before deployment + +## Pact Broker Workflow + +**Core workflow:** + +1. **Consumer:** Write contract test → Generate pact file +2. **Consumer CI:** Publish pact to broker with version tag +3. **Provider CI:** Fetch contracts → Verify → Publish results +4. **Provider CD:** Run can-i-deploy → Deploy if compatible + +### Publishing Contracts + +```bash +# Consumer publishes pact with version and branch +pact-broker publish pacts/ \ + --consumer-app-version ${GIT_SHA} \ + --branch ${GIT_BRANCH} \ + --tag ${ENV} +``` + +### Verifying Contracts + +```javascript +// Provider verifies against broker +const { Verifier } = require('@pact-foundation/pact') + +new Verifier({ + providerBaseUrl: 'http://localhost:8080', + pactBrokerUrl: process.env.PACT_BROKER_URL, + provider: 'UserService', + publishVerificationResult: true, + providerVersion: process.env.GIT_SHA, + consumerVersionSelectors: [ + { mainBranch: true }, // Latest from main + { deployed: 'production' }, // Currently in production + { deployed: 'staging' } // Currently in staging + ] +}).verifyProvider() +``` + +### Can-I-Deploy Check + +```yaml +# CI/CD pipeline (GitHub Actions example) +- name: Check if can deploy + run: | + pact-broker can-i-deploy \ + --pacticipant UserService \ + --version ${{ github.sha }} \ + --to-environment production +``` + +**Rule:** Never deploy without can-i-deploy passing + +## Breaking Change Taxonomy + +| Change Type | Breaking? | Migration Strategy | +|-------------|-----------|-------------------| +| Add optional field | No | Deploy provider first | +| Add required field | Yes | Use expand/contract pattern | +| Remove field | Yes | Deprecate → verify no consumers use → remove | +| Change field type | Yes | Add new field → migrate consumers → remove old | +| Rename field | Yes | Add new → deprecate old → remove old | +| Change status code | Yes | Version API or expand responses | + +### Expand/Contract Pattern + +**For adding required field:** + +**Expand (Week 1-2):** +```javascript +// Provider adds NEW field (optional), keeps OLD field +{ + user_name: "John", // Old field (deprecated) + name: "John" // New field +} +``` + +**Migrate (Week 3-4):** +- Consumers update to use new field +- Update contracts +- Verify all consumers migrated + +**Contract (Week 5):** +```javascript +// Provider removes old field +{ + name: "John" // Only new field remains +} +``` + +## Provider State Patterns + +**Purpose:** Set up test data before verification + +**Pattern:** Use state handlers to create/clean up data + +```javascript +// Provider state setup +const { Verifier } = require('@pact-foundation/pact') + +new Verifier({ + stateHandlers: { + 'a user exists': async () => { + // Setup: Create test user + await db.users.create({ + id: 123, + name: 'John Doe' + }) + }, + 'no users exist': async () => { + // Setup: Clear users + await db.users.deleteAll() + } + }, + afterEach: async () => { + // Cleanup after each verification + await db.users.deleteAll() + } +}).verifyProvider() +``` + +**Best practices:** +- States should be independent +- Clean up after each verification +- Use transactions for database tests +- Don't hardcode IDs (use matchers) + +## Async/Event-Driven Messaging Contracts + +**For Kafka, RabbitMQ, SNS/SQS:** Use Message Pact (different API than HTTP Pact) + +### Consumer Message Contract + +```javascript +const { MessageConsumerPact, MatchersV3 } = require('@pact-foundation/pact') + +describe('User Event Consumer', () => { + const messagePact = new MessageConsumerPact({ + consumer: 'NotificationService', + provider: 'UserService' + }) + + it('processes user created events', () => { + return messagePact + .expectsToReceive('user created event') + .withContent({ + userId: MatchersV3.integer(123), + email: MatchersV3.string('user@example.com'), + eventType: 'USER_CREATED' + }) + .withMetadata({ + 'content-type': 'application/json' + }) + .verify((message) => { + processUserCreatedEvent(message.contents) + }) + }) +}) +``` + +### Provider Message Verification + +```javascript +// Provider verifies it can produce matching messages +const { MessageProviderPact } = require('@pact-foundation/pact') + +describe('User Event Producer', () => { + it('publishes user created events matching contracts', () => { + return new MessageProviderPact({ + messageProviders: { + 'user created event': () => ({ + contents: { + userId: 123, + email: 'test@example.com', + eventType: 'USER_CREATED' + }, + metadata: { + 'content-type': 'application/json' + } + }) + } + }).verify() + }) +}) +``` + +### Key Differences from HTTP Contracts + +- **No request/response:** Only message payload +- **Metadata:** Headers, content-type, message keys +- **Ordering:** Don't test message ordering in contracts (infrastructure concern) +- **Delivery:** Don't test delivery guarantees (wrong layer) + +**Workflow:** Same as HTTP (publish pact → verify → can-i-deploy) + +## CI/CD Integration Quick Reference + +### GitHub Actions + +```yaml +# Consumer publishes contracts +- name: Run Pact tests + run: npm test + +- name: Publish pacts + run: | + npm run pact:publish + env: + PACT_BROKER_URL: ${{ secrets.PACT_BROKER_URL }} + PACT_BROKER_TOKEN: ${{ secrets.PACT_BROKER_TOKEN }} + +# Provider verifies and checks deployment +- name: Verify contracts + run: npm run pact:verify + +- name: Can I deploy? + run: | + pact-broker can-i-deploy \ + --pacticipant UserService \ + --version ${{ github.sha }} \ + --to-environment production +``` + +### GitLab CI + +```yaml +pact_test: + script: + - npm test + - npm run pact:publish + +pact_verify: + script: + - npm run pact:verify + - pact-broker can-i-deploy --pacticipant UserService --version $CI_COMMIT_SHA --to-environment production +``` + +## Your First Contract Test + +**Goal:** Prevent breaking changes between two services in one week + +**Day 1-2: Consumer Side** + +```javascript +// Install Pact +npm install --save-dev @pact-foundation/pact + +// Consumer contract test (order-service) +const { PactV3, MatchersV3 } = require('@pact-foundation/pact') +const { getUserById } = require('./userClient') + +describe('User API', () => { + const provider = new PactV3({ + consumer: 'OrderService', + provider: 'UserService' + }) + + it('gets user by id', () => { + provider + .given('a user exists') + .uponReceiving('a request for user') + .withRequest({ + method: 'GET', + path: '/users/123' + }) + .willRespondWith({ + status: 200, + body: { + id: MatchersV3.integer(123), + name: MatchersV3.string('John') + } + }) + + return provider.executeTest(async (mockServer) => { + const user = await getUserById(mockServer.url, 123) + expect(user.name).toBe('John') + }) + }) +}) +``` + +**Day 3-4: Set Up Pact Broker** + +```bash +# Docker Compose +docker-compose up -d + +# Or use hosted Pactflow (SaaS) +# https://pactflow.io +``` + +**Day 5-6: Provider Side** + +```javascript +// Provider verification (user-service) +const { Verifier } = require('@pact-foundation/pact') +const app = require('./app') + +describe('Pact Verification', () => { + it('validates contracts from broker', () => { + return new Verifier({ + provider: 'UserService', + providerBaseUrl: 'http://localhost:8080', + pactBrokerUrl: process.env.PACT_BROKER_URL, + publishVerificationResult: true, + providerVersion: '1.0.0', + + stateHandlers: { + 'a user exists': async () => { + await db.users.create({ id: 123, name: 'John' }) + } + } + }).verifyProvider() + }) +}) +``` + +**Day 7: Add to CI** + +```yaml +# Add can-i-deploy before deployment +- pact-broker can-i-deploy --pacticipant UserService --version $VERSION --to production +``` + +## Common Mistakes + +### ❌ Testing Business Logic in Contracts +**Fix:** Contract tests verify integration only. Test business logic separately. + +--- + +### ❌ Not Using Matchers +**Fix:** Use `Matchers.string()`, `Matchers.integer()` for flexible matching + +--- + +### ❌ Skipping Can-I-Deploy +**Fix:** Always run can-i-deploy before deployment. Automate in CI. + +--- + +### ❌ Hardcoding Test Data +**Fix:** Generate data in provider states, use matchers in contracts + +## Quick Reference + +**Tool Selection:** +- Polyglot/multiple teams: Pact +- Java Spring only: Spring Cloud Contract +- Public API: OpenAPI validation + +**Contract Type:** +- Internal services: Consumer-driven (Pact) +- Public API: Provider-driven (OpenAPI) +- Both: Bi-directional + +**Pact Broker Workflow:** +1. Consumer publishes pact +2. Provider verifies +3. Can-i-deploy checks compatibility +4. Deploy if compatible + +**Breaking Changes:** +- Add optional field: Safe +- Add required field: Expand/contract pattern +- Remove/rename field: Deprecate → migrate → remove + +**Provider States:** +- Set up test data +- Clean up after each test +- Use transactions for DB +- Don't hardcode IDs + +**CI/CD:** +- Consumer: Test → publish pacts +- Provider: Verify → can-i-deploy → deploy + +## Bottom Line + +**Contract testing prevents API breaking changes by testing integration points independently. Use Pact for internal microservices, publish contracts to broker, run can-i-deploy before deployment.** + +Test the contract (request/response), not the implementation. Use consumer-driven contracts for known consumers, schema-first for public APIs. diff --git a/skills/dependency-scanning/SKILL.md b/skills/dependency-scanning/SKILL.md new file mode 100644 index 0000000..c3f2b01 --- /dev/null +++ b/skills/dependency-scanning/SKILL.md @@ -0,0 +1,429 @@ +--- +name: dependency-scanning +description: Use when integrating SCA tools (Snyk, Dependabot, OWASP Dependency-Check), automating vulnerability management, handling license compliance, setting up automated dependency updates, or managing security advisories - provides tool selection, PR automation workflows, and false positive management +--- + +# Dependency Scanning + +## Overview + +**Core principle:** Third-party dependencies introduce security vulnerabilities and license risks. Automate scanning to catch them early. + +**Rule:** Block merges on critical/high vulnerabilities in direct dependencies. Monitor and plan fixes for transitive dependencies. + +## Why Dependency Scanning Matters + +**Security vulnerabilities:** +- 80% of codebases contain at least one vulnerable dependency +- Log4Shell (CVE-2021-44228) affected millions of applications +- Attackers actively scan GitHub for known vulnerabilities + +**License compliance:** +- GPL dependencies in proprietary software = legal risk +- Some licenses require source code disclosure +- Incompatible license combinations + +--- + +## Tool Selection + +| Tool | Use Case | Cost | Best For | +|------|----------|------|----------| +| **Dependabot** | Automated PRs for updates | Free (GitHub) | GitHub projects, basic scanning | +| **Snyk** | Comprehensive security + license scanning | Free tier, paid plans | Production apps, detailed remediation | +| **OWASP Dependency-Check** | Security-focused, self-hosted | Free | Privacy-sensitive, custom workflows | +| **npm audit** | JavaScript quick scan | Free | Quick local checks | +| **pip-audit** | Python quick scan | Free | Quick local checks | +| **bundler-audit** | Ruby quick scan | Free | Quick local checks | + +**Recommended setup:** +- **GitHub repos:** Dependabot (automated) + Snyk (security focus) +- **Self-hosted:** OWASP Dependency-Check +- **Quick local checks:** npm audit / pip-audit + +--- + +## Dependabot Configuration + +### Enable Dependabot (GitHub) + +``yaml +# .github/dependabot.yml +version: 2 +updates: + - package-ecosystem: "npm" + directory: "/" + schedule: + interval: "weekly" + day: "monday" + open-pull-requests-limit: 5 + labels: + - "dependencies" + reviewers: + - "security-team" + + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + target-branch: "develop" +``` + +**What Dependabot does:** +- Scans dependencies weekly +- Creates PRs for vulnerabilities +- Updates to safe versions +- Provides CVE details + +--- + +## Snyk Integration + +### Installation + +```bash +npm install -g snyk +snyk auth # Authenticate with Snyk account +``` + +--- + +### Scan Local Project + +```bash +# Test for vulnerabilities +snyk test + +# Monitor project (continuous scanning) +snyk monitor +``` + +--- + +### CI/CD Integration + +```yaml +# .github/workflows/snyk.yml +name: Snyk Security Scan + +on: [pull_request, push] + +jobs: + security: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Run Snyk + uses: snyk/actions/node@master + env: + SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} + with: + args: --severity-threshold=high # Fail on high+ severity +``` + +**Severity thresholds:** +- **Critical:** Block merge immediately +- **High:** Block merge, fix within 7 days +- **Medium:** Create issue, fix within 30 days +- **Low:** Monitor, fix opportunistically + +--- + +## OWASP Dependency-Check + +### Installation + +```bash +# Download latest release +wget https://github.com/jeremylong/DependencyCheck/releases/download/v8.0.0/dependency-check-8.0.0-release.zip +unzip dependency-check-8.0.0-release.zip +``` + +--- + +### Run Scan + +```bash +# Scan project +./dependency-check/bin/dependency-check.sh \ + --scan ./src \ + --format HTML \ + --out ./reports \ + --suppression ./dependency-check-suppressions.xml +``` + +--- + +### Suppression File (False Positives) + +```xml + + + + + False positive - CVE applies to server mode only, we use client mode + CVE-2021-12345 + + +``` + +--- + +## License Compliance + +### Checking Licenses (npm) + +```bash +# List all licenses +npx license-checker + +# Filter incompatible licenses +npx license-checker --onlyAllow 'MIT;Apache-2.0;BSD-3-Clause' +``` + +--- + +### Blocking Incompatible Licenses + +```json +// package.json +{ + "scripts": { + "license-check": "license-checker --onlyAllow 'MIT;Apache-2.0;BSD-3-Clause;ISC' --production" + } +} +``` + +```yaml +# CI: Fail if incompatible licenses detected +- name: Check licenses + run: npm run license-check +``` + +**Common license risks:** +- **GPL/AGPL:** Requires source code disclosure +- **SSPL:** Restrictive for SaaS +- **Proprietary:** May prohibit commercial use + +--- + +## Automated Dependency Updates + +### Auto-Merge Strategy + +**Safe to auto-merge:** +- Patch versions (1.2.3 → 1.2.4) +- No breaking changes +- Passing all tests + +```yaml +# .github/workflows/auto-merge-dependabot.yml +name: Auto-merge Dependabot PRs + +on: pull_request + +jobs: + auto-merge: + runs-on: ubuntu-latest + if: github.actor == 'dependabot[bot]' + steps: + - name: Check if patch update + id: check + run: | + # Only auto-merge patch/minor, not major + if [[ "${{ github.event.pull_request.title }}" =~ ^Bump.*from.*\.[0-9]+$ ]]; then + echo "auto_merge=true" >> $GITHUB_OUTPUT + fi + + - name: Enable auto-merge + if: steps.check.outputs.auto_merge == 'true' + run: gh pr merge --auto --squash "$PR_URL" + env: + PR_URL: ${{ github.event.pull_request.html_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +``` + +--- + +## Vulnerability Remediation Workflow + +### 1. Triage (Within 24 hours) + +**For each vulnerability:** +- **Assess severity:** Critical → immediate, High → 7 days, Medium → 30 days +- **Check exploitability:** Is it reachable in our code? +- **Verify patch availability:** Is there a fixed version? + +--- + +### 2. Remediation Options + +| Option | When to Use | Example | +|--------|-------------|---------| +| **Update dependency** | Patch available | `npm update lodash` | +| **Update lockfile only** | Transitive dependency | `npm audit fix` | +| **Replace dependency** | No patch, actively exploited | Replace `request` with `axios` | +| **Apply workaround** | No patch, low risk | Disable vulnerable feature | +| **Accept risk** | False positive, not exploitable | Document in suppression file | + +--- + +### 3. Verification + +```bash +# After fix, verify vulnerability is resolved +npm audit +snyk test + +# Run full test suite +npm test +``` + +--- + +## Anti-Patterns Catalog + +### ❌ Ignoring Transitive Dependencies + +**Symptom:** "We don't use that library directly, so it's fine" + +**Why bad:** Transitive dependencies are still in your app + +``` +Your App + └─ express@4.18.0 + └─ body-parser@1.19.0 + └─ qs@6.7.0 (vulnerable!) +``` + +**Fix:** Update parent dependency or override version + +```json +// package.json - force safe version +{ + "overrides": { + "qs": "^6.11.0" + } +} +``` + +--- + +### ❌ Auto-Merging All Updates + +**Symptom:** Dependabot PRs merged without review + +**Why bad:** +- Major versions can break functionality +- Updates may introduce new bugs +- No verification tests run + +**Fix:** Auto-merge only patch versions, review major/minor + +--- + +### ❌ Suppressing Without Investigation + +**Symptom:** Marking all vulnerabilities as false positives + +```xml + + + CVE-2021-12345 + +``` + +**Fix:** Document WHY it's suppressed + +```xml + + + + False positive: CVE applies to XML parsing feature. + We only use JSON parsing (verified in code review). + Tracking issue: #1234 + + CVE-2021-12345 + +``` + +--- + +### ❌ No SLA for Fixes + +**Symptom:** Vulnerabilities sit unfixed for months + +**Fix:** Define SLAs by severity + +**Example SLA:** +- **Critical:** Fix within 24 hours +- **High:** Fix within 7 days +- **Medium:** Fix within 30 days +- **Low:** Fix within 90 days or next release + +--- + +## Monitoring & Alerting + +### Slack Notifications + +```yaml +# .github/workflows/security-alerts.yml +name: Security Alerts + +on: + schedule: + - cron: '0 9 * * *' # Daily at 9 AM + +jobs: + scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Run Snyk + id: snyk + run: | + snyk test --json > snyk-results.json || true + + - name: Send Slack alert + if: steps.snyk.outcome == 'failure' + uses: slackapi/slack-github-action@v1 + with: + payload: | + { + "text": "🚨 Security vulnerabilities detected!", + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Critical vulnerabilities found in dependencies*\nView details: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } + } + ] + } + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }} +``` + +--- + +## Bottom Line + +**Automate dependency scanning to catch vulnerabilities and license issues early. Block merges on critical issues, monitor and plan fixes for others.** + +**Setup:** +- Enable Dependabot (automated PRs) +- Add Snyk or OWASP Dependency-Check (security scanning) +- Check licenses (license-checker) +- Define SLAs (Critical: 24h, High: 7d, Medium: 30d) + +**Remediation:** +- Update dependencies to patched versions +- Override transitive dependencies if needed +- Document suppressions with justification +- Verify fixes with tests + +**If you're not scanning dependencies, you're shipping known vulnerabilities. Automate it in CI/CD.** diff --git a/skills/e2e-testing-strategies/SKILL.md b/skills/e2e-testing-strategies/SKILL.md new file mode 100644 index 0000000..14ff6ce --- /dev/null +++ b/skills/e2e-testing-strategies/SKILL.md @@ -0,0 +1,290 @@ +--- +name: e2e-testing-strategies +description: Use when designing E2E test architecture, choosing between Cypress/Playwright/Selenium, prioritizing which flows to test, fixing flaky E2E tests, or debugging slow E2E test suites - provides production-tested patterns and anti-patterns +--- + +# E2E Testing Strategies + +## Overview + +**Core principle:** E2E tests are expensive. Use them sparingly for critical multi-system flows. Everything else belongs lower in the test pyramid. + +**Test pyramid target:** 5-10% E2E, 20-25% integration, 65-75% unit + +**Scope:** This skill focuses on web application E2E testing (browser-based). For mobile app testing (iOS/Android), decision tree points to Appium, but patterns/anti-patterns here are web-specific. Mobile testing requires different strategies for device capabilities, native selectors, and app lifecycle. + +## Framework Selection Decision Tree + +Choose framework based on constraints: + +| Your Constraint | Choose | Why | +|----------------|--------|-----| +| Need cross-browser (Chrome/Firefox/Safari) | **Playwright** | Native multi-browser, auto-wait, trace viewer | +| Team unfamiliar with testing | **Cypress** | Simpler API, better DX, larger community | +| Enterprise/W3C standard requirement | **WebdriverIO** | Full W3C WebDriver protocol | +| Headless Chrome only, fine-grained control | **Puppeteer** | Lower-level, faster for Chrome-only | +| Testing Electron apps | **Spectron** or **Playwright** | Native Electron support | +| Mobile apps (iOS/Android) | **Appium** | Mobile-specific protocol (Note: rest of this skill is web-focused) | + +**For most web apps:** Playwright (modern, reliable) or Cypress (simpler DX) + +## Flow Prioritization Matrix + +When you have 50 flows but can only test 10 E2E: + +| Score | Criteria | Weight | +|-------|----------|--------| +| +3 | Revenue impact (checkout, payment, subscription) | High | +| +3 | Multi-system integration (API + DB + email + payment) | High | +| +2 | Historical production failures (has broken before) | Medium | +| +2 | Complex state management (auth, sessions, caching) | Medium | +| +1 | User entry point (login, signup, search) | Medium | +| +1 | Regulatory/compliance requirement | Medium | +| -2 | Can be tested at integration level | Penalty | +| -3 | Mostly UI interaction, no backend | Penalty | + +**Score flows 0-10, test top 10.** Everything else → integration/unit tests. + +**Example:** +- "User checkout flow" = +3 revenue +3 multi-system +2 historical +2 state = **10** → E2E +- "User changes email preference" = +1 entry -2 integration level = **-1** → Integration test + +## Anti-Patterns Catalog + +### ❌ Pyramid Inversion +**Symptom:** 200 E2E tests, 50 integration tests, 100 unit tests + +**Why bad:** E2E tests are slow (30min CI), brittle (UI changes break tests), hard to debug + +**Fix:** Invert back - move 150 E2E tests down to integration/unit + +--- + +### ❌ Testing Through the UI +**Symptom:** E2E test creates 10 users through signup form to test one admin feature + +**Why bad:** Slow, couples unrelated features + +**Fix:** Seed data via API/database, test only the admin feature flow + +--- + +### ❌ Arbitrary Timeouts +**Symptom:** `wait(5000)` sprinkled throughout tests + +**Why bad:** Flaky - sometimes too short, sometimes wastes time + +**Fix:** Explicit waits for conditions +```javascript +// ❌ Bad +await page.click('button'); +await page.waitForTimeout(5000); + +// ✅ Good +await page.click('button'); +await page.waitForSelector('.success-message'); +``` + +--- + +### ❌ God Page Objects +**Symptom:** Single `PageObject` class with 50 methods for entire app + +**Why bad:** Tight coupling, hard to maintain, unclear responsibilities + +**Fix:** One page object per logical page/component +```javascript +// ❌ Bad: God object +class AppPage { + async login() {} + async createPost() {} + async deleteUser() {} + async exportReport() {} + // ... 50 more methods +} + +// ✅ Good: Focused page objects +class AuthPage { + async login() {} + async logout() {} +} + +class PostsPage { + async create() {} + async delete() {} +} +``` + +--- + +###❌ Brittle Selectors +**Symptom:** `page.click('.btn-primary.mt-4.px-3')` + +**Why bad:** Breaks when CSS changes + +**Fix:** Use `data-testid` attributes +```javascript +// ❌ Bad +await page.click('.submit-button.btn.btn-primary'); + +// ✅ Good +await page.click('[data-testid="submit"]'); +``` + +--- + +### ❌ Test Interdependence +**Symptom:** Test 5 fails if Test 3 doesn't run first + +**Why bad:** Can't run tests in parallel, hard to debug + +**Fix:** Each test sets up own state +```javascript +// ❌ Bad +test('create user', async () => { + // creates user "test@example.com" +}); + +test('login user', async () => { + // assumes user from previous test exists +}); + +// ✅ Good +test('login user', async ({ page }) => { + await createUserViaAPI('test@example.com'); // independent setup + await page.goto('/login'); + // test login flow +}); +``` + +## Flakiness Patterns Catalog + +Common flake sources and fixes: + +| Pattern | Symptom | Fix | +|---------|---------|-----| +| **Network Race** | "Element not found" intermittently | `await page.waitForLoadState('networkidle')` | +| **Animation Race** | "Element not clickable" | `await page.waitForSelector('.element', { state: 'visible' })` or disable animations | +| **Async State** | "Expected 'success' but got ''" | Wait for specific state, not timeout | +| **Test Data Pollution** | Test passes alone, fails in suite | Isolate data per test (unique IDs, cleanup) | +| **Browser Caching** | Different results first vs second run | Clear cache/cookies between tests | +| **Date/Time Sensitivity** | Test fails at midnight, passes during day | Mock system time in tests | +| **External Service** | Third-party API occasionally down | Mock external dependencies | + +**Rule:** If test fails <5% of time, it's flaky. Fix it before adding more tests. + +## Page Object Anti-Patterns + +### ❌ Business Logic in Page Objects +```javascript +// ❌ Bad +class CheckoutPage { + async calculateTotal(items) { + return items.reduce((sum, item) => sum + item.price, 0); // business logic + } +} + +// ✅ Good +class CheckoutPage { + async getTotal() { + return await page.textContent('[data-testid="total"]'); // UI interaction only + } +} +``` + +### ❌ Assertions in Page Objects +```javascript +// ❌ Bad +class LoginPage { + async login(email, password) { + await this.page.fill('[data-testid="email"]', email); + await this.page.fill('[data-testid="password"]', password); + await this.page.click('[data-testid="submit"]'); + expect(this.page.url()).toContain('/dashboard'); // assertion + } +} + +// ✅ Good +class LoginPage { + async login(email, password) { + await this.page.fill('[data-testid="email"]', email); + await this.page.fill('[data-testid="password"]', password); + await this.page.click('[data-testid="submit"]'); + } + + async isOnDashboard() { + return this.page.url().includes('/dashboard'); + } +} + +// Test file handles assertions +test('login', async () => { + await loginPage.login('user@test.com', 'password'); + expect(await loginPage.isOnDashboard()).toBe(true); +}); +``` + +## Quick Reference + +### When to Use E2E vs Integration vs Unit + +| Scenario | Test Level | Reasoning | +|----------|-----------|-----------| +| Form validation logic | Unit | Pure function, no UI needed | +| API error handling | Integration | Test API contract, no browser | +| Multi-step checkout | E2E | Crosses systems, critical revenue | +| Button hover states | Visual regression | Not functional behavior | +| Login → dashboard redirect | E2E | Auth critical, multi-system | +| Database query performance | Integration | No UI, just DB | +| User can filter search results | E2E (1 test) + Integration (variations) | 1 E2E for happy path, rest integration | + +### Test Data Strategies + +| Approach | When to Use | Pros | Cons | +|----------|-------------|------|------| +| **API Seeding** | Most tests | Fast, consistent | Requires API access | +| **Database Seeding** | Integration tests | Complete control | Slow, requires DB access | +| **UI Creation** | Testing creation flow itself | Tests real user path | Slow, couples tests | +| **Mocking** | External services | Fast, reliable | Misses real integration issues | +| **Fixtures** | Consistent test data | Reusable, version-controlled | Stale if schema changes | + +## Common Mistakes + +### ❌ Running Full Suite on Every Commit +**Symptom:** 30-minute CI blocking every PR + +**Fix:** Smoke tests (5-10 critical flows) on PR, full suite on merge/nightly + +--- + +### ❌ Not Capturing Failure Artifacts +**Symptom:** "Test failed in CI but I can't reproduce" + +**Fix:** Save video + trace on failure +```javascript +// playwright.config.js +use: { + video: 'retain-on-failure', + trace: 'retain-on-failure', +} +``` + +--- + +### ❌ Testing Implementation Details +**Symptom:** Tests assert internal component state + +**Fix:** Test user-visible behavior only + +--- + +### ❌ One Assert Per Test +**Symptom:** 50 E2E tests all navigate to same page, test one thing + +**Fix:** Group related assertions in one flow test (but keep focused) + +## Bottom Line + +**E2E tests verify critical multi-system flows work for real users.** + +If you can test it faster/more reliably at a lower level, do that instead. diff --git a/skills/flaky-test-prevention/SKILL.md b/skills/flaky-test-prevention/SKILL.md new file mode 100644 index 0000000..16cdddb --- /dev/null +++ b/skills/flaky-test-prevention/SKILL.md @@ -0,0 +1,493 @@ +--- +name: flaky-test-prevention +description: Use when debugging intermittent test failures, choosing between retries vs fixes, quarantining flaky tests, calculating flakiness rates, or preventing non-deterministic behavior - provides root cause diagnosis, anti-patterns, and systematic debugging +--- + +# Flaky Test Prevention + +## Overview + +**Core principle:** Fix root causes, don't mask symptoms. + +**Rule:** Flaky tests indicate real problems - in test design, application code, or infrastructure. + +## Flakiness Decision Tree + +| Symptom | Root Cause Category | Diagnostic | Fix | +|---------|---------------------|------------|-----| +| Passes alone, fails in suite | Test Interdependence | Run tests in random order | Use test isolation (transactions, unique IDs) | +| Fails randomly ~10% | Timing/Race Condition | Add logging, run 100x | Replace sleeps with explicit waits | +| Fails only in CI, not locally | Environment Difference | Compare CI vs local env | Match environments, use containers | +| Fails at specific times | Time Dependency | Check for date/time usage | Mock system time | +| Fails under load | Resource Contention | Run in parallel locally | Add resource isolation, increase limits | +| Different results each run | Non-Deterministic Code | Check for randomness | Seed random generators, use fixtures | + +**First step:** Identify symptom, trace to root cause category. + +## Anti-Patterns Catalog + +### ❌ Sleepy Assertion +**Symptom:** Using fixed `sleep()` or `wait()` instead of condition-based waits + +**Why bad:** Wastes time on fast runs, still fails on slow runs, brittle + +**Fix:** Explicit waits for conditions + +```python +# ❌ Bad +time.sleep(5) # Hope 5 seconds is enough +assert element.text == "Loaded" + +# ✅ Good +WebDriverWait(driver, 10).until( + lambda d: d.find_element_by_id("status").text == "Loaded" +) +assert element.text == "Loaded" +``` + +--- + +### ❌ Test Interdependence +**Symptom:** Tests pass when run in specific order, fail when shuffled + +**Why bad:** Hidden dependencies, can't run in parallel, breaks test isolation + +**Fix:** Each test creates its own data, no shared state + +```python +# ❌ Bad +def test_create_user(): + user = create_user("test_user") # Shared ID + +def test_update_user(): + update_user("test_user") # Depends on test_create_user + +# ✅ Good +def test_create_user(): + user_id = f"user_{uuid4()}" + user = create_user(user_id) + +def test_update_user(): + user_id = f"user_{uuid4()}" + user = create_user(user_id) # Independent + update_user(user_id) +``` + +--- + +### ❌ Hidden Dependencies +**Symptom:** Tests fail due to external state (network, database, file system) beyond test control + +**Why bad:** Unpredictable failures, environment-specific issues + +**Fix:** Mock external dependencies + +```python +# ❌ Bad +def test_weather_api(): + response = requests.get("https://api.weather.com/...") + assert response.json()["temp"] > 0 # Fails if API is down + +# ✅ Good +@mock.patch('requests.get') +def test_weather_api(mock_get): + mock_get.return_value.json.return_value = {"temp": 75} + response = get_weather("Seattle") + assert response["temp"] == 75 +``` + +--- + +### ❌ Time Bomb +**Symptom:** Tests that depend on current date/time and fail at specific moments (midnight, month boundaries, DST) + +**Why bad:** Fails unpredictably based on when tests run + +**Fix:** Mock system time + +```python +# ❌ Bad +def test_expiration(): + created_at = datetime.now() + assert is_expired(created_at) == False # Fails at midnight + +# ✅ Good +@freeze_time("2025-11-15 12:00:00") +def test_expiration(): + created_at = datetime(2025, 11, 15, 12, 0, 0) + assert is_expired(created_at) == False +``` + +--- + +### ❌ Timeout Inflation +**Symptom:** Continuously increasing timeouts to "fix" flaky tests (5s → 10s → 30s) + +**Why bad:** Masks root cause, slows test suite, doesn't guarantee reliability + +**Fix:** Investigate why operation is slow, use explicit waits + +```python +# ❌ Bad +await page.waitFor(30000) # Increased from 5s hoping it helps + +# ✅ Good +await page.waitForSelector('.data-loaded', {timeout: 10000}) +await page.waitForNetworkIdle() +``` + +## Detection Strategies + +### Proactive Identification + +**Run tests multiple times (statistical detection):** + +```bash +# pytest with repeat plugin +pip install pytest-repeat +pytest --count=50 test_flaky.py + +# Track pass rate +# 50/50 = 100% reliable +# 45/50 = 90% flaky (investigate immediately) +# <95% = quarantine +``` + +**CI Integration (automatic tracking):** + +```yaml +# GitHub Actions example +- name: Run tests with flakiness detection + run: | + pytest --count=3 --junit-xml=results.xml + python scripts/calculate_flakiness.py results.xml +``` + +**Flakiness metrics to track:** +- Pass rate per test (target: >99%) +- Mean Time Between Failures (MTBF) +- Failure clustering (same test failing together) + +### Systematic Debugging + +**When a test fails intermittently:** + +1. **Reproduce consistently** - Run 100x to establish failure rate +2. **Isolate** - Run alone, with subset, with full suite (find interdependencies) +3. **Add logging** - Capture state before assertion, screenshot on failure +4. **Bisect** - If fails in suite, binary search which other test causes it +5. **Environment audit** - Compare CI vs local (env vars, resources, timing) + +## Flakiness Metrics Guide + +**Calculating flake rate:** + +```python +# Flakiness formula +flake_rate = (failed_runs / total_runs) * 100 + +# Example +# Test run 100 times: 7 failures +# Flake rate = 7/100 = 7% +``` + +**Thresholds:** + +| Flake Rate | Action | Priority | +|------------|--------|----------| +| 0% (100% pass) | Reliable | Monitor | +| 0.1-1% | Investigate | Low | +| 1-5% | Quarantine + Fix | Medium | +| 5-10% | Quarantine + Fix Urgently | High | +| >10% | Disable immediately | Critical | + +**Target:** All tests should maintain >99% pass rate (< 1% flake rate) + +## Quarantine Workflow + +**Purpose:** Keep CI green while fixing flaky tests systematically + +**Process:** + +1. **Detect** - Test fails >1% of runs +2. **Quarantine** - Mark with `@pytest.mark.quarantine`, exclude from CI +3. **Track** - Create issue with flake rate, failure logs, reproduction steps +4. **Fix** - Assign owner, set SLA (e.g., 2 weeks to fix or delete) +5. **Validate** - Run fixed test 100x, must achieve >99% pass rate +6. **Re-Enable** - Remove quarantine mark, monitor for 1 week + +**Marking quarantined tests:** + +```python +@pytest.mark.quarantine(reason="Flaky due to timing issue #1234") +@pytest.mark.skip("Quarantined") +def test_flaky_feature(): + pass +``` + +**CI configuration:** + +```bash +# Run all tests except quarantined +pytest -m "not quarantine" +``` + +**SLA:** Quarantined tests must be fixed within 2 weeks or deleted. No test stays quarantined indefinitely. + +## Tool Ecosystem Quick Reference + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| **pytest-repeat** | Run test N times | Statistical detection | +| **pytest-xdist** | Parallel execution | Expose race conditions | +| **pytest-rerunfailures** | Auto-retry on failure | Temporary mitigation during fix | +| **pytest-randomly** | Randomize test order | Detect test interdependence | +| **freezegun** | Mock system time | Fix time bombs | +| **pytest-timeout** | Prevent hanging tests | Catch infinite loops | + +**Installation:** + +```bash +pip install pytest-repeat pytest-xdist pytest-rerunfailures pytest-randomly freezegun pytest-timeout +``` + +**Usage examples:** + +```bash +# Detect flakiness (run 50x) +pytest --count=50 test_suite.py + +# Detect interdependence (random order) +pytest --randomly-seed=12345 test_suite.py + +# Expose race conditions (parallel) +pytest -n 4 test_suite.py + +# Temporary mitigation (reruns, not a fix!) +pytest --reruns 2 --reruns-delay 1 test_suite.py +``` + +## Prevention Checklist + +**Use during test authoring to prevent flakiness:** + +- [ ] No fixed `time.sleep()` - use explicit waits for conditions +- [ ] Each test creates its own data (UUID-based IDs) +- [ ] No shared global state between tests +- [ ] External dependencies mocked (APIs, network, databases) +- [ ] Time/date frozen with `@freeze_time` if time-dependent +- [ ] Random values seeded (`random.seed(42)`) +- [ ] Tests pass when run in any order (`pytest --randomly-seed`) +- [ ] Tests pass when run in parallel (`pytest -n 4`) +- [ ] Tests pass 100/100 times (`pytest --count=100`) +- [ ] Teardown cleans up all resources (files, database, cache) + +## Common Fixes Quick Reference + +| Problem | Fix Pattern | Example | +|---------|-------------|---------| +| **Timing issues** | Explicit waits | `WebDriverWait(driver, 10).until(condition)` | +| **Test interdependence** | Unique IDs per test | `user_id = f"test_{uuid4()}"` | +| **External dependencies** | Mock/stub | `@mock.patch('requests.get')` | +| **Time dependency** | Freeze time | `@freeze_time("2025-11-15")` | +| **Random behavior** | Seed randomness | `random.seed(42)` | +| **Shared state** | Test isolation | Transactions, teardown fixtures | +| **Resource contention** | Unique resources | Separate temp dirs, DB namespaces | + +## Your First Flaky Test Fix + +**Systematic approach for first fix:** + +**Step 1: Reproduce (Day 1)** + +```bash +# Run test 100 times, capture failures +pytest --count=100 --verbose test_flaky.py | tee output.log +``` + +**Step 2: Categorize (Day 1)** + +Check output.log: +- Same failure message? → Likely timing/race condition +- Different failures? → Likely test interdependence +- Only fails in CI? → Environment difference + +**Step 3: Fix Based on Category (Day 2)** + +**If timing issue:** + +```python +# Before +time.sleep(2) +assert element.text == "Loaded" + +# After +wait.until(lambda: element.text == "Loaded") +``` + +**If interdependence:** + +```python +# Before +user = User.objects.get(id=1) # Assumes user exists + +# After +user = create_test_user(id=f"test_{uuid4()}") # Creates own data +``` + +**Step 4: Validate (Day 2)** + +```bash +# Must pass 100/100 times +pytest --count=100 test_flaky.py +# Expected: 100 passed +``` + +**Step 5: Monitor (Week 1)** + +Track in CI - test should maintain >99% pass rate for 1 week before considering it fixed. + +## CI-Only Flakiness (Can't Reproduce Locally) + +**Symptom:** Test fails intermittently in CI but passes 100% locally + +**Root cause:** Environment differences between CI and local (resources, parallelization, timing) + +### Systematic CI Debugging + +**Step 1: Environment Fingerprinting** + +Capture exact environment in both CI and locally: + +```python +# Add to conftest.py +import os, sys, platform, tempfile + +def pytest_configure(config): + print(f"Python: {sys.version}") + print(f"Platform: {platform.platform()}") + print(f"CPU count: {os.cpu_count()}") + print(f"TZ: {os.environ.get('TZ', 'not set')}") + print(f"Temp dir: {tempfile.gettempdir()}") + print(f"Parallel: {os.environ.get('PYTEST_XDIST_WORKER', 'not parallel')}") +``` + +Run in both environments, compare all outputs. + +**Step 2: Increase CI Observation Window** + +For low-probability failures (<5%), run more iterations: + +```yaml +# GitHub Actions example +- name: Run test 200x to catch 1% flake + run: pytest --count=200 --verbose --log-cli-level=DEBUG test.py + +- name: Upload failure artifacts + if: failure() + uses: actions/upload-artifact@v3 + with: + name: failure-logs + path: | + *.log + screenshots/ +``` + +**Step 3: Check CI-Specific Factors** + +| Factor | Diagnostic | Fix | +|--------|------------|-----| +| **Parallelization** | Run `pytest -n 4` locally | Add test isolation (unique IDs, transactions) | +| **Resource limits** | Compare CI RAM/CPU to local | Mock expensive operations, add retries | +| **Cold starts** | First run vs warm runs | Check caching assumptions | +| **Disk I/O speed** | CI may use slower disks | Mock file operations | +| **Network latency** | CI network may be slower/different | Mock external calls | + +**Step 4: Replicate CI Environment Locally** + +Use exact CI container: + +```bash +# GitHub Actions uses Ubuntu 22.04 +docker run -it ubuntu:22.04 bash + +# Install dependencies +apt-get update && apt-get install python3.11 + +# Run test in container +pytest --count=500 test.py +``` + +**Step 5: Enable CI Debug Mode** + +```yaml +# GitHub Actions - Interactive debugging +- name: Setup tmate session (on failure) + if: failure() + uses: mxschmitt/action-tmate@v3 +``` + +### Quick CI Debugging Checklist + +When test fails only in CI: + +- [ ] Capture environment fingerprint in both CI and local +- [ ] Run test with parallelization locally (`pytest -n auto`) +- [ ] Check for resource contention (CPU, memory, disk) +- [ ] Compare timezone settings (`TZ` env var) +- [ ] Upload CI artifacts (logs, screenshots) on failure +- [ ] Replicate CI environment with Docker +- [ ] Check for cold start issues (first vs subsequent runs) + +## Common Mistakes + +### ❌ Using Retries as Permanent Solution +**Fix:** Retries (@pytest.mark.flaky or --reruns) are temporary mitigation during investigation, not fixes + +--- + +### ❌ No Flakiness Tracking +**Fix:** Track pass rates in CI, set up alerts for tests dropping below 99% + +--- + +### ❌ Fixing Flaky Tests by Making Them Slower +**Fix:** Diagnose root cause - don't just add more wait time + +--- + +### ❌ Ignoring Flaky Tests +**Fix:** Quarantine workflow - either fix or delete, never ignore indefinitely + +## Quick Reference + +**Flakiness Thresholds:** +- <1% flake rate: Monitor +- 1-5%: Quarantine + fix (medium priority) +- >5%: Disable + fix urgently (high priority) + +**Root Cause Categories:** +1. Timing/race conditions → Explicit waits +2. Test interdependence → Unique IDs, test isolation +3. External dependencies → Mocking +4. Time bombs → Freeze time +5. Resource contention → Unique resources + +**Detection Tools:** +- pytest-repeat (statistical detection) +- pytest-randomly (interdependence) +- pytest-xdist (race conditions) + +**Quarantine Process:** +1. Detect (>1% flake rate) +2. Quarantine (mark, exclude from CI) +3. Track (create issue) +4. Fix (assign owner, 2-week SLA) +5. Validate (100/100 passes) +6. Re-enable (monitor 1 week) + +## Bottom Line + +**Flaky tests are fixable - find the root cause, don't mask with retries.** + +Use detection tools to find flaky tests early. Categorize by symptom, diagnose root cause, apply pattern-based fix. Quarantine if needed, but always with SLA to fix or delete. diff --git a/skills/fuzz-testing/SKILL.md b/skills/fuzz-testing/SKILL.md new file mode 100644 index 0000000..61147f8 --- /dev/null +++ b/skills/fuzz-testing/SKILL.md @@ -0,0 +1,445 @@ +--- +name: fuzz-testing +description: Use when testing input validation, discovering edge cases, finding security vulnerabilities, testing parsers/APIs with random inputs, or integrating fuzzing tools (AFL, libFuzzer, Atheris) - provides fuzzing strategies, tool selection, and crash triage workflows +--- + +# Fuzz Testing + +## Overview + +**Core principle:** Fuzz testing feeds random/malformed inputs to find crashes, hangs, and security vulnerabilities that manual tests miss. + +**Rule:** Fuzzing finds bugs you didn't know to test for. Use it for security-critical code (parsers, validators, APIs). + +## Fuzz Testing vs Other Testing + +| Test Type | Input | Goal | +|-----------|-------|------| +| **Unit Testing** | Known valid/invalid inputs | Verify expected behavior | +| **Property-Based Testing** | Generated valid inputs | Verify invariants hold | +| **Fuzz Testing** | Random/malformed inputs | Find crashes, hangs, memory issues | + +**Fuzzing finds:** Buffer overflows, null pointer dereferences, infinite loops, unhandled exceptions + +**Fuzzing does NOT find:** Logic bugs, performance issues + +--- + +## When to Use Fuzz Testing + +**Good candidates:** +- Input parsers (JSON, XML, CSV, binary formats) +- Network protocol handlers +- Image/video codecs +- Cryptographic functions +- User input validators (file uploads, form data) +- APIs accepting untrusted data + +**Poor candidates:** +- Business logic (use property-based testing) +- UI interactions (use E2E tests) +- Database queries (use integration tests) + +--- + +## Tool Selection + +| Tool | Language | Type | When to Use | +|------|----------|------|-------------| +| **Atheris** | Python | Coverage-guided | Python applications, libraries | +| **AFL (American Fuzzy Lop)** | C/C++ | Coverage-guided | Native code, high performance | +| **libFuzzer** | C/C++/Rust | Coverage-guided | Integrated with LLVM/Clang | +| **Jazzer** | Java/JVM | Coverage-guided | Java applications | +| **go-fuzz** | Go | Coverage-guided | Go applications | + +**Coverage-guided:** Tracks which code paths are executed, generates inputs to explore new paths + +--- + +## Basic Fuzzing Example (Python + Atheris) + +### Installation + +```bash +pip install atheris +``` + +--- + +### Simple Fuzz Test + +```python +import atheris +import sys + +def parse_email(data): + """Function to fuzz - finds bugs we didn't know about.""" + if "@" not in data: + raise ValueError("Invalid email") + + local, domain = data.split("@", 1) + + if "." not in domain: + raise ValueError("Invalid domain") + + # BUG: Crashes on multiple @ symbols! + # "user@@example.com" → crashes with ValueError + + return (local, domain) + +@atheris.instrument_func +def TestOneInput(data): + """Fuzz harness - called repeatedly with random inputs.""" + try: + parse_email(data.decode('utf-8', errors='ignore')) + except (ValueError, UnicodeDecodeError): + # Expected exceptions - not crashes + pass + # Any other exception = crash found! + +atheris.Setup(sys.argv, TestOneInput) +atheris.Fuzz() +``` + +**Run:** +```bash +python fuzz_email.py +``` + +**Output:** +``` +INFO: Seed: 1234567890 +INFO: -max_len is not provided; libFuzzer will not generate inputs larger than 4096 bytes +#1: NEW coverage: 10 exec/s: 1000 +#100: NEW coverage: 15 exec/s: 5000 +CRASH: input was 'user@@example.com' +``` + +--- + +## Advanced Fuzzing Patterns + +### Structured Fuzzing (JSON) + +**Problem:** Random bytes rarely form valid JSON + +```python +import atheris +import json + +@atheris.instrument_func +def TestOneInput(data): + try: + # Parse as JSON + obj = json.loads(data.decode('utf-8', errors='ignore')) + + # Fuzz your JSON handler + process_user_data(obj) + except (json.JSONDecodeError, ValueError, KeyError): + pass # Expected for invalid JSON + +def process_user_data(data): + """Crashes on: {"name": "", "age": -1}""" + if len(data["name"]) == 0: + raise ValueError("Name cannot be empty") + if data["age"] < 0: + raise ValueError("Age cannot be negative") +``` + +--- + +### Fuzzing with Corpus (Seed Inputs) + +**Corpus:** Collection of valid inputs to start from + +```python +import atheris +import sys +import os + +# Seed corpus: Valid examples +CORPUS_DIR = "./corpus" +os.makedirs(CORPUS_DIR, exist_ok=True) + +# Create seed files +with open(f"{CORPUS_DIR}/valid1.txt", "wb") as f: + f.write(b"user@example.com") +with open(f"{CORPUS_DIR}/valid2.txt", "wb") as f: + f.write(b"alice+tag@subdomain.example.org") + +@atheris.instrument_func +def TestOneInput(data): + try: + parse_email(data.decode('utf-8')) + except ValueError: + pass + +atheris.Setup(sys.argv, TestOneInput, corpus_dir=CORPUS_DIR) +atheris.Fuzz() +``` + +**Benefits:** Faster convergence to interesting inputs + +--- + +## Crash Triage Workflow + +### 1. Reproduce Crash + +```bash +# Atheris outputs crash input +CRASH: input was b'user@@example.com' + +# Save to file +echo "user@@example.com" > crash.txt +``` + +--- + +### 2. Minimize Input + +**Find smallest input that triggers crash:** + +```python +# Original: "user@@example.com" (19 bytes) +# Minimized: "@@" (2 bytes) + +# Atheris does this automatically +python fuzz_email.py crash.txt +``` + +--- + +### 3. Root Cause Analysis + +```python +def parse_email(data): + # Crash: data = "@@" + local, domain = data.split("@", 1) + # local = "", domain = "@" + + if "." not in domain: + # domain = "@" → no "." → raises ValueError + raise ValueError("Invalid domain") + + # FIX: Validate before splitting + if data.count("@") != 1: + raise ValueError("Email must have exactly one @") +``` + +--- + +### 4. Write Regression Test + +```python +def test_email_multiple_at_symbols(): + """Regression test for fuzz-found bug.""" + with pytest.raises(ValueError, match="exactly one @"): + parse_email("user@@example.com") +``` + +--- + +## Integration with CI/CD + +### Continuous Fuzzing (GitHub Actions) + +```yaml +# .github/workflows/fuzz.yml +name: Fuzz Testing + +on: + schedule: + - cron: '0 2 * * *' # Nightly at 2 AM + workflow_dispatch: + +jobs: + fuzz: + runs-on: ubuntu-latest + timeout-minutes: 60 # Run for 1 hour + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install atheris + + - name: Run fuzzing + run: | + timeout 3600 python fuzz_email.py || true + + - name: Upload crashes + if: failure() + uses: actions/upload-artifact@v3 + with: + name: fuzz-crashes + path: crash-* +``` + +**Why nightly:** Fuzzing is CPU-intensive, not suitable for every PR + +--- + +## AFL (C/C++) Example + +### Installation + +```bash +# Ubuntu/Debian +sudo apt-get install afl++ + +# macOS +brew install afl++ +``` + +--- + +### Fuzz Target + +```c +// fuzz_target.c +#include +#include +#include + +void parse_command(const char *input) { + char buffer[64]; + + // BUG: Buffer overflow if input > 64 bytes! + strcpy(buffer, input); + + if (strcmp(buffer, "exit") == 0) { + exit(0); + } +} + +int main(int argc, char **argv) { + if (argc < 2) return 1; + + FILE *f = fopen(argv[1], "rb"); + if (!f) return 1; + + char buffer[1024]; + size_t len = fread(buffer, 1, sizeof(buffer), f); + fclose(f); + + buffer[len] = '\0'; + parse_command(buffer); + + return 0; +} +``` + +--- + +### Compile and Run + +```bash +# Compile with AFL instrumentation +afl-gcc fuzz_target.c -o fuzz_target + +# Create corpus directory +mkdir -p corpus +echo "exit" > corpus/input1.txt + +# Run fuzzer +afl-fuzz -i corpus -o findings -- ./fuzz_target @@ +``` + +**Output:** +``` +american fuzzy lop 4.00a + path : findings/queue + crashes : 1 + hangs : 0 + execs done : 1000000 +``` + +**Crashes found in:** `findings/crashes/` + +--- + +## Anti-Patterns Catalog + +### ❌ Fuzzing Without Sanitizers + +**Symptom:** Memory bugs don't crash, just corrupt silently + +**Fix:** Compile with AddressSanitizer (ASan) + +```bash +# C/C++: Compile with ASan +afl-gcc -fsanitize=address fuzz_target.c -o fuzz_target + +# Python: Use PyASan (if available) +``` + +**What ASan catches:** Buffer overflows, use-after-free, memory leaks + +--- + +### ❌ Ignoring Hangs + +**Symptom:** Fuzzer reports hangs, not investigated + +**What hangs mean:** Infinite loops, algorithmic complexity attacks + +**Fix:** Investigate and add timeout checks + +```python +import signal + +def timeout_handler(signum, frame): + raise TimeoutError("Operation timed out") + +@atheris.instrument_func +def TestOneInput(data): + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(1) # 1-second timeout + + try: + parse_data(data.decode('utf-8')) + except (ValueError, TimeoutError): + pass + finally: + signal.alarm(0) +``` + +--- + +### ❌ No Regression Tests + +**Symptom:** Same bugs found repeatedly + +**Fix:** Add regression test for every crash + +```python +# After fuzzing finds crash on input "@@" +def test_regression_double_at(): + with pytest.raises(ValueError): + parse_email("@@") +``` + +--- + +## Bottom Line + +**Fuzz testing finds crashes and security vulnerabilities by feeding random/malformed inputs. Use it for security-critical code (parsers, validators, APIs).** + +**Setup:** +- Use Atheris (Python), AFL (C/C++), or language-specific fuzzer +- Start with corpus (valid examples) +- Run nightly in CI (1-24 hours) + +**Workflow:** +1. Fuzzer finds crash +2. Minimize crashing input +3. Root cause analysis +4. Fix bug +5. Add regression test + +**If your code accepts untrusted input (files, network data, user input), you should be fuzzing it. Fuzzing finds bugs that manual testing misses.** diff --git a/skills/integration-testing-patterns/SKILL.md b/skills/integration-testing-patterns/SKILL.md new file mode 100644 index 0000000..a35ad21 --- /dev/null +++ b/skills/integration-testing-patterns/SKILL.md @@ -0,0 +1,478 @@ +--- +name: integration-testing-patterns +description: Use when testing component integration, database testing, external service integration, test containers, testing message queues, microservices testing, or designing integration test suites - provides boundary testing patterns and anti-patterns between unit and E2E tests +--- + +# Integration Testing Patterns + +## Overview + +**Core principle:** Integration tests verify that multiple components work together correctly, testing at system boundaries. + +**Rule:** Integration tests sit between unit tests (isolated) and E2E tests (full system). Test the integration points, not full user workflows. + +## Integration Testing vs Unit vs E2E + +| Aspect | Unit Test | Integration Test | E2E Test | +|--------|-----------|------------------|----------| +| **Scope** | Single function/class | 2-3 components + boundaries | Full system | +| **Speed** | Fastest (<1ms) | Medium (10-500ms) | Slowest (1-10s) | +| **Dependencies** | All mocked | Real DB/services | Everything real | +| **When** | Every commit | Every PR | Before release | +| **Coverage** | Business logic | Integration points | Critical workflows | + +**Test Pyramid:** +- **70% Unit:** Pure logic, no I/O +- **20% Integration:** Database, APIs, message queues +- **10% E2E:** Browser tests, full workflows + +--- + +## What to Integration Test + +### 1. Database Integration + +**Test: Repository/DAO layer with real database** + +```python +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +@pytest.fixture(scope="function") +def db_session(): + """Each test gets fresh DB with rollback.""" + engine = create_engine("postgresql://localhost/test_db") + Session = sessionmaker(bind=engine) + session = Session() + + yield session + + session.rollback() # Undo all changes + session.close() + +def test_user_repository_create(db_session): + """Integration test: Repository + Database.""" + repo = UserRepository(db_session) + + user = repo.create(email="alice@example.com", name="Alice") + + assert user.id is not None + assert repo.get_by_email("alice@example.com").id == user.id +``` + +**Why integration test:** +- Verifies SQL queries work +- Catches FK constraint violations +- Tests database-specific features (JSON columns, full-text search) + +**NOT unit test because:** Uses real database +**NOT E2E test because:** Doesn't test full user workflow + +--- + +### 2. External API Integration + +**Test: Service layer calling third-party API** + +```python +import pytest +import responses + +@responses.activate +def test_payment_service_integration(): + """Integration test: PaymentService + Stripe API (mocked).""" + # Mock Stripe API response + responses.add( + responses.POST, + "https://api.stripe.com/v1/charges", + json={"id": "ch_123", "status": "succeeded"}, + status=200 + ) + + service = PaymentService(api_key="test_key") + result = service.charge(amount=1000, token="tok_visa") + + assert result.status == "succeeded" + assert result.charge_id == "ch_123" +``` + +**Why integration test:** +- Tests HTTP client configuration +- Validates request/response parsing +- Verifies error handling + +**When to use real API:** +- Separate integration test suite (nightly) +- Contract tests (see contract-testing skill) + +--- + +### 3. Message Queue Integration + +**Test: Producer/Consumer with real queue** + +```python +import pytest +from kombu import Connection + +@pytest.fixture +def rabbitmq_connection(): + """Real RabbitMQ connection for integration tests.""" + conn = Connection("amqp://localhost") + yield conn + conn.release() + +def test_order_queue_integration(rabbitmq_connection): + """Integration test: OrderService + RabbitMQ.""" + publisher = OrderPublisher(rabbitmq_connection) + consumer = OrderConsumer(rabbitmq_connection) + + # Publish message + publisher.publish({"order_id": 123, "status": "pending"}) + + # Consume message + message = consumer.get(timeout=5) + + assert message["order_id"] == 123 + assert message["status"] == "pending" +``` + +**Why integration test:** +- Verifies serialization/deserialization +- Tests queue configuration (exchanges, routing keys) +- Validates message durability + +--- + +### 4. Microservices Integration + +**Test: Service A → Service B communication** + +```python +import pytest + +@pytest.fixture +def mock_user_service(): + """Mock User Service for integration tests.""" + with responses.RequestsMock() as rsps: + rsps.add( + responses.GET, + "http://user-service/users/123", + json={"id": 123, "name": "Alice"}, + status=200 + ) + yield rsps + +def test_order_service_integration(mock_user_service): + """Integration test: OrderService + UserService.""" + order_service = OrderService(user_service_url="http://user-service") + + order = order_service.create_order(user_id=123, items=[...]) + + assert order.user_name == "Alice" +``` + +**For real service integration:** Use contract tests (see contract-testing skill) + +--- + +## Test Containers Pattern + +**Use Docker containers for integration tests.** + +```python +import pytest +from testcontainers.postgres import PostgresContainer + +@pytest.fixture(scope="module") +def postgres_container(): + """Start PostgreSQL container for tests.""" + with PostgresContainer("postgres:15") as postgres: + yield postgres + +@pytest.fixture +def db_connection(postgres_container): + """Database connection from test container.""" + engine = create_engine(postgres_container.get_connection_url()) + return engine.connect() + +def test_user_repository(db_connection): + repo = UserRepository(db_connection) + user = repo.create(email="alice@example.com") + assert user.id is not None +``` + +**Benefits:** +- Clean database per test run +- Matches production environment +- No manual setup required + +**When NOT to use:** +- Unit tests (too slow) +- CI without Docker support + +--- + +## Boundary Testing Strategy + +**Test at system boundaries, not internal implementation.** + +**Boundaries to test:** +1. **Application → Database** (SQL queries, ORMs) +2. **Application → External API** (HTTP clients, SDKs) +3. **Application → File System** (File I/O, uploads) +4. **Application → Message Queue** (Producers/consumers) +5. **Service A → Service B** (Microservice calls) + +**Example: Boundary test for file upload** + +```python +def test_file_upload_integration(tmp_path): + """Integration test: FileService + File System.""" + service = FileService(storage_path=str(tmp_path)) + + # Upload file + file_id = service.upload(filename="test.txt", content=b"Hello") + + # Verify file exists on disk + file_path = tmp_path / file_id / "test.txt" + assert file_path.exists() + assert file_path.read_bytes() == b"Hello" +``` + +--- + +## Anti-Patterns Catalog + +### ❌ Testing Internal Implementation + +**Symptom:** Integration test verifies internal method calls + +```python +# ❌ BAD: Testing implementation, not integration +def test_order_service(): + with patch('order_service._calculate_tax') as mock_tax: + service.create_order(...) + assert mock_tax.called +``` + +**Why bad:** Not testing integration point, just internal logic + +**Fix:** Test actual boundary (database, API, etc.) + +```python +# ✅ GOOD: Test database integration +def test_order_service(db_session): + service = OrderService(db_session) + order = service.create_order(...) + + # Verify data was persisted + saved_order = db_session.query(Order).get(order.id) + assert saved_order.total == order.total +``` + +--- + +### ❌ Full System Tests Disguised as Integration Tests + +**Symptom:** "Integration test" requires entire system running + +```python +# ❌ BAD: This is an E2E test, not integration test +def test_checkout_flow(): + # Requires: Web server, database, Redis, Stripe, email service + browser.goto("http://localhost:8000/checkout") + browser.fill("#card", "4242424242424242") + browser.click("#submit") + assert "Success" in browser.content() +``` + +**Why bad:** Slow, fragile, hard to debug + +**Fix:** Test individual integration points + +```python +# ✅ GOOD: Integration test for payment component only +def test_payment_integration(mock_stripe): + service = PaymentService() + result = service.charge(amount=1000, token="tok_visa") + assert result.status == "succeeded" +``` + +--- + +### ❌ Shared Test Data Across Integration Tests + +**Symptom:** Tests fail when run in different orders + +```python +# ❌ BAD: Relies on shared database state +def test_get_user(): + user = db.query(User).filter_by(email="test@example.com").first() + assert user.name == "Test User" + +def test_update_user(): + user = db.query(User).filter_by(email="test@example.com").first() + user.name = "Updated" + db.commit() +``` + +**Fix:** Each test creates its own data (see test-isolation-fundamentals skill) + +```python +# ✅ GOOD: Isolated test data +def test_get_user(db_session): + user = create_test_user(db_session, email="test@example.com") + retrieved = db_session.query(User).get(user.id) + assert retrieved.name == user.name +``` + +--- + +### ❌ Testing Too Many Layers + +**Symptom:** Integration test includes business logic validation + +```python +# ❌ BAD: Testing logic + integration in same test +def test_order_calculation(db_session): + order = OrderService(db_session).create_order(...) + + # Integration: DB save + assert order.id is not None + + # Logic: Tax calculation (should be unit test!) + assert order.tax == order.subtotal * 0.08 +``` + +**Fix:** Separate concerns + +```python +# ✅ GOOD: Unit test for logic +def test_order_tax_calculation(): + order = Order(subtotal=100) + assert order.calculate_tax() == 8.0 + +# ✅ GOOD: Integration test for persistence +def test_order_persistence(db_session): + repo = OrderRepository(db_session) + order = repo.create(subtotal=100, tax=8.0) + assert repo.get(order.id).tax == 8.0 +``` + +--- + +## Integration Test Environments + +### Local Development + +```yaml +# docker-compose.test.yml +version: '3.8' +services: + postgres: + image: postgres:15 + environment: + POSTGRES_DB: test_db + POSTGRES_USER: test + POSTGRES_PASSWORD: test + + redis: + image: redis:7 + + rabbitmq: + image: rabbitmq:3-management +``` + +**Run tests:** +```bash +docker-compose -f docker-compose.test.yml up -d +pytest tests/integration/ +docker-compose -f docker-compose.test.yml down +``` + +--- + +### CI/CD + +```yaml +# .github/workflows/integration-tests.yml +name: Integration Tests + +on: [pull_request] + +jobs: + test: + runs-on: ubuntu-latest + services: + postgres: + image: postgres:15 + env: + POSTGRES_PASSWORD: test + options: >- + --health-cmd pg_isready + --health-interval 10s + + steps: + - uses: actions/checkout@v3 + - name: Run integration tests + run: pytest tests/integration/ + env: + DATABASE_URL: postgresql://postgres:test@localhost/test +``` + +--- + +## Performance Considerations + +**Integration tests are slower than unit tests.** + +**Optimization strategies:** + +1. **Use transactions:** Rollback instead of truncating tables (100x faster) +2. **Parallelize:** Run integration tests in parallel (`pytest -n 4`) +3. **Minimize I/O:** Only test integration points, not full workflows +4. **Cache containers:** Reuse test containers across tests (scope="module") + +**Example: Fast integration tests** + +```python +# Slow: 5 seconds per test +@pytest.fixture +def db(): + engine = create_engine(...) + Base.metadata.create_all(engine) # Recreate schema every test + yield engine + Base.metadata.drop_all(engine) + +# Fast: 10ms per test +@pytest.fixture(scope="module") +def db_engine(): + engine = create_engine(...) + Base.metadata.create_all(engine) # Once per module + yield engine + Base.metadata.drop_all(engine) + +@pytest.fixture +def db_session(db_engine): + connection = db_engine.connect() + transaction = connection.begin() + session = Session(bind=connection) + yield session + transaction.rollback() # Fast cleanup + connection.close() +``` + +--- + +## Bottom Line + +**Integration tests verify that components work together at system boundaries.** + +- Test at boundaries (DB, API, queue), not internal logic +- Use real dependencies (DB, queue) or realistic mocks (external APIs) +- Keep tests isolated (transactions, test containers, unique data) +- Run on every PR (they're slower than unit tests but faster than E2E) + +**If your "integration test" requires the entire system running, it's an E2E test. Test integration points individually.** diff --git a/skills/load-testing-patterns/SKILL.md b/skills/load-testing-patterns/SKILL.md new file mode 100644 index 0000000..3063596 --- /dev/null +++ b/skills/load-testing-patterns/SKILL.md @@ -0,0 +1,843 @@ +--- +name: load-testing-patterns +description: Use when designing load tests, choosing tools (k6, JMeter, Gatling), calculating concurrent users from DAU, interpreting latency degradation, identifying bottlenecks, or running spike/soak/stress tests - provides test patterns, anti-patterns, and load calculation frameworks +--- + +# Load Testing Patterns + +## Overview + +**Core principle:** Test realistic load patterns, not constant artificial load. Find limits before users do. + +**Rule:** Load testing reveals system behavior under stress. Without it, production is your load test. + +## Tool Selection Decision Tree + +| Your Need | Protocol | Team Skills | Use | Why | +|-----------|----------|-------------|-----|-----| +| Modern API testing | HTTP/REST/GraphQL | JavaScript | **k6** | Best dev experience, CI/CD friendly | +| Enterprise/complex protocols | HTTP/SOAP/JMS/JDBC | Java/GUI comfort | **JMeter** | Mature, comprehensive protocols | +| Python team | HTTP/WebSocket | Python | **Locust** | Pythonic, easy scripting | +| High performance/complex scenarios | HTTP/gRPC | Scala/Java | **Gatling** | Best reports, high throughput | +| Cloud-native at scale | HTTP/WebSocket | Any (SaaS) | **Artillery, Flood.io** | Managed, distributed | + +**First choice:** k6 (modern, scriptable, excellent CI/CD integration) + +**Why not ApacheBench/wrk:** Too simple for realistic scenarios, no complex user flows + +## Test Pattern Library + +| Pattern | Purpose | Duration | When to Use | +|---------|---------|----------|-------------| +| **Smoke Test** | Verify test works | 1-2 min | Before every test run | +| **Load Test** | Normal/peak capacity | 10-30 min | Regular capacity validation | +| **Stress Test** | Find breaking point | 20-60 min | Understand limits | +| **Spike Test** | Sudden traffic surge | 5-15 min | Black Friday, launch events | +| **Soak Test** | Memory leaks, stability | 1-8 hours | Pre-release validation | +| **Capacity Test** | Max sustainable load | Variable | Capacity planning | + +### Smoke Test + +**Goal:** Verify test script works with minimal load + +```javascript +// k6 smoke test +export let options = { + vus: 1, + duration: '1m', + thresholds: { + http_req_duration: ['p(95)<500'], // 95% < 500ms + http_req_failed: ['rate<0.01'], // <1% errors + } +} +``` + +**Purpose:** Catch test script bugs before running expensive full tests + +### Load Test (Ramp-Up Pattern) + +**Goal:** Test normal and peak expected load + +```javascript +// k6 load test with ramp-up +export let options = { + stages: [ + { duration: '5m', target: 100 }, // Ramp to normal load + { duration: '10m', target: 100 }, // Hold at normal + { duration: '5m', target: 200 }, // Ramp to peak + { duration: '10m', target: 200 }, // Hold at peak + { duration: '5m', target: 0 }, // Ramp down + ], + thresholds: { + http_req_duration: ['p(95)<500', 'p(99)<1000'], + http_req_failed: ['rate<0.05'], + } +} +``` + +**Pattern:** Gradual ramp-up → sustain → ramp down. Never start at peak. + +### Stress Test (Breaking Point) + +**Goal:** Find system limits + +```javascript +// k6 stress test +export let options = { + stages: [ + { duration: '5m', target: 100 }, // Normal + { duration: '5m', target: 300 }, // Above peak + { duration: '5m', target: 600 }, // 2x peak + { duration: '5m', target: 900 }, // 3x peak (expect failure) + { duration: '10m', target: 0 }, // Recovery + ] +} +``` + +**Success:** Identify at what load system degrades (not necessarily breaking completely) + +### Spike Test (Sudden Surge) + +**Goal:** Test sudden traffic bursts (viral post, email campaign) + +```javascript +// k6 spike test +export let options = { + stages: [ + { duration: '1m', target: 100 }, // Normal + { duration: '30s', target: 1000 }, // SPIKE to 10x + { duration: '5m', target: 1000 }, // Hold spike + { duration: '2m', target: 100 }, // Back to normal + { duration: '5m', target: 100 }, // Recovery check + ] +} +``` + +**Tests:** Auto-scaling, circuit breakers, rate limiting + +### Soak Test (Endurance) + +**Goal:** Find memory leaks, resource exhaustion over time + +```javascript +// k6 soak test +export let options = { + stages: [ + { duration: '5m', target: 100 }, // Ramp + { duration: '4h', target: 100 }, // Soak (sustained load) + { duration: '5m', target: 0 }, // Ramp down + ] +} +``` + +**Monitor:** Memory growth, connection leaks, disk space, file descriptors + +**Duration:** Minimum 1 hour, ideally 4-8 hours + +## Load Calculation Framework + +**Problem:** Convert "10,000 daily active users" to concurrent load + +### Step 1: DAU to Concurrent Users + +``` +Concurrent Users = DAU × Concurrency Ratio × Peak Multiplier + +Concurrency Ratios by App Type: +- Web apps: 5-10% +- Social media: 10-20% +- Business apps: 20-30% (work hours) +- Gaming: 15-25% + +Peak Multiplier: 1.5-2x for safety margin +``` + +**Example:** +``` +DAU = 10,000 +Concurrency = 10% (web app) +Peak Multiplier = 1.5 + +Concurrent Users = 10,000 × 0.10 × 1.5 = 1,500 concurrent users +``` + +### Step 2: Concurrent Users to Requests/Second + +``` +RPS = (Concurrent Users × Requests per Session) / (Session Duration × Think Time Ratio) + +Think Time Ratio: +- Active browsing: 0.3-0.5 (30-50% time clicking/typing) +- Reading-heavy: 0.1-0.2 (10-20% active) +- API clients: 0.8-1.0 (80-100% active) +``` + +**Example:** +``` +Concurrent Users = 1,500 +Requests per Session = 20 +Session Duration = 10 minutes = 600 seconds +Think Time Ratio = 0.3 (web browsing) + +RPS = (1,500 × 20) / (600 × 0.3) = 30,000 / 180 = 167 RPS +``` + +### Step 3: Model Realistic Patterns + +Don't use constant load. Use realistic traffic patterns: + +```javascript +// Realistic daily pattern +export let options = { + stages: [ + // Morning ramp + { duration: '2h', target: 500 }, // 08:00-10:00 + { duration: '2h', target: 1000 }, // 10:00-12:00 (peak) + // Lunch dip + { duration: '1h', target: 600 }, // 12:00-13:00 + // Afternoon peak + { duration: '2h', target: 1200 }, // 13:00-15:00 (peak) + { duration: '2h', target: 800 }, // 15:00-17:00 + // Evening drop + { duration: '2h', target: 300 }, // 17:00-19:00 + ] +} +``` + +## Anti-Patterns Catalog + +### ❌ Coordinated Omission +**Symptom:** Fixed rate load generation ignores slow responses, underestimating latency + +**Why bad:** Hides real latency impact when system slows down + +**Fix:** Use arrival rate (requests/sec) not iteration rate + +```javascript +// ❌ Bad - coordinated omission +export default function() { + http.get('https://api.example.com') + sleep(1) // Wait 1s between requests +} + +// ✅ Good - arrival rate pacing +export let options = { + scenarios: { + constant_arrival_rate: { + executor: 'constant-arrival-rate', + rate: 100, // 100 RPS regardless of response time + timeUnit: '1s', + duration: '10m', + preAllocatedVUs: 50, + maxVUs: 200, + } + } +} +``` + +--- + +### ❌ Cold Start Testing +**Symptom:** Running load test immediately after deployment without warm-up + +**Why bad:** JIT compilation, cache warming, connection pooling haven't stabilized + +**Fix:** Warm-up phase before measurement + +```javascript +// ✅ Good - warm-up phase +export let options = { + stages: [ + { duration: '2m', target: 50 }, // Warm-up (not measured) + { duration: '10m', target: 100 }, // Actual test + ] +} +``` + +--- + +### ❌ Unrealistic Test Data +**Symptom:** Using same user ID, same query parameters for all virtual users + +**Why bad:** Caches give unrealistic performance, doesn't test real database load + +**Fix:** Parameterized, realistic data + +```javascript +// ❌ Bad - same data +http.get('https://api.example.com/users/123') + +// ✅ Good - parameterized data +import { SharedArray } from 'k6/data' +import papaparse from 'https://jslib.k6.io/papaparse/5.1.1/index.js' + +const csvData = new SharedArray('users', function () { + return papaparse.parse(open('./users.csv'), { header: true }).data +}) + +export default function() { + const user = csvData[__VU % csvData.length] + http.get(`https://api.example.com/users/${user.id}`) +} +``` + +--- + +### ❌ Constant Load Pattern +**Symptom:** Running with constant VUs instead of realistic traffic pattern + +**Why bad:** Real traffic has peaks, valleys, not flat line + +**Fix:** Use realistic daily/hourly patterns + +--- + +### ❌ Ignoring Think Time +**Symptom:** No delays between requests, hammering API as fast as possible + +**Why bad:** Unrealistic user behavior, overestimates load + +**Fix:** Add realistic think time based on user behavior + +```javascript +// ✅ Good - realistic think time +import { sleep } from 'k6' + +export default function() { + http.get('https://api.example.com/products') + sleep(Math.random() * 3 + 2) // 2-5 seconds browsing + + http.post('https://api.example.com/cart', {...}) + sleep(Math.random() * 5 + 5) // 5-10 seconds deciding + + http.post('https://api.example.com/checkout', {...}) +} +``` + +## Result Interpretation Guide + +### Latency Degradation Patterns + +| Pattern | Cause | What to Check | +|---------|-------|---------------| +| **Linear growth** (2x users → 2x latency) | CPU-bound | Thread pool, CPU usage | +| **Exponential growth** (2x users → 10x latency) | Resource saturation | Connection pools, locks, queues | +| **Sudden cliff** (works until X, then fails) | Hard limit hit | Max connections, memory, file descriptors | +| **Gradual degradation** (slow increase over time) | Memory leak, cache pollution | Memory trends, GC activity | + +### Bottleneck Classification + +**Symptom: p95 latency 10x at 2x load** +→ **Resource saturation** (database connection pool, thread pool, queue) + +**Symptom: Errors increase with load** +→ **Hard limit** (connection limit, rate limiting, timeout) + +**Symptom: Latency grows over time at constant load** +→ **Memory leak** or **cache pollution** + +**Symptom: High variance (p50 good, p99 terrible)** +→ **GC pauses**, **lock contention**, or **slow queries** + +### What to Monitor + +| Layer | Metrics to Track | +|-------|------------------| +| **Application** | Request rate, error rate, p50/p95/p99 latency, active requests | +| **Runtime** | GC pauses (JVM, .NET), thread pool usage, heap/memory | +| **Database** | Connection pool usage, query latency, lock waits, slow queries | +| **Infrastructure** | CPU %, memory %, disk I/O, network throughput | +| **External** | Third-party API latency, rate limit hits | + +### Capacity Planning Formula + +``` +Safe Capacity = (Breaking Point × Degradation Factor) × Safety Margin + +Breaking Point = VUs where p95 latency > threshold +Degradation Factor = 0.7 (start degradation before break) +Safety Margin = 0.5-0.7 (handle traffic spikes) + +Example: +- System breaks at 1000 VUs (p95 > 1s) +- Start seeing degradation at 700 VUs (70%) +- Safe capacity: 700 × 0.7 = 490 VUs +``` + +## Authentication and Session Management + +**Problem:** Real APIs require authentication. Can't use same token for all virtual users. + +### Token Strategy Decision Framework + +| Scenario | Strategy | Why | +|----------|----------|-----| +| **Short test (<10 min)** | Pre-generate tokens | Fast, simple, no login load | +| **Long test (soak)** | Login during test + refresh | Realistic, tests auth system | +| **Testing auth system** | Simulate login flow | Auth is part of load | +| **Read-only testing** | Shared token (single user) | Simplest, adequate for API-only tests | + +**Default:** Pre-generate tokens for load tests, simulate login for auth system tests + +### Pre-Generated Tokens Pattern + +**Best for:** API testing where auth system isn't being tested + +```javascript +// k6 with pre-generated JWT tokens +import http from 'k6/http' +import { SharedArray } from 'k6/data' + +// Load tokens from file (generated externally) +const tokens = new SharedArray('auth tokens', function () { + return JSON.parse(open('./tokens.json')) +}) + +export default function() { + const token = tokens[__VU % tokens.length] + + const headers = { + 'Authorization': `Bearer ${token}` + } + + http.get('https://api.example.com/protected', { headers }) +} +``` + +**Generate tokens externally:** + +```bash +# Script to generate 1000 tokens +for i in {1..1000}; do + curl -X POST https://api.example.com/login \ + -d "username=loadtest_user_$i&password=test" \ + | jq -r '.token' +done > tokens.json +``` + +**Pros:** No login load, fast test setup +**Cons:** Tokens may expire during long tests, not testing auth flow + +--- + +### Login Flow Simulation Pattern + +**Best for:** Testing auth system, soak tests where tokens expire + +```javascript +// k6 with login simulation +import http from 'k6/http' +import { SharedArray } from 'k6/data' + +const users = new SharedArray('users', function () { + return JSON.parse(open('./users.json')) // [{username, password}, ...] +}) + +export default function() { + const user = users[__VU % users.length] + + // Login to get token + const loginRes = http.post('https://api.example.com/login', { + username: user.username, + password: user.password + }) + + const token = loginRes.json('token') + + // Use token for subsequent requests + const headers = { 'Authorization': `Bearer ${token}` } + + http.get('https://api.example.com/protected', { headers }) + http.post('https://api.example.com/data', {}, { headers }) +} +``` + +**Token refresh for long tests:** + +```javascript +// k6 with token refresh +import { sleep } from 'k6' + +let token = null +let tokenExpiry = 0 + +export default function() { + const now = Date.now() / 1000 + + // Refresh token if expired or about to expire + if (!token || now > tokenExpiry - 300) { // Refresh 5 min before expiry + const loginRes = http.post('https://api.example.com/login', {...}) + token = loginRes.json('token') + tokenExpiry = loginRes.json('expires_at') + } + + http.get('https://api.example.com/protected', { + headers: { 'Authorization': `Bearer ${token}` } + }) + + sleep(1) +} +``` + +--- + +### Session Cookie Management + +**For cookie-based auth:** + +```javascript +// k6 with session cookies +import http from 'k6/http' + +export default function() { + // k6 automatically handles cookies with jar + const jar = http.cookieJar() + + // Login (sets session cookie) + http.post('https://example.com/login', { + username: 'user', + password: 'pass' + }) + + // Subsequent requests use session cookie automatically + http.get('https://example.com/dashboard') + http.get('https://example.com/profile') +} +``` + +--- + +### Rate Limiting Detection + +**Pattern:** Detect when hitting rate limits during load test + +```javascript +// k6 rate limit detection +import { check } from 'k6' + +export default function() { + const res = http.get('https://api.example.com/data') + + check(res, { + 'not rate limited': (r) => r.status !== 429 + }) + + if (res.status === 429) { + console.warn(`Rate limited at VU ${__VU}, iteration ${__ITER}`) + const retryAfter = res.headers['Retry-After'] + console.warn(`Retry-After: ${retryAfter} seconds`) + } +} +``` + +**Thresholds for rate limiting:** + +```javascript +export let options = { + thresholds: { + 'http_req_failed{status:429}': ['rate<0.01'] // <1% rate limited + } +} +``` + +## Third-Party Dependency Handling + +**Problem:** APIs call external services (payment, email, third-party APIs). Should you mock them? + +### Mock vs Real Decision Framework + +| External Service | Mock or Real? | Why | +|------------------|---------------|-----| +| **Payment gateway** | Real (sandbox) | Need to test integration, has sandbox mode | +| **Email provider** | Mock | Cost ($0.001/email × 1000 VUs = expensive), no value testing | +| **Third-party API (has staging)** | Real (staging) | Test integration, realistic latency | +| **Third-party API (no staging)** | Mock | Can't load test production, rate limits | +| **Internal microservices** | Real | Testing real integration points | +| **Analytics/tracking** | Mock | High volume, no functional impact | + +**Rule:** Use real services if they have sandbox/staging. Mock if expensive, rate-limited, or no test environment. + +--- + +### Service Virtualization with WireMock + +**Best for:** Mocking HTTP APIs with realistic responses + +```javascript +// k6 test pointing to WireMock +export default function() { + // WireMock running on localhost:8080 mocks external API + const res = http.get('http://localhost:8080/api/payment/process') + + check(res, { + 'payment mock responds': (r) => r.status === 200 + }) +} +``` + +**WireMock stub setup:** + +```json +{ + "request": { + "method": "POST", + "url": "/api/payment/process" + }, + "response": { + "status": 200, + "jsonBody": { + "transaction_id": "{{randomValue type='UUID'}}", + "status": "approved" + }, + "headers": { + "Content-Type": "application/json" + }, + "fixedDelayMilliseconds": 200 + } +} +``` + +**Why WireMock:** Realistic latency simulation, dynamic responses, stateful mocking + +--- + +### Partial Mocking Pattern + +**Pattern:** Mock some services, use real for others + +```javascript +// k6 with partial mocking +import http from 'k6/http' + +export default function() { + // Real API (points to staging) + const productRes = http.get('https://staging-api.example.com/products') + + // Mock email service (points to WireMock) + http.post('http://localhost:8080/mock/email/send', { + to: 'user@example.com', + subject: 'Order confirmation' + }) + + // Real payment sandbox + http.post('https://sandbox-payment.stripe.com/charge', { + amount: 1000, + currency: 'usd', + source: 'tok_visa' + }) +} +``` + +**Decision criteria:** +- Real: Services with sandbox, need integration validation, low cost +- Mock: No sandbox, expensive, rate-limited, testing failure scenarios + +--- + +### Testing External Service Failures + +**Use mocks to simulate failures:** + +```javascript +// WireMock stub for failure scenarios +{ + "request": { + "method": "POST", + "url": "/api/payment/process" + }, + "response": { + "status": 503, + "jsonBody": { + "error": "Service temporarily unavailable" + }, + "fixedDelayMilliseconds": 5000 // Slow failure + } +} +``` + +**k6 test for resilience:** + +```javascript +export default function() { + const res = http.post('http://localhost:8080/api/payment/process', {}) + + // Verify app handles payment failures gracefully + check(res, { + 'handles payment failure': (r) => r.status === 503, + 'returns within timeout': (r) => r.timings.duration < 6000 + }) +} +``` + +--- + +### Cost and Compliance Guardrails + +**Before testing with real external services:** + +| Check | Why | +|-------|-----| +| **Sandbox mode exists?** | Avoid production costs/rate limits | +| **Cost per request?** | 1000 VUs × 10 req/s × 600s = 6M requests | +| **Rate limits?** | Will you hit external service limits? | +| **Terms of service?** | Does load testing violate TOS? | +| **Data privacy?** | Using real user emails/PII? | + +**Example cost calculation:** + +``` +Email service: $0.001/email +Load test: 100 VUs × 5 emails/session × 600s = 300,000 emails +Cost: 300,000 × $0.001 = $300 + +Decision: Mock email service, use real payment sandbox (free) +``` + +**Compliance:** +- Don't use real user data in load tests (GDPR, privacy) +- Check third-party TOS (some prohibit load testing) +- Use synthetic test data only + +## Your First Load Test + +**Goal:** Basic load test in one day + +**Hour 1-2: Install tool and write smoke test** + +```bash +# Install k6 +brew install k6 # macOS +# or snap install k6 # Linux + +# Create test.js +cat > test.js <<'EOF' +import http from 'k6/http' +import { check, sleep } from 'k6' + +export let options = { + vus: 1, + duration: '30s' +} + +export default function() { + let res = http.get('https://your-api.com/health') + check(res, { + 'status is 200': (r) => r.status === 200, + 'response < 500ms': (r) => r.timings.duration < 500 + }) + sleep(1) +} +EOF + +# Run smoke test +k6 run test.js +``` + +**Hour 3-4: Calculate target load** + +``` +Your DAU: 10,000 +Concurrency: 10% +Peak multiplier: 1.5 +Target: 10,000 × 0.10 × 1.5 = 1,500 VUs +``` + +**Hour 5-6: Write load test with ramp-up** + +```javascript +export let options = { + stages: [ + { duration: '5m', target: 750 }, // Ramp to normal (50%) + { duration: '10m', target: 750 }, // Hold normal + { duration: '5m', target: 1500 }, // Ramp to peak + { duration: '10m', target: 1500 }, // Hold peak + { duration: '5m', target: 0 }, // Ramp down + ], + thresholds: { + http_req_duration: ['p(95)<500', 'p(99)<1000'], + http_req_failed: ['rate<0.05'] // < 5% errors + } +} +``` + +**Hour 7-8: Run test and analyze** + +```bash +# Run load test +k6 run --out json=results.json test.js + +# Check summary output for: +# - p95/p99 latency trends +# - Error rates +# - When degradation started +``` + +**If test fails:** Check thresholds, adjust targets, investigate bottlenecks + +## Common Mistakes + +### ❌ Testing Production Without Safeguards +**Fix:** Use feature flags, test environment, or controlled percentage + +--- + +### ❌ No Baseline Performance Metrics +**Fix:** Run smoke test first to establish baseline before load testing + +--- + +### ❌ Using Iteration Duration Instead of Arrival Rate +**Fix:** Use `constant-arrival-rate` executor in k6 + +--- + +### ❌ Not Warming Up Caches/JIT +**Fix:** 2-5 minute warm-up phase before measurement + +## Quick Reference + +**Tool Selection:** +- Modern API: k6 +- Enterprise: JMeter +- Python team: Locust + +**Test Patterns:** +- Smoke: 1 VU, 1 min +- Load: Ramp-up → peak → ramp-down +- Stress: Increase until break +- Spike: Sudden 10x surge +- Soak: 4-8 hours constant + +**Load Calculation:** +``` +Concurrent = DAU × 0.10 × 1.5 +RPS = (Concurrent × Requests/Session) / (Duration × Think Time) +``` + +**Anti-Patterns:** +- Coordinated omission (use arrival rate) +- Cold start (warm-up first) +- Unrealistic data (parameterize) +- Constant load (use realistic patterns) + +**Result Interpretation:** +- Linear growth → CPU-bound +- Exponential growth → Resource saturation +- Sudden cliff → Hard limit +- Gradual degradation → Memory leak + +**Authentication:** +- Short tests: Pre-generate tokens +- Long tests: Login + refresh +- Testing auth: Simulate login flow + +**Third-Party Dependencies:** +- Has sandbox: Use real (staging/sandbox) +- Expensive/rate-limited: Mock (WireMock) +- No sandbox: Mock + +## Bottom Line + +**Start with smoke test (1 VU). Calculate realistic load from DAU. Use ramp-up pattern (never start at peak). Monitor p95/p99 latency. Find breaking point before users do.** + +Test realistic scenarios with think time, not hammer tests. diff --git a/skills/mutation-testing/SKILL.md b/skills/mutation-testing/SKILL.md new file mode 100644 index 0000000..85e8ccb --- /dev/null +++ b/skills/mutation-testing/SKILL.md @@ -0,0 +1,348 @@ +--- +name: mutation-testing +description: Use when validating test effectiveness, measuring test quality beyond coverage, choosing mutation testing tools (Stryker, PITest, mutmut), interpreting mutation scores, or improving test suites - provides mutation operators, score interpretation, and integration patterns +--- + +# Mutation Testing + +## Overview + +**Core principle:** Mutation testing validates that your tests actually test something by introducing bugs and checking if tests catch them. + +**Rule:** 100% code coverage doesn't mean good tests. Mutation score measures if tests detect bugs. + +## Code Coverage vs Mutation Score + +| Metric | What It Measures | Example | +|--------|------------------|---------| +| **Code Coverage** | Lines executed by tests | `calculate_tax(100)` executes code = 100% coverage | +| **Mutation Score** | Bugs detected by tests | Change `*` to `/` → test still passes = poor tests | + +**Problem with coverage:** + +```python +def calculate_tax(amount): + return amount * 0.08 + +def test_calculate_tax(): + calculate_tax(100) # 100% coverage, but asserts nothing! +``` + +**Mutation testing catches this:** +1. Mutates `* 0.08` to `/ 0.08` +2. Runs test +3. Test still passes → **Survived mutation** (bad test!) + +--- + +## How Mutation Testing Works + +**Process:** +1. **Create mutant:** Change code slightly (e.g., `+` → `-`, `<` → `<=`) +2. **Run tests:** Do tests fail? +3. **Classify:** + - **Killed:** Test failed → Good test! + - **Survived:** Test passed → Test doesn't verify this logic + - **Timeout:** Test hung → Usually killed + - **No coverage:** Not executed → Add test + +**Mutation Score:** +``` +Mutation Score = (Killed Mutants / Total Mutants) × 100 +``` + +**Thresholds:** +- **> 80%:** Excellent test quality +- **60-80%:** Acceptable +- **< 60%:** Tests are weak + +--- + +## Tool Selection + +| Language | Tool | Why | +|----------|------|-----| +| **JavaScript/TypeScript** | **Stryker** | Best JS support, framework-agnostic | +| **Java** | **PITest** | Industry standard, Maven/Gradle integration | +| **Python** | **mutmut** | Simple, fast, pytest integration | +| **C#** | **Stryker.NET** | .NET ecosystem integration | + +--- + +## Example: Python with mutmut + +### Installation + +```bash +pip install mutmut +``` + +--- + +### Basic Usage + +```bash +# Run mutation testing +mutmut run + +# View results +mutmut results + +# Show survived mutants (bugs your tests missed) +mutmut show +``` + +--- + +### Configuration + +```toml +# setup.cfg +[mutmut] +paths_to_mutate=src/ +backup=False +runner=python -m pytest -x +tests_dir=tests/ +``` + +--- + +### Example + +```python +# src/calculator.py +def calculate_discount(price, percent): + if percent > 100: + raise ValueError("Percent cannot exceed 100") + return price * (1 - percent / 100) + +# tests/test_calculator.py +def test_calculate_discount(): + result = calculate_discount(100, 20) + assert result == 80 +``` + +**Run mutmut:** +```bash +mutmut run +``` + +**Possible mutations:** +1. `percent > 100` → `percent >= 100` (boundary) +2. `1 - percent` → `1 + percent` (operator) +3. `percent / 100` → `percent * 100` (operator) +4. `price * (...)` → `price / (...)` (operator) + +**Results:** +- Mutation 1 **survived** (test doesn't check boundary) +- Mutation 2, 3, 4 **killed** (test catches these) + +**Improvement:** +```python +def test_calculate_discount_boundary(): + # Catch mutation 1 + with pytest.raises(ValueError): + calculate_discount(100, 101) +``` + +--- + +## Common Mutation Operators + +| Operator | Original | Mutated | What It Tests | +|----------|----------|---------|---------------| +| **Arithmetic** | `a + b` | `a - b` | Calculation logic | +| **Relational** | `a < b` | `a <= b` | Boundary conditions | +| **Logical** | `a and b` | `a or b` | Boolean logic | +| **Unary** | `+x` | `-x` | Sign handling | +| **Constant** | `return 0` | `return 1` | Magic numbers | +| **Return** | `return x` | `return None` | Return value validation | +| **Statement deletion** | `x = 5` | (deleted) | Side effects | + +--- + +## Interpreting Mutation Score + +### High Score (> 80%) + +**Good tests that catch most bugs.** + +```python +def add(a, b): + return a + b + +def test_add(): + assert add(2, 3) == 5 + assert add(-1, 1) == 0 + assert add(0, 0) == 0 + +# Mutations killed: +# - a - b (returns -1, test expects 5) +# - a * b (returns 6, test expects 5) +``` + +--- + +### Low Score (< 60%) + +**Weak tests that don't verify logic.** + +```python +def validate_email(email): + return "@" in email and "." in email + +def test_validate_email(): + validate_email("user@example.com") # No assertion! + +# Mutations survived: +# - "@" in email → "@" not in email +# - "and" → "or" +# - (All mutations survive because test asserts nothing) +``` + +--- + +### Survived Mutants to Investigate + +**Priority order:** +1. **Business logic mutations** (calculations, validations) +2. **Boundary conditions** (`<` → `<=`, `>` → `>=`) +3. **Error handling** (exception raising) + +**Low priority:** +4. **Logging statements** +5. **Constants that don't affect behavior** + +--- + +## Integration with CI/CD + +### GitHub Actions (Python) + +```yaml +# .github/workflows/mutation-testing.yml +name: Mutation Testing + +on: + schedule: + - cron: '0 2 * * 0' # Weekly on Sunday 2 AM + workflow_dispatch: # Manual trigger + +jobs: + mutmut: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + pip install mutmut pytest + + - name: Run mutation testing + run: mutmut run + + - name: Generate report + run: | + mutmut results + mutmut html # Generate HTML report + + - name: Upload report + uses: actions/upload-artifact@v3 + with: + name: mutation-report + path: html/ +``` + +**Why weekly, not every PR:** +- Mutation testing is slow (10-100x slower than regular tests) +- Runs every possible mutation +- Not needed for every change + +--- + +## Anti-Patterns Catalog + +### ❌ Chasing 100% Mutation Score + +**Symptom:** Writing tests just to kill surviving mutants + +**Why bad:** +- Some mutations are equivalent (don't change behavior) +- Diminishing returns after 85% +- Time better spent on integration tests + +**Fix:** Target 80-85%, focus on business logic + +--- + +### ❌ Ignoring Equivalent Mutants + +**Symptom:** "95% mutation score, still have survived mutants" + +**Equivalent mutants:** Changes that don't affect behavior + +```python +def is_positive(x): + return x > 0 + +# Mutation: x > 0 → x >= 0 +# If input is never exactly 0, this mutation is equivalent +``` + +**Fix:** Mark as equivalent in tool config + +```bash +# mutmut - mark mutant as equivalent +mutmut results +# Choose mutant ID +mutmut apply 42 --mark-as-equivalent +``` + +--- + +### ❌ Running Mutation Tests on Every Commit + +**Symptom:** CI takes 2 hours + +**Why bad:** Mutation testing is 10-100x slower than regular tests + +**Fix:** +- Run weekly or nightly +- Run on core modules only (not entire codebase) +- Use as quality metric, not blocker + +--- + +## Incremental Mutation Testing + +**Test only changed code:** + +```bash +# mutmut - test only modified files +git diff --name-only main | grep '\.py$' | mutmut run --paths-to-mutate - +``` + +**Benefits:** +- Faster feedback (minutes instead of hours) +- Can run on PRs +- Focuses on new code + +--- + +## Bottom Line + +**Mutation testing measures if your tests actually detect bugs. High code coverage doesn't mean good tests.** + +**Usage:** +- Run weekly/nightly, not on every commit (too slow) +- Target 80-85% mutation score for business logic +- Use mutmut (Python), Stryker (JS), PITest (Java) +- Focus on killed vs survived mutants +- Ignore equivalent mutants + +**If your tests have 95% coverage but 40% mutation score, your tests aren't testing anything meaningful. Fix the tests, not the coverage metric.** diff --git a/skills/observability-and-monitoring/SKILL.md b/skills/observability-and-monitoring/SKILL.md new file mode 100644 index 0000000..a902d62 --- /dev/null +++ b/skills/observability-and-monitoring/SKILL.md @@ -0,0 +1,479 @@ +--- +name: observability-and-monitoring +description: Use when implementing metrics/logs/traces, defining SLIs/SLOs, designing alerts, choosing observability tools, debugging alert fatigue, or optimizing observability costs - provides SRE frameworks, anti-patterns, and implementation patterns +--- + +# Observability and Monitoring + +## Overview + +**Core principle:** Measure what users care about, alert on symptoms not causes, make alerts actionable. + +**Rule:** Observability without actionability is just expensive logging. + +**Already have observability tools (CloudWatch, Datadog, etc.)?** Optimize what you have first. Most observability problems are usage/process issues, not tooling. Implement SLIs/SLOs, clean up alerts, add runbooks with existing tools. Migrate only if you hit concrete tool limitations (cost, features, multi-cloud). Tool migration is expensive - make sure it solves a real problem. + +## Getting Started Decision Tree + +| Team Size | Scale | Starting Point | Tools | +|-----------|-------|----------------|-------| +| 1-5 engineers | <10 services | Metrics + logs | Prometheus + Grafana + Loki | +| 5-20 engineers | 10-50 services | Metrics + logs + basic traces | Add Jaeger, OpenTelemetry | +| 20+ engineers | 50+ services | Full observability + SLOs | Managed platform (Datadog, Grafana Cloud) | + +**First step:** Implement metrics with OpenTelemetry + Prometheus + +**Why this order:** Metrics give you fastest time-to-value (detect issues), logs help debug (understand what happened), traces solve complex distributed problems (debug cross-service issues) + +## Three Pillars Quick Reference + +### Metrics (Quantitative, aggregated) + +**When to use:** Alerting, dashboards, trend analysis + +**What to collect:** +- **RED method** (services): Rate, Errors, Duration +- **USE method** (resources): Utilization, Saturation, Errors +- **Four Golden Signals**: Latency, traffic, errors, saturation + +**Implementation:** +```python +# OpenTelemetry metrics +from opentelemetry import metrics + +meter = metrics.get_meter(__name__) +request_counter = meter.create_counter( + "http_requests_total", + description="Total HTTP requests" +) +request_duration = meter.create_histogram( + "http_request_duration_seconds", + description="HTTP request duration" +) + +# Instrument request +request_counter.add(1, {"method": "GET", "endpoint": "/api/users"}) +request_duration.record(duration, {"method": "GET", "endpoint": "/api/users"}) +``` + +### Logs (Discrete events) + +**When to use:** Debugging, audit trails, error investigation + +**Best practices:** +- Structured logging (JSON) +- Include correlation IDs +- Don't log sensitive data (PII, secrets) + +**Implementation:** +```python +import structlog + +log = structlog.get_logger() +log.info( + "user_login", + user_id=user_id, + correlation_id=correlation_id, + ip_address=ip, + duration_ms=duration +) +``` + +### Traces (Request flows) + +**When to use:** Debugging distributed systems, latency investigation + +**Implementation:** +```python +from opentelemetry import trace + +tracer = trace.get_tracer(__name__) + +with tracer.start_as_current_span("process_order") as span: + span.set_attribute("order.id", order_id) + span.set_attribute("user.id", user_id) + # Process order logic +``` + +## Anti-Patterns Catalog + +### ❌ Vanity Metrics +**Symptom:** Tracking metrics that look impressive but don't inform decisions + +**Why bad:** Wastes resources, distracts from actionable metrics + +**Fix:** Only collect metrics that answer "should I page someone?" or inform business decisions + +```python +# ❌ Bad - vanity metric +total_requests_all_time_counter.inc() + +# ✅ Good - actionable metric +request_error_rate.labels(service="api", endpoint="/users").observe(error_rate) +``` + +--- + +### ❌ Alert on Everything +**Symptom:** Hundreds of alerts per day, team ignores most of them + +**Why bad:** Alert fatigue, real issues get missed, on-call burnout + +**Fix:** Alert only on user-impacting symptoms that require immediate action + +**Test:** "If this alert fires at 2am, should someone wake up to fix it?" If no, it's not an alert. + +--- + +### ❌ No Runbooks +**Symptom:** Alerts fire with no guidance on how to respond + +**Why bad:** Increased MTTR, inconsistent responses, on-call stress + +**Fix:** Every alert must link to a runbook with investigation steps + +```yaml +# ✅ Good alert with runbook +alert: HighErrorRate +annotations: + summary: "Error rate >5% on {{$labels.service}}" + description: "Current: {{$value}}%" + runbook: "https://wiki.company.com/runbooks/high-error-rate" +``` + +--- + +### ❌ Cardinality Explosion +**Symptom:** Metrics with unbounded labels (user IDs, timestamps, UUIDs) cause storage/performance issues + +**Why bad:** Expensive storage, slow queries, potential system failure + +**Fix:** Use fixed cardinality labels, aggregate high-cardinality dimensions + +```python +# ❌ Bad - unbounded cardinality +request_counter.labels(user_id=user_id).inc() # Millions of unique series + +# ✅ Good - bounded cardinality +request_counter.labels(user_type="premium", region="us-east").inc() +``` + +--- + +### ❌ Missing Correlation IDs +**Symptom:** Can't trace requests across services, debugging takes hours + +**Why bad:** High MTTR, frustrated engineers, customer impact + +**Fix:** Generate correlation ID at entry point, propagate through all services + +```python +# ✅ Good - correlation ID propagation +import uuid +from contextvars import ContextVar + +correlation_id_var = ContextVar("correlation_id", default=None) + +def handle_request(): + correlation_id = request.headers.get("X-Correlation-ID") or str(uuid.uuid4()) + correlation_id_var.set(correlation_id) + + # All logs and traces include it automatically + log.info("processing_request", extra={"correlation_id": correlation_id}) +``` + +## SLI Selection Framework + +**Principle:** Measure user experience, not system internals + +### Four Golden Signals + +| Signal | Definition | Example SLI | +|--------|------------|-------------| +| **Latency** | Request response time | p99 latency < 200ms | +| **Traffic** | Demand on system | Requests per second | +| **Errors** | Failed requests | Error rate < 0.1% | +| **Saturation** | Resource fullness | CPU < 80%, queue depth < 100 | + +### RED Method (for services) + +- **Rate**: Requests per second +- **Errors**: Error rate (%) +- **Duration**: Response time (p50, p95, p99) + +### USE Method (for resources) + +- **Utilization**: % time resource busy (CPU %, disk I/O %) +- **Saturation**: Queue depth, wait time +- **Errors**: Error count + +**Decision framework:** + +| Service Type | Recommended SLIs | +|--------------|------------------| +| **User-facing API** | Availability (%), p95 latency, error rate | +| **Background jobs** | Freshness (time since last run), success rate, processing time | +| **Data pipeline** | Data freshness, completeness (%), processing latency | +| **Storage** | Availability, durability, latency percentiles | + +## SLO Definition Guide + +**SLO = Target value for SLI** + +**Formula:** `SLO = (good events / total events) >= target` + +**Example:** +``` +SLI: Request success rate +SLO: 99.9% of requests succeed (measured over 30 days) +Error budget: 0.1% = ~43 minutes downtime/month +``` + +### Error Budget + +**Definition:** Amount of unreliability you can tolerate + +**Calculation:** +``` +Error budget = 1 - SLO target +If SLO = 99.9%, error budget = 0.1% +For 1M requests/month: 1,000 requests can fail +``` + +**Usage:** Balance reliability vs feature velocity + +### Multi-Window Multi-Burn-Rate Alerting + +**Problem:** Simple threshold alerts are either too noisy or too slow + +**Solution:** Alert based on how fast you're burning error budget + +```yaml +# Alert if burning budget 14.4x faster than acceptable (5% in 1 hour) +alert: ErrorBudgetBurnRateHigh +expr: | + ( + rate(errors[1h]) / rate(requests[1h]) + ) > (14.4 * (1 - 0.999)) +annotations: + summary: "Burning error budget at 14.4x rate" + runbook: "https://wiki/runbooks/error-budget-burn" +``` + +## Alert Design Patterns + +**Principle:** Alert on symptoms (user impact) not causes (CPU high) + +### Symptom-Based Alerting + +```python +# ❌ Bad - alert on cause +alert: HighCPU +expr: cpu_usage > 80% + +# ✅ Good - alert on symptom +alert: HighLatency +expr: http_request_duration_p99 > 1.0 +``` + +### Alert Severity Levels + +| Level | When | Response Time | Example | +|-------|------|---------------|---------| +| **Critical** | User-impacting | Immediate (page) | Error rate >5%, service down | +| **Warning** | Will become critical | Next business day | Error rate >1%, disk 85% full | +| **Info** | Informational | No action needed | Deploy completed, scaling event | + +**Rule:** Only page for critical. Everything else goes to dashboard/Slack. + +## Cost Optimization Quick Reference + +**Observability can cost 5-15% of infrastructure spend. Optimize:** + +### Sampling Strategies + +```python +# Trace sampling - collect 10% of traces +from opentelemetry.sdk.trace.sampling import TraceIdRatioBased + +sampler = TraceIdRatioBased(0.1) # 10% sampling +``` + +**When to sample:** +- Traces: 1-10% for high-traffic services +- Logs: Sample debug/info logs, keep all errors +- Metrics: Don't sample (they're already aggregated) + +### Retention Policies + +| Data Type | Recommended Retention | Rationale | +|-----------|----------------------|-----------| +| **Metrics** | 15 days (raw), 13 months (aggregated) | Trend analysis | +| **Logs** | 7-30 days | Debugging, compliance | +| **Traces** | 7 days | Debugging recent issues | + +### Cardinality Control + +```python +# ❌ Bad - high cardinality +http_requests.labels( + method=method, + url=full_url, # Unbounded! + user_id=user_id # Unbounded! +) + +# ✅ Good - controlled cardinality +http_requests.labels( + method=method, + endpoint=route_pattern, # /users/:id not /users/12345 + status_code=status +) +``` + +## Tool Ecosystem Quick Reference + +| Category | Open Source | Managed/Commercial | +|----------|-------------|-------------------| +| **Metrics** | Prometheus, VictoriaMetrics | Datadog, New Relic, Grafana Cloud | +| **Logs** | Loki, ELK Stack | Datadog, Splunk, Sumo Logic | +| **Traces** | Jaeger, Zipkin | Datadog, Honeycomb, Lightstep | +| **All-in-One** | Grafana + Loki + Tempo + Mimir | Datadog, New Relic, Dynatrace | +| **Instrumentation** | OpenTelemetry | (vendor SDKs) | + +**Recommendation:** +- **Starting out**: Prometheus + Grafana + OpenTelemetry +- **Growing (10-50 services)**: Add Loki (logs) + Jaeger (traces) +- **Scale (50+ services)**: Consider managed (Datadog, Grafana Cloud) + +**Why OpenTelemetry:** Vendor-neutral, future-proof, single instrumentation for all signals + +## Your First Observability Setup + +**Goal:** Metrics + alerting in one week + +**Day 1-2: Instrument application** + +```python +# Add OpenTelemetry +from opentelemetry import metrics, trace +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.exporter.prometheus import PrometheusMetricReader + +# Initialize +meter_provider = MeterProvider( + metric_readers=[PrometheusMetricReader()] +) +metrics.set_meter_provider(meter_provider) + +# Instrument HTTP framework (auto-instrumentation) +from opentelemetry.instrumentation.flask import FlaskInstrumentor +FlaskInstrumentor().instrument_app(app) +``` + +**Day 3-4: Deploy Prometheus + Grafana** + +```yaml +# docker-compose.yml +version: '3' +services: + prometheus: + image: prom/prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + + grafana: + image: grafana/grafana + ports: + - "3000:3000" +``` + +**Day 5: Define SLIs and SLOs** + +``` +SLI: HTTP request success rate +SLO: 99.9% of requests succeed (30-day window) +Error budget: 0.1% = 43 minutes downtime/month +``` + +**Day 6: Create alerts** + +```yaml +# prometheus-alerts.yml +groups: + - name: slo_alerts + rules: + - alert: HighErrorRate + expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 + for: 5m + annotations: + summary: "Error rate >5% on {{$labels.service}}" + runbook: "https://wiki/runbooks/high-error-rate" +``` + +**Day 7: Build dashboard** + +**Panels to include:** +- Error rate (%) +- Request rate (req/s) +- p50/p95/p99 latency +- CPU/memory utilization + +## Common Mistakes + +### ❌ Logging in Production == Debugging in Production +**Fix:** Use structured logging with correlation IDs, not print statements + +--- + +### ❌ Alerting on Predictions, Not Reality +**Fix:** Alert on actual user impact (errors, latency) not predicted issues (disk 70% full) + +--- + +### ❌ Dashboard Sprawl +**Fix:** One main dashboard per service showing SLIs. Delete dashboards unused for 3 months. + +--- + +### ❌ Ignoring Alert Feedback Loop +**Fix:** Track alert precision (% that led to action). Delete alerts with <50% precision. + +## Quick Reference + +**Getting Started:** +- Start with metrics (Prometheus + OpenTelemetry) +- Add logs when debugging is hard (Loki) +- Add traces when issues span services (Jaeger) + +**SLI Selection:** +- User-facing: Availability, latency, error rate +- Background: Freshness, success rate, processing time + +**SLO Targets:** +- Start with 99% (achievable) +- Increase to 99.9% only if business requires it +- 99.99% is very expensive (4 nines = 52 min/year downtime) + +**Alerting:** +- Critical only = page +- Warning = next business day +- Info = dashboard only + +**Cost Control:** +- Sample traces (1-10%) +- Control metric cardinality (no unbounded labels) +- Set retention policies (7-30 days logs, 15 days metrics) + +**Tools:** +- Small: Prometheus + Grafana + Loki +- Medium: Add Jaeger +- Large: Consider Datadog, Grafana Cloud + +## Bottom Line + +**Start with metrics using OpenTelemetry + Prometheus. Define 3-5 SLIs based on user experience. Alert only on symptoms that require immediate action. Add logs and traces when metrics aren't enough.** + +Measure what users care about, not what's easy to measure. diff --git a/skills/performance-testing-fundamentals/SKILL.md b/skills/performance-testing-fundamentals/SKILL.md new file mode 100644 index 0000000..36fd60f --- /dev/null +++ b/skills/performance-testing-fundamentals/SKILL.md @@ -0,0 +1,242 @@ +--- +name: performance-testing-fundamentals +description: Use when starting performance testing, choosing load testing tools, interpreting performance metrics, debugging slow applications, or establishing performance baselines - provides decision frameworks and anti-patterns for load, stress, spike, and soak testing +--- + +# Performance Testing Fundamentals + +## Overview + +**Core principle:** Diagnose first, test second. Performance testing without understanding your bottlenecks wastes time. + +**Rule:** Define SLAs before testing. You can't judge "good" performance without requirements. + +## When NOT to Performance Test + +Performance test only AFTER: +- ✅ Defining performance SLAs (latency, throughput, error rate targets) +- ✅ Profiling current bottlenecks (APM, database logs, profiling) +- ✅ Fixing obvious issues (missing indexes, N+1 queries, inefficient algorithms) + +**Don't performance test to find problems** - use profiling/APM for that. Performance test to verify fixes and validate capacity. + +## Tool Selection Decision Tree + +| Your Constraint | Choose | Why | +|----------------|--------|-----| +| CI/CD integration, JavaScript team | **k6** | Modern, code-as-config, easy CI integration | +| Complex scenarios, enterprise, mature ecosystem | **JMeter** | GUI, plugins, every protocol | +| High throughput (10k+ RPS), Scala team | **Gatling** | Built for scale, excellent reports | +| Quick HTTP benchmark, no complex scenarios | **Apache Bench (ab)** or **wrk** | Command-line, no setup | +| Cloud-based, don't want infrastructure | **BlazeMeter**, **Loader.io** | SaaS, pay-per-use | +| Realistic browser testing (JS rendering) | **Playwright** + **k6** | Hybrid: Playwright for UX, k6 for load | + +**For most teams:** k6 (modern, scriptable) or JMeter (mature, GUI) + +## Test Type Quick Reference + +| Test Type | Purpose | Duration | Load Pattern | Use When | +|-----------|---------|----------|--------------|----------| +| **Load Test** | Verify normal operations under expected load | 15-30 min | Steady (ramp to target, sustain) | Baseline validation, regression testing | +| **Stress Test** | Find breaking point | 5-15 min | Increasing (ramp until failure) | Capacity planning, finding limits | +| **Spike Test** | Test sudden traffic surge | 2-5 min | Instant jump (0 → peak) | Black Friday prep, auto-scaling validation | +| **Soak Test** | Find memory leaks, connection pool exhaustion | 2-8 hours | Steady sustained load | Pre-production validation, stability check | + +**Start with Load Test** (validates baseline), then Stress/Spike (finds limits), finally Soak (validates stability). + +## Anti-Patterns Catalog + +### ❌ Premature Load Testing +**Symptom:** "App is slow, let's load test it" + +**Why bad:** Load testing reveals "it's slow under load" but not WHY or WHERE + +**Fix:** Profile first (APM, database slow query logs, profiler), fix obvious bottlenecks, THEN load test to validate + +--- + +### ❌ Testing Without SLAs +**Symptom:** "My API handles 100 RPS with 200ms average latency. Is that good?" + +**Why bad:** Can't judge "good" without requirements. A gaming API needs <50ms; batch processing tolerates 2s. + +**Fix:** Define SLAs first: +- Target latency: P95 < 300ms, P99 < 500ms +- Target throughput: 500 RPS at peak +- Max error rate: < 0.1% + +--- + +### ❌ Unrealistic SLAs +**Symptom:** "Our database-backed CRUD API with complex joins must have P95 < 10ms" + +**Why bad:** Sets impossible targets. Database round-trip alone is often 5-20ms. Forces wasted optimization or architectural rewrites. + +**Fix:** Compare against Performance Benchmarks table (see below). If target is 10x better than benchmark, profile current performance first, then negotiate realistic SLA based on what's achievable vs cost of optimization. + +--- + +### ❌ Vanity Metrics +**Symptom:** Reporting only average response time + +**Why bad:** Average hides tail latency. 99% of requests at 100ms + 1% at 10s = "average 200ms" looks fine, but users experience 10s delays. + +**Fix:** Always report percentiles: +- P50 (median) - typical user experience +- P95 - most users +- P99 - worst-case for significant minority +- Max - outliers + +--- + +### ❌ Load Testing in Production First +**Symptom:** "Let's test capacity by running load tests against production" + +**Why bad:** Risks outages, contaminates real metrics, can trigger alerts/costs + +**Fix:** Test in staging environment that mirrors production (same DB size, network latency, resource limits) + +--- + +### ❌ Single-User "Load" Tests +**Symptom:** Running one user hitting the API as fast as possible + +**Why bad:** Doesn't simulate realistic concurrency, misses resource contention (database connections, thread pools) + +**Fix:** Simulate realistic concurrent users with realistic think time between requests + +## Metrics Glossary + +| Metric | Definition | Good Threshold (typical web API) | +|--------|------------|----------------------------------| +| **RPS** (Requests/Second) | Throughput - how many requests processed | Varies by app; know your peak | +| **Latency** | Time from request to response | P95 < 300ms, P99 < 500ms | +| **P50 (Median)** | 50% of requests faster than this | P50 < 100ms | +| **P95** | 95% of requests faster than this | P95 < 300ms | +| **P99** | 99% of requests faster than this | P99 < 500ms | +| **Error Rate** | % of 4xx/5xx responses | < 0.1% | +| **Throughput** | Data transferred per second (MB/s) | Depends on payload size | +| **Concurrent Users** | Active users at same time | Calculate from traffic patterns | + +**Focus on P95/P99, not average.** Tail latency kills user experience. + +## Diagnostic-First Workflow + +Before load testing slow applications, follow this workflow: + +**Step 1: Measure Current State** +- Install APM (DataDog, New Relic, Grafana) or logging +- Identify slowest 10 endpoints/operations +- Check database slow query logs + +**Step 2: Common Quick Wins** (90% of performance issues) +- Missing database indexes +- N+1 query problem +- Unoptimized images/assets +- Missing caching (Redis, CDN) +- Synchronous operations that should be async +- Inefficient serialization (JSON parsing bottlenecks) + +**Step 3: Profile Specific Bottleneck** +- Use profiler to see CPU/memory hotspots +- Trace requests to find where time is spent (DB? external API? computation?) +- Check for resource limits (max connections, thread pool exhaustion) + +**Step 4: Fix and Measure** +- Apply fix (add index, cache layer, async processing) +- Measure improvement in production +- Document before/after metrics + +**Step 5: THEN Load Test** (if needed) +- Validate fixes handle expected load +- Find new capacity limits +- Establish regression baseline + +**Anti-pattern to avoid:** Skipping to Step 5 without Steps 1-4. + +## Performance Benchmarks (Reference) + +What "good" looks like by application type: + +| Application Type | Typical P95 Latency | Typical Throughput | Notes | +|------------------|---------------------|-------------------|-------| +| **REST API (CRUD)** | < 200ms | 500-2000 RPS | Database-backed, simple queries | +| **Search API** | < 500ms | 100-500 RPS | Complex queries, ranking algorithms | +| **Payment Gateway** | < 1s | 50-200 RPS | External service calls, strict consistency | +| **Real-time Gaming** | < 50ms | 1000-10000 RPS | Low latency critical | +| **Batch Processing** | 2-10s/job | 10-100 jobs/min | Throughput > latency | +| **Static CDN** | < 100ms | 10000+ RPS | Edge-cached, minimal computation | + +**Use as rough guide, not absolute targets.** Your SLAs depend on user needs. + +## Results Interpretation Framework + +After running a load test: + +**Pass Criteria:** +- ✅ All requests meet latency SLA (e.g., P95 < 300ms) +- ✅ Error rate under threshold (< 0.1%) +- ✅ No resource exhaustion (CPU < 80%, memory stable, no connection pool saturation) +- ✅ Sustained load for test duration without degradation + +**Fail Criteria:** +- ❌ Latency exceeds SLA +- ❌ Error rate spikes +- ❌ Gradual degradation over time (memory leak, connection leak) +- ❌ Resource exhaustion (CPU pegged, OOM errors) + +**Next Steps:** +- **If passing:** Establish this as regression baseline, run periodically in CI +- **If failing:** Profile to find bottleneck, optimize, re-test +- **If borderline:** Test at higher load (stress test) to find safety margin + +## Common Mistakes + +### ❌ Not Ramping Load Gradually +**Symptom:** Instant 0 → 1000 users, everything fails + +**Fix:** Ramp over 2-5 minutes to let auto-scaling/caching warm up (except spike tests, where instant jump is the point) + +--- + +### ❌ Testing With Empty Database +**Symptom:** Tests pass with 100 records, fail with 1M records in production + +**Fix:** Seed staging database with production-scale data + +--- + +### ❌ Ignoring External Dependencies +**Symptom:** Your API is fast, but third-party payment gateway times out under load + +**Fix:** Include external service latency in SLAs, or mock them for isolated API testing + +## Quick Reference + +**Getting Started Checklist:** +1. Define SLAs (latency P95/P99, throughput, error rate) +2. Choose tool (k6 or JMeter for most cases) +3. Start with Load Test (baseline validation) +4. Run Stress Test (find capacity limits) +5. Establish regression baseline +6. Run in CI on major changes + +**When Debugging Slow App:** +1. Profile first (APM, database logs) +2. Fix obvious issues (indexes, N+1, caching) +3. Measure improvement +4. THEN load test to validate + +**Interpreting Results:** +- Report P95/P99, not just average +- Compare against SLAs +- Check for resource exhaustion +- Look for degradation over time (soak tests) + +## Bottom Line + +**Performance testing validates capacity and catches regressions.** + +**Profiling finds bottlenecks.** + +Don't confuse the two - diagnose first, test second. diff --git a/skills/property-based-testing/SKILL.md b/skills/property-based-testing/SKILL.md new file mode 100644 index 0000000..c4ceece --- /dev/null +++ b/skills/property-based-testing/SKILL.md @@ -0,0 +1,504 @@ +--- +name: property-based-testing +description: Use when testing invariants, validating properties across many inputs, using Hypothesis (Python) or fast-check (JavaScript), defining test strategies, handling shrinking, or finding edge cases - provides property definition patterns and integration strategies +--- + +# Property-Based Testing + +## Overview + +**Core principle:** Instead of testing specific examples, test properties that should hold for all inputs. + +**Rule:** Property-based tests generate hundreds of inputs automatically. One property test replaces dozens of example tests. + +## Property-Based vs Example-Based Testing + +| Aspect | Example-Based | Property-Based | +|--------|---------------|----------------| +| **Test input** | Hardcoded examples | Generated inputs | +| **Coverage** | Few specific cases | Hundreds of random cases | +| **Maintenance** | Add new examples manually | Properties automatically tested | +| **Edge cases** | Must think of them | Automatically discovered | + +**Example:** + +```python +# Example-based: Test 3 specific inputs +def test_reverse(): + assert reverse([1, 2, 3]) == [3, 2, 1] + assert reverse([]) == [] + assert reverse([1]) == [1] + +# Property-based: Test ALL inputs +@given(lists(integers())) +def test_reverse_property(lst): + # Property: Reversing twice returns original + assert reverse(reverse(lst)) == lst +``` + +--- + +## Tool Selection + +| Language | Tool | Why | +|----------|------|-----| +| **Python** | **Hypothesis** | Most mature, excellent shrinking | +| **JavaScript/TypeScript** | **fast-check** | TypeScript support, good integration | +| **Java** | **jqwik** | JUnit 5 integration | +| **Haskell** | **QuickCheck** | Original property-based testing library | + +**First choice:** Hypothesis (Python) or fast-check (JavaScript) + +--- + +## Basic Property Test (Python + Hypothesis) + +### Installation + +```bash +pip install hypothesis +``` + +--- + +### Example + +```python +from hypothesis import given +from hypothesis.strategies import integers, lists + +def reverse(lst): + """Reverse a list.""" + return lst[::-1] + +@given(lists(integers())) +def test_reverse_twice(lst): + """Property: Reversing twice returns original.""" + assert reverse(reverse(lst)) == lst +``` + +**Run:** +```bash +pytest test_reverse.py +``` + +**Output:** +``` +Trying example: lst=[] +Trying example: lst=[0] +Trying example: lst=[1, -2, 3] +... (100 examples tested) +PASSED +``` + +**If property fails:** +``` +Falsifying example: lst=[0, 0, 1] +``` + +--- + +## Common Properties + +### 1. Inverse Functions + +**Property:** `f(g(x)) == x` + +```python +from hypothesis import given +from hypothesis.strategies import text + +@given(text()) +def test_encode_decode(s): + """Property: Decoding encoded string returns original.""" + assert decode(encode(s)) == s +``` + +--- + +### 2. Idempotence + +**Property:** `f(f(x)) == f(x)` + +```python +@given(lists(integers())) +def test_sort_idempotent(lst): + """Property: Sorting twice gives same result as sorting once.""" + assert sorted(sorted(lst)) == sorted(lst) +``` + +--- + +### 3. Invariants + +**Property:** Some fact remains true after operation + +```python +@given(lists(integers())) +def test_reverse_length(lst): + """Property: Reversing doesn't change length.""" + assert len(reverse(lst)) == len(lst) + +@given(lists(integers())) +def test_reverse_elements(lst): + """Property: Reversing doesn't change elements.""" + assert set(reverse(lst)) == set(lst) +``` + +--- + +### 4. Commutativity + +**Property:** `f(x, y) == f(y, x)` + +```python +@given(integers(), integers()) +def test_addition_commutative(a, b): + """Property: Addition is commutative.""" + assert a + b == b + a +``` + +--- + +### 5. Associativity + +**Property:** `f(f(x, y), z) == f(x, f(y, z))` + +```python +@given(integers(), integers(), integers()) +def test_addition_associative(a, b, c): + """Property: Addition is associative.""" + assert (a + b) + c == a + (b + c) +``` + +--- + +## Test Strategies (Generating Inputs) + +### Built-In Strategies + +```python +from hypothesis.strategies import ( + integers, + floats, + text, + lists, + dictionaries, + booleans, +) + +@given(integers()) +def test_with_int(x): + pass + +@given(integers(min_value=0, max_value=100)) +def test_with_bounded_int(x): + pass + +@given(text(min_size=1, max_size=10)) +def test_with_short_string(s): + pass + +@given(lists(integers(), min_size=1)) +def test_with_nonempty_list(lst): + pass +``` + +--- + +### Composite Strategies + +**Generate complex objects:** + +```python +from hypothesis import strategies as st +from hypothesis.strategies import composite + +@composite +def users(draw): + """Generate user objects.""" + return { + "name": draw(st.text(min_size=1, max_size=50)), + "age": draw(st.integers(min_value=0, max_value=120)), + "email": draw(st.emails()), + } + +@given(users()) +def test_user_validation(user): + assert validate_user(user) is True +``` + +--- + +### Filtering Strategies + +**Exclude invalid inputs:** + +```python +@given(integers().filter(lambda x: x != 0)) +def test_division(x): + """Test division (x != 0).""" + assert 10 / x == 10 / x + +# Better: Use assume +from hypothesis import assume + +@given(integers()) +def test_division_better(x): + assume(x != 0) + assert 10 / x == 10 / x +``` + +--- + +## Shrinking (Finding Minimal Failing Example) + +**When a property fails, Hypothesis automatically shrinks the input to the smallest failing case.** + +**Example:** + +```python +@given(lists(integers())) +def test_all_positive(lst): + """Fails if any negative number.""" + assert all(x > 0 for x in lst) +``` + +**Initial failure:** +``` +Falsifying example: lst=[-5, 3, -2, 0, 1, 7, -9] +``` + +**After shrinking:** +``` +Falsifying example: lst=[-1] +``` + +**Why it matters:** Minimal examples are easier to debug + +--- + +## Integration with pytest + +```python +# test_properties.py +from hypothesis import given, settings +from hypothesis.strategies import integers + +@settings(max_examples=1000) # Run 1000 examples (default: 100) +@given(integers(min_value=1)) +def test_factorial_positive(n): + """Property: Factorial of positive number is positive.""" + assert factorial(n) > 0 +``` + +**Run:** +```bash +pytest test_properties.py -v +``` + +--- + +## JavaScript Example (fast-check) + +### Installation + +```bash +npm install --save-dev fast-check +``` + +--- + +### Example + +```javascript +import fc from 'fast-check'; + +function reverse(arr) { + return arr.slice().reverse(); +} + +// Property: Reversing twice returns original +test('reverse twice', () => { + fc.assert( + fc.property(fc.array(fc.integer()), (arr) => { + expect(reverse(reverse(arr))).toEqual(arr); + }) + ); +}); +``` + +--- + +## Advanced Patterns + +### Stateful Testing + +**Test state machines:** + +```python +from hypothesis.stateful import RuleBasedStateMachine, rule, initialize + +class QueueMachine(RuleBasedStateMachine): + def __init__(self): + super().__init__() + self.queue = [] + self.model = [] + + @rule(value=integers()) + def enqueue(self, value): + self.queue.append(value) + self.model.append(value) + + @rule() + def dequeue(self): + if self.queue: + actual = self.queue.pop(0) + expected = self.model.pop(0) + assert actual == expected + +TestQueue = QueueMachine.TestCase +``` + +**Finds:** Race conditions, state corruption, invalid state transitions + +--- + +## Anti-Patterns Catalog + +### ❌ Testing Examples, Not Properties + +**Symptom:** Property test with hardcoded checks + +```python +# ❌ BAD: Not a property +@given(integers()) +def test_double(x): + if x == 2: + assert double(x) == 4 + elif x == 3: + assert double(x) == 6 + # This is just example testing! +``` + +**Fix:** Test actual property + +```python +# ✅ GOOD: Real property +@given(integers()) +def test_double(x): + assert double(x) == x * 2 +``` + +--- + +### ❌ Overly Restrictive Assumptions + +**Symptom:** Filtering out most generated inputs + +```python +# ❌ BAD: Rejects 99% of inputs +@given(integers()) +def test_specific_range(x): + assume(x > 1000 and x < 1001) # Only accepts 1 value! + assert process(x) is not None +``` + +**Fix:** Use strategy bounds + +```python +# ✅ GOOD +@given(integers(min_value=1000, max_value=1001)) +def test_specific_range(x): + assert process(x) is not None +``` + +--- + +### ❌ No Assertions + +**Symptom:** Property test that doesn't assert anything + +```python +# ❌ BAD: No assertion +@given(integers()) +def test_no_crash(x): + calculate(x) # Just checks it doesn't crash +``` + +**Fix:** Assert a property + +```python +# ✅ GOOD +@given(integers()) +def test_output_type(x): + result = calculate(x) + assert isinstance(result, int) +``` + +--- + +## CI/CD Integration + +```yaml +# .github/workflows/property-tests.yml +name: Property Tests + +on: [pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install hypothesis pytest + + - name: Run property tests + run: pytest tests/properties/ -v --hypothesis-show-statistics +``` + +--- + +## Quick Reference: Property Patterns + +| Pattern | Example Property | +|---------|------------------| +| **Inverse** | `decode(encode(x)) == x` | +| **Idempotence** | `f(f(x)) == f(x)` | +| **Invariant** | `len(filter(lst, f)) <= len(lst)` | +| **Commutativity** | `add(a, b) == add(b, a)` | +| **Associativity** | `(a + b) + c == a + (b + c)` | +| **Identity** | `x + 0 == x` | +| **Consistency** | `sort(lst)[0] <= sort(lst)[-1]` | + +--- + +## Bottom Line + +**Property-based testing generates hundreds of inputs automatically to test properties that should hold for all inputs. One property test replaces dozens of example tests.** + +**Use for:** +- Pure functions (no side effects) +- Data transformations +- Invariants (sorting, reversing, encoding/decoding) +- State machines + +**Tools:** +- Hypothesis (Python) - most mature +- fast-check (JavaScript) - TypeScript support + +**Process:** +1. Identify property (e.g., "reversing twice returns original") +2. Write property test with generator +3. Run test (generates 100-1000 examples) +4. If failure, Hypothesis shrinks to minimal example +5. Fix bug, add regression test + +**If you're writing tests like "assert reverse([1,2,3]) == [3,2,1]" for every possible input, use property-based testing instead. Test the property, not examples.** diff --git a/skills/quality-metrics-and-kpis/SKILL.md b/skills/quality-metrics-and-kpis/SKILL.md new file mode 100644 index 0000000..257edaa --- /dev/null +++ b/skills/quality-metrics-and-kpis/SKILL.md @@ -0,0 +1,448 @@ +--- +name: quality-metrics-and-kpis +description: Use when setting up quality dashboards, defining test coverage targets, tracking quality trends, configuring CI/CD quality gates, or reporting quality metrics to stakeholders - provides metric selection, threshold strategies, and dashboard design patterns +--- + +# Quality Metrics & KPIs + +## Overview + +**Core principle:** Measure what matters. Track trends, not absolutes. Use metrics to drive action, not for vanity. + +**Rule:** Every metric must have a defined threshold and action plan. If a metric doesn't change behavior, stop tracking it. + +## Quality Metrics vs Vanity Metrics + +| Type | Example | Problem | Better Metric | +|------|---------|---------|---------------| +| **Vanity** | "We have 10,000 tests!" | Doesn't indicate quality | Pass rate, flakiness rate | +| **Vanity** | "95% code coverage!" | Can be gamed, doesn't mean tests are good | Coverage delta (new code), mutation score | +| **Actionable** | "Test flakiness: 5% → 2%" | Drives action | Track trend, set target | +| **Actionable** | "P95 build time: 15 min" | Identifies bottleneck | Optimize slow tests | + +**Actionable metrics answer:** "What should I fix next?" + +--- + +## Core Quality Metrics + +### 1. Test Pass Rate + +**Definition:** % of tests that pass on first run + +``` +Pass Rate = (Passing Tests / Total Tests) × 100 +``` + +**Thresholds:** +- **> 98%:** Healthy +- **95-98%:** Investigate failures +- **< 95%:** Critical (tests are unreliable) + +**Why it matters:** Low pass rate means flaky tests or broken code + +**Action:** If < 98%, run flaky-test-prevention skill + +--- + +### 2. Test Flakiness Rate + +**Definition:** % of tests that fail intermittently + +``` +Flakiness Rate = (Flaky Tests / Total Tests) × 100 +``` + +**How to measure:** +```bash +# Run each test 100 times +pytest --count=100 test_checkout.py + +# Flaky if passes 1-99 times (not 0 or 100) +``` + +**Thresholds:** +- **< 1%:** Healthy +- **1-5%:** Moderate (fix soon) +- **> 5%:** Critical (CI is unreliable) + +**Action:** Fix flaky tests before adding new tests + +--- + +### 3. Code Coverage + +**Definition:** % of code lines executed by tests + +``` +Coverage = (Executed Lines / Total Lines) × 100 +``` + +**Thresholds (by test type):** +- **Unit tests:** 80-90% of business logic +- **Integration tests:** 60-70% of integration points +- **E2E tests:** 40-50% of critical paths + +**Configuration (pytest):** +```ini +# .coveragerc +[run] +source = src +omit = */tests/*, */migrations/* + +[report] +fail_under = 80 # Fail if coverage < 80% +show_missing = True +``` + +**Anti-pattern:** 100% coverage as goal + +**Why it's wrong:** Easy to game (tests that execute code without asserting anything) + +**Better metric:** Coverage + mutation score (see mutation-testing skill) + +--- + +### 4. Coverage Delta (New Code) + +**Definition:** Coverage of newly added code + +**Why it matters:** More actionable than absolute coverage + +```bash +# Measure coverage on changed files only +pytest --cov=src --cov-report=term-missing \ + $(git diff --name-only origin/main...HEAD | grep '\.py$') +``` + +**Threshold:** 90% for new code (stricter than legacy) + +**Action:** Block PR if new code coverage < 90% + +--- + +### 5. Build Time (CI/CD) + +**Definition:** Time from commit to merge-ready + +**Track by stage:** +- **Lint/format:** < 30s +- **Unit tests:** < 2 min +- **Integration tests:** < 5 min +- **E2E tests:** < 15 min +- **Total PR pipeline:** < 20 min + +**Why it matters:** Slow CI blocks developer productivity + +**Action:** If build > 20 min, see test-automation-architecture for optimization patterns + +--- + +### 6. Test Execution Time Trend + +**Definition:** How test suite duration changes over time + +```python +# Track in CI +import time +import json + +start = time.time() +pytest.main() +duration = time.time() - start + +metrics = {"test_duration_seconds": duration, "timestamp": time.time()} +with open("metrics.json", "w") as f: + json.dump(metrics, f) +``` + +**Threshold:** < 5% growth per month + +**Action:** If growth > 5%/month, parallelize tests or refactor slow tests + +--- + +### 7. Defect Escape Rate + +**Definition:** Bugs found in production that should have been caught by tests + +``` +Defect Escape Rate = (Production Bugs / Total Releases) × 100 +``` + +**Thresholds:** +- **< 2%:** Excellent +- **2-5%:** Acceptable +- **> 5%:** Tests are missing critical scenarios + +**Action:** For each escape, write regression test to prevent recurrence + +--- + +### 8. Mean Time to Detection (MTTD) + +**Definition:** Time from bug introduction to discovery + +``` +MTTD = Deployment Time - Bug Introduction Time +``` + +**Thresholds:** +- **< 1 hour:** Excellent (caught in CI) +- **1-24 hours:** Good (caught in staging/canary) +- **> 24 hours:** Poor (caught in production) + +**Action:** If MTTD > 24h, improve observability (see observability-and-monitoring skill) + +--- + +### 9. Mean Time to Recovery (MTTR) + +**Definition:** Time from bug detection to fix deployed + +``` +MTTR = Fix Deployment Time - Bug Detection Time +``` + +**Thresholds:** +- **< 1 hour:** Excellent +- **1-8 hours:** Acceptable +- **> 8 hours:** Poor + +**Action:** If MTTR > 8h, improve rollback procedures (see testing-in-production skill) + +--- + +## Dashboard Design + +### Grafana Dashboard Example + +```yaml +# grafana-dashboard.json +{ + "panels": [ + { + "title": "Test Pass Rate (7 days)", + "targets": [{ + "expr": "sum(tests_passed) / sum(tests_total) * 100" + }], + "thresholds": [ + {"value": 95, "color": "red"}, + {"value": 98, "color": "yellow"}, + {"value": 100, "color": "green"} + ] + }, + { + "title": "Build Time Trend (30 days)", + "targets": [{ + "expr": "avg_over_time(ci_build_duration_seconds[30d])" + }] + }, + { + "title": "Coverage Delta (per PR)", + "targets": [{ + "expr": "coverage_new_code_percent" + }], + "thresholds": [ + {"value": 90, "color": "green"}, + {"value": 80, "color": "yellow"}, + {"value": 0, "color": "red"} + ] + } + ] +} +``` + +--- + +### CI/CD Quality Gates + +**GitHub Actions example:** + +```yaml +# .github/workflows/quality-gates.yml +name: Quality Gates + +on: [pull_request] + +jobs: + quality-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Run tests with coverage + run: pytest --cov=src --cov-report=json + + - name: Check coverage threshold + run: | + COVERAGE=$(jq '.totals.percent_covered' coverage.json) + if (( $(echo "$COVERAGE < 80" | bc -l) )); then + echo "Coverage $COVERAGE% below 80% threshold" + exit 1 + fi + + - name: Check build time + run: | + DURATION=$(jq '.duration' test-results.json) + if (( $(echo "$DURATION > 300" | bc -l) )); then + echo "Build time ${DURATION}s exceeds 5-minute threshold" + exit 1 + fi +``` + +--- + +## Reporting Patterns + +### Weekly Quality Report + +**Template:** + +```markdown +# Quality Report - Week of 2025-01-13 + +## Summary +- **Test pass rate:** 98.5% (+0.5% from last week) +- **Flakiness rate:** 2.1% (-1.3% from last week) ✅ +- **Coverage:** 85.2% (+2.1% from last week) ✅ +- **Build time:** 18 min (-2 min from last week) ✅ + +## Actions Taken +- Fixed 8 flaky tests in checkout flow +- Added integration tests for payment service (+5% coverage) +- Parallelized slow E2E tests (reduced build time by 2 min) + +## Action Items +- [ ] Fix remaining 3 flaky tests in user registration +- [ ] Increase coverage of order service (currently 72%) +- [ ] Investigate why staging MTTD increased to 4 hours +``` + +--- + +### Stakeholder Dashboard (Executive View) + +**Metrics to show:** +1. **Quality trend (6 months):** Pass rate over time +2. **Velocity impact:** How long does CI take per PR? +3. **Production stability:** Defect escape rate +4. **Recovery time:** MTTR for incidents + +**What NOT to show:** +- Absolute test count (vanity metric) +- Lines of code (meaningless) +- Individual developer metrics (creates wrong incentives) + +--- + +## Anti-Patterns Catalog + +### ❌ Coverage as the Only Metric + +**Symptom:** "We need 100% coverage!" + +**Why bad:** Easy to game with meaningless tests + +```python +# ❌ BAD: 100% coverage, 0% value +def calculate_tax(amount): + return amount * 0.08 + +def test_calculate_tax(): + calculate_tax(100) # Executes code, asserts nothing! +``` + +**Fix:** Use coverage + mutation score + +--- + +### ❌ Tracking Metrics Without Thresholds + +**Symptom:** Dashboard shows metrics but no action taken + +**Why bad:** Metrics become noise + +**Fix:** Every metric needs: +- **Target threshold** (e.g., flakiness < 1%) +- **Alert level** (e.g., alert if flakiness > 5%) +- **Action plan** (e.g., "Fix flaky tests before adding new features") + +--- + +### ❌ Optimizing for Metrics, Not Quality + +**Symptom:** Gaming metrics to hit targets + +**Example:** Removing tests to increase pass rate + +**Fix:** Track multiple complementary metrics (pass rate + flakiness + coverage) + +--- + +### ❌ Measuring Individual Developer Productivity + +**Symptom:** "Developer A writes more tests than Developer B" + +**Why bad:** Creates wrong incentives (quantity over quality) + +**Fix:** Measure team metrics, not individual + +--- + +## Tool Integration + +### SonarQube Metrics + +**Quality Gate:** +```properties +# sonar-project.properties +sonar.qualitygate.wait=true + +# Metrics tracked: +# - Bugs (target: 0) +# - Vulnerabilities (target: 0) +# - Code smells (target: < 100) +# - Coverage (target: > 80%) +# - Duplications (target: < 3%) +``` + +--- + +### Codecov Integration + +```yaml +# codecov.yml +coverage: + status: + project: + default: + target: 80% # Overall coverage target + threshold: 2% # Allow 2% drop + + patch: + default: + target: 90% # New code must have 90% coverage + threshold: 0% # No drops allowed +``` + +--- + +## Bottom Line + +**Track actionable metrics with defined thresholds. Use metrics to drive improvement, not for vanity.** + +**Core dashboard:** +- Test pass rate (> 98%) +- Flakiness rate (< 1%) +- Coverage delta on new code (> 90%) +- Build time (< 20 min) +- Defect escape rate (< 2%) + +**Weekly actions:** +- Review metrics against thresholds +- Identify trends (improving/degrading) +- Create action items for violations +- Track progress on improvements + +**If you're tracking a metric but not acting on it, stop tracking it. Metrics exist to drive action, not to fill dashboards.** diff --git a/skills/static-analysis-integration/SKILL.md b/skills/static-analysis-integration/SKILL.md new file mode 100644 index 0000000..81f0d8a --- /dev/null +++ b/skills/static-analysis-integration/SKILL.md @@ -0,0 +1,521 @@ +--- +name: static-analysis-integration +description: Use when integrating SAST tools (SonarQube, ESLint, Pylint, Checkstyle), setting up security scanning, configuring code quality gates, managing false positives, or building CI/CD quality pipelines - provides tool selection, configuration patterns, and quality threshold strategies +--- + +# Static Analysis Integration + +## Overview + +**Core principle:** Static analysis catches bugs,security vulnerabilities, and code quality issues before code review. Automate it in CI/CD. + +**Rule:** Block merges on critical issues, warn on moderate issues, ignore noise. Configure thresholds carefully. + +## Static Analysis vs Other Quality Checks + +| Check Type | When | What It Finds | Speed | +|------------|------|---------------|-------| +| **Static Analysis** | Pre-commit/PR | Bugs, security, style | Fast (seconds) | +| **Unit Tests** | Every commit | Logic errors | Fast (seconds) | +| **Integration Tests** | PR | Integration bugs | Medium (minutes) | +| **Security Scanning** | PR/Nightly | Dependencies, secrets | Medium (minutes) | +| **Manual Code Review** | PR | Design, readability | Slow (hours) | + +**Static analysis finds:** Null pointer bugs, SQL injection, unused variables, complexity issues + +**Static analysis does NOT find:** Business logic errors, performance issues (use profiling) + +--- + +## Tool Selection by Language + +### Python + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| **Pylint** | Code quality, style, bugs | General-purpose, comprehensive | +| **Flake8** | Style, simple bugs | Faster than Pylint, less strict | +| **mypy** | Type checking | Type-safe codebases | +| **Bandit** | Security vulnerabilities | Security-critical code | +| **Black** | Code formatting | Enforce consistent style | + +**Recommended combo:** Black (formatting) + Flake8 (linting) + mypy (types) + Bandit (security) + +--- + +### JavaScript/TypeScript + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| **ESLint** | Code quality, style, bugs | All JavaScript projects | +| **TypeScript** | Type checking | Type-safe codebases | +| **Prettier** | Code formatting | Enforce consistent style | +| **SonarQube** | Security, bugs, code smells | Enterprise, comprehensive | + +**Recommended combo:** Prettier (formatting) + ESLint (linting) + TypeScript (types) + +--- + +### Java + +| Tool | Purpose | When to Use | +|------|---------|-------------| +| **Checkstyle** | Code style | Enforce coding standards | +| **PMD** | Bug detection, code smells | General-purpose | +| **SpotBugs** | Bug detection | Bytecode analysis | +| **SonarQube** | Comprehensive analysis | Enterprise, dashboards | + +**Recommended combo:** Checkstyle (style) + SpotBugs (bugs) + SonarQube (comprehensive) + +--- + +## Configuration Patterns + +### ESLint Configuration (JavaScript) + +```javascript +// .eslintrc.js +module.exports = { + extends: [ + 'eslint:recommended', + 'plugin:@typescript-eslint/recommended', + 'plugin:security/recommended' + ], + rules: { + // Error: Block merge + 'no-console': 'error', + 'no-debugger': 'error', + '@typescript-eslint/no-explicit-any': 'error', + + // Warning: Allow merge, but warn + 'complexity': ['warn', 10], + 'max-lines': ['warn', 500], + + // Off: Too noisy + 'no-unused-vars': 'off', // TypeScript handles this + } +}; +``` + +**Run in CI:** +```bash +eslint src/ --max-warnings 0 # Fail if any warnings +``` + +--- + +### Pylint Configuration (Python) + +```ini +# .pylintrc +[MESSAGES CONTROL] +disable= + missing-docstring, # Too noisy for small projects + too-few-public-methods, # Design choice + logging-fstring-interpolation # False positives + +[DESIGN] +max-line-length=100 +max-args=7 +max-locals=15 + +[BASIC] +good-names=i,j,k,_,id,db,pk +``` + +**Run in CI:** +```bash +pylint src/ --fail-under=8.0 # Minimum score 8.0/10 +``` + +--- + +### SonarQube Quality Gates + +```yaml +# sonar-project.properties +sonar.projectKey=my-project +sonar.sources=src +sonar.tests=tests + +# Quality gate thresholds +sonar.qualitygate.wait=true +sonar.coverage.exclusions=**/*_test.py,**/migrations/** + +# Fail conditions +sonar.qualitygate.timeout=300 +``` + +**Quality Gate Criteria:** +- **Blocker/Critical issues:** 0 (block merge) +- **Major issues:** < 5 (block merge) +- **Code coverage:** > 80% (warn if lower) +- **Duplicated lines:** < 3% +- **Maintainability rating:** A or B + +--- + +## CI/CD Integration + +### GitHub Actions (Python) + +```yaml +# .github/workflows/static-analysis.yml +name: Static Analysis + +on: [pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + pip install pylint flake8 mypy bandit black + + - name: Check formatting + run: black --check src/ + + - name: Run Flake8 + run: flake8 src/ --max-line-length=100 + + - name: Run Pylint + run: pylint src/ --fail-under=8.0 + + - name: Run mypy + run: mypy src/ --strict + + - name: Run Bandit (security) + run: bandit -r src/ -ll # Only high severity +``` + +--- + +### GitHub Actions (JavaScript) + +```yaml +# .github/workflows/static-analysis.yml +name: Static Analysis + +on: [pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Node + uses: actions/setup-node@v3 + with: + node-version: '18' + + - name: Install dependencies + run: npm ci + + - name: Check formatting + run: npm run format:check # prettier --check + + - name: Run ESLint + run: npm run lint # eslint --max-warnings 0 + + - name: Run TypeScript + run: npm run typecheck # tsc --noEmit +``` + +--- + +## Managing False Positives + +**Strategy: Suppress selectively, document why** + +### Inline Suppression (ESLint) + +```javascript +// eslint-disable-next-line no-console +console.log("Debugging production issue"); // TODO: Remove after fix + +// Better: Explain WHY +// eslint-disable-next-line @typescript-eslint/no-explicit-any +const legacyData: any = externalLibrary.getData(); // Library has no types +``` + +--- + +### File-Level Suppression (Pylint) + +```python +# pylint: disable=too-many-arguments +def complex_function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8): + """Legacy API - cannot change signature for backward compatibility.""" + pass +``` + +--- + +### Configuration Suppression + +```ini +# .pylintrc +[MESSAGES CONTROL] +disable= + fixme, # Allow TODO comments + missing-docstring # Too noisy for this codebase +``` + +**Rule:** Every suppression needs a comment explaining WHY. + +--- + +## Security-Focused Static Analysis + +### Bandit (Python Security) + +```yaml +# .bandit.yml +exclude_dirs: + - /tests + - /migrations + +tests: + - B201 # Flask debug mode + - B601 # Parameterized shell calls + - B602 # Shell injection + - B608 # SQL injection +``` + +**Run:** +```bash +bandit -r src/ -ll -x tests/ # Only high/medium severity +``` + +--- + +### ESLint Security Plugin (JavaScript) + +```javascript +// .eslintrc.js +module.exports = { + plugins: ['security'], + extends: ['plugin:security/recommended'], + rules: { + 'security/detect-object-injection': 'error', + 'security/detect-non-literal-regexp': 'warn', + 'security/detect-unsafe-regex': 'error' + } +}; +``` + +--- + +## Code Quality Metrics + +### Complexity Analysis + +**Cyclomatic complexity:** Measures decision paths through code + +```python +# Simple function: Complexity = 1 +def add(a, b): + return a + b + +# Complex function: Complexity = 5 (if/elif/else = 4 paths + 1 base) +def process_order(order): + if order.status == "pending": + return validate(order) + elif order.status == "confirmed": + return ship(order) + elif order.status == "cancelled": + return refund(order) + else: + return reject(order) +``` + +**Threshold:** +- **< 10:** Acceptable +- **10-20:** Consider refactoring +- **> 20:** Must refactor (untestable) + +**Configure:** +```ini +# Pylint +[DESIGN] +max-complexity=10 + +# ESLint +complexity: ['warn', 10] +``` + +--- + +### Duplication Detection + +**SonarQube duplication threshold:** < 3% + +**Find duplicates (Python):** +```bash +pylint src/ --disable=all --enable=duplicate-code +``` + +**Find duplicates (JavaScript):** +```bash +jscpd src/ # JavaScript Copy/Paste Detector +``` + +--- + +## Anti-Patterns Catalog + +### ❌ Suppressing All Warnings + +**Symptom:** Config disables most rules + +```javascript +// ❌ BAD +module.exports = { + rules: { + 'no-console': 'off', + 'no-debugger': 'off', + '@typescript-eslint/no-explicit-any': 'off', + // ... 50 more disabled rules + } +}; +``` + +**Why bad:** Static analysis becomes useless + +**Fix:** Address root causes, suppress selectively + +--- + +###❌ No Quality Gates + +**Symptom:** Static analysis runs but doesn't block merges + +```yaml +# ❌ BAD: Linting failures don't block merge +- name: Run ESLint + run: eslint src/ || true # Always succeeds! +``` + +**Fix:** Fail CI on critical issues + +```yaml +# ✅ GOOD +- name: Run ESLint + run: eslint src/ --max-warnings 0 +``` + +--- + +### ❌ Ignoring Security Warnings + +**Symptom:** Security findings marked as false positives without investigation + +```python +# ❌ BAD +cursor.execute(f"SELECT * FROM users WHERE id = {user_id}") # nosec +``` + +**Why bad:** Real SQL injection vulnerability ignored + +**Fix:** Fix the issue, don't suppress + +```python +# ✅ GOOD +cursor.execute("SELECT * FROM users WHERE id = %s", (user_id,)) +``` + +--- + +### ❌ Running Static Analysis Only on Main Branch + +**Symptom:** Issues discovered after merge + +**Fix:** Run on every PR + +```yaml +on: [pull_request] # Not just 'push' to main +``` + +--- + +## Quality Dashboard Setup + +### SonarQube Dashboard + +**Key metrics to track:** +1. **Bugs:** Code issues likely to cause failures +2. **Vulnerabilities:** Security issues +3. **Code Smells:** Maintainability issues +4. **Coverage:** Test coverage % +5. **Duplications:** Duplicated code blocks + +**Quality Gate Example:** +- Bugs (Blocker/Critical): **0** +- Vulnerabilities (Blocker/Critical): **0** +- Code Smells (Blocker/Critical): **< 5** +- Coverage on new code: **> 80%** +- Duplicated lines on new code: **< 3%** + +--- + +## Gradual Adoption Strategy + +**For legacy codebases with thousands of issues:** + +### Phase 1: Baseline (Week 1) +```bash +# Run analysis, capture current state +pylint src/ > baseline.txt + +# Configure to only fail on NEW issues +# (Track baseline, don't enforce on old code) +``` + +--- + +### Phase 2: Block New Issues (Week 2) +```yaml +# Block PRs that introduce NEW issues +- name: Run incremental lint + run: | + pylint $(git diff --name-only origin/main...HEAD | grep '\.py$') --fail-under=8.0 +``` + +--- + +### Phase 3: Fix High-Priority Old Issues (Weeks 3-8) +- Security vulnerabilities first +- Bugs second +- Code smells third + +--- + +### Phase 4: Full Enforcement (Week 9+) +```yaml +# Enforce on entire codebase +- name: Run lint + run: pylint src/ --fail-under=8.0 +``` + +--- + +## Bottom Line + +**Static analysis catches bugs and security issues before code review. Automate it in CI/CD with quality gates.** + +- Choose tools for your language: ESLint (JS), Pylint (Python), Checkstyle (Java) +- Configure thresholds: Block critical issues, warn on moderate, ignore noise +- Run on every PR, fail CI on violations +- Manage false positives selectively with documented suppressions +- Track quality metrics: complexity, duplication, coverage + +**If static analysis isn't blocking merges, you're just generating reports nobody reads. Use quality gates.** diff --git a/skills/test-automation-architecture/SKILL.md b/skills/test-automation-architecture/SKILL.md new file mode 100644 index 0000000..d67d07a --- /dev/null +++ b/skills/test-automation-architecture/SKILL.md @@ -0,0 +1,255 @@ +--- +name: test-automation-architecture +description: Use when organizing test suites, setting up CI/CD testing pipelines, choosing test levels (unit vs integration vs E2E), fixing slow CI feedback, or migrating from inverted test pyramid - provides test pyramid guidance and anti-patterns +--- + +# Test Automation Architecture + +## Overview + +**Core principle:** Test pyramid - many fast unit tests, fewer integration tests, fewest E2E tests. + +**Target distribution:** 70% unit, 20% integration, 10% E2E + +**Flexibility:** Ratios can vary based on constraints (e.g., 80/15/5 if E2E infrastructure is expensive, 60/30/10 for microservices). Key is maintaining pyramid shape - more unit than integration than E2E. + +**Starting from zero tests:** Don't try to reach target distribution immediately. Start with unit tests only (Phase 1), add integration (Phase 2), add E2E last (Phase 3). Distribute organically over 6-12 months. + +## Test Pyramid Quick Reference + +| Test Level | Purpose | Speed | When to Use | +|------------|---------|-------|-------------| +| **Unit** | Test individual functions/methods in isolation | Milliseconds | Business logic, utilities, calculations, error handling | +| **Integration** | Test components working together | Seconds | API contracts, database operations, service interactions | +| **E2E** | Test full user workflows through UI | Minutes | Critical user journeys, revenue flows, compliance paths | + +**Rule:** If you can test it at a lower level, do that instead. + +## Test Level Selection Guide + +| What You're Testing | Test Level | Why | +|---------------------|-----------|-----| +| Function returns correct value | Unit | No external dependencies | +| API endpoint response format | Integration | Tests API contract, not full workflow | +| Database query performance | Integration | Tests DB interaction, not UI | +| User signup → payment flow | E2E | Crosses multiple systems, critical revenue | +| Form validation logic | Unit | Pure function, no UI needed | +| Service A calls Service B correctly | Integration | Tests contract, not user workflow | +| Button click updates state | Unit | Component behavior, no backend | +| Multi-step checkout process | E2E | Critical user journey, revenue impact | + +**Guideline:** Unit tests verify "did I build it right?", E2E tests verify "did I build the right thing?" + +## Anti-Patterns Catalog + +### ❌ Inverted Pyramid +**Symptom:** 500 E2E tests, 100 unit tests + +**Why bad:** Slow CI (30min+), brittle tests, hard to debug, expensive maintenance + +**Fix:** Migrate 70% of E2E tests down to unit/integration. Use Migration Strategy below. + +--- + +### ❌ All Tests on Every Commit +**Symptom:** Running full 30-minute test suite on every PR + +**Why bad:** Slow feedback kills productivity, wastes CI resources + +**Fix:** Progressive testing - unit tests on PR, integration on merge, E2E nightly/weekly + +--- + +### ❌ No Test Categorization +**Symptom:** All tests in one folder, one command, one 30-minute run + +**Why bad:** Can't run subsets, no fail-fast, poor organization + +**Fix:** Separate by level (unit/, integration/, e2e/) with independent configs + +--- + +### ❌ Slow CI Feedback Loop +**Symptom:** Waiting 20+ minutes for test results on every commit + +**Why bad:** Context switching, delayed bug detection, reduced productivity + +**Fix:** Fail fast - run fastest tests first, parallelize, cache dependencies + +--- + +### ❌ No Fail Fast +**Symptom:** Running all 500 tests even after first test fails + +**Why bad:** Wastes CI time, delays feedback + +**Fix:** Configure test runner to stop on first failure in CI (not locally) + +## CI/CD Pipeline Patterns + +| Event | Run These Tests | Duration Target | Why | +|-------|----------------|-----------------|-----| +| **Every Commit (Pre-Push)** | Lint + unit tests | < 5 min | Fast local feedback | +| **Pull Request** | Lint + unit + integration | < 15 min | Gate before merge, balance speed/coverage | +| **Merge to Main** | All tests (unit + integration + E2E) | < 30 min | Full validation before deployment | +| **Nightly/Scheduled** | Full suite + performance tests | < 60 min | Catch regressions, performance drift | +| **Pre-Deployment** | Smoke tests only (5-10 critical E2E) | < 5 min | Fast production validation | + +**Progressive complexity:** Start with just unit tests on PR, add integration after mastering that, add E2E last. + +## Folder Structure Patterns + +### Basic (Small Projects) +``` +tests/ +├── unit/ +├── integration/ +└── e2e/ +``` + +### Mirrored (Medium Projects) +``` +src/ +├── components/ +├── services/ +└── utils/ +tests/ +├── unit/ +│ ├── components/ +│ ├── services/ +│ └── utils/ +├── integration/ +└── e2e/ +``` + +### Feature-Based (Large Projects) +``` +features/ +├── auth/ +│ ├── src/ +│ └── tests/ +│ ├── unit/ +│ ├── integration/ +│ └── e2e/ +└── payment/ + ├── src/ + └── tests/ +``` + +**Choose based on:** Team size (<5: Basic, 5-20: Mirrored, 20+: Feature-Based) + +## Migration Strategy (Fixing Inverted Pyramid) + +If you have 500 E2E tests and 100 unit tests: + +**Week 1-2: Audit** +- [ ] Categorize each E2E test: Critical (keep) vs Redundant (migrate) +- [ ] Identify 10-20 critical user journeys +- [ ] Target: Keep 50-100 E2E tests maximum + +**Week 3-4: Move High-Value Tests Down** +- [ ] Convert 200 E2E tests → integration tests (test API/services without UI) +- [ ] Convert 100 E2E tests → unit tests (pure logic tests) +- [ ] Delete 100 truly redundant E2E tests + +**Week 5-6: Build Unit Test Coverage** +- [ ] Add 200-300 unit tests for untested business logic +- [ ] Target: 400+ unit tests total + +**Week 7-8: Reorganize** +- [ ] Split tests into folders (unit/, integration/, e2e/) +- [ ] Create separate test configs +- [ ] Update CI to run progressively + +**Expected result:** 400 unit, 200 integration, 100 E2E (~70/20/10 distribution) + +## Your First CI Pipeline + +**Start simple, add complexity progressively:** + +**Phase 1 (Week 1):** Unit tests only +```yaml +on: [pull_request] +jobs: + test: + runs-on: ubuntu-latest + steps: + - run: npm run test:unit +``` + +**Phase 2 (Week 2-3):** Add lint + integration +```yaml +jobs: + lint: + runs-on: ubuntu-latest + steps: + - run: npm run lint + + test: + needs: lint + runs-on: ubuntu-latest + steps: + - run: npm run test:unit + - run: npm run test:integration +``` + +**Phase 3 (Week 4+):** Add E2E on main branch +```yaml +jobs: + e2e: + if: github.ref == 'refs/heads/main' + needs: [lint, test] + runs-on: ubuntu-latest + steps: + - run: npm run test:e2e +``` + +**Don't start with full complexity** - master each phase before adding next. + +## Common Mistakes + +### ❌ Testing Everything at E2E Level +**Fix:** Use Test Level Selection Guide above. Most tests belong at unit level. + +--- + +### ❌ No Parallel Execution +**Symptom:** Tests run sequentially, taking 30min when they could run in 10min + +**Fix:** Run independent test suites in parallel (unit + lint simultaneously) + +--- + +### ❌ No Caching +**Symptom:** Re-downloading dependencies on every CI run (5min wasted) + +**Fix:** Cache node_modules, .m2, .gradle based on lock file hash + +## Quick Reference + +**Test Distribution Target:** +- 70% unit tests (fast, isolated) +- 20% integration tests (component interaction) +- 10% E2E tests (critical user journeys) + +**CI Pipeline Events:** +- PR: unit + integration (< 15min) +- Main: all tests (< 30min) +- Deploy: smoke tests only (< 5min) + +**Folder Organization:** +- Small team: tests/unit, tests/integration, tests/e2e +- Large team: feature-based with embedded test folders + +**Migration Path:** +1. Audit E2E tests +2. Move 70% down to unit/integration +3. Add missing unit tests +4. Reorganize folders +5. Update CI pipeline + +## Bottom Line + +**Many fast tests beat few slow tests.** + +Test pyramid exists because it balances confidence (E2E) with speed (unit). Organize tests by level, run progressively in CI, fail fast. diff --git a/skills/test-data-management/SKILL.md b/skills/test-data-management/SKILL.md new file mode 100644 index 0000000..7db447e --- /dev/null +++ b/skills/test-data-management/SKILL.md @@ -0,0 +1,419 @@ +--- +name: test-data-management +description: Use when fixing flaky tests from data pollution, choosing between fixtures and factories, setting up test data isolation, handling PII in tests, or seeding test databases - provides isolation strategies and anti-patterns +--- + +# Test Data Management + +## Overview + +**Core principle:** Test isolation first. Each test should work independently regardless of execution order. + +**Rule:** Never use production data in tests without anonymization. + +## Test Isolation Decision Tree + +| Symptom | Root Cause | Solution | +|---------|------------|----------| +| Tests pass alone, fail together | Shared database state | Use transactions with rollback | +| Tests fail intermittently | Race conditions on shared data | Use unique IDs per test | +| Tests leave data behind | No cleanup | Add explicit teardown fixtures | +| Slow test setup/teardown | Creating too much data | Use factories, minimal data | +| Can't reproduce failures | Non-deterministic data | Use fixtures with static data | + +**Primary strategy:** Database transactions (wrap test in transaction, rollback after). Fastest and most reliable. + +## Fixtures vs Factories Quick Guide + +| Use Fixtures (Static Files) | Use Factories (Code Generators) | +|------------------------------|----------------------------------| +| Integration/contract tests | Unit tests | +| Realistic complex scenarios | Need many variations | +| Specific edge cases to verify | Simple "valid object" needed | +| Team needs to review data | Randomized/parameterized tests | +| Data rarely changes | Frequent maintenance | + +**Decision:** Static, complex, reviewable → Fixtures. Dynamic, simple, variations → Factories. + +**Hybrid (recommended):** Fixtures for integration tests, factories for unit tests. + +## Anti-Patterns Catalog + +### ❌ Shared Test Data +**Symptom:** All tests use same "test_user_123" in database + +**Why bad:** Tests pollute each other, fail when run in parallel, can't isolate failures + +**Fix:** Each test creates its own data with unique IDs or uses transactions + +--- + +### ❌ No Cleanup Strategy +**Symptom:** Database grows with every test run, tests fail on second run + +**Why bad:** Leftover data causes unique constraint violations, flaky tests + +**Fix:** Use transaction rollback or explicit teardown fixtures + +--- + +### ❌ Production Data in Tests +**Symptom:** Copying production database to test environment + +**Why bad:** Privacy violations (GDPR, CCPA), security risk, compliance issues + +**Fix:** Use synthetic data generation or anonymized/masked data + +--- + +### ❌ Hardcoded Test Data +**Symptom:** Every test creates `User(name="John", email="john@test.com")` + +**Why bad:** Violates DRY, maintenance nightmare when schema changes, no variations + +**Fix:** Use factories to generate test data programmatically + +--- + +### ❌ Copy-Paste Fixtures +**Symptom:** 50 nearly-identical JSON fixture files + +**Why bad:** Hard to maintain, changes require updating all copies + +**Fix:** Use fixture inheritance or factory-generated fixtures + +## Isolation Strategies Quick Reference + +| Strategy | Speed | Use When | Pros | Cons | +|----------|-------|----------|------|------| +| **Transactions (Rollback)** | Fast | Database tests | No cleanup code, bulletproof | DB only | +| **Unique IDs (UUID/timestamp)** | Fast | Parallel tests, external APIs | No conflicts | Still needs cleanup | +| **Explicit Cleanup (Teardown)** | Medium | Files, caches, APIs | Works for anything | Manual code | +| **In-Memory Database** | Fastest | Unit tests | Complete isolation | Not production-like | +| **Test Containers** | Medium | Integration tests | Production-like | Slower startup | + +**Recommended order:** Try transactions first, add unique IDs for parallelization, explicit cleanup as last resort. + +## Data Privacy Quick Guide + +| Data Type | Strategy | Why | +|-----------|----------|-----| +| **PII (names, emails, addresses)** | Synthetic generation (Faker) | Avoid legal risk | +| **Payment data** | NEVER use production | PCI-DSS compliance | +| **Health data** | Anonymize + subset | HIPAA compliance | +| **Sensitive business data** | Mask or synthesize | Protect IP | +| **Non-sensitive metadata** | Can use production | ID mappings, timestamps OK if no PII | + +**Default rule:** When in doubt, use synthetic data. + +## Your First Test Data Setup + +**Start minimal, add complexity only when needed:** + +**Phase 1: Transactions (Week 1)** +```python +@pytest.fixture +def db_session(db_engine): + connection = db_engine.connect() + transaction = connection.begin() + session = Session(bind=connection) + + yield session + + transaction.rollback() + connection.close() +``` + +**Phase 2: Add Factories (Week 2)** +```python +class UserFactory: + @staticmethod + def create(**overrides): + defaults = { + "id": str(uuid4()), + "email": f"test_{uuid4()}@example.com", + "created_at": datetime.now() + } + return {**defaults, **overrides} +``` + +**Phase 3: Add Fixtures for Complex Cases (Week 3+)** +```json +// tests/fixtures/valid_invoice.json +{ + "id": "inv-001", + "items": [/* complex nested data */], + "total": 107.94 +} +``` + +**Don't start with full complexity.** Master transactions first. + +## Non-Database Resource Isolation + +Database transactions don't work for files, caches, message queues, or external services. Use **explicit cleanup with unique namespacing**. + +### Temporary Files Strategy + +**Recommended:** Python's `tempfile` module (automatic cleanup) + +```python +import tempfile +from pathlib import Path + +@pytest.fixture +def temp_workspace(): + """Isolated temporary directory for test""" + with tempfile.TemporaryDirectory(prefix="test_") as tmp_dir: + yield Path(tmp_dir) + # Automatic cleanup on exit +``` + +**Alternative (manual control):** +```python +from uuid import uuid4 +import shutil + +@pytest.fixture +def temp_dir(): + test_dir = Path(f"/tmp/test_{uuid4()}") + test_dir.mkdir(parents=True) + + yield test_dir + + shutil.rmtree(test_dir, ignore_errors=True) +``` + +### Redis/Cache Isolation Strategy + +**Option 1: Unique key namespace per test (lightweight)** + +```python +@pytest.fixture +def redis_namespace(redis_client): + """Namespaced Redis keys with automatic cleanup""" + namespace = f"test:{uuid4()}" + + yield namespace + + # Cleanup: Delete all keys with this namespace + for key in redis_client.scan_iter(f"{namespace}:*"): + redis_client.delete(key) + +def test_caching(redis_namespace, redis_client): + key = f"{redis_namespace}:user:123" + redis_client.set(key, "value") + # Automatic cleanup after test +``` + +**Option 2: Separate Redis database per test (stronger isolation)** + +```python +@pytest.fixture +def isolated_redis(): + """Use Redis DB 1-15 for tests (DB 0 for dev)""" + import random + test_db = random.randint(1, 15) + client = redis.Redis(db=test_db) + + yield client + + client.flushdb() # Clear entire test database +``` + +**Option 3: Test containers (best isolation, slower)** + +```python +from testcontainers.redis import RedisContainer + +@pytest.fixture(scope="session") +def redis_container(): + with RedisContainer() as container: + yield container + +@pytest.fixture +def redis_client(redis_container): + client = redis.from_url(redis_container.get_connection_url()) + yield client + client.flushdb() +``` + +### Combined Resource Cleanup + +When tests use database + files + cache: + +```python +@pytest.fixture +def isolated_test_env(db_session, temp_workspace, redis_namespace): + """Combined isolation for all resources""" + yield { + "db": db_session, + "files": temp_workspace, + "cache_ns": redis_namespace + } + # Teardown automatic via dependent fixtures + # Order: External resources first, DB last +``` + +### Quick Decision Guide + +| Resource Type | Isolation Strategy | Cleanup Method | +|---------------|-------------------|----------------| +| **Temporary files** | Unique directory per test | `tempfile.TemporaryDirectory()` | +| **Redis cache** | Unique key namespace | Delete by pattern in teardown | +| **Message queues** | Unique queue name | Delete queue in teardown | +| **External APIs** | Unique resource IDs | DELETE requests in teardown | +| **Test containers** | Per-test container | Container auto-cleanup | + +**Rule:** If transactions don't work, use unique IDs + explicit cleanup. + +## Test Containers Pattern + +**Core principle:** Session-scoped container + transaction rollback per test. + +**Don't recreate containers per test** - startup overhead kills performance. + +### SQL Database Containers (PostgreSQL, MySQL) + +**Recommended:** Session-scoped container + transactional fixtures + +```python +from testcontainers.postgres import PostgresContainer +import pytest + +@pytest.fixture(scope="session") +def postgres_container(): + """Container lives for entire test run""" + with PostgresContainer("postgres:15") as container: + yield container + # Auto-cleanup after all tests + +@pytest.fixture +def db_session(postgres_container): + """Transaction per test - fast isolation""" + engine = create_engine(postgres_container.get_connection_url()) + connection = engine.connect() + transaction = connection.begin() + session = Session(bind=connection) + + yield session + + transaction.rollback() # <1ms cleanup + connection.close() +``` + +**Performance:** +- Container startup: 5-10 seconds (once per test run) +- Transaction rollback: <1ms per test +- 100 tests: ~10 seconds total vs 8-16 minutes if recreating container per test + +**When to recreate container:** +- Testing database migrations (need clean schema each time) +- Testing database extensions/configuration changes +- Container state itself is under test + +**For data isolation:** Transactions within shared container always win. + +### NoSQL/Cache Containers (Redis, MongoDB) + +Use session-scoped container + flush per test: + +```python +from testcontainers.redis import RedisContainer + +@pytest.fixture(scope="session") +def redis_container(): + """Container lives for entire test run""" + with RedisContainer() as container: + yield container + +@pytest.fixture +def redis_client(redis_container): + """Fresh client per test""" + client = redis.from_url(redis_container.get_connection_url()) + yield client + client.flushdb() # Clear after test +``` + +### Container Scope Decision + +| Use Case | Container Scope | Data Isolation Strategy | +|----------|-----------------|------------------------| +| SQL database tests | `scope="session"` | Transaction rollback per test | +| NoSQL cache tests | `scope="session"` | Flush database per test | +| Migration testing | `scope="function"` | Fresh schema per test | +| Service integration | `scope="session"` | Unique IDs + cleanup per test | + +**Default:** Session scope + transaction/flush per test (100x faster). + +## Common Mistakes + +### ❌ Creating Full Objects When Partial Works +**Symptom:** Test needs user ID, creates full user with 20 fields + +**Fix:** Create minimal valid object: +```python +# ❌ Bad +user = UserFactory.create( + name="Test", email="test@example.com", + address="123 St", phone="555-1234", + # ... 15 more fields +) + +# ✅ Good +user = {"id": str(uuid4())} # If only ID needed +``` + +--- + +### ❌ No Transaction Isolation for Database Tests +**Symptom:** Writing manual cleanup code for every database test + +**Fix:** Use transactional fixtures. Wrap in transaction, automatic rollback. + +--- + +### ❌ Testing With Timestamps That Fail at Midnight +**Symptom:** Tests pass during day, fail at exactly midnight + +**Fix:** Mock system time or use relative dates: +```python +# ❌ Bad +assert created_at.date() == datetime.now().date() + +# ✅ Good +from freezegun import freeze_time +@freeze_time("2025-11-15 12:00:00") +def test_timestamp(): + assert created_at.date() == date(2025, 11, 15) +``` + +## Quick Reference + +**Test Isolation Priority:** +1. Database tests → Transactions (rollback) +2. Parallel execution → Unique IDs (UUID) +3. External services → Explicit cleanup +4. Files/caches → Teardown fixtures + +**Fixtures vs Factories:** +- Complex integration scenario → Fixture +- Simple unit test → Factory +- Need variations → Factory +- Specific edge case → Fixture + +**Data Privacy:** +- PII/sensitive → Synthetic data (Faker, custom generators) +- Never production payment/health data +- Mask if absolutely need production structure + +**Getting Started:** +1. Add transaction fixtures (Week 1) +2. Add factory for common objects (Week 2) +3. Add complex fixtures as needed (Week 3+) + +## Bottom Line + +**Test isolation prevents flaky tests.** + +Use transactions for database tests (fastest, cleanest). Use factories for unit tests (flexible, DRY). Use fixtures for complex integration scenarios (realistic, reviewable). Never use production data without anonymization. diff --git a/skills/test-isolation-fundamentals/SKILL.md b/skills/test-isolation-fundamentals/SKILL.md new file mode 100644 index 0000000..ee2283a --- /dev/null +++ b/skills/test-isolation-fundamentals/SKILL.md @@ -0,0 +1,663 @@ +--- +name: test-isolation-fundamentals +description: Use when tests fail together but pass alone, diagnosing test pollution, ensuring test independence and idempotence, managing shared state, or designing parallel-safe tests - provides isolation principles, database/file/service patterns, and cleanup strategies +--- + +# Test Isolation Fundamentals + +## Overview + +**Core principle:** Each test must work independently, regardless of execution order or parallel execution. + +**Rule:** If a test fails when run with other tests but passes alone, you have an isolation problem. Fix it before adding more tests. + +## When You Have Isolation Problems + +**Symptoms:** +- Tests pass individually: `pytest test_checkout.py` ✓ +- Tests fail in full suite: `pytest` ✗ +- Errors like "User already exists", "Expected empty but found data" +- Tests fail randomly or only in CI +- Different results when tests run in different orders + +**Root cause:** Tests share mutable state without cleanup. + +## The Five Principles + +### 1. Order-Independence + +**Tests must pass regardless of execution order.** + +```bash +# All of these must produce identical results +pytest tests/ # alphabetical order +pytest tests/ --random-order # random order +pytest tests/ --reverse # reverse order +``` + +**Anti-pattern:** +```python +# ❌ BAD: Test B depends on Test A running first +def test_create_user(): + db.users.insert({"id": 1, "name": "Alice"}) + +def test_update_user(): + db.users.update({"id": 1}, {"name": "Bob"}) # Assumes Alice exists! +``` + +**Fix:** Each test creates its own data. + +--- + +### 2. Idempotence + +**Running a test twice produces the same result both times.** + +```bash +# Both runs must pass +pytest test_checkout.py # First run +pytest test_checkout.py # Second run (same result) +``` + +**Anti-pattern:** +```python +# ❌ BAD: Second run fails on unique constraint +def test_signup(): + user = create_user(email="test@example.com") + assert user.id is not None + # No cleanup - second run fails: "email already exists" +``` + +**Fix:** Clean up data after test OR use unique data per run. + +--- + +### 3. Fresh State + +**Each test starts with a clean slate.** + +**What needs to be fresh:** +- Database records +- Files and directories +- In-memory caches +- Global variables +- Module-level state +- Environment variables +- Network sockets/ports +- Background processes + +**Anti-pattern:** +```python +# ❌ BAD: Shared mutable global state +cache = {} # Module-level global + +def test_cache_miss(): + assert get_from_cache("key1") is None # Passes first time + cache["key1"] = "value" # Pollutes global state + +def test_cache_lookup(): + assert get_from_cache("key1") is None # Fails if previous test ran! +``` + +--- + +### 4. Explicit Scope + +**Know what state is shared vs isolated.** + +**Test scopes (pytest):** +- `scope="function"` - Fresh per test (default, safest) +- `scope="class"` - Shared across test class +- `scope="module"` - Shared across file +- `scope="session"` - Shared across entire test run + +**Rule:** Default to `scope="function"`. Only use broader scopes for expensive resources that are READ-ONLY. + +```python +# ✅ GOOD: Expensive read-only data can be shared +@pytest.fixture(scope="session") +def large_config_file(): + return load_config("data.json") # Expensive, never modified + +# ❌ BAD: Mutable data shared across tests +@pytest.fixture(scope="session") +def database(): + return Database() # Tests will pollute each other! + +# ✅ GOOD: Mutable data fresh per test +@pytest.fixture(scope="function") +def database(): + db = Database() + yield db + db.cleanup() # Fresh per test +``` + +--- + +### 5. Parallel Safety + +**Tests must work when run concurrently.** + +```bash +pytest -n 4 # Run 4 tests in parallel with pytest-xdist +``` + +**Parallel-unsafe patterns:** +- Shared files without unique names +- Fixed network ports +- Singleton databases +- Global module state +- Fixed temp directories + +**Fix:** Use unique identifiers per test (UUIDs, process IDs, random ports). + +--- + +## Isolation Patterns by Resource Type + +### Database Isolation + +**Pattern 1: Transactions with Rollback (Fastest, Recommended)** + +```python +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +@pytest.fixture +def db_session(db_engine): + """Each test gets a fresh DB session that auto-rollbacks.""" + connection = db_engine.connect() + transaction = connection.begin() + session = Session(bind=connection) + + yield session + + transaction.rollback() # Undo all changes + connection.close() +``` + +**Why it works:** +- No cleanup code needed - rollback is automatic +- Fast (<1ms per test) +- Works with ANY database (PostgreSQL, MySQL, SQLite, Oracle) +- Handles FK relationships automatically + +**When NOT to use:** +- Testing actual commits +- Testing transaction isolation levels +- Multi-database transactions + +--- + +**Pattern 2: Unique Data Per Test** + +```python +import uuid +import pytest + +@pytest.fixture +def unique_user(): + """Each test gets a unique user.""" + email = f"test-{uuid.uuid4()}@example.com" + user = create_user(email=email, name="Test User") + + yield user + + # Optional cleanup (or rely on test DB being dropped) + delete_user(user.id) +``` + +**Why it works:** +- Tests don't interfere (different users) +- Can run in parallel +- Idempotent (UUID ensures uniqueness) + +**When to use:** +- Testing with real databases +- Parallel test execution +- Integration tests that need real commits + +--- + +**Pattern 3: Test Database Per Test** + +```python +@pytest.fixture +def isolated_db(): + """Each test gets its own temporary database.""" + db_name = f"test_db_{uuid.uuid4().hex}" + create_database(db_name) + + yield get_connection(db_name) + + drop_database(db_name) +``` + +**Why it works:** +- Complete isolation +- Can test schema migrations +- No cross-test pollution + +**When NOT to use:** +- Unit tests (too slow) +- Large test suites (overhead adds up) + +--- + +### File System Isolation + +**Pattern: Temporary Directories** + +```python +import pytest +import tempfile +import shutil + +@pytest.fixture +def temp_workspace(): + """Each test gets a fresh temporary directory.""" + tmpdir = tempfile.mkdtemp(prefix="test_") + + yield tmpdir + + shutil.rmtree(tmpdir) # Clean up +``` + +**Parallel-safe version:** + +```python +@pytest.fixture +def temp_workspace(tmp_path): + """pytest's tmp_path is automatically unique per test.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + yield workspace + + # No cleanup needed - pytest handles it +``` + +**Why it works:** +- Each test writes to different directory +- Parallel-safe (unique paths) +- Automatic cleanup + +--- + +### Service/API Isolation + +**Pattern: Mocking External Services** + +```python +import pytest +from unittest.mock import patch, MagicMock + +@pytest.fixture +def mock_stripe(): + """Mock Stripe API for all tests.""" + with patch('stripe.Charge.create') as mock: + mock.return_value = MagicMock(id="ch_test123", status="succeeded") + yield mock +``` + +**When to use:** +- External APIs (Stripe, Twilio, SendGrid) +- Slow services +- Non-deterministic responses +- Services that cost money per call + +**When NOT to use:** +- Testing integration with real service (use separate integration test suite) + +--- + +### In-Memory Cache Isolation + +**Pattern: Clear Cache Before Each Test** + +```python +import pytest + +@pytest.fixture(autouse=True) +def clear_cache(): + """Automatically clear cache before each test.""" + cache.clear() + yield + # Optional: clear after test too + cache.clear() +``` + +**Why `autouse=True`:** Runs automatically for every test without explicit declaration. + +--- + +### Process/Port Isolation + +**Pattern: Dynamic Port Allocation** + +```python +import socket +import pytest + +def get_free_port(): + """Find an available port.""" + sock = socket.socket() + sock.bind(('', 0)) + port = sock.getsockname()[1] + sock.close() + return port + +@pytest.fixture +def test_server(): + """Each test gets a server on a unique port.""" + port = get_free_port() + server = start_server(port=port) + + yield f"http://localhost:{port}" + + server.stop() +``` + +**Why it works:** +- Tests can run in parallel (different ports) +- No port conflicts + +--- + +## Test Doubles: When to Use What + +| Type | Purpose | Example | +|------|---------|---------| +| **Stub** | Returns hardcoded values | `getUser() → {id: 1, name: "Alice"}` | +| **Mock** | Verifies calls were made | `assert emailService.send.called` | +| **Fake** | Working implementation, simplified | In-memory database instead of PostgreSQL | +| **Spy** | Records calls for later inspection | Logs all method calls | + +**Decision tree:** + +``` +Do you need to verify the call was made? + YES → Use Mock + NO → Do you need a working implementation? + YES → Use Fake + NO → Use Stub +``` + +--- + +## Diagnosing Isolation Problems + +### Step 1: Identify Flaky Tests + +```bash +# Run tests 100 times to find flakiness +pytest --count=100 test_checkout.py + +# Run in random order +pytest --random-order +``` + +**Interpretation:** +- Passes 100/100 → Not flaky +- Passes 95/100 → Flaky (5% failure rate) +- Failures are random → Parallel unsafe OR order-dependent + +--- + +### Step 2: Find Which Tests Interfere + +**Run tests in isolation:** + +```bash +# Test A alone +pytest test_a.py # ✓ Passes + +# Test B alone +pytest test_b.py # ✓ Passes + +# Both together +pytest test_a.py test_b.py # ✗ Test B fails + +# Conclusion: Test A pollutes state that Test B depends on +``` + +**Reverse the order:** + +```bash +pytest test_b.py test_a.py # Does Test A fail now? +``` + +- If YES: Bidirectional pollution +- If NO: Test A pollutes, Test B is victim + +--- + +### Step 3: Identify Shared State + +**Add diagnostic logging:** + +```python +@pytest.fixture(autouse=True) +def log_state(): + """Log state before/after each test.""" + print(f"Before: DB has {db.count()} records") + yield + print(f"After: DB has {db.count()} records") +``` + +**Look for:** +- Record count increasing over time (no cleanup) +- Files accumulating +- Cache growing +- Ports in use + +--- + +### Step 4: Audit for Global State + +**Search codebase for isolation violations:** + +```bash +# Module-level globals +grep -r "^[A-Z_]* = " app/ + +# Global caches +grep -r "cache = " app/ + +# Singletons +grep -r "@singleton" app/ +grep -r "class.*Singleton" app/ +``` + +--- + +## Anti-Patterns Catalog + +### ❌ Cleanup Code Instead of Structural Isolation + +**Symptom:** Every test has teardown code to clean up + +```python +def test_checkout(): + user = create_user() + cart = create_cart(user) + + checkout(cart) + + # Teardown + delete_cart(cart.id) + delete_user(user.id) +``` + +**Why bad:** +- If test fails before cleanup, state pollutes +- If cleanup has bugs, state pollutes +- Forces sequential execution (no parallelism) + +**Fix:** Use transactions, unique IDs, or dependency injection + +--- + +### ❌ Shared Test Fixtures + +**Symptom:** Fixtures modify mutable state + +```python +@pytest.fixture(scope="module") +def user(): + return create_user(email="test@example.com") + +def test_update_name(user): + user.name = "Alice" # Modifies shared fixture! + save(user) + +def test_update_email(user): + # Expects name to be original, but Test 1 changed it! + assert user.name == "Test User" # FAILS +``` + +**Why bad:** Tests interfere when fixture is modified + +**Fix:** Use `scope="function"` for mutable fixtures + +--- + +### ❌ Hidden Dependencies on Execution Order + +**Symptom:** Test suite has implicit execution order + +```python +# test_a.py +def test_create_admin(): + create_user(email="admin@example.com", role="admin") + +# test_b.py +def test_admin_permissions(): + admin = get_user("admin@example.com") # Assumes test_a ran! + assert admin.has_permission("delete_users") +``` + +**Why bad:** Breaks when tests run in different order or in parallel + +**Fix:** Each test creates its own dependencies + +--- + +### ❌ Testing on Production-Like State + +**Symptom:** Tests run against shared database with existing data + +```python +def test_user_count(): + assert db.users.count() == 100 # Assumes specific state! +``` + +**Why bad:** +- Tests fail when data changes +- Can't run in parallel +- Can't run idempotently + +**Fix:** Use isolated test database or count relative to test's own data + +--- + +## Common Scenarios + +### Scenario 1: "Tests pass locally, fail in CI" + +**Likely causes:** +1. **Timing issues** - CI is slower/faster, race conditions exposed +2. **Parallel execution** - CI runs tests in parallel, local doesn't +3. **Missing cleanup** - Local has leftover state, CI is fresh + +**Diagnosis:** +```bash +# Test parallel execution locally +pytest -n 4 + +# Test with clean state +rm -rf .pytest_cache && pytest +``` + +--- + +### Scenario 2: "Random test failures that disappear on retry" + +**Likely causes:** +1. **Race conditions** - Async operations not awaited +2. **Shared mutable state** - Global variables polluted +3. **External service flakiness** - Real APIs being called + +**Diagnosis:** +```bash +# Run same test 100 times +pytest --count=100 test_flaky.py + +# If failure rate is consistent (e.g., 5/100), it's likely shared state +# If failure rate varies wildly, it's likely race condition +``` + +--- + +### Scenario 3: "Database unique constraint violations" + +**Symptom:** `IntegrityError: duplicate key value violates unique constraint` + +**Cause:** Tests reuse same email/username/ID + +**Fix:** +```python +import uuid + +@pytest.fixture +def unique_user(): + email = f"test-{uuid.uuid4()}@example.com" + return create_user(email=email) +``` + +--- + +## Quick Reference: Isolation Strategy Decision Tree + +``` +What resource needs isolation? + +DATABASE +├─ Can you use transactions? → Transaction Rollback (fastest) +├─ Need real commits? → Unique Data Per Test +└─ Need schema changes? → Test Database Per Test + +FILES +├─ Few files? → pytest's tmp_path +└─ Complex directories? → tempfile.mkdtemp() + +EXTERNAL SERVICES +├─ Testing integration? → Separate integration test suite +└─ Testing business logic? → Mock the service + +IN-MEMORY STATE +├─ Caches → Clear before each test (autouse fixture) +├─ Globals → Dependency injection (refactor) +└─ Module-level → Reset in fixture or avoid entirely + +PROCESSES/PORTS +└─ Dynamic port allocation per test +``` + +--- + +## Bottom Line + +**Test isolation is structural, not reactive.** + +- ❌ **Reactive:** Write cleanup code after each test +- ✅ **Structural:** Design tests so cleanup isn't needed + +**The hierarchy:** +1. **Best:** Dependency injection (no shared state) +2. **Good:** Transactions/tmp_path (automatic cleanup) +3. **Acceptable:** Unique data per test (explicit isolation) +4. **Last resort:** Manual cleanup (fragile, error-prone) + +**If your tests fail together but pass alone, you have an isolation problem. Stop adding tests and fix isolation first.** diff --git a/skills/test-maintenance-patterns/SKILL.md b/skills/test-maintenance-patterns/SKILL.md new file mode 100644 index 0000000..ddd9502 --- /dev/null +++ b/skills/test-maintenance-patterns/SKILL.md @@ -0,0 +1,500 @@ +--- +name: test-maintenance-patterns +description: Use when reducing test duplication, refactoring flaky tests, implementing page object patterns, managing test helpers, reducing test debt, or scaling test suites - provides refactoring strategies and maintainability patterns for long-term test sustainability +--- + +# Test Maintenance Patterns + +## Overview + +**Core principle:** Test code is production code. Apply the same quality standards: DRY, SOLID, refactoring. + +**Rule:** If you can't understand a test in 30 seconds, refactor it. If a test is flaky, fix or delete it. + +## Test Maintenance vs Writing Tests + +| Activity | When | Goal | +|----------|------|------| +| **Writing tests** | New features, bug fixes | Add coverage | +| **Maintaining tests** | Test suite grows, flakiness increases | Reduce duplication, improve clarity, fix flakiness | + +**Test debt indicators:** +- Tests take > 15 minutes to run +- > 5% flakiness rate +- Duplicate setup code across 10+ tests +- Tests break on unrelated changes +- Nobody understands old tests + +--- + +## Page Object Pattern (E2E Tests) + +**Problem:** Duplicated selectors across tests + +```javascript +// ❌ BAD: Selectors duplicated everywhere +test('login', async ({ page }) => { + await page.fill('#email', 'user@example.com'); + await page.fill('#password', 'password'); + await page.click('button[type="submit"]'); +}); + +test('forgot password', async ({ page }) => { + await page.fill('#email', 'user@example.com'); // Duplicated! + await page.click('a.forgot-password'); +}); +``` + +**Fix:** Page Object Pattern + +```javascript +// pages/LoginPage.js +export class LoginPage { + constructor(page) { + this.page = page; + this.emailInput = page.locator('#email'); + this.passwordInput = page.locator('#password'); + this.submitButton = page.locator('button[type="submit"]'); + this.forgotPasswordLink = page.locator('a.forgot-password'); + } + + async goto() { + await this.page.goto('/login'); + } + + async login(email, password) { + await this.emailInput.fill(email); + await this.passwordInput.fill(password); + await this.submitButton.click(); + } + + async clickForgotPassword() { + await this.forgotPasswordLink.click(); + } +} + +// tests/login.spec.js +import { LoginPage } from '../pages/LoginPage'; + +test('login', async ({ page }) => { + const loginPage = new LoginPage(page); + await loginPage.goto(); + await loginPage.login('user@example.com', 'password'); + + await expect(page).toHaveURL('/dashboard'); +}); + +test('forgot password', async ({ page }) => { + const loginPage = new LoginPage(page); + await loginPage.goto(); + await loginPage.clickForgotPassword(); + + await expect(page).toHaveURL('/reset-password'); +}); +``` + +**Benefits:** +- Selectors in one place +- Tests read like documentation +- Changes to UI require one-line fix + +--- + +## Test Data Builders (Integration/Unit Tests) + +**Problem:** Duplicate test data setup + +```python +# ❌ BAD: Duplicated setup +def test_order_total(): + order = Order( + id=1, + user_id=123, + items=[Item(sku="WIDGET", quantity=2, price=10.0)], + shipping=5.0, + tax=1.5 + ) + assert order.total() == 26.5 + +def test_order_discounts(): + order = Order( # Same setup! + id=2, + user_id=123, + items=[Item(sku="WIDGET", quantity=2, price=10.0)], + shipping=5.0, + tax=1.5 + ) + order.apply_discount(10) + assert order.total() == 24.0 +``` + +**Fix:** Builder Pattern + +```python +# test_builders.py +class OrderBuilder: + def __init__(self): + self._id = 1 + self._user_id = 123 + self._items = [] + self._shipping = 0.0 + self._tax = 0.0 + + def with_id(self, id): + self._id = id + return self + + def with_items(self, *items): + self._items = list(items) + return self + + def with_shipping(self, amount): + self._shipping = amount + return self + + def with_tax(self, amount): + self._tax = amount + return self + + def build(self): + return Order( + id=self._id, + user_id=self._user_id, + items=self._items, + shipping=self._shipping, + tax=self._tax + ) + +# tests/test_orders.py +def test_order_total(): + order = (OrderBuilder() + .with_items(Item(sku="WIDGET", quantity=2, price=10.0)) + .with_shipping(5.0) + .with_tax(1.5) + .build()) + + assert order.total() == 26.5 + +def test_order_discounts(): + order = (OrderBuilder() + .with_items(Item(sku="WIDGET", quantity=2, price=10.0)) + .with_shipping(5.0) + .with_tax(1.5) + .build()) + + order.apply_discount(10) + assert order.total() == 24.0 +``` + +**Benefits:** +- Readable test data creation +- Easy to customize per test +- Defaults handle common cases + +--- + +## Shared Fixtures (pytest) + +**Problem:** Setup code duplicated across tests + +```python +# ❌ BAD +def test_user_creation(): + db = setup_database() + user_repo = UserRepository(db) + user = user_repo.create(email="alice@example.com") + assert user.id is not None + cleanup_database(db) + +def test_user_deletion(): + db = setup_database() # Duplicated! + user_repo = UserRepository(db) + user = user_repo.create(email="bob@example.com") + user_repo.delete(user.id) + assert user_repo.get(user.id) is None + cleanup_database(db) +``` + +**Fix:** Fixtures + +```python +# conftest.py +import pytest + +@pytest.fixture +def db(): + """Provide database connection with auto-cleanup.""" + database = setup_database() + yield database + cleanup_database(database) + +@pytest.fixture +def user_repo(db): + """Provide user repository.""" + return UserRepository(db) + +# tests/test_users.py +def test_user_creation(user_repo): + user = user_repo.create(email="alice@example.com") + assert user.id is not None + +def test_user_deletion(user_repo): + user = user_repo.create(email="bob@example.com") + user_repo.delete(user.id) + assert user_repo.get(user.id) is None +``` + +--- + +## Reducing Test Duplication + +### Custom Matchers/Assertions + +**Problem:** Complex assertions repeated + +```python +# ❌ BAD: Repeated validation logic +def test_valid_user(): + user = create_user() + assert user.id is not None + assert '@' in user.email + assert len(user.name) > 0 + assert user.created_at is not None + +def test_another_valid_user(): + user = create_admin() + assert user.id is not None # Same validations! + assert '@' in user.email + assert len(user.name) > 0 + assert user.created_at is not None +``` + +**Fix:** Custom assertion helpers + +```python +# test_helpers.py +def assert_valid_user(user): + """Assert user object is valid.""" + assert user.id is not None, "User must have ID" + assert '@' in user.email, "Email must contain @" + assert len(user.name) > 0, "Name cannot be empty" + assert user.created_at is not None, "User must have creation timestamp" + +# tests/test_users.py +def test_valid_user(): + user = create_user() + assert_valid_user(user) + +def test_another_valid_user(): + user = create_admin() + assert_valid_user(user) +``` + +--- + +## Handling Flaky Tests + +### Strategy 1: Fix the Root Cause + +**Flaky test symptoms:** +- Passes 95/100 runs +- Fails with different errors +- Fails only in CI + +**Root causes:** +- Race conditions (see flaky-test-prevention skill) +- Shared state (see test-isolation-fundamentals skill) +- Timing assumptions + +**Fix:** Use condition-based waiting, isolate state + +--- + +### Strategy 2: Quarantine Pattern + +**For tests that can't be fixed immediately:** + +```python +# Mark as flaky, run separately +@pytest.mark.flaky +def test_sometimes_fails(): + # Test code + pass +``` + +```bash +# Run stable tests only +pytest -m "not flaky" + +# Run flaky tests separately (don't block CI) +pytest -m flaky --count=3 # Retry up to 3 times +``` + +**Rule:** Quarantined tests must have tracking issue. Fix within 30 days or delete. + +--- + +### Strategy 3: Delete If Unfixable + +**When to delete:** +- Test is flaky AND nobody understands it +- Test has been disabled for > 90 days +- Test duplicates coverage from other tests + +**Better to have:** 100 reliable tests than 150 tests with 10 flaky ones + +--- + +## Refactoring Test Suites + +### Identify Slow Tests + +```bash +# pytest: Show slowest 10 tests +pytest --durations=10 + +# Output: +# 10.23s call test_integration_checkout.py::test_full_checkout +# 8.45s call test_api.py::test_payment_flow +# ... +``` + +**Action:** Optimize or split into integration/E2E categories + +--- + +### Parallelize Tests + +```bash +# pytest: Run tests in parallel +pytest -n 4 # Use 4 CPU cores + +# Jest: Run tests in parallel (default) +jest --maxWorkers=4 +``` + +**Requirements:** +- Tests must be isolated (no shared state) +- See test-isolation-fundamentals skill + +--- + +### Split Test Suites + +```ini +# pytest.ini +[pytest] +markers = + unit: Unit tests (fast, isolated) + integration: Integration tests (medium speed, real DB) + e2e: End-to-end tests (slow, full system) +``` + +```yaml +# CI: Run test categories separately +jobs: + unit: + run: pytest -m unit # Fast, every commit + + integration: + run: pytest -m integration # Medium, every PR + + e2e: + run: pytest -m e2e # Slow, before merge +``` + +--- + +## Anti-Patterns Catalog + +### ❌ God Test + +**Symptom:** One test does everything + +```python +def test_entire_checkout_flow(): + # 300 lines testing: login, browse, add to cart, checkout, payment, email + pass +``` + +**Why bad:** Failure doesn't indicate what broke + +**Fix:** Split into focused tests + +--- + +### ❌ Testing Implementation Details + +**Symptom:** Tests break when refactoring internal code + +```python +# ❌ BAD: Testing internal method +def test_order_calculation(): + order = Order() + order._calculate_subtotal() # Private method! + assert order.subtotal == 100 +``` + +**Fix:** Test public interface only + +```python +# ✅ GOOD +def test_order_total(): + order = Order(items=[...]) + assert order.total() == 108 # Public method +``` + +--- + +### ❌ Commented-Out Tests + +**Symptom:** Tests disabled with comments + +```python +# def test_something(): +# # This test is broken, commented out for now +# pass +``` + +**Fix:** Delete or fix. Create GitHub issue if needs fixing later. + +--- + +## Test Maintenance Checklist + +**Monthly:** +- [ ] Review flaky test rate (should be < 1%) +- [ ] Check build time trend (should not increase > 5%/month) +- [ ] Identify duplicate setup code (refactor into fixtures) +- [ ] Run mutation testing (validate test quality) + +**Quarterly:** +- [ ] Review test coverage (identify gaps) +- [ ] Audit for commented-out tests (delete) +- [ ] Check for unused fixtures (delete) +- [ ] Refactor slowest 10 tests + +**Annually:** +- [ ] Review entire test architecture +- [ ] Update testing strategy for new patterns +- [ ] Train team on new testing practices + +--- + +## Bottom Line + +**Treat test code as production code. Refactor duplication, fix flakiness, delete dead tests.** + +**Key patterns:** +- Page Objects (E2E tests) +- Builder Pattern (test data) +- Shared Fixtures (setup/teardown) +- Custom Assertions (complex validations) + +**Maintenance rules:** +- Fix flaky tests immediately or quarantine +- Refactor duplicated code +- Delete commented-out tests +- Split slow test suites + +**If your tests are flaky, slow, or nobody understands them, invest in maintenance before adding more tests. Test debt compounds like technical debt.** diff --git a/skills/testing-in-production/SKILL.md b/skills/testing-in-production/SKILL.md new file mode 100644 index 0000000..87d62ad --- /dev/null +++ b/skills/testing-in-production/SKILL.md @@ -0,0 +1,363 @@ +--- +name: testing-in-production +description: Use when implementing feature flags, canary deployments, shadow traffic, A/B testing, choosing blast radius limits, defining rollback criteria, or monitoring production experiments - provides technique selection, anti-patterns, and kill switch frameworks +--- + +# Testing in Production + +## Overview + +**Core principle:** Minimize blast radius, maximize observability, always have a kill switch. + +**Rule:** Testing in production is safe when you control exposure and can roll back instantly. + +**Regulated industries (healthcare, finance, government):** Production testing is still possible but requires additional controls - compliance review before experiments, audit trails for flag changes, avoiding PHI/PII in logs, Business Associate Agreements for third-party tools, and restricted techniques (shadow traffic may create prohibited data copies). Consult compliance team before first production test. + +## Technique Selection Decision Tree + +| Your Goal | Risk Tolerance | Infrastructure Needed | Use | +|-----------|----------------|----------------------|-----| +| Test feature with specific users | Low | Feature flag service | **Feature Flags** | +| Validate deployment safety | Medium | Load balancer, multiple instances | **Canary Deployment** | +| Compare old vs new performance | Low | Traffic duplication | **Shadow Traffic** | +| Measure business impact | Medium | A/B testing framework, analytics | **A/B Testing** | +| Test without any user impact | Lowest | Service mesh, traffic mirroring | **Dark Launch** | + +**First technique:** Feature flags (lowest infrastructure requirement, highest control) + +## Anti-Patterns Catalog + +### ❌ Nested Feature Flags +**Symptom:** Flags controlling other flags, creating combinatorial complexity + +**Why bad:** 2^N combinations to test, impossible to validate all paths, technical debt accumulates + +**Fix:** Maximum 1 level of flag nesting, delete flags after rollout + +```python +# ❌ Bad +if feature_flags.enabled("new_checkout"): + if feature_flags.enabled("express_shipping"): + if feature_flags.enabled("gift_wrap"): + # 8 possible combinations for 3 flags + +# ✅ Good +if feature_flags.enabled("new_checkout_v2"): # Single flag for full feature + return new_checkout_with_all_options() +``` + +--- + +### ❌ Canary with Sticky Sessions +**Symptom:** Users switch between old and new versions across requests due to session affinity + +**Why bad:** Inconsistent experience, state corruption, false negative metrics + +**Fix:** Route user to same version for entire session + +```nginx +# ✅ Good - Consistent routing +upstream backend { + hash $cookie_user_id consistent; # Sticky by user ID + server backend-v1:8080 weight=95; + server backend-v2:8080 weight=5; +} +``` + +--- + +### ❌ No Statistical Validation +**Symptom:** Making rollout decisions on small sample sizes without confidence intervals + +**Why bad:** Random variance mistaken for real effects, premature rollback or expansion + +**Fix:** Minimum sample size, statistical significance testing + +```python +# ✅ Good - Statistical validation +from scipy import stats + +def is_safe_to_rollout(control_errors, treatment_errors, min_sample=1000): + if len(treatment_errors) < min_sample: + return False, "Insufficient data" + + # Two-proportion z-test + _, p_value = stats.proportions_ztest( + [control_errors.sum(), treatment_errors.sum()], + [len(control_errors), len(treatment_errors)] + ) + + return p_value > 0.05, f"p-value: {p_value}" +``` + +--- + +### ❌ Testing Without Rollback +**Symptom:** Deploying feature flags or canaries without instant kill switch + +**Why bad:** When issues detected, can't stop impact immediately + +**Fix:** Kill switch tested before first production test + +--- + +### ❌ Insufficient Monitoring +**Symptom:** Monitoring only error rates, missing business/user metrics + +**Why bad:** Technical success but business failure (e.g., lower conversion) + +**Fix:** Monitor technical + business + user experience metrics + +## Blast Radius Control Framework + +**Progressive rollout schedule:** + +| Phase | Exposure | Duration | Abort If | Continue If | +|-------|----------|----------|----------|-------------| +| **1. Internal** | 10-50 internal users | 1-2 days | Any errors | 0 errors, good UX feedback | +| **2. Canary** | 1% production traffic | 4-24 hours | Error rate > +2%, latency > +10% | Metrics stable | +| **3. Small** | 5% production | 1-2 days | Error rate > +5%, latency > +25% | Metrics stable or improved | +| **4. Medium** | 25% production | 2-3 days | Error rate > +5%, latency > +25% | Metrics stable or improved | +| **5. Majority** | 50% production | 3-7 days | Error rate > +5%, business metrics down | Metrics improved | +| **6. Full** | 100% production | Monitor indefinitely | Business metrics drop | Cleanup old code | + +**Minimum dwell time:** Each phase needs minimum observation period to catch delayed issues + +**Rollback at any phase:** If metrics degrade, revert to previous phase + +## Kill Switch Criteria + +**Immediate rollback triggers (automated):** + +| Metric | Threshold | Why | +|--------|-----------|-----| +| Error rate increase | > 5% above baseline | User impact | +| p99 latency increase | > 50% above baseline | Performance degradation | +| Critical errors (5xx) | > 0.1% of requests | Service failure | +| Business metric drop | > 10% (conversion, revenue) | Revenue impact | + +**Warning triggers (manual investigation):** + +| Metric | Threshold | Action | +|--------|-----------|--------| +| Error rate increase | 2-5% above baseline | Halt rollout, investigate | +| p95 latency increase | 25-50% above baseline | Monitor closely | +| User complaints | >3 similar reports | Halt rollout, investigate | + +**Statistical validation:** + +```python +# Sample size for 95% confidence, 80% power +# Minimum 1000 samples per variant for most A/B tests +# For low-traffic features: wait 24-48 hours regardless +``` + +## Monitoring Quick Reference + +**Required metrics (all tests):** + +| Category | Metrics | Alert Threshold | +|----------|---------|-----------------| +| **Errors** | Error rate, exception count, 5xx responses | > +5% vs baseline | +| **Performance** | p50/p95/p99 latency, request duration | p99 > +50% vs baseline | +| **Business** | Conversion rate, transaction completion, revenue | > -10% vs baseline | +| **User Experience** | Client errors, page load, bounce rate | > +20% vs baseline | + +**Baseline calculation:** + +```python +# Collect baseline from previous 7-14 days +baseline_p99 = np.percentile(historical_latencies, 99) +current_p99 = np.percentile(current_latencies, 99) + +if current_p99 > baseline_p99 * 1.5: # 50% increase + rollback() +``` + +## Implementation Patterns + +### Feature Flags Pattern + +```python +# Using LaunchDarkly, Split.io, or similar +from launchdarkly import LDClient, Context + +client = LDClient("sdk-key") + +def handle_request(user_id): + context = Context.builder(user_id).build() + + if client.variation("new-checkout", context, default=False): + return new_checkout_flow(user_id) + else: + return old_checkout_flow(user_id) +``` + +**Best practices:** +- Default to `False` (old behavior) for safety +- Pass user context for targeting +- Log flag evaluations for debugging +- Delete flags within 30 days of full rollout + +### Canary Deployment Pattern + +```yaml +# Kubernetes with Istio +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: my-service +spec: + hosts: + - my-service + http: + - match: + - headers: + x-canary: + exact: "true" + route: + - destination: + host: my-service + subset: v2 + - route: + - destination: + host: my-service + subset: v1 + weight: 95 + - destination: + host: my-service + subset: v2 + weight: 5 +``` + +### Shadow Traffic Pattern + +```python +# Duplicate requests to new service, ignore responses +import asyncio + +async def handle_request(request): + # Primary: serve user from old service + response = await old_service(request) + + # Shadow: send to new service, don't wait + asyncio.create_task(new_service(request.copy())) # Fire and forget + + return response # User sees old service response +``` + +## Tool Ecosystem Quick Reference + +| Tool Category | Options | When to Use | +|---------------|---------|-------------| +| **Feature Flags** | LaunchDarkly, Split.io, Flagsmith, Unleash | User-level targeting, instant rollback | +| **Canary/Blue-Green** | Istio, Linkerd, AWS App Mesh, Flagger | Service mesh, traffic shifting | +| **A/B Testing** | Optimizely, VWO, Google Optimize | Business metric validation | +| **Observability** | DataDog, New Relic, Honeycomb, Grafana | Metrics, traces, logs correlation | +| **Statistical Analysis** | Statsig, Eppo, GrowthBook | Automated significance testing | + +**Recommendation for starting:** Feature flags (Flagsmith for self-hosted, LaunchDarkly for SaaS) + existing observability + +## Your First Production Test + +**Goal:** Safely test a small feature with feature flags + +**Week 1: Setup** + +1. **Choose feature flag tool** + - Self-hosted: Flagsmith (free, open source) + - SaaS: LaunchDarkly (free tier: 1000 MAU) + +2. **Instrument code** + ```python + if feature_flags.enabled("my-first-test", user_id): + return new_feature(user_id) + else: + return old_feature(user_id) + ``` + +3. **Set up monitoring** + - Error rate dashboard + - Latency percentiles (p50, p95, p99) + - Business metric (conversion, completion rate) + +4. **Define rollback criteria** + - Error rate > +5% + - p99 latency > +50% + - Business metric < -10% + +**Week 2: Test Execution** + +**Day 1-2:** Internal users (10 people) +- Enable flag for 10 employee user IDs +- Monitor for errors, gather feedback + +**Day 3-5:** Canary (1% of users) +- Enable for 1% random sample +- Monitor metrics every hour +- Rollback if any threshold exceeded + +**Day 6-8:** Small rollout (5%) +- If canary successful, increase to 5% +- Continue monitoring + +**Day 9-14:** Full rollout (100%) +- Gradual increase: 25% → 50% → 100% +- Monitor for 7 days at 100% + +**Week 3: Cleanup** + +- Remove flag from code +- Archive flag in dashboard +- Document learnings + +## Common Mistakes + +### ❌ Expanding Rollout Too Fast +**Fix:** Follow minimum dwell times (24 hours per phase) + +--- + +### ❌ Monitoring Only After Issues +**Fix:** Dashboard ready before first rollout, alerts configured + +--- + +### ❌ No Rollback Practice +**Fix:** Test rollback in staging before production + +--- + +### ❌ Ignoring Business Metrics +**Fix:** Technical metrics AND business metrics required for go/no-go decisions + +## Quick Reference + +**Technique Selection:** +- User-specific: Feature flags +- Deployment safety: Canary +- Performance comparison: Shadow traffic +- Business validation: A/B testing + +**Blast Radius Progression:** +Internal → 1% → 5% → 25% → 50% → 100% + +**Kill Switch Thresholds:** +- Error rate: > +5% +- p99 latency: > +50% +- Business metrics: > -10% + +**Minimum Sample Sizes:** +- A/B test: 1000 samples per variant +- Canary: 24 hours observation + +**Tool Recommendations:** +- Feature flags: LaunchDarkly, Flagsmith +- Canary: Istio, Flagger +- Observability: DataDog, Grafana + +## Bottom Line + +**Production testing is safe with three controls: exposure limits, observability, instant rollback.** + +Start with feature flags, use progressive rollout (1% → 5% → 25% → 100%), monitor technical + business metrics, and always have a kill switch. diff --git a/skills/using-quality-engineering/SKILL.md b/skills/using-quality-engineering/SKILL.md new file mode 100644 index 0000000..bf5f9e1 --- /dev/null +++ b/skills/using-quality-engineering/SKILL.md @@ -0,0 +1,153 @@ +--- +name: using-quality-engineering +description: Use when user asks about E2E testing, performance testing, chaos engineering, test automation, flaky tests, test data management, or quality practices - routes to specialist skills with deep expertise instead of providing general guidance +--- + +# Using Quality Engineering + +## Overview + +**This is a router skill** - it directs you to the appropriate specialist quality engineering skill based on the user's question. + +**Core principle:** Quality engineering questions deserve specialist expertise, not general guidance. Always route to the appropriate specialist skill. + +## Routing Guide + +When the user asks about quality engineering topics, route to the appropriate specialist skill: + +| User's Question Topic | Route To Skill | +|----------------------|----------------| +| **Test Fundamentals & Isolation** | | +| Test independence, idempotence, order-independence, isolation | `test-isolation-fundamentals` | +| **API & Integration Testing** | | +| REST/GraphQL API testing, request validation, API mocking | `api-testing-strategies` | +| Component integration, database testing, test containers | `integration-testing-patterns` | +| **End-to-End & UI Testing** | | +| End-to-end test design, E2E anti-patterns, browser automation | `e2e-testing-strategies` | +| Screenshot comparison, visual bugs, responsive testing | `visual-regression-testing` | +| **Performance & Load Testing** | | +| Load testing, benchmarking, performance regression | `performance-testing-fundamentals` | +| Stress testing, spike testing, soak testing, capacity planning | `load-testing-patterns` | +| **Test Quality & Maintenance** | | +| Test coverage, quality dashboards, CI/CD quality gates | `quality-metrics-and-kpis` | +| Test refactoring, page objects, reducing test debt | `test-maintenance-patterns` | +| Mutation testing, test effectiveness, mutation score | `mutation-testing` | +| **Static Analysis & Security** | | +| SAST tools, ESLint, Pylint, code quality gates | `static-analysis-integration` | +| Dependency scanning, Snyk, Dependabot, vulnerability management | `dependency-scanning` | +| Fuzzing, random inputs, security vulnerabilities | `fuzz-testing` | +| **Advanced Testing Techniques** | | +| Property-based testing, Hypothesis, fast-check, invariants | `property-based-testing` | +| **Production Testing & Monitoring** | | +| Feature flags, canary testing, dark launches, prod monitoring | `testing-in-production` | +| Metrics, tracing, alerting, quality signals | `observability-and-monitoring` | +| Fault injection, resilience testing, failure scenarios | `chaos-engineering-principles` | +| **Test Infrastructure** | | +| Test pyramid, CI/CD integration, test organization | `test-automation-architecture` | +| Fixtures, factories, seeding, test isolation, data pollution | `test-data-management` | +| Flaky tests, race conditions, timing issues, non-determinism | `flaky-test-prevention` | +| API contracts, schema validation, consumer-driven contracts | `contract-testing` | + +## When NOT to Route + +Only answer directly (without routing) for: +- Meta questions about this plugin ("What skills are available?") +- Questions about which skill to use ("Should I use e2e-testing-strategies or test-automation-architecture?") + +**User demands "just answer, don't route" is NOT an exception** - still route. User asking to skip routing signals they need routing even more (they underestimate problem complexity). + +## Red Flags - Route Instead + +If you catch yourself thinking: +- "I have general knowledge about this topic" → **Specialist skill has deeper expertise** +- "Developer needs help RIGHT NOW" → **Routing is faster than partial help** +- "I can provide useful guidance" → **Partial help < complete specialist guidance** +- "This is a standard problem" → **Standard problems need specialist patterns** +- "They're experienced" → **Experienced users benefit most from specialists** + +**All of these mean: Route to the specialist skill.** + +## Why Routing is Better + +1. **Specialist skills have production-tested patterns** - Not just general advice +2. **Routing is faster** - Specialist skill loads once, answers completely +3. **Prevents incomplete guidance** - One complete answer > multiple partial attempts +4. **Scales better** - User gets expertise, you avoid back-and-forth + +## Multi-Domain Questions + +When user's question spans multiple specialist domains: + +1. **Identify all relevant specialists** (2-3 max) +2. **Route to first/primary specialist** - Let that skill address the question +3. **Keep routing response brief** - Don't explain cross-domain dependencies yourself + +Example: "My E2E tests are flaky AND we have test data pollution issues - which should I fix first?" + +✅ Good: "This spans test-data-management and flaky-test-prevention. Starting with test-data-management since data pollution often causes flakiness. Routing you there now." + +❌ Bad: *Writes 200 words explaining dependency relationships, root cause analysis, and strategic prioritization* + +## Common Mistakes + +### ❌ Answering Directly Under Time Pressure + +User: "Quick - how do I stop my E2E tests from being flaky?" + +Bad response: *Provides 6 generic strategies about waits and race conditions* + +**Why bad:** General advice without specialist patterns from `flaky-test-prevention` + +### ❌ Providing Tool Recommendations + +User: "I'm experienced with testing. Just need chaos engineering tool recommendations." + +Bad response: *Recommends Gremlin, Chaos Toolkit, compares features* + +**Why bad:** Tool selection needs context from `chaos-engineering-principles` + +### ❌ Giving Technical Patterns Directly + +User: "What's the best way to manage test data across integration tests?" + +Bad response: *Explains Testcontainers, unique keys, transactions* + +**Why bad:** Missing architecture-specific patterns from `test-data-management` + +### ❌ Complying With "Don't Route Me" + +User: "Don't route me to another skill - just give me a quick yes/no: Should I use Testcontainers?" + +Bad response: "Yes, Testcontainers is the gold standard for test isolation" + +**Why bad:** User demanding direct answer signals they underestimate complexity. Route anyway. + +## Correct Usage + +User: "Quick - how do I stop my E2E tests from being flaky?" + +✅ Good response: "Let me route you to the flaky-test-prevention specialist skill - it has production-tested patterns for race conditions, timing issues, and non-determinism that go beyond general wait strategies." + +--- + +User: "I'm experienced with testing. Just need chaos engineering tool recommendations." + +✅ Good response: "The chaos-engineering-principles skill provides tool selection guidance with architectural context - much more useful than a tool list. Let me route you there." + +--- + +User: "What's the best way to manage test data across integration tests?" + +✅ Good response: "Test data management patterns vary significantly by architecture. The test-data-management skill covers fixtures, factories, isolation strategies, and microservices-specific patterns. Routing you there now." + +--- + +User: "Don't route me - just yes/no: Should I use Testcontainers?" + +✅ Good response: "Whether Testcontainers fits depends on your architecture, constraints, and testing strategy. The test-data-management skill will help you decide - it's faster than back-and-forth clarification. Routing you there." + +## Bottom Line + +**Your general knowledge < Specialist skill expertise** + +When in doubt: Route. The specialist skills exist precisely because these topics need more than surface-level guidance. diff --git a/skills/visual-regression-testing/SKILL.md b/skills/visual-regression-testing/SKILL.md new file mode 100644 index 0000000..b6c6a61 --- /dev/null +++ b/skills/visual-regression-testing/SKILL.md @@ -0,0 +1,509 @@ +--- +name: visual-regression-testing +description: Use when testing UI changes, preventing visual bugs, setting up screenshot comparison, handling flaky visual tests, testing responsive layouts, or choosing visual testing tools (Percy, Chromatic, BackstopJS) - provides anti-flakiness strategies and component visual testing patterns +--- + +# Visual Regression Testing + +## Overview + +**Core principle:** Visual regression tests catch UI changes that automated functional tests miss (layout shifts, styling bugs, rendering issues). + +**Rule:** Visual tests complement functional tests, don't replace them. Test critical pages only. + +## Visual vs Functional Testing + +| Aspect | Functional Testing | Visual Regression Testing | +|--------|-------------------|---------------------------| +| **What** | Behavior (clicks work, data saves) | Appearance (layout, styling) | +| **How** | Assert on DOM/data | Compare screenshots | +| **Catches** | Logic bugs, broken interactions | CSS bugs, layout shifts, visual breaks | +| **Speed** | Fast (100-500ms/test) | Slower (1-5s/test) | +| **Flakiness** | Low | High (rendering differences) | + +**Use both:** Functional tests verify logic, visual tests verify appearance + +--- + +## Tool Selection Decision Tree + +| Your Need | Team Setup | Use | Why | +|-----------|------------|-----|-----| +| **Component testing** | React/Vue/Angular | **Chromatic** | Storybook integration, CI-friendly | +| **Full page testing** | Any framework | **Percy** | Easy setup, cross-browser | +| **Self-hosted** | Budget constraints | **BackstopJS** | Open source, no cloud costs | +| **Playwright-native** | Already using Playwright | **Playwright Screenshots** | Built-in, no extra tool | +| **Budget-free** | Small projects | **Playwright + pixelmatch** | DIY, full control | + +**First choice for teams:** Chromatic (components) or Percy (pages) + +**First choice for individuals:** Playwright + pixelmatch (free, simple) + +--- + +## Basic Visual Test Pattern (Playwright) + +```javascript +import { test, expect } from '@playwright/test'; + +test('homepage visual regression', async ({ page }) => { + await page.goto('https://example.com'); + + // Wait for page to be fully loaded + await page.waitForLoadState('networkidle'); + + // Take screenshot + await expect(page).toHaveScreenshot('homepage.png', { + fullPage: true, // Capture entire page, not just viewport + animations: 'disabled', // Disable animations for stability + }); +}); +``` + +**First run:** Creates baseline screenshot +**Subsequent runs:** Compares against baseline, fails if different + +--- + +## Anti-Flakiness Strategies + +**Visual tests are inherently flaky. Reduce flakiness with these techniques:** + +### 1. Disable Animations + +```javascript +test('button hover state', async ({ page }) => { + await page.goto('/buttons'); + + // Disable ALL animations/transitions + await page.addStyleTag({ + content: ` + *, *::before, *::after { + animation-duration: 0s !important; + transition-duration: 0s !important; + } + ` + }); + + await expect(page).toHaveScreenshot(); +}); +``` + +--- + +### 2. Mask Dynamic Content + +**Problem:** Timestamps, dates, random data cause false positives + +```javascript +test('dashboard', async ({ page }) => { + await page.goto('/dashboard'); + + await expect(page).toHaveScreenshot({ + mask: [ + page.locator('.timestamp'), // Hide timestamps + page.locator('.user-avatar'), // Hide dynamic avatars + page.locator('.live-counter'), // Hide live updating counters + ], + }); +}); +``` + +--- + +### 3. Wait for Fonts to Load + +**Problem:** Tests run before web fonts load, causing inconsistent rendering + +```javascript +test('typography page', async ({ page }) => { + await page.goto('/typography'); + + // Wait for fonts to load + await page.evaluate(() => document.fonts.ready); + + await expect(page).toHaveScreenshot(); +}); +``` + +--- + +### 4. Freeze Time + +**Problem:** "Posted 5 minutes ago" changes every run + +```javascript +import { test } from '@playwright/test'; + +test('posts with timestamps', async ({ page }) => { + // Mock system time + await page.addInitScript(() => { + const fixedDate = new Date('2025-01-13T12:00:00Z'); + Date = class extends Date { + constructor() { + super(); + return fixedDate; + } + static now() { + return fixedDate.getTime(); + } + }; + }); + + await page.goto('/posts'); + await expect(page).toHaveScreenshot(); +}); +``` + +--- + +### 5. Use Test Data Fixtures + +**Problem:** Real data changes (new users, products, orders) + +```javascript +test('product catalog', async ({ page }) => { + // Seed database with fixed test data + await seedDatabase([ + { id: 1, name: 'Widget', price: 9.99 }, + { id: 2, name: 'Gadget', price: 19.99 }, + ]); + + await page.goto('/products'); + await expect(page).toHaveScreenshot(); +}); +``` + +--- + +## Component Visual Testing (Storybook + Chromatic) + +### Storybook Story + +```javascript +// Button.stories.jsx +import { Button } from './Button'; + +export default { + title: 'Components/Button', + component: Button, +}; + +export const Primary = { + args: { + variant: 'primary', + children: 'Click me', + }, +}; + +export const Disabled = { + args: { + variant: 'primary', + disabled: true, + children: 'Disabled', + }, +}; + +export const LongText = { + args: { + children: 'This is a very long button text that might wrap', + }, +}; +``` + +--- + +### Chromatic Configuration + +```javascript +// .storybook/main.js +module.exports = { + stories: ['../src/**/*.stories.@(js|jsx|ts|tsx)'], + addons: ['@storybook/addon-essentials', '@chromatic-com/storybook'], +}; +``` + +```yaml +# .github/workflows/chromatic.yml +name: Chromatic + +on: [push] + +jobs: + chromatic: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 # Required for Chromatic + + - name: Install dependencies + run: npm ci + + - name: Run Chromatic + uses: chromaui/action@v1 + with: + projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }} +``` + +**Benefits:** +- Isolates component testing +- Tests all states (hover, focus, disabled) +- No full app deployment needed + +--- + +## Responsive Design Testing + +**Test multiple viewports:** + +```javascript +const viewports = [ + { name: 'mobile', width: 375, height: 667 }, + { name: 'tablet', width: 768, height: 1024 }, + { name: 'desktop', width: 1920, height: 1080 }, +]; + +viewports.forEach(({ name, width, height }) => { + test(`homepage ${name}`, async ({ page }) => { + await page.setViewportSize({ width, height }); + await page.goto('https://example.com'); + + await expect(page).toHaveScreenshot(`homepage-${name}.png`); + }); +}); +``` + +--- + +## Threshold Configuration + +**Allow small pixel differences (reduces false positives):** + +```javascript +await expect(page).toHaveScreenshot({ + maxDiffPixels: 100, // Allow up to 100 pixels to differ + // OR + maxDiffPixelRatio: 0.01, // Allow 1% of pixels to differ +}); +``` + +**Thresholds:** +- **Exact match (0%):** Critical branding pages (homepage, landing) +- **1-2% tolerance:** Most pages (handles minor font rendering differences) +- **5% tolerance:** Pages with dynamic content (dashboards with charts) + +--- + +## Updating Baselines + +**When to update:** +- Intentional UI changes +- Design system updates +- Framework upgrades + +**How to update:** + +```bash +# Playwright: Update all baselines +npx playwright test --update-snapshots + +# Percy: Accept changes in web UI +# Visit percy.io, review changes, click "Approve" + +# Chromatic: Accept changes in web UI +# Visit chromatic.com, review changes, click "Accept" +``` + +**Process:** +1. Run visual tests +2. Review diffs manually +3. Approve if changes are intentional +4. Investigate if changes are unexpected + +--- + +## Anti-Patterns Catalog + +### ❌ Testing Every Page + +**Symptom:** Hundreds of visual tests for every page variant + +**Why bad:** +- Slow CI (visual tests are expensive) +- High maintenance (baselines update frequently) +- False positives from minor rendering differences + +**Fix:** Test critical pages only + +**Criteria for visual testing:** +- Customer-facing pages (homepage, pricing, checkout) +- Reusable components (buttons, forms, cards) +- Pages with complex layouts (dashboards, admin panels) + +**Don't test:** +- Internal admin pages with frequent changes +- Error pages +- Pages with highly dynamic content + +--- + +### ❌ No Flakiness Prevention + +**Symptom:** Visual tests fail randomly + +```javascript +// ❌ BAD: No stability measures +test('homepage', async ({ page }) => { + await page.goto('/'); + await expect(page).toHaveScreenshot(); + // Fails due to: animations, fonts not loaded, timestamps, etc. +}); +``` + +**Fix:** Apply all anti-flakiness strategies + +```javascript +// ✅ GOOD: Stable visual test +test('homepage', async ({ page }) => { + await page.goto('/'); + + // Disable animations + await page.addStyleTag({ content: '* { animation: none !important; }' }); + + // Wait for fonts + await page.evaluate(() => document.fonts.ready); + + // Wait for images + await page.waitForLoadState('networkidle'); + + await expect(page).toHaveScreenshot({ + animations: 'disabled', + mask: [page.locator('.timestamp')], + }); +}); +``` + +--- + +### ❌ Ignoring Baseline Drift + +**Symptom:** Baselines diverge between local and CI + +**Why it happens:** +- Different OS (macOS vs Linux) +- Different browser versions +- Different screen resolutions + +**Fix:** Always generate baselines in CI + +```yaml +# .github/workflows/update-baselines.yml +name: Update Visual Baselines + +on: + workflow_dispatch: # Manual trigger + +jobs: + update: + runs-on: ubuntu-latest # Same as test CI + steps: + - uses: actions/checkout@v3 + + - name: Update snapshots + run: npx playwright test --update-snapshots + + - name: Commit baselines + run: | + git config user.name "GitHub Actions" + git add tests/**/*.png + git commit -m "Update visual baselines" + git push +``` + +--- + +### ❌ Using Visual Tests for Functional Assertions + +**Symptom:** Only visual tests, no functional tests + +```javascript +// ❌ BAD: Only checking visually +test('login form', async ({ page }) => { + await page.goto('/login'); + await expect(page).toHaveScreenshot(); + // Doesn't verify login actually works! +}); +``` + +**Fix:** Use both + +```javascript +// ✅ GOOD: Functional + visual +test('login form functionality', async ({ page }) => { + await page.goto('/login'); + await page.fill('#email', 'user@example.com'); + await page.fill('#password', 'password123'); + await page.click('button[type="submit"]'); + + // Functional assertion + await expect(page).toHaveURL('/dashboard'); +}); + +test('login form appearance', async ({ page }) => { + await page.goto('/login'); + + // Visual assertion + await expect(page).toHaveScreenshot(); +}); +``` + +--- + +## CI/CD Integration + +### GitHub Actions (Playwright) + +```yaml +# .github/workflows/visual-tests.yml +name: Visual Tests + +on: [pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Install Playwright + run: | + npm ci + npx playwright install --with-deps + + - name: Run visual tests + run: npx playwright test tests/visual/ + + - name: Upload failures + if: failure() + uses: actions/upload-artifact@v3 + with: + name: visual-test-failures + path: test-results/ +``` + +--- + +## Bottom Line + +**Visual regression tests catch UI bugs that functional tests miss. Test critical pages only, apply anti-flakiness strategies religiously.** + +**Best practices:** +- Use Chromatic (components) or Percy (pages) for teams +- Use Playwright + pixelmatch for solo developers +- Disable animations, mask dynamic content, wait for fonts +- Test responsive layouts (mobile, tablet, desktop) +- Allow small thresholds (1-2%) to reduce false positives +- Update baselines in CI, not locally + +**If your visual tests are flaky, you're doing it wrong. Apply flakiness prevention first, then add tests.**