From adfd3add64127e90a946a8ea98900e97d1e4a9db Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:59:27 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + README.md | 3 + plugin.lock.json | 89 ++ skills/using-web-backend/SKILL.md | 135 ++ .../using-web-backend/api-authentication.md | 1381 +++++++++++++++++ skills/using-web-backend/api-documentation.md | 944 +++++++++++ skills/using-web-backend/api-testing.md | 1013 ++++++++++++ .../using-web-backend/database-integration.md | 1117 +++++++++++++ .../using-web-backend/django-development.md | 890 +++++++++++ .../using-web-backend/express-development.md | 872 +++++++++++ .../using-web-backend/fastapi-development.md | 500 ++++++ .../using-web-backend/graphql-api-design.md | 954 ++++++++++++ skills/using-web-backend/message-queues.md | 993 ++++++++++++ .../microservices-architecture.md | 592 +++++++ skills/using-web-backend/rest-api-design.md | 523 +++++++ 15 files changed, 10018 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/using-web-backend/SKILL.md create mode 100644 skills/using-web-backend/api-authentication.md create mode 100644 skills/using-web-backend/api-documentation.md create mode 100644 skills/using-web-backend/api-testing.md create mode 100644 skills/using-web-backend/database-integration.md create mode 100644 skills/using-web-backend/django-development.md create mode 100644 skills/using-web-backend/express-development.md create mode 100644 skills/using-web-backend/fastapi-development.md create mode 100644 skills/using-web-backend/graphql-api-design.md create mode 100644 skills/using-web-backend/message-queues.md create mode 100644 skills/using-web-backend/microservices-architecture.md create mode 100644 skills/using-web-backend/rest-api-design.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..82d14f1 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "axiom-web-backend", + "description": "Web backend development expertise across FastAPI, Django, Express, REST/GraphQL APIs, microservices, and production deployment patterns", + "version": "1.0.2", + "author": { + "name": "tachyon-beep", + "email": "zhongweili@tubi.tv" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a750686 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# axiom-web-backend + +Web backend development expertise across FastAPI, Django, Express, REST/GraphQL APIs, microservices, and production deployment patterns diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..d9902b4 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,89 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:tachyon-beep/skillpacks:plugins/axiom-web-backend", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "0764e5b504aacbd4da2f48bdec21358663278d74", + "treeHash": "364f2c0aaeb1af51d1ccf2720bc551e971fce5f23cd7f2c2b18f63f76a1f6ac7", + "generatedAt": "2025-11-28T10:28:31.553642Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "axiom-web-backend", + "description": "Web backend development expertise across FastAPI, Django, Express, REST/GraphQL APIs, microservices, and production deployment patterns", + "version": "1.0.2" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "f3e380170674e70ad093e5df7e66f9da0491072355d9e909bd2739bf674f46a2" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "041ef2dbea7ec879a9b9b0ad4c19907669881b319c0833a96ca9ff4c618f6cf8" + }, + { + "path": "skills/using-web-backend/database-integration.md", + "sha256": "67d46b6ba3e8c054d5b5e2f0a7834de8b8449f2a3f02522bc47176f928efb41a" + }, + { + "path": "skills/using-web-backend/api-authentication.md", + "sha256": "a39ba1396e008ab506196585cbac3ca1b05d22890db6dce3fee1a74ea346330f" + }, + { + "path": "skills/using-web-backend/fastapi-development.md", + "sha256": "5625b5728fe03930ddc68fd37457eee2e86037e78b08fed0dd319b11d05eb40d" + }, + { + "path": "skills/using-web-backend/rest-api-design.md", + "sha256": "612c2c0e04c74868b2f623c1802bcd83b4a4d10958e599a30260eb0be1f477e1" + }, + { + "path": "skills/using-web-backend/microservices-architecture.md", + "sha256": "b0deaa7b2652ce38dd653402db0feb6c41dc584b3013f2d44d1e68466158d75b" + }, + { + "path": "skills/using-web-backend/api-documentation.md", + "sha256": "b385ed6a4b78ac2c43cafe90cba81e944574073023a4c6b10d82cf8d48729987" + }, + { + "path": "skills/using-web-backend/api-testing.md", + "sha256": "bdd989e1ce000b8cff10bfb7a60e68beb12278cb8161805e90855dca16bf4946" + }, + { + "path": "skills/using-web-backend/message-queues.md", + "sha256": "82069fb6e4d2fea25c60c1ea762a5addf5a8649b67f451fa2c7f626b6601c670" + }, + { + "path": "skills/using-web-backend/graphql-api-design.md", + "sha256": "5415ec8cec79237f4c60cb1c7949b7091028f669f95853af3baf15c25bf83e55" + }, + { + "path": "skills/using-web-backend/SKILL.md", + "sha256": "eb0a96a46e863f347512c29ef57624657c3380f7f056023339d3679771659541" + }, + { + "path": "skills/using-web-backend/express-development.md", + "sha256": "af60cd4f592ea5ba20ff4749e2337932278fb3f4b21fe36834b9a07c6c481fb5" + }, + { + "path": "skills/using-web-backend/django-development.md", + "sha256": "338e5ac3e7d6cda4138618abe42c0af640497b26e74b1f0c934cf97a493ab078" + } + ], + "dirSha256": "364f2c0aaeb1af51d1ccf2720bc551e971fce5f23cd7f2c2b18f63f76a1f6ac7" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/using-web-backend/SKILL.md b/skills/using-web-backend/SKILL.md new file mode 100644 index 0000000..aad2b1e --- /dev/null +++ b/skills/using-web-backend/SKILL.md @@ -0,0 +1,135 @@ +--- +name: using-web-backend +description: Use when building web APIs, backend services, or encountering FastAPI/Django/Express/GraphQL questions, microservices architecture, authentication, or message queues - routes to 11 specialist skills rather than giving surface-level generic advice +--- + +# Using Web Backend Skills + +## Overview + +**This router directs you to specialized web backend skills. Each specialist provides deep expertise in their domain.** + +**Core principle:** Different backend challenges require different specialist knowledge. Routing to the right skill gives better results than generic advice. + +## When to Use + +Use this router when encountering: + +- **Framework-specific questions**: FastAPI, Django, Express implementation details +- **API design**: REST or GraphQL architecture, versioning, schema design +- **Architecture patterns**: Microservices, message queues, event-driven systems +- **Backend infrastructure**: Authentication, database integration, deployment +- **Testing & documentation**: API testing strategies, documentation approaches + +## Quick Reference - Routing Table + +| User Question Contains | Route To | Why | +|------------------------|----------|-----| +| FastAPI, Pydantic, async Python APIs | [fastapi-development.md](fastapi-development.md) | FastAPI-specific patterns, dependency injection, async | +| Django, ORM, views, middleware | [django-development.md](django-development.md) | Django conventions, ORM optimization, settings | +| Express, Node.js backend, middleware | [express-development.md](express-development.md) | Express patterns, error handling, async flow | +| REST API, endpoints, versioning, pagination | [rest-api-design.md](rest-api-design.md) | REST principles, resource design, hypermedia | +| GraphQL, schema, resolvers, N+1 | [graphql-api-design.md](graphql-api-design.md) | Schema design, query optimization, federation | +| Microservices, service mesh, boundaries | [microservices-architecture.md](microservices-architecture.md) | Service design, communication, consistency | +| Message queues, RabbitMQ, Kafka, events | [message-queues.md](message-queues.md) | Queue patterns, reliability, event-driven | +| JWT, OAuth2, API keys, auth | [api-authentication.md](api-authentication.md) | Auth patterns, token management, security | +| Database connections, ORM, migrations | [database-integration.md](database-integration.md) | Connection pooling, query optimization, migrations | +| API testing, integration tests, mocking | [api-testing.md](api-testing.md) | Testing strategies, contract testing, mocking | +| OpenAPI, Swagger, API docs | [api-documentation.md](api-documentation.md) | API docs (also see: muna-technical-writer) | + +## Cross-References to Other Packs + +**Before routing, check if these packs are more appropriate:** + +- **Security concerns** → `ordis-security-architect` (threat modeling, OWASP, security patterns) +- **API usability/UX** → `lyra-ux-designer` (error messages, API ergonomics) +- **Python code patterns** → `axiom-python-engineering` (Python-specific engineering) +- **Documentation writing** → `muna-technical-writer` (technical writing, clarity) + +## How to Route + +**STOP: Do not attempt to answer web backend questions yourself.** + +**Instead:** + +1. **Identify the specialist domain** from the routing table above +2. **State which specialist skill you're using**: "I'll use the `[skill-name]` skill for this" +3. **Apply the specialist skill** to provide deep, expert guidance + +## Red Flags - Using Generic Knowledge Instead of Specialists + +If you catch yourself doing any of these, STOP and route to a specialist: + +- ❌ "I'll answer all these diverse backend questions myself" +- ❌ Giving code examples from "general backend knowledge" +- ❌ Providing "competent but broad" answers across multiple domains +- ❌ Not mentioning specialized skills would give better guidance +- ❌ "The specialist skill isn't available right now, so I'll answer myself" + +**All of these mean: You're being a jack-of-all-trades. Route to the specialist.** + +## Rationalization Table + +| Excuse | Reality | +|--------|---------| +| "I can handle this backend question myself" | Specialists provide deeper expertise than generic knowledge | +| "These are all backend topics" | Backend spans 11 distinct specialist domains | +| "I'll just give a quick answer" | Quick answers miss edge cases and production patterns | +| "The user didn't ask for specialized help" | Better results come from routing to specialists proactively | +| "Specialist skill not available, I'll answer instead" | State which skill would help, explain routing, guide user to install if needed | + +## Example Routing + +**User**: "I need to build a REST API with FastAPI. How should I structure my dependency injection?" + +**Your response**: +"I'll use the [fastapi-development.md](fastapi-development.md) skill to help you structure dependency injection properly. This specialist skill covers FastAPI patterns in depth." + +[Then apply fastapi-development skill] + +**User**: "My GraphQL API has N+1 problems and we're splitting our monolith into microservices." + +**Your response**: +"You have two distinct specialist areas: +1. GraphQL optimization → I'll use [graphql-api-design.md](graphql-api-design.md) +2. Microservices architecture → I'll use [microservices-architecture.md](microservices-architecture.md) + +Let me address the GraphQL N+1 problem first with the graphql-api-design skill..." + +[Apply each specialist skill to its domain] + +## Why This Matters + +**Without routing**: Surface-level answers covering multiple domains broadly +**With routing**: Deep expertise addressing edge cases, production patterns, and domain-specific best practices + +Specialist skills = better results. + +--- + +## Web Backend Specialist Skills Catalog + +After routing, load the appropriate specialist skill for detailed guidance: + +### Framework-Specific Skills + +1. [fastapi-development.md](fastapi-development.md) - FastAPI patterns, dependency injection, async/await, Pydantic validation, background tasks +2. [django-development.md](django-development.md) - Django conventions, ORM optimization, middleware, settings, management commands +3. [express-development.md](express-development.md) - Express patterns, middleware chains, error handling, async flow control + +### API Design Skills + +4. [rest-api-design.md](rest-api-design.md) - REST principles, resource design, versioning, pagination, HATEOAS, HTTP semantics +5. [graphql-api-design.md](graphql-api-design.md) - GraphQL schema design, resolver patterns, N+1 query optimization, federation + +### Architecture & Infrastructure + +6. [microservices-architecture.md](microservices-architecture.md) - Service boundaries, communication patterns, distributed consistency, service mesh +7. [message-queues.md](message-queues.md) - Queue patterns, reliability guarantees, event-driven architecture, RabbitMQ/Kafka + +### Cross-Cutting Concerns + +8. [api-authentication.md](api-authentication.md) - JWT, OAuth2, API keys, token management, auth patterns +9. [database-integration.md](database-integration.md) - Connection pooling, query optimization, migrations, ORM patterns +10. [api-testing.md](api-testing.md) - Testing strategies, contract testing, integration tests, mocking +11. [api-documentation.md](api-documentation.md) - OpenAPI/Swagger, API documentation patterns, schema generation diff --git a/skills/using-web-backend/api-authentication.md b/skills/using-web-backend/api-authentication.md new file mode 100644 index 0000000..ba0bff0 --- /dev/null +++ b/skills/using-web-backend/api-authentication.md @@ -0,0 +1,1381 @@ + +# API Authentication + +## Overview + +**API authentication specialist covering token patterns, OAuth2 flows, security hardening, compliance, monitoring, and production operations.** + +**Core principle**: Authentication proves identity; authorization controls access - implement defense-in-depth with short-lived tokens, secure storage, rotation, monitoring, and assume breach to minimize blast radius. + +## When to Use This Skill + +Use when encountering: + +- **Authentication strategy**: JWT vs sessions vs OAuth2 vs API keys +- **OAuth2 flows**: Authorization Code, PKCE, Client Credentials, token exchange +- **Token security**: Storage, rotation, revocation, theft detection +- **Service-to-service**: mTLS, service mesh, zero-trust +- **Mobile auth**: Secure storage, biometrics, certificate pinning +- **Security hardening**: Rate limiting, abuse prevention, anomaly detection +- **Monitoring**: Auth metrics, distributed tracing, audit logs +- **Compliance**: GDPR, PCI-DSS, SOC 2, audit trails +- **Multi-tenancy**: Tenant isolation, per-tenant policies +- **Testing**: Mock auth, development workflows + +**Do NOT use for**: +- Application-specific business logic → Use domain skills +- Infrastructure security (firewalls, IDS) → `ordis-security-architect` +- Frontend auth UI → `lyra-ux-designer` + +## Quick Reference - Authentication Patterns + +| Pattern | Use Case | Security | Complexity | Revocation | +|---------|----------|----------|------------|------------| +| **JWT** | Mobile apps, APIs | Medium | Low | Hard (requires blacklist) | +| **Sessions** | Web apps, admin panels | High | Medium | Easy (delete session) | +| **OAuth2** | Third-party access, SSO | High | High | Medium (refresh rotation) | +| **API Keys** | Service-to-service, webhooks | Medium | Low | Easy (rotate keys) | +| **mTLS** | Service mesh, zero-trust | Very High | High | Medium (cert revocation) | + +## JWT vs Sessions Decision Matrix + +| Factor | JWT | Server-Side Sessions | Winner | +|--------|-----|---------------------|--------| +| **Mobile apps** | Excellent (stateless) | Poor (sticky sessions needed) | JWT | +| **Horizontal scaling** | Excellent (no shared state) | Requires sticky sessions or Redis | JWT | +| **Revocation** | Poor (need blacklist or short TTL) | Excellent (delete session) | Sessions | +| **Payload size** | Large (sent every request) | Small (session ID only) | Sessions | +| **Server memory** | None (stateless) | High (session store) | JWT | +| **XSS vulnerability** | High (if stored in localStorage) | Low (httpOnly cookies) | Sessions | +| **CSRF vulnerability** | None (bearer token) | High (requires CSRF tokens) | JWT | + +**Production Recommendation**: **Hybrid Approach** + +``` +Architecture: +- Short-lived JWTs (15 min) for API access +- Long-lived refresh tokens stored server-side (session-like) +- Refresh endpoint returns new JWT + rotates refresh token + +Benefits: +- Stateless API access (JWT) +- Secure revocation (server-side refresh tokens) +- Mobile-friendly (no cookies required) +- Horizontal scaling (minimal session state) +``` + +## OAuth2 Grant Types + +### Grant Type Selection Matrix + +| Client Type | Grant Type | Security | Use Case | +|-------------|-----------|----------|----------| +| **Web app (server-side)** | Authorization Code + PKCE | High | User login with backend | +| **SPA** | Authorization Code + PKCE | Medium-High | React/Vue/Angular apps | +| **Mobile app** | Authorization Code + PKCE | High | iOS/Android apps | +| **Service-to-service** | Client Credentials | High | Background jobs, APIs | +| **Device** | Device Authorization Grant | Medium | Smart TV, IoT devices | +| **Legacy** | ~~Password Grant~~ | DEPRECATED | Don't use | + +### Authorization Code + PKCE (RFC 7636) + +**Why PKCE?** Prevents authorization code interception attacks + +```javascript +// Step 1: Generate PKCE challenge +const codeVerifier = crypto.randomBytes(32).toString('base64url'); +const codeChallenge = crypto + .createHash('sha256') + .update(codeVerifier) + .digest('base64url'); + +// Step 2: Redirect to authorization endpoint +const authUrl = new URL('https://auth.example.com/authorize'); +authUrl.searchParams.set('response_type', 'code'); +authUrl.searchParams.set('client_id', 'your_client_id'); +authUrl.searchParams.set('redirect_uri', 'https://yourapp.com/callback'); +authUrl.searchParams.set('scope', 'read write offline_access'); +authUrl.searchParams.set('code_challenge', codeChallenge); +authUrl.searchParams.set('code_challenge_method', 'S256'); +authUrl.searchParams.set('state', generateStateToken()); // CSRF protection + +// Step 3: Exchange code for token +const tokenResponse = await fetch('https://auth.example.com/token', { + method: 'POST', + headers: { 'Content-Type': 'application/x-www-form-urlencoded' }, + body: new URLSearchParams({ + grant_type: 'authorization_code', + code: receivedCode, + redirect_uri: 'https://yourapp.com/callback', + client_id: 'your_client_id', + code_verifier: codeVerifier // Proves you initiated the flow + }) +}); + +// Response +{ + "access_token": "eyJhbGc...", + "token_type": "Bearer", + "expires_in": 900, + "refresh_token": "zxcvbnm...", + "scope": "read write offline_access" +} +``` + +### Client Credentials (Service-to-Service) + +```javascript +const tokenResponse = await fetch('https://auth.example.com/token', { + method: 'POST', + headers: { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Authorization': `Basic ${base64(client_id + ':' + client_secret)}` + }, + body: new URLSearchParams({ + grant_type: 'client_credentials', + scope: 'api.read api.write', + audience: 'https://api.example.com' + }) +}); +``` + +## Token Storage Security + +### Storage Security Matrix + +| Storage Location | XSS Risk | CSRF Risk | Accessible to JS | Production Use | +|------------------|----------|-----------|------------------|----------------| +| **localStorage** | ❌ HIGH | ✅ None | Yes | NEVER for tokens | +| **sessionStorage** | ❌ HIGH | ✅ None | Yes | NEVER for tokens | +| **Memory only** | ✅ None | ✅ None | Yes (in-app) | ✅ Access tokens (SPA) | +| **httpOnly cookie** | ✅ None | ❌ HIGH | No | ✅ Refresh tokens (+SameSite) | +| **Secure + httpOnly + SameSite=Strict** | ✅ None | ✅ Low | No | ✅ BEST for web | +| **iOS Keychain** | ✅ None | ✅ N/A | No (secure enclave) | ✅ Mobile apps | +| **Android Keystore** | ✅ None | ✅ N/A | No (hardware-backed) | ✅ Mobile apps | + +### Web App Pattern (BFF - Backend For Frontend) + +```javascript +// Frontend - access token in memory only +class AuthService { + #accessToken = null; // Private field, lost on refresh + + async callAPI(endpoint) { + if (!this.#accessToken || this.isExpired(this.#accessToken)) { + this.#accessToken = await this.refreshAccessToken(); + } + + return fetch(endpoint, { + headers: { 'Authorization': `Bearer ${this.#accessToken}` } + }); + } + + async refreshAccessToken() { + // Calls BFF, which reads httpOnly cookie + const response = await fetch('/api/auth/refresh', { + method: 'POST', + credentials: 'include' // Send httpOnly cookie + }); + + const { access_token } = await response.json(); + return access_token; + } +} + +// Backend (BFF) - refresh endpoint +app.post('/api/auth/refresh', async (req, res) => { + const refreshToken = req.cookies.refresh_token; // httpOnly cookie + + // Validate and rotate refresh token + const newTokens = await rotateRefreshToken(refreshToken); + + // Set new httpOnly cookie + res.cookie('refresh_token', newTokens.refresh_token, { + httpOnly: true, + secure: true, + sameSite: 'strict', + maxAge: 7 * 24 * 60 * 60 * 1000 // 7 days + }); + + res.json({ access_token: newTokens.access_token, expires_in: 900 }); +}); +``` + +### Mobile App Pattern + +```swift +// iOS - Keychain storage +import Security + +class TokenStorage { + func saveToken(_ token: String, forKey key: String) { + let data = token.data(using: .utf8)! + + let query: [String: Any] = [ + kSecClass as String: kSecClassGenericPassword, + kSecAttrAccount as String: key, + kSecValueData as String: data, + kSecAttrAccessible as String: kSecAttrAccessibleWhenUnlockedThisDeviceOnly + ] + + SecItemDelete(query as CFDictionary) // Delete old + SecItemAdd(query as CFDictionary, nil) // Add new + } + + func getToken(forKey key: String) -> String? { + let query: [String: Any] = [ + kSecClass as String: kSecClassGenericPassword, + kSecAttrAccount as String: key, + kSecReturnData as String: true + ] + + var result: AnyObject? + SecItemCopyMatching(query as CFDictionary, &result) + + guard let data = result as? Data else { return nil } + return String(data: data, encoding: .utf8) + } +} +``` + +```kotlin +// Android - EncryptedSharedPreferences +import androidx.security.crypto.EncryptedSharedPreferences +import androidx.security.crypto.MasterKey + +class TokenStorage(context: Context) { + private val masterKey = MasterKey.Builder(context) + .setKeyScheme(MasterKey.KeyScheme.AES256_GCM) + .build() + + private val prefs = EncryptedSharedPreferences.create( + context, + "secure_prefs", + masterKey, + EncryptedSharedPreferences.PrefKeyEncryptionScheme.AES256_SIV, + EncryptedSharedPreferences.PrefValueEncryptionScheme.AES256_GCM + ) + + fun saveToken(key: String, token: String) { + prefs.edit().putString(key, token).apply() + } + + fun getToken(key: String): String? { + return prefs.getString(key, null) + } +} +``` + +## Refresh Token Rotation + +### Pattern: Token Families with Replay Detection + +```javascript +// Database schema +CREATE TABLE refresh_tokens ( + token_hash VARCHAR(64) PRIMARY KEY, + user_id UUID NOT NULL, + family_id UUID NOT NULL, + parent_token_hash VARCHAR(64), + device_id VARCHAR(255), + ip_address INET, + user_agent TEXT, + created_at TIMESTAMP NOT NULL, + expires_at TIMESTAMP NOT NULL, + revoked BOOLEAN DEFAULT false, + revoked_at TIMESTAMP, + revoked_reason TEXT, + INDEX idx_family (family_id), + INDEX idx_user (user_id), + INDEX idx_expires (expires_at) +); + +// Refresh endpoint with rotation +async function refreshTokens(refreshToken, clientInfo) { + const tokenHash = sha256(refreshToken); + const dbToken = await db.query( + 'SELECT * FROM refresh_tokens WHERE token_hash = $1', + [tokenHash] + ); + + // Case 1: Token not found or already revoked + if (!dbToken || dbToken.revoked) { + // Check if this token existed in history + const historical = await db.query( + 'SELECT family_id FROM refresh_tokens WHERE token_hash = $1', + [tokenHash] + ); + + if (historical.length > 0) { + // REPLAY ATTACK DETECTED! + // Revoke entire token family + await db.query( + 'UPDATE refresh_tokens SET revoked = true, revoked_at = NOW(), ' + + 'revoked_reason = $1 WHERE family_id = $2', + ['Replay attack detected', historical[0].family_id] + ); + + await auditLog.critical({ + event: 'token_replay_attack', + user_id: historical[0].user_id, + family_id: historical[0].family_id, + ip: clientInfo.ip + }); + + throw new SecurityError('Token reuse detected - all sessions revoked'); + } + + throw new AuthError('Invalid refresh token'); + } + + // Case 2: Token expired + if (dbToken.expires_at < new Date()) { + throw new AuthError('Refresh token expired'); + } + + // Case 3: Valid token - rotate it + const newRefreshToken = crypto.randomBytes(32).toString('base64url'); + const newAccessToken = generateJWT({ + sub: dbToken.user_id, + scopes: ['read', 'write'], + exp: Math.floor(Date.now() / 1000) + 900 // 15 min + }); + + // Revoke current token + await db.query( + 'UPDATE refresh_tokens SET revoked = true WHERE token_hash = $1', + [tokenHash] + ); + + // Create new token in same family + await db.query( + 'INSERT INTO refresh_tokens ' + + '(token_hash, user_id, family_id, parent_token_hash, device_id, ' + + 'ip_address, user_agent, created_at, expires_at) ' + + 'VALUES ($1, $2, $3, $4, $5, $6, $7, NOW(), NOW() + INTERVAL \'7 days\')', + [ + sha256(newRefreshToken), + dbToken.user_id, + dbToken.family_id, // Same family + tokenHash, // Track lineage + clientInfo.device_id, + clientInfo.ip, + clientInfo.user_agent + ] + ); + + return { + access_token: newAccessToken, + refresh_token: newRefreshToken, + expires_in: 900, + token_type: 'Bearer' + }; +} +``` + +### Advanced Refresh Patterns + +**Absolute expiry** (max lifetime regardless of rotation): + +```javascript +// Add max_family_age to family tracking +CREATE TABLE token_families ( + family_id UUID PRIMARY KEY, + user_id UUID NOT NULL, + created_at TIMESTAMP NOT NULL, + max_lifetime_hours INT DEFAULT 720, // 30 days max + INDEX idx_user (user_id) +); + +// Check absolute expiry +const familyAge = Date.now() - family.created_at; +const maxAge = family.max_lifetime_hours * 60 * 60 * 1000; + +if (familyAge > maxAge) { + throw new AuthError('Session expired - please re-authenticate'); +} +``` + +**Grace period for concurrent requests**: + +```javascript +// Allow small window for race conditions +const ROTATION_GRACE_PERIOD_MS = 5000; // 5 seconds + +if (dbToken.revoked && dbToken.revoked_at) { + const timeSinceRevocation = Date.now() - dbToken.revoked_at; + + if (timeSinceRevocation < ROTATION_GRACE_PERIOD_MS) { + // Within grace period - might be concurrent refresh + // Return cached new tokens instead of replay alert + const newTokens = await getChildToken(tokenHash); + if (newTokens) return newTokens; + } + + // Outside grace period - likely replay attack + await revokeTokenFamily(dbToken.family_id); +} +``` + +## Rate Limiting & Abuse Prevention + +### Authentication Endpoint Rate Limits + +```javascript +const rateLimit = require('express-rate-limit'); +const RedisStore = require('rate-limit-redis'); + +// Login endpoint - strict limits +const loginLimiter = rateLimit({ + store: new RedisStore({ client: redisClient }), + windowMs: 15 * 60 * 1000, // 15 minutes + max: 5, // Max 5 attempts + message: 'Too many login attempts, please try again later', + keyGenerator: (req) => { + // Rate limit by IP + username combination + return `login:${req.ip}:${req.body.username}`; + }, + handler: (req, res) => { + auditLog.warning({ + event: 'rate_limit_exceeded', + endpoint: '/auth/login', + ip: req.ip, + username: req.body.username + }); + + res.status(429).json({ + error: 'rate_limit_exceeded', + retry_after: res.getHeader('Retry-After') + }); + } +}); + +app.post('/auth/login', loginLimiter, async (req, res) => { + // Login logic +}); + +// Refresh endpoint - moderate limits +const refreshLimiter = rateLimit({ + store: new RedisStore({ client: redisClient }), + windowMs: 60 * 1000, // 1 minute + max: 10, // 10 refreshes per minute + keyGenerator: (req) => `refresh:${req.ip}` +}); + +app.post('/auth/refresh', refreshLimiter, async (req, res) => { + // Refresh logic +}); +``` + +### Account Lockout After Failed Attempts + +```javascript +async function attemptLogin(username, password, clientInfo) { + const lockoutKey = `lockout:${username}`; + const attemptsKey = `attempts:${username}`; + + // Check if account is locked + const lockedUntil = await redis.get(lockoutKey); + if (lockedUntil && Date.now() < parseInt(lockedUntil)) { + throw new AuthError('Account temporarily locked due to failed login attempts'); + } + + // Verify credentials + const user = await db.findUser(username); + const valid = await bcrypt.compare(password, user.password_hash); + + if (!valid) { + // Increment failed attempts + const attempts = await redis.incr(attemptsKey); + await redis.expire(attemptsKey, 15 * 60); // 15 min window + + if (attempts >= 5) { + // Lock account for 30 minutes + const lockUntil = Date.now() + 30 * 60 * 1000; + await redis.set(lockoutKey, lockUntil.toString(), 'EX', 30 * 60); + + await auditLog.warning({ + event: 'account_locked', + user_id: user.id, + attempts, + ip: clientInfo.ip + }); + + throw new AuthError('Account locked due to too many failed attempts'); + } + + throw new AuthError('Invalid credentials'); + } + + // Success - clear attempts + await redis.del(attemptsKey); + + // Check for anomalies + await detectAnomalies(user.id, clientInfo); + + return generateTokens(user); +} +``` + +### Anomaly Detection + +```javascript +async function detectAnomalies(userId, clientInfo) { + // Get user's login history + const recentLogins = await db.query( + 'SELECT ip_address, country, city FROM login_history ' + + 'WHERE user_id = $1 AND created_at > NOW() - INTERVAL \'30 days\' ' + + 'ORDER BY created_at DESC LIMIT 100', + [userId] + ); + + // Check for new location + const knownLocations = new Set(recentLogins.map(l => `${l.country}:${l.city}`)); + const currentLocation = `${clientInfo.country}:${clientInfo.city}`; + + if (!knownLocations.has(currentLocation)) { + // New location - require additional verification + await sendSecurityAlert(userId, { + type: 'new_location', + location: currentLocation, + ip: clientInfo.ip + }); + + // Could require: + // - Email verification + // - 2FA challenge + // - Security question + // - Temporary session with limited access + } + + // Check for impossible travel + if (recentLogins.length > 0) { + const lastLogin = recentLogins[0]; + const timeDiff = Date.now() - lastLogin.created_at; + const distance = calculateDistance( + lastLogin.country, + clientInfo.country + ); + + // If 500+ km traveled in < 1 hour, flag as suspicious + if (distance > 500 && timeDiff < 60 * 60 * 1000) { + await auditLog.warning({ + event: 'impossible_travel', + user_id: userId, + from: lastLogin.country, + to: clientInfo.country, + time_diff_minutes: timeDiff / 60000 + }); + + // Require step-up authentication + return { require_2fa: true }; + } + } +} +``` + +## Monitoring & Observability + +### Key Metrics to Track + +| Metric | Alert Threshold | Why It Matters | +|--------|----------------|----------------| +| **Login success rate** | < 80% | Credentials issues, attacks | +| **Token refresh failures** | > 5% | Rotation bugs, clock skew | +| **Rate limit hits** | > 100/hour | Brute force attempts | +| **Account lockouts** | > 10/hour | Credential stuffing attack | +| **Token replay attempts** | > 0 | Security breach | +| **Failed 2FA attempts** | > 3/user/day | Account compromise | +| **New device logins** | Monitor trends | Unusual activity | +| **p99 auth latency** | > 500ms | Performance degradation | + +### Distributed Tracing for Auth Flows + +```javascript +const { trace, context } = require('@opentelemetry/api'); + +const tracer = trace.getTracer('auth-service'); + +async function handleLogin(req, res) { + return tracer.startActiveSpan('auth.login', async (span) => { + span.setAttribute('user.username', req.body.username); + span.setAttribute('client.ip', req.ip); + span.setAttribute('client.user_agent', req.headers['user-agent']); + + try { + // Nested span for credential validation + const user = await tracer.startActiveSpan('auth.validate_credentials', async (validateSpan) => { + const result = await validateCredentials(req.body.username, req.body.password); + validateSpan.setAttribute('validation.success', !!result); + validateSpan.end(); + return result; + }); + + if (!user) { + span.setAttribute('auth.result', 'invalid_credentials'); + throw new AuthError('Invalid credentials'); + } + + // Nested span for token generation + const tokens = await tracer.startActiveSpan('auth.generate_tokens', async (tokenSpan) => { + const result = await generateTokens(user); + tokenSpan.setAttribute('tokens.access_expiry', result.expires_in); + tokenSpan.end(); + return result; + }); + + span.setAttribute('auth.result', 'success'); + span.setAttribute('user.id', user.id); + + res.json(tokens); + } catch (error) { + span.recordException(error); + span.setAttribute('auth.result', 'error'); + throw error; + } finally { + span.end(); + } + }); +} + +// Trace shows: +// auth.login (500ms) +// ├── auth.validate_credentials (300ms) // DB query +// ├── auth.generate_tokens (50ms) // JWT signing +// └── auth.audit_log (150ms) // Logging + +// Can identify bottlenecks: +// - Slow password hashing (increase bcrypt rounds?) +// - Slow DB queries (add indexes?) +// - Network latency to Redis +``` + +### Audit Logging + +```javascript +class AuditLogger { + async log(event) { + const entry = { + timestamp: new Date().toISOString(), + event_type: event.type, + user_id: event.user_id, + ip_address: event.ip, + user_agent: event.user_agent, + resource: event.resource, + action: event.action, + result: event.result, + metadata: event.metadata, + trace_id: context.active().getValue('trace_id') + }; + + // Write to multiple destinations + await Promise.all([ + // 1. Append-only audit table (compliance) + db.query('INSERT INTO audit_log (...) VALUES (...)', entry), + + // 2. Time-series database (analytics) + influxdb.write('auth_events', entry), + + // 3. SIEM (security monitoring) + siem.send(entry), + + // 4. Compliance log (immutable, encrypted) + complianceLog.append(encrypt(entry)) + ]); + } + + async critical(event) { + await this.log({ ...event, severity: 'critical' }); + + // Alert on critical events + await alerting.send({ + title: `Critical Auth Event: ${event.event_type}`, + details: event, + severity: 'critical' + }); + } +} + +// Usage +await auditLog.log({ + type: 'login_success', + user_id: user.id, + ip: req.ip, + user_agent: req.headers['user-agent'], + result: 'success' +}); + +await auditLog.critical({ + type: 'token_replay_attack', + user_id: user.id, + family_id: token.family_id, + ip: req.ip +}); +``` + +## Multi-Tenancy Patterns + +### Tenant Isolation in Tokens + +```javascript +// JWT with tenant claim +const accessToken = jwt.sign({ + sub: user.id, + tenant_id: user.tenant_id, // Tenant isolation + tenant_tier: tenant.tier, // For rate limiting + roles: user.roles, // ['admin', 'user'] + scopes: ['read:orders', 'write:orders'], + iss: 'https://auth.example.com', + aud: 'https://api.example.com', + exp: Math.floor(Date.now() / 1000) + 900 +}, privateKey, { algorithm: 'RS256' }); + +// Middleware to enforce tenant isolation +function tenantIsolation(req, res, next) { + const token = verifyJWT(req.headers.authorization); + + // Extract tenant from token + req.tenant_id = token.tenant_id; + + // Add tenant filter to all DB queries + req.dbFilter = { tenant_id: req.tenant_id }; + + next(); +} + +// All queries automatically filtered +app.get('/orders', tenantIsolation, async (req, res) => { + // Automatically filtered by tenant + const orders = await db.query( + 'SELECT * FROM orders WHERE tenant_id = $1', + [req.tenant_id] + ); + res.json(orders); +}); +``` + +### Per-Tenant Rate Limits + +```javascript +const getTenantRateLimit = (tier) => { + const limits = { + free: { windowMs: 60000, max: 100 }, // 100/min + pro: { windowMs: 60000, max: 1000 }, // 1000/min + enterprise: { windowMs: 60000, max: 10000 } // 10k/min + }; + return limits[tier] || limits.free; +}; + +app.use(async (req, res, next) => { + const token = verifyJWT(req.headers.authorization); + const tenant = await getTenant(token.tenant_id); + + const limit = getTenantRateLimit(tenant.tier); + + // Apply tenant-specific rate limit + const limiter = rateLimit({ + ...limit, + keyGenerator: () => `api:${tenant.id}` + }); + + limiter(req, res, next); +}); +``` + +## Service-to-Service Authentication + +### Zero-Trust Architecture + +``` +Principles: +1. Never trust, always verify +2. Assume breach +3. Verify explicitly (identity + device + location) +4. Least privilege access +5. Micro-segmentation +``` + +### Mutual TLS (mTLS) Pattern + +```yaml +# Kubernetes with cert-manager +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: service-a-cert +spec: + secretName: service-a-tls + issuerRef: + name: internal-ca + kind: ClusterIssuer + dnsNames: + - service-a.default.svc.cluster.local + usages: + - digital signature + - key encipherment + - client auth # Client authentication + - server auth # Server authentication + +# Service configuration +apiVersion: v1 +kind: Service +metadata: + name: service-b + annotations: + service.alpha.kubernetes.io/app-protocols: '{"https":"HTTPS"}' +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 8443 + +# Pod configuration +apiVersion: apps/v1 +kind: Deployment +metadata: + name: service-a +spec: + template: + spec: + containers: + - name: app + volumeMounts: + - name: tls + mountPath: /etc/tls + readOnly: true + volumes: + - name: tls + secret: + secretName: service-a-tls +``` + +```javascript +// Node.js client with mTLS +const https = require('https'); +const fs = require('fs'); + +const options = { + hostname: 'service-b.default.svc.cluster.local', + port: 443, + path: '/api/orders', + method: 'GET', + + // Client certificate + cert: fs.readFileSync('/etc/tls/tls.crt'), + key: fs.readFileSync('/etc/tls/tls.key'), + + // CA certificate to verify server + ca: fs.readFileSync('/etc/tls/ca.crt'), + + // Verify server identity + checkServerIdentity: (hostname, cert) => { + // Custom verification logic + if (cert.subject.CN !== 'service-b.default.svc.cluster.local') { + throw new Error('Server identity mismatch'); + } + } +}; + +https.get(options, (res) => { + // Handle response +}); +``` + +### Service Mesh (Istio) Pattern + +```yaml +# Automatic mTLS for all services +apiVersion: security.istio.io/v1beta1 +kind: PeerAuthentication +metadata: + name: default + namespace: default +spec: + mtls: + mode: STRICT # Require mTLS + +# Authorization policy +apiVersion: security.istio.io/v1beta1 +kind: AuthorizationPolicy +metadata: + name: service-b-policy +spec: + selector: + matchLabels: + app: service-b + rules: + - from: + - source: + principals: ["cluster.local/ns/default/sa/service-a"] + to: + - operation: + methods: ["GET", "POST"] + paths: ["/api/orders/*"] + - from: + - source: + principals: ["cluster.local/ns/default/sa/service-c"] + to: + - operation: + methods: ["GET"] + paths: ["/api/orders/*/status"] + +# Request authentication (JWT validation) +apiVersion: security.istio.io/v1beta1 +kind: RequestAuthentication +metadata: + name: jwt-auth +spec: + selector: + matchLabels: + app: service-b + jwtRules: + - issuer: "https://auth.example.com" + jwksUri: "https://auth.example.com/.well-known/jwks.json" + audiences: + - "service-b" +``` + +## Mobile-Specific Patterns + +### Certificate Pinning + +```swift +// iOS - Certificate pinning with URLSession +class CertificatePinner: NSObject, URLSessionDelegate { + let pinnedCertificates: [SecCertificate] + + init(pinnedCertificates: [SecCertificate]) { + self.pinnedCertificates = pinnedCertificates + } + + func urlSession( + _ session: URLSession, + didReceive challenge: URLAuthenticationChallenge, + completionHandler: @escaping (URLSession.AuthChallengeDisposition, URLCredential?) -> Void + ) { + guard challenge.protectionSpace.authenticationMethod == NSURLAuthenticationMethodServerTrust, + let serverTrust = challenge.protectionSpace.serverTrust else { + completionHandler(.cancelAuthenticationChallenge, nil) + return + } + + // Get server certificate + guard let serverCertificate = SecTrustGetCertificateAtIndex(serverTrust, 0) else { + completionHandler(.cancelAuthenticationChallenge, nil) + return + } + + // Check if server cert matches any pinned cert + let serverCertData = SecCertificateCopyData(serverCertificate) as Data + + for pinnedCert in pinnedCertificates { + let pinnedCertData = SecCertificateCopyData(pinnedCert) as Data + + if serverCertData == pinnedCertData { + // Certificate matches - allow connection + let credential = URLCredential(trust: serverTrust) + completionHandler(.useCredential, credential) + return + } + } + + // Certificate not pinned - reject connection + completionHandler(.cancelAuthenticationChallenge, nil) + } +} +``` + +### Biometric Authentication + +```swift +// iOS - Biometric auth (Face ID / Touch ID) +import LocalAuthentication + +class BiometricAuth { + func authenticate(reason: String, completion: @escaping (Bool, Error?) -> Void) { + let context = LAContext() + var error: NSError? + + // Check if biometric auth is available + guard context.canEvaluatePolicy(.deviceOwnerAuthenticationWithBiometrics, error: &error) else { + completion(false, error) + return + } + + // Attempt biometric authentication + context.evaluatePolicy( + .deviceOwnerAuthenticationWithBiometrics, + localizedReason: reason + ) { success, error in + DispatchQueue.main.async { + if success { + // Biometric auth successful - retrieve token from Keychain + let token = TokenStorage().getToken(forKey: "refresh_token") + completion(true, nil) + } else { + completion(false, error) + } + } + } + } +} +``` + +## Compliance & Regulations + +### GDPR Considerations + +```javascript +// Right to be forgotten - token revocation +async function deleteUserData(userId) { + await db.transaction(async (tx) => { + // 1. Revoke all active tokens + await tx.query( + 'UPDATE refresh_tokens SET revoked = true, ' + + 'revoked_reason = $1 WHERE user_id = $2', + ['GDPR deletion request', userId] + ); + + // 2. Anonymize audit logs (keep for compliance) + await tx.query( + 'UPDATE audit_log SET user_id = NULL, ' + + 'ip_address = NULL, user_agent = NULL WHERE user_id = $1', + [userId] + ); + + // 3. Delete user data + await tx.query('DELETE FROM users WHERE id = $1', [userId]); + }); +} + +// Data portability - export auth history +async function exportAuthData(userId) { + const data = { + login_history: await db.query( + 'SELECT created_at, ip_address, user_agent, result ' + + 'FROM login_history WHERE user_id = $1', + [userId] + ), + active_sessions: await db.query( + 'SELECT created_at, device_id, ip_address, expires_at ' + + 'FROM refresh_tokens WHERE user_id = $1 AND revoked = false', + [userId] + ) + }; + + return JSON.stringify(data, null, 2); +} +``` + +### PCI-DSS for Payment Systems + +```javascript +// Requirements for authentication in payment systems + +// 1. Strong access control (8.2) +const PASSWORD_REQUIREMENTS = { + minLength: 12, + requireUppercase: true, + requireLowercase: true, + requireNumbers: true, + requireSpecialChars: true, + preventReuse: 4, // Can't reuse last 4 passwords + maxAge: 90 * 24 * 60 * 60 * 1000 // 90 days +}; + +// 2. Multi-factor authentication (8.3) +async function loginWithMFA(username, password, mfaCode) { + const user = await validateCredentials(username, password); + if (!user) throw new AuthError('Invalid credentials'); + + // Require MFA for all administrative access + if (user.roles.includes('admin')) { + const validMFA = await validateTOTP(user.id, mfaCode); + if (!validMFA) throw new AuthError('Invalid MFA code'); + } + + return generateTokens(user); +} + +// 3. Session timeout (8.1.8) +const SESSION_TIMEOUT = 15 * 60 * 1000; // 15 minutes idle + +// 4. Audit logging (10.2) +await auditLog.log({ + type: 'cardholder_data_access', + user_id: user.id, + resource: 'payment_methods', + action: 'read', + result: 'success', + timestamp: new Date().toISOString() +}); +``` + +## Testing Strategies + +### Mock Auth for Development + +```javascript +// Development-only bypass (NEVER in production) +if (process.env.NODE_ENV === 'development') { + app.use('/dev-auth/login-as/:userId', async (req, res) => { + if (process.env.ENABLE_DEV_AUTH !== 'true') { + return res.status(403).json({ error: 'Dev auth not enabled' }); + } + + const user = await db.findUser(req.params.userId); + const tokens = await generateTokens(user); + + res.json(tokens); + }); +} + +// Environment check middleware +app.use((req, res, next) => { + if (req.path.startsWith('/dev-auth') && process.env.NODE_ENV !== 'development') { + return res.status(404).json({ error: 'Not found' }); + } + next(); +}); +``` + +### Integration Testing + +```javascript +const request = require('supertest'); +const app = require('./app'); + +describe('OAuth2 Authorization Code Flow', () => { + let authCode, codeVerifier; + + it('should initiate authorization', async () => { + codeVerifier = generatePKCEVerifier(); + const codeChallenge = generatePKCEChallenge(codeVerifier); + + const res = await request(app) + .get('/oauth/authorize') + .query({ + response_type: 'code', + client_id: 'test_client', + redirect_uri: 'http://localhost:3000/callback', + scope: 'read write', + code_challenge: codeChallenge, + code_challenge_method: 'S256', + state: 'random_state' + }); + + expect(res.status).toBe(302); + expect(res.headers.location).toContain('code='); + + // Extract code from redirect + const url = new URL(res.headers.location); + authCode = url.searchParams.get('code'); + }); + + it('should exchange code for tokens', async () => { + const res = await request(app) + .post('/oauth/token') + .send({ + grant_type: 'authorization_code', + code: authCode, + redirect_uri: 'http://localhost:3000/callback', + client_id: 'test_client', + code_verifier: codeVerifier + }); + + expect(res.status).toBe(200); + expect(res.body).toHaveProperty('access_token'); + expect(res.body).toHaveProperty('refresh_token'); + expect(res.body.token_type).toBe('Bearer'); + expect(res.body.expires_in).toBe(900); + }); + + it('should detect PKCE verification failure', async () => { + const res = await request(app) + .post('/oauth/token') + .send({ + grant_type: 'authorization_code', + code: authCode, + redirect_uri: 'http://localhost:3000/callback', + client_id: 'test_client', + code_verifier: 'wrong_verifier' // Wrong verifier + }); + + expect(res.status).toBe(400); + expect(res.body.error).toBe('invalid_grant'); + }); +}); + +describe('Refresh Token Rotation', () => { + let refreshToken1, refreshToken2; + + it('should rotate refresh token on use', async () => { + // First refresh + const res1 = await request(app) + .post('/auth/refresh') + .send({ refresh_token: originalRefreshToken }); + + expect(res1.status).toBe(200); + refreshToken1 = res1.body.refresh_token; + + // Second refresh with new token + const res2 = await request(app) + .post('/auth/refresh') + .send({ refresh_token: refreshToken1 }); + + expect(res2.status).toBe(200); + refreshToken2 = res2.body.refresh_token; + + expect(refreshToken1).not.toBe(refreshToken2); + }); + + it('should detect refresh token replay', async () => { + // Try to reuse first refresh token (already rotated) + const res = await request(app) + .post('/auth/refresh') + .send({ refresh_token: refreshToken1 }); + + expect(res.status).toBe(401); + expect(res.body.error).toContain('replay'); + + // Entire family should be revoked + const familyCheck = await request(app) + .post('/auth/refresh') + .send({ refresh_token: refreshToken2 }); + + expect(familyCheck.status).toBe(401); // Also revoked + }); +}); +``` + +## Token Validation Patterns + +### JWT Validation with Caching + +```javascript +const jwt = require('jsonwebtoken'); +const { NodeCache } = require('node-cache'); + +const publicKeyCache = new NodeCache({ stdTTL: 3600 }); // 1 hour + +async function validateJWT(token) { + // Decode without verification to get header + const decoded = jwt.decode(token, { complete: true }); + if (!decoded) throw new AuthError('Invalid token format'); + + const keyId = decoded.header.kid; + + // Try cache first + let publicKey = publicKeyCache.get(keyId); + + if (!publicKey) { + // Fetch from JWKS endpoint + const jwks = await fetch('https://auth.example.com/.well-known/jwks.json'); + const keys = await jwks.json(); + + const key = keys.keys.find(k => k.kid === keyId); + if (!key) throw new AuthError('Public key not found'); + + publicKey = jwkToPem(key); + publicKeyCache.set(keyId, publicKey); + } + + // Verify signature and claims + try { + const payload = jwt.verify(token, publicKey, { + algorithms: ['RS256'], + issuer: 'https://auth.example.com', + audience: 'https://api.example.com' + }); + + // Additional validation + if (!payload.sub) throw new AuthError('Missing subject claim'); + if (!payload.scopes || !Array.isArray(payload.scopes)) { + throw new AuthError('Missing or invalid scopes'); + } + + return payload; + } catch (error) { + if (error.name === 'TokenExpiredError') { + throw new AuthError('Token expired'); + } + throw new AuthError('Token validation failed'); + } +} +``` + +### Key Rotation Without Downtime + +```javascript +// Support multiple signing keys simultaneously +const CURRENT_KEY_ID = 'key-2024-11'; +const PREVIOUS_KEY_ID = 'key-2024-10'; + +const signingKeys = new Map([ + [CURRENT_KEY_ID, fs.readFileSync('/keys/current-private.pem')], + [PREVIOUS_KEY_ID, fs.readFileSync('/keys/previous-private.pem')] +]); + +// Sign with current key +function generateJWT(payload) { + return jwt.sign(payload, signingKeys.get(CURRENT_KEY_ID), { + algorithm: 'RS256', + keyid: CURRENT_KEY_ID, + expiresIn: '15m' + }); +} + +// Validate with either key (grace period) +function validateJWT(token) { + const decoded = jwt.decode(token, { complete: true }); + const keyId = decoded.header.kid; + + if (!signingKeys.has(keyId)) { + throw new AuthError('Unknown signing key'); + } + + return jwt.verify(token, signingKeys.get(keyId), { + algorithms: ['RS256'] + }); +} + +// Key rotation process: +// 1. Generate new key pair → key-2024-12 +// 2. Add to signingKeys map (validation now accepts 3 keys) +// 3. Update CURRENT_KEY_ID to key-2024-12 (new tokens use new key) +// 4. Wait for old tokens to expire (15 min) +// 5. Remove key-2024-10 from signingKeys map +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **Long-lived JWTs** | Can't revoke, security risk | Max 15-60 min, use refresh tokens | +| **Tokens in localStorage** | XSS vulnerability | httpOnly cookies or memory-only | +| **No refresh rotation** | Stolen token = permanent access | Rotate on every use, detect replay | +| **Password Grant** | App handles credentials, no MFA | Authorization Code + PKCE | +| **Shared secrets across services** | One breach = all compromised | Per-service secrets, rotate regularly | +| **No rate limiting** | Brute force attacks | Rate limit login, refresh, sensitive endpoints | +| **Ignoring anomalies** | Account takeover undetected | Monitor location, device, behavior | +| **No audit logging** | Can't investigate breaches | Log all auth events, immutable storage | +| **Weak password requirements** | Easy to crack | 12+ chars, complexity, no common passwords | +| **No MFA for admins** | Privileged account compromise | Require MFA for elevated access | + +## Cross-References + +**Related skills**: +- **Security architecture** → `ordis-security-architect` (threat modeling, defense-in-depth) +- **FastAPI implementation** → `fastapi-development` (FastAPI auth middleware) +- **REST API design** → `rest-api-design` (Bearer tokens, auth headers) +- **GraphQL auth** → `graphql-api-design` (context-based auth, directives) +- **Microservices** → `microservices-architecture` (service mesh, mTLS) + +## Further Reading + +- **OAuth 2.1**: Latest OAuth spec (consolidates best practices) +- **RFC 7636**: PKCE specification +- **RFC 8693**: Token exchange for delegation +- **OWASP Auth Cheat Sheet**: https://cheatsheetseries.owasp.org/cheatsheets/Authentication_Cheat_Sheet.html +- **JWT Best Practices**: https://datatracker.ietf.org/doc/html/rfc8725 +- **Zero Trust Architecture**: NIST SP 800-207 diff --git a/skills/using-web-backend/api-documentation.md b/skills/using-web-backend/api-documentation.md new file mode 100644 index 0000000..458d1ba --- /dev/null +++ b/skills/using-web-backend/api-documentation.md @@ -0,0 +1,944 @@ + +# API Documentation + +## Overview + +**API documentation specialist covering OpenAPI specs, documentation-as-code, testing docs, SDK generation, and preventing documentation debt.** + +**Core principle**: Documentation is a product feature that directly impacts developer adoption - invest in keeping it accurate, tested, and discoverable. + +## When to Use This Skill + +Use when encountering: + +- **OpenAPI/Swagger**: Auto-generating docs, customizing Swagger UI, maintaining specs +- **Documentation testing**: Ensuring examples work, preventing stale docs +- **Versioning**: Managing multi-version docs, deprecation notices +- **Documentation-as-code**: Keeping docs in sync with code changes +- **SDK generation**: Generating client libraries from OpenAPI specs +- **Documentation debt**: Detecting and preventing outdated documentation +- **Metrics**: Tracking documentation usage and effectiveness +- **Community docs**: Managing contributions, improving discoverability + +**Do NOT use for**: +- General technical writing (see `muna-technical-writer` skill) +- API design principles (see `rest-api-design`, `graphql-api-design`) +- Authentication implementation (see `api-authentication`) + +## OpenAPI Specification Best Practices + +### Production-Quality OpenAPI Specs + +**Complete FastAPI example**: + +```python +from fastapi import FastAPI, Path, Query, Body +from pydantic import BaseModel, Field +from typing import Optional, List + +app = FastAPI( + title="Payment Processing API", + description=""" + # Payment API + + Process payments with PCI-DSS compliance. + + ## Features + - Multiple payment methods (cards, ACH, digital wallets) + - Fraud detection + - Webhook notifications + - Test mode for development + + ## Rate Limits + - Standard: 100 requests/minute + - Premium: 1000 requests/minute + + ## Support + - Documentation: https://docs.example.com + - Status: https://status.example.com + - Support: api-support@example.com + """, + version="2.1.0", + terms_of_service="https://example.com/terms", + contact={ + "name": "API Support", + "url": "https://example.com/support", + "email": "api-support@example.com" + }, + license_info={ + "name": "Apache 2.0", + "url": "https://www.apache.org/licenses/LICENSE-2.0.html" + }, + servers=[ + {"url": "https://api.example.com", "description": "Production"}, + {"url": "https://sandbox-api.example.com", "description": "Sandbox"} + ] +) + +# Tag organization +tags_metadata = [ + { + "name": "payments", + "description": "Payment operations", + "externalDocs": { + "description": "Payment Guide", + "url": "https://docs.example.com/guides/payments" + } + } +] + +app = FastAPI(openapi_tags=tags_metadata) + +# Rich schema with examples +class PaymentRequest(BaseModel): + amount: float = Field( + ..., + gt=0, + le=999999.99, + description="Payment amount in USD", + example=99.99 + ) + currency: str = Field( + default="USD", + pattern="^[A-Z]{3}$", + description="ISO 4217 currency code", + example="USD" + ) + + class Config: + schema_extra = { + "examples": [ + { + "amount": 149.99, + "currency": "USD", + "payment_method": "card_visa_4242", + "description": "Premium subscription" + }, + { + "amount": 29.99, + "currency": "EUR", + "payment_method": "paypal_account", + "description": "Monthly plan" + } + ] + } + +# Comprehensive error documentation +@app.post( + "/payments", + summary="Create payment", + description=""" + Creates a new payment transaction. + + ## Processing Time + Typically 2-5 seconds for card payments. + + ## Idempotency + Use `Idempotency-Key` header to prevent duplicates. + + ## Test Mode + Use test payment methods in sandbox environment. + """, + responses={ + 201: {"description": "Payment created", "model": PaymentResponse}, + 400: { + "description": "Invalid request", + "content": { + "application/json": { + "examples": { + "invalid_amount": { + "summary": "Amount validation failed", + "value": { + "error_code": "INVALID_AMOUNT", + "message": "Amount must be between 0.01 and 999999.99" + } + } + } + } + } + }, + 402: {"description": "Payment declined"}, + 429: {"description": "Rate limit exceeded"} + }, + tags=["payments"] +) +async def create_payment(payment: PaymentRequest): + pass +``` + +### Custom OpenAPI Generation + +**Add security schemes, custom extensions**: + +```python +from fastapi.openapi.utils import get_openapi + +def custom_openapi(): + if app.openapi_schema: + return app.openapi_schema + + openapi_schema = get_openapi( + title=app.title, + version=app.version, + description=app.description, + routes=app.routes, + ) + + # Security schemes + openapi_schema["components"]["securitySchemes"] = { + "ApiKeyAuth": { + "type": "apiKey", + "in": "header", + "name": "X-API-Key", + "description": "Get your API key at https://dashboard.example.com/api-keys" + }, + "OAuth2": { + "type": "oauth2", + "flows": { + "authorizationCode": { + "authorizationUrl": "https://auth.example.com/oauth/authorize", + "tokenUrl": "https://auth.example.com/oauth/token", + "scopes": { + "payments:read": "Read payment data", + "payments:write": "Create payments" + } + }, + "clientCredentials": { + "tokenUrl": "https://auth.example.com/oauth/token", + "scopes": { + "payments:read": "Read payment data", + "payments:write": "Create payments" + } + } + } + } + } + + # Global security requirement + openapi_schema["security"] = [{"ApiKeyAuth": []}] + + # Custom extensions for tooling + openapi_schema["x-api-id"] = "payments-api-v2" + openapi_schema["x-audience"] = "external" + openapi_schema["x-ratelimit-default"] = 100 + + # Add code samples extension (for Swagger UI) + for path_data in openapi_schema["paths"].values(): + for operation in path_data.values(): + if isinstance(operation, dict) and "operationId" in operation: + operation["x-code-samples"] = [ + { + "lang": "curl", + "source": generate_curl_example(operation) + }, + { + "lang": "python", + "source": generate_python_example(operation) + } + ] + + app.openapi_schema = openapi_schema + return app.openapi_schema + +app.openapi = custom_openapi +``` + +## Documentation-as-Code + +### Keep Docs in Sync with Code + +**Anti-pattern**: Docs in separate repo, manually updated, always stale + +**Pattern**: Co-locate docs with code, auto-generate from source + +**Implementation**: + +```python +# Source of truth: Pydantic models +class PaymentRequest(BaseModel): + """ + Payment request model. + + Examples: + Basic payment: + ```python + payment = PaymentRequest( + amount=99.99, + currency="USD", + payment_method="pm_card_visa" + ) + ``` + """ + amount: float = Field(..., description="Amount in USD") + currency: str = Field(default="USD", description="ISO 4217 currency code") + + class Config: + schema_extra = { + "examples": [ + {"amount": 99.99, "currency": "USD", "payment_method": "pm_card_visa"} + ] + } + +# Docs auto-generated from model +# - OpenAPI spec from Field descriptions +# - Examples from schema_extra +# - Code samples from docstring examples +``` + +**Prevent schema drift**: + +```python +import pytest +from fastapi.testclient import TestClient + +def test_openapi_schema_matches_committed(): + """Ensure OpenAPI spec is committed and up-to-date""" + client = TestClient(app) + + # Get current OpenAPI spec + current_spec = client.get("/openapi.json").json() + + # Load committed spec + with open("docs/openapi.json") as f: + committed_spec = json.load(f) + + # Fail if specs don't match + assert current_spec == committed_spec, \ + "OpenAPI spec has changed. Run 'make update-openapi-spec' and commit" + +def test_all_endpoints_have_examples(): + """Ensure all endpoints have request/response examples""" + client = TestClient(app) + spec = client.get("/openapi.json").json() + + for path, methods in spec["paths"].items(): + for method, details in methods.items(): + if method in ["get", "post", "put", "patch", "delete"]: + # Check request body has example + if "requestBody" in details: + assert "examples" in details["requestBody"]["content"]["application/json"], \ + f"{method.upper()} {path} missing request examples" + + # Check responses have examples + for status_code, response in details.get("responses", {}).items(): + if "content" in response and "application/json" in response["content"]: + assert "examples" in response["content"]["application/json"] or \ + "example" in response["content"]["application/json"]["schema"], \ + f"{method.upper()} {path} response {status_code} missing examples" +``` + +### Documentation Pre-Commit Hook + +```bash +# .git/hooks/pre-commit +#!/bin/bash + +# Regenerate OpenAPI spec +python -c " +from app.main import app +import json + +with open('docs/openapi.json', 'w') as f: + json.dump(app.openapi(), f, indent=2) +" + +# Check if spec changed +git add docs/openapi.json + +# Validate spec +npm run validate:openapi + +# Run doc tests +pytest tests/test_documentation.py +``` + +## Documentation Testing + +### Ensure Examples Actually Work + +**Problem**: Examples in docs become stale, don't work + +**Solution**: Test every code example automatically + +```python +# Extract examples from OpenAPI spec +import pytest +import requests +from app.main import app + +def get_all_examples_from_openapi(): + """Extract all examples from OpenAPI spec""" + spec = app.openapi() + examples = [] + + for path, methods in spec["paths"].items(): + for method, details in methods.items(): + if "examples" in details.get("requestBody", {}).get("content", {}).get("application/json", {}): + for example_name, example_data in details["requestBody"]["content"]["application/json"]["examples"].items(): + examples.append({ + "path": path, + "method": method, + "example_name": example_name, + "data": example_data["value"] + }) + + return examples + +@pytest.mark.parametrize("example", get_all_examples_from_openapi(), ids=lambda e: f"{e['method']}_{e['path']}_{e['example_name']}") +def test_openapi_examples_are_valid(example, client): + """Test that all OpenAPI examples are valid requests""" + method = example["method"] + path = example["path"] + data = example["data"] + + response = client.request(method, path, json=data) + + # Examples should either succeed or fail with expected error + assert response.status_code in [200, 201, 400, 401, 402, 403, 404], \ + f"Example {example['example_name']} for {method.upper()} {path} returned unexpected status {response.status_code}" +``` + +**Test markdown code samples**: + +```python +import pytest +import re +import tempfile +import subprocess + +def extract_code_blocks_from_markdown(markdown_file): + """Extract code blocks from markdown""" + with open(markdown_file) as f: + content = f.read() + + # Find code blocks with language + pattern = r'```(\w+)\n(.*?)```' + return re.findall(pattern, content, re.DOTALL) + +def test_python_examples_in_quickstart(): + """Test that Python examples in quickstart.md execute without errors""" + code_blocks = extract_code_blocks_from_markdown("docs/quickstart.md") + + for lang, code in code_blocks: + if lang == "python": + # Write code to temp file + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: + # Replace placeholders + code = code.replace("sk_test_abc123...", "test_api_key") + code = code.replace("https://api.example.com", "http://localhost:8000") + f.write(code) + f.flush() + + # Run code + result = subprocess.run( + ["python", f.name], + capture_output=True, + text=True, + timeout=5 + ) + + assert result.returncode == 0, \ + f"Python example failed:\n{code}\n\nError:\n{result.stderr}" +``` + +### Documentation Coverage Metrics + +```python +def test_documentation_coverage(): + """Ensure all endpoints are documented""" + from fastapi.openapi.utils import get_openapi + + spec = get_openapi(title="Test", version="1.0.0", routes=app.routes) + + missing_docs = [] + + for path, methods in spec["paths"].items(): + for method, details in methods.items(): + # Check summary + if not details.get("summary"): + missing_docs.append(f"{method.upper()} {path}: Missing summary") + + # Check description + if not details.get("description"): + missing_docs.append(f"{method.upper()} {path}: Missing description") + + # Check examples + if "requestBody" in details: + content = details["requestBody"].get("content", {}).get("application/json", {}) + if "examples" not in content and "example" not in content.get("schema", {}): + missing_docs.append(f"{method.upper()} {path}: Missing request example") + + assert not missing_docs, \ + f"Documentation incomplete:\n" + "\n".join(missing_docs) +``` + +## Interactive Documentation + +### Swagger UI Customization + +**Custom Swagger UI with branding**: + +```python +from fastapi import FastAPI +from fastapi.openapi.docs import get_swagger_ui_html +from fastapi.staticfiles import StaticFiles + +app = FastAPI(docs_url=None) # Disable default docs +app.mount("/static", StaticFiles(directory="static"), name="static") + +@app.get("/docs", include_in_schema=False) +async def custom_swagger_ui_html(): + return get_swagger_ui_html( + openapi_url=app.openapi_url, + title=f"{app.title} - API Documentation", + oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url, + swagger_js_url="/static/swagger-ui-bundle.js", + swagger_css_url="/static/swagger-ui.css", + swagger_favicon_url="/static/favicon.png", + swagger_ui_parameters={ + "deepLinking": True, + "displayRequestDuration": True, + "filter": True, + "showExtensions": True, + "tryItOutEnabled": True, + "persistAuthorization": True, + "defaultModelsExpandDepth": 1, + "defaultModelExpandDepth": 1 + } + ) +``` + +**Add "Try It Out" authentication**: + +```python +from fastapi.openapi.docs import get_swagger_ui_html + +@app.get("/docs") +async def custom_swagger_ui(): + return get_swagger_ui_html( + openapi_url="/openapi.json", + title="API Docs", + init_oauth={ + "clientId": "swagger-ui-client", + "appName": "API Documentation", + "usePkceWithAuthorizationCodeGrant": True + } + ) +``` + +### ReDoc Customization + +```python +from fastapi.openapi.docs import get_redoc_html + +@app.get("/redoc", include_in_schema=False) +async def redoc_html(): + return get_redoc_html( + openapi_url="/openapi.json", + title="API Documentation - ReDoc", + redoc_js_url="/static/redoc.standalone.js", + redoc_favicon_url="/static/favicon.png", + with_google_fonts=True + ) +``` + +**ReDoc configuration options**: + +```html + + +``` + +## SDK Generation + +### Generate Client SDKs from OpenAPI + +**OpenAPI Generator**: + +```bash +# Install openapi-generator +npm install -g @openapitools/openapi-generator-cli + +# Generate Python SDK +openapi-generator-cli generate \ + -i docs/openapi.json \ + -g python \ + -o sdks/python \ + --additional-properties=packageName=payment_api,projectName=payment-api-python + +# Generate TypeScript SDK +openapi-generator-cli generate \ + -i docs/openapi.json \ + -g typescript-fetch \ + -o sdks/typescript \ + --additional-properties=npmName=@example/payment-api,supportsES6=true + +# Generate Go SDK +openapi-generator-cli generate \ + -i docs/openapi.json \ + -g go \ + -o sdks/go \ + --additional-properties=packageName=paymentapi +``` + +**Automate SDK generation in CI**: + +```yaml +# .github/workflows/generate-sdks.yml +name: Generate SDKs + +on: + push: + branches: [main] + paths: + - 'docs/openapi.json' + +jobs: + generate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Generate Python SDK + run: | + docker run --rm \ + -v ${PWD}:/local \ + openapitools/openapi-generator-cli generate \ + -i /local/docs/openapi.json \ + -g python \ + -o /local/sdks/python + + - name: Test Python SDK + run: | + cd sdks/python + pip install -e . + pytest + + - name: Publish to PyPI + if: github.ref == 'refs/heads/main' + run: | + cd sdks/python + python -m build + twine upload dist/* + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} +``` + +**Custom SDK templates**: + +``` +templates/ +├── python/ +│ ├── api.mustache # Custom API client template +│ ├── model.mustache # Custom model template +│ └── README.mustache # Custom README +``` + +```bash +# Generate with custom templates +openapi-generator-cli generate \ + -i docs/openapi.json \ + -g python \ + -o sdks/python \ + -t templates/python \ + --additional-properties=packageName=payment_api +``` + +## Documentation Versioning + +### Version Documentation Separately from API + +**Documentation versions**: + +``` +docs/ +├── v1/ +│ ├── quickstart.md +│ ├── api-reference.md +│ └── migration-to-v2.md ← Deprecation notice +├── v2/ +│ ├── quickstart.md +│ ├── api-reference.md +│ └── whats-new.md +└── latest -> v2/ # Symlink to current version +``` + +**Documentation routing**: + +```python +from fastapi import Request +from fastapi.responses import HTMLResponse, RedirectResponse +from jinja2 import Environment, FileSystemLoader + +env = Environment(loader=FileSystemLoader("docs")) + +@app.get("/docs") +async def docs_redirect(): + """Redirect to latest docs""" + return RedirectResponse(url="/docs/v2/") + +@app.get("/docs/{version}/{page}") +async def serve_docs(version: str, page: str): + """Serve versioned documentation""" + if version not in ["v1", "v2"]: + raise HTTPException(404) + + # Add deprecation warning for v1 + deprecated = version == "v1" + + template = env.get_template(f"{version}/{page}.md") + content = template.render(deprecated=deprecated) + + return HTMLResponse(content) +``` + +**Deprecation banner**: + +```html + +{% if deprecated %} +
+ ⚠️ Deprecated: This documentation is for API v1, + which will be sunset on June 1, 2025. + Migrate to v2 +
+{% endif %} +``` + +## Documentation Debt Detection + +### Prevent Stale Documentation + +**Detect outdated docs**: + +```python +import pytest +from datetime import datetime, timedelta + +def test_documentation_freshness(): + """Ensure docs have been updated recently""" + docs_modified = datetime.fromtimestamp( + os.path.getmtime("docs/api-reference.md") + ) + + # Fail if docs haven't been updated in 90 days + max_age = timedelta(days=90) + age = datetime.now() - docs_modified + + assert age < max_age, \ + f"API docs are {age.days} days old. Review and update or add exemption comment." +``` + +**Track documentation TODOs**: + +```python +def test_no_documentation_todos(): + """Ensure no TODO comments in docs""" + import re + + doc_files = glob.glob("docs/**/*.md", recursive=True) + todos = [] + + for doc_file in doc_files: + with open(doc_file) as f: + for line_num, line in enumerate(f, 1): + if re.search(r'TODO|FIXME|XXX', line): + todos.append(f"{doc_file}:{line_num}: {line.strip()}") + + assert not todos, \ + f"Documentation has {len(todos)} TODOs:\n" + "\n".join(todos) +``` + +**Broken link detection**: + +```python +import pytest +import requests +from bs4 import BeautifulSoup +import re + +def extract_links_from_markdown(markdown_file): + """Extract all HTTP(S) links from markdown""" + with open(markdown_file) as f: + content = f.read() + + # Find markdown links [text](url) + links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content) + return [(text, url) for text, url in links if url.startswith('http')] + +def test_no_broken_links_in_docs(): + """Ensure all external links in docs are valid""" + doc_files = glob.glob("docs/**/*.md", recursive=True) + broken_links = [] + + for doc_file in doc_files: + for text, url in extract_links_from_markdown(doc_file): + try: + response = requests.head(url, timeout=5, allow_redirects=True) + if response.status_code >= 400: + broken_links.append(f"{doc_file}: {url} ({response.status_code})") + except requests.RequestException as e: + broken_links.append(f"{doc_file}: {url} (error: {e})") + + assert not broken_links, \ + f"Found {len(broken_links)} broken links:\n" + "\n".join(broken_links) +``` + +## Documentation Metrics + +### Track Documentation Usage + +**Analytics integration**: + +```python +from fastapi import Request +import analytics + +@app.middleware("http") +async def track_doc_views(request: Request, call_next): + if request.url.path.startswith("/docs"): + # Track page view + analytics.track( + user_id="anonymous", + event="Documentation Viewed", + properties={ + "page": request.url.path, + "version": request.url.path.split("/")[2] if len(request.url.path.split("/")) > 2 else "latest", + "referrer": request.headers.get("referer") + } + ) + + return await call_next(request) +``` + +**Track "Try It Out" usage**: + +```javascript +// Inject into Swagger UI +const originalExecute = swagger.presets.apis.execute; +swagger.presets.apis.execute = function(spec) { + // Track API call from docs + analytics.track('API Call from Docs', { + endpoint: spec.path, + method: spec.method, + success: spec.response.status < 400 + }); + + return originalExecute(spec); +}; +``` + +**Documentation health dashboard**: + +```python +from fastapi import APIRouter +from datetime import datetime, timedelta + +router = APIRouter() + +@router.get("/admin/docs-metrics") +async def get_doc_metrics(db: Session = Depends(get_db)): + """Dashboard for documentation health""" + + # Page views by version + views_by_version = analytics.query( + "Documentation Viewed", + group_by="version", + since=datetime.now() - timedelta(days=30) + ) + + # Most viewed pages + top_pages = analytics.query( + "Documentation Viewed", + group_by="page", + since=datetime.now() - timedelta(days=30), + limit=10 + ) + + # Try it out usage + api_calls = analytics.query( + "API Call from Docs", + since=datetime.now() - timedelta(days=30) + ) + + # Documentation freshness + freshness = { + "quickstart.md": get_file_age("docs/quickstart.md"), + "api-reference.md": get_file_age("docs/api-reference.md") + } + + return { + "views_by_version": views_by_version, + "top_pages": top_pages, + "api_calls_from_docs": api_calls, + "freshness": freshness, + "health_score": calculate_doc_health_score() + } + +def calculate_doc_health_score(): + """Calculate documentation health (0-100)""" + score = 100 + + # Deduct for stale docs (>90 days old) + for doc_file in glob.glob("docs/**/*.md", recursive=True): + age_days = (datetime.now() - datetime.fromtimestamp(os.path.getmtime(doc_file))).days + if age_days > 90: + score -= 10 + + # Deduct for broken links + broken_links = count_broken_links() + score -= min(broken_links * 5, 30) + + # Deduct for missing examples + endpoints_without_examples = count_endpoints_without_examples() + score -= min(endpoints_without_examples * 3, 20) + + return max(score, 0) +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **Docs in separate repo** | Always out of sync | Co-locate with code | +| **Manual example updates** | Examples become stale | Test examples in CI | +| **No deprecation notices** | Breaking changes surprise users | Document deprecation 6+ months ahead | +| **Generic descriptions** | Doesn't help developers | Specific use cases, edge cases | +| **No versioned docs** | Can't reference old versions | Version docs separately | +| **Untested SDKs** | Generated SDKs don't work | Test generated SDKs in CI | +| **No documentation metrics** | Can't measure effectiveness | Track page views, usage | +| **Single example per endpoint** | Doesn't show edge cases | Multiple examples (success, errors) | + +## Cross-References + +**Related skills**: +- **Technical writing** → `muna-technical-writer` (writing style, organization) +- **API design** → `rest-api-design`, `graphql-api-design` (design patterns) +- **API testing** → `api-testing` (contract testing, examples) +- **Authentication** → `api-authentication` (auth flow documentation) + +## Further Reading + +- **OpenAPI Specification**: https://spec.openapis.org/oas/v3.1.0 +- **FastAPI docs**: https://fastapi.tiangolo.com/tutorial/metadata/ +- **Swagger UI**: https://swagger.io/docs/open-source-tools/swagger-ui/ +- **ReDoc**: https://redoc.ly/docs/ +- **Write the Docs**: https://www.writethedocs.org/ diff --git a/skills/using-web-backend/api-testing.md b/skills/using-web-backend/api-testing.md new file mode 100644 index 0000000..6c09b41 --- /dev/null +++ b/skills/using-web-backend/api-testing.md @@ -0,0 +1,1013 @@ + +# API Testing + +## Overview + +**API testing specialist covering test organization, integration testing, performance testing, security testing, and production test strategies.** + +**Core principle**: Tests are executable documentation that verify correctness, prevent regressions, and enable confident refactoring - invest in test quality as you would production code. + +## When to Use This Skill + +Use when encountering: + +- **Test organization**: Structuring test suites, fixtures, test discovery +- **Integration testing**: Testing with databases, external APIs, authentication +- **Performance testing**: Load testing, stress testing, benchmarking +- **Security testing**: Auth testing, injection testing, CORS validation +- **Test quality**: Coverage analysis, mutation testing, flaky test detection +- **CI/CD integration**: Running tests in pipelines, test reporting +- **Test debugging**: Debugging failing tests, using pytest features + +**Do NOT use for**: +- Unit testing business logic (use general Python testing resources) +- Frontend testing (use frontend testing tools) +- Database-specific patterns (see `database-integration` skill) + +## Test Organization + +### Test Structure Conventions + +**Directory layout**: + +``` +project/ +├── app/ +│ ├── __init__.py +│ ├── main.py +│ ├── routes/ +│ │ ├── users.py +│ │ └── orders.py +│ └── services/ +│ └── payment.py +└── tests/ + ├── __init__.py + ├── conftest.py # Shared fixtures + ├── unit/ # Fast, isolated tests + │ ├── test_services.py + │ └── test_schemas.py + ├── integration/ # Tests with database/external deps + │ ├── test_users_api.py + │ └── test_orders_api.py + ├── e2e/ # End-to-end tests + │ └── test_checkout_flow.py + ├── performance/ # Load/stress tests + │ └── test_load.py + └── security/ # Security-specific tests + └── test_auth.py +``` + +**Naming conventions**: +- Test files: `test_*.py` or `*_test.py` +- Test functions: `test___` +- Test classes: `Test` + +```python +# Good naming +def test_create_user_with_valid_data_returns_201(): + pass + +def test_create_user_with_duplicate_email_returns_409(): + pass + +def test_get_user_when_not_found_returns_404(): + pass + +# Bad naming +def test_user(): # Too vague + pass + +def test_1(): # No context + pass +``` + +### Test Markers for Organization + +**Define markers in pytest.ini**: + +```ini +# pytest.ini +[pytest] +markers = + unit: Unit tests (fast, no external dependencies) + integration: Integration tests (database, external APIs) + e2e: End-to-end tests (full system) + slow: Tests that take > 1 second + security: Security-focused tests + smoke: Critical smoke tests (run first) + wip: Work in progress (skip in CI) +``` + +**Apply markers**: + +```python +import pytest + +@pytest.mark.unit +def test_calculate_discount(): + """Unit test - no dependencies""" + assert calculate_discount(100, 0.1) == 90 + +@pytest.mark.integration +@pytest.mark.slow +def test_create_order_end_to_end(client, test_db): + """Integration test with database""" + response = client.post("/orders", json={...}) + assert response.status_code == 201 + +@pytest.mark.security +def test_unauthorized_access_returns_401(client): + """Security test for auth""" + response = client.get("/admin/users") + assert response.status_code == 401 + +@pytest.mark.smoke +def test_health_endpoint(client): + """Critical smoke test""" + response = client.get("/health") + assert response.status_code == 200 +``` + +**Run specific test categories**: + +```bash +# Run only unit tests (fast) +pytest -m unit + +# Run only integration tests +pytest -m integration + +# Run everything except slow tests +pytest -m "not slow" + +# Run smoke tests first, then rest +pytest -m smoke && pytest -m "not smoke" + +# Run security tests +pytest -m security + +# Skip work-in-progress tests +pytest -m "not wip" +``` + +### Parametrized Testing + +**Test same logic with multiple inputs**: + +```python +import pytest + +@pytest.mark.parametrize("email,expected_valid", [ + ("user@example.com", True), + ("user+tag@example.co.uk", True), + ("invalid.email", False), + ("@example.com", False), + ("user@", False), + ("", False), +]) +def test_email_validation(email, expected_valid): + """Test email validation with multiple cases""" + assert is_valid_email(email) == expected_valid + +@pytest.mark.parametrize("status_code,expected_retry", [ + (500, True), # Internal error - retry + (502, True), # Bad gateway - retry + (503, True), # Service unavailable - retry + (400, False), # Bad request - don't retry + (401, False), # Unauthorized - don't retry + (404, False), # Not found - don't retry +]) +def test_should_retry_request(status_code, expected_retry): + """Test retry logic for different status codes""" + assert should_retry(status_code) == expected_retry + +@pytest.mark.parametrize("role,endpoint,expected_status", [ + ("admin", "/admin/users", 200), + ("user", "/admin/users", 403), + ("guest", "/admin/users", 401), + ("admin", "/users/me", 200), + ("user", "/users/me", 200), + ("guest", "/users/me", 401), +]) +def test_authorization_matrix(client, role, endpoint, expected_status): + """Test authorization for different role/endpoint combinations""" + token = create_token_with_role(role) + response = client.get(endpoint, headers={"Authorization": f"Bearer {token}"}) + assert response.status_code == expected_status +``` + +**Parametrize with IDs for readability**: + +```python +@pytest.mark.parametrize("input_data,expected_error", [ + ({"email": ""}, "Email is required"), + ({"email": "invalid"}, "Invalid email format"), + ({"email": "user@example.com", "age": -1}, "Age must be positive"), +], ids=["missing_email", "invalid_email", "negative_age"]) +def test_validation_errors(input_data, expected_error): + with pytest.raises(ValidationError, match=expected_error): + validate_user(input_data) +``` + +## Test Doubles: Mocks, Stubs, Fakes, Spies + +### Taxonomy and When to Use Each + +| Type | Purpose | Use When | Example | +|------|---------|----------|---------| +| **Mock** | Verify interactions (method calls) | Testing behavior, not state | Verify email service was called | +| **Stub** | Return predefined responses | Testing with controlled inputs | Return fake user data | +| **Fake** | Working implementation (simpler) | Need real behavior without dependencies | In-memory database | +| **Spy** | Record calls while preserving real behavior | Testing interactions + real logic | Count cache hits | + +### Mocks (Verify Behavior) + +```python +from unittest.mock import Mock, patch, call + +def test_send_welcome_email_called_on_registration(client, mocker): + """Mock to verify email service was called""" + mock_send_email = mocker.patch("app.services.email.send_email") + + response = client.post("/register", json={ + "email": "user@example.com", + "name": "Alice" + }) + + assert response.status_code == 201 + + # Verify email service was called with correct arguments + mock_send_email.assert_called_once_with( + to="user@example.com", + template="welcome", + context={"name": "Alice"} + ) + +def test_payment_failure_triggers_rollback(client, mocker): + """Mock to verify rollback is called on payment failure""" + mock_payment = mocker.patch("app.services.payment.charge") + mock_payment.side_effect = PaymentError("Card declined") + + mock_rollback = mocker.patch("app.database.rollback") + + response = client.post("/orders", json={"total": 100}) + + assert response.status_code == 402 + mock_rollback.assert_called_once() +``` + +### Stubs (Return Predefined Data) + +```python +def test_user_profile_with_stubbed_external_api(client, mocker): + """Stub external API to return controlled data""" + # Stub returns predefined response + mock_external_api = mocker.patch("app.services.profile.fetch_profile_data") + mock_external_api.return_value = { + "avatar_url": "https://example.com/avatar.jpg", + "bio": "Test bio" + } + + response = client.get("/users/123/profile") + + assert response.status_code == 200 + data = response.json() + assert data["avatar_url"] == "https://example.com/avatar.jpg" + +def test_payment_processing_with_different_responses(client, mocker): + """Test different payment responses using stubs""" + mock_payment = mocker.patch("app.services.payment.charge") + + # Test success + mock_payment.return_value = {"status": "success", "id": "pay_123"} + response = client.post("/orders", json={"total": 100}) + assert response.status_code == 201 + + # Test failure + mock_payment.return_value = {"status": "declined", "reason": "insufficient_funds"} + response = client.post("/orders", json={"total": 100}) + assert response.status_code == 402 +``` + +### Fakes (Working Implementation) + +```python +class FakePaymentGateway: + """Fake payment gateway with working implementation""" + def __init__(self): + self.charges = [] + self.fail_next = False + + def charge(self, amount, customer_id): + """Fake charge that tracks calls""" + if self.fail_next: + self.fail_next = False + raise PaymentError("Simulated failure") + + charge_id = f"fake_charge_{len(self.charges) + 1}" + self.charges.append({ + "id": charge_id, + "amount": amount, + "customer_id": customer_id, + "status": "success" + }) + return {"id": charge_id, "status": "success"} + + def refund(self, charge_id): + """Fake refund""" + for charge in self.charges: + if charge["id"] == charge_id: + charge["status"] = "refunded" + return True + return False + +@pytest.fixture +def fake_payment(): + return FakePaymentGateway() + +def test_order_with_fake_payment(client, fake_payment): + """Test using fake payment gateway""" + app.dependency_overrides[get_payment_gateway] = lambda: fake_payment + + # Create order + response = client.post("/orders", json={"total": 100}) + assert response.status_code == 201 + + # Verify payment was charged + assert len(fake_payment.charges) == 1 + assert fake_payment.charges[0]["amount"] == 100 + + # Test refund + charge_id = fake_payment.charges[0]["id"] + response = client.post(f"/orders/{charge_id}/refund") + + assert response.status_code == 200 + assert fake_payment.charges[0]["status"] == "refunded" +``` + +### Spies (Record Calls + Real Behavior) + +```python +def test_cache_hit_rate_with_spy(client, mocker): + """Spy on cache to measure hit rate""" + real_cache_get = cache.get + + call_count = {"hits": 0, "misses": 0} + + def spy_cache_get(key): + result = real_cache_get(key) + if result is not None: + call_count["hits"] += 1 + else: + call_count["misses"] += 1 + return result + + mocker.patch("app.cache.get", side_effect=spy_cache_get) + + # Make requests + for _ in range(10): + client.get("/users/123") + + # Verify cache behavior + assert call_count["hits"] > 5 # Most should hit cache + assert call_count["misses"] <= 1 # Only first miss +``` + +## Performance Testing + +### Load Testing with Locust + +**Setup Locust test**: + +```python +# tests/performance/locustfile.py +from locust import HttpUser, task, between +import random + +class APIUser(HttpUser): + """Simulate API user behavior""" + wait_time = between(1, 3) # Wait 1-3 seconds between requests + + def on_start(self): + """Login once per user""" + response = self.client.post("/login", json={ + "email": "test@example.com", + "password": "password123" + }) + self.token = response.json()["access_token"] + + @task(3) # Weight: 3x more likely than other tasks + def get_users(self): + """GET /users (most common operation)""" + self.client.get( + "/users", + headers={"Authorization": f"Bearer {self.token}"} + ) + + @task(2) + def get_user_detail(self): + """GET /users/{id}""" + user_id = random.randint(1, 1000) + self.client.get( + f"/users/{user_id}", + headers={"Authorization": f"Bearer {self.token}"} + ) + + @task(1) + def create_order(self): + """POST /orders (less common)""" + self.client.post( + "/orders", + json={"total": 99.99, "items": ["item1", "item2"]}, + headers={"Authorization": f"Bearer {self.token}"} + ) +``` + +**Run load test**: + +```bash +# Start Locust +locust -f tests/performance/locustfile.py --host=http://localhost:8000 + +# Command-line load test (no web UI) +locust -f tests/performance/locustfile.py \ + --host=http://localhost:8000 \ + --users 100 \ + --spawn-rate 10 \ + --run-time 60s \ + --headless +``` + +**Performance thresholds in tests**: + +```python +import pytest +from locust import stats +from locust.env import Environment + +def test_api_handles_load(): + """Test API handles 100 concurrent users""" + env = Environment(user_classes=[APIUser]) + runner = env.create_local_runner() + + # Run load test + runner.start(user_count=100, spawn_rate=10) + runner.greenlet.join(timeout=60) + + # Assert performance requirements + stats_dict = runner.stats.total + + assert stats_dict.avg_response_time < 200, "Average response time too high" + assert stats_dict.fail_ratio < 0.01, "Error rate above 1%" + assert stats_dict.get_response_time_percentile(0.95) < 500, "95th percentile too high" +``` + +### Benchmark Testing with pytest-benchmark + +```python +import pytest + +def test_user_query_performance(benchmark, test_db): + """Benchmark user query performance""" + # Setup test data + UserFactory.create_batch(1000) + + # Benchmark the query + result = benchmark(lambda: test_db.query(User).filter(User.is_active == True).all()) + + # Assertions on benchmark + assert len(result) == 1000 + assert benchmark.stats["mean"] < 0.1, "Query too slow (>100ms)" + +def test_endpoint_response_time(benchmark, client): + """Benchmark endpoint response time""" + def make_request(): + return client.get("/users") + + result = benchmark(make_request) + + assert result.status_code == 200 + assert benchmark.stats["mean"] < 0.050, "Endpoint too slow (>50ms)" +``` + +**Benchmark comparison** (track performance over time): + +```bash +# Save benchmark results +pytest tests/performance/ --benchmark-save=baseline + +# Compare against baseline +pytest tests/performance/ --benchmark-compare=baseline + +# Fail if performance degrades >10% +pytest tests/performance/ --benchmark-compare=baseline --benchmark-compare-fail=mean:10% +``` + +## Security Testing + +### Authentication Testing + +**Test auth flows**: + +```python +import pytest + +def test_login_with_valid_credentials(client): + """Test successful login""" + response = client.post("/login", json={ + "email": "user@example.com", + "password": "correct_password" + }) + + assert response.status_code == 200 + data = response.json() + assert "access_token" in data + assert "refresh_token" in data + +def test_login_with_invalid_credentials(client): + """Test failed login""" + response = client.post("/login", json={ + "email": "user@example.com", + "password": "wrong_password" + }) + + assert response.status_code == 401 + assert "invalid credentials" in response.json()["detail"].lower() + +def test_access_protected_endpoint_without_token(client): + """Test unauthorized access""" + response = client.get("/users/me") + assert response.status_code == 401 + +def test_access_protected_endpoint_with_valid_token(client, auth_token): + """Test authorized access""" + response = client.get( + "/users/me", + headers={"Authorization": f"Bearer {auth_token}"} + ) + assert response.status_code == 200 + +def test_access_with_expired_token(client): + """Test expired token rejection""" + expired_token = create_expired_token(user_id=1) + + response = client.get( + "/users/me", + headers={"Authorization": f"Bearer {expired_token}"} + ) + + assert response.status_code == 401 + assert "expired" in response.json()["detail"].lower() + +def test_token_refresh(client, refresh_token): + """Test refresh token flow""" + response = client.post("/refresh", json={ + "refresh_token": refresh_token + }) + + assert response.status_code == 200 + data = response.json() + assert "access_token" in data + assert data["access_token"] != refresh_token +``` + +### Authorization Testing + +```python +@pytest.mark.parametrize("role,endpoint,expected_status", [ + ("admin", "/admin/users", 200), + ("admin", "/admin/settings", 200), + ("user", "/admin/users", 403), + ("user", "/admin/settings", 403), + ("user", "/users/me", 200), + ("guest", "/users/me", 401), +]) +def test_role_based_access_control(client, role, endpoint, expected_status): + """Test RBAC for different roles""" + if role == "guest": + response = client.get(endpoint) + else: + token = create_token_with_role(role) + response = client.get(endpoint, headers={"Authorization": f"Bearer {token}"}) + + assert response.status_code == expected_status +``` + +### Injection Testing + +**SQL injection testing**: + +```python +def test_sql_injection_in_query_params(client): + """Test SQL injection is prevented""" + malicious_input = "1' OR '1'='1" + + response = client.get(f"/users?name={malicious_input}") + + # Should return empty or error, not all users + assert response.status_code in [200, 400] + if response.status_code == 200: + assert len(response.json()) == 0 + +def test_sql_injection_in_json_body(client): + """Test SQL injection in request body""" + response = client.post("/users", json={ + "name": "'; DROP TABLE users; --", + "email": "test@example.com" + }) + + # Should succeed (string is escaped) or fail validation + assert response.status_code in [201, 400] + + # Verify table still exists + verify_response = client.get("/users") + assert verify_response.status_code == 200 +``` + +**Command injection testing**: + +```python +def test_command_injection_in_file_path(client): + """Test command injection is prevented""" + malicious_path = "../../etc/passwd" + + response = client.get(f"/files/{malicious_path}") + + assert response.status_code in [400, 404] + assert "etc/passwd" not in response.text +``` + +### CORS Testing + +```python +def test_cors_headers_present(client): + """Test CORS headers are set""" + response = client.options( + "/users", + headers={"Origin": "https://example.com"} + ) + + assert response.headers.get("Access-Control-Allow-Origin") == "https://example.com" + assert "GET" in response.headers.get("Access-Control-Allow-Methods", "") + assert "POST" in response.headers.get("Access-Control-Allow-Methods", "") + +def test_cors_blocks_unauthorized_origin(client): + """Test CORS blocks unauthorized origins""" + response = client.options( + "/users", + headers={"Origin": "https://malicious.com"} + ) + + # Should not include CORS headers for unauthorized origin + assert response.headers.get("Access-Control-Allow-Origin") is None +``` + +### Rate Limiting Testing + +```python +def test_rate_limit_enforced(client): + """Test rate limiting blocks excessive requests""" + # Make requests up to limit (e.g., 100/minute) + for _ in range(100): + response = client.get("/users") + assert response.status_code == 200 + + # Next request should be rate limited + response = client.get("/users") + assert response.status_code == 429 + assert "rate limit" in response.json()["detail"].lower() + +def test_rate_limit_reset_after_window(client, mocker): + """Test rate limit resets after time window""" + # Exhaust rate limit + for _ in range(100): + client.get("/users") + + # Fast-forward time + mocker.patch("time.time", return_value=time.time() + 61) # 61 seconds + + # Should work again + response = client.get("/users") + assert response.status_code == 200 +``` + +## Test Quality and Coverage + +### Coverage Analysis + +**Run tests with coverage**: + +```bash +# Generate coverage report +pytest --cov=app --cov-report=html --cov-report=term + +# Fail if coverage below threshold +pytest --cov=app --cov-fail-under=80 + +# Show missing lines +pytest --cov=app --cov-report=term-missing +``` + +**Coverage configuration (.coveragerc)**: + +```ini +[run] +source = app +omit = + */tests/* + */migrations/* + */__pycache__/* + */venv/* + +[report] +exclude_lines = + pragma: no cover + def __repr__ + raise AssertionError + raise NotImplementedError + if __name__ == .__main__.: + if TYPE_CHECKING: + @abstractmethod +``` + +**Branch coverage** (more thorough): + +```bash +# Test both branches of conditionals +pytest --cov=app --cov-branch +``` + +### Mutation Testing + +**Install mutation testing tools**: + +```bash +pip install mutpy cosmic-ray +``` + +**Run mutation tests** (test the tests): + +```bash +# Using mutpy +mut.py --target app.services.payment \ + --unit-test tests.test_payment \ + --report-html mutation-report + +# Using cosmic-ray +cosmic-ray init cosmic-ray.conf payment_session +cosmic-ray exec payment_session +cosmic-ray report payment_session +``` + +**Mutation testing concept**: +- Introduces bugs into code (mutations) +- Runs tests against mutated code +- If tests still pass, they didn't catch the mutation (weak tests) +- Goal: 100% mutation score (all mutations caught) + +### Detecting Flaky Tests + +**Repeat tests to find flakiness**: + +```bash +# Run tests 100 times to detect flaky tests +pytest --count=100 tests/test_orders.py + +# Run with pytest-flakefinder +pytest --flake-finder --flake-runs=50 +``` + +**Common flaky test causes**: +- Non-deterministic data (random, timestamps) +- Async race conditions +- Test order dependencies +- External service dependencies +- Shared test state + +**Fix flaky tests**: + +```python +# BAD: Non-deterministic timestamp +def test_user_creation_time(client): + response = client.post("/users", json={...}) + # Flaky: timestamp might differ by milliseconds + assert response.json()["created_at"] == datetime.now().isoformat() + +# GOOD: Relative time check +def test_user_creation_time(client): + before = datetime.now() + response = client.post("/users", json={...}) + after = datetime.now() + + created_at = datetime.fromisoformat(response.json()["created_at"]) + assert before <= created_at <= after + +# BAD: Random data without seed +def test_user_name(): + name = random.choice(["Alice", "Bob", "Charlie"]) + # Flaky: different name each run + assert create_user(name).name == name + +# GOOD: Seeded random or fixed data +def test_user_name(): + random.seed(42) # Deterministic + name = random.choice(["Alice", "Bob", "Charlie"]) + assert create_user(name).name == name +``` + +## CI/CD Integration + +### GitHub Actions Workflow + +```yaml +# .github/workflows/test.yml +name: Tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + + services: + postgres: + image: postgres:15 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test_db + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + redis: + image: redis:7 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 6379:6379 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install -r requirements-dev.txt + + - name: Run migrations + run: alembic upgrade head + env: + DATABASE_URL: postgresql://postgres:postgres@localhost:5432/test_db + + - name: Run unit tests + run: pytest -m unit --cov=app --cov-report=xml + + - name: Run integration tests + run: pytest -m integration --cov=app --cov-append --cov-report=xml + env: + DATABASE_URL: postgresql://postgres:postgres@localhost:5432/test_db + REDIS_URL: redis://localhost:6379 + + - name: Run security tests + run: pytest -m security + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + files: ./coverage.xml + fail_ci_if_error: true + + - name: Check coverage threshold + run: pytest --cov=app --cov-fail-under=80 +``` + +### Test Reporting + +**Generate JUnit XML for CI**: + +```bash +pytest --junitxml=test-results.xml +``` + +**HTML test report**: + +```bash +pytest --html=test-report.html --self-contained-html +``` + +## Debugging Failing Tests + +### pytest Debugging Flags + +```bash +# Stop on first failure +pytest -x + +# Show local variables on failure +pytest -l + +# Enter debugger on failure +pytest --pdb + +# Enter debugger on first failure +pytest -x --pdb + +# Show print statements +pytest -s + +# Verbose output +pytest -v + +# Very verbose output (show full diff) +pytest -vv + +# Run last failed tests only +pytest --lf + +# Run failed tests first, then rest +pytest --ff + +# Show slowest 10 tests +pytest --durations=10 +``` + +### Using pdb for Interactive Debugging + +```python +import pytest + +def test_complex_calculation(client): + """Debug this test interactively""" + response = client.post("/calculate", json={"x": 10, "y": 20}) + + # Set breakpoint + import pdb; pdb.set_trace() + + # Interactive debugging from here + result = response.json() + assert result["sum"] == 30 +``` + +**pdb commands**: +- `n` (next): Execute next line +- `s` (step): Step into function +- `c` (continue): Continue execution +- `p variable`: Print variable value +- `pp variable`: Pretty-print variable +- `l` (list): Show current location in code +- `w` (where): Show stack trace +- `q` (quit): Exit debugger + +### Debugging with pytest-timeout + +```python +import pytest + +@pytest.mark.timeout(5) # Fail if test takes >5 seconds +def test_slow_operation(client): + """This test might hang - timeout prevents infinite wait""" + response = client.get("/slow-endpoint") + assert response.status_code == 200 +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **Tests depend on each other** | Brittle, can't run in parallel | Use fixtures for shared setup | +| **Testing implementation details** | Breaks when refactoring | Test behavior/outcomes, not internals | +| **No test isolation** | One test affects another | Use transaction rollback, clean state | +| **Mocking too much** | Tests don't reflect reality | Use real dependencies where feasible | +| **No performance tests** | Production slowdowns surprise you | Add load/benchmark tests | +| **Ignoring flaky tests** | Erodes trust in test suite | Fix or remove flaky tests | +| **Low coverage with poor tests** | False confidence | Focus on quality, not just coverage | +| **Testing private methods** | Couples tests to implementation | Test public interface only | + +## Cross-References + +**Related skills**: +- **Database testing** → `database-integration` (test database setup, query testing) +- **FastAPI patterns** → `fastapi-development` (dependency injection for tests) +- **Security** → `ordis-security-architect` (security testing strategies) +- **Authentication** → `api-authentication` (auth testing patterns) + +## Further Reading + +- **pytest documentation**: https://docs.pytest.org/ +- **Testing FastAPI**: https://fastapi.tiangolo.com/tutorial/testing/ +- **Locust load testing**: https://docs.locust.io/ +- **Test Driven Development** by Kent Beck +- **Growing Object-Oriented Software, Guided by Tests** by Freeman & Pryce diff --git a/skills/using-web-backend/database-integration.md b/skills/using-web-backend/database-integration.md new file mode 100644 index 0000000..656ee8b --- /dev/null +++ b/skills/using-web-backend/database-integration.md @@ -0,0 +1,1117 @@ + +# Database Integration + +## Overview + +**Database integration specialist covering SQLAlchemy, connection pooling, query optimization, migrations, transactions, and production patterns.** + +**Core principle**: Databases are stateful, high-latency external systems requiring careful connection management, query optimization, and migration strategies to maintain performance and reliability at scale. + +## When to Use This Skill + +Use when encountering: + +- **Connection pooling**: Pool exhaustion, "too many connections" errors, pool configuration +- **Query optimization**: N+1 queries, slow endpoints, eager loading strategies +- **Migrations**: Schema changes, zero-downtime deployments, data backfills +- **Transactions**: Multi-step operations, rollback strategies, isolation levels +- **ORM vs Raw SQL**: Complex queries, performance optimization, query readability +- **Testing**: Database test strategies, fixtures, test isolation +- **Monitoring**: Query performance tracking, connection pool health + +**Do NOT use for**: +- Database selection (PostgreSQL vs MySQL vs MongoDB) +- Database administration (backup, replication, sharding) +- Schema design principles (see general architecture resources) + +## Connection Pool Configuration + +### Pool Sizing Formula + +**Calculate pool size based on deployment architecture**: + +```python +# Formula: pool_size × num_workers ≤ (postgres_max_connections - reserved) +# Example: 10 workers × 5 connections = 50 total ≤ (100 - 10) reserved + +from sqlalchemy import create_engine +from sqlalchemy.pool import QueuePool + +DATABASE_URL = "postgresql://user:pass@host/db" + +engine = create_engine( + DATABASE_URL, + poolclass=QueuePool, + pool_size=5, # Connections per worker + max_overflow=10, # Additional connections during spikes + pool_pre_ping=True, # CRITICAL: Verify connection before use + pool_recycle=3600, # Recycle after 1 hour (prevent stale connections) + pool_timeout=30, # Wait max 30s for connection from pool + echo_pool=False, # Enable for debugging pool issues + connect_args={ + "connect_timeout": 10, + "options": "-c statement_timeout=30000" # 30s query timeout + } +) +``` + +**Environment-based configuration**: + +```python +import os +from pydantic import BaseSettings + +class DatabaseSettings(BaseSettings): + database_url: str + pool_size: int = 5 + max_overflow: int = 10 + pool_pre_ping: bool = True + pool_recycle: int = 3600 + + class Config: + env_file = ".env" + +settings = DatabaseSettings() + +engine = create_engine( + settings.database_url, + pool_size=settings.pool_size, + max_overflow=settings.max_overflow, + pool_pre_ping=settings.pool_pre_ping, + pool_recycle=settings.pool_recycle +) +``` + +**Async configuration** (asyncpg): + +```python +from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession +from sqlalchemy.orm import sessionmaker + +engine = create_async_engine( + "postgresql+asyncpg://user:pass@host/db", + pool_size=20, # Async handles more concurrent connections + max_overflow=0, # No overflow - fail fast + pool_pre_ping=False, # asyncpg handles internally + pool_recycle=3600 +) + +async_session = sessionmaker( + engine, class_=AsyncSession, expire_on_commit=False +) +``` + +### Pool Health Monitoring + +**Health check endpoint**: + +```python +from fastapi import FastAPI, HTTPException +from sqlalchemy import text + +app = FastAPI() + +@app.get("/health/database") +async def database_health(db: Session = Depends(get_db)): + """Check database connectivity and pool status""" + try: + # Simple query to verify connection + result = db.execute(text("SELECT 1")) + + # Check pool statistics + pool = db.get_bind().pool + pool_status = { + "size": pool.size(), + "checked_in": pool.checkedin(), + "checked_out": pool.checkedout(), + "overflow": pool.overflow(), + "total_connections": pool.size() + pool.overflow() + } + + return { + "status": "healthy", + "pool": pool_status + } + except Exception as e: + raise HTTPException(status_code=503, detail=f"Database unhealthy: {e}") +``` + +**Pool exhaustion debugging**: + +```python +import logging + +logger = logging.getLogger(__name__) + +# Enable pool event logging +from sqlalchemy import event + +@event.listens_for(engine, "connect") +def receive_connect(dbapi_conn, connection_record): + logger.info(f"New connection created: {id(dbapi_conn)}") + +@event.listens_for(engine, "checkout") +def receive_checkout(dbapi_conn, connection_record, connection_proxy): + logger.debug(f"Connection checked out: {id(dbapi_conn)}") + + pool = connection_proxy._pool + logger.debug( + f"Pool status - size: {pool.size()}, " + f"checked_out: {pool.checkedout()}, " + f"overflow: {pool.overflow()}" + ) + +@event.listens_for(engine, "checkin") +def receive_checkin(dbapi_conn, connection_record): + logger.debug(f"Connection checked in: {id(dbapi_conn)}") +``` + +### Testing with NullPool + +**Disable pooling in tests**: + +```python +from sqlalchemy.pool import NullPool + +# Test configuration - no connection pooling +test_engine = create_engine( + "postgresql://user:pass@localhost/test_db", + poolclass=NullPool, # No pooling - each query gets new connection + echo=True # Log all SQL queries +) +``` + +## Query Optimization + +### N+1 Query Detection + +**Automatic detection in tests**: + +```python +from sqlalchemy import event +from sqlalchemy.engine import Engine +import pytest + +class QueryCounter: + """Count queries executed during test""" + def __init__(self): + self.queries = [] + + def __enter__(self): + event.listen(Engine, "before_cursor_execute", self._before_cursor_execute) + return self + + def __exit__(self, *args): + event.remove(Engine, "before_cursor_execute", self._before_cursor_execute) + + def _before_cursor_execute(self, conn, cursor, statement, *args): + self.queries.append(statement) + + @property + def count(self): + return len(self.queries) + +# Test usage +def test_no_n_plus_1(): + with QueryCounter() as counter: + users = get_users_with_posts() # Should use eager loading + + # Access posts (should not trigger additional queries) + for user in users: + _ = [post.title for post in user.posts] + + # Should be 1-2 queries, not 101 + assert counter.count <= 2, f"N+1 detected: {counter.count} queries" +``` + +### Eager Loading Strategies + +**Decision matrix**: + +| Pattern | Queries | Use When | Example | +|---------|---------|----------|---------| +| `joinedload()` | 1 (JOIN) | One-to-one, small one-to-many | User → Profile | +| `selectinload()` | 2 (IN clause) | One-to-many with many rows | User → Posts | +| `subqueryload()` | 2 (subquery) | Legacy alternative | Use selectinload instead | +| `raiseload()` | 0 (raises error) | Prevent lazy loading | Production safety | + +**joinedload() - Single query with JOIN**: + +```python +from sqlalchemy.orm import joinedload + +# Single query: SELECT * FROM users LEFT OUTER JOIN posts ON ... +users = db.query(User).options( + joinedload(User.posts) +).all() + +# Best for one-to-one or small one-to-many +user = db.query(User).options( + joinedload(User.profile) # One-to-one +).filter(User.id == user_id).first() +``` + +**selectinload() - Two queries (more efficient for many rows)**: + +```python +from sqlalchemy.orm import selectinload + +# Query 1: SELECT * FROM users +# Query 2: SELECT * FROM posts WHERE user_id IN (1, 2, 3, ...) +users = db.query(User).options( + selectinload(User.posts) +).all() + +# Best for one-to-many with many related rows +``` + +**Nested eager loading**: + +```python +# Load users → posts → comments (3 queries total) +users = db.query(User).options( + selectinload(User.posts).selectinload(Post.comments) +).all() +``` + +**Conditional eager loading**: + +```python +from sqlalchemy.orm import selectinload, Load + +# Only load published posts +users = db.query(User).options( + selectinload(User.posts).options( + Load(Post).filter(Post.published == True) + ) +).all() +``` + +**Prevent lazy loading in production** (raiseload): + +```python +from sqlalchemy.orm import raiseload + +# Raise error if any relationship accessed without eager loading +users = db.query(User).options( + raiseload('*') # Disable all lazy loading +).all() + +# This will raise an error: +# user.posts # InvalidRequestError: 'User.posts' is not available due to lazy='raise' +``` + +### Query Performance Measurement + +**Log slow queries**: + +```python +from sqlalchemy import event +from sqlalchemy.engine import Engine +import time +import logging + +logger = logging.getLogger(__name__) + +SLOW_QUERY_THRESHOLD = 1.0 # seconds + +@event.listens_for(Engine, "before_cursor_execute") +def before_cursor_execute(conn, cursor, statement, parameters, context, executemany): + conn.info.setdefault('query_start_time', []).append(time.time()) + +@event.listens_for(Engine, "after_cursor_execute") +def after_cursor_execute(conn, cursor, statement, parameters, context, executemany): + total_time = time.time() - conn.info['query_start_time'].pop() + + if total_time > SLOW_QUERY_THRESHOLD: + logger.warning( + f"Slow query ({total_time:.2f}s): {statement[:200]}", + extra={ + "duration": total_time, + "statement": statement, + "parameters": parameters + } + ) +``` + +**EXPLAIN ANALYZE for query optimization**: + +```python +from sqlalchemy import text + +def explain_query(db: Session, query): + """Get query execution plan""" + compiled = query.statement.compile( + compile_kwargs={"literal_binds": True} + ) + + explain_result = db.execute( + text(f"EXPLAIN ANALYZE {compiled}") + ).fetchall() + + return "\n".join([row[0] for row in explain_result]) + +# Usage +query = db.query(User).join(Post).filter(Post.published == True) +plan = explain_query(db, query) +print(plan) +``` + +### Deferred Column Loading + +**Exclude large columns from initial query**: + +```python +from sqlalchemy.orm import defer, undefer + +# Don't load large 'bio' column initially +users = db.query(User).options( + defer(User.bio), # Skip this column + defer(User.profile_image) # Skip binary data +).all() + +# Load specific user's bio when needed +user = db.query(User).options( + undefer(User.bio) # Load this column +).filter(User.id == user_id).first() +``` + +**Load only specific columns**: + +```python +from sqlalchemy.orm import load_only + +# Only load id and name (ignore all other columns) +users = db.query(User).options( + load_only(User.id, User.name) +).all() +``` + +## Zero-Downtime Migrations + +### Migration Decision Matrix + +| Operation | Locking | Approach | Downtime | +|-----------|---------|----------|----------| +| Add nullable column | None | Single migration | No | +| Add NOT NULL column | Table lock | Multi-phase (nullable → backfill → NOT NULL) | No | +| Add index | Share lock | `CREATE INDEX CONCURRENTLY` | No | +| Add foreign key | Share lock | `NOT VALID` → `VALIDATE` | No | +| Drop column | None | Multi-phase (stop using → drop) | No | +| Rename column | None | Multi-phase (add new → dual write → drop old) | No | +| Alter column type | Table lock | Multi-phase or rebuild table | Maybe | + +### Multi-Phase NOT NULL Migration + +**Phase 1: Add nullable column**: + +```python +# migrations/versions/001_add_email_verified.py +def upgrade(): + # Fast: no table rewrite + op.add_column('users', sa.Column('email_verified', sa.Boolean(), nullable=True)) + + # Set default for new rows + op.execute("ALTER TABLE users ALTER COLUMN email_verified SET DEFAULT false") + +def downgrade(): + op.drop_column('users', 'email_verified') +``` + +**Phase 2: Backfill in batches**: + +```python +# migrations/versions/002_backfill_email_verified.py +from alembic import op +import sqlalchemy as sa + +def upgrade(): + """Backfill existing rows in batches""" + connection = op.get_bind() + + # Process in batches to avoid long transactions + batch_size = 10000 + total_updated = 0 + + while True: + result = connection.execute(sa.text(""" + UPDATE users + SET email_verified = false + WHERE email_verified IS NULL + AND id IN ( + SELECT id FROM users + WHERE email_verified IS NULL + ORDER BY id + LIMIT :batch_size + ) + """), {"batch_size": batch_size}) + + rows_updated = result.rowcount + total_updated += rows_updated + + if rows_updated == 0: + break + + print(f"Backfilled {total_updated} rows") + +def downgrade(): + pass # No rollback needed +``` + +**Phase 3: Add NOT NULL constraint**: + +```python +# migrations/versions/003_make_email_verified_not_null.py +def upgrade(): + # Verify no NULLs remain + connection = op.get_bind() + result = connection.execute(sa.text( + "SELECT COUNT(*) FROM users WHERE email_verified IS NULL" + )) + null_count = result.scalar() + + if null_count > 0: + raise Exception(f"Cannot add NOT NULL: {null_count} NULL values remain") + + # Add NOT NULL constraint (fast since all values are set) + op.alter_column('users', 'email_verified', nullable=False) + +def downgrade(): + op.alter_column('users', 'email_verified', nullable=True) +``` + +### Concurrent Index Creation + +**Without CONCURRENTLY (blocks writes)**: + +```python +# BAD: Locks table during index creation +def upgrade(): + op.create_index('idx_users_email', 'users', ['email']) +``` + +**With CONCURRENTLY (no locks)**: + +```python +# GOOD: No blocking, safe for production +def upgrade(): + # Requires raw SQL for CONCURRENTLY + op.execute(""" + CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_users_email + ON users (email) + """) + +def downgrade(): + op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_users_email") +``` + +**Partial index for efficiency**: + +```python +def upgrade(): + op.execute(""" + CREATE INDEX CONCURRENTLY idx_users_active_email + ON users (email) + WHERE deleted_at IS NULL + """) +``` + +### Adding Foreign Keys Without Blocking + +**Using NOT VALID constraint**: + +```python +# migrations/versions/004_add_foreign_key.py +def upgrade(): + # Phase 1: Add constraint without validating existing rows (fast) + op.execute(""" + ALTER TABLE posts + ADD CONSTRAINT fk_posts_user_id + FOREIGN KEY (user_id) + REFERENCES users (id) + NOT VALID + """) + + # Phase 2: Validate constraint in background (can be canceled/restarted) + op.execute(""" + ALTER TABLE posts + VALIDATE CONSTRAINT fk_posts_user_id + """) + +def downgrade(): + op.drop_constraint('fk_posts_user_id', 'posts', type_='foreignkey') +``` + +### Migration Monitoring + +**Track migration progress**: + +```sql +-- Check backfill progress +SELECT + COUNT(*) FILTER (WHERE email_verified IS NULL) as null_count, + COUNT(*) as total_count, + ROUND(100.0 * COUNT(*) FILTER (WHERE email_verified IS NOT NULL) / COUNT(*), 2) as pct_complete +FROM users; + +-- Check index creation progress (PostgreSQL 12+) +SELECT + phase, + ROUND(100.0 * blocks_done / NULLIF(blocks_total, 0), 2) as pct_complete +FROM pg_stat_progress_create_index +WHERE relid = 'users'::regclass; +``` + +## Transaction Management + +### Basic Transaction Pattern + +**Context manager with automatic rollback**: + +```python +from contextlib import contextmanager +from sqlalchemy.orm import Session + +@contextmanager +def transactional_session(db: Session): + """Context manager for automatic rollback on error""" + try: + yield db + db.commit() + except Exception as e: + db.rollback() + raise + finally: + db.close() + +# Usage +with transactional_session(db) as session: + user = User(name="Alice") + session.add(user) + # Automatic commit on success, rollback on exception +``` + +### Savepoints for Partial Rollback + +**Nested transactions with savepoints**: + +```python +def create_order_with_retry(db: Session, order_data: dict): + """Use savepoints to retry failed steps without losing entire transaction""" + # Start main transaction + order = Order(**order_data) + db.add(order) + db.flush() # Get order.id + + # Try payment with savepoint + sp = db.begin_nested() # Create savepoint + try: + payment = process_payment(order.total) + order.payment_id = payment.id + except PaymentError as e: + sp.rollback() # Rollback to savepoint (keep order) + + # Try alternative payment method + sp = db.begin_nested() + try: + payment = process_backup_payment(order.total) + order.payment_id = payment.id + except PaymentError: + sp.rollback() + raise HTTPException(status_code=402, detail="All payment methods failed") + + db.commit() # Commit entire transaction + return order +``` + +### Locking Strategies + +**Optimistic locking with version column**: + +```python +from sqlalchemy import Column, Integer, String + +class Product(Base): + __tablename__ = "products" + id = Column(Integer, primary_key=True) + name = Column(String) + inventory = Column(Integer) + version = Column(Integer, nullable=False, default=1) # Version column + +# Usage +def decrement_inventory(db: Session, product_id: int, quantity: int): + product = db.query(Product).filter(Product.id == product_id).first() + + if product.inventory < quantity: + raise ValueError("Insufficient inventory") + + # Update with version check + rows_updated = db.execute( + sa.update(Product) + .where(Product.id == product_id) + .where(Product.version == product.version) # Check version hasn't changed + .values( + inventory=Product.inventory - quantity, + version=Product.version + 1 + ) + ).rowcount + + if rows_updated == 0: + # Version mismatch - another transaction modified this row + raise HTTPException(status_code=409, detail="Product was modified by another transaction") + + db.commit() +``` + +**Pessimistic locking with SELECT FOR UPDATE**: + +```python +def decrement_inventory_with_lock(db: Session, product_id: int, quantity: int): + """Acquire row lock to prevent concurrent modifications""" + # Lock the row (blocks other transactions) + product = db.query(Product).filter( + Product.id == product_id + ).with_for_update().first() # SELECT ... FOR UPDATE + + if not product: + raise HTTPException(status_code=404, detail="Product not found") + + if product.inventory < quantity: + raise HTTPException(status_code=400, detail="Insufficient inventory") + + product.inventory -= quantity + db.commit() + # Lock released after commit +``` + +**Lock timeout to prevent deadlocks**: + +```python +from sqlalchemy import text + +def with_lock_timeout(db: Session, timeout_ms: int = 5000): + """Set lock timeout for this transaction""" + db.execute(text(f"SET LOCAL lock_timeout = '{timeout_ms}ms'")) + +# Usage +try: + with_lock_timeout(db, 3000) # 3 second timeout + product = db.query(Product).with_for_update().filter(...).first() +except Exception as e: + if "lock timeout" in str(e).lower(): + raise HTTPException(status_code=409, detail="Resource locked by another transaction") + raise +``` + +### Isolation Levels + +**Configure isolation level**: + +```python +from sqlalchemy import create_engine + +# Default: READ COMMITTED +engine = create_engine( + DATABASE_URL, + isolation_level="REPEATABLE READ" # Options: READ UNCOMMITTED, READ COMMITTED, REPEATABLE READ, SERIALIZABLE +) + +# Per-transaction isolation +from sqlalchemy.orm import Session + +with Session(engine) as session: + session.connection(execution_options={"isolation_level": "SERIALIZABLE"}) + # ... transaction logic ... +``` + +## Raw SQL vs ORM + +### Decision Matrix + +| Use ORM When | Use Raw SQL When | +|--------------|------------------| +| CRUD operations | Complex CTEs (Common Table Expressions) | +| Simple joins (<3 tables) | Window functions with PARTITION BY | +| Type safety critical | Performance-critical queries | +| Database portability needed | Database-specific optimizations (PostgreSQL arrays, JSONB) | +| Code readability with ORM is good | ORM query becomes unreadable (>10 lines) | + +### Raw SQL with Type Safety + +**Parameterized queries with Pydantic results**: + +```python +from sqlalchemy import text +from pydantic import BaseModel +from typing import List + +class CustomerReport(BaseModel): + id: int + name: str + region: str + total_spent: float + order_count: int + rank_in_region: int + +@app.get("/reports/top-customers") +def get_top_customers( + db: Session = Depends(get_db), + region: str = None, + limit: int = 100 +) -> List[CustomerReport]: + """Complex report with CTEs and window functions""" + query = text(""" + WITH customer_totals AS ( + SELECT + u.id, + u.name, + u.region, + COUNT(o.id) as order_count, + COALESCE(SUM(o.total), 0) as total_spent + FROM users u + LEFT JOIN orders o ON u.id = o.user_id + WHERE u.deleted_at IS NULL + AND (:region IS NULL OR u.region = :region) + GROUP BY u.id, u.name, u.region + ), + ranked AS ( + SELECT + *, + ROW_NUMBER() OVER ( + PARTITION BY region + ORDER BY total_spent DESC + ) as rank_in_region + FROM customer_totals + ) + SELECT * FROM ranked + WHERE total_spent > 0 + ORDER BY total_spent DESC + LIMIT :limit + """) + + result = db.execute(query, {"region": region, "limit": limit}) + + # Type-safe results with Pydantic + return [CustomerReport(**dict(row._mapping)) for row in result] +``` + +### Hybrid Approach + +**Combine ORM and raw SQL**: + +```python +def get_user_analytics(db: Session, user_id: int): + """Use raw SQL for complex aggregation, ORM for simple queries""" + + # Complex aggregation in raw SQL + analytics_query = text(""" + SELECT + COUNT(*) as total_orders, + SUM(total) as lifetime_value, + AVG(total) as avg_order_value, + MAX(created_at) as last_order_date, + MIN(created_at) as first_order_date + FROM orders + WHERE user_id = :user_id + """) + + analytics = db.execute(analytics_query, {"user_id": user_id}).first() + + # Simple ORM query for user details + user = db.query(User).filter(User.id == user_id).first() + + return { + "user": { + "id": user.id, + "name": user.name, + "email": user.email + }, + "analytics": { + "total_orders": analytics.total_orders, + "lifetime_value": float(analytics.lifetime_value or 0), + "avg_order_value": float(analytics.avg_order_value or 0), + "first_order": analytics.first_order_date, + "last_order": analytics.last_order_date + } + } +``` + +### Query Optimization Checklist + +**Before optimizing**: + +1. **Measure with EXPLAIN ANALYZE**: + ```sql + EXPLAIN ANALYZE + SELECT * FROM users JOIN orders ON users.id = orders.user_id; + ``` + +2. **Look for**: + - Sequential scans on large tables → Add index + - High loop counts → N+1 query problem + - Hash joins on small tables → Consider nested loop + - Sort operations → Consider index on ORDER BY columns + +3. **Optimize**: + - Add indexes on foreign keys, WHERE clauses, ORDER BY columns + - Use LIMIT for pagination + - Use EXISTS instead of IN for large subqueries + - Denormalize for read-heavy workloads + +**Index usage verification**: + +```sql +-- Check if index is being used +EXPLAIN SELECT * FROM users WHERE email = 'test@example.com'; +-- Look for "Index Scan using idx_users_email" + +-- Check index statistics +SELECT + schemaname, + tablename, + indexname, + idx_scan as index_scans, + idx_tup_read as tuples_read, + idx_tup_fetch as tuples_fetched +FROM pg_stat_user_indexes +WHERE tablename = 'users'; +``` + +## Testing Strategies + +### Test Database Setup + +**Separate test database with fixtures**: + +```python +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from sqlalchemy.pool import NullPool + +@pytest.fixture(scope="session") +def test_engine(): + """Create test database engine""" + engine = create_engine( + "postgresql://user:pass@localhost/test_db", + poolclass=NullPool, # No pooling in tests + echo=True # Log all queries + ) + + # Create all tables + Base.metadata.create_all(engine) + + yield engine + + # Drop all tables after tests + Base.metadata.drop_all(engine) + +@pytest.fixture(scope="function") +def db_session(test_engine): + """Create fresh database session for each test""" + connection = test_engine.connect() + transaction = connection.begin() + + Session = sessionmaker(bind=connection) + session = Session() + + yield session + + # Rollback transaction (undo all changes) + session.close() + transaction.rollback() + connection.close() +``` + +### Factory Pattern for Test Data + +**Use factories for consistent test data**: + +```python +from factory import Factory, Faker, SubFactory +from factory.alchemy import SQLAlchemyModelFactory + +class UserFactory(SQLAlchemyModelFactory): + class Meta: + model = User + sqlalchemy_session = db_session + + name = Faker('name') + email = Faker('email') + created_at = Faker('date_time') + +class PostFactory(SQLAlchemyModelFactory): + class Meta: + model = Post + sqlalchemy_session = db_session + + title = Faker('sentence') + content = Faker('text') + user = SubFactory(UserFactory) # Auto-create related user + +# Test usage +def test_get_user_posts(db_session): + user = UserFactory.create() + PostFactory.create_batch(5, user=user) # Create 5 posts for user + + posts = db_session.query(Post).filter(Post.user_id == user.id).all() + assert len(posts) == 5 +``` + +### Testing Transactions + +**Test rollback behavior**: + +```python +def test_transaction_rollback(db_session): + """Verify rollback on error""" + user = User(name="Alice", email="alice@example.com") + db_session.add(user) + + with pytest.raises(IntegrityError): + # This should fail (duplicate email) + user2 = User(name="Bob", email="alice@example.com") + db_session.add(user2) + db_session.commit() + + # Verify rollback occurred + db_session.rollback() + assert db_session.query(User).count() == 0 +``` + +### Testing Migrations + +**Test migration up and down**: + +```python +from alembic import command +from alembic.config import Config + +def test_migration_upgrade_downgrade(): + """Test migration can be applied and reversed""" + alembic_cfg = Config("alembic.ini") + alembic_cfg.set_main_option("sqlalchemy.url", TEST_DATABASE_URL) + + # Apply migration + command.upgrade(alembic_cfg, "head") + + # Verify schema changes + # ... assertions ... + + # Rollback migration + command.downgrade(alembic_cfg, "-1") + + # Verify rollback + # ... assertions ... +``` + +## Monitoring and Observability + +### Query Performance Tracking + +**Track slow queries with middleware**: + +```python +from fastapi import Request +import time +import logging + +logger = logging.getLogger(__name__) + +@app.middleware("http") +async def track_db_queries(request: Request, call_next): + """Track database query performance per request""" + query_count = 0 + total_query_time = 0.0 + + def track_query(conn, cursor, statement, parameters, context, executemany): + nonlocal query_count, total_query_time + start = time.time() + + # Execute query + cursor.execute(statement, parameters) + + duration = time.time() - start + query_count += 1 + total_query_time += duration + + if duration > 1.0: # Log slow queries + logger.warning( + f"Slow query ({duration:.2f}s): {statement[:200]}", + extra={ + "duration": duration, + "path": request.url.path + } + ) + + # Attach listener + event.listen(Engine, "before_cursor_execute", track_query) + + response = await call_next(request) + + # Remove listener + event.remove(Engine, "before_cursor_execute", track_query) + + # Add headers + response.headers["X-DB-Query-Count"] = str(query_count) + response.headers["X-DB-Query-Time"] = f"{total_query_time:.3f}s" + + return response +``` + +### Connection Pool Metrics + +**Expose pool metrics for monitoring**: + +```python +from prometheus_client import Gauge + +pool_size_gauge = Gauge('db_pool_size', 'Number of connections in pool') +pool_checked_out_gauge = Gauge('db_pool_checked_out', 'Connections currently checked out') +pool_overflow_gauge = Gauge('db_pool_overflow', 'Overflow connections') + +@app.on_event("startup") +async def start_pool_metrics(): + """Collect pool metrics periodically""" + import asyncio + + async def collect_metrics(): + while True: + pool = engine.pool + pool_size_gauge.set(pool.size()) + pool_checked_out_gauge.set(pool.checkedout()) + pool_overflow_gauge.set(pool.overflow()) + + await asyncio.sleep(10) # Every 10 seconds + + asyncio.create_task(collect_metrics()) +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **No connection pooling** | Creates new connection per request (slow) | Use `create_engine()` with pool | +| **pool_pre_ping=False** | Fails on stale connections | Always `pool_pre_ping=True` in production | +| **Lazy loading in loops** | N+1 query problem | Use `joinedload()` or `selectinload()` | +| **No query timeout** | Slow queries block workers | Set `statement_timeout` in connect_args | +| **Large transactions** | Locks held too long, blocking | Break into smaller transactions | +| **No migration rollback** | Can't undo bad migrations | Always test downgrade path | +| **String interpolation in SQL** | SQL injection vulnerability | Use parameterized queries with `text()` | +| **No index on foreign keys** | Slow joins | Add index on all foreign key columns | +| **Blocking migrations** | Downtime during deployment | Use `CONCURRENTLY`, `NOT VALID` patterns | + +## Cross-References + +**Related skills**: +- **FastAPI dependency injection** → `fastapi-development` (database dependencies) +- **API testing** → `api-testing` (testing database code) +- **Microservices** → `microservices-architecture` (per-service databases) +- **Security** → `ordis-security-architect` (SQL injection, connection security) + +## Further Reading + +- **SQLAlchemy docs**: https://docs.sqlalchemy.org/ +- **Alembic migrations**: https://alembic.sqlalchemy.org/ +- **PostgreSQL performance**: https://www.postgresql.org/docs/current/performance-tips.html +- **Database Reliability Engineering** by Laine Campbell diff --git a/skills/using-web-backend/django-development.md b/skills/using-web-backend/django-development.md new file mode 100644 index 0000000..ce33b6e --- /dev/null +++ b/skills/using-web-backend/django-development.md @@ -0,0 +1,890 @@ + +# Django Development + +## Overview + +**Django development specialist covering Django ORM optimization, DRF best practices, caching strategies, migrations, testing, and production deployment.** + +**Core principle**: Django's "batteries included" philosophy is powerful but requires understanding which battery to use when - master Django's tools to avoid reinventing wheels or choosing wrong patterns. + +## When to Use This Skill + +Use when encountering: + +- **ORM optimization**: N+1 queries, select_related vs prefetch_related, query performance +- **DRF patterns**: Serializers, ViewSets, permissions, nested relationships +- **Caching**: Cache framework, per-view caching, template fragment caching +- **Migrations**: Zero-downtime migrations, data migrations, squashing +- **Testing**: Django TestCase, fixtures, factories, mocking +- **Deployment**: Gunicorn, static files, database pooling +- **Async Django**: Channels, async views, WebSockets +- **Admin customization**: Custom admin actions, list filters, inlines + +**Do NOT use for**: +- General Python patterns (use `axiom-python-engineering`) +- API design principles (use `rest-api-design`) +- Database-agnostic patterns (use `database-integration`) +- Authentication flows (use `api-authentication`) + +## Django ORM Optimization + +### select_related vs prefetch_related + +**Decision matrix**: + +| Relationship | Method | SQL Strategy | Use When | +|--------------|--------|--------------|----------| +| ForeignKey (many-to-one) | `select_related` | JOIN | Book → Author | +| OneToOneField | `select_related` | JOIN | User → Profile | +| Reverse ForeignKey (one-to-many) | `prefetch_related` | Separate query + IN | Author → Books | +| ManyToManyField | `prefetch_related` | Separate query + IN | Book → Tags | + +**Example - select_related (JOIN)**: + +```python +# BAD: N+1 queries (1 + N) +books = Book.objects.all() +for book in books: + print(book.author.name) # Additional query per book + +# GOOD: Single JOIN query +books = Book.objects.select_related('author').all() +for book in books: + print(book.author.name) # No additional queries + +# SQL generated: +# SELECT book.*, author.* FROM book JOIN author ON book.author_id = author.id +``` + +**Example - prefetch_related (IN query)**: + +```python +# BAD: N+1 queries +authors = Author.objects.all() +for author in authors: + print(author.books.count()) # Query per author + +# GOOD: 2 queries total +authors = Author.objects.prefetch_related('books').all() +for author in authors: + print(author.books.count()) # No additional queries + +# SQL generated: +# Query 1: SELECT * FROM author +# Query 2: SELECT * FROM book WHERE author_id IN (1, 2, 3, ...) +``` + +**Nested prefetching**: + +```python +from django.db.models import Prefetch + +# Fetch authors → books → reviews (3 queries) +authors = Author.objects.prefetch_related( + Prefetch('books', queryset=Book.objects.prefetch_related('reviews')) +) + +# Custom filtering on prefetch +recent_books = Book.objects.filter( + published_date__gte=timezone.now() - timedelta(days=30) +).order_by('-published_date') + +authors = Author.objects.prefetch_related( + Prefetch('books', queryset=recent_books, to_attr='recent_books') +) + +# Access via custom attribute +for author in authors: + for book in author.recent_books: # Only recent books + print(book.title) +``` + +### Query Debugging + +```python +from django.db import connection, reset_queries +from django.conf import settings + +# Enable in settings.py: DEBUG = True +# Or use django-debug-toolbar + +def debug_queries(func): + """Decorator to debug query counts""" + def wrapper(*args, **kwargs): + reset_queries() + result = func(*args, **kwargs) + print(f"Queries: {len(connection.queries)}") + for query in connection.queries: + print(f" {query['time']}s: {query['sql'][:100]}") + return result + return wrapper + +@debug_queries +def get_books(): + return list(Book.objects.select_related('author').prefetch_related('tags')) +``` + +**Django Debug Toolbar** (production alternative - django-silk): + +```python +# settings.py +INSTALLED_APPS = [ + 'debug_toolbar', + # ... +] + +MIDDLEWARE = [ + 'debug_toolbar.middleware.DebugToolbarMiddleware', + # ... +] + +INTERNAL_IPS = ['127.0.0.1'] + +# For production: use django-silk for profiling +INSTALLED_APPS += ['silk'] +MIDDLEWARE += ['silk.middleware.SilkyMiddleware'] +``` + +### Annotation and Aggregation + +**Annotate** (add computed fields): + +```python +from django.db.models import Count, Avg, Sum, F, Q + +# Add book count to each author +authors = Author.objects.annotate( + book_count=Count('books'), + avg_rating=Avg('books__rating'), + total_sales=Sum('books__sales') +) + +for author in authors: + print(f"{author.name}: {author.book_count} books, avg rating {author.avg_rating}") +``` + +**Aggregate** (single value across queryset): + +```python +from django.db.models import Avg + +# Get average rating across all books +avg_rating = Book.objects.aggregate(Avg('rating')) +# Returns: {'rating__avg': 4.2} + +# Multiple aggregations +stats = Book.objects.aggregate( + avg_rating=Avg('rating'), + total_sales=Sum('sales'), + book_count=Count('id') +) +``` + +**Conditional aggregation with Q**: + +```python +from django.db.models import Q, Count + +# Count books by rating category +Author.objects.annotate( + high_rated_books=Count('books', filter=Q(books__rating__gte=4.0)), + low_rated_books=Count('books', filter=Q(books__rating__lt=3.0)) +) +``` + +## Django REST Framework Patterns + +### ViewSet vs APIView + +**Decision matrix**: + +| Use | Pattern | When | +|-----|---------|------| +| Standard CRUD | `ModelViewSet` | Full REST API for model | +| Custom actions only | `ViewSet` | Non-standard endpoints | +| Read-only API | `ReadOnlyModelViewSet` | GET/LIST only | +| Fine control | `APIView` or `@api_view` | Custom business logic | + +**ModelViewSet** (full CRUD): + +```python +from rest_framework import viewsets, filters +from rest_framework.decorators import action +from rest_framework.response import Response + +class BookViewSet(viewsets.ModelViewSet): + """ + Provides: list, create, retrieve, update, partial_update, destroy + """ + queryset = Book.objects.select_related('author').prefetch_related('tags') + serializer_class = BookSerializer + permission_classes = [IsAuthenticatedOrReadOnly] + filter_backends = [filters.SearchFilter, filters.OrderingFilter] + search_fields = ['title', 'author__name'] + ordering_fields = ['published_date', 'rating'] + + def get_queryset(self): + """Optimize queryset based on action""" + queryset = super().get_queryset() + + if self.action == 'list': + # List doesn't need full detail + return queryset.only('id', 'title', 'author__name') + + return queryset + + @action(detail=True, methods=['post']) + def publish(self, request, pk=None): + """Custom action: POST /books/123/publish/""" + book = self.get_object() + book.status = 'published' + book.published_date = timezone.now() + book.save() + return Response({'status': 'published'}) + + @action(detail=False, methods=['get']) + def bestsellers(self, request): + """Custom list action: GET /books/bestsellers/""" + books = self.get_queryset().filter(sales__gte=10000).order_by('-sales')[:10] + serializer = self.get_serializer(books, many=True) + return Response(serializer.data) +``` + +### Serializer Patterns + +**Basic serializer with validation**: + +```python +from rest_framework import serializers +from django.contrib.auth.password_validation import validate_password + +class UserSerializer(serializers.ModelSerializer): + password = serializers.CharField( + write_only=True, + required=True, + validators=[validate_password] + ) + password_confirm = serializers.CharField(write_only=True, required=True) + + class Meta: + model = User + fields = ['id', 'username', 'email', 'password', 'password_confirm'] + read_only_fields = ['id'] + + # Field-level validation + def validate_email(self, value): + if User.objects.filter(email__iexact=value).exists(): + raise serializers.ValidationError("Email already in use") + return value.lower() + + # Object-level validation (cross-field) + def validate(self, attrs): + if attrs['password'] != attrs['password_confirm']: + raise serializers.ValidationError({ + 'password_confirm': "Passwords don't match" + }) + attrs.pop('password_confirm') + return attrs + + def create(self, validated_data): + password = validated_data.pop('password') + user = User.objects.create(**validated_data) + user.set_password(password) + user.save() + return user +``` + +**Nested serializers (read-only)**: + +```python +class AuthorSerializer(serializers.ModelSerializer): + book_count = serializers.IntegerField(read_only=True) + + class Meta: + model = Author + fields = ['id', 'name', 'bio', 'book_count'] + +class BookSerializer(serializers.ModelSerializer): + author = AuthorSerializer(read_only=True) + author_id = serializers.PrimaryKeyRelatedField( + queryset=Author.objects.all(), + source='author', + write_only=True + ) + + class Meta: + model = Book + fields = ['id', 'title', 'author', 'author_id', 'published_date'] +``` + +**Dynamic fields** (include/exclude fields via query params): + +```python +class DynamicFieldsModelSerializer(serializers.ModelSerializer): + """ + Usage: /api/books/?fields=id,title,author + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + request = self.context.get('request') + if request: + fields = request.query_params.get('fields') + if fields: + fields = fields.split(',') + allowed = set(fields) + existing = set(self.fields.keys()) + for field_name in existing - allowed: + self.fields.pop(field_name) + +class BookSerializer(DynamicFieldsModelSerializer): + class Meta: + model = Book + fields = '__all__' +``` + +## Django Caching + +### Cache Framework Setup + +```python +# settings.py + +# Redis cache (production) +CACHES = { + 'default': { + 'BACKEND': 'django_redis.cache.RedisCache', + 'LOCATION': 'redis://127.0.0.1:6379/1', + 'OPTIONS': { + 'CLIENT_CLASS': 'django_redis.client.DefaultClient', + 'CONNECTION_POOL_KWARGS': {'max_connections': 50}, + 'PARSER_CLASS': 'redis.connection.HiredisParser', + }, + 'KEY_PREFIX': 'myapp', + 'TIMEOUT': 300, # Default 5 minutes + } +} + +# Memcached (alternative) +CACHES = { + 'default': { + 'BACKEND': 'django.core.cache.backends.memcached.PyMemcacheCache', + 'LOCATION': '127.0.0.1:11211', + } +} + +# Local memory (development only) +CACHES = { + 'default': { + 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', + 'LOCATION': 'unique-snowflake', + } +} +``` + +### Per-View Caching + +```python +from django.views.decorators.cache import cache_page +from django.utils.decorators import method_decorator + +# Function-based view +@cache_page(60 * 15) # Cache for 15 minutes +def book_list(request): + books = Book.objects.all() + return render(request, 'books/list.html', {'books': books}) + +# Class-based view +class BookListView(ListView): + model = Book + + @method_decorator(cache_page(60 * 15)) + def dispatch(self, *args, **kwargs): + return super().dispatch(*args, **kwargs) + +# DRF ViewSet +from rest_framework_extensions.cache.decorators import cache_response + +class BookViewSet(viewsets.ModelViewSet): + @cache_response(timeout=60*15, key_func='calculate_cache_key') + def list(self, request, *args, **kwargs): + return super().list(request, *args, **kwargs) + + def calculate_cache_key(self, view_instance, view_method, request, args, kwargs): + # Custom cache key including user, filters + return f"books:list:{request.user.id}:{request.GET.urlencode()}" +``` + +### Low-Level Cache API + +```python +from django.core.cache import cache + +# Set cache +cache.set('my_key', 'my_value', timeout=300) + +# Get cache +value = cache.get('my_key') +if value is None: + value = expensive_computation() + cache.set('my_key', value, timeout=300) + +# Get or set (atomic) +value = cache.get_or_set('my_key', lambda: expensive_computation(), timeout=300) + +# Delete cache +cache.delete('my_key') + +# Clear all +cache.clear() + +# Multiple keys +cache.set_many({'key1': 'value1', 'key2': 'value2'}, timeout=300) +values = cache.get_many(['key1', 'key2']) + +# Increment/decrement +cache.set('counter', 0) +cache.incr('counter') # 1 +cache.incr('counter', delta=5) # 6 +``` + +### Cache Invalidation Patterns + +```python +from django.db.models.signals import post_save, post_delete +from django.dispatch import receiver + +@receiver([post_save, post_delete], sender=Book) +def invalidate_book_cache(sender, instance, **kwargs): + """Invalidate cache when book changes""" + cache.delete(f'book:{instance.id}') + cache.delete('books:list') # Invalidate list cache + cache.delete(f'author:{instance.author_id}:books') + +# Pattern: Cache with version tags +def get_books(): + version = cache.get('books:version', 0) + cache_key = f'books:list:v{version}' + books = cache.get(cache_key) + + if books is None: + books = list(Book.objects.all()) + cache.set(cache_key, books, timeout=3600) + + return books + +def invalidate_books(): + """Bump version to invalidate all book caches""" + version = cache.get('books:version', 0) + cache.set('books:version', version + 1) +``` + +## Django Migrations + +### Zero-Downtime Migration Pattern + +**Adding NOT NULL column to large table**: + +```python +# Step 1: Add nullable field (migration 0002) +class Migration(migrations.Migration): + operations = [ + migrations.AddField( + model_name='user', + name='department', + field=models.CharField(max_length=100, null=True, blank=True), + ), + ] + +# Step 2: Populate data in batches (migration 0003) +from django.db import migrations + +def populate_department(apps, schema_editor): + User = apps.get_model('myapp', 'User') + + # Batch update for performance + batch_size = 10000 + total = User.objects.filter(department__isnull=True).count() + + for offset in range(0, total, batch_size): + users = User.objects.filter(department__isnull=True)[offset:offset+batch_size] + for user in users: + user.department = determine_department(user) # Your logic + User.objects.bulk_update(users, ['department'], batch_size=batch_size) + +class Migration(migrations.Migration): + dependencies = [('myapp', '0002_add_department')], + operations = [ + migrations.RunPython(populate_department, migrations.RunPython.noop), + ] + +# Step 3: Make NOT NULL (migration 0004) +class Migration(migrations.Migration): + dependencies = [('myapp', '0003_populate_department')], + operations = [ + migrations.AlterField( + model_name='user', + name='department', + field=models.CharField(max_length=100), # NOT NULL + ), + ] +``` + +### Concurrent Index Creation (PostgreSQL) + +```python +from django.contrib.postgres.operations import AddIndexConcurrently +from django.db import migrations, models + +class Migration(migrations.Migration): + atomic = False # Required for CONCURRENTLY operations + + operations = [ + AddIndexConcurrently( + model_name='book', + index=models.Index(fields=['published_date'], name='book_published_idx'), + ), + ] +``` + +### Squashing Migrations + +```bash +# Squash migrations 0001 through 0020 into single migration +python manage.py squashmigrations myapp 0001 0020 + +# This creates migrations/0001_squashed_0020.py +# After deploying squashed migration, delete originals: +# migrations/0001.py through migrations/0020.py +``` + +## Django Testing + +### TestCase vs TransactionTestCase + +| Feature | TestCase | TransactionTestCase | +|---------|----------|---------------------| +| Speed | Fast (no DB reset between tests) | Slow (resets DB each test) | +| Transactions | Wrapped in transaction, rolled back | No automatic transaction | +| Use for | Most tests | Testing transaction behavior, signals | + +**Example - TestCase**: + +```python +from django.test import TestCase +from myapp.models import Book + +class BookModelTest(TestCase): + @classmethod + def setUpTestData(cls): + """Run once for entire test class (fast)""" + cls.author = Author.objects.create(name="Test Author") + + def setUp(self): + """Run before each test method""" + self.book = Book.objects.create( + title="Test Book", + author=self.author + ) + + def test_book_str(self): + self.assertEqual(str(self.book), "Test Book") + + def test_book_author_relationship(self): + self.assertEqual(self.book.author.name, "Test Author") +``` + +### API Testing with DRF + +```python +from rest_framework.test import APITestCase, APIClient +from rest_framework import status +from django.contrib.auth.models import User + +class BookAPITest(APITestCase): + def setUp(self): + self.client = APIClient() + self.user = User.objects.create_user( + username='testuser', + password='testpass123' + ) + self.book = Book.objects.create(title="Test Book") + + def test_list_books_unauthenticated(self): + response = self.client.get('/api/books/') + self.assertEqual(response.status_code, status.HTTP_200_OK) + + def test_create_book_authenticated(self): + self.client.force_authenticate(user=self.user) + data = {'title': 'New Book', 'author': self.author.id} + response = self.client.post('/api/books/', data) + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertEqual(Book.objects.count(), 2) + + def test_update_book_unauthorized(self): + other_user = User.objects.create_user(username='other', password='pass') + self.client.force_authenticate(user=other_user) + data = {'title': 'Updated Title'} + response = self.client.patch(f'/api/books/{self.book.id}/', data) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) +``` + +### Factory Pattern with factory_boy + +```python +# tests/factories.py +import factory +from myapp.models import Author, Book + +class AuthorFactory(factory.django.DjangoModelFactory): + class Meta: + model = Author + + name = factory.Faker('name') + bio = factory.Faker('text', max_nb_chars=200) + +class BookFactory(factory.django.DjangoModelFactory): + class Meta: + model = Book + + title = factory.Faker('sentence', nb_words=4) + author = factory.SubFactory(AuthorFactory) + published_date = factory.Faker('date_this_decade') + isbn = factory.Sequence(lambda n: f'978-0-{n:09d}') + +# Usage in tests +class BookTest(TestCase): + def test_book_creation(self): + book = BookFactory.create() # Creates Author too + self.assertIsNotNone(book.id) + + def test_multiple_books(self): + books = BookFactory.create_batch(10) # Create 10 books + self.assertEqual(len(books), 10) + + def test_author_with_books(self): + author = AuthorFactory.create() + BookFactory.create_batch(5, author=author) + self.assertEqual(author.books.count(), 5) +``` + +## Django Settings Organization + +### Multiple Environment Configs + +``` +myproject/ +└── settings/ + ├── __init__.py + ├── base.py # Common settings + ├── development.py # Dev overrides + ├── production.py # Prod overrides + └── test.py # Test overrides +``` + +**settings/base.py**: + +```python +import os +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent.parent.parent + +SECRET_KEY = os.environ.get('DJANGO_SECRET_KEY') + +INSTALLED_APPS = [ + 'django.contrib.admin', + # ... + 'rest_framework', + 'myapp', +] + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.postgresql', + 'NAME': os.environ.get('DB_NAME'), + 'USER': os.environ.get('DB_USER'), + 'PASSWORD': os.environ.get('DB_PASSWORD'), + 'HOST': os.environ.get('DB_HOST', 'localhost'), + 'PORT': os.environ.get('DB_PORT', '5432'), + } +} +``` + +**settings/development.py**: + +```python +from .base import * + +DEBUG = True + +ALLOWED_HOSTS = ['localhost', '127.0.0.1'] + +# Use console email backend +EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend' + +# Local cache +CACHES = { + 'default': { + 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', + } +} + +# Debug toolbar +INSTALLED_APPS += ['debug_toolbar'] +MIDDLEWARE += ['debug_toolbar.middleware.DebugToolbarMiddleware'] +INTERNAL_IPS = ['127.0.0.1'] +``` + +**settings/production.py**: + +```python +from .base import * + +DEBUG = False + +ALLOWED_HOSTS = [os.environ.get('ALLOWED_HOST')] + +# Security settings +SECURE_SSL_REDIRECT = True +SESSION_COOKIE_SECURE = True +CSRF_COOKIE_SECURE = True +SECURE_HSTS_SECONDS = 31536000 +SECURE_HSTS_INCLUDE_SUBDOMAINS = True +SECURE_HSTS_PRELOAD = True + +# Redis cache +CACHES = { + 'default': { + 'BACKEND': 'django_redis.cache.RedisCache', + 'LOCATION': os.environ.get('REDIS_URL'), + } +} + +# Real email +EMAIL_BACKEND = 'django.core.mail.backends.smtp.EmailBackend' +EMAIL_HOST = os.environ.get('EMAIL_HOST') +EMAIL_PORT = int(os.environ.get('EMAIL_PORT', 587)) +EMAIL_USE_TLS = True +``` + +**Usage**: + +```bash +# Development +export DJANGO_SETTINGS_MODULE=myproject.settings.development +python manage.py runserver + +# Production +export DJANGO_SETTINGS_MODULE=myproject.settings.production +gunicorn myproject.wsgi:application +``` + +## Django Deployment + +### Gunicorn Configuration + +```python +# gunicorn_config.py +import multiprocessing + +bind = "0.0.0.0:8000" +workers = multiprocessing.cpu_count() * 2 + 1 +worker_class = "sync" # or "gevent" for async +worker_connections = 1000 +max_requests = 1000 # Restart workers after N requests (prevent memory leaks) +max_requests_jitter = 100 +timeout = 30 +keepalive = 2 + +# Logging +accesslog = "-" # stdout +errorlog = "-" # stderr +loglevel = "info" + +# Process naming +proc_name = "myproject" + +# Server mechanics +daemon = False +pidfile = "/var/run/gunicorn.pid" +``` + +**Systemd service**: + +```ini +# /etc/systemd/system/myproject.service +[Unit] +Description=MyProject Django Application +After=network.target + +[Service] +Type=notify +User=www-data +Group=www-data +WorkingDirectory=/var/www/myproject +Environment="DJANGO_SETTINGS_MODULE=myproject.settings.production" +ExecStart=/var/www/myproject/venv/bin/gunicorn \ + --config /var/www/myproject/gunicorn_config.py \ + myproject.wsgi:application +ExecReload=/bin/kill -s HUP $MAINPID +Restart=always + +[Install] +WantedBy=multi-user.target +``` + +### Static and Media Files + +```python +# settings/production.py +STATIC_URL = '/static/' +STATIC_ROOT = BASE_DIR / 'staticfiles' + +MEDIA_URL = '/media/' +MEDIA_ROOT = BASE_DIR / 'media' + +# Use WhiteNoise for static files +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'whitenoise.middleware.WhiteNoiseMiddleware', # After SecurityMiddleware + # ... +] + +STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage' +``` + +**Collect static files**: + +```bash +python manage.py collectstatic --noinput +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **Lazy loading in loops** | N+1 queries | Use `select_related`/`prefetch_related` | +| **No database indexing** | Slow queries | Add `db_index=True` or Meta indexes | +| **Signals for async work** | Blocks requests | Use Celery tasks instead | +| **Generic serializers for everything** | Over-fetching data | Create optimized serializers per use case | +| **No caching** | Repeated expensive queries | Cache querysets, views, template fragments | +| **Migrations in production without testing** | Downtime, data loss | Test on production-sized datasets first | +| **DEBUG=True in production** | Security risk, slow | Always DEBUG=False in production | +| **No connection pooling** | Exhausts DB connections | Use pgBouncer or django-db-geventpool | + +## Cross-References + +**Related skills**: +- **Database optimization** → `database-integration` (connection pooling, migrations) +- **API testing** → `api-testing` (DRF testing patterns) +- **Authentication** → `api-authentication` (DRF token auth, JWT) +- **REST API design** → `rest-api-design` (API patterns) + +## Further Reading + +- **Django docs**: https://docs.djangoproject.com/ +- **DRF docs**: https://www.django-rest-framework.org/ +- **Two Scoops of Django**: Best practices book +- **Classy Class-Based Views**: https://ccbv.co.uk/ +- **Classy Django REST Framework**: https://www.cdrf.co/ diff --git a/skills/using-web-backend/express-development.md b/skills/using-web-backend/express-development.md new file mode 100644 index 0000000..ee1e673 --- /dev/null +++ b/skills/using-web-backend/express-development.md @@ -0,0 +1,872 @@ + +# Express Development + +## Overview + +**Express.js development specialist covering middleware organization, error handling, validation, database integration, testing, and production deployment.** + +**Core principle**: Express's minimalist philosophy requires disciplined patterns - without structure, Express apps become tangled middleware chains with inconsistent error handling and poor testability. + +## When to Use This Skill + +Use when encountering: + +- **Middleware organization**: Ordering, async error handling, custom middleware +- **Error handling**: Centralized handlers, custom error classes, async/await errors +- **Request validation**: Zod, express-validator, type-safe validation +- **Database patterns**: Connection pooling, transactions, graceful shutdown +- **Testing**: Supertest, mocking, middleware isolation +- **Production deployment**: PM2, clustering, Docker, environment management +- **Performance**: Compression, caching, clustering +- **Security**: Helmet, rate limiting, CORS, input sanitization + +**DO NOT use for**: +- General TypeScript patterns (use `axiom-python-engineering` equivalents) +- API design principles (use `rest-api-design`) +- Database-agnostic patterns (use `database-integration`) + +## Middleware Organization + +### Correct Middleware Order + +**Order matters** - middleware executes top to bottom: + +```typescript +import express from 'express'; +import helmet from 'helmet'; +import cors from 'cors'; +import compression from 'compression'; + +const app = express(); + +// 1. Security (FIRST - before any parsing) +app.use(helmet({ + contentSecurityPolicy: { + directives: { + defaultSrc: ["'self'"], + styleSrc: ["'self'", "'unsafe-inline'"], + }, + }, +})); + +// 2. CORS (before routes) +app.use(cors({ + origin: process.env.ALLOWED_ORIGINS?.split(','), + credentials: true, + maxAge: 86400, // 24 hours +})); + +// 3. Parsing +app.use(express.json({ limit: '10mb' })); +app.use(express.urlencoded({ extended: true, limit: '10mb' })); + +// 4. Compression +app.use(compression()); + +// 5. Logging +app.use(morgan('combined', { stream: logger.stream })); + +// 6. Authentication (before routes that need it) +app.use('/api', authenticationMiddleware); + +// 7. Routes +app.use('/api/users', userRoutes); +app.use('/api/posts', postRoutes); + +// 8. 404 handler (AFTER all routes) +app.use((req, res) => { + res.status(404).json({ + status: 'error', + message: 'Route not found', + path: req.path, + }); +}); + +// 9. Error handler (LAST) +app.use(errorHandler); +``` + +### Async Error Wrapper + +**Problem**: Express doesn't catch async errors automatically + +```typescript +// src/middleware/asyncHandler.ts +import { Request, Response, NextFunction } from 'express'; + +export const asyncHandler = ( + fn: (req: Request, res: Response, next: NextFunction) => Promise +) => { + return (req: Request, res: Response, next: NextFunction) => { + Promise.resolve(fn(req, res, next)).catch(next); + }; +}; + +// Usage +router.get('/:id', asyncHandler(async (req, res) => { + const user = await userService.findById(req.params.id); + if (!user) throw new NotFoundError('User not found'); + res.json(user); +})); +``` + +**Alternative**: Use express-async-errors (automatic) + +```typescript +// At top of app.ts (BEFORE routes) +import 'express-async-errors'; + +// Now all async route handlers auto-catch errors +router.get('/:id', async (req, res) => { + const user = await userService.findById(req.params.id); + res.json(user); +}); // Errors automatically forwarded to error handler +``` + +## Error Handling + +### Custom Error Classes + +```typescript +// src/errors/AppError.ts +export class AppError extends Error { + constructor( + public readonly message: string, + public readonly statusCode: number, + public readonly isOperational: boolean = true + ) { + super(message); + Error.captureStackTrace(this, this.constructor); + } +} + +// src/errors/HttpErrors.ts +export class BadRequestError extends AppError { + constructor(message: string) { + super(message, 400); + } +} + +export class UnauthorizedError extends AppError { + constructor(message = 'Unauthorized') { + super(message, 401); + } +} + +export class ForbiddenError extends AppError { + constructor(message = 'Forbidden') { + super(message, 403); + } +} + +export class NotFoundError extends AppError { + constructor(message: string) { + super(message, 404); + } +} + +export class ConflictError extends AppError { + constructor(message: string) { + super(message, 409); + } +} + +export class TooManyRequestsError extends AppError { + constructor(message = 'Too many requests', public retryAfter?: number) { + super(message, 429); + } +} +``` + +### Centralized Error Handler + +```typescript +// src/middleware/errorHandler.ts +import { Request, Response, NextFunction } from 'express'; +import { AppError } from '../errors/AppError'; +import { logger } from '../config/logger'; + +export const errorHandler = ( + err: Error, + req: Request, + res: Response, + next: NextFunction +) => { + // Log error with context + logger.error('Error occurred', { + error: { + message: err.message, + stack: err.stack, + name: err.name, + }, + request: { + method: req.method, + url: req.url, + ip: req.ip, + userAgent: req.get('user-agent'), + }, + }); + + // Operational errors (expected) + if (err instanceof AppError && err.isOperational) { + const response: any = { + status: 'error', + message: err.message, + }; + + // Add retry-after for rate limiting + if (err instanceof TooManyRequestsError && err.retryAfter) { + res.setHeader('Retry-After', err.retryAfter); + response.retryAfter = err.retryAfter; + } + + return res.status(err.statusCode).json(response); + } + + // Validation errors (Zod, express-validator) + if (err.name === 'ZodError') { + return res.status(400).json({ + status: 'error', + message: 'Validation failed', + errors: (err as any).errors, + }); + } + + // Database constraint violations + if ((err as any).code === '23505') { // PostgreSQL unique violation + return res.status(409).json({ + status: 'error', + message: 'Resource already exists', + }); + } + + if ((err as any).code === '23503') { // Foreign key violation + return res.status(400).json({ + status: 'error', + message: 'Invalid reference', + }); + } + + // Unexpected errors (don't leak details in production) + res.status(500).json({ + status: 'error', + message: process.env.NODE_ENV === 'production' + ? 'Internal server error' + : err.message, + ...(process.env.NODE_ENV !== 'production' && { stack: err.stack }), + }); +}; +``` + +### Global Error Handlers + +```typescript +// src/server.ts +process.on('unhandledRejection', (reason: Error) => { + logger.error('Unhandled Rejection', { reason }); + // Graceful shutdown + server.close(() => process.exit(1)); +}); + +process.on('uncaughtException', (error: Error) => { + logger.error('Uncaught Exception', { error }); + process.exit(1); +}); +``` + +## Request Validation + +### Zod Integration (Type-Safe) + +```typescript +// src/schemas/userSchema.ts +import { z } from 'zod'; + +export const createUserSchema = z.object({ + body: z.object({ + email: z.string().email('Invalid email'), + password: z.string() + .min(8, 'Password must be at least 8 characters') + .regex(/[A-Z]/, 'Password must contain uppercase') + .regex(/[0-9]/, 'Password must contain number'), + name: z.string().min(2).max(100), + age: z.number().int().positive().max(150).optional(), + }), +}); + +export const getUserSchema = z.object({ + params: z.object({ + id: z.string().regex(/^\d+$/, 'ID must be numeric'), + }), +}); + +export const getUsersSchema = z.object({ + query: z.object({ + page: z.string().regex(/^\d+$/).transform(Number).default('1'), + limit: z.string().regex(/^\d+$/).transform(Number).default('10'), + search: z.string().optional(), + sortBy: z.enum(['name', 'created_at', 'updated_at']).optional(), + order: z.enum(['asc', 'desc']).optional(), + }), +}); + +// Type inference +export type CreateUserInput = z.infer['body']; +export type GetUserParams = z.infer['params']; +export type GetUsersQuery = z.infer['query']; +``` + +**Validation middleware**: + +```typescript +// src/middleware/validate.ts +import { Request, Response, NextFunction } from 'express'; +import { AnyZodObject, ZodError } from 'zod'; + +export const validate = (schema: AnyZodObject) => { + return async (req: Request, res: Response, next: NextFunction) => { + try { + const validated = await schema.parseAsync({ + body: req.body, + query: req.query, + params: req.params, + }); + + // Replace with validated data (transforms applied) + req.body = validated.body || req.body; + req.query = validated.query || req.query; + req.params = validated.params || req.params; + + next(); + } catch (error) { + if (error instanceof ZodError) { + return res.status(400).json({ + status: 'error', + message: 'Validation failed', + errors: error.errors.map(err => ({ + field: err.path.join('.'), + message: err.message, + code: err.code, + })), + }); + } + next(error); + } + }; +}; +``` + +**Usage in routes**: + +```typescript +import { Router } from 'express'; +import { validate } from '../middleware/validate'; +import * as schemas from '../schemas/userSchema'; + +const router = Router(); + +router.post('/', validate(schemas.createUserSchema), async (req, res) => { + // req.body is now typed as CreateUserInput + const user = await userService.create(req.body); + res.status(201).json(user); +}); + +router.get('/:id', validate(schemas.getUserSchema), async (req, res) => { + // req.params.id is validated + const user = await userService.findById(req.params.id); + if (!user) throw new NotFoundError('User not found'); + res.json(user); +}); +``` + +## Database Connection Pooling + +### PostgreSQL with pg + +```typescript +// src/config/database.ts +import { Pool, PoolConfig } from 'pg'; +import { logger } from './logger'; + +const config: PoolConfig = { + host: process.env.DB_HOST || 'localhost', + port: Number(process.env.DB_PORT) || 5432, + database: process.env.DB_NAME, + user: process.env.DB_USER, + password: process.env.DB_PASSWORD, + max: Number(process.env.DB_POOL_MAX) || 20, + idleTimeoutMillis: 30000, + connectionTimeoutMillis: 2000, + statement_timeout: 30000, // 30s query timeout +}; + +export const pool = new Pool(config); + +// Event handlers +pool.on('connect', (client) => { + logger.debug('Database client connected'); +}); + +pool.on('acquire', (client) => { + logger.debug('Client acquired from pool'); +}); + +pool.on('error', (err, client) => { + logger.error('Unexpected pool error', { error: err }); + process.exit(-1); +}); + +// Health check +export const testConnection = async () => { + try { + const client = await pool.connect(); + const result = await client.query('SELECT NOW()'); + client.release(); + logger.info('Database connection successful', { + serverTime: result.rows[0].now, + }); + } catch (err) { + logger.error('Database connection failed', { error: err }); + throw err; + } +}; + +// Graceful shutdown +export const closePool = async () => { + logger.info('Closing database pool'); + await pool.end(); + logger.info('Database pool closed'); +}; +``` + +### Transaction Helper + +```typescript +// src/utils/transaction.ts +import { Pool, PoolClient } from 'pg'; + +export async function withTransaction( + pool: Pool, + callback: (client: PoolClient) => Promise +): Promise { + const client = await pool.connect(); + + try { + await client.query('BEGIN'); + const result = await callback(client); + await client.query('COMMIT'); + return result; + } catch (error) { + await client.query('ROLLBACK'); + throw error; + } finally { + client.release(); + } +} + +// Usage +import { pool } from '../config/database'; + +async function createUserWithProfile(userData, profileData) { + return withTransaction(pool, async (client) => { + const userResult = await client.query( + 'INSERT INTO users (email, name) VALUES ($1, $2) RETURNING id', + [userData.email, userData.name] + ); + const userId = userResult.rows[0].id; + + await client.query( + 'INSERT INTO profiles (user_id, bio) VALUES ($1, $2)', + [userId, profileData.bio] + ); + + return userId; + }); +} +``` + +## Testing + +### Integration Tests with Supertest + +```typescript +// tests/integration/userRoutes.test.ts +import request from 'supertest'; +import app from '../../src/app'; +import { pool } from '../../src/config/database'; + +describe('User Routes', () => { + beforeAll(async () => { + await pool.query('CREATE TABLE IF NOT EXISTS users (...)'); + }); + + afterEach(async () => { + await pool.query('TRUNCATE TABLE users CASCADE'); + }); + + afterAll(async () => { + await pool.end(); + }); + + describe('POST /api/users', () => { + it('should create user with valid data', async () => { + const response = await request(app) + .post('/api/users') + .send({ + email: 'test@example.com', + name: 'Test User', + password: 'Password123', + }) + .expect(201); + + expect(response.body).toHaveProperty('id'); + expect(response.body.email).toBe('test@example.com'); + expect(response.body).not.toHaveProperty('password'); + }); + + it('should return 400 for invalid email', async () => { + const response = await request(app) + .post('/api/users') + .send({ + email: 'invalid', + name: 'Test', + password: 'Password123', + }) + .expect(400); + + expect(response.body.status).toBe('error'); + expect(response.body.errors).toContainEqual( + expect.objectContaining({ + field: 'body.email', + message: expect.stringContaining('email'), + }) + ); + }); + }); + + describe('GET /api/users/:id', () => { + it('should return user by ID', async () => { + const createRes = await request(app) + .post('/api/users') + .send({ + email: 'test@example.com', + name: 'Test User', + password: 'Password123', + }); + + const response = await request(app) + .get(`/api/users/${createRes.body.id}`) + .expect(200); + + expect(response.body.id).toBe(createRes.body.id); + }); + + it('should return 404 for non-existent user', async () => { + await request(app) + .get('/api/users/99999') + .expect(404); + }); + }); +}); +``` + +### Unit Tests with Mocks + +```typescript +// tests/unit/userService.test.ts +import { userService } from '../../src/services/userService'; +import { pool } from '../../src/config/database'; + +jest.mock('../../src/config/database'); + +const mockPool = pool as jest.Mocked; + +describe('UserService', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + describe('findById', () => { + it('should return user when found', async () => { + mockPool.query.mockResolvedValue({ + rows: [{ id: 1, email: 'test@example.com', name: 'Test' }], + command: 'SELECT', + rowCount: 1, + oid: 0, + fields: [], + }); + + const result = await userService.findById('1'); + + expect(result).toEqual( + expect.objectContaining({ id: 1, email: 'test@example.com' }) + ); + }); + + it('should return null when not found', async () => { + mockPool.query.mockResolvedValue({ + rows: [], + command: 'SELECT', + rowCount: 0, + oid: 0, + fields: [], + }); + + const result = await userService.findById('999'); + expect(result).toBeNull(); + }); + }); +}); +``` + +## Production Deployment + +### PM2 Configuration + +```javascript +// ecosystem.config.js +module.exports = { + apps: [{ + name: 'api', + script: './dist/server.js', + instances: 'max', // Use all CPU cores + exec_mode: 'cluster', + env: { + NODE_ENV: 'production', + PORT: 3000, + }, + error_file: './logs/err.log', + out_file: './logs/out.log', + log_date_format: 'YYYY-MM-DD HH:mm:ss Z', + merge_logs: true, + max_memory_restart: '500M', + wait_ready: true, + listen_timeout: 10000, + kill_timeout: 5000, + }], +}; +``` + +**Graceful shutdown with PM2**: + +```typescript +// src/server.ts +const server = app.listen(PORT, () => { + logger.info(`Server started on port ${PORT}`); + + // Signal PM2 ready + if (process.send) { + process.send('ready'); + } +}); + +// Graceful shutdown +process.on('SIGINT', async () => { + logger.info('SIGINT received, closing server'); + + server.close(async () => { + await closePool(); + logger.info('Server closed'); + process.exit(0); + }); + + // Force shutdown after 10s + setTimeout(() => { + logger.error('Forcing shutdown'); + process.exit(1); + }, 10000); +}); +``` + +### Dockerfile + +```dockerfile +# Multi-stage build +FROM node:18-alpine AS builder + +WORKDIR /app + +# Copy package files +COPY package*.json ./ +COPY tsconfig.json ./ + +# Install dependencies +RUN npm ci + +# Copy source +COPY src ./src + +# Build TypeScript +RUN npm run build + +# Production image +FROM node:18-alpine + +WORKDIR /app + +# Install production dependencies only +COPY package*.json ./ +RUN npm ci --omit=dev && npm cache clean --force + +# Copy built files +COPY --from=builder /app/dist ./dist + +# Create non-root user +RUN addgroup -g 1001 -S nodejs && \ + adduser -S nodejs -u 1001 + +USER nodejs + +EXPOSE 3000 + +CMD ["node", "dist/server.js"] +``` + +### Health Check Endpoint + +```typescript +// src/routes/healthRoutes.ts +import { Router } from 'express'; +import { pool } from '../config/database'; + +const router = Router(); + +router.get('/health', async (req, res) => { + const health = { + uptime: process.uptime(), + message: 'OK', + timestamp: Date.now(), + }; + + try { + await pool.query('SELECT 1'); + health.database = 'connected'; + } catch (error) { + health.database = 'disconnected'; + return res.status(503).json(health); + } + + res.json(health); +}); + +router.get('/health/ready', async (req, res) => { + // Readiness check + try { + await pool.query('SELECT 1'); + res.status(200).json({ status: 'ready' }); + } catch (error) { + res.status(503).json({ status: 'not ready' }); + } +}); + +router.get('/health/live', (req, res) => { + // Liveness check (simpler) + res.status(200).json({ status: 'alive' }); +}); + +export default router; +``` + +## Performance Optimization + +### Response Caching + +```typescript +import Redis from 'ioredis'; + +const redis = new Redis({ + host: process.env.REDIS_HOST, + port: Number(process.env.REDIS_PORT), +}); + +export const cacheMiddleware = (duration: number) => { + return async (req: Request, res: Response, next: NextFunction) => { + if (req.method !== 'GET') return next(); + + const key = `cache:${req.originalUrl}`; + + try { + const cached = await redis.get(key); + if (cached) { + return res.json(JSON.parse(cached)); + } + + // Capture response + const originalJson = res.json.bind(res); + res.json = (body: any) => { + redis.setex(key, duration, JSON.stringify(body)); + return originalJson(body); + }; + + next(); + } catch (error) { + next(); + } + }; +}; + +// Usage +router.get('/users', cacheMiddleware(300), async (req, res) => { + const users = await userService.findAll(); + res.json(users); +}); +``` + +## Security + +### Rate Limiting + +```typescript +import rateLimit from 'express-rate-limit'; +import RedisStore from 'rate-limit-redis'; +import Redis from 'ioredis'; + +const redis = new Redis(); + +export const apiLimiter = rateLimit({ + store: new RedisStore({ client: redis }), + windowMs: 15 * 60 * 1000, // 15 minutes + max: 100, // 100 requests per window + message: 'Too many requests, please try again later', + standardHeaders: true, + legacyHeaders: false, +}); + +export const authLimiter = rateLimit({ + store: new RedisStore({ client: redis }), + windowMs: 15 * 60 * 1000, + max: 5, // 5 attempts + skipSuccessfulRequests: true, +}); + +// Usage +app.use('/api/', apiLimiter); +app.use('/api/auth/login', authLimiter); +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **No async error handling** | Crashes server | Use asyncHandler or express-async-errors | +| **Inconsistent error responses** | Poor DX | Centralized error handler | +| **New DB connection per request** | Exhausts connections | Use connection pool | +| **No graceful shutdown** | Data loss, broken requests | Handle SIGTERM/SIGINT | +| **Logging to console in production** | Lost logs, no structure | Use Winston/Pino with transports | +| **No request validation** | Security vulnerabilities | Zod/express-validator | +| **Synchronous operations in routes** | Blocks event loop | Use async/await | +| **No health checks** | Can't monitor service | /health endpoints | + +## Cross-References + +**Related skills**: +- **Database patterns** → `database-integration` (pooling, transactions) +- **API testing** → `api-testing` (supertest patterns) +- **REST design** → `rest-api-design` (endpoint patterns) +- **Authentication** → `api-authentication` (JWT, sessions) + +## Further Reading + +- **Express docs**: https://expressjs.com/ +- **Express.js Best Practices**: https://expressjs.com/en/advanced/best-practice-performance.html +- **Node.js Production Best Practices**: https://github.com/goldbergyoni/nodebestpractices diff --git a/skills/using-web-backend/fastapi-development.md b/skills/using-web-backend/fastapi-development.md new file mode 100644 index 0000000..ccade1a --- /dev/null +++ b/skills/using-web-backend/fastapi-development.md @@ -0,0 +1,500 @@ + +# FastAPI Development + +## Overview + +**FastAPI specialist skill providing production-ready patterns, anti-patterns to avoid, and testing strategies.** + +**Core principle**: FastAPI's type hints, dependency injection, and async-first design enable fast, maintainable APIs - but require understanding async/sync boundaries, proper dependency management, and production hardening patterns. + +## When to Use This Skill + +Use when encountering: + +- **Dependency injection**: Database connections, auth, shared resources, testing overrides +- **Async/sync boundaries**: Mixing blocking I/O with async endpoints, performance issues +- **Background tasks**: Choosing between BackgroundTasks, Celery, or other task queues +- **File uploads**: Streaming large files, memory management +- **Testing**: Dependency overrides, async test clients, fixture patterns +- **Production deployment**: ASGI servers, lifespan management, connection pooling +- **Security**: SQL injection, CORS, authentication patterns +- **Performance**: Connection pooling, query optimization, caching + +## Quick Reference - Common Patterns + +| Pattern | Use Case | Code Snippet | +|---------|----------|--------------| +| **DB dependency with pooling** | Per-request database access | `def get_db(): db = SessionLocal(); try: yield db; finally: db.close()` | +| **Dependency override for testing** | Test with mock/test DB | `app.dependency_overrides[get_db] = override_get_db` | +| **Lifespan events** | Startup/shutdown resources | `@asynccontextmanager async def lifespan(app): ... yield ...` | +| **Streaming file upload** | Large files without memory issues | `async with aiofiles.open(...) as f: while chunk := await file.read(CHUNK_SIZE): await f.write(chunk)` | +| **Background tasks (short)** | < 30 sec tasks | `background_tasks.add_task(func, args)` | +| **Task queue (long)** | > 1 min tasks, retries needed | Use Celery/Arq with Redis | +| **Parameterized queries** | Prevent SQL injection | `cursor.execute("SELECT * FROM users WHERE id = %s", (user_id,))` | + +## Core Patterns + +### 1. Dependency Injection Architecture + +**Pattern: Connection pooling with yield dependencies** + +```python +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker, Session +from fastapi import Depends, FastAPI + +# One-time pool creation at module level +engine = create_engine( + "postgresql://user:pass@localhost/db", + pool_size=20, # Max connections + max_overflow=0, # No overflow beyond pool_size + pool_pre_ping=True, # Verify connection health before use + pool_recycle=3600 # Recycle connections every hour +) +SessionLocal = sessionmaker(bind=engine, expire_on_commit=False) + +# Dependency pattern with automatic cleanup +def get_db() -> Session: + """ + Yields database session from pool. + Ensures cleanup even if endpoint raises exception. + """ + db = SessionLocal() + try: + yield db + finally: + db.close() + +# Usage in endpoints +@app.get("/items/{item_id}") +def get_item(item_id: int, db: Session = Depends(get_db)): + return db.query(Item).filter(Item.id == item_id).first() +``` + +**Why this pattern**: +- Pool created once (expensive operation) +- Per-request connections from pool (cheap) +- `yield` ensures cleanup on success AND exceptions +- `pool_pre_ping` prevents stale connection errors +- `pool_recycle` prevents long-lived connection issues + +**Testing pattern**: + +```python +# conftest.py +import pytest +from fastapi.testclient import TestClient + +@pytest.fixture +def test_db(): + """Test database fixture""" + db = TestSessionLocal() + try: + yield db + finally: + db.rollback() + db.close() + +@pytest.fixture +def client(test_db): + """Test client with overridden dependencies""" + def override_get_db(): + yield test_db + + app.dependency_overrides[get_db] = override_get_db + with TestClient(app) as c: + yield c + app.dependency_overrides.clear() + +# test_items.py +def test_get_item(client, test_db): + # Setup test data + test_db.add(Item(id=1, name="Test")) + test_db.commit() + + # Test endpoint + response = client.get("/items/1") + assert response.status_code == 200 +``` + +### 2. Async/Sync Boundary Management + +**❌ Anti-pattern: Blocking calls in async endpoints** + +```python +# BAD - Blocks event loop +@app.get("/users/{user_id}") +async def get_user(user_id: int): + conn = psycopg2.connect(...) # Blocking! + cursor = conn.cursor() + cursor.execute(...) # Blocking! + return cursor.fetchone() +``` + +**✅ Pattern: Use async libraries or run_in_threadpool** + +```python +# GOOD Option 1: Async database library +from databases import Database + +database = Database("postgresql://...") + +@app.get("/users/{user_id}") +async def get_user(user_id: int): + query = "SELECT * FROM users WHERE id = :user_id" + return await database.fetch_one(query=query, values={"user_id": user_id}) + +# GOOD Option 2: Run blocking code in thread pool +from fastapi.concurrency import run_in_threadpool + +def blocking_db_call(user_id: int): + conn = psycopg2.connect(...) + cursor = conn.cursor() + cursor.execute("SELECT * FROM users WHERE id = %s", (user_id,)) + return cursor.fetchone() + +@app.get("/users/{user_id}") +async def get_user(user_id: int): + return await run_in_threadpool(blocking_db_call, user_id) +``` + +**Decision table**: + +| Scenario | Use | +|----------|-----| +| PostgreSQL with async needed | `asyncpg` or `databases` library | +| PostgreSQL, sync is fine | `psycopg2` with `def` (not `async def`) endpoints | +| MySQL with async | `aiomysql` | +| SQLite | `aiosqlite` (async) or sync with `def` endpoints | +| External API calls | `httpx.AsyncClient` | +| CPU-intensive work | `run_in_threadpool` or Celery | + +### 3. Lifespan Management (Modern Pattern) + +**✅ Use lifespan context manager** (replaces deprecated `@app.on_event`) + +```python +from contextlib import asynccontextmanager +from fastapi import FastAPI + +# Global resources +resources = {} + +@asynccontextmanager +async def lifespan(app: FastAPI): + # Startup + resources["db_pool"] = await create_async_pool( + "postgresql://...", + min_size=10, + max_size=20 + ) + resources["redis"] = await aioredis.create_redis_pool("redis://...") + resources["ml_model"] = load_ml_model() # Can be sync or async + + yield # Application runs + + # Shutdown + await resources["db_pool"].close() + resources["redis"].close() + await resources["redis"].wait_closed() + resources.clear() + +app = FastAPI(lifespan=lifespan) + +# Access resources in endpoints +@app.get("/predict") +async def predict(data: dict): + model = resources["ml_model"] + return {"prediction": model.predict(data)} +``` + +### 4. File Upload Patterns + +**For 100MB+ files: Stream to disk, never load into memory** + +```python +from fastapi import UploadFile, File, HTTPException +import aiofiles +import os + +UPLOAD_DIR = "/var/uploads" +CHUNK_SIZE = 1024 * 1024 # 1MB chunks +MAX_FILE_SIZE = 500 * 1024 * 1024 # 500MB + +@app.post("/upload") +async def upload_large_file(file: UploadFile = File(...)): + # Validate content type + if not file.content_type.startswith("video/"): + raise HTTPException(400, "Only video files accepted") + + filepath = os.path.join(UPLOAD_DIR, f"{uuid.uuid4()}_{file.filename}") + size = 0 + + try: + async with aiofiles.open(filepath, 'wb') as f: + while chunk := await file.read(CHUNK_SIZE): + size += len(chunk) + if size > MAX_FILE_SIZE: + raise HTTPException(413, "File too large") + await f.write(chunk) + except Exception as e: + # Cleanup on failure + if os.path.exists(filepath): + os.remove(filepath) + raise + + return {"filename": file.filename, "size": size} +``` + +**For very large files (1GB+): Direct S3 upload with presigned URLs** + +```python +import boto3 + +@app.post("/upload/presigned-url") +async def get_presigned_upload_url(filename: str): + s3_client = boto3.client('s3') + presigned_post = s3_client.generate_presigned_post( + Bucket='my-bucket', + Key=f'uploads/{uuid.uuid4()}_{filename}', + ExpiresIn=3600 + ) + return presigned_post # Client uploads directly to S3 +``` + +### 5. Background Task Decision Matrix + +| Task Duration | Needs Retries? | Needs Monitoring? | Solution | +|---------------|----------------|-------------------|----------| +| < 30 seconds | No | No | `BackgroundTasks` | +| < 30 seconds | Yes | Maybe | Celery/Arq | +| > 1 minute | Don't care | Don't care | Celery/Arq | +| Any | Yes | Yes | Celery/Arq with monitoring | + +**BackgroundTasks pattern** (simple, in-process): + +```python +from fastapi import BackgroundTasks + +async def send_email(email: str): + await asyncio.sleep(2) # Async work + print(f"Email sent to {email}") + +@app.post("/register") +async def register(email: str, background_tasks: BackgroundTasks): + # ... save user ... + background_tasks.add_task(send_email, email) + return {"status": "registered"} # Returns immediately +``` + +**Celery pattern** (distributed, persistent): + +```python +# celery_app.py +from celery import Celery + +celery_app = Celery('tasks', broker='redis://localhost:6379/0') + +@celery_app.task(bind=True, max_retries=3) +def process_video(self, filepath: str): + try: + # Long-running work + extract_frames(filepath) + except Exception as exc: + raise self.retry(exc=exc, countdown=60) + +# main.py +from celery_app import process_video + +@app.post("/upload") +async def upload(file: UploadFile): + filepath = await save_file(file) + task = process_video.delay(filepath) + return {"task_id": task.id} + +@app.get("/status/{task_id}") +async def get_status(task_id: str): + from celery_app import celery_app + result = celery_app.AsyncResult(task_id) + return {"status": result.state, "result": result.result} +``` + +## Security Patterns + +### SQL Injection Prevention + +**❌ NEVER use f-strings or string concatenation** + +```python +# DANGEROUS +cursor.execute(f"SELECT * FROM users WHERE id = {user_id}") +cursor.execute("SELECT * FROM users WHERE email = '" + email + "'") +``` + +**✅ ALWAYS use parameterized queries** + +```python +# SQLAlchemy ORM (safe) +db.query(User).filter(User.id == user_id).first() + +# Raw SQL (safe with parameters) +cursor.execute("SELECT * FROM users WHERE id = %s", (user_id,)) +cursor.execute("SELECT * FROM users WHERE email = :email", {"email": email}) +``` + +### CORS Configuration + +```python +from fastapi.middleware.cors import CORSMiddleware + +app.add_middleware( + CORSMiddleware, + allow_origins=["https://yourdomain.com"], # Specific origins, not "*" in production + allow_credentials=True, + allow_methods=["GET", "POST", "PUT", "DELETE"], + allow_headers=["*"], +) +``` + +### Authentication Pattern + +```python +from fastapi import Depends, HTTPException, status +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials + +security = HTTPBearer() + +async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)): + token = credentials.credentials + try: + payload = jwt.decode(token, SECRET_KEY, algorithms=["HS256"]) + user_id = payload.get("sub") + if not user_id: + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid token") + return await get_user_by_id(user_id) + except jwt.JWTError: + raise HTTPException(status.HTTP_401_UNAUTHORIZED, "Invalid token") + +@app.get("/protected") +async def protected_route(current_user = Depends(get_current_user)): + return {"user": current_user} +``` + +## Middleware Ordering + +**Critical: Middleware wraps in order added, executes in reverse for responses** + +```python +# Correct order: +app.add_middleware(CORSMiddleware, ...) # 1. FIRST - handles preflight +app.add_middleware(RequestLoggingMiddleware) # 2. Logs entire request +app.add_middleware(ErrorHandlingMiddleware) # 3. Catches errors from auth/routes +app.add_middleware(AuthenticationMiddleware) # 4. LAST - closest to routes +``` + +## Common Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| Global database connection | Not thread-safe, connection leaks | Use connection pool with dependency injection | +| `async def` with blocking I/O | Blocks event loop, kills performance | Use async libraries or `run_in_threadpool` | +| `time.sleep()` in async code | Blocks entire event loop | Use `asyncio.sleep()` | +| Loading large files into memory | Memory exhaustion, OOM crashes | Stream with `aiofiles` and chunks | +| BackgroundTasks for long work | Lost on restart, no retries | Use Celery/Arq | +| String formatting in SQL | SQL injection vulnerability | Parameterized queries only | +| `allow_origins=["*"]` with credentials | Security vulnerability | Specify exact origins | +| Not closing database connections | Connection pool exhaustion | Use `yield` in dependencies | + +## Testing Best Practices + +```python +import pytest +from fastapi.testclient import TestClient +from httpx import AsyncClient + +# Sync tests (simpler, faster for most cases) +def test_read_item(client): + response = client.get("/items/1") + assert response.status_code == 200 + +# Async tests (needed for testing async endpoints with real async operations) +@pytest.mark.asyncio +async def test_async_endpoint(): + async with AsyncClient(app=app, base_url="http://test") as ac: + response = await ac.get("/items/1") + assert response.status_code == 200 + +# Dependency override pattern +def test_with_mock_db(client): + def override_get_db(): + yield mock_db + + app.dependency_overrides[get_db] = override_get_db + response = client.get("/items/1") + app.dependency_overrides.clear() + assert response.status_code == 200 +``` + +## Production Deployment + +**ASGI server configuration** (Uvicorn + Gunicorn): + +```bash +# gunicorn with uvicorn workers (production) +gunicorn main:app \ + --workers 4 \ + --worker-class uvicorn.workers.UvicornWorker \ + --bind 0.0.0.0:8000 \ + --timeout 120 \ + --graceful-timeout 30 \ + --keep-alive 5 +``` + +**Environment-based configuration**: + +```python +from pydantic_settings import BaseSettings + +class Settings(BaseSettings): + database_url: str + redis_url: str + secret_key: str + debug: bool = False + + class Config: + env_file = ".env" + +settings = Settings() + +# Use in app +engine = create_engine(settings.database_url) +``` + +## Cross-References + +**Related skills**: +- **Security** → `ordis-security-architect` (threat modeling, OWASP top 10) +- **Python patterns** → `axiom-python-engineering` (async patterns, type hints) +- **API testing** → `api-testing` (contract testing, integration tests) +- **API documentation** → `api-documentation` or `muna-technical-writer` +- **Database optimization** → `database-integration` (query optimization, migrations) +- **Authentication deep dive** → `api-authentication` (OAuth2, JWT patterns) +- **GraphQL alternative** → `graphql-api-design` + +## Performance Tips + +1. **Use connection pooling** - Create pool once, not per-request +2. **Enable response caching** - Use `fastapi-cache2` for expensive queries +3. **Limit response size** - Paginate large result sets +4. **Use async for I/O** - Database, HTTP calls, file operations +5. **Profile slow endpoints** - Use `starlette-prometheus` for monitoring +6. **Enable gzip compression** - `GZipMiddleware` for large JSON responses + +## When NOT to Use FastAPI + +- **Simple CRUD with admin panel** → Django (has built-in admin) +- **Heavy template rendering** → Django or Flask +- **Mature ecosystem needed** → Django (more third-party packages) +- **Team unfamiliar with async** → Flask or Django (simpler mental model) + +FastAPI excels at: Modern APIs, microservices, ML model serving, real-time features, high performance requirements. diff --git a/skills/using-web-backend/graphql-api-design.md b/skills/using-web-backend/graphql-api-design.md new file mode 100644 index 0000000..97f4c95 --- /dev/null +++ b/skills/using-web-backend/graphql-api-design.md @@ -0,0 +1,954 @@ + +# GraphQL API Design + +## Overview + +**GraphQL API specialist covering schema design, query optimization, real-time subscriptions, federation, and production patterns.** + +**Core principle**: GraphQL enables clients to request exactly the data they need in a single query - but requires careful schema design, batching strategies, and security measures to prevent performance and security issues. + +## When to Use This Skill + +Use when encountering: + +- **N+1 query problems**: Too many database queries for nested resolvers +- **Schema design**: Types, interfaces, unions, input types, directives +- **Pagination**: Connections, cursors, offset patterns +- **Performance**: Query complexity, caching, batching, persisted queries +- **Real-time**: Subscriptions, WebSocket patterns, live queries +- **Federation**: Splitting schema across multiple services +- **Security**: Query depth limiting, cost analysis, allowlisting +- **Testing**: Schema validation, resolver testing, integration tests +- **Migrations**: Schema evolution, deprecation, versioning + +**Do NOT use for**: +- REST API design → `rest-api-design` +- Framework-specific implementation → `fastapi-development`, `express-development` +- Microservices architecture → `microservices-architecture` (use with Federation) + +## GraphQL vs REST Decision Matrix + +| Factor | Choose GraphQL | Choose REST | +|--------|----------------|-------------| +| **Client needs** | Mobile apps, varying data needs | Uniform data requirements | +| **Over/under-fetching** | Problem | Not a problem | +| **Real-time features** | Subscriptions built-in | Need SSE/WebSockets separately | +| **Schema-first** | Strong typing required | Flexible, schema optional | +| **Caching** | Complex (field-level) | Simple (HTTP caching) | +| **File uploads** | Non-standard (multipart) | Native (multipart/form-data) | +| **Team expertise** | GraphQL experience | REST experience | +| **API consumers** | Known clients | Public/third-party | +| **Rate limiting** | Complex (field-level) | Simple (endpoint-level) | + +**Hybrid approach**: GraphQL for internal/mobile, REST for public APIs + +## Quick Reference - Core Patterns + +| Pattern | Use Case | Key Concept | +|---------|----------|-------------| +| **DataLoader** | N+1 queries | Batch and cache within request | +| **Connection** | Pagination | Cursor-based with edges/nodes | +| **Union** | Heterogeneous results | Search, activity feeds | +| **Interface** | Shared fields | Polymorphic types with guarantees | +| **Directive** | Field behavior | @auth, @deprecated, custom logic | +| **Input types** | Mutations | Type-safe input validation | +| **Federation** | Microservices | Distributed schema composition | +| **Subscription** | Real-time | WebSocket-based live updates | + +## N+1 Query Optimization + +### The Problem + +```javascript +// Schema +type Post { + id: ID! + title: String! + author: User! // Requires fetching user +} + +type Query { + posts: [Post!]! +} + +// Naive resolver (N+1 problem) +const resolvers = { + Query: { + posts: () => db.posts.findAll() // 1 query + }, + Post: { + author: (post) => db.users.findOne(post.authorId) // N queries! + } +}; + +// Result: 100 posts = 101 database queries +``` + +### DataLoader Solution + +```javascript +const DataLoader = require('dataloader'); + +// Batch loading function +const batchUsers = async (userIds) => { + const users = await db.users.findMany({ + where: { id: { in: userIds } } + }); + + // CRITICAL: Return in same order as requested IDs + const userMap = new Map(users.map(u => [u.id, u])); + return userIds.map(id => userMap.get(id) || null); +}; + +// Create loader per-request (avoid stale cache) +const createLoaders = () => ({ + user: new DataLoader(batchUsers), + post: new DataLoader(batchPosts), + // ... other loaders +}); + +// Add to context +const server = new ApolloServer({ + typeDefs, + resolvers, + context: () => ({ + loaders: createLoaders(), + db, + user: getCurrentUser() + }) +}); + +// Use in resolver +const resolvers = { + Post: { + author: (post, args, { loaders }) => { + return loaders.user.load(post.authorId); // Batched! + } + } +}; +``` + +**Result**: 100 posts = 2 queries (1 for posts, 1 batched for unique authors) + +### Advanced DataLoader Patterns + +**Composite Keys**: + +```javascript +// For multi-field lookups +const batchUsersByEmail = async (keys) => { + // keys = [{domain: 'example.com', email: 'user@example.com'}, ...] + const users = await db.users.findMany({ + where: { + OR: keys.map(k => ({ email: k.email, domain: k.domain })) + } + }); + + const userMap = new Map( + users.map(u => [`${u.domain}:${u.email}`, u]) + ); + + return keys.map(k => userMap.get(`${k.domain}:${k.email}`)); +}; + +const userByEmailLoader = new DataLoader(batchUsersByEmail, { + cacheKeyFn: (key) => `${key.domain}:${key.email}` +}); +``` + +**Priming Cache**: + +```javascript +// After fetching posts, prime user loader +const posts = await db.posts.findAll(); +posts.forEach(post => { + if (post.authorData) { + loaders.user.prime(post.authorId, post.authorData); + } +}); +return posts; +``` + +**Error Handling in Batch**: + +```javascript +const batchUsers = async (userIds) => { + const users = await db.users.findMany({ + where: { id: { in: userIds } } + }); + + const userMap = new Map(users.map(u => [u.id, u])); + + return userIds.map(id => { + const user = userMap.get(id); + if (!user) { + return new Error(`User ${id} not found`); // Per-item error + } + return user; + }); +}; +``` + +## Schema Design Patterns + +### Interface vs Union + +**Interface** (shared fields enforced): + +```graphql +interface Node { + id: ID! +} + +interface Timestamped { + createdAt: DateTime! + updatedAt: DateTime! +} + +type User implements Node & Timestamped { + id: ID! + createdAt: DateTime! + updatedAt: DateTime! + email: String! + name: String! +} + +type Post implements Node & Timestamped { + id: ID! + createdAt: DateTime! + updatedAt: DateTime! + title: String! + content: String! +} + +type Query { + node(id: ID!): Node # Can return any Node implementer + nodes(ids: [ID!]!): [Node!]! +} +``` + +**Query**: +```graphql +{ + node(id: "user_123") { + id + ... on User { + email + name + } + ... on Post { + title + } + } +} +``` + +**Union** (no shared fields required): + +```graphql +union SearchResult = User | Post | Comment + +type Query { + search(query: String!): [SearchResult!]! +} +``` + +**When to use each**: + +| Use Case | Pattern | Why | +|----------|---------|-----| +| Global ID lookup | Interface (Node) | Guarantees `id` field | +| Polymorphic lists with shared fields | Interface | Can query shared fields without fragments | +| Heterogeneous results | Union | No shared field requirements | +| Activity feeds | Union | Different event types | +| Search results | Union | Mixed content types | + +### Input Types and Validation + +```graphql +input CreatePostInput { + title: String! + content: String! + tags: [String!] + publishedAt: DateTime +} + +input UpdatePostInput { + title: String + content: String + tags: [String!] +} + +type Mutation { + createPost(input: CreatePostInput!): Post! + updatePost(id: ID!, input: UpdatePostInput!): Post! +} +``` + +**Benefits**: +- Reusable across multiple mutations +- Clear separation of create vs update requirements +- Type-safe in generated code +- Can add descriptions per field + +### Custom Directives + +```graphql +directive @auth(requires: Role = USER) on FIELD_DEFINITION +directive @rateLimit(limit: Int!, window: Int!) on FIELD_DEFINITION +directive @deprecated(reason: String) on FIELD_DEFINITION | ENUM_VALUE + +enum Role { + USER + ADMIN + SUPER_ADMIN +} + +type Query { + publicData: String + userData: User @auth(requires: USER) + adminData: String @auth(requires: ADMIN) + expensiveQuery: Result @rateLimit(limit: 10, window: 60) +} + +type User { + id: ID! + email: String! @auth(requires: USER) # Only authenticated users + internalId: String @deprecated(reason: "Use `id` instead") +} +``` + +## Pagination Patterns + +### Relay Connection Specification + +**Standard connection pattern**: + +```graphql +type PostConnection { + edges: [PostEdge!]! + pageInfo: PageInfo! + totalCount: Int # Optional +} + +type PostEdge { + node: Post! + cursor: String! +} + +type PageInfo { + hasNextPage: Boolean! + hasPreviousPage: Boolean! + startCursor: String + endCursor: String +} + +type Query { + posts( + first: Int + after: String + last: Int + before: String + ): PostConnection! +} +``` + +**Implementation**: + +```javascript +const resolvers = { + Query: { + posts: async (parent, { first, after, last, before }) => { + const limit = first || last || 10; + const cursor = after || before; + + // Decode cursor + const offset = cursor ? decodeCursor(cursor) : 0; + + // Fetch one extra to determine hasNextPage + const posts = await db.posts.findMany({ + skip: offset, + take: limit + 1, + orderBy: { createdAt: 'desc' } + }); + + const hasNextPage = posts.length > limit; + const edges = posts.slice(0, limit).map((post, index) => ({ + node: post, + cursor: encodeCursor(offset + index) + })); + + return { + edges, + pageInfo: { + hasNextPage, + hasPreviousPage: offset > 0, + startCursor: edges[0]?.cursor, + endCursor: edges[edges.length - 1]?.cursor + } + }; + } + } +}; + +// Opaque cursor encoding +const encodeCursor = (offset) => + Buffer.from(`arrayconnection:${offset}`).toString('base64'); +const decodeCursor = (cursor) => + parseInt(Buffer.from(cursor, 'base64').toString().split(':')[1]); +``` + +**Alternative: Offset pagination** (simpler but less robust): + +```graphql +type PostPage { + items: [Post!]! + total: Int! + page: Int! + pageSize: Int! +} + +type Query { + posts(page: Int = 1, pageSize: Int = 20): PostPage! +} +``` + +## Performance Optimization + +### Query Complexity Analysis + +**Prevent expensive queries**: + +```javascript +const depthLimit = require('graphql-depth-limit'); +const { createComplexityLimitRule } = require('graphql-validation-complexity'); + +const server = new ApolloServer({ + typeDefs, + resolvers, + validationRules: [ + depthLimit(10), // Max 10 levels deep + createComplexityLimitRule(1000, { + scalarCost: 1, + objectCost: 2, + listFactor: 10 + }) + ] +}); +``` + +**Custom complexity**: + +```graphql +type Query { + posts(first: Int!): [Post!]! @cost(complexity: 10, multipliers: ["first"]) + expensiveAnalytics: AnalyticsReport! @cost(complexity: 1000) +} +``` + +### Automatic Persisted Queries (APQ) + +**Client sends hash instead of full query**: + +```javascript +// Client +const query = gql` + query GetUser($id: ID!) { + user(id: $id) { name email } + } +`; + +const queryHash = sha256(query); + +// First request: Send hash only +fetch('/graphql', { + body: JSON.stringify({ + extensions: { + persistedQuery: { + version: 1, + sha256Hash: queryHash + } + }, + variables: { id: '123' } + }) +}); + +// If server doesn't have it (PersistedQueryNotFound) +// Second request: Send full query + hash +fetch('/graphql', { + body: JSON.stringify({ + query, + extensions: { + persistedQuery: { + version: 1, + sha256Hash: queryHash + } + }, + variables: { id: '123' } + }) +}); + +// Future requests: Just send hash +``` + +**Benefits**: +- Reduced bandwidth (hash << full query) +- CDN caching of GET requests +- Query allowlisting (if configured) + +### Field-Level Caching + +```javascript +const resolvers = { + Query: { + user: async (parent, { id }, { cache }) => { + const cacheKey = `user:${id}`; + const cached = await cache.get(cacheKey); + if (cached) return JSON.parse(cached); + + const user = await db.users.findOne(id); + await cache.set(cacheKey, JSON.stringify(user), { ttl: 300 }); + return user; + } + } +}; +``` + +## Subscriptions (Real-Time) + +### Basic Subscription + +```graphql +type Subscription { + postAdded: Post! + commentAdded(postId: ID!): Comment! +} + +type Mutation { + createPost(input: CreatePostInput!): Post! +} +``` + +**Implementation (Apollo Server)**: + +```javascript +const { PubSub } = require('graphql-subscriptions'); +const pubsub = new PubSub(); + +const resolvers = { + Mutation: { + createPost: async (parent, { input }) => { + const post = await db.posts.create(input); + pubsub.publish('POST_ADDED', { postAdded: post }); + return post; + } + }, + Subscription: { + postAdded: { + subscribe: () => pubsub.asyncIterator(['POST_ADDED']) + }, + commentAdded: { + subscribe: (parent, { postId }) => + pubsub.asyncIterator([`COMMENT_ADDED_${postId}`]) + } + } +}; + +// Client +subscription { + postAdded { + id + title + author { name } + } +} +``` + +### Scaling Subscriptions + +**Problem**: In-memory PubSub doesn't work across servers + +**Solution**: Redis PubSub + +```javascript +const { RedisPubSub } = require('graphql-redis-subscriptions'); +const Redis = require('ioredis'); + +const pubsub = new RedisPubSub({ + publisher: new Redis(), + subscriber: new Redis() +}); + +// Now works across multiple server instances +``` + +### Subscription Authorization + +```javascript +const resolvers = { + Subscription: { + secretDataUpdated: { + subscribe: withFilter( + () => pubsub.asyncIterator(['SECRET_DATA']), + (payload, variables, context) => { + // Only admin users can subscribe + return context.user?.role === 'ADMIN'; + } + ) + } + } +}; +``` + +## Federation (Distributed Schema) + +**Split schema across multiple services**: + +### User Service + +```graphql +# user-service schema +type User @key(fields: "id") { + id: ID! + email: String! + name: String! +} + +type Query { + user(id: ID!): User +} +``` + +### Post Service + +```graphql +# post-service schema +extend type User @key(fields: "id") { + id: ID! @external + posts: [Post!]! +} + +type Post { + id: ID! + title: String! + content: String! + authorId: ID! + author: User! +} +``` + +### Gateway + +Composes schemas and routes requests: + +```javascript +const { ApolloGateway } = require('@apollo/gateway'); + +const gateway = new ApolloGateway({ + serviceList: [ + { name: 'users', url: 'http://user-service:4001/graphql' }, + { name: 'posts', url: 'http://post-service:4002/graphql' } + ] +}); + +const server = new ApolloServer({ + gateway, + subscriptions: false // Not yet supported in federation +}); +``` + +**Reference Resolver** (fetch extended fields): + +```javascript +// post-service resolvers +const resolvers = { + User: { + __resolveReference: async (user) => { + // Receive { __typename: 'User', id: '123' } + // Don't need to fetch user, just return it for field resolution + return user; + }, + posts: async (user) => { + return db.posts.findMany({ where: { authorId: user.id } }); + } + } +}; +``` + +## Security Patterns + +### Query Depth Limiting + +```javascript +const depthLimit = require('graphql-depth-limit'); + +const server = new ApolloServer({ + validationRules: [depthLimit(7)] // Max 7 levels deep +}); + +// Prevents: user { posts { author { posts { author { ... } } } } +``` + +### Query Allowlisting (Production) + +```javascript +const allowedQueries = new Map([ + ['GetUser', 'query GetUser($id: ID!) { user(id: $id) { name } }'], + ['ListPosts', 'query ListPosts { posts { title } }'] +]); + +const server = new ApolloServer({ + validationRules: [ + (context) => ({ + Document(node) { + const queryName = node.definitions[0]?.name?.value; + if (!allowedQueries.has(queryName)) { + context.reportError( + new GraphQLError('Query not allowed') + ); + } + } + }) + ] +}); +``` + +### Rate Limiting (Field-Level) + +```javascript +const { shield, rule, and } = require('graphql-shield'); + +const isRateLimited = rule({ cache: 'contextual' })( + async (parent, args, ctx, info) => { + const key = `rate:${ctx.user.id}:${info.fieldName}`; + const count = await redis.incr(key); + if (count === 1) { + await redis.expire(key, 60); // 1 minute window + } + return count <= 10; // 10 requests per minute + } +); + +const permissions = shield({ + Query: { + expensiveQuery: isRateLimited + } +}); +``` + +## Schema Evolution + +### Deprecation + +```graphql +type User { + id: ID! + username: String @deprecated(reason: "Use `name` instead") + name: String! +} +``` + +**Tooling shows warnings to clients** + +### Breaking Changes (Avoid) + +❌ **Breaking**: +- Removing fields +- Changing field types +- Making nullable → non-nullable +- Removing enum values +- Changing arguments + +✅ **Non-breaking**: +- Adding fields +- Adding types +- Deprecating fields +- Making non-nullable → nullable +- Adding arguments with defaults + +### Versioning Strategy + +**Don't version schema** - evolve incrementally: + +1. Add new field +2. Deprecate old field +3. Monitor usage +4. Remove old field in next major version (if removing) + +## Testing Strategies + +### Schema Validation + +```javascript +const { buildSchema, validateSchema } = require('graphql'); + +test('schema is valid', () => { + const schema = buildSchema(typeDefs); + const errors = validateSchema(schema); + expect(errors).toHaveLength(0); +}); +``` + +### Resolver Testing + +```javascript +const resolvers = require('./resolvers'); + +test('user resolver fetches user', async () => { + const mockDb = { + users: { findOne: jest.fn().mockResolvedValue({ id: '1', name: 'Alice' }) } + }; + + const result = await resolvers.Query.user( + null, + { id: '1' }, + { db: mockDb, loaders: { user: mockDataLoader() } } + ); + + expect(result).toEqual({ id: '1', name: 'Alice' }); + expect(mockDb.users.findOne).toHaveBeenCalledWith('1'); +}); +``` + +### Integration Testing + +```javascript +const { ApolloServer } = require('apollo-server'); +const { createTestClient } = require('apollo-server-testing'); + +const server = new ApolloServer({ typeDefs, resolvers }); +const { query } = createTestClient(server); + +test('GetUser query', async () => { + const GET_USER = gql` + query GetUser($id: ID!) { + user(id: $id) { + name + email + } + } + `; + + const res = await query({ query: GET_USER, variables: { id: '1' } }); + + expect(res.errors).toBeUndefined(); + expect(res.data.user).toMatchObject({ + name: 'Alice', + email: 'alice@example.com' + }); +}); +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **No DataLoader** | N+1 queries kill performance | Use DataLoader for all entity fetching | +| **Offset pagination** | Breaks with real-time data | Use cursor-based connections | +| **No query complexity** | DoS via deeply nested queries | Set depth/complexity limits | +| **Shared DataLoader instances** | Stale cache across requests | Create new loaders per request | +| **No error masking** | Leaks internal errors to clients | Mask in production, log internally | +| **mutations returning Boolean** | Can't extend response | Return object type | +| **Nullable IDs** | IDs should never be null | Use `ID!` not `ID` | +| **Over-fetching in resolvers** | Selecting * wastes bandwidth | Select only requested fields | + +## Common Mistakes + +### 1. DataLoader Return Order + +```javascript +// ❌ WRONG - Returns in database order +const batchUsers = async (ids) => { + return await db.users.findMany({ where: { id: { in: ids } } }); +}; + +// ✅ CORRECT - Returns in requested order +const batchUsers = async (ids) => { + const users = await db.users.findMany({ where: { id: { in: ids } } }); + const userMap = new Map(users.map(u => [u.id, u])); + return ids.map(id => userMap.get(id)); +}; +``` + +### 2. Mutations Returning Primitives + +```graphql +# ❌ BAD - Can't extend +type Mutation { + deletePost(id: ID!): Boolean! +} + +# ✅ GOOD - Extensible +type DeletePostPayload { + success: Boolean! + deletedPostId: ID + message: String +} + +type Mutation { + deletePost(id: ID!): DeletePostPayload! +} +``` + +### 3. No Context in Subscriptions + +```javascript +// ❌ Missing auth context +const server = new ApolloServer({ + subscriptions: { + onConnect: () => { + return {}; // No user context! + } + } +}); + +// ✅ Include auth +const server = new ApolloServer({ + subscriptions: { + onConnect: (connectionParams) => { + const token = connectionParams.authToken; + const user = verifyToken(token); + return { user }; + } + } +}); +``` + +## Tooling Ecosystem + +**Schema Management**: +- **Apollo Studio**: Schema registry, operation tracking, metrics +- **GraphQL Inspector**: Schema diffing, breaking change detection +- **Graphql-eslint**: Linting for schema and queries + +**Code Generation**: +- **GraphQL Code Generator**: TypeScript types from schema +- **Apollo Codegen**: Client types for queries + +**Development**: +- **GraphiQL**: In-browser IDE +- **Apollo Sandbox**: Modern GraphQL explorer +- **Altair**: Desktop GraphQL client + +**Testing**: +- **EasyGraphQL Test**: Schema mocking +- **GraphQL Tools**: Schema stitching, mocking + +## Cross-References + +**Related skills**: +- **REST comparison** → `rest-api-design` (when to use each) +- **FastAPI implementation** → `fastapi-development` (Strawberry, Graphene) +- **Express implementation** → `express-development` (Apollo Server, GraphQL Yoga) +- **Microservices** → `microservices-architecture` (use with Federation) +- **Security** → `ordis-security-architect` (OWASP API Security) +- **Testing** → `api-testing` (integration testing strategies) +- **Authentication** → `api-authentication` (JWT, OAuth2 with GraphQL) + +## Further Reading + +- **GraphQL Spec**: https://spec.graphql.org/ +- **Apollo Docs**: Federation, caching, tooling +- **Relay Spec**: Connection specification +- **DataLoader GitHub**: facebook/dataloader +- **Production Ready GraphQL**: Book by Marc-André Giroux diff --git a/skills/using-web-backend/message-queues.md b/skills/using-web-backend/message-queues.md new file mode 100644 index 0000000..75b44db --- /dev/null +++ b/skills/using-web-backend/message-queues.md @@ -0,0 +1,993 @@ + +# Message Queues + +## Overview + +**Message queue specialist covering technology selection, reliability patterns, ordering guarantees, schema evolution, and production operations.** + +**Core principle**: Message queues decouple producers from consumers, enabling async processing, load leveling, and resilience - but require careful design for reliability, ordering, monitoring, and operational excellence. + +## When to Use This Skill + +Use when encountering: + +- **Technology selection**: RabbitMQ vs Kafka vs SQS vs SNS +- **Reliability**: Guaranteed delivery, acknowledgments, retries, DLQ +- **Ordering**: Partition keys, FIFO queues, ordered processing +- **Scaling**: Consumer groups, parallelism, backpressure +- **Schema evolution**: Message versioning, Avro, Protobuf +- **Monitoring**: Lag tracking, alerting, distributed tracing +- **Advanced patterns**: Outbox, saga, CQRS, event sourcing +- **Security**: Encryption, IAM, Kafka authentication +- **Testing**: Local testing, chaos engineering, load testing + +**Do NOT use for**: +- Request/response APIs → Use REST or GraphQL instead +- Strong consistency required → Use database transactions +- Real-time streaming analytics → See if streaming-specific skill exists + +## Technology Selection Matrix + +| Factor | RabbitMQ | Apache Kafka | AWS SQS | AWS SNS | +|--------|----------|--------------|---------|---------| +| **Use Case** | Task queues, routing | Event streaming, logs | Simple queues | Pub/sub fanout | +| **Throughput** | 10k-50k msg/s | 100k+ msg/s | 3k msg/s (std), 300 msg/s (FIFO) | 100k+ msg/s | +| **Ordering** | Queue-level | Partition-level (strong) | FIFO queues only | None | +| **Persistence** | Durable queues | Log-based (default) | Managed | Ephemeral (SNS → SQS for durability) | +| **Retention** | Until consumed | Days to weeks | 4 days (std), 14 days max | None (delivery only) | +| **Routing** | Exchanges (topic, fanout, headers) | Topics only | None | Topic-based filtering | +| **Message size** | Up to 128 MB | Up to 1 MB (configurable) | 256 KB | 256 KB | +| **Ops complexity** | Medium (clustering) | High (partitions, replication) | Low (managed) | Low (managed) | +| **Cost** | EC2 self-hosted | Self-hosted or MSK | Pay-per-request | Pay-per-request | + +### Decision Tree + +``` +Are you on AWS and need simple async processing? + → Yes → **AWS SQS** (start simple) + → No → Continue... + +Do you need event replay or stream processing? + → Yes → **Kafka** (log-based, replayable) + → No → Continue... + +Do you need complex routing (topic exchange, headers)? + → Yes → **RabbitMQ** (rich exchange types) + → No → Continue... + +Do you need pub/sub fanout to multiple subscribers? + → Yes → **SNS** (or Kafka topics with multiple consumer groups) + → No → **SQS** or **RabbitMQ** for task queues +``` + +### Migration Path + +| Current State | Next Step | Why | +|---------------|-----------|-----| +| No queue | Start with SQS (if AWS) or RabbitMQ | Lowest operational complexity | +| SQS → 1k+ msg/s | Consider Kafka or sharded SQS | SQS throttles at 3k msg/s | +| RabbitMQ → Event sourcing needed | Migrate to Kafka | Kafka's log retention enables replay | +| Kafka → Simple task queue | Consider RabbitMQ or SQS | Kafka is overkill for simple queues | + +## Reliability Patterns + +### Acknowledgment Modes + +| Mode | When Ack Sent | Reliability | Performance | Use Case | +|------|---------------|-------------|-------------|----------| +| **Auto-ack** | On receive | Low (lost on crash) | High | Logs, analytics, best-effort | +| **Manual ack (after processing)** | After success | High (at-least-once) | Medium | Standard production pattern | +| **Transactional** | In transaction | Highest (exactly-once) | Low | Financial, critical data | + +### At-Least-Once Delivery Pattern + +**SQS**: + +```python +# WRONG: Delete before processing +message = sqs.receive_message(QueueUrl=queue_url)['Messages'][0] +sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message['ReceiptHandle']) +process(message['Body']) # ❌ If this fails, message is lost + +# CORRECT: Process, then delete +message = sqs.receive_message( + QueueUrl=queue_url, + VisibilityTimeout=300 # 5 minutes to process +)['Messages'][0] + +try: + process(json.loads(message['Body'])) + sqs.delete_message(QueueUrl=queue_url, ReceiptHandle=message['ReceiptHandle']) +except Exception as e: + # Message becomes visible again after timeout + logger.error(f"Processing failed, will retry: {e}") +``` + +**Kafka**: + +```python +# WRONG: Auto-commit before processing +consumer = KafkaConsumer( + 'orders', + enable_auto_commit=True, # ❌ Commits offset before processing + auto_commit_interval_ms=5000 +) + +for msg in consumer: + process(msg.value) # Crash here = message lost + +# CORRECT: Manual commit after processing +consumer = KafkaConsumer( + 'orders', + enable_auto_commit=False +) + +for msg in consumer: + try: + process(msg.value) + consumer.commit() # ✓ Commit only after success + except Exception as e: + logger.error(f"Processing failed, will retry: {e}") + # Don't commit - message will be reprocessed +``` + +**RabbitMQ**: + +```python +import pika + +connection = pika.BlockingConnection(pika.ConnectionParameters('localhost')) +channel = connection.channel() + +def callback(ch, method, properties, body): + try: + process(json.loads(body)) + ch.basic_ack(delivery_tag=method.delivery_tag) # ✓ Ack after success + except Exception as e: + logger.error(f"Processing failed: {e}") + ch.basic_nack(delivery_tag=method.delivery_tag, requeue=True) # Requeue + +channel.basic_consume( + queue='orders', + on_message_callback=callback, + auto_ack=False # ✓ Manual acknowledgment +) + +channel.start_consuming() +``` + +### Idempotency (Critical for At-Least-Once) + +Since at-least-once delivery guarantees duplicates, **all processing must be idempotent**: + +```python +# Pattern 1: Database unique constraint +def process_order(order_id, data): + db.execute( + "INSERT INTO orders (id, user_id, amount, created_at) " + "VALUES (%s, %s, %s, NOW()) " + "ON CONFLICT (id) DO NOTHING", # Idempotent + (order_id, data['user_id'], data['amount']) + ) + +# Pattern 2: Distributed lock (Redis) +def process_order_with_lock(order_id, data): + lock_key = f"lock:order:{order_id}" + + # Try to acquire lock (60s TTL) + if not redis.set(lock_key, "1", nx=True, ex=60): + logger.info(f"Order {order_id} already being processed") + return # Duplicate, skip + + try: + # Process order + create_order(data) + charge_payment(data['amount']) + finally: + redis.delete(lock_key) + +# Pattern 3: Idempotency key table +def process_with_idempotency_key(message_id, data): + with db.transaction(): + # Check if already processed + result = db.execute( + "SELECT 1 FROM processed_messages WHERE message_id = %s FOR UPDATE", + (message_id,) + ) + + if result: + return # Already processed + + # Process + record atomically + process_order(data) + db.execute( + "INSERT INTO processed_messages (message_id, processed_at) VALUES (%s, NOW())", + (message_id,) + ) +``` + +## Ordering Guarantees + +### Kafka: Partition-Level Ordering + +**Kafka guarantees ordering within a partition**, not across partitions. + +```python +from kafka import KafkaProducer + +producer = KafkaProducer( + bootstrap_servers=['kafka:9092'], + key_serializer=str.encode, + value_serializer=lambda v: json.dumps(v).encode() +) + +# ✓ Partition key ensures ordering +def publish_order_event(user_id, event_type, data): + producer.send( + 'orders', + key=str(user_id), # All user_id events go to same partition + value={ + 'event_type': event_type, + 'user_id': user_id, + 'data': data, + 'timestamp': time.time() + } + ) + +# User 123's events all go to partition 2 → strict ordering +publish_order_event(123, 'order_placed', {...}) +publish_order_event(123, 'payment_processed', {...}) +publish_order_event(123, 'shipped', {...}) +``` + +**Partition count determines max parallelism**: + +``` +Topic: orders (4 partitions) +Consumer group: order-processors + +2 consumers → Each processes 2 partitions +4 consumers → Each processes 1 partition (max parallelism) +5 consumers → 1 consumer idle (wasted) + +Rule: partition_count >= max_consumers_needed +``` + +### SQS FIFO: MessageGroupId Ordering + +```python +import boto3 + +sqs = boto3.client('sqs') + +# FIFO queue guarantees ordering per MessageGroupId +sqs.send_message( + QueueUrl='orders.fifo', + MessageBody=json.dumps(event), + MessageGroupId=f"user-{user_id}", # Like Kafka partition key + MessageDeduplicationId=f"{event_id}-{timestamp}" # Prevent duplicates +) + +# Throughput limit: 300 msg/s per MessageGroupId +# Workaround: Use multiple MessageGroupIds if possible +``` + +### RabbitMQ: Single Consumer Ordering + +```python +# RabbitMQ guarantees ordering if single consumer +channel.basic_qos(prefetch_count=1) # Process one at a time + +channel.basic_consume( + queue='orders', + on_message_callback=callback, + auto_ack=False +) + +# Multiple consumers break ordering unless using consistent hashing +``` + +## Dead Letter Queues (DLQ) + +### Retry Strategy with Exponential Backoff + +**SQS with DLQ**: + +```python +# Infrastructure setup +main_queue = sqs.create_queue( + QueueName='orders', + Attributes={ + 'RedrivePolicy': json.dumps({ + 'deadLetterTargetArn': dlq_arn, + 'maxReceiveCount': '3' # After 3 failures → DLQ + }), + 'VisibilityTimeout': '300' + } +) + +# Consumer with retry logic +def process_with_retry(message): + attempt = int(message.attributes.get('ApproximateReceiveCount', 0)) + + try: + process_order(json.loads(message.body)) + message.delete() + + except RetriableError as e: + # Exponential backoff: 10s, 20s, 40s, 80s, ... + backoff = min(300, 2 ** attempt * 10) + message.change_visibility(VisibilityTimeout=backoff) + logger.warning(f"Retriable error (attempt {attempt}), retry in {backoff}s") + + except PermanentError as e: + # Send to DLQ immediately + logger.error(f"Permanent error: {e}") + send_to_dlq(message, error=str(e)) + message.delete() + +# Error classification +class RetriableError(Exception): + """Network timeout, rate limit, DB unavailable""" + pass + +class PermanentError(Exception): + """Invalid data, missing field, business rule violation""" + pass +``` + +**Kafka DLQ Pattern**: + +```python +from kafka import KafkaConsumer, KafkaProducer + +consumer = KafkaConsumer('orders', group_id='processor') +dlq_producer = KafkaProducer(bootstrap_servers=['kafka:9092']) + +def process_with_dlq(message): + retry_count = message.headers.get('retry_count', 0) + + try: + process_order(message.value) + consumer.commit() + + except RetriableError as e: + if retry_count < 3: + # Send to retry topic with delay + delay_minutes = 2 ** retry_count # 1min, 2min, 4min + retry_producer.send( + f'orders-retry-{delay_minutes}min', + value=message.value, + headers={'retry_count': retry_count + 1} + ) + else: + # Max retries → DLQ + dlq_producer.send( + 'orders-dlq', + value=message.value, + headers={'error': str(e), 'retry_count': retry_count} + ) + consumer.commit() # Don't reprocess from main topic + + except PermanentError as e: + # Immediate DLQ + dlq_producer.send('orders-dlq', value=message.value, headers={'error': str(e)}) + consumer.commit() +``` + +### DLQ Monitoring & Recovery + +```python +# Alert on DLQ depth +def check_dlq_depth(): + attrs = sqs.get_queue_attributes( + QueueUrl=dlq_url, + AttributeNames=['ApproximateNumberOfMessages'] + ) + depth = int(attrs['Attributes']['ApproximateNumberOfMessages']) + + if depth > 10: + alert(f"DLQ has {depth} messages - investigate!") + +# Manual recovery +def replay_from_dlq(): + """Fix root cause, then replay""" + messages = dlq.receive_messages(MaxNumberOfMessages=10) + + for msg in messages: + data = json.loads(msg.body) + + # Fix data issue + if 'customer_email' not in data: + data['customer_email'] = lookup_email(data['user_id']) + + # Replay to main queue + main_queue.send_message(MessageBody=json.dumps(data)) + msg.delete() +``` + +## Message Schema Evolution + +### Versioning Strategies + +**Pattern 1: Version field in message**: + +```python +# v1 message +{ + "version": "1.0", + "order_id": "123", + "amount": 99.99 +} + +# v2 message (added currency) +{ + "version": "2.0", + "order_id": "123", + "amount": 99.99, + "currency": "USD" +} + +# Consumer handles both versions +def process_order(message): + if message['version'] == "1.0": + amount = message['amount'] + currency = "USD" # Default for v1 + elif message['version'] == "2.0": + amount = message['amount'] + currency = message['currency'] + else: + raise ValueError(f"Unsupported version: {message['version']}") +``` + +**Pattern 2: Apache Avro (Kafka best practice)**: + +```python +from confluent_kafka import avro +from confluent_kafka.avro import AvroProducer, AvroConsumer + +# Define schema +value_schema = avro.loads(''' +{ + "type": "record", + "name": "Order", + "fields": [ + {"name": "order_id", "type": "string"}, + {"name": "amount", "type": "double"}, + {"name": "currency", "type": "string", "default": "USD"} # Backward compatible + ] +} +''') + +# Producer +producer = AvroProducer({ + 'bootstrap.servers': 'kafka:9092', + 'schema.registry.url': 'http://schema-registry:8081' +}, default_value_schema=value_schema) + +producer.produce(topic='orders', value={ + 'order_id': '123', + 'amount': 99.99, + 'currency': 'USD' +}) + +# Consumer automatically validates schema +consumer = AvroConsumer({ + 'bootstrap.servers': 'kafka:9092', + 'group.id': 'processor', + 'schema.registry.url': 'http://schema-registry:8081' +}) +``` + +**Avro Schema Evolution Rules**: + +| Change | Compatible? | Notes | +|--------|-------------|-------| +| Add field with default | ✓ Backward compatible | Old consumers ignore new field | +| Remove field | ✓ Forward compatible | New consumers must handle missing field | +| Rename field | ❌ Breaking | Requires migration | +| Change field type | ❌ Breaking | Requires new topic or migration | + +**Pattern 3: Protobuf (alternative to Avro)**: + +```protobuf +syntax = "proto3"; + +message Order { + string order_id = 1; + double amount = 2; + string currency = 3; // New field, backward compatible +} +``` + +### Schema Registry (Kafka) + +``` +Producer → Schema Registry (validate) → Kafka +Consumer → Kafka → Schema Registry (deserialize) + +Benefits: +- Centralized schema management +- Automatic validation +- Schema evolution enforcement +- Type safety +``` + +## Monitoring & Observability + +### Key Metrics + +| Metric | Alert Threshold | Why It Matters | +|--------|----------------|----------------| +| **Queue depth** | > 1000 (or 5min processing time) | Consumers can't keep up | +| **Consumer lag** (Kafka) | > 100k messages or > 5 min | Consumers falling behind | +| **DLQ depth** | > 10 | Messages failing repeatedly | +| **Processing time p99** | > 5 seconds | Slow processing blocks queue | +| **Error rate** | > 5% | Widespread failures | +| **Redelivery rate** | > 10% | Idempotency issues or transient errors | + +### Consumer Lag Monitoring (Kafka) + +```python +from kafka import KafkaAdminClient, TopicPartition + +admin = KafkaAdminClient(bootstrap_servers=['kafka:9092']) + +def check_consumer_lag(group_id, topic): + # Get committed offsets + committed = admin.list_consumer_group_offsets(group_id) + + # Get latest offsets (highwater mark) + consumer = KafkaConsumer(bootstrap_servers=['kafka:9092']) + partitions = [TopicPartition(topic, p) for p in range(partition_count)] + latest = consumer.end_offsets(partitions) + + # Calculate lag + total_lag = 0 + for partition in partitions: + committed_offset = committed[partition].offset + latest_offset = latest[partition] + lag = latest_offset - committed_offset + total_lag += lag + + if lag > 10000: + alert(f"Partition {partition.partition} lag: {lag}") + + return total_lag + +# Alert if total lag > 100k +if check_consumer_lag('order-processor', 'orders') > 100000: + alert("Consumer lag critical!") +``` + +### Distributed Tracing Across Queues + +```python +from opentelemetry import trace +from opentelemetry.propagate import inject, extract + +tracer = trace.get_tracer(__name__) + +# Producer: Inject trace context +def publish_with_trace(topic, message): + with tracer.start_as_current_span("publish-order") as span: + headers = {} + inject(headers) # Inject trace context into headers + + producer.send( + topic, + value=message, + headers=list(headers.items()) + ) + +# Consumer: Extract trace context +def consume_with_trace(message): + context = extract(dict(message.headers)) + + with tracer.start_as_current_span("process-order", context=context) as span: + process_order(message.value) + span.set_attribute("order.id", message.value['order_id']) + +# Trace spans: API → Producer → Queue → Consumer → DB +# Shows end-to-end latency including queue wait time +``` + +## Backpressure & Circuit Breakers + +### Rate Limiting Consumers + +```python +import time +from collections import deque + +class RateLimitedConsumer: + def __init__(self, max_per_second=100): + self.max_per_second = max_per_second + self.requests = deque() + + def consume(self, message): + now = time.time() + + # Remove requests older than 1 second + while self.requests and self.requests[0] < now - 1: + self.requests.popleft() + + # Check rate limit + if len(self.requests) >= self.max_per_second: + sleep_time = 1 - (now - self.requests[0]) + time.sleep(sleep_time) + + self.requests.append(time.time()) + process(message) +``` + +### Circuit Breaker for Downstream Dependencies + +```python +from circuitbreaker import circuit + +@circuit(failure_threshold=5, recovery_timeout=60) +def call_payment_service(order_id, amount): + response = requests.post( + 'https://payment-service/charge', + json={'order_id': order_id, 'amount': amount}, + timeout=5 + ) + + if response.status_code >= 500: + raise ServiceUnavailableError() + + return response.json() + +def process_order(message): + try: + result = call_payment_service(message['order_id'], message['amount']) + # ... continue processing + except CircuitBreakerError: + # Circuit open - don't overwhelm failing service + logger.warning("Payment service circuit open, requeueing message") + raise RetriableError("Circuit breaker open") +``` + +## Advanced Patterns + +### Outbox Pattern (Reliable Publishing) + +**Problem**: How to atomically update database AND publish message? + +```python +# ❌ WRONG: Dual write (can fail between DB and queue) +def create_order(data): + db.execute("INSERT INTO orders (...) VALUES (...)") + producer.send('orders', data) # ❌ If this fails, DB updated but no event + +# ✓ CORRECT: Outbox pattern +def create_order_with_outbox(data): + with db.transaction(): + # 1. Insert order + db.execute("INSERT INTO orders (id, user_id, amount) VALUES (%s, %s, %s)", + (data['id'], data['user_id'], data['amount'])) + + # 2. Insert into outbox (same transaction) + db.execute("INSERT INTO outbox (event_type, payload) VALUES (%s, %s)", + ('order.created', json.dumps(data))) + + # Separate process reads outbox and publishes + +# Outbox processor (separate worker) +def process_outbox(): + while True: + events = db.execute("SELECT * FROM outbox WHERE published_at IS NULL LIMIT 10") + + for event in events: + try: + producer.send(event['event_type'], json.loads(event['payload'])) + db.execute("UPDATE outbox SET published_at = NOW() WHERE id = %s", (event['id'],)) + except Exception as e: + logger.error(f"Failed to publish event {event['id']}: {e}") + # Will retry on next iteration + + time.sleep(1) +``` + +### Saga Pattern (Distributed Transactions) + +See `microservices-architecture` skill for full saga patterns (choreography vs orchestration). + +**Quick reference for message-based saga**: + +```python +# Order saga coordinator publishes commands +def create_order_saga(order_data): + saga_id = str(uuid.uuid4()) + + # Step 1: Reserve inventory + producer.send('inventory-commands', { + 'command': 'reserve', + 'saga_id': saga_id, + 'order_id': order_data['order_id'], + 'items': order_data['items'] + }) + + # Inventory service responds on 'inventory-events' + # If success → proceed to step 2 + # If failure → compensate (cancel order) +``` + +## Security + +### Message Encryption + +**SQS**: Server-side encryption (SSE) with KMS + +```python +sqs.create_queue( + QueueName='orders-encrypted', + Attributes={ + 'KmsMasterKeyId': 'alias/my-key', # AWS KMS + 'KmsDataKeyReusePeriodSeconds': '300' + } +) +``` + +**Kafka**: Encryption in transit + at rest + +```python +# SSL/TLS for in-transit encryption +producer = KafkaProducer( + bootstrap_servers=['kafka:9093'], + security_protocol='SSL', + ssl_cafile='/path/to/ca-cert', + ssl_certfile='/path/to/client-cert', + ssl_keyfile='/path/to/client-key' +) + +# Encryption at rest (Kafka broker config) +# log.dirs=/encrypted-volume # Use encrypted EBS volumes +``` + +### Authentication & Authorization + +**SQS**: IAM policies + +```json +{ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Principal": {"AWS": "arn:aws:iam::123456789012:role/OrderService"}, + "Action": ["sqs:SendMessage"], + "Resource": "arn:aws:sqs:us-east-1:123456789012:orders" + }] +} +``` + +**Kafka**: SASL/SCRAM authentication + +```python +producer = KafkaProducer( + bootstrap_servers=['kafka:9093'], + security_protocol='SASL_SSL', + sasl_mechanism='SCRAM-SHA-512', + sasl_plain_username='order-service', + sasl_plain_password='secret' +) +``` + +**Kafka ACLs** (authorization): + +```bash +# Grant order-service permission to write to orders topic +kafka-acls --add \ + --allow-principal User:order-service \ + --operation Write \ + --topic orders +``` + +## Testing Strategies + +### Local Testing + +**LocalStack for SQS/SNS**: + +```python +# docker-compose.yml +services: + localstack: + image: localstack/localstack + environment: + - SERVICES=sqs,sns + +# Test code +import boto3 + +sqs = boto3.client( + 'sqs', + endpoint_url='http://localhost:4566', # LocalStack + region_name='us-east-1' +) + +queue_url = sqs.create_queue(QueueName='test-orders')['QueueUrl'] +sqs.send_message(QueueUrl=queue_url, MessageBody='test') +``` + +**Kafka in Docker**: + +```yaml +# docker-compose.yml +services: + zookeeper: + image: confluentinc/cp-zookeeper:latest + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + + kafka: + image: confluentinc/cp-kafka:latest + ports: + - "9092:9092" + environment: + KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092 +``` + +### Integration Testing + +```python +import pytest +from testcontainers.kafka import KafkaContainer + +@pytest.fixture +def kafka(): + with KafkaContainer() as kafka: + yield kafka.get_bootstrap_server() + +def test_order_processing(kafka): + producer = KafkaProducer(bootstrap_servers=kafka) + consumer = KafkaConsumer('orders', bootstrap_servers=kafka, auto_offset_reset='earliest') + + # Publish message + producer.send('orders', value=b'{"order_id": "123"}') + producer.flush() + + # Consume and verify + message = next(consumer) + assert json.loads(message.value)['order_id'] == '123' +``` + +### Chaos Engineering + +```python +# Test consumer failure recovery +def test_consumer_crash_recovery(): + # Start consumer + consumer_process = subprocess.Popen(['python', 'consumer.py']) + time.sleep(2) + + # Publish message + producer.send('orders', value=test_order) + producer.flush() + + # Kill consumer mid-processing + consumer_process.kill() + + # Restart consumer + consumer_process = subprocess.Popen(['python', 'consumer.py']) + time.sleep(5) + + # Verify message was reprocessed (idempotency!) + assert db.execute("SELECT COUNT(*) FROM orders WHERE id = %s", (test_order['id'],))[0] == 1 +``` + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **Auto-ack before processing** | Messages lost on crash | Manual ack after processing | +| **No idempotency** | Duplicates cause data corruption | Unique constraints, locks, or idempotency keys | +| **No DLQ** | Poison messages block queue | Configure DLQ with maxReceiveCount | +| **No monitoring** | Can't detect consumer lag or failures | Monitor lag, depth, error rate | +| **Synchronous message processing** | Low throughput | Batch processing, parallel consumers | +| **Large messages** | Exceeds queue limits, slow transfer | Store in S3, send reference in message | +| **No schema versioning** | Breaking changes break consumers | Use Avro/Protobuf with schema registry | +| **Shared consumer instances** | Race conditions, duplicate processing | Use consumer groups (Kafka) or visibility timeout (SQS) | + +## Technology-Specific Patterns + +### RabbitMQ Exchanges + +```python +# Topic exchange for routing +channel.exchange_declare(exchange='orders', exchange_type='topic') + +# Bind queues with patterns +channel.queue_bind(exchange='orders', queue='us-orders', routing_key='order.us.*') +channel.queue_bind(exchange='orders', queue='eu-orders', routing_key='order.eu.*') + +# Publish with routing key +channel.basic_publish( + exchange='orders', + routing_key='order.us.california', # Goes to us-orders queue + body=json.dumps(order) +) + +# Fanout exchange for pub/sub +channel.exchange_declare(exchange='analytics', exchange_type='fanout') +# All bound queues receive every message +``` + +### Kafka Connect (Data Integration) + +```json +{ + "name": "mysql-source", + "config": { + "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", + "connection.url": "jdbc:mysql://localhost:3306/mydb", + "table.whitelist": "orders", + "mode": "incrementing", + "incrementing.column.name": "id", + "topic.prefix": "mysql-" + } +} +``` + +**Use cases**: +- Stream DB changes to Kafka (CDC) +- Sink Kafka to Elasticsearch, S3, databases +- No custom code needed for common integrations + +## Batching Optimizations + +### Batch Size Tuning + +```python +# SQS batch receiving (up to 10 messages) +messages = sqs.receive_messages( + QueueUrl=queue_url, + MaxNumberOfMessages=10, # Fetch 10 at once + WaitTimeSeconds=20 # Long polling (reduces empty receives) +) + +# Process in parallel +with ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(process, msg) for msg in messages] + for future in futures: + future.result() + +# Kafka batch consuming +consumer = KafkaConsumer( + 'orders', + max_poll_records=500, # Fetch 500 messages per poll + fetch_min_bytes=1024 # Wait for at least 1KB before returning +) + +for messages in consumer: + batch_process(messages) # Process 500 at once +``` + +**Batch size tradeoffs**: + +| Batch Size | Throughput | Latency | Memory | +|------------|------------|---------|--------| +| 1 | Low | Low | Low | +| 10-100 | Medium | Medium | Medium | +| 500+ | High | High | High | + +**Recommendation**: Start with 10-100, increase for higher throughput if latency allows. + +## Cross-References + +**Related skills**: +- **Microservices communication** → `microservices-architecture` (saga, event-driven) +- **FastAPI async** → `fastapi-development` (consuming queues in FastAPI) +- **REST vs async** → `rest-api-design` (when to use queues vs HTTP) +- **Security** → `ordis-security-architect` (encryption, IAM, compliance) +- **Testing** → `api-testing` (integration testing strategies) + +## Further Reading + +- **Enterprise Integration Patterns** by Gregor Hohpe (message patterns) +- **Designing Data-Intensive Applications** by Martin Kleppmann (Kafka internals) +- **RabbitMQ in Action** by Alvaro Videla +- **Kafka: The Definitive Guide** by Neha Narkhede +- **AWS SQS Best Practices**: https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-best-practices.html diff --git a/skills/using-web-backend/microservices-architecture.md b/skills/using-web-backend/microservices-architecture.md new file mode 100644 index 0000000..645513e --- /dev/null +++ b/skills/using-web-backend/microservices-architecture.md @@ -0,0 +1,592 @@ + +# Microservices Architecture + +## Overview + +**Microservices architecture specialist covering service boundaries, communication patterns, data consistency, and operational concerns.** + +**Core principle**: Microservices decompose applications into independently deployable services organized around business capabilities - enabling team autonomy and technology diversity at the cost of operational complexity and distributed system challenges. + +## When to Use This Skill + +Use when encountering: + +- **Service boundaries**: Defining service scope, applying domain-driven design +- **Monolith decomposition**: Strategies for splitting existing systems +- **Data consistency**: Sagas, event sourcing, eventual consistency patterns +- **Communication**: Sync (REST/gRPC) vs async (events/messages) +- **API gateways**: Routing, authentication, rate limiting +- **Service discovery**: Registry patterns, DNS, configuration +- **Resilience**: Circuit breakers, retries, timeouts, bulkheads +- **Observability**: Distributed tracing, logging aggregation, metrics +- **Deployment**: Containers, orchestration, blue-green deployments + +**Do NOT use for**: +- Monolithic architectures (microservices aren't always better) +- Single-team projects < 5 services (overhead exceeds benefits) +- Simple CRUD applications (microservices add unnecessary complexity) + +## When NOT to Use Microservices + +**Stay monolithic if**: +- Team < 10 engineers +- Domain is not well understood yet +- Strong consistency required everywhere +- Network latency is critical +- You can't invest in observability/DevOps infrastructure + +**Microservices require**: Mature DevOps, monitoring, distributed systems expertise, organizational support. + +## Service Boundary Patterns (Domain-Driven Design) + +### 1. Bounded Contexts + +**Pattern: One microservice = One bounded context** + +``` +❌ Too fine-grained (anemic services): +- UserService (just CRUD) +- OrderService (just CRUD) +- PaymentService (just CRUD) + +✅ Business capability alignment: +- CustomerManagementService (user profiles, preferences, history) +- OrderFulfillmentService (order lifecycle, inventory, shipping) +- PaymentProcessingService (payment, billing, invoicing, refunds) +``` + +**Identifying boundaries**: +1. **Ubiquitous language** - Different terms for same concept = different contexts +2. **Change patterns** - Services that change together should stay together +3. **Team ownership** - One team should own one service +4. **Data autonomy** - Each service owns its data, no shared databases + +### 2. Strategic DDD Patterns + +| Pattern | Use When | Example | +|---------|----------|---------| +| **Separate Ways** | Contexts are independent | Analytics service, main app service | +| **Partnership** | Teams must collaborate closely | Order + Inventory services | +| **Customer-Supplier** | Upstream/downstream relationship | Payment gateway (upstream) → Order service | +| **Conformist** | Accept upstream model as-is | Third-party API integration | +| **Anti-Corruption Layer** | Isolate from legacy/external systems | ACL between new microservices and legacy monolith | + +### 3. Service Sizing Guidelines + +**Too small (Nanoservices)**: +- Excessive network calls +- Distributed monolith +- Coordination overhead exceeds benefits + +**Too large (Minimonoliths)**: +- Multiple teams modifying same service +- Mixed deployment frequencies +- Tight coupling re-emerges + +**Right size indicators**: +- Single team can own it +- Deployable independently +- Changes don't ripple to other services +- Clear business capability +- 100-10,000 LOC (highly variable) + +## Communication Patterns + +### Synchronous Communication + +**REST APIs**: + +```python +# Order service calling Payment service +async def create_order(order: Order): + # Synchronous REST call + payment = await payment_service.charge( + amount=order.total, + customer_id=order.customer_id + ) + + if payment.status == "success": + order.status = "confirmed" + await db.save(order) + return order + else: + raise PaymentFailedException() +``` + +**Pros**: Simple, request-response, easy to debug +**Cons**: Tight coupling, availability dependency, latency cascades + +**gRPC**: + +```python +# Proto definition +service OrderService { + rpc CreateOrder (OrderRequest) returns (OrderResponse); +} + +# Implementation +class OrderServicer(order_pb2_grpc.OrderServiceServicer): + async def CreateOrder(self, request, context): + # Type-safe, efficient binary protocol + payment = await payment_stub.Charge( + PaymentRequest(amount=request.total) + ) + return OrderResponse(order_id=order.id) +``` + +**Pros**: Type-safe, efficient, streaming support +**Cons**: HTTP/2 required, less human-readable, proto dependencies + +### Asynchronous Communication + +**Event-Driven (Pub/Sub)**: + +```python +# Order service publishes event +await event_bus.publish("order.created", { + "order_id": order.id, + "customer_id": customer.id, + "total": order.total +}) + +# Inventory service subscribes +@event_bus.subscribe("order.created") +async def reserve_inventory(event): + await inventory.reserve(event["order_id"]) + await event_bus.publish("inventory.reserved", {...}) + +# Notification service subscribes +@event_bus.subscribe("order.created") +async def send_confirmation(event): + await email.send_order_confirmation(event) +``` + +**Pros**: Loose coupling, services independent, scalable +**Cons**: Eventual consistency, harder to trace, ordering challenges + +**Message Queues (Point-to-Point)**: + +```python +# Producer +await queue.send("payment-processing", { + "order_id": order.id, + "amount": order.total +}) + +# Consumer +@queue.consumer("payment-processing") +async def process_payment(message): + result = await payment_gateway.charge(message["amount"]) + if result.success: + await message.ack() + else: + await message.nack(requeue=True) +``` + +**Pros**: Guaranteed delivery, work distribution, retry handling +**Cons**: Queue becomes bottleneck, requires message broker + +### Communication Pattern Decision Matrix + +| Scenario | Pattern | Why | +|----------|---------|-----| +| User-facing request/response | Sync (REST/gRPC) | Low latency, immediate feedback | +| Background processing | Async (queue) | Don't block user, retry support | +| Cross-service notifications | Async (pub/sub) | Loose coupling, multiple consumers | +| Real-time updates | WebSocket/SSE | Bidirectional, streaming | +| Data replication | Event sourcing | Audit trail, rebuild state | +| High throughput | Async (messaging) | Buffer spikes, backpressure | + +## Data Consistency Patterns + +### 1. Saga Pattern (Distributed Transactions) + +**Choreography (Event-Driven)**: + +```python +# Order Service +async def create_order(order): + order.status = "pending" + await db.save(order) + await events.publish("order.created", order) + +# Payment Service +@events.subscribe("order.created") +async def handle_order(event): + try: + await charge_customer(event["total"]) + await events.publish("payment.completed", event) + except PaymentError: + await events.publish("payment.failed", event) + +# Inventory Service +@events.subscribe("payment.completed") +async def reserve_items(event): + try: + await reserve(event["items"]) + await events.publish("inventory.reserved", event) + except InventoryError: + await events.publish("inventory.failed", event) + +# Order Service (Compensation) +@events.subscribe("payment.failed") +async def cancel_order(event): + order = await db.get(event["order_id"]) + order.status = "cancelled" + await db.save(order) + +@events.subscribe("inventory.failed") +async def refund_payment(event): + await payment.refund(event["order_id"]) + await cancel_order(event) +``` + +**Orchestration (Coordinator)**: + +```python +class OrderSaga: + def __init__(self, order): + self.order = order + self.completed_steps = [] + + async def execute(self): + try: + # Step 1: Reserve inventory + await self.reserve_inventory() + self.completed_steps.append("inventory") + + # Step 2: Process payment + await self.process_payment() + self.completed_steps.append("payment") + + # Step 3: Confirm order + await self.confirm_order() + + except Exception as e: + # Compensate in reverse order + await self.compensate() + raise + + async def compensate(self): + for step in reversed(self.completed_steps): + if step == "inventory": + await inventory_service.release(self.order.id) + elif step == "payment": + await payment_service.refund(self.order.id) +``` + +**Choreography vs Orchestration**: + +| Aspect | Choreography | Orchestration | +|--------|--------------|---------------| +| Coordination | Decentralized (events) | Centralized (orchestrator) | +| Coupling | Loose | Tight to orchestrator | +| Complexity | Distributed across services | Concentrated in orchestrator | +| Tracing | Harder (follow events) | Easier (single coordinator) | +| Failure handling | Implicit (event handlers) | Explicit (orchestrator logic) | +| Best for | Simple workflows | Complex workflows | + +### 2. Event Sourcing + +**Pattern: Store events, not state** + +```python +# Traditional approach (storing state) +class Order: + id: int + status: str # "pending" → "confirmed" → "shipped" + total: float + +# Event sourcing (storing events) +class OrderCreated(Event): + order_id: int + total: float + +class OrderConfirmed(Event): + order_id: int + +class OrderShipped(Event): + order_id: int + +# Rebuild state from events +def rebuild_order(order_id): + events = event_store.get_events(order_id) + order = Order() + for event in events: + order.apply(event) # Apply each event to rebuild state + return order +``` + +**Pros**: Complete audit trail, time travel, event replay +**Cons**: Complexity, eventual consistency, schema evolution challenges + +### 3. CQRS (Command Query Responsibility Segregation) + +**Separate read and write models**: + +```python +# Write model (commands) +class CreateOrder: + def execute(self, data): + order = Order(**data) + await db.save(order) + await event_bus.publish("order.created", order) + +# Read model (projections) +class OrderReadModel: + # Denormalized for fast reads + def __init__(self): + self.cache = {} + + @event_bus.subscribe("order.created") + async def on_order_created(self, event): + self.cache[event["order_id"]] = { + "id": event["order_id"], + "customer_name": await get_customer_name(event["customer_id"]), + "status": "pending", + "total": event["total"] + } + + def get_order(self, order_id): + return self.cache.get(order_id) # Fast read, no joins +``` + +**Use when**: Read/write patterns differ significantly (e.g., analytics dashboards) + +## Resilience Patterns + +### 1. Circuit Breaker + +```python +from circuitbreaker import circuit + +@circuit(failure_threshold=5, recovery_timeout=60) +async def call_payment_service(amount): + response = await http.post("http://payment-service/charge", json={"amount": amount}) + if response.status >= 500: + raise PaymentServiceError() + return response.json() + +# Circuit states: +# CLOSED → normal operation +# OPEN → fails fast after threshold +# HALF_OPEN → test if service recovered +``` + +### 2. Retry with Exponential Backoff + +```python +from tenacity import retry, stop_after_attempt, wait_exponential + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10) +) +async def call_with_retry(url): + return await http.get(url) + +# Retries: 2s → 4s → 8s +``` + +### 3. Timeout + +```python +import asyncio + +async def call_with_timeout(url): + try: + return await asyncio.wait_for( + http.get(url), + timeout=5.0 # 5 second timeout + ) + except asyncio.TimeoutError: + return {"error": "Service timeout"} +``` + +### 4. Bulkhead + +**Isolate resources to prevent cascade failures**: + +```python +# Separate thread pools for different services +payment_pool = ThreadPoolExecutor(max_workers=10) +inventory_pool = ThreadPoolExecutor(max_workers=5) + +async def call_payment(): + return await asyncio.get_event_loop().run_in_executor( + payment_pool, + payment_service.call + ) + +# If payment service is slow, it only exhausts payment_pool, +# inventory calls still work +``` + +## API Gateway Pattern + +**Centralized entry point for client requests**: + +``` +Client → API Gateway → [Order, Payment, Inventory services] +``` + +**Responsibilities**: +- Routing requests to services +- Authentication/authorization +- Rate limiting +- Request/response transformation +- Caching +- Logging/monitoring + +**Example (Kong, AWS API Gateway, Nginx)**: + +```yaml +# API Gateway config +routes: + - path: /orders + service: order-service + auth: jwt + ratelimit: 100/minute + + - path: /payments + service: payment-service + auth: oauth2 + ratelimit: 50/minute +``` + +**Backend for Frontend (BFF) Pattern**: + +``` +Web Client → Web BFF → Services +Mobile App → Mobile BFF → Services +``` + +Each client type has optimized gateway. + +## Service Discovery + +### 1. Client-Side Discovery + +```python +# Service registry (Consul, Eureka) +registry = ServiceRegistry("http://consul:8500") + +# Client looks up service +instances = registry.get_instances("payment-service") +instance = load_balancer.choose(instances) +response = await http.get(f"http://{instance.host}:{instance.port}/charge") +``` + +### 2. Server-Side Discovery (Load Balancer) + +``` +Client → Load Balancer → [Service Instance 1, Instance 2, Instance 3] +``` + +**DNS-based**: Kubernetes services, AWS ELB + +## Observability + +### Distributed Tracing + +```python +from opentelemetry import trace + +tracer = trace.get_tracer(__name__) + +async def create_order(order): + with tracer.start_as_current_span("create-order") as span: + span.set_attribute("order.id", order.id) + span.set_attribute("order.total", order.total) + + # Trace propagates to payment service + payment = await payment_service.charge( + amount=order.total, + trace_context=span.context + ) + + span.add_event("payment-completed") + return order +``` + +**Tools**: Jaeger, Zipkin, AWS X-Ray, Datadog APM + +### Log Aggregation + +**Structured logging with correlation IDs**: + +```python +import logging +import uuid + +logger = logging.getLogger(__name__) + +async def handle_request(request): + correlation_id = request.headers.get("X-Correlation-ID") or str(uuid.uuid4()) + + logger.info("Processing request", extra={ + "correlation_id": correlation_id, + "service": "order-service", + "user_id": request.user_id + }) +``` + +**Tools**: ELK stack (Elasticsearch, Logstash, Kibana), Splunk, Datadog + +## Monolith Decomposition Strategies + +### 1. Strangler Fig Pattern + +**Gradually replace monolith with microservices**: + +``` +Phase 1: Monolith handles everything +Phase 2: Extract service, proxy some requests to it +Phase 3: More services extracted, proxy more requests +Phase 4: Monolith retired +``` + +### 2. Branch by Abstraction + +1. Create abstraction layer in monolith +2. Implement new service +3. Gradually migrate code behind abstraction +4. Remove old implementation +5. Extract as microservice + +### 3. Extract by Bounded Context + +Priority order: +1. Services with clear boundaries (authentication, payments) +2. Services changing frequently +3. Services with different scaling needs +4. Services with technology mismatches (e.g., Java monolith, Python ML service) + +## Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| **Distributed Monolith** | Services share database, deploy together | One DB per service, independent deployment | +| **Nanoservices** | Too fine-grained, excessive network calls | Merge related services, follow DDD | +| **Shared Database** | Tight coupling, schema changes break multiple services | Database per service | +| **Synchronous Chains** | A→B→C→D, latency adds up, cascading failures | Async events, parallelize where possible | +| **Chatty Services** | N+1 calls, excessive network overhead | Batch APIs, caching, coarser boundaries | +| **No Circuit Breakers** | Cascading failures bring down system | Circuit breakers + timeouts + retries | +| **No Distributed Tracing** | Impossible to debug cross-service issues | OpenTelemetry, correlation IDs | + +## Cross-References + +**Related skills**: +- **Message queues** → `message-queues` (RabbitMQ, Kafka patterns) +- **REST APIs** → `rest-api-design` (service interface design) +- **gRPC** → Check if gRPC skill exists +- **Security** → `ordis-security-architect` (service-to-service auth, zero trust) +- **Database** → `database-integration` (per-service databases, migrations) +- **Testing** → `api-testing` (contract testing, integration testing) + +## Further Reading + +- **Building Microservices** by Sam Newman +- **Domain-Driven Design** by Eric Evans +- **Release It!** by Michael Nygard (resilience patterns) +- **Microservices Patterns** by Chris Richardson diff --git a/skills/using-web-backend/rest-api-design.md b/skills/using-web-backend/rest-api-design.md new file mode 100644 index 0000000..7106a14 --- /dev/null +++ b/skills/using-web-backend/rest-api-design.md @@ -0,0 +1,523 @@ + +# REST API Design + +## Overview + +**REST API design specialist covering resource modeling, HTTP semantics, versioning, pagination, and API evolution.** + +**Core principle**: REST is an architectural style based on resources, HTTP semantics, and stateless communication. Good REST API design makes resources discoverable, operations predictable, and evolution manageable. + +## When to Use This Skill + +Use when encountering: + +- **Resource modeling**: Designing URL structures, choosing singular vs plural, handling relationships +- **HTTP methods**: GET, POST, PUT, PATCH, DELETE semantics and idempotency +- **Status codes**: Choosing correct 2xx, 4xx, 5xx codes +- **Versioning**: URI vs header versioning, managing API evolution +- **Pagination**: Offset, cursor, or page-based pagination strategies +- **Filtering/sorting**: Query parameter design for collections +- **Error responses**: Standardized error formats +- **HATEOAS**: Hypermedia-driven APIs and discoverability + +**Do NOT use for**: +- GraphQL API design → `graphql-api-design` +- Framework-specific implementation → `fastapi-development`, `django-development`, `express-development` +- Authentication patterns → `api-authentication` + +## Quick Reference - HTTP Methods + +| Method | Semantics | Idempotent? | Safe? | Request Body | Response Body | +|--------|-----------|-------------|-------|--------------|---------------| +| GET | Retrieve resource | ✅ Yes | ✅ Yes | ❌ No | ✅ Yes | +| POST | Create resource | ❌ No | ❌ No | ✅ Yes | ✅ Yes | +| PUT | Replace resource | ✅ Yes | ❌ No | ✅ Yes | ✅ Optional | +| PATCH | Partial update | ❌ No* | ❌ No | ✅ Yes | ✅ Optional | +| DELETE | Remove resource | ✅ Yes | ❌ No | ❌ Optional | ✅ Optional | +| HEAD | Retrieve headers | ✅ Yes | ✅ Yes | ❌ No | ❌ No | +| OPTIONS | Supported methods | ✅ Yes | ✅ Yes | ❌ No | ✅ Yes | + +*PATCH can be designed to be idempotent but often isn't + +## Quick Reference - Status Codes + +| Code | Meaning | Use When | +|------|---------|----------| +| 200 OK | Success | GET, PUT, PATCH succeeded with response body | +| 201 Created | Resource created | POST created new resource | +| 202 Accepted | Async processing | Request accepted, processing continues async | +| 204 No Content | Success, no body | DELETE succeeded, PUT/PATCH succeeded without response | +| 400 Bad Request | Invalid input | Validation failed, malformed request | +| 401 Unauthorized | Authentication failed | Missing or invalid credentials | +| 403 Forbidden | Authorization failed | User authenticated but lacks permission | +| 404 Not Found | Resource missing | Resource doesn't exist | +| 409 Conflict | State conflict | Resource already exists, version conflict | +| 422 Unprocessable Entity | Semantic error | Valid syntax but business logic failed | +| 429 Too Many Requests | Rate limited | User exceeded rate limit | +| 500 Internal Server Error | Server error | Unexpected server failure | +| 503 Service Unavailable | Temporary outage | Maintenance, overload | + +## Resource Modeling Patterns + +### 1. URL Structure + +**✅ Good patterns**: + +``` +GET /users # List users +POST /users # Create user +GET /users/{id} # Get specific user +PUT /users/{id} # Replace user +PATCH /users/{id} # Update user +DELETE /users/{id} # Delete user + +GET /users/{id}/orders # User's orders (nested resource) +POST /users/{id}/orders # Create order for user +GET /orders/{id} # Get specific order (top-level for direct access) + +GET /search/users?q=john # Search endpoint +``` + +**❌ Anti-patterns**: + +``` +GET /getUsers # Verb in URL (use HTTP method instead) +POST /users/create # Redundant verb +GET /users/123/delete # DELETE operation via GET +POST /api?action=createUser # RPC-style, not REST +GET /users/{id}/orders/{id} # Ambiguous - which {id}? +``` + +### 2. Singular vs Plural + +**Convention: Use plural for collections, even for single-item endpoints** + +``` +✅ /users/{id} # Consistent plural +✅ /orders/{id} # Consistent plural + +❌ /user/{id} # Inconsistent singular +❌ /users/{id}/order/{id} # Mixed singular/plural +``` + +**Exception**: Non-countable resources can be singular + +``` +✅ /me # Current user context +✅ /config # Application config (single resource) +✅ /health # Health check endpoint +``` + +### 3. Nested Resources vs Top-Level + +**Nested when showing relationship**: + +``` +GET /users/{userId}/orders # "Orders belonging to this user" +POST /users/{userId}/orders # "Create order for this user" +``` + +**Top-level when resource has independent identity**: + +``` +GET /orders/{orderId} # Direct access to order +DELETE /orders/{orderId} # Delete order directly +``` + +**Guidelines**: +- Nest ≤ 2 levels deep (`/users/{id}/orders/{id}` is max) +- Provide top-level access for resources that exist independently +- Use query parameters for filtering instead of deep nesting + +``` +✅ GET /orders?userId=123 # Better than /users/123/orders/{id} +❌ GET /users/{id}/orders/{id}/items/{id} # Too deep +``` + +## Pagination Patterns + +### Offset Pagination + +**Good for**: Small datasets, page numbers, SQL databases + +``` +GET /users?limit=20&offset=40 + +Response: +{ + "data": [...], + "pagination": { + "limit": 20, + "offset": 40, + "total": 1000, + "hasMore": true + } +} +``` + +**Pros**: Simple, allows jumping to any page +**Cons**: Performance degrades with large offsets, inconsistent with concurrent modifications + +### Cursor Pagination + +**Good for**: Large datasets, real-time data, NoSQL databases + +``` +GET /users?limit=20&after=eyJpZCI6MTIzfQ + +Response: +{ + "data": [...], + "pagination": { + "nextCursor": "eyJpZCI6MTQzfQ", + "hasMore": true + } +} +``` + +**Pros**: Consistent results, efficient for large datasets +**Cons**: Can't jump to arbitrary page, cursors are opaque + +### Page-Based Pagination + +**Good for**: UIs with page numbers + +``` +GET /users?page=3&pageSize=20 + +Response: +{ + "data": [...], + "pagination": { + "page": 3, + "pageSize": 20, + "totalPages": 50, + "totalCount": 1000 + } +} +``` + +**Choice matrix**: + +| Use Case | Pattern | +|----------|---------| +| Admin dashboards, small datasets | Offset or Page | +| Infinite scroll feeds | Cursor | +| Real-time data (chat, notifications) | Cursor | +| Need page numbers in UI | Page | +| Large datasets (millions of rows) | Cursor | + +## Filtering and Sorting + +### Query Parameter Conventions + +``` +GET /users?status=active&role=admin # Simple filtering +GET /users?createdAfter=2024-01-01 # Date filtering +GET /users?search=john # Full-text search +GET /users?sort=createdAt&order=desc # Sorting +GET /users?sort=-createdAt # Alternative: prefix for descending +GET /users?fields=id,name,email # Sparse fieldsets +GET /users?include=orders,profile # Relationship inclusion +``` + +### Advanced Filtering Patterns + +**LHS Brackets (Rails-style)**: + +``` +GET /users?filter[status]=active&filter[role]=admin +``` + +**RHS Colon (JSON API style)**: + +``` +GET /users?filter=status:active,role:admin +``` + +**Comparison operators**: + +``` +GET /products?price[gte]=100&price[lte]=500 # Price between 100-500 +GET /users?createdAt[gt]=2024-01-01 # Created after date +``` + +## API Versioning Strategies + +### 1. URI Versioning + +``` +GET /v1/users +GET /v2/users +``` + +**Pros**: Explicit, easy to route, clear in logs +**Cons**: Violates REST principles (resource identity changes), URL proliferation + +**Best for**: Public APIs, major breaking changes + +### 2. Header Versioning + +``` +GET /users +Accept: application/vnd.myapi.v2+json +``` + +**Pros**: Clean URLs, follows REST principles +**Cons**: Less visible, harder to test in browser + +**Best for**: Internal APIs, clients with header control + +### 3. Query Parameter Versioning + +``` +GET /users?version=2 +``` + +**Pros**: Easy to test, optional (can default to latest) +**Cons**: Pollutes query parameters, not semantic + +**Best for**: Minor version variants, opt-in features + +### Version Deprecation Process + +1. **Announce**: Document deprecation timeline (6-12 months recommended) +2. **Warn**: Add `Deprecated` header to responses +3. **Sunset**: Add `Sunset` header with end date (RFC 8594) +4. **Migrate**: Provide migration guides and tooling +5. **Remove**: After sunset date, return 410 Gone + +``` +HTTP/1.1 200 OK +Deprecated: true +Sunset: Sat, 31 Dec 2024 23:59:59 GMT +Link: ; rel="successor-version" +``` + +## Error Response Format + +**Standard JSON error format**: + +```json +{ + "error": { + "code": "VALIDATION_ERROR", + "message": "One or more fields failed validation", + "details": [ + { + "field": "email", + "message": "Invalid email format", + "code": "INVALID_FORMAT" + }, + { + "field": "age", + "message": "Must be at least 18", + "code": "OUT_OF_RANGE" + } + ], + "requestId": "req_abc123", + "timestamp": "2024-11-14T10:30:00Z" + } +} +``` + +**Problem Details (RFC 7807)**: + +```json +{ + "type": "https://api.example.com/errors/validation-error", + "title": "Validation Error", + "status": 400, + "detail": "The request body contains invalid data", + "instance": "/users", + "invalid-params": [ + { + "name": "email", + "reason": "Invalid email format" + } + ] +} +``` + +## HATEOAS (Hypermedia) + +**Level 3 REST includes hypermedia links**: + +```json +{ + "id": 123, + "name": "John Doe", + "status": "active", + "_links": { + "self": { "href": "/users/123" }, + "orders": { "href": "/users/123/orders" }, + "deactivate": { + "href": "/users/123/deactivate", + "method": "POST" + } + } +} +``` + +**Benefits**: +- Self-documenting API +- Clients discover available actions +- Server controls workflow +- Reduces client-server coupling + +**Tradeoffs**: +- Increased response size +- Complexity for simple APIs +- Limited client library support + +**When to use**: Complex workflows, long-lived APIs, discoverability requirements + +## Idempotency Keys + +**For POST operations that should be safely retryable**: + +``` +POST /orders +Idempotency-Key: key_abc123xyz + +{ + "items": [...], + "total": 99.99 +} +``` + +**Server behavior**: +1. First request with key → Process and store result +2. Duplicate request with same key → Return stored result (do not reprocess) +3. Different request with same key → Return 409 Conflict + +**Implementation**: + +```python +@app.post("/orders") +def create_order(order: Order, idempotency_key: str = Header(None)): + if idempotency_key: + # Check if key was used before + cached = redis.get(f"idempotency:{idempotency_key}") + if cached: + return JSONResponse(content=cached, status_code=200) + + # Process order + result = process_order(order) + + if idempotency_key: + # Cache result for 24 hours + redis.setex(f"idempotency:{idempotency_key}", 86400, result) + + return result +``` + +## API Evolution Patterns + +### Adding Fields (Non-Breaking) + +**✅ Safe changes**: +- Add optional request fields +- Add response fields +- Add new endpoints +- Add new query parameters + +**Client requirements**: Ignore unknown fields + +### Removing Fields (Breaking) + +**Strategies**: +1. **Deprecation period**: Mark field as deprecated, remove in next major version +2. **Versioning**: Create v2 without field +3. **Optional → Required**: Never safe, always breaking + +### Changing Field Types (Breaking) + +**❌ Breaking**: +- String → Number +- Number → String +- Boolean → String +- Flat → Nested object + +**✅ Non-breaking**: +- Number → String (if client coerces) +- Adding nullability (required → optional) + +**Strategy**: Add new field with correct type, deprecate old field + +## Richardson Maturity Model + +| Level | Description | Example | +|-------|-------------|---------| +| 0 | POX (Plain Old XML) | Single endpoint, all operations via POST | +| 1 | Resources | Multiple endpoints, still using POST for everything | +| 2 | HTTP Verbs | Proper HTTP methods (GET, POST, PUT, DELETE) | +| 3 | Hypermedia (HATEOAS) | Responses include links to related resources | + +**Most APIs target Level 2** (HTTP verbs + status codes). +**Level 3 is optional** but valuable for complex domains. + +## Common Anti-Patterns + +| Anti-Pattern | Why Bad | Fix | +|--------------|---------|-----| +| Verbs in URLs (`/createUser`) | Not RESTful, redundant with HTTP methods | Use POST /users | +| GET with side effects | Violates HTTP semantics, not safe | Use POST/PUT/DELETE | +| POST for everything | Loses HTTP semantics, not idempotent | Use appropriate method | +| 200 for errors | Breaks HTTP contract | Use correct 4xx/5xx codes | +| Deeply nested URLs | Hard to navigate, brittle | Max 2 levels, use query params | +| Binary response flags | Unclear semantics | Use proper HTTP status codes | +| Timestamps without timezone | Ambiguous | Use ISO 8601 with timezone | +| Pagination without total | Can't show "Page X of Y" | Include total count or hasMore | + +## Best Practices Checklist + +**Resource Design**: +- [ ] Resources are nouns, not verbs +- [ ] Plural names for collections +- [ ] Max 2 levels of nesting +- [ ] Consistent naming conventions (snake_case or camelCase) + +**HTTP Semantics**: +- [ ] Correct HTTP methods for operations +- [ ] Proper status codes (not just 200/500) +- [ ] Idempotent operations are actually idempotent +- [ ] GET/HEAD have no side effects + +**API Evolution**: +- [ ] Versioning strategy defined +- [ ] Backward compatibility maintained within version +- [ ] Deprecation headers for sunset features +- [ ] Migration guides for breaking changes + +**Error Handling**: +- [ ] Consistent error response format +- [ ] Detailed field-level validation errors +- [ ] Request IDs for tracing +- [ ] Human-readable error messages + +**Performance**: +- [ ] Pagination for large collections +- [ ] ETags for caching +- [ ] Gzip compression enabled +- [ ] Rate limiting implemented + +## Cross-References + +**Related skills**: +- **GraphQL alternative** → `graphql-api-design` +- **FastAPI implementation** → `fastapi-development` +- **Django implementation** → `django-development` +- **Express implementation** → `express-development` +- **Authentication** → `api-authentication` +- **API testing** → `api-testing` +- **API documentation** → `api-documentation` or `muna-technical-writer` +- **Security** → `ordis-security-architect` (OWASP API Security) + +## Further Reading + +- **REST Dissertation**: Roy Fielding's original thesis +- **RFC 7807**: Problem Details for HTTP APIs +- **RFC 8594**: Sunset HTTP Header +- **JSON:API**: Opinionated REST specification +- **OpenAPI 3.0**: API documentation standard