commit 57bed020cd8f97a08d8bbb06170197c3c62286dc Author: Zhongwei Li Date: Sun Nov 30 08:59:40 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..f3ed9cc --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "muna-technical-writer", + "description": "Documentation structure, clarity, security-aware docs - 9 technical writing skills", + "version": "1.0.1", + "author": { + "name": "tachyon-beep", + "url": "https://github.com/tachyon-beep" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0534eda --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# muna-technical-writer + +Documentation structure, clarity, security-aware docs - 9 technical writing skills diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..8649dab --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,77 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:tachyon-beep/skillpacks:plugins/muna-technical-writer", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "534b811e66088b5e4b97f90b62ad89bb6cd7bcc8", + "treeHash": "0d57a33a65df0bc0d6eb7de36bbe0c109c14ed363a4c1a180dc0866d8da8e59e", + "generatedAt": "2025-11-28T10:28:32.765889Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "muna-technical-writer", + "description": "Documentation structure, clarity, security-aware docs - 9 technical writing skills", + "version": "1.0.1" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "58fe4d567cd1f6f79fbd5454e92455af3571bf9bf6317d5e3011eb6c36ded7f3" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "258c4fca8990f086fc0e4c13062dbdc2fb950f43cdf9943851f06f55f4e6934b" + }, + { + "path": "skills/using-technical-writer/documentation-testing.md", + "sha256": "7f13d9c20752b8417b17bb11e717374c3e7016dc220e4c617d15c76f9727f4c0" + }, + { + "path": "skills/using-technical-writer/clarity-and-style.md", + "sha256": "8f3895cdc455bf1978c44b1249c679546dd40e05d3adae2dac8c1ceba806e4b7" + }, + { + "path": "skills/using-technical-writer/itil-and-governance-documentation.md", + "sha256": "7737337796c54dc84b073ab1d35de0e979072be0e4a41bb1f51e51f317b864e2" + }, + { + "path": "skills/using-technical-writer/documentation-structure.md", + "sha256": "6bbc0b0a171ffbd290752dc56e817f88e3342e9ff06d5239dc370bcff6ee1a75" + }, + { + "path": "skills/using-technical-writer/security-aware-documentation.md", + "sha256": "f93d990e39cc53f27f36c490a6d1cead54de6fa2ae7c6ae189e5218692c5742e" + }, + { + "path": "skills/using-technical-writer/diagram-conventions.md", + "sha256": "84b49d47ac9c85f4dc59e774b24837efbcdddadc3febd5ab7de0faca7a87f283" + }, + { + "path": "skills/using-technical-writer/SKILL.md", + "sha256": "e1800e03e43fd6363cee17d678033b58ec7f7274b2e755a78b25563771ee1532" + }, + { + "path": "skills/using-technical-writer/operational-acceptance-documentation.md", + "sha256": "5d583ebcba1f0861c4a90da4f3a614af9fcb215db5ad5beb5f0b872e9e5ce6b1" + }, + { + "path": "skills/using-technical-writer/incident-response-documentation.md", + "sha256": "93279518a5a86e2cf6febb06ac0048b9c1d110df33216acd27cfcaea27f24b37" + } + ], + "dirSha256": "0d57a33a65df0bc0d6eb7de36bbe0c109c14ed363a4c1a180dc0866d8da8e59e" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/using-technical-writer/SKILL.md b/skills/using-technical-writer/SKILL.md new file mode 100644 index 0000000..c166b28 --- /dev/null +++ b/skills/using-technical-writer/SKILL.md @@ -0,0 +1,443 @@ +--- +name: using-technical-writer +description: Router for documentation tasks - routes to ADRs, APIs, runbooks, security docs, or governance docs +mode: true +--- + +# Using Technical Writer + +## Overview + +This meta-skill routes you to the right technical writing skills based on your documentation task. Load this skill when you need to create, improve, or organize documentation but aren't sure which specific writing skill to use. + +**Core Principle**: Different document types and audiences require different skills. Match your situation to the appropriate skill, load only what you need. + +## When to Use + +Load this skill when: +- Starting any documentation task +- User mentions: "document", "write docs", "README", "API docs", "ADR", "runbook" +- Creating technical content for any audience +- Improving or reorganizing existing documentation + +**Don't use for**: Non-technical writing (marketing copy, blog posts, creative content) + +## Routing by Document Type + +### Architecture Decisions (ADRs) + +**Symptoms**: "Document why we chose X", "Record this architectural decision", "Explain technology choice" + +**Route to**: [documentation-structure.md](documentation-structure.md) + +**Key Pattern**: ADRs document Context → Decision → Consequences + +**Example**: "Document why we chose PostgreSQL over MongoDB" → Load [documentation-structure.md](documentation-structure.md) + +--- + +### API Documentation + +**Symptoms**: "Document this API", "Create API reference", "Explain endpoints" + +**Route to**: +1. [documentation-structure.md](documentation-structure.md) (API reference pattern) +2. [clarity-and-style.md](clarity-and-style.md) (examples, precision) + +**Example**: "Document REST API for user management" → Load both skills + +--- + +### Runbooks & Procedures + +**Symptoms**: "Write deployment procedure", "Create incident runbook", "Document how to..." + +**Route to**: +1. [documentation-structure.md](documentation-structure.md) (runbook pattern) +2. [clarity-and-style.md](clarity-and-style.md) (step-by-step clarity) + +**Example**: "Create deployment runbook" → Load both skills + +--- + +### README Files + +**Symptoms**: "Add a README", "Quick start guide", "Installation instructions" + +**For Complex Projects**: +Route to: [documentation-structure.md](documentation-structure.md) (README pattern) + +**For Simple Utilities**: +Route to: **NONE** - basic technical writing sufficient + +**Decision Point**: Complex (>100 lines, multiple features, deployment) vs Simple (script, single function) + +--- + +### Security Documentation + +**Symptoms**: "Document threat model", "Security controls", "Document security decisions" + +**Route to (Cross-Faction)**: +1. `ordis/security-architect/documenting-threats-and-controls` (security content) +2. [documentation-structure.md](documentation-structure.md) (ADR format, organization) +3. [clarity-and-style.md](clarity-and-style.md) (explain to non-experts) + +**Key Insight**: Security docs need BOTH content expertise (Ordis) AND writing skills (Muna) + +**Example**: "Document authentication security decisions" → Load all three skills + +--- + +## Routing by Audience + +### Developer Audience + +**What They Need**: Architecture diagrams, code examples, API references, technical depth + +**Route to**: +- [documentation-structure.md](documentation-structure.md) (architecture docs, API patterns) +- [diagram-conventions.md](diagram-conventions.md) (system diagrams, data flows) +- [clarity-and-style.md](clarity-and-style.md) (concrete examples, precision) + +**Example**: "Write docs for internal developers" → Load all three + +--- + +### Operator Audience + +**What They Need**: Runbooks, troubleshooting, deployment procedures, configuration guides + +**Route to**: +- [documentation-structure.md](documentation-structure.md) (runbook pattern) +- [clarity-and-style.md](clarity-and-style.md) (step-by-step, scannable) + +**Example**: "Create SRE runbook" → Load both skills + +--- + +### Executive Audience + +**What They Need**: High-level summaries, business impact, risks, costs (minimal technical detail) + +**Route to**: +- [clarity-and-style.md](clarity-and-style.md) (progressive disclosure, audience adaptation) + +**Example**: "Executive summary of migration plan" → Load [clarity-and-style.md](clarity-and-style.md) only + +--- + +### Mixed/Public Audience + +**What They Need**: Progressive disclosure (quick start → advanced topics), multiple entry points + +**Route to**: +- [documentation-structure.md](documentation-structure.md) (README pattern, API docs) +- [clarity-and-style.md](clarity-and-style.md) (progressive disclosure, audience adaptation) +- [diagram-conventions.md](diagram-conventions.md) (high-level overviews) + +**Example**: "Public documentation for open-source project" → Load all three + +--- + +## Cross-Faction Documentation + +### Security + Documentation + +**When**: Documenting threat models, security controls, classified systems, compliance + +**Route to**: +- `ordis/security-architect/documenting-threats-and-controls` +- `ordis/security-architect/threat-modeling` (for threat content) +- [documentation-structure.md](documentation-structure.md) (for organization) +- [security-aware-documentation.md](security-aware-documentation.md) (if handling sensitive info) + +**Example**: "Document MLS security architecture" → Load all four skills + +--- + +### Compliance + Documentation + +**When**: Audit documentation, SSP/SAR, compliance mappings + +**Route to**: +- `ordis/security-architect/compliance-awareness-and-mapping` (compliance content) +- [operational-acceptance-documentation.md](operational-acceptance-documentation.md) (SSP/SAR structure) +- [documentation-structure.md](documentation-structure.md) (organization) + +**Example**: "Create SOC2 compliance documentation" → Load all three skills + +--- + +### Incident Response + Documentation + +**When**: Post-mortem reports, incident runbooks, response procedures + +**Route to**: +- [incident-response-documentation.md](incident-response-documentation.md) (incident-specific patterns) +- [documentation-structure.md](documentation-structure.md) (runbook pattern) +- [clarity-and-style.md](clarity-and-style.md) (clarity under pressure) + +**Example**: "Write post-mortem for outage" → Load all three skills + +--- + +## Documentation Workflow + +### Standard Flow + +``` +1. Determine document type → Route to structure skill +2. Write content → Apply clarity/style skill +3. Add diagrams if needed → Use diagram conventions +4. Test documentation → Use documentation-testing +``` + +### Quick Reference + +| Task | Load | +|------|------| +| "Why did we choose X?" | documentation-structure (ADR) | +| "Document API" | documentation-structure + clarity-and-style | +| "Deployment runbook" | documentation-structure + clarity-and-style | +| "README for utility" | NONE (simple) or documentation-structure (complex) | +| "Security docs" | documenting-threats + documentation-structure + clarity | +| "Developer guide" | documentation-structure + diagram-conventions + clarity | +| "Executive summary" | clarity-and-style only | + +--- + +## When NOT to Load Documentation Skills + +**Don't load skills for**: +- Simple utility README (<50 lines, single purpose, obvious usage) +- Code comments (use standard practices) +- Commit messages (use conventional commits) +- Chat/email (conversational writing) +- First drafts where you're exploring (capture ideas first, structure later) + +**Example**: "Add README to hello-world script" → No special skills needed + +--- + +## Core vs Extension Skills + +### Core Skills (Universal - Any Project) + +Use for **any** project: +- [documentation-structure.md](documentation-structure.md) - ADRs, APIs, runbooks, READMEs, architecture docs +- [clarity-and-style.md](clarity-and-style.md) - Active voice, concrete examples, audience adaptation +- [diagram-conventions.md](diagram-conventions.md) - System diagrams, data flows, architecture visuals +- [documentation-testing.md](documentation-testing.md) - Verify docs are accurate, complete, findable + +### Extension Skills (Specialized Contexts) + +Use **only** when context requires: +- [security-aware-documentation.md](security-aware-documentation.md) - Sanitizing examples with sensitive data, classification marking +- [incident-response-documentation.md](incident-response-documentation.md) - Post-mortems, incident runbooks, RCA templates +- [itil-and-governance-documentation.md](itil-and-governance-documentation.md) - ITIL processes, change management, governance frameworks +- [operational-acceptance-documentation.md](operational-acceptance-documentation.md) - SSP, SAR, POA&M for government authorization + +**Decision**: If unsure whether context is "specialized", start with core skills. Specialized needs will be explicit. + +--- + +## Common Routing Patterns + +### Pattern 1: ADR for Architecture Decision + +``` +User: "We chose to use REST instead of GraphQL. Document this." +You: Loading [documentation-structure.md](documentation-structure.md) (ADR pattern) +``` + +### Pattern 2: API Documentation + +``` +User: "Document our user management API." +You: Loading [documentation-structure.md](documentation-structure.md) + [clarity-and-style.md](clarity-and-style.md) +``` + +### Pattern 3: Security Documentation (Cross-Faction) + +``` +User: "Document the threat model for authentication." +You: Loading ordis/security-architect/documenting-threats-and-controls + + [documentation-structure.md](documentation-structure.md) + + [clarity-and-style.md](clarity-and-style.md) +``` + +### Pattern 4: Simple README + +``` +User: "Add README to this backup script." +You: [Check script complexity] + Simple utility → No skills needed + OR + Complex tool → Loading [documentation-structure.md](documentation-structure.md) +``` + +### Pattern 5: Operator Runbook + +``` +User: "Create runbook for database failover." +You: Loading [documentation-structure.md](documentation-structure.md) (runbook) + + [clarity-and-style.md](clarity-and-style.md) (step-by-step clarity) +``` + +--- + +## Decision Tree + +``` +Starting documentation task? +├─ What type? +│ ├─ Architecture decision → documentation-structure (ADR) +│ ├─ API documentation → documentation-structure + clarity-and-style +│ ├─ Runbook/procedure → documentation-structure + clarity-and-style +│ ├─ README → Complex? documentation-structure : None +│ └─ Security/compliance → Cross-faction (Ordis + Muna) +│ +├─ Who's the audience? +│ ├─ Developers → Add diagram-conventions +│ ├─ Operators → Focus on runbook patterns +│ ├─ Executives → clarity-and-style only (progressive disclosure) +│ └─ Mixed → All core skills +│ +└─ Specialized context? + ├─ Sensitive data → ADD: security-aware-documentation + ├─ Incident response → ADD: incident-response-documentation + ├─ Government/compliance → ADD: operational-acceptance-documentation + └─ None → Core skills sufficient +``` + +--- + +## Quick Reference Table + +| Document Type | Primary Skill | Additional Skills | Cross-Faction? | +|---------------|---------------|-------------------|----------------| +| **ADR** | documentation-structure | clarity-and-style | No | +| **API docs** | documentation-structure | clarity-and-style | No | +| **Runbook** | documentation-structure | clarity-and-style | No | +| **README (complex)** | documentation-structure | clarity-and-style, diagram-conventions | No | +| **README (simple)** | NONE | NONE | No | +| **Security docs** | documenting-threats-and-controls | documentation-structure, clarity-and-style | **Yes (Ordis)** | +| **Compliance** | operational-acceptance-documentation | documentation-structure | **Yes (Ordis)** | +| **Developer guide** | documentation-structure | diagram-conventions, clarity-and-style | No | +| **Operator guide** | documentation-structure | clarity-and-style | No | +| **Executive summary** | clarity-and-style | NONE | No | +| **Post-mortem** | incident-response-documentation | documentation-structure, clarity-and-style | No | + +--- + +## Common Mistakes + +### ❌ Loading All Skills for Every Task +**Wrong**: Load all 8 Muna skills for every documentation task +**Right**: Load only skills your situation needs (use decision tree) + +### ❌ Missing Cross-Faction Needs +**Wrong**: Document security with only Muna skills (missing security content expertise) +**Right**: Load Ordis skills for content + Muna skills for structure/clarity + +### ❌ Over-Engineering Simple Docs +**Wrong**: Load documentation-structure for 10-line README +**Right**: Simple docs don't need special skills (just write clearly) + +### ❌ Not Considering Audience +**Wrong**: Same documentation for developers and executives +**Right**: Adapt content and skills based on audience needs + +--- + +## Examples + +### Example 1: Documenting Database Choice + +``` +User: "We decided on PostgreSQL. Document why." + +Your routing: +1. Recognize: Architecture decision → ADR format +2. Load: [documentation-structure.md](documentation-structure.md) +3. Create: ADR with Context, Decision, Consequences +``` + +### Example 2: Security Threat Model Documentation + +``` +User: "Document the threat model for our API gateway." + +Your routing: +1. Recognize: Security content (need Ordis) + Documentation (need Muna) +2. Load: ordis/security-architect/documenting-threats-and-controls (threats content) +3. Load: [documentation-structure.md](documentation-structure.md) (ADR for security decisions) +4. Load: [clarity-and-style.md](clarity-and-style.md) (explain to non-security team) +5. Create: Threat model doc with STRIDE analysis + mitigations + clear explanations +``` + +### Example 3: Simple Utility README + +``` +User: "Add README to this file-copy script." + +Your routing: +1. Recognize: Simple utility (single function, obvious usage) +2. Decision: No special skills needed +3. Create: Basic README with usage example, no complex structure +``` + +--- + +## Phase 1 Note + +**Currently Available** (Phase 1): +- ✅ `using-technical-writer` (this skill) +- ✅ `documentation-structure` (in progress) + +**Coming Soon** (Phases 2-3): +- `clarity-and-style` +- `diagram-conventions` +- `documentation-testing` +- `security-aware-documentation` +- `incident-response-documentation` +- `itil-and-governance-documentation` +- `operational-acceptance-documentation` + +**For Phase 1**: Focus on documentation-structure as primary skill. Reference other skills by name even though not implemented yet - this tests routing logic. + +--- + +## Summary + +**This skill maps documentation tasks → specific writing skills to load.** + +1. Identify document type (ADR, API, runbook, README, security) +2. Use decision tree to find applicable skills +3. Load core skills for universal needs +4. Add extension skills for specialized contexts +5. Cross-reference Ordis for security/compliance content +6. Don't load skills when not needed (simple docs) + +**Meta-rule**: When in doubt about document type, start with [documentation-structure.md](documentation-structure.md) - it covers most common patterns (ADR, API, runbook, README). + +--- + +## Technical Writer Specialist Skills Catalog + +After routing, load the appropriate specialist skill for detailed guidance: + +### Core Skills (Universal) + +1. [documentation-structure.md](documentation-structure.md) - ADR format, API reference patterns, runbook templates, README structure, architecture documentation +2. [clarity-and-style.md](clarity-and-style.md) - Active voice, concrete examples, progressive disclosure, audience adaptation, step-by-step clarity +3. [diagram-conventions.md](diagram-conventions.md) - System diagrams, data flow visuals, architecture overviews, C4 model, consistent notation +4. [documentation-testing.md](documentation-testing.md) - Verify accuracy, completeness, findability, test examples, validate links + +### Extension Skills (Specialized Contexts) + +5. [security-aware-documentation.md](security-aware-documentation.md) - Sanitizing sensitive data, classification marking, redacting examples, security-conscious writing +6. [incident-response-documentation.md](incident-response-documentation.md) - Post-mortem templates, incident runbooks, RCA structure, timeline documentation +7. [itil-and-governance-documentation.md](itil-and-governance-documentation.md) - ITIL processes, change management, governance frameworks, policy documentation +8. [operational-acceptance-documentation.md](operational-acceptance-documentation.md) - SSP/SAR structure, POA&M templates, government authorization, compliance artifacts diff --git a/skills/using-technical-writer/clarity-and-style.md b/skills/using-technical-writer/clarity-and-style.md new file mode 100644 index 0000000..14f2a1f --- /dev/null +++ b/skills/using-technical-writer/clarity-and-style.md @@ -0,0 +1,559 @@ + +# Clarity and Style + +## Overview + +Write documentation that readers can **immediately act on**. Core principle: Every abstract concept needs a concrete, runnable example. Every audience needs information in their language. + +**Key insight**: Good writing = easy scanning + clear actions + adapted to reader's context. + +## When to Use + +Load this skill when: +- Writing documentation (README, API docs, runbooks, ADRs) +- Reviewing documentation for clarity +- Explaining technical concepts to different audiences +- Creating user guides or tutorials + +**Symptoms you need this**: +- Documentation says "configure appropriately" without showing how +- Passive voice everywhere ("tests should be run" vs "run tests") +- Same explanation for developers and executives +- Wall-of-text paragraphs without headings +- Jargon without definitions + +**Don't use for**: +- Code comments (use standard practices) +- Commit messages (use conventional commits) +- Chat/email (conversational style different) + +## Core Patterns + +### Pattern 1: Active Voice (Who Does What) + +**Rule**: Subject performs action directly. "X does Y", not "Y is done by X". + +| Passive (❌) | Active (✅) | +|-------------|-----------| +| "The token is validated by the system" | "The system validates the token" | +| "Tests should be run with pytest" | "Run tests with pytest" | +| "The configuration file is read at startup" | "The application reads the config file at startup" | +| "Rate limiting can be configured" | "Configure rate limiting with environment variables" | +| "Errors are logged to CloudWatch" | "The service logs errors to CloudWatch" | + +**Why**: Active voice shows WHO/WHAT does the action, making responsibilities clear. + +**Common passive constructions to avoid**: +- "should be done" → "do X" +- "is processed by" → "X processes" +- "can be configured" → "configure X with Y" +- "is validated" → "the service validates" + +**When passive is okay**: When actor is unknown or irrelevant: +- "The server was compromised" (attacker unknown) +- "The file was deleted" (focus on state change, not actor) + + +### Pattern 2: Concrete Examples (Show, Don't Tell) + +**Rule**: Every instruction needs a runnable example. Never say "configure" without showing exact config. + +| Abstract (❌) | Concrete (✅) | +|--------------|-------------| +| "Set the timeout appropriately" | "Set `API_TIMEOUT=30` in `.env` for 30-second timeout" | +| "Configure the database connection" | "Set `DATABASE_URL=postgresql://user:pass@localhost:5432/dbname`" | +| "Run the tests" | "Run `pytest tests/ -v` from project root" | +| "Increase the rate limit" | "Set `RATE_LIMIT=1000` (requests per hour) in `config.yml`" | +| "Handle errors properly" | "Wrap API calls in try/except and log to CloudWatch:\n```python\ntry:\n response = api.call()\nexcept APIError as e:\n logger.error(f\"API failed: {e}\")\n```" | + +**Pattern**: [Abstract concept] + [Concrete example] + [Expected outcome] + +**Example**: +```markdown +Configure rate limiting (concept) by setting `RATE_LIMIT=1000` in `.env` (example). +The API will reject requests after 1000/hour per client (outcome). +``` + +**When to provide examples**: +- Commands to run +- Config values to set +- API calls to make +- Error messages you'll see +- File paths to check + +**Example formats**: +- Code blocks for commands: `` `pytest tests/` `` +- File snippets for config +- API request/response pairs +- Before/after comparisons + + +### Pattern 3: Progressive Disclosure (Essentials First, Details On-Demand) + +**Rule**: Start with minimum viable information. Provide detail progressively, not all at once. + +**Structure**: +``` +1. One-sentence summary (what it is) +2. Minimal quick start (get started in <5 min) +3. Common use cases (cover 80% of users) +4. Advanced topics (expandable sections or separate pages) +5. Complete reference (link to API docs/spec) +``` + +**Example: Rate Limiting Documentation** + +❌ **Bad (Everything Upfront)**: +```markdown +# Rate Limiting + +Rate limiting is implemented using a token bucket algorithm with distributed +state management via Redis Cluster. The system tracks requests per client using +API keys extracted from the Authorization header or IP addresses for unauthenticated +requests. Limits are enforced using sliding windows with configurable window sizes +(1 hour, 1 day, 1 month) and multiple tiers (Free: 100/hour, Pro: 10k/hour, +Enterprise: custom). When limits are exceeded, the API returns 429 with Retry-After +header calculated based on the token bucket refill rate. The system supports +distributed deployments with eventual consistency guarantees and graceful degradation +when Redis is unavailable by falling back to in-memory rate limiting... +``` + +✅ **Good (Progressive Disclosure)**: +```markdown +# Rate Limiting + +The API limits requests to prevent abuse. Free tier: 100 requests/hour. + +## Quick Start + +Check your remaining quota: +\`\`\`bash +curl -i https://api.example.com/status +# See X-RateLimit-Remaining header +\`\`\` + +## What Happens When Limited + +API returns 429 status. Wait time shown in `Retry-After` header (seconds). + +## Rate Limit Tiers + +| Tier | Limit | Use Case | +|------|-------|----------| +| Free | 100/hour | Development | +| Pro | 10k/hour | Production | +| Enterprise | Custom | High volume | + +
+Advanced: How It Works + +Uses token bucket algorithm with Redis. Sliding windows, distributed state... +[Technical details here] +
+ +
+Advanced: Custom Limits + +Contact sales@example.com for Enterprise tier with negotiated limits... +
+``` + +**Benefits**: +- New users get started in 30 seconds (check header) +- 80% of users find answer in main sections (tiers, what happens when limited) +- 20% power users access advanced details (expandable) +- Nobody is overwhelmed with token buckets upfront + + +### Pattern 4: Audience Adaptation (Write for Your Reader) + +**Rule**: Same information, different framing for different audiences. + +**Three primary audiences**: + +#### Developer Audience +**What they need**: HOW it works (architecture, APIs, code examples, data flows) + +**Style**: +- Technical precision +- Code examples first +- Architecture diagrams +- API reference details +- Error codes and debugging + +**Example**: +```markdown +## Authentication (For Developers) + +API uses JWT with RS256 signing. + +**Request**: +\`\`\`bash +curl -H "Authorization: Bearer eyJhbG..." https://api.example.com/users +\`\`\` + +**Token structure**: +\`\`\`json +{ + "sub": "user_12345", + "scope": ["read:users", "write:posts"], + "exp": 1730145600 +} +\`\`\` + +**Validation process**: +1. Extract token from Authorization header +2. Verify signature using public key (fetch from `/keys`) +3. Check expiration (`exp` claim > current time) +4. Verify scopes match endpoint requirements + +**Errors**: +- 401: Invalid signature or expired token +- 403: Valid token but insufficient scopes +``` + + +#### Operator Audience +**What they need**: HOW to run it (deployment, configuration, monitoring, troubleshooting) + +**Style**: +- Step-by-step procedures +- Config file examples +- Monitoring queries +- Troubleshooting checklists +- Runbooks for incidents + +**Example**: +```markdown +## Authentication (For Operators) + +### Deployment + +1. Generate RSA keypair: + \`\`\`bash + ssh-keygen -t rsa -b 4096 -f jwt-key + \`\`\` + +2. Set environment variables: + \`\`\`bash + JWT_PUBLIC_KEY_PATH=/etc/app/jwt-key.pub + JWT_ALGORITHM=RS256 + TOKEN_EXPIRY_SECONDS=3600 + \`\`\` + +3. Restart service: + \`\`\`bash + systemctl restart api-service + \`\`\` + +### Monitoring + +**Alert on high auth failures**: +\`\`\`promql +rate(auth_failures_total[5m]) > 10 +\`\`\` + +### Troubleshooting + +**Symptom**: All requests returning 401 + +**Check**: +1. Public key readable? `ls -la /etc/app/jwt-key.pub` +2. Service logs: `journalctl -u api-service | grep JWT` +3. Key format correct? Should be PEM format, not binary +``` + + +#### Executive Audience +**What they need**: WHY it matters (business value, risks, costs, timelines) + +**Style**: +- High-level summaries (no technical jargon) +- Business impact (revenue, risk, customer satisfaction) +- Costs and ROI +- Timeline and milestones +- No implementation details + +**Example**: +```markdown +## Authentication (For Executives) + +### Business Impact + +**Security**: JWT authentication prevents unauthorized access to customer data, +reducing breach risk and regulatory liability. + +**Cost**: Industry-standard implementation, no licensing fees. Scales to millions +of users with existing infrastructure. + +**Customer Experience**: Users stay logged in for 1 hour without re-authentication, +reducing friction while maintaining security. + +### Risk Mitigation + +- **Before**: API keys in URLs, logged in plaintext, exposed in browser history +- **After**: Short-lived tokens, signed cryptographically, revocable + +### Timeline + +- Implementation: 2 weeks +- Migration: 1 week (parallel run with old system) +- Full rollout: 1 week + +**Investment**: 4 engineering weeks ($40k) +**Risk reduction**: Avoid potential $2M+ breach costs (industry average) +``` + + +### Pattern 5: Precision Without Jargon + +**Rule**: Be technically accurate using accessible language. Define acronyms on first use. + +| Jargon-Heavy (❌) | Precise & Clear (✅) | +|------------------|---------------------| +| "Utilize the ingress controller to facilitate external traffic ingress" | "Use the ingress controller to route external traffic into the cluster" | +| "Implement idempotency semantics" | "Make requests safe to retry - calling twice produces same result" | +| "Leverage the ORM abstraction layer" | "Use the ORM (Object-Relational Mapping) to query the database with Python code instead of SQL" | +| "Instantiate a singleton factory pattern" | "Create one shared instance that all code uses (singleton pattern)" | + +**Pattern for acronyms**: Full term (Acronym) on first use, acronym thereafter. + +**Examples**: +- First use: "JWT (JSON Web Token)" +- Later: "JWT" +- First use: "SLA (Service Level Agreement)" +- Later: "SLA" + +**Simplification checklist**: +- Replace "utilize" → "use" +- Replace "facilitate" → "help" or "enable" +- Replace "instantiate" → "create" +- Replace "leverage" → "use" +- Define domain terms: "idempotency means safe to retry" + +**When jargon is okay**: When writing for technical audience and term is standard. +- "JWT" in developer docs (industry standard) +- "Kubernetes Pod" in operator docs (specific technical concept) +- "SQL injection" in security docs (precise attack name) + + +### Pattern 6: Scannable Structure + +**Rule**: Use headings, bullets, tables, code blocks. Make key information findable in <10 seconds. + +**Scannable elements**: +- ✅ Headings (H2, H3) for sections +- ✅ Bullet points for lists +- ✅ Tables for comparisons +- ✅ Code blocks for commands/examples +- ✅ **Bold** for key terms (use sparingly) +- ✅ Short paragraphs (3-5 sentences max) + +**Anti-patterns**: +- ❌ Wall-of-text paragraphs (>10 lines) +- ❌ **Everything in bold** (loses emphasis) +- ❌ No headings (can't scan) +- ❌ Inline code for long examples (use blocks) + +**Example: Scannable vs Not** + +❌ **Not Scannable**: +```markdown +When you need to deploy the application you should first make sure that Docker +is installed on your system and then you need to clone the repository from GitHub +and after that you should copy the .env.example file to .env and edit it to set +your database credentials and API keys and then you can run docker-compose up -d +to start the containers in detached mode and then wait for the database to +initialize which usually takes about 30 seconds and then you can run the migrations +with docker-compose exec app python manage.py migrate... +``` + +✅ **Scannable**: +```markdown +## Deployment Steps + +### Prerequisites +- Docker installed +- GitHub access + +### Setup + +1. Clone repository: + \`\`\`bash + git clone https://github.com/org/app.git + \`\`\` + +2. Configure environment: + \`\`\`bash + cp .env.example .env + # Edit .env: Set DATABASE_URL and API_KEY + \`\`\` + +3. Start services: + \`\`\`bash + docker-compose up -d + # Wait 30 seconds for database initialization + \`\`\` + +4. Run migrations: + \`\`\`bash + docker-compose exec app python manage.py migrate + \`\`\` + +### Verification + +Check services are running: +\`\`\`bash +docker-compose ps +# All services should show "Up" +\`\`\` +``` + +**Why scannable works**: +- Find information in <10 seconds (headings) +- Copy commands directly (code blocks) +- See prerequisites before starting (avoids mid-process failures) +- Numbered steps (clear sequence) +- Verification step (know when done) + + +## Quick Reference: Clarity Checklist + +Use this checklist when writing or reviewing documentation: + +| Check | Pattern | Example | +|-------|---------|---------| +| ✅ Active voice | "X does Y" not "Y is done" | "Run tests" not "tests should be run" | +| ✅ Concrete examples | Every instruction has runnable example | "Set `API_KEY=abc123`" not "configure API key" | +| ✅ Progressive disclosure | Essentials first, details expandable | Quick start → Use cases → Advanced (collapsed) | +| ✅ Audience adapted | Sections for Dev/Ops/Exec as needed | "For Developers: API details" / "For Execs: Business impact" | +| ✅ Acronyms defined | Full term (Acronym) on first use | "JWT (JSON Web Token)" then "JWT" | +| ✅ Scannable structure | Headings, bullets, tables, code blocks | H2 sections, bullet lists, comparison tables | +| ✅ Short paragraphs | 3-5 sentences max | Break up walls of text | +| ✅ Bold for emphasis | Key terms only, not whole paragraphs | **Important:** not **everything** | + + +## Common Mistakes + +### ❌ Passive Voice Throughout + +**Wrong**: +```markdown +The database should be configured before the application is started. +Tests can be run after deployment is completed. +``` + +**Right**: +```markdown +Configure the database before starting the application. +Run tests after completing deployment. +``` + +**Why**: Active voice shows WHO does WHAT, making actions clear. + + +### ❌ Abstract Instructions Without Examples + +**Wrong**: +```markdown +Configure rate limiting appropriately for your use case. +``` + +**Right**: +```markdown +Configure rate limiting by setting `RATE_LIMIT=1000` (requests/hour) in `config.yml`: +\`\`\`yaml +rate_limiting: + limit: 1000 # requests per hour + window: 3600 # seconds +\`\`\` +``` + +**Why**: Readers need concrete examples to act on. + + +### ❌ Same Content for All Audiences + +**Wrong**: +```markdown +# Authentication + +Uses JWT with RS256 signature algorithm, 3600-second expiry, and refresh token rotation. +``` + +**Right**: +```markdown +# Authentication + +**For Developers**: Uses JWT with RS256. Token expires in 1 hour. Refresh tokens rotate on use. + +**For Operators**: Requires RSA keypair. Set `JWT_PUBLIC_KEY_PATH` in config. + +**For Executives**: Industry-standard auth. Users stay logged in for 1 hour. Low cost, high security. +``` + +**Why**: Developers need technical details, operators need config, executives need business impact. + + +### ❌ Jargon Without Definition + +**Wrong**: +```markdown +Implement idempotency semantics using distributed locks with TTL. +``` + +**Right**: +```markdown +Make requests safe to retry (idempotency) using distributed locks that expire after a set time (TTL). + +**Idempotency**: Calling an API twice produces the same result. +**TTL**: Time To Live - how long a lock lasts before expiring. +``` + +**Why**: Not everyone knows domain-specific terms. + + +### ❌ Wall-of-Text Paragraphs + +**Wrong**: +```markdown +When you encounter rate limit errors you should first check the X-RateLimit-Remaining +header to see how many requests you have left and then wait for the time specified +in the Retry-After header before making another request and if you continue to hit +rate limits you should consider upgrading to a higher tier or implementing exponential +backoff in your client code to reduce the request rate automatically... +``` + +**Right**: +```markdown +## Handling Rate Limits + +1. Check `X-RateLimit-Remaining` header (requests left) +2. Wait time shown in `Retry-After` header (seconds) +3. If limits persist: + - Upgrade to higher tier, OR + - Implement exponential backoff in client +``` + +**Why**: Scannable structure with numbered steps is easier to follow. + + +## Cross-References + +**Use BEFORE this skill**: +- `muna/technical-writer/using-technical-writer` - Determine document type first + +**Use WITH this skill**: +- `muna/technical-writer/documentation-structure` - Structure AND clarity + +**Use AFTER this skill**: +- `muna/technical-writer/documentation-testing` - Verify documentation is clear + +## Real-World Impact + +**Well-written documentation using these patterns**: +- **38% reduction in support tickets** after rewriting config docs with concrete examples (removed "configure appropriately", added exact `.env` values) +- **Developer onboarding time from 2 days → 4 hours** after adding progressive disclosure quick start (get running in <10 min, details later) +- **Executive buy-in achieved in single meeting** after splitting technical docs into "For Executives" section showing business impact, not implementation details + +**Key lesson**: **Clarity = concrete examples + active voice + adapted to audience + scannable structure.** diff --git a/skills/using-technical-writer/diagram-conventions.md b/skills/using-technical-writer/diagram-conventions.md new file mode 100644 index 0000000..c3a184b --- /dev/null +++ b/skills/using-technical-writer/diagram-conventions.md @@ -0,0 +1,478 @@ + +# Diagram Conventions + +## Overview + +Choose the right diagram type for what you're documenting. Core principle: **Diagram type matches what you're showing** (time-based interactions → sequence, static structure → component, etc.). + +**Key insight**: Wrong diagram type obscures meaning. Right diagram type makes it obvious. + +## When to Use + +Load this skill when: +- Creating diagrams for documentation +- Choosing between diagram types +- Labeling components and relationships +- Reviewing diagrams for clarity + +**Symptoms you need this**: +- "Should I use a flowchart or sequence diagram?" +- Creating architecture documentation +- Documenting API flows, system interactions +- Explaining complex decision logic + +**Don't use for**: +- Writing code (not documentation) +- Non-technical diagrams (org charts, process flows) + +## Decision Tree: Choosing Diagram Type + +``` +What are you documenting? +│ +├─ Interactions between systems over time? +│ (API calls, message exchanges, request-response flows) +│ └─→ Use SEQUENCE DIAGRAM +│ +├─ System components and their relationships? +│ (Services, databases, queues, static architecture) +│ └─→ Use COMPONENT/ARCHITECTURE DIAGRAM +│ +├─ Data movement through transformations? +│ (ETL pipelines, data processing, input→output) +│ └─→ Use DATA FLOW DIAGRAM +│ +├─ State changes over lifecycle? +│ (Order states: pending→paid→shipped, connection states) +│ └─→ Use STATE DIAGRAM +│ +└─ Simple decision logic with branches? + (2-4 conditions, clear branching) + ├─ Small (≤3 conditions) → FLOWCHART acceptable + └─ Large (>3 conditions) → Use DECISION TABLE or PSEUDO-CODE instead +``` + + +## Diagram Type 1: Sequence Diagram + +**Use for**: Interactions between systems/actors over time. + +**When to use**: +- API request-response flows +- Authentication sequences +- Message exchanges between services +- Anything with temporal ordering (this happens THEN that happens) + +### Structure + +``` +Actor/System 1 Actor/System 2 Actor/System 3 + | | | + |-- message 1 -->| | + | |-- message 2 -->| + | |<-- response ---| + |<-- response ---| | + | | | +``` + +**Time flows downward**. Each arrow = message/call with label showing WHAT is sent. + +### Example: OAuth Authentication + +``` +User Frontend Google Auth Backend + | | | | + |-- Click Login → | | + | |-- Redirect -->| | + |<------------- Redirect to Google Auth --------| + |-- Enter credentials -------->| | + |<-- Auth code ----------------| | + | |<-- Redirect with code --------| + | |-- POST /auth/callback ------->| + | | |<-- Exchange code for token + | | |--- Access token + | |<-- Session token -------------| + |<-- Redirect to dashboard ----| | +``` + +### Labeling Rules + +**Arrow labels** = What is sent/requested: +- ✅ "POST /users with user_data" +- ✅ "Return 200 OK with user_id" +- ✅ "Publish OrderCreated event" +- ❌ "Request" (too vague) +- ❌ "Step 3" (not semantic) + +**Actor/System names** = Specific entities: +- ✅ "API Gateway", "Auth Service", "Users Database" +- ❌ "Service1", "Database" (too generic) + + +## Diagram Type 2: Component/Architecture Diagram + +**Use for**: Static system structure and relationships. + +**When to use**: +- Microservices architecture +- System components and dependencies +- Database relationships +- Infrastructure layout +- No temporal aspect (not "then what happens", just "what connects to what") + +### Structure + +``` +┌─────────────┐ +│ Component A │ +└──────┬──────┘ + │ relationship_label + ↓ +┌─────────────┐ +│ Component B │ +└─────────────┘ +``` + +**Components** = boxes with names. **Relationships** = arrows with meaningful labels. + +### Example: Microservices Architecture + +``` + ┌──────────────────┐ + │ API Gateway │ + │ (Routes requests)│ + └────┬────────┬────┘ + │ │ + authenticates │ │ queries orders + │ │ + ┌────▼───┐ ┌─▼──────────────┐ + │ Auth │ │ Order Service │ + │Service │ └────┬───────────┘ + └────┬───┘ │ + │ │ publishes OrderCreated + │ queries │ + ↓ ↓ + ┌──────────┐ ┌────────────┐ + │ Users DB │ │ Queue │ + └──────────┘ └─────┬──────┘ + │ consumes + ↓ + ┌──────────────────┐ + │ Notification │ + │ Service │ + └──────────────────┘ +``` + +### Labeling Rules + +**Component names** = What they are + brief function: +- ✅ "Auth Service (validates tokens)" +- ✅ "Users Database (PostgreSQL)" +- ✅ "Message Queue (RabbitMQ)" +- ❌ "Service", "DB", "Queue" (too generic) + +**Relationship labels** = Specific action: +- ✅ "authenticates user", "queries orders", "publishes OrderCreated" +- ✅ "reads from", "writes to", "subscribes to" +- ❌ "uses", "talks to", "connects" (too vague) + +**Consistency**: Use same terminology as code/documentation. + + +## Diagram Type 3: Data Flow Diagram + +**Use for**: Data movement and transformations. + +**When to use**: +- ETL pipelines +- Data processing workflows +- Input → transformation → output flows + +### Structure + +``` +[Input Source] → [Transform] → [Transform] → [Output Destination] +``` + +### Example: Data Pipeline + +``` +CSV Files Parse CSV Validate Enrich with Write to +(S3 Bucket) → (extract) → (check) → Metadata → Database + │ │ │ (Postgres) + ↓ ↓ ↓ + JSON objects Valid records Records + + timestamps +``` + +### Labeling Rules + +**Transformation steps** = What happens to data: +- ✅ "Parse CSV to JSON" +- ✅ "Validate schema" +- ✅ "Enrich with timestamps" +- ❌ "Process", "Handle" (not specific) + +**Data labels** = What format/content: +- ✅ "CSV records", "JSON objects", "Valid records" +- ✅ Show intermediate formats if they change + + +## Diagram Type 4: State Diagram + +**Use for**: State changes over entity lifecycle. + +**When to use**: +- Order states (pending → paid → shipped) +- Connection states (disconnected → connecting → connected) +- Workflow states (draft → review → approved) + +### Structure + +``` +[State 1] --event/condition--> [State 2] --event/condition--> [State 3] +``` + +### Example: Order Lifecycle + +``` +┌─────────┐ payment ┌──────────┐ fulfill ┌──────────┐ +│ Pending │ received │ Paid │ order │ Shipped │ +└─────────┘ ───────────→ └──────────┘ ──────────→ └──────────┘ + │ │ │ + │ cancel │ refund │ deliver + ↓ ↓ ↓ +┌──────────┐ ┌──────────┐ ┌──────────┐ +│Cancelled │ │Refunded │ │Delivered │ +└──────────┘ └──────────┘ └──────────┘ +``` + +### Labeling Rules + +**States** = Noun describing entity status: +- ✅ "Pending", "Paid", "Shipped" +- ✅ "Connected", "Disconnected" +- ❌ "Processing" (too vague - processing what?) + +**Transitions** = Event or condition causing change: +- ✅ "payment received", "cancel order", "fulfill order" +- ✅ "timeout expires", "user clicks submit" +- ❌ "go to next state" (not semantic) + + +## When Flowcharts Become Anti-Patterns + +Flowcharts are overused. Use alternatives for: + +### Anti-Pattern 1: Complex Business Logic + +❌ **Wrong**: Flowchart with 15+ decision diamonds + +✅ **Right**: Decision table or pseudo-code + +**Example**: Authorization logic (authenticated? admin? owns resource?) + +**Better as decision table**: +| Authenticated | Admin | Owns Resource | Result | +|---|---|---|---| +| No | - | - | 401 | +| Yes | Yes | - | Allow | +| Yes | No | Yes | Allow | +| Yes | No | No | Deny | + +**Why**: Flowchart with 4+ conditions becomes spaghetti. Table is scannable. + + +### Anti-Pattern 2: Long Procedures + +❌ **Wrong**: Flowchart showing deployment steps (20 boxes) + +✅ **Right**: Numbered list + +**Example**: +```markdown +## Deployment Steps + +1. Build Docker image: `docker build -t app:v1.0 .` +2. Push to registry: `docker push registry/app:v1.0` +3. Update Kubernetes: `kubectl set image deployment/app app=registry/app:v1.0` +4. Verify pods running: `kubectl get pods -l app=app` +5. Check logs: `kubectl logs -f deployment/app` +``` + +**Why**: Sequential steps don't need visual diagram. Numbered list is clearer. + + +### Anti-Pattern 3: Duplicating Code + +❌ **Wrong**: Flowchart replicating function logic that exists in code + +✅ **Right**: Link to code, don't duplicate + +**Example**: +```markdown +## Token Validation + +See `validate_token()` in `auth/token_validator.py:45-78` + +High-level: Checks signature, expiration, scopes. +``` + +**Why**: Flowchart duplicates code. Gets out of sync when code changes. + + +## Flowchart Usage Checklist + +**Use flowchart ONLY if all these are true:** +- [ ] Fewer than 4 decision points +- [ ] Not duplicating existing code +- [ ] Branching logic is core to understanding (not just procedural steps) +- [ ] No simpler alternative (decision table, list, pseudo-code) + +**If any are false**: Use alternative format. + + +## Semantic Labeling Standards + +### Rule 1: No Generic Names + +❌ **Wrong**: +- "Service1", "Service2" +- "Step1", "Step2" +- "Database", "Queue" +- "Process", "Handle" + +✅ **Right**: +- "Auth Service", "Order Service" +- "Parse CSV", "Validate Schema" +- "Users Database (PostgreSQL)", "Message Queue (RabbitMQ)" +- "Authenticate user", "Publish OrderCreated event" + +**Principle**: Names should have semantic meaning. If you removed the diagram and only saw labels, you'd understand what they do. + + +### Rule 2: Consistent Terminology + +**Use same terms as code/documentation.** + +If code has `AuthenticationService`, diagram should say "Authentication Service", not "Login Handler". + +If code publishes `OrderCreatedEvent`, diagram should say "publishes OrderCreated", not "sends message". + +**Why**: Readers switching between diagram and code should see same concepts. + + +### Rule 3: Meaningful Relationships + +❌ **Wrong**: +- Arrow with no label +- "connects to", "uses" +- "talks to", "calls" + +✅ **Right**: +- "authenticates user with JWT" +- "queries orders by user_id" +- "publishes OrderCreated event to queue" +- "consumes from notifications topic" + +**Pattern**: `[Verb] [Object] [with/via/using] [Details]` + +**Examples**: +- "queries users with SQL SELECT" +- "publishes to orders_topic via Kafka" +- "validates signature using RSA public key" + + +## Quick Reference: Diagram Selection + +| What You're Documenting | Use This Diagram | Key Feature | +|---|---|---| +| **API calls between services** | Sequence | Shows temporal order (time flows down) | +| **Microservices architecture** | Component | Shows static structure (boxes and relationships) | +| **ETL pipeline** | Data Flow | Shows transformations (input → process → output) | +| **Order/connection states** | State | Shows lifecycle (state → event → state) | +| **Simple decision (≤3 conditions)** | Flowchart | Shows branching logic | +| **Complex decision (>3 conditions)** | Decision Table | Scannable conditions and outcomes | +| **Sequential steps (deployment)** | Numbered List | No visual needed for linear steps | + + +## Common Mistakes + +### ❌ Wrong Diagram Type for Content + +**Wrong**: Sequence diagram for system architecture (no temporal aspect) + +**Right**: Component diagram (static structure) + +**Why**: Sequence diagrams imply ordering over time. Architecture is static. + + +### ❌ Generic Labels + +**Wrong**: +``` +Service1 → Service2 → Database +``` + +**Right**: +``` +Auth Service (validates JWT) + → User Service (queries user data) + → Users Database (PostgreSQL) +``` + +**Why**: Generic labels force reader to guess. Semantic labels explain. + + +### ❌ Flowchart for Complex Logic + +**Wrong**: Flowchart with 10+ decision diamonds (authentication logic) + +**Right**: Decision table showing all auth outcomes + +**Why**: Large flowcharts are spaghetti. Tables are scannable. + + +### ❌ Missing Relationship Labels + +**Wrong**: +``` +Auth Service → Database +(arrow with no label) +``` + +**Right**: +``` +Auth Service → Database + "queries users by email" +``` + +**Why**: Unlabeled arrows are ambiguous. Does it read? Write? Both? + + +### ❌ Inconsistent Terminology + +**Wrong**: Code calls it `OrderService`, diagram says "Purchase Handler" + +**Right**: Code and diagram both say "Order Service" + +**Why**: Different terms confuse readers switching between diagram and code. + + +## Cross-References + +**Use WITH this skill**: +- `muna/technical-writer/documentation-structure` - Diagrams go in specific sections (architecture docs, API flows) +- `muna/technical-writer/clarity-and-style` - Diagrams should be scannable, well-labeled + +**Use AFTER this skill**: +- `muna/technical-writer/documentation-testing` - Verify diagrams are understandable + +## Real-World Impact + +**Well-chosen diagrams using these conventions:** +- **Sequence diagram for OAuth flow**: Onboarding developers understood flow in 5 minutes (vs 30 minutes reading prose) +- **Decision table replacing 12-branch flowchart**: Authorization logic bugs reduced from 8 to 0 (scannable table caught missed cases) +- **Component diagram with semantic labels**: New engineers could navigate codebase without asking "what is Service2?" (eliminated 15+ Slack questions per week) + +**Key lesson**: **Right diagram type + semantic labels = immediate understanding. Wrong type or generic labels = confusion.** diff --git a/skills/using-technical-writer/documentation-structure.md b/skills/using-technical-writer/documentation-structure.md new file mode 100644 index 0000000..7ccc665 --- /dev/null +++ b/skills/using-technical-writer/documentation-structure.md @@ -0,0 +1,986 @@ + +# Documentation Structure + +## Overview + +Proven documentation patterns for common technical content. Use these templates to create consistent, complete, findable documentation. + +**Core Principle**: Structure determines findability. Well-structured docs get used; poorly structured docs get ignored. + +## When to Use + +Load this skill when: +- Creating new documentation (ADR, API docs, runbook, README) +- Choosing documentation format +- Organizing existing scattered documentation +- User mentions: "document decision", "API reference", "runbook", "README" + +## ADR (Architecture Decision Record) + +### When to Use ADRs + +**Use ADRs for**: +- Technology choices (database, framework, library) +- Architecture patterns (microservices vs monolith, REST vs GraphQL) +- Design decisions with long-term consequences +- Trade-off decisions (performance vs simplicity) + +**Don't use ADRs for**: +- Implementation details (how to write a function) +- Temporary decisions (which bug to fix first) +- Obvious choices (use version control, write tests) + +### Complete ADR Template + +```markdown +# ADR-NNN: [Short Title of Decision] + +**Status**: [Proposed | Accepted | Deprecated | Superseded by ADR-XXX] +**Date**: YYYY-MM-DD +**Deciders**: [Names or roles of people who made decision] +**Context**: [What prompted this decision] + +## Summary + +[One-paragraph summary of the decision and its impact] + +## Context + +[Describe the problem you're solving] + +- What constraints exist? (technical, business, time, people) +- What requirements must be met? +- What assumptions are we making? +- What's the current state (if replacing something)? + +## Decision + +[State the decision clearly and concisely] + +We will [decision statement]. + +## Alternatives Considered + +### Alternative 1: [Name] + +**Description**: [What this alternative involves] + +**Pros**: +- [Advantage 1] +- [Advantage 2] + +**Cons**: +- [Disadvantage 1] +- [Disadvantage 2] + +**Why rejected**: [Specific reason this wasn't chosen] + +### Alternative 2: [Name] + +[Same format as Alternative 1] + +## Consequences + +### Positive + +- [Good outcome 1] +- [Good outcome 2] + +### Negative + +- [Trade-off 1] +- [Trade-off 2] + +### Neutral + +- [Change that's neither good nor bad, just different] + +## Implementation Notes + +[Optional: Technical details, migration steps, timeline] + +## Related Decisions + +- **Supersedes**: ADR-XXX (if applicable) +- **Superseded by**: ADR-YYY (if applicable) +- **Related to**: ADR-ZZZ, ADR-AAA (decisions that interact with this one) + +## References + +- [Links to relevant documentation, RFCs, blog posts, research papers] + +``` + +### ADR Numbering Convention + +- **Sequential numbering**: ADR-001, ADR-002, etc. +- **Never reuse numbers** (even if decision is deprecated) +- **Pad with zeros**: ADR-007 not ADR-7 (sorts correctly) + +### ADR Location + +``` +docs/architecture/decisions/ +├── README.md (index of all ADRs) +├── ADR-001-use-postgresql.md +├── ADR-002-mls-enforcement.md +├── ADR-003-plugin-registry.md +└── ADR-004-abc-over-protocol.md +``` + +### Example: Real ADR (BasePlugin ABC) + +```markdown +# ADR-004: Use Abstract Base Class Instead of Protocol for Plugin System + +**Status**: Accepted +**Date**: 2025-10-28 +**Deciders**: Security Architecture Team +**Context**: Multi-Level Security enforcement requires reliable type checking + +## Summary + +We will use Abstract Base Class (ABC) instead of Protocol for the BasePlugin interface +to enable runtime type verification critical for security level enforcement. + +## Context + +The plugin system requires security level validation before plugins can execute. We need +to verify that all plugins inherit from BasePlugin to ensure they implement mandatory +security methods and properties. + +Constraints: +- Security level must be immutable and verifiable at runtime +- Plugin registration must confirm plugin type before allowing execution +- Need to prevent duck-typed plugins from bypassing security checks + +Python offers two approaches for defining plugin interfaces: +1. Protocol (PEP 544) - structural subtyping (duck typing) +2. Abstract Base Class - nominal typing with inheritance + +## Decision + +We will use Abstract Base Class (ABC) with @abstractmethod for the BasePlugin interface. + +## Alternatives Considered + +### Alternative 1: Protocol-based Interface + +**Description**: Define BasePlugin as a Protocol, allowing any class implementing +the required methods to be considered a valid plugin. + +**Pros**: +- More flexible - no inheritance required +- Easier for third-party plugins +- More "Pythonic" for general use + +**Cons**: +- isinstance() checks don't work reliably with Protocol +- Security bypass risk: attacker creates duck-typed plugin without BasePlugin +- Can't seal security-critical methods +- Type checking is structural, not nominal + +**Why rejected**: Security level verification requires isinstance() to confirm plugin +inheritance. Protocol duck typing allows security bypasses (see threat model THREAT-003). + +### Alternative 2: Manual Registration Without Type Checks + +**Description**: Don't enforce type at all - rely on plugin registry and runtime checks. + +**Pros**: +- Maximum flexibility +- No inheritance requirements + +**Cons**: +- No compile-time safety +- Easy to bypass registration +- Higher runtime overhead for checks + +**Why rejected**: Defense-in-depth principle requires type system + registry + runtime. +Single-layer validation is insufficient. + +## Consequences + +### Positive + +- isinstance(plugin, BasePlugin) provides reliable runtime type checking +- Sealed methods prevent subclasses from overriding security-critical code +- Nominal typing makes security boundaries explicit +- Compile-time type safety via mypy + +### Negative + +- Third-party plugins must inherit from BasePlugin (less flexible) +- Tighter coupling between plugins and framework +- Slightly more boilerplate for plugin authors + +### Neutral + +- Plugins must be registered AND inherit from BasePlugin (defense-in-depth) + +## Implementation Notes + +- BasePlugin declared as ABC with frozen dataclass +- security_level property marked as @abstractmethod +- Plugin factory verifies isinstance() before instantiation +- Mypy configured to require nominal types for plugins + +## Related Decisions + +- **Related to**: ADR-002 (MLS enforcement - requires type checking) +- **Related to**: ADR-003 (Plugin registry - ABC + registry = defense-in-depth) +- **Related to**: ADR-005 (Frozen plugin capability - ABC enables sealed methods) + +## References + +- PEP 544: Protocols - https://peps.python.org/pep-0544/ +- Bell-LaPadula MLS model requirements +- Threat model THREAT-003: Type system bypass via duck typing +``` + + +## API Reference Documentation + +### When to Use API Reference Pattern + +**Use for**: +- REST APIs +- GraphQL APIs +- Library/SDK public interfaces +- Internal service APIs + +### Complete API Documentation Structure + +```markdown +# [Service/API Name] API Reference + +## Overview + +**Base URL**: `https://api.example.com/v1` +**Protocol**: HTTPS only +**Format**: JSON + +[One-paragraph description of what this API does] + +## Authentication + +### Method + +[OAuth 2.0 | API Key | JWT | etc.] + +### Obtaining Credentials + +[How to get API key/token] + +### Using Authentication + +**Header Format**: +``` +Authorization: Bearer {token} +``` + +**Example**: +```bash +curl -H "Authorization: Bearer abc123..." https://api.example.com/v1/users +``` + +### Token Expiration + +- **Access tokens**: 1 hour +- **Refresh tokens**: 30 days + +## Rate Limiting + +- **Limit**: 1000 requests per hour per API key +- **Headers**: + - `X-RateLimit-Limit`: Your rate limit ceiling + - `X-RateLimit-Remaining`: Requests remaining in window + - `X-RateLimit-Reset`: UTC epoch seconds when limit resets + +**Example Response** (429 Too Many Requests): +```json +{ + "error": "rate_limit_exceeded", + "message": "Rate limit of 1000 requests per hour exceeded", + "retry_after": 1800 +} +``` + +## Pagination + +### Parameters + +- `page`: Page number (default: 1) +- `limit`: Items per page (default: 20, max: 100) + +### Response Format + +```json +{ + "data": [ /* items */ ], + "pagination": { + "page": 1, + "limit": 20, + "total": 157, + "pages": 8 + } +} +``` + +### Navigation Links + +```json +{ + "links": { + "first": "https://api.example.com/v1/users?page=1", + "prev": null, + "next": "https://api.example.com/v1/users?page=2", + "last": "https://api.example.com/v1/users?page=8" + } +} +``` + +## Versioning + +- **URL-based versioning**: `/v1/`, `/v2/` +- **Current version**: v1 +- **Deprecation policy**: 12 months notice before version sunset + +## Endpoints + +### [Resource Name] + +#### List [Resources] + +**Endpoint**: `GET /[resource]` + +**Description**: [What this endpoint does] + +**Authentication**: Required + +**Query Parameters**: +- `param1` (string, optional): [Description] +- `param2` (integer, optional): [Description] +- `page` (integer, optional): Page number +- `limit` (integer, optional): Items per page + +**Example Request**: +```bash +curl -X GET "https://api.example.com/v1/users?role=admin&page=1&limit=20" \ + -H "Authorization: Bearer abc123..." +``` + +**Success Response** (200 OK): +```json +{ + "data": [ + { + "id": "550e8400-e29b-41d4-a716-446655440000", + "username": "jdoe", + "email": "jdoe@example.com", + "role": "admin", + "created_at": "2025-10-28T14:30:00Z" + } + ], + "pagination": { + "page": 1, + "limit": 20, + "total": 1 + } +} +``` + +**Error Responses**: +- `401 Unauthorized`: Missing or invalid authentication token +- `403 Forbidden`: Authenticated but lacks permission +- `429 Too Many Requests`: Rate limit exceeded + + +#### Get [Resource] + +**Endpoint**: `GET /[resource]/{id}` + +[Similar format as above] + + +#### Create [Resource] + +**Endpoint**: `POST /[resource]` + +[Similar format as above] + + +#### Update [Resource] + +**Endpoint**: `PUT /[resource]/{id}` or `PATCH /[resource]/{id}` + +[Similar format as above] + + +#### Delete [Resource] + +**Endpoint**: `DELETE /[resource]/{id}` + +[Similar format as above] + + +## Error Codes + +| Code | Name | Description | +|------|------|-------------| +| 400 | Bad Request | Invalid request format or parameters | +| 401 | Unauthorized | Missing or invalid authentication | +| 403 | Forbidden | Authenticated but lacks permission | +| 404 | Not Found | Resource doesn't exist | +| 409 | Conflict | Resource already exists or version conflict | +| 422 | Unprocessable Entity | Validation failed | +| 429 | Too Many Requests | Rate limit exceeded | +| 500 | Internal Server Error | Server error (contact support) | +| 503 | Service Unavailable | Temporary outage or maintenance | + +### Error Response Format + +```json +{ + "error": "error_code_identifier", + "message": "Human-readable error message", + "details": { + "field": "specific_field_with_error", + "reason": "why_it_failed" + }, + "request_id": "req_abc123", + "timestamp": "2025-10-28T14:30:00Z" +} +``` + +## SDKs and Client Libraries + +[Links to official SDKs for different languages] + +## Webhooks + +[If applicable - webhook registration, event types, payload formats] + +## Changelog + +### v1.2.0 (2025-10-15) +- Added: Webhook support for user events +- Changed: Increased rate limit from 500 to 1000 req/hour + +### v1.1.0 (2025-09-01) +- Added: PATCH support for partial updates +- Fixed: Pagination links for empty results + +``` + + +## Runbook Pattern + +### When to Use Runbooks + +**Use runbooks for**: +- Deployment procedures +- Incident response playbooks +- Maintenance operations +- Recovery procedures +- Regular operational tasks + +### Complete Runbook Template + +```markdown +# Runbook: [Operation Name] + +**Purpose**: [One-sentence description of what this runbook achieves] +**Owner**: [Team or person responsible] +**Last Updated**: YYYY-MM-DD +**Frequency**: [On-demand | Weekly | Monthly | During incidents] + +## Overview + +[2-3 sentences describing when to use this runbook and what it accomplishes] + +## Prerequisites + +### Required Access + +- [ ] Production database access (role: `db-operator`) +- [ ] Kubernetes cluster access (namespace: `production`) +- [ ] PagerDuty access (for incident updates) +- [ ] VPN connection to production network + +### Required Tools + +- [ ] `kubectl` v1.28+ +- [ ] `psql` PostgreSQL client +- [ ] `aws-cli` configured with production profile +- [ ] SSH key for bastion host + +### Required Knowledge + +- Basic Kubernetes concepts +- SQL query syntax +- Understanding of [specific system architecture] + +### Verification + +Run these commands to verify prerequisites: +```bash +# Check kubectl access +kubectl get nodes + +# Check database access +psql -h db.production.example.com -U operator -c "SELECT 1" + +# Check AWS access +aws sts get-caller-identity +``` + +## Safety Checks + +**STOP if any of these are true**: +- [ ] Active incident in progress (check PagerDuty) +- [ ] Scheduled maintenance window not started +- [ ] Change request not approved +- [ ] Backup not verified (see "Pre-Operation Backup" below) + +## Procedure + +### Step 1: Create Backup + +**Purpose**: Ensure rollback is possible if operation fails + +```bash +# Create database backup +pg_dump -h db.production.example.com -U operator \ + -Fc production_db > backup-$(date +%Y%m%d-%H%M%S).dump + +# Verify backup +ls -lh backup-*.dump +``` + +**Expected Result**: Backup file created, size > 0 bytes + +**If this fails**: [What to do if backup fails] + + +### Step 2: [Operation Step] + +**Purpose**: [What this step does] + +```bash +# Commands to run +command1 +command2 +``` + +**Expected Result**: [What you should see] + +**If this fails**: [Troubleshooting steps] + + +[Repeat for each step] + + +### Final Step: Verify Operation + +**Purpose**: Confirm operation succeeded + +```bash +# Verification commands +``` + +**Success Criteria**: +- [ ] Service responds with 200 OK +- [ ] No errors in logs (last 5 minutes) +- [ ] Metrics show normal traffic + +## Post-Operation + +### Update Tracking + +- [ ] Update change request ticket with completion time +- [ ] Update runbook if procedure changed +- [ ] Document any deviations from standard procedure + +### Monitoring + +Monitor these for 30 minutes after operation: +- Application logs: `kubectl logs -f deployment/app -n production` +- Error rate: [Link to monitoring dashboard] +- Response time: [Link to metrics] + +## Rollback Procedure + +**When to rollback**: +- Operation failed at any step +- Post-operation verification failed +- Unexpected behavior observed + +**Steps**: +```bash +# Restore from backup +pg_restore -h db.production.example.com -U operator \ + -d production_db backup-YYYYMMDD-HHMMSS.dump +``` + +[Additional rollback steps] + +**Verification**: +- [ ] Service restored to pre-operation state +- [ ] No data loss confirmed +- [ ] Application functioning normally + +## Troubleshooting + +### Problem: [Common Issue 1] + +**Symptoms**: [What you see] + +**Cause**: [Why this happens] + +**Solution**: +```bash +# Commands to fix +``` + + +### Problem: [Common Issue 2] + +[Same format] + + +## Escalation + +**When to escalate**: +- Rollback failed +- Data integrity concerns +- Incident severity increases +- Unsure how to proceed + +**Who to contact**: +1. **On-call engineer**: [PagerDuty rotation or phone] +2. **Database team**: [Contact method] +3. **Security team** (if data breach suspected): [Contact method] + +## References + +- [Link to architecture diagram] +- [Link to related runbooks] +- [Link to incident post-mortems] +- [Link to system documentation] + +``` + + +## README Patterns + +### When to Use Each README Type + +**Simple README** (<100 lines): +- Single-purpose utilities +- Scripts +- Small libraries + +**Standard README** (100-300 lines): +- Applications +- Multi-feature libraries +- Services + +**Comprehensive README** (300+ lines): +- Open-source projects +- Complex systems +- Projects with many contributors + +### Simple README Template + +For small utilities and scripts: + +```markdown +# [Project Name] + +[One-sentence description of what it does] + +## Installation + +```bash +pip install project-name +``` + +## Usage + +```bash +# Basic example +project-name input.txt output.txt + +# With options +project-name --verbose input.txt output.txt +``` + +## Options + +- `--verbose`: Print detailed progress +- `--output FILE`: Specify output file + +## Requirements + +- Python 3.8+ +- No external dependencies + +## License + +MIT +``` + +### Standard README Template + +For most projects: + +```markdown +# [Project Name] + +[2-3 sentence description of what the project does and why it exists] + +## Features + +- Feature 1 +- Feature 2 +- Feature 3 + +## Installation + +### Prerequisites + +- [Dependency 1] version X.Y+ +- [Dependency 2] + +### Install from Source + +```bash +git clone https://github.com/user/project.git +cd project +pip install -r requirements.txt +``` + +### Install from Package Manager + +```bash +pip install project-name +``` + +## Quick Start + +```bash +# Minimal example to get started +project-name --help +``` + +## Usage + +### Basic Usage + +```bash +# Example 1 +project-name command arg1 arg2 + +# Example 2 +project-name --option value +``` + +### Advanced Usage + +[More complex examples] + +## Configuration + +Configuration file location: `~/.project/config.yml` + +```yaml +# Example configuration +option1: value1 +option2: value2 +``` + +## Documentation + +- [User Guide](docs/user-guide.md) +- [API Reference](docs/api-reference.md) +- [Architecture](docs/architecture/README.md) + +## Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) + +## License + +[License name] - see [LICENSE](LICENSE) + +## Support + +- [Issue tracker](https://github.com/user/project/issues) +- [Discussions](https://github.com/user/project/discussions) +- [Email](support@example.com) +``` + +### Comprehensive README Template + +For open-source and complex projects: + +```markdown +# [Project Name] + +[![Build Status](badge-url)](build-url) +[![Coverage](badge-url)](coverage-url) +[![License](badge-url)](license-url) + +[3-4 sentence description of the project, its purpose, and key benefits] + +## Table of Contents + +- [Features](#features) +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Usage](#usage) +- [Configuration](#configuration) +- [Architecture](#architecture) +- [API Reference](#api-reference) +- [Contributing](#contributing) +- [Testing](#testing) +- [Deployment](#deployment) +- [Troubleshooting](#troubleshooting) +- [FAQ](#faq) +- [Roadmap](#roadmap) +- [License](#license) +- [Acknowledgements](#acknowledgements) + +[Rest of content follows standard README template but with more depth] + +## Architecture + +High-level overview with diagram: + +``` +[ASCII diagram or link to docs/architecture/] +``` + +## Performance + +- Benchmark results +- Scalability characteristics +- Resource requirements + +## Security + +See [SECURITY.md](SECURITY.md) for security policy and vulnerability reporting. + +## Changelog + +See [CHANGELOG.md](CHANGELOG.md) for version history. + +## Roadmap + +- [x] Feature 1 (completed) +- [ ] Feature 2 (in progress) +- [ ] Feature 3 (planned) + +See [full roadmap](ROADMAP.md) +``` + + +## Architecture Documentation Structure + +### Directory Organization + +``` +docs/ +├── README.md (navigation hub) +├── architecture/ +│ ├── README.md (system overview) +│ ├── decisions/ (ADRs) +│ │ ├── README.md (ADR index) +│ │ └── ADR-NNN-*.md +│ ├── diagrams/ +│ │ ├── system-overview.png +│ │ ├── data-flow.png +│ │ └── deployment.png +│ ├── components/ +│ │ ├── authentication.md +│ │ ├── database.md +│ │ └── api-gateway.md +│ └── security/ +│ ├── threat-model.md +│ ├── access-control.md +│ └── encryption.md +├── api/ +│ └── reference.md +├── guides/ +│ ├── getting-started.md +│ ├── contributing.md +│ └── deployment.md +└── runbooks/ + ├── deployment.md + ├── backup-restore.md + └── incident-response.md +``` + + +## Common Mistakes + +### ❌ Incomplete ADRs +**Wrong**: ADR with only "We chose X" and no alternatives/consequences +**Right**: Complete ADR with Context, Alternatives Considered, Consequences, Related Decisions + +### ❌ Scattered Documentation +**Wrong**: Decisions in README, code comments, wiki, Slack +**Right**: Single source of truth - decisions in ADRs, linked from other locations + +### ❌ Missing API Details +**Wrong**: API docs with only endpoints and examples +**Right**: API docs with auth, rate limiting, pagination, versioning, error codes + +### ❌ Incomplete Runbooks +**Wrong**: Runbook with only procedure steps +**Right**: Runbook with prerequisites, safety checks, verification, rollback, troubleshooting + +### ❌ Generic README +**Wrong**: README saying "This is a project that does things" +**Right**: README with concrete features, runnable examples, clear installation steps + + +## Quick Reference + +| Document Type | Use When | Key Sections | +|---------------|----------|--------------| +| **ADR** | Architecture/technology decisions with long-term impact | Context, Alternatives, Consequences, Related Decisions | +| **API Reference** | Documenting REST/GraphQL APIs | Auth, Rate Limiting, Pagination, Endpoints, Errors | +| **Runbook** | Operational procedures | Prerequisites, Safety, Procedure, Verification, Rollback | +| **README (Simple)** | Small utilities (<100 lines) | Installation, Usage, Options | +| **README (Standard)** | Most projects | Features, Installation, Quick Start, Usage, Config | +| **README (Comprehensive)** | Open-source/complex projects | All standard + Architecture, Performance, Roadmap | + + +## Real-World Example: Elspeth Documentation Evolution + +**Before** (Scattered narratives): +- README: 8 sections explaining architecture decisions +- Code comments: "// We chose ABC because..." +- No traceability or findability + +**After** (Structured with ADRs): +- 14 ADRs documenting key decisions +- README: Quick start + links to ADRs +- Code comments: `// See ADR-004 for rationale` +- Clear decision trail: ADR-002 (MLS) → ADR-003 (Registry) → ADR-004 (ABC) → ADR-005 (Frozen) + +**Key Improvement**: "Can't find why we chose X" → "ADR-004 documents ABC vs Protocol decision with full context" + + +## Summary + +**Use the right structure for the content type:** + +- **ADRs**: Architecture decisions → Complete template with alternatives and consequences +- **API docs**: REST/GraphQL → Auth, rate limiting, pagination, versioning, errors +- **Runbooks**: Operations → Prerequisites, safety, procedure, verification, rollback +- **READMEs**: Project overview → Match complexity (simple/standard/comprehensive) + +**Meta-rule**: Good structure makes docs findable. If readers can't find it, you haven't documented it. diff --git a/skills/using-technical-writer/documentation-testing.md b/skills/using-technical-writer/documentation-testing.md new file mode 100644 index 0000000..02bc615 --- /dev/null +++ b/skills/using-technical-writer/documentation-testing.md @@ -0,0 +1,549 @@ + +# Documentation Testing + +## Overview + +Test documentation like you test code. Core principle: **If you haven't tried it, it's broken**. + +**Key insight**: Untested documentation always has issues. Copy-paste-run test finds them before users do. + +## When to Use + +Load this skill when: +- Finalizing documentation before release +- Reviewing documentation quality +- Creating documentation quality gates +- After major doc updates + +**Symptoms you need this**: +- "Is this documentation good enough to ship?" +- Preparing installation guides, quick starts, tutorials +- Documentation quality review +- Pre-release documentation checklist + +**Don't use for**: +- Writing documentation (use `muna/technical-writer/clarity-and-style`) +- Structuring documentation (use `muna/technical-writer/documentation-structure`) + +## Five Testing Dimensions + +Test documentation across 5 dimensions: + +### 1. Completeness Testing +**Question**: Can reader accomplish the task with ONLY this documentation? + +### 2. Accuracy Testing +**Question**: Do all examples, commands, and instructions actually work? + +### 3. Findability Testing +**Question**: Can users find this documentation when they need it? + +### 4. Example Verification +**Question**: Can you copy-paste every example and have it run without modification? + +### 5. Walkthrough Testing +**Question**: Can a new user follow this successfully on a clean system? + + +## Dimension 1: Completeness Testing + +**Goal**: Verify reader can complete task without external resources. + +### Checklist + +- [ ] **All prerequisites listed** - OS, language versions, required accounts, tools +- [ ] **All configuration options documented** - Every setting, not just defaults +- [ ] **Error cases covered** - What can go wrong and how to fix +- [ ] **Troubleshooting section** - Common issues with solutions +- [ ] **Success criteria** - "How do I know it worked?" +- [ ] **Next steps** - What to do after completing this doc + +### Example: Installation Guide + +❌ **Incomplete**: +```markdown +## Installation + +Run: +\`\`\`bash +npm install our-app +\`\`\` + +You're done! +``` + +**Missing**: Prerequisites, error handling, verification + +✅ **Complete**: +```markdown +## Installation + +### Prerequisites +- Node.js 18+ (`node --version` to check) +- npm 9+ (`npm --version` to check) +- Active internet connection + +### Install + +\`\`\`bash +npm install our-app +\`\`\` + +### Verify Installation + +Check installed version: +\`\`\`bash +npx our-app --version +# Expected output: our-app v2.1.0 +\`\`\` + +### Troubleshooting + +**Error: "EACCES: permission denied"** +- Solution: Run with sudo: `sudo npm install -g our-app` + +**Error: "Unsupported engine"** +- Solution: Upgrade Node.js to 18+ + +### Next Steps +- [Quick Start Guide](./quick-start.md) +- [Configuration Reference](./config.md) +``` + +### Testing Method + +**The "Clean Slate Test"**: +1. Can someone with ZERO context complete this? +2. Read doc, follow instructions +3. Note every moment you had to Google or guess + + +## Dimension 2: Accuracy Testing + +**Goal**: Verify all examples, commands, and instructions work. + +### Checklist + +- [ ] **Code examples run** - Copy-paste into environment, executes without errors +- [ ] **Commands correct** - No typos, correct options, work on stated OS +- [ ] **Version numbers current** - Not referencing outdated versions +- [ ] **Screenshots up-to-date** - Match current UI +- [ ] **Links work** - No 404s, links go to correct pages +- [ ] **Output matches examples** - Documented output = actual output + +### Example: API Documentation + +❌ **Inaccurate**: +```markdown +Make a request: +\`\`\`bash +curl https://api.example.com/users +\`\`\` + +Returns user list. +``` + +**Issues**: No authentication, vague output + +✅ **Accurate**: +```markdown +Make a request: +\`\`\`bash +curl -H "Authorization: Bearer YOUR_API_KEY" \\ + https://api.example.com/v1/users +\`\`\` + +Response: +\`\`\`json +{ + "users": [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"} + ], + "total": 2 +} +\`\`\` + +Status: 200 OK +``` + +### Testing Method + +**Copy-Paste-Run Test**: +1. Copy EVERY code example +2. Paste into clean environment +3. Run without modifications +4. Verify output matches documentation +5. If ANY example fails, documentation is inaccurate + + +## Dimension 3: Findability Testing + +**Goal**: Verify users can find documentation when needed. + +### Checklist + +- [ ] **Keywords present** - Terms users would search for +- [ ] **Linked from related pages** - Cross-references in both directions +- [ ] **In navigation/TOC** - Appears in sidebar, index, sitemap +- [ ] **Search engine optimized** - Title, headers, meta description +- [ ] **Clear title** - Describes content accurately + +### Example: Deployment Guide + +❌ **Not Findable**: +```markdown +# Guide + +Deploy the app... +``` + +**Issues**: Generic title, no keywords + +✅ **Findable**: +```markdown +# Deploying to AWS ECS with Docker (Production) + +**Keywords**: AWS, ECS, Fargate, Docker, deployment, production, continuous deployment, CI/CD + +Deploy our application to AWS Elastic Container Service (ECS) using Docker containers... + +**Related**: +- [Docker Configuration](./docker.md) +- [CI/CD Pipeline](./cicd.md) +- [Environment Variables](./env-vars.md) +``` + +### Testing Method + +**Search Simulation**: +1. What would user search for? ("deploy to AWS", "ECS deployment", "Docker production") +2. Search your docs with those terms +3. Does this page appear in top 3 results? + + +## Dimension 4: Example Verification + +**Goal**: All examples work without modification. + +### Checklist + +- [ ] **Examples are complete** - Include all necessary imports, setup +- [ ] **No placeholders without explanation** - If using `YOUR_API_KEY`, explain how to get it +- [ ] **Environment specified** - Language version, OS, dependencies +- [ ] **Async/await correct** - Don't forget await on promises +- [ ] **Error handling shown** - Not just happy path + +### Example: Code Example + +❌ **Unverified**: +```javascript +const users = client.get('/users'); +console.log(users); +``` + +**Issues**: Missing await, won't work + +✅ **Verified**: +```javascript +// Prerequisites: npm install api-client +// Environment: Node.js 18+ + +import APIClient from 'api-client'; + +// Get API key from https://dashboard.example.com/settings +const client = new APIClient(process.env.API_KEY); + +async function getUsers() { + try { + const users = await client.get('/users'); + console.log('Users:', users); + // Output: Users: [{id: 1, name: 'Alice'}, {id: 2, name: 'Bob'}] + } catch (error) { + console.error('Failed to fetch users:', error.message); + } +} + +getUsers(); +``` + +### Testing Method + +**Literal Copy-Paste Test**: +1. Copy example +2. Create new file +3. Paste (no modifications) +4. Run +5. Does it work? If not, example is broken. + + +## Dimension 5: Walkthrough Testing + +**Goal**: New user can follow successfully on clean system. + +### Checklist + +- [ ] **Test on clean system** - Fresh VM/container, not your dev machine +- [ ] **Follow every step literally** - No shortcuts, no cached knowledge +- [ ] **Note confusion points** - Every time you have to guess or Google +- [ ] **Verify timing claims** - "5 minute setup" actually takes 5 minutes? +- [ ] **Test with beginner** - Colleague unfamiliar with project + +### Example: Quick Start + +❌ **Untested**: +```markdown +# Quick Start (5 minutes) + +1. Install the CLI +2. Configure your credentials +3. Deploy your first app + +Done! +``` + +**Issues**: Vague steps, unverified timing + +✅ **Walkthrough-Tested**: +```markdown +# Quick Start (15 minutes) + +## Prerequisites (2 min) +- [ ] Ubuntu 22.04 or macOS 12+ +- [ ] 2GB free disk space +- [ ] Internet connection + +## Step 1: Install CLI (5 min) + +\`\`\`bash +curl -L https://releases.example.com/cli.sh | bash +\`\`\` + +Verify: +\`\`\`bash +our-cli --version +# Should output: our-cli v2.1.0 +\`\`\` + +**Troubleshooting**: If command not found, close and reopen terminal. + +## Step 2: Configure Credentials (3 min) + +Get API key: https://dashboard.example.com/settings + +\`\`\`bash +our-cli auth login YOUR_API_KEY_HERE +\`\`\` + +Success message: "✓ Authenticated as user@example.com" + +## Step 3: Deploy First App (5 min) + +\`\`\`bash +mkdir my-app && cd my-app +our-cli init +our-cli deploy +\`\`\` + +**Success criteria**: URL shown: "✓ Deployed to https://my-app-abc123.example.com" + +Visit URL in browser - you should see "Hello World" + +## Next Steps +- [Add custom domain](./custom-domains.md) +- [Configure environment variables](./env-vars.md) +``` + +### Testing Method + +**New User Walkthrough**: +1. Spin up clean VM/container +2. Follow guide step-by-step as written +3. Don't use any cached knowledge +4. Note EVERY point of confusion +5. Time how long it actually takes + +**Or**: Give to colleague who's never used the product. Watch them follow it. Note every question they ask. + + +## Testing Workflow + +Use this workflow before releasing documentation: + +### Phase 1: Quick Checks (10 min) + +- [ ] Read through once - obvious errors? +- [ ] Check all links - do they work? +- [ ] Scan for placeholders - any unexplained `YOUR_X_HERE`? +- [ ] Verify versions - are version numbers current? + +### Phase 2: Example Verification (30 min) + +- [ ] Copy EVERY code example +- [ ] Paste into clean environment +- [ ] Run without modifications +- [ ] Verify output matches docs + +### Phase 3: Completeness Check (15 min) + +- [ ] Prerequisites listed? +- [ ] Error cases covered? +- [ ] Troubleshooting section? +- [ ] Success criteria ("how do I know it worked")? + +### Phase 4: Walkthrough Test (60 min) + +- [ ] Fresh VM/container +- [ ] Follow as new user +- [ ] Note confusion points +- [ ] Verify timing claims + +### Phase 5: Findability Check (10 min) + +- [ ] Search docs with user keywords +- [ ] Check cross-references +- [ ] Verify in navigation/TOC + +**Total time**: ~2 hours for thorough documentation testing + + +## Common Issues Found By Testing + +### Issue: Async/Await Missing + +**Example**: +```javascript +const data = api.get('/endpoint'); // Missing await +console.log(data); // Prints Promise object, not data +``` + +**Found by**: Copy-paste-run test + + +### Issue: Prerequisites Not Listed + +**Example**: +```markdown +Run: `docker-compose up` +``` + +**Missing**: Docker installed, docker-compose.yml file exists + +**Found by**: Clean system walkthrough + + +### Issue: Environment Variables Not Explained + +**Example**: +```javascript +const key = process.env.API_KEY; // How do I set this? +``` + +**Found by**: New user walkthrough (where do I get API key?) + + +### Issue: Timing Claims Unverified + +**Example**: "Setup in 5 minutes" actually takes 20 minutes (npm install, account creation, key generation) + +**Found by**: Walkthrough testing with timer + + +### Issue: Success Criteria Missing + +**Example**: +```markdown +Deploy your app: +\`\`\`bash +deploy.sh +\`\`\` +``` + +**Missing**: How do I know it worked? What URL? What should I see? + +**Found by**: Completeness testing + + +## Documentation Testing Report Template + +```markdown +# Documentation Testing Report + +**Document**: [Quick Start Guide / API Reference / etc.] +**Tester**: [Name] +**Date**: [Date] +**Environment**: [Clean Ubuntu 22.04 VM / macOS 13 / etc.] + +## Test Results + +### Completeness ✅ / ❌ +- [ ] Prerequisites listed +- [ ] Error cases covered +- [ ] Troubleshooting included +- [ ] Success criteria present + +**Issues Found**: [List any gaps] + +### Accuracy ✅ / ❌ +- [ ] All code examples run +- [ ] All commands correct +- [ ] Links work +- [ ] Output matches docs + +**Issues Found**: [List inaccuracies] + +### Findability ✅ / ❌ +- [ ] Keywords present +- [ ] Linked from related pages +- [ ] In navigation + +**Issues Found**: [List findability gaps] + +### Examples ✅ / ❌ +**Copy-Paste-Run Results**: +- Example 1: ✅ Works / ❌ Failed - [error] +- Example 2: ✅ Works / ❌ Failed - [error] + +### Walkthrough ✅ / ❌ +**Confusion Points**: [List every point where you had to guess or Google] +**Actual Time**: [X minutes] vs Claimed: [Y minutes] +**Success**: ✅ Completed task / ❌ Got stuck at step X + +## Recommendations +1. [Fix async/await in example 2] +2. [Add prerequisites section] +3. [Update timing claim from 5 to 15 minutes] + +## Overall: Ready for Release ✅ / Needs Work ❌ +``` + + +## Quick Reference: Testing Checklist + +| Dimension | Key Question | Quick Test | +|-----------|--------------|------------| +| **Completeness** | Can task be done with ONLY this doc? | List everything needed - is it in the doc? | +| **Accuracy** | Do examples run? | Copy-paste every example, run it | +| **Findability** | Can users find this? | Search with user keywords - does it appear? | +| **Examples** | Copy-paste-run works? | Literally copy-paste, no modifications, run | +| **Walkthrough** | Does it work for new user? | Fresh VM, follow as beginner, time it | + + +## Cross-References + +**Use BEFORE this skill**: +- `muna/technical-writer/clarity-and-style` - Write clear docs +- `muna/technical-writer/documentation-structure` - Structure docs properly + +**Use AFTER this skill**: +- Fix issues found, then re-test + +## Real-World Impact + +**Documentation tested with this framework:** +- **Quick start guide testing caught async/await bug** in 3/5 examples (would have broken for every user) +- **Walkthrough testing revealed "5 minute setup" actually took 22 minutes** (including account creation and key generation not mentioned in docs) +- **Copy-paste-run test found missing `import` statement** that prevented example from running (developer's IDE auto-imported it) +- **Clean system test revealed missing prerequisite** (Docker Compose not documented) that blocked 40% of users + +**Key lesson**: **Untested documentation always has issues. 2 hours of testing prevents weeks of user confusion and support tickets.** diff --git a/skills/using-technical-writer/incident-response-documentation.md b/skills/using-technical-writer/incident-response-documentation.md new file mode 100644 index 0000000..76feae6 --- /dev/null +++ b/skills/using-technical-writer/incident-response-documentation.md @@ -0,0 +1,391 @@ + +# Incident Response Documentation + +## Overview + +Create actionable runbooks for high-stress incidents. Core principle: **Clear, numbered steps with decision trees - no ambiguity under pressure**. + +**Key insight**: Incidents are chaotic. Good runbooks provide structure when thinking is hardest. + +## When to Use + +Load this skill when: +- Creating security incident runbooks +- Documenting operational emergency procedures +- Writing post-incident reports +- Establishing escalation paths + +**Symptoms you need this**: +- "What do we do if [security incident/outage]?" +- Creating runbooks for PII exposure, DDoS, database failure +- Documenting on-call procedures +- Writing post-mortems + +**Don't use for**: +- General documentation (use `muna/technical-writer/documentation-structure`) +- Non-urgent procedures + +## Response Template: 5 Phases + +### Phase 1: Detection + +**What to document**: +```markdown +## Detection + +### Symptoms +- [What you see: alerts, error messages, user reports] +- [Monitoring dashboard links] + +### Severity Classification +**P1 (Critical)**: [Description, e.g., PII exposure, complete outage] +**P2 (High)**: [Description, e.g., degraded performance, partial outage] +**P3 (Medium)**: [Description, e.g., minor issue affecting few users] +**P4 (Low)**: [Description, e.g., cosmetic issue, no user impact] + +### Initial Triage +1. Check monitoring dashboard: [link] +2. Run diagnostic query: + \```bash + [Query to check system health] + \``` +3. Expected output: [What healthy looks like] +4. If [symptom X], proceed to Containment +``` + +### Phase 2: Containment + +**What to document**: +```markdown +## Containment + +### Goal +Stop the bleeding. Prevent further damage. + +###Critical Actions (Do First) +1. **[Action 1]**: [Command/procedure] + - Why: [Rationale] + - Success criteria: [How to verify] + +2. **[Action 2]**: [Command/procedure] + - Why: [Rationale] + - Success criteria: [How to verify] + +### Communication Holds +❌ **DO NOT**: [Actions that tip off attacker or cause panic] +- Don't post to public Slack before containment +- Don't email affected users before scope known +- Don't restart services that destroy forensic evidence + +✅ **DO**: [Immediate notifications] +- Alert security team via [pager] +- Notify incident commander via [phone] +``` + +### Phase 3: Investigation + +**What to document**: +```markdown +## Investigation + +### Log Collection +Collect logs for forensic analysis: +\```bash +# Application logs (last 24 hours) +aws logs filter-log-events --log-group-name /aws/app \ + --start-time $(date -d '24 hours ago' +%s)000 \ + --output json > incident-logs.json + +# Access logs +aws s3 cp s3://logs/access-logs/ ./access-logs/ --recursive +\``` + +### Forensic Procedures +1. **Preserve evidence**: Take snapshots before changes + \```bash + aws ec2 create-snapshot --volume-id vol-abc123 + \``` + +2. **Timeline reconstruction**: When did compromise occur? + - Check authentication logs for unauthorized access + - Review deployment history for recent changes + - Identify first appearance of anomaly + +3. **Impact assessment**: + - How many users/records affected? + - What data was accessed/modified? + - Did attacker establish persistence? + +### Investigation Checklist +- [ ] Logs collected and preserved +- [ ] Timeline reconstructed (first compromise to detection) +- [ ] Scope determined (affected users, data, systems) +- [ ] Attack vector identified (how did they get in?) +- [ ] Persistence mechanisms found (backdoors, cron jobs, etc.) +``` + +### Phase 4: Recovery + +**What to document**: +```markdown +## Recovery + +### Restoration Procedure +1. **Remove attacker access**: + \```bash + # Rotate all credentials + aws secretsmanager rotate-secret --secret-id prod/db/password + + # Revoke suspicious sessions + redis-cli KEYS "session:suspicious_*" | xargs redis-cli DEL + \``` + +2. **Patch vulnerability**: + - Deploy fix: [git commit hash, deployment command] + - Verify patch: [test procedure] + +3. **Restore service**: + - Bring systems back online + - Verify functionality + - Monitor for recurrence + +### Verification Steps +- [ ] Vulnerability patched and verified +- [ ] All malicious access removed +- [ ] Service restored to normal operation +- [ ] Monitoring shows no anomalies for [duration] + +### Monitoring for Recurrence +\```bash +# Watch for suspicious activity +watch -n 60 'aws logs tail /aws/app --follow --filter-pattern "[attack-pattern]"' +\``` + +Continue monitoring for 24-48 hours post-recovery. +``` + +### Phase 5: Lessons Learned + +**What to document**: +```markdown +## Lessons Learned (Post-Incident Report) + +### Timeline +| Time | Event | Actor | +|------|-------|-------| +| 10:23 | First compromise detected in logs | Attacker | +| 10:45 | Alert triggered, pager sent | Monitoring | +| 10:50 | On-call engineer acknowledged | John Doe | +| 11:05 | Containment actions completed | John Doe | +| 11:30 | Investigation confirmed SQL injection | Jane Smith | +| 12:15 | Patch deployed to production | DevOps | +| 12:30 | Service fully restored | Team | + +### Root Cause +[Single-sentence root cause: "SQL injection in /api/users endpoint due to unparameterized query"] + +### Impact +- **Users affected**: 1,247 users +- **Data exposed**: Email addresses and usernames (no passwords or payment data) +- **Downtime**: 2 hours (10:45-12:30) +- **Revenue impact**: ~$5,000 (estimated) + +### What Went Well +- ✅ Alert fired within 20 minutes of compromise +- ✅ On-call responded in 5 minutes +- ✅ Containment completed within 20 minutes +- ✅ Clear runbook followed, no confusion + +### What Could Improve +- ❌ Initial alert lacked severity context (delayed triage) +- ❌ Log retention only 7 days (lost pre-compromise forensics) +- ❌ No automated rollback procedure (manual steps delayed recovery) + +### Action Items +| Action | Owner | Due Date | Status | +|--------|-------|----------|--------| +| Implement parameterized queries in all endpoints | Dev Team | 2024-04-01 | ✅ Done | +| Extend log retention to 90 days | Platform | 2024-04-15 | 🔄 In Progress | +| Add severity to alert messages | SRE | 2024-04-05 | ✅ Done | +| Create automated rollback procedure | DevOps | 2024-05-01 | 📋 Planned | + +### Prevention +- Implement SAST scan in CI/CD to catch SQL injection +- Quarterly penetration testing +- Code review checklist updated with parameterized query requirement +``` + + +## Escalation Paths + +### Severity-Based Escalation + +```markdown +## Escalation Matrix + +### P1 (Critical): Immediate Escalation +**Definition**: Complete outage, security breach, data loss +**Response Time**: 15 minutes +**Escalation Path**: +1. On-call engineer (immediately via pager) +2. If no response in 5 min → Escalate to backup on-call +3. If no resolution in 15 min → Page incident commander +4. If ongoing after 30 min → Notify VP Engineering + +**External Notifications**: +- Customer communication: Status page update within 30 min +- Regulatory notification (if PII breach): Within 72 hours +- Media inquiry response: Refer to PR team + +**Contacts**: +- On-call: [pagerduty-link] +- Backup: Jane Doe (+1-555-0100) +- Incident Commander: John Smith (+1-555-0200) +- VP Engineering: Alice Johnson (+1-555-0300) +- PR Team: pr@example.com + +### P2 (High): Escalate if Not Resolved +**Definition**: Degraded performance, partial outage, security concern +**Response Time**: 1 hour +**Escalation Path**: +1. On-call engineer (page) +2. If no resolution in 2 hours → Page incident commander +3. If ongoing after 4 hours → Notify VP Engineering + +### P3 (Medium): Standard Response +**Definition**: Minor issue, workaround available +**Response Time**: 4 hours +**Escalation Path**: +1. Create ticket in incident system +2. On-call reviews and assigns +3. If no progress in 8 hours → Escalate to team lead + +### P4 (Low): Track for Later +**Definition**: Cosmetic issue, no user impact +**Response Time**: Next business day +**Escalation Path**: +1. Create ticket +2. Addressed in next sprint planning +``` + + +## Time-Critical Clarity Patterns + +### Rule 1: Numbered Steps (Not Paragraphs) + +❌ **WRONG**: +``` +To respond to a database outage, you should first check if the primary is down by running a connection test. If the primary is unresponsive, you might want to consider promoting the replica to primary, but first make sure replication is up-to-date by checking the lag metrics. After promoting, update the connection string in the application configuration and restart the application servers. +``` + +**Problem**: Paragraph format, vague ("you might want to"), no clear sequence. + +✅ **RIGHT**: +``` +## Database Outage Response + +1. **Check primary status**: + \```bash + pg_isready -h primary.db.internal -p 5432 + \``` + - If returns "accepting connections" → Primary is healthy, check replica + - If returns "no response" → Primary is down, proceed to step 2 + +2. **Verify replica health**: + \```bash + psql -h replica.db.internal -c "SELECT pg_last_wal_replay_lsn();" + \``` + - If replication lag < 1MB → Safe to promote + - If lag > 1MB → Wait or accept data loss, document decision + +3. **Promote replica to primary**: + \```bash + ssh replica.db.internal + sudo pg_ctl promote -D /var/lib/postgresql/data + \``` + - Success message: "server promoting" + - Verify: `psql -c "SELECT pg_is_in_recovery();"` returns `f` (false = primary) + +4. **Update application config**: + \```bash + kubectl set env deployment/app DATABASE_URL=postgresql://replica.db.internal:5432/app + kubectl rollout restart deployment/app + \``` + - Wait for rollout: `kubectl rollout status deployment/app` + - Success: "successfully rolled out" + +5. **Verify application health**: + - Check monitoring dashboard: [link] + - Test query: `curl https://api.example.com/health` + - Expected: `{"status": "healthy", "database": "connected"}` +``` + +**Better**: Numbered steps, specific commands, success criteria, decision points. + + +### Rule 2: Decision Trees for Triage + +❌ **WRONG**: +``` +If you see high latency, check the database. If the database is slow, check for lock contention. If there's no lock contention, check for slow queries. Also check if the cache is working. +``` + +**Problem**: Unstructured, reader doesn't know priority. + +✅ **RIGHT**: +``` +## High Latency Triage + +**Symptom**: API response time > 2 seconds + +### Decision Tree + +1. **Check cache hit rate**: + \```bash + redis-cli INFO stats | grep keyspace_hits + \``` + - If hit rate < 80% → Cache miss issue, see [Cache Troubleshooting](#cache) + - If hit rate ≥ 80% → Proceed to step 2 + +2. **Check database connection pool**: + \```bash + psql -c "SELECT count(*) FROM pg_stat_activity WHERE state = 'active';" + \``` + - If active connections > 90 → Connection pool exhausted, see [Pool Tuning](#pool) + - If active connections ≤ 90 → Proceed to step 3 + +3. **Check for slow queries**: + \```bash + psql -c "SELECT query, state, query_start FROM pg_stat_activity WHERE state != 'idle' AND (now() - query_start) > interval '5 seconds';" + \``` + - If slow queries found → See [Query Optimization](#queries) + - If no slow queries → Proceed to step 4 + +4. **Check for lock contention**: + \```bash + psql -c "SELECT * FROM pg_locks WHERE NOT granted;" + \``` + - If locks found → See [Lock Resolution](#locks) + - If no locks → Escalate to database team +``` + +**Better**: Clear decision tree, priority order, specific thresholds. + + +## Cross-References + +**Use WITH this skill**: +- `ordis/security-architect/security-controls-design` - Understand control failure scenarios +- `muna/technical-writer/clarity-and-style` - Write clear steps under stress + +**Use AFTER this skill**: +- `muna/technical-writer/documentation-testing` - Test runbooks with tabletop exercises + +## Real-World Impact + +**Runbooks using this framework**: +- **PII Exposure Response**: Detection→Containment→Investigation→Recovery structure enabled 45-minute response (vs 2-hour average without runbook). Forensic evidence preserved. +- **Database Outage**: Decision tree (primary down → check replica lag → promote if <1MB lag) reduced promotion decision from 15 minutes (ad-hoc discussion) to 2 minutes (follow runbook). +- **DDoS Response**: Numbered steps with success criteria enabled junior engineer to respond effectively during P1 incident without senior engineer (first time). + +**Key lesson**: **Time-critical clarity (numbered steps, decision trees, success criteria) enables effective response under stress. Paragraphs fail during incidents.** diff --git a/skills/using-technical-writer/itil-and-governance-documentation.md b/skills/using-technical-writer/itil-and-governance-documentation.md new file mode 100644 index 0000000..b750c03 --- /dev/null +++ b/skills/using-technical-writer/itil-and-governance-documentation.md @@ -0,0 +1,413 @@ + +# ITIL and Governance Documentation + +## Overview + +Document systems for formal IT service management environments. Core principle: **Governance documentation enables controlled change and accountability**. + +**Key insight**: Enterprise operations require structured documentation for change management, service delivery, and business continuity. + +## When to Use + +Load this skill when: +- Working in ITIL/ITSM environments +- Preparing change requests (RFC) +- Documenting enterprise services +- Creating disaster recovery plans + +**Symptoms you need this**: +- "How do I document a production change?" +- Creating service catalog entries +- Writing DR/BCP documentation +- Formal change advisory board (CAB) processes + +**Don't use for**: +- Informal/startup environments (too heavyweight) +- Quick fixes without governance + +## Change Request (RFC) Documentation + +### Pattern: Request for Change + +```markdown +# RFC-1234: Database Schema Migration + +## Change Type +**NORMAL** (Standard | Normal | Emergency) + +## Summary +Migrate user authentication schema from legacy format to OAuth2-compatible format. Adds `oauth_provider` and `oauth_token` columns to `users` table. + +## Business Justification +- Enable SSO integration with enterprise identity providers +- Customer requirement for Fortune 500 clients +- Revenue impact: Unblocks $500k in contracts + +## Impact Analysis + +### Affected Services +- User Authentication Service (primary impact) +- API Gateway (configuration change required) +- Mobile App (token format change) + +### Affected Users/Teams +- All active users (15,000 users) - No visible impact +- Engineering team - Deployment coordination required +- Customer support - FAQs updated + +### Dependencies +- Requires database maintenance window +- OAuth provider configuration must be complete +- Mobile app v2.5+ must be deployed first (rollout complete as of 2024-03-10) + +### Risk Assessment +| Risk | Likelihood | Impact | Mitigation | +|------|------------|--------|------------| +| Migration fails mid-process | Low | High | Transaction-based migration, automatic rollback | +| Users locked out post-migration | Medium | High | Keep legacy auth fallback for 30 days | +| Performance degradation | Low | Medium | Load testing completed, indexes added | + +**Overall Risk**: MEDIUM + +## Implementation Plan + +### Pre-Implementation (1 day before) +1. **2024-03-19 10:00**: Send user notification ("scheduled maintenance 2024-03-20 02:00-04:00 UTC, no action required") +2. **2024-03-19 14:00**: Final QA testing in staging environment +3. **2024-03-19 16:00**: Freeze code deployments (change freeze begins) + +### Implementation Window (2024-03-20 02:00-04:00 UTC) +**Duration**: 2 hours +**Teams required**: 2x SRE, 1x Database Admin, 1x Backend Engineer + +1. **02:00**: Enable maintenance mode + \```bash + kubectl scale deployment/api-gateway --replicas=0 + \``` + - Success: No active API requests + +2. **02:05**: Backup database + \```bash + pg_dump production_db > backup-pre-migration-20240320.sql + aws s3 cp backup-pre-migration-20240320.sql s3://backups/ + \``` + - Success: Backup file uploaded (verify size ~5GB) + +3. **02:15**: Run migration + \```bash + psql production_db < migration-20240320-oauth-schema.sql + \``` + - Success: "ALTER TABLE" messages, no errors + - Rollback if: Any ERROR message → Restore from backup + +4. **02:45**: Deploy new API version + \```bash + kubectl set image deployment/api-gateway api=registry/api:v2.6.0 + kubectl scale deployment/api-gateway --replicas=6 + kubectl rollout status deployment/api-gateway + \``` + - Success: "successfully rolled out" + +5. **03:00**: Smoke tests + \```bash + # Test legacy auth (backward compatibility) + curl -X POST https://api.example.com/auth/legacy -d '{"user":"test@example.com","pass":"fake_password"}' + # Expected: 200 OK, token returned + + # Test OAuth auth (new feature) + curl -X POST https://api.example.com/auth/oauth -d '{"provider":"google","code":"fake_code"}' + # Expected: 200 OK, token returned + \``` + +6. **03:15**: Monitor for 15 minutes + - Check error rate dashboard: [link] + - Check authentication success rate: [link] + - Expected: Error rate <0.1%, auth success >99% + +7. **03:30**: Disable maintenance mode + \```bash + # Restore normal operation + \``` + +8. **03:45**: Post-deployment verification + - Verify mobile app can authenticate + - Verify web app can authenticate + - Check customer support queue (any authentication issues?) + +### Post-Implementation +1. **2024-03-20 08:00**: Team standup - review deployment, any issues? +2. **2024-03-20 09:00**: Send "maintenance complete" notification +3. **2024-03-21**: Monitor for 24 hours, watch for late-breaking issues +4. **2024-03-27**: Remove legacy auth fallback (7 days post-migration) + +## Rollback Plan + +### Trigger Conditions +Rollback if ANY of these occur: +- Migration script errors +- Auth success rate drops below 95% +- Critical bugs reported by >5 users +- Performance degradation >20% + +### Rollback Procedure (30 minutes) +1. **Enable maintenance mode** (same as step 1 above) + +2. **Restore database from backup**: + \```bash + aws s3 cp s3://backups/backup-pre-migration-20240320.sql ./ + psql production_db < backup-pre-migration-20240320.sql + \``` + - Duration: ~15 minutes for 5GB database + +3. **Revert to previous API version**: + \```bash + kubectl set image deployment/api-gateway api=registry/api:v2.5.9 + kubectl rollout restart deployment/api-gateway + \``` + +4. **Verify rollback success**: Run smoke tests (same as implementation step 5) + +5. **Disable maintenance mode** + +6. **Post-rollback**: Investigate failure, update RFC with findings, reschedule + +## Testing Plan + +### Pre-Production Testing (Completed) +- [x] Unit tests pass (2024-03-15) +- [x] Integration tests pass (2024-03-16) +- [x] Load testing: 10,000 concurrent users, <200ms latency (2024-03-17) +- [x] Staging deployment: Migration successful (2024-03-18) + +### Production Verification +- Smoke tests (included in implementation plan step 5) +- 24-hour monitoring period +- User acceptance: No critical issues reported within 7 days + +## Approval Chain + +| Role | Name | Approval Date | Status | +|------|------|---------------|--------| +| Backend Engineer | John Doe | 2024-03-14 | ✅ Approved | +| Database Admin | Jane Smith | 2024-03-14 | ✅ Approved | +| SRE Lead | Bob Johnson | 2024-03-15 | ✅ Approved | +| Product Manager | Alice Williams | 2024-03-15 | ✅ Approved | +| Change Advisory Board (CAB) | CAB Chair | 2024-03-18 | ✅ Approved | + +**Change Authorized by**: CAB Chair (Alice Johnson) on 2024-03-18 + +## Communication Plan + +| Audience | Message | Channel | Timing | +|----------|---------|---------|--------| +| All users | "Scheduled maintenance 2024-03-20 02:00-04:00 UTC. No action required." | Email, status page | 1 day before | +| Engineering team | Implementation details, on-call requirements | Slack #engineering | 1 day before | +| Customer support | FAQ updates, escalation path | Support portal | 1 day before | +| Executive team | Change summary, business impact | Email | 1 day before | +| All users | "Maintenance complete. New SSO features available." | Email, status page | After completion | +``` + + +## Service Documentation + +### Service Catalog Entry + +```markdown +# Service: User Authentication API + +## Service Overview +**Service ID**: SVC-0042 +**Service Name**: User Authentication API +**Description**: Provides authentication and authorization for all company applications. Supports password-based login, OAuth SSO, and API token management. + +## Service Owner +**Team**: Platform Security Team +**Primary Contact**: security-team@example.com +**Escalation**: VP Engineering (vp-eng@example.com) + +## Service Details + +### Purpose +Enable secure user authentication across all company products. + +### Scope +**Included**: +- User login (password, OAuth) +- Session management +- API token generation/validation +- MFA enforcement + +**Not Included**: +- User registration (owned by User Management Service) +- Password reset (owned by User Management Service) + +### Users +- **Primary Users**: All application users (15,000 active users) +- **Internal Users**: API Gateway, Mobile App, Web App + +## Service Level Agreement (SLA) + +### Availability +**Target**: 99.9% uptime +**Measurement**: Monthly uptime excluding planned maintenance +**Planned Maintenance**: Max 4 hours/month, communicated 7 days in advance + +### Performance +**Target**: 95th percentile latency <100ms +**Measurement**: API response time from request to auth token response + +### Support Hours +**P1 (Critical)**: 24/7/365, 15-minute response time +**P2 (High)**: Business hours (9am-5pm PT), 2-hour response time +**P3/P4**: Business hours, next business day response + +## Operational Level Agreements (OLA) + +### Database Team OLA +**Commitment**: Database backup every 6 hours, 30-day retention +**Response Time**: 1-hour response for database issues affecting auth service + +### Network Team OLA +**Commitment**: 99.95% network availability for auth service subnet +**Response Time**: 30-minute response for network issues + +## Monitoring and Alerts + +**Dashboard**: https://grafana.example.com/d/auth-service +**Key Metrics**: +- Uptime: Target 99.9% +- Latency (p95): Target <100ms +- Error rate: Target <0.1% +- Auth success rate: Target >99% + +**Alerts**: +- P1: Service down (5-minute window), latency >500ms, error rate >1% +- P2: Elevated errors (>0.5%), auth success rate <95% + +## Dependencies +- **Database**: PostgreSQL (SVC-0010) +- **Cache**: Redis (SVC-0015) +- **Secrets Management**: Vault (SVC-0020) + +## Disaster Recovery +**RTO**: 1 hour (service restored within 1 hour of total failure) +**RPO**: 15 minutes (max 15 minutes of data loss acceptable) +**DR Site**: us-west-2 (failover from us-east-1) +**Testing**: Quarterly DR drills +``` + + +## Business Continuity Documentation + +### Disaster Recovery Plan + +```markdown +# Disaster Recovery Plan: User Authentication Service + +## RTO and RPO + +**RTO (Recovery Time Objective)**: 1 hour +- Time to restore service after total regional failure + +**RPO (Recovery Point Objective)**: 15 minutes +- Maximum acceptable data loss (last database backup) + +## Disaster Scenarios + +### Scenario 1: Complete Regional Outage (AWS us-east-1) +**Trigger**: All us-east-1 availability zones unavailable +**Impact**: PRIMARY - Service completely unavailable +**Recovery**: Failover to us-west-2 (DR region) + +### Scenario 2: Database Corruption +**Trigger**: Database integrity check fails, data corruption detected +**Impact**: CRITICAL - Service degraded or unavailable +**Recovery**: Restore from most recent backup (15-minute data loss) + +### Scenario 3: Critical Security Incident +**Trigger**: Confirmed breach, attacker has system access +**Impact**: CRITICAL - Service must be taken offline immediately +**Recovery**: Forensic analysis, patch vulnerabilities, restore from known-good state + +## Failover Procedure (Scenario 1: Regional Outage) + +### Prerequisites +- DR region (us-west-2) database in standby replication mode +- DR region compute resources pre-provisioned (scaled to zero, can scale up instantly) + +### Failover Steps (45 minutes) + +1. **Declare disaster** (5 minutes) + - Incident commander confirms regional outage + - Notify stakeholders: "Initiating failover to DR region" + +2. **Promote DR database to primary** (10 minutes) + \```bash + # SSH to DR database server + ssh db-dr.us-west-2.internal + + # Promote standby to primary + sudo -u postgres pg_ctl promote -D /var/lib/postgresql/data + + # Verify promotion + psql -c "SELECT pg_is_in_recovery();" # Should return 'f' (not in recovery = primary) + \``` + +3. **Scale up DR compute** (10 minutes) + \```bash + # Scale API servers from 0 to 6 replicas + kubectl --context us-west-2 scale deployment/auth-api --replicas=6 + + # Wait for readiness + kubectl --context us-west-2 rollout status deployment/auth-api + \``` + +4. **Update DNS to DR region** (10 minutes) + \```bash + # Update Route53 to point to DR load balancer + aws route53 change-resource-record-sets --hosted-zone-id Z123 --change-batch file://dns-failover.json + + # Verify DNS propagation (may take 5-10 minutes for full propagation) + dig auth-api.example.com + \``` + +5. **Verify service health** (5 minutes) + - Smoke test: Authenticate test user + - Check monitoring dashboard: Error rate, latency + - Success criteria: Error rate <1%, latency <200ms + +6. **Communicate completion** (5 minutes) + - Status page: "Service restored, operating from backup region" + - Engineering team: "Failover complete, monitor for issues" + - Customer support: "Service operational, escalate any issues to incident channel" + +### Failback Procedure (After Primary Region Restored) + +**Wait period**: 24 hours of stable DR operation before failback + +1. Restore primary region database from DR backup +2. Establish replication DR → Primary (reverse direction) +3. During maintenance window: Failover back to primary (same procedure as above) + +## Testing Schedule + +**DR Drills**: Quarterly (January, April, July, October) +**Procedure**: Simulate regional failure, execute failover, measure RTO/RPO +**Success Criteria**: Complete failover within 1-hour RTO, data loss <15 minutes +``` + + +## Cross-References + +**Use WITH this skill**: +- `muna/technical-writer/documentation-structure` - RFC and service docs follow structured patterns +- `muna/technical-writer/incident-response-documentation` - DR procedures are incident responses + +## Real-World Impact + +**Governance documentation using these patterns**: +- **Database Migration RFC**: Change Advisory Board approved in first review (vs 3-review avg) due to comprehensive impact analysis, rollback plan, and testing documentation. +- **Service Catalog**: SLA documentation enabled proactive capacity planning. When auth service approached 99.9% SLA threshold, automatic scaling prevented SLA breach. +- **DR Plan**: Quarterly DR drill revealed 2-hour RTO vs documented 1-hour. Plan updated with pre-provisioned DR resources, actual RTO now 45 minutes. + +**Key lesson**: **Structured governance documentation (RFC, SLA, DR plans) enables controlled change, accountability, and measurable service delivery.** diff --git a/skills/using-technical-writer/operational-acceptance-documentation.md b/skills/using-technical-writer/operational-acceptance-documentation.md new file mode 100644 index 0000000..ab37b9a --- /dev/null +++ b/skills/using-technical-writer/operational-acceptance-documentation.md @@ -0,0 +1,361 @@ + +# Operational Acceptance Documentation + +## Overview + +Prepare complete acceptance packages for production deployment. Core principle: **Document readiness, risks, and acceptance criteria for informed go-live decisions**. + +**Key insight**: Acceptance documentation enables stakeholders to make informed risk decisions about production deployment. + +## When to Use + +Load this skill when: +- Preparing systems for production launch +- Seeking executive go-live approval +- Completing operational handover +- Government/defense system authorization + +**Symptoms you need this**: +- "How do I get approval to launch?" +- Preparing production readiness checklist +- Creating go-live approval package +- Operational handover to support team + +**Don't use for**: +- Development/staging deployments +- Internal-only tools (unless high-risk) + +## Production Readiness Checklist + +### Infrastructure Readiness + +```markdown +## Infrastructure Readiness + +### Compute Resources +- [ ] Production servers provisioned (6x API servers, 2x database servers) +- [ ] Auto-scaling configured (scale 2-20 instances based on CPU >70%) +- [ ] Load balancer configured with health checks +- [ ] SSL/TLS certificates installed and valid (expires 2025-12-01) + +### Storage +- [ ] Database provisioned (PostgreSQL 14, 500GB storage) +- [ ] Database backups configured (automated hourly backups, 30-day retention) +- [ ] Backup restoration tested (RTO: 1 hour, RPO: 1 hour) + +### Network +- [ ] VPC configured with public/private subnets +- [ ] Firewall rules implemented (allow HTTPS 443, deny all other inbound) +- [ ] DNS configured (api.example.com → load balancer) + +### Monitoring and Logging +- [ ] Application metrics instrumented (Prometheus) +- [ ] Logs centralized (CloudWatch Logs, 90-day retention) +- [ ] Dashboards created ([Grafana dashboard link]) +- [ ] Alerts configured (error rate, latency, uptime) + +### Security +- [ ] Secrets stored in secrets manager (not environment variables) +- [ ] TLS 1.3 enforced +- [ ] Authentication implemented (MFA for admins) +- [ ] Security scan completed (no HIGH/CRITICAL findings) + +**Infrastructure Status**: ✅ READY (all criteria met) +``` + + +## Operational Readiness Checklist + +```markdown +## Operational Readiness + +### Monitoring Coverage +- [ ] **Availability**: Uptime monitoring ([UptimeRobot link]) +- [ ] **Performance**: Latency tracking (p50, p95, p99) +- [ ] **Errors**: Error rate monitoring (<0.1% threshold) +- [ ] **Business metrics**: User signups, API calls, revenue + +**Success criteria**: All critical metrics have dashboards + alerts + +### Alerting Configuration +- [ ] **P1 alerts** (PagerDuty): Service down, error rate >1%, security incident +- [ ] **P2 alerts** (Slack #ops): Elevated errors >0.5%, latency >500ms +- [ ] **P3 alerts** (Email): Performance degradation, capacity warnings + +**Success criteria**: Alerts tested and verified to fire correctly + +### Backup and Recovery +- [ ] **Backup procedure**: Automated hourly PostgreSQL dumps to S3 +- [ ] **Backup testing**: Restored from backup on 2024-03-10 (successful) +- [ ] **Recovery time**: 1-hour RTO verified +- [ ] **Recovery point**: 1-hour RPO (acceptable data loss) + +**Success criteria**: Restore from backup completes within RTO + +### Runbooks and Documentation +- [ ] **Incident response runbooks**: Database outage, API errors, security incidents +- [ ] **Operational procedures**: Deployment, rollback, scaling +- [ ] **Architecture documentation**: System diagram, data flows, integrations +- [ ] **API documentation**: Endpoint reference, authentication guide + +**Success criteria**: On-call engineer can respond to P1 incident using runbooks alone + +**Operational Status**: ✅ READY (all criteria met) +``` + + +## Test and Evaluation Documentation + +### Test Summary Report + +```markdown +# Test Summary Report: Customer Portal Launch + +## Test Objectives +1. Verify functional requirements (user registration, login, profile management) +2. Validate performance requirements (p95 latency <200ms, support 1000 concurrent users) +3. Confirm security requirements (authentication, authorization, data encryption) + +## Test Methodology + +### Functional Testing +- **Unit tests**: 487 tests, 100% pass rate +- **Integration tests**: 156 tests, 100% pass rate +- **End-to-end tests**: 45 scenarios, 44 passed, 1 defect (LOW severity, workaround available) + +### Performance Testing +- **Load test**: 1000 concurrent users, 10,000 requests/min + - p50 latency: 45ms ✅ + - p95 latency: 180ms ✅ (target: <200ms) + - p99 latency: 350ms ⚠️ (target: <500ms) + - Error rate: 0.02% ✅ (target: <0.1%) + +### Security Testing +- **Vulnerability scan**: Nessus scan completed 2024-03-15 + - Critical: 0 ✅ + - High: 0 ✅ + - Medium: 3 (remediated) + - Low: 8 (accepted risk) +- **Penetration test**: External pentest completed 2024-03-18 + - HIGH findings: 1 (SQL injection, fixed on 2024-03-19) + - MEDIUM findings: 2 (remediated) + +## Defect Summary + +| Defect ID | Severity | Description | Status | Disposition | +|-----------|----------|-------------|--------|-------------| +| DEF-001 | LOW | Profile image upload fails for files >10MB | Open | Workaround: Resize before upload (documented) | +| DEF-002 | MEDIUM | Password reset email delayed 5-10 minutes | Fixed | Fixed on 2024-03-20 | +| DEF-003 | HIGH | SQL injection in /api/users | Fixed | Fixed on 2024-03-19, re-tested | + +## Test Completion Criteria + +- [ ] ✅ All HIGH/CRITICAL defects fixed +- [ ] ✅ All MEDIUM defects fixed or have workarounds +- [ ] ✅ LOW defects documented (1 open, workaround available) +- [ ] ✅ Performance requirements met +- [ ] ✅ Security requirements met (no HIGH/CRITICAL findings) + +**Test Status**: ✅ PASSED (all criteria met, 1 LOW defect acceptable) +``` + + +## Go-Live Approval Package + +### Executive Summary + +```markdown +# Go-Live Approval Request: Customer Portal + +## System Overview +**System Name**: Customer Portal +**Purpose**: Enable customers to self-serve account management, reducing support tickets by 40% +**Business Value**: $2M annual revenue enabler (enterprise customers require self-service portal) +**Target Launch**: 2024-04-01 + +## Readiness Status + +### Infrastructure: ✅ READY +- All production servers provisioned and tested +- Auto-scaling configured +- Backups automated and tested (1-hour RTO/RPO) + +### Operations: ✅ READY +- Monitoring and alerting configured +- Runbooks complete +- On-call rotation staffed (3 SREs, 2 backend engineers) + +### Testing: ✅ PASSED +- Functional tests: 100% pass (1 LOW defect with workaround) +- Performance tests: p95 latency 180ms (target: <200ms) +- Security tests: 0 HIGH/CRITICAL findings + +### Security Authorization: ✅ AUTHORIZED +- ATO granted on 2024-03-25 (valid for 3 years) +- POA&M with 2 LOW-risk items (tracked, non-blocking) + +## Residual Risks + +### Risk 1: Performance Degradation Above 1000 Users (MEDIUM) +**Description**: Load testing validated 1000 concurrent users. Performance above 1000 users unknown. +**Mitigation**: +- Auto-scaling configured to add capacity at 70% CPU +- Gradual rollout plan (100 users week 1, 500 week 2, all users week 4) +- Performance monitoring with alerts at 800ms latency threshold +**Accepted by**: CTO on 2024-03-28 + +### Risk 2: Profile Image Upload Limitation (LOW) +**Description**: Images >10MB fail to upload (DEF-001) +**Mitigation**: +- Workaround documented in user help center +- Fix planned for v1.1 release (2024-05-01) +**Accepted by**: Product Manager on 2024-03-28 + +## Launch Criteria + +### Success Metrics +**Immediate (Week 1)**: +- Uptime: >99% (target: 99.9%) +- Error rate: <0.5% (target: <0.1%) +- p95 latency: <300ms (target: <200ms) + +**Medium-term (Month 1)**: +- User adoption: 30% of customers use portal +- Support ticket reduction: 20% decrease + +### Abort Criteria +**Immediate rollback if**: +- Uptime drops below 95% for >1 hour +- Error rate exceeds 5% +- Data breach or security incident +- Critical functionality broken for >50% of users + +### Monitoring Plan +- **Real-time**: Grafana dashboard monitored by on-call +- **Daily**: Morning standup reviews previous 24 hours +- **Weekly**: Executive summary report (metrics vs targets) + +## Rollback Plan + +**Trigger**: Any abort criterion met + +**Rollback Procedure** (30 minutes): +1. Enable maintenance page +2. Scale production deployment to 0 replicas +3. Restore database from pre-launch backup (if data changes occurred) +4. Re-enable previous customer support workflow +5. Communicate to customers via email + +**Testing**: Rollback procedure tested in staging on 2024-03-27 (successful, 25-minute duration) + +## Recommendation + +**Status**: ✅ APPROVED FOR LAUNCH + +All readiness criteria met. Residual risks identified and accepted by stakeholders. Launch criteria defined with clear success metrics and abort criteria. Rollback plan tested and ready. + +**Requested Approval**: Executive Go-Live Approval + +**Approvals Required**: +- [ ] VP Engineering (technical readiness) +- [ ] CTO (security and risk acceptance) +- [ ] VP Product (business value and user experience) +- [ ] CFO (budget and revenue impact) +``` + + +## Operational Handover Checklist + +```markdown +# Operational Handover: Customer Portal + +## Knowledge Transfer + +### Documentation Delivered +- [ ] ✅ Architecture documentation (`/docs/architecture.md`) +- [ ] ✅ API reference (`/docs/api-reference.md`) +- [ ] ✅ Runbooks (`/runbooks/` - 12 runbooks) +- [ ] ✅ Deployment procedures (`/docs/deployment.md`) +- [ ] ✅ Troubleshooting guide (`/docs/troubleshooting.md`) + +### Training Completed +- [ ] ✅ On-call training (2024-03-20): 3 SREs, 2 backend engineers +- [ ] ✅ Runbook walkthrough (2024-03-22): All on-call staff +- [ ] ✅ Incident response drill (2024-03-25): Simulated database outage, responded successfully + +### Handoff Meeting +- **Date**: 2024-03-28 +- **Attendees**: Development team (6), Operations team (5), Product (2) +- **Agenda**: + 1. System overview and architecture + 2. Common issues and troubleshooting + 3. Escalation paths and contact information + 4. Q&A session +- **Outcome**: ✅ Operations team confident in supporting system + +## Support Model + +### On-Call Rotation +**Primary On-Call**: Rotating weekly schedule (3 SREs) +**Backup On-Call**: Backend engineer (2-person rotation) + +**Schedule**: https://pagerduty.example.com/schedules/customer-portal + +### Escalation Paths + +**P1 (Critical)**: +1. Primary on-call (page immediately) +2. If no response in 5 min → Backup on-call +3. If no resolution in 30 min → Incident commander +4. If ongoing after 1 hour → VP Engineering + +**P2 (High)**: +1. Primary on-call (page) +2. If no response in 15 min → Backup on-call +3. If no resolution in 4 hours → Team lead + +**Contacts**: +- Primary on-call: [PagerDuty link] +- Incident commander: John Doe (+1-555-0100) +- Team lead: Jane Smith (+1-555-0200) +- VP Engineering: Bob Johnson (+1-555-0300) + +### SLA Commitments +**Uptime**: 99.9% (measured monthly) +**Performance**: p95 latency <200ms +**Support Response**: +- P1: 15-minute response, 4-hour resolution target +- P2: 2-hour response, 1-day resolution target +- P3: Next business day response + +## Acceptance Criteria Met + +- [ ] ✅ All documentation delivered and reviewed +- [ ] ✅ Operations team trained +- [ ] ✅ Incident response drill successful +- [ ] ✅ On-call rotation staffed +- [ ] ✅ Escalation paths defined +- [ ] ✅ SLA commitments documented + +**Handover Status**: ✅ COMPLETE + +**Signed Off**: +- Development Team Lead: John Doe (2024-03-28) +- Operations Team Lead: Jane Smith (2024-03-28) +``` + + +## Cross-References + +**Use WITH this skill**: +- `ordis/security-architect/security-authorization-and-accreditation` - For government/defense ATO requirements +- `muna/technical-writer/itil-and-governance-documentation` - For RFC and service documentation + +## Real-World Impact + +**Systems using operational acceptance documentation**: +- **Customer Portal Launch**: Go-live approval package enabled same-day executive approval (vs 1-week review cycle). Clear risk acceptance + rollback plan gave confidence to approve. +- **Government System**: Complete readiness checklist (infrastructure, operations, testing, security authorization) passed IRAP assessment on first attempt. Assessor: "Most comprehensive readiness documentation in 5 years". +- **Operational Handover**: Training + runbooks + incident drill enabled junior SRE to respond to P1 database outage successfully within 45 minutes (first week post-handover). + +**Key lesson**: **Comprehensive acceptance documentation (readiness, risks, criteria, handover) enables informed go-live decisions and smooth operational transitions.** diff --git a/skills/using-technical-writer/security-aware-documentation.md b/skills/using-technical-writer/security-aware-documentation.md new file mode 100644 index 0000000..6df9fb9 --- /dev/null +++ b/skills/using-technical-writer/security-aware-documentation.md @@ -0,0 +1,503 @@ + +# Security-Aware Documentation + +## Overview + +Document systems without compromising security. Core principle: **Documentation should inform, not expose**. + +**Key insight**: Examples with real credentials/PII leak secrets. Obviously fake examples are safe and clear. + +## When to Use + +Load this skill when: +- Documenting authentication/authorization +- Creating API examples with credentials +- Writing about systems handling PII or classified data +- Documenting security features + +**Symptoms you need this**: +- "How do I show API key example without exposing real keys?" +- Writing documentation for healthcare/finance systems +- Creating examples with user data +- Documenting security configurations + +**Don't use for**: +- General documentation without security concerns +- Internal-only docs (though still good practice) + +## Sanitizing Examples + +### Rule 1: Never Use Real Credentials + +❌ **WRONG**: +```bash +# Don't mask real secrets +curl -H "Authorization: Bearer sk_live_51Hx***REDACTED***" \ + https://api.example.com/users +``` + +**Problem**: Pattern `sk_live_51Hx...` suggests real Stripe key structure. Reader might think they should unmask it. + +✅ **RIGHT**: +```bash +# Generate obviously fake credentials +curl -H "Authorization: Bearer fake_key_abc123_for_docs_only" \ + https://api.example.com/users +``` + +**Better**: Clearly fake, no confusion possible. + + +### Rule 2: Use Obviously Fake Values + +**Fake Credentials Pattern**: +``` +fake_[type]_[random]_for_docs_only +``` + +**Examples**: +- API key: `fake_api_key_abc123_for_docs_only` +- JWT token: `eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.FAKE_TOKEN_FOR_DOCUMENTATION.fake_signature_do_not_use` +- Database password: `fake_password_p@ssw0rd_example_only` +- SSH key: `fake_ssh_key_AAAAB3NzaC1yc2EAAAADAQABAAABAQ... (truncated for docs)` + +**Fake PII Pattern**: +``` +[common_name]@example.com +000-00-0000 (SSN) ++1-555-0123 (phone - 555 is reserved for fiction) +``` + +**Examples**: +- Email: `jane.doe@example.com`, `user@example.org` +- SSN: `000-00-0000`, `123-45-6789` (invalid format) +- Phone: `+1-555-0100`, `+1-555-0199` +- Address: `123 Main Street, Anytown, ST 12345` + + +### Rule 3: Use Reserved Domains + +**Reserved for documentation** (RFC 2606): +- `example.com` +- `example.net` +- `example.org` +- `test.com` (not official but commonly accepted) + +❌ **WRONG**: +```javascript +const API_URL = 'https://api.acmecorp.com'; // Real company? +``` + +✅ **RIGHT**: +```javascript +const API_URL = 'https://api.example.com'; // Obviously fake +``` + + +### Rule 4: Complete Fake Examples (Not Partial) + +❌ **WRONG**: +```python +# Partial example - reader must guess +api_key = "YOUR_API_KEY_HERE" +client = APIClient(api_key) +``` + +**Problem**: Reader doesn't know what `YOUR_API_KEY_HERE` should look like. + +✅ **RIGHT**: +```python +# Complete fake example - copy-paste-run (with fake backend) +api_key = "fake_api_key_abc123_for_docs_only" +client = APIClient(api_key) + +# For real usage: +# 1. Get API key from https://dashboard.example.com/settings +# 2. Replace fake_api_key_abc123_for_docs_only with your real key +# 3. Never commit real keys to git +``` + +**Better**: Complete example + clear instructions for real usage. + + +## Threat Disclosure Decisions + +### Document: Security Features Users Must Configure + +✅ **DO document**: +```markdown +## Security Configuration + +### Enable MFA (Required for Production) + +Multi-factor authentication prevents unauthorized access even if passwords are compromised. + +Enable MFA for all admin accounts: +\```bash +user-admin mfa enable --user admin@example.com --method totp +\``` + +**Security impact**: Without MFA, stolen passwords grant full access. +``` + + +### Document: Security Best Practices + +✅ **DO document**: +```markdown +## Hardening Guide + +### Disable Unused Services + +**Threat**: Unused services increase attack surface. + +Disable SSH if not needed: +\```bash +systemctl disable sshd +systemctl stop sshd +\``` + +**Verification**: `systemctl status sshd` should show "inactive (dead)" +``` + + +### Don't Document: Specific Vulnerabilities + +❌ **DON'T document**: +```markdown +## Known Issues + +### CVE-2024-12345: SQL Injection in /api/users Endpoint + +Vulnerable code in `user_controller.py:45`: +\```python +query = f"SELECT * FROM users WHERE id = {user_id}" # Vulnerable! +\``` + +Attacker can inject SQL via: `/api/users?id=1 OR 1=1` +``` + +**Problem**: Provides exploit guide to attackers. + +✅ **DO instead**: Coordinate with security team for disclosure. Document fix after patch released: +```markdown +## Security Updates + +### Version 2.1.5 (2024-03-15) + +**Security fix**: Resolved input validation issue in user API (CVE-2024-12345). +Upgrade immediately. + +For technical details, see our security advisory: [link] +``` + + +### Don't Document: Internal Security Architecture (Unless Necessary) + +❌ **DON'T document** (in public docs): +```markdown +## Internal Security Architecture + +Our secrets vault runs on ec2-10-0-1-50.internal with: +- Port 8200 (HTTP API) +- Port 8201 (cluster communication) +- Root token stored in S3 bucket: company-secrets-prod +- Unsealing keys split across: admin1@company.com, admin2@company.com, admin3@company.com +``` + +**Problem**: Reveals infrastructure details aiding attackers (IP addresses, ports, bucket names, key custodians). + +✅ **DO instead**: Document what users need, abstract internals: +```markdown +## Secrets Management + +Secrets are stored in an encrypted vault. To access secrets: + +1. Request access via [access-request-form] +2. Use provided vault token +3. Tokens expire after 8 hours + +See [secrets-access-guide] for details. +``` + + +## Compliance Sensitivity + +### Rule: Document Controls Without Revealing Weaknesses + +❌ **WRONG**: +```markdown +## SOC2 Compliance + +### Access Control (CC6.1) + +**Control**: Role-based access control (RBAC) + +**Current gaps**: +- Admin users can bypass RBAC via debug mode (known issue #245) +- No access reviews conducted (planned for Q3) +- 3 dormant admin accounts still active (cleanup delayed) +``` + +**Problem**: Audit report publicly reveals control weaknesses. + +✅ **RIGHT**: +```markdown +## SOC2 Compliance + +### Access Control (CC6.1) + +**Control**: Role-based access control (RBAC) + +**Implementation**: +- Authentication: MFA required for admin accounts +- Authorization: Permissions enforced at API layer and database layer +- Access reviews: Quarterly review of all privileged accounts +- Account lifecycle: Automated disablement after 30 days inactivity + +**Audit evidence**: Available to authorized auditors via [compliance-portal] +``` + +**Better**: Focus on what exists, keep gaps internal. + + +## Redaction Patterns + +### Logs: Redact Sensitive Data + +❌ **WRONG**: +``` +[2024-03-15 10:23:45] User login: user=john.smith@acme.com, password=MyP@ssw0rd123, token=eyJhbGci... +[2024-03-15 10:24:12] API call: GET /users/12345, auth_token=sk_live_51HxAbC123... +``` + +✅ **RIGHT**: +``` +[2024-03-15 10:23:45] User login: user=john.smith@acme.com, password=[REDACTED], token=[REDACTED] +[2024-03-15 10:24:12] API call: GET /users/12345, auth_token=[REDACTED] +``` + +**Even better** (for docs): Use fake data: +``` +[2024-03-15 10:23:45] User login: user=jane.doe@example.com, password=[REDACTED], token=[REDACTED] +[2024-03-15 10:24:12] API call: GET /users/67890, auth_token=[REDACTED] +``` + + +### Screenshots: Blur Sensitive Data + +**Before sharing screenshot**: +1. Use test account (user@example.com, fake data) +2. Blur any real data (names, emails, IDs) +3. Use browser extensions: "Redact for Screenshot" + +❌ **WRONG**: Screenshot with production data visible + +✅ **RIGHT**: Screenshot with: +- Fake user names (Jane Doe, John Smith) +- Fake emails (jane@example.com) +- Blurred real data if any leaked + + +### Diagrams: Anonymize Infrastructure + +❌ **WRONG**: +``` +[Load Balancer: lb-prod-01.company.com (52.12.34.56)] + ↓ +[API Servers: api-01.internal (10.0.1.10), api-02.internal (10.0.1.11)] + ↓ +[Database: postgres-master.internal (10.0.2.50)] +``` + +✅ **RIGHT**: +``` +[Load Balancer: lb.example.com (203.0.113.10)] # RFC 5737 documentation IP + ↓ +[API Servers: api-01, api-02 (192.168.1.10-11)] # RFC 1918 private IP + ↓ +[Database: postgres-master (192.168.2.50)] +``` + + +### Database Schemas: Use Synthetic Data + +❌ **WRONG**: +```sql +-- Example users table +SELECT * FROM users LIMIT 3; + +| id | email | ssn | credit_card | +|----|--------------------------|-------------|------------------| +| 1 | alice@acmecorp.com | 123-45-6789 | 4532-1234-5678-9012 | +| 2 | bob@acmecorp.com | 987-65-4321 | 5105-1051-0510-5100 | +| 3 | charlie@acmecorp.com | 456-78-9123 | 3782-822463-10005 | +``` + +✅ **RIGHT**: +```sql +-- Example users table (synthetic data) +SELECT * FROM users LIMIT 3; + +| id | email | ssn | credit_card | +|----|------------------------|---------------|------------------| +| 1 | alice@example.com | 000-00-0001 | 0000-0000-0000-0001 | +| 2 | bob@example.com | 000-00-0002 | 0000-0000-0000-0002 | +| 3 | charlie@example.com | 000-00-0003 | 0000-0000-0000-0003 | +``` + + +## Security Feature Documentation + +### Pattern: Threat + Configuration + Impact + +```markdown +## Security Feature: [Name] + +### Threat Prevented +[What attack does this prevent?] + +### Configuration +[How to enable/configure?] +\```bash +[Example commands] +\``` + +### Security Impact +**If enabled**: [What protection do you get?] +**If disabled**: [What risk remains?] + +### Verification +[How to verify it's working?] +\```bash +[Test commands] +\``` +``` + +### Example: Rate Limiting + +```markdown +## Security Feature: API Rate Limiting + +### Threat Prevented +**Brute force attacks**: Attacker attempts thousands of login requests to guess passwords. +**DoS attacks**: Attacker overwhelms API with excessive requests. + +### Configuration + +Enable rate limiting in `config.yaml`: +\```yaml +rate_limiting: + enabled: true + max_requests: 100 # per minute per IP + window: 60 # seconds + block_duration: 300 # 5 minutes +\``` + +Restart API server: +\```bash +systemctl restart api-server +\``` + +### Security Impact +**If enabled**: +- Brute force attack limited to 100 attempts/minute (vs unlimited) +- Single IP cannot DoS entire service +- Legitimate users unaffected (typical usage: 10-20 req/min) + +**If disabled**: +- Attacker can attempt 1000s of passwords per minute +- Single attacker can exhaust server resources +- No protection against credential stuffing attacks + +### Verification + +Test rate limit: +\```bash +# Attempt 101 requests in 1 minute +for i in {1..101}; do + curl https://api.example.com/login -d "user=test&pass=fake" +done + +# Expected: First 100 succeed, 101st returns: +# HTTP 429 Too Many Requests +# Retry-After: 60 +\``` + +Check logs: +\```bash +grep "rate_limit_exceeded" /var/log/api-server.log +# Should show: [2024-03-15 10:25:45] Rate limit exceeded: IP 203.0.113.10, endpoint /login +\``` +``` + + +## Quick Reference: Sanitization Checklist + +| Data Type | ❌ Wrong | ✅ Right | +|-----------|---------|---------| +| **API Key** | `sk_live_***REDACTED***` | `fake_api_key_abc123_for_docs_only` | +| **JWT Token** | `eyJhbGci...` (real masked) | `eyJhbGci...FAKE_TOKEN_FOR_DOCUMENTATION...` | +| **Email** | `john.smith@acme.com` | `jane.doe@example.com` | +| **SSN** | `***-**-1234` | `000-00-0000` | +| **Phone** | `(555) ***-1234` | `+1-555-0100` | +| **IP Address** | `52.12.34.56` (real AWS) | `203.0.113.10` (RFC 5737 docs) | +| **Domain** | `api.acme.com` | `api.example.com` | +| **Database Password** | `p@ssw***` | `fake_password_example_only` | + + +## Common Mistakes + +### ❌ Masking Real Secrets + +**Wrong**: `sk_live_***REDACTED***` (pattern suggests real key) + +**Right**: `fake_api_key_abc123_for_docs_only` (obviously fake) + +**Why**: Masked secrets still leak structure. Readers might try to unmask or think it's production data. + + +### ❌ Using Real Company Names + +**Wrong**: `curl https://api.acmecorp.com` (might be real company) + +**Right**: `curl https://api.example.com` (reserved for docs) + +**Why**: Avoid accidental real company references. Use RFC-designated example domains. + + +### ❌ Documenting Exploits Before Patch + +**Wrong**: Publish CVE details with exploit code before customers patch + +**Right**: Coordinate disclosure with security team, publish after patch available + +**Why**: Responsible disclosure prevents weaponizing vulnerabilities before users can protect themselves. + + +### ❌ Incomplete Redaction + +**Wrong**: Redact password but leave username + server IP + +**Right**: Redact all PII/credentials and use example IPs + +**Why**: Partial redaction still enables attacks. Usernames + server IPs = reconnaissance. + + +## Cross-References + +**Use WITH this skill**: +- `muna/technical-writer/clarity-and-style` - Write clear security documentation +- `ordis/security-architect/threat-modeling` - Understand threats documentation might expose + +**Use AFTER this skill**: +- `muna/technical-writer/documentation-testing` - Verify examples work (with fake credentials) + +## Real-World Impact + +**Projects using security-aware documentation**: +- **API Documentation (Healthcare)**: Sanitized all examples with `jane.doe@example.com`, `fake_api_key_...`. Prevented accidental PII exposure in publicly-accessible docs. +- **OAuth Flow Tutorial**: Used complete fake examples (`client_id=fake_client_abc123`) vs placeholders (`YOUR_CLIENT_ID_HERE`). Support tickets reduced 60% ("I don't know what client ID looks like"). +- **Database Migration Guide**: Used synthetic data (SSN: 000-00-0000) vs redacted real data (SSN: ***-**-1234). Compliance audit passed with "exemplary PII handling in documentation". + +**Key lesson**: **Obviously fake examples are clearer and safer than masked real data. Complete fake examples enable copy-paste-run testing without security risk.**