commit 31df8711f2798fe154aba640a7069146a526d192 Author: Zhongwei Li Date: Sat Nov 29 18:17:15 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..ff41333 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "ReVa", + "description": "ReVa (Reverse Engineering Assistant) - AI-assisted binary analysis and reverse engineering with Ghidra integration", + "version": "0.1.0", + "author": { + "name": "CyberKaida" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..994f995 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# ReVa + +ReVa (Reverse Engineering Assistant) - AI-assisted binary analysis and reverse engineering with Ghidra integration diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..248465c --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,81 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:cyberkaida/reverse-engineering-assistant:ReVa", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "56888177ef0ed432400989e30ea56de6fc360475", + "treeHash": "a829b46c2d1e4cbe5ab9f54828617c9349a4152d132ee488e0230b337f1c98a7", + "generatedAt": "2025-11-28T10:15:59.650961Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "ReVa", + "description": "ReVa (Reverse Engineering Assistant) - AI-assisted binary analysis and reverse engineering with Ghidra integration", + "version": "0.1.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "c1bed8b57181a1566c8e5acc443b1b1888b39c2dc958cb261d71411562fea3f3" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "4ed4eefa86199601e122a65d1b82c0bc8ef7f185eff69d8d217fca2407079347" + }, + { + "path": "skills/binary-triage/SKILL.md", + "sha256": "1a79f775c1d114ba2cad0da4f9bc7d2875a424cd7f368b89ee61063e2faf50ad" + }, + { + "path": "skills/deep-analysis/examples.md", + "sha256": "873165c68cc1494e952f569b7d04cafdfa0bdd6969f14ca9a884f4bca8bb98d3" + }, + { + "path": "skills/deep-analysis/patterns.md", + "sha256": "19426f103d89b05780c61ff75b968dc60a28fab9512df2aceb128c14f258b4cb" + }, + { + "path": "skills/deep-analysis/SKILL.md", + "sha256": "4ca4e2ce49187c484dcc667ff0a71fc9043de9e8c7609247f328bf3bbd104b58" + }, + { + "path": "skills/ctf-pwn/patterns.md", + "sha256": "279ab032ad2632706026a511a72d43149626b3955de4d4afcd08c712e03cc990" + }, + { + "path": "skills/ctf-pwn/SKILL.md", + "sha256": "d762a5f133bee75656dcd5fc37da378922cb13c7ab8e7b47f416be158d044836" + }, + { + "path": "skills/ctf-rev/patterns.md", + "sha256": "94d799060dff9828963e108bc41963f4a490c7296a8e11d5b2d763fdb973f3cc" + }, + { + "path": "skills/ctf-rev/SKILL.md", + "sha256": "b917b8d9d0bf646cf9ef6c557c3c6b9a8487c485575281456c2f3370d8b5048c" + }, + { + "path": "skills/ctf-crypto/patterns.md", + "sha256": "2c624bafeb0684faf9932707189e5bbbc52f4f751faeba88b7904a0d686d1fac" + }, + { + "path": "skills/ctf-crypto/SKILL.md", + "sha256": "edd806cef8356a5fb1c9787f8a37aa31366968c7873dceaab05019050c7e2d61" + } + ], + "dirSha256": "a829b46c2d1e4cbe5ab9f54828617c9349a4152d132ee488e0230b337f1c98a7" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/binary-triage/SKILL.md b/skills/binary-triage/SKILL.md new file mode 100644 index 0000000..055bba2 --- /dev/null +++ b/skills/binary-triage/SKILL.md @@ -0,0 +1,155 @@ +--- +name: binary-triage +description: Performs initial binary triage by surveying memory layout, strings, imports/exports, and functions to quickly understand what a binary does and identify suspicious behavior. Use when first examining a binary, when user asks to triage/survey/analyze a program, or wants an overview before deeper reverse engineering. +--- + +# Binary Triage + +## Instructions +We are triaging a binary to quickly understand what it does. This is an initial survey, not deep analysis. Our goal is to: +1. Identify key components and behaviors +2. Flag suspicious or interesting areas +3. Create a task list of next steps for deeper investigation + +## Binary triage with ReVa + +Follow this systematic workflow using ReVa's MCP tools: + +### 1. Identify the Program +- Use `get-current-program` to see the active program +- Or use `list-project-files` to see available programs in the project +- Note the `programPath` (e.g., "/Hatchery.exe") for use in subsequent tools + +### 2. Survey Memory Layout +- Use `get-memory-blocks` to understand the binary structure +- Examine key sections: + - `.text` - executable code + - `.data` - initialized data + - `.rodata` - read-only data (strings, constants) + - `.bss` - uninitialized data +- Flag unusual characteristics: + - Unusually large sections + - Packed/encrypted sections + - Executable data sections + - Writable code sections + +### 3. Survey Strings +- Use `get-strings-count` to see total string count +- Use `get-strings` with pagination (100-200 strings at a time) +- Look for indicators of functionality or malicious behavior: + - **Network**: URLs, IP addresses, domain names, API endpoints + - **File System**: File paths, registry keys, configuration files + - **APIs**: Function names, library references + - **Messages**: Error messages, debug strings, log messages + - **Suspicious Keywords**: admin, password, credential, token, crypto, encrypt, decrypt, download, execute, inject, shellcode, payload + +### 4. Survey Symbols and Imports +- Use `get-symbols-count` with `includeExternal=true` to count imports +- Use `get-symbols` with `includeExternal=true` and `filterDefaultNames=true` +- Focus on external symbols (imports from libraries) +- Flag interesting/suspicious imports by category: + - **Network APIs**: connect, send, recv, WSAStartup, getaddrinfo, curl_*, socket + - **File I/O**: CreateFile, WriteFile, ReadFile, fopen, fwrite, fread + - **Process Manipulation**: CreateProcess, exec, fork, system, WinExec, ShellExecute + - **Memory Operations**: VirtualAlloc, VirtualProtect, mmap, mprotect + - **Crypto**: CryptEncrypt, CryptDecrypt, EVP_*, AES_*, bcrypt, RC4 + - **Anti-Analysis**: IsDebuggerPresent, CheckRemoteDebuggerPresent, ptrace + - **Registry**: RegOpenKey, RegSetValue, RegQueryValue +- Note the ratio of imports to total symbols (heavy import usage may indicate reliance on libraries) + +### 5. Survey Functions +- Use `get-function-count` with `filterDefaultNames=true` to count named functions +- Use `get-function-count` with `filterDefaultNames=false` to count all functions +- Calculate ratio of named vs unnamed functions (high unnamed ratio = stripped binary) +- Use `get-functions` with `filterDefaultNames=true` to list named functions +- Identify key functions: + - **Entry points**: `entry`, `start`, `_start` + - **Main functions**: `main`, `WinMain`, `DllMain`, `_main` + - **Suspicious names**: If not stripped, look for revealing function names + +### 6. Cross-Reference Analysis for Key Findings +- For interesting strings found in Step 3: + - Use `find-cross-references` with `direction="to"` and `includeContext=true` + - Identify which functions reference suspicious strings +- For suspicious imports found in Step 4: + - Use `find-cross-references` with `direction="to"` and `includeContext=true` + - Identify which functions call suspicious APIs +- This helps prioritize which functions need detailed examination + +### 7. Selective Initial Decompilation +- Use `get-decompilation` on entry point or main function + - Set `limit=30` to get ~30 lines initially + - Set `includeIncomingReferences=true` to see callers + - Set `includeReferenceContext=true` for context snippets +- Use `get-decompilation` on 1-2 suspicious functions identified in Step 6 + - Set `limit=20-30` for quick overview +- Look for high-level patterns: + - Loops (encryption/decryption routines) + - Network operations + - File operations + - Process creation + - Suspicious control flow (obfuscation indicators) +- **Do not do deep analysis yet** - this is just to understand general behavior + +### 8. Document Findings and Create Task List +- Use the `TodoWrite` tool to create an actionable task list with items like: + - "Investigate string 'http://malicious-c2.com' (referenced at 0x00401234)" + - "Decompile function sub_401000 (calls VirtualAlloc + memcpy + CreateThread)" + - "Analyze crypto usage in function encrypt_payload (uses CryptEncrypt)" + - "Trace anti-debugging checks (IsDebuggerPresent at 0x00402000)" + - "Examine packed section .UPX0 for unpacking routine" +- Each todo should be: + - Specific (include addresses, function names, strings) + - Actionable (what needs to be investigated) + - Prioritized (most suspicious first) + +## Output Format + +Present triage findings to the user in this structured format: + +### Program Overview +- **Name**: [Program name from programPath] +- **Type**: [Executable type - PE, ELF, Mach-O, etc.] +- **Platform**: [Windows, Linux, macOS, etc.] + +### Memory Layout +- **Total Size**: [Size in bytes/KB/MB] +- **Key Sections**: [List main sections with sizes and permissions] +- **Unusual Characteristics**: [Any packed/encrypted/suspicious sections] + +### String Analysis +- **Total Strings**: [Count from get-strings-count] +- **Notable Findings**: [Bullet list of interesting strings with context] +- **Suspicious Indicators**: [URLs, IPs, suspicious keywords found] + +### Import Analysis +- **Total Symbols**: [Count from get-symbols-count] +- **External Imports**: [Count of external symbols] +- **Key Libraries**: [Main libraries imported] +- **Suspicious APIs**: [Categorized list of concerning imports] + +### Function Analysis +- **Total Functions**: [Count with filterDefaultNames=false] +- **Named Functions**: [Count with filterDefaultNames=true] +- **Stripped Status**: [Yes/No based on ratio] +- **Entry Point**: [Address and name] +- **Main Function**: [Address and name] +- **Key Functions**: [List of important functions identified] + +### Suspicious Indicators +[Bulleted list of red flags discovered, prioritized by severity] + +### Recommended Next Steps +[Present the task list created in Step 8] +- Each item should be specific and actionable +- Prioritize by severity/importance +- Include addresses, function names, and context + +## Important Notes + +- **Speed over depth**: This is triage, not full analysis. Move quickly through steps. +- **Use pagination**: Don't request thousands of strings/functions at once. Use chunks of 100-200. +- **Focus on anomalies**: Flag things that are unusual, suspicious, or interesting. +- **Context is key**: When using cross-references, enable `includeContext=true` for code snippets. +- **Create actionable todos**: Each next step should be specific enough for another agent to execute. +- **Be systematic**: Follow all 8 steps in order for comprehensive coverage. diff --git a/skills/ctf-crypto/SKILL.md b/skills/ctf-crypto/SKILL.md new file mode 100644 index 0000000..4254f5b --- /dev/null +++ b/skills/ctf-crypto/SKILL.md @@ -0,0 +1,323 @@ +--- +name: ctf-crypto +description: Solve CTF cryptography challenges by identifying, analyzing, and exploiting weak crypto implementations in binaries to extract keys or decrypt data. Use for custom ciphers, weak crypto, key extraction, or algorithm identification. +--- + +# CTF Cryptography + +## Purpose + +You are a cryptographic implementation investigator for CTF challenges. Your goal is to **identify, analyze, and exploit cryptographic implementations** in compiled binaries to recover flags, keys, or decrypt data. + +Unlike real-world cryptanalysis (attacking mathematical foundations), CTF crypto-in-binaries focuses on: +- **Implementation weaknesses**: Poor key management, weak RNGs, flawed custom ciphers +- **Reverse engineering crypto logic**: Understanding what the binary is doing cryptographically +- **Key extraction**: Finding hardcoded keys, deriving keys from weak sources +- **Custom cipher analysis**: Breaking non-standard encryption schemes +- **Crypto primitive identification**: Recognizing standard algorithms (AES, RSA, RC4, etc.) + +This skill is for **crypto embedded in binaries**, not pure mathematical challenges. + +## Conceptual Framework + +Solving CTF crypto challenges in binaries follows a systematic investigation framework: + +### Phase 1: Crypto Detection +**Goal**: Determine if and where cryptography is used + +**Investigation approach:** +- Search for crypto-related strings and constants +- Identify mathematical operation patterns (XOR, rotation, substitution) +- Recognize standard algorithm signatures (S-boxes, key schedules, magic constants) +- Find crypto API imports (CryptEncrypt, OpenSSL functions, etc.) + +**Key question**: "Is there crypto, and if so, what kind?" + +### Phase 2: Algorithm Identification +**Goal**: Determine what cryptographic algorithm is being used + +**Investigation approach:** +- Compare constants to known crypto constants (initialization vectors, S-boxes) +- Analyze operation patterns (rounds, block sizes, data flow) +- Match code structure to known algorithm patterns +- Check for library usage vs. custom implementation + +**Key question**: "What algorithm is this, or is it custom?" + +### Phase 3: Implementation Analysis +**Goal**: Understand how the crypto is implemented and find weaknesses + +**Investigation approach:** +- Trace key material sources (hardcoded, derived, user input) +- Analyze key generation/derivation logic +- Identify mode of operation (ECB, CBC, CTR, etc.) +- Look for implementation mistakes (IV reuse, weak RNG, etc.) +- Check for custom modifications to standard algorithms + +**Key question**: "How is it implemented, and where are the weaknesses?" + +### Phase 4: Key Extraction or Breaking +**Goal**: Recover the key or break the implementation to decrypt data + +**Investigation approach:** +- Extract hardcoded keys from binary data +- Exploit weak key derivation (predictable RNG, poor entropy) +- Break custom ciphers (frequency analysis, known-plaintext, etc.) +- Leverage implementation flaws (timing, side channels, logic errors) +- Reverse engineer decryption routines to understand transformation + +**Key question**: "How do I recover the plaintext or key?" + +## Core Methodologies + +### Methodology 1: String and Constant Analysis + +**When to use**: Initial discovery phase + +**Approach**: +1. Search for crypto keywords in strings +2. Search for URLs, API endpoints that might receive encrypted data +3. Locate large constant arrays (potential S-boxes, lookup tables) +4. Compare constants to known crypto constants databases +5. Follow cross-references from strings/constants to crypto functions + +**Tools**: +- `search-strings-regex` for crypto keywords +- `get-strings-by-similarity` for algorithm names +- `read-memory` to inspect constant arrays +- `find-cross-references` to trace usage + +### Methodology 2: Pattern Recognition + +**When to use**: Identifying algorithm type + +**Approach**: +1. Look for characteristic loop structures (round counts) +2. Identify substitution operations (table lookups) +3. Recognize permutation patterns (bit shuffling) +4. Spot modular arithmetic (public-key crypto) +5. Match to known algorithm patterns (see patterns.md) + +**Tools**: +- `get-decompilation` with context to see algorithm structure +- `search-decompilation` for operation patterns +- Pattern reference (patterns.md) for recognition + +### Methodology 3: Data Flow Analysis + +**When to use**: Understanding key management and data flow + +**Approach**: +1. Trace where plaintext/ciphertext enters the system +2. Follow key material from source to usage +3. Identify transformation steps (encrypt, decrypt, derive) +4. Map data dependencies between functions +5. Find where decrypted output is used or stored + +**Tools**: +- `find-cross-references` with context for data flow +- `rename-variables` to clarify data roles (plaintext, key, iv) +- `change-variable-datatypes` to reflect crypto types (uint8_t*, etc.) + +### Methodology 4: Weakness Discovery + +**When to use**: Finding exploitable flaws in implementation + +**Common implementation weaknesses in CTF challenges**: +- Hardcoded keys in binary (directly extractable) +- Weak key derivation (time-based seeds, simple XOR) +- Poor random number generation (predictable, seeded with constant) +- ECB mode (enables block analysis and manipulation) +- IV reuse or predictable IVs +- Custom ciphers with mathematical weaknesses +- Incomplete key schedules or reduced rounds +- Debug/test modes that bypass crypto + +**Investigation strategy**: +1. Check if key is hardcoded (read memory at key pointer) +2. Analyze RNG initialization (is seed predictable?) +3. Check for mode of operation weaknesses (ECB patterns) +4. Look for test/debug backdoors +5. Identify custom modifications to standard algorithms + +### Methodology 5: Reverse Engineering Decryption + +**When to use**: When you need to understand or replicate crypto logic + +**Approach**: +1. Find decryption routine (may be encryption run backwards) +2. Rename variables systematically (key, plaintext, ciphertext, state) +3. Apply correct data types (byte arrays, word arrays) +4. Document each transformation step with comments +5. Replicate logic in Python script to test understanding +6. Use binary's own decryption routine if possible + +**Tools**: +- `rename-variables` for clarity +- `change-variable-datatypes` for correctness +- `set-decompilation-comment` to document understanding +- `set-bookmark` to mark important crypto functions + +## Flexible Workflow + +CTF crypto challenges vary widely, so adapt this workflow to your specific challenge: + +### Quick Triage (5 minutes) +1. **Detect**: Search for crypto strings, imports, constants +2. **Identify**: Quick pattern match to known algorithms +3. **Assess**: Is it standard crypto or custom? Strong or weak? + +### Deep Investigation (15-30 minutes) +4. **Understand**: Decompile crypto functions, trace data flow +5. **Improve**: Rename variables, fix types, document behavior +6. **Analyze**: Find key sources, check for weaknesses +7. **Exploit**: Extract keys, break weak implementations, or replicate logic + +### Exploitation (varies) +8. **Extract**: Pull hardcoded keys from binary data +9. **Break**: Exploit weak RNG, custom cipher flaws, or poor key derivation +10. **Decrypt**: Use recovered keys or replicated logic to get flag + +### Verification +11. **Test**: Verify decryption produces readable flag +12. **Document**: Save findings in bookmarks and comments + +## Pattern Recognition + +For detailed cryptographic algorithm patterns and recognition techniques, see **patterns.md**. + +Key pattern categories: +- **Block ciphers**: AES, DES, Blowfish (S-boxes, rounds, key schedules) +- **Stream ciphers**: RC4, ChaCha (state evolution, keystream generation) +- **Public key**: RSA, ECC (modular arithmetic, large integers) +- **Hash functions**: MD5, SHA family (compression, magic constants) +- **Simple schemes**: XOR, substitution, custom ciphers + +## CTF-Specific Considerations + +### CTF Challenge Design Patterns + +**Common CTF crypto scenarios**: +1. **Weak custom cipher**: Break via cryptanalysis (frequency, known-plaintext) +2. **Hardcoded key**: Extract from .data section +3. **Weak RNG**: Predict key from time-based or constant seed +4. **Standard crypto, weak key**: Brute-force small keyspace +5. **Implementation bug**: Exploit logic error to bypass crypto +6. **Obfuscated standard**: Recognize despite code obfuscation + +**What CTF crypto is NOT**: +- Pure mathematical cryptanalysis (breaking AES-256 mathematically) +- Side-channel attacks on hardware (timing, power analysis) +- Network protocol attacks (though may combine with binary crypto) +- Breaking modern TLS/SSL implementations + +### Time Management + +**Prioritize based on difficulty**: +1. Hardcoded keys (minutes): Search .data, extract bytes +2. Weak RNG (10-15 min): Analyze seed, predict sequence +3. Simple custom cipher (20-30 min): Frequency analysis, known-plaintext +4. Implementation bugs (15-30 min): Find logic errors, test edge cases +5. Complex custom cipher (30-60 min): Full reverse engineering and breaking + +**Know when to move on**: If you've spent 30 minutes without progress, step back and reassess or try a different challenge. + +## Tool Usage Patterns + +### Discovery Phase +``` +search-strings-regex pattern="(AES|RSA|encrypt|decrypt|crypto|cipher|key)" +get-symbols includeExternal=true → Check for crypto API imports +search-decompilation pattern="(xor|sbox|round|block)" +``` + +### Analysis Phase +``` +get-decompilation includeIncomingReferences=true includeReferenceContext=true +find-cross-references direction="both" includeContext=true +read-memory at suspected key/S-box locations +``` + +### Improvement Phase +``` +rename-variables: {"var_1": "key", "var_2": "plaintext", "var_3": "sbox"} +change-variable-datatypes: {"key": "uint8_t*", "block": "uint8_t[16]"} +apply-data-type: uint8_t[256] to S-box constants +set-decompilation-comment: Document crypto operations +``` + +### Documentation Phase +``` +set-bookmark type="Analysis" category="Crypto" → Mark crypto functions +set-bookmark type="Note" category="Key" → Mark key locations +set-comment → Document assumptions and findings +``` + +## Integration with Other Skills + +### After Binary Triage +If binary-triage identified crypto indicators, start investigation at bookmarked locations: +``` +search-bookmarks type="Warning" category="Crypto" +search-bookmarks type="TODO" category="Crypto" +``` + +### With Deep Analysis +Use deep-analysis investigation loop for systematic crypto function analysis: +- READ → Get decompilation +- UNDERSTAND → Match to crypto patterns +- IMPROVE → Rename/retype for clarity +- VERIFY → Re-read to confirm +- FOLLOW → Trace key sources +- TRACK → Document findings + +### Standalone Usage +User explicitly asks about crypto: +- "What encryption is used?" +- "Find the hardcoded key" +- "How does the custom cipher work?" +- "Extract the encryption key" + +## Output Format + +Return structured findings: + +``` +Crypto Analysis Summary: +- Algorithm: [Identified algorithm or "custom cipher"] +- Confidence: [high/medium/low] +- Key Size: [bits/bytes] +- Mode: [ECB, CBC, CTR, etc. if applicable] + +Evidence: +- [Specific addresses, constants, code patterns] + +Key Material: +- Location: [address of key] +- Source: [hardcoded/derived/user-input] +- Value: [key bytes if extracted] + +Weaknesses Found: +- [List of exploitable weaknesses] + +Exploitation Strategy: +- [How to break/bypass crypto to get flag] + +Database Improvements: +- [Variables renamed, types fixed, comments added] + +Unanswered Questions: +- [Further investigation needed] +``` + +## Remember + +- **Generic approach**: Apply conceptual framework to any crypto implementation +- **Pattern matching**: Use patterns.md for algorithm recognition +- **Implementation focus**: Look for weaknesses in implementation, not mathematical breaks +- **Key extraction**: Most CTF challenges have extractable or derivable keys +- **Document as you go**: Crypto analysis benefits from clear variable naming +- **Time-box your work**: Don't spend hours on cryptanalysis if key extraction is simpler +- **Test assumptions**: Verify your understanding by replicating crypto logic + +Your goal is to **extract the flag**, not to become a cryptographer. Use implementation weaknesses, not mathematical attacks. diff --git a/skills/ctf-crypto/patterns.md b/skills/ctf-crypto/patterns.md new file mode 100644 index 0000000..f34f17b --- /dev/null +++ b/skills/ctf-crypto/patterns.md @@ -0,0 +1,630 @@ +# Cryptographic Pattern Recognition Reference + +This document provides pattern recognition guides for identifying and analyzing cryptographic implementations in compiled binaries. These patterns are generic and apply across different algorithms - focus on conceptual characteristics, not specific implementations. + +## General Crypto Recognition + +### Crypto Presence Indicators + +**High-confidence indicators:** +- Crypto-related strings: "encrypt", "decrypt", "cipher", "AES", "RSA", "key", "hash" +- Crypto library imports: CryptEncrypt, OpenSSL functions, libcrypto +- Large constant arrays (256+ bytes) with seemingly random data +- Heavy use of XOR operations +- Bitwise rotation patterns: `(x << n) | (x >> (32-n))` +- Fixed iteration counts (rounds): 8, 10, 12, 14, 16, 32, 64, 80 +- Modular arithmetic on large integers + +**Medium-confidence indicators:** +- Nested loops with array manipulations +- Byte-level array indexing patterns +- S-box style lookups: `output = table[input]` +- State transformation in fixed-size blocks + +**What to check:** +``` +search-strings-regex pattern="(encrypt|decrypt|crypto|cipher|AES|DES|RSA|RC4|key|hash|salt|iv)" +get-symbols includeExternal=true → Look for crypto API imports +search-decompilation pattern="(xor|sbox|round|permut)" +``` + +## Block Cipher Patterns + +### Conceptual Characteristics + +**Core concept**: Transform fixed-size data blocks through multiple rounds of substitution and permutation. + +**Key identifying features:** +1. **Fixed block size**: Data processed in chunks (64 bits, 128 bits, etc.) +2. **Round structure**: Outer loop with fixed iteration count +3. **Substitution**: Table lookups (S-boxes) replacing input bytes +4. **Permutation**: Bit shuffling, rotation, mixing operations +5. **Key schedule**: Function deriving per-round keys from master key + +**Generic code structure:** +```c +// Simplified conceptual pattern +void block_cipher_encrypt(uint8_t* data, uint8_t* key) { + uint8_t round_keys[NUM_ROUNDS][KEY_SIZE]; + generate_round_keys(key, round_keys); + + for (int round = 0; round < NUM_ROUNDS; round++) { + substitute_bytes(data); // S-box lookups + permute_bits(data); // Bit shuffling + mix_columns(data); // Linear transformation + add_round_key(data, round_keys[round]); // XOR with round key + } +} +``` + +### Substitution-Permutation Network (SPN) + +**What it is**: Most modern block ciphers (AES, PRESENT, etc.) + +**Recognition pattern:** +``` +Loop structure: + for round in 0..NUM_ROUNDS: + 1. SubBytes (S-box lookup) + 2. ShiftRows/PermuteBits (positional change) + 3. MixColumns (linear transformation) + 4. AddRoundKey (XOR with round key) + +Characteristics: + - Large constant arrays (S-boxes, typically 256 bytes) + - Heavy XOR usage + - Byte/word array indexing + - State array (16+ bytes) +``` + +**AES-specific signatures:** +- S-box starting: 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5... +- Round counts: 10 (AES-128), 12 (AES-192), 14 (AES-256) +- 128-bit state (16 bytes, often as 4x4 matrix) +- Rcon (round constant) array for key expansion + +**DES-specific signatures:** +- 64-bit blocks (8 bytes) +- 16 rounds +- Permutation tables (IP, FP) +- S-box arrays (8 boxes of 64 entries each) +- Feistel structure (see below) + +### Feistel Network + +**What it is**: Older block cipher design (DES, Blowfish, TEA) + +**Recognition pattern:** +``` +Loop structure: + Split data into left and right halves + for round in 0..NUM_ROUNDS: + temp = right + right = left XOR F(right, round_key[round]) + left = temp + Swap and combine halves + +Characteristics: + - Data split in half + - Swap operation each round + - F-function (round function) operating on half the data + - Other half XORed with F-function output +``` + +**Telltale code patterns:** +```c +// Feistel structure +uint32_t left = data[0]; +uint32_t right = data[1]; + +for (int i = 0; i < rounds; i++) { + uint32_t temp = right; + right = left ^ f_function(right, key[i]); + left = temp; +} +``` + +### Block Cipher Investigation Strategy + +1. **Count rounds**: Outer loop iterations → Indicates cipher type and key size +2. **Measure block size**: How much data processed per iteration → 64-bit (DES) or 128-bit (AES) +3. **Identify S-boxes**: Large constant arrays → `read-memory` and compare to known S-boxes +4. **Check key schedule**: Look for function deriving multiple round keys from master key +5. **Recognize structure**: SPN (parallel operations) vs Feistel (swap pattern) + +**Useful tools:** +``` +get-decompilation limit=50 includeIncomingReferences=true +read-memory at constant array addresses +rename-variables: var_1 → sbox, var_2 → state, var_3 → round_key +``` + +## Stream Cipher Patterns + +### Conceptual Characteristics + +**Core concept**: Generate pseudo-random keystream from key, then XOR with plaintext. + +**Key identifying features:** +1. **State-based generation**: Internal state evolves to produce keystream bytes +2. **Simple combination**: `ciphertext = plaintext XOR keystream` +3. **No fixed blocks**: Can encrypt arbitrary lengths +4. **Smaller code**: Less complex than block ciphers (no large S-boxes) +5. **Initialization**: State setup from key (KSA - Key Scheduling Algorithm) + +**Generic code structure:** +```c +// Simplified conceptual pattern +void stream_cipher(uint8_t* data, size_t len, uint8_t* key) { + uint8_t state[STATE_SIZE]; + initialize_state(state, key); // KSA + + for (size_t i = 0; i < len; i++) { + uint8_t keystream_byte = generate_next_byte(state); // PRGA + data[i] ^= keystream_byte; + } +} +``` + +### RC4 Pattern (Most Common in CTFs) + +**Recognition pattern:** +``` +Initialization (KSA): + state = [0, 1, 2, ..., 255] // 256-byte array + j = 0 + for i in 0..255: + j = (j + state[i] + key[i % key_len]) % 256 + swap(state[i], state[j]) + +Keystream generation (PRGA): + i = 0; j = 0 + for each byte: + i = (i + 1) % 256 + j = (j + state[i]) % 256 + swap(state[i], state[j]) + keystream_byte = state[(state[i] + state[j]) % 256] + output ^= keystream_byte +``` + +**Telltale signs:** +- 256-byte state array +- Swap operations: `temp = a[i]; a[i] = a[j]; a[j] = temp` +- Modulo 256 (`% 256` or `& 0xFF`) +- Index computations with running totals +- Two-phase structure (init, then generate) + +### ChaCha/Salsa Pattern + +**Recognition pattern:** +- 512-bit state (16 words of 32 bits) +- Quarter-round function (ARX: Add-Rotate-XOR) +- Magic constants: "expand 32-byte k" or "expand 16-byte k" +- 20 rounds (10 double-rounds) for ChaCha20 +- Heavy use of 32-bit rotation + +### Stream Cipher Investigation Strategy + +1. **Find state initialization**: Look for array setup from key +2. **Identify update function**: How state evolves (swap, ARX, LCG, etc.) +3. **Locate XOR operation**: Simple `output = input ^ keystream` +4. **Check for reuse**: Is same keystream used multiple times? (weakness) +5. **Analyze state size**: 256 bytes (RC4), 64 bytes (ChaCha), variable (custom) + +**Useful tools:** +``` +search-decompilation pattern="swap|xor" +get-decompilation to see state evolution loop +rename-variables: var_1 → state, var_2 → keystream, var_3 → index +``` + +## Public Key Cryptography Patterns + +### Conceptual Characteristics + +**Core concept**: Asymmetric encryption using mathematical trapdoor functions. + +**Key identifying features:** +1. **Large integer arithmetic**: Numbers hundreds or thousands of bits +2. **Modular exponentiation**: `result = base^exponent mod modulus` +3. **Very slow**: Orders of magnitude slower than symmetric crypto +4. **Multi-precision arithmetic**: Arrays representing big integers + +**Generic code structure:** +```c +// Simplified modular exponentiation (square-and-multiply) +bigint modexp(bigint base, bigint exponent, bigint modulus) { + bigint result = 1; + while (exponent > 0) { + if (exponent & 1) { + result = (result * base) % modulus; // Multiply + } + base = (base * base) % modulus; // Square + exponent >>= 1; + } + return result; +} +``` + +### RSA Pattern + +**Recognition pattern:** +``` +Key components: + - Large modulus N (1024, 2048, 4096+ bits) + - Public exponent e (often 65537 = 0x10001) + - Private exponent d + +Encryption: c = m^e mod N +Decryption: m = c^d mod N + +Operations: + - Modular exponentiation (square-and-multiply) + - Multi-precision multiplication + - Barrett or Montgomery reduction for modulo +``` + +**Telltale signs:** +- Very large buffers (128, 256, 512 bytes+) +- Magic constant 0x10001 (common RSA public exponent) +- Bit-by-bit processing of exponent +- Slow execution (many iterations) +- Functions for add/subtract/multiply on arrays + +### Elliptic Curve Pattern + +**Recognition pattern:** +- Point addition/doubling operations +- Affine or projective coordinates (x, y) or (x, y, z) +- Field arithmetic (modular arithmetic over prime field) +- Curve parameters (a, b, p, G, n) +- Scalar multiplication (point added to itself k times) + +### Public Key Investigation Strategy + +1. **Identify big integer operations**: Look for array-based arithmetic +2. **Find exponentiation pattern**: Square-and-multiply loop +3. **Extract parameters**: Modulus, exponent values from constants +4. **Check key size**: Buffer sizes indicate security level +5. **Look for weak parameters**: Small exponents, factorable moduli (CTF tricks) + +**CTF-specific weaknesses:** +- Small modulus (factorizable) +- Small private exponent (Wiener's attack) +- Reused primes across multiple keys +- Textbook RSA (no padding, malleable) + +## Hash Function Patterns + +### Conceptual Characteristics + +**Core concept**: One-way compression of arbitrary data to fixed-size digest. + +**Key identifying features:** +1. **Initialization constants**: Fixed magic numbers unique to algorithm +2. **Block processing**: Data processed in chunks (512 bits typical) +3. **State accumulation**: Running state updated with each block +4. **Padding**: Append bits to make input multiple of block size +5. **Heavy mixing**: Lots of bitwise operations (irreversible) + +**Generic code structure:** +```c +// Simplified hash structure +void hash(uint8_t* data, size_t len, uint8_t* digest) { + uint32_t state[STATE_SIZE]; + initialize_state(state); // Magic constants + + // Process each block + for (each block in data) { + process_block(state, block); // Compression function + } + + finalize(state, digest); // Output transformation +} +``` + +### MD5/SHA Recognition + +**MD5 initialization constants:** +```c +state[0] = 0x67452301; +state[1] = 0xefcdab89; +state[2] = 0x98badcfe; +state[3] = 0x10325476; +``` + +**SHA-1 initialization constants:** +```c +state[0] = 0x67452301; +state[1] = 0xefcdab89; +state[2] = 0x98badcfe; +state[3] = 0x10325476; +state[4] = 0xc3d2e1f0; +``` + +**SHA-256 initialization constants:** +```c +// First 32 bits of fractional parts of square roots of first 8 primes +state[0] = 0x6a09e667; +state[1] = 0xbb67ae85; +state[2] = 0x3c6ef372; +// ... 5 more +``` + +**Telltale signs:** +- Characteristic initialization constants (search for these!) +- Fixed round counts: 64 (MD5, SHA-256), 80 (SHA-1, SHA-512) +- Bitwise rotations: `(x << n) | (x >> (32-n))` +- Message schedule expansion (W array) +- Mixing functions (F, G, H functions in MD5) + +### Hash Investigation Strategy + +1. **Search for magic constants**: Hash functions have unique initializers +2. **Count rounds**: 64 or 80 iterations → Specific hash function +3. **Check block size**: 512 bits (MD5, SHA-1, SHA-256) or 1024 bits (SHA-512) +4. **Identify mixing operations**: AND, OR, XOR, NOT, rotation patterns +5. **Find padding logic**: Append 0x80, then zeros, then length + +**Useful tools:** +``` +search-decompilation pattern="0x67452301|0xefcdab89|0x98badcfe" +get-decompilation to see round structure +read-memory at initialization constants +``` + +## Simple Obfuscation Patterns + +### XOR Cipher + +**What it is**: Trivial encryption used for obfuscation, not security. + +**Recognition pattern:** +``` +Single-byte key: + for (i = 0; i < len; i++) + data[i] ^= 0x42; // Fixed constant + +Multi-byte key: + for (i = 0; i < len; i++) + data[i] ^= key[i % keylen]; // Repeating key + +Rolling key (LCG-based): + key = seed; + for (i = 0; i < len; i++) { + data[i] ^= key; + key = (key * A + C) % M; // Linear congruential generator + } +``` + +**Telltale signs:** +- Very short functions (5-10 lines) +- XOR with constants or simple patterns +- Often applied to strings or config data +- No complex state or multiple rounds + +**Breaking approach:** +- Single-byte: Brute-force (256 possibilities) +- Multi-byte: Frequency analysis or known-plaintext +- Rolling key: If LCG parameters known, reproduce sequence + +### Substitution Cipher + +**Recognition pattern:** +``` +Simple substitution: + for (i = 0; i < len; i++) + output[i] = substitution_table[input[i]]; + +Caesar cipher (special case): + for (i = 0; i < len; i++) + output[i] = (input[i] + shift) % 256; +``` + +**Breaking approach:** +- Frequency analysis (if sufficient ciphertext) +- Known-plaintext attack +- Brute-force substitution table + +### Custom Cipher Pattern + +**What it is**: Challenge-specific encryption scheme not based on standards. + +**Recognition indicators:** +- No match to known crypto patterns +- Unusual operations or data flow +- Mix of arithmetic, XOR, bit shifts in non-standard way +- Often simpler than real crypto (for solvability) + +**Investigation strategy:** +1. **Document operations**: What transformations are applied, in what order? +2. **Identify invertibility**: Can operations be reversed? +3. **Look for weaknesses**: + - Reduced keyspace (brute-forceable) + - Linear operations (algebraically solvable) + - Repeated patterns (exploitable structure) +4. **Known-plaintext**: If you have plaintext-ciphertext pairs, work backwards +5. **Replicate in Python**: Reproduce encryption logic, then reverse it + +**Common CTF custom cipher weaknesses:** +- Insufficient mixing (partially recoverable plaintext) +- Weak key derivation (predictable) +- Reversible operations (decrypt by inverting) +- Small state space (brute-forceable) + +## Recognition Workflow + +### Step 1: Initial Detection +``` +1. Search for crypto strings + search-strings-regex pattern="(encrypt|decrypt|aes|rsa|md5|sha|key)" + +2. Check for crypto API imports + get-symbols includeExternal=true → Look for OpenSSL, Windows Crypto API + +3. Search for crypto patterns in code + search-decompilation pattern="(xor|sbox|round)" +``` + +### Step 2: Pattern Matching +``` +4. Get decompilation of suspected function + get-decompilation includeIncomingReferences=true + +5. Compare to pattern categories: + - Block cipher? (rounds, S-boxes, fixed blocks) + - Stream cipher? (state, swap, XOR) + - Hash? (magic constants, compression) + - Public key? (big integers, modexp) + - Simple obfuscation? (short, simple XOR) +``` + +### Step 3: Detailed Analysis +``` +6. Read constant arrays + read-memory at suspected S-box/constant locations + +7. Compare to known values + - AES S-box: 63 7c 77 7b... + - MD5 init: 67452301 efcdab89... + - RSA exponent: 0x10001 + +8. Count iterations + - 10/12/14 rounds → AES + - 16 rounds → DES + - 64/80 rounds → Hash function +``` + +### Step 4: Verification +``` +9. Rename variables for clarity + rename-variables: var_1 → sbox, var_2 → key, var_3 → state + +10. Document findings + set-bookmark type="Analysis" category="Crypto" + set-decompilation-comment line=N "AES encryption round" + +11. Cross-check with usage + find-cross-references → See where crypto is called, what data it processes +``` + +## CTF-Specific Patterns + +### Key Management Anti-Patterns + +**Hardcoded keys (most common):** +```c +uint8_t key[] = {0x41, 0x42, 0x43, ...}; // Key in .data section +encrypt(data, key); +``` +**Finding**: `read-memory` at key array address + +**Weak derivation:** +```c +// Time-based (predictable) +srand(time(NULL)); +for (i = 0; i < keylen; i++) + key[i] = rand() % 256; + +// Constant seed (always same key) +srand(12345); +... +``` +**Finding**: Analyze RNG initialization, predict or replicate + +**User input as key (brute-force candidate):** +```c +scanf("%s", key); // Short password +if (strlen(key) < 8) ... +``` +**Finding**: Small keyspace, brute-forceable + +### Implementation Bugs to Exploit + +**ECB mode (block patterns visible):** +```c +for (i = 0; i < len; i += BLOCK_SIZE) + encrypt_block(data + i, key); // No chaining +``` +**Weakness**: Identical plaintext blocks → identical ciphertext blocks + +**IV reuse or zero IV:** +```c +uint8_t iv[16] = {0}; // Should be random! +``` +**Weakness**: Breaks CBC security, enables attacks + +**Reduced rounds (weak variant):** +```c +#define ROUNDS 4 // Should be 10+ for AES +``` +**Weakness**: May be breakable with cryptanalysis tools + +**Debug backdoor:** +```c +if (strcmp(password, "DEBUG") == 0) + return decrypt_without_key(data); +``` +**Finding**: Search for debug strings, test/admin backdoors + +## Using This Reference + +### Quick Lookup Process + +1. **Identify general category**: Block/stream/hash/public-key/simple +2. **Match to specific pattern**: Compare code structure to examples +3. **Verify with evidence**: Check constants, round counts, operations +4. **Document in Ghidra**: Rename, retype, comment for clarity +5. **Investigate weaknesses**: Look for CTF-specific anti-patterns + +### Example Investigation Flow + +``` +Observation: Function with loop, array access, XOR + +1. Compare to patterns: + - Block cipher? (Check for S-boxes, rounds) + - Stream cipher? (Check for swap, state evolution) + - Simple XOR? (Check function length) + +2. Verify: + - Read memory at constant array (if exists) + - Count loop iterations + - Check for characteristic operations + +3. Identify: + - Found 256-byte array with specific pattern + - Swap operations in initialization + - Simple XOR in second phase + +4. Conclude: RC4 stream cipher + +5. Improve: + rename-variables: state, keystream, plaintext + set-comment: "RC4 encryption with hardcoded key" + +6. Exploit: + Extract key from initialization + Replicate RC4 in Python to decrypt +``` + +### Progressive Refinement + +**First pass**: "This looks like crypto (XOR, loops, constants)" +**Second pass**: "Probably a block cipher (rounds, S-box pattern)" +**Third pass**: "Matches AES pattern (S-box signature, 10/12/14 rounds)" +**Fourth pass**: "AES-128 with hardcoded key at 0x405000" +**Fifth pass**: "Extracted key, successfully decrypted flag" + +Each pass narrows down understanding and guides next investigation steps. + +## Remember + +- **Patterns are guidelines**, not rigid rules - CTF challenges may have variations +- **Constants are your friends** - Magic numbers uniquely identify algorithms +- **Structure reveals intent** - Loop patterns indicate algorithm type +- **CTF crypto is about implementation** - Look for weaknesses, not mathematical breaks +- **Document as you learn** - Rename variables to reflect your understanding +- **Verify with evidence** - Don't guess - compare constants, count rounds, check operations + +Use this reference alongside the conceptual framework in SKILL.md to systematically identify and analyze cryptographic implementations. diff --git a/skills/ctf-pwn/SKILL.md b/skills/ctf-pwn/SKILL.md new file mode 100644 index 0000000..3c165b6 --- /dev/null +++ b/skills/ctf-pwn/SKILL.md @@ -0,0 +1,532 @@ +--- +name: ctf-pwn +description: Solve CTF binary exploitation challenges by discovering and exploiting memory corruption vulnerabilities to read flags. Use for buffer overflows, format strings, heap exploits, ROP challenges, or any pwn/exploitation task. +--- + +# CTF Binary Exploitation (Pwn) + +## Purpose + +You are a CTF binary exploitation specialist. Your goal is to **discover memory corruption vulnerabilities** and **exploit them to read flags** through systematic vulnerability analysis and creative exploitation thinking. + +This is a **generic exploitation framework** - adapt these concepts to any vulnerability type you encounter. Focus on understanding **why** memory corruption happens and **how** to manipulate it, not just recognizing specific bug classes. + +## Conceptual Framework + +### The Exploitation Mindset + +**Think in three layers:** + +1. **Data Flow Layer**: Where does attacker-controlled data go? + - Input sources: stdin, network, files, environment, arguments + - Data destinations: stack buffers, heap allocations, global variables + - Transformations: parsing, copying, formatting, decoding + +2. **Memory Safety Layer**: What assumptions does the program make? + - Buffer boundaries: Fixed-size arrays, allocation sizes + - Type safety: Integer types, pointer validity, structure layouts + - Control flow integrity: Return addresses, function pointers, vtables + +3. **Exploitation Layer**: How can we violate trust boundaries? + - Memory writes: Overwrite critical data (return addresses, function pointers, flags) + - Memory reads: Leak information (addresses, canaries, pointer values) + - Control flow hijacking: Redirect execution to attacker-controlled locations + - Logic manipulation: Change program state to skip checks or trigger unintended paths + +### Core Question Sequence + +For every CTF pwn challenge, ask these questions **in order**: + +1. **What data do I control?** + - Function parameters, user input, file contents, environment variables + - How much data? What format? Any restrictions (printable chars, null bytes)? + +2. **Where does my data go in memory?** + - Stack buffers? Heap allocations? Global variables? + - What's the size of the destination? Is it checked? + +3. **What interesting data is nearby in memory?** + - Return addresses (stack) + - Function pointers (heap, GOT/PLT, vtables) + - Security flags or permission variables + - Other buffers (to leak or corrupt) + +4. **What happens if I send more data than expected?** + - Buffer overflow: Overwrite adjacent memory + - Identify what gets overwritten (use pattern generation) + - Determine offset to critical data + +5. **What can I overwrite to change program behavior?** + - Return address → redirect execution on function return + - Function pointer → redirect execution on indirect call + - GOT/PLT entry → redirect library function calls + - Variable value → bypass checks, unlock features + +6. **Where can I redirect execution?** + - Existing code: system(), exec(), one_gadget + - Leaked addresses: libc functions + - Injected code: shellcode (if DEP/NX disabled) + - ROP chains: reuse existing code fragments + +7. **How do I read the flag?** + - Direct: Call system("/bin/cat flag.txt") or open()/read()/write() + - Shell: Call system("/bin/sh") and interact + - Leak: Read flag into buffer, leak buffer contents + +## Core Methodologies + +### Vulnerability Discovery + +**Unsafe API Pattern Recognition:** + +Identify dangerous functions that don't enforce bounds: +- **Unbounded copies**: strcpy, strcat, sprintf, gets +- **Underspecified bounds**: read(), recv(), scanf("%s"), strncpy (no null termination) +- **Format string bugs**: printf(user_input), fprintf(fp, user_input) +- **Integer overflows**: malloc(user_size), buffer[user_index], length calculations + +**Investigation strategy:** +1. `get-symbols` includeExternal=true → Find unsafe API imports +2. `find-cross-references` to unsafe functions → Locate usage points +3. `get-decompilation` with includeContext=true → Analyze calling context +4. Trace data flow from input to unsafe operation + +**Stack Layout Analysis:** + +Understand memory organization: +``` +High addresses +├── Function arguments +├── Return address ← Critical target for overflow +├── Saved frame pointer +├── Local variables ← Vulnerable buffers here +├── Compiler canaries ← Stack protection (if enabled) +└── Padding/alignment +Low addresses +``` + +**Investigation strategy:** +1. `get-decompilation` of vulnerable function → See local variable layout +2. Estimate offsets: buffer → saved registers → return address +3. `set-bookmark` type="Analysis" category="Vulnerability" at overflow site +4. `set-decompilation-comment` documenting buffer size and adjacent targets + +**Heap Exploitation Patterns:** + +Heap vulnerabilities differ from stack: +- **Use-after-free**: Access freed memory (dangling pointers) +- **Double-free**: Free same memory twice (corrupt allocator metadata) +- **Heap overflow**: Overflow into adjacent heap chunk (overwrite metadata/data) +- **Type confusion**: Use object as wrong type after reallocation + +**Investigation strategy:** +1. `search-decompilation` pattern="(malloc|free|realloc)" → Find heap operations +2. Trace pointer lifecycle: allocation → use → free +3. Look for dangling pointer usage after free +4. Identify adjacent allocations (overflow targets) + +### Memory Layout Understanding + +**Address Space Discovery:** + +Map the binary's memory: +1. `get-memory-blocks` → See sections (.text, .data, .bss, heap, stack) +2. Note executable sections (shellcode candidates if NX disabled) +3. Note writable sections (data corruption targets) +4. Identify ASLR status (addresses randomized each run?) + +**Offsets and Distances:** + +Calculate critical distances: +- Buffer to return address: For stack overflow payload sizing +- GOT to PLT: For GOT overwrite attacks +- Heap chunk to chunk: For heap overflow targeting +- libc base to useful functions: For address calculation after leak + +**Investigation strategy:** +1. `get-data` or `read-memory` at known addresses → Sample memory layout +2. `find-cross-references` direction="both" → Map relationships +3. Calculate offsets manually from decompilation +4. `set-comment` at key offsets documenting distances + +### Exploitation Planning + +**Constraint Analysis:** + +Identify exploitation constraints: +- **Bad bytes**: Null bytes (\x00) terminate C strings → avoid in address/payload +- **Input size limits**: Truncation, buffering, network MTU +- **Character restrictions**: Printable-only, alphanumeric, no special chars +- **Protection mechanisms**: Detect via `search-decompilation` pattern="(canary|__stack_chk)" + +**Bypass Strategies:** + +Common protections and bypass techniques: +- **Stack canaries**: Leak canary value, brute-force (fork servers), overwrite without corrupting +- **ASLR**: Leak addresses (format strings, uninitialized data), partial overwrite (last byte randomization) +- **NX/DEP**: ROP (Return-Oriented Programming), ret2libc, JOP (Jump-Oriented Programming) +- **PIE**: Leak code addresses, relative offsets within binary, partial overwrites + +**Exploitation Primitives:** + +Build these fundamental capabilities: +- **Arbitrary write**: Write controlled data to chosen address (format string, heap overflow) +- **Arbitrary read**: Read from chosen address (format string, uninitialized data, overflow into pointer) +- **Control flow hijack**: Redirect execution (overwrite return address, function pointer, GOT entry) +- **Information leak**: Obtain addresses, canaries, pointers (uninitialized variables, format strings) + +**Chain multiple primitives when needed:** +- Leak → Calculate addresses → Overwrite function pointer → Exploit +- Partial overwrite → Leak full address → Calculate libc base → ret2libc +- Heap overflow → Overwrite function pointer → Arbitrary write → GOT overwrite → Shell + +## Flexible Workflow + +This is a **thinking framework**, not a rigid checklist. Adapt to the challenge: + +### Phase 1: Binary Reconnaissance (5-10 tool calls) + +**Understand the challenge:** + +1. `get-current-program` or `list-project-files` → Identify target binary +2. `get-memory-blocks` → Map sections, identify protections +3. `get-functions` filterDefaultNames=false → Count functions (stripped vs. symbolic) +4. `search-strings-regex` pattern="flag" → Find flag-related strings +5. `get-symbols` includeExternal=true → List imported functions + +**Identify entry points and input vectors:** + +6. `get-decompilation` functionNameOrAddress="main" limit=50 → See program flow +7. Look for input functions: read(), recv(), gets(), scanf(), fgets() +8. `find-cross-references` to input functions → Map input flow +9. `set-bookmark` type="TODO" category="Input Vector" at each input point + +**Flag suspicious patterns:** +- Unsafe functions (strcpy, sprintf, gets) +- Large stack buffers with small read operations +- Format string vulnerabilities (user-controlled format) +- Unbounded loops or recursion + +### Phase 2: Vulnerability Analysis (10-15 tool calls) + +**Trace data flow from input to vulnerability:** + +1. `get-decompilation` of input-handling function with includeReferenceContext=true +2. Identify buffer sizes: char buf[64], malloc(size), etc. +3. Identify write operations: strcpy(dest, src), read(fd, buf, 1024) +4. **Calculate vulnerability**: Write size > buffer size? + +**Analyze vulnerable function context:** + +5. `rename-variables` → Clarify data flow (user_input, buffer, size, etc.) +6. `change-variable-datatypes` → Fix types for clarity +7. `set-decompilation-comment` → Document vulnerability location and type + +**Map memory layout around vulnerability:** + +8. Identify local variables and their stack positions +9. Calculate offset from buffer start to return address +10. `read-memory` at nearby addresses → Sample stack layout (if debugging available) +11. `set-bookmark` type="Warning" category="Overflow" → Mark vulnerability + +**Cross-reference analysis:** + +12. `find-cross-references` to vulnerable function → How is it called? +13. Check for exploitation helpers: system(), exec(), "/bin/sh" string +14. `search-strings-regex` pattern="/bin/(sh|bash)" → Find shell strings +15. `search-decompilation` pattern="system|exec" → Find execution functions + +### Phase 3: Exploitation Strategy (5-10 tool calls) + +**Determine exploitation approach:** + +Based on protections and available primitives: + +**If no protections (NX disabled, no canary, no ASLR):** +- Stack overflow → overwrite return address → jump to shellcode +- Inject shellcode in buffer, jump to buffer address + +**If NX enabled but no ASLR:** +- ret2libc: Overwrite return address → chain to system() with "/bin/sh" +- ROP chain: Chain gadgets to build system("/bin/sh") call +- GOT overwrite: Overwrite GOT entry to redirect library call + +**If ASLR enabled:** +- Leak addresses first (format string, uninitialized data) +- Calculate libc base from leaked address +- Use leak to build ROP chain or ret2libc with correct addresses + +**If stack canary present:** +- Leak canary value (format string, sequential overflow) +- Preserve canary in overflow payload +- Or use heap exploitation instead + +**Investigation for each strategy:** + +1. `search-strings-regex` pattern="(\\x2f|/)bin/(sh|bash)" → Find shell strings +2. `find-cross-references` to "/bin/sh" → Get string address +3. `get-symbols` includeExternal=true → Find system/exec imports +4. `get-decompilation` of system → Get address (if not PIE) + +**For ROP:** +5. `search-decompilation` pattern="(pop|ret)" → Find gadget candidates +6. Manual ROP gadget discovery (use external tools like ROPgadget) +7. Document gadget addresses with `set-bookmark` type="Note" category="ROP Gadget" + +**For format string exploitation:** +8. `get-decompilation` of printf call → Analyze format string control +9. Test format string primitives: %x (leak), %n (write), %s (arbitrary read) +10. `set-comment` documenting exploitation primitive + +### Phase 4: Payload Construction (Conceptual) + +**Build the exploit payload:** + +This happens **outside Ghidra** using Python/pwntools, but plan it here: + +1. **Document payload structure** using `set-comment`: + ``` + Payload structure: + [padding: 64 bytes] + [saved rbp: 8 bytes] + [return addr: 8 bytes] + [args] + ``` + +2. **Record critical addresses** with `set-bookmark`: + - Buffer address: 0x7fffffffdd00 + - Return address location: 0x7fffffffdd40 (offset +64) + - system() address: 0x7ffff7e14410 + - "/bin/sh" string: 0x00404030 + +3. **Document exploitation steps** with `set-bookmark` type="Analysis" category="Exploit Plan": + ``` + Step 1: Send 64 bytes padding + Step 2: Overwrite return address with system() address + Step 3: Inject "/bin/sh" pointer as argument + Step 4: Trigger return to execute system("/bin/sh") + ``` + +4. **Track assumptions** with `set-bookmark` type="Warning" category="Assumption": + - "Assuming stack addresses are stable (no ASLR)" + - "Assuming no canary based on decompilation (verify runtime)" + +### Phase 5: Exploitation Validation (Iterative) + +**This phase happens outside Ghidra**, but document findings: + +1. Test exploit against local binary +2. Adjust offsets based on crash analysis +3. Handle bad bytes or character restrictions +4. Refine payload until successful + +**Update Ghidra database with findings:** +- `set-comment` with actual working offsets +- `set-bookmark` documenting successful exploitation +- `checkin-program` message="Documented successful exploitation of buffer overflow in function_X" + +## Pattern Recognition + +See `patterns.md` for detailed vulnerability patterns: +- Unsafe API usage patterns +- Buffer overflow indicators +- Format string vulnerability signatures +- Heap exploitation patterns +- Integer overflow scenarios +- Control flow hijacking opportunities + +## Exploitation Techniques Reference + +### Stack Buffer Overflow + +**Concept**: Write beyond buffer bounds to overwrite return address or function pointers on stack. + +**Discovery**: +1. Find unsafe copy: strcpy, gets, scanf("%s"), read with large size +2. Identify buffer size from decompilation +3. Compare buffer size to maximum input size +4. Calculate offset to return address (buffer size + saved registers) + +**Exploitation**: +- Payload: [padding to return address] + [new return address] + [optional arguments/ROP chain] +- Target: Overwrite return address to redirect execution + +### Format String Vulnerability + +**Concept**: User-controlled format string allows arbitrary memory read/write. + +**Discovery**: +1. `search-decompilation` pattern="printf|fprintf|sprintf" +2. Check if format string comes from user input: printf(user_buffer) +3. Vulnerable pattern: printf(input) instead of printf("%s", input) + +**Exploitation**: +- Read: %x, %p (leak stack values), %s (arbitrary read via pointer on stack) +- Write: %n (write number of bytes printed to pointer on stack) +- Position: %N$x (access Nth argument directly) + +**Investigation**: +4. `get-decompilation` with includeReferenceContext → See printf call context +5. `set-decompilation-comment` documenting format string control +6. `set-bookmark` type="Warning" category="Format String" + +### Return-Oriented Programming (ROP) + +**Concept**: Chain existing code fragments (gadgets) ending in 'ret' to build arbitrary computation without injecting code. + +**Discovery**: +1. Find gadgets: `pop reg; ret`, `mov [addr], reg; ret`, `syscall; ret` +2. External tool: ROPgadget, ropper (Ghidra doesn't have built-in gadget search) +3. Document gadgets in Ghidra with `set-bookmark` type="Note" category="ROP Gadget" + +**Exploitation**: +- Chain gadgets by placing addresses on stack +- Each gadget executes, then 'ret' pops next gadget address +- Build syscall with proper registers: execve("/bin/sh", NULL, NULL) + +**Workflow**: +4. Identify required gadgets for goal (e.g., execve syscall) +5. `set-comment` at gadget addresses documenting purpose +6. Plan ROP chain structure with `set-bookmark` type="Analysis" category="ROP Chain" + +### ret2libc + +**Concept**: Redirect execution to libc functions (system, exec, one_gadget) instead of shellcode. + +**Discovery**: +1. `get-symbols` includeExternal=true → Find libc imports +2. `find-cross-references` to system, execve → Get addresses +3. `search-strings-regex` pattern="/bin/sh" → Find shell string + +**Exploitation** (no ASLR): +- Overwrite return address → system function address +- Set first argument → pointer to "/bin/sh" string +- Calling convention: x86-64 uses RDI for first arg, x86 uses stack + +**Exploitation** (with ASLR): +- Leak libc address (format string, uninitialized pointer) +- Calculate system/exec address = libc_base + offset +- Build ROP chain with calculated addresses + +**Investigation**: +4. `get-data` at GOT entries → See libc function addresses +5. Calculate libc base from known offset +6. `set-bookmark` documenting calculated addresses + +### Heap Exploitation + +**Concept**: Corrupt heap metadata or overflow between heap chunks to achieve arbitrary write or control flow hijack. + +**Discovery**: +1. `search-decompilation` pattern="malloc|free|realloc" +2. Trace allocation and free patterns +3. Look for use-after-free: pointer used after free() +4. Look for heap overflow: write beyond allocated size + +**Exploitation techniques**: +- **Use-after-free**: Free object, allocate new object in same slot, use old pointer to access new object (type confusion) +- **Double-free**: Free same pointer twice, corrupt allocator metadata +- **Heap overflow**: Overflow into next chunk, overwrite metadata (size, pointers) or data (function pointers) +- **Fastbin/tcache poisoning**: Corrupt freelist pointers to allocate arbitrary memory + +**Investigation**: +5. `rename-variables` for heap pointers (heap_ptr, freed_ptr, chunk1, chunk2) +6. `set-decompilation-comment` at allocation/free sites +7. `set-bookmark` type="Warning" category="Use-After-Free" + +### Integer Overflow + +**Concept**: Integer overflow/underflow leads to incorrect buffer size calculation or bounds check bypass. + +**Discovery**: +1. Find size calculations: size = user_input * sizeof(element) +2. Check for overflow: What if user_input is very large? +3. Find bounds checks: if (index < size) → What if index is large unsigned? + +**Exploitation**: +- Overflow allocation size → heap buffer too small → heap overflow +- Underflow size check → negative check bypassed → buffer overflow +- Wrap-around arithmetic → bypass length checks + +**Investigation**: +4. `change-variable-datatypes` to proper integer types (uint32_t, size_t) +5. Identify overflow scenarios in comments +6. `set-bookmark` type="Warning" category="Integer Overflow" + +## Tool Integration + +**Use ReVa tools systematically:** + +### Discovery Tools +- `get-symbols` → Find unsafe API imports +- `search-strings-regex` → Find interesting strings (flag, shell, paths) +- `search-decompilation` → Find vulnerability patterns (unsafe functions) +- `get-functions-by-similarity` → Find functions similar to known vulnerable pattern + +### Analysis Tools +- `get-decompilation` with `includeIncomingReferences=true` and `includeReferenceContext=true` +- `find-cross-references` with `includeContext=true` → Trace data flow +- `get-data` → Examine global variables, GOT entries, constant data +- `read-memory` → Sample memory layout + +### Database Improvement Tools +- `rename-variables` → Clarify exploitation-relevant variables (buffer, user_input, return_addr) +- `change-variable-datatypes` → Fix types for proper understanding +- `set-decompilation-comment` → Document vulnerabilities inline +- `set-comment` → Document exploitation strategy at key addresses +- `set-bookmark` → Track vulnerabilities, gadgets, exploit plan + +### Organization Tools +- `set-bookmark` type="Warning" category="Vulnerability" → Mark vulnerabilities +- `set-bookmark` type="Note" category="ROP Gadget" → Track gadgets +- `set-bookmark` type="Analysis" category="Exploit Plan" → Document strategy +- `set-bookmark` type="TODO" category="Verify" → Track assumptions to verify +- `checkin-program` → Save progress + +## Success Criteria + +You've successfully completed the challenge when: + +1. **Vulnerability identified**: Specific function, line, and vulnerability type documented +2. **Memory layout understood**: Buffer sizes, offsets, adjacent data mapped +3. **Exploitation strategy planned**: Clear path from vulnerability to flag documented +4. **Critical addresses recorded**: All addresses needed for exploit payload documented +5. **Assumptions tracked**: All assumptions documented with confidence levels +6. **Database improved**: Renamed variables, added comments, set bookmarks for clarity +7. **Exploit plan ready**: Sufficient information to write exploit code outside Ghidra + +**Return to user:** +- Vulnerability description with evidence +- Exploitation approach explanation +- Critical addresses and offsets +- Payload structure plan +- Assumptions and verification needs +- Follow-up tasks if needed (e.g., "Test exploit against binary") + +## Anti-Patterns + +**Don't**: +- Assume vulnerability without evidence (check buffer sizes!) +- Forget about protections (canaries, NX, ASLR, PIE) +- Overlook input restrictions (bad bytes, size limits) +- Get stuck on one approach (try different exploitation techniques) +- Ignore calling conventions (x86 vs x64 argument passing) +- Forget null byte termination (C string functions) + +**Do**: +- Verify buffer sizes from decompilation +- Check for stack canaries: `__stack_chk_fail` references +- Calculate offsets precisely (buffer to return address) +- Document all assumptions with `set-bookmark` type="Warning" +- Adapt exploitation technique to protections present +- Think creatively (chain primitives, use unconventional targets) + +## Remember + +Binary exploitation is **creative problem-solving**: +- Understand **why** vulnerabilities exist (unsafe assumptions) +- Think **how** to manipulate memory (data flow analysis) +- Plan **what** to overwrite (control flow, data, pointers) +- Determine **where** to redirect (existing code, injected code, ROP) +- Execute **step-by-step** (leak, calculate, overwrite, trigger) + +Every CTF challenge is different. Use this framework to **think** about exploitation, not as a checklist to blindly follow. + +**Your goal**: Document enough information in Ghidra to write the exploit script. The actual exploitation happens outside, but the analysis happens here. diff --git a/skills/ctf-pwn/patterns.md b/skills/ctf-pwn/patterns.md new file mode 100644 index 0000000..55e7da6 --- /dev/null +++ b/skills/ctf-pwn/patterns.md @@ -0,0 +1,948 @@ +# CTF Binary Exploitation Patterns + +This document contains patterns for recognizing common vulnerability classes and exploitation primitives in CTF challenges. Focus on **conceptual understanding** rather than specific exploits. + +## Vulnerability Recognition Patterns + +### Unsafe String Operation Patterns + +**Conceptual characteristics**: +- Functions that don't check destination buffer size +- Unbounded copying from source to destination +- Reliance on null terminator without size validation +- No length parameter or ignored length parameter + +**Dangerous API patterns**: + +```c +// Unbounded copy (no size checking) +strcpy(dest, user_input); // Copies until null byte +strcat(dest, user_input); // Appends until null byte +sprintf(dest, "%s", user_input); // Formats without bounds +gets(buffer); // Reads unlimited from stdin + +// Underspecified bounds +strncpy(dest, src, sizeof(dest)); // Doesn't guarantee null termination +scanf("%s", buffer); // No size limit specified +read(fd, buffer, 1024); // May exceed buffer size if buffer < 1024 +recv(sock, buffer, MAX, 0); // May exceed buffer capacity +``` + +**What to look for in decompiled code**: +``` +Buffer declaration: + char buffer[64]; // Fixed-size local array + +Unsafe operation on same buffer: + strcpy(buffer, user_input); // No size check + read(fd, buffer, 256); // Reads more than buffer holds + +Distance to critical data: + buffer[64] // Local variable at stack offset + saved_rbp // Usually at buffer + buffer_size + return_address // Usually at buffer + buffer_size + 8 +``` + +**Investigation strategy**: +1. `get-symbols` includeExternal=true → Find strcpy, strcat, gets, scanf, sprintf imports +2. `find-cross-references` to unsafe functions → Locate call sites +3. `get-decompilation` with includeContext=true → Analyze buffer size vs. input size +4. Calculate: input_max_size > buffer_size? → Buffer overflow exists +5. `set-bookmark` type="Warning" category="Buffer Overflow" at vulnerability + +**Telltale signs**: +- Local char arrays with small sizes (64, 128, 256 bytes) +- Unbounded string functions called on those arrays +- User input directly passed to unsafe function +- No explicit size checking before copy operation + +### Format String Vulnerability Patterns + +**Conceptual characteristics**: +- User controls the format string parameter +- Format specifiers allow memory read (%x, %s, %p) and write (%n) +- Stack-based exploitation (format string reads stack arguments) +- Arbitrary read/write primitive when exploited + +**Vulnerable patterns**: + +```c +// VULNERABLE: User input as format string +printf(user_input); +fprintf(fp, user_input); +sprintf(buffer, user_input); +snprintf(buffer, size, user_input); +syslog(priority, user_input); + +// SAFE: Format string is literal +printf("%s", user_input); +fprintf(fp, "Input: %s\n", user_input); +sprintf(buffer, "Data: %s", user_input); +``` + +**What to look for in decompiled code**: +``` +Direct user input to format function: + read(0, buffer, 256); + printf(buffer); // VULNERABLE + +Variable format string: + char* fmt = get_format_string(); // Source from user + printf(fmt, args); // VULNERABLE if fmt user-controlled + +Missing format string: + fprintf(stderr, error_msg); // VULNERABLE if error_msg from user +``` + +**Exploitation primitives**: + +``` +%x or %p → Leak stack values (addresses, canaries, pointers) +%s → Arbitrary read (if pointer on stack) +%n → Arbitrary write (writes byte count to pointer) +%N$x → Direct parameter access (Nth argument) +%N$n → Write to Nth argument pointer + +Example attack: + printf("AAAA%10$x"); → Leak 10th stack parameter + printf("AAAA%7$n"); → Write to pointer at 7th stack position +``` + +**Investigation strategy**: +1. `search-decompilation` pattern="printf|fprintf|sprintf|snprintf|syslog" +2. `get-decompilation` at each match with includeContext=true +3. Check format string argument: Is it a constant string or variable? +4. If variable, trace source: Does it come from user input? +5. `set-bookmark` type="Warning" category="Format String" at vulnerability + +**Telltale signs**: +- printf/fprintf with single argument (no format string literal) +- Format string stored in writable buffer +- User input copied into format string variable +- Error message formatted with user-supplied data + +### Buffer Size vs. Operation Mismatch Patterns + +**Conceptual characteristics**: +- Buffer allocated with one size +- Operation assumes different (larger) size +- Off-by-one errors +- Mismatched size calculations + +**Common mismatch patterns**: + +```c +// Wrong size constant +char buffer[64]; +read(fd, buffer, 128); // Reads 128 into 64-byte buffer + +// Off-by-one +char buffer[64]; +for (i = 0; i <= 64; i++) // Loop goes to 64 (65 iterations) + buffer[i] = input[i]; // Writes one byte past end + +// Null terminator forgotten +char buffer[64]; +strncpy(buffer, input, 64); // May not null-terminate +printf("%s", buffer); // Reads past end if not terminated + +// Size calculation error +char buffer[64]; +memcpy(buffer, src, strlen(src)); // strlen doesn't include null byte + // But may overflow if strlen(src) >= 64 +``` + +**What to look for in decompiled code**: +``` +Size declaration: + local_48 = buffer (char array, size 64) + +Operation size: + read(0, local_48, 0x80); // 0x80 = 128 > 64 + +Offset calculation: + local_48[iVar1] = input[iVar1]; // Check iVar1 bounds + +Loop bounds: + for (i = 0; i < size; i++) // Is size validated? + buffer[i] = input[i]; // Does size match buffer capacity? +``` + +**Investigation strategy**: +1. `get-decompilation` → Identify buffer size from local variable declaration +2. Find operations on buffer (read, memcpy, strcpy, loops) +3. Compare buffer size to operation size +4. `rename-variables` → buffer, buffer_size, read_size for clarity +5. `set-decompilation-comment` → "Buffer overflow: reads 128 into 64-byte buffer" +6. `set-bookmark` type="Warning" category="Size Mismatch" + +**Telltale signs**: +- Magic constants in read/copy operations that don't match buffer size +- sizeof() used incorrectly (sizeof(pointer) vs. sizeof(array)) +- Off-by-one in loop bounds (<= instead of <) +- Missing null terminator checks + +### Integer Overflow Leading to Memory Corruption + +**Conceptual characteristics**: +- Integer arithmetic wraps around at type bounds +- Overflow in size calculation leads to small allocation +- Small allocation leads to buffer overflow +- Underflow in bounds check bypasses security + +**Vulnerable patterns**: + +```c +// Allocation size overflow +uint32_t count = user_input; // User controls this +uint32_t size = count * sizeof(element); // May overflow +buffer = malloc(size); // Allocates small buffer due to overflow +for (i = 0; i < count; i++) // Loop uses original count + buffer[i] = data[i]; // Heap overflow + +// Bounds check underflow +size_t len = user_input; +if (len - 1 < MAX_SIZE) { // Underflows if len == 0 (unsigned) + memcpy(buffer, src, len); // Large len bypasses check +} + +// Sign confusion +int size = user_size; // User controls, may be negative +if (size < MAX_SIZE) { // Passes check if negative + memcpy(buffer, src, size); // Casted to size_t (huge number) +} +``` + +**What to look for in decompiled code**: +``` +Size calculation: + size = user_count * 16; // Multiplication may overflow + +Wraparound check missing: + if (user_count < 1000) { // Doesn't check for overflow + size = user_count * 16; + buf = malloc(size); + } + +Unsigned underflow: + if (len - 1 < 1024) { // What if len == 0? + +Sign conversion: + int signed_size = user_input; // Signed integer + malloc(signed_size); // Casted to size_t (unsigned) + // Negative becomes huge positive +``` + +**Investigation strategy**: +1. `search-decompilation` pattern="malloc|calloc|realloc" +2. Trace size parameter back to source +3. Check for multiplication/addition in size calculation +4. `change-variable-datatypes` to proper types (uint32_t, size_t, ssize_t) +5. Look for overflow checks (or lack thereof) +6. `set-decompilation-comment` → "Integer overflow: count * size may wrap" +7. `set-bookmark` type="Warning" category="Integer Overflow" + +**Telltale signs**: +- Multiplication in allocation size without overflow check +- Unsigned subtraction in bounds check +- Signed/unsigned type confusion +- Missing validation for very large user-supplied sizes + +### Use-After-Free Patterns + +**Conceptual characteristics**: +- Memory freed but pointer still accessible (dangling pointer) +- Dangling pointer dereferenced (use after free) +- Heap allocator may reuse freed memory for new allocation +- Type confusion when old pointer accesses new object + +**Vulnerable patterns**: + +```c +// Classic use-after-free +object* ptr = malloc(sizeof(object)); +use_object(ptr); +free(ptr); +// ... later in code ... +use_object(ptr); // Use after free! + +// Double-free (special case) +free(ptr); +free(ptr); // Corrupts heap metadata + +// Use-after-free via aliasing +object* ptr1 = malloc(sizeof(object)); +object* ptr2 = ptr1; // Aliased pointer +free(ptr1); +use_object(ptr2); // Use after free via alias +``` + +**What to look for in decompiled code**: +``` +Allocation and free: + heap_ptr = malloc(0x40); + // ... use heap_ptr ... + free(heap_ptr); + +Later usage (use-after-free): + // ... some code ... + *heap_ptr = value; // Write to freed memory + function(heap_ptr); // Pass freed pointer + +Conditional free (double-free risk): + if (condition1) free(ptr); + if (condition2) free(ptr); // May free twice if both true + +No pointer nulling: + free(ptr); + // ptr not set to NULL, can be reused +``` + +**Investigation strategy**: +1. `search-decompilation` pattern="free" +2. For each free(), trace pointer usage after free +3. `find-cross-references` to pointer variable → See all uses +4. Check if pointer is nulled after free (ptr = NULL) +5. Check if pointer is checked before use (if (ptr != NULL)) +6. `rename-variables` → freed_ptr, dangling_ptr for clarity +7. `set-decompilation-comment` at use site → "Use-after-free" +8. `set-bookmark` type="Warning" category="Use-After-Free" + +**Telltale signs**: +- free() call without setting pointer to NULL +- Pointer dereferenced after free() in any code path +- Multiple free() calls on same pointer +- Pointer used in different contexts (freed as type A, used as type B) + +### Heap Overflow Patterns + +**Conceptual characteristics**: +- Allocation with one size +- Write operation exceeds allocated size +- Overflows into adjacent heap chunk +- Can corrupt heap metadata or adjacent object data + +**Vulnerable patterns**: + +```c +// Allocation too small +buffer = malloc(64); +read(fd, buffer, 128); // Heap overflow + +// Calculation error +buffer = malloc(count * sizeof(element)); +for (i = 0; i <= count; i++) // Off-by-one (should be <, not <=) + buffer[i] = data[i]; // Overflows by one element + +// Unchecked string operation on heap +buffer = malloc(64); +strcpy(buffer, user_input); // Overflow if user_input > 63 bytes +``` + +**What to look for in decompiled code**: +``` +Heap allocation: + heap_buf = malloc(0x40); // Allocates 64 bytes + +Write operation: + read(0, heap_buf, 0x100); // Reads 256 bytes → overflow + +Adjacent allocations: + buf1 = malloc(0x40); + buf2 = malloc(0x40); // buf2 likely adjacent to buf1 + strcpy(buf1, user_input); // May overflow into buf2 + +Metadata corruption risk: + chunk = malloc(size); + overflow_write(chunk, large_size); // May corrupt next chunk's metadata +``` + +**Investigation strategy**: +1. `search-decompilation` pattern="malloc" +2. Trace allocated buffer through code +3. Find write operations on buffer (strcpy, memcpy, read, loops) +4. Compare allocation size to write size +5. Check for adjacent allocations (exploitation targets) +6. `set-decompilation-comment` → "Heap overflow: writes 256 into 64-byte allocation" +7. `set-bookmark` type="Warning" category="Heap Overflow" + +**Telltale signs**: +- Small malloc() followed by large read/write +- String operations on heap buffers without bounds +- Loop writing to heap array without bounds check +- Multiple sequential allocations (heap layout predictable) + +--- + +## Exploitation Primitive Patterns + +### Arbitrary Memory Write Primitives + +**Conceptual characteristics**: +- Ability to write controlled data to chosen address +- Achieved through various vulnerability classes +- Foundation for control flow hijacking and data corruption + +**Primitive construction patterns**: + +**Format string arbitrary write**: +``` +// Concept: %n writes byte count to pointer argument +printf("AAAA%7$n"); +// If stack[7] is controlled pointer, writes to *stack[7] + +Technique: +1. Place target address on stack +2. Position format string to access it (%N$n) +3. Adjust byte count with padding to write desired value +4. Use width specifiers: %200c%7$n → writes 200+4=204 +``` + +**Buffer overflow arbitrary write**: +``` +// Concept: Overflow to overwrite pointer, then use pointer + +Step 1: Overflow to corrupt pointer +[buffer overflow] → [overwrite ptr variable] + +Step 2: Trigger write through pointer +*ptr = value; // Writes to attacker-controlled address +``` + +**Heap overflow arbitrary write**: +``` +// Concept: Overflow heap chunk to corrupt adjacent chunk's pointers + +Chunk layout: +[chunk1 metadata][chunk1 data][chunk2 metadata][chunk2 data] + +Overflow chunk1 data → overwrite chunk2 metadata → corrupt pointers +When chunk2 used, writes to attacker-controlled addresses +``` + +**Investigation strategy**: +1. Identify vulnerability (format string, overflow, use-after-free) +2. Analyze what can be overwritten +3. Trace pointer dereferencing after corruption +4. `set-bookmark` type="Analysis" category="Arbitrary Write" → Document primitive + +**What enables arbitrary write**: +- Controlled pointer value (overflow, format string) +- Dereference of controlled pointer (assignment, function call) +- Heap metadata corruption (unlink exploitation) + +### Arbitrary Memory Read Primitives + +**Conceptual characteristics**: +- Ability to read from chosen memory address +- Used to leak addresses, canaries, code/data +- Critical for defeating ASLR and other protections + +**Primitive construction patterns**: + +**Format string arbitrary read**: +``` +// Concept: %s reads string from pointer argument +printf("AAAA%10$s"); +// If stack[10] is controlled pointer, prints string at *stack[10] + +Technique: +1. Place target address on stack +2. Position format string to access it (%N$s) +3. Read output to obtain memory contents +``` + +**Uninitialized data read**: +``` +// Concept: Uninitialized variables contain previous stack/heap data + +Pattern in decompiled code: + char buffer[64]; + // No initialization + send(socket, buffer, 64, 0); // Leaks stack contents + +Investigation: + Look for send/write without initialization + Check if data used before written +``` + +**Buffer over-read**: +``` +// Concept: Read past end of buffer into adjacent memory + +Pattern: + char buffer[64]; + strncpy(buffer, input, 64); // No null termination + printf("%s", buffer); // Reads past end until null byte + +Result: Leaks adjacent stack data +``` + +**Investigation strategy**: +1. Find format string vulnerabilities (user-controlled format) +2. Find uninitialized variables sent to output +3. Find string operations missing null termination +4. `set-bookmark` type="Analysis" category="Info Leak" → Document primitive +5. Calculate what can be leaked (addresses, canaries, pointers) + +**What enables arbitrary read**: +- Format string with %s and controlled pointer +- Uninitialized buffer sent to network/file +- Missing null terminator allows over-read +- Heap use-after-free with read operations + +### Control Flow Hijack Primitives + +**Conceptual characteristics**: +- Redirect program execution to attacker-controlled location +- Achieved by overwriting function pointers or return addresses +- Goal: Execute shellcode, ROP chain, or existing functions + +**Hijack target patterns**: + +**Return address overwrite (stack overflow)**: +``` +Stack layout: +[buffer][saved rbp][return address] + +Overflow buffer → overwrite return address → redirect on function return + +What to look for: + Local buffer vulnerable to overflow + Return address at predictable offset (buffer_size + 8 on x64) + Calculate offset: buffer start to return address location +``` + +**Function pointer overwrite**: +``` +// Global or heap-allocated function pointer +void (*callback)(void) = default_handler; + +// Overflow to overwrite callback +buffer_overflow → overwrite callback pointer + +// Trigger hijack +callback(); // Calls attacker-controlled address +``` + +**GOT/PLT overwrite**: +``` +// Global Offset Table contains addresses of library functions +// Overwrite GOT entry to redirect library call + +Example: + Overwrite GOT[puts] with system address + Next call to puts() actually calls system() + +Requirement: Arbitrary write primitive to GOT address +``` + +**Virtual table (vtable) overwrite**: +``` +// C++ objects have vtable pointers +// Overwrite vtable pointer to fake vtable + +Object layout: +[vtable ptr][member1][member2]... + +Overflow → overwrite vtable ptr → point to attacker-controlled memory +Virtual function call → uses fake vtable → hijacks control flow +``` + +**Investigation strategy**: +1. Identify overflow vulnerability +2. Determine what's adjacent in memory (return address, function pointer, vtable) +3. Calculate offset from buffer to target +4. `get-data` at GOT/PLT addresses → Get function pointer locations +5. `set-bookmark` type="Analysis" category="Control Flow Hijack" +6. Document target address and offset + +**Telltale signs**: +- Function pointers in global variables or structures +- Indirect calls through function pointers +- Virtual function calls (C++ code) +- GOT/PLT entries for library functions + +### Information Leak Primitives (Defeating ASLR) + +**Conceptual characteristics**: +- Leak address from memory to defeat address randomization +- Calculate base addresses from leaked pointers +- Use leaked addresses in subsequent exploitation + +**Leak source patterns**: + +**Stack address leak**: +``` +// Stack addresses often present on stack itself +Format string: printf("%p %p %p %p") // Leak stack pointers +Uninitialized: Stack variable contains previous stack frame address + +Use: Calculate stack layout, predict buffer addresses +``` + +**Code address leak (PIE bypass)**: +``` +// Return addresses on stack point to code section +Format string leak of return address → code address +Calculate code base: leaked_addr & ~0xFFF (page alignment) + +Use: Calculate gadget addresses, function addresses +``` + +**Libc address leak (ASLR bypass)**: +``` +// GOT contains resolved libc function addresses +Arbitrary read of GOT entry → libc function address +Calculate libc base: leaked_addr - function_offset + +Use: Calculate system(), one_gadget, useful function addresses +``` + +**Heap address leak**: +``` +// Heap pointers often in freed chunks or stack +Use-after-free leak: Read freed chunk (contains fwd/bck pointers) +Format string: Leak heap pointer from stack + +Use: Predict heap layout, target heap objects +``` + +**Investigation strategy**: +1. Identify leak primitive (format string, uninitialized data, over-read) +2. Determine what's leaked (stack, code, heap, libc addresses) +3. Calculate offsets to useful addresses +4. `set-bookmark` type="Note" category="Address Leak" → Document leak +5. `set-comment` → "Leaks libc address, calculate system() as libc_base + 0x4f4e0" + +**Telltale signs**: +- printf with user-controlled format string +- Send/write with uninitialized buffer +- String operations without null termination +- Heap metadata visible to program (freed chunks) + +--- + +## Common Exploitation Workflows + +### Stack Overflow to Shell + +**Attack flow**: +``` +1. Find buffer overflow on stack +2. Calculate offset to return address +3. Identify target for hijack: + a. Shellcode address (if NX disabled) + b. system() address (if no ASLR) + c. ROP chain address (if protections enabled) +4. Construct payload: [padding][return address][arguments/ROP] +5. Trigger overflow, return redirects to attacker code +6. Execute shellcode/system("/bin/sh") to get shell +``` + +**Investigation steps**: +1. `get-decompilation` of vulnerable function → Find buffer overflow +2. `rename-variables` → buffer, user_input, size +3. Calculate offset: buffer to return address (usually buffer_size + 8) +4. `search-strings-regex` pattern="/bin/sh" → Find shell string +5. `get-symbols` includeExternal=true → Find system() import +6. `set-bookmark` type="Analysis" category="Exploit Plan" +7. Document payload structure in comment + +### Format String to Arbitrary Write + +**Attack flow**: +``` +1. Find printf(user_input) vulnerability +2. Test format string: Send "%x %x %x" → leak stack values +3. Find offset to controlled data on stack +4. Construct format string to write to arbitrary address: + - Place target address on stack + - Use %N$n to write to address at stack[N] +5. Target: Overwrite GOT entry, return address, or function pointer +6. Redirect execution to attacker code +``` + +**Investigation steps**: +1. `search-decompilation` pattern="printf|sprintf" → Find format string calls +2. `get-decompilation` with includeContext → Verify format string from user +3. `get-data` at GOT addresses → Identify targets for overwrite +4. Calculate stack offset to controlled buffer +5. `set-bookmark` type="Warning" category="Format String" +6. Document exploitation: "%7$n writes to address at stack[7]" + +### Heap Exploitation to Code Execution + +**Attack flow**: +``` +1. Find heap vulnerability (use-after-free, heap overflow, double-free) +2. Understand heap layout (chunk sizes, allocation order) +3. Exploit heap corruption: + a. Use-after-free: Free object, allocate new, use old pointer (type confusion) + b. Heap overflow: Overflow chunk to corrupt adjacent chunk metadata + c. Double-free: Corrupt freelist to allocate arbitrary address +4. Gain arbitrary write or control flow hijack primitive +5. Overwrite function pointer, GOT entry, or return address +6. Execute attacker code +``` + +**Investigation steps**: +1. `search-decompilation` pattern="malloc|free" +2. Trace allocation and free patterns +3. Identify vulnerability (use-after-free, overflow, double-free) +4. `rename-variables` → chunk1, chunk2, freed_ptr, size +5. Analyze adjacent allocations (overflow targets) +6. `set-bookmark` type="Warning" category="Heap Vulnerability" +7. Document exploitation primitive achieved + +### Ret2libc (Return-to-libc) + +**Attack flow**: +``` +1. Find stack overflow vulnerability +2. Cannot use shellcode (NX enabled) +3. Redirect to existing libc function: system() +4. Set up arguments: First arg points to "/bin/sh" +5. Payload structure: + - Overflow to return address + - Overwrite return address → system() address + - Set first argument → pointer to "/bin/sh" string +6. Function returns, calls system("/bin/sh"), spawns shell +``` + +**Investigation steps**: +1. `get-decompilation` → Find buffer overflow +2. `search-strings-regex` pattern="/bin/sh" → Get shell string address +3. `get-symbols` includeExternal=true → Find system import +4. Check calling convention (x86: stack args, x64: RDI register) +5. Calculate ROP gadgets if needed: pop rdi; ret +6. `set-bookmark` type="Note" category="Ret2libc Plan" +7. Document payload: [padding][system_addr][ret_addr]["/bin/sh"_ptr] + +### ROP Chain Construction + +**Attack flow**: +``` +1. Find code execution vulnerability (overflow, etc.) +2. Protections prevent direct shellcode/ret2libc +3. Build ROP chain: Sequence of gadget addresses +4. Each gadget: Small code fragment ending in 'ret' +5. Chain gadgets to build desired operation (e.g., execve syscall) +6. Place chain on stack, trigger vulnerability +7. Execution flows through gadgets, performs desired operation +``` + +**Investigation steps**: +1. Identify required gadgets (pop rdi; ret, pop rsi; ret, syscall; ret, etc.) +2. Use external tool (ROPgadget) to find gadgets in binary/libc +3. `set-bookmark` type="Note" category="ROP Gadget" at each gadget address +4. `set-comment` at gadget address → "pop rdi; ret" +5. Document ROP chain structure: + - [gadget1_addr] → pop rdi; ret + - ["/bin/sh"_ptr] → argument for rdi + - [gadget2_addr] → pop rsi; ret + - [NULL] → argument for rsi + - [syscall_addr] → execve syscall +6. `set-bookmark` type="Analysis" category="ROP Chain Plan" + +--- + +## Protection Mechanism Bypass Patterns + +### Stack Canary Bypass + +**Canary mechanism**: +``` +Stack layout with canary: +[buffer][stack canary][saved rbp][return address] + +On function return: + if (canary != expected_canary) + __stack_chk_fail(); // Abort on corruption +``` + +**Bypass techniques**: + +**1. Leak canary value (format string, uninitialized data)**: +``` +printf(user_input); // Format string leak +Send "%7$p" → leak canary from stack position 7 +Include leaked canary in overflow payload to preserve it +``` + +**2. Brute-force canary (fork server)**: +``` +If server forks instead of exiting: + Canary same across fork + Brute-force one byte at a time + 256 attempts per byte, 1024 total for 32-bit canary +``` + +**3. Overwrite without corrupting canary**: +``` +Partial overwrite: Overflow only up to return address +Don't touch canary if it's not in the way +Or overwrite saved rbp and return address precisely +``` + +**Investigation**: +1. `search-decompilation` pattern="__stack_chk_fail" → Detect canary presence +2. `get-decompilation` → See canary check in code +3. Identify canary position on stack +4. `set-bookmark` type="Note" category="Stack Canary" → Document location +5. Plan bypass: leak, brute-force, or avoid + +### NX/DEP Bypass (No Execute) + +**Protection mechanism**: +``` +Stack/heap marked non-executable +Shellcode injection doesn't work (causes segfault) +``` + +**Bypass techniques**: + +**1. Return-to-libc (ret2libc)**: +``` +Don't inject code, reuse existing code +Redirect to system(), execve(), etc. +Set up arguments properly +``` + +**2. Return-Oriented Programming (ROP)**: +``` +Chain existing code fragments (gadgets) +Build complex operations from simple gadgets +No new code introduced +``` + +**3. mprotect/VirtualProtect ROP**: +``` +Use ROP to call mprotect(shellcode_addr, RWX) +Change shellcode memory to executable +Jump to now-executable shellcode +``` + +**Investigation**: +1. `get-memory-blocks` → Check stack/heap permissions (look for 'x' flag) +2. If NX enabled, plan ROP or ret2libc +3. `get-symbols` includeExternal=true → Find usable functions +4. `set-bookmark` type="Analysis" category="NX Bypass" + +### ASLR Bypass (Address Space Layout Randomization) + +**Protection mechanism**: +``` +Addresses randomized each execution +Code base, libc base, stack base, heap base all randomized +Exploit addresses must be dynamically calculated +``` + +**Bypass techniques**: + +**1. Information leak**: +``` +Leak address from memory (format string, uninitialized data) +Calculate base address from leaked pointer +Use base + offset to find desired functions +``` + +**2. Partial overwrite**: +``` +Only lowest 12 bits (page offset) are not randomized +Overwrite only last byte of address +Reduces entropy, enables brute-force or partial redirect +``` + +**3. Heap spraying (rarely applicable in CTF)**: +``` +Fill heap with controlled data +Increase probability of hitting controlled memory +``` + +**Investigation**: +1. Identify leak primitive (format string, over-read, uninitialized) +2. Calculate what's leaked (code, stack, heap, libc) +3. Determine offsets: leaked_addr to target_addr +4. `set-comment` → "Leak libc: system = libc_base + 0x4f4e0" +5. `set-bookmark` type="Analysis" category="ASLR Bypass" + +### PIE Bypass (Position Independent Executable) + +**Protection mechanism**: +``` +Code section randomized (in addition to ASLR) +Function addresses, gadget addresses randomized +Cannot hardcode code addresses +``` + +**Bypass techniques**: + +**1. Leak code address**: +``` +Leak return address from stack → points to code +Calculate code base: leaked_addr & ~0xFFF +Calculate function/gadget addresses: code_base + offset +``` + +**2. Partial overwrite**: +``` +Overwrite only last byte of return address +Redirect within same function or nearby functions +Useful for redirecting to existing win() function +``` + +**Investigation**: +1. Identify if PIE enabled (check binary properties) +2. Find code address leak (stack return address) +3. Calculate offsets from code base to targets +4. `set-bookmark` type="Analysis" category="PIE Bypass" + +--- + +## Using This Reference + +### Pattern Recognition Workflow + +1. **Identify vulnerability class** → Match decompiled code to vulnerability patterns +2. **Determine exploitation primitive** → What capability does vulnerability provide? +3. **Check protections** → What bypass techniques are needed? +4. **Plan exploitation workflow** → Chain primitives to achieve goal +5. **Document in Ghidra** → Bookmarks, comments, renamed variables + +### Investigation Priority + +**Start with:** +1. Unsafe API recognition (strcpy, printf, etc.) +2. Buffer size vs. operation size comparison +3. Input flow tracing (where does user data go?) + +**Then analyze:** +4. Memory layout (what's adjacent to vulnerable buffer?) +5. Available exploitation targets (return address, function pointers, GOT) +6. Protection mechanisms (canary, NX, ASLR, PIE) + +**Finally plan:** +7. Exploitation primitive construction +8. Protection bypass strategy +9. Payload structure +10. Exploit execution plan + +### Progressive Understanding + +**First pass**: "Unsafe strcpy in main() on buffer[64]" +**Second pass**: "Overflow of 64 bytes to reach return address at offset +72" +**Third pass**: "Can redirect to system@plt, need '/bin/sh' string address" +**Fourth pass**: "Full ret2libc: overflow → system('/bin/sh') → shell" + +Each iteration refines the exploitation plan. + +### Evidence-Based Exploitation + +Every claim needs evidence: +- "Buffer overflow exists" → Show buffer size < input size +- "Return address at offset 72" → Show stack layout calculation +- "Can call system()" → Show system@plt address or import +- "ASLR bypass possible" → Show leak primitive and calculation + +Document all evidence with bookmarks and comments in Ghidra. diff --git a/skills/ctf-rev/SKILL.md b/skills/ctf-rev/SKILL.md new file mode 100644 index 0000000..02f4ac6 --- /dev/null +++ b/skills/ctf-rev/SKILL.md @@ -0,0 +1,548 @@ +--- +name: ctf-rev +description: Solve CTF reverse engineering challenges using systematic analysis to find flags, keys, or passwords. Use for crackmes, binary bombs, key validators, obfuscated code, algorithm recovery, or any challenge requiring program comprehension to extract hidden information. +--- + +# CTF Reverse Engineering + +## Purpose + +You are a CTF reverse engineering solver. Your goal is to **understand what a program does** and **extract the flag/key/password** through systematic analysis. + +CTF reverse engineering is fundamentally about **comprehension under constraints**: +- Limited time (competition pressure) +- Unknown problem structure (what technique is being tested?) +- Minimal documentation (that's the challenge!) +- Goal-oriented (find the flag, not perfect understanding) + +Unlike malware analysis or vulnerability research, CTF reversing tests your ability to: +1. **Quickly identify the core challenge** (crypto? obfuscation? algorithm recovery?) +2. **Trace critical data flow** (where does input go? how is it validated?) +3. **Recognize patterns** (standard algorithms, common tricks) +4. **Adapt your approach** (static vs dynamic, top-down vs bottom-up) + +## Conceptual Framework + +### The Three Questions + +Every reverse engineering challenge boils down to answering: + +**1. What does the program EXPECT?** +- Input format (string, number, binary data?) +- Input structure (length, format, encoding?) +- Validation criteria (checks, comparisons, constraints?) + +**2. What does the program DO?** +- Transformation (encrypt, hash, encode, compute?) +- Comparison (against hardcoded value, derived value?) +- Algorithm (standard crypto, custom logic, mathematical?) + +**3. How do I REVERSE it?** +- Is the operation reversible? (encryption vs hashing) +- Can I brute force? (keyspace size, performance) +- Can I derive the answer? (solve equations, trace backwards) +- Can I bypass? (patch, debug, manipulate state) + +### Understanding vs Solving + +**You don't need to understand everything** - focus on what gets you to the flag: + +**Full Understanding** (often unnecessary): +- Every function's purpose +- Complete program flow +- All edge cases and error handling +- Library implementation details + +**Sufficient Understanding** (what you need): +- Entry point to flag validation +- Core transformation logic +- Input-to-output relationship +- Comparison or success criteria + +**Example:** +``` +Program has 50 functions. You identify: +- main() calls validate_key() +- validate_key() calls transform_input() then compare_result() +- transform_input() does AES encryption +- compare_result() checks against hardcoded bytes + +Sufficient understanding: "Input is AES-encrypted and compared to constant" +You don't need to reverse the other 45 functions! +``` + +## Core Methodologies + +### Static Analysis: Code Comprehension + +**Goal:** Understand program logic by reading decompiled/disassembled code + +**When to use:** +- Small, focused programs (crackmes, keygens) +- Algorithm identification challenges +- When dynamic analysis is hindered (anti-debugging, complex state) +- When you need to understand transformation logic + +**Approach:** +1. **Find the critical path** - Entry point → flag validation → success +2. **Trace input flow** - Where does user input go? How is it used? +3. **Identify operations** - What transformations occur? (XOR, loops, comparisons) +4. **Recognize patterns** - Does this match known algorithms? (see patterns.md) + +**ReVa workflow:** +``` +1. get-decompilation of entry/main function + - includeIncomingReferences=true to see program structure + +2. Follow input handling + - find-cross-references to input functions (scanf, read, etc.) + - Trace data flow from input to validation + +3. Analyze transformations + - rename-variables to clarify data flow + - change-variable-datatypes to understand operations + - set-decompilation-comment to document logic + +4. Identify success criteria + - Find comparison or validation logic + - Extract expected values or patterns +``` + +### Dynamic Analysis: Runtime Observation + +**Goal:** Observe program behavior during execution + +**When to use:** +- Complex control flow (hard to follow statically) +- Obfuscated or packed code +- When you need to see intermediate values +- Time-based or environmental checks + +**Approach:** +1. **Set breakpoints at key locations** + - Input processing + - Transformations + - Comparisons + - Success/failure branches + +2. **Observe state changes** + - Register/variable values + - Memory contents + - Function arguments/returns + +3. **Test hypotheses** + - "If I input X, does Y happen?" + - "What value is being compared here?" + +**Note:** ReVa focuses on static analysis. For dynamic analysis, use external debuggers (gdb, x64dbg, etc.) + +### Hybrid Approach: Best of Both Worlds + +**Most effective for CTF challenges** + +**Workflow:** +1. **Static: Identify structure** (find validation function, success path) +2. **Dynamic: Observe runtime** (breakpoint at validation, see expected value) +3. **Static: Understand transformation** (reverse the algorithm) +4. **Dynamic: Verify solution** (test your derived key/flag) + +**Example:** +``` +Static: "Input is transformed by function sub_401234 then compared" +Dynamic: Run with test input, breakpoint at comparison → see expected value +Static: Decompile sub_401234 → recognize as base64 encoding +Solve: base64_decode(expected_value) = flag +Dynamic: Verify flag works +``` + +## Problem-Solving Strategies + +### Strategy 1: Top-Down (Goal-Oriented) + +**Start from the win condition, work backwards** + +**When to use:** +- Clear success/failure indicators (prints "Correct!" or "Wrong!") +- Simple program structure +- When you want to understand the minimum necessary + +**Workflow:** +``` +1. Find success message/function +2. find-cross-references direction="to" → What calls this? +3. get-decompilation of validation function +4. Identify what conditions lead to success +5. Work backwards to understand required input +``` + +**Example:** +``` +1. String "Congratulations!" at 0x402000 +2. Referenced by function validate_flag at 0x401500 +3. Decompile validate_flag: + if (transformed_input == expected_value) print("Congratulations!"); +4. Now focus on: What's expected_value? How is input transformed? +``` + +### Strategy 2: Bottom-Up (Data Flow) + +**Start from input, trace forward to validation** + +**When to use:** +- Complex program structure (many functions) +- When win condition is unclear +- When you want to understand transformations + +**Workflow:** +``` +1. search-strings-regex pattern="(scanf|read|fgets|input)" +2. find-cross-references to input function +3. Trace data flow: input → storage → transformation → usage +4. Follow transformations until you reach comparison/validation +``` + +**Example:** +``` +1. scanf at 0x401000 reads into buffer +2. buffer passed to process_input(buffer) +3. process_input calls encrypt(buffer, key) +4. Encrypted result compared to hardcoded bytes +5. Now analyze: What's the encryption? Can we reverse it? +``` + +### Strategy 3: Pattern Recognition + +**Identify standard algorithms or common techniques** + +**When to use:** +- Crypto challenges (encryption, hashing) +- Encoding challenges (base64, custom encodings) +- Algorithm implementation challenges + +**Workflow:** +``` +1. Look for algorithmic patterns (see patterns.md): + - Loop structures (rounds, iterations) + - Constant arrays (S-boxes, tables) + - Characteristic operations (XOR, rotations, substitutions) + +2. Compare to known implementations: + - read-memory at constant arrays → compare to standard tables + - Count loop iterations → indicates algorithm variant + - search-decompilation for crypto patterns + +3. Once identified, apply standard solutions: + - AES → decrypt with known/derived key + - RC4 → decrypt with extracted key + - Custom XOR → reverse the XOR operation +``` + +### Strategy 4: Constraint Solving + +**Frame the problem as mathematical constraints** + +**When to use:** +- Serial/key validation (input must satisfy equations) +- Mathematical puzzles +- Multiple related checks + +**Workflow:** +``` +1. Identify all constraints on input: + input[0] + input[1] == 0x42 + input[0] ^ input[2] == 0x13 + input[1] * 2 == input[3] + +2. Extract to external solver (z3, constraint solver) + +3. Solve for input values + +4. Verify solution in program +``` + +**Example:** +``` +Decompiled validation: + if (flag[0] + flag[1] != 100) return 0; + if (flag[0] - flag[1] != 20) return 0; + if (flag[2] ^ 0x42 != 0x33) return 0; + +Solve: + flag[0] + flag[1] = 100 + flag[0] - flag[1] = 20 + → flag[0] = 60, flag[1] = 40 + + flag[2] ^ 0x42 = 0x33 + → flag[2] = 0x33 ^ 0x42 = 0x71 = 'q' +``` + +## Flexible Workflow + +CTF challenges vary widely - adapt your approach: + +### Initial Assessment (5-10 minutes) + +**Understand the challenge:** +- What's provided? (binary, source, description?) +- What's the goal? (find flag, generate key, bypass check?) +- What's the constraint? (time limit, black box?) + +**ReVa reconnaissance:** +``` +1. get-current-program or list-project-files +2. get-strings-count and sample strings (100-200) + - Look for: flag format, hints, library names +3. get-symbols with includeExternal=true + - Check for suspicious imports (crypto APIs, anti-debug) +4. get-function-count to gauge complexity +``` + +### Focused Investigation (15-45 minutes) + +**Follow the most promising lead:** + +**If you found flag format in strings:** +→ Top-down from flag string + +**If you found crypto APIs:** +→ Pattern recognition (identify algorithm) + +**If you found input validation:** +→ Data flow tracing (input to validation) + +**If program is simple (< 10 functions):** +→ Comprehensive static analysis + +**If program is complex or obfuscated:** +→ Hybrid approach (dynamic to find key points, static to understand) + +### Solution Extraction (10-20 minutes) + +**Once you understand the mechanism:** + +1. **Can you reverse it?** + - Decryption, decoding, mathematical inverse + +2. **Can you derive it?** + - Solve constraints, extract from comparison + +3. **Can you brute force it?** + - Small keyspace, fast validation + +4. **Can you bypass it?** + - Patch comparison, manipulate state + +**Verify your solution:** +- Test with actual program (if possible) +- Check flag format (usually flag{...} or CTF{...}) + +## Pattern Recognition + +CTF challenges often test recognition of standard patterns. See `patterns.md` for detailed guides on: + +**Cryptographic Patterns:** +- Block ciphers (AES, DES, custom) +- Stream ciphers (RC4, custom) +- Hash functions (MD5, SHA, custom) +- XOR obfuscation + +**Algorithm Patterns:** +- Encoding schemes (base64, custom alphabets) +- Mathematical operations (modular arithmetic, matrix operations) +- State machines (input validation via states) + +**Code Patterns:** +- Input validation loops +- Character-by-character comparisons +- Transformation + comparison structures +- Anti-debugging tricks (for CTF context) + +**Data Structure Patterns:** +- Lookup tables (substitution ciphers) +- Hardcoded arrays (expected values) +- Buffer transformations + +## ReVa Tool Usage for CTF + +### Discovery Tools + +**Find the interesting parts quickly:** + +``` +search-strings-regex pattern="(flag|key|password|correct|wrong|success)" +→ Find win/lose conditions + +search-decompilation pattern="(scanf|read|input|strcmp|memcmp)" +→ Find input/comparison functions + +get-functions-by-similarity searchString="check" +→ Find validation functions +``` + +### Analysis Tools + +**Understand the core logic:** + +``` +get-decompilation with includeIncomingReferences=true, includeReferenceContext=true +→ Get full context of validation logic + +find-cross-references direction="both" includeContext=true +→ Trace data flow and function relationships + +read-memory to extract constants, tables, expected values +→ Get hardcoded comparison targets +``` + +### Improvement Tools + +**Make code readable as you work:** + +``` +rename-variables to track data flow +→ input_buffer, encrypted_data, expected_hash + +change-variable-datatypes to clarify operations +→ uint8_t* for byte buffers, uint32_t for crypto state + +set-decompilation-comment to document findings +→ "AES round function", "Compares against flag" + +set-bookmark for important locations +→ type="Analysis" for key findings +→ type="TODO" for things to investigate +``` + +## Key Principles + +### 1. Goal Focus +**Don't analyze everything - focus on getting the flag** +- Identify critical path (input → validation → success) +- Ignore unrelated functions +- Sufficient understanding > complete understanding + +### 2. Adapt Quickly +**Switch strategies if stuck** +- Static not working? Try dynamic +- Too complex? Look for simpler approach (bypass, brute force) +- Pattern not matching? Could be custom algorithm + +### 3. Leverage Knowledge +**CTF challenges reuse concepts** +- Standard crypto algorithms +- Common obfuscation tricks +- Typical validation patterns +- Recognize and apply known solutions + +### 4. Document Progress +**Track what you learn** +``` +set-bookmark type="Analysis" category="Finding" + → Document what you've confirmed + +set-bookmark type="TODO" category="Investigate" + → Track unanswered questions + +set-decompilation-comment + → Preserve understanding for later reference +``` + +### 5. Verify Incrementally +**Test your understanding as you go** +- "If this is AES, I should see S-box constants" → Check +- "If input is XORed with 0x42, output[0] should be..." → Verify with example +- "If this is the flag comparison, changing this byte should..." → Test hypothesis + +## Common CTF Challenge Types + +### Crackme / Serial Validation +**Challenge:** Find input that passes validation +**Approach:** Data flow tracing (input → validation logic) +**Key insight:** Focus on validation function, extract constraints + +### Algorithm Recovery +**Challenge:** Implement or reverse unknown algorithm +**Approach:** Pattern recognition, understand operations +**Key insight:** Look for mathematical patterns, trace transformations + +### Crypto Challenge +**Challenge:** Decrypt ciphertext or find key +**Approach:** Identify algorithm, extract key/IV, decrypt +**Key insight:** Recognize standard crypto patterns (see patterns.md) + +### Code Obfuscation +**Challenge:** Understand obfuscated/packed code +**Approach:** Dynamic analysis to observe deobfuscated state +**Key insight:** Let program do the work, observe result + +### Binary Bomb +**Challenge:** Defuse "bomb" by providing correct inputs for each phase +**Approach:** Phase-by-phase analysis, mixed static/dynamic +**Key insight:** Each phase typically tests different concept + +### Custom Encoding +**Challenge:** Decode encoded flag or encode input correctly +**Approach:** Identify encoding scheme, reverse or replicate +**Key insight:** Look for transformation loops, character mappings + +## Integration with Other Skills + +### After Binary Triage +**Triage identified suspicious areas → Deep dive with CTF focus** + +``` +From triage bookmarks: +- "Crypto function at 0x401234" → Identify algorithm, extract key +- "Input validation at 0x402000" → Understand constraints, solve +- "Suspicious string XOR" → Decode to find flag or hint +``` + +### Using Deep Analysis +**When you need detailed function understanding** + +``` +CTF skill identifies: "Validation at validate_key function" +Deep analysis answers: "What exactly does validate_key do?" +CTF skill uses result: Apply findings to extract flag +``` + +**Workflow:** +1. CTF skill: High-level strategy, identify critical functions +2. Deep analysis: Detailed investigation of specific functions +3. CTF skill: Synthesize findings, extract solution + +## Success Criteria + +**You've solved the challenge when you can:** + +1. **Demonstrate understanding:** + - Explain how input becomes output + - Identify the validation mechanism + - Recognize the core algorithm/technique + +2. **Extract the solution:** + - Provide the flag/key/password + - Explain how you derived it + - Verify it works (if testable) + +3. **Document the path:** + - Key functions and addresses + - Critical transformations or comparisons + - Solution method (reverse, derive, brute force, bypass) + +## Remember + +CTF reverse engineering is **problem-solving under constraints**: +- You have limited time +- You need sufficient, not perfect, understanding +- The goal is the flag, not comprehensive analysis +- Adapt your strategy based on what you find +- Leverage patterns and prior knowledge +- Switch between static and dynamic as needed + +**Focus on answering:** +1. What does the program expect? (input format/structure) +2. What does the program do? (transformation/validation) +3. How do I reverse it? (derive/decrypt/solve/bypass) + +When you answer these three questions, you have your flag. diff --git a/skills/ctf-rev/patterns.md b/skills/ctf-rev/patterns.md new file mode 100644 index 0000000..1d031c4 --- /dev/null +++ b/skills/ctf-rev/patterns.md @@ -0,0 +1,906 @@ +# CTF Reverse Engineering Pattern Recognition + +This document provides pattern recognition guides for common CTF reverse engineering challenges. Focus on **identifying patterns quickly** to guide your solution strategy. + +## Cryptographic Patterns + +### Simple XOR Patterns + +**Recognition Signature:** +``` +Single-byte XOR: + for (i = 0; i < len; i++) + output[i] = input[i] ^ 0xKEY; + +Multi-byte XOR (repeating key): + for (i = 0; i < len; i++) + output[i] = input[i] ^ key[i % keylen]; + +Rolling XOR: + xor_val = seed; + for (i = 0; i < len; i++) { + output[i] = input[i] ^ xor_val; + xor_val = next_value(xor_val); // Linear congruential or similar + } +``` + +**What to look for:** +- Very short functions (5-15 lines decompiled) +- XOR operation in loop +- Constant value or small array +- Modulo operation for key index (`i % keylen`) + +**ReVa detection:** +``` +search-decompilation pattern="\\^" caseSensitive=false +→ Find XOR operations + +get-decompilation of suspicious function +→ Look for loop with XOR + +read-memory at key location +→ Extract XOR key +``` + +**Solution approach:** +- XOR is self-inverse: `decrypt(x) = encrypt(x)` +- If you have ciphertext + key: plaintext = ciphertext XOR key +- If you have plaintext + ciphertext: key = plaintext XOR ciphertext +- If you have partial known plaintext: derive key, decrypt rest + +### Base64 and Variants + +**Recognition Signature:** +``` +Character lookup table (64-character alphabet): + Standard: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/ + Custom: May use different alphabet + +Bit manipulation: + 3 bytes → 4 encoded characters + Shifting and masking: (data >> 18) & 0x3F + +Padding: + '=' characters or custom padding +``` + +**What to look for:** +- 64-character string constant (lookup table) +- Bit shifting: `>> 6`, `>> 12`, `>> 18` +- Masking: `& 0x3F` (6 bits) +- 3-to-4 or 4-to-3 byte conversion ratio +- Padding logic + +**ReVa detection:** +``` +search-strings-regex pattern="[A-Za-z0-9+/]{64}" +→ Find base64 alphabet + +search-decompilation pattern="& 0x3f" +→ Find 6-bit masking (base64 characteristic) + +get-decompilation of encoding function +→ Confirm 3→4 byte transformation +``` + +**Solution approach:** +- If standard base64: use standard decoder +- If custom alphabet: map custom → standard, then decode +- Reverse engineering: identify alphabet, implement decoder + +### Block Cipher Patterns (AES, DES, etc.) + +**Recognition Signature:** +``` +AES characteristics: + - 128-bit (16-byte) blocks + - 10, 12, or 14 rounds (for 128, 192, 256-bit keys) + - S-box: 256-byte constant array starting 63 7c 77 7b f2 6b 6f c5... + - Mix columns, shift rows operations + - Key schedule expansion + +DES characteristics: + - 64-bit (8-byte) blocks + - 16 rounds + - Permutation tables (IP, FP, E, P, S-boxes) + - Feistel structure (split, swap, repeat) +``` + +**What to look for:** +``` +Nested loops: + for (round = 0; round < NUM_ROUNDS; round++) + for (i = 0; i < BLOCK_SIZE; i++) + state[i] = transform(state[i], key[round]); + +Large constant arrays: + uint8_t sbox[256] = {0x63, 0x7c, 0x77, ...}; + +Block processing: + Fixed-size chunks (16 bytes for AES, 8 for DES) + +Key schedule: + Function deriving round keys from master key +``` + +**ReVa detection:** +``` +search-decompilation pattern="(for.*round|for.*0x10)" +→ Find round loops + +read-memory at constant arrays +→ Compare first bytes to known S-boxes: + AES: 63 7c 77 7b f2 6b 6f c5 + DES S1: 0e 04 0d 01 02 0f 0b 08 + +get-decompilation with focus on nested loops +→ Count iterations (round count indicates key size) +``` + +**Solution approach:** +- Identify algorithm by S-box or constants +- Extract key from memory or key schedule +- Use standard implementation to decrypt +- For custom implementations, replicate in Python/C + +### Stream Cipher Patterns (RC4, etc.) + +**Recognition Signature:** +``` +RC4 characteristics: + KSA (Key Scheduling Algorithm): + for i = 0 to 255: S[i] = i + for i = 0 to 255: swap S[i] with S[(S[i] + key[i % keylen]) % 256] + + PRGA (Pseudo-Random Generation Algorithm): + i = 0, j = 0 + while generating: + i = (i + 1) % 256 + j = (j + S[i]) % 256 + swap(S[i], S[j]) + output = S[(S[i] + S[j]) % 256] +``` + +**What to look for:** +``` +State array initialization: + for (i = 0; i < 256; i++) state[i] = i; + +Swap operations: + temp = arr[i]; + arr[i] = arr[j]; + arr[j] = temp; + +Modulo arithmetic: + (i + 1) % 256 + index & 0xFF (equivalent to % 256) + +Simple XOR with keystream: + output[i] = input[i] ^ keystream[i]; +``` + +**ReVa detection:** +``` +search-decompilation pattern="(swap|temp.*=.*\\[)" +→ Find array swap operations + +get-decompilation of initialization +→ Look for 0-255 loop filling array + +find-cross-references to state array +→ Trace usage through KSA and PRGA +``` + +**Solution approach:** +- Extract key from initialization +- Replicate KSA to generate initial state +- Replicate PRGA to generate keystream +- XOR ciphertext with keystream to decrypt + +### Hash Function Patterns + +**Recognition Signature:** +``` +MD5/SHA characteristics: + - Fixed initialization vectors (magic constants) + - Block processing (512 bits / 64 bytes) + - Multiple rounds (64 for MD5/SHA-256, 80 for SHA-1) + - Bitwise operations: rotations, XOR, AND, OR, NOT + - Padding: append 0x80, then zeros, then length + +Magic constants: + MD5: 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476 + SHA-1: adds 0xc3d2e1f0 + SHA-256: Eight 32-bit constants derived from square roots +``` + +**What to look for:** +``` +Characteristic constants: + Search for 0x67452301 (MD5/SHA-1 IV) + +Fixed round counts: + for (round = 0; round < 64; round++) // MD5, SHA-256 + for (round = 0; round < 80; round++) // SHA-1 + +Bitwise rotation macros: + ROTL(x, n) = (x << n) | (x >> (32-n)) + +Message schedule (W array): + Expands 16 input words to 64/80 words + +Padding logic: + Append 0x80, zeros, then 64-bit length +``` + +**ReVa detection:** +``` +search-decompilation pattern="0x67452301" +→ Find MD5/SHA initialization + +read-memory at round constants +→ Identify specific hash variant + +get-decompilation of hash function +→ Count rounds, identify structure +``` + +**Solution approach:** +- Hash functions are one-way (cannot decrypt) +- If you find hash of flag: need to brute force or use known input +- If you find comparison: extract expected hash, try common flags +- Check for weak hash (MD5, SHA-1) or short input (brute-forceable) + +## Encoding Patterns + +### Character Substitution + +**Recognition Signature:** +``` +Lookup table mapping: + output[i] = table[input[i]]; + +Caesar cipher (shift): + output[i] = (input[i] - 'A' + shift) % 26 + 'A'; + +Custom alphabet: + const char* alphabet = "ZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjihgfedcba"; + output[i] = alphabet[input[i] - 'A']; +``` + +**What to look for:** +- Character array constants (alphabets, substitution tables) +- Character-by-character processing loops +- Range checks: `if (c >= 'A' && c <= 'Z')` +- Arithmetic on character codes: `c - 'A'`, `c + shift` + +**ReVa detection:** +``` +search-strings-regex pattern="[A-Z]{26}" +→ Find alphabet strings + +search-decompilation pattern="(- 'A'|% 26)" +→ Find character arithmetic + +get-decompilation of encoding function +→ Identify substitution pattern +``` + +**Solution approach:** +- Extract substitution table or shift value +- Build reverse mapping +- Apply to encoded data + +### Binary-to-Text Encodings + +**Recognition Signature:** +``` +Hex encoding: + "0123456789abcdef" + nibble_high = (byte >> 4) & 0xF; + nibble_low = byte & 0xF; + +Binary/ASCII: + Converting to "01011010" strings + +Custom encodings: + Mapping bytes to multi-character sequences +``` + +**What to look for:** +- Hex digit strings +- Bit extraction: `>> 4`, `& 0xF`, `& 1` +- Character code generation loops +- 1-to-2 or 1-to-8 byte expansion + +**ReVa detection:** +``` +search-decompilation pattern="(>> 4|& 0xf)" +→ Find nibble extraction (hex encoding) + +get-strings to find encoding alphabets +→ Check for hex, binary digit strings +``` + +**Solution approach:** +- Identify encoding scheme +- Implement decoder +- Apply to encoded flag + +## Input Validation Patterns + +### Character-by-Character Comparison + +**Recognition Signature:** +``` +Direct comparison: + for (i = 0; i < len; i++) + if (input[i] != expected[i]) + return 0; + return 1; + +Comparison with transformation: + for (i = 0; i < len; i++) + if (transform(input[i]) != expected[i]) + return 0; +``` + +**What to look for:** +- Loop over input length +- Comparison inside loop: `!=`, `==` +- Early return on mismatch +- Success after full loop completion + +**ReVa detection:** +``` +search-decompilation pattern="(if.*!=|if.*==)" +→ Find comparison operations + +get-decompilation of validation function +→ Identify loop structure + +read-memory at expected value array +→ Extract expected bytes +``` + +**Solution approach:** +- If direct comparison: read expected array, that's the flag +- If transformed comparison: reverse transformation +- If complex transformation: trace each character + +### Checksum Validation + +**Recognition Signature:** +``` +Sum check: + sum = 0; + for (i = 0; i < len; i++) + sum += input[i]; + return (sum == EXPECTED_SUM); + +XOR check: + xor = 0; + for (i = 0; i < len; i++) + xor ^= input[i]; + return (xor == EXPECTED_XOR); + +Custom accumulation: + result = SEED; + for (i = 0; i < len; i++) + result = (result * MULT + input[i]) % MOD; + return (result == EXPECTED); +``` + +**What to look for:** +- Accumulator variable (sum, product, xor) +- Loop updating accumulator +- Final comparison to constant +- May be combined with other checks + +**ReVa detection:** +``` +search-decompilation pattern="(\\+=|\\*=|\\^=)" +→ Find accumulator updates + +get-decompilation of validation +→ Identify accumulation pattern + +read-memory at expected value +→ Extract target checksum +``` + +**Solution approach:** +- Single checksum: underconstrained (many solutions) +- Multiple checksums: may uniquely identify input +- Extract all constraints, solve as system of equations + +### Constraint-Based Validation + +**Recognition Signature:** +``` +Multiple independent checks: + if (input[0] + input[1] != 0x64) return 0; + if (input[0] - input[1] != 0x14) return 0; + if (input[2] ^ 0x42 != 0x33) return 0; + if (input[3] * 2 == input[4]) return 0; + return 1; + +Relational constraints: + if (input[i] != input[j] + 5) return 0; +``` + +**What to look for:** +- Multiple if-statements with comparisons +- Arithmetic operations on input elements +- Relationships between different input positions +- Constants in comparisons + +**ReVa detection:** +``` +get-decompilation of validation function +→ Identify all comparison statements + +set-decompilation-comment on each constraint +→ Document relationships + +Extract to external solver: +→ List all constraints, solve with z3 or similar +``` + +**Solution approach:** +- Extract all constraints +- Frame as system of equations +- Solve using constraint solver (z3, SMT) +- Verify solution satisfies all constraints + +## Algorithm Patterns + +### Mathematical Sequences + +**Recognition Signature:** +``` +Fibonacci: + a = 0, b = 1; + while (...) { + next = a + b; + a = b; + b = next; + } + +Factorial: + result = 1; + for (i = 1; i <= n; i++) + result *= i; + +Prime checking: + for (i = 2; i < sqrt(n); i++) + if (n % i == 0) return 0; + return 1; +``` + +**What to look for:** +- Iterative or recursive patterns +- Arithmetic progressions +- Number theory operations (modulo, divisibility) +- Known sequence generation + +**ReVa detection:** +``` +search-decompilation pattern="(fibonacci|factorial|prime)" +→ Find named functions (if not stripped) + +get-decompilation of suspicious function +→ Identify mathematical pattern + +Recognize by structure: +→ Two-variable update (Fibonacci) +→ Multiplication accumulator (factorial) +→ Modulo divisibility (prime check) +``` + +**Solution approach:** +- Recognize the algorithm +- Understand how it validates input +- Derive required input or replicate logic + +### Matrix Operations + +**Recognition Signature:** +``` +Matrix multiplication: + for (i = 0; i < rows; i++) + for (j = 0; j < cols; j++) + for (k = 0; k < inner; k++) + result[i][j] += a[i][k] * b[k][j]; + +Linear transformations: + output[i] = matrix[i][0] * input[0] + matrix[i][1] * input[1] + ...; +``` + +**What to look for:** +- Triple-nested loops (matrix multiply) +- 2D array indexing: `array[i][j]` or `array[i * width + j]` +- Accumulator in inner loop +- Linear combinations of input + +**ReVa detection:** +``` +search-decompilation pattern="\\[.*\\]\\[.*\\]" +→ Find 2D array access + +get-decompilation showing nested loops +→ Count loop depth (3 = likely matrix multiply) + +read-memory at matrix constants +→ Extract transformation matrix +``` + +**Solution approach:** +- Extract matrix +- Invert matrix (if square and invertible) +- Apply inverse to expected output to get required input + +### State Machine Patterns + +**Recognition Signature:** +``` +Explicit state variable: + int state = STATE_INIT; + while (running) { + switch (state) { + case STATE_INIT: /* ... */ state = STATE_READY; break; + case STATE_READY: /* ... */ state = STATE_PROCESS; break; + case STATE_PROCESS: /* ... */ state = STATE_DONE; break; + } + } + +Implicit state (position in input): + for (i = 0; i < len; i++) { + if (/* condition based on i and input */) + /* different processing for different positions */ + } +``` + +**What to look for:** +- State variable with multiple values +- Large switch statement on state +- State transitions (state = NEW_STATE) +- Different behavior based on current state + +**ReVa detection:** +``` +search-decompilation pattern="(case|switch)" +→ Find switch statements + +get-decompilation of state machine +→ Map state transitions + +rename-variables to clarify states +→ current_state, next_state, etc. +``` + +**Solution approach:** +- Map state transition graph +- Identify accepting states (success) +- Determine input sequence that reaches accepting state + +## Obfuscation Patterns + +### Control Flow Obfuscation + +**Recognition Signature:** +``` +Opaque predicates (always true/false): + if (x * x >= 0) // Always true + real_code(); + else + never_executed(); + +Dispatcher loops: + while (1) { + switch (dispatch_value) { + case 0: /* block A */; dispatch_value = 5; break; + case 5: /* block B */; dispatch_value = 2; break; + case 2: /* block C */; dispatch_value = -1; break; + case -1: return; + } + } +``` + +**What to look for:** +- Unnecessary conditionals +- Complex control flow with simple logic +- Dispatcher-based execution (case jumps) +- Dead code branches + +**ReVa detection:** +``` +get-decompilation of obfuscated function +→ Look for unusual control flow + +set-bookmark type="Warning" for suspicious patterns +→ Mark opaque predicates, dispatchers + +Focus on data flow, ignore control flow complexity +→ Track input transformation regardless of jumps +``` + +**Solution approach:** +- Ignore obfuscation, trace data flow +- Use dynamic analysis to observe actual execution path +- Simplify manually or with deobfuscation tools + +### String Obfuscation + +**Recognition Signature:** +``` +Stack strings (character-by-character): + str[0] = 'f'; str[1] = 'l'; str[2] = 'a'; str[3] = 'g'; + +Encrypted strings (decrypted at runtime): + decrypt_string(encrypted_data, key, output); + +Computed strings: + for (i = 0; i < len; i++) + str[i] = base[i] ^ key; +``` + +**What to look for:** +- Character assignments to array +- String decryption functions +- XOR or arithmetic on character arrays +- Strings not visible in static string list + +**ReVa detection:** +``` +get-strings may not show obfuscated strings +→ Use decompilation to find construction + +search-decompilation pattern="\\[0\\] = " +→ Find character-by-character assignments + +find-cross-references to decryption functions +→ Locate where strings are revealed +``` + +**Solution approach:** +- Identify deobfuscation routine +- Extract encrypted data and key +- Decrypt manually or use dynamic analysis to observe decrypted string + +### Anti-Debugging (CTF Context) + +**Recognition Signature:** +``` +Debugger detection: + if (ptrace(PTRACE_TRACEME, 0, 1, 0) < 0) exit(1); // Linux + if (IsDebuggerPresent()) exit(1); // Windows + +Timing checks: + start = time(); + /* short operation */ + end = time(); + if (end - start > THRESHOLD) exit(1); // Detected breakpoint delay + +Self-modification: + Decrypt code section at runtime + Execute decrypted code + Re-encrypt afterwards +``` + +**What to look for:** +- Debugger detection APIs +- Timing measurements +- Memory protection changes +- Code modification at runtime + +**ReVa detection:** +``` +get-symbols includeExternal=true +→ Look for: ptrace, IsDebuggerPresent, time, gettimeofday + +search-decompilation pattern="(ptrace|IsDebugger|time)" +→ Find anti-debug checks + +find-cross-references to VirtualProtect, mprotect +→ Identify self-modifying code +``` + +**Solution approach:** +- Patch out anti-debug checks (NOP the exit) +- Use anti-anti-debugging tools +- Analyze in sandbox that hides debugger +- For CTF, often acceptable to patch binary + +## Common CTF Tricks + +### Flag Format Validation + +**Pattern:** +``` +Check prefix: + if (strncmp(input, "flag{", 5) != 0) return 0; + +Check suffix: + if (input[len-1] != '}') return 0; + +Check length: + if (strlen(input) != EXPECTED_LEN) return 0; +``` + +**What to look for:** +- String comparison with literal "flag{" or "CTF{" +- Bracket/brace checks +- Length validation + +**ReVa detection:** +``` +search-strings-regex pattern="(flag\\{|CTF\\{)" +→ Find flag format strings + +get-decompilation of validation +→ Extract format requirements +``` + +**Solution approach:** +- Note format requirements +- Focus on solving for content between delimiters +- Reconstruct full flag with proper format + +### Multi-Stage Validation + +**Pattern:** +``` +Stage 1: Check format (flag{...}) +Stage 2: Check length (must be 32 characters) +Stage 3: Check checksum (sum must equal X) +Stage 4: Check encryption (encrypted content matches Y) +``` + +**What to look for:** +- Multiple validation functions called in sequence +- Early exits on failure +- Progressive constraints + +**ReVa detection:** +``` +find-cross-references to validation function +→ See if called from multi-stage validator + +get-decompilation of main validator +→ Identify call sequence + +Analyze each stage separately +→ Understand cumulative constraints +``` + +**Solution approach:** +- Solve each stage's constraints +- Combine solutions (flag must satisfy ALL stages) +- Work backwards from most constrained to least + +### Hidden Success Path + +**Pattern:** +``` +Obvious failure message: + printf("Wrong!\n"); + +Hidden success logic: + if (/* complex condition */) + system("cat /flag.txt"); // No message, just action +``` + +**What to look for:** +- Success action without visible message +- File access (cat flag, open flag.txt) +- Network communication of flag +- Success indicated by lack of "Wrong" message + +**ReVa detection:** +``` +search-strings-regex pattern="(flag|/flag|flag\\.txt)" +→ Find flag file references + +find-cross-references to flag file +→ Locate success path + +get-decompilation of success condition +→ Understand requirements +``` + +**Solution approach:** +- Don't rely on "Correct!" message +- Look for flag output actions +- Check for file reads, network sends +- Success may be silent + +## Using These Patterns + +### Pattern Matching Workflow + +1. **Observe code structure** + - Loops, conditionals, function calls + - Data types, array sizes + - Constants and literals + +2. **Compare to pattern catalog** + - Does this match a crypto pattern? + - Is this an encoding scheme? + - Looks like input validation? + +3. **Verify with specific checks** + ``` + Hypothesis: This is AES + Check 1: read-memory at constant array → Matches AES S-box? ✓ + Check 2: Count loop iterations → 10, 12, or 14? ✓ + Check 3: Block size 16 bytes? ✓ + Conclusion: AES confirmed + ``` + +4. **Apply pattern-specific solution** + - AES → Extract key, decrypt + - XOR → Extract key, XOR again + - Constraint validation → Extract constraints, solve + +### Quick Reference Decision Tree + +``` +Does it have loops with XOR? + → Check Simple XOR Patterns + +Does it have large constant arrays? + → Check Block Cipher or Hash Patterns + +Does it have swap operations and modulo? + → Check Stream Cipher Patterns + +Does it have character-by-character comparison? + → Check Input Validation Patterns + +Does it have 64-character lookup table? + → Check Base64 Pattern + +Does it have mathematical operations (factorial, fibonacci)? + → Check Algorithm Patterns + +Is control flow overly complex? + → Check Obfuscation Patterns +``` + +### Combining Patterns + +Real challenges often combine multiple patterns: + +**Example: Crypto + Validation** +``` +Input → Format Check (flag{...}) → XOR Decode → AES Decrypt → Compare to Expected +``` + +**Solve:** +1. Extract format requirements +2. Identify XOR key +3. Identify AES key +4. Extract expected value +5. Work backwards: AES_decrypt(XOR_decode(expected)) with known keys + +**Example: Encoding + Constraint** +``` +Input → Base64 Decode → Constraint Check (sum == X, product == Y) +``` + +**Solve:** +1. Extract constraints on decoded values +2. Solve constraints +3. Base64 encode solution + +## Remember + +Patterns are **recognition shortcuts**, not rigid rules: +- Use patterns to quickly identify challenge type +- Adapt pattern solutions to specific implementation +- If pattern doesn't fit, analyze from first principles +- Document your pattern matches with bookmarks/comments +- Build your own pattern library from experience + +When you recognize a pattern, you skip hours of analysis and jump directly to solution strategy. diff --git a/skills/deep-analysis/SKILL.md b/skills/deep-analysis/SKILL.md new file mode 100644 index 0000000..25a39e3 --- /dev/null +++ b/skills/deep-analysis/SKILL.md @@ -0,0 +1,607 @@ +--- +name: deep-analysis +description: Performs focused, depth-first investigation of specific reverse engineering questions through iterative analysis and database improvement. Answers questions like "What does this function do?", "Does this use crypto?", "What's the C2 address?", "Fix types in this function". Makes incremental improvements (renaming, retyping, commenting) to aid understanding. Returns evidence-based answers with new investigation threads. Use after binary-triage for investigating specific suspicious areas or when user asks focused questions about binary behavior. +--- + +# Deep Analysis + +## Purpose + +You are a focused reverse engineering investigator. Your goal is to answer **specific questions** about binary behavior through systematic, evidence-based analysis while **improving the Ghidra database** to aid understanding. + +Unlike binary-triage (breadth-first survey), you perform **depth-first investigation**: +- Follow one thread completely before branching +- Make incremental improvements to code readability +- Document all assumptions with evidence +- Return findings with new investigation threads + +## Core Workflow: The Investigation Loop + +Follow this iterative process (repeat 3-7 times): + +### 1. READ - Gather Current Context (1-2 tool calls) +``` +Get decompilation/data at focus point: +- get-decompilation (limit=20-50 lines, includeIncomingReferences=true, includeReferenceContext=true) +- find-cross-references (direction="to"/"from", includeContext=true) +- get-data or read-memory for data structures +``` + +### 2. UNDERSTAND - Analyze What You See +Ask yourself: +- What is unclear? (variable names, types, logic flow) +- What operations are being performed? +- What APIs/strings/data are referenced? +- What assumptions am I making? + +### 3. IMPROVE - Make Small Database Changes (1-3 tool calls) +Prioritize clarity improvements: +``` +rename-variables: var_1 → encryption_key, iVar2 → buffer_size +change-variable-datatypes: local_10 from undefined4 to uint32_t +set-function-prototype: void FUN_00401234(uint8_t* data, size_t len) +apply-data-type: Apply uint8_t[256] to S-box constant +set-decompilation-comment: Document key findings in code +set-comment: Document assumptions at address level +``` + +### 4. VERIFY - Re-read to Confirm Improvement (1 tool call) +``` +get-decompilation again → Verify changes improved readability +``` + +### 5. FOLLOW THREADS - Pursue Evidence (1-2 tool calls) +``` +Follow xrefs to called/calling functions +Trace data flow through variables +Check string/constant usage +Search for similar patterns +``` + +### 6. TRACK PROGRESS - Document Findings (1 tool call) +``` +set-bookmark type="Analysis" category="[Topic]" → Mark important findings +set-bookmark type="TODO" category="DeepDive" → Track unanswered questions +set-bookmark type="Note" category="Evidence" → Document key evidence +``` + +### 7. ON-TASK CHECK - Stay Focused +Every 3-5 tool calls, ask: +- "Am I still answering the original question?" +- "Is this lead productive or a distraction?" +- "Do I have enough evidence to conclude?" +- "Should I return partial results now?" + +## Question Type Strategies + +### "What does function X do?" + +**Discovery:** +1. `get-decompilation` with `includeIncomingReferences=true` +2. `find-cross-references` direction="to" to see who calls it + +**Investigation:** +3. Identify key operations (loops, conditionals, API calls) +4. Check strings/constants referenced: `get-data`, `read-memory` +5. `rename-variables` based on usage patterns +6. `change-variable-datatypes` where evident from operations +7. `set-decompilation-comment` to document behavior + +**Synthesis:** +8. Summarize function behavior with evidence +9. Return threads: "What calls this?", "What does it do with results?" + +### "Does this use cryptography?" + +**Discovery:** +1. `search-strings-regex` pattern="(AES|RSA|encrypt|decrypt|crypto|cipher)" +2. `search-decompilation` pattern for crypto patterns (S-box, permutation loops) +3. `get-symbols` includeExternal=true → Check for crypto API imports + +**Investigation:** +4. `find-cross-references` to crypto strings/constants +5. `get-decompilation` of functions referencing crypto indicators +6. Look for crypto patterns: substitution boxes, key schedules, rounds +7. `read-memory` at constants to check for S-boxes (0x63, 0x7c, 0x77, 0x7b...) + +**Improvement:** +8. `rename-variables`: key, plaintext, ciphertext, sbox +9. `apply-data-type`: uint8_t[256] for S-boxes, uint32_t[60] for key schedules +10. `set-comment` at constants: "AES S-box" or "RC4 substitution table" + +**Synthesis:** +11. Return: Algorithm type, mode, key size with specific evidence +12. Threads: "Where does key originate?", "What data is encrypted?" + +### "What is the C2 address?" + +**Discovery:** +1. `search-strings-regex` pattern="(http|https|[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+|\.com|\.net|\.org)" +2. `get-symbols` includeExternal=true → Find network APIs (connect, send, WSAStartup) +3. `search-decompilation` pattern="(connect|send|recv|socket)" + +**Investigation:** +4. `find-cross-references` to network strings (URLs, IPs) +5. `get-decompilation` of network functions +6. Trace data flow from strings to network calls +7. Check for string obfuscation: stack strings, XOR decoding + +**Improvement:** +8. `rename-variables`: c2_url, server_ip, port +9. `set-decompilation-comment`: "Connects to C2 server" +10. `set-bookmark` type="Analysis" category="Network" at connection point + +**Synthesis:** +11. Return: All potential C2 indicators with evidence +12. Threads: "How is C2 address selected?", "What protocol is used?" + +### "Fix types in this function" + +**Discovery:** +1. `get-decompilation` to see current state +2. Analyze variable usage: operations, API parameters, return values + +**Investigation:** +3. For each unclear type, check: + - What operations? (arithmetic → int, pointer deref → pointer) + - What APIs called with it? (check API signature) + - What's returned/passed? (trace data flow) + +**Improvement:** +4. `change-variable-datatypes` based on usage evidence +5. Check for structure patterns: repeated field access at fixed offsets +6. `apply-structure` or `apply-data-type` for complex types +7. `set-function-prototype` to fix parameter/return types + +**Verification:** +8. `get-decompilation` again → Verify code makes more sense +9. Check that type changes propagate correctly (no casts needed) + +**Synthesis:** +10. Return: List of type changes with rationale +11. Threads: "Are these structure fields correct?", "Check callers for type consistency" + +## Tool Usage Guidelines + +### Discovery Phase (Find the Target) +Use broad search tools first, then narrow focus: +``` +search-decompilation pattern="..." → Find functions doing X +search-strings-regex pattern="..." → Find strings matching pattern +get-strings-by-similarity searchString="..." → Find similar strings +get-functions-by-similarity searchString="..." → Find similar functions +find-cross-references location="..." direction="to" → Who references this? +``` + +### Investigation Phase (Understand the Code) +Always request context to understand usage: +``` +get-decompilation: + - includeIncomingReferences=true (see callers on function line) + - includeReferenceContext=true (get code snippets from callers) + - limit=20-50 (start small, expand as needed) + - offset=1 (paginate through large functions) + +find-cross-references: + - includeContext=true (get code snippets) + - contextLines=2 (lines before/after) + - direction="both" (see full picture) + +get-data addressOrSymbol="..." → Inspect data structures +read-memory addressOrSymbol="..." length=... → Check constants +``` + +### Improvement Phase (Make Code Readable) +Prioritize high-impact, low-cost improvements: + +**PRIORITY 1: Variable Naming** (biggest clarity gain) +``` +rename-variables: + - Use descriptive names based on usage + - Example: var_1 → encryption_key, iVar2 → buffer_size + - Rename only what you understand (don't guess) +``` + +**PRIORITY 2: Type Correction** (fixes casts, clarifies operations) +``` +change-variable-datatypes: + - Use evidence from operations/APIs + - Example: local_10 from undefined4 to uint32_t + - Check decompilation improves after change +``` + +**PRIORITY 3: Function Signatures** (helps callers understand) +``` +set-function-prototype: + - Use C-style signatures + - Example: "void encrypt_data(uint8_t* buffer, size_t len, uint8_t* key)" +``` + +**PRIORITY 4: Structure Application** (reveals data organization) +``` +apply-data-type or apply-structure: + - Apply when pattern is clear (repeated field access) + - Example: Apply AES_CTX structure at ctx pointer +``` + +**PRIORITY 5: Documentation** (preserves findings) +``` +set-decompilation-comment: + - Document behavior at specific lines + - Example: line 15: "Initializes AES context with 256-bit key" + +set-comment type="pre": + - Document at address level + - Example: "Entry point for encryption routine" +``` + +### Tracking Phase (Document Progress) +Use bookmarks and comments to track work: + +**Bookmark Types:** +``` +type="Analysis" category="[Topic]" → Current investigation findings +type="TODO" category="DeepDive" → Unanswered questions for later +type="Note" category="Evidence" → Key evidence locations +type="Warning" category="Assumption" → Document assumptions made +``` + +**Search Your Work:** +``` +search-bookmarks type="Analysis" → Review all findings +search-comments searchText="[keyword]" → Find documented assumptions +``` + +**Checkpoint Progress:** +``` +checkin-program message="..." → Save significant improvements +``` + +## Evidence Requirements + +Every claim must be backed by **specific evidence**: + +### REQUIRED for all findings: +- **Address**: Exact location (0x401234) +- **Code**: Relevant decompilation snippet +- **Context**: Why this supports the claim + +### Example of GOOD evidence: +``` +Claim: "This function uses AES-256 encryption" +Evidence: + 1. String "AES-256-CBC" at 0x404010 (referenced in function) + 2. S-box constant at 0x404100 (matches standard AES S-box) + 3. 14-round loop at 0x401245:15 (AES-256 uses 14 rounds) + 4. 256-bit key parameter (32 bytes, function signature) +Confidence: High +``` + +### Example of BAD evidence: +``` +Claim: "This looks like encryption" +Evidence: "There's a loop and some XOR operations" +Confidence: Low +``` + +## Assumption Tracking + +Explicitly document all assumptions: + +### When making assumptions: +1. **State the assumption clearly** + - "Assuming key is hardcoded based on constant reference" + +2. **Provide supporting evidence** + - "Key pointer (0x401250:8) loads from .data section at 0x405000" + - "Memory at 0x405000 contains 32 constant bytes" + +3. **Rate confidence** + - High: Strong evidence, standard pattern + - Medium: Some evidence, plausible + - Low: Weak evidence, speculation + +4. **Document with bookmark/comment** + ``` + set-bookmark type="Warning" category="Assumption" + comment="Assuming AES key is hardcoded - needs verification" + ``` + +### Common assumptions to watch for: +- Function purpose based on limited context +- Data type inferences from single usage +- Crypto algorithm based on partial pattern +- Protocol based on string content +- Control flow in obfuscated code + +## Integration with Binary-Triage + +### Consuming Triage Results + +**Triage creates bookmarks you should check:** +``` +search-bookmarks type="Warning" category="Suspicious" +search-bookmarks type="TODO" category="Triage" +``` + +**Triage identifies areas for investigation:** +- Suspicious functions (crypto, network, process manipulation) +- Interesting strings (URLs, IPs, keywords) +- Anomalous imports (anti-debugging, injection APIs) + +**Start from triage findings:** +1. User: "Investigate the crypto function from triage" +2. `search-bookmarks` type="Warning" category="Crypto" +3. Navigate to bookmarked address +4. Begin deep investigation with context + +### Producing Results for Parent Agent + +**Return structured findings:** +```json +{ + "question": "Does function sub_401234 use encryption?", + "answer": "Yes, AES-256-CBC encryption", + "confidence": "high", + "evidence": [ + "String 'AES-256-CBC' at 0x404010", + "Standard AES S-box at 0x404100", + "14-round loop at 0x401245:15", + "32-byte key parameter" + ], + "assumptions": [ + { + "assumption": "Key is hardcoded", + "evidence": "Constant reference at 0x401250", + "confidence": "medium", + "bookmark": "0x405000 type=Warning category=Assumption" + } + ], + "improvements_made": [ + "Renamed 8 variables (var_1→key, iVar2→rounds, etc.)", + "Changed 3 datatypes (uint8_t*, uint32_t, size_t)", + "Applied uint8_t[256] to S-box at 0x404100", + "Added 5 decompilation comments documenting AES operations", + "Set function prototype: void aes_encrypt(uint8_t* data, size_t len, uint8_t* key)" + ], + "unanswered_threads": [ + { + "question": "Where does the 32-byte AES key originate?", + "starting_point": "0x401250 (key parameter load)", + "priority": "high", + "context": "Key appears hardcoded at 0x405000 but may be derived" + }, + { + "question": "What data is being encrypted?", + "starting_point": "Cross-references to aes_encrypt", + "priority": "high", + "context": "Need to trace callers to understand data source" + }, + { + "question": "Is IV properly randomized?", + "starting_point": "0x401260 (IV initialization)", + "priority": "medium", + "context": "IV appears to use time-based seed, check entropy" + } + ] +} +``` + +**Key components:** +1. **Direct answer** to the question +2. **Confidence level** (high/medium/low) +3. **Specific evidence** (addresses, code, data) +4. **Documented assumptions** with confidence +5. **Database improvements** made during investigation +6. **Unanswered threads** as new investigation tasks + +## Quality Standards + +### Before Returning Results: + +**Check completeness:** +- [ ] Original question answered (or marked as unanswerable) +- [ ] All claims backed by specific evidence (addresses + code) +- [ ] All assumptions explicitly documented +- [ ] Confidence level provided with rationale +- [ ] Database improvements listed + +**Check focus:** +- [ ] Investigation stayed on-topic +- [ ] No excessive tangents or scope creep +- [ ] Tool calls were purposeful (10-15 max) +- [ ] Partial results returned rather than getting stuck + +**Check quality:** +- [ ] Variable names are descriptive, not generic +- [ ] Data types match actual usage +- [ ] Comments explain WHY, not just WHAT +- [ ] Code is more readable than before +- [ ] Bookmarks categorized appropriately + +**Check handoff:** +- [ ] Unanswered threads are specific and actionable +- [ ] Each thread has starting point (address/function) +- [ ] Threads are prioritized by importance +- [ ] Context provided for each thread + +## Anti-Patterns to Avoid + +### Scope Creep +❌ **Don't**: Start investigating "Does this use crypto?" and drift into analyzing entire network protocol +✅ **Do**: Answer crypto question, return thread "Investigate network protocol at 0x402000" + +### Premature Conclusions +❌ **Don't**: "This is AES encryption" (based on seeing XOR operations) +✅ **Do**: "Likely AES encryption (S-box pattern matches), confidence: medium" + +### Over-Improving +❌ **Don't**: Spend 10 tool calls renaming every variable perfectly +✅ **Do**: Rename key variables for clarity, note others as improvement thread + +### Ignoring Context +❌ **Don't**: Analyze function in isolation without checking callers +✅ **Do**: Always use `includeIncomingReferences=true` and check xrefs + +### Lost Threads +❌ **Don't**: Notice interesting behavior but forget to document it +✅ **Do**: Immediately `set-bookmark type=TODO` for all unanswered questions + +### Assumption Hiding +❌ **Don't**: Make assumptions without stating them +✅ **Do**: Explicitly document: "Assuming X based on Y (confidence: Z)" + +## Tool Call Budget + +Stay efficient - aim for **10-15 tool calls** per investigation: + +**Typical breakdown:** +- Discovery: 2-3 calls (find target, get initial context) +- Investigation Loop (3-5 iterations): + - Read: 1 call (get-decompilation) + - Improve: 1-2 calls (rename/retype/comment) + - Follow: 1 call (xrefs or related functions) +- Tracking: 1-2 calls (bookmarks, comments) +- Checkpoint: 0-1 calls (checkin if major progress) + +**If exceeding budget:** +- Return partial results now +- Create threads for continued investigation +- Don't get stuck - pass to parent agent + +## Starting the Investigation + +### Parse the Question + +Identify: +1. **Target**: Function, string, address, behavior +2. **Type**: "What does", "Does it", "Where is", "Fix" +3. **Scope**: Single function vs. system-wide behavior +4. **Depth**: Quick check vs. thorough analysis + +### Gather Initial Context + +**If function-focused:** +``` +get-decompilation functionNameOrAddress="..." limit=30 + includeIncomingReferences=true + includeReferenceContext=true +``` + +**If string-focused:** +``` +get-strings-by-similarity searchString="..." +find-cross-references location="[string address]" direction="to" +``` + +**If behavior-focused:** +``` +search-decompilation pattern="..." +search-strings-regex pattern="..." +``` + +### Set Starting Bookmark + +``` +set-bookmark type="Analysis" category="[Question Topic]" + addressOrSymbol="[starting point]" + comment="Investigating: [original question]" +``` + +This marks where you began for future reference. + +## Exiting the Investigation + +### Success Criteria + +Return results when you've: +1. **Answered the question** (or determined it's unanswerable) +2. **Gathered sufficient evidence** (3+ specific supporting facts) +3. **Improved the database** (code is clearer than before) +4. **Documented assumptions** (nothing hidden) +5. **Identified threads** (next steps are clear) + +### Partial Results Are OK + +Return partial results if: +- You've hit the tool call budget (10-15 calls) +- Investigation is blocked (need external info) +- Question requires multiple investigations (split into threads) +- Confidence is low but some findings exist + +**Better to return:** +``` +"Partially answered: Likely uses AES (medium confidence), needs verification" +Threads: ["Verify S-box matches AES standard", "Confirm key schedule"] +``` + +**Than to:** +- Keep investigating without progress +- Make unsupported claims +- Never return results + +## Example Investigation Flow + +``` +User: "Does function FUN_00401234 use encryption?" + +[Call 1] get-decompilation FUN_00401234 limit=30 includeIncomingReferences=true +→ See loop with array access, XOR operations, called from 3 functions + +[Call 2] search-strings-regex pattern="(AES|encrypt|crypto)" +→ No crypto strings found in binary + +[Call 3] find-cross-references location="0x401234" direction="to" includeContext=true +→ Called by "send_data" function with buffer parameter + +[Call 4] read-memory addressOrSymbol="0x404000" length=256 +→ Check suspicious constant array → Matches AES S-box! + +[Call 5] rename-variables FUN_00401234 {"var_1": "data", "var_2": "data_len", "var_3": "sbox"} + +[Call 6] get-decompilation FUN_00401234 limit=30 +→ Verify improved: data[i] = sbox[data[i] ^ key[i % 16]] + +[Call 7] change-variable-datatypes FUN_00401234 {"sbox": "uint8_t*", "key": "uint8_t*"} + +[Call 8] set-decompilation-comment FUN_00401234 line=15 comment="AES S-box substitution" + +[Call 9] set-bookmark type="Analysis" category="Crypto" + addressOrSymbol="0x401234" comment="AES encryption function" + +[Call 10] set-bookmark type="TODO" category="DeepDive" + addressOrSymbol="0x401240" comment="Find AES key source" + +Return: +{ + "answer": "Yes, uses AES encryption", + "confidence": "high", + "evidence": [ + "Standard AES S-box at 0x404000", + "S-box substitution at 0x401234:15", + "Called by send_data to encrypt network traffic" + ], + "improvements": [ + "Renamed 3 variables for clarity", + "Fixed 2 variable types to uint8_t*", + "Added decompilation comment on S-box usage" + ], + "threads": [ + "Find AES key source (starting at 0x401240)", + "Determine AES mode (CBC, ECB, etc.)", + "Check if IV is properly randomized" + ] +} +``` + +## Remember + +You are a **focused investigator**, not a comprehensive analyzer: +- Answer the specific question asked +- Follow evidence, not hunches +- Improve code incrementally as you work +- Document everything explicitly +- Return threads for continued investigation +- Stay on task, stay efficient + +The goal is **evidence-based answers with improved code**, not perfect understanding of the entire binary. diff --git a/skills/deep-analysis/examples.md b/skills/deep-analysis/examples.md new file mode 100644 index 0000000..e083788 --- /dev/null +++ b/skills/deep-analysis/examples.md @@ -0,0 +1,733 @@ +# Deep Analysis Skill - Investigation Examples + +This document provides concrete examples of how to use the deep-analysis skill to answer specific reverse engineering questions. + +## Example 1: "What does this function do?" + +### Initial Question +User: "What does function `FUN_00401850` do?" + +### Investigation Flow + +**Phase 1: Initial Context (2 calls)** + +1. Get decompilation with context: +``` +get-decompilation FUN_00401850 limit=30 + includeIncomingReferences=true + includeReferenceContext=true +``` + +Observe: +- Loop iterating 32 times +- Bitwise operations (XOR, shifts) +- Array access with computed indices +- Called by `process_buffer` and `send_encrypted_data` + +2. Check cross-references: +``` +find-cross-references location="0x401850" direction="to" + includeContext=true contextLines=3 +``` + +Observe: +- Called with 3 parameters: data buffer, length, key buffer +- Return value not checked +- Always passes 32-byte key buffer + +**Phase 2: Pattern Recognition (2 calls)** + +3. Read memory at constant data: +``` +read-memory addressOrSymbol="0x404100" length=256 +``` + +Find: Starts with `63 7c 77 7b f2 6b 6f c5...` + +Compare: Matches AES S-box from patterns.md + +4. Set initial bookmark: +``` +set-bookmark type="Analysis" category="Crypto" + addressOrSymbol="0x401850" + comment="Investigating encryption function - appears to be AES" +``` + +**Phase 3: Iterative Improvement (6 calls)** + +5. Rename key variables: +``` +rename-variables FUN_00401850 { + "param_1": "data", + "param_2": "data_len", + "param_3": "key", + "local_10": "round", + "local_14": "sbox_ptr" +} +``` + +6. Re-read to verify: +``` +get-decompilation FUN_00401850 limit=30 +``` + +Much clearer! Now see: `data[i] = sbox_ptr[data[i] ^ key[round]]` + +7. Fix data types: +``` +change-variable-datatypes FUN_00401850 { + "data": "uint8_t*", + "data_len": "size_t", + "key": "uint8_t*", + "sbox_ptr": "uint8_t*" +} +``` + +8. Set function prototype: +``` +set-function-prototype location="0x401850" + signature="void aes_encrypt_block(uint8_t* data, size_t len, uint8_t* key)" +``` + +9. Add documentation comment: +``` +set-decompilation-comment FUN_00401850 line=15 + comment="AES S-box substitution - confirms AES encryption" +``` + +10. Document findings: +``` +set-comment programPath="/malware.exe" addressOrSymbol="0x401850" + type="pre" + comment="AES encryption function: encrypts data in place using 32-round AES with S-box at 0x404100" +``` + +**Phase 4: Return Results** + +### Answer +```json +{ + "question": "What does function FUN_00401850 do?", + "answer": "Performs AES encryption on a data buffer in-place", + "confidence": "high", + "evidence": [ + "Uses standard AES S-box at 0x404100 (verified match: 63 7c 77 7b...)", + "32-round loop structure typical of AES-256", + "S-box substitution pattern at 0x401850:15", + "Called with data, length, and 32-byte key parameters", + "Invoked by send_encrypted_data and process_buffer functions" + ], + "assumptions": [ + { + "assumption": "Function performs full AES encryption, not just one component", + "evidence": "Complete 32-round structure with key schedule", + "confidence": "high" + } + ], + "improvements_made": [ + "Renamed 5 variables for clarity (data, data_len, key, round, sbox_ptr)", + "Fixed 4 data types (uint8_t* for buffers, size_t for length)", + "Set function prototype: void aes_encrypt_block(uint8_t* data, size_t len, uint8_t* key)", + "Added decompilation comment on S-box substitution line", + "Added address-level comment documenting AES usage" + ], + "unanswered_threads": [ + { + "question": "Where does the 32-byte encryption key come from?", + "starting_point": "Cross-references to aes_encrypt_block - check callers", + "priority": "high", + "context": "Key appears to be passed from calling functions, need to trace origin" + }, + { + "question": "What data is being encrypted?", + "starting_point": "Functions process_buffer (0x402100) and send_encrypted_data (0x402500)", + "priority": "high", + "context": "These callers determine what data gets encrypted" + }, + { + "question": "Is this using proper AES mode (CBC, CTR, etc.) or just ECB?", + "starting_point": "Examine calling context for IV usage or block chaining", + "priority": "medium", + "context": "Need to check if callers implement proper mode of operation" + } + ] +} +``` + +--- + +## Example 2: "Does this binary use encryption?" + +### Initial Question +User: "Does this binary use encryption?" + +### Investigation Flow + +**Phase 1: Broad Discovery (3 calls)** + +1. Search for crypto-related strings: +``` +search-strings-regex pattern="(AES|RSA|encrypt|decrypt|crypto|cipher|key)" + caseSensitive=false +``` + +Result: No obvious crypto strings found + +2. Search decompiled code for patterns: +``` +search-decompilation pattern="(\^|\&\s*0x(FF|ff)|<<|>>).*loop" + maxResults=20 +``` + +Find: Multiple functions with XOR and bitwise operations in loops + +3. Check for suspicious imports: +``` +get-symbols includeExternal=true filterDefaultNames=true startIndex=0 maxCount=200 +``` + +Find: No obvious crypto library imports + +**Phase 2: Investigate Candidates (4 calls)** + +4. Check most suspicious function from search: +``` +get-decompilation FUN_00402340 limit=40 + includeIncomingReferences=true +``` + +Observe: Nested loops, array indexing, modulo operations + +5. Check for constant arrays: +``` +find-cross-references location="0x402340" direction="from" limit=50 +``` + +Find: References to 0x405000 (large data section) + +6. Read suspicious memory region: +``` +read-memory addressOrSymbol="0x405000" length=256 +``` + +Find: Initialized array 0-255 in sequence, then modified + +Pattern match: Looks like RC4 initialization! + +7. Bookmark finding: +``` +set-bookmark type="Analysis" category="Crypto" + addressOrSymbol="0x402340" + comment="Suspected RC4 cipher - 256-byte state array" +``` + +**Phase 3: Confirm Pattern (3 calls)** + +8. Improve understanding: +``` +rename-variables FUN_00402340 { + "local_10": "state_array", + "local_14": "i", + "local_18": "j", + "param_1": "key", + "param_2": "key_len" +} +``` + +9. Re-examine: +``` +get-decompilation FUN_00402340 limit=40 +``` + +Clear RC4 key scheduling algorithm (KSA) pattern: +```c +for (i = 0; i < 256; i++) { + state_array[i] = i; +} +j = 0; +for (i = 0; i < 256; i++) { + j = (j + state_array[i] + key[i % key_len]) % 256; + // swap state_array[i] and state_array[j] +} +``` + +10. Document: +``` +set-comment programPath="/binary.exe" addressOrSymbol="0x402340" + type="pre" + comment="RC4 Key Scheduling Algorithm (KSA) - initializes cipher state from key" +``` + +**Phase 4: Return Results** + +### Answer +```json +{ + "question": "Does this binary use encryption?", + "answer": "Yes, uses RC4 stream cipher", + "confidence": "high", + "evidence": [ + "RC4 KSA (Key Scheduling Algorithm) at 0x402340", + "256-byte state array initialization (0-255 sequence)", + "Characteristic RC4 swap operations with modulo arithmetic", + "State array modified based on key input", + "Pattern matches RC4 specification exactly" + ], + "assumptions": [ + { + "assumption": "RC4 PRGA (encryption loop) exists in separate function", + "evidence": "KSA alone doesn't encrypt, must have companion function", + "confidence": "medium" + } + ], + "improvements_made": [ + "Renamed 5 variables in RC4 KSA function", + "Documented RC4 KSA at 0x402340", + "Set bookmark for crypto analysis" + ], + "unanswered_threads": [ + { + "question": "Where is the RC4 PRGA (actual encryption) function?", + "starting_point": "Check cross-references from KSA, likely called next", + "priority": "high", + "context": "KSA initializes state, PRGA uses it to encrypt" + }, + { + "question": "What is the RC4 key and where does it come from?", + "starting_point": "Analyze callers of RC4 KSA at 0x402340", + "priority": "high", + "context": "Key parameter origin determines if hardcoded or derived" + }, + { + "question": "What data is being encrypted with RC4?", + "starting_point": "Find PRGA function, then check its callers", + "priority": "medium", + "context": "Need to identify what sensitive data is being protected" + } + ] +} +``` + +--- + +## Example 3: "What is the C2 server address?" + +### Initial Question +User: "What is the command and control server address?" + +### Investigation Flow + +**Phase 1: Search for Network Indicators (3 calls)** + +1. Search for URLs and IPs: +``` +search-strings-regex pattern="(https?://|[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|\.com|\.net)" + caseSensitive=false +``` + +Find: No obvious domain/IP strings + +2. Search for network-related strings: +``` +search-strings-regex pattern="(connect|server|host|port|callback|beacon)" + caseSensitive=false +``` + +Find: String "host" at 0x404500 + +3. Check memory around "host" string: +``` +read-memory addressOrSymbol="0x404500" length=64 +``` + +Find: Encoded data following "host" string + +**Phase 2: Find String Usage (3 calls)** + +4. Find cross-references to suspicious string: +``` +find-cross-references location="0x404500" direction="to" + includeContext=true contextLines=4 +``` + +Find: Referenced in FUN_00403000, looks like string decoding + +5. Investigate decode function: +``` +get-decompilation FUN_00403000 limit=30 + includeIncomingReferences=true +``` + +Observe: XOR loop decoding strings from 0x404500 region + +6. Bookmark decode function: +``` +set-bookmark type="Analysis" category="Obfuscation" + addressOrSymbol="0x403000" + comment="String decoding function - XOR with 0x42" +``` + +**Phase 3: Understand Decoding (4 calls)** + +7. Improve decode function: +``` +rename-variables FUN_00403000 { + "param_1": "encoded_str", + "param_2": "output_buf", + "local_8": "i", + "local_c": "str_len" +} +``` + +8. Add comment on decode logic: +``` +set-decompilation-comment FUN_00403000 line=8 + comment="XOR decode with key 0x42 - reveals hidden strings" +``` + +9. Find where decoded string is used: +``` +find-cross-references location="0x403000" direction="to" + includeContext=true +``` + +Find: Called by FUN_00403500 (network initialization function) + +10. Check network function: +``` +get-decompilation FUN_00403500 limit=40 + includeIncomingReferences=true + includeReferenceContext=true +``` + +Observe: +```c +decode_string(encoded_c2, hostname_buf); // 0x403000 +connect_to_server(hostname_buf, 443); +``` + +**Phase 4: Decode C2 Address (3 calls)** + +11. Read encoded C2 address: +``` +read-memory addressOrSymbol="0x404520" length=32 +``` + +Get bytes: `2b 37 37 33 1a 0e 0e 6f 63 6c 6c 67 69 67 77 79 1e 65 6d 6f 00` + +Manual decode (XOR with 0x42): +- 0x2b ^ 0x42 = 'i' +- 0x37 ^ 0x42 = 'u' +- 0x37 ^ 0x42 = 'u' +- Result: "iuuxy//evil-malware.com" + +12. Document C2: +``` +set-comment programPath="/malware.exe" addressOrSymbol="0x404520" + type="pre" + comment="Encoded C2 address (XOR 0x42): https://evil-malware.com" +``` + +13. Document network function: +``` +set-comment programPath="/malware.exe" addressOrSymbol="0x403500" + type="pre" + comment="C2 connection function: decodes and connects to https://evil-malware.com:443" +``` + +**Phase 5: Return Results** + +### Answer +```json +{ + "question": "What is the command and control server address?", + "answer": "https://evil-malware.com (port 443)", + "confidence": "high", + "evidence": [ + "Encoded string at 0x404520 decodes to domain", + "String decode function at 0x403000 uses XOR key 0x42", + "Network init function at 0x403500 decodes and uses domain", + "Connection established on port 443 (HTTPS)", + "Manual decode confirmed: iuuxy// → https:// (XOR 0x42)" + ], + "assumptions": [ + { + "assumption": "This is the only C2 address", + "evidence": "Only encoded network string found so far", + "confidence": "medium", + "note": "May have fallback/backup C2 addresses" + } + ], + "improvements_made": [ + "Renamed variables in decode function (0x403000)", + "Added comments on XOR decode logic", + "Documented decoded C2 address at storage location", + "Documented C2 connection function" + ], + "unanswered_threads": [ + { + "question": "Are there backup/fallback C2 addresses?", + "starting_point": "Search for other encoded strings, check for C2 lists", + "priority": "high", + "context": "Malware often has multiple C2 servers for redundancy" + }, + { + "question": "What protocol is used for C2 communication?", + "starting_point": "Analyze functions called after connect_to_server", + "priority": "medium", + "context": "Connects on port 443, likely HTTPS but may be custom" + }, + { + "question": "Is the C2 address hardcoded or can it be updated?", + "starting_point": "Check for config file reading or domain generation", + "priority": "medium", + "context": "Dynamic C2 would indicate more sophisticated malware" + } + ] +} +``` + +--- + +## Example 4: "Fix the types in this function" + +### Initial Question +User: "Fix the types in function `FUN_00402800` - the decompilation is confusing" + +### Investigation Flow + +**Phase 1: Understand Current State (2 calls)** + +1. Get current decompilation: +``` +get-decompilation FUN_00402800 limit=50 +``` + +Observe confusing code: +```c +undefined4 FUN_00402800(int param_1, undefined4 param_2) { + int iVar1; + undefined4 *puVar2; + undefined4 uVar3; + + iVar1 = *(int *)(param_1 + 4); + puVar2 = (undefined4 *)(param_1 + 8); + uVar3 = *puVar2; + // ... more confusing code ... +} +``` + +2. Check cross-references for usage context: +``` +find-cross-references location="0x402800" direction="to" + includeContext=true contextLines=5 +``` + +Observe: Called with pointer to structure, second param looks like a size + +**Phase 2: Analyze Usage Patterns (3 calls)** + +3. Check what fields are accessed: +- `param_1 + 0`: read as int +- `param_1 + 4`: read as int +- `param_1 + 8`: read as pointer +- Pattern: Structure with int, int, pointer fields + +4. Check second parameter usage: +``` +Search in decompilation for param_2 usage +``` + +Find: Used in comparison `if (iVar1 < param_2)` and loop counter +Conclusion: param_2 is a count/size, should be `size_t` or `uint32_t` + +5. Check return value usage: +``` +Check caller context from xrefs +``` + +Find: Return value compared to 0 and 1 +Conclusion: Returns success/failure (bool or int status) + +**Phase 3: Iterative Type Fixing (5 calls)** + +6. Start with function signature: +``` +set-function-prototype location="0x402800" + signature="int process_items(struct item_list* list, uint32_t max_count)" + createIfNotExists=false +``` + +7. Create structure for param_1: +``` +parse-c-structure programPath="/binary.exe" + cDefinition="struct item_list { uint32_t current_count; uint32_t total_count; void** items; };" +``` + +8. Apply structure to callers (check one caller): +``` +Get address of a caller from xrefs: 0x403100 +``` + +9. Fix variable types in function: +``` +change-variable-datatypes FUN_00402800 { + "iVar1": "uint32_t", + "puVar2": "void**" +} +``` + +10. Rename variables for clarity: +``` +rename-variables FUN_00402800 { + "param_1": "list", + "param_2": "max_count", + "iVar1": "current_count", + "puVar2": "items_array", + "uVar3": "current_item" +} +``` + +**Phase 4: Verify and Document (3 calls)** + +11. Re-read decompilation: +``` +get-decompilation FUN_00402800 limit=50 +``` + +Much better: +```c +int process_items(item_list* list, uint32_t max_count) { + uint32_t current_count; + void** items_array; + void* current_item; + + current_count = list->total_count; + items_array = list->items; + current_item = *items_array; + // ... clearer logic now ... +} +``` + +12. Add documentation: +``` +set-decompilation-comment FUN_00402800 line=5 + comment="Iterates through item list up to max_count limit" +``` + +13. Document structure usage: +``` +set-comment programPath="/binary.exe" addressOrSymbol="0x402800" + type="pre" + comment="Processes item_list structure: validates count and iterates items array" +``` + +**Phase 5: Return Results** + +### Answer +```json +{ + "question": "Fix the types in function FUN_00402800", + "answer": "Fixed - function processes item_list structure with count validation", + "confidence": "high", + "evidence": [ + "Structure pattern identified: int + int + pointer at offsets 0, 4, 8", + "Second parameter used as loop bound (confirmed uint32_t)", + "Return value used as boolean (0/1 success code)", + "Cross-references confirm structure pointer usage", + "Decompilation much clearer after type fixes" + ], + "improvements_made": [ + "Created item_list structure: { uint32_t current_count; uint32_t total_count; void** items; }", + "Set function prototype: int process_items(item_list* list, uint32_t max_count)", + "Fixed 2 variable types (uint32_t, void**)", + "Renamed 5 variables for clarity (list, max_count, current_count, items_array, current_item)", + "Added decompilation comment documenting behavior", + "Added address-level comment on structure usage" + ], + "assumptions": [ + { + "assumption": "items array contains void pointers (generic items)", + "evidence": "No type-specific operations on items, just pointer manipulation", + "confidence": "medium", + "note": "Actual item type may be more specific" + }, + { + "assumption": "Structure is consistently used across codebase", + "evidence": "Multiple callers use same access pattern", + "confidence": "high" + } + ], + "unanswered_threads": [ + { + "question": "What is the actual type of items in the array?", + "starting_point": "Check where items are allocated and how they're used", + "priority": "medium", + "context": "Currently typed as void** but may be more specific struct" + }, + { + "question": "Should item_list structure be applied at allocation sites?", + "starting_point": "Find where item_list structures are created (malloc calls)", + "priority": "low", + "context": "Applying structure type at allocation improves consistency" + }, + { + "question": "Are there other functions using this structure that need fixing?", + "starting_point": "Search for similar offset access patterns (param+0, param+4, param+8)", + "priority": "medium", + "context": "Consistent type usage across codebase aids understanding" + } + ] +} +``` + +--- + +## Key Takeaways from Examples + +### Common Patterns Across Investigations + +1. **Start broad, narrow focus** + - Search/scan first + - Identify candidates + - Zoom into specific functions + +2. **Iterate: Read → Improve → Verify** + - Get decompilation + - Rename/retype + - Re-read to confirm improvement + +3. **Follow the evidence** + - Cross-references show usage + - Memory reads reveal constants + - Pattern matching confirms algorithms + +4. **Document as you go** + - Bookmarks for waypoints + - Comments for findings + - Keeps investigation organized + +5. **Return actionable threads** + - Always have next steps + - Specific starting points + - Prioritized by importance + +### Tool Call Efficiency + +Each example stayed within 10-15 tool calls: +- Example 1: 10 calls +- Example 2: 10 calls +- Example 3: 13 calls +- Example 4: 13 calls + +This demonstrates staying focused and efficient while still gathering sufficient evidence and making meaningful improvements. + +### Evidence-Based Conclusions + +Every answer includes: +- Specific addresses +- Code patterns or constants found +- Cross-reference evidence +- Confidence level with rationale + +This makes findings verifiable and trustworthy. diff --git a/skills/deep-analysis/patterns.md b/skills/deep-analysis/patterns.md new file mode 100644 index 0000000..8371f7a --- /dev/null +++ b/skills/deep-analysis/patterns.md @@ -0,0 +1,720 @@ +# Reverse Engineering Patterns Reference + +This document contains higher-level patterns and concepts to recognize during deep analysis. Focus on algorithmic patterns, behavioral patterns, and code structure rather than platform-specific implementation details. + +## Cryptographic Algorithm Patterns + +### Block Cipher Recognition + +**Conceptual characteristics:** +- **Substitution-Permutation Network (SPN)**: Repeated rounds of substitution (S-boxes) and permutation (bit shuffling) +- **Feistel Network**: Split data in half, operate on one half using the other as key input, swap halves, repeat +- **Fixed block size**: Typically 64 bits (DES, Blowfish) or 128 bits (AES) +- **Multiple rounds**: 8-16+ iterations of core transformation +- **Key schedule**: Derive round keys from master key + +**What to look for in decompiled code:** +``` +Nested loops: + Outer: rounds (8, 10, 12, 14, 16, 32 iterations) + Inner: processing blocks of fixed size + +Array lookups (S-boxes): + result = table[input_byte] + Often 256-element arrays (0x100 size) + +Bit manipulation: + XOR, rotation (>> combined with <<), permutation + +State updates: + Array or struct representing current cipher state + Transformed each round +``` + +**Telltale signs:** +- Large constant arrays (256+ bytes) that look like random data +- Fixed iteration counts (not data-dependent) +- Heavy use of XOR operations +- Byte-level array indexing: `array[data[i]]` + +**Investigation strategy:** +1. `read-memory` at constant arrays - compare to known S-boxes +2. Count loop iterations - indicates cipher type/key size +3. `search-strings-regex` for algorithm names +4. Check cross-references to constants - find cipher initialization + +### Stream Cipher Recognition + +**Conceptual characteristics:** +- **Keystream generation**: Produce pseudo-random byte stream from key +- **Simple combination**: XOR plaintext with keystream +- **State-based**: Internal state evolves as keystream is produced +- **No fixed blocks**: Can encrypt arbitrary lengths + +**What to look for:** +``` +State initialization: + Array or struct setup from key + Often 256-byte arrays + +Keystream generation loop: + State updates via modular arithmetic + Index computations: i = (i + 1) % N + Swap operations common + +XOR combination: + output[i] = input[i] ^ keystream[i] + Simple, obvious pattern +``` + +**Telltale signs:** +- Array swap operations: `temp = a[i]; a[i] = a[j]; a[j] = temp` +- Modulo operations: `% 256` or `& 0xFF` +- XOR in simple loop +- Smaller code footprint than block ciphers (no large constants) + +### Public Key Cryptography Recognition + +**Conceptual characteristics:** +- **Large integer arithmetic**: Numbers hundreds or thousands of bits +- **Modular exponentiation**: `result = base^exponent mod modulus` +- **Performance**: Very slow compared to symmetric crypto (indicates usage for key exchange, not bulk data) + +**What to look for:** +``` +Multi-precision arithmetic: + Arrays representing big integers + Functions for add/subtract/multiply on arrays + +Square-and-multiply pattern: + Loop over exponent bits + Square operation each iteration + Conditional multiply based on bit value + +Modulo operations on large numbers: + Division with large divisors + Barrett reduction or Montgomery multiplication +``` + +**Telltale signs:** +- Very large buffers (128, 256, 512 bytes+) +- Bit-by-bit exponent processing +- Characteristic magic constants (e.g., 0x10001 = 65537 for RSA) +- Slow execution (thousands of operations per byte) + +### Hash Function Recognition + +**Conceptual characteristics:** +- **Compression function**: Transform fixed-size input to fixed-size output +- **Block processing**: Process data in chunks (512 bits typical) +- **State accumulation**: Running state updated with each block +- **Padding**: Add bytes to make input multiple of block size +- **One-way**: Lots of mixing, no reversibility + +**What to look for:** +``` +Initialization: + Fixed magic constants + MD5: 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476 + SHA-1: 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 + SHA-256: 8 different constants + +Round function: + Fixed iteration count (64, 80 rounds) + Lots of bitwise operations (rotations, XOR, AND, OR) + State mixing (each output bit depends on many input bits) + +Padding logic: + Append 0x80 byte + Length encoding at end +``` + +**Telltale signs:** +- Characteristic initialization constants +- Fixed 64 or 80 round loops +- Bitwise rotation: `(x << n) | (x >> (32-n))` +- Message schedule computation (W array expansion) + +### Simple XOR Obfuscation + +**Conceptual characteristics:** +- **Trivial operation**: `output = input XOR key` +- **Symmetric**: Encryption and decryption identical +- **Weak security**: Easy to break, used for obfuscation not protection + +**What to look for:** +``` +Single-byte key: + for (i = 0; i < len; i++) + data[i] ^= 0x42; + +Multi-byte key: + for (i = 0; i < len; i++) + data[i] ^= key[i % keylen]; + +Rolling key: + key = seed; + for (i = 0; i < len; i++) { + data[i] ^= key; + key = update_key(key); // LCG or similar + } +``` + +**Telltale signs:** +- Very short functions (5-10 lines) +- XOR with constants or simple patterns +- Often applied to strings or config data +- Paired with static data arrays that need decoding + +--- + +## Control Flow Patterns + +### State Machine Recognition + +**Conceptual characteristics:** +- **Explicit states**: Enumeration or integer representing current state +- **State transitions**: Switch/if-else on state variable +- **Event-driven**: External input triggers transitions + +**What to look for:** +``` +State variable: + int state = INITIAL_STATE; + +Dispatch loop: + while (running) { + switch (state) { + case STATE_A: /* handle A, maybe transition to B */ + case STATE_B: /* handle B, maybe transition to C */ + ... + } + } + +State tables (more advanced): + next_state = transition_table[current_state][input]; + action = action_table[current_state][input]; +``` + +**Telltale signs:** +- Large switch statements with many cases +- State variable repeatedly assigned new values +- Enumeration or #define constants for states +- Patterns like IDLE, CONNECTING, CONNECTED, DISCONNECTED + +**Common uses:** +- Network protocol handling +- Parser implementation +- UI event handling +- Command processing + +### Command Dispatcher Recognition + +**Conceptual characteristics:** +- **Command codes**: Numeric identifiers for operations +- **Handler lookup**: Map command ID to handler function +- **Extensibility**: Easy to add new commands + +**What to look for:** +``` +Command dispatch table: + switch (command_id) { + case CMD_EXECUTE: handle_execute(params); break; + case CMD_UPLOAD: handle_upload(params); break; + case CMD_DOWNLOAD: handle_download(params); break; + ... + } + +Function pointer table: + handler = command_table[command_id]; + handler(params); + +String-based dispatch: + if (strcmp(cmd, "exec") == 0) handle_execute(); + else if (strcmp(cmd, "upload") == 0) handle_upload(); +``` + +**Telltale signs:** +- Large switch on integer or string +- Array of function pointers +- Command ID constants or strings +- Common command names: exec, upload, download, shell, sleep, etc. + +**Common uses:** +- Remote access tools (RAT) +- Backdoor command handling +- Plugin systems +- IPC/RPC mechanisms + +### Callback Pattern Recognition + +**Conceptual characteristics:** +- **Inversion of control**: Library calls your code, not you calling library +- **Function pointers**: Pass address of your function to framework +- **Asynchronous**: Often used for async operations + +**What to look for:** +``` +Callback registration: + library_set_callback(MY_EVENT, my_handler_function); + +Callback function signature: + void my_callback(event_type, data, user_context) + +Common callback contexts: + - Network data received + - Timer expired + - File I/O complete + - User interaction +``` + +**Telltale signs:** +- Function pointers passed as parameters +- Functions with generic names like "handler", "callback", "on_event" +- Often have opaque pointer parameter (void* user_data) + +### Loop Patterns + +**Simple iteration:** +``` +for (i = 0; i < count; i++) + - Linear processing + - Transform/encrypt each element +``` + +**Nested loops (2D processing):** +``` +for (i = 0; i < height; i++) + for (j = 0; j < width; j++) + - Image processing + - Matrix operations + - Block cipher on 2D state +``` + +**Do-while patterns:** +``` +do { + read_chunk(); + process_chunk(); +} while (more_data); + - File/network processing + - Guaranteed first execution +``` + +**While-true with break:** +``` +while (1) { + if (condition) break; + process(); +} + - Server loops + - State machines + - Event loops +``` + +--- + +## Data Structure Patterns + +### Buffer Management + +**Fixed-size buffers:** +``` +char buffer[1024]; +read(fd, buffer, sizeof(buffer)); + - Stack-allocated + - Size known at compile time + - Often seen with unsafe functions (strcpy, sprintf) +``` + +**Dynamic buffers:** +``` +size = calculate_size(); +buffer = malloc(size); + - Heap-allocated + - Size determined at runtime + - Look for malloc/free pairs or memory leaks +``` + +**Ring buffers (circular):** +``` +write_pos = (write_pos + 1) % BUFFER_SIZE; +read_pos = (read_pos + 1) % BUFFER_SIZE; + - Fixed-size, reusable + - Modulo arithmetic for wrap-around + - Used in queues, streaming +``` + +### Linked Structures + +**Linked list:** +``` +struct node { + data_type data; + struct node* next; // singly-linked + struct node* prev; // doubly-linked (optional) +}; +``` + +**Recognition:** +- Pointer fields in structures +- Traversal loops: `while (node != NULL) { node = node->next; }` +- Insertion/deletion operations + +**Tree structures:** +``` +struct tree_node { + data_type data; + struct tree_node* left; + struct tree_node* right; +}; +``` + +**Recognition:** +- Two pointer fields (left/right) +- Recursive functions +- Comparison operations for ordering + +### String Handling Patterns + +**Length-prefixed strings:** +``` +struct { + uint32_t length; + char data[]; +} +``` + +**Null-terminated strings:** +``` +while (*str != '\0') str++; // strlen pattern +``` + +**Wide strings:** +``` +wchar_t* wstr; +uint16_t* utf16_str; + - 2 or 4 bytes per character + - String operations work on larger units +``` + +**Detection:** +- Character-by-character loops +- Null byte checks +- String manipulation function calls +- UTF-8/UTF-16 encoding/decoding + +--- + +## Network Protocol Patterns + +### Protocol Structure Recognition + +**Request-Response:** +``` +send_request(command, params); +response = receive_response(); +process_response(response); +``` + +**Characteristics:** +- Client initiates +- Server responds +- Blocking or polling wait for response +- Examples: HTTP, DNS, RPC + +**Continuous Stream:** +``` +while (connected) { + data = receive_data(); + process_chunk(data); +} +``` + +**Characteristics:** +- Persistent connection +- Data flows continuously +- No strict request-response pairing +- Examples: video streaming, log shipping + +**Message-Oriented:** +``` +while (true) { + message = receive_message(); // reads length, then payload + dispatch_message(message); +} +``` + +**Characteristics:** +- Discrete messages with boundaries +- Length prefix or delimiter +- Message type/ID field +- Examples: custom C2 protocols, message queues + +### Serialization Patterns + +**Binary serialization:** +``` +Write primitives in sequence: + write_uint32(length); + write_bytes(data, length); + write_uint8(flags); +``` + +**Characteristics:** +- Dense, efficient +- Fixed byte order (endianness) +- Magic numbers for structure identification +- Version fields for compatibility + +**Text-based serialization:** +``` +JSON: {"key": "value", "num": 42} +XML: value +``` + +**Characteristics:** +- Human-readable +- Delimiter characters ({}, <>, quotes) +- String parsing and generation code +- Less efficient but more flexible + +**Detection strategies:** +1. Look for sprintf/snprintf for text generation +2. Check for JSON/XML parsing libraries +3. Find memcpy sequences for binary packing +4. Identify byte-swapping (htonl/ntohl pattern) + +### Connection Management + +**Connection establishment pattern:** +``` +Create socket +→ Connect to server +→ Send handshake/authentication +→ Receive acknowledgment +→ Enter main communication loop +``` + +**Connection pooling pattern:** +``` +maintain pool of N connections +when request arrives: + if free_connection available: + use it + else: + create new connection (up to max) +after request: + return connection to pool +``` + +**Reconnection pattern:** +``` +max_retries = 5; +while (retries < max_retries) { + if (connect_success) break; + sleep(backoff_time); + backoff_time *= 2; // exponential backoff + retries++; +} +``` + +**Telltale signs:** +- Retry loops with delays +- Connection state checking +- Timeout handling +- Fallback server lists + +--- + +## Behavioral Patterns + +### Encryption + Network (Data Exfiltration) + +**Pattern sequence:** +``` +1. Collect files/data +2. Compress (optional) +3. Encrypt +4. Send over network +5. Clean up local copies +``` + +**What to look for:** +- File enumeration → encryption function → network send +- Temporary file creation → processing → deletion +- Cross-reference encryption function to network functions + +### Decrypt + Execute (Payload Loading) + +**Pattern sequence:** +``` +1. Read encrypted payload from resource/file/network +2. Decrypt in memory +3. Execute (direct call, injection, or create process) +``` + +**What to look for:** +- Buffer allocated with execute permissions +- Decryption function → function pointer cast → indirect call +- XOR loop → memory copy → execution transfer + +### Time-Based Triggering + +**Pattern:** +``` +while (true) { + current_time = get_time(); + if (current_time >= trigger_time) { + execute_payload(); + break; + } + sleep(check_interval); +} +``` + +**What to look for:** +- Time/date API calls +- Comparison with specific dates +- Sleep/delay in loops +- Activation conditions based on temporal logic + +### Polymorphic Behavior + +**Pattern:** +``` +code_variant = select_variant(seed); +decrypt_code(code_variant); +execute_decrypted_code(); +re-encrypt_code(new_seed); +``` + +**What to look for:** +- Self-modifying code +- Multiple code variants +- Decryption before execution +- Encryption after execution +- Memory protection changes (read/write/execute toggling) + +--- + +## Code Quality Indicators + +### Hand-Written vs. Generated Code + +**Hand-written characteristics:** +- Inconsistent formatting +- Comments (if not stripped) +- Meaningful variable names (if symbols present) +- Idiomatic patterns for the language +- Error handling mixed with logic + +**Generated/compiled characteristics:** +- Very consistent structure +- Compiler optimization patterns +- Systematic variable naming (if stripped) +- Uniform error handling +- Recognizable library code patterns + +### Obfuscated Code Indicators + +**Deliberately obscured:** +- Meaningless variable/function names +- Unnecessary complexity +- Dead code branches +- Opaque predicates (always true/false conditions) +- Indirect calls through pointer manipulations +- String obfuscation + +**Compiler optimizations (benign):** +- Loop unrolling +- Function inlining +- Constant folding +- Dead code elimination +- Register allocation patterns + +**Distinction:** Obfuscation creates complexity without performance benefit; optimization creates complexity for performance. + +### Library Code vs. Custom Code + +**Library code:** +- Standard algorithms (qsort, hash functions) +- Consistent with open-source implementations +- Well-structured, parameterized +- Minimal dependencies on surrounding code + +**Custom code:** +- Unique patterns +- Integrated with application logic +- Application-specific data structures +- More likely to have bugs/vulnerabilities + +**Investigation priority:** Focus on custom code - that's where unique behavior lives. + +--- + +## Using This Reference + +### Pattern Matching Workflow + +1. **Observe structure** - What loops, branches, data structures appear? +2. **Compare to patterns** - Does this match known algorithmic patterns? +3. **Verify with evidence** - Check for characteristic constants, operations, structure +4. **Document pattern** - Bookmark with pattern name for reference +5. **Improve code** - Rename variables/functions to reflect pattern (e.g., `aes_encrypt`, `rc4_keystream`) + +### Example Investigation + +``` +Observation: Function with nested loops, array lookups, XOR operations + +Compare: Matches "Block Cipher" or "Stream Cipher" patterns + +Verify: + - Check for large constant array (S-box?) + - Count outer loop iterations (rounds?) + - Look for key schedule function + +Find: 256-byte array starting 63 7c 77 7b... + 14 iterations in outer loop + +Conclusion: AES-256 (14 rounds, standard S-box) + +Improve: + rename-variables: state→aes_state, table→aes_sbox + set-function-prototype: void aes_encrypt(uint8_t* data, uint8_t* key) + set-comment: "AES-256 encryption using standard S-box" +``` + +### Pattern Combination + +Real-world code combines multiple patterns: + +**Example: Malware C2 Communication** +``` +[Command Dispatcher] receives command from network + ↓ +[State Machine] tracks connection state + ↓ +[Callback Functions] handle specific commands + ↓ +[Buffer Management] manages received data + ↓ +[Encryption] protects command payloads +``` + +When you identify one pattern, look for related patterns in: +- Functions that call this one (higher-level orchestration) +- Functions called by this one (lower-level primitives) +- Cross-references to shared data structures + +### Progressive Understanding + +Don't need to identify every pattern perfectly: + +**First pass:** "This looks like crypto (lots of XOR and loops)" +**Second pass:** "Probably a stream cipher (simple state, no large tables)" +**Third pass:** "Matches RC4 pattern (256-byte init, swap operations)" +**Fourth pass:** "Confirmed RC4 (found KSA and PRGA pattern)" + +Each pass refines understanding and guides further investigation.