Initial commit

2025-11-30 08:38:26 +08:00
commit 41d9f6b189
304 changed files with 98322 additions and 0 deletions
--- a/skills/prototyping-pretotyping/resources/evaluators/rubric_prototyping_pretotyping.json
+++ b/skills/prototyping-pretotyping/resources/evaluators/rubric_prototyping_pretotyping.json
@@ -0,0 +1,150 @@
+{
+  "name": "Prototyping & Pretotyping Evaluator",
+  "description": "Evaluates prototype experiments for assumption clarity, appropriate fidelity, rigorous measurement, and actionable results",
+  "criteria": [
+    {
+      "name": "Assumption Clarity and Risk Assessment",
+      "weight": 1.4,
+      "scale": {
+        "1": "Vague or missing assumption, no risk assessment",
+        "2": "Assumption stated but not specific, weak risk rationale",
+        "3": "Clear assumption with basic risk assessment (high/medium/low)",
+        "4": "Specific testable assumption with quantified risk (probability × impact)",
+        "5": "Exemplary: Riskiest assumption identified from ranked list, risk score calculated, clear rationale for testing this assumption first"
+      }
+    },
+    {
+      "name": "Fidelity Appropriateness",
+      "weight": 1.4,
+      "scale": {
+        "1": "Severe mismatch (coded prototype for demand question, or pretotype for technical feasibility)",
+        "2": "Overbuilt (higher fidelity than needed) or underbuilt (too low to answer question)",
+        "3": "Appropriate fidelity for most questions, minor mismatch",
+        "4": "Well-matched fidelity with clear rationale for choice",
+        "5": "Exemplary: Fidelity ladder approach (started low, climbed only when validated), cost-benefit analysis for fidelity choice documented"
+      }
+    },
+    {
+      "name": "Success Criteria Definition",
+      "weight": 1.3,
+      "scale": {
+        "1": "No success criteria or vague ('see if users like it')",
+        "2": "Basic criteria but not quantitative, no thresholds",
+        "3": "Quantitative metric stated (e.g., '10% conversion') but no decision rule",
+        "4": "Clear metric with decision thresholds (persevere ≥X, pivot <Y)",
+        "5": "Exemplary: Criteria set before testing (documented), clear decision rule (persevere/pivot/iterate thresholds), benchmarked against industry standards"
+      }
+    },
+    {
+      "name": "User Recruitment Quality",
+      "weight": 1.2,
+      "scale": {
+        "1": "No target user definition or tested with friends/family",
+        "2": "Target defined but convenience sample (not representative)",
+        "3": "Recruited from target segment but small sample (n<5 qualitative, n<50 quantitative)",
+        "4": "Appropriate sample from target segment (n=5-10 qualitative, n=100+ quantitative), screening used",
+        "5": "Exemplary: Target user persona documented, screener questions used, appropriate sample size with statistical justification, diverse sub-segments represented"
+      }
+    },
+    {
+      "name": "Measurement Rigor (Behavior over Opinion)",
+      "weight": 1.3,
+      "scale": {
+        "1": "Only opinions ('users said they liked it'), no behavioral data",
+        "2": "Mix of opinions and weak behavioral signals (page views)",
+        "3": "Behavioral data collected (clicks, task completion) but also relying on opinions",
+        "4": "Strong behavioral focus (conversions, payments, retention), opinions secondary for context",
+        "5": "Exemplary: Pre-commitment signals measured (payments, credit card, waitlist email), behavioral metrics primary, qualitative for understanding why, clear distinction between reliable/unreliable signals"
+      }
+    },
+    {
+      "name": "Build Quality (Minimum Viable for Question)",
+      "weight": 1.1,
+      "scale": {
+        "1": "Severely overbuilt (months on prototype) or unusable (can't test question)",
+        "2": "Overbuilt (weeks when days suffice) or missing key components",
+        "3": "Appropriate build scope, minor overbuilding or gaps",
+        "4": "Minimum build for question, all necessary components present, nothing extra",
+        "5": "Exemplary: Time-boxed build (e.g., 1 week max), clear fake vs real components, disposable mindset (won't ship prototype code), iterated quickly on feedback"
+      }
+    },
+    {
+      "name": "Analysis and Decision Quality",
+      "weight": 1.2,
+      "scale": {
+        "1": "No analysis or decision, or ignored negative results",
+        "2": "Basic analysis but unclear decision (ambiguous results, no action)",
+        "3": "Analysis completed, decision stated (persevere/pivot/iterate) but weak rationale",
+        "4": "Rigorous analysis comparing results to criteria, clear decision with rationale",
+        "5": "Exemplary: Results compared to pre-set criteria, decision follows decision rule, learnings documented, next steps defined (build MVP / test alternative / iterate on X), negative results respected"
+      }
+    },
+    {
+      "name": "Ethical Transparency",
+      "weight": 1.0,
+      "scale": {
+        "1": "Deceptive (fake features advertised as real, charging for non-existent product)",
+        "2": "Misleading (implies fully functional when fake)",
+        "3": "Basic transparency (states 'beta' or 'early access') but could be clearer",
+        "4": "Transparent about limitations ('launching soon', 'early access', 'waitlist')",
+        "5": "Exemplary: Honest framing ('sign up for early access', 'join waitlist to be notified'), users understand they're testing concept, no promises of features that won't be built"
+      }
+    }
+  ],
+  "guidance": {
+    "by_assumption_type": {
+      "demand": {
+        "recommended_method": "Pretotype: Fake door, landing page with sign-up, pre-order",
+        "success_metric": "Conversion rate (visitors → sign-ups/orders), absolute numbers (100+ sign-ups)",
+        "red_flags": ["Using coded prototype (overbuilt)", "Only opinions (survey), no behavior", "Testing with existing customers (not new market)"]
+      },
+      "pricing": {
+        "recommended_method": "Pretotype: Price on landing page, A/B test price tiers, pre-order at target price",
+        "success_metric": "Conversion at target price, revenue collected, willingness to pay distribution",
+        "red_flags": ["Asking 'would you pay $X' (opinions unreliable)", "Testing single price (no comparison)", "Free beta then hoping to charge later (different behavior)"]
+      },
+      "workflow": {
+        "recommended_method": "Paper or clickable prototype (depends on complexity)",
+        "success_metric": "Task completion rate, time on task, error rate, qualitative confusion points",
+        "red_flags": ["Coded prototype (overbuilt for workflow question)", "No task-based testing (just showing screens)", "Sample size <5 (pattern unclear)"]
+      },
+      "feasibility": {
+        "recommended_method": "Coded prototype (technical spike), manual concierge (learn before automating)",
+        "success_metric": "Performance (latency, throughput), cost per transaction, error rate, scalability limit",
+        "red_flags": ["Paper/clickable (can't test technical constraints)", "Not using real data (integration issues hidden)", "No performance benchmarks"]
+      }
+    }
+  },
+  "common_failure_modes": {
+    "overbuilding": {
+      "symptom": "Weeks/months on prototype, or coded when landing page would suffice",
+      "root_cause": "Excitement to build, perfectionism, uncomfortable with 'faking it'",
+      "fix": "Force fidelity ladder (start lowest, justify climbing), time-box builds (e.g., 1 week max)"
+    },
+    "no_success_criteria": {
+      "symptom": "Tested but unclear if validated, moving goalposts after results",
+      "root_cause": "Confirmation bias, didn't think through metrics before",
+      "fix": "Write success criteria doc before building, get stakeholder sign-off, commit to decision rule"
+    },
+    "wrong_users": {
+      "symptom": "Positive test results, market launch flops",
+      "root_cause": "Friends/family (polite), convenience sample (not target segment)",
+      "fix": "Define target persona, screen with qualifying questions, recruit from actual target market"
+    },
+    "opinion_over_behavior": {
+      "symptom": "'Users loved it' but no usage/retention",
+      "root_cause": "Social desirability bias, hypothetical bias",
+      "fix": "Measure behavior (clicks, payments, retention) as primary, use opinions only for context"
+    }
+  },
+  "excellence_indicators": [
+    "Riskiest assumption tested first (ranked by prob wrong × impact)",
+    "Fidelity matched to question (not overbuilt)",
+    "Success criteria set before testing (documented)",
+    "Behavioral metrics measured (conversions, payments, not just opinions)",
+    "Appropriate sample size (n=5-10 qualitative, n=100+ quantitative)",
+    "Clear decision made (persevere/pivot/iterate) based on pre-set criteria",
+    "Learnings documented and shared",
+    "Prototype treated as disposable (won't ship prototype code)"
+  ]
+}
--- a/skills/prototyping-pretotyping/resources/methodology.md
+++ b/skills/prototyping-pretotyping/resources/methodology.md
@@ -0,0 +1,245 @@
+# Prototyping & Pretotyping: Advanced Methodologies
+
+## Table of Contents
+1. [Pretotyping Techniques](#1-pretotyping-techniques)
+2. [Fidelity Selection Framework](#2-fidelity-selection-framework)
+3. [Experiment Design Principles](#3-experiment-design-principles)
+4. [Measurement and Validation](#4-measurement-and-validation)
+5. [Common Failure Patterns](#5-common-failure-patterns)
+
+## 1. Pretotyping Techniques
+
+### Fake Door Test
+**What**: Feature appears in UI but doesn't exist yet
+**Setup**: Add button/link "New Feature X", tracks clicks, shows "Coming Soon"
+**Measures**: Click-through rate (interest), wait list sign-ups (intent)
+**Example**: Amazon tested new category by showing link, measuring clicks before building inventory
+**When**: Test demand for new feature/product before building
+
+### Concierge MVP
+**What**: Manually deliver service that will eventually be automated
+**Setup**: Humans do work (curation, matching, analysis) as if algorithm did it
+**Measures**: Customer satisfaction, willingness to pay, time/cost to deliver manually
+**Example**: Food delivery app founders manually taking orders/delivering before building platform
+**When**: Learn what "good" looks like before automating, validate service value proposition
+
+### Wizard of Oz
+**What**: System appears automated but humans power it behind scenes
+**Setup**: Build UI, users interact thinking it's automated, humans respond in real-time
+**Measures**: User acceptance of automated experience, performance expectations, edge cases
+**Example**: IBM speech recognition - person typing what user said, appeared like AI transcription
+**When**: Test if users accept automated interface before building complex AI/automation
+
+### Painted Door
+**What**: Feature shown in UI as "Beta" or "Early Access" but not built yet
+**Setup**: Badge/flag on fake feature, measure attempts to access
+**Measures**: Click rate, request rate for access
+**Example**: Slack showed "Calls" feature as "Beta", measured requests before building voice infrastructure
+**When**: Test interest in feature when UI space is limited (avoiding clutter)
+
+### Single-Feature MVP
+**What**: Build one feature extremely well, ignore everything else
+**Setup**: Identify core value hypothesis, build only that feature
+**Measures**: Retention (do users come back?), engagement (how often used?), WTP (will they pay?)
+**Example**: Twitter v1 - just 140-char posts, no replies/retweets/hashtags/DMs
+**When**: Test if core value alone is enough before adding features
+
+### Pre-Order / Crowdfunding
+**What**: Collect money before building product
+**Setup**: Landing page with product description, pre-order button, collect payments
+**Measures**: Conversion rate (visitors → buyers), funding amount vs target
+**Example**: Pebble smartwatch raised $10M on Kickstarter before manufacturing
+**When**: Test willingness to pay and validate demand with financial commitment
+
+### Explainer Video
+**What**: Video showing product in use before building it
+**Setup**: 2-3 min video demonstrating value prop, post to landing page, measure sign-ups
+**Measures**: View-to-signup conversion, qualitative feedback in comments
+**Example**: Dropbox video (3min) drove 70K→75K beta sign-ups overnight (10% conversion)
+**When**: Complex product hard to explain in text, want viral sharing
+
+### Manual-First Approach
+**What**: Do work manually before building tools/automation
+**Setup**: Spreadsheets, email, manual processes instead of software
+**Measures**: Feasibility (can we do manually?), bottlenecks (what takes time?), quality (output good enough?)
+**Example**: Zapier founders manually connecting APIs for first customers before building platform
+**When**: Learn workflow requirements before automation, validate service value before tooling
+
+## 2. Fidelity Selection Framework
+
+### Decision Matrix
+
+| Question | Recommended Fidelity | Timeline | Cost |
+|----------|---------------------|----------|------|
+| Do people want this? | Pretotype (Fake Door) | Hours-Days | $0-100 |
+| Will they pay $X? | Pretotype (Pricing on landing page) | Days | $0-500 |
+| Is workflow intuitive? | Paper Prototype | Hours-Days | $0-50 |
+| Do interactions feel right? | Clickable Prototype | Days-Week | $100-500 |
+| Can we build technically? | Coded Prototype | Weeks | $1K-10K |
+| Will they retain/engage? | MVP | Months | $10K-100K+ |
+
+### Fidelity Ladder Climber
+
+Start low fidelity, climb only if validated:
+
+1. **Pretotype** (Fake Door): 5% conversion → demand validated → climb to prototype
+2. **Paper Prototype**: 8/10 users complete workflow → UX validated → climb to clickable
+3. **Clickable Prototype**: 15% task completion <2 min → flow validated → climb to coded
+4. **Coded Prototype**: <500ms latency at 100 req/sec → technical validated → build MVP
+5. **MVP**: 40% week-1 retention → value validated → build full product
+
+**Don't skip steps**: Each step de-risks before higher investment
+
+### Cost-Benefit Analysis
+
+**Example - Should we code prototype or stick with clickable?**
+
+Clickable prototype cost: $500 (1 week designer)
+Coded prototype cost: $8K (1 month engineer)
+**Delta**: $7.5K, 3 weeks
+
+Information gained from coded vs clickable:
+- Performance data (real latency, not estimated)
+- Integration complexity (real API issues, not mocked)
+- Scalability constraints (actual database limits)
+
+**Is $7.5K worth it?**
+- If performance/integration unknown and high risk: Yes (de-risking worth cost)
+- If performance/integration well-understood: No (clickable sufficient)
+
+## 3. Experiment Design Principles
+
+### Minimum Viable Data
+
+**Qualitative**: n=5-10 for pattern identification (Nielsen Norman Group: 5 users find 85% of usability issues)
+**Quantitative**: n=100+ for statistical confidence (conversions, A/B tests)
+
+**Don't over-collect**: More users = more time/cost. Stop when pattern clear.
+
+### Success Criteria Template
+
+**Good criteria** (set before testing):
+- Specific: "10% landing page conversion"
+- Measurable: Can be tracked with analytics
+- Actionable: Tells you to pivot or persevere
+- Realistic: Based on industry benchmarks
+- Time-bound: "In 2 weeks"
+
+**Decision thresholds**:
+- **Persevere**: ≥10% conversion → validated, build it
+- **Pivot**: <5% conversion → assumption wrong, change direction
+- **Iterate**: 5-10% conversion → unclear, refine and re-test
+
+### Bias Mitigation
+
+**Confirmation bias**: Seeing what we want to see
+- Fix: Set success criteria before testing, blind analysis (analyst doesn't know hypothesis)
+
+**Sampling bias**: Testing wrong users
+- Fix: Screen participants (e.g., "Do you currently use X?"), recruit from target segment
+
+**Social desirability bias**: Users say what's polite
+- Fix: Observe behavior (clicks, time), don't just ask opinions
+
+**Leading questions**: "Wouldn't you love feature X?"
+- Fix: Neutral framing: "How would you solve problem Y?"
+
+## 4. Measurement and Validation
+
+### Behavioral Metrics (Reliable)
+
+**Pre-commitment signals** (ranked by strength):
+1. **Paid**: Actual payment (strongest signal)
+2. **Pre-ordered**: Credit card on file, will be charged later
+3. **Waitlist with phone/email**: Provided contact info
+4. **Clicked "Buy"**: Navigated to checkout (even if abandoned)
+5. **Clicked feature**: Showed interest by interaction
+
+**Engagement metrics**:
+- Task completion rate: % who finished workflow
+- Time on task: How long (too long = confusing)
+- Error rate: Mis-clicks, form errors
+- Return visits: Came back without prompt
+- Referrals: Told others (strongest retention signal)
+
+### Opinion Metrics (Less Reliable)
+
+**Survey responses**: "Would you pay $X?" (70% say yes, 10% actually pay → 7× overestimate)
+**Net Promoter Score**: "Would you recommend?" (aspirational, not predictive)
+**Satisfaction ratings**: "How satisfied?" (grade inflation, social desirability)
+
+**Use opinions for context, not decisions**: "Why did you abandon checkout?" (explains behavior) not "Would you buy this?" (unreliable prediction)
+
+### Statistical Confidence
+
+**Sample size for conversions**:
+- Baseline conversion: 10%
+- Want to detect: 2% change (10% → 12%)
+- Confidence: 95%
+- **Required sample**: ~1,000 per variant (use online calculators)
+
+**Too small sample**: False confidence (random noise looks like signal)
+**Too large sample**: Wasted time/money (pattern already clear at n=200)
+
+### Qualitative Analysis
+
+**Thematic coding**:
+1. Collect observations/quotes (n=10 interviews)
+2. Identify recurring themes (e.g., "confused by pricing", "wanted export feature")
+3. Count frequency (7/10 mentioned pricing confusion)
+4. Prioritize by frequency + severity
+
+**Think-aloud protocol**:
+- Users narrate thoughts while completing task
+- Reveals mental model mismatches: "I expected X here but saw Y"
+- Uncovers unspoken assumptions: "I assume this button does..."
+
+## 5. Common Failure Patterns
+
+### Overbuilding
+
+**Symptom**: Coded prototype for question answerable with landing page
+**Root cause**: Excitement to build, uncomfortable with "fakery", underestimating learning from cheap tests
+**Fix**: Force fidelity ladder (start low, justify climbing), set "maximum time to first test" (e.g., 1 week)
+
+### No Success Criteria
+
+**Symptom**: Ran test, got data, unclear what it means
+**Root cause**: Didn't define success before testing, moving goalposts
+**Fix**: Write success criteria document before building prototype, get stakeholder sign-off
+
+### Testing with Wrong Users
+
+**Symptom**: Positive feedback from test, market launch flops
+**Root cause**: Tested with friends/family (not target), convenience sample (not representative)
+**Fix**: Screen participants (qualifying questions), recruit from target segment (ads, outreach)
+
+### Opinion over Behavior
+
+**Symptom**: "Users loved it in interviews" but no one uses product
+**Root cause**: Relying on what users say, not what they do (social desirability, hypothetical bias)
+**Fix**: Measure behavior (clicks, payments, retention) as primary, opinions as secondary context
+
+### Single Test Overconfidence
+
+**Symptom**: One test shows X, assume validated forever
+**Root cause**: Confirmation bias, small sample, didn't test alternatives
+**Fix**: Multiple tests, test variations, update beliefs with new evidence
+
+### Prototype Becomes Product
+
+**Symptom**: Shipped prototype code, now have technical debt/security issues
+**Root cause**: Pressure to ship fast, reluctance to "throw away" working code
+**Fix**: Treat prototypes as disposable (document learnings, rebuild properly for production)
+
+### Analysis Paralysis
+
+**Symptom**: Months refining prototype before testing
+**Root cause**: Perfectionism, fear of negative feedback, unclear scope
+**Fix**: Time-box prototype building (e.g., 1 week max), test with "good enough" version
+
+### Ignoring Negative Results
+
+**Symptom**: Test shows assumption wrong, but team proceeds anyway (sunk cost fallacy)
+**Root cause**: Ego, sunk cost, optimism bias ("this time will be different")
+**Fix**: Pre-commit to decision rule ("if conversion <5%, we pivot"), make pivoting psychologically safe
--- a/skills/prototyping-pretotyping/resources/template.md
+++ b/skills/prototyping-pretotyping/resources/template.md
@@ -0,0 +1,175 @@
+# Prototyping & Pretotyping Experiment Template
+
+## Workflow
+
+```
+Prototyping Progress:
+- [ ] Step 1: Identify riskiest assumption to test
+- [ ] Step 2: Choose pretotype/prototype approach
+- [ ] Step 3: Design and build minimum test
+- [ ] Step 4: Run experiment and collect data
+- [ ] Step 5: Analyze results and decide
+```
+
+## Experiment Design Template
+
+### 1. Assumption to Test
+
+**Assumption**: [What are we assuming? E.g., "Users will pay $49/mo for AI-powered analytics"]
+**Why risky**: [Why might this be wrong? Impact if wrong?]
+**Risk score**: [Probability wrong (1-5) × Impact if wrong (1-5) = Risk (1-25)]
+
+### 2. Test Method
+
+**Approach**: [Pretotype / Paper / Clickable / Coded / MVP]
+**Fidelity choice rationale**: [Why this fidelity level? What question does it answer?]
+**Estimated cost**: [$X or X hours]
+**Timeline**: [X days to build, Y days to test]
+
+### 3. Success Criteria
+
+**Primary metric**: [E.g., "10% landing page → sign-up conversion"]
+**Secondary metrics**: [E.g., "50% complete onboarding, 5 min avg session"]
+**Minimum sample**: [n=X users/observations]
+**Decision rule**: 
+- **Persevere** (build it): [Metric ≥ X means validated]
+- **Pivot** (change direction): [Metric < Y means assumption wrong]
+- **Iterate** (refine and re-test): [X > Metric ≥ Y means unclear, need more data]
+
+### 4. Experiment Build
+
+**What we're building**: [Landing page, paper prototype, working feature, etc.]
+**Components needed**:
+- [ ] [Component 1, e.g., Landing page copy/design]
+- [ ] [Component 2, e.g., Sign-up form]
+- [ ] [Component 3, e.g., Analytics tracking]
+
+**Fake vs Real**:
+- **Faking**: [What appears real but isn't? E.g., "Buy Now button shows 'Coming Soon'"]
+- **Real**: [What must actually work? E.g., "Email capture must work"]
+
+### 5. Participant Recruitment
+
+**Target users**: [Who are we testing with? Demographics, behaviors, context]
+**Sample size**: [n=X, reasoning: qualitative vs quantitative]
+**Recruitment method**: [Ads, existing users, outreach, intercepts]
+**Screening**: [How do we ensure target users? Screener questions]
+
+### 6. Data Collection Plan
+
+**Quantitative data**:
+| Metric | How measured | Tool | Target |
+|--------|--------------|------|--------|
+| [Sign-ups] | [Form submissions] | [Google Analytics] | [≥100] |
+| [Conversion] | [Sign-ups / Visitors] | [GA] | [≥10%] |
+
+**Qualitative data**:
+| Method | N | Questions/Tasks |
+|--------|---|-----------------|
+| [User interview] | [5-10] | [What problem were you trying to solve? Did prototype help?] |
+| [Task observation] | [10] | [Complete checkout, note errors/confusion] |
+
+### 7. Results
+
+**Quantitative**:
+| Metric | Target | Actual | Status |
+|--------|--------|--------|--------|
+| [Sign-ups] | [≥100] | [X] | [✓ / ✗] |
+| [Conversion] | [≥10%] | [Y%] | [✓ / ✗] |
+
+**Qualitative**:
+- **Observation 1**: [E.g., "7/10 users confused by pricing page"]
+- **Observation 2**: [E.g., "All users expected 'Export' feature"]
+- **Quote 1**: [User said...]
+- **Quote 2**: [User said...]
+
+### 8. Decision
+
+**Decision**: [Persevere / Pivot / Iterate]
+**Rationale**: [Why? Which criteria met/not met?]
+**Next steps**:
+- [ ] [If Persevere: Build MVP with features X, Y, Z]
+- [ ] [If Pivot: Test alternative approach A]
+- [ ] [If Iterate: Refine prototype addressing issues 1, 2, 3, re-test in 2 weeks]
+
+**Learnings**:
+1. [What we learned about assumption]
+2. [What surprised us]
+3. [What to test next]
+
+---
+
+## Quick Patterns
+
+### Pretotype Methods
+
+**Fake Door Test** (Test demand):
+- Build: Landing page "New Feature X - Coming Soon" with "Notify Me" button
+- Measure: Click rate, email sign-ups
+- Example: "500 visitors, 50 sign-ups (10%) → validates demand"
+
+**Concierge MVP** (Test workflow manually before automating):
+- Build: Manual service delivery (no automation)
+- Measure: Customer satisfaction, willingness to pay, time spent
+- Example: "Manually curate recommendations for 10 users → learn what good looks like before building algorithm"
+
+**Wizard of Oz** (Appear automated, human-powered):
+- Build: UI looks automated, humans behind scenes
+- Measure: User perception, task success, performance expectations
+- Example: "Chatbot UI, humans answering questions → test if users accept chatbot interaction before building NLP"
+
+**Single-Feature MVP** (Test one feature well):
+- Build: One core feature, ignore rest
+- Measure: Usage, retention, WTP
+- Example: "Instagram v1: photo filters only → test if core value enough before building stories/reels"
+
+### Prototype Methods
+
+**Paper Prototype** (Test workflow):
+- Build: Hand-drawn screens on paper/cards
+- Test: Users "click" on paper, swap screens, observe
+- Measure: Task completion, errors, confusion points
+- Example: "10 users complete checkout, 3 confused by shipping step → redesign before coding"
+
+**Clickable Prototype** (Test UI/UX):
+- Build: Interactive mockup in Figma/InVision (no real code)
+- Test: Users complete tasks, measure success/time
+- Measure: Completion rate, time, errors, satisfaction
+- Example: "20 users, 85% complete task <3 min → validates flow"
+
+**Coded Prototype** (Test feasibility):
+- Build: Working code, limited features/data
+- Test: Real users, real tasks, measure performance
+- Measure: Latency, error rate, scalability, cost
+- Example: "Search 10K docs <500ms → validates approach, ready to scale to 10M docs"
+
+### Measurement Approaches
+
+**Quantitative (n=100+)**:
+- Conversion rates (landing page → sign-up, sign-up → payment)
+- Task completion rates (% who finish checkout)
+- Time on task (how long to complete)
+- Error rates (clicks on wrong element, form errors)
+
+**Qualitative (n=5-10)**:
+- Think-aloud protocol (users narrate thought process)
+- Retrospective interview (after task, ask about confusion/delight)
+- Observation notes (where they pause, retry, look confused)
+- Open-ended feedback (what worked, what didn't)
+
+**Behavioral > Opinions**:
+- ✓ "50 clicked 'Buy', 5 completed payment" (behavior)
+- ❌ "Users said they'd pay $99" (opinion, unreliable)
+
+---
+
+## Quality Checklist
+
+- [ ] Assumption is risky (high probability wrong × high impact if wrong)
+- [ ] Fidelity matches question (not overbuilt)
+- [ ] Success criteria set before testing (no moving goalposts)
+- [ ] Recruited real target users (not friends/family)
+- [ ] Sample size appropriate (n=5-10 qualitative, n=100+ quantitative)
+- [ ] Measuring behavior (clicks, conversions), not just opinions
+- [ ] Clear decision rule (persevere/pivot/iterate thresholds)
+- [ ] Results documented and shared with team