Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:38:26 +08:00
commit 41d9f6b189
304 changed files with 98322 additions and 0 deletions

View File

@@ -0,0 +1,150 @@
{
"name": "Prototyping & Pretotyping Evaluator",
"description": "Evaluates prototype experiments for assumption clarity, appropriate fidelity, rigorous measurement, and actionable results",
"criteria": [
{
"name": "Assumption Clarity and Risk Assessment",
"weight": 1.4,
"scale": {
"1": "Vague or missing assumption, no risk assessment",
"2": "Assumption stated but not specific, weak risk rationale",
"3": "Clear assumption with basic risk assessment (high/medium/low)",
"4": "Specific testable assumption with quantified risk (probability × impact)",
"5": "Exemplary: Riskiest assumption identified from ranked list, risk score calculated, clear rationale for testing this assumption first"
}
},
{
"name": "Fidelity Appropriateness",
"weight": 1.4,
"scale": {
"1": "Severe mismatch (coded prototype for demand question, or pretotype for technical feasibility)",
"2": "Overbuilt (higher fidelity than needed) or underbuilt (too low to answer question)",
"3": "Appropriate fidelity for most questions, minor mismatch",
"4": "Well-matched fidelity with clear rationale for choice",
"5": "Exemplary: Fidelity ladder approach (started low, climbed only when validated), cost-benefit analysis for fidelity choice documented"
}
},
{
"name": "Success Criteria Definition",
"weight": 1.3,
"scale": {
"1": "No success criteria or vague ('see if users like it')",
"2": "Basic criteria but not quantitative, no thresholds",
"3": "Quantitative metric stated (e.g., '10% conversion') but no decision rule",
"4": "Clear metric with decision thresholds (persevere ≥X, pivot <Y)",
"5": "Exemplary: Criteria set before testing (documented), clear decision rule (persevere/pivot/iterate thresholds), benchmarked against industry standards"
}
},
{
"name": "User Recruitment Quality",
"weight": 1.2,
"scale": {
"1": "No target user definition or tested with friends/family",
"2": "Target defined but convenience sample (not representative)",
"3": "Recruited from target segment but small sample (n<5 qualitative, n<50 quantitative)",
"4": "Appropriate sample from target segment (n=5-10 qualitative, n=100+ quantitative), screening used",
"5": "Exemplary: Target user persona documented, screener questions used, appropriate sample size with statistical justification, diverse sub-segments represented"
}
},
{
"name": "Measurement Rigor (Behavior over Opinion)",
"weight": 1.3,
"scale": {
"1": "Only opinions ('users said they liked it'), no behavioral data",
"2": "Mix of opinions and weak behavioral signals (page views)",
"3": "Behavioral data collected (clicks, task completion) but also relying on opinions",
"4": "Strong behavioral focus (conversions, payments, retention), opinions secondary for context",
"5": "Exemplary: Pre-commitment signals measured (payments, credit card, waitlist email), behavioral metrics primary, qualitative for understanding why, clear distinction between reliable/unreliable signals"
}
},
{
"name": "Build Quality (Minimum Viable for Question)",
"weight": 1.1,
"scale": {
"1": "Severely overbuilt (months on prototype) or unusable (can't test question)",
"2": "Overbuilt (weeks when days suffice) or missing key components",
"3": "Appropriate build scope, minor overbuilding or gaps",
"4": "Minimum build for question, all necessary components present, nothing extra",
"5": "Exemplary: Time-boxed build (e.g., 1 week max), clear fake vs real components, disposable mindset (won't ship prototype code), iterated quickly on feedback"
}
},
{
"name": "Analysis and Decision Quality",
"weight": 1.2,
"scale": {
"1": "No analysis or decision, or ignored negative results",
"2": "Basic analysis but unclear decision (ambiguous results, no action)",
"3": "Analysis completed, decision stated (persevere/pivot/iterate) but weak rationale",
"4": "Rigorous analysis comparing results to criteria, clear decision with rationale",
"5": "Exemplary: Results compared to pre-set criteria, decision follows decision rule, learnings documented, next steps defined (build MVP / test alternative / iterate on X), negative results respected"
}
},
{
"name": "Ethical Transparency",
"weight": 1.0,
"scale": {
"1": "Deceptive (fake features advertised as real, charging for non-existent product)",
"2": "Misleading (implies fully functional when fake)",
"3": "Basic transparency (states 'beta' or 'early access') but could be clearer",
"4": "Transparent about limitations ('launching soon', 'early access', 'waitlist')",
"5": "Exemplary: Honest framing ('sign up for early access', 'join waitlist to be notified'), users understand they're testing concept, no promises of features that won't be built"
}
}
],
"guidance": {
"by_assumption_type": {
"demand": {
"recommended_method": "Pretotype: Fake door, landing page with sign-up, pre-order",
"success_metric": "Conversion rate (visitors → sign-ups/orders), absolute numbers (100+ sign-ups)",
"red_flags": ["Using coded prototype (overbuilt)", "Only opinions (survey), no behavior", "Testing with existing customers (not new market)"]
},
"pricing": {
"recommended_method": "Pretotype: Price on landing page, A/B test price tiers, pre-order at target price",
"success_metric": "Conversion at target price, revenue collected, willingness to pay distribution",
"red_flags": ["Asking 'would you pay $X' (opinions unreliable)", "Testing single price (no comparison)", "Free beta then hoping to charge later (different behavior)"]
},
"workflow": {
"recommended_method": "Paper or clickable prototype (depends on complexity)",
"success_metric": "Task completion rate, time on task, error rate, qualitative confusion points",
"red_flags": ["Coded prototype (overbuilt for workflow question)", "No task-based testing (just showing screens)", "Sample size <5 (pattern unclear)"]
},
"feasibility": {
"recommended_method": "Coded prototype (technical spike), manual concierge (learn before automating)",
"success_metric": "Performance (latency, throughput), cost per transaction, error rate, scalability limit",
"red_flags": ["Paper/clickable (can't test technical constraints)", "Not using real data (integration issues hidden)", "No performance benchmarks"]
}
}
},
"common_failure_modes": {
"overbuilding": {
"symptom": "Weeks/months on prototype, or coded when landing page would suffice",
"root_cause": "Excitement to build, perfectionism, uncomfortable with 'faking it'",
"fix": "Force fidelity ladder (start lowest, justify climbing), time-box builds (e.g., 1 week max)"
},
"no_success_criteria": {
"symptom": "Tested but unclear if validated, moving goalposts after results",
"root_cause": "Confirmation bias, didn't think through metrics before",
"fix": "Write success criteria doc before building, get stakeholder sign-off, commit to decision rule"
},
"wrong_users": {
"symptom": "Positive test results, market launch flops",
"root_cause": "Friends/family (polite), convenience sample (not target segment)",
"fix": "Define target persona, screen with qualifying questions, recruit from actual target market"
},
"opinion_over_behavior": {
"symptom": "'Users loved it' but no usage/retention",
"root_cause": "Social desirability bias, hypothetical bias",
"fix": "Measure behavior (clicks, payments, retention) as primary, use opinions only for context"
}
},
"excellence_indicators": [
"Riskiest assumption tested first (ranked by prob wrong × impact)",
"Fidelity matched to question (not overbuilt)",
"Success criteria set before testing (documented)",
"Behavioral metrics measured (conversions, payments, not just opinions)",
"Appropriate sample size (n=5-10 qualitative, n=100+ quantitative)",
"Clear decision made (persevere/pivot/iterate) based on pre-set criteria",
"Learnings documented and shared",
"Prototype treated as disposable (won't ship prototype code)"
]
}

View File

@@ -0,0 +1,245 @@
# Prototyping & Pretotyping: Advanced Methodologies
## Table of Contents
1. [Pretotyping Techniques](#1-pretotyping-techniques)
2. [Fidelity Selection Framework](#2-fidelity-selection-framework)
3. [Experiment Design Principles](#3-experiment-design-principles)
4. [Measurement and Validation](#4-measurement-and-validation)
5. [Common Failure Patterns](#5-common-failure-patterns)
## 1. Pretotyping Techniques
### Fake Door Test
**What**: Feature appears in UI but doesn't exist yet
**Setup**: Add button/link "New Feature X", tracks clicks, shows "Coming Soon"
**Measures**: Click-through rate (interest), wait list sign-ups (intent)
**Example**: Amazon tested new category by showing link, measuring clicks before building inventory
**When**: Test demand for new feature/product before building
### Concierge MVP
**What**: Manually deliver service that will eventually be automated
**Setup**: Humans do work (curation, matching, analysis) as if algorithm did it
**Measures**: Customer satisfaction, willingness to pay, time/cost to deliver manually
**Example**: Food delivery app founders manually taking orders/delivering before building platform
**When**: Learn what "good" looks like before automating, validate service value proposition
### Wizard of Oz
**What**: System appears automated but humans power it behind scenes
**Setup**: Build UI, users interact thinking it's automated, humans respond in real-time
**Measures**: User acceptance of automated experience, performance expectations, edge cases
**Example**: IBM speech recognition - person typing what user said, appeared like AI transcription
**When**: Test if users accept automated interface before building complex AI/automation
### Painted Door
**What**: Feature shown in UI as "Beta" or "Early Access" but not built yet
**Setup**: Badge/flag on fake feature, measure attempts to access
**Measures**: Click rate, request rate for access
**Example**: Slack showed "Calls" feature as "Beta", measured requests before building voice infrastructure
**When**: Test interest in feature when UI space is limited (avoiding clutter)
### Single-Feature MVP
**What**: Build one feature extremely well, ignore everything else
**Setup**: Identify core value hypothesis, build only that feature
**Measures**: Retention (do users come back?), engagement (how often used?), WTP (will they pay?)
**Example**: Twitter v1 - just 140-char posts, no replies/retweets/hashtags/DMs
**When**: Test if core value alone is enough before adding features
### Pre-Order / Crowdfunding
**What**: Collect money before building product
**Setup**: Landing page with product description, pre-order button, collect payments
**Measures**: Conversion rate (visitors → buyers), funding amount vs target
**Example**: Pebble smartwatch raised $10M on Kickstarter before manufacturing
**When**: Test willingness to pay and validate demand with financial commitment
### Explainer Video
**What**: Video showing product in use before building it
**Setup**: 2-3 min video demonstrating value prop, post to landing page, measure sign-ups
**Measures**: View-to-signup conversion, qualitative feedback in comments
**Example**: Dropbox video (3min) drove 70K→75K beta sign-ups overnight (10% conversion)
**When**: Complex product hard to explain in text, want viral sharing
### Manual-First Approach
**What**: Do work manually before building tools/automation
**Setup**: Spreadsheets, email, manual processes instead of software
**Measures**: Feasibility (can we do manually?), bottlenecks (what takes time?), quality (output good enough?)
**Example**: Zapier founders manually connecting APIs for first customers before building platform
**When**: Learn workflow requirements before automation, validate service value before tooling
## 2. Fidelity Selection Framework
### Decision Matrix
| Question | Recommended Fidelity | Timeline | Cost |
|----------|---------------------|----------|------|
| Do people want this? | Pretotype (Fake Door) | Hours-Days | $0-100 |
| Will they pay $X? | Pretotype (Pricing on landing page) | Days | $0-500 |
| Is workflow intuitive? | Paper Prototype | Hours-Days | $0-50 |
| Do interactions feel right? | Clickable Prototype | Days-Week | $100-500 |
| Can we build technically? | Coded Prototype | Weeks | $1K-10K |
| Will they retain/engage? | MVP | Months | $10K-100K+ |
### Fidelity Ladder Climber
Start low fidelity, climb only if validated:
1. **Pretotype** (Fake Door): 5% conversion → demand validated → climb to prototype
2. **Paper Prototype**: 8/10 users complete workflow → UX validated → climb to clickable
3. **Clickable Prototype**: 15% task completion <2 min → flow validated → climb to coded
4. **Coded Prototype**: <500ms latency at 100 req/sec → technical validated → build MVP
5. **MVP**: 40% week-1 retention → value validated → build full product
**Don't skip steps**: Each step de-risks before higher investment
### Cost-Benefit Analysis
**Example - Should we code prototype or stick with clickable?**
Clickable prototype cost: $500 (1 week designer)
Coded prototype cost: $8K (1 month engineer)
**Delta**: $7.5K, 3 weeks
Information gained from coded vs clickable:
- Performance data (real latency, not estimated)
- Integration complexity (real API issues, not mocked)
- Scalability constraints (actual database limits)
**Is $7.5K worth it?**
- If performance/integration unknown and high risk: Yes (de-risking worth cost)
- If performance/integration well-understood: No (clickable sufficient)
## 3. Experiment Design Principles
### Minimum Viable Data
**Qualitative**: n=5-10 for pattern identification (Nielsen Norman Group: 5 users find 85% of usability issues)
**Quantitative**: n=100+ for statistical confidence (conversions, A/B tests)
**Don't over-collect**: More users = more time/cost. Stop when pattern clear.
### Success Criteria Template
**Good criteria** (set before testing):
- Specific: "10% landing page conversion"
- Measurable: Can be tracked with analytics
- Actionable: Tells you to pivot or persevere
- Realistic: Based on industry benchmarks
- Time-bound: "In 2 weeks"
**Decision thresholds**:
- **Persevere**: ≥10% conversion → validated, build it
- **Pivot**: <5% conversion → assumption wrong, change direction
- **Iterate**: 5-10% conversion → unclear, refine and re-test
### Bias Mitigation
**Confirmation bias**: Seeing what we want to see
- Fix: Set success criteria before testing, blind analysis (analyst doesn't know hypothesis)
**Sampling bias**: Testing wrong users
- Fix: Screen participants (e.g., "Do you currently use X?"), recruit from target segment
**Social desirability bias**: Users say what's polite
- Fix: Observe behavior (clicks, time), don't just ask opinions
**Leading questions**: "Wouldn't you love feature X?"
- Fix: Neutral framing: "How would you solve problem Y?"
## 4. Measurement and Validation
### Behavioral Metrics (Reliable)
**Pre-commitment signals** (ranked by strength):
1. **Paid**: Actual payment (strongest signal)
2. **Pre-ordered**: Credit card on file, will be charged later
3. **Waitlist with phone/email**: Provided contact info
4. **Clicked "Buy"**: Navigated to checkout (even if abandoned)
5. **Clicked feature**: Showed interest by interaction
**Engagement metrics**:
- Task completion rate: % who finished workflow
- Time on task: How long (too long = confusing)
- Error rate: Mis-clicks, form errors
- Return visits: Came back without prompt
- Referrals: Told others (strongest retention signal)
### Opinion Metrics (Less Reliable)
**Survey responses**: "Would you pay $X?" (70% say yes, 10% actually pay → 7× overestimate)
**Net Promoter Score**: "Would you recommend?" (aspirational, not predictive)
**Satisfaction ratings**: "How satisfied?" (grade inflation, social desirability)
**Use opinions for context, not decisions**: "Why did you abandon checkout?" (explains behavior) not "Would you buy this?" (unreliable prediction)
### Statistical Confidence
**Sample size for conversions**:
- Baseline conversion: 10%
- Want to detect: 2% change (10% → 12%)
- Confidence: 95%
- **Required sample**: ~1,000 per variant (use online calculators)
**Too small sample**: False confidence (random noise looks like signal)
**Too large sample**: Wasted time/money (pattern already clear at n=200)
### Qualitative Analysis
**Thematic coding**:
1. Collect observations/quotes (n=10 interviews)
2. Identify recurring themes (e.g., "confused by pricing", "wanted export feature")
3. Count frequency (7/10 mentioned pricing confusion)
4. Prioritize by frequency + severity
**Think-aloud protocol**:
- Users narrate thoughts while completing task
- Reveals mental model mismatches: "I expected X here but saw Y"
- Uncovers unspoken assumptions: "I assume this button does..."
## 5. Common Failure Patterns
### Overbuilding
**Symptom**: Coded prototype for question answerable with landing page
**Root cause**: Excitement to build, uncomfortable with "fakery", underestimating learning from cheap tests
**Fix**: Force fidelity ladder (start low, justify climbing), set "maximum time to first test" (e.g., 1 week)
### No Success Criteria
**Symptom**: Ran test, got data, unclear what it means
**Root cause**: Didn't define success before testing, moving goalposts
**Fix**: Write success criteria document before building prototype, get stakeholder sign-off
### Testing with Wrong Users
**Symptom**: Positive feedback from test, market launch flops
**Root cause**: Tested with friends/family (not target), convenience sample (not representative)
**Fix**: Screen participants (qualifying questions), recruit from target segment (ads, outreach)
### Opinion over Behavior
**Symptom**: "Users loved it in interviews" but no one uses product
**Root cause**: Relying on what users say, not what they do (social desirability, hypothetical bias)
**Fix**: Measure behavior (clicks, payments, retention) as primary, opinions as secondary context
### Single Test Overconfidence
**Symptom**: One test shows X, assume validated forever
**Root cause**: Confirmation bias, small sample, didn't test alternatives
**Fix**: Multiple tests, test variations, update beliefs with new evidence
### Prototype Becomes Product
**Symptom**: Shipped prototype code, now have technical debt/security issues
**Root cause**: Pressure to ship fast, reluctance to "throw away" working code
**Fix**: Treat prototypes as disposable (document learnings, rebuild properly for production)
### Analysis Paralysis
**Symptom**: Months refining prototype before testing
**Root cause**: Perfectionism, fear of negative feedback, unclear scope
**Fix**: Time-box prototype building (e.g., 1 week max), test with "good enough" version
### Ignoring Negative Results
**Symptom**: Test shows assumption wrong, but team proceeds anyway (sunk cost fallacy)
**Root cause**: Ego, sunk cost, optimism bias ("this time will be different")
**Fix**: Pre-commit to decision rule ("if conversion <5%, we pivot"), make pivoting psychologically safe

View File

@@ -0,0 +1,175 @@
# Prototyping & Pretotyping Experiment Template
## Workflow
```
Prototyping Progress:
- [ ] Step 1: Identify riskiest assumption to test
- [ ] Step 2: Choose pretotype/prototype approach
- [ ] Step 3: Design and build minimum test
- [ ] Step 4: Run experiment and collect data
- [ ] Step 5: Analyze results and decide
```
## Experiment Design Template
### 1. Assumption to Test
**Assumption**: [What are we assuming? E.g., "Users will pay $49/mo for AI-powered analytics"]
**Why risky**: [Why might this be wrong? Impact if wrong?]
**Risk score**: [Probability wrong (1-5) × Impact if wrong (1-5) = Risk (1-25)]
### 2. Test Method
**Approach**: [Pretotype / Paper / Clickable / Coded / MVP]
**Fidelity choice rationale**: [Why this fidelity level? What question does it answer?]
**Estimated cost**: [$X or X hours]
**Timeline**: [X days to build, Y days to test]
### 3. Success Criteria
**Primary metric**: [E.g., "10% landing page → sign-up conversion"]
**Secondary metrics**: [E.g., "50% complete onboarding, 5 min avg session"]
**Minimum sample**: [n=X users/observations]
**Decision rule**:
- **Persevere** (build it): [Metric ≥ X means validated]
- **Pivot** (change direction): [Metric < Y means assumption wrong]
- **Iterate** (refine and re-test): [X > Metric ≥ Y means unclear, need more data]
### 4. Experiment Build
**What we're building**: [Landing page, paper prototype, working feature, etc.]
**Components needed**:
- [ ] [Component 1, e.g., Landing page copy/design]
- [ ] [Component 2, e.g., Sign-up form]
- [ ] [Component 3, e.g., Analytics tracking]
**Fake vs Real**:
- **Faking**: [What appears real but isn't? E.g., "Buy Now button shows 'Coming Soon'"]
- **Real**: [What must actually work? E.g., "Email capture must work"]
### 5. Participant Recruitment
**Target users**: [Who are we testing with? Demographics, behaviors, context]
**Sample size**: [n=X, reasoning: qualitative vs quantitative]
**Recruitment method**: [Ads, existing users, outreach, intercepts]
**Screening**: [How do we ensure target users? Screener questions]
### 6. Data Collection Plan
**Quantitative data**:
| Metric | How measured | Tool | Target |
|--------|--------------|------|--------|
| [Sign-ups] | [Form submissions] | [Google Analytics] | [≥100] |
| [Conversion] | [Sign-ups / Visitors] | [GA] | [≥10%] |
**Qualitative data**:
| Method | N | Questions/Tasks |
|--------|---|-----------------|
| [User interview] | [5-10] | [What problem were you trying to solve? Did prototype help?] |
| [Task observation] | [10] | [Complete checkout, note errors/confusion] |
### 7. Results
**Quantitative**:
| Metric | Target | Actual | Status |
|--------|--------|--------|--------|
| [Sign-ups] | [≥100] | [X] | [✓ / ✗] |
| [Conversion] | [≥10%] | [Y%] | [✓ / ✗] |
**Qualitative**:
- **Observation 1**: [E.g., "7/10 users confused by pricing page"]
- **Observation 2**: [E.g., "All users expected 'Export' feature"]
- **Quote 1**: [User said...]
- **Quote 2**: [User said...]
### 8. Decision
**Decision**: [Persevere / Pivot / Iterate]
**Rationale**: [Why? Which criteria met/not met?]
**Next steps**:
- [ ] [If Persevere: Build MVP with features X, Y, Z]
- [ ] [If Pivot: Test alternative approach A]
- [ ] [If Iterate: Refine prototype addressing issues 1, 2, 3, re-test in 2 weeks]
**Learnings**:
1. [What we learned about assumption]
2. [What surprised us]
3. [What to test next]
---
## Quick Patterns
### Pretotype Methods
**Fake Door Test** (Test demand):
- Build: Landing page "New Feature X - Coming Soon" with "Notify Me" button
- Measure: Click rate, email sign-ups
- Example: "500 visitors, 50 sign-ups (10%) → validates demand"
**Concierge MVP** (Test workflow manually before automating):
- Build: Manual service delivery (no automation)
- Measure: Customer satisfaction, willingness to pay, time spent
- Example: "Manually curate recommendations for 10 users → learn what good looks like before building algorithm"
**Wizard of Oz** (Appear automated, human-powered):
- Build: UI looks automated, humans behind scenes
- Measure: User perception, task success, performance expectations
- Example: "Chatbot UI, humans answering questions → test if users accept chatbot interaction before building NLP"
**Single-Feature MVP** (Test one feature well):
- Build: One core feature, ignore rest
- Measure: Usage, retention, WTP
- Example: "Instagram v1: photo filters only → test if core value enough before building stories/reels"
### Prototype Methods
**Paper Prototype** (Test workflow):
- Build: Hand-drawn screens on paper/cards
- Test: Users "click" on paper, swap screens, observe
- Measure: Task completion, errors, confusion points
- Example: "10 users complete checkout, 3 confused by shipping step → redesign before coding"
**Clickable Prototype** (Test UI/UX):
- Build: Interactive mockup in Figma/InVision (no real code)
- Test: Users complete tasks, measure success/time
- Measure: Completion rate, time, errors, satisfaction
- Example: "20 users, 85% complete task <3 min → validates flow"
**Coded Prototype** (Test feasibility):
- Build: Working code, limited features/data
- Test: Real users, real tasks, measure performance
- Measure: Latency, error rate, scalability, cost
- Example: "Search 10K docs <500ms → validates approach, ready to scale to 10M docs"
### Measurement Approaches
**Quantitative (n=100+)**:
- Conversion rates (landing page → sign-up, sign-up → payment)
- Task completion rates (% who finish checkout)
- Time on task (how long to complete)
- Error rates (clicks on wrong element, form errors)
**Qualitative (n=5-10)**:
- Think-aloud protocol (users narrate thought process)
- Retrospective interview (after task, ask about confusion/delight)
- Observation notes (where they pause, retry, look confused)
- Open-ended feedback (what worked, what didn't)
**Behavioral > Opinions**:
- ✓ "50 clicked 'Buy', 5 completed payment" (behavior)
- ❌ "Users said they'd pay $99" (opinion, unreliable)
---
## Quality Checklist
- [ ] Assumption is risky (high probability wrong × high impact if wrong)
- [ ] Fidelity matches question (not overbuilt)
- [ ] Success criteria set before testing (no moving goalposts)
- [ ] Recruited real target users (not friends/family)
- [ ] Sample size appropriate (n=5-10 qualitative, n=100+ quantitative)
- [ ] Measuring behavior (clicks, conversions), not just opinions
- [ ] Clear decision rule (persevere/pivot/iterate thresholds)
- [ ] Results documented and shared with team