gh-lyndonkl-claude/skills/metrics-tree/resources/evaluators/rubric_metrics_tree.json

{
  "name": "Metrics Tree Evaluator",
  "description": "Evaluate metrics trees for North Star selection, decomposition quality, causal clarity, and actionability. Assess whether the metrics tree will drive effective decision-making and experimentation.",
  "version": "1.0.0",
  "criteria": [
    {
      "name": "North Star Selection",
      "description": "Evaluates whether the chosen North Star metric appropriately captures value and business success",
      "weight": 1.3,
      "scale": {
        "1": {
          "label": "Poor North Star choice",
          "description": "Vanity metric (registered users, pageviews) that doesn't reflect value delivered or business health. Not actionable or measurable."
        },
        "2": {
          "label": "Weak North Star",
          "description": "Metric somewhat related to value but indirect or lagging. Example: Revenue for early-stage product (reflects pricing not product-market fit)."
        },
        "3": {
          "label": "Acceptable North Star",
          "description": "Metric captures some value but missing key criteria. For example, measures usage but not business model alignment, or actionable but not predictive of revenue."
        },
        "4": {
          "label": "Good North Star",
          "description": "Metric captures value delivered to customers, is measurable and actionable, but relationship to business success could be stronger or rationale could be clearer."
        },
        "5": {
          "label": "Excellent North Star",
          "description": "Metric perfectly captures value delivered to customers, predicts business success (revenue/retention), is measurable and actionable by teams. Clear rationale provided. Examples: Slack's 'teams sending 100+ messages/week', Airbnb's 'nights booked'."
        }
      }
    },
    {
      "name": "Decomposition Completeness",
      "description": "Evaluates whether North Star is fully decomposed into mutually exclusive, collectively exhaustive drivers",
      "weight": 1.2,
      "scale": {
        "1": {
          "label": "No decomposition",
          "description": "North Star stated but not broken down into component drivers. No input metrics (L2)."
        },
        "2": {
          "label": "Incomplete decomposition",
          "description": "1-2 input metrics identified but major drivers missing. Components overlap (not mutually exclusive) or gaps exist (not collectively exhaustive)."
        },
        "3": {
          "label": "Basic decomposition",
          "description": "3-5 input metrics cover major drivers but some gaps or overlaps exist. Mathematical relationship unclear (additive vs multiplicative)."
        },
        "4": {
          "label": "Complete decomposition",
          "description": "3-5 input metrics are mutually exclusive and collectively exhaustive. Clear mathematical relationship (e.g., sum or product). Minor gaps acceptable."
        },
        "5": {
          "label": "Rigorous decomposition",
          "description": "3-5 input metrics provably decompose North Star with explicit formula. MECE (mutually exclusive, collectively exhaustive). Each input can be owned by a team. Validated with data that components sum/multiply to North Star."
        }
      }
    },
    {
      "name": "Causal Clarity",
      "description": "Evaluates whether causal relationships between metrics are clearly specified and validated",
      "weight": 1.2,
      "scale": {
        "1": {
          "label": "No causal reasoning",
          "description": "Metrics listed without explaining how they relate to each other or to North Star."
        },
        "2": {
          "label": "Assumed causation",
          "description": "Relationships implied but not validated. Possible confusion between correlation and causation. Direction unclear (does A cause B or B cause A?)."
        },
        "3": {
          "label": "Plausible causation",
          "description": "Causal relationships stated with reasoning but not validated with data. Direction clear. Lag times not specified."
        },
        "4": {
          "label": "Validated causation",
          "description": "Causal relationships supported by correlation data or past experiments. Direction and approximate lag times specified. Some relationships tested."
        },
        "5": {
          "label": "Proven causation",
          "description": "Causal relationships validated through experiments or strong observational data (cohort analysis, regression). Effect sizes quantified (e.g., 10% increase in X → 5% increase in Y). Lag times specified. Confounds controlled."
        }
      }
    },
    {
      "name": "Actionability",
      "description": "Evaluates whether metrics can actually be moved by teams through specific actions",
      "weight": 1.1,
      "scale": {
        "1": {
          "label": "Not actionable",
          "description": "Metrics are outcomes outside team control (market conditions, competitor actions) or too abstract to act on."
        },
        "2": {
          "label": "Weakly actionable",
          "description": "Metrics are high-level (e.g., 'engagement') without specific user behaviors identified. Teams unsure what to do."
        },
        "3": {
          "label": "Moderately actionable",
          "description": "Some action metrics (L3) identified but not comprehensive. Clear which metrics each team owns but specific actions to move them are vague."
        },
        "4": {
          "label": "Actionable",
          "description": "Action metrics (L3) specified as concrete user behaviors for each input metric. Teams know what actions to encourage. Current rates measured."
        },
        "5": {
          "label": "Highly actionable",
          "description": "Action metrics are specific, observable behaviors with clear measurement (events tracked). Each input metric has 3-5 actions identified. Teams have explicit experiments to test moving actions. Ownership clear."
        }
      }
    },
    {
      "name": "Leading Indicator Quality",
      "description": "Evaluates whether true leading indicators are identified that predict North Star movement",
      "weight": 1.0,
      "scale": {
        "1": {
          "label": "No leading indicators",
          "description": "Only lagging indicators provided (same time or after North Star changes)."
        },
        "2": {
          "label": "Weak leading indicators",
          "description": "Indicators proposed but timing unclear (do they actually predict?) or correlation weak/untested."
        },
        "3": {
          "label": "Plausible leading indicators",
          "description": "2-3 indicators identified that logically should predict North Star. Timing estimates provided but not validated. Correlation not measured."
        },
        "4": {
          "label": "Validated leading indicators",
          "description": "2-3 leading indicators with timing specified (e.g., 'predicts 7-day retention') and correlation measured (r > 0.6). Tested on historical data."
        },
        "5": {
          "label": "High-quality leading indicators",
          "description": "2-4 leading indicators with proven predictive power (r > 0.7), clear timing (days/weeks ahead), and actionable (teams can move them). Includes propensity models or cohort analysis showing predictive strength."
        }
      }
    },
    {
      "name": "Prioritization Rigor",
      "description": "Evaluates whether experiments and opportunities are prioritized using sound reasoning",
      "weight": 1.0,
      "scale": {
        "1": {
          "label": "No prioritization",
          "description": "Metrics and experiments listed without ranking or rationale."
        },
        "2": {
          "label": "Subjective prioritization",
          "description": "Ranking provided but based on gut feel or opinion without framework or data."
        },
        "3": {
          "label": "Framework-based prioritization",
          "description": "ICE or RICE framework applied but scores are estimates without data support. Top 3 experiments identified."
        },
        "4": {
          "label": "Data-informed prioritization",
          "description": "ICE/RICE scores based on historical data or analysis. Impact estimates grounded in past experiments or correlations. Top 1-3 experiments have clear hypotheses and success criteria."
        },
        "5": {
          "label": "Rigorous prioritization",
          "description": "ICE/RICE scores validated with data. Tradeoffs considered (e.g., impact vs effort, short-term vs long-term). Sensitivity analysis performed (\"what if impact is half?\"). Top experiments have quantified hypotheses, clear metrics, and decision criteria. Portfolio approach if multiple experiments."
        }
      }
    },
    {
      "name": "Guardrails & Counter-Metrics",
      "description": "Evaluates whether risks, tradeoffs, and negative externalities are considered",
      "weight": 0.9,
      "scale": {
        "1": {
          "label": "No risk consideration",
          "description": "Only positive metrics. No mention of potential downsides, gaming, or tradeoffs."
        },
        "2": {
          "label": "Risks mentioned",
          "description": "Potential issues noted but no concrete counter-metrics or guardrails defined."
        },
        "3": {
          "label": "Some guardrails",
          "description": "1-2 counter-metrics identified (e.g., quality, satisfaction) but no thresholds set. Tradeoffs acknowledged but not quantified."
        },
        "4": {
          "label": "Clear guardrails",
          "description": "2-4 counter-metrics with minimum acceptable thresholds (e.g., NPS must stay ≥40). Gaming risks identified. Monitoring plan included."
        },
        "5": {
          "label": "Comprehensive risk framework",
          "description": "Counter-metrics for each major risk (quality, trust, satisfaction, ecosystem health). Guardrail thresholds set based on data or policy. Gaming prevention mechanisms specified. Tradeoff analysis included (e.g., short-term growth vs long-term retention)."
        }
      }
    },
    {
      "name": "Overall Usefulness",
      "description": "Evaluates whether the metrics tree will effectively guide decision-making and experimentation",
      "weight": 1.0,
      "scale": {
        "1": {
          "label": "Not useful",
          "description": "Missing critical components or so flawed that teams cannot use it for decisions."
        },
        "2": {
          "label": "Limited usefulness",
          "description": "Provides some structure but too many gaps, unclear relationships, or impractical to implement."
        },
        "3": {
          "label": "Moderately useful",
          "description": "Covers basics (North Star, input metrics, some actions) but lacks depth in actionability or prioritization. Teams can use it with significant additional work."
        },
        "4": {
          "label": "Useful",
          "description": "Complete metrics tree with clear structure. Teams can identify what to measure, understand relationships, and select experiments. Minor improvements needed."
        },
        "5": {
          "label": "Highly useful",
          "description": "Decision-ready artifact. Teams can immediately use it to align on goals, prioritize experiments, instrument dashboards, and make metric-driven decisions. Well-documented assumptions and data gaps. Review cadence specified."
        }
      }
    }
  ],
  "guidance": {
    "by_business_model": {
      "saas_subscription": {
        "north_star_options": "MRR, WAU/MAU for engaged users, Net Revenue Retention (NRR) for mature",
        "key_inputs": "New users, retained users, expansion revenue, churn",
        "leading_indicators": "Activation rate, feature adoption, usage frequency, product qualified leads (PQLs)",
        "guardrails": "Customer satisfaction (NPS/CSAT), support ticket volume, technical reliability"
      },
      "marketplace": {
        "north_star_options": "GMV, successful transactions, nights booked (supply × demand balanced metric)",
        "key_inputs": "Supply-side (active suppliers), demand-side (buyers/searches), match rate/liquidity",
        "leading_indicators": "New supplier activation, buyer intent signals, supply utilization rate",
        "guardrails": "Supply/demand balance ratio, trust/safety metrics, quality scores"
      },
      "ecommerce": {
        "north_star_options": "Revenue, orders per customer, customer LTV",
        "key_inputs": "Traffic, conversion rate, AOV, repeat purchase rate",
        "leading_indicators": "Add-to-cart rate, wishlist additions, email engagement, product page depth",
        "guardrails": "Return rate, customer satisfaction, shipping time, product quality ratings"
      },
      "social_content": {
        "north_star_options": "Engaged time, content created and consumed, network density (connections per user)",
        "key_inputs": "Content creation rate, content consumption, social interactions, retention",
        "leading_indicators": "Profile completion, first content post, first social interaction, 7-day activation",
        "guardrails": "Content quality, user wellbeing, toxicity/moderation metrics, creator retention"
      },
      "mobile_app": {
        "north_star_options": "DAU (for high-frequency) or WAU (for moderate-frequency), session frequency × duration",
        "key_inputs": "New installs, activated users, retained users, resurrected users",
        "leading_indicators": "Day 1 retention, tutorial completion, push notification opt-in, first core action",
        "guardrails": "App rating, uninstall rate, crash-free rate, user-reported satisfaction"
      }
    },
    "by_stage": {
      "pre_pmf": {
        "focus": "Finding product-market fit through retention and satisfaction signals",
        "north_star": "Week-over-week retention (>40% is strong signal)",
        "key_metrics": "Retention curves, NPS, 'very disappointed' score (>40%), organic usage frequency",
        "experiments": "Rapid iteration on core value prop, onboarding, early activation"
      },
      "post_pmf_pre_scale": {
        "focus": "Validating unit economics and early growth loops",
        "north_star": "New activated users per week or month",
        "key_metrics": "LTV/CAC ratio (>3), payback period (<12 months), month-over-month growth (>10%)",
        "experiments": "Channel optimization, conversion funnel improvements, early retention tactics"
      },
      "growth": {
        "focus": "Efficient scaling of acquisition, activation, and retention",
        "north_star": "Revenue, ARR, or transaction volume",
        "key_metrics": "Net revenue retention (>100%), magic number (>0.75), efficient growth",
        "experiments": "Systematic A/B testing, multi-channel optimization, retention programs, expansion revenue"
      },
      "maturity": {
        "focus": "Profitability, market share, operational efficiency",
        "north_star": "Free cash flow, EBITDA, or market share",
        "key_metrics": "Operating margin (>20%), customer concentration, competitive position",
        "experiments": "Operational efficiency, new market expansion, product line extension, M&A"
      }
    }
  },
  "common_failure_modes": {
    "vanity_north_star": "Chose metric that looks good but doesn't reflect value (total registered users, app downloads). Fix: Select metric tied to usage and business model.",
    "incomplete_decomposition": "Input metrics don't fully explain North Star. Missing key drivers. Fix: Validate that inputs sum/multiply to North Star mathematically.",
    "correlation_not_causation": "Assumed causation without validation. Metrics move together but one doesn't cause the other. Fix: Run experiments or use causal inference methods.",
    "not_actionable": "Metrics are outcomes without clear actions. Teams don't know what to do. Fix: Add action metrics (L3) as specific user behaviors.",
    "no_leading_indicators": "Only lagging metrics that react slowly. Can't make proactive decisions. Fix: Find early signals through cohort analysis or propensity modeling.",
    "ignoring_tradeoffs": "Optimizing one metric hurts another. No guardrails set. Fix: Add counter-metrics with minimum thresholds.",
    "gaming_risk": "Metric can be easily gamed without delivering real value. Fix: Add quality signals and combination metrics.",
    "no_prioritization": "Too many metrics to focus on. No clear experiments. Fix: Use ICE/RICE framework to rank top 1-3 experiments."
  },
  "excellence_indicators": [
    "North Star clearly captures value delivered to customers and predicts business success with explicit rationale",
    "Decomposition is provably MECE (mutually exclusive, collectively exhaustive) with mathematical formula",
    "Causal relationships validated through experiments or strong observational data with effect sizes quantified",
    "Each input metric has 3-5 specific action metrics (observable user behaviors) with measurement defined",
    "2-4 leading indicators identified with proven predictive power (r > 0.7) and clear timing",
    "Top 1-3 experiments prioritized using data-informed ICE/RICE scores with quantified hypotheses",
    "Counter-metrics and guardrails defined for major risks (quality, gaming, ecosystem health) with thresholds",
    "Assumptions documented, data gaps identified, review cadence specified",
    "Metrics tree diagram clearly shows relationships and hierarchy",
    "Decision-ready artifact that teams can immediately use for alignment and experimentation"
  ],
  "evaluation_notes": {
    "scoring": "Calculate weighted average across all criteria. Minimum passing score: 3.0 (basic quality). Production-ready target: 3.5+. Excellence threshold: 4.2+.",
    "context": "Adjust expectations based on business stage, data availability, and complexity. Early-stage with limited data may score 3.0-3.5 and be acceptable. Growth-stage with resources should target 4.0+.",
    "iteration": "Low scores indicate specific improvement areas. Prioritize fixing North Star selection and causal clarity first (highest weights), then improve actionability and prioritization. Revalidate after changes."
  }
}