Initial commit

2025-11-30 08:43:40 +08:00
commit d6cdda3f30
13 changed files with 4664 additions and 0 deletions
--- a/skills/databento/scripts/fetch_ohlcv.py
+++ b/skills/databento/scripts/fetch_ohlcv.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+"""
+Databento OHLCV Data Fetcher
+
+Standard pattern for fetching OHLCV data with built-in best practices:
+- Automatic cost estimation before fetch
+- Error handling with retries
+- Post-fetch data validation
+- Export options (CSV/pandas)
+
+Usage:
+    python fetch_ohlcv.py --symbol ES.c.0 --schema ohlcv-1h --start 2024-01-01 --end 2024-01-31
+    python fetch_ohlcv.py --symbol NQ.c.0 --schema ohlcv-1d --start 2024-01-01 --limit 100
+    python fetch_ohlcv.py --symbol ES.c.0,NQ.c.0 --schema ohlcv-1h --start 2024-01-01 --output data.csv
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+import time
+
+
+class DatabentoPHTLCVFetcher:
+    """Fetches OHLCV data from Databento with best practices built-in."""
+
+    def __init__(self, dataset: str = "GLBX.MDP3", stype_in: str = "continuous"):
+        """
+        Initialize fetcher.
+
+        Args:
+            dataset: Dataset code (default: GLBX.MDP3 for ES/NQ)
+            stype_in: Input symbol type (default: continuous)
+        """
+        self.dataset = dataset
+        self.stype_in = stype_in
+        self.max_retries = 3
+        self.retry_delay = 2  # seconds
+
+    def estimate_cost(
+        self,
+        symbols: str,
+        schema: str,
+        start: str,
+        end: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Estimate cost before fetching data.
+
+        Args:
+            symbols: Comma-separated symbol list
+            schema: Data schema (e.g., ohlcv-1h)
+            start: Start date (YYYY-MM-DD)
+            end: End date (optional)
+
+        Returns:
+            Cost estimation result
+        """
+        print(f"[COST CHECK] Estimating cost for {symbols} ({schema})...")
+
+        # NOTE: In actual usage, this would call the MCP tool:
+        # mcp__databento__metadata_get_cost(
+        #     dataset=self.dataset,
+        #     start=start,
+        #     end=end,
+        #     symbols=symbols,
+        #     schema=schema,
+        #     stype_in=self.stype_in
+        # )
+
+        # For this template, we simulate the response
+        print("[NOTE] This template script demonstrates the pattern.")
+        print("[NOTE] In actual usage, integrate with MCP tools directly.")
+
+        return {
+            "estimated_cost_usd": 0.0,
+            "estimated_size_mb": 0.0,
+            "note": "Call mcp__databento__metadata_get_cost here"
+        }
+
+    def validate_dataset_range(self) -> Dict[str, str]:
+        """
+        Validate dataset availability.
+
+        Returns:
+            Dataset date range
+        """
+        print(f"[VALIDATION] Checking dataset availability for {self.dataset}...")
+
+        # NOTE: In actual usage, this would call:
+        # mcp__databento__metadata_get_dataset_range(dataset=self.dataset)
+
+        return {
+            "start_date": "2000-01-01",
+            "end_date": datetime.now().strftime("%Y-%m-%d"),
+            "note": "Call mcp__databento__metadata_get_dataset_range here"
+        }
+
+    def fetch_data(
+        self,
+        symbols: str,
+        schema: str,
+        start: str,
+        end: Optional[str] = None,
+        limit: Optional[int] = None,
+        check_cost: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Fetch OHLCV data with retries and error handling.
+
+        Args:
+            symbols: Comma-separated symbol list
+            schema: Data schema (e.g., ohlcv-1h, ohlcv-1d)
+            start: Start date (YYYY-MM-DD)
+            end: End date (optional)
+            limit: Maximum number of records (optional)
+            check_cost: Whether to check cost before fetching (default: True)
+
+        Returns:
+            Fetched data
+        """
+        # Step 1: Cost check (if enabled)
+        if check_cost:
+            cost_info = self.estimate_cost(symbols, schema, start, end)
+            print(f"[COST] Estimated cost: ${cost_info.get('estimated_cost_usd', 0):.2f}")
+            print(f"[COST] Estimated size: {cost_info.get('estimated_size_mb', 0):.2f} MB")
+
+            # Prompt for confirmation if cost is high
+            estimated_cost = cost_info.get('estimated_cost_usd', 0)
+            if estimated_cost > 10:
+                response = input(f"\nEstimated cost is ${estimated_cost:.2f}. Continue? (y/n): ")
+                if response.lower() != 'y':
+                    print("[CANCELLED] Data fetch cancelled by user.")
+                    sys.exit(0)
+
+        # Step 2: Validate dataset
+        dataset_range = self.validate_dataset_range()
+        print(f"[DATASET] Available range: {dataset_range.get('start_date')} to {dataset_range.get('end_date')}")
+
+        # Step 3: Fetch data with retries
+        for attempt in range(self.max_retries):
+            try:
+                print(f"\n[FETCH] Attempt {attempt + 1}/{self.max_retries}")
+                print(f"[FETCH] Fetching {symbols} ({schema}) from {start} to {end or 'now'}...")
+
+                # NOTE: In actual usage, this would call:
+                # data = mcp__databento__timeseries_get_range(
+                #     dataset=self.dataset,
+                #     symbols=symbols,
+                #     schema=schema,
+                #     start=start,
+                #     end=end,
+                #     stype_in=self.stype_in,
+                #     stype_out="instrument_id",
+                #     limit=limit
+                # )
+
+                # Simulate successful fetch
+                print("[SUCCESS] Data fetched successfully!")
+                return {
+                    "data": [],
+                    "record_count": 0,
+                    "note": "Call mcp__databento__timeseries_get_range here"
+                }
+
+            except Exception as e:
+                print(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
+
+                if attempt < self.max_retries - 1:
+                    print(f"[RETRY] Waiting {self.retry_delay} seconds before retry...")
+                    time.sleep(self.retry_delay)
+                else:
+                    print("[FAILED] All retry attempts exhausted.")
+                    raise
+
+    def validate_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Validate fetched data quality.
+
+        Args:
+            data: Fetched data
+
+        Returns:
+            Validation report
+        """
+        print("\n[VALIDATION] Running data quality checks...")
+
+        # NOTE: Actual validation would:
+        # - Check for timestamp gaps
+        # - Verify record counts
+        # - Validate price ranges
+        # - Check for duplicates
+
+        # Use scripts/validate_data.py for comprehensive validation
+
+        return {
+            "valid": True,
+            "record_count": data.get("record_count", 0),
+            "issues": [],
+            "note": "Use scripts/validate_data.py for detailed validation"
+        }
+
+    def export_csv(self, data: Dict[str, Any], output_path: str):
+        """
+        Export data to CSV.
+
+        Args:
+            data: Data to export
+            output_path: Output file path
+        """
+        print(f"\n[EXPORT] Saving data to {output_path}...")
+
+        # NOTE: Actual export would convert data to CSV format
+        # and write to file
+
+        print(f"[SUCCESS] Data saved to {output_path}")
+
+    def export_json(self, data: Dict[str, Any], output_path: str):
+        """
+        Export data to JSON.
+
+        Args:
+            data: Data to export
+            output_path: Output file path
+        """
+        print(f"\n[EXPORT] Saving data to {output_path}...")
+
+        with open(output_path, 'w') as f:
+            json.dump(data, f, indent=2)
+
+        print(f"[SUCCESS] Data saved to {output_path}")
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Fetch OHLCV data from Databento with best practices"
+    )
+
+    parser.add_argument(
+        "--symbol",
+        "-s",
+        required=True,
+        help="Symbol or comma-separated symbols (e.g., ES.c.0 or ES.c.0,NQ.c.0)"
+    )
+
+    parser.add_argument(
+        "--schema",
+        choices=["ohlcv-1s", "ohlcv-1m", "ohlcv-1h", "ohlcv-1d", "ohlcv-eod"],
+        default="ohlcv-1h",
+        help="OHLCV schema (default: ohlcv-1h)"
+    )
+
+    parser.add_argument(
+        "--start",
+        required=True,
+        help="Start date (YYYY-MM-DD)"
+    )
+
+    parser.add_argument(
+        "--end",
+        help="End date (YYYY-MM-DD, optional)"
+    )
+
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Maximum number of records (optional)"
+    )
+
+    parser.add_argument(
+        "--dataset",
+        default="GLBX.MDP3",
+        help="Dataset code (default: GLBX.MDP3)"
+    )
+
+    parser.add_argument(
+        "--stype-in",
+        default="continuous",
+        choices=["continuous", "raw_symbol", "instrument_id"],
+        help="Input symbol type (default: continuous)"
+    )
+
+    parser.add_argument(
+        "--output",
+        "-o",
+        help="Output file path (CSV or JSON based on extension)"
+    )
+
+    parser.add_argument(
+        "--no-cost-check",
+        action="store_true",
+        help="Skip cost estimation (not recommended)"
+    )
+
+    args = parser.parse_args()
+
+    # Create fetcher
+    fetcher = DatabentOHLCVFetcher(
+        dataset=args.dataset,
+        stype_in=args.stype_in
+    )
+
+    try:
+        # Fetch data
+        data = fetcher.fetch_data(
+            symbols=args.symbol,
+            schema=args.schema,
+            start=args.start,
+            end=args.end,
+            limit=args.limit,
+            check_cost=not args.no_cost_check
+        )
+
+        # Validate data
+        validation = fetcher.validate_data(data)
+        print(f"\n[VALIDATION] Data is valid: {validation['valid']}")
+        print(f"[VALIDATION] Record count: {validation['record_count']}")
+
+        if validation['issues']:
+            print(f"[WARNING] Issues found: {validation['issues']}")
+
+        # Export if output specified
+        if args.output:
+            if args.output.endswith('.csv'):
+                fetcher.export_csv(data, args.output)
+            elif args.output.endswith('.json'):
+                fetcher.export_json(data, args.output)
+            else:
+                print("[WARNING] Unknown output format. Saving as JSON.")
+                fetcher.export_json(data, args.output + '.json')
+
+        print("\n[DONE] Fetch complete!")
+
+    except KeyboardInterrupt:
+        print("\n[CANCELLED] Fetch cancelled by user.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n[ERROR] Fetch failed: {str(e)}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/databento/scripts/session_filter.py
+++ b/skills/databento/scripts/session_filter.py
@@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+"""
+Databento Trading Session Filter
+
+Filter market data by trading session (Asian/London/NY):
+- Session detection using get_session_info
+- Historical data filtering by session
+- Session transition handling
+- Session-specific statistics
+
+Usage:
+    python session_filter.py --input data.json --session NY --output ny_session.json
+    python session_filter.py --input data.json --session London --stats
+    python session_filter.py --input data.json --sessions Asian,London --output combined.json
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timezone, timedelta
+from typing import Dict, List, Any, Optional, Tuple
+from enum import Enum
+
+
+class TradingSession(Enum):
+    """Trading session definitions (in ET)."""
+    ASIAN = ("Asian", 18, 2)      # 6pm - 2am ET
+    LONDON = ("London", 2, 8)     # 2am - 8am ET
+    NY = ("NY", 8, 16)            # 8am - 4pm ET
+
+
+class SessionFilter:
+    """Filters Databento market data by trading session."""
+
+    def __init__(self):
+        """Initialize session filter."""
+        self.sessions = {
+            "Asian": TradingSession.ASIAN,
+            "London": TradingSession.LONDON,
+            "NY": TradingSession.NY
+        }
+
+    def get_current_session(self, timestamp: Optional[str] = None) -> str:
+        """
+        Get trading session for a timestamp.
+
+        Args:
+            timestamp: ISO timestamp (optional, defaults to now)
+
+        Returns:
+            Session name (Asian, London, or NY)
+        """
+        # NOTE: In actual usage, this would call:
+        # session_info = mcp__databento__get_session_info(timestamp=timestamp)
+        # return session_info["session"]
+
+        # For this template, simulate session detection
+        if timestamp:
+            dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+        else:
+            dt = datetime.now(timezone.utc)
+
+        # Convert to ET
+        et_hour = (dt.hour - 5) % 24  # Simplified ET conversion
+
+        # Determine session
+        if 18 <= et_hour or et_hour < 2:
+            return "Asian"
+        elif 2 <= et_hour < 8:
+            return "London"
+        else:
+            return "NY"
+
+    def is_in_session(
+        self,
+        timestamp_ns: int,
+        session: TradingSession
+    ) -> bool:
+        """
+        Check if timestamp falls within trading session.
+
+        Args:
+            timestamp_ns: Timestamp in nanoseconds
+            session: Trading session to check
+
+        Returns:
+            True if timestamp is in session
+        """
+        # Convert nanoseconds to datetime
+        ts_seconds = timestamp_ns / 1_000_000_000
+        dt = datetime.fromtimestamp(ts_seconds, tz=timezone.utc)
+
+        # Convert to ET (simplified, doesn't handle DST)
+        et_offset = timedelta(hours=-5)
+        dt_et = dt + et_offset
+
+        hour = dt_et.hour
+
+        # Check if hour falls within session
+        _, start_hour, end_hour = session.value
+
+        if start_hour < end_hour:
+            # Session doesn't cross midnight
+            return start_hour <= hour < end_hour
+        else:
+            # Session crosses midnight (Asian session)
+            return hour >= start_hour or hour < end_hour
+
+    def filter_by_session(
+        self,
+        data: List[Dict[str, Any]],
+        sessions: List[str]
+    ) -> List[Dict[str, Any]]:
+        """
+        Filter data to include only specified sessions.
+
+        Args:
+            data: List of records
+            sessions: List of session names to include
+
+        Returns:
+            Filtered data
+        """
+        print(f"[FILTER] Filtering {len(data)} records for sessions: {', '.join(sessions)}")
+
+        session_enums = [self.sessions[s] for s in sessions]
+        filtered = []
+
+        for record in data:
+            # Extract timestamp
+            ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
+
+            if not ts_ns:
+                continue
+
+            # Check if in any of the specified sessions
+            for session in session_enums:
+                if self.is_in_session(int(ts_ns), session):
+                    filtered.append(record)
+                    break
+
+        print(f"[FILTER] Kept {len(filtered)} records ({len(filtered)/len(data)*100:.1f}%)")
+        return filtered
+
+    def calculate_session_stats(
+        self,
+        data: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Calculate statistics by trading session.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Session statistics
+        """
+        print(f"[STATS] Calculating session statistics for {len(data)} records...")
+
+        stats = {
+            "Asian": {"count": 0, "volume": 0, "trades": 0},
+            "London": {"count": 0, "volume": 0, "trades": 0},
+            "NY": {"count": 0, "volume": 0, "trades": 0}
+        }
+
+        for record in data:
+            ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
+
+            if not ts_ns:
+                continue
+
+            # Determine session
+            for session_name, session_enum in self.sessions.items():
+                if self.is_in_session(int(ts_ns), session_enum):
+                    stats[session_name]["count"] += 1
+
+                    # Add volume if available
+                    if "volume" in record:
+                        stats[session_name]["volume"] += record["volume"]
+
+                    # Count trades
+                    if "size" in record:  # Trade record
+                        stats[session_name]["trades"] += 1
+
+                    break
+
+        # Calculate percentages
+        total_count = sum(s["count"] for s in stats.values())
+        for session_stats in stats.values():
+            if total_count > 0:
+                session_stats["percentage"] = (session_stats["count"] / total_count) * 100
+            else:
+                session_stats["percentage"] = 0
+
+        return stats
+
+    def filter_session_transitions(
+        self,
+        data: List[Dict[str, Any]],
+        minutes_before: int = 30,
+        minutes_after: int = 30
+    ) -> List[Dict[str, Any]]:
+        """
+        Filter data to include only session transitions (handoffs).
+
+        Args:
+            data: List of records
+            minutes_before: Minutes before transition to include
+            minutes_after: Minutes after transition to include
+
+        Returns:
+            Filtered data around session transitions
+        """
+        print(f"[FILTER] Extracting session transitions ({minutes_before}m before, {minutes_after}m after)...")
+
+        # Session transition times (in ET)
+        transitions = [
+            2,   # Asian → London (2am ET)
+            8,   # London → NY (8am ET)
+            16,  # NY → Post-market
+            18,  # Post-market → Asian (6pm ET)
+        ]
+
+        filtered = []
+        transition_window = timedelta(minutes=minutes_before + minutes_after)
+
+        for record in data:
+            ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
+
+            if not ts_ns:
+                continue
+
+            # Convert to ET hour
+            ts_seconds = int(ts_ns) / 1_000_000_000
+            dt = datetime.fromtimestamp(ts_seconds, tz=timezone.utc)
+            et_offset = timedelta(hours=-5)
+            dt_et = dt + et_offset
+
+            # Check if near any transition
+            for transition_hour in transitions:
+                transition_dt = dt_et.replace(hour=transition_hour, minute=0, second=0, microsecond=0)
+
+                # Calculate time difference
+                time_diff = abs((dt_et - transition_dt).total_seconds())
+
+                # Include if within window
+                if time_diff <= transition_window.total_seconds():
+                    filtered.append(record)
+                    break
+
+        print(f"[FILTER] Found {len(filtered)} records near session transitions")
+        return filtered
+
+    def print_session_stats(self, stats: Dict[str, Any]):
+        """Print session statistics to console."""
+        print("\n" + "=" * 60)
+        print("SESSION STATISTICS")
+        print("=" * 60)
+
+        for session_name in ["Asian", "London", "NY"]:
+            session_stats = stats[session_name]
+            print(f"\n{session_name} Session:")
+            print(f"  Records: {session_stats['count']:,} ({session_stats['percentage']:.1f}%)")
+            if session_stats['volume'] > 0:
+                print(f"  Volume: {session_stats['volume']:,}")
+            if session_stats['trades'] > 0:
+                print(f"  Trades: {session_stats['trades']:,}")
+
+        print("\n" + "=" * 60)
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Filter Databento data by trading session"
+    )
+
+    parser.add_argument(
+        "--input",
+        "-i",
+        required=True,
+        help="Input data file (JSON)"
+    )
+
+    parser.add_argument(
+        "--session",
+        "--sessions",
+        help="Session(s) to filter (Asian, London, NY). Comma-separated for multiple."
+    )
+
+    parser.add_argument(
+        "--transitions",
+        action="store_true",
+        help="Filter for session transition periods only"
+    )
+
+    parser.add_argument(
+        "--minutes-before",
+        type=int,
+        default=30,
+        help="Minutes before transition (default: 30)"
+    )
+
+    parser.add_argument(
+        "--minutes-after",
+        type=int,
+        default=30,
+        help="Minutes after transition (default: 30)"
+    )
+
+    parser.add_argument(
+        "--stats",
+        action="store_true",
+        help="Calculate and display session statistics"
+    )
+
+    parser.add_argument(
+        "--output",
+        "-o",
+        help="Output file for filtered data (JSON)"
+    )
+
+    args = parser.parse_args()
+
+    # Load data
+    print(f"[LOAD] Loading data from {args.input}...")
+    with open(args.input, 'r') as f:
+        data = json.load(f)
+
+    # Handle different data formats
+    if isinstance(data, dict) and "data" in data:
+        data = data["data"]
+
+    # Create filter
+    session_filter = SessionFilter()
+
+    # Calculate stats if requested
+    if args.stats:
+        stats = session_filter.calculate_session_stats(data)
+        session_filter.print_session_stats(stats)
+
+    # Filter data
+    filtered_data = data
+
+    if args.transitions:
+        # Filter for session transitions
+        filtered_data = session_filter.filter_session_transitions(
+            filtered_data,
+            minutes_before=args.minutes_before,
+            minutes_after=args.minutes_after
+        )
+    elif args.session:
+        # Filter by specific session(s)
+        sessions = [s.strip() for s in args.session.split(',')]
+
+        # Validate sessions
+        for session in sessions:
+            if session not in ["Asian", "London", "NY"]:
+                print(f"[ERROR] Invalid session: {session}")
+                print("[ERROR] Valid sessions: Asian, London, NY")
+                sys.exit(1)
+
+        filtered_data = session_filter.filter_by_session(filtered_data, sessions)
+
+    # Save filtered data if output specified
+    if args.output:
+        print(f"\n[SAVE] Saving {len(filtered_data)} filtered records to {args.output}...")
+
+        output_data = {
+            "data": filtered_data,
+            "metadata": {
+                "original_count": len(data),
+                "filtered_count": len(filtered_data),
+                "filter_type": "transitions" if args.transitions else "sessions",
+                "sessions": args.session.split(',') if args.session else None
+            }
+        }
+
+        with open(args.output, 'w') as f:
+            json.dump(output_data, f, indent=2)
+
+        print(f"[SUCCESS] Filtered data saved!")
+
+    print("\n[DONE] Session filtering complete!")
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/databento/scripts/validate_data.py
+++ b/skills/databento/scripts/validate_data.py
@@ -0,0 +1,496 @@
+#!/usr/bin/env python3
+"""
+Databento Data Quality Validator
+
+Validates market data quality to catch issues early:
+- Timestamp gap detection
+- Record count verification
+- Price range validation (no negative prices, outliers)
+- Duplicate timestamp detection
+- Summary quality report
+
+Usage:
+    python validate_data.py --input data.json
+    python validate_data.py --input data.csv --schema ohlcv-1h
+    python validate_data.py --input data.json --max-gap-minutes 60 --report report.json
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional, Tuple
+from collections import defaultdict
+
+
+class DataValidator:
+    """Validates Databento market data quality."""
+
+    def __init__(
+        self,
+        schema: str,
+        max_gap_minutes: int = 60,
+        price_outlier_std: float = 10.0
+    ):
+        """
+        Initialize validator.
+
+        Args:
+            schema: Data schema (ohlcv-1h, trades, mbp-1, etc.)
+            max_gap_minutes: Maximum acceptable gap in minutes
+            price_outlier_std: Standard deviations for outlier detection
+        """
+        self.schema = schema
+        self.max_gap_seconds = max_gap_minutes * 60
+        self.price_outlier_std = price_outlier_std
+        self.issues: List[Dict[str, Any]] = []
+
+    def validate(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Run all validation checks on data.
+
+        Args:
+            data: List of records to validate
+
+        Returns:
+            Validation report
+        """
+        print(f"[VALIDATION] Running quality checks on {len(data)} records...")
+
+        report = {
+            "total_records": len(data),
+            "valid": True,
+            "checks": {}
+        }
+
+        if not data:
+            print("[WARNING] No data to validate!")
+            report["valid"] = False
+            return report
+
+        # Run all validation checks
+        report["checks"]["timestamp_gaps"] = self.check_timestamp_gaps(data)
+        report["checks"]["duplicates"] = self.check_duplicates(data)
+        report["checks"]["price_range"] = self.check_price_range(data)
+        report["checks"]["record_count"] = self.check_record_count(data)
+        report["checks"]["data_completeness"] = self.check_completeness(data)
+
+        # Overall validity
+        report["valid"] = all(
+            check.get("valid", True)
+            for check in report["checks"].values()
+        )
+
+        report["issues"] = self.issues
+
+        return report
+
+    def check_timestamp_gaps(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check for unexpected gaps in timestamps.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Gap check report
+        """
+        print("[CHECK] Checking for timestamp gaps...")
+
+        gaps = []
+        timestamps = self._extract_timestamps(data)
+
+        if len(timestamps) < 2:
+            return {"valid": True, "gaps": [], "note": "Insufficient data for gap detection"}
+
+        # Sort timestamps
+        sorted_ts = sorted(timestamps)
+
+        # Check gaps between consecutive timestamps
+        for i in range(len(sorted_ts) - 1):
+            gap_ns = sorted_ts[i + 1] - sorted_ts[i]
+            gap_seconds = gap_ns / 1_000_000_000
+
+            if gap_seconds > self.max_gap_seconds:
+                gap_info = {
+                    "index": i,
+                    "gap_seconds": gap_seconds,
+                    "gap_minutes": gap_seconds / 60,
+                    "before": self._format_timestamp(sorted_ts[i]),
+                    "after": self._format_timestamp(sorted_ts[i + 1])
+                }
+                gaps.append(gap_info)
+
+                self.issues.append({
+                    "type": "timestamp_gap",
+                    "severity": "warning",
+                    "message": f"Gap of {gap_seconds / 60:.1f} minutes detected",
+                    **gap_info
+                })
+
+        valid = len(gaps) == 0
+        print(f"[CHECK] Found {len(gaps)} gaps > {self.max_gap_seconds / 60} minutes")
+
+        return {
+            "valid": valid,
+            "gaps_found": len(gaps),
+            "gaps": gaps[:10] if gaps else [],  # Limit to first 10 for report
+            "total_gaps": len(gaps)
+        }
+
+    def check_duplicates(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check for duplicate timestamps.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Duplicate check report
+        """
+        print("[CHECK] Checking for duplicate timestamps...")
+
+        timestamps = self._extract_timestamps(data)
+        timestamp_counts = defaultdict(int)
+
+        for ts in timestamps:
+            timestamp_counts[ts] += 1
+
+        duplicates = {ts: count for ts, count in timestamp_counts.items() if count > 1}
+
+        if duplicates:
+            for ts, count in list(duplicates.items())[:10]:  # Limit to first 10
+                self.issues.append({
+                    "type": "duplicate_timestamp",
+                    "severity": "error",
+                    "timestamp": self._format_timestamp(ts),
+                    "count": count,
+                    "message": f"Timestamp appears {count} times"
+                })
+
+        valid = len(duplicates) == 0
+        print(f"[CHECK] Found {len(duplicates)} duplicate timestamps")
+
+        return {
+            "valid": valid,
+            "duplicates_found": len(duplicates),
+            "duplicate_timestamps": len(duplicates)
+        }
+
+    def check_price_range(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check for invalid or outlier prices.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Price range check report
+        """
+        print("[CHECK] Checking price ranges...")
+
+        prices = self._extract_prices(data)
+
+        if not prices:
+            return {"valid": True, "note": "No price data to validate"}
+
+        # Check for negative prices
+        negative_prices = [p for p in prices if p < 0]
+
+        # Check for zero prices (unusual for ES/NQ)
+        zero_prices = [p for p in prices if p == 0]
+
+        # Calculate statistics for outlier detection
+        if len(prices) > 1:
+            mean_price = sum(prices) / len(prices)
+            variance = sum((p - mean_price) ** 2 for p in prices) / len(prices)
+            std_dev = variance ** 0.5
+
+            # Detect outliers (> N standard deviations from mean)
+            outliers = []
+            for p in prices:
+                if abs(p - mean_price) > (self.price_outlier_std * std_dev):
+                    outliers.append(p)
+                    if len(outliers) <= 10:  # Limit issues
+                        self.issues.append({
+                            "type": "price_outlier",
+                            "severity": "warning",
+                            "price": p,
+                            "mean": mean_price,
+                            "std_dev": std_dev,
+                            "message": f"Price {p:.2f} is {abs(p - mean_price) / std_dev:.1f} std devs from mean"
+                        })
+        else:
+            outliers = []
+            mean_price = prices[0] if prices else 0
+            std_dev = 0
+
+        # Report negative prices as errors
+        for p in negative_prices[:10]:  # Limit to first 10
+            self.issues.append({
+                "type": "negative_price",
+                "severity": "error",
+                "price": p,
+                "message": f"Negative price detected: {p}"
+            })
+
+        valid = len(negative_prices) == 0 and len(zero_prices) == 0
+
+        print(f"[CHECK] Price range: {min(prices):.2f} to {max(prices):.2f}")
+        print(f"[CHECK] Negative prices: {len(negative_prices)}, Zero prices: {len(zero_prices)}, Outliers: {len(outliers)}")
+
+        return {
+            "valid": valid,
+            "min_price": min(prices),
+            "max_price": max(prices),
+            "mean_price": mean_price,
+            "std_dev": std_dev,
+            "negative_prices": len(negative_prices),
+            "zero_prices": len(zero_prices),
+            "outliers": len(outliers)
+        }
+
+    def check_record_count(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Verify expected record count.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Record count check report
+        """
+        print(f"[CHECK] Verifying record count: {len(data)} records")
+
+        # For OHLCV data, can estimate expected count based on timeframe
+        expected_count = self._estimate_expected_count(data)
+
+        valid = True
+        if expected_count and abs(len(data) - expected_count) > (expected_count * 0.1):
+            # More than 10% deviation
+            valid = False
+            self.issues.append({
+                "type": "unexpected_record_count",
+                "severity": "warning",
+                "actual": len(data),
+                "expected": expected_count,
+                "message": f"Expected ~{expected_count} records, got {len(data)}"
+            })
+
+        return {
+            "valid": valid,
+            "actual_count": len(data),
+            "expected_count": expected_count,
+            "note": "Expected count is estimated based on schema and date range"
+        }
+
+    def check_completeness(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check data completeness (required fields present).
+
+        Args:
+            data: List of records
+
+        Returns:
+            Completeness check report
+        """
+        print("[CHECK] Checking data completeness...")
+
+        if not data:
+            return {"valid": False, "note": "No data"}
+
+        # Check required fields based on schema
+        required_fields = self._get_required_fields()
+
+        missing_fields = defaultdict(int)
+        for record in data[:100]:  # Sample first 100 records
+            for field in required_fields:
+                if field not in record or record[field] is None:
+                    missing_fields[field] += 1
+
+        if missing_fields:
+            for field, count in missing_fields.items():
+                self.issues.append({
+                    "type": "missing_field",
+                    "severity": "error",
+                    "field": field,
+                    "missing_count": count,
+                    "message": f"Field '{field}' missing in {count} records (sampled)"
+                })
+
+        valid = len(missing_fields) == 0
+
+        return {
+            "valid": valid,
+            "missing_fields": dict(missing_fields) if missing_fields else {}
+        }
+
+    def _extract_timestamps(self, data: List[Dict[str, Any]]) -> List[int]:
+        """Extract timestamps from records."""
+        timestamps = []
+        for record in data:
+            # Try different timestamp field names
+            ts = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
+            if ts:
+                timestamps.append(int(ts))
+        return timestamps
+
+    def _extract_prices(self, data: List[Dict[str, Any]]) -> List[float]:
+        """Extract prices from records."""
+        prices = []
+        for record in data:
+            # For OHLCV, use close price
+            if "close" in record:
+                # Convert from fixed-point if needed
+                price = record["close"]
+                if isinstance(price, int) and price > 1_000_000:
+                    price = price / 1_000_000_000  # Fixed-point conversion
+                prices.append(float(price))
+            # For trades/mbp, use price field
+            elif "price" in record:
+                price = record["price"]
+                if isinstance(price, int) and price > 1_000_000:
+                    price = price / 1_000_000_000
+                prices.append(float(price))
+        return prices
+
+    def _format_timestamp(self, ts_ns: int) -> str:
+        """Format nanosecond timestamp to readable string."""
+        ts_seconds = ts_ns / 1_000_000_000
+        dt = datetime.fromtimestamp(ts_seconds)
+        return dt.strftime("%Y-%m-%d %H:%M:%S")
+
+    def _estimate_expected_count(self, data: List[Dict[str, Any]]) -> Optional[int]:
+        """Estimate expected record count based on schema and date range."""
+        # This is a simplified estimation
+        # In practice, would calculate based on actual date range
+        if "ohlcv" in self.schema:
+            if "1h" in self.schema:
+                return None  # ~24 records per day per symbol
+            elif "1d" in self.schema:
+                return None  # ~1 record per day per symbol
+        return None
+
+    def _get_required_fields(self) -> List[str]:
+        """Get required fields for schema."""
+        base_fields = ["ts_event", "ts_recv"]
+
+        if "ohlcv" in self.schema:
+            return base_fields + ["open", "high", "low", "close", "volume"]
+        elif self.schema == "trades":
+            return base_fields + ["price", "size"]
+        elif "mbp" in self.schema:
+            return base_fields + ["bid_px_00", "ask_px_00", "bid_sz_00", "ask_sz_00"]
+        else:
+            return base_fields
+
+    def print_report(self, report: Dict[str, Any]):
+        """Print validation report to console."""
+        print("\n" + "=" * 60)
+        print("DATA VALIDATION REPORT")
+        print("=" * 60)
+
+        print(f"\nTotal Records: {report['total_records']}")
+        print(f"Overall Valid: {'✓ YES' if report['valid'] else '✗ NO'}")
+
+        print("\n" + "-" * 60)
+        print("CHECK RESULTS")
+        print("-" * 60)
+
+        for check_name, check_result in report["checks"].items():
+            status = "✓" if check_result.get("valid", True) else "✗"
+            print(f"\n{status} {check_name.replace('_', ' ').title()}")
+            for key, value in check_result.items():
+                if key != "valid" and key != "gaps":
+                    print(f"    {key}: {value}")
+
+        if report["issues"]:
+            print("\n" + "-" * 60)
+            print(f"ISSUES FOUND ({len(report['issues'])})")
+            print("-" * 60)
+            for i, issue in enumerate(report["issues"][:20], 1):  # Limit to 20
+                print(f"\n{i}. [{issue['severity'].upper()}] {issue['type']}")
+                print(f"   {issue['message']}")
+
+            if len(report["issues"]) > 20:
+                print(f"\n... and {len(report['issues']) - 20} more issues")
+
+        print("\n" + "=" * 60)
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Validate Databento market data quality"
+    )
+
+    parser.add_argument(
+        "--input",
+        "-i",
+        required=True,
+        help="Input data file (JSON or CSV)"
+    )
+
+    parser.add_argument(
+        "--schema",
+        default="ohlcv-1h",
+        help="Data schema (default: ohlcv-1h)"
+    )
+
+    parser.add_argument(
+        "--max-gap-minutes",
+        type=int,
+        default=60,
+        help="Maximum acceptable gap in minutes (default: 60)"
+    )
+
+    parser.add_argument(
+        "--price-outlier-std",
+        type=float,
+        default=10.0,
+        help="Standard deviations for outlier detection (default: 10.0)"
+    )
+
+    parser.add_argument(
+        "--report",
+        "-r",
+        help="Save report to JSON file"
+    )
+
+    args = parser.parse_args()
+
+    # Load data
+    print(f"[LOAD] Loading data from {args.input}...")
+    with open(args.input, 'r') as f:
+        data = json.load(f)
+
+    # Handle different data formats
+    if isinstance(data, dict) and "data" in data:
+        data = data["data"]
+
+    # Create validator
+    validator = DataValidator(
+        schema=args.schema,
+        max_gap_minutes=args.max_gap_minutes,
+        price_outlier_std=args.price_outlier_std
+    )
+
+    # Run validation
+    report = validator.validate(data)
+
+    # Print report
+    validator.print_report(report)
+
+    # Save report if requested
+    if args.report:
+        print(f"\n[SAVE] Saving report to {args.report}...")
+        with open(args.report, 'w') as f:
+            json.dump(report, f, indent=2)
+        print(f"[SUCCESS] Report saved!")
+
+    # Exit with appropriate code
+    sys.exit(0 if report["valid"] else 1)
+
+
+if __name__ == "__main__":
+    main()