gh-nice-wolf-studio-wolf-sk…/skills/databento/scripts/validate_data.py

#!/usr/bin/env python3
"""
Databento Data Quality Validator

Validates market data quality to catch issues early:
- Timestamp gap detection
- Record count verification
- Price range validation (no negative prices, outliers)
- Duplicate timestamp detection
- Summary quality report

Usage:
    python validate_data.py --input data.json
    python validate_data.py --input data.csv --schema ohlcv-1h
    python validate_data.py --input data.json --max-gap-minutes 60 --report report.json
"""

import argparse
import json
import sys
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from collections import defaultdict


class DataValidator:
    """Validates Databento market data quality."""

    def __init__(
        self,
        schema: str,
        max_gap_minutes: int = 60,
        price_outlier_std: float = 10.0
    ):
        """
        Initialize validator.

        Args:
            schema: Data schema (ohlcv-1h, trades, mbp-1, etc.)
            max_gap_minutes: Maximum acceptable gap in minutes
            price_outlier_std: Standard deviations for outlier detection
        """
        self.schema = schema
        self.max_gap_seconds = max_gap_minutes * 60
        self.price_outlier_std = price_outlier_std
        self.issues: List[Dict[str, Any]] = []

    def validate(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Run all validation checks on data.

        Args:
            data: List of records to validate

        Returns:
            Validation report
        """
        print(f"[VALIDATION] Running quality checks on {len(data)} records...")

        report = {
            "total_records": len(data),
            "valid": True,
            "checks": {}
        }

        if not data:
            print("[WARNING] No data to validate!")
            report["valid"] = False
            return report

        # Run all validation checks
        report["checks"]["timestamp_gaps"] = self.check_timestamp_gaps(data)
        report["checks"]["duplicates"] = self.check_duplicates(data)
        report["checks"]["price_range"] = self.check_price_range(data)
        report["checks"]["record_count"] = self.check_record_count(data)
        report["checks"]["data_completeness"] = self.check_completeness(data)

        # Overall validity
        report["valid"] = all(
            check.get("valid", True)
            for check in report["checks"].values()
        )

        report["issues"] = self.issues

        return report

    def check_timestamp_gaps(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Check for unexpected gaps in timestamps.

        Args:
            data: List of records

        Returns:
            Gap check report
        """
        print("[CHECK] Checking for timestamp gaps...")

        gaps = []
        timestamps = self._extract_timestamps(data)

        if len(timestamps) < 2:
            return {"valid": True, "gaps": [], "note": "Insufficient data for gap detection"}

        # Sort timestamps
        sorted_ts = sorted(timestamps)

        # Check gaps between consecutive timestamps
        for i in range(len(sorted_ts) - 1):
            gap_ns = sorted_ts[i + 1] - sorted_ts[i]
            gap_seconds = gap_ns / 1_000_000_000

            if gap_seconds > self.max_gap_seconds:
                gap_info = {
                    "index": i,
                    "gap_seconds": gap_seconds,
                    "gap_minutes": gap_seconds / 60,
                    "before": self._format_timestamp(sorted_ts[i]),
                    "after": self._format_timestamp(sorted_ts[i + 1])
                }
                gaps.append(gap_info)

                self.issues.append({
                    "type": "timestamp_gap",
                    "severity": "warning",
                    "message": f"Gap of {gap_seconds / 60:.1f} minutes detected",
                    **gap_info
                })

        valid = len(gaps) == 0
        print(f"[CHECK] Found {len(gaps)} gaps > {self.max_gap_seconds / 60} minutes")

        return {
            "valid": valid,
            "gaps_found": len(gaps),
            "gaps": gaps[:10] if gaps else [],  # Limit to first 10 for report
            "total_gaps": len(gaps)
        }

    def check_duplicates(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Check for duplicate timestamps.

        Args:
            data: List of records

        Returns:
            Duplicate check report
        """
        print("[CHECK] Checking for duplicate timestamps...")

        timestamps = self._extract_timestamps(data)
        timestamp_counts = defaultdict(int)

        for ts in timestamps:
            timestamp_counts[ts] += 1

        duplicates = {ts: count for ts, count in timestamp_counts.items() if count > 1}

        if duplicates:
            for ts, count in list(duplicates.items())[:10]:  # Limit to first 10
                self.issues.append({
                    "type": "duplicate_timestamp",
                    "severity": "error",
                    "timestamp": self._format_timestamp(ts),
                    "count": count,
                    "message": f"Timestamp appears {count} times"
                })

        valid = len(duplicates) == 0
        print(f"[CHECK] Found {len(duplicates)} duplicate timestamps")

        return {
            "valid": valid,
            "duplicates_found": len(duplicates),
            "duplicate_timestamps": len(duplicates)
        }

    def check_price_range(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Check for invalid or outlier prices.

        Args:
            data: List of records

        Returns:
            Price range check report
        """
        print("[CHECK] Checking price ranges...")

        prices = self._extract_prices(data)

        if not prices:
            return {"valid": True, "note": "No price data to validate"}

        # Check for negative prices
        negative_prices = [p for p in prices if p < 0]

        # Check for zero prices (unusual for ES/NQ)
        zero_prices = [p for p in prices if p == 0]

        # Calculate statistics for outlier detection
        if len(prices) > 1:
            mean_price = sum(prices) / len(prices)
            variance = sum((p - mean_price) ** 2 for p in prices) / len(prices)
            std_dev = variance ** 0.5

            # Detect outliers (> N standard deviations from mean)
            outliers = []
            for p in prices:
                if abs(p - mean_price) > (self.price_outlier_std * std_dev):
                    outliers.append(p)
                    if len(outliers) <= 10:  # Limit issues
                        self.issues.append({
                            "type": "price_outlier",
                            "severity": "warning",
                            "price": p,
                            "mean": mean_price,
                            "std_dev": std_dev,
                            "message": f"Price {p:.2f} is {abs(p - mean_price) / std_dev:.1f} std devs from mean"
                        })
        else:
            outliers = []
            mean_price = prices[0] if prices else 0
            std_dev = 0

        # Report negative prices as errors
        for p in negative_prices[:10]:  # Limit to first 10
            self.issues.append({
                "type": "negative_price",
                "severity": "error",
                "price": p,
                "message": f"Negative price detected: {p}"
            })

        valid = len(negative_prices) == 0 and len(zero_prices) == 0

        print(f"[CHECK] Price range: {min(prices):.2f} to {max(prices):.2f}")
        print(f"[CHECK] Negative prices: {len(negative_prices)}, Zero prices: {len(zero_prices)}, Outliers: {len(outliers)}")

        return {
            "valid": valid,
            "min_price": min(prices),
            "max_price": max(prices),
            "mean_price": mean_price,
            "std_dev": std_dev,
            "negative_prices": len(negative_prices),
            "zero_prices": len(zero_prices),
            "outliers": len(outliers)
        }

    def check_record_count(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Verify expected record count.

        Args:
            data: List of records

        Returns:
            Record count check report
        """
        print(f"[CHECK] Verifying record count: {len(data)} records")

        # For OHLCV data, can estimate expected count based on timeframe
        expected_count = self._estimate_expected_count(data)

        valid = True
        if expected_count and abs(len(data) - expected_count) > (expected_count * 0.1):
            # More than 10% deviation
            valid = False
            self.issues.append({
                "type": "unexpected_record_count",
                "severity": "warning",
                "actual": len(data),
                "expected": expected_count,
                "message": f"Expected ~{expected_count} records, got {len(data)}"
            })

        return {
            "valid": valid,
            "actual_count": len(data),
            "expected_count": expected_count,
            "note": "Expected count is estimated based on schema and date range"
        }

    def check_completeness(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Check data completeness (required fields present).

        Args:
            data: List of records

        Returns:
            Completeness check report
        """
        print("[CHECK] Checking data completeness...")

        if not data:
            return {"valid": False, "note": "No data"}

        # Check required fields based on schema
        required_fields = self._get_required_fields()

        missing_fields = defaultdict(int)
        for record in data[:100]:  # Sample first 100 records
            for field in required_fields:
                if field not in record or record[field] is None:
                    missing_fields[field] += 1

        if missing_fields:
            for field, count in missing_fields.items():
                self.issues.append({
                    "type": "missing_field",
                    "severity": "error",
                    "field": field,
                    "missing_count": count,
                    "message": f"Field '{field}' missing in {count} records (sampled)"
                })

        valid = len(missing_fields) == 0

        return {
            "valid": valid,
            "missing_fields": dict(missing_fields) if missing_fields else {}
        }

    def _extract_timestamps(self, data: List[Dict[str, Any]]) -> List[int]:
        """Extract timestamps from records."""
        timestamps = []
        for record in data:
            # Try different timestamp field names
            ts = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
            if ts:
                timestamps.append(int(ts))
        return timestamps

    def _extract_prices(self, data: List[Dict[str, Any]]) -> List[float]:
        """Extract prices from records."""
        prices = []
        for record in data:
            # For OHLCV, use close price
            if "close" in record:
                # Convert from fixed-point if needed
                price = record["close"]
                if isinstance(price, int) and price > 1_000_000:
                    price = price / 1_000_000_000  # Fixed-point conversion
                prices.append(float(price))
            # For trades/mbp, use price field
            elif "price" in record:
                price = record["price"]
                if isinstance(price, int) and price > 1_000_000:
                    price = price / 1_000_000_000
                prices.append(float(price))
        return prices

    def _format_timestamp(self, ts_ns: int) -> str:
        """Format nanosecond timestamp to readable string."""
        ts_seconds = ts_ns / 1_000_000_000
        dt = datetime.fromtimestamp(ts_seconds)
        return dt.strftime("%Y-%m-%d %H:%M:%S")

    def _estimate_expected_count(self, data: List[Dict[str, Any]]) -> Optional[int]:
        """Estimate expected record count based on schema and date range."""
        # This is a simplified estimation
        # In practice, would calculate based on actual date range
        if "ohlcv" in self.schema:
            if "1h" in self.schema:
                return None  # ~24 records per day per symbol
            elif "1d" in self.schema:
                return None  # ~1 record per day per symbol
        return None

    def _get_required_fields(self) -> List[str]:
        """Get required fields for schema."""
        base_fields = ["ts_event", "ts_recv"]

        if "ohlcv" in self.schema:
            return base_fields + ["open", "high", "low", "close", "volume"]
        elif self.schema == "trades":
            return base_fields + ["price", "size"]
        elif "mbp" in self.schema:
            return base_fields + ["bid_px_00", "ask_px_00", "bid_sz_00", "ask_sz_00"]
        else:
            return base_fields

    def print_report(self, report: Dict[str, Any]):
        """Print validation report to console."""
        print("\n" + "=" * 60)
        print("DATA VALIDATION REPORT")
        print("=" * 60)

        print(f"\nTotal Records: {report['total_records']}")
        print(f"Overall Valid: {'✓ YES' if report['valid'] else '✗ NO'}")

        print("\n" + "-" * 60)
        print("CHECK RESULTS")
        print("-" * 60)

        for check_name, check_result in report["checks"].items():
            status = "✓" if check_result.get("valid", True) else "✗"
            print(f"\n{status} {check_name.replace('_', ' ').title()}")
            for key, value in check_result.items():
                if key != "valid" and key != "gaps":
                    print(f"    {key}: {value}")

        if report["issues"]:
            print("\n" + "-" * 60)
            print(f"ISSUES FOUND ({len(report['issues'])})")
            print("-" * 60)
            for i, issue in enumerate(report["issues"][:20], 1):  # Limit to 20
                print(f"\n{i}. [{issue['severity'].upper()}] {issue['type']}")
                print(f"   {issue['message']}")

            if len(report["issues"]) > 20:
                print(f"\n... and {len(report['issues']) - 20} more issues")

        print("\n" + "=" * 60)


def main():
    """Main entry point for CLI usage."""
    parser = argparse.ArgumentParser(
        description="Validate Databento market data quality"
    )

    parser.add_argument(
        "--input",
        "-i",
        required=True,
        help="Input data file (JSON or CSV)"
    )

    parser.add_argument(
        "--schema",
        default="ohlcv-1h",
        help="Data schema (default: ohlcv-1h)"
    )

    parser.add_argument(
        "--max-gap-minutes",
        type=int,
        default=60,
        help="Maximum acceptable gap in minutes (default: 60)"
    )

    parser.add_argument(
        "--price-outlier-std",
        type=float,
        default=10.0,
        help="Standard deviations for outlier detection (default: 10.0)"
    )

    parser.add_argument(
        "--report",
        "-r",
        help="Save report to JSON file"
    )

    args = parser.parse_args()

    # Load data
    print(f"[LOAD] Loading data from {args.input}...")
    with open(args.input, 'r') as f:
        data = json.load(f)

    # Handle different data formats
    if isinstance(data, dict) and "data" in data:
        data = data["data"]

    # Create validator
    validator = DataValidator(
        schema=args.schema,
        max_gap_minutes=args.max_gap_minutes,
        price_outlier_std=args.price_outlier_std
    )

    # Run validation
    report = validator.validate(data)

    # Print report
    validator.print_report(report)

    # Save report if requested
    if args.report:
        print(f"\n[SAVE] Saving report to {args.report}...")
        with open(args.report, 'w') as f:
            json.dump(report, f, indent=2)
        print(f"[SUCCESS] Report saved!")

    # Exit with appropriate code
    sys.exit(0 if report["valid"] else 1)


if __name__ == "__main__":
    main()