Initial commit

2025-11-30 08:43:40 +08:00
commit d6cdda3f30
13 changed files with 4664 additions and 0 deletions
--- a/skills/databento/scripts/validate_data.py
+++ b/skills/databento/scripts/validate_data.py
@@ -0,0 +1,496 @@
+#!/usr/bin/env python3
+"""
+Databento Data Quality Validator
+
+Validates market data quality to catch issues early:
+- Timestamp gap detection
+- Record count verification
+- Price range validation (no negative prices, outliers)
+- Duplicate timestamp detection
+- Summary quality report
+
+Usage:
+    python validate_data.py --input data.json
+    python validate_data.py --input data.csv --schema ohlcv-1h
+    python validate_data.py --input data.json --max-gap-minutes 60 --report report.json
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional, Tuple
+from collections import defaultdict
+
+
+class DataValidator:
+    """Validates Databento market data quality."""
+
+    def __init__(
+        self,
+        schema: str,
+        max_gap_minutes: int = 60,
+        price_outlier_std: float = 10.0
+    ):
+        """
+        Initialize validator.
+
+        Args:
+            schema: Data schema (ohlcv-1h, trades, mbp-1, etc.)
+            max_gap_minutes: Maximum acceptable gap in minutes
+            price_outlier_std: Standard deviations for outlier detection
+        """
+        self.schema = schema
+        self.max_gap_seconds = max_gap_minutes * 60
+        self.price_outlier_std = price_outlier_std
+        self.issues: List[Dict[str, Any]] = []
+
+    def validate(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Run all validation checks on data.
+
+        Args:
+            data: List of records to validate
+
+        Returns:
+            Validation report
+        """
+        print(f"[VALIDATION] Running quality checks on {len(data)} records...")
+
+        report = {
+            "total_records": len(data),
+            "valid": True,
+            "checks": {}
+        }
+
+        if not data:
+            print("[WARNING] No data to validate!")
+            report["valid"] = False
+            return report
+
+        # Run all validation checks
+        report["checks"]["timestamp_gaps"] = self.check_timestamp_gaps(data)
+        report["checks"]["duplicates"] = self.check_duplicates(data)
+        report["checks"]["price_range"] = self.check_price_range(data)
+        report["checks"]["record_count"] = self.check_record_count(data)
+        report["checks"]["data_completeness"] = self.check_completeness(data)
+
+        # Overall validity
+        report["valid"] = all(
+            check.get("valid", True)
+            for check in report["checks"].values()
+        )
+
+        report["issues"] = self.issues
+
+        return report
+
+    def check_timestamp_gaps(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check for unexpected gaps in timestamps.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Gap check report
+        """
+        print("[CHECK] Checking for timestamp gaps...")
+
+        gaps = []
+        timestamps = self._extract_timestamps(data)
+
+        if len(timestamps) < 2:
+            return {"valid": True, "gaps": [], "note": "Insufficient data for gap detection"}
+
+        # Sort timestamps
+        sorted_ts = sorted(timestamps)
+
+        # Check gaps between consecutive timestamps
+        for i in range(len(sorted_ts) - 1):
+            gap_ns = sorted_ts[i + 1] - sorted_ts[i]
+            gap_seconds = gap_ns / 1_000_000_000
+
+            if gap_seconds > self.max_gap_seconds:
+                gap_info = {
+                    "index": i,
+                    "gap_seconds": gap_seconds,
+                    "gap_minutes": gap_seconds / 60,
+                    "before": self._format_timestamp(sorted_ts[i]),
+                    "after": self._format_timestamp(sorted_ts[i + 1])
+                }
+                gaps.append(gap_info)
+
+                self.issues.append({
+                    "type": "timestamp_gap",
+                    "severity": "warning",
+                    "message": f"Gap of {gap_seconds / 60:.1f} minutes detected",
+                    **gap_info
+                })
+
+        valid = len(gaps) == 0
+        print(f"[CHECK] Found {len(gaps)} gaps > {self.max_gap_seconds / 60} minutes")
+
+        return {
+            "valid": valid,
+            "gaps_found": len(gaps),
+            "gaps": gaps[:10] if gaps else [],  # Limit to first 10 for report
+            "total_gaps": len(gaps)
+        }
+
+    def check_duplicates(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check for duplicate timestamps.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Duplicate check report
+        """
+        print("[CHECK] Checking for duplicate timestamps...")
+
+        timestamps = self._extract_timestamps(data)
+        timestamp_counts = defaultdict(int)
+
+        for ts in timestamps:
+            timestamp_counts[ts] += 1
+
+        duplicates = {ts: count for ts, count in timestamp_counts.items() if count > 1}
+
+        if duplicates:
+            for ts, count in list(duplicates.items())[:10]:  # Limit to first 10
+                self.issues.append({
+                    "type": "duplicate_timestamp",
+                    "severity": "error",
+                    "timestamp": self._format_timestamp(ts),
+                    "count": count,
+                    "message": f"Timestamp appears {count} times"
+                })
+
+        valid = len(duplicates) == 0
+        print(f"[CHECK] Found {len(duplicates)} duplicate timestamps")
+
+        return {
+            "valid": valid,
+            "duplicates_found": len(duplicates),
+            "duplicate_timestamps": len(duplicates)
+        }
+
+    def check_price_range(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check for invalid or outlier prices.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Price range check report
+        """
+        print("[CHECK] Checking price ranges...")
+
+        prices = self._extract_prices(data)
+
+        if not prices:
+            return {"valid": True, "note": "No price data to validate"}
+
+        # Check for negative prices
+        negative_prices = [p for p in prices if p < 0]
+
+        # Check for zero prices (unusual for ES/NQ)
+        zero_prices = [p for p in prices if p == 0]
+
+        # Calculate statistics for outlier detection
+        if len(prices) > 1:
+            mean_price = sum(prices) / len(prices)
+            variance = sum((p - mean_price) ** 2 for p in prices) / len(prices)
+            std_dev = variance ** 0.5
+
+            # Detect outliers (> N standard deviations from mean)
+            outliers = []
+            for p in prices:
+                if abs(p - mean_price) > (self.price_outlier_std * std_dev):
+                    outliers.append(p)
+                    if len(outliers) <= 10:  # Limit issues
+                        self.issues.append({
+                            "type": "price_outlier",
+                            "severity": "warning",
+                            "price": p,
+                            "mean": mean_price,
+                            "std_dev": std_dev,
+                            "message": f"Price {p:.2f} is {abs(p - mean_price) / std_dev:.1f} std devs from mean"
+                        })
+        else:
+            outliers = []
+            mean_price = prices[0] if prices else 0
+            std_dev = 0
+
+        # Report negative prices as errors
+        for p in negative_prices[:10]:  # Limit to first 10
+            self.issues.append({
+                "type": "negative_price",
+                "severity": "error",
+                "price": p,
+                "message": f"Negative price detected: {p}"
+            })
+
+        valid = len(negative_prices) == 0 and len(zero_prices) == 0
+
+        print(f"[CHECK] Price range: {min(prices):.2f} to {max(prices):.2f}")
+        print(f"[CHECK] Negative prices: {len(negative_prices)}, Zero prices: {len(zero_prices)}, Outliers: {len(outliers)}")
+
+        return {
+            "valid": valid,
+            "min_price": min(prices),
+            "max_price": max(prices),
+            "mean_price": mean_price,
+            "std_dev": std_dev,
+            "negative_prices": len(negative_prices),
+            "zero_prices": len(zero_prices),
+            "outliers": len(outliers)
+        }
+
+    def check_record_count(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Verify expected record count.
+
+        Args:
+            data: List of records
+
+        Returns:
+            Record count check report
+        """
+        print(f"[CHECK] Verifying record count: {len(data)} records")
+
+        # For OHLCV data, can estimate expected count based on timeframe
+        expected_count = self._estimate_expected_count(data)
+
+        valid = True
+        if expected_count and abs(len(data) - expected_count) > (expected_count * 0.1):
+            # More than 10% deviation
+            valid = False
+            self.issues.append({
+                "type": "unexpected_record_count",
+                "severity": "warning",
+                "actual": len(data),
+                "expected": expected_count,
+                "message": f"Expected ~{expected_count} records, got {len(data)}"
+            })
+
+        return {
+            "valid": valid,
+            "actual_count": len(data),
+            "expected_count": expected_count,
+            "note": "Expected count is estimated based on schema and date range"
+        }
+
+    def check_completeness(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Check data completeness (required fields present).
+
+        Args:
+            data: List of records
+
+        Returns:
+            Completeness check report
+        """
+        print("[CHECK] Checking data completeness...")
+
+        if not data:
+            return {"valid": False, "note": "No data"}
+
+        # Check required fields based on schema
+        required_fields = self._get_required_fields()
+
+        missing_fields = defaultdict(int)
+        for record in data[:100]:  # Sample first 100 records
+            for field in required_fields:
+                if field not in record or record[field] is None:
+                    missing_fields[field] += 1
+
+        if missing_fields:
+            for field, count in missing_fields.items():
+                self.issues.append({
+                    "type": "missing_field",
+                    "severity": "error",
+                    "field": field,
+                    "missing_count": count,
+                    "message": f"Field '{field}' missing in {count} records (sampled)"
+                })
+
+        valid = len(missing_fields) == 0
+
+        return {
+            "valid": valid,
+            "missing_fields": dict(missing_fields) if missing_fields else {}
+        }
+
+    def _extract_timestamps(self, data: List[Dict[str, Any]]) -> List[int]:
+        """Extract timestamps from records."""
+        timestamps = []
+        for record in data:
+            # Try different timestamp field names
+            ts = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
+            if ts:
+                timestamps.append(int(ts))
+        return timestamps
+
+    def _extract_prices(self, data: List[Dict[str, Any]]) -> List[float]:
+        """Extract prices from records."""
+        prices = []
+        for record in data:
+            # For OHLCV, use close price
+            if "close" in record:
+                # Convert from fixed-point if needed
+                price = record["close"]
+                if isinstance(price, int) and price > 1_000_000:
+                    price = price / 1_000_000_000  # Fixed-point conversion
+                prices.append(float(price))
+            # For trades/mbp, use price field
+            elif "price" in record:
+                price = record["price"]
+                if isinstance(price, int) and price > 1_000_000:
+                    price = price / 1_000_000_000
+                prices.append(float(price))
+        return prices
+
+    def _format_timestamp(self, ts_ns: int) -> str:
+        """Format nanosecond timestamp to readable string."""
+        ts_seconds = ts_ns / 1_000_000_000
+        dt = datetime.fromtimestamp(ts_seconds)
+        return dt.strftime("%Y-%m-%d %H:%M:%S")
+
+    def _estimate_expected_count(self, data: List[Dict[str, Any]]) -> Optional[int]:
+        """Estimate expected record count based on schema and date range."""
+        # This is a simplified estimation
+        # In practice, would calculate based on actual date range
+        if "ohlcv" in self.schema:
+            if "1h" in self.schema:
+                return None  # ~24 records per day per symbol
+            elif "1d" in self.schema:
+                return None  # ~1 record per day per symbol
+        return None
+
+    def _get_required_fields(self) -> List[str]:
+        """Get required fields for schema."""
+        base_fields = ["ts_event", "ts_recv"]
+
+        if "ohlcv" in self.schema:
+            return base_fields + ["open", "high", "low", "close", "volume"]
+        elif self.schema == "trades":
+            return base_fields + ["price", "size"]
+        elif "mbp" in self.schema:
+            return base_fields + ["bid_px_00", "ask_px_00", "bid_sz_00", "ask_sz_00"]
+        else:
+            return base_fields
+
+    def print_report(self, report: Dict[str, Any]):
+        """Print validation report to console."""
+        print("\n" + "=" * 60)
+        print("DATA VALIDATION REPORT")
+        print("=" * 60)
+
+        print(f"\nTotal Records: {report['total_records']}")
+        print(f"Overall Valid: {'✓ YES' if report['valid'] else '✗ NO'}")
+
+        print("\n" + "-" * 60)
+        print("CHECK RESULTS")
+        print("-" * 60)
+
+        for check_name, check_result in report["checks"].items():
+            status = "✓" if check_result.get("valid", True) else "✗"
+            print(f"\n{status} {check_name.replace('_', ' ').title()}")
+            for key, value in check_result.items():
+                if key != "valid" and key != "gaps":
+                    print(f"    {key}: {value}")
+
+        if report["issues"]:
+            print("\n" + "-" * 60)
+            print(f"ISSUES FOUND ({len(report['issues'])})")
+            print("-" * 60)
+            for i, issue in enumerate(report["issues"][:20], 1):  # Limit to 20
+                print(f"\n{i}. [{issue['severity'].upper()}] {issue['type']}")
+                print(f"   {issue['message']}")
+
+            if len(report["issues"]) > 20:
+                print(f"\n... and {len(report['issues']) - 20} more issues")
+
+        print("\n" + "=" * 60)
+
+
+def main():
+    """Main entry point for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description="Validate Databento market data quality"
+    )
+
+    parser.add_argument(
+        "--input",
+        "-i",
+        required=True,
+        help="Input data file (JSON or CSV)"
+    )
+
+    parser.add_argument(
+        "--schema",
+        default="ohlcv-1h",
+        help="Data schema (default: ohlcv-1h)"
+    )
+
+    parser.add_argument(
+        "--max-gap-minutes",
+        type=int,
+        default=60,
+        help="Maximum acceptable gap in minutes (default: 60)"
+    )
+
+    parser.add_argument(
+        "--price-outlier-std",
+        type=float,
+        default=10.0,
+        help="Standard deviations for outlier detection (default: 10.0)"
+    )
+
+    parser.add_argument(
+        "--report",
+        "-r",
+        help="Save report to JSON file"
+    )
+
+    args = parser.parse_args()
+
+    # Load data
+    print(f"[LOAD] Loading data from {args.input}...")
+    with open(args.input, 'r') as f:
+        data = json.load(f)
+
+    # Handle different data formats
+    if isinstance(data, dict) and "data" in data:
+        data = data["data"]
+
+    # Create validator
+    validator = DataValidator(
+        schema=args.schema,
+        max_gap_minutes=args.max_gap_minutes,
+        price_outlier_std=args.price_outlier_std
+    )
+
+    # Run validation
+    report = validator.validate(data)
+
+    # Print report
+    validator.print_report(report)
+
+    # Save report if requested
+    if args.report:
+        print(f"\n[SAVE] Saving report to {args.report}...")
+        with open(args.report, 'w') as f:
+            json.dump(report, f, indent=2)
+        print(f"[SUCCESS] Report saved!")
+
+    # Exit with appropriate code
+    sys.exit(0 if report["valid"] else 1)
+
+
+if __name__ == "__main__":
+    main()