#!/usr/bin/env python3 """ Databento Data Quality Validator Validates market data quality to catch issues early: - Timestamp gap detection - Record count verification - Price range validation (no negative prices, outliers) - Duplicate timestamp detection - Summary quality report Usage: python validate_data.py --input data.json python validate_data.py --input data.csv --schema ohlcv-1h python validate_data.py --input data.json --max-gap-minutes 60 --report report.json """ import argparse import json import sys from datetime import datetime, timedelta from typing import Dict, List, Any, Optional, Tuple from collections import defaultdict class DataValidator: """Validates Databento market data quality.""" def __init__( self, schema: str, max_gap_minutes: int = 60, price_outlier_std: float = 10.0 ): """ Initialize validator. Args: schema: Data schema (ohlcv-1h, trades, mbp-1, etc.) max_gap_minutes: Maximum acceptable gap in minutes price_outlier_std: Standard deviations for outlier detection """ self.schema = schema self.max_gap_seconds = max_gap_minutes * 60 self.price_outlier_std = price_outlier_std self.issues: List[Dict[str, Any]] = [] def validate(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Run all validation checks on data. Args: data: List of records to validate Returns: Validation report """ print(f"[VALIDATION] Running quality checks on {len(data)} records...") report = { "total_records": len(data), "valid": True, "checks": {} } if not data: print("[WARNING] No data to validate!") report["valid"] = False return report # Run all validation checks report["checks"]["timestamp_gaps"] = self.check_timestamp_gaps(data) report["checks"]["duplicates"] = self.check_duplicates(data) report["checks"]["price_range"] = self.check_price_range(data) report["checks"]["record_count"] = self.check_record_count(data) report["checks"]["data_completeness"] = self.check_completeness(data) # Overall validity report["valid"] = all( check.get("valid", True) for check in report["checks"].values() ) report["issues"] = self.issues return report def check_timestamp_gaps(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Check for unexpected gaps in timestamps. Args: data: List of records Returns: Gap check report """ print("[CHECK] Checking for timestamp gaps...") gaps = [] timestamps = self._extract_timestamps(data) if len(timestamps) < 2: return {"valid": True, "gaps": [], "note": "Insufficient data for gap detection"} # Sort timestamps sorted_ts = sorted(timestamps) # Check gaps between consecutive timestamps for i in range(len(sorted_ts) - 1): gap_ns = sorted_ts[i + 1] - sorted_ts[i] gap_seconds = gap_ns / 1_000_000_000 if gap_seconds > self.max_gap_seconds: gap_info = { "index": i, "gap_seconds": gap_seconds, "gap_minutes": gap_seconds / 60, "before": self._format_timestamp(sorted_ts[i]), "after": self._format_timestamp(sorted_ts[i + 1]) } gaps.append(gap_info) self.issues.append({ "type": "timestamp_gap", "severity": "warning", "message": f"Gap of {gap_seconds / 60:.1f} minutes detected", **gap_info }) valid = len(gaps) == 0 print(f"[CHECK] Found {len(gaps)} gaps > {self.max_gap_seconds / 60} minutes") return { "valid": valid, "gaps_found": len(gaps), "gaps": gaps[:10] if gaps else [], # Limit to first 10 for report "total_gaps": len(gaps) } def check_duplicates(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Check for duplicate timestamps. Args: data: List of records Returns: Duplicate check report """ print("[CHECK] Checking for duplicate timestamps...") timestamps = self._extract_timestamps(data) timestamp_counts = defaultdict(int) for ts in timestamps: timestamp_counts[ts] += 1 duplicates = {ts: count for ts, count in timestamp_counts.items() if count > 1} if duplicates: for ts, count in list(duplicates.items())[:10]: # Limit to first 10 self.issues.append({ "type": "duplicate_timestamp", "severity": "error", "timestamp": self._format_timestamp(ts), "count": count, "message": f"Timestamp appears {count} times" }) valid = len(duplicates) == 0 print(f"[CHECK] Found {len(duplicates)} duplicate timestamps") return { "valid": valid, "duplicates_found": len(duplicates), "duplicate_timestamps": len(duplicates) } def check_price_range(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Check for invalid or outlier prices. Args: data: List of records Returns: Price range check report """ print("[CHECK] Checking price ranges...") prices = self._extract_prices(data) if not prices: return {"valid": True, "note": "No price data to validate"} # Check for negative prices negative_prices = [p for p in prices if p < 0] # Check for zero prices (unusual for ES/NQ) zero_prices = [p for p in prices if p == 0] # Calculate statistics for outlier detection if len(prices) > 1: mean_price = sum(prices) / len(prices) variance = sum((p - mean_price) ** 2 for p in prices) / len(prices) std_dev = variance ** 0.5 # Detect outliers (> N standard deviations from mean) outliers = [] for p in prices: if abs(p - mean_price) > (self.price_outlier_std * std_dev): outliers.append(p) if len(outliers) <= 10: # Limit issues self.issues.append({ "type": "price_outlier", "severity": "warning", "price": p, "mean": mean_price, "std_dev": std_dev, "message": f"Price {p:.2f} is {abs(p - mean_price) / std_dev:.1f} std devs from mean" }) else: outliers = [] mean_price = prices[0] if prices else 0 std_dev = 0 # Report negative prices as errors for p in negative_prices[:10]: # Limit to first 10 self.issues.append({ "type": "negative_price", "severity": "error", "price": p, "message": f"Negative price detected: {p}" }) valid = len(negative_prices) == 0 and len(zero_prices) == 0 print(f"[CHECK] Price range: {min(prices):.2f} to {max(prices):.2f}") print(f"[CHECK] Negative prices: {len(negative_prices)}, Zero prices: {len(zero_prices)}, Outliers: {len(outliers)}") return { "valid": valid, "min_price": min(prices), "max_price": max(prices), "mean_price": mean_price, "std_dev": std_dev, "negative_prices": len(negative_prices), "zero_prices": len(zero_prices), "outliers": len(outliers) } def check_record_count(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Verify expected record count. Args: data: List of records Returns: Record count check report """ print(f"[CHECK] Verifying record count: {len(data)} records") # For OHLCV data, can estimate expected count based on timeframe expected_count = self._estimate_expected_count(data) valid = True if expected_count and abs(len(data) - expected_count) > (expected_count * 0.1): # More than 10% deviation valid = False self.issues.append({ "type": "unexpected_record_count", "severity": "warning", "actual": len(data), "expected": expected_count, "message": f"Expected ~{expected_count} records, got {len(data)}" }) return { "valid": valid, "actual_count": len(data), "expected_count": expected_count, "note": "Expected count is estimated based on schema and date range" } def check_completeness(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Check data completeness (required fields present). Args: data: List of records Returns: Completeness check report """ print("[CHECK] Checking data completeness...") if not data: return {"valid": False, "note": "No data"} # Check required fields based on schema required_fields = self._get_required_fields() missing_fields = defaultdict(int) for record in data[:100]: # Sample first 100 records for field in required_fields: if field not in record or record[field] is None: missing_fields[field] += 1 if missing_fields: for field, count in missing_fields.items(): self.issues.append({ "type": "missing_field", "severity": "error", "field": field, "missing_count": count, "message": f"Field '{field}' missing in {count} records (sampled)" }) valid = len(missing_fields) == 0 return { "valid": valid, "missing_fields": dict(missing_fields) if missing_fields else {} } def _extract_timestamps(self, data: List[Dict[str, Any]]) -> List[int]: """Extract timestamps from records.""" timestamps = [] for record in data: # Try different timestamp field names ts = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp") if ts: timestamps.append(int(ts)) return timestamps def _extract_prices(self, data: List[Dict[str, Any]]) -> List[float]: """Extract prices from records.""" prices = [] for record in data: # For OHLCV, use close price if "close" in record: # Convert from fixed-point if needed price = record["close"] if isinstance(price, int) and price > 1_000_000: price = price / 1_000_000_000 # Fixed-point conversion prices.append(float(price)) # For trades/mbp, use price field elif "price" in record: price = record["price"] if isinstance(price, int) and price > 1_000_000: price = price / 1_000_000_000 prices.append(float(price)) return prices def _format_timestamp(self, ts_ns: int) -> str: """Format nanosecond timestamp to readable string.""" ts_seconds = ts_ns / 1_000_000_000 dt = datetime.fromtimestamp(ts_seconds) return dt.strftime("%Y-%m-%d %H:%M:%S") def _estimate_expected_count(self, data: List[Dict[str, Any]]) -> Optional[int]: """Estimate expected record count based on schema and date range.""" # This is a simplified estimation # In practice, would calculate based on actual date range if "ohlcv" in self.schema: if "1h" in self.schema: return None # ~24 records per day per symbol elif "1d" in self.schema: return None # ~1 record per day per symbol return None def _get_required_fields(self) -> List[str]: """Get required fields for schema.""" base_fields = ["ts_event", "ts_recv"] if "ohlcv" in self.schema: return base_fields + ["open", "high", "low", "close", "volume"] elif self.schema == "trades": return base_fields + ["price", "size"] elif "mbp" in self.schema: return base_fields + ["bid_px_00", "ask_px_00", "bid_sz_00", "ask_sz_00"] else: return base_fields def print_report(self, report: Dict[str, Any]): """Print validation report to console.""" print("\n" + "=" * 60) print("DATA VALIDATION REPORT") print("=" * 60) print(f"\nTotal Records: {report['total_records']}") print(f"Overall Valid: {'✓ YES' if report['valid'] else '✗ NO'}") print("\n" + "-" * 60) print("CHECK RESULTS") print("-" * 60) for check_name, check_result in report["checks"].items(): status = "✓" if check_result.get("valid", True) else "✗" print(f"\n{status} {check_name.replace('_', ' ').title()}") for key, value in check_result.items(): if key != "valid" and key != "gaps": print(f" {key}: {value}") if report["issues"]: print("\n" + "-" * 60) print(f"ISSUES FOUND ({len(report['issues'])})") print("-" * 60) for i, issue in enumerate(report["issues"][:20], 1): # Limit to 20 print(f"\n{i}. [{issue['severity'].upper()}] {issue['type']}") print(f" {issue['message']}") if len(report["issues"]) > 20: print(f"\n... and {len(report['issues']) - 20} more issues") print("\n" + "=" * 60) def main(): """Main entry point for CLI usage.""" parser = argparse.ArgumentParser( description="Validate Databento market data quality" ) parser.add_argument( "--input", "-i", required=True, help="Input data file (JSON or CSV)" ) parser.add_argument( "--schema", default="ohlcv-1h", help="Data schema (default: ohlcv-1h)" ) parser.add_argument( "--max-gap-minutes", type=int, default=60, help="Maximum acceptable gap in minutes (default: 60)" ) parser.add_argument( "--price-outlier-std", type=float, default=10.0, help="Standard deviations for outlier detection (default: 10.0)" ) parser.add_argument( "--report", "-r", help="Save report to JSON file" ) args = parser.parse_args() # Load data print(f"[LOAD] Loading data from {args.input}...") with open(args.input, 'r') as f: data = json.load(f) # Handle different data formats if isinstance(data, dict) and "data" in data: data = data["data"] # Create validator validator = DataValidator( schema=args.schema, max_gap_minutes=args.max_gap_minutes, price_outlier_std=args.price_outlier_std ) # Run validation report = validator.validate(data) # Print report validator.print_report(report) # Save report if requested if args.report: print(f"\n[SAVE] Saving report to {args.report}...") with open(args.report, 'w') as f: json.dump(report, f, indent=2) print(f"[SUCCESS] Report saved!") # Exit with appropriate code sys.exit(0 if report["valid"] else 1) if __name__ == "__main__": main()