Files
gh-nice-wolf-studio-wolf-sk…/skills/databento/scripts/validate_data.py
2025-11-30 08:43:40 +08:00

497 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Databento Data Quality Validator
Validates market data quality to catch issues early:
- Timestamp gap detection
- Record count verification
- Price range validation (no negative prices, outliers)
- Duplicate timestamp detection
- Summary quality report
Usage:
python validate_data.py --input data.json
python validate_data.py --input data.csv --schema ohlcv-1h
python validate_data.py --input data.json --max-gap-minutes 60 --report report.json
"""
import argparse
import json
import sys
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from collections import defaultdict
class DataValidator:
"""Validates Databento market data quality."""
def __init__(
self,
schema: str,
max_gap_minutes: int = 60,
price_outlier_std: float = 10.0
):
"""
Initialize validator.
Args:
schema: Data schema (ohlcv-1h, trades, mbp-1, etc.)
max_gap_minutes: Maximum acceptable gap in minutes
price_outlier_std: Standard deviations for outlier detection
"""
self.schema = schema
self.max_gap_seconds = max_gap_minutes * 60
self.price_outlier_std = price_outlier_std
self.issues: List[Dict[str, Any]] = []
def validate(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Run all validation checks on data.
Args:
data: List of records to validate
Returns:
Validation report
"""
print(f"[VALIDATION] Running quality checks on {len(data)} records...")
report = {
"total_records": len(data),
"valid": True,
"checks": {}
}
if not data:
print("[WARNING] No data to validate!")
report["valid"] = False
return report
# Run all validation checks
report["checks"]["timestamp_gaps"] = self.check_timestamp_gaps(data)
report["checks"]["duplicates"] = self.check_duplicates(data)
report["checks"]["price_range"] = self.check_price_range(data)
report["checks"]["record_count"] = self.check_record_count(data)
report["checks"]["data_completeness"] = self.check_completeness(data)
# Overall validity
report["valid"] = all(
check.get("valid", True)
for check in report["checks"].values()
)
report["issues"] = self.issues
return report
def check_timestamp_gaps(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check for unexpected gaps in timestamps.
Args:
data: List of records
Returns:
Gap check report
"""
print("[CHECK] Checking for timestamp gaps...")
gaps = []
timestamps = self._extract_timestamps(data)
if len(timestamps) < 2:
return {"valid": True, "gaps": [], "note": "Insufficient data for gap detection"}
# Sort timestamps
sorted_ts = sorted(timestamps)
# Check gaps between consecutive timestamps
for i in range(len(sorted_ts) - 1):
gap_ns = sorted_ts[i + 1] - sorted_ts[i]
gap_seconds = gap_ns / 1_000_000_000
if gap_seconds > self.max_gap_seconds:
gap_info = {
"index": i,
"gap_seconds": gap_seconds,
"gap_minutes": gap_seconds / 60,
"before": self._format_timestamp(sorted_ts[i]),
"after": self._format_timestamp(sorted_ts[i + 1])
}
gaps.append(gap_info)
self.issues.append({
"type": "timestamp_gap",
"severity": "warning",
"message": f"Gap of {gap_seconds / 60:.1f} minutes detected",
**gap_info
})
valid = len(gaps) == 0
print(f"[CHECK] Found {len(gaps)} gaps > {self.max_gap_seconds / 60} minutes")
return {
"valid": valid,
"gaps_found": len(gaps),
"gaps": gaps[:10] if gaps else [], # Limit to first 10 for report
"total_gaps": len(gaps)
}
def check_duplicates(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check for duplicate timestamps.
Args:
data: List of records
Returns:
Duplicate check report
"""
print("[CHECK] Checking for duplicate timestamps...")
timestamps = self._extract_timestamps(data)
timestamp_counts = defaultdict(int)
for ts in timestamps:
timestamp_counts[ts] += 1
duplicates = {ts: count for ts, count in timestamp_counts.items() if count > 1}
if duplicates:
for ts, count in list(duplicates.items())[:10]: # Limit to first 10
self.issues.append({
"type": "duplicate_timestamp",
"severity": "error",
"timestamp": self._format_timestamp(ts),
"count": count,
"message": f"Timestamp appears {count} times"
})
valid = len(duplicates) == 0
print(f"[CHECK] Found {len(duplicates)} duplicate timestamps")
return {
"valid": valid,
"duplicates_found": len(duplicates),
"duplicate_timestamps": len(duplicates)
}
def check_price_range(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check for invalid or outlier prices.
Args:
data: List of records
Returns:
Price range check report
"""
print("[CHECK] Checking price ranges...")
prices = self._extract_prices(data)
if not prices:
return {"valid": True, "note": "No price data to validate"}
# Check for negative prices
negative_prices = [p for p in prices if p < 0]
# Check for zero prices (unusual for ES/NQ)
zero_prices = [p for p in prices if p == 0]
# Calculate statistics for outlier detection
if len(prices) > 1:
mean_price = sum(prices) / len(prices)
variance = sum((p - mean_price) ** 2 for p in prices) / len(prices)
std_dev = variance ** 0.5
# Detect outliers (> N standard deviations from mean)
outliers = []
for p in prices:
if abs(p - mean_price) > (self.price_outlier_std * std_dev):
outliers.append(p)
if len(outliers) <= 10: # Limit issues
self.issues.append({
"type": "price_outlier",
"severity": "warning",
"price": p,
"mean": mean_price,
"std_dev": std_dev,
"message": f"Price {p:.2f} is {abs(p - mean_price) / std_dev:.1f} std devs from mean"
})
else:
outliers = []
mean_price = prices[0] if prices else 0
std_dev = 0
# Report negative prices as errors
for p in negative_prices[:10]: # Limit to first 10
self.issues.append({
"type": "negative_price",
"severity": "error",
"price": p,
"message": f"Negative price detected: {p}"
})
valid = len(negative_prices) == 0 and len(zero_prices) == 0
print(f"[CHECK] Price range: {min(prices):.2f} to {max(prices):.2f}")
print(f"[CHECK] Negative prices: {len(negative_prices)}, Zero prices: {len(zero_prices)}, Outliers: {len(outliers)}")
return {
"valid": valid,
"min_price": min(prices),
"max_price": max(prices),
"mean_price": mean_price,
"std_dev": std_dev,
"negative_prices": len(negative_prices),
"zero_prices": len(zero_prices),
"outliers": len(outliers)
}
def check_record_count(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Verify expected record count.
Args:
data: List of records
Returns:
Record count check report
"""
print(f"[CHECK] Verifying record count: {len(data)} records")
# For OHLCV data, can estimate expected count based on timeframe
expected_count = self._estimate_expected_count(data)
valid = True
if expected_count and abs(len(data) - expected_count) > (expected_count * 0.1):
# More than 10% deviation
valid = False
self.issues.append({
"type": "unexpected_record_count",
"severity": "warning",
"actual": len(data),
"expected": expected_count,
"message": f"Expected ~{expected_count} records, got {len(data)}"
})
return {
"valid": valid,
"actual_count": len(data),
"expected_count": expected_count,
"note": "Expected count is estimated based on schema and date range"
}
def check_completeness(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check data completeness (required fields present).
Args:
data: List of records
Returns:
Completeness check report
"""
print("[CHECK] Checking data completeness...")
if not data:
return {"valid": False, "note": "No data"}
# Check required fields based on schema
required_fields = self._get_required_fields()
missing_fields = defaultdict(int)
for record in data[:100]: # Sample first 100 records
for field in required_fields:
if field not in record or record[field] is None:
missing_fields[field] += 1
if missing_fields:
for field, count in missing_fields.items():
self.issues.append({
"type": "missing_field",
"severity": "error",
"field": field,
"missing_count": count,
"message": f"Field '{field}' missing in {count} records (sampled)"
})
valid = len(missing_fields) == 0
return {
"valid": valid,
"missing_fields": dict(missing_fields) if missing_fields else {}
}
def _extract_timestamps(self, data: List[Dict[str, Any]]) -> List[int]:
"""Extract timestamps from records."""
timestamps = []
for record in data:
# Try different timestamp field names
ts = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
if ts:
timestamps.append(int(ts))
return timestamps
def _extract_prices(self, data: List[Dict[str, Any]]) -> List[float]:
"""Extract prices from records."""
prices = []
for record in data:
# For OHLCV, use close price
if "close" in record:
# Convert from fixed-point if needed
price = record["close"]
if isinstance(price, int) and price > 1_000_000:
price = price / 1_000_000_000 # Fixed-point conversion
prices.append(float(price))
# For trades/mbp, use price field
elif "price" in record:
price = record["price"]
if isinstance(price, int) and price > 1_000_000:
price = price / 1_000_000_000
prices.append(float(price))
return prices
def _format_timestamp(self, ts_ns: int) -> str:
"""Format nanosecond timestamp to readable string."""
ts_seconds = ts_ns / 1_000_000_000
dt = datetime.fromtimestamp(ts_seconds)
return dt.strftime("%Y-%m-%d %H:%M:%S")
def _estimate_expected_count(self, data: List[Dict[str, Any]]) -> Optional[int]:
"""Estimate expected record count based on schema and date range."""
# This is a simplified estimation
# In practice, would calculate based on actual date range
if "ohlcv" in self.schema:
if "1h" in self.schema:
return None # ~24 records per day per symbol
elif "1d" in self.schema:
return None # ~1 record per day per symbol
return None
def _get_required_fields(self) -> List[str]:
"""Get required fields for schema."""
base_fields = ["ts_event", "ts_recv"]
if "ohlcv" in self.schema:
return base_fields + ["open", "high", "low", "close", "volume"]
elif self.schema == "trades":
return base_fields + ["price", "size"]
elif "mbp" in self.schema:
return base_fields + ["bid_px_00", "ask_px_00", "bid_sz_00", "ask_sz_00"]
else:
return base_fields
def print_report(self, report: Dict[str, Any]):
"""Print validation report to console."""
print("\n" + "=" * 60)
print("DATA VALIDATION REPORT")
print("=" * 60)
print(f"\nTotal Records: {report['total_records']}")
print(f"Overall Valid: {'✓ YES' if report['valid'] else '✗ NO'}")
print("\n" + "-" * 60)
print("CHECK RESULTS")
print("-" * 60)
for check_name, check_result in report["checks"].items():
status = "" if check_result.get("valid", True) else ""
print(f"\n{status} {check_name.replace('_', ' ').title()}")
for key, value in check_result.items():
if key != "valid" and key != "gaps":
print(f" {key}: {value}")
if report["issues"]:
print("\n" + "-" * 60)
print(f"ISSUES FOUND ({len(report['issues'])})")
print("-" * 60)
for i, issue in enumerate(report["issues"][:20], 1): # Limit to 20
print(f"\n{i}. [{issue['severity'].upper()}] {issue['type']}")
print(f" {issue['message']}")
if len(report["issues"]) > 20:
print(f"\n... and {len(report['issues']) - 20} more issues")
print("\n" + "=" * 60)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Validate Databento market data quality"
)
parser.add_argument(
"--input",
"-i",
required=True,
help="Input data file (JSON or CSV)"
)
parser.add_argument(
"--schema",
default="ohlcv-1h",
help="Data schema (default: ohlcv-1h)"
)
parser.add_argument(
"--max-gap-minutes",
type=int,
default=60,
help="Maximum acceptable gap in minutes (default: 60)"
)
parser.add_argument(
"--price-outlier-std",
type=float,
default=10.0,
help="Standard deviations for outlier detection (default: 10.0)"
)
parser.add_argument(
"--report",
"-r",
help="Save report to JSON file"
)
args = parser.parse_args()
# Load data
print(f"[LOAD] Loading data from {args.input}...")
with open(args.input, 'r') as f:
data = json.load(f)
# Handle different data formats
if isinstance(data, dict) and "data" in data:
data = data["data"]
# Create validator
validator = DataValidator(
schema=args.schema,
max_gap_minutes=args.max_gap_minutes,
price_outlier_std=args.price_outlier_std
)
# Run validation
report = validator.validate(data)
# Print report
validator.print_report(report)
# Save report if requested
if args.report:
print(f"\n[SAVE] Saving report to {args.report}...")
with open(args.report, 'w') as f:
json.dump(report, f, indent=2)
print(f"[SUCCESS] Report saved!")
# Exit with appropriate code
sys.exit(0 if report["valid"] else 1)
if __name__ == "__main__":
main()