Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:43:40 +08:00
commit d6cdda3f30
13 changed files with 4664 additions and 0 deletions

View File

@@ -0,0 +1,345 @@
#!/usr/bin/env python3
"""
Databento OHLCV Data Fetcher
Standard pattern for fetching OHLCV data with built-in best practices:
- Automatic cost estimation before fetch
- Error handling with retries
- Post-fetch data validation
- Export options (CSV/pandas)
Usage:
python fetch_ohlcv.py --symbol ES.c.0 --schema ohlcv-1h --start 2024-01-01 --end 2024-01-31
python fetch_ohlcv.py --symbol NQ.c.0 --schema ohlcv-1d --start 2024-01-01 --limit 100
python fetch_ohlcv.py --symbol ES.c.0,NQ.c.0 --schema ohlcv-1h --start 2024-01-01 --output data.csv
"""
import argparse
import json
import sys
from datetime import datetime
from typing import Optional, Dict, Any, List
import time
class DatabentoPHTLCVFetcher:
"""Fetches OHLCV data from Databento with best practices built-in."""
def __init__(self, dataset: str = "GLBX.MDP3", stype_in: str = "continuous"):
"""
Initialize fetcher.
Args:
dataset: Dataset code (default: GLBX.MDP3 for ES/NQ)
stype_in: Input symbol type (default: continuous)
"""
self.dataset = dataset
self.stype_in = stype_in
self.max_retries = 3
self.retry_delay = 2 # seconds
def estimate_cost(
self,
symbols: str,
schema: str,
start: str,
end: Optional[str] = None
) -> Dict[str, Any]:
"""
Estimate cost before fetching data.
Args:
symbols: Comma-separated symbol list
schema: Data schema (e.g., ohlcv-1h)
start: Start date (YYYY-MM-DD)
end: End date (optional)
Returns:
Cost estimation result
"""
print(f"[COST CHECK] Estimating cost for {symbols} ({schema})...")
# NOTE: In actual usage, this would call the MCP tool:
# mcp__databento__metadata_get_cost(
# dataset=self.dataset,
# start=start,
# end=end,
# symbols=symbols,
# schema=schema,
# stype_in=self.stype_in
# )
# For this template, we simulate the response
print("[NOTE] This template script demonstrates the pattern.")
print("[NOTE] In actual usage, integrate with MCP tools directly.")
return {
"estimated_cost_usd": 0.0,
"estimated_size_mb": 0.0,
"note": "Call mcp__databento__metadata_get_cost here"
}
def validate_dataset_range(self) -> Dict[str, str]:
"""
Validate dataset availability.
Returns:
Dataset date range
"""
print(f"[VALIDATION] Checking dataset availability for {self.dataset}...")
# NOTE: In actual usage, this would call:
# mcp__databento__metadata_get_dataset_range(dataset=self.dataset)
return {
"start_date": "2000-01-01",
"end_date": datetime.now().strftime("%Y-%m-%d"),
"note": "Call mcp__databento__metadata_get_dataset_range here"
}
def fetch_data(
self,
symbols: str,
schema: str,
start: str,
end: Optional[str] = None,
limit: Optional[int] = None,
check_cost: bool = True
) -> Dict[str, Any]:
"""
Fetch OHLCV data with retries and error handling.
Args:
symbols: Comma-separated symbol list
schema: Data schema (e.g., ohlcv-1h, ohlcv-1d)
start: Start date (YYYY-MM-DD)
end: End date (optional)
limit: Maximum number of records (optional)
check_cost: Whether to check cost before fetching (default: True)
Returns:
Fetched data
"""
# Step 1: Cost check (if enabled)
if check_cost:
cost_info = self.estimate_cost(symbols, schema, start, end)
print(f"[COST] Estimated cost: ${cost_info.get('estimated_cost_usd', 0):.2f}")
print(f"[COST] Estimated size: {cost_info.get('estimated_size_mb', 0):.2f} MB")
# Prompt for confirmation if cost is high
estimated_cost = cost_info.get('estimated_cost_usd', 0)
if estimated_cost > 10:
response = input(f"\nEstimated cost is ${estimated_cost:.2f}. Continue? (y/n): ")
if response.lower() != 'y':
print("[CANCELLED] Data fetch cancelled by user.")
sys.exit(0)
# Step 2: Validate dataset
dataset_range = self.validate_dataset_range()
print(f"[DATASET] Available range: {dataset_range.get('start_date')} to {dataset_range.get('end_date')}")
# Step 3: Fetch data with retries
for attempt in range(self.max_retries):
try:
print(f"\n[FETCH] Attempt {attempt + 1}/{self.max_retries}")
print(f"[FETCH] Fetching {symbols} ({schema}) from {start} to {end or 'now'}...")
# NOTE: In actual usage, this would call:
# data = mcp__databento__timeseries_get_range(
# dataset=self.dataset,
# symbols=symbols,
# schema=schema,
# start=start,
# end=end,
# stype_in=self.stype_in,
# stype_out="instrument_id",
# limit=limit
# )
# Simulate successful fetch
print("[SUCCESS] Data fetched successfully!")
return {
"data": [],
"record_count": 0,
"note": "Call mcp__databento__timeseries_get_range here"
}
except Exception as e:
print(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
if attempt < self.max_retries - 1:
print(f"[RETRY] Waiting {self.retry_delay} seconds before retry...")
time.sleep(self.retry_delay)
else:
print("[FAILED] All retry attempts exhausted.")
raise
def validate_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate fetched data quality.
Args:
data: Fetched data
Returns:
Validation report
"""
print("\n[VALIDATION] Running data quality checks...")
# NOTE: Actual validation would:
# - Check for timestamp gaps
# - Verify record counts
# - Validate price ranges
# - Check for duplicates
# Use scripts/validate_data.py for comprehensive validation
return {
"valid": True,
"record_count": data.get("record_count", 0),
"issues": [],
"note": "Use scripts/validate_data.py for detailed validation"
}
def export_csv(self, data: Dict[str, Any], output_path: str):
"""
Export data to CSV.
Args:
data: Data to export
output_path: Output file path
"""
print(f"\n[EXPORT] Saving data to {output_path}...")
# NOTE: Actual export would convert data to CSV format
# and write to file
print(f"[SUCCESS] Data saved to {output_path}")
def export_json(self, data: Dict[str, Any], output_path: str):
"""
Export data to JSON.
Args:
data: Data to export
output_path: Output file path
"""
print(f"\n[EXPORT] Saving data to {output_path}...")
with open(output_path, 'w') as f:
json.dump(data, f, indent=2)
print(f"[SUCCESS] Data saved to {output_path}")
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Fetch OHLCV data from Databento with best practices"
)
parser.add_argument(
"--symbol",
"-s",
required=True,
help="Symbol or comma-separated symbols (e.g., ES.c.0 or ES.c.0,NQ.c.0)"
)
parser.add_argument(
"--schema",
choices=["ohlcv-1s", "ohlcv-1m", "ohlcv-1h", "ohlcv-1d", "ohlcv-eod"],
default="ohlcv-1h",
help="OHLCV schema (default: ohlcv-1h)"
)
parser.add_argument(
"--start",
required=True,
help="Start date (YYYY-MM-DD)"
)
parser.add_argument(
"--end",
help="End date (YYYY-MM-DD, optional)"
)
parser.add_argument(
"--limit",
type=int,
help="Maximum number of records (optional)"
)
parser.add_argument(
"--dataset",
default="GLBX.MDP3",
help="Dataset code (default: GLBX.MDP3)"
)
parser.add_argument(
"--stype-in",
default="continuous",
choices=["continuous", "raw_symbol", "instrument_id"],
help="Input symbol type (default: continuous)"
)
parser.add_argument(
"--output",
"-o",
help="Output file path (CSV or JSON based on extension)"
)
parser.add_argument(
"--no-cost-check",
action="store_true",
help="Skip cost estimation (not recommended)"
)
args = parser.parse_args()
# Create fetcher
fetcher = DatabentOHLCVFetcher(
dataset=args.dataset,
stype_in=args.stype_in
)
try:
# Fetch data
data = fetcher.fetch_data(
symbols=args.symbol,
schema=args.schema,
start=args.start,
end=args.end,
limit=args.limit,
check_cost=not args.no_cost_check
)
# Validate data
validation = fetcher.validate_data(data)
print(f"\n[VALIDATION] Data is valid: {validation['valid']}")
print(f"[VALIDATION] Record count: {validation['record_count']}")
if validation['issues']:
print(f"[WARNING] Issues found: {validation['issues']}")
# Export if output specified
if args.output:
if args.output.endswith('.csv'):
fetcher.export_csv(data, args.output)
elif args.output.endswith('.json'):
fetcher.export_json(data, args.output)
else:
print("[WARNING] Unknown output format. Saving as JSON.")
fetcher.export_json(data, args.output + '.json')
print("\n[DONE] Fetch complete!")
except KeyboardInterrupt:
print("\n[CANCELLED] Fetch cancelled by user.")
sys.exit(1)
except Exception as e:
print(f"\n[ERROR] Fetch failed: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,388 @@
#!/usr/bin/env python3
"""
Databento Trading Session Filter
Filter market data by trading session (Asian/London/NY):
- Session detection using get_session_info
- Historical data filtering by session
- Session transition handling
- Session-specific statistics
Usage:
python session_filter.py --input data.json --session NY --output ny_session.json
python session_filter.py --input data.json --session London --stats
python session_filter.py --input data.json --sessions Asian,London --output combined.json
"""
import argparse
import json
import sys
from datetime import datetime, timezone, timedelta
from typing import Dict, List, Any, Optional, Tuple
from enum import Enum
class TradingSession(Enum):
"""Trading session definitions (in ET)."""
ASIAN = ("Asian", 18, 2) # 6pm - 2am ET
LONDON = ("London", 2, 8) # 2am - 8am ET
NY = ("NY", 8, 16) # 8am - 4pm ET
class SessionFilter:
"""Filters Databento market data by trading session."""
def __init__(self):
"""Initialize session filter."""
self.sessions = {
"Asian": TradingSession.ASIAN,
"London": TradingSession.LONDON,
"NY": TradingSession.NY
}
def get_current_session(self, timestamp: Optional[str] = None) -> str:
"""
Get trading session for a timestamp.
Args:
timestamp: ISO timestamp (optional, defaults to now)
Returns:
Session name (Asian, London, or NY)
"""
# NOTE: In actual usage, this would call:
# session_info = mcp__databento__get_session_info(timestamp=timestamp)
# return session_info["session"]
# For this template, simulate session detection
if timestamp:
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
else:
dt = datetime.now(timezone.utc)
# Convert to ET
et_hour = (dt.hour - 5) % 24 # Simplified ET conversion
# Determine session
if 18 <= et_hour or et_hour < 2:
return "Asian"
elif 2 <= et_hour < 8:
return "London"
else:
return "NY"
def is_in_session(
self,
timestamp_ns: int,
session: TradingSession
) -> bool:
"""
Check if timestamp falls within trading session.
Args:
timestamp_ns: Timestamp in nanoseconds
session: Trading session to check
Returns:
True if timestamp is in session
"""
# Convert nanoseconds to datetime
ts_seconds = timestamp_ns / 1_000_000_000
dt = datetime.fromtimestamp(ts_seconds, tz=timezone.utc)
# Convert to ET (simplified, doesn't handle DST)
et_offset = timedelta(hours=-5)
dt_et = dt + et_offset
hour = dt_et.hour
# Check if hour falls within session
_, start_hour, end_hour = session.value
if start_hour < end_hour:
# Session doesn't cross midnight
return start_hour <= hour < end_hour
else:
# Session crosses midnight (Asian session)
return hour >= start_hour or hour < end_hour
def filter_by_session(
self,
data: List[Dict[str, Any]],
sessions: List[str]
) -> List[Dict[str, Any]]:
"""
Filter data to include only specified sessions.
Args:
data: List of records
sessions: List of session names to include
Returns:
Filtered data
"""
print(f"[FILTER] Filtering {len(data)} records for sessions: {', '.join(sessions)}")
session_enums = [self.sessions[s] for s in sessions]
filtered = []
for record in data:
# Extract timestamp
ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
if not ts_ns:
continue
# Check if in any of the specified sessions
for session in session_enums:
if self.is_in_session(int(ts_ns), session):
filtered.append(record)
break
print(f"[FILTER] Kept {len(filtered)} records ({len(filtered)/len(data)*100:.1f}%)")
return filtered
def calculate_session_stats(
self,
data: List[Dict[str, Any]]
) -> Dict[str, Any]:
"""
Calculate statistics by trading session.
Args:
data: List of records
Returns:
Session statistics
"""
print(f"[STATS] Calculating session statistics for {len(data)} records...")
stats = {
"Asian": {"count": 0, "volume": 0, "trades": 0},
"London": {"count": 0, "volume": 0, "trades": 0},
"NY": {"count": 0, "volume": 0, "trades": 0}
}
for record in data:
ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
if not ts_ns:
continue
# Determine session
for session_name, session_enum in self.sessions.items():
if self.is_in_session(int(ts_ns), session_enum):
stats[session_name]["count"] += 1
# Add volume if available
if "volume" in record:
stats[session_name]["volume"] += record["volume"]
# Count trades
if "size" in record: # Trade record
stats[session_name]["trades"] += 1
break
# Calculate percentages
total_count = sum(s["count"] for s in stats.values())
for session_stats in stats.values():
if total_count > 0:
session_stats["percentage"] = (session_stats["count"] / total_count) * 100
else:
session_stats["percentage"] = 0
return stats
def filter_session_transitions(
self,
data: List[Dict[str, Any]],
minutes_before: int = 30,
minutes_after: int = 30
) -> List[Dict[str, Any]]:
"""
Filter data to include only session transitions (handoffs).
Args:
data: List of records
minutes_before: Minutes before transition to include
minutes_after: Minutes after transition to include
Returns:
Filtered data around session transitions
"""
print(f"[FILTER] Extracting session transitions ({minutes_before}m before, {minutes_after}m after)...")
# Session transition times (in ET)
transitions = [
2, # Asian → London (2am ET)
8, # London → NY (8am ET)
16, # NY → Post-market
18, # Post-market → Asian (6pm ET)
]
filtered = []
transition_window = timedelta(minutes=minutes_before + minutes_after)
for record in data:
ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
if not ts_ns:
continue
# Convert to ET hour
ts_seconds = int(ts_ns) / 1_000_000_000
dt = datetime.fromtimestamp(ts_seconds, tz=timezone.utc)
et_offset = timedelta(hours=-5)
dt_et = dt + et_offset
# Check if near any transition
for transition_hour in transitions:
transition_dt = dt_et.replace(hour=transition_hour, minute=0, second=0, microsecond=0)
# Calculate time difference
time_diff = abs((dt_et - transition_dt).total_seconds())
# Include if within window
if time_diff <= transition_window.total_seconds():
filtered.append(record)
break
print(f"[FILTER] Found {len(filtered)} records near session transitions")
return filtered
def print_session_stats(self, stats: Dict[str, Any]):
"""Print session statistics to console."""
print("\n" + "=" * 60)
print("SESSION STATISTICS")
print("=" * 60)
for session_name in ["Asian", "London", "NY"]:
session_stats = stats[session_name]
print(f"\n{session_name} Session:")
print(f" Records: {session_stats['count']:,} ({session_stats['percentage']:.1f}%)")
if session_stats['volume'] > 0:
print(f" Volume: {session_stats['volume']:,}")
if session_stats['trades'] > 0:
print(f" Trades: {session_stats['trades']:,}")
print("\n" + "=" * 60)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Filter Databento data by trading session"
)
parser.add_argument(
"--input",
"-i",
required=True,
help="Input data file (JSON)"
)
parser.add_argument(
"--session",
"--sessions",
help="Session(s) to filter (Asian, London, NY). Comma-separated for multiple."
)
parser.add_argument(
"--transitions",
action="store_true",
help="Filter for session transition periods only"
)
parser.add_argument(
"--minutes-before",
type=int,
default=30,
help="Minutes before transition (default: 30)"
)
parser.add_argument(
"--minutes-after",
type=int,
default=30,
help="Minutes after transition (default: 30)"
)
parser.add_argument(
"--stats",
action="store_true",
help="Calculate and display session statistics"
)
parser.add_argument(
"--output",
"-o",
help="Output file for filtered data (JSON)"
)
args = parser.parse_args()
# Load data
print(f"[LOAD] Loading data from {args.input}...")
with open(args.input, 'r') as f:
data = json.load(f)
# Handle different data formats
if isinstance(data, dict) and "data" in data:
data = data["data"]
# Create filter
session_filter = SessionFilter()
# Calculate stats if requested
if args.stats:
stats = session_filter.calculate_session_stats(data)
session_filter.print_session_stats(stats)
# Filter data
filtered_data = data
if args.transitions:
# Filter for session transitions
filtered_data = session_filter.filter_session_transitions(
filtered_data,
minutes_before=args.minutes_before,
minutes_after=args.minutes_after
)
elif args.session:
# Filter by specific session(s)
sessions = [s.strip() for s in args.session.split(',')]
# Validate sessions
for session in sessions:
if session not in ["Asian", "London", "NY"]:
print(f"[ERROR] Invalid session: {session}")
print("[ERROR] Valid sessions: Asian, London, NY")
sys.exit(1)
filtered_data = session_filter.filter_by_session(filtered_data, sessions)
# Save filtered data if output specified
if args.output:
print(f"\n[SAVE] Saving {len(filtered_data)} filtered records to {args.output}...")
output_data = {
"data": filtered_data,
"metadata": {
"original_count": len(data),
"filtered_count": len(filtered_data),
"filter_type": "transitions" if args.transitions else "sessions",
"sessions": args.session.split(',') if args.session else None
}
}
with open(args.output, 'w') as f:
json.dump(output_data, f, indent=2)
print(f"[SUCCESS] Filtered data saved!")
print("\n[DONE] Session filtering complete!")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,496 @@
#!/usr/bin/env python3
"""
Databento Data Quality Validator
Validates market data quality to catch issues early:
- Timestamp gap detection
- Record count verification
- Price range validation (no negative prices, outliers)
- Duplicate timestamp detection
- Summary quality report
Usage:
python validate_data.py --input data.json
python validate_data.py --input data.csv --schema ohlcv-1h
python validate_data.py --input data.json --max-gap-minutes 60 --report report.json
"""
import argparse
import json
import sys
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple
from collections import defaultdict
class DataValidator:
"""Validates Databento market data quality."""
def __init__(
self,
schema: str,
max_gap_minutes: int = 60,
price_outlier_std: float = 10.0
):
"""
Initialize validator.
Args:
schema: Data schema (ohlcv-1h, trades, mbp-1, etc.)
max_gap_minutes: Maximum acceptable gap in minutes
price_outlier_std: Standard deviations for outlier detection
"""
self.schema = schema
self.max_gap_seconds = max_gap_minutes * 60
self.price_outlier_std = price_outlier_std
self.issues: List[Dict[str, Any]] = []
def validate(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Run all validation checks on data.
Args:
data: List of records to validate
Returns:
Validation report
"""
print(f"[VALIDATION] Running quality checks on {len(data)} records...")
report = {
"total_records": len(data),
"valid": True,
"checks": {}
}
if not data:
print("[WARNING] No data to validate!")
report["valid"] = False
return report
# Run all validation checks
report["checks"]["timestamp_gaps"] = self.check_timestamp_gaps(data)
report["checks"]["duplicates"] = self.check_duplicates(data)
report["checks"]["price_range"] = self.check_price_range(data)
report["checks"]["record_count"] = self.check_record_count(data)
report["checks"]["data_completeness"] = self.check_completeness(data)
# Overall validity
report["valid"] = all(
check.get("valid", True)
for check in report["checks"].values()
)
report["issues"] = self.issues
return report
def check_timestamp_gaps(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check for unexpected gaps in timestamps.
Args:
data: List of records
Returns:
Gap check report
"""
print("[CHECK] Checking for timestamp gaps...")
gaps = []
timestamps = self._extract_timestamps(data)
if len(timestamps) < 2:
return {"valid": True, "gaps": [], "note": "Insufficient data for gap detection"}
# Sort timestamps
sorted_ts = sorted(timestamps)
# Check gaps between consecutive timestamps
for i in range(len(sorted_ts) - 1):
gap_ns = sorted_ts[i + 1] - sorted_ts[i]
gap_seconds = gap_ns / 1_000_000_000
if gap_seconds > self.max_gap_seconds:
gap_info = {
"index": i,
"gap_seconds": gap_seconds,
"gap_minutes": gap_seconds / 60,
"before": self._format_timestamp(sorted_ts[i]),
"after": self._format_timestamp(sorted_ts[i + 1])
}
gaps.append(gap_info)
self.issues.append({
"type": "timestamp_gap",
"severity": "warning",
"message": f"Gap of {gap_seconds / 60:.1f} minutes detected",
**gap_info
})
valid = len(gaps) == 0
print(f"[CHECK] Found {len(gaps)} gaps > {self.max_gap_seconds / 60} minutes")
return {
"valid": valid,
"gaps_found": len(gaps),
"gaps": gaps[:10] if gaps else [], # Limit to first 10 for report
"total_gaps": len(gaps)
}
def check_duplicates(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check for duplicate timestamps.
Args:
data: List of records
Returns:
Duplicate check report
"""
print("[CHECK] Checking for duplicate timestamps...")
timestamps = self._extract_timestamps(data)
timestamp_counts = defaultdict(int)
for ts in timestamps:
timestamp_counts[ts] += 1
duplicates = {ts: count for ts, count in timestamp_counts.items() if count > 1}
if duplicates:
for ts, count in list(duplicates.items())[:10]: # Limit to first 10
self.issues.append({
"type": "duplicate_timestamp",
"severity": "error",
"timestamp": self._format_timestamp(ts),
"count": count,
"message": f"Timestamp appears {count} times"
})
valid = len(duplicates) == 0
print(f"[CHECK] Found {len(duplicates)} duplicate timestamps")
return {
"valid": valid,
"duplicates_found": len(duplicates),
"duplicate_timestamps": len(duplicates)
}
def check_price_range(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check for invalid or outlier prices.
Args:
data: List of records
Returns:
Price range check report
"""
print("[CHECK] Checking price ranges...")
prices = self._extract_prices(data)
if not prices:
return {"valid": True, "note": "No price data to validate"}
# Check for negative prices
negative_prices = [p for p in prices if p < 0]
# Check for zero prices (unusual for ES/NQ)
zero_prices = [p for p in prices if p == 0]
# Calculate statistics for outlier detection
if len(prices) > 1:
mean_price = sum(prices) / len(prices)
variance = sum((p - mean_price) ** 2 for p in prices) / len(prices)
std_dev = variance ** 0.5
# Detect outliers (> N standard deviations from mean)
outliers = []
for p in prices:
if abs(p - mean_price) > (self.price_outlier_std * std_dev):
outliers.append(p)
if len(outliers) <= 10: # Limit issues
self.issues.append({
"type": "price_outlier",
"severity": "warning",
"price": p,
"mean": mean_price,
"std_dev": std_dev,
"message": f"Price {p:.2f} is {abs(p - mean_price) / std_dev:.1f} std devs from mean"
})
else:
outliers = []
mean_price = prices[0] if prices else 0
std_dev = 0
# Report negative prices as errors
for p in negative_prices[:10]: # Limit to first 10
self.issues.append({
"type": "negative_price",
"severity": "error",
"price": p,
"message": f"Negative price detected: {p}"
})
valid = len(negative_prices) == 0 and len(zero_prices) == 0
print(f"[CHECK] Price range: {min(prices):.2f} to {max(prices):.2f}")
print(f"[CHECK] Negative prices: {len(negative_prices)}, Zero prices: {len(zero_prices)}, Outliers: {len(outliers)}")
return {
"valid": valid,
"min_price": min(prices),
"max_price": max(prices),
"mean_price": mean_price,
"std_dev": std_dev,
"negative_prices": len(negative_prices),
"zero_prices": len(zero_prices),
"outliers": len(outliers)
}
def check_record_count(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Verify expected record count.
Args:
data: List of records
Returns:
Record count check report
"""
print(f"[CHECK] Verifying record count: {len(data)} records")
# For OHLCV data, can estimate expected count based on timeframe
expected_count = self._estimate_expected_count(data)
valid = True
if expected_count and abs(len(data) - expected_count) > (expected_count * 0.1):
# More than 10% deviation
valid = False
self.issues.append({
"type": "unexpected_record_count",
"severity": "warning",
"actual": len(data),
"expected": expected_count,
"message": f"Expected ~{expected_count} records, got {len(data)}"
})
return {
"valid": valid,
"actual_count": len(data),
"expected_count": expected_count,
"note": "Expected count is estimated based on schema and date range"
}
def check_completeness(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Check data completeness (required fields present).
Args:
data: List of records
Returns:
Completeness check report
"""
print("[CHECK] Checking data completeness...")
if not data:
return {"valid": False, "note": "No data"}
# Check required fields based on schema
required_fields = self._get_required_fields()
missing_fields = defaultdict(int)
for record in data[:100]: # Sample first 100 records
for field in required_fields:
if field not in record or record[field] is None:
missing_fields[field] += 1
if missing_fields:
for field, count in missing_fields.items():
self.issues.append({
"type": "missing_field",
"severity": "error",
"field": field,
"missing_count": count,
"message": f"Field '{field}' missing in {count} records (sampled)"
})
valid = len(missing_fields) == 0
return {
"valid": valid,
"missing_fields": dict(missing_fields) if missing_fields else {}
}
def _extract_timestamps(self, data: List[Dict[str, Any]]) -> List[int]:
"""Extract timestamps from records."""
timestamps = []
for record in data:
# Try different timestamp field names
ts = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
if ts:
timestamps.append(int(ts))
return timestamps
def _extract_prices(self, data: List[Dict[str, Any]]) -> List[float]:
"""Extract prices from records."""
prices = []
for record in data:
# For OHLCV, use close price
if "close" in record:
# Convert from fixed-point if needed
price = record["close"]
if isinstance(price, int) and price > 1_000_000:
price = price / 1_000_000_000 # Fixed-point conversion
prices.append(float(price))
# For trades/mbp, use price field
elif "price" in record:
price = record["price"]
if isinstance(price, int) and price > 1_000_000:
price = price / 1_000_000_000
prices.append(float(price))
return prices
def _format_timestamp(self, ts_ns: int) -> str:
"""Format nanosecond timestamp to readable string."""
ts_seconds = ts_ns / 1_000_000_000
dt = datetime.fromtimestamp(ts_seconds)
return dt.strftime("%Y-%m-%d %H:%M:%S")
def _estimate_expected_count(self, data: List[Dict[str, Any]]) -> Optional[int]:
"""Estimate expected record count based on schema and date range."""
# This is a simplified estimation
# In practice, would calculate based on actual date range
if "ohlcv" in self.schema:
if "1h" in self.schema:
return None # ~24 records per day per symbol
elif "1d" in self.schema:
return None # ~1 record per day per symbol
return None
def _get_required_fields(self) -> List[str]:
"""Get required fields for schema."""
base_fields = ["ts_event", "ts_recv"]
if "ohlcv" in self.schema:
return base_fields + ["open", "high", "low", "close", "volume"]
elif self.schema == "trades":
return base_fields + ["price", "size"]
elif "mbp" in self.schema:
return base_fields + ["bid_px_00", "ask_px_00", "bid_sz_00", "ask_sz_00"]
else:
return base_fields
def print_report(self, report: Dict[str, Any]):
"""Print validation report to console."""
print("\n" + "=" * 60)
print("DATA VALIDATION REPORT")
print("=" * 60)
print(f"\nTotal Records: {report['total_records']}")
print(f"Overall Valid: {'✓ YES' if report['valid'] else '✗ NO'}")
print("\n" + "-" * 60)
print("CHECK RESULTS")
print("-" * 60)
for check_name, check_result in report["checks"].items():
status = "" if check_result.get("valid", True) else ""
print(f"\n{status} {check_name.replace('_', ' ').title()}")
for key, value in check_result.items():
if key != "valid" and key != "gaps":
print(f" {key}: {value}")
if report["issues"]:
print("\n" + "-" * 60)
print(f"ISSUES FOUND ({len(report['issues'])})")
print("-" * 60)
for i, issue in enumerate(report["issues"][:20], 1): # Limit to 20
print(f"\n{i}. [{issue['severity'].upper()}] {issue['type']}")
print(f" {issue['message']}")
if len(report["issues"]) > 20:
print(f"\n... and {len(report['issues']) - 20} more issues")
print("\n" + "=" * 60)
def main():
"""Main entry point for CLI usage."""
parser = argparse.ArgumentParser(
description="Validate Databento market data quality"
)
parser.add_argument(
"--input",
"-i",
required=True,
help="Input data file (JSON or CSV)"
)
parser.add_argument(
"--schema",
default="ohlcv-1h",
help="Data schema (default: ohlcv-1h)"
)
parser.add_argument(
"--max-gap-minutes",
type=int,
default=60,
help="Maximum acceptable gap in minutes (default: 60)"
)
parser.add_argument(
"--price-outlier-std",
type=float,
default=10.0,
help="Standard deviations for outlier detection (default: 10.0)"
)
parser.add_argument(
"--report",
"-r",
help="Save report to JSON file"
)
args = parser.parse_args()
# Load data
print(f"[LOAD] Loading data from {args.input}...")
with open(args.input, 'r') as f:
data = json.load(f)
# Handle different data formats
if isinstance(data, dict) and "data" in data:
data = data["data"]
# Create validator
validator = DataValidator(
schema=args.schema,
max_gap_minutes=args.max_gap_minutes,
price_outlier_std=args.price_outlier_std
)
# Run validation
report = validator.validate(data)
# Print report
validator.print_report(report)
# Save report if requested
if args.report:
print(f"\n[SAVE] Saving report to {args.report}...")
with open(args.report, 'w') as f:
json.dump(report, f, indent=2)
print(f"[SUCCESS] Report saved!")
# Exit with appropriate code
sys.exit(0 if report["valid"] else 1)
if __name__ == "__main__":
main()