Initial commit
This commit is contained in:
345
skills/databento/scripts/fetch_ohlcv.py
Normal file
345
skills/databento/scripts/fetch_ohlcv.py
Normal file
@@ -0,0 +1,345 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Databento OHLCV Data Fetcher
|
||||
|
||||
Standard pattern for fetching OHLCV data with built-in best practices:
|
||||
- Automatic cost estimation before fetch
|
||||
- Error handling with retries
|
||||
- Post-fetch data validation
|
||||
- Export options (CSV/pandas)
|
||||
|
||||
Usage:
|
||||
python fetch_ohlcv.py --symbol ES.c.0 --schema ohlcv-1h --start 2024-01-01 --end 2024-01-31
|
||||
python fetch_ohlcv.py --symbol NQ.c.0 --schema ohlcv-1d --start 2024-01-01 --limit 100
|
||||
python fetch_ohlcv.py --symbol ES.c.0,NQ.c.0 --schema ohlcv-1h --start 2024-01-01 --output data.csv
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
import time
|
||||
|
||||
|
||||
class DatabentoPHTLCVFetcher:
|
||||
"""Fetches OHLCV data from Databento with best practices built-in."""
|
||||
|
||||
def __init__(self, dataset: str = "GLBX.MDP3", stype_in: str = "continuous"):
|
||||
"""
|
||||
Initialize fetcher.
|
||||
|
||||
Args:
|
||||
dataset: Dataset code (default: GLBX.MDP3 for ES/NQ)
|
||||
stype_in: Input symbol type (default: continuous)
|
||||
"""
|
||||
self.dataset = dataset
|
||||
self.stype_in = stype_in
|
||||
self.max_retries = 3
|
||||
self.retry_delay = 2 # seconds
|
||||
|
||||
def estimate_cost(
|
||||
self,
|
||||
symbols: str,
|
||||
schema: str,
|
||||
start: str,
|
||||
end: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Estimate cost before fetching data.
|
||||
|
||||
Args:
|
||||
symbols: Comma-separated symbol list
|
||||
schema: Data schema (e.g., ohlcv-1h)
|
||||
start: Start date (YYYY-MM-DD)
|
||||
end: End date (optional)
|
||||
|
||||
Returns:
|
||||
Cost estimation result
|
||||
"""
|
||||
print(f"[COST CHECK] Estimating cost for {symbols} ({schema})...")
|
||||
|
||||
# NOTE: In actual usage, this would call the MCP tool:
|
||||
# mcp__databento__metadata_get_cost(
|
||||
# dataset=self.dataset,
|
||||
# start=start,
|
||||
# end=end,
|
||||
# symbols=symbols,
|
||||
# schema=schema,
|
||||
# stype_in=self.stype_in
|
||||
# )
|
||||
|
||||
# For this template, we simulate the response
|
||||
print("[NOTE] This template script demonstrates the pattern.")
|
||||
print("[NOTE] In actual usage, integrate with MCP tools directly.")
|
||||
|
||||
return {
|
||||
"estimated_cost_usd": 0.0,
|
||||
"estimated_size_mb": 0.0,
|
||||
"note": "Call mcp__databento__metadata_get_cost here"
|
||||
}
|
||||
|
||||
def validate_dataset_range(self) -> Dict[str, str]:
|
||||
"""
|
||||
Validate dataset availability.
|
||||
|
||||
Returns:
|
||||
Dataset date range
|
||||
"""
|
||||
print(f"[VALIDATION] Checking dataset availability for {self.dataset}...")
|
||||
|
||||
# NOTE: In actual usage, this would call:
|
||||
# mcp__databento__metadata_get_dataset_range(dataset=self.dataset)
|
||||
|
||||
return {
|
||||
"start_date": "2000-01-01",
|
||||
"end_date": datetime.now().strftime("%Y-%m-%d"),
|
||||
"note": "Call mcp__databento__metadata_get_dataset_range here"
|
||||
}
|
||||
|
||||
def fetch_data(
|
||||
self,
|
||||
symbols: str,
|
||||
schema: str,
|
||||
start: str,
|
||||
end: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
check_cost: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch OHLCV data with retries and error handling.
|
||||
|
||||
Args:
|
||||
symbols: Comma-separated symbol list
|
||||
schema: Data schema (e.g., ohlcv-1h, ohlcv-1d)
|
||||
start: Start date (YYYY-MM-DD)
|
||||
end: End date (optional)
|
||||
limit: Maximum number of records (optional)
|
||||
check_cost: Whether to check cost before fetching (default: True)
|
||||
|
||||
Returns:
|
||||
Fetched data
|
||||
"""
|
||||
# Step 1: Cost check (if enabled)
|
||||
if check_cost:
|
||||
cost_info = self.estimate_cost(symbols, schema, start, end)
|
||||
print(f"[COST] Estimated cost: ${cost_info.get('estimated_cost_usd', 0):.2f}")
|
||||
print(f"[COST] Estimated size: {cost_info.get('estimated_size_mb', 0):.2f} MB")
|
||||
|
||||
# Prompt for confirmation if cost is high
|
||||
estimated_cost = cost_info.get('estimated_cost_usd', 0)
|
||||
if estimated_cost > 10:
|
||||
response = input(f"\nEstimated cost is ${estimated_cost:.2f}. Continue? (y/n): ")
|
||||
if response.lower() != 'y':
|
||||
print("[CANCELLED] Data fetch cancelled by user.")
|
||||
sys.exit(0)
|
||||
|
||||
# Step 2: Validate dataset
|
||||
dataset_range = self.validate_dataset_range()
|
||||
print(f"[DATASET] Available range: {dataset_range.get('start_date')} to {dataset_range.get('end_date')}")
|
||||
|
||||
# Step 3: Fetch data with retries
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
print(f"\n[FETCH] Attempt {attempt + 1}/{self.max_retries}")
|
||||
print(f"[FETCH] Fetching {symbols} ({schema}) from {start} to {end or 'now'}...")
|
||||
|
||||
# NOTE: In actual usage, this would call:
|
||||
# data = mcp__databento__timeseries_get_range(
|
||||
# dataset=self.dataset,
|
||||
# symbols=symbols,
|
||||
# schema=schema,
|
||||
# start=start,
|
||||
# end=end,
|
||||
# stype_in=self.stype_in,
|
||||
# stype_out="instrument_id",
|
||||
# limit=limit
|
||||
# )
|
||||
|
||||
# Simulate successful fetch
|
||||
print("[SUCCESS] Data fetched successfully!")
|
||||
return {
|
||||
"data": [],
|
||||
"record_count": 0,
|
||||
"note": "Call mcp__databento__timeseries_get_range here"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
|
||||
|
||||
if attempt < self.max_retries - 1:
|
||||
print(f"[RETRY] Waiting {self.retry_delay} seconds before retry...")
|
||||
time.sleep(self.retry_delay)
|
||||
else:
|
||||
print("[FAILED] All retry attempts exhausted.")
|
||||
raise
|
||||
|
||||
def validate_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Validate fetched data quality.
|
||||
|
||||
Args:
|
||||
data: Fetched data
|
||||
|
||||
Returns:
|
||||
Validation report
|
||||
"""
|
||||
print("\n[VALIDATION] Running data quality checks...")
|
||||
|
||||
# NOTE: Actual validation would:
|
||||
# - Check for timestamp gaps
|
||||
# - Verify record counts
|
||||
# - Validate price ranges
|
||||
# - Check for duplicates
|
||||
|
||||
# Use scripts/validate_data.py for comprehensive validation
|
||||
|
||||
return {
|
||||
"valid": True,
|
||||
"record_count": data.get("record_count", 0),
|
||||
"issues": [],
|
||||
"note": "Use scripts/validate_data.py for detailed validation"
|
||||
}
|
||||
|
||||
def export_csv(self, data: Dict[str, Any], output_path: str):
|
||||
"""
|
||||
Export data to CSV.
|
||||
|
||||
Args:
|
||||
data: Data to export
|
||||
output_path: Output file path
|
||||
"""
|
||||
print(f"\n[EXPORT] Saving data to {output_path}...")
|
||||
|
||||
# NOTE: Actual export would convert data to CSV format
|
||||
# and write to file
|
||||
|
||||
print(f"[SUCCESS] Data saved to {output_path}")
|
||||
|
||||
def export_json(self, data: Dict[str, Any], output_path: str):
|
||||
"""
|
||||
Export data to JSON.
|
||||
|
||||
Args:
|
||||
data: Data to export
|
||||
output_path: Output file path
|
||||
"""
|
||||
print(f"\n[EXPORT] Saving data to {output_path}...")
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
print(f"[SUCCESS] Data saved to {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch OHLCV data from Databento with best practices"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--symbol",
|
||||
"-s",
|
||||
required=True,
|
||||
help="Symbol or comma-separated symbols (e.g., ES.c.0 or ES.c.0,NQ.c.0)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--schema",
|
||||
choices=["ohlcv-1s", "ohlcv-1m", "ohlcv-1h", "ohlcv-1d", "ohlcv-eod"],
|
||||
default="ohlcv-1h",
|
||||
help="OHLCV schema (default: ohlcv-1h)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--start",
|
||||
required=True,
|
||||
help="Start date (YYYY-MM-DD)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--end",
|
||||
help="End date (YYYY-MM-DD, optional)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Maximum number of records (optional)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
default="GLBX.MDP3",
|
||||
help="Dataset code (default: GLBX.MDP3)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--stype-in",
|
||||
default="continuous",
|
||||
choices=["continuous", "raw_symbol", "instrument_id"],
|
||||
help="Input symbol type (default: continuous)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
"-o",
|
||||
help="Output file path (CSV or JSON based on extension)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--no-cost-check",
|
||||
action="store_true",
|
||||
help="Skip cost estimation (not recommended)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create fetcher
|
||||
fetcher = DatabentOHLCVFetcher(
|
||||
dataset=args.dataset,
|
||||
stype_in=args.stype_in
|
||||
)
|
||||
|
||||
try:
|
||||
# Fetch data
|
||||
data = fetcher.fetch_data(
|
||||
symbols=args.symbol,
|
||||
schema=args.schema,
|
||||
start=args.start,
|
||||
end=args.end,
|
||||
limit=args.limit,
|
||||
check_cost=not args.no_cost_check
|
||||
)
|
||||
|
||||
# Validate data
|
||||
validation = fetcher.validate_data(data)
|
||||
print(f"\n[VALIDATION] Data is valid: {validation['valid']}")
|
||||
print(f"[VALIDATION] Record count: {validation['record_count']}")
|
||||
|
||||
if validation['issues']:
|
||||
print(f"[WARNING] Issues found: {validation['issues']}")
|
||||
|
||||
# Export if output specified
|
||||
if args.output:
|
||||
if args.output.endswith('.csv'):
|
||||
fetcher.export_csv(data, args.output)
|
||||
elif args.output.endswith('.json'):
|
||||
fetcher.export_json(data, args.output)
|
||||
else:
|
||||
print("[WARNING] Unknown output format. Saving as JSON.")
|
||||
fetcher.export_json(data, args.output + '.json')
|
||||
|
||||
print("\n[DONE] Fetch complete!")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n[CANCELLED] Fetch cancelled by user.")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n[ERROR] Fetch failed: {str(e)}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
388
skills/databento/scripts/session_filter.py
Normal file
388
skills/databento/scripts/session_filter.py
Normal file
@@ -0,0 +1,388 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Databento Trading Session Filter
|
||||
|
||||
Filter market data by trading session (Asian/London/NY):
|
||||
- Session detection using get_session_info
|
||||
- Historical data filtering by session
|
||||
- Session transition handling
|
||||
- Session-specific statistics
|
||||
|
||||
Usage:
|
||||
python session_filter.py --input data.json --session NY --output ny_session.json
|
||||
python session_filter.py --input data.json --session London --stats
|
||||
python session_filter.py --input data.json --sessions Asian,London --output combined.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TradingSession(Enum):
|
||||
"""Trading session definitions (in ET)."""
|
||||
ASIAN = ("Asian", 18, 2) # 6pm - 2am ET
|
||||
LONDON = ("London", 2, 8) # 2am - 8am ET
|
||||
NY = ("NY", 8, 16) # 8am - 4pm ET
|
||||
|
||||
|
||||
class SessionFilter:
|
||||
"""Filters Databento market data by trading session."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize session filter."""
|
||||
self.sessions = {
|
||||
"Asian": TradingSession.ASIAN,
|
||||
"London": TradingSession.LONDON,
|
||||
"NY": TradingSession.NY
|
||||
}
|
||||
|
||||
def get_current_session(self, timestamp: Optional[str] = None) -> str:
|
||||
"""
|
||||
Get trading session for a timestamp.
|
||||
|
||||
Args:
|
||||
timestamp: ISO timestamp (optional, defaults to now)
|
||||
|
||||
Returns:
|
||||
Session name (Asian, London, or NY)
|
||||
"""
|
||||
# NOTE: In actual usage, this would call:
|
||||
# session_info = mcp__databento__get_session_info(timestamp=timestamp)
|
||||
# return session_info["session"]
|
||||
|
||||
# For this template, simulate session detection
|
||||
if timestamp:
|
||||
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
else:
|
||||
dt = datetime.now(timezone.utc)
|
||||
|
||||
# Convert to ET
|
||||
et_hour = (dt.hour - 5) % 24 # Simplified ET conversion
|
||||
|
||||
# Determine session
|
||||
if 18 <= et_hour or et_hour < 2:
|
||||
return "Asian"
|
||||
elif 2 <= et_hour < 8:
|
||||
return "London"
|
||||
else:
|
||||
return "NY"
|
||||
|
||||
def is_in_session(
|
||||
self,
|
||||
timestamp_ns: int,
|
||||
session: TradingSession
|
||||
) -> bool:
|
||||
"""
|
||||
Check if timestamp falls within trading session.
|
||||
|
||||
Args:
|
||||
timestamp_ns: Timestamp in nanoseconds
|
||||
session: Trading session to check
|
||||
|
||||
Returns:
|
||||
True if timestamp is in session
|
||||
"""
|
||||
# Convert nanoseconds to datetime
|
||||
ts_seconds = timestamp_ns / 1_000_000_000
|
||||
dt = datetime.fromtimestamp(ts_seconds, tz=timezone.utc)
|
||||
|
||||
# Convert to ET (simplified, doesn't handle DST)
|
||||
et_offset = timedelta(hours=-5)
|
||||
dt_et = dt + et_offset
|
||||
|
||||
hour = dt_et.hour
|
||||
|
||||
# Check if hour falls within session
|
||||
_, start_hour, end_hour = session.value
|
||||
|
||||
if start_hour < end_hour:
|
||||
# Session doesn't cross midnight
|
||||
return start_hour <= hour < end_hour
|
||||
else:
|
||||
# Session crosses midnight (Asian session)
|
||||
return hour >= start_hour or hour < end_hour
|
||||
|
||||
def filter_by_session(
|
||||
self,
|
||||
data: List[Dict[str, Any]],
|
||||
sessions: List[str]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Filter data to include only specified sessions.
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
sessions: List of session names to include
|
||||
|
||||
Returns:
|
||||
Filtered data
|
||||
"""
|
||||
print(f"[FILTER] Filtering {len(data)} records for sessions: {', '.join(sessions)}")
|
||||
|
||||
session_enums = [self.sessions[s] for s in sessions]
|
||||
filtered = []
|
||||
|
||||
for record in data:
|
||||
# Extract timestamp
|
||||
ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
|
||||
|
||||
if not ts_ns:
|
||||
continue
|
||||
|
||||
# Check if in any of the specified sessions
|
||||
for session in session_enums:
|
||||
if self.is_in_session(int(ts_ns), session):
|
||||
filtered.append(record)
|
||||
break
|
||||
|
||||
print(f"[FILTER] Kept {len(filtered)} records ({len(filtered)/len(data)*100:.1f}%)")
|
||||
return filtered
|
||||
|
||||
def calculate_session_stats(
|
||||
self,
|
||||
data: List[Dict[str, Any]]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate statistics by trading session.
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
|
||||
Returns:
|
||||
Session statistics
|
||||
"""
|
||||
print(f"[STATS] Calculating session statistics for {len(data)} records...")
|
||||
|
||||
stats = {
|
||||
"Asian": {"count": 0, "volume": 0, "trades": 0},
|
||||
"London": {"count": 0, "volume": 0, "trades": 0},
|
||||
"NY": {"count": 0, "volume": 0, "trades": 0}
|
||||
}
|
||||
|
||||
for record in data:
|
||||
ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
|
||||
|
||||
if not ts_ns:
|
||||
continue
|
||||
|
||||
# Determine session
|
||||
for session_name, session_enum in self.sessions.items():
|
||||
if self.is_in_session(int(ts_ns), session_enum):
|
||||
stats[session_name]["count"] += 1
|
||||
|
||||
# Add volume if available
|
||||
if "volume" in record:
|
||||
stats[session_name]["volume"] += record["volume"]
|
||||
|
||||
# Count trades
|
||||
if "size" in record: # Trade record
|
||||
stats[session_name]["trades"] += 1
|
||||
|
||||
break
|
||||
|
||||
# Calculate percentages
|
||||
total_count = sum(s["count"] for s in stats.values())
|
||||
for session_stats in stats.values():
|
||||
if total_count > 0:
|
||||
session_stats["percentage"] = (session_stats["count"] / total_count) * 100
|
||||
else:
|
||||
session_stats["percentage"] = 0
|
||||
|
||||
return stats
|
||||
|
||||
def filter_session_transitions(
|
||||
self,
|
||||
data: List[Dict[str, Any]],
|
||||
minutes_before: int = 30,
|
||||
minutes_after: int = 30
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Filter data to include only session transitions (handoffs).
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
minutes_before: Minutes before transition to include
|
||||
minutes_after: Minutes after transition to include
|
||||
|
||||
Returns:
|
||||
Filtered data around session transitions
|
||||
"""
|
||||
print(f"[FILTER] Extracting session transitions ({minutes_before}m before, {minutes_after}m after)...")
|
||||
|
||||
# Session transition times (in ET)
|
||||
transitions = [
|
||||
2, # Asian → London (2am ET)
|
||||
8, # London → NY (8am ET)
|
||||
16, # NY → Post-market
|
||||
18, # Post-market → Asian (6pm ET)
|
||||
]
|
||||
|
||||
filtered = []
|
||||
transition_window = timedelta(minutes=minutes_before + minutes_after)
|
||||
|
||||
for record in data:
|
||||
ts_ns = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
|
||||
|
||||
if not ts_ns:
|
||||
continue
|
||||
|
||||
# Convert to ET hour
|
||||
ts_seconds = int(ts_ns) / 1_000_000_000
|
||||
dt = datetime.fromtimestamp(ts_seconds, tz=timezone.utc)
|
||||
et_offset = timedelta(hours=-5)
|
||||
dt_et = dt + et_offset
|
||||
|
||||
# Check if near any transition
|
||||
for transition_hour in transitions:
|
||||
transition_dt = dt_et.replace(hour=transition_hour, minute=0, second=0, microsecond=0)
|
||||
|
||||
# Calculate time difference
|
||||
time_diff = abs((dt_et - transition_dt).total_seconds())
|
||||
|
||||
# Include if within window
|
||||
if time_diff <= transition_window.total_seconds():
|
||||
filtered.append(record)
|
||||
break
|
||||
|
||||
print(f"[FILTER] Found {len(filtered)} records near session transitions")
|
||||
return filtered
|
||||
|
||||
def print_session_stats(self, stats: Dict[str, Any]):
|
||||
"""Print session statistics to console."""
|
||||
print("\n" + "=" * 60)
|
||||
print("SESSION STATISTICS")
|
||||
print("=" * 60)
|
||||
|
||||
for session_name in ["Asian", "London", "NY"]:
|
||||
session_stats = stats[session_name]
|
||||
print(f"\n{session_name} Session:")
|
||||
print(f" Records: {session_stats['count']:,} ({session_stats['percentage']:.1f}%)")
|
||||
if session_stats['volume'] > 0:
|
||||
print(f" Volume: {session_stats['volume']:,}")
|
||||
if session_stats['trades'] > 0:
|
||||
print(f" Trades: {session_stats['trades']:,}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Filter Databento data by trading session"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
"-i",
|
||||
required=True,
|
||||
help="Input data file (JSON)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--session",
|
||||
"--sessions",
|
||||
help="Session(s) to filter (Asian, London, NY). Comma-separated for multiple."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--transitions",
|
||||
action="store_true",
|
||||
help="Filter for session transition periods only"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--minutes-before",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Minutes before transition (default: 30)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--minutes-after",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Minutes after transition (default: 30)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--stats",
|
||||
action="store_true",
|
||||
help="Calculate and display session statistics"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
"-o",
|
||||
help="Output file for filtered data (JSON)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load data
|
||||
print(f"[LOAD] Loading data from {args.input}...")
|
||||
with open(args.input, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle different data formats
|
||||
if isinstance(data, dict) and "data" in data:
|
||||
data = data["data"]
|
||||
|
||||
# Create filter
|
||||
session_filter = SessionFilter()
|
||||
|
||||
# Calculate stats if requested
|
||||
if args.stats:
|
||||
stats = session_filter.calculate_session_stats(data)
|
||||
session_filter.print_session_stats(stats)
|
||||
|
||||
# Filter data
|
||||
filtered_data = data
|
||||
|
||||
if args.transitions:
|
||||
# Filter for session transitions
|
||||
filtered_data = session_filter.filter_session_transitions(
|
||||
filtered_data,
|
||||
minutes_before=args.minutes_before,
|
||||
minutes_after=args.minutes_after
|
||||
)
|
||||
elif args.session:
|
||||
# Filter by specific session(s)
|
||||
sessions = [s.strip() for s in args.session.split(',')]
|
||||
|
||||
# Validate sessions
|
||||
for session in sessions:
|
||||
if session not in ["Asian", "London", "NY"]:
|
||||
print(f"[ERROR] Invalid session: {session}")
|
||||
print("[ERROR] Valid sessions: Asian, London, NY")
|
||||
sys.exit(1)
|
||||
|
||||
filtered_data = session_filter.filter_by_session(filtered_data, sessions)
|
||||
|
||||
# Save filtered data if output specified
|
||||
if args.output:
|
||||
print(f"\n[SAVE] Saving {len(filtered_data)} filtered records to {args.output}...")
|
||||
|
||||
output_data = {
|
||||
"data": filtered_data,
|
||||
"metadata": {
|
||||
"original_count": len(data),
|
||||
"filtered_count": len(filtered_data),
|
||||
"filter_type": "transitions" if args.transitions else "sessions",
|
||||
"sessions": args.session.split(',') if args.session else None
|
||||
}
|
||||
}
|
||||
|
||||
with open(args.output, 'w') as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
|
||||
print(f"[SUCCESS] Filtered data saved!")
|
||||
|
||||
print("\n[DONE] Session filtering complete!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
496
skills/databento/scripts/validate_data.py
Normal file
496
skills/databento/scripts/validate_data.py
Normal file
@@ -0,0 +1,496 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Databento Data Quality Validator
|
||||
|
||||
Validates market data quality to catch issues early:
|
||||
- Timestamp gap detection
|
||||
- Record count verification
|
||||
- Price range validation (no negative prices, outliers)
|
||||
- Duplicate timestamp detection
|
||||
- Summary quality report
|
||||
|
||||
Usage:
|
||||
python validate_data.py --input data.json
|
||||
python validate_data.py --input data.csv --schema ohlcv-1h
|
||||
python validate_data.py --input data.json --max-gap-minutes 60 --report report.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class DataValidator:
|
||||
"""Validates Databento market data quality."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
schema: str,
|
||||
max_gap_minutes: int = 60,
|
||||
price_outlier_std: float = 10.0
|
||||
):
|
||||
"""
|
||||
Initialize validator.
|
||||
|
||||
Args:
|
||||
schema: Data schema (ohlcv-1h, trades, mbp-1, etc.)
|
||||
max_gap_minutes: Maximum acceptable gap in minutes
|
||||
price_outlier_std: Standard deviations for outlier detection
|
||||
"""
|
||||
self.schema = schema
|
||||
self.max_gap_seconds = max_gap_minutes * 60
|
||||
self.price_outlier_std = price_outlier_std
|
||||
self.issues: List[Dict[str, Any]] = []
|
||||
|
||||
def validate(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run all validation checks on data.
|
||||
|
||||
Args:
|
||||
data: List of records to validate
|
||||
|
||||
Returns:
|
||||
Validation report
|
||||
"""
|
||||
print(f"[VALIDATION] Running quality checks on {len(data)} records...")
|
||||
|
||||
report = {
|
||||
"total_records": len(data),
|
||||
"valid": True,
|
||||
"checks": {}
|
||||
}
|
||||
|
||||
if not data:
|
||||
print("[WARNING] No data to validate!")
|
||||
report["valid"] = False
|
||||
return report
|
||||
|
||||
# Run all validation checks
|
||||
report["checks"]["timestamp_gaps"] = self.check_timestamp_gaps(data)
|
||||
report["checks"]["duplicates"] = self.check_duplicates(data)
|
||||
report["checks"]["price_range"] = self.check_price_range(data)
|
||||
report["checks"]["record_count"] = self.check_record_count(data)
|
||||
report["checks"]["data_completeness"] = self.check_completeness(data)
|
||||
|
||||
# Overall validity
|
||||
report["valid"] = all(
|
||||
check.get("valid", True)
|
||||
for check in report["checks"].values()
|
||||
)
|
||||
|
||||
report["issues"] = self.issues
|
||||
|
||||
return report
|
||||
|
||||
def check_timestamp_gaps(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Check for unexpected gaps in timestamps.
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
|
||||
Returns:
|
||||
Gap check report
|
||||
"""
|
||||
print("[CHECK] Checking for timestamp gaps...")
|
||||
|
||||
gaps = []
|
||||
timestamps = self._extract_timestamps(data)
|
||||
|
||||
if len(timestamps) < 2:
|
||||
return {"valid": True, "gaps": [], "note": "Insufficient data for gap detection"}
|
||||
|
||||
# Sort timestamps
|
||||
sorted_ts = sorted(timestamps)
|
||||
|
||||
# Check gaps between consecutive timestamps
|
||||
for i in range(len(sorted_ts) - 1):
|
||||
gap_ns = sorted_ts[i + 1] - sorted_ts[i]
|
||||
gap_seconds = gap_ns / 1_000_000_000
|
||||
|
||||
if gap_seconds > self.max_gap_seconds:
|
||||
gap_info = {
|
||||
"index": i,
|
||||
"gap_seconds": gap_seconds,
|
||||
"gap_minutes": gap_seconds / 60,
|
||||
"before": self._format_timestamp(sorted_ts[i]),
|
||||
"after": self._format_timestamp(sorted_ts[i + 1])
|
||||
}
|
||||
gaps.append(gap_info)
|
||||
|
||||
self.issues.append({
|
||||
"type": "timestamp_gap",
|
||||
"severity": "warning",
|
||||
"message": f"Gap of {gap_seconds / 60:.1f} minutes detected",
|
||||
**gap_info
|
||||
})
|
||||
|
||||
valid = len(gaps) == 0
|
||||
print(f"[CHECK] Found {len(gaps)} gaps > {self.max_gap_seconds / 60} minutes")
|
||||
|
||||
return {
|
||||
"valid": valid,
|
||||
"gaps_found": len(gaps),
|
||||
"gaps": gaps[:10] if gaps else [], # Limit to first 10 for report
|
||||
"total_gaps": len(gaps)
|
||||
}
|
||||
|
||||
def check_duplicates(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Check for duplicate timestamps.
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
|
||||
Returns:
|
||||
Duplicate check report
|
||||
"""
|
||||
print("[CHECK] Checking for duplicate timestamps...")
|
||||
|
||||
timestamps = self._extract_timestamps(data)
|
||||
timestamp_counts = defaultdict(int)
|
||||
|
||||
for ts in timestamps:
|
||||
timestamp_counts[ts] += 1
|
||||
|
||||
duplicates = {ts: count for ts, count in timestamp_counts.items() if count > 1}
|
||||
|
||||
if duplicates:
|
||||
for ts, count in list(duplicates.items())[:10]: # Limit to first 10
|
||||
self.issues.append({
|
||||
"type": "duplicate_timestamp",
|
||||
"severity": "error",
|
||||
"timestamp": self._format_timestamp(ts),
|
||||
"count": count,
|
||||
"message": f"Timestamp appears {count} times"
|
||||
})
|
||||
|
||||
valid = len(duplicates) == 0
|
||||
print(f"[CHECK] Found {len(duplicates)} duplicate timestamps")
|
||||
|
||||
return {
|
||||
"valid": valid,
|
||||
"duplicates_found": len(duplicates),
|
||||
"duplicate_timestamps": len(duplicates)
|
||||
}
|
||||
|
||||
def check_price_range(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Check for invalid or outlier prices.
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
|
||||
Returns:
|
||||
Price range check report
|
||||
"""
|
||||
print("[CHECK] Checking price ranges...")
|
||||
|
||||
prices = self._extract_prices(data)
|
||||
|
||||
if not prices:
|
||||
return {"valid": True, "note": "No price data to validate"}
|
||||
|
||||
# Check for negative prices
|
||||
negative_prices = [p for p in prices if p < 0]
|
||||
|
||||
# Check for zero prices (unusual for ES/NQ)
|
||||
zero_prices = [p for p in prices if p == 0]
|
||||
|
||||
# Calculate statistics for outlier detection
|
||||
if len(prices) > 1:
|
||||
mean_price = sum(prices) / len(prices)
|
||||
variance = sum((p - mean_price) ** 2 for p in prices) / len(prices)
|
||||
std_dev = variance ** 0.5
|
||||
|
||||
# Detect outliers (> N standard deviations from mean)
|
||||
outliers = []
|
||||
for p in prices:
|
||||
if abs(p - mean_price) > (self.price_outlier_std * std_dev):
|
||||
outliers.append(p)
|
||||
if len(outliers) <= 10: # Limit issues
|
||||
self.issues.append({
|
||||
"type": "price_outlier",
|
||||
"severity": "warning",
|
||||
"price": p,
|
||||
"mean": mean_price,
|
||||
"std_dev": std_dev,
|
||||
"message": f"Price {p:.2f} is {abs(p - mean_price) / std_dev:.1f} std devs from mean"
|
||||
})
|
||||
else:
|
||||
outliers = []
|
||||
mean_price = prices[0] if prices else 0
|
||||
std_dev = 0
|
||||
|
||||
# Report negative prices as errors
|
||||
for p in negative_prices[:10]: # Limit to first 10
|
||||
self.issues.append({
|
||||
"type": "negative_price",
|
||||
"severity": "error",
|
||||
"price": p,
|
||||
"message": f"Negative price detected: {p}"
|
||||
})
|
||||
|
||||
valid = len(negative_prices) == 0 and len(zero_prices) == 0
|
||||
|
||||
print(f"[CHECK] Price range: {min(prices):.2f} to {max(prices):.2f}")
|
||||
print(f"[CHECK] Negative prices: {len(negative_prices)}, Zero prices: {len(zero_prices)}, Outliers: {len(outliers)}")
|
||||
|
||||
return {
|
||||
"valid": valid,
|
||||
"min_price": min(prices),
|
||||
"max_price": max(prices),
|
||||
"mean_price": mean_price,
|
||||
"std_dev": std_dev,
|
||||
"negative_prices": len(negative_prices),
|
||||
"zero_prices": len(zero_prices),
|
||||
"outliers": len(outliers)
|
||||
}
|
||||
|
||||
def check_record_count(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Verify expected record count.
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
|
||||
Returns:
|
||||
Record count check report
|
||||
"""
|
||||
print(f"[CHECK] Verifying record count: {len(data)} records")
|
||||
|
||||
# For OHLCV data, can estimate expected count based on timeframe
|
||||
expected_count = self._estimate_expected_count(data)
|
||||
|
||||
valid = True
|
||||
if expected_count and abs(len(data) - expected_count) > (expected_count * 0.1):
|
||||
# More than 10% deviation
|
||||
valid = False
|
||||
self.issues.append({
|
||||
"type": "unexpected_record_count",
|
||||
"severity": "warning",
|
||||
"actual": len(data),
|
||||
"expected": expected_count,
|
||||
"message": f"Expected ~{expected_count} records, got {len(data)}"
|
||||
})
|
||||
|
||||
return {
|
||||
"valid": valid,
|
||||
"actual_count": len(data),
|
||||
"expected_count": expected_count,
|
||||
"note": "Expected count is estimated based on schema and date range"
|
||||
}
|
||||
|
||||
def check_completeness(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Check data completeness (required fields present).
|
||||
|
||||
Args:
|
||||
data: List of records
|
||||
|
||||
Returns:
|
||||
Completeness check report
|
||||
"""
|
||||
print("[CHECK] Checking data completeness...")
|
||||
|
||||
if not data:
|
||||
return {"valid": False, "note": "No data"}
|
||||
|
||||
# Check required fields based on schema
|
||||
required_fields = self._get_required_fields()
|
||||
|
||||
missing_fields = defaultdict(int)
|
||||
for record in data[:100]: # Sample first 100 records
|
||||
for field in required_fields:
|
||||
if field not in record or record[field] is None:
|
||||
missing_fields[field] += 1
|
||||
|
||||
if missing_fields:
|
||||
for field, count in missing_fields.items():
|
||||
self.issues.append({
|
||||
"type": "missing_field",
|
||||
"severity": "error",
|
||||
"field": field,
|
||||
"missing_count": count,
|
||||
"message": f"Field '{field}' missing in {count} records (sampled)"
|
||||
})
|
||||
|
||||
valid = len(missing_fields) == 0
|
||||
|
||||
return {
|
||||
"valid": valid,
|
||||
"missing_fields": dict(missing_fields) if missing_fields else {}
|
||||
}
|
||||
|
||||
def _extract_timestamps(self, data: List[Dict[str, Any]]) -> List[int]:
|
||||
"""Extract timestamps from records."""
|
||||
timestamps = []
|
||||
for record in data:
|
||||
# Try different timestamp field names
|
||||
ts = record.get("ts_event") or record.get("ts_recv") or record.get("timestamp")
|
||||
if ts:
|
||||
timestamps.append(int(ts))
|
||||
return timestamps
|
||||
|
||||
def _extract_prices(self, data: List[Dict[str, Any]]) -> List[float]:
|
||||
"""Extract prices from records."""
|
||||
prices = []
|
||||
for record in data:
|
||||
# For OHLCV, use close price
|
||||
if "close" in record:
|
||||
# Convert from fixed-point if needed
|
||||
price = record["close"]
|
||||
if isinstance(price, int) and price > 1_000_000:
|
||||
price = price / 1_000_000_000 # Fixed-point conversion
|
||||
prices.append(float(price))
|
||||
# For trades/mbp, use price field
|
||||
elif "price" in record:
|
||||
price = record["price"]
|
||||
if isinstance(price, int) and price > 1_000_000:
|
||||
price = price / 1_000_000_000
|
||||
prices.append(float(price))
|
||||
return prices
|
||||
|
||||
def _format_timestamp(self, ts_ns: int) -> str:
|
||||
"""Format nanosecond timestamp to readable string."""
|
||||
ts_seconds = ts_ns / 1_000_000_000
|
||||
dt = datetime.fromtimestamp(ts_seconds)
|
||||
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
def _estimate_expected_count(self, data: List[Dict[str, Any]]) -> Optional[int]:
|
||||
"""Estimate expected record count based on schema and date range."""
|
||||
# This is a simplified estimation
|
||||
# In practice, would calculate based on actual date range
|
||||
if "ohlcv" in self.schema:
|
||||
if "1h" in self.schema:
|
||||
return None # ~24 records per day per symbol
|
||||
elif "1d" in self.schema:
|
||||
return None # ~1 record per day per symbol
|
||||
return None
|
||||
|
||||
def _get_required_fields(self) -> List[str]:
|
||||
"""Get required fields for schema."""
|
||||
base_fields = ["ts_event", "ts_recv"]
|
||||
|
||||
if "ohlcv" in self.schema:
|
||||
return base_fields + ["open", "high", "low", "close", "volume"]
|
||||
elif self.schema == "trades":
|
||||
return base_fields + ["price", "size"]
|
||||
elif "mbp" in self.schema:
|
||||
return base_fields + ["bid_px_00", "ask_px_00", "bid_sz_00", "ask_sz_00"]
|
||||
else:
|
||||
return base_fields
|
||||
|
||||
def print_report(self, report: Dict[str, Any]):
|
||||
"""Print validation report to console."""
|
||||
print("\n" + "=" * 60)
|
||||
print("DATA VALIDATION REPORT")
|
||||
print("=" * 60)
|
||||
|
||||
print(f"\nTotal Records: {report['total_records']}")
|
||||
print(f"Overall Valid: {'✓ YES' if report['valid'] else '✗ NO'}")
|
||||
|
||||
print("\n" + "-" * 60)
|
||||
print("CHECK RESULTS")
|
||||
print("-" * 60)
|
||||
|
||||
for check_name, check_result in report["checks"].items():
|
||||
status = "✓" if check_result.get("valid", True) else "✗"
|
||||
print(f"\n{status} {check_name.replace('_', ' ').title()}")
|
||||
for key, value in check_result.items():
|
||||
if key != "valid" and key != "gaps":
|
||||
print(f" {key}: {value}")
|
||||
|
||||
if report["issues"]:
|
||||
print("\n" + "-" * 60)
|
||||
print(f"ISSUES FOUND ({len(report['issues'])})")
|
||||
print("-" * 60)
|
||||
for i, issue in enumerate(report["issues"][:20], 1): # Limit to 20
|
||||
print(f"\n{i}. [{issue['severity'].upper()}] {issue['type']}")
|
||||
print(f" {issue['message']}")
|
||||
|
||||
if len(report["issues"]) > 20:
|
||||
print(f"\n... and {len(report['issues']) - 20} more issues")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for CLI usage."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Validate Databento market data quality"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
"-i",
|
||||
required=True,
|
||||
help="Input data file (JSON or CSV)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--schema",
|
||||
default="ohlcv-1h",
|
||||
help="Data schema (default: ohlcv-1h)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--max-gap-minutes",
|
||||
type=int,
|
||||
default=60,
|
||||
help="Maximum acceptable gap in minutes (default: 60)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--price-outlier-std",
|
||||
type=float,
|
||||
default=10.0,
|
||||
help="Standard deviations for outlier detection (default: 10.0)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--report",
|
||||
"-r",
|
||||
help="Save report to JSON file"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load data
|
||||
print(f"[LOAD] Loading data from {args.input}...")
|
||||
with open(args.input, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle different data formats
|
||||
if isinstance(data, dict) and "data" in data:
|
||||
data = data["data"]
|
||||
|
||||
# Create validator
|
||||
validator = DataValidator(
|
||||
schema=args.schema,
|
||||
max_gap_minutes=args.max_gap_minutes,
|
||||
price_outlier_std=args.price_outlier_std
|
||||
)
|
||||
|
||||
# Run validation
|
||||
report = validator.validate(data)
|
||||
|
||||
# Print report
|
||||
validator.print_report(report)
|
||||
|
||||
# Save report if requested
|
||||
if args.report:
|
||||
print(f"\n[SAVE] Saving report to {args.report}...")
|
||||
with open(args.report, 'w') as f:
|
||||
json.dump(report, f, indent=2)
|
||||
print(f"[SUCCESS] Report saved!")
|
||||
|
||||
# Exit with appropriate code
|
||||
sys.exit(0 if report["valid"] else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user