346 lines
10 KiB
Python
346 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Databento OHLCV Data Fetcher
|
|
|
|
Standard pattern for fetching OHLCV data with built-in best practices:
|
|
- Automatic cost estimation before fetch
|
|
- Error handling with retries
|
|
- Post-fetch data validation
|
|
- Export options (CSV/pandas)
|
|
|
|
Usage:
|
|
python fetch_ohlcv.py --symbol ES.c.0 --schema ohlcv-1h --start 2024-01-01 --end 2024-01-31
|
|
python fetch_ohlcv.py --symbol NQ.c.0 --schema ohlcv-1d --start 2024-01-01 --limit 100
|
|
python fetch_ohlcv.py --symbol ES.c.0,NQ.c.0 --schema ohlcv-1h --start 2024-01-01 --output data.csv
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
from typing import Optional, Dict, Any, List
|
|
import time
|
|
|
|
|
|
class DatabentoPHTLCVFetcher:
|
|
"""Fetches OHLCV data from Databento with best practices built-in."""
|
|
|
|
def __init__(self, dataset: str = "GLBX.MDP3", stype_in: str = "continuous"):
|
|
"""
|
|
Initialize fetcher.
|
|
|
|
Args:
|
|
dataset: Dataset code (default: GLBX.MDP3 for ES/NQ)
|
|
stype_in: Input symbol type (default: continuous)
|
|
"""
|
|
self.dataset = dataset
|
|
self.stype_in = stype_in
|
|
self.max_retries = 3
|
|
self.retry_delay = 2 # seconds
|
|
|
|
def estimate_cost(
|
|
self,
|
|
symbols: str,
|
|
schema: str,
|
|
start: str,
|
|
end: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Estimate cost before fetching data.
|
|
|
|
Args:
|
|
symbols: Comma-separated symbol list
|
|
schema: Data schema (e.g., ohlcv-1h)
|
|
start: Start date (YYYY-MM-DD)
|
|
end: End date (optional)
|
|
|
|
Returns:
|
|
Cost estimation result
|
|
"""
|
|
print(f"[COST CHECK] Estimating cost for {symbols} ({schema})...")
|
|
|
|
# NOTE: In actual usage, this would call the MCP tool:
|
|
# mcp__databento__metadata_get_cost(
|
|
# dataset=self.dataset,
|
|
# start=start,
|
|
# end=end,
|
|
# symbols=symbols,
|
|
# schema=schema,
|
|
# stype_in=self.stype_in
|
|
# )
|
|
|
|
# For this template, we simulate the response
|
|
print("[NOTE] This template script demonstrates the pattern.")
|
|
print("[NOTE] In actual usage, integrate with MCP tools directly.")
|
|
|
|
return {
|
|
"estimated_cost_usd": 0.0,
|
|
"estimated_size_mb": 0.0,
|
|
"note": "Call mcp__databento__metadata_get_cost here"
|
|
}
|
|
|
|
def validate_dataset_range(self) -> Dict[str, str]:
|
|
"""
|
|
Validate dataset availability.
|
|
|
|
Returns:
|
|
Dataset date range
|
|
"""
|
|
print(f"[VALIDATION] Checking dataset availability for {self.dataset}...")
|
|
|
|
# NOTE: In actual usage, this would call:
|
|
# mcp__databento__metadata_get_dataset_range(dataset=self.dataset)
|
|
|
|
return {
|
|
"start_date": "2000-01-01",
|
|
"end_date": datetime.now().strftime("%Y-%m-%d"),
|
|
"note": "Call mcp__databento__metadata_get_dataset_range here"
|
|
}
|
|
|
|
def fetch_data(
|
|
self,
|
|
symbols: str,
|
|
schema: str,
|
|
start: str,
|
|
end: Optional[str] = None,
|
|
limit: Optional[int] = None,
|
|
check_cost: bool = True
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Fetch OHLCV data with retries and error handling.
|
|
|
|
Args:
|
|
symbols: Comma-separated symbol list
|
|
schema: Data schema (e.g., ohlcv-1h, ohlcv-1d)
|
|
start: Start date (YYYY-MM-DD)
|
|
end: End date (optional)
|
|
limit: Maximum number of records (optional)
|
|
check_cost: Whether to check cost before fetching (default: True)
|
|
|
|
Returns:
|
|
Fetched data
|
|
"""
|
|
# Step 1: Cost check (if enabled)
|
|
if check_cost:
|
|
cost_info = self.estimate_cost(symbols, schema, start, end)
|
|
print(f"[COST] Estimated cost: ${cost_info.get('estimated_cost_usd', 0):.2f}")
|
|
print(f"[COST] Estimated size: {cost_info.get('estimated_size_mb', 0):.2f} MB")
|
|
|
|
# Prompt for confirmation if cost is high
|
|
estimated_cost = cost_info.get('estimated_cost_usd', 0)
|
|
if estimated_cost > 10:
|
|
response = input(f"\nEstimated cost is ${estimated_cost:.2f}. Continue? (y/n): ")
|
|
if response.lower() != 'y':
|
|
print("[CANCELLED] Data fetch cancelled by user.")
|
|
sys.exit(0)
|
|
|
|
# Step 2: Validate dataset
|
|
dataset_range = self.validate_dataset_range()
|
|
print(f"[DATASET] Available range: {dataset_range.get('start_date')} to {dataset_range.get('end_date')}")
|
|
|
|
# Step 3: Fetch data with retries
|
|
for attempt in range(self.max_retries):
|
|
try:
|
|
print(f"\n[FETCH] Attempt {attempt + 1}/{self.max_retries}")
|
|
print(f"[FETCH] Fetching {symbols} ({schema}) from {start} to {end or 'now'}...")
|
|
|
|
# NOTE: In actual usage, this would call:
|
|
# data = mcp__databento__timeseries_get_range(
|
|
# dataset=self.dataset,
|
|
# symbols=symbols,
|
|
# schema=schema,
|
|
# start=start,
|
|
# end=end,
|
|
# stype_in=self.stype_in,
|
|
# stype_out="instrument_id",
|
|
# limit=limit
|
|
# )
|
|
|
|
# Simulate successful fetch
|
|
print("[SUCCESS] Data fetched successfully!")
|
|
return {
|
|
"data": [],
|
|
"record_count": 0,
|
|
"note": "Call mcp__databento__timeseries_get_range here"
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
|
|
|
|
if attempt < self.max_retries - 1:
|
|
print(f"[RETRY] Waiting {self.retry_delay} seconds before retry...")
|
|
time.sleep(self.retry_delay)
|
|
else:
|
|
print("[FAILED] All retry attempts exhausted.")
|
|
raise
|
|
|
|
def validate_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Validate fetched data quality.
|
|
|
|
Args:
|
|
data: Fetched data
|
|
|
|
Returns:
|
|
Validation report
|
|
"""
|
|
print("\n[VALIDATION] Running data quality checks...")
|
|
|
|
# NOTE: Actual validation would:
|
|
# - Check for timestamp gaps
|
|
# - Verify record counts
|
|
# - Validate price ranges
|
|
# - Check for duplicates
|
|
|
|
# Use scripts/validate_data.py for comprehensive validation
|
|
|
|
return {
|
|
"valid": True,
|
|
"record_count": data.get("record_count", 0),
|
|
"issues": [],
|
|
"note": "Use scripts/validate_data.py for detailed validation"
|
|
}
|
|
|
|
def export_csv(self, data: Dict[str, Any], output_path: str):
|
|
"""
|
|
Export data to CSV.
|
|
|
|
Args:
|
|
data: Data to export
|
|
output_path: Output file path
|
|
"""
|
|
print(f"\n[EXPORT] Saving data to {output_path}...")
|
|
|
|
# NOTE: Actual export would convert data to CSV format
|
|
# and write to file
|
|
|
|
print(f"[SUCCESS] Data saved to {output_path}")
|
|
|
|
def export_json(self, data: Dict[str, Any], output_path: str):
|
|
"""
|
|
Export data to JSON.
|
|
|
|
Args:
|
|
data: Data to export
|
|
output_path: Output file path
|
|
"""
|
|
print(f"\n[EXPORT] Saving data to {output_path}...")
|
|
|
|
with open(output_path, 'w') as f:
|
|
json.dump(data, f, indent=2)
|
|
|
|
print(f"[SUCCESS] Data saved to {output_path}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point for CLI usage."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Fetch OHLCV data from Databento with best practices"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--symbol",
|
|
"-s",
|
|
required=True,
|
|
help="Symbol or comma-separated symbols (e.g., ES.c.0 or ES.c.0,NQ.c.0)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--schema",
|
|
choices=["ohlcv-1s", "ohlcv-1m", "ohlcv-1h", "ohlcv-1d", "ohlcv-eod"],
|
|
default="ohlcv-1h",
|
|
help="OHLCV schema (default: ohlcv-1h)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--start",
|
|
required=True,
|
|
help="Start date (YYYY-MM-DD)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--end",
|
|
help="End date (YYYY-MM-DD, optional)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
help="Maximum number of records (optional)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--dataset",
|
|
default="GLBX.MDP3",
|
|
help="Dataset code (default: GLBX.MDP3)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--stype-in",
|
|
default="continuous",
|
|
choices=["continuous", "raw_symbol", "instrument_id"],
|
|
help="Input symbol type (default: continuous)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
help="Output file path (CSV or JSON based on extension)"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--no-cost-check",
|
|
action="store_true",
|
|
help="Skip cost estimation (not recommended)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Create fetcher
|
|
fetcher = DatabentOHLCVFetcher(
|
|
dataset=args.dataset,
|
|
stype_in=args.stype_in
|
|
)
|
|
|
|
try:
|
|
# Fetch data
|
|
data = fetcher.fetch_data(
|
|
symbols=args.symbol,
|
|
schema=args.schema,
|
|
start=args.start,
|
|
end=args.end,
|
|
limit=args.limit,
|
|
check_cost=not args.no_cost_check
|
|
)
|
|
|
|
# Validate data
|
|
validation = fetcher.validate_data(data)
|
|
print(f"\n[VALIDATION] Data is valid: {validation['valid']}")
|
|
print(f"[VALIDATION] Record count: {validation['record_count']}")
|
|
|
|
if validation['issues']:
|
|
print(f"[WARNING] Issues found: {validation['issues']}")
|
|
|
|
# Export if output specified
|
|
if args.output:
|
|
if args.output.endswith('.csv'):
|
|
fetcher.export_csv(data, args.output)
|
|
elif args.output.endswith('.json'):
|
|
fetcher.export_json(data, args.output)
|
|
else:
|
|
print("[WARNING] Unknown output format. Saving as JSON.")
|
|
fetcher.export_json(data, args.output + '.json')
|
|
|
|
print("\n[DONE] Fetch complete!")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n[CANCELLED] Fetch cancelled by user.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n[ERROR] Fetch failed: {str(e)}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|