232 lines
7.8 KiB
Python
232 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
COSMIC Data Download Utility
|
|
|
|
This script provides functions to download data from the COSMIC database
|
|
(Catalogue of Somatic Mutations in Cancer).
|
|
|
|
Usage:
|
|
from download_cosmic import download_cosmic_file, list_available_files
|
|
|
|
# Download a specific file
|
|
download_cosmic_file(
|
|
email="user@example.com",
|
|
password="password",
|
|
filepath="GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz",
|
|
output_filename="mutations.tsv.gz"
|
|
)
|
|
|
|
Requirements:
|
|
- requests library: pip install requests
|
|
- Valid COSMIC account credentials (register at cancer.sanger.ac.uk/cosmic)
|
|
"""
|
|
|
|
import requests
|
|
import sys
|
|
import os
|
|
from typing import Optional
|
|
|
|
|
|
def download_cosmic_file(
|
|
email: str,
|
|
password: str,
|
|
filepath: str,
|
|
output_filename: Optional[str] = None,
|
|
genome_assembly: str = "GRCh38"
|
|
) -> bool:
|
|
"""
|
|
Download a file from COSMIC database.
|
|
|
|
Args:
|
|
email: COSMIC account email
|
|
password: COSMIC account password
|
|
filepath: Relative path to file (e.g., "GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz")
|
|
output_filename: Optional custom output filename (default: last part of filepath)
|
|
genome_assembly: Genome assembly version (GRCh37 or GRCh38, default: GRCh38)
|
|
|
|
Returns:
|
|
True if download successful, False otherwise
|
|
|
|
Example:
|
|
download_cosmic_file(
|
|
"user@email.com",
|
|
"pass123",
|
|
"GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz"
|
|
)
|
|
"""
|
|
base_url = "https://cancer.sanger.ac.uk/cosmic/file_download/"
|
|
|
|
# Determine output filename
|
|
if output_filename is None:
|
|
output_filename = os.path.basename(filepath)
|
|
|
|
try:
|
|
# Step 1: Get the download URL
|
|
print(f"Requesting download URL for: {filepath}")
|
|
r = requests.get(
|
|
base_url + filepath,
|
|
auth=(email, password),
|
|
timeout=30
|
|
)
|
|
|
|
if r.status_code == 401:
|
|
print("ERROR: Authentication failed. Check email and password.")
|
|
return False
|
|
elif r.status_code == 404:
|
|
print(f"ERROR: File not found: {filepath}")
|
|
return False
|
|
elif r.status_code != 200:
|
|
print(f"ERROR: Request failed with status code {r.status_code}")
|
|
print(f"Response: {r.text}")
|
|
return False
|
|
|
|
# Parse response to get download URL
|
|
response_data = r.json()
|
|
download_url = response_data.get("url")
|
|
|
|
if not download_url:
|
|
print("ERROR: No download URL in response")
|
|
return False
|
|
|
|
# Step 2: Download the file
|
|
print(f"Downloading file from: {download_url}")
|
|
file_response = requests.get(download_url, stream=True, timeout=300)
|
|
|
|
if file_response.status_code != 200:
|
|
print(f"ERROR: Download failed with status code {file_response.status_code}")
|
|
return False
|
|
|
|
# Step 3: Write to disk
|
|
print(f"Saving to: {output_filename}")
|
|
total_size = int(file_response.headers.get('content-length', 0))
|
|
|
|
with open(output_filename, 'wb') as f:
|
|
if total_size == 0:
|
|
f.write(file_response.content)
|
|
else:
|
|
downloaded = 0
|
|
for chunk in file_response.iter_content(chunk_size=8192):
|
|
if chunk:
|
|
f.write(chunk)
|
|
downloaded += len(chunk)
|
|
# Show progress
|
|
progress = (downloaded / total_size) * 100
|
|
print(f"\rProgress: {progress:.1f}%", end='', flush=True)
|
|
print() # New line after progress
|
|
|
|
print(f"✓ Successfully downloaded: {output_filename}")
|
|
return True
|
|
|
|
except requests.exceptions.Timeout:
|
|
print("ERROR: Request timed out")
|
|
return False
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"ERROR: Request failed: {e}")
|
|
return False
|
|
except Exception as e:
|
|
print(f"ERROR: Unexpected error: {e}")
|
|
return False
|
|
|
|
|
|
def get_common_file_path(
|
|
data_type: str,
|
|
genome_assembly: str = "GRCh38",
|
|
version: str = "latest"
|
|
) -> Optional[str]:
|
|
"""
|
|
Get the filepath for common COSMIC data files.
|
|
|
|
Args:
|
|
data_type: Type of data (e.g., 'mutations', 'gene_census', 'signatures')
|
|
genome_assembly: GRCh37 or GRCh38
|
|
version: COSMIC version (use 'latest' for most recent)
|
|
|
|
Returns:
|
|
Filepath string or None if type unknown
|
|
"""
|
|
common_files = {
|
|
'mutations': f'{genome_assembly}/cosmic/{version}/CosmicMutantExport.tsv.gz',
|
|
'mutations_vcf': f'{genome_assembly}/cosmic/{version}/VCF/CosmicCodingMuts.vcf.gz',
|
|
'gene_census': f'{genome_assembly}/cosmic/{version}/cancer_gene_census.csv',
|
|
'resistance_mutations': f'{genome_assembly}/cosmic/{version}/CosmicResistanceMutations.tsv.gz',
|
|
'structural_variants': f'{genome_assembly}/cosmic/{version}/CosmicStructExport.tsv.gz',
|
|
'gene_expression': f'{genome_assembly}/cosmic/{version}/CosmicCompleteGeneExpression.tsv.gz',
|
|
'copy_number': f'{genome_assembly}/cosmic/{version}/CosmicCompleteCNA.tsv.gz',
|
|
'fusion_genes': f'{genome_assembly}/cosmic/{version}/CosmicFusionExport.tsv.gz',
|
|
'signatures': f'signatures/signatures.tsv',
|
|
'sample_info': f'{genome_assembly}/cosmic/{version}/CosmicSample.tsv.gz',
|
|
}
|
|
|
|
return common_files.get(data_type)
|
|
|
|
|
|
def main():
|
|
"""Command-line interface for downloading COSMIC files."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Download files from COSMIC database',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Download mutations file
|
|
%(prog)s user@email.com --filepath GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz
|
|
|
|
# Download using shorthand
|
|
%(prog)s user@email.com --data-type mutations
|
|
|
|
# Download for GRCh37
|
|
%(prog)s user@email.com --data-type gene_census --assembly GRCh37
|
|
"""
|
|
)
|
|
|
|
parser.add_argument('email', help='COSMIC account email')
|
|
parser.add_argument('--password', help='COSMIC account password (will prompt if not provided)')
|
|
parser.add_argument('--filepath', help='Full filepath to download')
|
|
parser.add_argument('--data-type',
|
|
choices=['mutations', 'mutations_vcf', 'gene_census', 'resistance_mutations',
|
|
'structural_variants', 'gene_expression', 'copy_number',
|
|
'fusion_genes', 'signatures', 'sample_info'],
|
|
help='Common data type shorthand')
|
|
parser.add_argument('--assembly', default='GRCh38',
|
|
choices=['GRCh37', 'GRCh38'],
|
|
help='Genome assembly (default: GRCh38)')
|
|
parser.add_argument('--version', default='latest',
|
|
help='COSMIC version (default: latest)')
|
|
parser.add_argument('-o', '--output', help='Output filename')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get password if not provided
|
|
if not args.password:
|
|
import getpass
|
|
args.password = getpass.getpass('COSMIC password: ')
|
|
|
|
# Determine filepath
|
|
if args.filepath:
|
|
filepath = args.filepath
|
|
elif args.data_type:
|
|
filepath = get_common_file_path(args.data_type, args.assembly, args.version)
|
|
if not filepath:
|
|
print(f"ERROR: Unknown data type: {args.data_type}")
|
|
return 1
|
|
else:
|
|
print("ERROR: Must provide either --filepath or --data-type")
|
|
parser.print_help()
|
|
return 1
|
|
|
|
# Download the file
|
|
success = download_cosmic_file(
|
|
email=args.email,
|
|
password=args.password,
|
|
filepath=filepath,
|
|
output_filename=args.output,
|
|
genome_assembly=args.assembly
|
|
)
|
|
|
|
return 0 if success else 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|