Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,231 @@
#!/usr/bin/env python3
"""
COSMIC Data Download Utility
This script provides functions to download data from the COSMIC database
(Catalogue of Somatic Mutations in Cancer).
Usage:
from download_cosmic import download_cosmic_file, list_available_files
# Download a specific file
download_cosmic_file(
email="user@example.com",
password="password",
filepath="GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz",
output_filename="mutations.tsv.gz"
)
Requirements:
- requests library: pip install requests
- Valid COSMIC account credentials (register at cancer.sanger.ac.uk/cosmic)
"""
import requests
import sys
import os
from typing import Optional
def download_cosmic_file(
email: str,
password: str,
filepath: str,
output_filename: Optional[str] = None,
genome_assembly: str = "GRCh38"
) -> bool:
"""
Download a file from COSMIC database.
Args:
email: COSMIC account email
password: COSMIC account password
filepath: Relative path to file (e.g., "GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz")
output_filename: Optional custom output filename (default: last part of filepath)
genome_assembly: Genome assembly version (GRCh37 or GRCh38, default: GRCh38)
Returns:
True if download successful, False otherwise
Example:
download_cosmic_file(
"user@email.com",
"pass123",
"GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz"
)
"""
base_url = "https://cancer.sanger.ac.uk/cosmic/file_download/"
# Determine output filename
if output_filename is None:
output_filename = os.path.basename(filepath)
try:
# Step 1: Get the download URL
print(f"Requesting download URL for: {filepath}")
r = requests.get(
base_url + filepath,
auth=(email, password),
timeout=30
)
if r.status_code == 401:
print("ERROR: Authentication failed. Check email and password.")
return False
elif r.status_code == 404:
print(f"ERROR: File not found: {filepath}")
return False
elif r.status_code != 200:
print(f"ERROR: Request failed with status code {r.status_code}")
print(f"Response: {r.text}")
return False
# Parse response to get download URL
response_data = r.json()
download_url = response_data.get("url")
if not download_url:
print("ERROR: No download URL in response")
return False
# Step 2: Download the file
print(f"Downloading file from: {download_url}")
file_response = requests.get(download_url, stream=True, timeout=300)
if file_response.status_code != 200:
print(f"ERROR: Download failed with status code {file_response.status_code}")
return False
# Step 3: Write to disk
print(f"Saving to: {output_filename}")
total_size = int(file_response.headers.get('content-length', 0))
with open(output_filename, 'wb') as f:
if total_size == 0:
f.write(file_response.content)
else:
downloaded = 0
for chunk in file_response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
# Show progress
progress = (downloaded / total_size) * 100
print(f"\rProgress: {progress:.1f}%", end='', flush=True)
print() # New line after progress
print(f"✓ Successfully downloaded: {output_filename}")
return True
except requests.exceptions.Timeout:
print("ERROR: Request timed out")
return False
except requests.exceptions.RequestException as e:
print(f"ERROR: Request failed: {e}")
return False
except Exception as e:
print(f"ERROR: Unexpected error: {e}")
return False
def get_common_file_path(
data_type: str,
genome_assembly: str = "GRCh38",
version: str = "latest"
) -> Optional[str]:
"""
Get the filepath for common COSMIC data files.
Args:
data_type: Type of data (e.g., 'mutations', 'gene_census', 'signatures')
genome_assembly: GRCh37 or GRCh38
version: COSMIC version (use 'latest' for most recent)
Returns:
Filepath string or None if type unknown
"""
common_files = {
'mutations': f'{genome_assembly}/cosmic/{version}/CosmicMutantExport.tsv.gz',
'mutations_vcf': f'{genome_assembly}/cosmic/{version}/VCF/CosmicCodingMuts.vcf.gz',
'gene_census': f'{genome_assembly}/cosmic/{version}/cancer_gene_census.csv',
'resistance_mutations': f'{genome_assembly}/cosmic/{version}/CosmicResistanceMutations.tsv.gz',
'structural_variants': f'{genome_assembly}/cosmic/{version}/CosmicStructExport.tsv.gz',
'gene_expression': f'{genome_assembly}/cosmic/{version}/CosmicCompleteGeneExpression.tsv.gz',
'copy_number': f'{genome_assembly}/cosmic/{version}/CosmicCompleteCNA.tsv.gz',
'fusion_genes': f'{genome_assembly}/cosmic/{version}/CosmicFusionExport.tsv.gz',
'signatures': f'signatures/signatures.tsv',
'sample_info': f'{genome_assembly}/cosmic/{version}/CosmicSample.tsv.gz',
}
return common_files.get(data_type)
def main():
"""Command-line interface for downloading COSMIC files."""
import argparse
parser = argparse.ArgumentParser(
description='Download files from COSMIC database',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Download mutations file
%(prog)s user@email.com --filepath GRCh38/cosmic/latest/CosmicMutantExport.tsv.gz
# Download using shorthand
%(prog)s user@email.com --data-type mutations
# Download for GRCh37
%(prog)s user@email.com --data-type gene_census --assembly GRCh37
"""
)
parser.add_argument('email', help='COSMIC account email')
parser.add_argument('--password', help='COSMIC account password (will prompt if not provided)')
parser.add_argument('--filepath', help='Full filepath to download')
parser.add_argument('--data-type',
choices=['mutations', 'mutations_vcf', 'gene_census', 'resistance_mutations',
'structural_variants', 'gene_expression', 'copy_number',
'fusion_genes', 'signatures', 'sample_info'],
help='Common data type shorthand')
parser.add_argument('--assembly', default='GRCh38',
choices=['GRCh37', 'GRCh38'],
help='Genome assembly (default: GRCh38)')
parser.add_argument('--version', default='latest',
help='COSMIC version (default: latest)')
parser.add_argument('-o', '--output', help='Output filename')
args = parser.parse_args()
# Get password if not provided
if not args.password:
import getpass
args.password = getpass.getpass('COSMIC password: ')
# Determine filepath
if args.filepath:
filepath = args.filepath
elif args.data_type:
filepath = get_common_file_path(args.data_type, args.assembly, args.version)
if not filepath:
print(f"ERROR: Unknown data type: {args.data_type}")
return 1
else:
print("ERROR: Must provide either --filepath or --data-type")
parser.print_help()
return 1
# Download the file
success = download_cosmic_file(
email=args.email,
password=args.password,
filepath=filepath,
output_filename=args.output,
genome_assembly=args.assembly
)
return 0 if success else 1
if __name__ == '__main__':
sys.exit(main())