#!/usr/bin/env python3 """ Batch Identifier Converter This script converts multiple identifiers between biological databases using UniProt's mapping service. Supports batch processing with automatic chunking and error handling. Usage: python batch_id_converter.py INPUT_FILE --from DB1 --to DB2 [options] Examples: python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG python batch_id_converter.py gene_ids.txt --from GeneID --to UniProtKB --output mapping.csv python batch_id_converter.py ids.txt --from UniProtKB_AC-ID --to Ensembl --chunk-size 50 Input file format: One identifier per line (plain text) Common database codes: UniProtKB_AC-ID - UniProt accession/ID KEGG - KEGG gene IDs GeneID - NCBI Gene (Entrez) IDs Ensembl - Ensembl gene IDs Ensembl_Protein - Ensembl protein IDs RefSeq_Protein - RefSeq protein IDs PDB - Protein Data Bank IDs HGNC - Human gene symbols GO - Gene Ontology IDs """ import sys import argparse import csv import time from bioservices import UniProt # Common database code mappings DATABASE_CODES = { 'uniprot': 'UniProtKB_AC-ID', 'uniprotkb': 'UniProtKB_AC-ID', 'kegg': 'KEGG', 'geneid': 'GeneID', 'entrez': 'GeneID', 'ensembl': 'Ensembl', 'ensembl_protein': 'Ensembl_Protein', 'ensembl_transcript': 'Ensembl_Transcript', 'refseq': 'RefSeq_Protein', 'refseq_protein': 'RefSeq_Protein', 'pdb': 'PDB', 'hgnc': 'HGNC', 'mgi': 'MGI', 'go': 'GO', 'pfam': 'Pfam', 'interpro': 'InterPro', 'reactome': 'Reactome', 'string': 'STRING', 'biogrid': 'BioGRID' } def normalize_database_code(code): """Normalize database code to official format.""" # Try exact match first if code in DATABASE_CODES.values(): return code # Try lowercase lookup lowercase = code.lower() if lowercase in DATABASE_CODES: return DATABASE_CODES[lowercase] # Return as-is if not found (may still be valid) return code def read_ids_from_file(filename): """Read identifiers from file (one per line).""" print(f"Reading identifiers from {filename}...") ids = [] with open(filename, 'r') as f: for line in f: line = line.strip() if line and not line.startswith('#'): ids.append(line) print(f"✓ Read {len(ids)} identifier(s)") return ids def batch_convert(ids, from_db, to_db, chunk_size=100, delay=0.5): """Convert IDs with automatic chunking and error handling.""" print(f"\nConverting {len(ids)} IDs:") print(f" From: {from_db}") print(f" To: {to_db}") print(f" Chunk size: {chunk_size}") print() u = UniProt(verbose=False) all_results = {} failed_ids = [] total_chunks = (len(ids) + chunk_size - 1) // chunk_size for i in range(0, len(ids), chunk_size): chunk = ids[i:i+chunk_size] chunk_num = (i // chunk_size) + 1 query = ",".join(chunk) try: print(f" [{chunk_num}/{total_chunks}] Processing {len(chunk)} IDs...", end=" ") results = u.mapping(fr=from_db, to=to_db, query=query) if results: all_results.update(results) mapped_count = len([v for v in results.values() if v]) print(f"✓ Mapped: {mapped_count}/{len(chunk)}") else: print(f"✗ No mappings returned") failed_ids.extend(chunk) # Rate limiting if delay > 0 and i + chunk_size < len(ids): time.sleep(delay) except Exception as e: print(f"✗ Error: {e}") # Try individual IDs in failed chunk print(f" Retrying individual IDs...") for single_id in chunk: try: result = u.mapping(fr=from_db, to=to_db, query=single_id) if result: all_results.update(result) print(f" ✓ {single_id}") else: failed_ids.append(single_id) print(f" ✗ {single_id} - no mapping") except Exception as e2: failed_ids.append(single_id) print(f" ✗ {single_id} - {e2}") time.sleep(0.2) # Add missing IDs to results (mark as failed) for id_ in ids: if id_ not in all_results: all_results[id_] = None print(f"\n✓ Conversion complete:") print(f" Total: {len(ids)}") print(f" Mapped: {len([v for v in all_results.values() if v])}") print(f" Failed: {len(failed_ids)}") return all_results, failed_ids def save_mapping_csv(mapping, output_file, from_db, to_db): """Save mapping results to CSV.""" print(f"\nSaving results to {output_file}...") with open(output_file, 'w', newline='') as f: writer = csv.writer(f) # Header writer.writerow(['Source_ID', 'Source_DB', 'Target_IDs', 'Target_DB', 'Mapping_Status']) # Data for source_id, target_ids in sorted(mapping.items()): if target_ids: target_str = ";".join(target_ids) status = "Success" else: target_str = "" status = "Failed" writer.writerow([source_id, from_db, target_str, to_db, status]) print(f"✓ Results saved") def save_failed_ids(failed_ids, output_file): """Save failed IDs to file.""" if not failed_ids: return print(f"\nSaving failed IDs to {output_file}...") with open(output_file, 'w') as f: for id_ in failed_ids: f.write(f"{id_}\n") print(f"✓ Saved {len(failed_ids)} failed ID(s)") def print_mapping_summary(mapping, from_db, to_db): """Print summary of mapping results.""" print(f"\n{'='*70}") print("MAPPING SUMMARY") print(f"{'='*70}") total = len(mapping) mapped = len([v for v in mapping.values() if v]) failed = total - mapped print(f"\nSource database: {from_db}") print(f"Target database: {to_db}") print(f"\nTotal identifiers: {total}") print(f"Successfully mapped: {mapped} ({mapped/total*100:.1f}%)") print(f"Failed to map: {failed} ({failed/total*100:.1f}%)") # Show some examples if mapped > 0: print(f"\nExample mappings (first 5):") count = 0 for source_id, target_ids in mapping.items(): if target_ids: target_str = ", ".join(target_ids[:3]) if len(target_ids) > 3: target_str += f" ... +{len(target_ids)-3} more" print(f" {source_id} → {target_str}") count += 1 if count >= 5: break # Show multiple mapping statistics multiple_mappings = [v for v in mapping.values() if v and len(v) > 1] if multiple_mappings: print(f"\nMultiple target mappings: {len(multiple_mappings)} ID(s)") print(f" (These source IDs map to multiple target IDs)") print(f"{'='*70}") def list_common_databases(): """Print list of common database codes.""" print("\nCommon Database Codes:") print("-" * 70) print(f"{'Alias':<20} {'Official Code':<30}") print("-" * 70) for alias, code in sorted(DATABASE_CODES.items()): if alias != code.lower(): print(f"{alias:<20} {code:<30}") print("-" * 70) print("\nNote: Many other database codes are supported.") print("See UniProt documentation for complete list.") def main(): """Main conversion workflow.""" parser = argparse.ArgumentParser( description="Batch convert biological identifiers between databases", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG python batch_id_converter.py ids.txt --from GeneID --to UniProtKB -o mapping.csv python batch_id_converter.py ids.txt --from uniprot --to ensembl --chunk-size 50 Common database codes: UniProtKB_AC-ID, KEGG, GeneID, Ensembl, Ensembl_Protein, RefSeq_Protein, PDB, HGNC, GO, Pfam, InterPro, Reactome Use --list-databases to see all supported aliases. """ ) parser.add_argument("input_file", help="Input file with IDs (one per line)") parser.add_argument("--from", dest="from_db", required=True, help="Source database code") parser.add_argument("--to", dest="to_db", required=True, help="Target database code") parser.add_argument("-o", "--output", default=None, help="Output CSV file (default: mapping_results.csv)") parser.add_argument("--chunk-size", type=int, default=100, help="Number of IDs per batch (default: 100)") parser.add_argument("--delay", type=float, default=0.5, help="Delay between batches in seconds (default: 0.5)") parser.add_argument("--save-failed", action="store_true", help="Save failed IDs to separate file") parser.add_argument("--list-databases", action="store_true", help="List common database codes and exit") args = parser.parse_args() # List databases and exit if args.list_databases: list_common_databases() sys.exit(0) print("=" * 70) print("BIOSERVICES: Batch Identifier Converter") print("=" * 70) # Normalize database codes from_db = normalize_database_code(args.from_db) to_db = normalize_database_code(args.to_db) if from_db != args.from_db: print(f"\nNote: Normalized '{args.from_db}' → '{from_db}'") if to_db != args.to_db: print(f"Note: Normalized '{args.to_db}' → '{to_db}'") # Read input IDs try: ids = read_ids_from_file(args.input_file) except Exception as e: print(f"\n✗ Error reading input file: {e}") sys.exit(1) if not ids: print("\n✗ No IDs found in input file") sys.exit(1) # Perform conversion mapping, failed_ids = batch_convert( ids, from_db, to_db, chunk_size=args.chunk_size, delay=args.delay ) # Print summary print_mapping_summary(mapping, from_db, to_db) # Save results output_file = args.output or "mapping_results.csv" save_mapping_csv(mapping, output_file, from_db, to_db) # Save failed IDs if requested if args.save_failed and failed_ids: failed_file = output_file.replace(".csv", "_failed.txt") save_failed_ids(failed_ids, failed_file) print(f"\n✓ Done!") if __name__ == "__main__": main()