Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions

View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
Rename genome files with clean, meaningful sample names for phylogenomics
This script helps create a mapping between genome files (often with cryptic
accession numbers) and clean species/sample names that will appear in the
final phylogenetic tree.
Usage:
# Interactive mode - prompts for names
python rename_genomes.py --interactive genome1.fasta genome2.fasta
# From mapping file (TSV: old_name<TAB>new_name)
python rename_genomes.py --mapping samples.tsv
# Create template mapping file
python rename_genomes.py --create-template *.fasta > samples.tsv
Author: Bruno de Medeiros (Field Museum)
Based on tutorials by Paul Frandsen (BYU)
"""
import argparse
import os
import sys
import shutil
from pathlib import Path
def sanitize_name(name):
"""
Sanitize a name to be phylogenomics-safe
- Replace spaces with underscores
- Remove special characters
- Keep only alphanumeric, underscore, hyphen
"""
# Replace spaces with underscores
name = name.replace(' ', '_')
# Remove special characters except underscore and hyphen
name = ''.join(c for c in name if c.isalnum() or c in '_-')
return name
def create_template(genome_files, output=sys.stdout):
"""Create a template mapping file"""
output.write("# Sample mapping file\n")
output.write("# Format: original_filename<TAB>new_sample_name\n")
output.write("# Edit the second column with meaningful species/sample names\n")
output.write("# Recommended format: [ACCESSION]_[NAME] (e.g., GCA000123456_Penstemon_eatonii)\n")
output.write("# This keeps accession for traceability while having readable names in trees\n")
output.write("# Names should contain only letters, numbers, underscores, and hyphens\n")
output.write("#\n")
for gfile in genome_files:
basename = Path(gfile).stem # Remove extension
output.write(f"{gfile}\t{basename}\n")
def read_mapping(mapping_file):
"""Read mapping from TSV file"""
mapping = {}
with open(mapping_file, 'r') as f:
for line in f:
line = line.strip()
# Skip comments and empty lines
if not line or line.startswith('#'):
continue
parts = line.split('\t')
if len(parts) != 2:
print(f"Warning: Skipping invalid line: {line}", file=sys.stderr)
continue
old_name, new_name = parts
new_name = sanitize_name(new_name)
mapping[old_name] = new_name
return mapping
def interactive_rename(genome_files):
"""Interactively ask for new names"""
mapping = {}
print("Enter new sample names for each genome file.")
print("Press Enter to keep the current name.")
print("Names will be sanitized (spaces→underscores, special chars removed)\n")
for gfile in genome_files:
current_name = Path(gfile).stem
new_name = input(f"{gfile} → [{current_name}]: ").strip()
if not new_name:
new_name = current_name
new_name = sanitize_name(new_name)
mapping[gfile] = new_name
print(f" Will rename to: {new_name}.fasta\n")
return mapping
def rename_files(mapping, dry_run=False, backup=True):
"""Rename genome files according to mapping"""
renamed = []
errors = []
for old_file, new_name in mapping.items():
if not os.path.exists(old_file):
errors.append(f"File not found: {old_file}")
continue
# Get extension from original file
ext = Path(old_file).suffix
if not ext:
ext = '.fasta'
new_file = f"{new_name}{ext}"
# Check if target exists
if os.path.exists(new_file) and new_file != old_file:
errors.append(f"Target exists: {new_file}")
continue
# Skip if names are the same
if old_file == new_file:
print(f"Skip (no change): {old_file}")
continue
if dry_run:
print(f"[DRY RUN] Would rename: {old_file}{new_file}")
else:
# Backup if requested
if backup:
backup_file = f"{old_file}.backup"
shutil.copy2(old_file, backup_file)
print(f"Backup created: {backup_file}")
# Rename
shutil.move(old_file, new_file)
print(f"Renamed: {old_file}{new_file}")
renamed.append((old_file, new_file))
return renamed, errors
def main():
parser = argparse.ArgumentParser(
description="Rename genome files with meaningful sample names for phylogenomics",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Create template mapping file
python rename_genomes.py --create-template *.fasta > samples.tsv
# Edit samples.tsv, then apply mapping
python rename_genomes.py --mapping samples.tsv
# Interactive renaming
python rename_genomes.py --interactive genome1.fasta genome2.fasta
# Dry run (preview changes)
python rename_genomes.py --mapping samples.tsv --dry-run
"""
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
'--create-template',
nargs='+',
metavar='GENOME',
help='Create a template mapping file from genome files'
)
group.add_argument(
'--mapping',
metavar='FILE',
help='TSV file with mapping (old_name<TAB>new_name)'
)
group.add_argument(
'--interactive',
nargs='+',
metavar='GENOME',
help='Interactively rename genome files'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be renamed without actually renaming'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Do not create backup files'
)
args = parser.parse_args()
# Create template
if args.create_template:
create_template(args.create_template)
return
# Interactive mode
if args.interactive:
mapping = interactive_rename(args.interactive)
# Mapping file mode
elif args.mapping:
mapping = read_mapping(args.mapping)
else:
parser.error("No mode specified")
if not mapping:
print("No files to rename", file=sys.stderr)
return
# Perform renaming
renamed, errors = rename_files(
mapping,
dry_run=args.dry_run,
backup=not args.no_backup
)
# Summary
print("\n" + "="*60)
if args.dry_run:
print("DRY RUN - No files were actually renamed")
else:
print(f"Successfully renamed {len(renamed)} file(s)")
if errors:
print(f"\nErrors ({len(errors)}):")
for error in errors:
print(f" - {error}")
sys.exit(1)
if __name__ == "__main__":
main()