Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""
Convert FASconCAT info file to IQ-TREE partition format
Usage:
python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt]
Author: Bruno de Medeiros (Field Museum)
Based on tutorials by Paul Frandsen (BYU)
"""
import sys
def convert_fcc_to_partition(fcc_file, output_file="partition_def.txt"):
"""
Convert FASconCAT info file to IQ-TREE partition format
Args:
fcc_file: Path to FcC_info.xls file from FASconCAT
output_file: Path to output partition definition file
"""
try:
with open(fcc_file, 'r') as f:
lines = f.readlines()
except FileNotFoundError:
print(f"Error: File '{fcc_file}' not found")
sys.exit(1)
partitions_written = 0
with open(output_file, 'w') as out:
# Skip first two header lines (FASconCAT INFO and column headers)
for line in lines[2:]:
line = line.strip()
if line:
parts = line.split('\t')
if len(parts) >= 3:
locus = parts[0]
start = parts[1]
end = parts[2]
out.write(f"AA, {locus} = {start}-{end}\n")
partitions_written += 1
print(f"Partition file created: {output_file}")
print(f"Number of partitions: {partitions_written}")
def main():
if len(sys.argv) < 2:
print("Usage: python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt]")
print("\nConverts FASconCAT info file to IQ-TREE partition format")
sys.exit(1)
fcc_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) > 2 else "partition_def.txt"
convert_fcc_to_partition(fcc_file, output_file)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""
Download genomes from NCBI using BioProject or Assembly accessions
Usage:
python download_ncbi_genomes.py --bioprojects PRJNA12345 PRJEB67890
python download_ncbi_genomes.py --assemblies GCA_123456789.1 GCF_987654321.1
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
Author: Bruno de Medeiros (Field Museum)
Based on tutorials by Paul Frandsen (BYU)
"""
import argparse
import sys
import subprocess
def download_using_cli(accessions, output_file="genomes.zip"):
"""
Download genomes using NCBI datasets CLI
Args:
accessions: List of BioProject or Assembly accessions
output_file: Name of output zip file
"""
cmd = ["datasets", "download", "genome", "accession"] + accessions + ["--filename", output_file]
print(f"Running: {' '.join(cmd)}")
print("")
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print(result.stdout)
print(f"\nDownload complete: {output_file}")
print("Extract with: unzip " + output_file)
return True
except subprocess.CalledProcessError as e:
print(f"Error downloading genomes: {e}", file=sys.stderr)
print(e.stderr, file=sys.stderr)
return False
except FileNotFoundError:
print("Error: 'datasets' command not found", file=sys.stderr)
print("Install with: conda install -c conda-forge ncbi-datasets-cli", file=sys.stderr)
return False
def get_bioproject_assemblies(bioprojects):
"""
Get assembly accessions for given BioProjects using Python API
Args:
bioprojects: List of BioProject accessions
Returns:
List of tuples (assembly_accession, organism_name)
"""
try:
from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions
except ImportError:
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
sys.exit(1)
assemblies = []
print(f"Fetching assembly information for {len(bioprojects)} BioProject(s)...")
print("")
for assembly in get_assembly_metadata_by_bioproject_accessions(bioprojects):
acc = assembly.accession
name = assembly.organism.organism_name
assemblies.append((acc, name))
print(f" {name}: {acc}")
print(f"\nFound {len(assemblies)} assemblies")
return assemblies
def main():
parser = argparse.ArgumentParser(
description="Download genomes from NCBI using BioProject or Assembly accessions"
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--bioprojects",
nargs="+",
help="BioProject accessions (e.g., PRJNA12345 PRJEB67890)"
)
group.add_argument(
"--assemblies",
nargs="+",
help="Assembly accessions (e.g., GCA_123456789.1 GCF_987654321.1)"
)
parser.add_argument(
"-o", "--output",
default="genomes.zip",
help="Output zip file name (default: genomes.zip)"
)
parser.add_argument(
"--list-only",
action="store_true",
help="List assemblies without downloading (BioProject mode only)"
)
args = parser.parse_args()
if args.bioprojects:
assemblies = get_bioproject_assemblies(args.bioprojects)
if args.list_only:
print("\nAssembly accessions (use with --assemblies to download):")
for acc, name in assemblies:
print(acc)
return
# Download assemblies
assembly_accs = [acc for acc, name in assemblies]
success = download_using_cli(assembly_accs, args.output)
elif args.assemblies:
success = download_using_cli(args.assemblies, args.output)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,88 @@
#!/bin/bash
# Extract and reorganize single-copy orthologs from compleasm output
#
# Usage: bash extract_orthologs.sh LINEAGE_NAME
# Example: bash extract_orthologs.sh metazoa
#
# Author: Bruno de Medeiros (Field Museum)
# Based on tutorials by Paul Frandsen (BYU)
if [ $# -lt 1 ]; then
echo "Usage: bash extract_orthologs.sh LINEAGE_NAME"
echo " Example: bash extract_orthologs.sh metazoa"
exit 1
fi
LINEAGE="$1"
echo "Extracting single-copy orthologs for lineage: ${LINEAGE}"
# Create directory for ortholog FASTA files
mkdir -p single_copy_orthologs
# Copy gene_marker.fasta files and rename by species
count=0
for dir in 01_busco_results/*_compleasm; do
if [ ! -d "${dir}" ]; then
continue
fi
genome=$(basename "${dir}" _compleasm)
# Auto-detect the OrthoDB version (odb10, odb11, odb12, etc.)
odb_dirs=("${dir}/${LINEAGE}_odb"*)
if [ -d "${odb_dirs[0]}" ]; then
marker_file="${odb_dirs[0]}/gene_marker.fasta"
else
echo " Warning: No OrthoDB directory found for ${genome}" >&2
continue
fi
if [ -f "${marker_file}" ]; then
cp "${marker_file}" "single_copy_orthologs/${genome}.fasta"
echo " Extracted: ${genome}"
count=$((count + 1))
else
echo " Warning: Marker file not found for ${genome}" >&2
fi
done
if [ ${count} -eq 0 ]; then
echo "Error: No gene_marker.fasta files found. Check lineage name." >&2
exit 1
fi
echo "Extracted ${count} genomes"
echo ""
echo "Now generating per-locus unaligned FASTA files..."
cd single_copy_orthologs || exit 1
mkdir -p unaligned_aa
cd unaligned_aa || exit 1
# AWK script to split by ortholog ID
awk 'BEGIN{RS=">"; FS="\n"} {
if (NF > 1) {
split($1, b, "_");
fnme = b[1] ".fas";
n = split(FILENAME, a, "/");
species = a[length(a)];
gsub(".fasta", "", species);
print ">" species "\n" $2 >> fnme;
close(fnme);
}
}' ../*.fasta
# Fix headers
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS
sed -i '' -e 's/.fasta//g' *.fas
else
# Linux
sed -i -e 's/.fasta//g' *.fas
fi
num_loci=$(ls -1 *.fas 2>/dev/null | wc -l)
echo "Unaligned ortholog files generated: ${num_loci} loci"
echo ""
echo "Output directory: single_copy_orthologs/unaligned_aa/"

View File

@@ -0,0 +1,59 @@
#!/bin/bash
# Quality control report generator for compleasm results
#
# Usage: bash generate_qc_report.sh [output_file.csv]
#
# Author: Bruno de Medeiros (Field Museum)
# Based on tutorials by Paul Frandsen (BYU)
OUTPUT_FILE="${1:-qc_report.csv}"
echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}"
count=0
for dir in 01_busco_results/*_compleasm; do
if [ ! -d "${dir}" ]; then
continue
fi
genome=$(basename "${dir}" _compleasm)
summary="${dir}/summary.txt"
if [ -f "${summary}" ]; then
# Parse completeness statistics from compleasm format
# compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing)
# Format: "S:80.93%, 2283" where we need the count (2283)
complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
# Check if all values were successfully extracted
if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then
echo "Warning: Could not parse statistics for ${genome}" >&2
continue
fi
# Calculate completeness percentage (Complete / Total * 100)
total=$((complete + duplicated + fragmented + missing))
if command -v bc &> /dev/null; then
completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc)
else
# Fallback if bc not available
completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}")
fi
echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}"
count=$((count + 1))
else
echo "Warning: Summary file not found for ${genome}" >&2
fi
done
if [ ${count} -eq 0 ]; then
echo "Error: No compleasm output directories found (*_compleasm)" >&2
exit 1
fi
echo "QC report generated: ${OUTPUT_FILE}"
echo "Genomes analyzed: ${count}"

View File

@@ -0,0 +1,742 @@
#!/usr/bin/perl
use strict ;
use File::Copy ;
use Tie::File ;
use Fcntl ;
use Term::Cap ;
use Term::ANSIColor qw(:constants);
use Getopt::Std ;
# updated on 13th february , 2009 by patrick k<>ck
# updated on 2nd april , 2009 by patrick k<>ck
# updated on 15th june , 2009 by patrick k<>ck
# updated on 26th july , 2009 by patrick k<>ck
# updated on 7th september, 2011 by patrick k<>ck (alicut v2.3)
# updated on 22.2.2017, by patrick k<>ck (alicut v2.31) -> correction of initial warning due to line 547, changed some terminal prints, argv handling commands
my @answer_remain_stems = ( 'no', 'yes' ) ;
my @answer_codons = ( 'no', 'yes' ) ;
my @answer_third_pos = ( 'no', 'yes' ) ;
&argv_handling ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ;
&menu ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ;
sub argv_handling{
my $aref_remain_stems = $_[0] ;
my $aref_codons = $_[1] ;
my $aref_third_pos = $_[2] ;
my ( $commandline ) = join "", @ARGV ;
$commandline =~ s/ |\s+// ;
my @commands = split "-", $commandline ;
shift @commands ;
for my $single_command ( sort @commands ){
if ( $single_command =~ /^r$/i ) { @$aref_remain_stems = ( reverse @$aref_remain_stems) }
elsif ( $single_command =~ /^c$/i ) { @$aref_codons = ( reverse @$aref_codons ) }
elsif ( $single_command =~ /^3$/i ) { @$aref_third_pos = ( reverse @$aref_third_pos ) }
elsif ( $single_command =~ /^h$/i ) { &help }
elsif ( $single_command =~ /^p$/i ) { &preface }
elsif ( $single_command =~ /^s$/i ) {
&header ;
&commands( \$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0]) ;
&start (\$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0])
}
else { print "\n\t!COMMAND-ERROR!: unknown command \"-", $single_command, "\"\n" }
}
&menu ( \@$aref_remain_stems, \@$aref_codons, \@$aref_third_pos)
}
sub header{
printf "\n%68s\n", "------------------------------------------------------------" ;
printf "%49s\n" , "Welcome to ALICUT V2.31 !" ;
printf "%60s\n" , "a Perlscript to cut ALISCORE identified RSS" ;
printf "%57s\n" , "written by Patrick Kueck (ZFMK, Bonn)" ;
printf "%68s\n\n", "------------------------------------------------------------" ;
}
sub commands{
my $sref_rem_stems = $_[0] ;
my $sref_reo_codon = $_[1] ;
my $sref_th_posit = $_[2] ;
print "\n\t------------------------------------------------------------" ;
print "\n\tRemain Stem Position :\t", $$sref_rem_stems ;
print "\n\tRemove Codon :\t", $$sref_reo_codon ;
print "\n\tRemove 3rd Position :\t", $$sref_th_posit ;
print "\n\t------------------------------------------------------------\n" ;
}
sub help{
print
<<info;
-------------------------------------------------------------------
-------------------------------------------------------------------
General Information and Usage:
-------------------------------
ALICUT V2.31 removes ALISCORE identified RSS positions
in given FASTA file(s) which are listed in the FASTA file cor-
responding ALISCORE "List" outfile(s). If structure sequences
are implemented, ALICUT V2.3 automatically replaces brackets
of non rss positions by dots when they are paired with rss
identified positions.
Start ALICUT under default
-------------------------------------------------------------------
To remove all ALISCORE identified RSS positions:
Type <s> return (via Menu) or
Type <perl ALICUT_V2.3.pl -s> <enter> (via command line)
R-Option (Remain Stems)
-------------------------------------------------------------------
To remain all stem positions of identified rss within FASTA file(s):
Type <r> <return> <s> <enter> (via Menu)
Type <perl ALICUT_V2.3.pl -r -s> <enter> (via command line)
C-Option (Remove Codon)
-------------------------------------------------------------------
To translate ALISCORE identified RSS positions of amino-acid data
into nucleotide triplet positions before exclusion of randomised
sequence sections:
Type <c> return <s> return (via Menu) or
Type <perl ALICUT_V2.3.pl -c -s> <enter> (via command line)
Note:
This option is only useful if you have analysed amino-acid
data, but wish to exclude nucleotide positions from the amino-acid
data corresponding nucleotide data.
Be aware, that the name of the nucleotide data file has to be named
equal to the ALISCORE analysed amino-acid data file. The C-option
can not be applied on amino-acid sequences. Otherwise, ALICUT
excludes the original ALISCORE identified sequence sections.
3-Option (Remove 3rd position)
-------------------------------------------------------------------
To remove ALISCORE identified RSS only if its sequence position is
up to amultiple of 3:
Type <3> <return> <s> <return> (via Menu)
Type <perl ALICUT_V2.3.pl -3 -s> <enter> (via command line)
Note:
The 3-Option can be combined with the C-option. In this case,
positions of the ALISCORE "List" outfile(s) are translated into
codon positions from which only the 3rd positions are excluded.
The 3-Option can only be applied on nucleotide data. Otherwise,
ALICUT excludes the original ALISCORE identified sequence sections.
ALICUT IN and OUT files
-------------------------------------------------------------------
ALICUT V2.3 needs the original ALISCORE FASTA infile(s) and "List"
outfile(s) in the same folder as ALICUT V2.3.
The "List" outfile(s) must contain the identified RSS positions
in one single line, separated by whitespace.
e.g. 1 3 5 6 8 9 10 11 123 127 10000 10001
ALICUT V2.0 can handle unlimited FASTA files in one single run.
The sole condition is that the Prefix of the ALISCORE "List"
outfile(s) are identic with the associated FASTA infile(s).
ALICUT V2.3 first searches for the ALISCORE "List" outfile(s),
removes the Suffix "_List_random.txt" and searches for the
"List" associated FASTA file(s).
e.g. COI.fas_List_random.txt (ALISCORE "List" outfile)
COI.fas (Associated FASTA infile)
If both files are detected, ALICUT V2.3 excludes the RSS identified
positions of the "List" file(s) in the associated
FASTA file(s) and saves the changes in a new FASTA outfile,
named "ALICUT_FASTAinputname.fas".
Under the C- and 3-Option, removed sequence positions differ from
the original "List" position numbers. Under both options, ALICUT
prints the actually removed positions in separate "ALICUT_LIST"
outfile(s).
ALICUT V2.3 generates also an info file "ALICUT_info". This file
informs about the number and percentage of removed positions, number
of single sequences, single parameter settings, and sequence states
of each restricted FASTA file.
If structure sequences are identified by ALICUT, ALICUT generates
structure info file(s) which lists remaining stem pairs and loop
positions, as well as percentages of both structure elements.
-------------------------------------------------------------------
-------------------------------------------------------------------
info
;
print "\tBACK to ALICUT MAIN-Menu:\t\t type <return>\n" ;
print "\n\t------------------------------------------------------------\n\t" ;
chomp ( my $answer_xy = <STDIN> );
&menu ;
}
sub preface{
print
<<preface
--------------------FASconCAT PREFACE---------------------
Version : 2.31
Language : PERL
Last Update : 22nd February, 2017
Author : Patrick Kueck, ZFMK Bonn GERMANY
e-mail : patrick_kueck\@web.de
Homepage : http://www.zfmk.de
This program is free software; you can whitedistribute it
and/or modify it under the terms of the GNU General Public
License as published by the Free Software Foundation ;
either version 2 of the License, or (at your option) any
later version.
This program is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the
implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public
License along with this program; if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
USA.
For further free downloadable programs visit:
www.zfmk.de/web/Forschung/Abteilungen/AG_Wgele/index.en.html
------------------------------------------------------------
preface
;
print "\tBACK to ALICUT MAIN-Menu:\t\t type <return>\n" ;
print "\n\t------------------------------------------------------------\n\t" ;
chomp ( my $answer_xy = <STDIN> );
&menu;
}
sub menu{
my $aref_remain_stems = $_[0] ;
my $aref_remove_codon = $_[1] ;
my $aref_third_posit = $_[2] ;
&header ;
print "\n\tSTART ALICUT:\t\ttype <s> <return>" ;
print "\n\tQUIT ALICUT:\t\ttype <q> <return>" ;
print "\n\tREMAIN STEMS:\t\ttype <r> <return>" ;
print "\n\tREMOVE CODON:\t\ttype <c> <return>" ;
print "\n\tREMOVE 3rd:\t\ttype <3> <return>" ;
print "\n\tHELP:\t\t\ttype <h> <return>" ;
print "\n\tPREFACE:\t\ttype <p> <return>" ;
&commands ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] );
my $answer_opening = &commandline ;
until ( $answer_opening =~ /^s$|^r$|^c$|^p$|^h$|^1$|^2$|^q$|^3$/i ){
print "\n\t!COMMAND-ERROR!: unknown command \"$answer_opening\"!\n" ;
$answer_opening = &commandline ;
}
$answer_opening =~ /^s$/i and do { &start ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] ) } ;
$answer_opening =~ /^r$/i and do { @$aref_remain_stems = (reverse @$aref_remain_stems ); &menu } ;
$answer_opening =~ /^c$/i and do { @$aref_remove_codon = (reverse @$aref_remove_codon ); &menu } ;
$answer_opening =~ /^3$/i and do { @$aref_third_posit = (reverse @$aref_third_posit ); &menu } ;
$answer_opening =~ /^q$/i and do { exit } ;
$answer_opening =~ /^h$/i and do { &help } ;
$answer_opening =~ /^1$/ and do { &error1 } ;
$answer_opening =~ /^2$/ and do { &error2 } ;
$answer_opening =~ /^p$/i and do { &preface }
}
sub start{
my $sref_stems_remain = $_[0] ;
my $sref_codon_remove = $_[1] ;
my $sref_third_remove = $_[2] ;
my $j = 0 ;
open OUTinfo, ">>ALICUT_info.xls" ;
print OUTinfo "\nUsed List File\tUsed Fasta file\tremove triplets\tremove 3rd position\tnumber taxa\tbp before\tbp after\tremaining bp [%]\tsequence type\n" ;
# Read IN of all List_random.txt files within the same folder as ALICUT and handle it
READING:
foreach my $file ( <*List_*.txt> ) {
# Set counter +1
$j++;
# Read in of the ALISCORE-list outfile
&tie_linefeeds ( \$file ) ;
( open IN, "<$file" ) or die "n\t!FILE-ERROR!: Can not open listfile $file!\n" ;
my $line = <IN> ; chomp $line ;
# check for correct aliscore list format
unless ( $line =~ /^(\d+ )+\d+$|^\d+$/ ) { warn "\t!FILE-WARN!: $file has no ALISCORE list format!\n" ; next READING }
# Total number of randomized identified positions
my @cut_positions = split " ", $line ; close IN ;
# "filename.fas_List_random.txt" to "filename.fas"
( my $file_fasta = $file ) =~ s/_List_.+// ;
# Read in of the original ALISCORE fasta infile which belongs to the listfile
&tie_linefeeds ( \$file_fasta ) ;
( open INfas, "<$file_fasta" ) or warn "\t!FILE-WARN!: Can not find $file_fasta!\n" and next READING ;
chomp ( my @inputfile = <INfas> ) ; close INfas ;
warn "\t!FILE-WARN!: File $file_fasta is empty!\n" if 0 == @inputfile and next READING ;
# Handle the FASTA file in the way that sequencename and sequence alternate in each line
@inputfile = fas_bearbeiten ( @inputfile ) ;
# Generate a hash: key=>taxon, value => sequenz
my %sequence = @inputfile ;
my @values = values %sequence ;
# Determine basepositions before und after cut. Output of cuttings as total number and in percent
my $number_sequences = keys %sequence ;
my $number_characters_before = length $values[0] ;
# Check for correct FASTA format and handling of structure sequence
my $sequence_state = 'nt' ;
SEQUENCE_CHECK:
for my $raw_taxon ( keys %sequence ){
# if whitespace are between ">" and the next sign within a sequence name, delete these whitespaces
$raw_taxon =~ s/^\>\s*/\>/g ;
# if whitespaces between last sign and newline in sequence name, delete these whitespaces
$raw_taxon =~ s/\s*$//g ;
die "\n\t!FILE-ERROR!: $raw_taxon in $file_fasta is not in FASTA format!\n" if $raw_taxon !~ /^\>/ ;
die "\n\t!FILE-ERROR!: Sequence name missing in $file_fasta!\n" if $raw_taxon =~ /^\>$/ ;
die "\n\t!FILE-ERROR!: Sequence name $raw_taxon in $file_fasta involves forbidden signs!\n" if $raw_taxon !~ /\w/ ;
die "\n\t!FILE-ERROR!: Sequences of $file_fasta have no equal length!\n" if length $sequence{$raw_taxon} != $number_characters_before ;
die "\n\t!FILE-ERROR!: Sequence missing in $file_fasta!\n" if $sequence{$raw_taxon} =~ /^\n$|^$/ ;
die "\n\t!FILE-ERROR!: Sequence length in $file_fasta is too short to cut all positions!\n" if $number_characters_before < $cut_positions[ $#cut_positions ] ;
# Structure handling
if ( $sequence{$raw_taxon} =~ /.*\(.*\).*/ ){
$sequence{$raw_taxon} =~ s/-/./g ;
my @strc_elements = split "" , $sequence{$raw_taxon} ;
for my $str_sign ( @strc_elements ){
unless ( $str_sign =~ /\(|\)|\./ ){ die "\n\t!FILE-ERROR!: Structure string of $file_fasta involves forbidden signs in $raw_taxon!\n" }
}
my $structurestring = $sequence{$raw_taxon} ;
$structurestring =~ s/-/./g ;
$sequence{$raw_taxon} = &structure_handling ( \$structurestring, \$$sref_stems_remain, \@cut_positions, \$file_fasta ); next SEQUENCE_CHECK ;
}
# Check for correct sequence states
$sequence{$raw_taxon} =~ s/(\w+)/\U$1/ig ;
my @seq_elements = split "" , $sequence{$raw_taxon} ;
for my $seq_sign ( @seq_elements ){
unless ( $seq_sign =~ /A|C|G|T|U|-|N|Y|X|R|W|S|K|M|D|V|H|B|Q|E|I|L|F|P|\?/ ){ die "\n\t!FILE-ERROR!: Sequence of $file_fasta involves forbidden signs in $raw_taxon!\n" }
}
if ( $sequence{$raw_taxon} =~ /I|E|L|Q|F|P/ ) { $sequence_state = 'aa' }
}
# Translate cut positions
my @fasta_cut;
&translate_cut_positions( \$$sref_codon_remove, \$$sref_third_remove, \@cut_positions, \$number_characters_before, \@fasta_cut, \$sequence_state, \$file_fasta );
# Calculate percent of remaining positions
my $number_cut_positions = @cut_positions ;
my $number_characters_after = $number_characters_before-$number_cut_positions ;
my $percent_left = sprintf "%.1f", ( $number_characters_after / $number_characters_before ) * 100 ;
$percent_left =~ s/\./,/g ;
# Assume uncut positions to $final and print out to ALICUT_$file_fasta
if ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_codon_3rd_$file_fasta" }
elsif ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /no/ ){ open OUT, ">ALICUT_codon_$file_fasta" }
elsif ( $$sref_codon_remove =~ /no/ && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_3rd_$file_fasta" }
else { open OUT, ">ALICUT_$file_fasta" }
for ( keys %sequence ){
my @bases = split "", $sequence{$_} ;
my @final = map { $bases[$_] } @fasta_cut ;
my $final = $_."\n".( join "", @final )."\n" ;
print OUT "$final" ;
}
close OUT;
# Print Out of extra infos to ALICUT_info
print OUTinfo "$file\t$file_fasta\t$$sref_codon_remove\t$$sref_third_remove\t$number_sequences\t$number_characters_before\t$number_characters_after\t$percent_left\t$sequence_state\n" ;
print "\tDone : $file cut to ALICUT_$file_fasta\n"
}
close OUTinfo ;
# Print OUT number of right handled FASTA files in relation to total number of files
printf "\n%68s\n", "------------------------------------------------------------" ;
printf "%42s\n", "$j FASTA file(s) correctly handled!" ;
printf "%57s\n", "Further infos are printed out in Alicut_info.txt!" ;
printf "\n%63s\n", "ALICUT V2.0 Finished! Thank you and good bye!" ;
printf "%68s\n", "------------------------------------------------------------" ;
&set_timer ;
exit ;
sub tie_linefeeds{
my $sref_filename = $_[0] ;
( open IN , "<$$sref_filename" ) or warn "\tError: can not open $$sref_filename!\n" and next READING ;
(tie ( my @data, 'Tie::File', $$sref_filename )) ;
warn "\t!FILE-WARN!: $$sref_filename is empty!\n" and next READING if 0 == @data ;
map { s/\r\n/\n/g } @data ;
map { s/\r/\n/g } @data ;
untie @data ; close IN ;
}
sub set_timer{
my ( $user, $system, $cuser, $csystem ) = times ;
print <<TIME;
*** time used: $user sec ***
TIME
}
sub translate_cut_positions {
my $sref_command_codon_remove = $_[0] ;
my $sref_command_third_remove = $_[1] ;
my $aref_cut_positions = $_[2] ;
my $sref_number_characters = $_[3] ;
my $aref_remaining_positions = $_[4] ;
my $sref_sequence_state = $_[5] ;
my $sref_filename = $_[6] ;
# Translate identified RSS aminoacid positions to nucleotide triplet positions
if ( $$sref_command_codon_remove =~ /yes/ && $$sref_command_third_remove =~ /no/){
unless ( $$sref_sequence_state =~ /aa/ ){
my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
for my $number( @fasta_old ){
my $newno1 = ($number*3)-2;
my $newno2 = $newno1+1;
my $newno3 = $newno2+1;
push @$aref_cut_positions, ( $newno1, $newno2, $newno3 )
}
my $string_cutnumbers = join " ", @$aref_cut_positions ;
open OUTnewcut, ">ALICUT_cut_positions_codon.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon.txt" ;
print OUTnewcut $string_cutnumbers ; close OUTnewcut ;
}
else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!" }
}
# Translate identified RSS aminoacid positions to nucleotide triplet positions, but remove only third position
elsif ( $$sref_command_codon_remove =~ /yes/ && $$sref_command_third_remove =~ /yes/){
unless ( $$sref_sequence_state =~ /aa/ ){
my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
for my $number( @fasta_old ){
push @$aref_cut_positions, ($number*3)
}
my $string_cutnumbers = join " ", @$aref_cut_positions ;
open OUTnewcut, ">ALICUT_cut_positions_codon_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon_3rd.txt" ;
print OUTnewcut $string_cutnumbers ; close OUTnewcut ;
}
else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!\n\t3rd codon position not removed!" }
}
# Remove only identified RSS if third position of original sequence
elsif ( $$sref_command_codon_remove =~ /no/ && $$sref_command_third_remove =~ /yes/){
unless ( $$sref_sequence_state =~ /aa/ ){
my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
for my $number( @fasta_old ){
if ( $number % 3 == 0 ){ push @$aref_cut_positions, $number }
}
my $string_cutnumbers = join " ", @$aref_cut_positions ;
open OUTnewcut, ">ALICUT_cut_positions_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_3rd.txt" ;
print OUTnewcut $string_cutnumbers ; close OUTnewcut
}
else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tNot only 3rd codon position removed!" }
}
# Examine remaining positions
my ( %seen, @zahlenreihe ) ;
for ( 1 .. $$sref_number_characters ) { push @zahlenreihe, $_-1 }
for my $value ( @$aref_cut_positions ){ $seen{$value-1}++ }
for ( @zahlenreihe ){ unless ( $seen{$_} ){ push @$aref_remaining_positions, $_ } }
}
}
sub fas_bearbeiten{
my @infile = @_ ;
grep s/(\>.*)/$1\t/, @infile ;
grep s/ //g, @infile ;
grep s/\n//g, @infile ;
grep s/\t/\n/g, @infile ;
grep s/\>/\n\>/g, @infile ;
my $string = join "", @infile ;
@infile = split "\n", $string ;
shift @infile ;
return @infile ;
}
sub structure_handling{
my $sref_string = $_[0] ;
my $sref_answer_remain = $_[1] ;
my $aref_cut_positions = $_[2] ;
my $sref_filename = $_[3] ;
my (
@pair_infos ,
@forward ,
@structurestring ,
@loops ,
@pairs ,
%structure_of_position ,
%seen_struc
);
# Stem assignment
my @structures = split "", $$sref_string ;
my $i = 0 ;
CHECKING:
for ( @structures ){ $i++ ;
SWITCH:
$structure_of_position{$i} = $_ ;
if ( $_ =~ /\(/ ){ push @forward, $i and next CHECKING }
if ( $_ =~ /\)/ ){ my $pair_1 = pop @forward; push @pairs, ( $pair_1, $i ); push @pair_infos, ( $pair_1.":".$i ); next CHECKING }
if ( $_ =~ /\./ ){ push @loops, $i and next CHECKING }
}
@pair_infos = reverse @pair_infos ;
# Generate listfiles for structure_info file
my $pairlist = join "\n\t\t\t\t\t", @pair_infos ;
my $looplist = join "\n\t\t\t\t\t", @loops ;
# Number and proportion of stem and loop positions for structure info file
my $N_total = @structures ;
my $N_stems = @pair_infos ;
my $N_loops = $N_total - ( $N_stems * 2 ) ;
my $P_loops = ( $N_loops / $N_total ) * 100 ;
my $P_stems = 100 - $P_loops ;
# Open structure info outfile
open OUTstruc, ">ALICUT_Struc_info_${$sref_filename}.txt" ;
# Print out
print OUTstruc "\nOriginal structure information identified in $$sref_filename:\n\n" ;
print OUTstruc "- Number of characters:\t\t\t$N_total\n" ;
print OUTstruc "- Number of single loop characters:\t$N_loops [$P_stems %]\n" ;
print OUTstruc "- Number of paired stem characters:\t$N_stems [$P_loops %]\n" ;
print OUTstruc "\n- Paired stem positions:\t\t$pairlist\n\n" ;
print OUTstruc "\n- Loop positions:\t\t\t$looplist\n" ;
close OUTstruc;
if ( $$sref_answer_remain =~ /yes/i ){
my @cut_positions2 = ();
# Remain rss identified stem positions within the MSA
for ( @pairs ){ $seen_struc{$_} = 1 }
for ( @$aref_cut_positions ){ unless ( $seen_struc{$_} ){ push @cut_positions2, $_ } }
@$aref_cut_positions = @cut_positions2 ;
}
else{
my %pair = @pairs;
# Replace paired structure positions of rss identified positions by dots
for my $bp_for ( keys %pair ){
for my $rss ( @$aref_cut_positions ){
if ( $bp_for == $rss ){ $structure_of_position{$pair{$bp_for}} = "." ; last }
if ( $pair{$bp_for} == $rss ){ $structure_of_position{$bp_for} = "." ; last }
}
}
}
for ( my $k=1; $k<=@structures-1; $k++ ){ push @structurestring, $structure_of_position{$k} }
my $structure_string_neu = join "", @structurestring ;
return $structure_string_neu ;
}
sub commandline{
print "\n\tCOMMAND:\t " ;
chomp ( my $sub_answer_opening = <STDIN> );
print "\n\t------------------------------------------------------------\n" ;
return $sub_answer_opening;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python3
"""
Query NCBI for available genome assemblies by taxon name
Usage:
python query_ncbi_assemblies.py --taxon "Coleoptera"
python query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50
python query_ncbi_assemblies.py --taxon "Apis" --refseq-only
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
Author: Bruno de Medeiros (Field Museum)
"""
import argparse
import sys
def query_assemblies_by_taxon(taxon, max_results=20, refseq_only=False):
"""
Query NCBI for genome assemblies of a given taxon
Args:
taxon: Taxon name (e.g., "Coleoptera", "Drosophila melanogaster")
max_results: Maximum number of results to return
refseq_only: If True, only return RefSeq assemblies (GCF_*)
Returns:
List of dictionaries with assembly information
"""
try:
from ncbi.datasets import GenomeApi
from ncbi.datasets.openapi import ApiClient, ApiException
except ImportError:
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
sys.exit(1)
assemblies = []
print(f"Querying NCBI for '{taxon}' genome assemblies...")
print(f"(Limiting to {max_results} results)")
if refseq_only:
print("(RefSeq assemblies only)")
print("")
try:
with ApiClient() as api_client:
api = GenomeApi(api_client)
# Query genome assemblies for the taxon
genome_summary = api.genome_summary_by_taxon(
taxon=taxon,
limit=str(max_results),
filters_refseq_only=refseq_only
)
if not genome_summary.reports:
print(f"No assemblies found for taxon '{taxon}'")
return []
for report in genome_summary.reports:
assembly_info = {
'accession': report.accession,
'organism': report.organism.organism_name,
'assembly_level': report.assembly_info.assembly_level,
'assembly_name': report.assembly_info.assembly_name,
'submission_date': report.assembly_info.release_date if hasattr(report.assembly_info, 'release_date') else 'N/A'
}
assemblies.append(assembly_info)
except ApiException as e:
print(f"Error querying NCBI: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Unexpected error: {e}", file=sys.stderr)
sys.exit(1)
return assemblies
def format_table(assemblies):
"""
Format assemblies as a readable table
Args:
assemblies: List of assembly dictionaries
"""
if not assemblies:
return
print(f"Found {len(assemblies)} assemblies:\n")
# Print header
print(f"{'#':<4} {'Accession':<20} {'Organism':<40} {'Level':<15} {'Assembly Name':<30}")
print("-" * 110)
# Print data rows
for i, asm in enumerate(assemblies, 1):
organism = asm['organism'][:38] + '..' if len(asm['organism']) > 40 else asm['organism']
assembly_name = asm['assembly_name'][:28] + '..' if len(asm['assembly_name']) > 30 else asm['assembly_name']
print(f"{i:<4} {asm['accession']:<20} {organism:<40} {asm['assembly_level']:<15} {assembly_name:<30}")
print("")
def save_accessions(assemblies, output_file):
"""
Save assembly accessions to a file
Args:
assemblies: List of assembly dictionaries
output_file: Output file path
"""
with open(output_file, 'w') as f:
for asm in assemblies:
f.write(f"{asm['accession']}\n")
print(f"Accessions saved to: {output_file}")
print(f"You can download these assemblies using:")
print(f" python download_ncbi_genomes.py --assemblies $(cat {output_file})")
def main():
parser = argparse.ArgumentParser(
description="Query NCBI for available genome assemblies by taxon name",
epilog="Example: python query_ncbi_assemblies.py --taxon 'Coleoptera' --max-results 50"
)
parser.add_argument(
"--taxon",
required=True,
help="Taxon name (e.g., 'Coleoptera', 'Drosophila melanogaster')"
)
parser.add_argument(
"--max-results",
type=int,
default=20,
help="Maximum number of results to return (default: 20)"
)
parser.add_argument(
"--refseq-only",
action="store_true",
help="Only return RefSeq assemblies (GCF_* accessions)"
)
parser.add_argument(
"--save",
metavar="FILE",
help="Save accessions to a file for later download"
)
args = parser.parse_args()
# Query NCBI
assemblies = query_assemblies_by_taxon(
taxon=args.taxon,
max_results=args.max_results,
refseq_only=args.refseq_only
)
# Display results
format_table(assemblies)
# Save if requested
if args.save and assemblies:
save_accessions(assemblies, args.save)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
Rename genome files with clean, meaningful sample names for phylogenomics
This script helps create a mapping between genome files (often with cryptic
accession numbers) and clean species/sample names that will appear in the
final phylogenetic tree.
Usage:
# Interactive mode - prompts for names
python rename_genomes.py --interactive genome1.fasta genome2.fasta
# From mapping file (TSV: old_name<TAB>new_name)
python rename_genomes.py --mapping samples.tsv
# Create template mapping file
python rename_genomes.py --create-template *.fasta > samples.tsv
Author: Bruno de Medeiros (Field Museum)
Based on tutorials by Paul Frandsen (BYU)
"""
import argparse
import os
import sys
import shutil
from pathlib import Path
def sanitize_name(name):
"""
Sanitize a name to be phylogenomics-safe
- Replace spaces with underscores
- Remove special characters
- Keep only alphanumeric, underscore, hyphen
"""
# Replace spaces with underscores
name = name.replace(' ', '_')
# Remove special characters except underscore and hyphen
name = ''.join(c for c in name if c.isalnum() or c in '_-')
return name
def create_template(genome_files, output=sys.stdout):
"""Create a template mapping file"""
output.write("# Sample mapping file\n")
output.write("# Format: original_filename<TAB>new_sample_name\n")
output.write("# Edit the second column with meaningful species/sample names\n")
output.write("# Recommended format: [ACCESSION]_[NAME] (e.g., GCA000123456_Penstemon_eatonii)\n")
output.write("# This keeps accession for traceability while having readable names in trees\n")
output.write("# Names should contain only letters, numbers, underscores, and hyphens\n")
output.write("#\n")
for gfile in genome_files:
basename = Path(gfile).stem # Remove extension
output.write(f"{gfile}\t{basename}\n")
def read_mapping(mapping_file):
"""Read mapping from TSV file"""
mapping = {}
with open(mapping_file, 'r') as f:
for line in f:
line = line.strip()
# Skip comments and empty lines
if not line or line.startswith('#'):
continue
parts = line.split('\t')
if len(parts) != 2:
print(f"Warning: Skipping invalid line: {line}", file=sys.stderr)
continue
old_name, new_name = parts
new_name = sanitize_name(new_name)
mapping[old_name] = new_name
return mapping
def interactive_rename(genome_files):
"""Interactively ask for new names"""
mapping = {}
print("Enter new sample names for each genome file.")
print("Press Enter to keep the current name.")
print("Names will be sanitized (spaces→underscores, special chars removed)\n")
for gfile in genome_files:
current_name = Path(gfile).stem
new_name = input(f"{gfile} → [{current_name}]: ").strip()
if not new_name:
new_name = current_name
new_name = sanitize_name(new_name)
mapping[gfile] = new_name
print(f" Will rename to: {new_name}.fasta\n")
return mapping
def rename_files(mapping, dry_run=False, backup=True):
"""Rename genome files according to mapping"""
renamed = []
errors = []
for old_file, new_name in mapping.items():
if not os.path.exists(old_file):
errors.append(f"File not found: {old_file}")
continue
# Get extension from original file
ext = Path(old_file).suffix
if not ext:
ext = '.fasta'
new_file = f"{new_name}{ext}"
# Check if target exists
if os.path.exists(new_file) and new_file != old_file:
errors.append(f"Target exists: {new_file}")
continue
# Skip if names are the same
if old_file == new_file:
print(f"Skip (no change): {old_file}")
continue
if dry_run:
print(f"[DRY RUN] Would rename: {old_file}{new_file}")
else:
# Backup if requested
if backup:
backup_file = f"{old_file}.backup"
shutil.copy2(old_file, backup_file)
print(f"Backup created: {backup_file}")
# Rename
shutil.move(old_file, new_file)
print(f"Renamed: {old_file}{new_file}")
renamed.append((old_file, new_file))
return renamed, errors
def main():
parser = argparse.ArgumentParser(
description="Rename genome files with meaningful sample names for phylogenomics",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Create template mapping file
python rename_genomes.py --create-template *.fasta > samples.tsv
# Edit samples.tsv, then apply mapping
python rename_genomes.py --mapping samples.tsv
# Interactive renaming
python rename_genomes.py --interactive genome1.fasta genome2.fasta
# Dry run (preview changes)
python rename_genomes.py --mapping samples.tsv --dry-run
"""
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
'--create-template',
nargs='+',
metavar='GENOME',
help='Create a template mapping file from genome files'
)
group.add_argument(
'--mapping',
metavar='FILE',
help='TSV file with mapping (old_name<TAB>new_name)'
)
group.add_argument(
'--interactive',
nargs='+',
metavar='GENOME',
help='Interactively rename genome files'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be renamed without actually renaming'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Do not create backup files'
)
args = parser.parse_args()
# Create template
if args.create_template:
create_template(args.create_template)
return
# Interactive mode
if args.interactive:
mapping = interactive_rename(args.interactive)
# Mapping file mode
elif args.mapping:
mapping = read_mapping(args.mapping)
else:
parser.error("No mode specified")
if not mapping:
print("No files to rename", file=sys.stderr)
return
# Perform renaming
renamed, errors = rename_files(
mapping,
dry_run=args.dry_run,
backup=not args.no_backup
)
# Summary
print("\n" + "="*60)
if args.dry_run:
print("DRY RUN - No files were actually renamed")
else:
print(f"Successfully renamed {len(renamed)} file(s)")
if errors:
print(f"\nErrors ({len(errors)}):")
for error in errors:
print(f" - {error}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,247 @@
#!/bin/bash
# run_alicut.sh
# Wrapper script for running ALICUT to remove Aliscore-identified RSS positions
# Removes randomly similar sequence sections from alignments
#
# Usage:
# bash run_alicut.sh [aliscore_dir] [options]
#
# Options:
# -r Remain stem positions (for RNA secondary structures)
# -c Remove codon (translate AA positions to nucleotide triplets)
# -3 Remove only 3rd codon positions
# -s Silent mode (non-interactive, use defaults)
#
# Requirements:
# - ALICUT_V2.31.pl in PATH or same directory
# - Perl with File::Copy, Tie::File, Term::Cap modules
# - Aliscore output directory with *_List_*.txt and original .fas file
set -euo pipefail
# Script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Check for ALICUT script
if command -v ALICUT_V2.31.pl &> /dev/null; then
ALICUT_SCRIPT="ALICUT_V2.31.pl"
elif [ -f "${SCRIPT_DIR}/ALICUT_V2.31.pl" ]; then
ALICUT_SCRIPT="${SCRIPT_DIR}/ALICUT_V2.31.pl"
elif [ -f "./ALICUT_V2.31.pl" ]; then
ALICUT_SCRIPT="./ALICUT_V2.31.pl"
else
echo "ERROR: ALICUT_V2.31.pl not found in PATH, script directory, or current directory"
echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/alicut"
exit 1
fi
# Function to display usage
usage() {
cat <<EOF
Usage: $0 [aliscore_dir] [options]
Run ALICUT to remove Aliscore-identified randomly similar sequence sections.
Arguments:
aliscore_dir Directory containing Aliscore output files
Options:
-r Remain stem positions in RNA secondary structure alignments
-c Remove entire codon (translates AA RSS positions to nt triplets)
-3 Remove only 3rd codon position of identified RSS
-s Silent/scripted mode (non-interactive, use defaults)
-h Display this help message
Input Requirements:
The aliscore_dir must contain:
- Original FASTA alignment file (*.fas)
- Aliscore List file (*_List_random.txt or *_List_*.txt)
Examples:
# Basic usage (interactive mode)
bash run_alicut.sh aliscore_alignment1
# Silent mode with defaults
bash run_alicut.sh aliscore_alignment1 -s
# Remain RNA stem positions
bash run_alicut.sh aliscore_16S -r -s
# Remove entire codons (for back-translation)
bash run_alicut.sh aliscore_protein1 -c -s
# Process all Aliscore output directories
for dir in aliscore_*/; do
bash run_alicut.sh "\${dir}" -s
done
Output Files (in aliscore_dir):
- ALICUT_[alignment].fas : Trimmed alignment
- ALICUT_info.xls : Statistics (taxa, positions removed, etc.)
- ALICUT_Struc_info_*.txt : Structure information (if RNA detected)
Citation:
Kück P, Meusemann K, Dambach J, Thormann B, von Reumont BM, Wägele JW,
Misof B (2010) Parametric and non-parametric masking of randomness in
sequence alignments can be improved and leads to better resolved trees.
Front Zool 7:10. doi: 10.1186/1742-9994-7-10
EOF
exit 0
}
# Parse command line arguments
ALISCORE_DIR=""
ALICUT_OPTS=""
SILENT_MODE=false
if [ $# -eq 0 ]; then
usage
fi
ALISCORE_DIR="$1"
shift
# Validate directory exists
if [ ! -d "${ALISCORE_DIR}" ]; then
echo "ERROR: Aliscore directory not found: ${ALISCORE_DIR}"
exit 1
fi
# Parse ALICUT options
while [ $# -gt 0 ]; do
case "$1" in
-h|--help)
usage
;;
-r)
ALICUT_OPTS="${ALICUT_OPTS} -r"
shift
;;
-c)
ALICUT_OPTS="${ALICUT_OPTS} -c"
shift
;;
-3)
ALICUT_OPTS="${ALICUT_OPTS} -3"
shift
;;
-s|--silent)
SILENT_MODE=true
ALICUT_OPTS="${ALICUT_OPTS} -s"
shift
;;
*)
echo "ERROR: Unknown option: $1"
usage
;;
esac
done
# Change to Aliscore output directory
cd "${ALISCORE_DIR}"
echo "Processing Aliscore output in: ${ALISCORE_DIR}"
# Find List file
LIST_FILE=$(ls *_List_*.txt 2>/dev/null | head -n 1)
if [ -z "${LIST_FILE}" ]; then
echo "ERROR: No Aliscore List file found (*_List_*.txt)"
echo "Make sure Aliscore completed successfully"
exit 1
fi
echo "Found List file: ${LIST_FILE}"
# Find original FASTA file
FASTA_FILE=$(find . -maxdepth 1 \( -name "*.fas" -o -name "*.fasta" \) -type f | head -n 1 | sed 's|^\./||')
if [ -z "${FASTA_FILE}" ]; then
echo "ERROR: No FASTA alignment file found (*.fas or *.fasta)"
echo "ALICUT requires the original alignment file in the same directory as List file"
exit 1
fi
echo "Found FASTA file: ${FASTA_FILE}"
# Check if List file contains RSS positions
RSS_COUNT=$(wc -w < "${LIST_FILE}" || echo "0")
if [ "${RSS_COUNT}" -eq 0 ]; then
echo "WARNING: List file is empty (no RSS positions identified)"
echo "Aliscore found no randomly similar sequences to remove"
echo "Skipping ALICUT - alignment is already clean"
# Create a symbolic link to indicate no trimming was needed
ln -sf "${FASTA_FILE}" "ALICUT_${FASTA_FILE}"
echo "Created symbolic link: ALICUT_${FASTA_FILE} -> ${FASTA_FILE}"
cd ..
exit 0
fi
echo "Found ${RSS_COUNT} RSS positions to remove"
# Run ALICUT
echo ""
echo "Running ALICUT..."
echo "Options: ${ALICUT_OPTS}"
# Construct ALICUT command
ALICUT_CMD="perl ${ALICUT_SCRIPT} ${ALICUT_OPTS}"
if [ "${SILENT_MODE}" = true ]; then
echo "Command: ${ALICUT_CMD}"
eval ${ALICUT_CMD}
else
echo "Running ALICUT in interactive mode..."
echo "Press 's' and Enter to start with current options"
echo ""
perl "${ALICUT_SCRIPT}" ${ALICUT_OPTS}
fi
# Check if ALICUT completed successfully
if [ $? -eq 0 ]; then
echo ""
echo "ALICUT completed successfully"
# Find output file
OUTPUT_FILE=$(ls ALICUT_*.fas ALICUT_*.fasta 2>/dev/null | head -n 1)
if [ -n "${OUTPUT_FILE}" ]; then
echo ""
echo "Output files:"
ls -lh ALICUT_* 2>/dev/null
# Calculate and report trimming statistics (handle multi-line FASTA format)
if [ -f "${OUTPUT_FILE}" ]; then
ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${FASTA_FILE}" | head -n 1 | wc -c)
TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${OUTPUT_FILE}" | head -n 1 | wc -c)
REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH))
PERCENT_REMOVED=$(awk "BEGIN {printf \"%.1f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}")
echo ""
echo "Trimming statistics:"
echo " Original length: ${ORIGINAL_LENGTH} bp"
echo " Trimmed length: ${TRIMMED_LENGTH} bp"
echo " Removed: ${REMOVED_LENGTH} bp (${PERCENT_REMOVED}%)"
fi
# Check for info file
if [ -f "ALICUT_info.xls" ]; then
echo ""
echo "Detailed statistics in: ALICUT_info.xls"
fi
else
echo "WARNING: Expected output file ALICUT_*.fas not found"
fi
else
echo "ERROR: ALICUT failed"
cd ..
exit 1
fi
# Return to parent directory
cd ..
echo ""
echo "Done: ${ALISCORE_DIR}"

View File

@@ -0,0 +1,248 @@
#!/bin/bash
# run_aliscore.sh
# Wrapper script for running Aliscore on aligned sequences
# Identifies randomly similar sequence sections (RSS) in multiple sequence alignments
#
# Usage:
# bash run_aliscore.sh [alignment.fas] [options]
#
# Options:
# -w INT Window size (default: 4)
# -r INT Number of random pairs to compare (default: 4*N taxa)
# -N Treat gaps as ambiguous characters (recommended for amino acids)
# -t TREE Tree file in Newick format for guided comparisons
# -l LEVEL Node level for tree-based comparisons
# -o TAXA Comma-separated list of outgroup taxa
#
# Array job usage:
# Set SLURM_ARRAY_TASK_ID or PBS_ARRAYID environment variable
# Create locus_list.txt with one alignment file per line
#
# Requirements:
# - Aliscore.02.2.pl in PATH or same directory
# - Perl with Tie::File and Fcntl modules
set -euo pipefail
# Script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Check for Aliscore script
if command -v Aliscore.02.2.pl &> /dev/null; then
ALISCORE_SCRIPT="Aliscore.02.2.pl"
elif [ -f "${SCRIPT_DIR}/Aliscore.02.2.pl" ]; then
ALISCORE_SCRIPT="${SCRIPT_DIR}/Aliscore.02.2.pl"
elif [ -f "./Aliscore.02.2.pl" ]; then
ALISCORE_SCRIPT="./Aliscore.02.2.pl"
else
echo "ERROR: Aliscore.02.2.pl not found in PATH, script directory, or current directory"
echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/aliscore"
exit 1
fi
# Function to display usage
usage() {
cat <<EOF
Usage: $0 [alignment.fas] [options]
Run Aliscore to identify randomly similar sequence sections in alignments.
Options:
-d DIR Base output directory for all Aliscore results (default: aliscore_output)
-w INT Window size for sliding window analysis (default: 4)
-r INT Number of random sequence pairs to compare (default: 4*N taxa)
-N Treat gaps as ambiguous characters (recommended for amino acids)
-t FILE Tree file in Newick format for phylogeny-guided comparisons
-l LEVEL Node level limit for tree-based comparisons (default: all)
-o TAXA Comma-separated list of outgroup taxa for focused comparisons
-h Display this help message
Array Job Mode:
If SLURM_ARRAY_TASK_ID or PBS_ARRAYID is set, reads alignment from locus_list.txt
Create locus_list.txt with: ls *.fas > locus_list.txt
Examples:
# Basic run with defaults (outputs to aliscore_output/)
bash run_aliscore.sh alignment.fas
# Amino acid sequences with gaps as ambiguous
bash run_aliscore.sh protein_alignment.fas -N
# Custom output directory
bash run_aliscore.sh alignment.fas -d my_aliscore_results
# Custom window size and random pairs
bash run_aliscore.sh alignment.fas -w 6 -r 100
# Tree-guided analysis
bash run_aliscore.sh alignment.fas -t species.tre
# Array job on SLURM
ls aligned_aa/*.fas > locus_list.txt
sbatch --array=1-\$(wc -l < locus_list.txt) run_aliscore_array.job
Output Files (in aliscore_output/aliscore_[alignment]/):
- [alignment]_List_random.txt : Positions identified as RSS (for ALICUT)
- [alignment]_Profile_random.txt: Quality profile for each position
- [alignment].svg : Visual plot of scoring profiles
Citation:
Misof B, Misof K (2009) A Monte Carlo approach successfully identifies
randomness in multiple sequence alignments: a more objective means of data
exclusion. Syst Biol 58(1):21-34. doi: 10.1093/sysbio/syp006
EOF
exit 0
}
# Parse command line arguments
ALIGNMENT=""
ALISCORE_OPTS=""
BASE_OUTPUT_DIR="aliscore_output"
if [ $# -eq 0 ]; then
usage
fi
# Check for array job mode
ARRAY_MODE=false
ARRAY_ID=""
if [ -n "${SLURM_ARRAY_TASK_ID:-}" ]; then
ARRAY_MODE=true
ARRAY_ID="${SLURM_ARRAY_TASK_ID}"
elif [ -n "${PBS_ARRAYID:-}" ]; then
ARRAY_MODE=true
ARRAY_ID="${PBS_ARRAYID}"
fi
# If in array mode, get alignment from locus list
if [ "${ARRAY_MODE}" = true ]; then
if [ ! -f "locus_list.txt" ]; then
echo "ERROR: Array job mode requires locus_list.txt"
echo "Create with: ls *.fas > locus_list.txt"
exit 1
fi
ALIGNMENT=$(sed -n "${ARRAY_ID}p" locus_list.txt)
if [ -z "${ALIGNMENT}" ]; then
echo "ERROR: Could not read alignment for array index ${ARRAY_ID}"
exit 1
fi
echo "Array job ${ARRAY_ID}: Processing ${ALIGNMENT}"
# Remaining arguments are Aliscore options
shift $# # Clear positional parameters
set -- "$@" # Reset with remaining args
else
# First argument is alignment file
ALIGNMENT="$1"
shift
fi
# Validate alignment file exists
if [ ! -f "${ALIGNMENT}" ]; then
echo "ERROR: Alignment file not found: ${ALIGNMENT}"
exit 1
fi
# Parse Aliscore options
while [ $# -gt 0 ]; do
case "$1" in
-h|--help)
usage
;;
-d|--output-dir)
BASE_OUTPUT_DIR="$2"
shift 2
;;
-w)
ALISCORE_OPTS="${ALISCORE_OPTS} -w $2"
shift 2
;;
-r)
ALISCORE_OPTS="${ALISCORE_OPTS} -r $2"
shift 2
;;
-N)
ALISCORE_OPTS="${ALISCORE_OPTS} -N"
shift
;;
-t)
if [ ! -f "$2" ]; then
echo "ERROR: Tree file not found: $2"
exit 1
fi
ALISCORE_OPTS="${ALISCORE_OPTS} -t $2"
shift 2
;;
-l)
ALISCORE_OPTS="${ALISCORE_OPTS} -l $2"
shift 2
;;
-o)
ALISCORE_OPTS="${ALISCORE_OPTS} -o $2"
shift 2
;;
*)
echo "ERROR: Unknown option: $1"
usage
;;
esac
done
# Get alignment name without extension
ALIGNMENT_NAME=$(basename "${ALIGNMENT}" .fas)
ALIGNMENT_NAME=$(basename "${ALIGNMENT_NAME}" .fasta)
# Create base output directory and specific directory for this alignment
mkdir -p "${BASE_OUTPUT_DIR}"
OUTPUT_DIR="${BASE_OUTPUT_DIR}/aliscore_${ALIGNMENT_NAME}"
mkdir -p "${OUTPUT_DIR}"
# Copy alignment to output directory
cp "${ALIGNMENT}" "${OUTPUT_DIR}/"
# Change to output directory
cd "${OUTPUT_DIR}"
# Run Aliscore
echo "Running Aliscore on ${ALIGNMENT}..."
echo "Options: ${ALISCORE_OPTS}"
echo "Aliscore script: ${ALISCORE_SCRIPT}"
# Construct and run Aliscore command
ALISCORE_CMD="perl -I${SCRIPT_DIR} ${ALISCORE_SCRIPT} -i $(basename ${ALIGNMENT}) ${ALISCORE_OPTS}"
echo "Command: ${ALISCORE_CMD}"
eval ${ALISCORE_CMD}
# Check if Aliscore completed successfully
if [ $? -eq 0 ]; then
echo "Aliscore completed successfully for ${ALIGNMENT}"
# List output files
echo ""
echo "Output files in ${OUTPUT_DIR}:"
ls -lh *List*.txt *Profile*.txt *.svg 2>/dev/null || echo " (some expected files not generated)"
# Report RSS positions if found
if [ -f "$(basename ${ALIGNMENT})_List_random.txt" ]; then
RSS_COUNT=$(wc -w < "$(basename ${ALIGNMENT})_List_random.txt")
echo ""
echo "Identified ${RSS_COUNT} randomly similar sequence positions"
echo "See: ${OUTPUT_DIR}/$(basename ${ALIGNMENT})_List_random.txt"
fi
else
echo "ERROR: Aliscore failed for ${ALIGNMENT}"
cd ..
exit 1
fi
# Return to parent directory
cd ..
echo "Done: ${ALIGNMENT} -> ${OUTPUT_DIR}"

View File

@@ -0,0 +1,270 @@
#!/bin/bash
# run_aliscore_alicut_batch.sh
# Batch processing script for Aliscore + ALICUT alignment trimming
# Processes all alignments in a directory through both tools sequentially
#
# Usage:
# bash run_aliscore_alicut_batch.sh [alignment_dir] [options]
#
# This script:
# 1. Runs Aliscore on all alignments to identify RSS
# 2. Runs ALICUT on each Aliscore output to remove RSS
# 3. Collects trimmed alignments in output directory
#
# Requirements:
# - run_aliscore.sh and run_alicut.sh in same directory or PATH
# - Aliscore.02.2.pl and ALICUT_V2.31.pl available
set -euo pipefail
# Script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Function to display usage
usage() {
cat <<EOF
Usage: $0 [alignment_dir] [options]
Batch process multiple alignments through Aliscore and ALICUT.
Arguments:
alignment_dir Directory containing aligned FASTA files (*.fas)
Options:
-o DIR Output directory for trimmed alignments (default: aliscore_alicut_trimmed)
-d DIR Base directory for Aliscore outputs (default: aliscore_output)
-w INT Aliscore window size (default: 4)
-r INT Aliscore random pairs (default: 4*N)
-N Aliscore: treat gaps as ambiguous (recommended for AA)
--remain-stems ALICUT: remain RNA stem positions
--remove-codon ALICUT: remove entire codons (for back-translation)
--remove-3rd ALICUT: remove only 3rd codon positions
-h Display this help message
Examples:
# Basic usage for amino acid alignments
bash run_aliscore_alicut_batch.sh aligned_aa/ -N
# Custom window size
bash run_aliscore_alicut_batch.sh aligned_aa/ -w 6 -N
# With RNA structure preservation
bash run_aliscore_alicut_batch.sh aligned_rrna/ --remain-stems
Output:
- aliscore_output/aliscore_[locus]/ : Individual Aliscore results per locus
- aliscore_alicut_trimmed/ : Final trimmed alignments
- aliscore_alicut_trimmed/trimming_summary.txt : Statistics for all loci
EOF
exit 0
}
# Default parameters
ALIGNMENT_DIR=""
OUTPUT_DIR="aliscore_alicut_trimmed"
ALISCORE_BASE_DIR="aliscore_output"
ALISCORE_OPTS=""
ALICUT_OPTS="-s" # Silent mode by default
if [ $# -eq 0 ]; then
usage
fi
ALIGNMENT_DIR="$1"
shift
# Validate alignment directory
if [ ! -d "${ALIGNMENT_DIR}" ]; then
echo "ERROR: Alignment directory not found: ${ALIGNMENT_DIR}"
exit 1
fi
# Parse options
while [ $# -gt 0 ]; do
case "$1" in
-h|--help)
usage
;;
-o|--output)
OUTPUT_DIR="$2"
shift 2
;;
-d|--aliscore-dir)
ALISCORE_BASE_DIR="$2"
shift 2
;;
-w)
ALISCORE_OPTS="${ALISCORE_OPTS} -w $2"
shift 2
;;
-r)
ALISCORE_OPTS="${ALISCORE_OPTS} -r $2"
shift 2
;;
-N)
ALISCORE_OPTS="${ALISCORE_OPTS} -N"
shift
;;
--remain-stems)
ALICUT_OPTS="${ALICUT_OPTS} -r"
shift
;;
--remove-codon)
ALICUT_OPTS="${ALICUT_OPTS} -c"
shift
;;
--remove-3rd)
ALICUT_OPTS="${ALICUT_OPTS} -3"
shift
;;
*)
echo "ERROR: Unknown option: $1"
usage
;;
esac
done
# Check for wrapper scripts
RUN_ALISCORE="${SCRIPT_DIR}/run_aliscore.sh"
RUN_ALICUT="${SCRIPT_DIR}/run_alicut.sh"
if [ ! -f "${RUN_ALISCORE}" ]; then
echo "ERROR: run_aliscore.sh not found: ${RUN_ALISCORE}"
exit 1
fi
if [ ! -f "${RUN_ALICUT}" ]; then
echo "ERROR: run_alicut.sh not found: ${RUN_ALICUT}"
exit 1
fi
# Create output directory
mkdir -p "${OUTPUT_DIR}"
# Find all FASTA files
ALIGNMENTS=($(find "${ALIGNMENT_DIR}" -maxdepth 1 -name "*.fas" -o -name "*.fasta"))
if [ ${#ALIGNMENTS[@]} -eq 0 ]; then
echo "ERROR: No FASTA files found in ${ALIGNMENT_DIR}"
exit 1
fi
echo "Found ${#ALIGNMENTS[@]} alignments to process"
echo "Aliscore options: ${ALISCORE_OPTS}"
echo "ALICUT options: ${ALICUT_OPTS}"
echo ""
# Initialize summary file
SUMMARY_FILE="${OUTPUT_DIR}/trimming_summary.txt"
echo -e "Locus\tOriginal_Length\tTrimmed_Length\tRemoved_Positions\tPercent_Removed\tRSS_Count" > "${SUMMARY_FILE}"
# Process each alignment
SUCCESS_COUNT=0
FAIL_COUNT=0
for ALIGNMENT in "${ALIGNMENTS[@]}"; do
LOCUS=$(basename "${ALIGNMENT}" .fas)
LOCUS=$(basename "${LOCUS}" .fasta)
echo "=========================================="
echo "Processing: ${LOCUS}"
echo "=========================================="
# Step 1: Run Aliscore
echo ""
echo "Step 1/2: Running Aliscore..."
if bash "${RUN_ALISCORE}" "${ALIGNMENT}" -d "${ALISCORE_BASE_DIR}" ${ALISCORE_OPTS}; then
echo "Aliscore completed for ${LOCUS}"
else
echo "ERROR: Aliscore failed for ${LOCUS}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# Step 2: Run ALICUT
echo ""
echo "Step 2/2: Running ALICUT..."
ALISCORE_DIR="${ALISCORE_BASE_DIR}/aliscore_${LOCUS}"
if [ ! -d "${ALISCORE_DIR}" ]; then
echo "ERROR: Aliscore output directory not found: ${ALISCORE_DIR}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
if bash "${RUN_ALICUT}" "${ALISCORE_DIR}" ${ALICUT_OPTS}; then
echo "ALICUT completed for ${LOCUS}"
else
echo "ERROR: ALICUT failed for ${LOCUS}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# Copy trimmed alignment to output directory
TRIMMED_FILE=$(find "${ALISCORE_DIR}" -name "ALICUT_*.fas" -o -name "ALICUT_*.fasta" | head -n 1)
if [ -n "${TRIMMED_FILE}" ] && [ -f "${TRIMMED_FILE}" ]; then
cp "${TRIMMED_FILE}" "${OUTPUT_DIR}/${LOCUS}_trimmed.fas"
echo "Trimmed alignment: ${OUTPUT_DIR}/${LOCUS}_trimmed.fas"
# Calculate statistics (handle multi-line FASTA format)
ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${ALIGNMENT}" | head -n 1 | tr -d ' ' | wc -c)
TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${TRIMMED_FILE}" | head -n 1 | tr -d ' ' | wc -c)
REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH))
PERCENT_REMOVED=$(awk "BEGIN {printf \"%.2f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}")
# Count RSS positions
LIST_FILE=$(find "${ALISCORE_DIR}" -name "*_List_*.txt" | head -n 1)
RSS_COUNT=$(wc -w < "${LIST_FILE}" 2>/dev/null || echo "0")
# Append to summary
echo -e "${LOCUS}\t${ORIGINAL_LENGTH}\t${TRIMMED_LENGTH}\t${REMOVED_LENGTH}\t${PERCENT_REMOVED}\t${RSS_COUNT}" >> "${SUMMARY_FILE}"
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
else
echo "WARNING: Trimmed file not found for ${LOCUS}"
FAIL_COUNT=$((FAIL_COUNT + 1))
fi
echo ""
done
# Final report
echo "=========================================="
echo "BATCH PROCESSING COMPLETE"
echo "=========================================="
echo ""
echo "Successfully processed: ${SUCCESS_COUNT}/${#ALIGNMENTS[@]} alignments"
echo "Failed: ${FAIL_COUNT}/${#ALIGNMENTS[@]} alignments"
echo ""
echo "Output directory: ${OUTPUT_DIR}"
echo "Trimmed alignments: ${OUTPUT_DIR}/*_trimmed.fas"
echo "Summary statistics: ${SUMMARY_FILE}"
echo ""
# Display summary statistics
if [ ${SUCCESS_COUNT} -gt 0 ]; then
echo "Overall trimming statistics:"
awk 'NR>1 {
total_orig += $2;
total_trim += $3;
total_removed += $4;
count++
}
END {
if (count > 0) {
avg_removed = (total_removed / total_orig) * 100;
printf " Total positions before: %d\n", total_orig;
printf " Total positions after: %d\n", total_trim;
printf " Total removed: %d (%.2f%%)\n", total_removed, avg_removed;
printf " Average per locus: %.2f%% removed\n", avg_removed;
}
}' "${SUMMARY_FILE}"
fi
echo ""
echo "Done!"