Initial commit
This commit is contained in:
63
skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py
Executable file
63
skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert FASconCAT info file to IQ-TREE partition format
|
||||
|
||||
Usage:
|
||||
python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt]
|
||||
|
||||
Author: Bruno de Medeiros (Field Museum)
|
||||
Based on tutorials by Paul Frandsen (BYU)
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
def convert_fcc_to_partition(fcc_file, output_file="partition_def.txt"):
|
||||
"""
|
||||
Convert FASconCAT info file to IQ-TREE partition format
|
||||
|
||||
Args:
|
||||
fcc_file: Path to FcC_info.xls file from FASconCAT
|
||||
output_file: Path to output partition definition file
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(fcc_file, 'r') as f:
|
||||
lines = f.readlines()
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File '{fcc_file}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
partitions_written = 0
|
||||
|
||||
with open(output_file, 'w') as out:
|
||||
# Skip first two header lines (FASconCAT INFO and column headers)
|
||||
for line in lines[2:]:
|
||||
line = line.strip()
|
||||
if line:
|
||||
parts = line.split('\t')
|
||||
if len(parts) >= 3:
|
||||
locus = parts[0]
|
||||
start = parts[1]
|
||||
end = parts[2]
|
||||
out.write(f"AA, {locus} = {start}-{end}\n")
|
||||
partitions_written += 1
|
||||
|
||||
print(f"Partition file created: {output_file}")
|
||||
print(f"Number of partitions: {partitions_written}")
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt]")
|
||||
print("\nConverts FASconCAT info file to IQ-TREE partition format")
|
||||
sys.exit(1)
|
||||
|
||||
fcc_file = sys.argv[1]
|
||||
output_file = sys.argv[2] if len(sys.argv) > 2 else "partition_def.txt"
|
||||
|
||||
convert_fcc_to_partition(fcc_file, output_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
133
skills/phylo_from_buscos/scripts/download_ncbi_genomes.py
Executable file
133
skills/phylo_from_buscos/scripts/download_ncbi_genomes.py
Executable file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download genomes from NCBI using BioProject or Assembly accessions
|
||||
|
||||
Usage:
|
||||
python download_ncbi_genomes.py --bioprojects PRJNA12345 PRJEB67890
|
||||
python download_ncbi_genomes.py --assemblies GCA_123456789.1 GCF_987654321.1
|
||||
|
||||
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
|
||||
|
||||
Author: Bruno de Medeiros (Field Museum)
|
||||
Based on tutorials by Paul Frandsen (BYU)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
|
||||
def download_using_cli(accessions, output_file="genomes.zip"):
|
||||
"""
|
||||
Download genomes using NCBI datasets CLI
|
||||
|
||||
Args:
|
||||
accessions: List of BioProject or Assembly accessions
|
||||
output_file: Name of output zip file
|
||||
"""
|
||||
cmd = ["datasets", "download", "genome", "accession"] + accessions + ["--filename", output_file]
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print("")
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print(result.stdout)
|
||||
print(f"\nDownload complete: {output_file}")
|
||||
print("Extract with: unzip " + output_file)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error downloading genomes: {e}", file=sys.stderr)
|
||||
print(e.stderr, file=sys.stderr)
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
print("Error: 'datasets' command not found", file=sys.stderr)
|
||||
print("Install with: conda install -c conda-forge ncbi-datasets-cli", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def get_bioproject_assemblies(bioprojects):
|
||||
"""
|
||||
Get assembly accessions for given BioProjects using Python API
|
||||
|
||||
Args:
|
||||
bioprojects: List of BioProject accessions
|
||||
|
||||
Returns:
|
||||
List of tuples (assembly_accession, organism_name)
|
||||
"""
|
||||
try:
|
||||
from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions
|
||||
except ImportError:
|
||||
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
|
||||
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
assemblies = []
|
||||
|
||||
print(f"Fetching assembly information for {len(bioprojects)} BioProject(s)...")
|
||||
print("")
|
||||
|
||||
for assembly in get_assembly_metadata_by_bioproject_accessions(bioprojects):
|
||||
acc = assembly.accession
|
||||
name = assembly.organism.organism_name
|
||||
assemblies.append((acc, name))
|
||||
print(f" {name}: {acc}")
|
||||
|
||||
print(f"\nFound {len(assemblies)} assemblies")
|
||||
|
||||
return assemblies
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download genomes from NCBI using BioProject or Assembly accessions"
|
||||
)
|
||||
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
"--bioprojects",
|
||||
nargs="+",
|
||||
help="BioProject accessions (e.g., PRJNA12345 PRJEB67890)"
|
||||
)
|
||||
group.add_argument(
|
||||
"--assemblies",
|
||||
nargs="+",
|
||||
help="Assembly accessions (e.g., GCA_123456789.1 GCF_987654321.1)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
default="genomes.zip",
|
||||
help="Output zip file name (default: genomes.zip)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--list-only",
|
||||
action="store_true",
|
||||
help="List assemblies without downloading (BioProject mode only)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.bioprojects:
|
||||
assemblies = get_bioproject_assemblies(args.bioprojects)
|
||||
|
||||
if args.list_only:
|
||||
print("\nAssembly accessions (use with --assemblies to download):")
|
||||
for acc, name in assemblies:
|
||||
print(acc)
|
||||
return
|
||||
|
||||
# Download assemblies
|
||||
assembly_accs = [acc for acc, name in assemblies]
|
||||
success = download_using_cli(assembly_accs, args.output)
|
||||
|
||||
elif args.assemblies:
|
||||
success = download_using_cli(args.assemblies, args.output)
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
88
skills/phylo_from_buscos/scripts/extract_orthologs.sh
Executable file
88
skills/phylo_from_buscos/scripts/extract_orthologs.sh
Executable file
@@ -0,0 +1,88 @@
|
||||
#!/bin/bash
|
||||
# Extract and reorganize single-copy orthologs from compleasm output
|
||||
#
|
||||
# Usage: bash extract_orthologs.sh LINEAGE_NAME
|
||||
# Example: bash extract_orthologs.sh metazoa
|
||||
#
|
||||
# Author: Bruno de Medeiros (Field Museum)
|
||||
# Based on tutorials by Paul Frandsen (BYU)
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: bash extract_orthologs.sh LINEAGE_NAME"
|
||||
echo " Example: bash extract_orthologs.sh metazoa"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LINEAGE="$1"
|
||||
|
||||
echo "Extracting single-copy orthologs for lineage: ${LINEAGE}"
|
||||
|
||||
# Create directory for ortholog FASTA files
|
||||
mkdir -p single_copy_orthologs
|
||||
|
||||
# Copy gene_marker.fasta files and rename by species
|
||||
count=0
|
||||
for dir in 01_busco_results/*_compleasm; do
|
||||
if [ ! -d "${dir}" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
genome=$(basename "${dir}" _compleasm)
|
||||
|
||||
# Auto-detect the OrthoDB version (odb10, odb11, odb12, etc.)
|
||||
odb_dirs=("${dir}/${LINEAGE}_odb"*)
|
||||
if [ -d "${odb_dirs[0]}" ]; then
|
||||
marker_file="${odb_dirs[0]}/gene_marker.fasta"
|
||||
else
|
||||
echo " Warning: No OrthoDB directory found for ${genome}" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ -f "${marker_file}" ]; then
|
||||
cp "${marker_file}" "single_copy_orthologs/${genome}.fasta"
|
||||
echo " Extracted: ${genome}"
|
||||
count=$((count + 1))
|
||||
else
|
||||
echo " Warning: Marker file not found for ${genome}" >&2
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${count} -eq 0 ]; then
|
||||
echo "Error: No gene_marker.fasta files found. Check lineage name." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Extracted ${count} genomes"
|
||||
echo ""
|
||||
echo "Now generating per-locus unaligned FASTA files..."
|
||||
|
||||
cd single_copy_orthologs || exit 1
|
||||
mkdir -p unaligned_aa
|
||||
cd unaligned_aa || exit 1
|
||||
|
||||
# AWK script to split by ortholog ID
|
||||
awk 'BEGIN{RS=">"; FS="\n"} {
|
||||
if (NF > 1) {
|
||||
split($1, b, "_");
|
||||
fnme = b[1] ".fas";
|
||||
n = split(FILENAME, a, "/");
|
||||
species = a[length(a)];
|
||||
gsub(".fasta", "", species);
|
||||
print ">" species "\n" $2 >> fnme;
|
||||
close(fnme);
|
||||
}
|
||||
}' ../*.fasta
|
||||
|
||||
# Fix headers
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
# macOS
|
||||
sed -i '' -e 's/.fasta//g' *.fas
|
||||
else
|
||||
# Linux
|
||||
sed -i -e 's/.fasta//g' *.fas
|
||||
fi
|
||||
|
||||
num_loci=$(ls -1 *.fas 2>/dev/null | wc -l)
|
||||
echo "Unaligned ortholog files generated: ${num_loci} loci"
|
||||
echo ""
|
||||
echo "Output directory: single_copy_orthologs/unaligned_aa/"
|
||||
59
skills/phylo_from_buscos/scripts/generate_qc_report.sh
Executable file
59
skills/phylo_from_buscos/scripts/generate_qc_report.sh
Executable file
@@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
# Quality control report generator for compleasm results
|
||||
#
|
||||
# Usage: bash generate_qc_report.sh [output_file.csv]
|
||||
#
|
||||
# Author: Bruno de Medeiros (Field Museum)
|
||||
# Based on tutorials by Paul Frandsen (BYU)
|
||||
|
||||
OUTPUT_FILE="${1:-qc_report.csv}"
|
||||
|
||||
echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}"
|
||||
|
||||
count=0
|
||||
for dir in 01_busco_results/*_compleasm; do
|
||||
if [ ! -d "${dir}" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
genome=$(basename "${dir}" _compleasm)
|
||||
summary="${dir}/summary.txt"
|
||||
|
||||
if [ -f "${summary}" ]; then
|
||||
# Parse completeness statistics from compleasm format
|
||||
# compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing)
|
||||
# Format: "S:80.93%, 2283" where we need the count (2283)
|
||||
complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
|
||||
# Check if all values were successfully extracted
|
||||
if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then
|
||||
echo "Warning: Could not parse statistics for ${genome}" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
# Calculate completeness percentage (Complete / Total * 100)
|
||||
total=$((complete + duplicated + fragmented + missing))
|
||||
if command -v bc &> /dev/null; then
|
||||
completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc)
|
||||
else
|
||||
# Fallback if bc not available
|
||||
completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}")
|
||||
fi
|
||||
|
||||
echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}"
|
||||
count=$((count + 1))
|
||||
else
|
||||
echo "Warning: Summary file not found for ${genome}" >&2
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${count} -eq 0 ]; then
|
||||
echo "Error: No compleasm output directories found (*_compleasm)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "QC report generated: ${OUTPUT_FILE}"
|
||||
echo "Genomes analyzed: ${count}"
|
||||
742
skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl
Executable file
742
skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl
Executable file
@@ -0,0 +1,742 @@
|
||||
#!/usr/bin/perl
|
||||
use strict ;
|
||||
use File::Copy ;
|
||||
use Tie::File ;
|
||||
use Fcntl ;
|
||||
use Term::Cap ;
|
||||
use Term::ANSIColor qw(:constants);
|
||||
use Getopt::Std ;
|
||||
|
||||
# updated on 13th february , 2009 by patrick k<>ck
|
||||
# updated on 2nd april , 2009 by patrick k<>ck
|
||||
# updated on 15th june , 2009 by patrick k<>ck
|
||||
# updated on 26th july , 2009 by patrick k<>ck
|
||||
# updated on 7th september, 2011 by patrick k<>ck (alicut v2.3)
|
||||
# updated on 22.2.2017, by patrick k<>ck (alicut v2.31) -> correction of initial warning due to line 547, changed some terminal prints, argv handling commands
|
||||
|
||||
my @answer_remain_stems = ( 'no', 'yes' ) ;
|
||||
my @answer_codons = ( 'no', 'yes' ) ;
|
||||
my @answer_third_pos = ( 'no', 'yes' ) ;
|
||||
|
||||
&argv_handling ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ;
|
||||
&menu ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ;
|
||||
|
||||
|
||||
|
||||
sub argv_handling{
|
||||
|
||||
my $aref_remain_stems = $_[0] ;
|
||||
my $aref_codons = $_[1] ;
|
||||
my $aref_third_pos = $_[2] ;
|
||||
|
||||
my ( $commandline ) = join "", @ARGV ;
|
||||
|
||||
$commandline =~ s/ |\s+// ;
|
||||
my @commands = split "-", $commandline ;
|
||||
shift @commands ;
|
||||
|
||||
for my $single_command ( sort @commands ){
|
||||
|
||||
if ( $single_command =~ /^r$/i ) { @$aref_remain_stems = ( reverse @$aref_remain_stems) }
|
||||
elsif ( $single_command =~ /^c$/i ) { @$aref_codons = ( reverse @$aref_codons ) }
|
||||
elsif ( $single_command =~ /^3$/i ) { @$aref_third_pos = ( reverse @$aref_third_pos ) }
|
||||
elsif ( $single_command =~ /^h$/i ) { &help }
|
||||
elsif ( $single_command =~ /^p$/i ) { &preface }
|
||||
elsif ( $single_command =~ /^s$/i ) {
|
||||
&header ;
|
||||
&commands( \$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0]) ;
|
||||
&start (\$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0])
|
||||
}
|
||||
else { print "\n\t!COMMAND-ERROR!: unknown command \"-", $single_command, "\"\n" }
|
||||
}
|
||||
|
||||
&menu ( \@$aref_remain_stems, \@$aref_codons, \@$aref_third_pos)
|
||||
}
|
||||
|
||||
sub header{
|
||||
|
||||
printf "\n%68s\n", "------------------------------------------------------------" ;
|
||||
printf "%49s\n" , "Welcome to ALICUT V2.31 !" ;
|
||||
printf "%60s\n" , "a Perlscript to cut ALISCORE identified RSS" ;
|
||||
printf "%57s\n" , "written by Patrick Kueck (ZFMK, Bonn)" ;
|
||||
printf "%68s\n\n", "------------------------------------------------------------" ;
|
||||
}
|
||||
|
||||
sub commands{
|
||||
|
||||
my $sref_rem_stems = $_[0] ;
|
||||
my $sref_reo_codon = $_[1] ;
|
||||
my $sref_th_posit = $_[2] ;
|
||||
|
||||
print "\n\t------------------------------------------------------------" ;
|
||||
print "\n\tRemain Stem Position :\t", $$sref_rem_stems ;
|
||||
print "\n\tRemove Codon :\t", $$sref_reo_codon ;
|
||||
print "\n\tRemove 3rd Position :\t", $$sref_th_posit ;
|
||||
print "\n\t------------------------------------------------------------\n" ;
|
||||
}
|
||||
|
||||
sub help{
|
||||
|
||||
print
|
||||
<<info;
|
||||
|
||||
-------------------------------------------------------------------
|
||||
-------------------------------------------------------------------
|
||||
|
||||
General Information and Usage:
|
||||
-------------------------------
|
||||
ALICUT V2.31 removes ALISCORE identified RSS positions
|
||||
in given FASTA file(s) which are listed in the FASTA file cor-
|
||||
responding ALISCORE "List" outfile(s). If structure sequences
|
||||
are implemented, ALICUT V2.3 automatically replaces brackets
|
||||
of non rss positions by dots when they are paired with rss
|
||||
identified positions.
|
||||
|
||||
|
||||
|
||||
Start ALICUT under default
|
||||
-------------------------------------------------------------------
|
||||
To remove all ALISCORE identified RSS positions:
|
||||
|
||||
Type <s> return (via Menu) or
|
||||
Type <perl ALICUT_V2.3.pl -s> <enter> (via command line)
|
||||
|
||||
|
||||
|
||||
R-Option (Remain Stems)
|
||||
-------------------------------------------------------------------
|
||||
To remain all stem positions of identified rss within FASTA file(s):
|
||||
|
||||
Type <r> <return> <s> <enter> (via Menu)
|
||||
Type <perl ALICUT_V2.3.pl -r -s> <enter> (via command line)
|
||||
|
||||
|
||||
|
||||
C-Option (Remove Codon)
|
||||
-------------------------------------------------------------------
|
||||
To translate ALISCORE identified RSS positions of amino-acid data
|
||||
into nucleotide triplet positions before exclusion of randomised
|
||||
sequence sections:
|
||||
|
||||
Type <c> return <s> return (via Menu) or
|
||||
Type <perl ALICUT_V2.3.pl -c -s> <enter> (via command line)
|
||||
|
||||
Note:
|
||||
This option is only useful if you have analysed amino-acid
|
||||
data, but wish to exclude nucleotide positions from the amino-acid
|
||||
data corresponding nucleotide data.
|
||||
Be aware, that the name of the nucleotide data file has to be named
|
||||
equal to the ALISCORE analysed amino-acid data file. The C-option
|
||||
can not be applied on amino-acid sequences. Otherwise, ALICUT
|
||||
excludes the original ALISCORE identified sequence sections.
|
||||
|
||||
|
||||
|
||||
3-Option (Remove 3rd position)
|
||||
-------------------------------------------------------------------
|
||||
To remove ALISCORE identified RSS only if its sequence position is
|
||||
up to amultiple of 3:
|
||||
|
||||
Type <3> <return> <s> <return> (via Menu)
|
||||
Type <perl ALICUT_V2.3.pl -3 -s> <enter> (via command line)
|
||||
|
||||
Note:
|
||||
The 3-Option can be combined with the C-option. In this case,
|
||||
positions of the ALISCORE "List" outfile(s) are translated into
|
||||
codon positions from which only the 3rd positions are excluded.
|
||||
The 3-Option can only be applied on nucleotide data. Otherwise,
|
||||
ALICUT excludes the original ALISCORE identified sequence sections.
|
||||
|
||||
|
||||
|
||||
ALICUT IN and OUT files
|
||||
-------------------------------------------------------------------
|
||||
ALICUT V2.3 needs the original ALISCORE FASTA infile(s) and "List"
|
||||
outfile(s) in the same folder as ALICUT V2.3.
|
||||
|
||||
The "List" outfile(s) must contain the identified RSS positions
|
||||
in one single line, separated by whitespace.
|
||||
|
||||
e.g. 1 3 5 6 8 9 10 11 123 127 10000 10001
|
||||
|
||||
ALICUT V2.0 can handle unlimited FASTA files in one single run.
|
||||
The sole condition is that the Prefix of the ALISCORE "List"
|
||||
outfile(s) are identic with the associated FASTA infile(s).
|
||||
ALICUT V2.3 first searches for the ALISCORE "List" outfile(s),
|
||||
removes the Suffix "_List_random.txt" and searches for the
|
||||
"List" associated FASTA file(s).
|
||||
|
||||
e.g. COI.fas_List_random.txt (ALISCORE "List" outfile)
|
||||
COI.fas (Associated FASTA infile)
|
||||
|
||||
If both files are detected, ALICUT V2.3 excludes the RSS identified
|
||||
positions of the "List" file(s) in the associated
|
||||
FASTA file(s) and saves the changes in a new FASTA outfile,
|
||||
named "ALICUT_FASTAinputname.fas".
|
||||
|
||||
Under the C- and 3-Option, removed sequence positions differ from
|
||||
the original "List" position numbers. Under both options, ALICUT
|
||||
prints the actually removed positions in separate "ALICUT_LIST"
|
||||
outfile(s).
|
||||
|
||||
ALICUT V2.3 generates also an info file "ALICUT_info". This file
|
||||
informs about the number and percentage of removed positions, number
|
||||
of single sequences, single parameter settings, and sequence states
|
||||
of each restricted FASTA file.
|
||||
If structure sequences are identified by ALICUT, ALICUT generates
|
||||
structure info file(s) which lists remaining stem pairs and loop
|
||||
positions, as well as percentages of both structure elements.
|
||||
|
||||
-------------------------------------------------------------------
|
||||
-------------------------------------------------------------------
|
||||
|
||||
|
||||
info
|
||||
;
|
||||
|
||||
print "\tBACK to ALICUT MAIN-Menu:\t\t type <return>\n" ;
|
||||
print "\n\t------------------------------------------------------------\n\t" ;
|
||||
|
||||
chomp ( my $answer_xy = <STDIN> );
|
||||
|
||||
&menu ;
|
||||
|
||||
}
|
||||
|
||||
sub preface{
|
||||
|
||||
print
|
||||
<<preface
|
||||
|
||||
--------------------FASconCAT PREFACE---------------------
|
||||
|
||||
Version : 2.31
|
||||
Language : PERL
|
||||
Last Update : 22nd February, 2017
|
||||
Author : Patrick Kueck, ZFMK Bonn GERMANY
|
||||
e-mail : patrick_kueck\@web.de
|
||||
Homepage : http://www.zfmk.de
|
||||
|
||||
This program is free software; you can whitedistribute it
|
||||
and/or modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation ;
|
||||
either version 2 of the License, or (at your option) any
|
||||
later version.
|
||||
|
||||
This program is distributed in the hope that it will be
|
||||
useful, but WITHOUT ANY WARRANTY; without even the
|
||||
implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
more details.
|
||||
|
||||
You should have received a copy of the GNU General Public
|
||||
License along with this program; if not, write to the Free
|
||||
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
|
||||
USA.
|
||||
|
||||
For further free downloadable programs visit:
|
||||
www.zfmk.de/web/Forschung/Abteilungen/AG_Wgele/index.en.html
|
||||
|
||||
------------------------------------------------------------
|
||||
|
||||
preface
|
||||
;
|
||||
|
||||
print "\tBACK to ALICUT MAIN-Menu:\t\t type <return>\n" ;
|
||||
print "\n\t------------------------------------------------------------\n\t" ;
|
||||
|
||||
chomp ( my $answer_xy = <STDIN> );
|
||||
|
||||
&menu;
|
||||
}
|
||||
|
||||
sub menu{
|
||||
|
||||
my $aref_remain_stems = $_[0] ;
|
||||
my $aref_remove_codon = $_[1] ;
|
||||
my $aref_third_posit = $_[2] ;
|
||||
|
||||
&header ;
|
||||
|
||||
print "\n\tSTART ALICUT:\t\ttype <s> <return>" ;
|
||||
print "\n\tQUIT ALICUT:\t\ttype <q> <return>" ;
|
||||
print "\n\tREMAIN STEMS:\t\ttype <r> <return>" ;
|
||||
print "\n\tREMOVE CODON:\t\ttype <c> <return>" ;
|
||||
print "\n\tREMOVE 3rd:\t\ttype <3> <return>" ;
|
||||
print "\n\tHELP:\t\t\ttype <h> <return>" ;
|
||||
print "\n\tPREFACE:\t\ttype <p> <return>" ;
|
||||
|
||||
&commands ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] );
|
||||
|
||||
my $answer_opening = &commandline ;
|
||||
|
||||
until ( $answer_opening =~ /^s$|^r$|^c$|^p$|^h$|^1$|^2$|^q$|^3$/i ){
|
||||
|
||||
print "\n\t!COMMAND-ERROR!: unknown command \"$answer_opening\"!\n" ;
|
||||
|
||||
$answer_opening = &commandline ;
|
||||
}
|
||||
|
||||
$answer_opening =~ /^s$/i and do { &start ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] ) } ;
|
||||
$answer_opening =~ /^r$/i and do { @$aref_remain_stems = (reverse @$aref_remain_stems ); &menu } ;
|
||||
$answer_opening =~ /^c$/i and do { @$aref_remove_codon = (reverse @$aref_remove_codon ); &menu } ;
|
||||
$answer_opening =~ /^3$/i and do { @$aref_third_posit = (reverse @$aref_third_posit ); &menu } ;
|
||||
$answer_opening =~ /^q$/i and do { exit } ;
|
||||
$answer_opening =~ /^h$/i and do { &help } ;
|
||||
$answer_opening =~ /^1$/ and do { &error1 } ;
|
||||
$answer_opening =~ /^2$/ and do { &error2 } ;
|
||||
$answer_opening =~ /^p$/i and do { &preface }
|
||||
}
|
||||
|
||||
sub start{
|
||||
|
||||
my $sref_stems_remain = $_[0] ;
|
||||
my $sref_codon_remove = $_[1] ;
|
||||
my $sref_third_remove = $_[2] ;
|
||||
|
||||
my $j = 0 ;
|
||||
|
||||
open OUTinfo, ">>ALICUT_info.xls" ;
|
||||
print OUTinfo "\nUsed List File\tUsed Fasta file\tremove triplets\tremove 3rd position\tnumber taxa\tbp before\tbp after\tremaining bp [%]\tsequence type\n" ;
|
||||
|
||||
|
||||
|
||||
# Read IN of all List_random.txt files within the same folder as ALICUT and handle it
|
||||
READING:
|
||||
foreach my $file ( <*List_*.txt> ) {
|
||||
|
||||
# Set counter +1
|
||||
$j++;
|
||||
|
||||
|
||||
|
||||
# Read in of the ALISCORE-list outfile
|
||||
&tie_linefeeds ( \$file ) ;
|
||||
( open IN, "<$file" ) or die "n\t!FILE-ERROR!: Can not open listfile $file!\n" ;
|
||||
my $line = <IN> ; chomp $line ;
|
||||
|
||||
# check for correct aliscore list format
|
||||
unless ( $line =~ /^(\d+ )+\d+$|^\d+$/ ) { warn "\t!FILE-WARN!: $file has no ALISCORE list format!\n" ; next READING }
|
||||
|
||||
# Total number of randomized identified positions
|
||||
my @cut_positions = split " ", $line ; close IN ;
|
||||
|
||||
|
||||
|
||||
# "filename.fas_List_random.txt" to "filename.fas"
|
||||
( my $file_fasta = $file ) =~ s/_List_.+// ;
|
||||
|
||||
# Read in of the original ALISCORE fasta infile which belongs to the listfile
|
||||
&tie_linefeeds ( \$file_fasta ) ;
|
||||
( open INfas, "<$file_fasta" ) or warn "\t!FILE-WARN!: Can not find $file_fasta!\n" and next READING ;
|
||||
|
||||
chomp ( my @inputfile = <INfas> ) ; close INfas ;
|
||||
warn "\t!FILE-WARN!: File $file_fasta is empty!\n" if 0 == @inputfile and next READING ;
|
||||
|
||||
# Handle the FASTA file in the way that sequencename and sequence alternate in each line
|
||||
@inputfile = fas_bearbeiten ( @inputfile ) ;
|
||||
|
||||
# Generate a hash: key=>taxon, value => sequenz
|
||||
my %sequence = @inputfile ;
|
||||
my @values = values %sequence ;
|
||||
|
||||
# Determine basepositions before und after cut. Output of cuttings as total number and in percent
|
||||
my $number_sequences = keys %sequence ;
|
||||
my $number_characters_before = length $values[0] ;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Check for correct FASTA format and handling of structure sequence
|
||||
my $sequence_state = 'nt' ;
|
||||
SEQUENCE_CHECK:
|
||||
for my $raw_taxon ( keys %sequence ){
|
||||
|
||||
# if whitespace are between ">" and the next sign within a sequence name, delete these whitespaces
|
||||
$raw_taxon =~ s/^\>\s*/\>/g ;
|
||||
|
||||
# if whitespaces between last sign and newline in sequence name, delete these whitespaces
|
||||
$raw_taxon =~ s/\s*$//g ;
|
||||
|
||||
die "\n\t!FILE-ERROR!: $raw_taxon in $file_fasta is not in FASTA format!\n" if $raw_taxon !~ /^\>/ ;
|
||||
die "\n\t!FILE-ERROR!: Sequence name missing in $file_fasta!\n" if $raw_taxon =~ /^\>$/ ;
|
||||
die "\n\t!FILE-ERROR!: Sequence name $raw_taxon in $file_fasta involves forbidden signs!\n" if $raw_taxon !~ /\w/ ;
|
||||
die "\n\t!FILE-ERROR!: Sequences of $file_fasta have no equal length!\n" if length $sequence{$raw_taxon} != $number_characters_before ;
|
||||
die "\n\t!FILE-ERROR!: Sequence missing in $file_fasta!\n" if $sequence{$raw_taxon} =~ /^\n$|^$/ ;
|
||||
die "\n\t!FILE-ERROR!: Sequence length in $file_fasta is too short to cut all positions!\n" if $number_characters_before < $cut_positions[ $#cut_positions ] ;
|
||||
|
||||
|
||||
|
||||
# Structure handling
|
||||
if ( $sequence{$raw_taxon} =~ /.*\(.*\).*/ ){
|
||||
|
||||
$sequence{$raw_taxon} =~ s/-/./g ;
|
||||
my @strc_elements = split "" , $sequence{$raw_taxon} ;
|
||||
|
||||
for my $str_sign ( @strc_elements ){
|
||||
|
||||
unless ( $str_sign =~ /\(|\)|\./ ){ die "\n\t!FILE-ERROR!: Structure string of $file_fasta involves forbidden signs in $raw_taxon!\n" }
|
||||
}
|
||||
|
||||
my $structurestring = $sequence{$raw_taxon} ;
|
||||
$structurestring =~ s/-/./g ;
|
||||
$sequence{$raw_taxon} = &structure_handling ( \$structurestring, \$$sref_stems_remain, \@cut_positions, \$file_fasta ); next SEQUENCE_CHECK ;
|
||||
}
|
||||
|
||||
|
||||
|
||||
# Check for correct sequence states
|
||||
$sequence{$raw_taxon} =~ s/(\w+)/\U$1/ig ;
|
||||
my @seq_elements = split "" , $sequence{$raw_taxon} ;
|
||||
|
||||
for my $seq_sign ( @seq_elements ){
|
||||
|
||||
unless ( $seq_sign =~ /A|C|G|T|U|-|N|Y|X|R|W|S|K|M|D|V|H|B|Q|E|I|L|F|P|\?/ ){ die "\n\t!FILE-ERROR!: Sequence of $file_fasta involves forbidden signs in $raw_taxon!\n" }
|
||||
}
|
||||
|
||||
if ( $sequence{$raw_taxon} =~ /I|E|L|Q|F|P/ ) { $sequence_state = 'aa' }
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Translate cut positions
|
||||
my @fasta_cut;
|
||||
&translate_cut_positions( \$$sref_codon_remove, \$$sref_third_remove, \@cut_positions, \$number_characters_before, \@fasta_cut, \$sequence_state, \$file_fasta );
|
||||
|
||||
|
||||
# Calculate percent of remaining positions
|
||||
my $number_cut_positions = @cut_positions ;
|
||||
my $number_characters_after = $number_characters_before-$number_cut_positions ;
|
||||
|
||||
my $percent_left = sprintf "%.1f", ( $number_characters_after / $number_characters_before ) * 100 ;
|
||||
$percent_left =~ s/\./,/g ;
|
||||
|
||||
|
||||
# Assume uncut positions to $final and print out to ALICUT_$file_fasta
|
||||
if ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_codon_3rd_$file_fasta" }
|
||||
elsif ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /no/ ){ open OUT, ">ALICUT_codon_$file_fasta" }
|
||||
elsif ( $$sref_codon_remove =~ /no/ && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_3rd_$file_fasta" }
|
||||
else { open OUT, ">ALICUT_$file_fasta" }
|
||||
|
||||
for ( keys %sequence ){
|
||||
|
||||
my @bases = split "", $sequence{$_} ;
|
||||
my @final = map { $bases[$_] } @fasta_cut ;
|
||||
my $final = $_."\n".( join "", @final )."\n" ;
|
||||
|
||||
print OUT "$final" ;
|
||||
}
|
||||
close OUT;
|
||||
|
||||
|
||||
|
||||
# Print Out of extra infos to ALICUT_info
|
||||
print OUTinfo "$file\t$file_fasta\t$$sref_codon_remove\t$$sref_third_remove\t$number_sequences\t$number_characters_before\t$number_characters_after\t$percent_left\t$sequence_state\n" ;
|
||||
print "\tDone : $file cut to ALICUT_$file_fasta\n"
|
||||
}
|
||||
|
||||
close OUTinfo ;
|
||||
|
||||
|
||||
# Print OUT number of right handled FASTA files in relation to total number of files
|
||||
printf "\n%68s\n", "------------------------------------------------------------" ;
|
||||
printf "%42s\n", "$j FASTA file(s) correctly handled!" ;
|
||||
printf "%57s\n", "Further infos are printed out in Alicut_info.txt!" ;
|
||||
printf "\n%63s\n", "ALICUT V2.0 Finished! Thank you and good bye!" ;
|
||||
printf "%68s\n", "------------------------------------------------------------" ;
|
||||
|
||||
|
||||
&set_timer ;
|
||||
exit ;
|
||||
|
||||
sub tie_linefeeds{
|
||||
|
||||
my $sref_filename = $_[0] ;
|
||||
|
||||
( open IN , "<$$sref_filename" ) or warn "\tError: can not open $$sref_filename!\n" and next READING ;
|
||||
|
||||
(tie ( my @data, 'Tie::File', $$sref_filename )) ;
|
||||
|
||||
warn "\t!FILE-WARN!: $$sref_filename is empty!\n" and next READING if 0 == @data ;
|
||||
|
||||
map { s/\r\n/\n/g } @data ;
|
||||
map { s/\r/\n/g } @data ;
|
||||
|
||||
untie @data ; close IN ;
|
||||
|
||||
}
|
||||
|
||||
sub set_timer{
|
||||
|
||||
my ( $user, $system, $cuser, $csystem ) = times ;
|
||||
|
||||
print <<TIME;
|
||||
|
||||
*** time used: $user sec ***
|
||||
|
||||
TIME
|
||||
|
||||
|
||||
}
|
||||
|
||||
sub translate_cut_positions {
|
||||
|
||||
my $sref_command_codon_remove = $_[0] ;
|
||||
my $sref_command_third_remove = $_[1] ;
|
||||
my $aref_cut_positions = $_[2] ;
|
||||
my $sref_number_characters = $_[3] ;
|
||||
my $aref_remaining_positions = $_[4] ;
|
||||
my $sref_sequence_state = $_[5] ;
|
||||
my $sref_filename = $_[6] ;
|
||||
|
||||
|
||||
# Translate identified RSS aminoacid positions to nucleotide triplet positions
|
||||
if ( $$sref_command_codon_remove =~ /yes/ && $$sref_command_third_remove =~ /no/){
|
||||
|
||||
unless ( $$sref_sequence_state =~ /aa/ ){
|
||||
|
||||
my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
|
||||
for my $number( @fasta_old ){
|
||||
|
||||
my $newno1 = ($number*3)-2;
|
||||
my $newno2 = $newno1+1;
|
||||
my $newno3 = $newno2+1;
|
||||
|
||||
push @$aref_cut_positions, ( $newno1, $newno2, $newno3 )
|
||||
}
|
||||
|
||||
my $string_cutnumbers = join " ", @$aref_cut_positions ;
|
||||
open OUTnewcut, ">ALICUT_cut_positions_codon.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon.txt" ;
|
||||
print OUTnewcut $string_cutnumbers ; close OUTnewcut ;
|
||||
}
|
||||
|
||||
else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!" }
|
||||
}
|
||||
|
||||
# Translate identified RSS aminoacid positions to nucleotide triplet positions, but remove only third position
|
||||
elsif ( $$sref_command_codon_remove =~ /yes/ && $$sref_command_third_remove =~ /yes/){
|
||||
|
||||
unless ( $$sref_sequence_state =~ /aa/ ){
|
||||
|
||||
my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
|
||||
for my $number( @fasta_old ){
|
||||
|
||||
push @$aref_cut_positions, ($number*3)
|
||||
}
|
||||
|
||||
my $string_cutnumbers = join " ", @$aref_cut_positions ;
|
||||
open OUTnewcut, ">ALICUT_cut_positions_codon_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon_3rd.txt" ;
|
||||
print OUTnewcut $string_cutnumbers ; close OUTnewcut ;
|
||||
}
|
||||
|
||||
else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!\n\t3rd codon position not removed!" }
|
||||
}
|
||||
|
||||
# Remove only identified RSS if third position of original sequence
|
||||
elsif ( $$sref_command_codon_remove =~ /no/ && $$sref_command_third_remove =~ /yes/){
|
||||
|
||||
unless ( $$sref_sequence_state =~ /aa/ ){
|
||||
|
||||
my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
|
||||
for my $number( @fasta_old ){
|
||||
|
||||
if ( $number % 3 == 0 ){ push @$aref_cut_positions, $number }
|
||||
}
|
||||
|
||||
my $string_cutnumbers = join " ", @$aref_cut_positions ;
|
||||
open OUTnewcut, ">ALICUT_cut_positions_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_3rd.txt" ;
|
||||
print OUTnewcut $string_cutnumbers ; close OUTnewcut
|
||||
}
|
||||
|
||||
else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tNot only 3rd codon position removed!" }
|
||||
}
|
||||
|
||||
|
||||
# Examine remaining positions
|
||||
my ( %seen, @zahlenreihe ) ;
|
||||
for ( 1 .. $$sref_number_characters ) { push @zahlenreihe, $_-1 }
|
||||
|
||||
for my $value ( @$aref_cut_positions ){ $seen{$value-1}++ }
|
||||
for ( @zahlenreihe ){ unless ( $seen{$_} ){ push @$aref_remaining_positions, $_ } }
|
||||
}
|
||||
}
|
||||
|
||||
sub fas_bearbeiten{
|
||||
|
||||
my @infile = @_ ;
|
||||
|
||||
grep s/(\>.*)/$1\t/, @infile ;
|
||||
grep s/ //g, @infile ;
|
||||
grep s/\n//g, @infile ;
|
||||
grep s/\t/\n/g, @infile ;
|
||||
grep s/\>/\n\>/g, @infile ;
|
||||
my $string = join "", @infile ;
|
||||
@infile = split "\n", $string ;
|
||||
shift @infile ;
|
||||
return @infile ;
|
||||
}
|
||||
|
||||
sub structure_handling{
|
||||
|
||||
my $sref_string = $_[0] ;
|
||||
my $sref_answer_remain = $_[1] ;
|
||||
my $aref_cut_positions = $_[2] ;
|
||||
my $sref_filename = $_[3] ;
|
||||
|
||||
my (
|
||||
|
||||
@pair_infos ,
|
||||
@forward ,
|
||||
@structurestring ,
|
||||
@loops ,
|
||||
@pairs ,
|
||||
%structure_of_position ,
|
||||
%seen_struc
|
||||
|
||||
);
|
||||
|
||||
|
||||
# Stem assignment
|
||||
my @structures = split "", $$sref_string ;
|
||||
my $i = 0 ;
|
||||
CHECKING:
|
||||
for ( @structures ){ $i++ ;
|
||||
|
||||
SWITCH:
|
||||
$structure_of_position{$i} = $_ ;
|
||||
|
||||
if ( $_ =~ /\(/ ){ push @forward, $i and next CHECKING }
|
||||
if ( $_ =~ /\)/ ){ my $pair_1 = pop @forward; push @pairs, ( $pair_1, $i ); push @pair_infos, ( $pair_1.":".$i ); next CHECKING }
|
||||
if ( $_ =~ /\./ ){ push @loops, $i and next CHECKING }
|
||||
}
|
||||
|
||||
@pair_infos = reverse @pair_infos ;
|
||||
|
||||
|
||||
|
||||
|
||||
# Generate listfiles for structure_info file
|
||||
my $pairlist = join "\n\t\t\t\t\t", @pair_infos ;
|
||||
my $looplist = join "\n\t\t\t\t\t", @loops ;
|
||||
|
||||
|
||||
# Number and proportion of stem and loop positions for structure info file
|
||||
my $N_total = @structures ;
|
||||
my $N_stems = @pair_infos ;
|
||||
my $N_loops = $N_total - ( $N_stems * 2 ) ;
|
||||
my $P_loops = ( $N_loops / $N_total ) * 100 ;
|
||||
my $P_stems = 100 - $P_loops ;
|
||||
|
||||
|
||||
# Open structure info outfile
|
||||
open OUTstruc, ">ALICUT_Struc_info_${$sref_filename}.txt" ;
|
||||
|
||||
# Print out
|
||||
print OUTstruc "\nOriginal structure information identified in $$sref_filename:\n\n" ;
|
||||
print OUTstruc "- Number of characters:\t\t\t$N_total\n" ;
|
||||
print OUTstruc "- Number of single loop characters:\t$N_loops [$P_stems %]\n" ;
|
||||
print OUTstruc "- Number of paired stem characters:\t$N_stems [$P_loops %]\n" ;
|
||||
print OUTstruc "\n- Paired stem positions:\t\t$pairlist\n\n" ;
|
||||
print OUTstruc "\n- Loop positions:\t\t\t$looplist\n" ;
|
||||
|
||||
close OUTstruc;
|
||||
|
||||
if ( $$sref_answer_remain =~ /yes/i ){
|
||||
|
||||
my @cut_positions2 = ();
|
||||
|
||||
# Remain rss identified stem positions within the MSA
|
||||
for ( @pairs ){ $seen_struc{$_} = 1 }
|
||||
for ( @$aref_cut_positions ){ unless ( $seen_struc{$_} ){ push @cut_positions2, $_ } }
|
||||
@$aref_cut_positions = @cut_positions2 ;
|
||||
}
|
||||
|
||||
else{
|
||||
|
||||
my %pair = @pairs;
|
||||
|
||||
# Replace paired structure positions of rss identified positions by dots
|
||||
for my $bp_for ( keys %pair ){
|
||||
|
||||
for my $rss ( @$aref_cut_positions ){
|
||||
|
||||
if ( $bp_for == $rss ){ $structure_of_position{$pair{$bp_for}} = "." ; last }
|
||||
if ( $pair{$bp_for} == $rss ){ $structure_of_position{$bp_for} = "." ; last }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for ( my $k=1; $k<=@structures-1; $k++ ){ push @structurestring, $structure_of_position{$k} }
|
||||
my $structure_string_neu = join "", @structurestring ;
|
||||
return $structure_string_neu ;
|
||||
|
||||
}
|
||||
|
||||
sub commandline{
|
||||
|
||||
print "\n\tCOMMAND:\t " ;
|
||||
|
||||
chomp ( my $sub_answer_opening = <STDIN> );
|
||||
|
||||
print "\n\t------------------------------------------------------------\n" ;
|
||||
|
||||
return $sub_answer_opening;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
1271
skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl
Executable file
1271
skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl
Executable file
File diff suppressed because it is too large
Load Diff
2081
skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore_module.pm
Executable file
2081
skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore_module.pm
Executable file
File diff suppressed because it is too large
Load Diff
174
skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py
Executable file
174
skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py
Executable file
@@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Query NCBI for available genome assemblies by taxon name
|
||||
|
||||
Usage:
|
||||
python query_ncbi_assemblies.py --taxon "Coleoptera"
|
||||
python query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50
|
||||
python query_ncbi_assemblies.py --taxon "Apis" --refseq-only
|
||||
|
||||
Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
|
||||
|
||||
Author: Bruno de Medeiros (Field Museum)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
def query_assemblies_by_taxon(taxon, max_results=20, refseq_only=False):
|
||||
"""
|
||||
Query NCBI for genome assemblies of a given taxon
|
||||
|
||||
Args:
|
||||
taxon: Taxon name (e.g., "Coleoptera", "Drosophila melanogaster")
|
||||
max_results: Maximum number of results to return
|
||||
refseq_only: If True, only return RefSeq assemblies (GCF_*)
|
||||
|
||||
Returns:
|
||||
List of dictionaries with assembly information
|
||||
"""
|
||||
try:
|
||||
from ncbi.datasets import GenomeApi
|
||||
from ncbi.datasets.openapi import ApiClient, ApiException
|
||||
except ImportError:
|
||||
print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
|
||||
print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
assemblies = []
|
||||
|
||||
print(f"Querying NCBI for '{taxon}' genome assemblies...")
|
||||
print(f"(Limiting to {max_results} results)")
|
||||
if refseq_only:
|
||||
print("(RefSeq assemblies only)")
|
||||
print("")
|
||||
|
||||
try:
|
||||
with ApiClient() as api_client:
|
||||
api = GenomeApi(api_client)
|
||||
|
||||
# Query genome assemblies for the taxon
|
||||
genome_summary = api.genome_summary_by_taxon(
|
||||
taxon=taxon,
|
||||
limit=str(max_results),
|
||||
filters_refseq_only=refseq_only
|
||||
)
|
||||
|
||||
if not genome_summary.reports:
|
||||
print(f"No assemblies found for taxon '{taxon}'")
|
||||
return []
|
||||
|
||||
for report in genome_summary.reports:
|
||||
assembly_info = {
|
||||
'accession': report.accession,
|
||||
'organism': report.organism.organism_name,
|
||||
'assembly_level': report.assembly_info.assembly_level,
|
||||
'assembly_name': report.assembly_info.assembly_name,
|
||||
'submission_date': report.assembly_info.release_date if hasattr(report.assembly_info, 'release_date') else 'N/A'
|
||||
}
|
||||
assemblies.append(assembly_info)
|
||||
|
||||
except ApiException as e:
|
||||
print(f"Error querying NCBI: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
return assemblies
|
||||
|
||||
|
||||
def format_table(assemblies):
|
||||
"""
|
||||
Format assemblies as a readable table
|
||||
|
||||
Args:
|
||||
assemblies: List of assembly dictionaries
|
||||
"""
|
||||
if not assemblies:
|
||||
return
|
||||
|
||||
print(f"Found {len(assemblies)} assemblies:\n")
|
||||
|
||||
# Print header
|
||||
print(f"{'#':<4} {'Accession':<20} {'Organism':<40} {'Level':<15} {'Assembly Name':<30}")
|
||||
print("-" * 110)
|
||||
|
||||
# Print data rows
|
||||
for i, asm in enumerate(assemblies, 1):
|
||||
organism = asm['organism'][:38] + '..' if len(asm['organism']) > 40 else asm['organism']
|
||||
assembly_name = asm['assembly_name'][:28] + '..' if len(asm['assembly_name']) > 30 else asm['assembly_name']
|
||||
|
||||
print(f"{i:<4} {asm['accession']:<20} {organism:<40} {asm['assembly_level']:<15} {assembly_name:<30}")
|
||||
|
||||
print("")
|
||||
|
||||
|
||||
def save_accessions(assemblies, output_file):
|
||||
"""
|
||||
Save assembly accessions to a file
|
||||
|
||||
Args:
|
||||
assemblies: List of assembly dictionaries
|
||||
output_file: Output file path
|
||||
"""
|
||||
with open(output_file, 'w') as f:
|
||||
for asm in assemblies:
|
||||
f.write(f"{asm['accession']}\n")
|
||||
|
||||
print(f"Accessions saved to: {output_file}")
|
||||
print(f"You can download these assemblies using:")
|
||||
print(f" python download_ncbi_genomes.py --assemblies $(cat {output_file})")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Query NCBI for available genome assemblies by taxon name",
|
||||
epilog="Example: python query_ncbi_assemblies.py --taxon 'Coleoptera' --max-results 50"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--taxon",
|
||||
required=True,
|
||||
help="Taxon name (e.g., 'Coleoptera', 'Drosophila melanogaster')"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--max-results",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Maximum number of results to return (default: 20)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--refseq-only",
|
||||
action="store_true",
|
||||
help="Only return RefSeq assemblies (GCF_* accessions)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--save",
|
||||
metavar="FILE",
|
||||
help="Save accessions to a file for later download"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Query NCBI
|
||||
assemblies = query_assemblies_by_taxon(
|
||||
taxon=args.taxon,
|
||||
max_results=args.max_results,
|
||||
refseq_only=args.refseq_only
|
||||
)
|
||||
|
||||
# Display results
|
||||
format_table(assemblies)
|
||||
|
||||
# Save if requested
|
||||
if args.save and assemblies:
|
||||
save_accessions(assemblies, args.save)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
240
skills/phylo_from_buscos/scripts/rename_genomes.py
Executable file
240
skills/phylo_from_buscos/scripts/rename_genomes.py
Executable file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Rename genome files with clean, meaningful sample names for phylogenomics
|
||||
|
||||
This script helps create a mapping between genome files (often with cryptic
|
||||
accession numbers) and clean species/sample names that will appear in the
|
||||
final phylogenetic tree.
|
||||
|
||||
Usage:
|
||||
# Interactive mode - prompts for names
|
||||
python rename_genomes.py --interactive genome1.fasta genome2.fasta
|
||||
|
||||
# From mapping file (TSV: old_name<TAB>new_name)
|
||||
python rename_genomes.py --mapping samples.tsv
|
||||
|
||||
# Create template mapping file
|
||||
python rename_genomes.py --create-template *.fasta > samples.tsv
|
||||
|
||||
Author: Bruno de Medeiros (Field Museum)
|
||||
Based on tutorials by Paul Frandsen (BYU)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def sanitize_name(name):
|
||||
"""
|
||||
Sanitize a name to be phylogenomics-safe
|
||||
- Replace spaces with underscores
|
||||
- Remove special characters
|
||||
- Keep only alphanumeric, underscore, hyphen
|
||||
"""
|
||||
# Replace spaces with underscores
|
||||
name = name.replace(' ', '_')
|
||||
# Remove special characters except underscore and hyphen
|
||||
name = ''.join(c for c in name if c.isalnum() or c in '_-')
|
||||
return name
|
||||
|
||||
|
||||
def create_template(genome_files, output=sys.stdout):
|
||||
"""Create a template mapping file"""
|
||||
output.write("# Sample mapping file\n")
|
||||
output.write("# Format: original_filename<TAB>new_sample_name\n")
|
||||
output.write("# Edit the second column with meaningful species/sample names\n")
|
||||
output.write("# Recommended format: [ACCESSION]_[NAME] (e.g., GCA000123456_Penstemon_eatonii)\n")
|
||||
output.write("# This keeps accession for traceability while having readable names in trees\n")
|
||||
output.write("# Names should contain only letters, numbers, underscores, and hyphens\n")
|
||||
output.write("#\n")
|
||||
|
||||
for gfile in genome_files:
|
||||
basename = Path(gfile).stem # Remove extension
|
||||
output.write(f"{gfile}\t{basename}\n")
|
||||
|
||||
|
||||
def read_mapping(mapping_file):
|
||||
"""Read mapping from TSV file"""
|
||||
mapping = {}
|
||||
with open(mapping_file, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
# Skip comments and empty lines
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
parts = line.split('\t')
|
||||
if len(parts) != 2:
|
||||
print(f"Warning: Skipping invalid line: {line}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
old_name, new_name = parts
|
||||
new_name = sanitize_name(new_name)
|
||||
mapping[old_name] = new_name
|
||||
|
||||
return mapping
|
||||
|
||||
|
||||
def interactive_rename(genome_files):
|
||||
"""Interactively ask for new names"""
|
||||
mapping = {}
|
||||
|
||||
print("Enter new sample names for each genome file.")
|
||||
print("Press Enter to keep the current name.")
|
||||
print("Names will be sanitized (spaces→underscores, special chars removed)\n")
|
||||
|
||||
for gfile in genome_files:
|
||||
current_name = Path(gfile).stem
|
||||
new_name = input(f"{gfile} → [{current_name}]: ").strip()
|
||||
|
||||
if not new_name:
|
||||
new_name = current_name
|
||||
|
||||
new_name = sanitize_name(new_name)
|
||||
mapping[gfile] = new_name
|
||||
print(f" Will rename to: {new_name}.fasta\n")
|
||||
|
||||
return mapping
|
||||
|
||||
|
||||
def rename_files(mapping, dry_run=False, backup=True):
|
||||
"""Rename genome files according to mapping"""
|
||||
|
||||
renamed = []
|
||||
errors = []
|
||||
|
||||
for old_file, new_name in mapping.items():
|
||||
if not os.path.exists(old_file):
|
||||
errors.append(f"File not found: {old_file}")
|
||||
continue
|
||||
|
||||
# Get extension from original file
|
||||
ext = Path(old_file).suffix
|
||||
if not ext:
|
||||
ext = '.fasta'
|
||||
|
||||
new_file = f"{new_name}{ext}"
|
||||
|
||||
# Check if target exists
|
||||
if os.path.exists(new_file) and new_file != old_file:
|
||||
errors.append(f"Target exists: {new_file}")
|
||||
continue
|
||||
|
||||
# Skip if names are the same
|
||||
if old_file == new_file:
|
||||
print(f"Skip (no change): {old_file}")
|
||||
continue
|
||||
|
||||
if dry_run:
|
||||
print(f"[DRY RUN] Would rename: {old_file} → {new_file}")
|
||||
else:
|
||||
# Backup if requested
|
||||
if backup:
|
||||
backup_file = f"{old_file}.backup"
|
||||
shutil.copy2(old_file, backup_file)
|
||||
print(f"Backup created: {backup_file}")
|
||||
|
||||
# Rename
|
||||
shutil.move(old_file, new_file)
|
||||
print(f"Renamed: {old_file} → {new_file}")
|
||||
renamed.append((old_file, new_file))
|
||||
|
||||
return renamed, errors
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Rename genome files with meaningful sample names for phylogenomics",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Create template mapping file
|
||||
python rename_genomes.py --create-template *.fasta > samples.tsv
|
||||
# Edit samples.tsv, then apply mapping
|
||||
python rename_genomes.py --mapping samples.tsv
|
||||
|
||||
# Interactive renaming
|
||||
python rename_genomes.py --interactive genome1.fasta genome2.fasta
|
||||
|
||||
# Dry run (preview changes)
|
||||
python rename_genomes.py --mapping samples.tsv --dry-run
|
||||
"""
|
||||
)
|
||||
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
'--create-template',
|
||||
nargs='+',
|
||||
metavar='GENOME',
|
||||
help='Create a template mapping file from genome files'
|
||||
)
|
||||
group.add_argument(
|
||||
'--mapping',
|
||||
metavar='FILE',
|
||||
help='TSV file with mapping (old_name<TAB>new_name)'
|
||||
)
|
||||
group.add_argument(
|
||||
'--interactive',
|
||||
nargs='+',
|
||||
metavar='GENOME',
|
||||
help='Interactively rename genome files'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Show what would be renamed without actually renaming'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-backup',
|
||||
action='store_true',
|
||||
help='Do not create backup files'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create template
|
||||
if args.create_template:
|
||||
create_template(args.create_template)
|
||||
return
|
||||
|
||||
# Interactive mode
|
||||
if args.interactive:
|
||||
mapping = interactive_rename(args.interactive)
|
||||
# Mapping file mode
|
||||
elif args.mapping:
|
||||
mapping = read_mapping(args.mapping)
|
||||
else:
|
||||
parser.error("No mode specified")
|
||||
|
||||
if not mapping:
|
||||
print("No files to rename", file=sys.stderr)
|
||||
return
|
||||
|
||||
# Perform renaming
|
||||
renamed, errors = rename_files(
|
||||
mapping,
|
||||
dry_run=args.dry_run,
|
||||
backup=not args.no_backup
|
||||
)
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*60)
|
||||
if args.dry_run:
|
||||
print("DRY RUN - No files were actually renamed")
|
||||
else:
|
||||
print(f"Successfully renamed {len(renamed)} file(s)")
|
||||
|
||||
if errors:
|
||||
print(f"\nErrors ({len(errors)}):")
|
||||
for error in errors:
|
||||
print(f" - {error}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
247
skills/phylo_from_buscos/scripts/run_alicut.sh
Executable file
247
skills/phylo_from_buscos/scripts/run_alicut.sh
Executable file
@@ -0,0 +1,247 @@
|
||||
#!/bin/bash
|
||||
|
||||
# run_alicut.sh
|
||||
# Wrapper script for running ALICUT to remove Aliscore-identified RSS positions
|
||||
# Removes randomly similar sequence sections from alignments
|
||||
#
|
||||
# Usage:
|
||||
# bash run_alicut.sh [aliscore_dir] [options]
|
||||
#
|
||||
# Options:
|
||||
# -r Remain stem positions (for RNA secondary structures)
|
||||
# -c Remove codon (translate AA positions to nucleotide triplets)
|
||||
# -3 Remove only 3rd codon positions
|
||||
# -s Silent mode (non-interactive, use defaults)
|
||||
#
|
||||
# Requirements:
|
||||
# - ALICUT_V2.31.pl in PATH or same directory
|
||||
# - Perl with File::Copy, Tie::File, Term::Cap modules
|
||||
# - Aliscore output directory with *_List_*.txt and original .fas file
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Script directory
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Check for ALICUT script
|
||||
if command -v ALICUT_V2.31.pl &> /dev/null; then
|
||||
ALICUT_SCRIPT="ALICUT_V2.31.pl"
|
||||
elif [ -f "${SCRIPT_DIR}/ALICUT_V2.31.pl" ]; then
|
||||
ALICUT_SCRIPT="${SCRIPT_DIR}/ALICUT_V2.31.pl"
|
||||
elif [ -f "./ALICUT_V2.31.pl" ]; then
|
||||
ALICUT_SCRIPT="./ALICUT_V2.31.pl"
|
||||
else
|
||||
echo "ERROR: ALICUT_V2.31.pl not found in PATH, script directory, or current directory"
|
||||
echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/alicut"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to display usage
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [aliscore_dir] [options]
|
||||
|
||||
Run ALICUT to remove Aliscore-identified randomly similar sequence sections.
|
||||
|
||||
Arguments:
|
||||
aliscore_dir Directory containing Aliscore output files
|
||||
|
||||
Options:
|
||||
-r Remain stem positions in RNA secondary structure alignments
|
||||
-c Remove entire codon (translates AA RSS positions to nt triplets)
|
||||
-3 Remove only 3rd codon position of identified RSS
|
||||
-s Silent/scripted mode (non-interactive, use defaults)
|
||||
-h Display this help message
|
||||
|
||||
Input Requirements:
|
||||
The aliscore_dir must contain:
|
||||
- Original FASTA alignment file (*.fas)
|
||||
- Aliscore List file (*_List_random.txt or *_List_*.txt)
|
||||
|
||||
Examples:
|
||||
# Basic usage (interactive mode)
|
||||
bash run_alicut.sh aliscore_alignment1
|
||||
|
||||
# Silent mode with defaults
|
||||
bash run_alicut.sh aliscore_alignment1 -s
|
||||
|
||||
# Remain RNA stem positions
|
||||
bash run_alicut.sh aliscore_16S -r -s
|
||||
|
||||
# Remove entire codons (for back-translation)
|
||||
bash run_alicut.sh aliscore_protein1 -c -s
|
||||
|
||||
# Process all Aliscore output directories
|
||||
for dir in aliscore_*/; do
|
||||
bash run_alicut.sh "\${dir}" -s
|
||||
done
|
||||
|
||||
Output Files (in aliscore_dir):
|
||||
- ALICUT_[alignment].fas : Trimmed alignment
|
||||
- ALICUT_info.xls : Statistics (taxa, positions removed, etc.)
|
||||
- ALICUT_Struc_info_*.txt : Structure information (if RNA detected)
|
||||
|
||||
Citation:
|
||||
Kück P, Meusemann K, Dambach J, Thormann B, von Reumont BM, Wägele JW,
|
||||
Misof B (2010) Parametric and non-parametric masking of randomness in
|
||||
sequence alignments can be improved and leads to better resolved trees.
|
||||
Front Zool 7:10. doi: 10.1186/1742-9994-7-10
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Parse command line arguments
|
||||
ALISCORE_DIR=""
|
||||
ALICUT_OPTS=""
|
||||
SILENT_MODE=false
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
ALISCORE_DIR="$1"
|
||||
shift
|
||||
|
||||
# Validate directory exists
|
||||
if [ ! -d "${ALISCORE_DIR}" ]; then
|
||||
echo "ERROR: Aliscore directory not found: ${ALISCORE_DIR}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Parse ALICUT options
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
-r)
|
||||
ALICUT_OPTS="${ALICUT_OPTS} -r"
|
||||
shift
|
||||
;;
|
||||
-c)
|
||||
ALICUT_OPTS="${ALICUT_OPTS} -c"
|
||||
shift
|
||||
;;
|
||||
-3)
|
||||
ALICUT_OPTS="${ALICUT_OPTS} -3"
|
||||
shift
|
||||
;;
|
||||
-s|--silent)
|
||||
SILENT_MODE=true
|
||||
ALICUT_OPTS="${ALICUT_OPTS} -s"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Change to Aliscore output directory
|
||||
cd "${ALISCORE_DIR}"
|
||||
|
||||
echo "Processing Aliscore output in: ${ALISCORE_DIR}"
|
||||
|
||||
# Find List file
|
||||
LIST_FILE=$(ls *_List_*.txt 2>/dev/null | head -n 1)
|
||||
if [ -z "${LIST_FILE}" ]; then
|
||||
echo "ERROR: No Aliscore List file found (*_List_*.txt)"
|
||||
echo "Make sure Aliscore completed successfully"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found List file: ${LIST_FILE}"
|
||||
|
||||
# Find original FASTA file
|
||||
FASTA_FILE=$(find . -maxdepth 1 \( -name "*.fas" -o -name "*.fasta" \) -type f | head -n 1 | sed 's|^\./||')
|
||||
if [ -z "${FASTA_FILE}" ]; then
|
||||
echo "ERROR: No FASTA alignment file found (*.fas or *.fasta)"
|
||||
echo "ALICUT requires the original alignment file in the same directory as List file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found FASTA file: ${FASTA_FILE}"
|
||||
|
||||
# Check if List file contains RSS positions
|
||||
RSS_COUNT=$(wc -w < "${LIST_FILE}" || echo "0")
|
||||
if [ "${RSS_COUNT}" -eq 0 ]; then
|
||||
echo "WARNING: List file is empty (no RSS positions identified)"
|
||||
echo "Aliscore found no randomly similar sequences to remove"
|
||||
echo "Skipping ALICUT - alignment is already clean"
|
||||
|
||||
# Create a symbolic link to indicate no trimming was needed
|
||||
ln -sf "${FASTA_FILE}" "ALICUT_${FASTA_FILE}"
|
||||
echo "Created symbolic link: ALICUT_${FASTA_FILE} -> ${FASTA_FILE}"
|
||||
|
||||
cd ..
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Found ${RSS_COUNT} RSS positions to remove"
|
||||
|
||||
# Run ALICUT
|
||||
echo ""
|
||||
echo "Running ALICUT..."
|
||||
echo "Options: ${ALICUT_OPTS}"
|
||||
|
||||
# Construct ALICUT command
|
||||
ALICUT_CMD="perl ${ALICUT_SCRIPT} ${ALICUT_OPTS}"
|
||||
|
||||
if [ "${SILENT_MODE}" = true ]; then
|
||||
echo "Command: ${ALICUT_CMD}"
|
||||
eval ${ALICUT_CMD}
|
||||
else
|
||||
echo "Running ALICUT in interactive mode..."
|
||||
echo "Press 's' and Enter to start with current options"
|
||||
echo ""
|
||||
perl "${ALICUT_SCRIPT}" ${ALICUT_OPTS}
|
||||
fi
|
||||
|
||||
# Check if ALICUT completed successfully
|
||||
if [ $? -eq 0 ]; then
|
||||
echo ""
|
||||
echo "ALICUT completed successfully"
|
||||
|
||||
# Find output file
|
||||
OUTPUT_FILE=$(ls ALICUT_*.fas ALICUT_*.fasta 2>/dev/null | head -n 1)
|
||||
|
||||
if [ -n "${OUTPUT_FILE}" ]; then
|
||||
echo ""
|
||||
echo "Output files:"
|
||||
ls -lh ALICUT_* 2>/dev/null
|
||||
|
||||
# Calculate and report trimming statistics (handle multi-line FASTA format)
|
||||
if [ -f "${OUTPUT_FILE}" ]; then
|
||||
ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${FASTA_FILE}" | head -n 1 | wc -c)
|
||||
TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${OUTPUT_FILE}" | head -n 1 | wc -c)
|
||||
REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH))
|
||||
PERCENT_REMOVED=$(awk "BEGIN {printf \"%.1f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}")
|
||||
|
||||
echo ""
|
||||
echo "Trimming statistics:"
|
||||
echo " Original length: ${ORIGINAL_LENGTH} bp"
|
||||
echo " Trimmed length: ${TRIMMED_LENGTH} bp"
|
||||
echo " Removed: ${REMOVED_LENGTH} bp (${PERCENT_REMOVED}%)"
|
||||
fi
|
||||
|
||||
# Check for info file
|
||||
if [ -f "ALICUT_info.xls" ]; then
|
||||
echo ""
|
||||
echo "Detailed statistics in: ALICUT_info.xls"
|
||||
fi
|
||||
else
|
||||
echo "WARNING: Expected output file ALICUT_*.fas not found"
|
||||
fi
|
||||
else
|
||||
echo "ERROR: ALICUT failed"
|
||||
cd ..
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Return to parent directory
|
||||
cd ..
|
||||
|
||||
echo ""
|
||||
echo "Done: ${ALISCORE_DIR}"
|
||||
248
skills/phylo_from_buscos/scripts/run_aliscore.sh
Executable file
248
skills/phylo_from_buscos/scripts/run_aliscore.sh
Executable file
@@ -0,0 +1,248 @@
|
||||
#!/bin/bash
|
||||
|
||||
# run_aliscore.sh
|
||||
# Wrapper script for running Aliscore on aligned sequences
|
||||
# Identifies randomly similar sequence sections (RSS) in multiple sequence alignments
|
||||
#
|
||||
# Usage:
|
||||
# bash run_aliscore.sh [alignment.fas] [options]
|
||||
#
|
||||
# Options:
|
||||
# -w INT Window size (default: 4)
|
||||
# -r INT Number of random pairs to compare (default: 4*N taxa)
|
||||
# -N Treat gaps as ambiguous characters (recommended for amino acids)
|
||||
# -t TREE Tree file in Newick format for guided comparisons
|
||||
# -l LEVEL Node level for tree-based comparisons
|
||||
# -o TAXA Comma-separated list of outgroup taxa
|
||||
#
|
||||
# Array job usage:
|
||||
# Set SLURM_ARRAY_TASK_ID or PBS_ARRAYID environment variable
|
||||
# Create locus_list.txt with one alignment file per line
|
||||
#
|
||||
# Requirements:
|
||||
# - Aliscore.02.2.pl in PATH or same directory
|
||||
# - Perl with Tie::File and Fcntl modules
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Script directory
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Check for Aliscore script
|
||||
if command -v Aliscore.02.2.pl &> /dev/null; then
|
||||
ALISCORE_SCRIPT="Aliscore.02.2.pl"
|
||||
elif [ -f "${SCRIPT_DIR}/Aliscore.02.2.pl" ]; then
|
||||
ALISCORE_SCRIPT="${SCRIPT_DIR}/Aliscore.02.2.pl"
|
||||
elif [ -f "./Aliscore.02.2.pl" ]; then
|
||||
ALISCORE_SCRIPT="./Aliscore.02.2.pl"
|
||||
else
|
||||
echo "ERROR: Aliscore.02.2.pl not found in PATH, script directory, or current directory"
|
||||
echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/aliscore"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to display usage
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [alignment.fas] [options]
|
||||
|
||||
Run Aliscore to identify randomly similar sequence sections in alignments.
|
||||
|
||||
Options:
|
||||
-d DIR Base output directory for all Aliscore results (default: aliscore_output)
|
||||
-w INT Window size for sliding window analysis (default: 4)
|
||||
-r INT Number of random sequence pairs to compare (default: 4*N taxa)
|
||||
-N Treat gaps as ambiguous characters (recommended for amino acids)
|
||||
-t FILE Tree file in Newick format for phylogeny-guided comparisons
|
||||
-l LEVEL Node level limit for tree-based comparisons (default: all)
|
||||
-o TAXA Comma-separated list of outgroup taxa for focused comparisons
|
||||
-h Display this help message
|
||||
|
||||
Array Job Mode:
|
||||
If SLURM_ARRAY_TASK_ID or PBS_ARRAYID is set, reads alignment from locus_list.txt
|
||||
Create locus_list.txt with: ls *.fas > locus_list.txt
|
||||
|
||||
Examples:
|
||||
# Basic run with defaults (outputs to aliscore_output/)
|
||||
bash run_aliscore.sh alignment.fas
|
||||
|
||||
# Amino acid sequences with gaps as ambiguous
|
||||
bash run_aliscore.sh protein_alignment.fas -N
|
||||
|
||||
# Custom output directory
|
||||
bash run_aliscore.sh alignment.fas -d my_aliscore_results
|
||||
|
||||
# Custom window size and random pairs
|
||||
bash run_aliscore.sh alignment.fas -w 6 -r 100
|
||||
|
||||
# Tree-guided analysis
|
||||
bash run_aliscore.sh alignment.fas -t species.tre
|
||||
|
||||
# Array job on SLURM
|
||||
ls aligned_aa/*.fas > locus_list.txt
|
||||
sbatch --array=1-\$(wc -l < locus_list.txt) run_aliscore_array.job
|
||||
|
||||
Output Files (in aliscore_output/aliscore_[alignment]/):
|
||||
- [alignment]_List_random.txt : Positions identified as RSS (for ALICUT)
|
||||
- [alignment]_Profile_random.txt: Quality profile for each position
|
||||
- [alignment].svg : Visual plot of scoring profiles
|
||||
|
||||
Citation:
|
||||
Misof B, Misof K (2009) A Monte Carlo approach successfully identifies
|
||||
randomness in multiple sequence alignments: a more objective means of data
|
||||
exclusion. Syst Biol 58(1):21-34. doi: 10.1093/sysbio/syp006
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Parse command line arguments
|
||||
ALIGNMENT=""
|
||||
ALISCORE_OPTS=""
|
||||
BASE_OUTPUT_DIR="aliscore_output"
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
# Check for array job mode
|
||||
ARRAY_MODE=false
|
||||
ARRAY_ID=""
|
||||
|
||||
if [ -n "${SLURM_ARRAY_TASK_ID:-}" ]; then
|
||||
ARRAY_MODE=true
|
||||
ARRAY_ID="${SLURM_ARRAY_TASK_ID}"
|
||||
elif [ -n "${PBS_ARRAYID:-}" ]; then
|
||||
ARRAY_MODE=true
|
||||
ARRAY_ID="${PBS_ARRAYID}"
|
||||
fi
|
||||
|
||||
# If in array mode, get alignment from locus list
|
||||
if [ "${ARRAY_MODE}" = true ]; then
|
||||
if [ ! -f "locus_list.txt" ]; then
|
||||
echo "ERROR: Array job mode requires locus_list.txt"
|
||||
echo "Create with: ls *.fas > locus_list.txt"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ALIGNMENT=$(sed -n "${ARRAY_ID}p" locus_list.txt)
|
||||
|
||||
if [ -z "${ALIGNMENT}" ]; then
|
||||
echo "ERROR: Could not read alignment for array index ${ARRAY_ID}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Array job ${ARRAY_ID}: Processing ${ALIGNMENT}"
|
||||
|
||||
# Remaining arguments are Aliscore options
|
||||
shift $# # Clear positional parameters
|
||||
set -- "$@" # Reset with remaining args
|
||||
else
|
||||
# First argument is alignment file
|
||||
ALIGNMENT="$1"
|
||||
shift
|
||||
fi
|
||||
|
||||
# Validate alignment file exists
|
||||
if [ ! -f "${ALIGNMENT}" ]; then
|
||||
echo "ERROR: Alignment file not found: ${ALIGNMENT}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Parse Aliscore options
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
-d|--output-dir)
|
||||
BASE_OUTPUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
-w)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -w $2"
|
||||
shift 2
|
||||
;;
|
||||
-r)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -r $2"
|
||||
shift 2
|
||||
;;
|
||||
-N)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -N"
|
||||
shift
|
||||
;;
|
||||
-t)
|
||||
if [ ! -f "$2" ]; then
|
||||
echo "ERROR: Tree file not found: $2"
|
||||
exit 1
|
||||
fi
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -t $2"
|
||||
shift 2
|
||||
;;
|
||||
-l)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -l $2"
|
||||
shift 2
|
||||
;;
|
||||
-o)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -o $2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Get alignment name without extension
|
||||
ALIGNMENT_NAME=$(basename "${ALIGNMENT}" .fas)
|
||||
ALIGNMENT_NAME=$(basename "${ALIGNMENT_NAME}" .fasta)
|
||||
|
||||
# Create base output directory and specific directory for this alignment
|
||||
mkdir -p "${BASE_OUTPUT_DIR}"
|
||||
OUTPUT_DIR="${BASE_OUTPUT_DIR}/aliscore_${ALIGNMENT_NAME}"
|
||||
mkdir -p "${OUTPUT_DIR}"
|
||||
|
||||
# Copy alignment to output directory
|
||||
cp "${ALIGNMENT}" "${OUTPUT_DIR}/"
|
||||
|
||||
# Change to output directory
|
||||
cd "${OUTPUT_DIR}"
|
||||
|
||||
# Run Aliscore
|
||||
echo "Running Aliscore on ${ALIGNMENT}..."
|
||||
echo "Options: ${ALISCORE_OPTS}"
|
||||
echo "Aliscore script: ${ALISCORE_SCRIPT}"
|
||||
|
||||
# Construct and run Aliscore command
|
||||
ALISCORE_CMD="perl -I${SCRIPT_DIR} ${ALISCORE_SCRIPT} -i $(basename ${ALIGNMENT}) ${ALISCORE_OPTS}"
|
||||
echo "Command: ${ALISCORE_CMD}"
|
||||
|
||||
eval ${ALISCORE_CMD}
|
||||
|
||||
# Check if Aliscore completed successfully
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Aliscore completed successfully for ${ALIGNMENT}"
|
||||
|
||||
# List output files
|
||||
echo ""
|
||||
echo "Output files in ${OUTPUT_DIR}:"
|
||||
ls -lh *List*.txt *Profile*.txt *.svg 2>/dev/null || echo " (some expected files not generated)"
|
||||
|
||||
# Report RSS positions if found
|
||||
if [ -f "$(basename ${ALIGNMENT})_List_random.txt" ]; then
|
||||
RSS_COUNT=$(wc -w < "$(basename ${ALIGNMENT})_List_random.txt")
|
||||
echo ""
|
||||
echo "Identified ${RSS_COUNT} randomly similar sequence positions"
|
||||
echo "See: ${OUTPUT_DIR}/$(basename ${ALIGNMENT})_List_random.txt"
|
||||
fi
|
||||
else
|
||||
echo "ERROR: Aliscore failed for ${ALIGNMENT}"
|
||||
cd ..
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Return to parent directory
|
||||
cd ..
|
||||
|
||||
echo "Done: ${ALIGNMENT} -> ${OUTPUT_DIR}"
|
||||
270
skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh
Executable file
270
skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh
Executable file
@@ -0,0 +1,270 @@
|
||||
#!/bin/bash
|
||||
|
||||
# run_aliscore_alicut_batch.sh
|
||||
# Batch processing script for Aliscore + ALICUT alignment trimming
|
||||
# Processes all alignments in a directory through both tools sequentially
|
||||
#
|
||||
# Usage:
|
||||
# bash run_aliscore_alicut_batch.sh [alignment_dir] [options]
|
||||
#
|
||||
# This script:
|
||||
# 1. Runs Aliscore on all alignments to identify RSS
|
||||
# 2. Runs ALICUT on each Aliscore output to remove RSS
|
||||
# 3. Collects trimmed alignments in output directory
|
||||
#
|
||||
# Requirements:
|
||||
# - run_aliscore.sh and run_alicut.sh in same directory or PATH
|
||||
# - Aliscore.02.2.pl and ALICUT_V2.31.pl available
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Script directory
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
# Function to display usage
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [alignment_dir] [options]
|
||||
|
||||
Batch process multiple alignments through Aliscore and ALICUT.
|
||||
|
||||
Arguments:
|
||||
alignment_dir Directory containing aligned FASTA files (*.fas)
|
||||
|
||||
Options:
|
||||
-o DIR Output directory for trimmed alignments (default: aliscore_alicut_trimmed)
|
||||
-d DIR Base directory for Aliscore outputs (default: aliscore_output)
|
||||
-w INT Aliscore window size (default: 4)
|
||||
-r INT Aliscore random pairs (default: 4*N)
|
||||
-N Aliscore: treat gaps as ambiguous (recommended for AA)
|
||||
--remain-stems ALICUT: remain RNA stem positions
|
||||
--remove-codon ALICUT: remove entire codons (for back-translation)
|
||||
--remove-3rd ALICUT: remove only 3rd codon positions
|
||||
-h Display this help message
|
||||
|
||||
Examples:
|
||||
# Basic usage for amino acid alignments
|
||||
bash run_aliscore_alicut_batch.sh aligned_aa/ -N
|
||||
|
||||
# Custom window size
|
||||
bash run_aliscore_alicut_batch.sh aligned_aa/ -w 6 -N
|
||||
|
||||
# With RNA structure preservation
|
||||
bash run_aliscore_alicut_batch.sh aligned_rrna/ --remain-stems
|
||||
|
||||
Output:
|
||||
- aliscore_output/aliscore_[locus]/ : Individual Aliscore results per locus
|
||||
- aliscore_alicut_trimmed/ : Final trimmed alignments
|
||||
- aliscore_alicut_trimmed/trimming_summary.txt : Statistics for all loci
|
||||
|
||||
EOF
|
||||
exit 0
|
||||
}
|
||||
|
||||
# Default parameters
|
||||
ALIGNMENT_DIR=""
|
||||
OUTPUT_DIR="aliscore_alicut_trimmed"
|
||||
ALISCORE_BASE_DIR="aliscore_output"
|
||||
ALISCORE_OPTS=""
|
||||
ALICUT_OPTS="-s" # Silent mode by default
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
ALIGNMENT_DIR="$1"
|
||||
shift
|
||||
|
||||
# Validate alignment directory
|
||||
if [ ! -d "${ALIGNMENT_DIR}" ]; then
|
||||
echo "ERROR: Alignment directory not found: ${ALIGNMENT_DIR}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Parse options
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
-o|--output)
|
||||
OUTPUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
-d|--aliscore-dir)
|
||||
ALISCORE_BASE_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
-w)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -w $2"
|
||||
shift 2
|
||||
;;
|
||||
-r)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -r $2"
|
||||
shift 2
|
||||
;;
|
||||
-N)
|
||||
ALISCORE_OPTS="${ALISCORE_OPTS} -N"
|
||||
shift
|
||||
;;
|
||||
--remain-stems)
|
||||
ALICUT_OPTS="${ALICUT_OPTS} -r"
|
||||
shift
|
||||
;;
|
||||
--remove-codon)
|
||||
ALICUT_OPTS="${ALICUT_OPTS} -c"
|
||||
shift
|
||||
;;
|
||||
--remove-3rd)
|
||||
ALICUT_OPTS="${ALICUT_OPTS} -3"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: Unknown option: $1"
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Check for wrapper scripts
|
||||
RUN_ALISCORE="${SCRIPT_DIR}/run_aliscore.sh"
|
||||
RUN_ALICUT="${SCRIPT_DIR}/run_alicut.sh"
|
||||
|
||||
if [ ! -f "${RUN_ALISCORE}" ]; then
|
||||
echo "ERROR: run_aliscore.sh not found: ${RUN_ALISCORE}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f "${RUN_ALICUT}" ]; then
|
||||
echo "ERROR: run_alicut.sh not found: ${RUN_ALICUT}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create output directory
|
||||
mkdir -p "${OUTPUT_DIR}"
|
||||
|
||||
# Find all FASTA files
|
||||
ALIGNMENTS=($(find "${ALIGNMENT_DIR}" -maxdepth 1 -name "*.fas" -o -name "*.fasta"))
|
||||
|
||||
if [ ${#ALIGNMENTS[@]} -eq 0 ]; then
|
||||
echo "ERROR: No FASTA files found in ${ALIGNMENT_DIR}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found ${#ALIGNMENTS[@]} alignments to process"
|
||||
echo "Aliscore options: ${ALISCORE_OPTS}"
|
||||
echo "ALICUT options: ${ALICUT_OPTS}"
|
||||
echo ""
|
||||
|
||||
# Initialize summary file
|
||||
SUMMARY_FILE="${OUTPUT_DIR}/trimming_summary.txt"
|
||||
echo -e "Locus\tOriginal_Length\tTrimmed_Length\tRemoved_Positions\tPercent_Removed\tRSS_Count" > "${SUMMARY_FILE}"
|
||||
|
||||
# Process each alignment
|
||||
SUCCESS_COUNT=0
|
||||
FAIL_COUNT=0
|
||||
|
||||
for ALIGNMENT in "${ALIGNMENTS[@]}"; do
|
||||
LOCUS=$(basename "${ALIGNMENT}" .fas)
|
||||
LOCUS=$(basename "${LOCUS}" .fasta)
|
||||
|
||||
echo "=========================================="
|
||||
echo "Processing: ${LOCUS}"
|
||||
echo "=========================================="
|
||||
|
||||
# Step 1: Run Aliscore
|
||||
echo ""
|
||||
echo "Step 1/2: Running Aliscore..."
|
||||
|
||||
if bash "${RUN_ALISCORE}" "${ALIGNMENT}" -d "${ALISCORE_BASE_DIR}" ${ALISCORE_OPTS}; then
|
||||
echo "Aliscore completed for ${LOCUS}"
|
||||
else
|
||||
echo "ERROR: Aliscore failed for ${LOCUS}"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
# Step 2: Run ALICUT
|
||||
echo ""
|
||||
echo "Step 2/2: Running ALICUT..."
|
||||
|
||||
ALISCORE_DIR="${ALISCORE_BASE_DIR}/aliscore_${LOCUS}"
|
||||
|
||||
if [ ! -d "${ALISCORE_DIR}" ]; then
|
||||
echo "ERROR: Aliscore output directory not found: ${ALISCORE_DIR}"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
if bash "${RUN_ALICUT}" "${ALISCORE_DIR}" ${ALICUT_OPTS}; then
|
||||
echo "ALICUT completed for ${LOCUS}"
|
||||
else
|
||||
echo "ERROR: ALICUT failed for ${LOCUS}"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
# Copy trimmed alignment to output directory
|
||||
TRIMMED_FILE=$(find "${ALISCORE_DIR}" -name "ALICUT_*.fas" -o -name "ALICUT_*.fasta" | head -n 1)
|
||||
|
||||
if [ -n "${TRIMMED_FILE}" ] && [ -f "${TRIMMED_FILE}" ]; then
|
||||
cp "${TRIMMED_FILE}" "${OUTPUT_DIR}/${LOCUS}_trimmed.fas"
|
||||
echo "Trimmed alignment: ${OUTPUT_DIR}/${LOCUS}_trimmed.fas"
|
||||
|
||||
# Calculate statistics (handle multi-line FASTA format)
|
||||
ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${ALIGNMENT}" | head -n 1 | tr -d ' ' | wc -c)
|
||||
TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${TRIMMED_FILE}" | head -n 1 | tr -d ' ' | wc -c)
|
||||
REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH))
|
||||
PERCENT_REMOVED=$(awk "BEGIN {printf \"%.2f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}")
|
||||
|
||||
# Count RSS positions
|
||||
LIST_FILE=$(find "${ALISCORE_DIR}" -name "*_List_*.txt" | head -n 1)
|
||||
RSS_COUNT=$(wc -w < "${LIST_FILE}" 2>/dev/null || echo "0")
|
||||
|
||||
# Append to summary
|
||||
echo -e "${LOCUS}\t${ORIGINAL_LENGTH}\t${TRIMMED_LENGTH}\t${REMOVED_LENGTH}\t${PERCENT_REMOVED}\t${RSS_COUNT}" >> "${SUMMARY_FILE}"
|
||||
|
||||
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
|
||||
else
|
||||
echo "WARNING: Trimmed file not found for ${LOCUS}"
|
||||
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||
fi
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
# Final report
|
||||
echo "=========================================="
|
||||
echo "BATCH PROCESSING COMPLETE"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Successfully processed: ${SUCCESS_COUNT}/${#ALIGNMENTS[@]} alignments"
|
||||
echo "Failed: ${FAIL_COUNT}/${#ALIGNMENTS[@]} alignments"
|
||||
echo ""
|
||||
echo "Output directory: ${OUTPUT_DIR}"
|
||||
echo "Trimmed alignments: ${OUTPUT_DIR}/*_trimmed.fas"
|
||||
echo "Summary statistics: ${SUMMARY_FILE}"
|
||||
echo ""
|
||||
|
||||
# Display summary statistics
|
||||
if [ ${SUCCESS_COUNT} -gt 0 ]; then
|
||||
echo "Overall trimming statistics:"
|
||||
awk 'NR>1 {
|
||||
total_orig += $2;
|
||||
total_trim += $3;
|
||||
total_removed += $4;
|
||||
count++
|
||||
}
|
||||
END {
|
||||
if (count > 0) {
|
||||
avg_removed = (total_removed / total_orig) * 100;
|
||||
printf " Total positions before: %d\n", total_orig;
|
||||
printf " Total positions after: %d\n", total_trim;
|
||||
printf " Total removed: %d (%.2f%%)\n", total_removed, avg_removed;
|
||||
printf " Average per locus: %.2f%% removed\n", avg_removed;
|
||||
}
|
||||
}' "${SUMMARY_FILE}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Done!"
|
||||
Reference in New Issue
Block a user