89 lines
2.1 KiB
Bash
Executable File
89 lines
2.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# Extract and reorganize single-copy orthologs from compleasm output
|
|
#
|
|
# Usage: bash extract_orthologs.sh LINEAGE_NAME
|
|
# Example: bash extract_orthologs.sh metazoa
|
|
#
|
|
# Author: Bruno de Medeiros (Field Museum)
|
|
# Based on tutorials by Paul Frandsen (BYU)
|
|
|
|
if [ $# -lt 1 ]; then
|
|
echo "Usage: bash extract_orthologs.sh LINEAGE_NAME"
|
|
echo " Example: bash extract_orthologs.sh metazoa"
|
|
exit 1
|
|
fi
|
|
|
|
LINEAGE="$1"
|
|
|
|
echo "Extracting single-copy orthologs for lineage: ${LINEAGE}"
|
|
|
|
# Create directory for ortholog FASTA files
|
|
mkdir -p single_copy_orthologs
|
|
|
|
# Copy gene_marker.fasta files and rename by species
|
|
count=0
|
|
for dir in 01_busco_results/*_compleasm; do
|
|
if [ ! -d "${dir}" ]; then
|
|
continue
|
|
fi
|
|
|
|
genome=$(basename "${dir}" _compleasm)
|
|
|
|
# Auto-detect the OrthoDB version (odb10, odb11, odb12, etc.)
|
|
odb_dirs=("${dir}/${LINEAGE}_odb"*)
|
|
if [ -d "${odb_dirs[0]}" ]; then
|
|
marker_file="${odb_dirs[0]}/gene_marker.fasta"
|
|
else
|
|
echo " Warning: No OrthoDB directory found for ${genome}" >&2
|
|
continue
|
|
fi
|
|
|
|
if [ -f "${marker_file}" ]; then
|
|
cp "${marker_file}" "single_copy_orthologs/${genome}.fasta"
|
|
echo " Extracted: ${genome}"
|
|
count=$((count + 1))
|
|
else
|
|
echo " Warning: Marker file not found for ${genome}" >&2
|
|
fi
|
|
done
|
|
|
|
if [ ${count} -eq 0 ]; then
|
|
echo "Error: No gene_marker.fasta files found. Check lineage name." >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "Extracted ${count} genomes"
|
|
echo ""
|
|
echo "Now generating per-locus unaligned FASTA files..."
|
|
|
|
cd single_copy_orthologs || exit 1
|
|
mkdir -p unaligned_aa
|
|
cd unaligned_aa || exit 1
|
|
|
|
# AWK script to split by ortholog ID
|
|
awk 'BEGIN{RS=">"; FS="\n"} {
|
|
if (NF > 1) {
|
|
split($1, b, "_");
|
|
fnme = b[1] ".fas";
|
|
n = split(FILENAME, a, "/");
|
|
species = a[length(a)];
|
|
gsub(".fasta", "", species);
|
|
print ">" species "\n" $2 >> fnme;
|
|
close(fnme);
|
|
}
|
|
}' ../*.fasta
|
|
|
|
# Fix headers
|
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
# macOS
|
|
sed -i '' -e 's/.fasta//g' *.fas
|
|
else
|
|
# Linux
|
|
sed -i -e 's/.fasta//g' *.fas
|
|
fi
|
|
|
|
num_loci=$(ls -1 *.fas 2>/dev/null | wc -l)
|
|
echo "Unaligned ortholog files generated: ${num_loci} loci"
|
|
echo ""
|
|
echo "Output directory: single_copy_orthologs/unaligned_aa/"
|