Initial commit
This commit is contained in:
59
skills/phylo_from_buscos/scripts/generate_qc_report.sh
Executable file
59
skills/phylo_from_buscos/scripts/generate_qc_report.sh
Executable file
@@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
# Quality control report generator for compleasm results
|
||||
#
|
||||
# Usage: bash generate_qc_report.sh [output_file.csv]
|
||||
#
|
||||
# Author: Bruno de Medeiros (Field Museum)
|
||||
# Based on tutorials by Paul Frandsen (BYU)
|
||||
|
||||
OUTPUT_FILE="${1:-qc_report.csv}"
|
||||
|
||||
echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}"
|
||||
|
||||
count=0
|
||||
for dir in 01_busco_results/*_compleasm; do
|
||||
if [ ! -d "${dir}" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
genome=$(basename "${dir}" _compleasm)
|
||||
summary="${dir}/summary.txt"
|
||||
|
||||
if [ -f "${summary}" ]; then
|
||||
# Parse completeness statistics from compleasm format
|
||||
# compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing)
|
||||
# Format: "S:80.93%, 2283" where we need the count (2283)
|
||||
complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
||||
|
||||
# Check if all values were successfully extracted
|
||||
if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then
|
||||
echo "Warning: Could not parse statistics for ${genome}" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
# Calculate completeness percentage (Complete / Total * 100)
|
||||
total=$((complete + duplicated + fragmented + missing))
|
||||
if command -v bc &> /dev/null; then
|
||||
completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc)
|
||||
else
|
||||
# Fallback if bc not available
|
||||
completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}")
|
||||
fi
|
||||
|
||||
echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}"
|
||||
count=$((count + 1))
|
||||
else
|
||||
echo "Warning: Summary file not found for ${genome}" >&2
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${count} -eq 0 ]; then
|
||||
echo "Error: No compleasm output directories found (*_compleasm)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "QC report generated: ${OUTPUT_FILE}"
|
||||
echo "Genomes analyzed: ${count}"
|
||||
Reference in New Issue
Block a user