Files
2025-11-29 18:02:37 +08:00

60 lines
2.1 KiB
Bash
Executable File

#!/bin/bash
# Quality control report generator for compleasm results
#
# Usage: bash generate_qc_report.sh [output_file.csv]
#
# Author: Bruno de Medeiros (Field Museum)
# Based on tutorials by Paul Frandsen (BYU)
OUTPUT_FILE="${1:-qc_report.csv}"
echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}"
count=0
for dir in 01_busco_results/*_compleasm; do
if [ ! -d "${dir}" ]; then
continue
fi
genome=$(basename "${dir}" _compleasm)
summary="${dir}/summary.txt"
if [ -f "${summary}" ]; then
# Parse completeness statistics from compleasm format
# compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing)
# Format: "S:80.93%, 2283" where we need the count (2283)
complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
# Check if all values were successfully extracted
if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then
echo "Warning: Could not parse statistics for ${genome}" >&2
continue
fi
# Calculate completeness percentage (Complete / Total * 100)
total=$((complete + duplicated + fragmented + missing))
if command -v bc &> /dev/null; then
completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc)
else
# Fallback if bc not available
completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}")
fi
echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}"
count=$((count + 1))
else
echo "Warning: Summary file not found for ${genome}" >&2
fi
done
if [ ${count} -eq 0 ]; then
echo "Error: No compleasm output directories found (*_compleasm)" >&2
exit 1
fi
echo "QC report generated: ${OUTPUT_FILE}"
echo "Genomes analyzed: ${count}"