60 lines
2.1 KiB
Bash
Executable File
60 lines
2.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# Quality control report generator for compleasm results
|
|
#
|
|
# Usage: bash generate_qc_report.sh [output_file.csv]
|
|
#
|
|
# Author: Bruno de Medeiros (Field Museum)
|
|
# Based on tutorials by Paul Frandsen (BYU)
|
|
|
|
OUTPUT_FILE="${1:-qc_report.csv}"
|
|
|
|
echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}"
|
|
|
|
count=0
|
|
for dir in 01_busco_results/*_compleasm; do
|
|
if [ ! -d "${dir}" ]; then
|
|
continue
|
|
fi
|
|
|
|
genome=$(basename "${dir}" _compleasm)
|
|
summary="${dir}/summary.txt"
|
|
|
|
if [ -f "${summary}" ]; then
|
|
# Parse completeness statistics from compleasm format
|
|
# compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing)
|
|
# Format: "S:80.93%, 2283" where we need the count (2283)
|
|
complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
|
duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
|
fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
|
missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
|
|
|
|
# Check if all values were successfully extracted
|
|
if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then
|
|
echo "Warning: Could not parse statistics for ${genome}" >&2
|
|
continue
|
|
fi
|
|
|
|
# Calculate completeness percentage (Complete / Total * 100)
|
|
total=$((complete + duplicated + fragmented + missing))
|
|
if command -v bc &> /dev/null; then
|
|
completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc)
|
|
else
|
|
# Fallback if bc not available
|
|
completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}")
|
|
fi
|
|
|
|
echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}"
|
|
count=$((count + 1))
|
|
else
|
|
echo "Warning: Summary file not found for ${genome}" >&2
|
|
fi
|
|
done
|
|
|
|
if [ ${count} -eq 0 ]; then
|
|
echo "Error: No compleasm output directories found (*_compleasm)" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "QC report generated: ${OUTPUT_FILE}"
|
|
echo "Genomes analyzed: ${count}"
|