#!/bin/bash
# Quality control report generator for compleasm results
#
# Usage: bash generate_qc_report.sh [output_file.csv]
#
# Author: Bruno de Medeiros (Field Museum)
# Based on tutorials by Paul Frandsen (BYU)

OUTPUT_FILE="${1:-qc_report.csv}"

echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}"

count=0
for dir in 01_busco_results/*_compleasm; do
  if [ ! -d "${dir}" ]; then
    continue
  fi

  genome=$(basename "${dir}" _compleasm)
  summary="${dir}/summary.txt"

  if [ -f "${summary}" ]; then
    # Parse completeness statistics from compleasm format
    # compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing)
    # Format: "S:80.93%, 2283" where we need the count (2283)
    complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
    duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
    fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
    missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')

    # Check if all values were successfully extracted
    if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then
      echo "Warning: Could not parse statistics for ${genome}" >&2
      continue
    fi

    # Calculate completeness percentage (Complete / Total * 100)
    total=$((complete + duplicated + fragmented + missing))
    if command -v bc &> /dev/null; then
      completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc)
    else
      # Fallback if bc not available
      completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}")
    fi

    echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}"
    count=$((count + 1))
  else
    echo "Warning: Summary file not found for ${genome}" >&2
  fi
done

if [ ${count} -eq 0 ]; then
  echo "Error: No compleasm output directories found (*_compleasm)" >&2
  exit 1
fi

echo "QC report generated: ${OUTPUT_FILE}"
echo "Genomes analyzed: ${count}"