Initial commit

2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions
--- a/skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job
+++ b/skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --job-name=compleasm_first
+#SBATCH --cpus-per-task=TOTAL_THREADS  # Replace with total available CPUs (e.g., 64)
+#SBATCH --mem-per-cpu=6G
+#SBATCH --time=24:00:00
+#SBATCH --output=logs/compleasm_first.%j.out
+#SBATCH --error=logs/compleasm_first.%j.err
+
+source ~/.bashrc
+conda activate phylo
+
+mkdir -p logs
+mkdir -p 01_busco_results
+
+# Process FIRST genome only (downloads lineage database)
+first_genome=$(head -n 1 genome_list.txt)
+genome_name=$(basename ${first_genome} .fasta)
+echo "Processing first genome: ${genome_name} with ${SLURM_CPUS_PER_TASK} threads..."
+echo "This will download the BUSCO lineage database for subsequent runs."
+
+compleasm run \
+  -a ${first_genome} \
+  -o 01_busco_results/${genome_name}_compleasm \
+  -l LINEAGE \
+  -t ${SLURM_CPUS_PER_TASK}
+
+echo "First genome complete! Lineage database is now cached."
+echo "Submit the parallel job for remaining genomes: sbatch run_compleasm_parallel.job"
--- a/skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job
+++ b/skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job
@@ -0,0 +1,25 @@
+#!/bin/bash
+#SBATCH --job-name=compleasm_parallel
+#SBATCH --array=2-NUM_GENOMES  # Start from genome 2 (first genome already processed)
+#SBATCH --cpus-per-task=THREADS_PER_JOB  # e.g., 16 for 64-core system with 4 concurrent jobs
+#SBATCH --mem-per-cpu=6G
+#SBATCH --time=48:00:00
+#SBATCH --output=logs/compleasm.%A_%a.out
+#SBATCH --error=logs/compleasm.%A_%a.err
+
+source ~/.bashrc
+conda activate phylo
+
+mkdir -p 01_busco_results
+
+# Get genome for this array task (skipping the first one)
+genome=$(sed -n "${SLURM_ARRAY_TASK_ID}p" genome_list.txt)
+genome_name=$(basename ${genome} .fasta)
+
+echo "Processing ${genome_name} with ${SLURM_CPUS_PER_TASK} threads..."
+
+compleasm run \
+  -a ${genome} \
+  -o 01_busco_results/${genome_name}_compleasm \
+  -l LINEAGE \
+  -t ${SLURM_CPUS_PER_TASK}
--- a/skills/phylo_from_buscos/templates/slurm/08a_partition_search.job
+++ b/skills/phylo_from_buscos/templates/slurm/08a_partition_search.job
@@ -0,0 +1,27 @@
+#!/bin/bash
+#SBATCH --job-name=iqtree_partition
+#SBATCH --cpus-per-task=18
+#SBATCH --mem-per-cpu=4G
+#SBATCH --time=72:00:00
+#SBATCH --output=logs/partition_search.out
+#SBATCH --error=logs/partition_search.err
+
+source ~/.bashrc
+conda activate phylo
+
+cd 06_concatenation  # Use organized directory structure
+
+iqtree \
+  -s FcC_supermatrix.fas \
+  -spp partition_def.txt \
+  -nt ${SLURM_CPUS_PER_TASK} \
+  -safe \
+  -pre partition_search \
+  -m TESTMERGEONLY \
+  -mset MODEL_SET \
+  -msub nuclear \
+  -rcluster 10 \
+  -bb 1000 \
+  -alrt 1000
+
+# Output: partition_search.best_scheme.nex
--- a/skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job
+++ b/skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job
@@ -0,0 +1,28 @@
+#!/bin/bash
+#SBATCH --job-name=iqtree_genes
+#SBATCH --array=1-NUM_LOCI
+#SBATCH --cpus-per-task=1
+#SBATCH --mem-per-cpu=4G
+#SBATCH --time=2:00:00
+#SBATCH --output=logs/%A_%a.genetree.out
+
+source ~/.bashrc
+conda activate phylo
+
+cd trimmed_aa
+
+# Create list of alignments if not present
+if [ ! -f locus_alignments.txt ]; then
+    ls *_trimmed.fas > locus_alignments.txt
+fi
+
+locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" locus_alignments.txt)
+
+iqtree \
+  -s ${locus} \
+  -m MFP \
+  -bb 1000 \
+  -bnni \
+  -czb \
+  -pre $(basename ${locus} _trimmed.fas) \
+  -nt 1