Initial commit

2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,13 @@
 {
  "name": "bioinfo-skills",
  "description": "Bioinformatics skills",
  "version": "0.0.0-2025.11.28",
  "author": {
    "name": "Bruno de Medeiros",
    "email": "bdemedeiros@fieldmuseum.org"
  },
  "skills": [
    "./skills/phylo_from_buscos",
    "./skills/biogeobears"
  ]
 }
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
 # bioinfo-skills
 Bioinformatics skills
--- a/plugin.lock.json
+++ b/plugin.lock.json
@@ -0,0 +1,180 @@
 {
  "$schema": "internal://schemas/plugin.lock.v1.json",
  "pluginId": "gh:brunoasm/my_claude_skills:bioinfo-skills",
  "normalized": {
    "repo": null,
    "ref": "refs/tags/v20251128.0",
    "commit": "de6d76ce965f0f2f78c1321dfc7afa719683892a",
    "treeHash": "08c77edc8edaaeaf8207d518c75c59bc13394fe42a8156c93304b12383037ffb",
    "generatedAt": "2025-11-28T10:14:26.129461Z",
    "toolVersion": "publish_plugins.py@0.2.0"
  },
  "origin": {
    "remote": "git@github.com:zhongweili/42plugin-data.git",
    "branch": "master",
    "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
    "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
  },
  "manifest": {
    "name": "bioinfo-skills",
    "description": "Bioinformatics skills"
  },
  "content": {
    "files": [
      {
        "path": "README.md",
        "sha256": "6cc26e7acb732e15c1a1a97a61ce783161dbb2236192c232da5c3c65847e328e"
      },
      {
        "path": ".claude-plugin/plugin.json",
        "sha256": "c55fa066bcc713f4ecbef376a15517d0732d3640b6c35edab8510e4c93748641"
      },
      {
        "path": "skills/phylo_from_buscos/README.md",
        "sha256": "3408e707f1e19a8924cd6e876f6c69b4a549a6d31826f0f9b587b93605996c1e"
      },
      {
        "path": "skills/phylo_from_buscos/SKILL.md",
        "sha256": "d58b91ef6554434d4b86879337546c92bb48223910303bb804136418730f7b9a"
      },
      {
        "path": "skills/phylo_from_buscos/.skillignore",
        "sha256": "0b1fece23c5bd3298ec9ce56df31f957c70b1716753f9acaa74bac61303273c0"
      },
      {
        "path": "skills/phylo_from_buscos/references/REFERENCE.md",
        "sha256": "1820a04978907df16ceed0dde829d0ef0655e1b4286b5f334661a6a17e9ff9ae"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py",
        "sha256": "69c02d641cbc719f12f859aca9285fdc5b0ac7d6e923e0b5bf0a2b36a297539d"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/extract_orthologs.sh",
        "sha256": "a739cd47d5ba5dcb30081cc1f3303821dc98f1c02108005bee896b9a7062b6e7"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh",
        "sha256": "a3926ec3bdf73d3f3cbc6af057275350951927b80301f7a145997b0291d6cde1"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/rename_genomes.py",
        "sha256": "5dc102c4384aab940f3ce9348c954c7813f51bd4124c5f85b7b7a28ff215f2da"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py",
        "sha256": "2f91b2564ecb6eee9c37ea9265ff666a5de321ac1d7f6abbdc70a2f3931c59a2"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/run_aliscore.sh",
        "sha256": "53e679c3071cf3ed56279250bd731d9a3604e15526c9a0293e0dc1e34bda8931"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/generate_qc_report.sh",
        "sha256": "872c5cecb4d9e25f8847baf060f4a8b37a86ab7c3c36f63536ba37be364194bc"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/download_ncbi_genomes.py",
        "sha256": "e30960bb620626e52309a3930e0c7769d6b7fea589de4fc17d6f8e4c51832502"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/run_alicut.sh",
        "sha256": "2d8762559c60b79924dcab5e0004626489c7aaaf9c9f7058ffb223d3edb1208a"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl",
        "sha256": "7e046000bb0834b0df8a135256fea6031e5ffdd21333182db18a0886c3bfdb82"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl",
        "sha256": "7e4a92710c840ea569458c7b6d97806dfe303c3346a2ae3310f0122f2f1496aa"
      },
      {
        "path": "skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore_module.pm",
        "sha256": "ee7e2690027925ac012bb097fe6ed367bf00da1425bae9e317f641048c765432"
      },
      {
        "path": "skills/phylo_from_buscos/templates/README.md",
        "sha256": "d9661f51a45aab151a07396e8ed4b0a353cf750ee55337ea938cf995469fc78c"
      },
      {
        "path": "skills/phylo_from_buscos/templates/slurm/08a_partition_search.job",
        "sha256": "98e58028c41abc524089cc547eabd48039185b4aba873d786be78630a45ab783"
      },
      {
        "path": "skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job",
        "sha256": "6fb08ac7b60ef2a07b60521c01140aa922c21139d1738ec39978de4ea0bf1f76"
      },
      {
        "path": "skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job",
        "sha256": "04f01f4d0692847793a78916c53bd98287da19d89f066e113de82772dd847197"
      },
      {
        "path": "skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job",
        "sha256": "e6b24626d44f5f122a12cd14147b38b8f60a44ab0d29c38058153dba84bd7b0c"
      },
      {
        "path": "skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh",
        "sha256": "a7dfb2056da7fa1a0b9d1e645ae34cb8170e0df6708c509b974c7e1e505c1848"
      },
      {
        "path": "skills/phylo_from_buscos/templates/local/08a_partition_search.sh",
        "sha256": "0f7e8de34e81fe93dec7940b9c1598b7156caf711e324a6f012748f2623ddd9a"
      },
      {
        "path": "skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh",
        "sha256": "aeefc03ff8396b7d28f40e4af726f4db1b2ebc036ba7bcfbcbdeea2cef3ea548"
      },
      {
        "path": "skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh",
        "sha256": "edb379d1f98b9e41326b9c1b82fd0be56a22fc63224393fd7c50642d3931be33"
      },
      {
        "path": "skills/phylo_from_buscos/templates/local/02_compleasm_first.sh",
        "sha256": "5be4bd66da9ba8e0e9c4b7b76361246ae162bb95718bd2a1a8a79a41869e9c94"
      },
      {
        "path": "skills/phylo_from_buscos/templates/pbs/08a_partition_search.job",
        "sha256": "bc98fa2abd41c07ab9a2608f803441d6c8e39359034d92a96eb46b9288a7929f"
      },
      {
        "path": "skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job",
        "sha256": "aceb618e7fbd0164baf90142cabb35e9302756f7d74e99e68476e106d1ce3de2"
      },
      {
        "path": "skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job",
        "sha256": "4d82508e3acada61f8999ed4c89e40f30ed00363a7477b925d6c5fb954d33e4e"
      },
      {
        "path": "skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job",
        "sha256": "de597deea9b382b8dcd8663b3c497fe866d353882cd533d8223723653dbe1851"
      },
      {
        "path": "skills/biogeobears/README.md",
        "sha256": "85b0ea5e40d2eda96243f0c12c9c5c565b4ca358d20c04f10366a3e0a6ae1961"
      },
      {
        "path": "skills/biogeobears/SKILL.md",
        "sha256": "6a872dc71353f89651825f0bcdd14d9e226103345d08380c9d6b4ba7de2df10a"
      },
      {
        "path": "skills/biogeobears/references/biogeobears_details.md",
        "sha256": "3665966b9f4cc1b18d6f50806a48b6fbb94d5b4fd7777fc0cd758761ed0a10f2"
      },
      {
        "path": "skills/biogeobears/scripts/validate_geography_file.py",
        "sha256": "6559b6f1a9a4c1e06ca033ba792446eb20a2f94ddb5ea072906ac1674a9fa105"
      },
      {
        "path": "skills/biogeobears/scripts/biogeobears_analysis_template.Rmd",
        "sha256": "2ab1859e662023e94abae8a1019bfd0b70c5779fee0c608157855042d8573f8f"
      }
    ],
    "dirSha256": "08c77edc8edaaeaf8207d518c75c59bc13394fe42a8156c93304b12383037ffb"
  },
  "security": {
    "scannedAt": null,
    "scannerVersion": null,
    "flags": []
  }
 }
--- a/skills/biogeobears/README.md
+++ b/skills/biogeobears/README.md
@@ -0,0 +1,222 @@
 # BioGeoBEARS Biogeographic Analysis Skill
 A Claude skill for setting up and executing phylogenetic biogeographic analyses using BioGeoBEARS in R.
 ## Overview
 This skill automates the complete workflow for biogeographic analysis on phylogenetic trees, from raw data validation to publication-ready visualizations. It helps users reconstruct ancestral geographic ranges by:
 - Validating and reformatting input files (phylogenetic tree + geographic distribution data)
 - Setting up organized analysis folder structures
 - Generating customized RMarkdown analysis scripts
 - Guiding parameter selection (maximum range size, model choices)
 - Producing visualizations with pie charts and text labels showing ancestral ranges
 - Comparing multiple biogeographic models with statistical tests
 ## When to Use
 Use this skill when you need to:
 - Reconstruct ancestral geographic ranges on a phylogeny
 - Test different biogeographic models (DEC, DIVALIKE, BAYAREALIKE)
 - Analyze how species distributions evolved over time
 - Determine whether founder-event speciation (+J parameter) is important
 - Generate publication-ready biogeographic visualizations
 ## Required Inputs
 Users must provide:
 1. **Phylogenetic tree** (Newick format: .nwk, .tre, or .tree)
   - Must be rooted
   - Tip labels must match species in geography file
   - Branch lengths required
 2. **Geographic distribution data** (any tabular format)
   - Species names matching tree tips
   - Presence/absence data for different geographic areas
   - Accepts CSV, TSV, Excel, or PHYLIP format
 ## What the Skill Does
 ### 1. Data Validation and Reformatting
 The skill includes a Python script (`validate_geography_file.py`) that:
 - Validates geography file format (PHYLIP-like with specific tab/spacing requirements)
 - Checks for common errors (spaces in species names, tab delimiters, binary code length)
 - Reformats CSV/TSV files to proper BioGeoBEARS format
 - Cross-validates species names against tree tip labels
 ### 2. Analysis Setup
 Creates an organized directory structure:
 ```
 biogeobears_analysis/
 ├── input/
 │   ├── tree.nwk                    # Phylogenetic tree
 │   ├── geography.data              # Validated geography file
 │   └── original_data/              # Original input files
 ├── scripts/
 │   └── run_biogeobears.Rmd         # Customized RMarkdown script
 ├── results/                        # Analysis outputs
 │   ├── [MODEL]_result.Rdata        # Saved model results
 │   └── plots/                      # Visualizations
 │       ├── [MODEL]_pie.pdf
 │       └── [MODEL]_text.pdf
 └── README.md                       # Documentation
 ```
 ### 3. RMarkdown Analysis Template
 Generates a complete RMarkdown script that:
 - Loads and validates input data
 - Fits 6 biogeographic models:
  - DEC (Dispersal-Extinction-Cladogenesis)
  - DEC+J (DEC with founder-event speciation)
  - DIVALIKE (vicariance-focused)
  - DIVALIKE+J
  - BAYAREALIKE (sympatry-focused)
  - BAYAREALIKE+J
 - Compares models using AIC, AICc, and AIC weights
 - Performs likelihood ratio tests for nested models
 - Estimates parameters (d=dispersal, e=extinction, j=founder-event rates)
 - Generates visualizations on the phylogeny
 - Creates HTML report with all results
 ### 4. Visualization
 Produces two types of plots:
 - **Pie charts**: Show probability distributions for ancestral ranges (conveys uncertainty)
 - **Text labels**: Show maximum likelihood ancestral states (cleaner, easier to read)
 Colors represent geographic areas:
 - Single areas: Bright primary colors
 - Multi-area ranges: Blended colors
 - All areas: White
 ## Workflow
 1. **Gather information**: Ask user for tree file, geography file, and parameters
 2. **Validate tree**: Check if rooted and extract tip labels
 3. **Validate/reformat geography file**: Use validation script to check format or convert from CSV/TSV
 4. **Set up analysis folder**: Create organized directory structure
 5. **Generate RMarkdown script**: Customize template with user parameters
 6. **Create documentation**: Generate README and run scripts
 7. **Provide instructions**: Clear steps for running the analysis
 ## Analysis Parameters
 The skill helps users choose:
 ### Maximum Range Size
 - How many areas can a species occupy simultaneously?
 - Options: Conservative (# areas - 1), Permissive (all areas), Data-driven (max observed)
 - Larger values increase computation time exponentially
 ### Models to Compare
 - Default: All 6 models (recommended for comprehensive comparison)
 - Alternative: Only base models or only +J models
 - Rationale: Model comparison is key to biogeographic inference
 ### Visualization Type
 - Pie charts (show probabilities and uncertainty)
 - Text labels (show most likely states, cleaner)
 - Both (default in template)
 ## Bundled Resources
 ### scripts/
 **validate_geography_file.py**
 - Validates BioGeoBEARS geography file format
 - Reformats from CSV/TSV to PHYLIP
 - Cross-validates with tree tip labels
 - Usage: `python validate_geography_file.py --help`
 **biogeobears_analysis_template.Rmd**
 - Complete RMarkdown analysis template
 - Parameterized via YAML header
 - Fits all models, compares, and visualizes
 - Generates self-contained HTML report
 ### references/
 **biogeobears_details.md**
 - Detailed model descriptions (DEC, DIVALIKE, BAYAREALIKE, +J parameter)
 - Input file format specifications with examples
 - Parameter interpretation guidelines
 - Plotting options and customization
 - Complete citations for publications
 - Computational considerations and troubleshooting
 ## Example Output
 The analysis produces:
 - `biogeobears_report.html` - Interactive HTML report with all results
 - `[MODEL]_result.Rdata` - Saved R objects for each model
 - `plots/[MODEL]_pie.pdf` - Ancestral ranges shown as pie charts on tree
 - `plots/[MODEL]_text.pdf` - Ancestral ranges shown as text labels on tree
 ## Interpretation Guidance
 The skill helps users understand:
 ### Model Selection
 - **AIC weights**: Probability each model is best
 - **ΔAIC thresholds**: <2 (equivalent), 2-7 (less support), >10 (no support)
 ### Parameter Estimates
 - **d (dispersal)**: Rate of range expansion
 - **e (extinction)**: Rate of local extinction
 - **j (founder-event)**: Rate of jump dispersal at speciation
 - **d/e ratio**: >1 favors expansion, <1 favors contraction
 ### Statistical Tests
 - **LRT p < 0.05**: +J parameter significantly improves fit
 - Model uncertainty: Report results from multiple models if weights similar
 ## Installation Requirements
 Users must have:
 - R (≥4.0)
 - BioGeoBEARS R package
 - Supporting R packages: ape, rmarkdown, knitr, kableExtra
 - Python 3 (for validation script)
 Installation instructions are included in generated README.md files.
 ## Expected Runtime
 **Skill setup time**: 5-10 minutes (file validation and directory setup)
 **Analysis runtime** (separate from skill execution):
 - Small datasets (<50 tips, ≤5 areas): 10-30 minutes
 - Medium datasets (50-100 tips, 5-6 areas): 30-90 minutes
 - Large datasets (>100 tips, >5 areas): 1-6 hours
 ## Common Issues Handled
 The skill troubleshoots:
 - Species name mismatches between tree and geography file
 - Unrooted trees (guides user to root with outgroup)
 - Geography file formatting errors (tabs, spaces, binary codes)
 - Optimization convergence failures
 - Slow runtime with many areas/tips
 ## Citations
 Based on:
 - **BioGeoBEARS** package by Nicholas Matzke
 - Tutorial resources from http://phylo.wikidot.com/biogeobears
 - Example workflows from BioGeoBEARS GitHub repository
 ## Skill Details
 - **Skill Type**: Workflow-based bioinformatics skill
 - **Domain**: Phylogenetic biogeography, historical biogeography
 - **Output**: Complete analysis setup with scripts, documentation, and ready-to-run workflow
 - **Automation Level**: High (validates, reformats, generates all scripts)
 - **User Input Required**: File paths and parameter choices via guided questions
 ## See Also
 - [phylo_from_buscos](../phylo_from_buscos/README.md) - Complementary skill for generating phylogenies from genomes
--- a/skills/biogeobears/SKILL.md
+++ b/skills/biogeobears/SKILL.md
@@ -0,0 +1,581 @@
 ---
 name: biogeobears
 description: Set up and execute phylogenetic biogeographic analyses using BioGeoBEARS in R. Use when users request biogeographic reconstruction, ancestral range estimation, or want to analyze species distributions on phylogenies. Handles input file validation, data reformatting, RMarkdown workflow generation, and result visualization.
 ---
 # BioGeoBEARS Biogeographic Analysis
 ## Overview
 BioGeoBEARS (BioGeography with Bayesian and Likelihood Evolutionary Analysis in R Scripts) performs probabilistic inference of ancestral geographic ranges on phylogenetic trees. This skill helps set up complete biogeographic analyses by:
 1. Validating and reformatting input files (phylogenetic tree and geographic distribution data)
 2. Generating organized analysis folder structure
 3. Creating customized RMarkdown analysis scripts
 4. Guiding users through parameter selection and model choices
 5. Producing publication-ready visualizations
 ## When to Use This Skill
 Use this skill when users request:
 - "Analyze biogeography on my phylogeny"
 - "Reconstruct ancestral ranges for my species"
 - "Run BioGeoBEARS analysis"
 - "Which areas did my ancestors occupy?"
 - "Test biogeographic models (DEC, DIVALIKE, BAYAREALIKE)"
 The skill triggers when users mention phylogenetic biogeography, ancestral area reconstruction, or provide tree + distribution data.
 ## Required Inputs
 Users must provide:
 1. **Phylogenetic tree** (Newick format, .nwk, .tre, or .tree file)
   - Must be rooted
   - Tip labels will be matched to geography file
   - Branch lengths required
 2. **Geographic distribution data** (any tabular format)
   - Species names (matching tree tips)
   - Presence/absence data for different geographic areas
   - Can be CSV, TSV, Excel, or already in PHYLIP format
 ## Workflow
 ### Step 1: Gather Information
 When a user requests a BioGeoBEARS analysis, ask for:
 1. **Input file paths**:
   - "What is the path to your phylogenetic tree file?"
   - "What is the path to your geographic distribution file?"
 2. **Analysis parameters** (if not specified):
   - Maximum range size (how many areas can a species occupy simultaneously?)
   - Which models to compare (default: all six - DEC, DEC+J, DIVALIKE, DIVALIKE+J, BAYAREALIKE, BAYAREALIKE+J)
   - Output directory name (default: "biogeobears_analysis")
 Use the AskUserQuestion tool to gather this information efficiently:
 ```
 Example questions:
 - "Maximum range size" - options based on number of areas (e.g., for 4 areas: "All 4 areas", "3 areas", "2 areas")
 - "Models to compare" - options: "All 6 models (recommended)", "Only base models (DEC, DIVALIKE, BAYAREALIKE)", "Only +J models", "Custom selection"
 - "Visualization type" - options: "Pie charts (show probabilities)", "Text labels (show most likely states)", "Both"
 ```
 ### Step 2: Validate and Prepare Input Files
 #### Validate Tree File
 Use the Read tool to check the tree file:
 ```r
 # In R, basic validation:
 library(ape)
 tr <- read.tree("path/to/tree.nwk")
 print(paste("Tips:", length(tr$tip.label)))
 print(paste("Rooted:", is.rooted(tr)))
 print(tr$tip.label)  # Check species names
 ```
 Verify:
 - File can be parsed as Newick
 - Tree is rooted (if not, ask user which outgroup to use)
 - Note the tip labels for geography file validation
 #### Validate and Reformat Geography File
 Use `scripts/validate_geography_file.py` to validate or reformat the geography file.
 **If file is already in PHYLIP format** (starts with numbers):
 ```bash
 python scripts/validate_geography_file.py path/to/geography.txt --validate --tree path/to/tree.nwk
 ```
 This checks:
 - Correct tab delimiters
 - Species names match tree tips
 - Binary codes are correct length
 - No spaces in species names or binary codes
 **If file is in CSV/TSV format** (needs reformatting):
 ```bash
 python scripts/validate_geography_file.py path/to/distribution.csv --reformat -o geography.data --delimiter ","
 ```
 Or for tab-delimited:
 ```bash
 python scripts/validate_geography_file.py path/to/distribution.txt --reformat -o geography.data --delimiter tab
 ```
 The script will:
 - Detect area names from header row
 - Convert presence/absence data to binary (handles "1", "present", "TRUE", etc.)
 - Remove spaces from species names (replace with underscores)
 - Create properly formatted PHYLIP file
 **Always validate the reformatted file** before proceeding:
 ```bash
 python scripts/validate_geography_file.py geography.data --validate --tree path/to/tree.nwk
 ```
 ### Step 3: Set Up Analysis Folder Structure
 Create an organized directory for the analysis:
 ```
 biogeobears_analysis/
 ├── input/
 │   ├── tree.nwk                 # Original or copied tree
 │   ├── geography.data            # Validated/reformatted geography file
 │   └── original_data/            # Original input files
 │       ├── original_tree.nwk
 │       └── original_distribution.csv
 ├── scripts/
 │   └── run_biogeobears.Rmd       # Generated RMarkdown script
 ├── results/                      # Created by analysis (output directory)
 │   ├── [MODEL]_result.Rdata      # Saved model results
 │   └── plots/                    # Visualization outputs
 │       ├── [MODEL]_pie.pdf
 │       └── [MODEL]_text.pdf
 └── README.md                     # Analysis documentation
 ```
 Create this structure programmatically:
 ```bash
 mkdir -p biogeobears_analysis/input/original_data
 mkdir -p biogeobears_analysis/scripts
 mkdir -p biogeobears_analysis/results/plots
 # Copy files
 cp path/to/tree.nwk biogeobears_analysis/input/
 cp geography.data biogeobears_analysis/input/
 cp original_files biogeobears_analysis/input/original_data/
 ```
 ### Step 4: Generate RMarkdown Analysis Script
 Use the template at `scripts/biogeobears_analysis_template.Rmd` and customize it with user parameters.
 **Copy and customize the template**:
 ```bash
 cp scripts/biogeobears_analysis_template.Rmd biogeobears_analysis/scripts/run_biogeobears.Rmd
 ```
 **Create a parameter file** or modify the YAML header in the Rmd to use the user's specific settings:
 Example customization via R code:
 ```r
 # Edit YAML parameters programmatically or provide as params when rendering
 rmarkdown::render(
  "biogeobears_analysis/scripts/run_biogeobears.Rmd",
  params = list(
    tree_file = "../input/tree.nwk",
    geog_file = "../input/geography.data",
    max_range_size = 4,
    models = "DEC,DEC+J,DIVALIKE,DIVALIKE+J,BAYAREALIKE,BAYAREALIKE+J",
    output_dir = "../results"
  ),
  output_file = "../results/biogeobears_report.html"
 )
 ```
 Or create a run script:
 ```bash
 # biogeobears_analysis/run_analysis.sh
 #!/bin/bash
 cd "$(dirname "$0")/scripts"
 R -e "rmarkdown::render('run_biogeobears.Rmd', params = list(
  tree_file = '../input/tree.nwk',
  geog_file = '../input/geography.data',
  max_range_size = 4,
  models = 'DEC,DEC+J,DIVALIKE,DIVALIKE+J,BAYAREALIKE,BAYAREALIKE+J',
  output_dir = '../results'
 ), output_file = '../results/biogeobears_report.html')"
 ```
 ### Step 5: Create README Documentation
 Generate a README.md in the analysis directory explaining:
 - What files are present
 - How to run the analysis
 - What parameters were used
 - How to interpret results
 Example:
 ```markdown
 # BioGeoBEARS Analysis
 ## Overview
 Biogeographic analysis of [NUMBER] species across [NUMBER] geographic areas.
 ## Input Data
 - **Tree**: `input/tree.nwk` ([NUMBER] tips)
 - **Geography**: `input/geography.data` ([NUMBER] species × [NUMBER] areas)
 - **Areas**: [A, B, C, ...]
 ## Parameters
 - Maximum range size: [NUMBER]
 - Models tested: [LIST]
 ## Running the Analysis
 ### Option 1: Using RMarkdown directly
 ```r
 library(rmarkdown)
 render("scripts/run_biogeobears.Rmd",
       output_file = "../results/biogeobears_report.html")
 ```
 ### Option 2: Using the run script
 ```bash
 bash run_analysis.sh
 ```
 ## Outputs
 Results will be saved in `results/`:
 - `biogeobears_report.html` - Full analysis report with visualizations
 - `[MODEL]_result.Rdata` - Saved R objects for each model
 - `plots/[MODEL]_pie.pdf` - Ancestral range reconstructions (pie charts)
 - `plots/[MODEL]_text.pdf` - Ancestral range reconstructions (text labels)
 ## Interpreting Results
 The HTML report includes:
 1. **Model Comparison** - AIC scores, AIC weights, best-fit model
 2. **Parameter Estimates** - Dispersal (d), extinction (e), founder-event (j) rates
 3. **Likelihood Ratio Tests** - Statistical comparisons of nested models
 4. **Ancestral Range Plots** - Visualizations on phylogeny
 5. **Session Info** - R package versions for reproducibility
 ## Model Descriptions
 - **DEC**: Dispersal-Extinction-Cladogenesis (general-purpose)
 - **DIVALIKE**: Emphasizes vicariance
 - **BAYAREALIKE**: Emphasizes sympatric speciation
 - **+J**: Adds founder-event speciation parameter
 See `references/biogeobears_details.md` for detailed model descriptions.
 ## Installation Requirements
 ```r
 # Install BioGeoBEARS
 install.packages("rexpokit")
 install.packages("cladoRcpp")
 library(devtools)
 devtools::install_github(repo="nmatzke/BioGeoBEARS")
 # Other packages
 install.packages(c("ape", "rmarkdown", "knitr", "kableExtra"))
 ```
 ```
 ### Step 6: Provide User Instructions
 After setting up the analysis, provide clear instructions to the user:
 ```
 Analysis Setup Complete!
 Directory structure created at: biogeobears_analysis/
 📁 Files created:
   ✓ input/tree.nwk - Phylogenetic tree ([N] tips)
   ✓ input/geography.data - Geographic distribution data (validated)
   ✓ scripts/run_biogeobears.Rmd - RMarkdown analysis script
   ✓ README.md - Documentation and instructions
   ✓ run_analysis.sh - Convenience script to run analysis
 📋 Next steps:
 1. Review the README.md for analysis details
 2. Install BioGeoBEARS if not already installed:
   ```r
   install.packages("rexpokit")
   install.packages("cladoRcpp")
   library(devtools)
   devtools::install_github(repo="nmatzke/BioGeoBEARS")
   ```
 3. Run the analysis:
   ```bash
   cd biogeobears_analysis
   bash run_analysis.sh
   ```
   Or in R:
   ```r
   setwd("biogeobears_analysis")
   rmarkdown::render("scripts/run_biogeobears.Rmd",
                     output_file = "../results/biogeobears_report.html")
   ```
 4. View results:
   - Open results/biogeobears_report.html in web browser
   - Check results/plots/ for PDF visualizations
 ⏱️ Expected runtime: [ESTIMATE based on tree size]
   - Small trees (<50 tips): 5-15 minutes
   - Medium trees (50-100 tips): 15-60 minutes
   - Large trees (>100 tips): 1-4 hours
 💡 The HTML report includes model comparison, parameter estimates, and visualization of ancestral ranges on your phylogeny.
 ```
 ## Analysis Parameter Guidance
 When users ask for guidance on parameters, consult `references/biogeobears_details.md` and provide recommendations:
 ### Maximum Range Size
 **Ask**: "What's the maximum number of areas a species in your group can realistically occupy?"
 Common approaches:
 - **Conservative**: Number of areas - 1 (prevents unrealistic cosmopolitan ancestral ranges)
 - **Permissive**: All areas (if biologically plausible)
 - **Data-driven**: Maximum observed in extant species
 **Impact**: Larger values increase computational time exponentially
 ### Model Selection
 **Default recommendation**: Run all 6 models for comprehensive comparison
 - DEC, DIVALIKE, BAYAREALIKE (base models)
 - DEC+J, DIVALIKE+J, BAYAREALIKE+J (+J variants)
 **Rationale**:
 - Model comparison is key to inference
 - +J parameter is often significant
 - Small additional computational cost
 If computation is a concern, suggest starting with DEC and DEC+J.
 ### Visualization Options
 **Pie charts** (`plotwhat = "pie"`):
 - Show probability distributions across all possible states
 - Better for conveying uncertainty
 - Can be cluttered with many areas
 **Text labels** (`plotwhat = "text"`):
 - Show only maximum likelihood state
 - Cleaner, easier to read
 - Doesn't show uncertainty
 **Recommendation**: Generate both in the analysis (template does this automatically)
 ## Common Issues and Troubleshooting
 ### Species Name Mismatches
 **Symptom**: Error about species in tree not in geography file (or vice versa)
 **Solution**: Use the validation script with `--tree` option to identify mismatches, then either:
 1. Edit the geography file to match tree tip labels
 2. Edit tree tip labels to match geography file
 3. Remove species that aren't in both
 ### Tree Not Rooted
 **Symptom**: Error about unrooted tree
 **Solution**:
 ```r
 library(ape)
 tr <- read.tree("tree.nwk")
 tr <- root(tr, outgroup = "outgroup_species_name")
 write.tree(tr, "tree_rooted.nwk")
 ```
 Ask user which species to use as outgroup.
 ### Formatting Errors in Geography File
 **Symptom**: Validation errors about tabs, spaces, or binary codes
 **Solution**: Use the reformat option:
 ```bash
 python scripts/validate_geography_file.py input.csv --reformat -o geography.data
 ```
 ### Optimization Fails to Converge
 **Symptom**: NA values in parameter estimates or very negative log-likelihoods
 **Possible causes**:
 - Tree and geography data mismatch
 - All species in same area (no variation)
 - Unrealistic max_range_size
 **Solution**: Check input data quality and try simpler model first (DEC only)
 ### Very Slow Runtime
 **Causes**:
 - Large number of areas (>6-7 areas gets slow)
 - Large max_range_size
 - Many tips (>200)
 **Solutions**:
 - Reduce max_range_size
 - Combine geographic areas if appropriate
 - Use `force_sparse = TRUE` in run object
 - Run on HPC cluster
 ## Resources
 This skill includes:
 ### scripts/
 - **validate_geography_file.py** - Validates and reformats geography files
  - Checks PHYLIP format compliance
  - Validates against tree tip labels
  - Reformats from CSV/TSV to PHYLIP
  - Usage: `python validate_geography_file.py --help`
 - **biogeobears_analysis_template.Rmd** - RMarkdown template for complete analysis
  - Model fitting for DEC, DIVALIKE, BAYAREALIKE (with/without +J)
  - Model comparison with AIC, AICc, weights
  - Likelihood ratio tests
  - Parameter visualization
  - Ancestral range plotting
  - Customizable via YAML parameters
 ### references/
 - **biogeobears_details.md** - Comprehensive reference including:
  - Detailed model descriptions
  - Input file format specifications
  - Parameter interpretation guidelines
  - Plotting options and customization
  - Citations and further reading
  - Computational considerations
 Load this reference when:
 - Users ask about specific models
 - Need to explain parameter estimates
 - Troubleshooting complex issues
 - Users want detailed methodology for publications
 ## Best Practices
 1. **Always validate input files** before analysis - saves time debugging later
 2. **Organize analysis in a dedicated directory** - keeps everything together and reproducible
 3. **Run all 6 models by default** - model comparison is crucial for biogeographic inference
 4. **Document parameters and decisions** - analysis README helps with reproducibility
 5. **Generate both visualization types** - pie charts for uncertainty, text labels for clarity
 6. **Save intermediate results** - the RMarkdown template does this automatically
 7. **Check parameter estimates** - unrealistic values suggest data or model issues
 8. **Provide context with visualizations** - explain what dispersal/extinction rates mean for the user's system
 ## Output Interpretation
 When presenting results to users, explain:
 ### Model Selection
 - **AIC weights** represent probability that each model is best
 - **ΔAIC < 2**: Models essentially equivalent
 - **ΔAIC 2-7**: Considerably less support
 - **ΔAIC > 10**: Essentially no support
 ### Parameter Estimates
 - **d (dispersal rate)**: Higher = more range expansions
 - **e (extinction rate)**: Higher = more local extinctions
 - **j (founder-event rate)**: Higher = more jump dispersal at speciation
 - **Ratio d/e**: > 1 favors expansion, < 1 favors contraction
 ### Ancestral Ranges
 - **Pie charts**: Larger slices = higher probability
 - **Colors**: Represent areas (single area = bright color, multiple areas = blended)
 - **Node labels**: Most likely ancestral range
 - **Split events** (at corners): Range changes at speciation
 ### Statistical Tests
 - **LRT p < 0.05**: +J parameter significantly improves fit
 - **High AIC weight** (>0.7): Strong evidence for one model
 - **Similar AIC weights**: Model uncertainty - report results from multiple models
 ## Example Usage
 ```
 User: "I have a phylogeny of 30 bird species and their distributions across 5 islands. Can you help me figure out where their ancestors lived?"
 Claude (using this skill):
 1. Ask for tree and distribution file paths
 2. Validate tree file (check 30 tips, rooted)
 3. Validate/reformat geography file (5 areas)
 4. Ask about max_range_size (suggest 4 areas)
 5. Ask about models (suggest all 6)
 6. Set up biogeobears_analysis/ directory structure
 7. Copy template RMarkdown script with parameters
 8. Generate README.md and run_analysis.sh
 9. Provide clear instructions to run analysis
 10. Explain expected outputs and how to interpret them
 Result: User has complete, ready-to-run analysis with documentation
 ```
 ## Attribution
 This skill was created based on:
 - **BioGeoBEARS** package by Nicholas Matzke
 - Tutorial resources from http://phylo.wikidot.com/biogeobears
 - Example workflows from the BioGeoBEARS GitHub repository
 ## Additional Notes
 **Time estimate for skill execution**:
 - File validation: 1-2 minutes
 - Directory setup: < 1 minute
 - Total setup time: 5-10 minutes
 **Analysis runtime** (separate from skill execution):
 - Depends on tree size and number of areas
 - Small datasets (<50 tips, ≤5 areas): 10-30 minutes
 - Large datasets (>100 tips, >5 areas): 1-6 hours
 **Installation requirements** (user must have):
 - R (≥4.0)
 - BioGeoBEARS R package
 - Supporting packages: ape, rmarkdown, knitr, kableExtra
 - Python 3 (for validation script)
 **When to consult references/**:
 - Load `biogeobears_details.md` when users need detailed explanations of models, parameters, or interpretation
 - Reference it for troubleshooting complex issues
 - Use it to help users write methods sections for publications
--- a/skills/biogeobears/references/biogeobears_details.md
+++ b/skills/biogeobears/references/biogeobears_details.md
@@ -0,0 +1,358 @@
 # BioGeoBEARS Detailed Reference
 ## Overview
 BioGeoBEARS (BioGeography with Bayesian and Likelihood Evolutionary Analysis in R Scripts) is an R package for probabilistic inference of historical biogeography on phylogenetic trees. It implements various models of range evolution and allows statistical comparison between them.
 ## Installation
 ```r
 # Install dependencies
 install.packages("rexpokit")
 install.packages("cladoRcpp")
 # Install from GitHub
 library(devtools)
 devtools::install_github(repo="nmatzke/BioGeoBEARS")
 ```
 ## Biogeographic Models
 BioGeoBEARS implements several models that differ in their assumptions about how species ranges evolve:
 ### DEC (Dispersal-Extinction-Cladogenesis)
 The DEC model is based on LAGRANGE and includes:
 - **Anagenetic changes** (along branches):
  - `d` (dispersal): Rate of range expansion into adjacent areas
  - `e` (extinction): Rate of local extinction in an area
 - **Cladogenetic events** (at speciation nodes):
  - Vicariance: Ancestral range splits between daughter lineages
  - Subset sympatry: One daughter inherits full range, other subset
  - Range copying: Both daughters inherit full ancestral range
 **Parameters**: 2 (d, e)
 **Best for**: General-purpose biogeographic inference
 ### DIVALIKE (Vicariance-focused)
 Similar to DIVA (Dispersal-Vicariance Analysis):
 - Emphasizes vicariance at speciation events
 - Fixes subset sympatry probability to 0
 - Only allows vicariance and range copying at nodes
 **Parameters**: 2 (d, e)
 **Best for**: Systems where vicariance is the primary speciation mode
 ### BAYAREALIKE (Sympatry-focused)
 Based on the BayArea model:
 - Emphasizes sympatric speciation
 - Fixes vicariance probability to 0
 - Only allows subset sympatry and range copying
 **Parameters**: 2 (d, e)
 **Best for**: Systems where dispersal and sympatric speciation dominate
 ### +J Extension (Founder-event speciation)
 Any of the above models can include a "+J" parameter:
 - **j**: Jump dispersal / founder-event speciation rate
 - Allows instantaneous dispersal to a new area at speciation
 - Often significantly improves model fit
 - Can be controversial (some argue it's biologically unrealistic)
 **Examples**: DEC+J, DIVALIKE+J, BAYAREALIKE+J
 **Additional parameters**: +1 (j)
 ## Model Comparison
 ### AIC (Akaike Information Criterion)
 ```
 AIC = -2 × ln(L) + 2k
 ```
 Where:
 - ln(L) = log-likelihood
 - k = number of parameters
 **Lower AIC = better model**
 ### AICc (Corrected AIC)
 Used when sample size is small relative to parameters:
 ```
 AICc = AIC + (2k² + 2k)/(n - k - 1)
 ```
 ### AIC Weights
 Probability that a model is the best among the set:
 ```
 w_i = exp(-0.5 × Δ_i) / Σ exp(-0.5 × Δ_j)
 ```
 Where Δ_i = AIC_i - AIC_min
 ### Likelihood Ratio Test (LRT)
 For nested models (e.g., DEC vs DEC+J):
 ```
 LRT = 2 × (ln(L_complex) - ln(L_simple))
 ```
 - Test statistic follows χ² distribution
 - df = difference in number of parameters
 - p < 0.05 suggests complex model significantly better
 ## Input File Formats
 ### Phylogenetic Tree (Newick format)
 Standard Newick format with:
 - Branch lengths required
 - Tip labels must match geography file
 - Should be rooted and ultrametric (for time-stratified analyses)
 Example:
 ```
 ((A:1.0,B:1.0):0.5,C:1.5);
 ```
 ### Geography File (PHYLIP-like format)
 **Format structure:**
 ```
 n_species [TAB] n_areas [TAB] (area1 area2 area3 ...)
 species1 [TAB] 011
 species2 [TAB] 110
 species3 [TAB] 001
 ```
 **Important formatting rules:**
 1. **Line 1 (Header)**:
   - Number of species (integer)
   - TAB character
   - Number of areas (integer)
   - TAB character
   - Area names in parentheses, separated by spaces
 2. **Subsequent lines (Species data)**:
   - Species name (must match tree tip label)
   - TAB character
   - Binary presence/absence code (1=present, 0=absent)
   - NO SPACES in the binary code
   - NO SPACES in species names (use underscores)
 3. **Common errors to avoid**:
   - Using spaces instead of tabs
   - Spaces within binary codes
   - Species names with spaces
   - Mismatch between species names in tree and geography file
   - Wrong number of digits in binary code
 **Example file:**
 ```
 5	3	(A B C)
 Sp_alpha	011
 Sp_beta	010
 Sp_gamma	111
 Sp_delta	100
 Sp_epsilon	001
 ```
 ## Key Parameters and Settings
 ### max_range_size
 Maximum number of areas a species can occupy simultaneously.
 - **Default**: Often set to number of areas, or number of areas - 1
 - **Impact**: Larger values = more possible states = longer computation
 - **Recommendation**: Set based on biological realism
 ### include_null_range
 Whether to include the "null range" (species extinct everywhere).
 - **Default**: TRUE
 - **Purpose**: Allows extinction along branches
 - **Recommendation**: Usually keep TRUE
 ### force_sparse
 Use sparse matrix operations for speed.
 - **Default**: FALSE
 - **When to use**: Large state spaces (many areas)
 - **Note**: May cause numerical issues
 ### speedup
 Various speedup options.
 - **Default**: TRUE
 - **Recommendation**: Usually keep TRUE
 ### use_optimx
 Use optimx for parameter optimization.
 - **Default**: TRUE
 - **Benefit**: More robust optimization
 - **Recommendation**: Keep TRUE
 ### calc_ancprobs
 Calculate ancestral state probabilities.
 - **Default**: FALSE
 - **Must set to TRUE** if you want ancestral range estimates
 - **Impact**: Adds computational time
 ## Plotting Functions
 ### plot_BioGeoBEARS_results()
 Main function for visualizing results.
 **Key parameters:**
 - `plotwhat`: "pie" (probability distributions) or "text" (ML states)
 - `tipcex`: Tip label text size
 - `statecex`: Node state text/pie chart size
 - `splitcex`: Split state text/pie size (at corners)
 - `titlecex`: Title text size
 - `plotsplits`: Show cladogenetic events (TRUE/FALSE)
 - `include_null_range`: Match analysis setting
 - `label.offset`: Distance of tip labels from tree
 - `cornercoords_loc`: Directory with corner coordinate files
 **Color scheme:**
 - Single areas: Bright primary colors
 - Multi-area ranges: Blended colors
 - All areas: White
 - Colors automatically assigned and mixed
 ## Biogeographical Stochastic Mapping (BSM)
 Extension of BioGeoBEARS that simulates stochastic histories:
 - Generates multiple possible biogeographic histories
 - Accounts for uncertainty in ancestral ranges
 - Allows visualization of range evolution dynamics
 - More computationally intensive
 Not covered in basic workflow but available in package.
 ## Common Analysis Workflow
 1. **Prepare inputs**
   - Phylogenetic tree (Newick)
   - Geography file (PHYLIP format)
   - Validate both files
 2. **Setup analysis**
   - Define max_range_size
   - Load tree and geography data
   - Create state space
 3. **Fit models**
   - DEC, DIVALIKE, BAYAREALIKE
   - With and without +J
   - 6 models total is standard
 4. **Compare models**
   - AIC/AICc scores
   - AIC weights
   - LRT for nested comparisons
 5. **Visualize best model**
   - Pie charts for probabilities
   - Text labels for ML states
   - Annotate with split events
 6. **Interpret results**
   - Ancestral ranges
   - Dispersal patterns
   - Speciation modes (if using +J)
 ## Interpretation Guidelines
 ### Dispersal rate (d)
 - **High d**: Frequent range expansions
 - **Low d**: Species mostly stay in current ranges
 - **Units**: Expected dispersal events per lineage per time unit
 ### Extinction rate (e)
 - **High e**: Ranges frequently contract
 - **Low e**: Stable occupancy once established
 - **Relative to d**: d/e ratio indicates dispersal vs. contraction tendency
 ### Founder-event rate (j)
 - **High j**: Jump dispersal important in clade evolution
 - **Low j** (but model still better): Minor role but statistically supported
 - **j = 0** (in +J model): Founder events not supported
 ### Model selection insights
 - **DEC favored**: Balanced dispersal, extinction, and vicariance
 - **DIVALIKE favored**: Vicariance-driven diversification
 - **BAYAREALIKE favored**: Sympatric speciation and dispersal
 - **+J improves fit**: Founder-event speciation may be important
 ## Computational Considerations
 ### Runtime factors
 - **Number of tips**: Polynomial scaling
 - **Number of areas**: Exponential scaling in state space
 - **max_range_size**: Major impact (reduces state space)
 - **Tree depth**: Linear scaling
 ### Memory usage
 - Large trees + many areas can require substantial RAM
 - Sparse matrices help but have trade-offs
 ### Optimization issues
 - Complex likelihood surfaces
 - Multiple local optima possible
 - May need multiple optimization runs
 - Check parameter estimates for sensibility
 ## Citations
 **Main BioGeoBEARS reference:**
 Matzke, N. J. (2013). Probabilistic historical biogeography: new models for founder-event speciation, imperfect detection, and fossils allow improved accuracy and model-testing. *Frontiers of Biogeography*, 5(4), 242-248.
 **LAGRANGE (DEC model origin):**
 Ree, R. H., & Smith, S. A. (2008). Maximum likelihood inference of geographic range evolution by dispersal, local extinction, and cladogenesis. *Systematic Biology*, 57(1), 4-14.
 **+J parameter discussion:**
 Ree, R. H., & Sanmartín, I. (2018). Conceptual and statistical problems with the DEC+J model of founder-event speciation and its comparison with DEC via model selection. *Journal of Biogeography*, 45(4), 741-749.
 **Model comparison best practices:**
 Burnham, K. P., & Anderson, D. R. (2002). *Model Selection and Multimodel Inference: A Practical Information-Theoretic Approach* (2nd ed.). Springer.
 ## Further Resources
 - **BioGeoBEARS wiki**: http://phylo.wikidot.com/biogeobears
 - **GitHub repository**: https://github.com/nmatzke/BioGeoBEARS
 - **Google Group**: biogeobears@googlegroups.com
 - **Tutorial scripts**: Available in package `inst/extdata/examples/`
--- a/skills/biogeobears/scripts/biogeobears_analysis_template.Rmd
+++ b/skills/biogeobears/scripts/biogeobears_analysis_template.Rmd
@@ -0,0 +1,404 @@
 ---
 title: "BioGeoBEARS Biogeographic Analysis"
 author: "Generated by Claude Code"
 date: "`r Sys.Date()`"
 output:
  html_document:
    toc: true
    toc_float: true
    code_folding: show
    theme: flatly
 params:
  tree_file: "tree.nwk"
  geog_file: "geography.data"
  max_range_size: 4
  models: "DEC,DEC+J,DIVALIKE,DIVALIKE+J"
  output_dir: "results"
 ---
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
 library(BioGeoBEARS)
 library(ape)
 library(knitr)
 library(kableExtra)
 ```
 # Analysis Parameters
 ```{r parameters, echo=FALSE}
 params_df <- data.frame(
  Parameter = c("Tree file", "Geography file", "Max range size", "Models to test", "Output directory"),
  Value = c(params$tree_file, params$geog_file, params$max_range_size, params$models, params$output_dir)
 )
 kable(params_df, caption = "Analysis Parameters") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
 ```
 # Input Data
 ## Phylogenetic Tree
 ```{r load-tree}
 trfn <- params$tree_file
 tr <- read.tree(trfn)
 cat(paste("Number of tips:", length(tr$tip.label), "\n"))
 cat(paste("Tree is rooted:", is.rooted(tr), "\n"))
 cat(paste("Tree is ultrametric:", is.ultrametric(tr), "\n"))
 # Plot tree
 plot(tr, cex = 0.6, main = "Input Phylogeny")
 ```
 ## Geographic Distribution Data
 ```{r load-geography}
 geogfn <- params$geog_file
 tipranges <- getranges_from_LagrangePHYLIP(lgdata_fn = geogfn)
 cat(paste("Number of species:", nrow(tipranges@df), "\n"))
 cat(paste("Number of areas:", ncol(tipranges@df), "\n"))
 cat(paste("Area names:", paste(names(tipranges@df), collapse = ", "), "\n\n"))
 # Display geography matrix
 kable(tipranges@df, caption = "Species Distribution Matrix (1 = present, 0 = absent)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), font_size = 10) %>%
  scroll_box(height = "400px")
 ```
 ## State Space Setup
 ```{r state-space}
 max_range_size <- params$max_range_size
 numareas <- ncol(tipranges@df)
 num_states <- numstates_from_numareas(numareas = numareas,
                                       maxareas = max_range_size,
                                       include_null_range = TRUE)
 cat(paste("Maximum range size:", max_range_size, "\n"))
 cat(paste("Number of possible states:", num_states, "\n"))
 ```
 # Model Fitting
 ```{r setup-output}
 # Create output directory
 if (!dir.exists(params$output_dir)) {
  dir.create(params$output_dir, recursive = TRUE)
 }
 # Parse models to run
 models_to_run <- unlist(strsplit(params$models, ","))
 models_to_run <- trimws(models_to_run)
 cat("Models to fit:\n")
 for (model in models_to_run) {
  cat(paste("  -", model, "\n"))
 }
 ```
 ```{r model-fitting, results='hide'}
 # Storage for results
 results_list <- list()
 model_comparison <- data.frame(
  Model = character(),
  LnL = numeric(),
  nParams = integer(),
  AIC = numeric(),
  AICc = numeric(),
  d = numeric(),
  e = numeric(),
  j = numeric(),
  stringsAsFactors = FALSE
 )
 # Helper function to setup and run a model
 run_biogeobears_model <- function(model_name, BioGeoBEARS_run_object) {
  cat(paste("\n\nFitting model:", model_name, "\n"))
  # Configure model based on name
  if (grepl("DEC", model_name)) {
    # DEC model (default settings)
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free"
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free"
  } else if (grepl("DIVALIKE", model_name)) {
    # DIVALIKE model (vicariance only, no subset sympatry)
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "fixed"
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","init"] = 0.0
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","est"] = 0.0
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free"
  } else if (grepl("BAYAREALIKE", model_name)) {
    # BAYAREALIKE model (sympatry only, no vicariance)
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free"
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "fixed"
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","init"] = 0.0
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","est"] = 0.0
  }
  # Add +J parameter if specified
  if (grepl("\\+J", model_name)) {
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "free"
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.01
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.01
  } else {
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "fixed"
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.0
    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.0
  }
  # Run optimization
  res <- bears_optim_run(BioGeoBEARS_run_object)
  return(res)
 }
 # Base run object setup
 BioGeoBEARS_run_object <- define_BioGeoBEARS_run()
 BioGeoBEARS_run_object$trfn <- trfn
 BioGeoBEARS_run_object$geogfn <- geogfn
 BioGeoBEARS_run_object$max_range_size <- max_range_size
 BioGeoBEARS_run_object$min_branchlength <- 0.000001
 BioGeoBEARS_run_object$include_null_range <- TRUE
 BioGeoBEARS_run_object$force_sparse <- FALSE
 BioGeoBEARS_run_object$speedup <- TRUE
 BioGeoBEARS_run_object$use_optimx <- TRUE
 BioGeoBEARS_run_object$calc_ancprobs <- TRUE
 BioGeoBEARS_run_object <- readfiles_BioGeoBEARS_run(BioGeoBEARS_run_object)
 BioGeoBEARS_run_object <- calc_loglike_sp(BioGeoBEARS_run_object)
 # Fit each model
 for (model in models_to_run) {
  tryCatch({
    res <- run_biogeobears_model(model, BioGeoBEARS_run_object)
    results_list[[model]] <- res
    # Save result
    save(res, file = file.path(params$output_dir, paste0(model, "_result.Rdata")))
    # Extract parameters for comparison
    params_table <- res$outputs@params_table
    model_comparison <- rbind(model_comparison, data.frame(
      Model = model,
      LnL = res$outputs@loglikelihood,
      nParams = sum(params_table$type == "free"),
      AIC = res$outputs@AIC,
      AICc = res$outputs@AICc,
      d = params_table["d", "est"],
      e = params_table["e", "est"],
      j = params_table["j", "est"],
      stringsAsFactors = FALSE
    ))
  }, error = function(e) {
    cat(paste("Error fitting model", model, ":", e$message, "\n"))
  })
 }
 ```
 # Model Comparison
 ```{r model-comparison}
 # Calculate AIC weights
 if (nrow(model_comparison) > 0) {
  model_comparison$delta_AIC <- model_comparison$AIC - min(model_comparison$AIC)
  model_comparison$AIC_weight <- exp(-0.5 * model_comparison$delta_AIC) /
                                 sum(exp(-0.5 * model_comparison$delta_AIC))
  # Sort by AIC
  model_comparison <- model_comparison[order(model_comparison$AIC), ]
  kable(model_comparison, digits = 3,
        caption = "Model Comparison (sorted by AIC)") %>%
    kable_styling(bootstrap_options = c("striped", "hover")) %>%
    row_spec(1, bold = TRUE, background = "#d4edda")  # Highlight best model
  # Model selection summary
  best_model <- model_comparison$Model[1]
  cat(paste("\n\nBest model by AIC:", best_model, "\n"))
  cat(paste("AIC weight:", round(model_comparison$AIC_weight[1], 3), "\n"))
 }
 ```
 # Ancestral Range Reconstruction
 ## Best Model: `r if(exists('best_model')) best_model else 'TBD'`
 ```{r plot-best-model, fig.width=10, fig.height=12}
 if (exists('best_model') && best_model %in% names(results_list)) {
  res_best <- results_list[[best_model]]
  # Create plots directory
  plots_dir <- file.path(params$output_dir, "plots")
  if (!dir.exists(plots_dir)) {
    dir.create(plots_dir, recursive = TRUE)
  }
  # Plot with pie charts
  pdf(file.path(plots_dir, paste0(best_model, "_pie.pdf")), width = 10, height = 12)
  analysis_titletxt <- paste("BioGeoBEARS:", best_model)
  plot_BioGeoBEARS_results(
    results_object = res_best,
    analysis_titletxt = analysis_titletxt,
    addl_params = list("j"),
    plotwhat = "pie",
    label.offset = 0.5,
    tipcex = 0.7,
    statecex = 0.7,
    splitcex = 0.6,
    titlecex = 0.8,
    plotsplits = TRUE,
    include_null_range = TRUE,
    tr = tr,
    tipranges = tipranges
  )
  dev.off()
  # Also create text plot
  pdf(file.path(plots_dir, paste0(best_model, "_text.pdf")), width = 10, height = 12)
  plot_BioGeoBEARS_results(
    results_object = res_best,
    analysis_titletxt = analysis_titletxt,
    addl_params = list("j"),
    plotwhat = "text",
    label.offset = 0.5,
    tipcex = 0.7,
    statecex = 0.7,
    splitcex = 0.6,
    titlecex = 0.8,
    plotsplits = TRUE,
    include_null_range = TRUE,
    tr = tr,
    tipranges = tipranges
  )
  dev.off()
  # Display in notebook (pie chart version)
  plot_BioGeoBEARS_results(
    results_object = res_best,
    analysis_titletxt = analysis_titletxt,
    addl_params = list("j"),
    plotwhat = "pie",
    label.offset = 0.5,
    tipcex = 0.7,
    statecex = 0.7,
    splitcex = 0.6,
    titlecex = 0.8,
    plotsplits = TRUE,
    include_null_range = TRUE,
    tr = tr,
    tipranges = tipranges
  )
  cat(paste("\n\nPlots saved to:", plots_dir, "\n"))
 }
 ```
 # Parameter Estimates
 ```{r parameter-estimates, fig.width=10, fig.height=6}
 if (nrow(model_comparison) > 0) {
  # Extract base models (without +J)
  base_models <- model_comparison[!grepl("\\+J", model_comparison$Model), ]
  j_models <- model_comparison[grepl("\\+J", model_comparison$Model), ]
  par(mfrow = c(1, 3))
  # Plot d (dispersal) estimates
  barplot(model_comparison$d, names.arg = model_comparison$Model,
          main = "Dispersal Rate (d)", ylab = "Rate", las = 2, cex.names = 0.8,
          col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
  # Plot e (extinction) estimates
  barplot(model_comparison$e, names.arg = model_comparison$Model,
          main = "Extinction Rate (e)", ylab = "Rate", las = 2, cex.names = 0.8,
          col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
  # Plot j (founder-event) estimates for +J models
  j_vals <- model_comparison$j
  j_vals[j_vals == 0] <- NA
  barplot(j_vals, names.arg = model_comparison$Model,
          main = "Founder-event Rate (j)", ylab = "Rate", las = 2, cex.names = 0.8,
          col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
 }
 ```
 # Likelihood Ratio Tests
 ```{r lrt-tests}
 # Compare models with and without +J
 if (nrow(model_comparison) > 0) {
  lrt_results <- data.frame(
    Comparison = character(),
    Model1 = character(),
    Model2 = character(),
    LRT_statistic = numeric(),
    df = integer(),
    p_value = numeric(),
    stringsAsFactors = FALSE
  )
  base_model_names <- c("DEC", "DIVALIKE", "BAYAREALIKE")
  for (base in base_model_names) {
    j_model <- paste0(base, "+J")
    if (base %in% model_comparison$Model && j_model %in% model_comparison$Model) {
      lnl_base <- model_comparison[model_comparison$Model == base, "LnL"]
      lnl_j <- model_comparison[model_comparison$Model == j_model, "LnL"]
      lrt_stat <- 2 * (lnl_j - lnl_base)
      df <- 1  # One additional parameter (j)
      p_val <- pchisq(lrt_stat, df = df, lower.tail = FALSE)
      lrt_results <- rbind(lrt_results, data.frame(
        Comparison = paste(base, "vs", j_model),
        Model1 = base,
        Model2 = j_model,
        LRT_statistic = lrt_stat,
        df = df,
        p_value = p_val,
        stringsAsFactors = FALSE
      ))
    }
  }
  if (nrow(lrt_results) > 0) {
    lrt_results$Significant <- ifelse(lrt_results$p_value < 0.05, "Yes*", "No")
    kable(lrt_results, digits = 4,
          caption = "Likelihood Ratio Tests (nested model comparisons)") %>%
      kable_styling(bootstrap_options = c("striped", "hover"))
    cat("\n* p < 0.05 indicates significant improvement with +J parameter\n")
  }
 }
 ```
 # Session Info
 ```{r session-info}
 sessionInfo()
 ```
 # Outputs
 All results have been saved to: **`r params$output_dir`**
 Files generated:
 - `[MODEL]_result.Rdata` - R data files with complete model results
 - `plots/[MODEL]_pie.pdf` - Phylogeny with pie charts showing ancestral range probabilities
 - `plots/[MODEL]_text.pdf` - Phylogeny with text labels showing most likely ancestral ranges
 - `biogeobears_analysis_template.html` - This HTML report
 To load a saved result in R:
 ```r
 load("results/DEC+J_result.Rdata")
 ```
--- a/skills/biogeobears/scripts/validate_geography_file.py
+++ b/skills/biogeobears/scripts/validate_geography_file.py
@@ -0,0 +1,299 @@
 #!/usr/bin/env python3
 """
 Validates and optionally reformats a BioGeoBEARS geography file.
 Geography files must follow the PHYLIP-like format:
 Line 1: n_species [TAB] n_areas [TAB] (area1 area2 area3 ...)
 Lines 2+: species_name [TAB] binary_string (e.g., 011 for absent in area1, present in area2 and area3)
 Common errors:
 - Spaces instead of tabs
 - Spaces in species names
 - Spaces within binary strings
 - Species names not matching tree tip labels
 """
 import sys
 import argparse
 import re
 from pathlib import Path
 def validate_geography_file(filepath, tree_tips=None):
    """
    Validate geography file format.
    Args:
        filepath: Path to geography file
        tree_tips: Optional set of tree tip labels to validate against
    Returns:
        dict with validation results and any errors/warnings
    """
    errors = []
    warnings = []
    info = {}
    with open(filepath, 'r') as f:
        lines = [line.rstrip('\n\r') for line in f.readlines()]
    if not lines:
        errors.append("File is empty")
        return {'valid': False, 'errors': errors, 'warnings': warnings, 'info': info}
    # Parse header line
    header = lines[0]
    if '\t' not in header:
        errors.append("Line 1: Missing tab delimiter (should be: n_species [TAB] n_areas [TAB] (area_names))")
    else:
        parts = header.split('\t')
        if len(parts) < 3:
            errors.append("Line 1: Expected format 'n_species [TAB] n_areas [TAB] (area_names)'")
        else:
            try:
                n_species = int(parts[0])
                n_areas = int(parts[1])
                # Parse area names
                area_part = parts[2].strip()
                if not (area_part.startswith('(') and area_part.endswith(')')):
                    errors.append("Line 1: Area names should be in parentheses: (A B C)")
                else:
                    areas = area_part[1:-1].split()
                    if len(areas) != n_areas:
                        errors.append(f"Line 1: Declared {n_areas} areas but found {len(areas)} area names")
                    info['n_species'] = n_species
                    info['n_areas'] = n_areas
                    info['areas'] = areas
                    # Validate species lines
                    species_found = []
                    for i, line in enumerate(lines[1:], start=2):
                        if not line.strip():
                            continue
                        if '\t' not in line:
                            errors.append(f"Line {i}: Missing tab between species name and binary code")
                            continue
                        parts = line.split('\t')
                        if len(parts) != 2:
                            errors.append(f"Line {i}: Expected exactly one tab between species name and binary code")
                            continue
                        species_name = parts[0]
                        binary_code = parts[1]
                        # Check for spaces in species name
                        if ' ' in species_name:
                            errors.append(f"Line {i}: Species name '{species_name}' contains spaces (use underscores instead)")
                        # Check for spaces in binary code
                        if ' ' in binary_code or '\t' in binary_code:
                            errors.append(f"Line {i}: Binary code '{binary_code}' contains spaces or tabs (should be like '011' with no spaces)")
                        # Check binary code length
                        if len(binary_code) != n_areas:
                            errors.append(f"Line {i}: Binary code length ({len(binary_code)}) doesn't match number of areas ({n_areas})")
                        # Check binary code characters
                        if not all(c in '01' for c in binary_code):
                            errors.append(f"Line {i}: Binary code contains invalid characters (only 0 and 1 allowed)")
                        species_found.append(species_name)
                    # Check species count
                    if len(species_found) != n_species:
                        warnings.append(f"Header declares {n_species} species but found {len(species_found)} data lines")
                    info['species'] = species_found
                    # Check against tree tips if provided
                    if tree_tips:
                        species_set = set(species_found)
                        tree_set = set(tree_tips)
                        missing_in_tree = species_set - tree_set
                        missing_in_geog = tree_set - species_set
                        if missing_in_tree:
                            errors.append(f"Species in geography file but not in tree: {', '.join(sorted(missing_in_tree))}")
                        if missing_in_geog:
                            errors.append(f"Species in tree but not in geography file: {', '.join(sorted(missing_in_geog))}")
            except ValueError:
                errors.append("Line 1: First two fields must be integers (n_species and n_areas)")
    return {
        'valid': len(errors) == 0,
        'errors': errors,
        'warnings': warnings,
        'info': info
    }
 def reformat_geography_file(input_path, output_path, delimiter=','):
    """
    Attempt to reformat a geography file from common formats.
    Args:
        input_path: Path to input file
        output_path: Path for output file
        delimiter: Delimiter used in input file (default: comma)
    """
    with open(input_path, 'r') as f:
        lines = [line.strip() for line in f.readlines()]
    # Detect if first line is a header
    header_line = lines[0]
    has_header = not header_line[0].isdigit()
    if has_header:
        # Parse area names from header
        parts = header_line.split(delimiter)
        species_col = parts[0]
        area_names = [p.strip() for p in parts[1:]]
        data_lines = lines[1:]
    else:
        # No header, infer from first data line
        parts = lines[0].split(delimiter)
        n_areas = len(parts) - 1
        area_names = [chr(65 + i) for i in range(n_areas)]  # A, B, C, ...
        data_lines = lines
    # Parse species data
    species_data = []
    for line in data_lines:
        if not line:
            continue
        parts = line.split(delimiter)
        if len(parts) < 2:
            continue
        species_name = parts[0].strip().replace(' ', '_')
        presence = ''.join(['1' if p.strip() in ['1', 'present', 'Present', 'TRUE', 'True'] else '0'
                           for p in parts[1:]])
        species_data.append((species_name, presence))
    # Write output
    with open(output_path, 'w') as f:
        # Header line
        n_species = len(species_data)
        n_areas = len(area_names)
        f.write(f"{n_species}\t{n_areas}\t({' '.join(area_names)})\n")
        # Species lines
        for species_name, binary_code in species_data:
            f.write(f"{species_name}\t{binary_code}\n")
    print(f"Reformatted {n_species} species across {n_areas} areas")
    print(f"Output written to: {output_path}")
 def main():
    parser = argparse.ArgumentParser(
        description='Validate and reformat BioGeoBEARS geography files',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Validate a geography file
  python validate_geography_file.py input.txt --validate
  # Reformat from CSV to PHYLIP format
  python validate_geography_file.py input.csv --reformat -o output.data
  # Reformat with tab delimiter
  python validate_geography_file.py input.txt --reformat --delimiter tab -o output.data
        """
    )
    parser.add_argument('input', help='Input geography file')
    parser.add_argument('--validate', action='store_true',
                       help='Validate the file format')
    parser.add_argument('--reformat', action='store_true',
                       help='Reformat file to BioGeoBEARS format')
    parser.add_argument('-o', '--output',
                       help='Output file path (required for --reformat)')
    parser.add_argument('--delimiter', default=',',
                       help='Delimiter in input file (default: comma). Use "tab" for tab-delimited.')
    parser.add_argument('--tree',
                       help='Newick tree file to validate species names against')
    args = parser.parse_args()
    if args.delimiter.lower() == 'tab':
        args.delimiter = '\t'
    # Parse tree tips if provided
    tree_tips = None
    if args.tree:
        try:
            with open(args.tree, 'r') as f:
                tree_string = f.read().strip()
            # Extract tip labels using regex
            tree_tips = re.findall(r'([^(),:\s]+):', tree_string)
            if not tree_tips:
                tree_tips = re.findall(r'([^(),:\s]+)[,)]', tree_string)
            print(f"Found {len(tree_tips)} tips in tree file")
        except Exception as e:
            print(f"Warning: Could not parse tree file: {e}")
    if args.validate:
        result = validate_geography_file(args.input, tree_tips)
        print(f"\nValidation Results for: {args.input}")
        print("=" * 60)
        if result['info']:
            print(f"\nFile Info:")
            print(f"  Species: {result['info'].get('n_species', 'unknown')}")
            print(f"  Areas: {result['info'].get('n_areas', 'unknown')}")
            if 'areas' in result['info']:
                print(f"  Area names: {', '.join(result['info']['areas'])}")
        if result['warnings']:
            print(f"\nWarnings ({len(result['warnings'])}):")
            for warning in result['warnings']:
                print(f"  ⚠️  {warning}")
        if result['errors']:
            print(f"\nErrors ({len(result['errors'])}):")
            for error in result['errors']:
                print(f"  ❌ {error}")
        else:
            print(f"\n✅ File is valid!")
        return 0 if result['valid'] else 1
    elif args.reformat:
        if not args.output:
            print("Error: --output required when using --reformat")
            return 1
        try:
            reformat_geography_file(args.input, args.output, args.delimiter)
            # Validate reformatted file
            result = validate_geography_file(args.output, tree_tips)
            if result['valid']:
                print("✅ Reformatted file is valid!")
            else:
                print("\n⚠️  Reformatted file has validation errors:")
                for error in result['errors']:
                    print(f"  ❌ {error}")
                return 1
        except Exception as e:
            print(f"Error during reformatting: {e}")
            return 1
    else:
        parser.print_help()
        return 1
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/skills/phylo_from_buscos/.skillignore
+++ b/skills/phylo_from_buscos/.skillignore
@@ -0,0 +1,12 @@
 # Exclude development materials from skill packaging
 info_to_craft_skill/
 # Exclude GitHub documentation (not needed in skill package)
 README.md
 # Exclude local settings
 .claude/
 # Exclude git files
 .git/
 .gitignore
--- a/skills/phylo_from_buscos/README.md
+++ b/skills/phylo_from_buscos/README.md
@@ -0,0 +1,99 @@
 # BUSCO-based Phylogenomics Skill
 A Claude Code skills for phylogenomic analyses, created by Bruno de Medeiros (Field Museum) based on code initially written by Paul Frandsen (Brigham Young University)
 It generate a complete phylogenetic workflow from genome assemblies using BUSCO/compleasm-based single-copy orthologs.
 **Features:**
 - Supports local genome files and NCBI accessions (BioProjects/Assemblies)
 - Generates scheduler-specific scripts (SLURM, PBS, cloud, local)
 - Uses modern tools (compleasm, MAFFT, IQ-TREE, ASTRAL)
 - Multiple alignment trimming options
 - Both concatenation and coalescent approaches
 - Quality control with recommendations
 - Writes a draft methods paragraph describing the pipeline for publications
 **Use when you need to:**
 - Build phylogenetic trees from multiple genome assemblies
 - Extract and align single-copy orthologs across genomes
 - Download genomes from NCBI by accession
 - Generate ready-to-run scripts for your computing environment
 ## Installation
 See README on the repository root folder for plugin installation.
 ## Usage
 Once installed, simply describe your phylogenomics task:
 ```
 I need to generate a phylogeny from 20 genome assemblies on a SLURM cluster
 ```
 Claude Code will automatically activate the appropriate skill and guide you through the workflow.
 ## Workflow Overview
 The complete phylogenomics pipeline:
 1. **Input Preparation** - Download NCBI genomes if needed
 2. **Ortholog Identification** - Run compleasm/BUSCO on all genomes
 3. **Quality Control** - Assess genome completeness with recommendations
 4. **Ortholog Extraction** - Generate per-locus unaligned FASTA files
 5. **Alignment** - Align orthologs with MAFFT
 6. **Trimming** - Remove poorly aligned regions (Aliscore/ALICUT, trimAl, BMGE, ClipKit)
 7. **Concatenation** - Build supermatrix with partition scheme
 8. **Phylogenetic Inference** - Generate ML concatenated tree (IQ-TREE), gene trees, and coalescent species tree (ASTRAL)
 ## Requirements
 Claude Code is better than the web interface, since Claude will then help you install all requirements.
 The skill generates scripts that install and use:
 - **compleasm** or BUSCO - ortholog detection
 - **MAFFT** - multiple sequence alignment
 - **Aliscore/ALICUT, trimAl, BMGE, or ClipKit** - alignment trimming
 - **FASconCAT** - alignment concatenation
 - **IQ-TREE** - maximum likelihood phylogenetic inference
 - **ASTRAL** - coalescent species tree estimation
 - **NCBI Datasets CLI** - genome download (if using NCBI accessions)
 ## Computing Environments
 The skill supports multiple computing environments:
 - **SLURM clusters** - generates SBATCH array jobs
 - **PBS/Torque clusters** - generates PBS array jobs
 - **Local machines** - sequential execution scripts
 ## Attribution
 Created by **Bruno de Medeiros** (Curator of Pollinating Insects, Field Museum) based on phylogenomics tutorials by **Paul Frandsen** (Brigham Young University).
 ## Citation
 If you use this skill for published research, please cite this website and also:
 - **compleasm**: Huang, N., & Li, H. (2023). compleasm: a faster and more accurate reimplementation of BUSCO. *Bioinformatics*, 39(10), btad595.
 - **MAFFT**: Katoh, K., & Standley, D. M. (2013). MAFFT multiple sequence alignment software version 7. *Molecular Biology and Evolution*, 30(4), 772-780.
 - **IQ-TREE**: Minh, B. Q., et al. (2020). IQ-TREE 2: New models and efficient methods for phylogenetic inference. *Molecular Biology and Evolution*, 37(5), 1530-1534.
 - **ASTRAL**: Zhang, C., et al. (2018). ASTRAL-III: polynomial time species tree reconstruction. *BMC Bioinformatics*, 19(6), 153.
 Plus any trimming tool you use (Aliscore/ALICUT, trimAl, BMGE, or ClipKit).
 ## License
 MIT License - see individual tool licenses for software dependencies.
 ## Support
 For issues or questions:
 - Open an issue in this repository
 - Contact Bruno de Medeiros at the Field Museum (bdemedeiros@fieldmuseum.org)
 ## Acknowledgments
 Special thanks to Paul Frandsen (BYU) for creating the excellent phylogenomics tutorials that form the foundation of this skill.
--- a/skills/phylo_from_buscos/SKILL.md
+++ b/skills/phylo_from_buscos/SKILL.md
@@ -0,0 +1,757 @@
 ---
 name: busco-phylogeny
 description: Generate phylogenies from genome assemblies using BUSCO/compleasm-based single-copy orthologs with scheduler-aware workflow generation
 ---
 # BUSCO-based Phylogenomics Workflow Generator
 This skill provides phylogenomics expertise for generating comprehensive, scheduler-aware workflows for phylogenetic inference from genome assemblies using single-copy orthologs.
 ## Purpose
 This skill helps users generate phylogenies from genome assemblies by:
 1. Handling mixed input (local files and NCBI accessions)
 2. Creating scheduler-specific scripts (SLURM, PBS, cloud, local)
 3. Setting up complete workflows from raw genomes to final trees
 4. Providing quality control and recommendations
 5. Supporting flexible software management (bioconda, Docker, custom)
 ## Available Resources
 The skill provides access to these bundled resources:
 ### Scripts (`scripts/`)
 - **`query_ncbi_assemblies.py`** - Query NCBI for available genome assemblies by taxon name (new!)
 - **`download_ncbi_genomes.py`** - Download genomes from NCBI using BioProjects or Assembly accessions
 - **`rename_genomes.py`** - Rename genome files with meaningful sample names (important!)
 - **`generate_qc_report.sh`** - Generate quality control reports from compleasm results
 - **`extract_orthologs.sh`** - Extract and reorganize single-copy orthologs
 - **`run_aliscore.sh`** - Wrapper for Aliscore to identify randomly similar sequences (RSS)
 - **`run_alicut.sh`** - Wrapper for ALICUT to remove RSS positions from alignments
 - **`run_aliscore_alicut_batch.sh`** - Batch process all alignments through Aliscore + ALICUT
 - **`convert_fasconcat_to_partition.py`** - Convert FASconCAT output to IQ-TREE partition format
 - **`predownloaded_aliscore_alicut/`** - Pre-tested Aliscore and ALICUT Perl scripts
 ### Templates (`templates/`)
 - **`slurm/`** - SLURM job scheduler templates
 - **`pbs/`** - PBS/Torque job scheduler templates
 - **`local/`** - Local machine templates (with GNU parallel)
 - **`README.md`** - Complete template documentation
 ### References (`references/`)
 - **`REFERENCE.md`** - Detailed technical reference including:
  - Sample naming best practices
  - BUSCO lineage datasets (complete list)
  - Resource recommendations (memory, CPUs, walltime)
  - Detailed step-by-step implementation guides
  - Quality control guidelines
  - Aliscore/ALICUT detailed guide
  - Tool citations and download links
  - Software installation guide
  - Common issues and troubleshooting
 ## Workflow Overview
 The complete phylogenomics pipeline follows this sequence:
 **Input Preparation** → **Ortholog Identification** → **Quality Control** → **Ortholog Extraction** → **Alignment** → **Trimming** → **Concatenation** → **Phylogenetic Inference**
 ## Initial User Questions
 When a user requests phylogeny generation, gather the following information systematically:
 ### Step 1: Detect Computing Environment
 Before asking questions, attempt to detect the local computing environment:
 ```bash
 # Check for job schedulers
 command -v sbatch >/dev/null 2>&1  # SLURM
 command -v qsub >/dev/null 2>&1    # PBS/Torque
 command -v parallel >/dev/null 2>&1  # GNU parallel
 ```
 Report findings to the user, then confirm: **"I detected [X] on this machine. Will you be running the scripts here or on a different system?"**
 ### Required Information
 Ask these questions to gather essential workflow parameters:
 1. **Computing Environment**
   - Where will these scripts run? (SLURM cluster, PBS/Torque cluster, Cloud computing, Local machine)
 2. **Input Data**
   - Local genome files, NCBI accessions, or both?
   - If NCBI: Do you already have Assembly accessions (GCA_*/GCF_*) or BioProject accessions (PRJNA*/PRJEB*/PRJDA*)?
   - If user doesn't have accessions: Offer to help find assemblies using `query_ncbi_assemblies.py` (see "STEP 0A: Query NCBI for Assemblies" below)
   - If local files: What are the file paths?
 3. **Taxonomic Scope & Dataset Details**
   - What taxonomic group? (determines BUSCO lineage dataset)
   - How many taxa/genomes will be analyzed?
   - What is the approximate phylogenetic breadth? (species-level, genus-level, family-level, order-level, etc.)
   - See `references/REFERENCE.md` for complete lineage list
 4. **Environment Management**
   - Use unified conda environment (default, recommended), or separate environments per tool?
 5. **Resource Constraints**
   - How many CPU cores/threads to use in total? (Ask user to specify, do not auto-detect)
   - Available memory (RAM) per node/machine?
   - Maximum walltime for jobs?
   - See `references/REFERENCE.md` for resource recommendations
 6. **Parallelization Strategy**
   Ask the user how they want to handle parallel processing:
   - **For job schedulers (SLURM/PBS)**:
     - Use array jobs for parallel steps? (Recommended: Yes)
     - Which steps to parallelize? (Steps 2, 5, 6, 8C recommended)
   - **For local machines**:
     - Use GNU parallel for parallel steps? (requires `parallel` installed)
     - How many concurrent jobs?
   - **For all systems**:
     - Optimize for maximum throughput or simplicity?
 7. **Scheduler-Specific Configuration** (if using SLURM or PBS)
   - Account/Username for compute time charges
   - Partition/Queue to submit jobs to
   - Email notifications? (address and when: START, END, FAIL, ALL)
   - Job dependencies? (Recommended: Yes for linear workflow)
   - Output log directory? (Default: `logs/`)
 8. **Alignment Trimming Preference**
   - Aliscore/ALICUT (traditional, thorough), trimAl (fast), BMGE (entropy-based), or ClipKit (modern)?
 9. **Substitution Model Selection** (for IQ-TREE phylogenetic inference)
   **Context needed**: Taxonomic breadth, number of taxa, evolutionary rates
   **Action**: Fetch IQ-TREE model documentation and suggest appropriate amino acid substitution models based on dataset characteristics.
   Use the substitution model recommendation system (see "Substitution Model Recommendation" section below).
 10. **Educational Goals**
   - Are you learning bioinformatics and would you like comprehensive explanations of each workflow step?
   - If yes: After completing each major workflow stage, offer to explain what the step accomplishes, why certain choices were made, and what best practices are being followed.
   - Store this preference to use throughout the workflow.
 ---
 ## Recommended Directory Structure
 Organize analyses with dedicated folders for each pipeline step:
 ```
 project_name/
 ├── logs/                          # All log files
 ├── 00_genomes/                    # Input genome assemblies
 ├── 01_busco_results/              # BUSCO/compleasm outputs
 ├── 02_qc/                         # Quality control reports
 ├── 03_extracted_orthologs/        # Extracted single-copy orthologs
 ├── 04_alignments/                 # Multiple sequence alignments
 ├── 05_trimmed/                    # Trimmed alignments
 ├── 06_concatenation/              # Supermatrix and partition files
 ├── 07_partition_search/           # Partition model selection
 ├── 08_concatenated_tree/          # Concatenated ML tree
 ├── 09_gene_trees/                 # Individual gene trees
 ├── 10_species_tree/               # ASTRAL species tree
 └── scripts/                       # All analysis scripts
 ```
 **Benefits**: Easy debugging, clear workflow progression, reproducibility, prevents root directory clutter.
 ---
 ## Template System
 This skill uses a template-based system to reduce token usage and improve maintainability. Script templates are stored in the `templates/` directory and organized by computing environment.
 ### How to Use Templates
 When generating scripts for users:
 1. **Read the appropriate template** for their computing environment:
   ```
   Read("templates/slurm/02_compleasm_first.job")
   ```
 2. **Replace placeholders** with user-specific values:
   - `TOTAL_THREADS` → e.g., `64`
   - `THREADS_PER_JOB` → e.g., `16`
   - `NUM_GENOMES` → e.g., `20`
   - `NUM_LOCI` → e.g., `2795`
   - `LINEAGE` → e.g., `insecta_odb10`
   - `MODEL_SET` → e.g., `LG,WAG,JTT,Q.pfam`
 3. **Present the customized script** to the user with setup instructions
 ### Available Templates
 Key templates by workflow step:
 - **Step 0 (setup)**: Environment setup script in `references/REFERENCE.md`
 - **Step 2 (compleasm)**: `02_compleasm_first`, `02_compleasm_parallel`
 - **Step 8A (partition search)**: `08a_partition_search`
 - **Step 8C (gene trees)**: `08c_gene_trees_array`, `08c_gene_trees_parallel`, `08c_gene_trees_serial`
 See `templates/README.md` for complete template documentation.
 ---
 ## Substitution Model Recommendation
 When asked about substitution model selection (Question 9), use this systematic approach:
 ### Step 1: Fetch IQ-TREE Documentation
 Use WebFetch to retrieve current model information:
 ```
 WebFetch(url="https://iqtree.github.io/doc/Substitution-Models",
         prompt="Extract all amino acid substitution models with descriptions and usage guidelines")
 ```
 ### Step 2: Analyze Dataset Characteristics
 Consider these factors from user responses:
 - **Taxonomic Scope**: Species/genus (shallow) vs. family/order (moderate) vs. class/phylum+ (deep)
 - **Number of Taxa**: <20 (small), 20-50 (medium), >50 (large)
 - **Evolutionary Rates**: Fast-evolving, moderate, or slow-evolving
 - **Sequence Type**: Nuclear proteins, mitochondrial, or chloroplast
 ### Step 3: Recommend Models
 Provide 3-5 appropriate models based on dataset characteristics. For detailed model recommendation matrices and taxonomically-targeted models, see `references/REFERENCE.md` section "Substitution Model Recommendation".
 **General recommendations**:
 - **Nuclear proteins (most common)**: LG, WAG, JTT, Q.pfam
 - **Mitochondrial**: mtREV, mtZOA, mtMAM, mtART, mtVer, mtInv
 - **Chloroplast**: cpREV
 - **Taxonomically-targeted**: Q.bird, Q.mammal, Q.insect, Q.plant, Q.yeast (when applicable)
 ### Step 4: Present Recommendations
 Format recommendations with justifications and explain how models will be used in IQ-TREE steps 8A and 8C.
 ### Step 5: Store Model Set
 Store the final comma-separated model list (e.g., "LG,WAG,JTT,Q.pfam") for use in Step 8 template placeholders.
 ---
 ## Workflow Implementation
 Once required information is gathered, guide the user through these steps. For each step, use templates where available and refer to `references/REFERENCE.md` for detailed implementation.
 ### STEP 0: Environment Setup
 **ALWAYS start by generating a setup script** for the user's environment.
 Use the unified conda environment setup script from `references/REFERENCE.md` (Section: "Software Installation Guide"). This creates a single conda environment with all necessary tools:
 - compleasm, MAFFT, trimming tools (trimAl, ClipKit, BMGE)
 - IQ-TREE, ASTRAL, Perl with BioPerl, GNU parallel
 - Downloads and installs Aliscore/ALICUT Perl scripts
 **Key points**:
 - Users choose between mamba (faster) or conda
 - Users choose between predownloaded Aliscore/ALICUT scripts (tested) or latest from GitHub
 - All subsequent steps use `conda activate phylo` (the unified environment)
 See `references/REFERENCE.md` for the complete setup script template.
 ---
 ### STEP 0A: Query NCBI for Assemblies (Optional)
 **Use this step when**: User wants to use NCBI data but doesn't have specific assembly accessions yet.
 This optional preliminary step helps users discover available genome assemblies by taxon name before proceeding with the main workflow.
 #### When to Offer This Step
 Offer this step when:
 - User wants to analyze genomes from NCBI
 - User doesn't have specific Assembly or BioProject accessions
 - User mentions a taxonomic group (e.g., "I want to build a phylogeny for beetles")
 #### Workflow
 1. **Ask for focal taxon**: Request the taxonomic group of interest
   - Examples: "Coleoptera", "Drosophila", "Apis mellifera"
   - Can be at any taxonomic level (order, family, genus, species)
 2. **Query NCBI using the script**: Use `scripts/query_ncbi_assemblies.py` to search for assemblies
   ```bash
   # Basic query (returns 20 results by default)
   python scripts/query_ncbi_assemblies.py --taxon "Coleoptera"
   # Query with more results
   python scripts/query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50
   # Query for RefSeq assemblies only (higher quality, GCF_* accessions)
   python scripts/query_ncbi_assemblies.py --taxon "Apis" --refseq-only
   # Save accessions to file for later download
   python scripts/query_ncbi_assemblies.py --taxon "Coleoptera" --save assembly_accessions.txt
   ```
 3. **Present results to user**: The script displays:
   - Assembly accession (GCA_* or GCF_*)
   - Organism name
   - Assembly level (Chromosome, Scaffold, Contig)
   - Assembly name
 4. **Help user select assemblies**: Ask user which assemblies they want to include
   - Consider assembly level (Chromosome > Scaffold > Contig)
   - Consider phylogenetic breadth (species coverage)
   - Consider data quality (RefSeq > GenBank when available)
 5. **Collect selected accessions**: Compile the list of chosen assembly accessions
 6. **Proceed to STEP 1**: Use the selected accessions with `download_ncbi_genomes.py`
 #### Tips for Assembly Selection
 - **Assembly Level**: Chromosome-level assemblies are most complete, followed by Scaffold, then Contig
 - **RefSeq vs GenBank**: RefSeq (GCF_*) assemblies undergo additional curation; GenBank (GCA_*) are submitter-provided
 - **Taxonomic Sampling**: For phylogenetics, aim for representative sampling across the taxonomic group
 - **Quality over Quantity**: Better to have 20 high-quality assemblies than 100 poor-quality ones
 ---
 ### STEP 1: Download NCBI Genomes (if applicable)
 If user provided NCBI accessions, use `scripts/download_ncbi_genomes.py`:
 **For BioProjects**:
 ```bash
 python scripts/download_ncbi_genomes.py --bioprojects PRJNA12345 -o genomes.zip
 unzip genomes.zip
 ```
 **For Assembly Accessions**:
 ```bash
 python scripts/download_ncbi_genomes.py --assemblies GCA_123456789.1 -o genomes.zip
 unzip genomes.zip
 ```
 **IMPORTANT**: After download, genomes must be renamed with meaningful sample names (format: `[ACCESSION]_[SPECIES_NAME]`). Sample names appear in final phylogenetic trees.
 Generate a script that:
 1. Finds all downloaded FASTA files in ncbi_dataset directory structure
 2. Moves/renames files to main genomes directory with meaningful names
 3. Includes any local genome files
 4. Creates final genome_list.txt with ALL genomes (local + downloaded)
 See `references/REFERENCE.md` section "Sample Naming Best Practices" for detailed guidelines.
 ---
 ### STEP 2: Ortholog Identification with compleasm
 Activate the unified environment and run compleasm on all genomes to identify single-copy orthologs.
 **Key considerations**:
 - First genome must run alone to download lineage database
 - Remaining genomes can run in parallel
 - Thread allocation: Miniprot scales well up to ~16-32 threads per genome
 **Threading guidelines**: See `references/REFERENCE.md` for recommended thread allocation table.
 **Generate scripts using templates**:
 - **SLURM**: Read templates `02_compleasm_first.job` and `02_compleasm_parallel.job`
 - **PBS**: Read templates `02_compleasm_first.job` and `02_compleasm_parallel.job`
 - **Local**: Read templates `02_compleasm_first.sh` and `02_compleasm_parallel.sh`
 Replace placeholders: `TOTAL_THREADS`, `THREADS_PER_JOB`, `NUM_GENOMES`, `LINEAGE`
 For detailed implementation examples, see `references/REFERENCE.md` section "Ortholog Identification Implementation".
 ---
 ### STEP 3: Quality Control
 After compleasm completes, generate QC report using `scripts/generate_qc_report.sh`:
 ```bash
 bash scripts/generate_qc_report.sh qc_report.csv
 ```
 Provide interpretation:
 - **>95% complete**: Excellent, retain
 - **90-95% complete**: Good, retain
 - **85-90% complete**: Acceptable, case-by-case
 - **70-85% complete**: Questionable, consider excluding
 - **<70% complete**: Poor, recommend excluding
 See `references/REFERENCE.md` section "Quality Control Guidelines" for detailed assessment criteria.
 ---
 ### STEP 4: Ortholog Extraction
 Use `scripts/extract_orthologs.sh` to extract single-copy orthologs:
 ```bash
 bash scripts/extract_orthologs.sh LINEAGE_NAME
 ```
 This generates per-locus unaligned FASTA files in `single_copy_orthologs/unaligned_aa/`.
 ---
 ### STEP 5: Alignment with MAFFT
 Activate the unified environment (`conda activate phylo`) which contains MAFFT.
 Create locus list, then generate alignment scripts:
 ```bash
 cd single_copy_orthologs/unaligned_aa
 ls *.fas > locus_names.txt
 num_loci=$(wc -l < locus_names.txt)
 ```
 **Generate scheduler-specific scripts**:
 - **SLURM/PBS**: Array job with one task per locus
 - **Local**: Sequential processing or GNU parallel
 For detailed script templates, see `references/REFERENCE.md` section "Alignment Implementation".
 ---
 ### STEP 6: Alignment Trimming
 Based on user's preference, provide appropriate trimming method. All tools are available in the unified conda environment.
 **Options**:
 - **trimAl**: Fast (`-automated1`), recommended for large datasets
 - **ClipKit**: Modern, fast (default smart-gap mode)
 - **BMGE**: Entropy-based (`-t AA`)
 - **Aliscore/ALICUT**: Traditional, thorough (recommended for phylogenomics)
 **For Aliscore/ALICUT**:
 - Perl scripts were installed in STEP 0
 - Use `scripts/run_aliscore_alicut_batch.sh` for batch processing
 - Or use array jobs with `scripts/run_aliscore.sh` and `scripts/run_alicut.sh`
 - Always use `-N` flag for amino acid sequences
 **Generate scripts** using scheduler-appropriate templates (array jobs for SLURM/PBS, parallel or serial for local).
 For detailed implementation of each trimming method, see `references/REFERENCE.md` section "Alignment Trimming Implementation".
 ---
 ### STEP 7: Concatenation and Partition Definition
 Download FASconCAT-G (Perl script) and run concatenation:
 ```bash
 conda activate phylo  # Has Perl installed
 wget https://raw.githubusercontent.com/PatrickKueck/FASconCAT-G/master/FASconCAT-G_v1.06.1.pl -O FASconCAT-G.pl
 chmod +x FASconCAT-G.pl
 cd trimmed_aa
 perl ../FASconCAT-G.pl -s -i
 ```
 Convert to IQ-TREE format using `scripts/convert_fasconcat_to_partition.py`:
 ```bash
 python ../scripts/convert_fasconcat_to_partition.py FcC_info.xls partition_def.txt
 ```
 Outputs: `FcC_supermatrix.fas`, `FcC_info.xls`, `partition_def.txt`
 ---
 ### STEP 8: Phylogenetic Inference
 IQ-TREE is already installed in the unified environment. Activate with `conda activate phylo`.
 #### Part 8A: Partition Model Selection
 Use the substitution models selected during initial setup (Question 9).
 **Generate script using templates**:
 - Read appropriate template: `templates/[slurm|pbs|local]/08a_partition_search.[job|sh]`
 - Replace `MODEL_SET` placeholder with user's selected models (e.g., "LG,WAG,JTT,Q.pfam")
 For detailed implementation, see `references/REFERENCE.md` section "Partition Model Selection Implementation".
 #### Part 8B: Concatenated ML Tree
 Run IQ-TREE using the best partition scheme from Part 8A:
 ```bash
 iqtree -s FcC_supermatrix.fas -spp partition_search.best_scheme.nex \
  -nt 18 -safe -pre concatenated_ML_tree -bb 1000 -bnni
 ```
 Output: `concatenated_ML_tree.treefile`
 #### Part 8C: Individual Gene Trees
 Estimate gene trees for coalescent-based species tree inference.
 **Generate scripts using templates**:
 - **SLURM/PBS**: Read `08c_gene_trees_array.job` template
 - **Local**: Read `08c_gene_trees_parallel.sh` or `08c_gene_trees_serial.sh` template
 - Replace `NUM_LOCI` placeholder
 For detailed implementation, see `references/REFERENCE.md` section "Gene Trees Implementation".
 #### Part 8D: ASTRAL Species Tree
 ASTRAL is already installed in the unified conda environment.
 ```bash
 conda activate phylo
 # Concatenate all gene trees
 cat trimmed_aa/*.treefile > all_gene_trees.tre
 # Run ASTRAL
 astral -i all_gene_trees.tre -o astral_species_tree.tre
 ```
 Output: `astral_species_tree.tre`
 ---
 ### STEP 9: Generate Methods Paragraph
 **ALWAYS generate a methods paragraph** to help users write their publication methods section.
 Create `METHODS_PARAGRAPH.md` file with:
 - Customized text based on tools and parameters used
 - Complete citations for all software
 - Placeholders for user-specific values (genome count, loci count, thresholds)
 - Instructions for adapting to journal requirements
 For the complete methods paragraph template, see `references/REFERENCE.md` section "Methods Paragraph Template".
 Pre-fill known values when possible:
 - Number of genomes
 - BUSCO lineage
 - Trimming method used
 - Substitution models tested
 ---
 ## Final Outputs Summary
 Provide users with a summary of outputs:
 **Phylogenetic Results**:
 1. `concatenated_ML_tree.treefile` - ML tree from concatenated supermatrix
 2. `astral_species_tree.tre` - Coalescent species tree
 3. `*.treefile` - Individual gene trees
 **Data and Quality Control**:
 4. `qc_report.csv` - Genome quality statistics
 5. `FcC_supermatrix.fas` - Concatenated alignment
 6. `partition_search.best_scheme.nex` - Selected partitioning scheme
 **Publication Materials**:
 7. `METHODS_PARAGRAPH.md` - Ready-to-use methods section with citations
 **Visualization tools**: FigTree, iTOL, ggtree (R), ete3/toytree (Python)
 ---
 ## Script Validation
 **ALWAYS perform validation checks** after generating scripts but before presenting them to the user. This ensures script accuracy, consistency, and proper resource allocation.
 ### Validation Workflow
 For each generated script, perform these validation checks in order:
 #### 1. Program Option Verification
 **Purpose**: Detect hallucinated or incorrect command-line options that may cause scripts to fail.
 **Procedure**:
 1. **Extract all command invocations** from the generated script (e.g., `compleasm run`, `iqtree -s`, `mafft --auto`)
 2. **Compare against reference sources**:
   - First check: Compare against corresponding template in `templates/` directory
   - Second check: Compare against examples in `references/REFERENCE.md`
   - Third check: If options differ significantly or are uncertain, perform web search for official documentation
 3. **Common tools to validate**:
   - `compleasm run` - Check `-a`, `-o`, `-l`, `-t` options
   - `iqtree` - Verify `-s`, `-p`, `-m`, `-bb`, `-alrt`, `-nt`, `-safe` options
   - `mafft` - Check `--auto`, `--thread`, `--reorder` options
   - `astral` - Verify `-i`, `-o` options
   - Trimming tools (`trimal`, `clipkit`, `BMGE.jar`) - Validate options
 **Action on issues**:
 - If incorrect options found: Inform user of the issue and ask if they want you to correct it
 - If uncertain: Ask user to verify with tool documentation before proceeding
 #### 2. Pipeline Continuity Verification
 **Purpose**: Ensure outputs from one step correctly feed into inputs of subsequent steps.
 **Procedure**:
 1. **Map input/output relationships**:
   - Step 2 output (`01_busco_results/*_compleasm/`) → Step 3 input (QC script)
   - Step 3 output (`single_copy_orthologs/`) → Step 5 input (MAFFT)
   - Step 5 output (`04_alignments/*.fas`) → Step 6 input (trimming)
   - Step 6 output (`05_trimmed/*.fas`) → Step 7 input (FASconCAT-G)
   - Step 7 output (`FcC_supermatrix.fas`, partition file) → Step 8A input (IQ-TREE)
   - Step 8C output (`*.treefile`) → Step 8D input (ASTRAL)
 2. **Check for consistency**:
   - File path references match across scripts
   - Directory structure follows recommended layout
   - Glob patterns correctly match expected files
   - Required intermediate files are generated before being used
 **Action on issues**:
 - If path mismatches found: Inform user and ask if they want you to correct them
 - If directory structure inconsistent: Suggest corrections aligned with recommended structure
 #### 3. Resource Compatibility Check
 **Purpose**: Ensure allocated computational resources are appropriate for the task.
 **Procedure**:
 1. **Verify resource allocations** against recommendations in `references/REFERENCE.md`:
   - **Memory allocation**: Check if memory per CPU (typically 6GB for compleasm, 2-4GB for others) is adequate
   - **Thread allocation**: Verify thread counts are reasonable for the number of genomes/loci
   - **Walltime**: Ensure walltime is sufficient based on dataset size guidelines
   - **Parallelization**: Check that threads per job × concurrent jobs ≤ total threads
 2. **Common issues to check**:
   - Compleasm: First job needs full thread allocation (downloads database)
   - IQ-TREE: `-nt` should match allocated CPUs
   - Gene trees: Ensure enough threads per tree × concurrent trees ≤ total available
   - Memory: Concatenated tree inference may need 8-16GB per CPU for large datasets
 3. **Validate against user-specified constraints**:
   - Total CPUs specified by user
   - Available memory per node
   - Maximum walltime limits
   - Scheduler-specific limits (if mentioned)
 **Action on issues**:
 - If resource allocation issues found: Inform user and suggest corrections with justification
 - If uncertain about adequacy: Ask user about typical job performance in their environment
 ### Validation Reporting
 After completing all validation checks:
 1. **If all checks pass**: Inform user briefly: "Scripts validated successfully - options, pipeline flow, and resources verified."
 2. **If issues found**: Present a structured report:
   ```
   **Validation Results**
   ⚠️ Issues found during validation:
   1. [Issue category]: [Description]
      - Current: [What was generated]
      - Suggested: [Recommended fix]
      - Reason: [Why this is an issue]
   Would you like me to apply these corrections?
   ```
 3. **Always ask before correcting**: Never silently fix issues - always get user confirmation before applying changes.
 4. **Document corrections**: If corrections are applied, explain what was changed and why.
 ---
 ## Communication Guidelines
 - **Always start with STEP 0**: Generate the unified environment setup script
 - **Always end with STEP 9**: Generate the customized methods paragraph
 - **Always validate scripts**: Perform validation checks before presenting scripts to users
 - **Use unified environment by default**: All scripts should use `conda activate phylo`
 - **Always ask about CPU allocation**: Never auto-detect cores, always ask user
 - **Recommend optimized workflows**: For users with adequate resources, recommend optimized parallel approaches over simple serial approaches
 - **Be clear and pedagogical**: Explain why each step is necessary
 - **Provide educational explanations when requested**: If user answered yes to educational goals (question 10):
  - After completing each major workflow stage, ask: "Would you like me to explain this step?"
  - If yes, provide moderate-length explanation (1-2 paragraphs) covering:
    - What the step accomplishes biologically and computationally
    - Significant choices made and their rationale
    - Best practices being followed in the workflow
  - Examples of "major workflow stages": STEP 0 (setup), STEP 1 (download), STEP 2 (BUSCO), STEP 3 (QC), STEP 5 (alignment), STEP 6 (trimming), STEP 7 (concatenation), STEP 8 (phylogenetic inference)
 - **Provide complete, ready-to-run scripts**: Users should copy-paste and run
 - **Adapt to user's environment**: Always generate scheduler-specific scripts
 - **Reference supporting files**: Direct users to `references/REFERENCE.md` for details
 - **Use helper scripts**: Leverage provided scripts in `scripts/` directory
 - **Include error checking**: Add file existence checks and informative error messages
 - **Be encouraging**: Phylogenomics is complex; maintain supportive tone
 ---
 ## Important Notes
 ### Mandatory Steps
 1. **STEP 0 is mandatory**: Always generate the environment setup script first
 2. **STEP 9 is mandatory**: Always generate the methods paragraph file at the end
 ### Template Usage (IMPORTANT!)
 3. **Prefer templates over inline code**: Use `templates/` directory for major scripts
 4. **Template workflow**:
   - Read: `Read("templates/slurm/02_compleasm_first.job")`
   - Replace placeholders: `TOTAL_THREADS`, `LINEAGE`, `NUM_GENOMES`, `MODEL_SET`, etc.
   - Present customized script to user
 5. **Available templates**: See `templates/README.md` for complete list
 6. **Benefits**: Reduces token usage, easier maintenance, consistent structure
 ### Script Generation
 7. **Always adapt scripts** to user's scheduler (SLURM/PBS/local)
 8. **Replace all placeholders** before presenting scripts
 9. **Never auto-detect CPU cores**: Always ask user to specify
 10. **Provide parallelization options**: For each parallelizable step, offer array job, parallel, and serial options
 11. **Scheduler-specific configuration**: For SLURM/PBS, always ask about account, partition, email, etc.
 ### Parallelization Strategy
 12. **Ask about preferences**: Let user choose between throughput optimization vs. simplicity
 13. **Compleasm optimization**: For ≥2 genomes and ≥16 cores, recommend two-phase approach
 14. **Use threading guidelines**: Refer to `references/REFERENCE.md` for thread allocation recommendations
 15. **Parallelizable steps**: Steps 2 (compleasm), 5 (MAFFT), 6 (trimming), 8C (gene trees)
 ### Substitution Model Selection
 16. **Always recommend models**: Use the systematic model recommendation process
 17. **Fetch current documentation**: Use WebFetch to get IQ-TREE model information
 18. **Replace MODEL_SET placeholder**: In Step 8A templates with comma-separated list
 19. **Taxonomically-targeted models**: Suggest Q.bird, Q.mammal, Q.insect, Q.plant when applicable
 ### Reference Material
 20. **Direct users to references/REFERENCE.md** for:
    - Detailed implementation guides
    - BUSCO lineage datasets (complete list)
    - Resource recommendations (memory, CPUs, walltime tables)
    - Sample naming best practices
    - Quality control assessment criteria
    - Aliscore/ALICUT detailed guide and parameters
    - Tool citations with DOIs
    - Software installation instructions
    - Common issues and troubleshooting
 ---
 ## Attribution
 This skill was created by **Bruno de Medeiros** (Curator of Pollinating Insects, Field Museum) based on phylogenomics tutorials by **Paul Frandsen** (Brigham Young University).
 ## Workflow Entry Point
 When a user requests phylogeny generation:
 1. Gather required information using the "Initial User Questions" section
 2. Generate STEP 0 setup script from `references/REFERENCE.MD`
 3. If user needs help finding NCBI assemblies, perform STEP 0A using `query_ncbi_assemblies.py`
 4. Proceed step-by-step through workflow (STEPS 1-8), using templates and referring to `references/REFERENCE.md` for detailed implementation
 5. All workflow scripts should use the unified conda environment (`conda activate phylo`)
 6. Validate all generated scripts before presenting to user (see "Script Validation" section)
 7. Generate STEP 9 methods paragraph from template in `references/REFERENCE.md`
 8. Provide final outputs summary
--- a/skills/phylo_from_buscos/references/REFERENCE.md
+++ b/skills/phylo_from_buscos/references/REFERENCE.md
--- a/skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py
+++ b/skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py
@@ -0,0 +1,63 @@
 #!/usr/bin/env python3
 """
 Convert FASconCAT info file to IQ-TREE partition format
 Usage:
    python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt]
 Author: Bruno de Medeiros (Field Museum)
 Based on tutorials by Paul Frandsen (BYU)
 """
 import sys
 def convert_fcc_to_partition(fcc_file, output_file="partition_def.txt"):
    """
    Convert FASconCAT info file to IQ-TREE partition format
    Args:
        fcc_file: Path to FcC_info.xls file from FASconCAT
        output_file: Path to output partition definition file
    """
    try:
        with open(fcc_file, 'r') as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f"Error: File '{fcc_file}' not found")
        sys.exit(1)
    partitions_written = 0
    with open(output_file, 'w') as out:
        # Skip first two header lines (FASconCAT INFO and column headers)
        for line in lines[2:]:
            line = line.strip()
            if line:
                parts = line.split('\t')
                if len(parts) >= 3:
                    locus = parts[0]
                    start = parts[1]
                    end = parts[2]
                    out.write(f"AA, {locus} = {start}-{end}\n")
                    partitions_written += 1
    print(f"Partition file created: {output_file}")
    print(f"Number of partitions: {partitions_written}")
 def main():
    if len(sys.argv) < 2:
        print("Usage: python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt]")
        print("\nConverts FASconCAT info file to IQ-TREE partition format")
        sys.exit(1)
    fcc_file = sys.argv[1]
    output_file = sys.argv[2] if len(sys.argv) > 2 else "partition_def.txt"
    convert_fcc_to_partition(fcc_file, output_file)
 if __name__ == "__main__":
    main()
--- a/skills/phylo_from_buscos/scripts/download_ncbi_genomes.py
+++ b/skills/phylo_from_buscos/scripts/download_ncbi_genomes.py
@@ -0,0 +1,133 @@
 #!/usr/bin/env python3
 """
 Download genomes from NCBI using BioProject or Assembly accessions
 Usage:
    python download_ncbi_genomes.py --bioprojects PRJNA12345 PRJEB67890
    python download_ncbi_genomes.py --assemblies GCA_123456789.1 GCF_987654321.1
 Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
 Author: Bruno de Medeiros (Field Museum)
 Based on tutorials by Paul Frandsen (BYU)
 """
 import argparse
 import sys
 import subprocess
 def download_using_cli(accessions, output_file="genomes.zip"):
    """
    Download genomes using NCBI datasets CLI
    Args:
        accessions: List of BioProject or Assembly accessions
        output_file: Name of output zip file
    """
    cmd = ["datasets", "download", "genome", "accession"] + accessions + ["--filename", output_file]
    print(f"Running: {' '.join(cmd)}")
    print("")
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(result.stdout)
        print(f"\nDownload complete: {output_file}")
        print("Extract with: unzip " + output_file)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error downloading genomes: {e}", file=sys.stderr)
        print(e.stderr, file=sys.stderr)
        return False
    except FileNotFoundError:
        print("Error: 'datasets' command not found", file=sys.stderr)
        print("Install with: conda install -c conda-forge ncbi-datasets-cli", file=sys.stderr)
        return False
 def get_bioproject_assemblies(bioprojects):
    """
    Get assembly accessions for given BioProjects using Python API
    Args:
        bioprojects: List of BioProject accessions
    Returns:
        List of tuples (assembly_accession, organism_name)
    """
    try:
        from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions
    except ImportError:
        print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
        print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
        sys.exit(1)
    assemblies = []
    print(f"Fetching assembly information for {len(bioprojects)} BioProject(s)...")
    print("")
    for assembly in get_assembly_metadata_by_bioproject_accessions(bioprojects):
        acc = assembly.accession
        name = assembly.organism.organism_name
        assemblies.append((acc, name))
        print(f"  {name}: {acc}")
    print(f"\nFound {len(assemblies)} assemblies")
    return assemblies
 def main():
    parser = argparse.ArgumentParser(
        description="Download genomes from NCBI using BioProject or Assembly accessions"
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        "--bioprojects",
        nargs="+",
        help="BioProject accessions (e.g., PRJNA12345 PRJEB67890)"
    )
    group.add_argument(
        "--assemblies",
        nargs="+",
        help="Assembly accessions (e.g., GCA_123456789.1 GCF_987654321.1)"
    )
    parser.add_argument(
        "-o", "--output",
        default="genomes.zip",
        help="Output zip file name (default: genomes.zip)"
    )
    parser.add_argument(
        "--list-only",
        action="store_true",
        help="List assemblies without downloading (BioProject mode only)"
    )
    args = parser.parse_args()
    if args.bioprojects:
        assemblies = get_bioproject_assemblies(args.bioprojects)
        if args.list_only:
            print("\nAssembly accessions (use with --assemblies to download):")
            for acc, name in assemblies:
                print(acc)
            return
        # Download assemblies
        assembly_accs = [acc for acc, name in assemblies]
        success = download_using_cli(assembly_accs, args.output)
    elif args.assemblies:
        success = download_using_cli(args.assemblies, args.output)
    sys.exit(0 if success else 1)
 if __name__ == "__main__":
    main()
--- a/skills/phylo_from_buscos/scripts/extract_orthologs.sh
+++ b/skills/phylo_from_buscos/scripts/extract_orthologs.sh
@@ -0,0 +1,88 @@
 #!/bin/bash
 # Extract and reorganize single-copy orthologs from compleasm output
 #
 # Usage: bash extract_orthologs.sh LINEAGE_NAME
 #   Example: bash extract_orthologs.sh metazoa
 #
 # Author: Bruno de Medeiros (Field Museum)
 # Based on tutorials by Paul Frandsen (BYU)
 if [ $# -lt 1 ]; then
  echo "Usage: bash extract_orthologs.sh LINEAGE_NAME"
  echo "  Example: bash extract_orthologs.sh metazoa"
  exit 1
 fi
 LINEAGE="$1"
 echo "Extracting single-copy orthologs for lineage: ${LINEAGE}"
 # Create directory for ortholog FASTA files
 mkdir -p single_copy_orthologs
 # Copy gene_marker.fasta files and rename by species
 count=0
 for dir in 01_busco_results/*_compleasm; do
  if [ ! -d "${dir}" ]; then
    continue
  fi
  genome=$(basename "${dir}" _compleasm)
  # Auto-detect the OrthoDB version (odb10, odb11, odb12, etc.)
  odb_dirs=("${dir}/${LINEAGE}_odb"*)
  if [ -d "${odb_dirs[0]}" ]; then
    marker_file="${odb_dirs[0]}/gene_marker.fasta"
  else
    echo "  Warning: No OrthoDB directory found for ${genome}" >&2
    continue
  fi
  if [ -f "${marker_file}" ]; then
    cp "${marker_file}" "single_copy_orthologs/${genome}.fasta"
    echo "  Extracted: ${genome}"
    count=$((count + 1))
  else
    echo "  Warning: Marker file not found for ${genome}" >&2
  fi
 done
 if [ ${count} -eq 0 ]; then
  echo "Error: No gene_marker.fasta files found. Check lineage name." >&2
  exit 1
 fi
 echo "Extracted ${count} genomes"
 echo ""
 echo "Now generating per-locus unaligned FASTA files..."
 cd single_copy_orthologs || exit 1
 mkdir -p unaligned_aa
 cd unaligned_aa || exit 1
 # AWK script to split by ortholog ID
 awk 'BEGIN{RS=">"; FS="\n"} {
  if (NF > 1) {
    split($1, b, "_");
    fnme = b[1] ".fas";
    n = split(FILENAME, a, "/");
    species = a[length(a)];
    gsub(".fasta", "", species);
    print ">" species "\n" $2 >> fnme;
    close(fnme);
  }
 }' ../*.fasta
 # Fix headers
 if [[ "$OSTYPE" == "darwin"* ]]; then
  # macOS
  sed -i '' -e 's/.fasta//g' *.fas
 else
  # Linux
  sed -i -e 's/.fasta//g' *.fas
 fi
 num_loci=$(ls -1 *.fas 2>/dev/null | wc -l)
 echo "Unaligned ortholog files generated: ${num_loci} loci"
 echo ""
 echo "Output directory: single_copy_orthologs/unaligned_aa/"
--- a/skills/phylo_from_buscos/scripts/generate_qc_report.sh
+++ b/skills/phylo_from_buscos/scripts/generate_qc_report.sh
@@ -0,0 +1,59 @@
 #!/bin/bash
 # Quality control report generator for compleasm results
 #
 # Usage: bash generate_qc_report.sh [output_file.csv]
 #
 # Author: Bruno de Medeiros (Field Museum)
 # Based on tutorials by Paul Frandsen (BYU)
 OUTPUT_FILE="${1:-qc_report.csv}"
 echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}"
 count=0
 for dir in 01_busco_results/*_compleasm; do
  if [ ! -d "${dir}" ]; then
    continue
  fi
  genome=$(basename "${dir}" _compleasm)
  summary="${dir}/summary.txt"
  if [ -f "${summary}" ]; then
    # Parse completeness statistics from compleasm format
    # compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing)
    # Format: "S:80.93%, 2283" where we need the count (2283)
    complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
    duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
    fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
    missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ')
    # Check if all values were successfully extracted
    if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then
      echo "Warning: Could not parse statistics for ${genome}" >&2
      continue
    fi
    # Calculate completeness percentage (Complete / Total * 100)
    total=$((complete + duplicated + fragmented + missing))
    if command -v bc &> /dev/null; then
      completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc)
    else
      # Fallback if bc not available
      completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}")
    fi
    echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}"
    count=$((count + 1))
  else
    echo "Warning: Summary file not found for ${genome}" >&2
  fi
 done
 if [ ${count} -eq 0 ]; then
  echo "Error: No compleasm output directories found (*_compleasm)" >&2
  exit 1
 fi
 echo "QC report generated: ${OUTPUT_FILE}"
 echo "Genomes analyzed: ${count}"
--- a/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl
+++ b/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl
@@ -0,0 +1,742 @@
 #!/usr/bin/perl
 use strict       ;
 use File::Copy   ;
 use Tie::File    ;
 use Fcntl        ;
 use Term::Cap ;
 use Term::ANSIColor qw(:constants);
 use Getopt::Std  ;
 # updated on 13th february , 2009 by patrick k<>ck
 # updated on  2nd april    , 2009 by patrick k<>ck
 # updated on 15th june     , 2009 by patrick k<>ck
 # updated on 26th july     , 2009 by patrick k<>ck
 # updated on  7th september, 2011 by patrick k<>ck (alicut v2.3)
 # updated on 22.2.2017, by patrick k<>ck (alicut v2.31) -> correction of initial warning due to line 547, changed some terminal prints, argv handling commands
 my @answer_remain_stems = ( 'no', 'yes' ) ;
 my @answer_codons       = ( 'no', 'yes' ) ;
 my @answer_third_pos    = ( 'no', 'yes' ) ;
 &argv_handling ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ;
 &menu          ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ;
 sub argv_handling{
 	my $aref_remain_stems = $_[0] ;
 	my $aref_codons       = $_[1] ;
 	my $aref_third_pos    = $_[2] ;
 	my ( $commandline )   = join "", @ARGV ;
 	$commandline =~ s/ |\s+// ;
 	my @commands = split "-", $commandline ;
 	shift @commands ;
 	for my $single_command ( sort @commands ){
 			if		( $single_command =~ /^r$/i ) { @$aref_remain_stems = ( reverse @$aref_remain_stems) }
 			elsif	( $single_command =~ /^c$/i ) { @$aref_codons       = ( reverse @$aref_codons      ) }
 			elsif	( $single_command =~ /^3$/i ) { @$aref_third_pos    = ( reverse @$aref_third_pos   ) }
 			elsif	( $single_command =~ /^h$/i ) { &help }
 			elsif	( $single_command =~ /^p$/i ) { &preface }
 			elsif	( $single_command =~ /^s$/i ) {  
 													&header ;
 													&commands( \$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0]) ;
 													&start (\$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0])
 			}
 			else	{ print "\n\t!COMMAND-ERROR!: unknown command \"-", $single_command, "\"\n" }
 	}
 	&menu ( \@$aref_remain_stems, \@$aref_codons, \@$aref_third_pos)
 }
 sub header{
 	printf "\n%68s\n", "------------------------------------------------------------"     ;
 	printf "%49s\n"  , "Welcome to ALICUT V2.31 !"                                        ;
 	printf "%60s\n"  , "a Perlscript to cut ALISCORE identified RSS"                      ;
 	printf "%57s\n"  , "written by Patrick Kueck (ZFMK, Bonn)"                            ;
 	printf "%68s\n\n", "------------------------------------------------------------"     ;
 }
 sub commands{
 	my $sref_rem_stems = $_[0] ;
 	my $sref_reo_codon = $_[1] ;
 	my $sref_th_posit  = $_[2] ;
 	print  "\n\t------------------------------------------------------------"             ;
 	print  "\n\tRemain Stem Position   :\t", $$sref_rem_stems ;
 	print  "\n\tRemove Codon           :\t", $$sref_reo_codon ;
 	print  "\n\tRemove 3rd Position    :\t", $$sref_th_posit ;
 	print  "\n\t------------------------------------------------------------\n"           ;
 }
 sub help{
 	print
 <<info;
 	-------------------------------------------------------------------
 	-------------------------------------------------------------------
 	General Information and Usage:
 	-------------------------------
 	ALICUT V2.31 removes ALISCORE identified RSS positions 
 	in given FASTA file(s) which are listed in the FASTA file cor-
 	responding ALISCORE "List" outfile(s). If structure sequences
 	are implemented, ALICUT V2.3 automatically replaces brackets 
 	of non rss positions by dots when they are paired with rss 
 	identified positions.
 	Start ALICUT under default
 	-------------------------------------------------------------------
 	To remove all ALISCORE identified RSS positions:
 	Type <s> return (via Menu) or
 	Type <perl ALICUT_V2.3.pl -s> <enter> (via command line)
 	R-Option (Remain Stems)
 	-------------------------------------------------------------------
 	To remain all stem positions of identified rss within FASTA file(s): 
 	Type <r> <return> <s> <enter> (via Menu)
 	Type <perl ALICUT_V2.3.pl -r -s> <enter> (via command line)
 	C-Option (Remove Codon)
 	-------------------------------------------------------------------
 	To translate ALISCORE identified RSS positions of amino-acid data
 	into nucleotide triplet positions before exclusion of randomised
 	sequence sections:
 	Type <c> return <s> return (via Menu) or
 	Type <perl ALICUT_V2.3.pl -c -s> <enter> (via command line)
 	Note: 
 	This option is only useful if you have analysed amino-acid 
 	data, but wish to exclude nucleotide positions from the amino-acid 
 	data corresponding nucleotide data.
 	Be aware, that the name of the nucleotide data file has to be named 
 	equal to the ALISCORE analysed amino-acid data file. The C-option
 	can not be applied on amino-acid sequences. Otherwise, ALICUT
 	excludes the original ALISCORE identified sequence sections.
 	3-Option (Remove 3rd position)
 	-------------------------------------------------------------------
 	To remove ALISCORE identified RSS only if its sequence position is 
 	up to amultiple of 3:
 	Type <3> <return> <s> <return> (via Menu)
 	Type <perl ALICUT_V2.3.pl -3 -s> <enter> (via command line)
 	Note: 
 	The 3-Option can be combined with the C-option. In this case,
 	positions of the ALISCORE "List" outfile(s) are translated into
 	codon positions from which only the 3rd positions are excluded.
 	The 3-Option can only be applied on nucleotide data. Otherwise, 
 	ALICUT excludes the original ALISCORE identified sequence sections.
 	ALICUT IN and OUT files
 	-------------------------------------------------------------------
 	ALICUT V2.3 needs the original ALISCORE FASTA infile(s) and "List"
 	outfile(s) in the same folder as ALICUT V2.3.
 	The "List" outfile(s) must contain the identified RSS positions
 	in one single line, separated by whitespace.
 	e.g. 1 3 5 6 8 9 10 11 123 127 10000 10001
 	ALICUT V2.0 can handle unlimited FASTA files in one single run.
 	The sole condition is that the Prefix of the ALISCORE "List" 
 	outfile(s) are identic with the associated FASTA infile(s). 
 	ALICUT V2.3 first searches for the ALISCORE "List" outfile(s), 
 	removes the Suffix "_List_random.txt" and searches for the 
 	"List" associated FASTA file(s).
 	e.g. COI.fas_List_random.txt (ALISCORE "List" outfile)
 	     COI.fas                 (Associated FASTA infile)
 	If both files are detected, ALICUT V2.3 excludes the RSS identified 
 	positions of the "List" file(s) in the associated
 	FASTA file(s) and saves the changes in a new FASTA outfile,
 	named "ALICUT_FASTAinputname.fas".
 	Under the C- and 3-Option, removed sequence positions differ from
 	the original "List" position numbers. Under both options, ALICUT 
 	prints the actually removed positions in separate "ALICUT_LIST" 
 	outfile(s).
 	ALICUT V2.3 generates also an info file "ALICUT_info". This file 
 	informs about the number and percentage of removed positions, number 
 	of single sequences, single parameter settings, and sequence states 
 	of each restricted FASTA file. 
 	If structure sequences are identified by ALICUT, ALICUT generates
 	structure info file(s) which lists remaining stem pairs and loop 
 	positions, as well as percentages of both structure elements.
 	-------------------------------------------------------------------
 	-------------------------------------------------------------------
 info
 ;
 	print  "\tBACK to ALICUT MAIN-Menu:\t\t type <return>\n"                    ;
 	print  "\n\t------------------------------------------------------------\n\t"  ;
 	chomp ( my $answer_xy = <STDIN> );
 	&menu ;
 }
 sub preface{
 print
 <<preface
 	--------------------FASconCAT PREFACE---------------------
 	Version     : 2.31
 	Language    : PERL
 	Last Update : 22nd February, 2017
 	Author      : Patrick Kueck, ZFMK Bonn GERMANY
 	e-mail      : patrick_kueck\@web.de
 	Homepage    : http://www.zfmk.de
 	This program is free software; you can whitedistribute it 
 	and/or modify it under the terms of the GNU General Public 
 	License as published by the Free Software Foundation ; 
 	either version 2 of the License, or (at your option) any 
 	later version.
 	This program is distributed in the hope that it will be 
 	useful, but WITHOUT ANY WARRANTY; without even the 
 	implied warranty of MERCHANTABILITY or FITNESS FOR A 
 	PARTICULAR PURPOSE. See the GNU General Public License for 
 	more details. 
 	You should have received a copy of the GNU General Public 
 	License along with this program; if not, write to the Free 
 	Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, 
 	USA.
 	For further free downloadable programs visit:
 	www.zfmk.de/web/Forschung/Abteilungen/AG_Wgele/index.en.html
 	------------------------------------------------------------
 preface
 ; 
 	print  "\tBACK to ALICUT MAIN-Menu:\t\t type <return>\n"                       ;
 	print  "\n\t------------------------------------------------------------\n\t"  ;
 	chomp ( my $answer_xy = <STDIN> );
 	&menu;
 }
 sub menu{
 	my $aref_remain_stems = $_[0] ;
 	my $aref_remove_codon = $_[1] ;
 	my $aref_third_posit  = $_[2] ;
 	&header ;
 	print "\n\tSTART ALICUT:\t\ttype <s> <return>"                                        ;
 	print "\n\tQUIT  ALICUT:\t\ttype <q> <return>"                                        ;
 	print "\n\tREMAIN STEMS:\t\ttype <r> <return>"                                        ;
 	print "\n\tREMOVE CODON:\t\ttype <c> <return>"                                        ;
 	print "\n\tREMOVE   3rd:\t\ttype <3> <return>"                                        ;
 	print "\n\tHELP:\t\t\ttype <h> <return>"                                              ;
 	print "\n\tPREFACE:\t\ttype <p> <return>"                                             ;
 	&commands ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] );
 	my       $answer_opening =  &commandline ;
 	until  ( $answer_opening =~ /^s$|^r$|^c$|^p$|^h$|^1$|^2$|^q$|^3$/i ){ 
 		print "\n\t!COMMAND-ERROR!: unknown command \"$answer_opening\"!\n" ;
 		$answer_opening =  &commandline ;
 	}
 	$answer_opening =~ /^s$/i      and do { &start ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] ) } ;
 	$answer_opening =~ /^r$/i      and do { @$aref_remain_stems = (reverse @$aref_remain_stems ); &menu                            } ;
 	$answer_opening =~ /^c$/i      and do { @$aref_remove_codon = (reverse @$aref_remove_codon ); &menu                            } ;
 	$answer_opening =~ /^3$/i      and do { @$aref_third_posit  = (reverse @$aref_third_posit  ); &menu                            } ;
 	$answer_opening =~ /^q$/i      and do {                                                        exit                            } ;
 	$answer_opening =~ /^h$/i      and do {                                                       &help                            } ;
 	$answer_opening =~ /^1$/       and do {                                                       &error1                          } ;
 	$answer_opening =~ /^2$/       and do {                                                       &error2                          } ;
 	$answer_opening =~ /^p$/i      and do {                                                       &preface                         }
 }
 sub start{
 	my $sref_stems_remain = $_[0] ;
 	my $sref_codon_remove = $_[1] ;
 	my $sref_third_remove = $_[2] ;
 	my $j = 0  ;
 	open  OUTinfo, ">>ALICUT_info.xls" ;
 	print OUTinfo  "\nUsed List File\tUsed Fasta file\tremove triplets\tremove 3rd position\tnumber taxa\tbp before\tbp after\tremaining bp [%]\tsequence type\n"  ;
 	# Read IN of all List_random.txt files within the same folder as ALICUT and handle it
 	READING:
 	foreach my $file ( <*List_*.txt> ) {
 		# Set counter +1
 		$j++;
 		# Read in of the ALISCORE-list outfile
 		&tie_linefeeds ( \$file ) ;
 		( open IN, "<$file" ) or die "n\t!FILE-ERROR!: Can not open listfile $file!\n" ;
 		my $line = <IN> ; chomp $line ;
 		# check for correct aliscore list format
 		unless ( $line =~ /^(\d+ )+\d+$|^\d+$/ ) { warn "\t!FILE-WARN!: $file has no ALISCORE list format!\n" ; next READING }
 		# Total number of randomized identified positions
 		my @cut_positions = split " ", $line  ; close IN ;
 		# "filename.fas_List_random.txt" to "filename.fas"
 		( my $file_fasta = $file ) =~ s/_List_.+//  ;
 		# Read in of the original ALISCORE fasta infile which belongs to the listfile
 		&tie_linefeeds ( \$file_fasta ) ;
 		( open INfas, "<$file_fasta" ) or warn "\t!FILE-WARN!: Can not find $file_fasta!\n" and next READING ;
 		chomp ( my @inputfile = <INfas> ) ; close INfas ;
 		warn  "\t!FILE-WARN!: File $file_fasta is empty!\n" if 0 == @inputfile and next READING ;
 		# Handle the FASTA file in the way that sequencename and sequence alternate in each line
 		@inputfile                   = fas_bearbeiten ( @inputfile ) ;
 		# Generate a hash: key=>taxon, value => sequenz
 		my %sequence                 = @inputfile ;
 		my @values                   = values %sequence ;
 		# Determine basepositions before und after cut. Output of cuttings as total number and in percent
 		my $number_sequences         = keys %sequence ;
 	    my $number_characters_before = length $values[0] ;
 		# Check for correct FASTA format and handling of structure sequence
 		my $sequence_state = 'nt' ;
 		SEQUENCE_CHECK:
 		for my $raw_taxon ( keys %sequence ){
 				# if whitespace are between ">" and the next sign within a sequence name, delete these whitespaces
 				$raw_taxon =~ s/^\>\s*/\>/g ;
 				# if whitespaces between last sign and newline in sequence name, delete these whitespaces
 				$raw_taxon =~ s/\s*$//g ;
 				die    "\n\t!FILE-ERROR!: $raw_taxon in $file_fasta is not in FASTA format!\n"                     if           $raw_taxon                  !~ /^\>/                             ;
 				die    "\n\t!FILE-ERROR!: Sequence name missing in $file_fasta!\n"                                 if           $raw_taxon                  =~ /^\>$/                            ;
 				die    "\n\t!FILE-ERROR!: Sequence name $raw_taxon in $file_fasta involves forbidden signs!\n"     if           $raw_taxon                  !~ /\w/                              ;
 				die    "\n\t!FILE-ERROR!: Sequences of $file_fasta have no equal length!\n"                        if length    $sequence{$raw_taxon}       != $number_characters_before         ;
 				die    "\n\t!FILE-ERROR!: Sequence missing in $file_fasta!\n"                                      if           $sequence{$raw_taxon}       =~ /^\n$|^$/                         ;
 				die    "\n\t!FILE-ERROR!: Sequence length in $file_fasta is too short to cut all positions!\n"     if           $number_characters_before   <  $cut_positions[ $#cut_positions ] ;
 				# Structure handling
 				if ( $sequence{$raw_taxon} =~ /.*\(.*\).*/ ){
 					$sequence{$raw_taxon}  =~ s/-/./g  ;
 					my @strc_elements      =  split "" , $sequence{$raw_taxon} ;
 					for my $str_sign ( @strc_elements ){ 
 						unless ( $str_sign =~ /\(|\)|\./ ){ die "\n\t!FILE-ERROR!: Structure string of $file_fasta involves forbidden signs in $raw_taxon!\n" }
 					}
 					my $structurestring       =  $sequence{$raw_taxon} ; 
 					   $structurestring       =~ s/-/./g ;
 					   $sequence{$raw_taxon}  =  &structure_handling ( \$structurestring, \$$sref_stems_remain, \@cut_positions, \$file_fasta ); next SEQUENCE_CHECK ;
 				}
 				# Check for correct sequence states
 				$sequence{$raw_taxon}   =~ s/(\w+)/\U$1/ig ;
 				my @seq_elements           = split "" , $sequence{$raw_taxon} ;
 				for my $seq_sign ( @seq_elements ){ 
 					unless ( $seq_sign =~ /A|C|G|T|U|-|N|Y|X|R|W|S|K|M|D|V|H|B|Q|E|I|L|F|P|\?/ ){ die "\n\t!FILE-ERROR!: Sequence of $file_fasta involves forbidden signs in $raw_taxon!\n" }
 				}
 				if ( $sequence{$raw_taxon}  =~ /I|E|L|Q|F|P/ ) { $sequence_state = 'aa' }
 		}
 		# Translate cut positions
 		my @fasta_cut;
 		&translate_cut_positions( \$$sref_codon_remove, \$$sref_third_remove, \@cut_positions, \$number_characters_before, \@fasta_cut, \$sequence_state, \$file_fasta );
 		# Calculate percent of remaining positions
 		my $number_cut_positions     = @cut_positions ;
 		my $number_characters_after  = $number_characters_before-$number_cut_positions ;
 		my $percent_left =  sprintf "%.1f", ( $number_characters_after / $number_characters_before ) * 100 ;
 		   $percent_left =~ s/\./,/g ;
 		# Assume uncut positions to $final and print out to ALICUT_$file_fasta
 		if    ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_codon_3rd_$file_fasta" }
 		elsif ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /no/  ){ open OUT, ">ALICUT_codon_$file_fasta"     }
 		elsif ( $$sref_codon_remove =~ /no/  && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_3rd_$file_fasta"       }
 		else                                                                  { open OUT, ">ALICUT_$file_fasta"           }
 		for ( keys %sequence ){
 			my @bases = split "", $sequence{$_}          ;
 			my @final = map { $bases[$_] } @fasta_cut    ;
 			my $final = $_."\n".( join "", @final )."\n" ;
 			print OUT "$final" ;
 		}
 		close OUT;
 		# Print Out of extra infos to ALICUT_info
 		print OUTinfo  "$file\t$file_fasta\t$$sref_codon_remove\t$$sref_third_remove\t$number_sequences\t$number_characters_before\t$number_characters_after\t$percent_left\t$sequence_state\n" ;
 		print          "\tDone  : $file cut to ALICUT_$file_fasta\n" 
 	}
 	close OUTinfo  ;
 	# Print OUT number of right handled FASTA files in relation to total number of files
 	printf "\n%68s\n",   "------------------------------------------------------------" ;
 	printf "%42s\n",     "$j FASTA file(s) correctly handled!"                          ;
 	printf "%57s\n",     "Further infos are printed out in Alicut_info.txt!"            ;
 	printf "\n%63s\n",   "ALICUT V2.0 Finished! Thank you and good bye!"                ;
 	printf "%68s\n",     "------------------------------------------------------------" ;
 	&set_timer ;
 	exit ;
 	sub tie_linefeeds{
 		my $sref_filename = $_[0] ;
 		( open IN , "<$$sref_filename" ) or warn "\tError: can not open $$sref_filename!\n" and next READING ;
 		(tie ( my @data, 'Tie::File', $$sref_filename )) ;
 		warn "\t!FILE-WARN!: $$sref_filename is empty!\n" and next READING if 0 == @data ;
 		map { s/\r\n/\n/g } @data ;
 		map { s/\r/\n/g   } @data ;
 		untie @data ; close IN ;
 	}
 	sub set_timer{
 			my ( $user, $system, $cuser, $csystem ) = times ;
 print <<TIME;
 			***  time used: $user sec  ***
 TIME
 	}
 	sub translate_cut_positions {
 		my $sref_command_codon_remove = $_[0] ;
 		my $sref_command_third_remove = $_[1] ;
 		my $aref_cut_positions        = $_[2] ;
 		my $sref_number_characters    = $_[3] ;
 		my $aref_remaining_positions  = $_[4] ;
 		my $sref_sequence_state       = $_[5] ;
 		my $sref_filename             = $_[6] ;
 		# Translate identified RSS aminoacid positions to nucleotide triplet positions
 		if ( $$sref_command_codon_remove =~ /yes/ && $$sref_command_third_remove =~ /no/){
 			unless ( $$sref_sequence_state =~ /aa/ ){
 				my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
 				for my $number( @fasta_old ){
 					my $newno1 = ($number*3)-2;
 					my $newno2 = $newno1+1;
 					my $newno3 = $newno2+1;
 					push @$aref_cut_positions, ( $newno1, $newno2, $newno3 )
 				}
 				my $string_cutnumbers = join " ",  @$aref_cut_positions ;
 				open  OUTnewcut, ">ALICUT_cut_positions_codon.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon.txt" ;
 				print OUTnewcut  $string_cutnumbers ; close OUTnewcut ;
 			}
 			else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!" }
 		}
 		# Translate identified RSS aminoacid positions to nucleotide triplet positions, but remove only third position
 		elsif ( $$sref_command_codon_remove =~ /yes/ && $$sref_command_third_remove =~ /yes/){
 			unless ( $$sref_sequence_state =~ /aa/ ){
 				my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
 				for my $number( @fasta_old ){ 
 					push @$aref_cut_positions, ($number*3) 
 				}
 				my $string_cutnumbers = join " ",  @$aref_cut_positions ;
 				open  OUTnewcut, ">ALICUT_cut_positions_codon_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon_3rd.txt" ;
 				print OUTnewcut  $string_cutnumbers ; close OUTnewcut ;
 			}
 			else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!\n\t3rd codon position not removed!" }
 		}
 		# Remove only identified RSS if third position of original sequence 
 		elsif ( $$sref_command_codon_remove =~ /no/ && $$sref_command_third_remove =~ /yes/){
 			unless ( $$sref_sequence_state =~ /aa/ ){
 				my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = ();
 				for my $number( @fasta_old ){
 					if ( $number % 3 == 0 ){ push @$aref_cut_positions, $number }
 				}
 				my $string_cutnumbers = join " ",  @$aref_cut_positions ;
 				open  OUTnewcut, ">ALICUT_cut_positions_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_3rd.txt" ;
 				print OUTnewcut  $string_cutnumbers ; close OUTnewcut
 			}
 			else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tNot only 3rd codon position removed!" }
 		}
 		# Examine remaining positions
 		my  ( %seen, @zahlenreihe ) ;
 		for ( 1 .. $$sref_number_characters ) { push @zahlenreihe, $_-1 }
 		for my $value ( @$aref_cut_positions ){ $seen{$value-1}++ }
 		for           ( @zahlenreihe         ){ unless ( $seen{$_} ){ push @$aref_remaining_positions, $_ } }
 	}
 }
 sub fas_bearbeiten{
 	my @infile = @_                   ;
 	grep  s/(\>.*)/$1\t/,     @infile ;
 	grep  s/ //g,             @infile ;
 	grep  s/\n//g,            @infile ;
 	grep  s/\t/\n/g,          @infile ;
 	grep  s/\>/\n\>/g,        @infile ;
 	my $string = join "",     @infile ;
 	@infile    = split "\n",  $string ;
 	shift                     @infile ;
 	return                    @infile ;
 }
 sub structure_handling{
 	my $sref_string        = $_[0] ;
 	my $sref_answer_remain = $_[1] ;
 	my $aref_cut_positions = $_[2] ;
 	my $sref_filename      = $_[3] ;
 	my ( 
 		@pair_infos            ,
 		@forward               ,
 		@structurestring       ,
 		@loops                 ,
 		@pairs                 ,
 		%structure_of_position ,
 		%seen_struc
 	);
 	# Stem assignment
 	my @structures = split "", $$sref_string ;
 	my  $i = 0                                                                                                         	                  ;
 	CHECKING:
 	for ( @structures ){ $i++                                                                                                             ;
 		SWITCH:
 		$structure_of_position{$i} = $_                                                                                                   ;
 		if ( $_  =~ /\(/ ){ push @forward, $i                                                                          and next CHECKING  }
 		if ( $_  =~ /\)/ ){ my $pair_1 = pop @forward; push @pairs, ( $pair_1, $i ); push @pair_infos, ( $pair_1.":".$i ); next CHECKING  }
 		if ( $_  =~ /\./ ){ push @loops,   $i                                                                          and next CHECKING  }
 	}
 	@pair_infos  =  reverse @pair_infos                                                                                                   ;
 	# Generate listfiles for structure_info file
 	my $pairlist =  join "\n\t\t\t\t\t", @pair_infos   ;
 	my $looplist =  join "\n\t\t\t\t\t", @loops        ;
 	# Number and proportion of stem and loop positions for structure info file
 	my $N_total  =  @structures                        ;
 	my $N_stems  =  @pair_infos                        ;
 	my $N_loops  =  $N_total - ( $N_stems * 2 )        ;
 	my $P_loops  =  ( $N_loops / $N_total ) * 100      ;
 	my $P_stems  =  100 - $P_loops                     ;
 	# Open structure info outfile
 	open OUTstruc, ">ALICUT_Struc_info_${$sref_filename}.txt"                                  ;
 	# Print out
 	print OUTstruc "\nOriginal structure information identified in $$sref_filename:\n\n"  ;
 	print OUTstruc "- Number of characters:\t\t\t$N_total\n"                              ;
 	print OUTstruc "- Number of single loop characters:\t$N_loops [$P_stems %]\n"         ;
 	print OUTstruc "- Number of paired stem characters:\t$N_stems [$P_loops %]\n"         ;
 	print OUTstruc "\n- Paired stem positions:\t\t$pairlist\n\n"                          ;
 	print OUTstruc "\n- Loop positions:\t\t\t$looplist\n"                                 ;
 	close OUTstruc;
 	if  ( $$sref_answer_remain =~ /yes/i ){
 		my @cut_positions2 = ();
 		# Remain rss identified stem positions within the MSA
 		for ( @pairs ){ $seen_struc{$_} = 1                                                   }
 		for ( @$aref_cut_positions ){ unless ( $seen_struc{$_} ){ push @cut_positions2, $_  } }
 		@$aref_cut_positions = @cut_positions2                                                ;
 	}
 	else{
 		my %pair = @pairs;
 		# Replace paired structure positions of rss identified positions by dots
 		for my $bp_for ( keys %pair ){
 			for my $rss ( @$aref_cut_positions ){
 				if ( $bp_for        == $rss ){ $structure_of_position{$pair{$bp_for}}  = "." ; last }
 				if ( $pair{$bp_for} == $rss ){ $structure_of_position{$bp_for}         = "." ; last }
 			}
 		}
 	}
 	for    ( my $k=1; $k<=@structures-1; $k++ ){ push @structurestring, $structure_of_position{$k}   }
 	my     $structure_string_neu = join "", @structurestring                                       ;
 	return $structure_string_neu                                                                   ;
 }
 sub commandline{
 	print  "\n\tCOMMAND:\t "                                                          ;
 	chomp ( my $sub_answer_opening = <STDIN> );
 	print  "\n\t------------------------------------------------------------\n"        ;
 	return $sub_answer_opening;
 }	
--- a/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl
+++ b/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl
--- a/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore_module.pm
+++ b/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore_module.pm
--- a/skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py
+++ b/skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py
@@ -0,0 +1,174 @@
 #!/usr/bin/env python3
 """
 Query NCBI for available genome assemblies by taxon name
 Usage:
    python query_ncbi_assemblies.py --taxon "Coleoptera"
    python query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50
    python query_ncbi_assemblies.py --taxon "Apis" --refseq-only
 Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib)
 Author: Bruno de Medeiros (Field Museum)
 """
 import argparse
 import sys
 def query_assemblies_by_taxon(taxon, max_results=20, refseq_only=False):
    """
    Query NCBI for genome assemblies of a given taxon
    Args:
        taxon: Taxon name (e.g., "Coleoptera", "Drosophila melanogaster")
        max_results: Maximum number of results to return
        refseq_only: If True, only return RefSeq assemblies (GCF_*)
    Returns:
        List of dictionaries with assembly information
    """
    try:
        from ncbi.datasets import GenomeApi
        from ncbi.datasets.openapi import ApiClient, ApiException
    except ImportError:
        print("Error: ncbi-datasets-pylib not installed", file=sys.stderr)
        print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr)
        sys.exit(1)
    assemblies = []
    print(f"Querying NCBI for '{taxon}' genome assemblies...")
    print(f"(Limiting to {max_results} results)")
    if refseq_only:
        print("(RefSeq assemblies only)")
    print("")
    try:
        with ApiClient() as api_client:
            api = GenomeApi(api_client)
            # Query genome assemblies for the taxon
            genome_summary = api.genome_summary_by_taxon(
                taxon=taxon,
                limit=str(max_results),
                filters_refseq_only=refseq_only
            )
            if not genome_summary.reports:
                print(f"No assemblies found for taxon '{taxon}'")
                return []
            for report in genome_summary.reports:
                assembly_info = {
                    'accession': report.accession,
                    'organism': report.organism.organism_name,
                    'assembly_level': report.assembly_info.assembly_level,
                    'assembly_name': report.assembly_info.assembly_name,
                    'submission_date': report.assembly_info.release_date if hasattr(report.assembly_info, 'release_date') else 'N/A'
                }
                assemblies.append(assembly_info)
    except ApiException as e:
        print(f"Error querying NCBI: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}", file=sys.stderr)
        sys.exit(1)
    return assemblies
 def format_table(assemblies):
    """
    Format assemblies as a readable table
    Args:
        assemblies: List of assembly dictionaries
    """
    if not assemblies:
        return
    print(f"Found {len(assemblies)} assemblies:\n")
    # Print header
    print(f"{'#':<4} {'Accession':<20} {'Organism':<40} {'Level':<15} {'Assembly Name':<30}")
    print("-" * 110)
    # Print data rows
    for i, asm in enumerate(assemblies, 1):
        organism = asm['organism'][:38] + '..' if len(asm['organism']) > 40 else asm['organism']
        assembly_name = asm['assembly_name'][:28] + '..' if len(asm['assembly_name']) > 30 else asm['assembly_name']
        print(f"{i:<4} {asm['accession']:<20} {organism:<40} {asm['assembly_level']:<15} {assembly_name:<30}")
    print("")
 def save_accessions(assemblies, output_file):
    """
    Save assembly accessions to a file
    Args:
        assemblies: List of assembly dictionaries
        output_file: Output file path
    """
    with open(output_file, 'w') as f:
        for asm in assemblies:
            f.write(f"{asm['accession']}\n")
    print(f"Accessions saved to: {output_file}")
    print(f"You can download these assemblies using:")
    print(f"  python download_ncbi_genomes.py --assemblies $(cat {output_file})")
 def main():
    parser = argparse.ArgumentParser(
        description="Query NCBI for available genome assemblies by taxon name",
        epilog="Example: python query_ncbi_assemblies.py --taxon 'Coleoptera' --max-results 50"
    )
    parser.add_argument(
        "--taxon",
        required=True,
        help="Taxon name (e.g., 'Coleoptera', 'Drosophila melanogaster')"
    )
    parser.add_argument(
        "--max-results",
        type=int,
        default=20,
        help="Maximum number of results to return (default: 20)"
    )
    parser.add_argument(
        "--refseq-only",
        action="store_true",
        help="Only return RefSeq assemblies (GCF_* accessions)"
    )
    parser.add_argument(
        "--save",
        metavar="FILE",
        help="Save accessions to a file for later download"
    )
    args = parser.parse_args()
    # Query NCBI
    assemblies = query_assemblies_by_taxon(
        taxon=args.taxon,
        max_results=args.max_results,
        refseq_only=args.refseq_only
    )
    # Display results
    format_table(assemblies)
    # Save if requested
    if args.save and assemblies:
        save_accessions(assemblies, args.save)
 if __name__ == "__main__":
    main()
--- a/skills/phylo_from_buscos/scripts/rename_genomes.py
+++ b/skills/phylo_from_buscos/scripts/rename_genomes.py
@@ -0,0 +1,240 @@
 #!/usr/bin/env python3
 """
 Rename genome files with clean, meaningful sample names for phylogenomics
 This script helps create a mapping between genome files (often with cryptic
 accession numbers) and clean species/sample names that will appear in the
 final phylogenetic tree.
 Usage:
    # Interactive mode - prompts for names
    python rename_genomes.py --interactive genome1.fasta genome2.fasta
    # From mapping file (TSV: old_name<TAB>new_name)
    python rename_genomes.py --mapping samples.tsv
    # Create template mapping file
    python rename_genomes.py --create-template *.fasta > samples.tsv
 Author: Bruno de Medeiros (Field Museum)
 Based on tutorials by Paul Frandsen (BYU)
 """
 import argparse
 import os
 import sys
 import shutil
 from pathlib import Path
 def sanitize_name(name):
    """
    Sanitize a name to be phylogenomics-safe
    - Replace spaces with underscores
    - Remove special characters
    - Keep only alphanumeric, underscore, hyphen
    """
    # Replace spaces with underscores
    name = name.replace(' ', '_')
    # Remove special characters except underscore and hyphen
    name = ''.join(c for c in name if c.isalnum() or c in '_-')
    return name
 def create_template(genome_files, output=sys.stdout):
    """Create a template mapping file"""
    output.write("# Sample mapping file\n")
    output.write("# Format: original_filename<TAB>new_sample_name\n")
    output.write("# Edit the second column with meaningful species/sample names\n")
    output.write("# Recommended format: [ACCESSION]_[NAME] (e.g., GCA000123456_Penstemon_eatonii)\n")
    output.write("# This keeps accession for traceability while having readable names in trees\n")
    output.write("# Names should contain only letters, numbers, underscores, and hyphens\n")
    output.write("#\n")
    for gfile in genome_files:
        basename = Path(gfile).stem  # Remove extension
        output.write(f"{gfile}\t{basename}\n")
 def read_mapping(mapping_file):
    """Read mapping from TSV file"""
    mapping = {}
    with open(mapping_file, 'r') as f:
        for line in f:
            line = line.strip()
            # Skip comments and empty lines
            if not line or line.startswith('#'):
                continue
            parts = line.split('\t')
            if len(parts) != 2:
                print(f"Warning: Skipping invalid line: {line}", file=sys.stderr)
                continue
            old_name, new_name = parts
            new_name = sanitize_name(new_name)
            mapping[old_name] = new_name
    return mapping
 def interactive_rename(genome_files):
    """Interactively ask for new names"""
    mapping = {}
    print("Enter new sample names for each genome file.")
    print("Press Enter to keep the current name.")
    print("Names will be sanitized (spaces→underscores, special chars removed)\n")
    for gfile in genome_files:
        current_name = Path(gfile).stem
        new_name = input(f"{gfile} → [{current_name}]: ").strip()
        if not new_name:
            new_name = current_name
        new_name = sanitize_name(new_name)
        mapping[gfile] = new_name
        print(f"  Will rename to: {new_name}.fasta\n")
    return mapping
 def rename_files(mapping, dry_run=False, backup=True):
    """Rename genome files according to mapping"""
    renamed = []
    errors = []
    for old_file, new_name in mapping.items():
        if not os.path.exists(old_file):
            errors.append(f"File not found: {old_file}")
            continue
        # Get extension from original file
        ext = Path(old_file).suffix
        if not ext:
            ext = '.fasta'
        new_file = f"{new_name}{ext}"
        # Check if target exists
        if os.path.exists(new_file) and new_file != old_file:
            errors.append(f"Target exists: {new_file}")
            continue
        # Skip if names are the same
        if old_file == new_file:
            print(f"Skip (no change): {old_file}")
            continue
        if dry_run:
            print(f"[DRY RUN] Would rename: {old_file} → {new_file}")
        else:
            # Backup if requested
            if backup:
                backup_file = f"{old_file}.backup"
                shutil.copy2(old_file, backup_file)
                print(f"Backup created: {backup_file}")
            # Rename
            shutil.move(old_file, new_file)
            print(f"Renamed: {old_file} → {new_file}")
            renamed.append((old_file, new_file))
    return renamed, errors
 def main():
    parser = argparse.ArgumentParser(
        description="Rename genome files with meaningful sample names for phylogenomics",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Create template mapping file
  python rename_genomes.py --create-template *.fasta > samples.tsv
  # Edit samples.tsv, then apply mapping
  python rename_genomes.py --mapping samples.tsv
  # Interactive renaming
  python rename_genomes.py --interactive genome1.fasta genome2.fasta
  # Dry run (preview changes)
  python rename_genomes.py --mapping samples.tsv --dry-run
        """
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '--create-template',
        nargs='+',
        metavar='GENOME',
        help='Create a template mapping file from genome files'
    )
    group.add_argument(
        '--mapping',
        metavar='FILE',
        help='TSV file with mapping (old_name<TAB>new_name)'
    )
    group.add_argument(
        '--interactive',
        nargs='+',
        metavar='GENOME',
        help='Interactively rename genome files'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be renamed without actually renaming'
    )
    parser.add_argument(
        '--no-backup',
        action='store_true',
        help='Do not create backup files'
    )
    args = parser.parse_args()
    # Create template
    if args.create_template:
        create_template(args.create_template)
        return
    # Interactive mode
    if args.interactive:
        mapping = interactive_rename(args.interactive)
    # Mapping file mode
    elif args.mapping:
        mapping = read_mapping(args.mapping)
    else:
        parser.error("No mode specified")
    if not mapping:
        print("No files to rename", file=sys.stderr)
        return
    # Perform renaming
    renamed, errors = rename_files(
        mapping,
        dry_run=args.dry_run,
        backup=not args.no_backup
    )
    # Summary
    print("\n" + "="*60)
    if args.dry_run:
        print("DRY RUN - No files were actually renamed")
    else:
        print(f"Successfully renamed {len(renamed)} file(s)")
    if errors:
        print(f"\nErrors ({len(errors)}):")
        for error in errors:
            print(f"  - {error}")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/skills/phylo_from_buscos/scripts/run_alicut.sh
+++ b/skills/phylo_from_buscos/scripts/run_alicut.sh
@@ -0,0 +1,247 @@
 #!/bin/bash
 # run_alicut.sh
 # Wrapper script for running ALICUT to remove Aliscore-identified RSS positions
 # Removes randomly similar sequence sections from alignments
 #
 # Usage:
 #   bash run_alicut.sh [aliscore_dir] [options]
 #
 # Options:
 #   -r         Remain stem positions (for RNA secondary structures)
 #   -c         Remove codon (translate AA positions to nucleotide triplets)
 #   -3         Remove only 3rd codon positions
 #   -s         Silent mode (non-interactive, use defaults)
 #
 # Requirements:
 #   - ALICUT_V2.31.pl in PATH or same directory
 #   - Perl with File::Copy, Tie::File, Term::Cap modules
 #   - Aliscore output directory with *_List_*.txt and original .fas file
 set -euo pipefail
 # Script directory
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Check for ALICUT script
 if command -v ALICUT_V2.31.pl &> /dev/null; then
    ALICUT_SCRIPT="ALICUT_V2.31.pl"
 elif [ -f "${SCRIPT_DIR}/ALICUT_V2.31.pl" ]; then
    ALICUT_SCRIPT="${SCRIPT_DIR}/ALICUT_V2.31.pl"
 elif [ -f "./ALICUT_V2.31.pl" ]; then
    ALICUT_SCRIPT="./ALICUT_V2.31.pl"
 else
    echo "ERROR: ALICUT_V2.31.pl not found in PATH, script directory, or current directory"
    echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/alicut"
    exit 1
 fi
 # Function to display usage
 usage() {
    cat <<EOF
 Usage: $0 [aliscore_dir] [options]
 Run ALICUT to remove Aliscore-identified randomly similar sequence sections.
 Arguments:
  aliscore_dir   Directory containing Aliscore output files
 Options:
  -r             Remain stem positions in RNA secondary structure alignments
  -c             Remove entire codon (translates AA RSS positions to nt triplets)
  -3             Remove only 3rd codon position of identified RSS
  -s             Silent/scripted mode (non-interactive, use defaults)
  -h             Display this help message
 Input Requirements:
  The aliscore_dir must contain:
    - Original FASTA alignment file (*.fas)
    - Aliscore List file (*_List_random.txt or *_List_*.txt)
 Examples:
  # Basic usage (interactive mode)
  bash run_alicut.sh aliscore_alignment1
  # Silent mode with defaults
  bash run_alicut.sh aliscore_alignment1 -s
  # Remain RNA stem positions
  bash run_alicut.sh aliscore_16S -r -s
  # Remove entire codons (for back-translation)
  bash run_alicut.sh aliscore_protein1 -c -s
  # Process all Aliscore output directories
  for dir in aliscore_*/; do
    bash run_alicut.sh "\${dir}" -s
  done
 Output Files (in aliscore_dir):
  - ALICUT_[alignment].fas        : Trimmed alignment
  - ALICUT_info.xls               : Statistics (taxa, positions removed, etc.)
  - ALICUT_Struc_info_*.txt       : Structure information (if RNA detected)
 Citation:
  Kück P, Meusemann K, Dambach J, Thormann B, von Reumont BM, Wägele JW,
  Misof B (2010) Parametric and non-parametric masking of randomness in
  sequence alignments can be improved and leads to better resolved trees.
  Front Zool 7:10. doi: 10.1186/1742-9994-7-10
 EOF
    exit 0
 }
 # Parse command line arguments
 ALISCORE_DIR=""
 ALICUT_OPTS=""
 SILENT_MODE=false
 if [ $# -eq 0 ]; then
    usage
 fi
 ALISCORE_DIR="$1"
 shift
 # Validate directory exists
 if [ ! -d "${ALISCORE_DIR}" ]; then
    echo "ERROR: Aliscore directory not found: ${ALISCORE_DIR}"
    exit 1
 fi
 # Parse ALICUT options
 while [ $# -gt 0 ]; do
    case "$1" in
        -h|--help)
            usage
            ;;
        -r)
            ALICUT_OPTS="${ALICUT_OPTS} -r"
            shift
            ;;
        -c)
            ALICUT_OPTS="${ALICUT_OPTS} -c"
            shift
            ;;
        -3)
            ALICUT_OPTS="${ALICUT_OPTS} -3"
            shift
            ;;
        -s|--silent)
            SILENT_MODE=true
            ALICUT_OPTS="${ALICUT_OPTS} -s"
            shift
            ;;
        *)
            echo "ERROR: Unknown option: $1"
            usage
            ;;
    esac
 done
 # Change to Aliscore output directory
 cd "${ALISCORE_DIR}"
 echo "Processing Aliscore output in: ${ALISCORE_DIR}"
 # Find List file
 LIST_FILE=$(ls *_List_*.txt 2>/dev/null | head -n 1)
 if [ -z "${LIST_FILE}" ]; then
    echo "ERROR: No Aliscore List file found (*_List_*.txt)"
    echo "Make sure Aliscore completed successfully"
    exit 1
 fi
 echo "Found List file: ${LIST_FILE}"
 # Find original FASTA file
 FASTA_FILE=$(find . -maxdepth 1 \( -name "*.fas" -o -name "*.fasta" \) -type f | head -n 1 | sed 's|^\./||')
 if [ -z "${FASTA_FILE}" ]; then
    echo "ERROR: No FASTA alignment file found (*.fas or *.fasta)"
    echo "ALICUT requires the original alignment file in the same directory as List file"
    exit 1
 fi
 echo "Found FASTA file: ${FASTA_FILE}"
 # Check if List file contains RSS positions
 RSS_COUNT=$(wc -w < "${LIST_FILE}" || echo "0")
 if [ "${RSS_COUNT}" -eq 0 ]; then
    echo "WARNING: List file is empty (no RSS positions identified)"
    echo "Aliscore found no randomly similar sequences to remove"
    echo "Skipping ALICUT - alignment is already clean"
    # Create a symbolic link to indicate no trimming was needed
    ln -sf "${FASTA_FILE}" "ALICUT_${FASTA_FILE}"
    echo "Created symbolic link: ALICUT_${FASTA_FILE} -> ${FASTA_FILE}"
    cd ..
    exit 0
 fi
 echo "Found ${RSS_COUNT} RSS positions to remove"
 # Run ALICUT
 echo ""
 echo "Running ALICUT..."
 echo "Options: ${ALICUT_OPTS}"
 # Construct ALICUT command
 ALICUT_CMD="perl ${ALICUT_SCRIPT} ${ALICUT_OPTS}"
 if [ "${SILENT_MODE}" = true ]; then
    echo "Command: ${ALICUT_CMD}"
    eval ${ALICUT_CMD}
 else
    echo "Running ALICUT in interactive mode..."
    echo "Press 's' and Enter to start with current options"
    echo ""
    perl "${ALICUT_SCRIPT}" ${ALICUT_OPTS}
 fi
 # Check if ALICUT completed successfully
 if [ $? -eq 0 ]; then
    echo ""
    echo "ALICUT completed successfully"
    # Find output file
    OUTPUT_FILE=$(ls ALICUT_*.fas ALICUT_*.fasta 2>/dev/null | head -n 1)
    if [ -n "${OUTPUT_FILE}" ]; then
        echo ""
        echo "Output files:"
        ls -lh ALICUT_* 2>/dev/null
        # Calculate and report trimming statistics (handle multi-line FASTA format)
        if [ -f "${OUTPUT_FILE}" ]; then
            ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${FASTA_FILE}" | head -n 1 | wc -c)
            TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${OUTPUT_FILE}" | head -n 1 | wc -c)
            REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH))
            PERCENT_REMOVED=$(awk "BEGIN {printf \"%.1f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}")
            echo ""
            echo "Trimming statistics:"
            echo "  Original length: ${ORIGINAL_LENGTH} bp"
            echo "  Trimmed length:  ${TRIMMED_LENGTH} bp"
            echo "  Removed:         ${REMOVED_LENGTH} bp (${PERCENT_REMOVED}%)"
        fi
        # Check for info file
        if [ -f "ALICUT_info.xls" ]; then
            echo ""
            echo "Detailed statistics in: ALICUT_info.xls"
        fi
    else
        echo "WARNING: Expected output file ALICUT_*.fas not found"
    fi
 else
    echo "ERROR: ALICUT failed"
    cd ..
    exit 1
 fi
 # Return to parent directory
 cd ..
 echo ""
 echo "Done: ${ALISCORE_DIR}"
--- a/skills/phylo_from_buscos/scripts/run_aliscore.sh
+++ b/skills/phylo_from_buscos/scripts/run_aliscore.sh
@@ -0,0 +1,248 @@
 #!/bin/bash
 # run_aliscore.sh
 # Wrapper script for running Aliscore on aligned sequences
 # Identifies randomly similar sequence sections (RSS) in multiple sequence alignments
 #
 # Usage:
 #   bash run_aliscore.sh [alignment.fas] [options]
 #
 # Options:
 #   -w INT     Window size (default: 4)
 #   -r INT     Number of random pairs to compare (default: 4*N taxa)
 #   -N         Treat gaps as ambiguous characters (recommended for amino acids)
 #   -t TREE    Tree file in Newick format for guided comparisons
 #   -l LEVEL   Node level for tree-based comparisons
 #   -o TAXA    Comma-separated list of outgroup taxa
 #
 # Array job usage:
 #   Set SLURM_ARRAY_TASK_ID or PBS_ARRAYID environment variable
 #   Create locus_list.txt with one alignment file per line
 #
 # Requirements:
 #   - Aliscore.02.2.pl in PATH or same directory
 #   - Perl with Tie::File and Fcntl modules
 set -euo pipefail
 # Script directory
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Check for Aliscore script
 if command -v Aliscore.02.2.pl &> /dev/null; then
    ALISCORE_SCRIPT="Aliscore.02.2.pl"
 elif [ -f "${SCRIPT_DIR}/Aliscore.02.2.pl" ]; then
    ALISCORE_SCRIPT="${SCRIPT_DIR}/Aliscore.02.2.pl"
 elif [ -f "./Aliscore.02.2.pl" ]; then
    ALISCORE_SCRIPT="./Aliscore.02.2.pl"
 else
    echo "ERROR: Aliscore.02.2.pl not found in PATH, script directory, or current directory"
    echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/aliscore"
    exit 1
 fi
 # Function to display usage
 usage() {
    cat <<EOF
 Usage: $0 [alignment.fas] [options]
 Run Aliscore to identify randomly similar sequence sections in alignments.
 Options:
  -d DIR     Base output directory for all Aliscore results (default: aliscore_output)
  -w INT     Window size for sliding window analysis (default: 4)
  -r INT     Number of random sequence pairs to compare (default: 4*N taxa)
  -N         Treat gaps as ambiguous characters (recommended for amino acids)
  -t FILE    Tree file in Newick format for phylogeny-guided comparisons
  -l LEVEL   Node level limit for tree-based comparisons (default: all)
  -o TAXA    Comma-separated list of outgroup taxa for focused comparisons
  -h         Display this help message
 Array Job Mode:
  If SLURM_ARRAY_TASK_ID or PBS_ARRAYID is set, reads alignment from locus_list.txt
  Create locus_list.txt with: ls *.fas > locus_list.txt
 Examples:
  # Basic run with defaults (outputs to aliscore_output/)
  bash run_aliscore.sh alignment.fas
  # Amino acid sequences with gaps as ambiguous
  bash run_aliscore.sh protein_alignment.fas -N
  # Custom output directory
  bash run_aliscore.sh alignment.fas -d my_aliscore_results
  # Custom window size and random pairs
  bash run_aliscore.sh alignment.fas -w 6 -r 100
  # Tree-guided analysis
  bash run_aliscore.sh alignment.fas -t species.tre
  # Array job on SLURM
  ls aligned_aa/*.fas > locus_list.txt
  sbatch --array=1-\$(wc -l < locus_list.txt) run_aliscore_array.job
 Output Files (in aliscore_output/aliscore_[alignment]/):
  - [alignment]_List_random.txt   : Positions identified as RSS (for ALICUT)
  - [alignment]_Profile_random.txt: Quality profile for each position
  - [alignment].svg               : Visual plot of scoring profiles
 Citation:
  Misof B, Misof K (2009) A Monte Carlo approach successfully identifies
  randomness in multiple sequence alignments: a more objective means of data
  exclusion. Syst Biol 58(1):21-34. doi: 10.1093/sysbio/syp006
 EOF
    exit 0
 }
 # Parse command line arguments
 ALIGNMENT=""
 ALISCORE_OPTS=""
 BASE_OUTPUT_DIR="aliscore_output"
 if [ $# -eq 0 ]; then
    usage
 fi
 # Check for array job mode
 ARRAY_MODE=false
 ARRAY_ID=""
 if [ -n "${SLURM_ARRAY_TASK_ID:-}" ]; then
    ARRAY_MODE=true
    ARRAY_ID="${SLURM_ARRAY_TASK_ID}"
 elif [ -n "${PBS_ARRAYID:-}" ]; then
    ARRAY_MODE=true
    ARRAY_ID="${PBS_ARRAYID}"
 fi
 # If in array mode, get alignment from locus list
 if [ "${ARRAY_MODE}" = true ]; then
    if [ ! -f "locus_list.txt" ]; then
        echo "ERROR: Array job mode requires locus_list.txt"
        echo "Create with: ls *.fas > locus_list.txt"
        exit 1
    fi
    ALIGNMENT=$(sed -n "${ARRAY_ID}p" locus_list.txt)
    if [ -z "${ALIGNMENT}" ]; then
        echo "ERROR: Could not read alignment for array index ${ARRAY_ID}"
        exit 1
    fi
    echo "Array job ${ARRAY_ID}: Processing ${ALIGNMENT}"
    # Remaining arguments are Aliscore options
    shift $#  # Clear positional parameters
    set -- "$@"  # Reset with remaining args
 else
    # First argument is alignment file
    ALIGNMENT="$1"
    shift
 fi
 # Validate alignment file exists
 if [ ! -f "${ALIGNMENT}" ]; then
    echo "ERROR: Alignment file not found: ${ALIGNMENT}"
    exit 1
 fi
 # Parse Aliscore options
 while [ $# -gt 0 ]; do
    case "$1" in
        -h|--help)
            usage
            ;;
        -d|--output-dir)
            BASE_OUTPUT_DIR="$2"
            shift 2
            ;;
        -w)
            ALISCORE_OPTS="${ALISCORE_OPTS} -w $2"
            shift 2
            ;;
        -r)
            ALISCORE_OPTS="${ALISCORE_OPTS} -r $2"
            shift 2
            ;;
        -N)
            ALISCORE_OPTS="${ALISCORE_OPTS} -N"
            shift
            ;;
        -t)
            if [ ! -f "$2" ]; then
                echo "ERROR: Tree file not found: $2"
                exit 1
            fi
            ALISCORE_OPTS="${ALISCORE_OPTS} -t $2"
            shift 2
            ;;
        -l)
            ALISCORE_OPTS="${ALISCORE_OPTS} -l $2"
            shift 2
            ;;
        -o)
            ALISCORE_OPTS="${ALISCORE_OPTS} -o $2"
            shift 2
            ;;
        *)
            echo "ERROR: Unknown option: $1"
            usage
            ;;
    esac
 done
 # Get alignment name without extension
 ALIGNMENT_NAME=$(basename "${ALIGNMENT}" .fas)
 ALIGNMENT_NAME=$(basename "${ALIGNMENT_NAME}" .fasta)
 # Create base output directory and specific directory for this alignment
 mkdir -p "${BASE_OUTPUT_DIR}"
 OUTPUT_DIR="${BASE_OUTPUT_DIR}/aliscore_${ALIGNMENT_NAME}"
 mkdir -p "${OUTPUT_DIR}"
 # Copy alignment to output directory
 cp "${ALIGNMENT}" "${OUTPUT_DIR}/"
 # Change to output directory
 cd "${OUTPUT_DIR}"
 # Run Aliscore
 echo "Running Aliscore on ${ALIGNMENT}..."
 echo "Options: ${ALISCORE_OPTS}"
 echo "Aliscore script: ${ALISCORE_SCRIPT}"
 # Construct and run Aliscore command
 ALISCORE_CMD="perl -I${SCRIPT_DIR} ${ALISCORE_SCRIPT} -i $(basename ${ALIGNMENT}) ${ALISCORE_OPTS}"
 echo "Command: ${ALISCORE_CMD}"
 eval ${ALISCORE_CMD}
 # Check if Aliscore completed successfully
 if [ $? -eq 0 ]; then
    echo "Aliscore completed successfully for ${ALIGNMENT}"
    # List output files
    echo ""
    echo "Output files in ${OUTPUT_DIR}:"
    ls -lh *List*.txt *Profile*.txt *.svg 2>/dev/null || echo "  (some expected files not generated)"
    # Report RSS positions if found
    if [ -f "$(basename ${ALIGNMENT})_List_random.txt" ]; then
        RSS_COUNT=$(wc -w < "$(basename ${ALIGNMENT})_List_random.txt")
        echo ""
        echo "Identified ${RSS_COUNT} randomly similar sequence positions"
        echo "See: ${OUTPUT_DIR}/$(basename ${ALIGNMENT})_List_random.txt"
    fi
 else
    echo "ERROR: Aliscore failed for ${ALIGNMENT}"
    cd ..
    exit 1
 fi
 # Return to parent directory
 cd ..
 echo "Done: ${ALIGNMENT} -> ${OUTPUT_DIR}"
--- a/skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh
+++ b/skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh
@@ -0,0 +1,270 @@
 #!/bin/bash
 # run_aliscore_alicut_batch.sh
 # Batch processing script for Aliscore + ALICUT alignment trimming
 # Processes all alignments in a directory through both tools sequentially
 #
 # Usage:
 #   bash run_aliscore_alicut_batch.sh [alignment_dir] [options]
 #
 # This script:
 #   1. Runs Aliscore on all alignments to identify RSS
 #   2. Runs ALICUT on each Aliscore output to remove RSS
 #   3. Collects trimmed alignments in output directory
 #
 # Requirements:
 #   - run_aliscore.sh and run_alicut.sh in same directory or PATH
 #   - Aliscore.02.2.pl and ALICUT_V2.31.pl available
 set -euo pipefail
 # Script directory
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Function to display usage
 usage() {
    cat <<EOF
 Usage: $0 [alignment_dir] [options]
 Batch process multiple alignments through Aliscore and ALICUT.
 Arguments:
  alignment_dir   Directory containing aligned FASTA files (*.fas)
 Options:
  -o DIR         Output directory for trimmed alignments (default: aliscore_alicut_trimmed)
  -d DIR         Base directory for Aliscore outputs (default: aliscore_output)
  -w INT         Aliscore window size (default: 4)
  -r INT         Aliscore random pairs (default: 4*N)
  -N             Aliscore: treat gaps as ambiguous (recommended for AA)
  --remain-stems ALICUT: remain RNA stem positions
  --remove-codon ALICUT: remove entire codons (for back-translation)
  --remove-3rd   ALICUT: remove only 3rd codon positions
  -h             Display this help message
 Examples:
  # Basic usage for amino acid alignments
  bash run_aliscore_alicut_batch.sh aligned_aa/ -N
  # Custom window size
  bash run_aliscore_alicut_batch.sh aligned_aa/ -w 6 -N
  # With RNA structure preservation
  bash run_aliscore_alicut_batch.sh aligned_rrna/ --remain-stems
 Output:
  - aliscore_output/aliscore_[locus]/  : Individual Aliscore results per locus
  - aliscore_alicut_trimmed/           : Final trimmed alignments
  - aliscore_alicut_trimmed/trimming_summary.txt : Statistics for all loci
 EOF
    exit 0
 }
 # Default parameters
 ALIGNMENT_DIR=""
 OUTPUT_DIR="aliscore_alicut_trimmed"
 ALISCORE_BASE_DIR="aliscore_output"
 ALISCORE_OPTS=""
 ALICUT_OPTS="-s"  # Silent mode by default
 if [ $# -eq 0 ]; then
    usage
 fi
 ALIGNMENT_DIR="$1"
 shift
 # Validate alignment directory
 if [ ! -d "${ALIGNMENT_DIR}" ]; then
    echo "ERROR: Alignment directory not found: ${ALIGNMENT_DIR}"
    exit 1
 fi
 # Parse options
 while [ $# -gt 0 ]; do
    case "$1" in
        -h|--help)
            usage
            ;;
        -o|--output)
            OUTPUT_DIR="$2"
            shift 2
            ;;
        -d|--aliscore-dir)
            ALISCORE_BASE_DIR="$2"
            shift 2
            ;;
        -w)
            ALISCORE_OPTS="${ALISCORE_OPTS} -w $2"
            shift 2
            ;;
        -r)
            ALISCORE_OPTS="${ALISCORE_OPTS} -r $2"
            shift 2
            ;;
        -N)
            ALISCORE_OPTS="${ALISCORE_OPTS} -N"
            shift
            ;;
        --remain-stems)
            ALICUT_OPTS="${ALICUT_OPTS} -r"
            shift
            ;;
        --remove-codon)
            ALICUT_OPTS="${ALICUT_OPTS} -c"
            shift
            ;;
        --remove-3rd)
            ALICUT_OPTS="${ALICUT_OPTS} -3"
            shift
            ;;
        *)
            echo "ERROR: Unknown option: $1"
            usage
            ;;
    esac
 done
 # Check for wrapper scripts
 RUN_ALISCORE="${SCRIPT_DIR}/run_aliscore.sh"
 RUN_ALICUT="${SCRIPT_DIR}/run_alicut.sh"
 if [ ! -f "${RUN_ALISCORE}" ]; then
    echo "ERROR: run_aliscore.sh not found: ${RUN_ALISCORE}"
    exit 1
 fi
 if [ ! -f "${RUN_ALICUT}" ]; then
    echo "ERROR: run_alicut.sh not found: ${RUN_ALICUT}"
    exit 1
 fi
 # Create output directory
 mkdir -p "${OUTPUT_DIR}"
 # Find all FASTA files
 ALIGNMENTS=($(find "${ALIGNMENT_DIR}" -maxdepth 1 -name "*.fas" -o -name "*.fasta"))
 if [ ${#ALIGNMENTS[@]} -eq 0 ]; then
    echo "ERROR: No FASTA files found in ${ALIGNMENT_DIR}"
    exit 1
 fi
 echo "Found ${#ALIGNMENTS[@]} alignments to process"
 echo "Aliscore options: ${ALISCORE_OPTS}"
 echo "ALICUT options: ${ALICUT_OPTS}"
 echo ""
 # Initialize summary file
 SUMMARY_FILE="${OUTPUT_DIR}/trimming_summary.txt"
 echo -e "Locus\tOriginal_Length\tTrimmed_Length\tRemoved_Positions\tPercent_Removed\tRSS_Count" > "${SUMMARY_FILE}"
 # Process each alignment
 SUCCESS_COUNT=0
 FAIL_COUNT=0
 for ALIGNMENT in "${ALIGNMENTS[@]}"; do
    LOCUS=$(basename "${ALIGNMENT}" .fas)
    LOCUS=$(basename "${LOCUS}" .fasta)
    echo "=========================================="
    echo "Processing: ${LOCUS}"
    echo "=========================================="
    # Step 1: Run Aliscore
    echo ""
    echo "Step 1/2: Running Aliscore..."
    if bash "${RUN_ALISCORE}" "${ALIGNMENT}" -d "${ALISCORE_BASE_DIR}" ${ALISCORE_OPTS}; then
        echo "Aliscore completed for ${LOCUS}"
    else
        echo "ERROR: Aliscore failed for ${LOCUS}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi
    # Step 2: Run ALICUT
    echo ""
    echo "Step 2/2: Running ALICUT..."
    ALISCORE_DIR="${ALISCORE_BASE_DIR}/aliscore_${LOCUS}"
    if [ ! -d "${ALISCORE_DIR}" ]; then
        echo "ERROR: Aliscore output directory not found: ${ALISCORE_DIR}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi
    if bash "${RUN_ALICUT}" "${ALISCORE_DIR}" ${ALICUT_OPTS}; then
        echo "ALICUT completed for ${LOCUS}"
    else
        echo "ERROR: ALICUT failed for ${LOCUS}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi
    # Copy trimmed alignment to output directory
    TRIMMED_FILE=$(find "${ALISCORE_DIR}" -name "ALICUT_*.fas" -o -name "ALICUT_*.fasta" | head -n 1)
    if [ -n "${TRIMMED_FILE}" ] && [ -f "${TRIMMED_FILE}" ]; then
        cp "${TRIMMED_FILE}" "${OUTPUT_DIR}/${LOCUS}_trimmed.fas"
        echo "Trimmed alignment: ${OUTPUT_DIR}/${LOCUS}_trimmed.fas"
        # Calculate statistics (handle multi-line FASTA format)
        ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${ALIGNMENT}" | head -n 1 | tr -d ' ' | wc -c)
        TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${TRIMMED_FILE}" | head -n 1 | tr -d ' ' | wc -c)
        REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH))
        PERCENT_REMOVED=$(awk "BEGIN {printf \"%.2f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}")
        # Count RSS positions
        LIST_FILE=$(find "${ALISCORE_DIR}" -name "*_List_*.txt" | head -n 1)
        RSS_COUNT=$(wc -w < "${LIST_FILE}" 2>/dev/null || echo "0")
        # Append to summary
        echo -e "${LOCUS}\t${ORIGINAL_LENGTH}\t${TRIMMED_LENGTH}\t${REMOVED_LENGTH}\t${PERCENT_REMOVED}\t${RSS_COUNT}" >> "${SUMMARY_FILE}"
        SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
    else
        echo "WARNING: Trimmed file not found for ${LOCUS}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
    fi
    echo ""
 done
 # Final report
 echo "=========================================="
 echo "BATCH PROCESSING COMPLETE"
 echo "=========================================="
 echo ""
 echo "Successfully processed: ${SUCCESS_COUNT}/${#ALIGNMENTS[@]} alignments"
 echo "Failed: ${FAIL_COUNT}/${#ALIGNMENTS[@]} alignments"
 echo ""
 echo "Output directory: ${OUTPUT_DIR}"
 echo "Trimmed alignments: ${OUTPUT_DIR}/*_trimmed.fas"
 echo "Summary statistics: ${SUMMARY_FILE}"
 echo ""
 # Display summary statistics
 if [ ${SUCCESS_COUNT} -gt 0 ]; then
    echo "Overall trimming statistics:"
    awk 'NR>1 {
        total_orig += $2;
        total_trim += $3;
        total_removed += $4;
        count++
    }
    END {
        if (count > 0) {
            avg_removed = (total_removed / total_orig) * 100;
            printf "  Total positions before: %d\n", total_orig;
            printf "  Total positions after:  %d\n", total_trim;
            printf "  Total removed:          %d (%.2f%%)\n", total_removed, avg_removed;
            printf "  Average per locus:      %.2f%% removed\n", avg_removed;
        }
    }' "${SUMMARY_FILE}"
 fi
 echo ""
 echo "Done!"
--- a/skills/phylo_from_buscos/templates/README.md
+++ b/skills/phylo_from_buscos/templates/README.md
@@ -0,0 +1,125 @@
 # Phylogenomics Workflow Templates
 This directory contains template scripts for running the phylogenomics pipeline across different computing environments.
 ## Directory Structure
 ```
 templates/
 ├── slurm/      # SLURM job scheduler templates
 ├── pbs/        # PBS/Torque job scheduler templates
 └── local/      # Local machine templates (with GNU parallel support)
 ```
 ## Template Naming Convention
 Templates follow a consistent naming pattern: `NN_step_name[_variant].ext`
 - `NN`: Step number (e.g., `02` for compleasm, `08a` for partition search)
 - `step_name`: Descriptive name of the pipeline step
 - `_variant`: Optional variant (e.g., `_first`, `_parallel`, `_serial`)
 - `.ext`: File extension (`.job` for schedulers, `.sh` for local scripts)
 ## Available Templates
 ### Step 2: Ortholog Identification (compleasm)
 **SLURM:**
 - `02_compleasm_first.job` - Process first genome to download lineage database
 - `02_compleasm_parallel.job` - Array job for remaining genomes
 **PBS:**
 - `02_compleasm_first.job` - Process first genome to download lineage database
 - `02_compleasm_parallel.job` - Array job for remaining genomes
 **Local:**
 - `02_compleasm_first.sh` - Process first genome to download lineage database
 - `02_compleasm_parallel.sh` - GNU parallel for remaining genomes
 ### Step 8A: Partition Model Selection
 **SLURM:**
 - `08a_partition_search.job` - IQ-TREE partition model search with TESTMERGEONLY
 **PBS:**
 - `08a_partition_search.job` - IQ-TREE partition model search with TESTMERGEONLY
 **Local:**
 - `08a_partition_search.sh` - IQ-TREE partition model search with TESTMERGEONLY
 ### Step 8C: Individual Gene Trees
 **SLURM:**
 - `08c_gene_trees_array.job` - Array job for parallel gene tree estimation
 **PBS:**
 - `08c_gene_trees_array.job` - Array job for parallel gene tree estimation
 **Local:**
 - `08c_gene_trees_parallel.sh` - GNU parallel for gene tree estimation
 - `08c_gene_trees_serial.sh` - Serial processing (for debugging/limited resources)
 ## Placeholders
 Templates contain placeholders that must be replaced with user-specific values:
 | Placeholder | Description | Example |
 |-------------|-------------|---------|
 | `TOTAL_THREADS` | Total CPU cores available | `64` |
 | `THREADS_PER_JOB` | Threads per concurrent job | `16` |
 | `NUM_GENOMES` | Number of genomes in analysis | `20` |
 | `NUM_LOCI` | Number of loci/alignments | `2795` |
 | `LINEAGE` | BUSCO lineage dataset | `insecta_odb10` |
 | `MODEL_SET` | Comma-separated substitution models | `LG,WAG,JTT,Q.pfam` |
 ## Usage
 ### For Claude (LLM)
 When a user requests scripts for a specific computing environment:
 1. **Read the appropriate template** using the Read tool
 2. **Replace placeholders** with user-specified values
 3. **Present the customized script** to the user
 4. **Provide setup instructions** (e.g., how many genomes, how to calculate thread allocation)
 Example:
 ```python
 # Read template
 template = Read("templates/slurm/02_compleasm_first.job")
 # Replace placeholders
 script = template.replace("TOTAL_THREADS", "64")
 script = script.replace("LINEAGE", "insecta_odb10")
 # Present to user
 print(script)
 ```
 ### For Users
 Templates are not meant to be used directly. Instead:
 1. Follow the workflow in `SKILL.md`
 2. Answer Claude's questions about your setup
 3. Claude will fetch the appropriate template and customize it for you
 4. Copy the customized script Claude provides
 ## Benefits of This Structure
 1. **Reduced token usage**: Claude only reads templates when needed
 2. **Easier maintenance**: Update one template file instead of multiple locations in SKILL.md
 3. **Consistency**: All users get the same base template structure
 4. **Clarity**: Separate files are easier to review than inline code
 5. **Extensibility**: Easy to add new templates for additional tools or variants
 ## Adding New Templates
 When adding new templates:
 1. **Follow naming convention**: `NN_descriptive_name[_variant].ext`
 2. **Include clear comments**: Explain what the script does
 3. **Use consistent placeholders**: Match existing placeholder names
 4. **Test thoroughly**: Ensure placeholders are complete and correct
 5. **Update this README**: Add the new template to the "Available Templates" section
 6. **Update SKILL.md**: Reference the new template in the appropriate workflow step
--- a/skills/phylo_from_buscos/templates/local/02_compleasm_first.sh
+++ b/skills/phylo_from_buscos/templates/local/02_compleasm_first.sh
@@ -0,0 +1,26 @@
 #!/bin/bash
 # run_compleasm_first.sh
 source ~/.bashrc
 conda activate phylo
 # User-specified total CPU threads
 TOTAL_THREADS=TOTAL_THREADS  # Replace with total cores you want to use (e.g., 16, 32, 64)
 echo "Processing first genome with ${TOTAL_THREADS} CPU threads to download lineage database..."
 # Create output directory
 mkdir -p 01_busco_results
 # Process FIRST genome only
 first_genome=$(head -n 1 genome_list.txt)
 genome_name=$(basename ${first_genome} .fasta)
 echo "Processing: ${genome_name}"
 compleasm run \
  -a ${first_genome} \
  -o 01_busco_results/${genome_name}_compleasm \
  -l LINEAGE \
  -t ${TOTAL_THREADS}
 echo ""
 echo "First genome complete! Lineage database is now cached."
 echo "Now run the parallel script for remaining genomes: bash run_compleasm_parallel.sh"
--- a/skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh
+++ b/skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh
@@ -0,0 +1,33 @@
 #!/bin/bash
 # run_compleasm_parallel.sh
 source ~/.bashrc
 conda activate phylo
 # Threading configuration (adjust based on your system)
 TOTAL_THREADS=TOTAL_THREADS      # Total cores to use (e.g., 64)
 THREADS_PER_JOB=THREADS_PER_JOB  # Threads per genome (e.g., 16)
 CONCURRENT_JOBS=$((TOTAL_THREADS / THREADS_PER_JOB))  # Calculated automatically
 echo "Configuration:"
 echo "  Total threads:      ${TOTAL_THREADS}"
 echo "  Threads per genome: ${THREADS_PER_JOB}"
 echo "  Concurrent genomes: ${CONCURRENT_JOBS}"
 echo ""
 # Create output directory
 mkdir -p 01_busco_results
 # Process remaining genomes (skip first one) in parallel
 tail -n +2 genome_list.txt | parallel -j ${CONCURRENT_JOBS} '
  genome_name=$(basename {} .fasta)
  echo "Processing ${genome_name} with THREADS_PER_JOB threads..."
  compleasm run \
    -a {} \
    -o 01_busco_results/${genome_name}_compleasm \
    -l LINEAGE \
    -t THREADS_PER_JOB
 '
 echo ""
 echo "All genomes processed!"
--- a/skills/phylo_from_buscos/templates/local/08a_partition_search.sh
+++ b/skills/phylo_from_buscos/templates/local/08a_partition_search.sh
@@ -0,0 +1,20 @@
 #!/bin/bash
 source ~/.bashrc
 conda activate phylo
 cd 06_concatenation
 iqtree \
  -s FcC_supermatrix.fas \
  -spp partition_def.txt \
  -nt 18 \
  -safe \
  -pre partition_search \
  -m TESTMERGEONLY \
  -mset MODEL_SET \
  -msub nuclear \
  -rcluster 10 \
  -bb 1000 \
  -alrt 1000
 echo "Partition search complete! Best scheme: partition_search.best_scheme.nex"
--- a/skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh
+++ b/skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh
@@ -0,0 +1,17 @@
 #!/bin/bash
 source ~/.bashrc
 conda activate phylo
 cd trimmed_aa
 # Create list of alignments
 ls *_trimmed.fas > locus_alignments.txt
 # Run IQ-TREE in parallel (adjust -j for number of concurrent jobs)
 cat locus_alignments.txt | parallel -j 4 '
  prefix=$(basename {} _trimmed.fas)
  iqtree -s {} -m MFP -bb 1000 -bnni -czb -pre ${prefix} -nt 1
  echo "Tree complete: ${prefix}"
 '
 echo "All gene trees complete!"
--- a/skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh
+++ b/skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh
@@ -0,0 +1,13 @@
 #!/bin/bash
 source ~/.bashrc
 conda activate phylo
 cd trimmed_aa
 for locus in *_trimmed.fas; do
    prefix=$(basename ${locus} _trimmed.fas)
    echo "Processing ${prefix}..."
    iqtree -s ${locus} -m MFP -bb 1000 -bnni -czb -pre ${prefix} -nt 1
 done
 echo "All gene trees complete!"
--- a/skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job
+++ b/skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job
@@ -0,0 +1,27 @@
 #!/bin/bash
 #PBS -N compleasm_first
 #PBS -l nodes=1:ppn=TOTAL_THREADS  # Replace with total available CPUs (e.g., 64)
 #PBS -l mem=384gb  # Adjust based on ppn × 6GB
 #PBS -l walltime=24:00:00
 cd $PBS_O_WORKDIR
 source ~/.bashrc
 conda activate phylo
 mkdir -p logs
 mkdir -p 01_busco_results
 # Process FIRST genome only (downloads lineage database)
 first_genome=$(head -n 1 genome_list.txt)
 genome_name=$(basename ${first_genome} .fasta)
 echo "Processing first genome: ${genome_name} with $PBS_NUM_PPN threads..."
 echo "This will download the BUSCO lineage database for subsequent runs."
 compleasm run \
  -a ${first_genome} \
  -o 01_busco_results/${genome_name}_compleasm \
  -l LINEAGE \
  -t $PBS_NUM_PPN
 echo "First genome complete! Lineage database is now cached."
 echo "Submit the parallel job for remaining genomes: qsub run_compleasm_parallel.job"
--- a/skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job
+++ b/skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job
@@ -0,0 +1,24 @@
 #!/bin/bash
 #PBS -N compleasm_parallel
 #PBS -t 2-NUM_GENOMES  # Start from genome 2 (first genome already processed)
 #PBS -l nodes=1:ppn=THREADS_PER_JOB  # e.g., 16 for 64-core system
 #PBS -l mem=96gb  # Adjust based on ppn × 6GB
 #PBS -l walltime=48:00:00
 cd $PBS_O_WORKDIR
 source ~/.bashrc
 conda activate phylo
 mkdir -p 01_busco_results
 # Get genome for this array task
 genome=$(sed -n "${PBS_ARRAYID}p" genome_list.txt)
 genome_name=$(basename ${genome} .fasta)
 echo "Processing ${genome_name} with $PBS_NUM_PPN threads..."
 compleasm run \
  -a ${genome} \
  -o 01_busco_results/${genome_name}_compleasm \
  -l LINEAGE \
  -t $PBS_NUM_PPN
--- a/skills/phylo_from_buscos/templates/pbs/08a_partition_search.job
+++ b/skills/phylo_from_buscos/templates/pbs/08a_partition_search.job
@@ -0,0 +1,22 @@
 #!/bin/bash
 #PBS -N iqtree_partition
 #PBS -l nodes=1:ppn=18
 #PBS -l mem=72gb
 #PBS -l walltime=72:00:00
 cd $PBS_O_WORKDIR/06_concatenation
 source ~/.bashrc
 conda activate phylo
 iqtree \
  -s FcC_supermatrix.fas \
  -spp partition_def.txt \
  -nt 18 \
  -safe \
  -pre partition_search \
  -m TESTMERGEONLY \
  -mset MODEL_SET \
  -msub nuclear \
  -rcluster 10 \
  -bb 1000 \
  -alrt 1000
--- a/skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job
+++ b/skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job
@@ -0,0 +1,26 @@
 #!/bin/bash
 #PBS -N iqtree_genes
 #PBS -t 1-NUM_LOCI
 #PBS -l nodes=1:ppn=1
 #PBS -l mem=4gb
 #PBS -l walltime=2:00:00
 cd $PBS_O_WORKDIR/trimmed_aa
 source ~/.bashrc
 conda activate phylo
 # Create list of alignments if not present
 if [ ! -f locus_alignments.txt ]; then
    ls *_trimmed.fas > locus_alignments.txt
 fi
 locus=$(sed -n "${PBS_ARRAYID}p" locus_alignments.txt)
 iqtree \
  -s ${locus} \
  -m MFP \
  -bb 1000 \
  -bnni \
  -czb \
  -pre $(basename ${locus} _trimmed.fas) \
  -nt 1
--- a/skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job
+++ b/skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job
@@ -0,0 +1,28 @@
 #!/bin/bash
 #SBATCH --job-name=compleasm_first
 #SBATCH --cpus-per-task=TOTAL_THREADS  # Replace with total available CPUs (e.g., 64)
 #SBATCH --mem-per-cpu=6G
 #SBATCH --time=24:00:00
 #SBATCH --output=logs/compleasm_first.%j.out
 #SBATCH --error=logs/compleasm_first.%j.err
 source ~/.bashrc
 conda activate phylo
 mkdir -p logs
 mkdir -p 01_busco_results
 # Process FIRST genome only (downloads lineage database)
 first_genome=$(head -n 1 genome_list.txt)
 genome_name=$(basename ${first_genome} .fasta)
 echo "Processing first genome: ${genome_name} with ${SLURM_CPUS_PER_TASK} threads..."
 echo "This will download the BUSCO lineage database for subsequent runs."
 compleasm run \
  -a ${first_genome} \
  -o 01_busco_results/${genome_name}_compleasm \
  -l LINEAGE \
  -t ${SLURM_CPUS_PER_TASK}
 echo "First genome complete! Lineage database is now cached."
 echo "Submit the parallel job for remaining genomes: sbatch run_compleasm_parallel.job"
--- a/skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job
+++ b/skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job
@@ -0,0 +1,25 @@
 #!/bin/bash
 #SBATCH --job-name=compleasm_parallel
 #SBATCH --array=2-NUM_GENOMES  # Start from genome 2 (first genome already processed)
 #SBATCH --cpus-per-task=THREADS_PER_JOB  # e.g., 16 for 64-core system with 4 concurrent jobs
 #SBATCH --mem-per-cpu=6G
 #SBATCH --time=48:00:00
 #SBATCH --output=logs/compleasm.%A_%a.out
 #SBATCH --error=logs/compleasm.%A_%a.err
 source ~/.bashrc
 conda activate phylo
 mkdir -p 01_busco_results
 # Get genome for this array task (skipping the first one)
 genome=$(sed -n "${SLURM_ARRAY_TASK_ID}p" genome_list.txt)
 genome_name=$(basename ${genome} .fasta)
 echo "Processing ${genome_name} with ${SLURM_CPUS_PER_TASK} threads..."
 compleasm run \
  -a ${genome} \
  -o 01_busco_results/${genome_name}_compleasm \
  -l LINEAGE \
  -t ${SLURM_CPUS_PER_TASK}
--- a/skills/phylo_from_buscos/templates/slurm/08a_partition_search.job
+++ b/skills/phylo_from_buscos/templates/slurm/08a_partition_search.job
@@ -0,0 +1,27 @@
 #!/bin/bash
 #SBATCH --job-name=iqtree_partition
 #SBATCH --cpus-per-task=18
 #SBATCH --mem-per-cpu=4G
 #SBATCH --time=72:00:00
 #SBATCH --output=logs/partition_search.out
 #SBATCH --error=logs/partition_search.err
 source ~/.bashrc
 conda activate phylo
 cd 06_concatenation  # Use organized directory structure
 iqtree \
  -s FcC_supermatrix.fas \
  -spp partition_def.txt \
  -nt ${SLURM_CPUS_PER_TASK} \
  -safe \
  -pre partition_search \
  -m TESTMERGEONLY \
  -mset MODEL_SET \
  -msub nuclear \
  -rcluster 10 \
  -bb 1000 \
  -alrt 1000
 # Output: partition_search.best_scheme.nex
--- a/skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job
+++ b/skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job
@@ -0,0 +1,28 @@
 #!/bin/bash
 #SBATCH --job-name=iqtree_genes
 #SBATCH --array=1-NUM_LOCI
 #SBATCH --cpus-per-task=1
 #SBATCH --mem-per-cpu=4G
 #SBATCH --time=2:00:00
 #SBATCH --output=logs/%A_%a.genetree.out
 source ~/.bashrc
 conda activate phylo
 cd trimmed_aa
 # Create list of alignments if not present
 if [ ! -f locus_alignments.txt ]; then
    ls *_trimmed.fas > locus_alignments.txt
 fi
 locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" locus_alignments.txt)
 iqtree \
  -s ${locus} \
  -m MFP \
  -bb 1000 \
  -bnni \
  -czb \
  -pre $(basename ${locus} _trimmed.fas) \
  -nt 1