From c1d9dee646cd34dd9241005b06f1c6ea1e11c4a7 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 18:02:37 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 13 + README.md | 3 + plugin.lock.json | 180 ++ skills/biogeobears/README.md | 222 ++ skills/biogeobears/SKILL.md | 581 +++++ .../references/biogeobears_details.md | 358 +++ .../scripts/biogeobears_analysis_template.Rmd | 404 +++ .../scripts/validate_geography_file.py | 299 +++ skills/phylo_from_buscos/.skillignore | 12 + skills/phylo_from_buscos/README.md | 99 + skills/phylo_from_buscos/SKILL.md | 757 ++++++ .../phylo_from_buscos/references/REFERENCE.md | 2225 +++++++++++++++++ .../scripts/convert_fasconcat_to_partition.py | 63 + .../scripts/download_ncbi_genomes.py | 133 + .../scripts/extract_orthologs.sh | 88 + .../scripts/generate_qc_report.sh | 59 + .../ALICUT_V2.31.pl | 742 ++++++ .../Aliscore.02.2.pl | 1271 ++++++++++ .../Aliscore_module.pm | 2081 +++++++++++++++ .../scripts/query_ncbi_assemblies.py | 174 ++ .../scripts/rename_genomes.py | 240 ++ .../phylo_from_buscos/scripts/run_alicut.sh | 247 ++ .../phylo_from_buscos/scripts/run_aliscore.sh | 248 ++ .../scripts/run_aliscore_alicut_batch.sh | 270 ++ skills/phylo_from_buscos/templates/README.md | 125 + .../templates/local/02_compleasm_first.sh | 26 + .../templates/local/02_compleasm_parallel.sh | 33 + .../templates/local/08a_partition_search.sh | 20 + .../local/08c_gene_trees_parallel.sh | 17 + .../templates/local/08c_gene_trees_serial.sh | 13 + .../templates/pbs/02_compleasm_first.job | 27 + .../templates/pbs/02_compleasm_parallel.job | 24 + .../templates/pbs/08a_partition_search.job | 22 + .../templates/pbs/08c_gene_trees_array.job | 26 + .../templates/slurm/02_compleasm_first.job | 28 + .../templates/slurm/02_compleasm_parallel.job | 25 + .../templates/slurm/08a_partition_search.job | 27 + .../templates/slurm/08c_gene_trees_array.job | 28 + 38 files changed, 11210 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/biogeobears/README.md create mode 100644 skills/biogeobears/SKILL.md create mode 100644 skills/biogeobears/references/biogeobears_details.md create mode 100644 skills/biogeobears/scripts/biogeobears_analysis_template.Rmd create mode 100755 skills/biogeobears/scripts/validate_geography_file.py create mode 100644 skills/phylo_from_buscos/.skillignore create mode 100644 skills/phylo_from_buscos/README.md create mode 100644 skills/phylo_from_buscos/SKILL.md create mode 100644 skills/phylo_from_buscos/references/REFERENCE.md create mode 100755 skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py create mode 100755 skills/phylo_from_buscos/scripts/download_ncbi_genomes.py create mode 100755 skills/phylo_from_buscos/scripts/extract_orthologs.sh create mode 100755 skills/phylo_from_buscos/scripts/generate_qc_report.sh create mode 100755 skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl create mode 100755 skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl create mode 100755 skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore_module.pm create mode 100755 skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py create mode 100755 skills/phylo_from_buscos/scripts/rename_genomes.py create mode 100755 skills/phylo_from_buscos/scripts/run_alicut.sh create mode 100755 skills/phylo_from_buscos/scripts/run_aliscore.sh create mode 100755 skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh create mode 100644 skills/phylo_from_buscos/templates/README.md create mode 100644 skills/phylo_from_buscos/templates/local/02_compleasm_first.sh create mode 100644 skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh create mode 100644 skills/phylo_from_buscos/templates/local/08a_partition_search.sh create mode 100644 skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh create mode 100644 skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh create mode 100644 skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job create mode 100644 skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job create mode 100644 skills/phylo_from_buscos/templates/pbs/08a_partition_search.job create mode 100644 skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job create mode 100644 skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job create mode 100644 skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job create mode 100644 skills/phylo_from_buscos/templates/slurm/08a_partition_search.job create mode 100644 skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..b7a6a49 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,13 @@ +{ + "name": "bioinfo-skills", + "description": "Bioinformatics skills", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Bruno de Medeiros", + "email": "bdemedeiros@fieldmuseum.org" + }, + "skills": [ + "./skills/phylo_from_buscos", + "./skills/biogeobears" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4a67e69 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# bioinfo-skills + +Bioinformatics skills diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..708737d --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,180 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:brunoasm/my_claude_skills:bioinfo-skills", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "de6d76ce965f0f2f78c1321dfc7afa719683892a", + "treeHash": "08c77edc8edaaeaf8207d518c75c59bc13394fe42a8156c93304b12383037ffb", + "generatedAt": "2025-11-28T10:14:26.129461Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "bioinfo-skills", + "description": "Bioinformatics skills" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "6cc26e7acb732e15c1a1a97a61ce783161dbb2236192c232da5c3c65847e328e" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "c55fa066bcc713f4ecbef376a15517d0732d3640b6c35edab8510e4c93748641" + }, + { + "path": "skills/phylo_from_buscos/README.md", + "sha256": "3408e707f1e19a8924cd6e876f6c69b4a549a6d31826f0f9b587b93605996c1e" + }, + { + "path": "skills/phylo_from_buscos/SKILL.md", + "sha256": "d58b91ef6554434d4b86879337546c92bb48223910303bb804136418730f7b9a" + }, + { + "path": "skills/phylo_from_buscos/.skillignore", + "sha256": "0b1fece23c5bd3298ec9ce56df31f957c70b1716753f9acaa74bac61303273c0" + }, + { + "path": "skills/phylo_from_buscos/references/REFERENCE.md", + "sha256": "1820a04978907df16ceed0dde829d0ef0655e1b4286b5f334661a6a17e9ff9ae" + }, + { + "path": "skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py", + "sha256": "69c02d641cbc719f12f859aca9285fdc5b0ac7d6e923e0b5bf0a2b36a297539d" + }, + { + "path": "skills/phylo_from_buscos/scripts/extract_orthologs.sh", + "sha256": "a739cd47d5ba5dcb30081cc1f3303821dc98f1c02108005bee896b9a7062b6e7" + }, + { + "path": "skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh", + "sha256": "a3926ec3bdf73d3f3cbc6af057275350951927b80301f7a145997b0291d6cde1" + }, + { + "path": "skills/phylo_from_buscos/scripts/rename_genomes.py", + "sha256": "5dc102c4384aab940f3ce9348c954c7813f51bd4124c5f85b7b7a28ff215f2da" + }, + { + "path": "skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py", + "sha256": "2f91b2564ecb6eee9c37ea9265ff666a5de321ac1d7f6abbdc70a2f3931c59a2" + }, + { + "path": "skills/phylo_from_buscos/scripts/run_aliscore.sh", + "sha256": "53e679c3071cf3ed56279250bd731d9a3604e15526c9a0293e0dc1e34bda8931" + }, + { + "path": "skills/phylo_from_buscos/scripts/generate_qc_report.sh", + "sha256": "872c5cecb4d9e25f8847baf060f4a8b37a86ab7c3c36f63536ba37be364194bc" + }, + { + "path": "skills/phylo_from_buscos/scripts/download_ncbi_genomes.py", + "sha256": "e30960bb620626e52309a3930e0c7769d6b7fea589de4fc17d6f8e4c51832502" + }, + { + "path": "skills/phylo_from_buscos/scripts/run_alicut.sh", + "sha256": "2d8762559c60b79924dcab5e0004626489c7aaaf9c9f7058ffb223d3edb1208a" + }, + { + "path": "skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl", + "sha256": "7e046000bb0834b0df8a135256fea6031e5ffdd21333182db18a0886c3bfdb82" + }, + { + "path": "skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl", + "sha256": "7e4a92710c840ea569458c7b6d97806dfe303c3346a2ae3310f0122f2f1496aa" + }, + { + "path": "skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore_module.pm", + "sha256": "ee7e2690027925ac012bb097fe6ed367bf00da1425bae9e317f641048c765432" + }, + { + "path": "skills/phylo_from_buscos/templates/README.md", + "sha256": "d9661f51a45aab151a07396e8ed4b0a353cf750ee55337ea938cf995469fc78c" + }, + { + "path": "skills/phylo_from_buscos/templates/slurm/08a_partition_search.job", + "sha256": "98e58028c41abc524089cc547eabd48039185b4aba873d786be78630a45ab783" + }, + { + "path": "skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job", + "sha256": "6fb08ac7b60ef2a07b60521c01140aa922c21139d1738ec39978de4ea0bf1f76" + }, + { + "path": "skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job", + "sha256": "04f01f4d0692847793a78916c53bd98287da19d89f066e113de82772dd847197" + }, + { + "path": "skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job", + "sha256": "e6b24626d44f5f122a12cd14147b38b8f60a44ab0d29c38058153dba84bd7b0c" + }, + { + "path": "skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh", + "sha256": "a7dfb2056da7fa1a0b9d1e645ae34cb8170e0df6708c509b974c7e1e505c1848" + }, + { + "path": "skills/phylo_from_buscos/templates/local/08a_partition_search.sh", + "sha256": "0f7e8de34e81fe93dec7940b9c1598b7156caf711e324a6f012748f2623ddd9a" + }, + { + "path": "skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh", + "sha256": "aeefc03ff8396b7d28f40e4af726f4db1b2ebc036ba7bcfbcbdeea2cef3ea548" + }, + { + "path": "skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh", + "sha256": "edb379d1f98b9e41326b9c1b82fd0be56a22fc63224393fd7c50642d3931be33" + }, + { + "path": "skills/phylo_from_buscos/templates/local/02_compleasm_first.sh", + "sha256": "5be4bd66da9ba8e0e9c4b7b76361246ae162bb95718bd2a1a8a79a41869e9c94" + }, + { + "path": "skills/phylo_from_buscos/templates/pbs/08a_partition_search.job", + "sha256": "bc98fa2abd41c07ab9a2608f803441d6c8e39359034d92a96eb46b9288a7929f" + }, + { + "path": "skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job", + "sha256": "aceb618e7fbd0164baf90142cabb35e9302756f7d74e99e68476e106d1ce3de2" + }, + { + "path": "skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job", + "sha256": "4d82508e3acada61f8999ed4c89e40f30ed00363a7477b925d6c5fb954d33e4e" + }, + { + "path": "skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job", + "sha256": "de597deea9b382b8dcd8663b3c497fe866d353882cd533d8223723653dbe1851" + }, + { + "path": "skills/biogeobears/README.md", + "sha256": "85b0ea5e40d2eda96243f0c12c9c5c565b4ca358d20c04f10366a3e0a6ae1961" + }, + { + "path": "skills/biogeobears/SKILL.md", + "sha256": "6a872dc71353f89651825f0bcdd14d9e226103345d08380c9d6b4ba7de2df10a" + }, + { + "path": "skills/biogeobears/references/biogeobears_details.md", + "sha256": "3665966b9f4cc1b18d6f50806a48b6fbb94d5b4fd7777fc0cd758761ed0a10f2" + }, + { + "path": "skills/biogeobears/scripts/validate_geography_file.py", + "sha256": "6559b6f1a9a4c1e06ca033ba792446eb20a2f94ddb5ea072906ac1674a9fa105" + }, + { + "path": "skills/biogeobears/scripts/biogeobears_analysis_template.Rmd", + "sha256": "2ab1859e662023e94abae8a1019bfd0b70c5779fee0c608157855042d8573f8f" + } + ], + "dirSha256": "08c77edc8edaaeaf8207d518c75c59bc13394fe42a8156c93304b12383037ffb" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/biogeobears/README.md b/skills/biogeobears/README.md new file mode 100644 index 0000000..461ae0e --- /dev/null +++ b/skills/biogeobears/README.md @@ -0,0 +1,222 @@ +# BioGeoBEARS Biogeographic Analysis Skill + +A Claude skill for setting up and executing phylogenetic biogeographic analyses using BioGeoBEARS in R. + +## Overview + +This skill automates the complete workflow for biogeographic analysis on phylogenetic trees, from raw data validation to publication-ready visualizations. It helps users reconstruct ancestral geographic ranges by: + +- Validating and reformatting input files (phylogenetic tree + geographic distribution data) +- Setting up organized analysis folder structures +- Generating customized RMarkdown analysis scripts +- Guiding parameter selection (maximum range size, model choices) +- Producing visualizations with pie charts and text labels showing ancestral ranges +- Comparing multiple biogeographic models with statistical tests + +## When to Use + +Use this skill when you need to: +- Reconstruct ancestral geographic ranges on a phylogeny +- Test different biogeographic models (DEC, DIVALIKE, BAYAREALIKE) +- Analyze how species distributions evolved over time +- Determine whether founder-event speciation (+J parameter) is important +- Generate publication-ready biogeographic visualizations + +## Required Inputs + +Users must provide: + +1. **Phylogenetic tree** (Newick format: .nwk, .tre, or .tree) + - Must be rooted + - Tip labels must match species in geography file + - Branch lengths required + +2. **Geographic distribution data** (any tabular format) + - Species names matching tree tips + - Presence/absence data for different geographic areas + - Accepts CSV, TSV, Excel, or PHYLIP format + +## What the Skill Does + +### 1. Data Validation and Reformatting + +The skill includes a Python script (`validate_geography_file.py`) that: +- Validates geography file format (PHYLIP-like with specific tab/spacing requirements) +- Checks for common errors (spaces in species names, tab delimiters, binary code length) +- Reformats CSV/TSV files to proper BioGeoBEARS format +- Cross-validates species names against tree tip labels + +### 2. Analysis Setup + +Creates an organized directory structure: +``` +biogeobears_analysis/ +├── input/ +│ ├── tree.nwk # Phylogenetic tree +│ ├── geography.data # Validated geography file +│ └── original_data/ # Original input files +├── scripts/ +│ └── run_biogeobears.Rmd # Customized RMarkdown script +├── results/ # Analysis outputs +│ ├── [MODEL]_result.Rdata # Saved model results +│ └── plots/ # Visualizations +│ ├── [MODEL]_pie.pdf +│ └── [MODEL]_text.pdf +└── README.md # Documentation +``` + +### 3. RMarkdown Analysis Template + +Generates a complete RMarkdown script that: +- Loads and validates input data +- Fits 6 biogeographic models: + - DEC (Dispersal-Extinction-Cladogenesis) + - DEC+J (DEC with founder-event speciation) + - DIVALIKE (vicariance-focused) + - DIVALIKE+J + - BAYAREALIKE (sympatry-focused) + - BAYAREALIKE+J +- Compares models using AIC, AICc, and AIC weights +- Performs likelihood ratio tests for nested models +- Estimates parameters (d=dispersal, e=extinction, j=founder-event rates) +- Generates visualizations on the phylogeny +- Creates HTML report with all results + +### 4. Visualization + +Produces two types of plots: +- **Pie charts**: Show probability distributions for ancestral ranges (conveys uncertainty) +- **Text labels**: Show maximum likelihood ancestral states (cleaner, easier to read) + +Colors represent geographic areas: +- Single areas: Bright primary colors +- Multi-area ranges: Blended colors +- All areas: White + +## Workflow + +1. **Gather information**: Ask user for tree file, geography file, and parameters +2. **Validate tree**: Check if rooted and extract tip labels +3. **Validate/reformat geography file**: Use validation script to check format or convert from CSV/TSV +4. **Set up analysis folder**: Create organized directory structure +5. **Generate RMarkdown script**: Customize template with user parameters +6. **Create documentation**: Generate README and run scripts +7. **Provide instructions**: Clear steps for running the analysis + +## Analysis Parameters + +The skill helps users choose: + +### Maximum Range Size +- How many areas can a species occupy simultaneously? +- Options: Conservative (# areas - 1), Permissive (all areas), Data-driven (max observed) +- Larger values increase computation time exponentially + +### Models to Compare +- Default: All 6 models (recommended for comprehensive comparison) +- Alternative: Only base models or only +J models +- Rationale: Model comparison is key to biogeographic inference + +### Visualization Type +- Pie charts (show probabilities and uncertainty) +- Text labels (show most likely states, cleaner) +- Both (default in template) + +## Bundled Resources + +### scripts/ + +**validate_geography_file.py** +- Validates BioGeoBEARS geography file format +- Reformats from CSV/TSV to PHYLIP +- Cross-validates with tree tip labels +- Usage: `python validate_geography_file.py --help` + +**biogeobears_analysis_template.Rmd** +- Complete RMarkdown analysis template +- Parameterized via YAML header +- Fits all models, compares, and visualizes +- Generates self-contained HTML report + +### references/ + +**biogeobears_details.md** +- Detailed model descriptions (DEC, DIVALIKE, BAYAREALIKE, +J parameter) +- Input file format specifications with examples +- Parameter interpretation guidelines +- Plotting options and customization +- Complete citations for publications +- Computational considerations and troubleshooting + +## Example Output + +The analysis produces: +- `biogeobears_report.html` - Interactive HTML report with all results +- `[MODEL]_result.Rdata` - Saved R objects for each model +- `plots/[MODEL]_pie.pdf` - Ancestral ranges shown as pie charts on tree +- `plots/[MODEL]_text.pdf` - Ancestral ranges shown as text labels on tree + +## Interpretation Guidance + +The skill helps users understand: + +### Model Selection +- **AIC weights**: Probability each model is best +- **ΔAIC thresholds**: <2 (equivalent), 2-7 (less support), >10 (no support) + +### Parameter Estimates +- **d (dispersal)**: Rate of range expansion +- **e (extinction)**: Rate of local extinction +- **j (founder-event)**: Rate of jump dispersal at speciation +- **d/e ratio**: >1 favors expansion, <1 favors contraction + +### Statistical Tests +- **LRT p < 0.05**: +J parameter significantly improves fit +- Model uncertainty: Report results from multiple models if weights similar + +## Installation Requirements + +Users must have: +- R (≥4.0) +- BioGeoBEARS R package +- Supporting R packages: ape, rmarkdown, knitr, kableExtra +- Python 3 (for validation script) + +Installation instructions are included in generated README.md files. + +## Expected Runtime + +**Skill setup time**: 5-10 minutes (file validation and directory setup) + +**Analysis runtime** (separate from skill execution): +- Small datasets (<50 tips, ≤5 areas): 10-30 minutes +- Medium datasets (50-100 tips, 5-6 areas): 30-90 minutes +- Large datasets (>100 tips, >5 areas): 1-6 hours + +## Common Issues Handled + +The skill troubleshoots: +- Species name mismatches between tree and geography file +- Unrooted trees (guides user to root with outgroup) +- Geography file formatting errors (tabs, spaces, binary codes) +- Optimization convergence failures +- Slow runtime with many areas/tips + +## Citations + +Based on: +- **BioGeoBEARS** package by Nicholas Matzke +- Tutorial resources from http://phylo.wikidot.com/biogeobears +- Example workflows from BioGeoBEARS GitHub repository + +## Skill Details + +- **Skill Type**: Workflow-based bioinformatics skill +- **Domain**: Phylogenetic biogeography, historical biogeography +- **Output**: Complete analysis setup with scripts, documentation, and ready-to-run workflow +- **Automation Level**: High (validates, reformats, generates all scripts) +- **User Input Required**: File paths and parameter choices via guided questions + +## See Also + +- [phylo_from_buscos](../phylo_from_buscos/README.md) - Complementary skill for generating phylogenies from genomes diff --git a/skills/biogeobears/SKILL.md b/skills/biogeobears/SKILL.md new file mode 100644 index 0000000..4461f6a --- /dev/null +++ b/skills/biogeobears/SKILL.md @@ -0,0 +1,581 @@ +--- +name: biogeobears +description: Set up and execute phylogenetic biogeographic analyses using BioGeoBEARS in R. Use when users request biogeographic reconstruction, ancestral range estimation, or want to analyze species distributions on phylogenies. Handles input file validation, data reformatting, RMarkdown workflow generation, and result visualization. +--- + +# BioGeoBEARS Biogeographic Analysis + +## Overview + +BioGeoBEARS (BioGeography with Bayesian and Likelihood Evolutionary Analysis in R Scripts) performs probabilistic inference of ancestral geographic ranges on phylogenetic trees. This skill helps set up complete biogeographic analyses by: + +1. Validating and reformatting input files (phylogenetic tree and geographic distribution data) +2. Generating organized analysis folder structure +3. Creating customized RMarkdown analysis scripts +4. Guiding users through parameter selection and model choices +5. Producing publication-ready visualizations + +## When to Use This Skill + +Use this skill when users request: +- "Analyze biogeography on my phylogeny" +- "Reconstruct ancestral ranges for my species" +- "Run BioGeoBEARS analysis" +- "Which areas did my ancestors occupy?" +- "Test biogeographic models (DEC, DIVALIKE, BAYAREALIKE)" + +The skill triggers when users mention phylogenetic biogeography, ancestral area reconstruction, or provide tree + distribution data. + +## Required Inputs + +Users must provide: + +1. **Phylogenetic tree** (Newick format, .nwk, .tre, or .tree file) + - Must be rooted + - Tip labels will be matched to geography file + - Branch lengths required + +2. **Geographic distribution data** (any tabular format) + - Species names (matching tree tips) + - Presence/absence data for different geographic areas + - Can be CSV, TSV, Excel, or already in PHYLIP format + +## Workflow + +### Step 1: Gather Information + +When a user requests a BioGeoBEARS analysis, ask for: + +1. **Input file paths**: + - "What is the path to your phylogenetic tree file?" + - "What is the path to your geographic distribution file?" + +2. **Analysis parameters** (if not specified): + - Maximum range size (how many areas can a species occupy simultaneously?) + - Which models to compare (default: all six - DEC, DEC+J, DIVALIKE, DIVALIKE+J, BAYAREALIKE, BAYAREALIKE+J) + - Output directory name (default: "biogeobears_analysis") + +Use the AskUserQuestion tool to gather this information efficiently: + +``` +Example questions: +- "Maximum range size" - options based on number of areas (e.g., for 4 areas: "All 4 areas", "3 areas", "2 areas") +- "Models to compare" - options: "All 6 models (recommended)", "Only base models (DEC, DIVALIKE, BAYAREALIKE)", "Only +J models", "Custom selection" +- "Visualization type" - options: "Pie charts (show probabilities)", "Text labels (show most likely states)", "Both" +``` + +### Step 2: Validate and Prepare Input Files + +#### Validate Tree File + +Use the Read tool to check the tree file: + +```r +# In R, basic validation: +library(ape) +tr <- read.tree("path/to/tree.nwk") +print(paste("Tips:", length(tr$tip.label))) +print(paste("Rooted:", is.rooted(tr))) +print(tr$tip.label) # Check species names +``` + +Verify: +- File can be parsed as Newick +- Tree is rooted (if not, ask user which outgroup to use) +- Note the tip labels for geography file validation + +#### Validate and Reformat Geography File + +Use `scripts/validate_geography_file.py` to validate or reformat the geography file. + +**If file is already in PHYLIP format** (starts with numbers): + +```bash +python scripts/validate_geography_file.py path/to/geography.txt --validate --tree path/to/tree.nwk +``` + +This checks: +- Correct tab delimiters +- Species names match tree tips +- Binary codes are correct length +- No spaces in species names or binary codes + +**If file is in CSV/TSV format** (needs reformatting): + +```bash +python scripts/validate_geography_file.py path/to/distribution.csv --reformat -o geography.data --delimiter "," +``` + +Or for tab-delimited: + +```bash +python scripts/validate_geography_file.py path/to/distribution.txt --reformat -o geography.data --delimiter tab +``` + +The script will: +- Detect area names from header row +- Convert presence/absence data to binary (handles "1", "present", "TRUE", etc.) +- Remove spaces from species names (replace with underscores) +- Create properly formatted PHYLIP file + +**Always validate the reformatted file** before proceeding: + +```bash +python scripts/validate_geography_file.py geography.data --validate --tree path/to/tree.nwk +``` + +### Step 3: Set Up Analysis Folder Structure + +Create an organized directory for the analysis: + +``` +biogeobears_analysis/ +├── input/ +│ ├── tree.nwk # Original or copied tree +│ ├── geography.data # Validated/reformatted geography file +│ └── original_data/ # Original input files +│ ├── original_tree.nwk +│ └── original_distribution.csv +├── scripts/ +│ └── run_biogeobears.Rmd # Generated RMarkdown script +├── results/ # Created by analysis (output directory) +│ ├── [MODEL]_result.Rdata # Saved model results +│ └── plots/ # Visualization outputs +│ ├── [MODEL]_pie.pdf +│ └── [MODEL]_text.pdf +└── README.md # Analysis documentation +``` + +Create this structure programmatically: + +```bash +mkdir -p biogeobears_analysis/input/original_data +mkdir -p biogeobears_analysis/scripts +mkdir -p biogeobears_analysis/results/plots + +# Copy files +cp path/to/tree.nwk biogeobears_analysis/input/ +cp geography.data biogeobears_analysis/input/ +cp original_files biogeobears_analysis/input/original_data/ +``` + +### Step 4: Generate RMarkdown Analysis Script + +Use the template at `scripts/biogeobears_analysis_template.Rmd` and customize it with user parameters. + +**Copy and customize the template**: + +```bash +cp scripts/biogeobears_analysis_template.Rmd biogeobears_analysis/scripts/run_biogeobears.Rmd +``` + +**Create a parameter file** or modify the YAML header in the Rmd to use the user's specific settings: + +Example customization via R code: + +```r +# Edit YAML parameters programmatically or provide as params when rendering +rmarkdown::render( + "biogeobears_analysis/scripts/run_biogeobears.Rmd", + params = list( + tree_file = "../input/tree.nwk", + geog_file = "../input/geography.data", + max_range_size = 4, + models = "DEC,DEC+J,DIVALIKE,DIVALIKE+J,BAYAREALIKE,BAYAREALIKE+J", + output_dir = "../results" + ), + output_file = "../results/biogeobears_report.html" +) +``` + +Or create a run script: + +```bash +# biogeobears_analysis/run_analysis.sh +#!/bin/bash +cd "$(dirname "$0")/scripts" + +R -e "rmarkdown::render('run_biogeobears.Rmd', params = list( + tree_file = '../input/tree.nwk', + geog_file = '../input/geography.data', + max_range_size = 4, + models = 'DEC,DEC+J,DIVALIKE,DIVALIKE+J,BAYAREALIKE,BAYAREALIKE+J', + output_dir = '../results' +), output_file = '../results/biogeobears_report.html')" +``` + +### Step 5: Create README Documentation + +Generate a README.md in the analysis directory explaining: + +- What files are present +- How to run the analysis +- What parameters were used +- How to interpret results + +Example: + +```markdown +# BioGeoBEARS Analysis + +## Overview + +Biogeographic analysis of [NUMBER] species across [NUMBER] geographic areas. + +## Input Data + +- **Tree**: `input/tree.nwk` ([NUMBER] tips) +- **Geography**: `input/geography.data` ([NUMBER] species × [NUMBER] areas) +- **Areas**: [A, B, C, ...] + +## Parameters + +- Maximum range size: [NUMBER] +- Models tested: [LIST] + +## Running the Analysis + +### Option 1: Using RMarkdown directly + +```r +library(rmarkdown) +render("scripts/run_biogeobears.Rmd", + output_file = "../results/biogeobears_report.html") +``` + +### Option 2: Using the run script + +```bash +bash run_analysis.sh +``` + +## Outputs + +Results will be saved in `results/`: + +- `biogeobears_report.html` - Full analysis report with visualizations +- `[MODEL]_result.Rdata` - Saved R objects for each model +- `plots/[MODEL]_pie.pdf` - Ancestral range reconstructions (pie charts) +- `plots/[MODEL]_text.pdf` - Ancestral range reconstructions (text labels) + +## Interpreting Results + +The HTML report includes: + +1. **Model Comparison** - AIC scores, AIC weights, best-fit model +2. **Parameter Estimates** - Dispersal (d), extinction (e), founder-event (j) rates +3. **Likelihood Ratio Tests** - Statistical comparisons of nested models +4. **Ancestral Range Plots** - Visualizations on phylogeny +5. **Session Info** - R package versions for reproducibility + +## Model Descriptions + +- **DEC**: Dispersal-Extinction-Cladogenesis (general-purpose) +- **DIVALIKE**: Emphasizes vicariance +- **BAYAREALIKE**: Emphasizes sympatric speciation +- **+J**: Adds founder-event speciation parameter + +See `references/biogeobears_details.md` for detailed model descriptions. + +## Installation Requirements + +```r +# Install BioGeoBEARS +install.packages("rexpokit") +install.packages("cladoRcpp") +library(devtools) +devtools::install_github(repo="nmatzke/BioGeoBEARS") + +# Other packages +install.packages(c("ape", "rmarkdown", "knitr", "kableExtra")) +``` +``` + +### Step 6: Provide User Instructions + +After setting up the analysis, provide clear instructions to the user: + +``` +Analysis Setup Complete! + +Directory structure created at: biogeobears_analysis/ + +📁 Files created: + ✓ input/tree.nwk - Phylogenetic tree ([N] tips) + ✓ input/geography.data - Geographic distribution data (validated) + ✓ scripts/run_biogeobears.Rmd - RMarkdown analysis script + ✓ README.md - Documentation and instructions + ✓ run_analysis.sh - Convenience script to run analysis + +📋 Next steps: + +1. Review the README.md for analysis details + +2. Install BioGeoBEARS if not already installed: + ```r + install.packages("rexpokit") + install.packages("cladoRcpp") + library(devtools) + devtools::install_github(repo="nmatzke/BioGeoBEARS") + ``` + +3. Run the analysis: + ```bash + cd biogeobears_analysis + bash run_analysis.sh + ``` + + Or in R: + ```r + setwd("biogeobears_analysis") + rmarkdown::render("scripts/run_biogeobears.Rmd", + output_file = "../results/biogeobears_report.html") + ``` + +4. View results: + - Open results/biogeobears_report.html in web browser + - Check results/plots/ for PDF visualizations + +⏱️ Expected runtime: [ESTIMATE based on tree size] + - Small trees (<50 tips): 5-15 minutes + - Medium trees (50-100 tips): 15-60 minutes + - Large trees (>100 tips): 1-4 hours + +💡 The HTML report includes model comparison, parameter estimates, and visualization of ancestral ranges on your phylogeny. +``` + +## Analysis Parameter Guidance + +When users ask for guidance on parameters, consult `references/biogeobears_details.md` and provide recommendations: + +### Maximum Range Size + +**Ask**: "What's the maximum number of areas a species in your group can realistically occupy?" + +Common approaches: +- **Conservative**: Number of areas - 1 (prevents unrealistic cosmopolitan ancestral ranges) +- **Permissive**: All areas (if biologically plausible) +- **Data-driven**: Maximum observed in extant species + +**Impact**: Larger values increase computational time exponentially + +### Model Selection + +**Default recommendation**: Run all 6 models for comprehensive comparison + +- DEC, DIVALIKE, BAYAREALIKE (base models) +- DEC+J, DIVALIKE+J, BAYAREALIKE+J (+J variants) + +**Rationale**: +- Model comparison is key to inference +- +J parameter is often significant +- Small additional computational cost + +If computation is a concern, suggest starting with DEC and DEC+J. + +### Visualization Options + +**Pie charts** (`plotwhat = "pie"`): +- Show probability distributions across all possible states +- Better for conveying uncertainty +- Can be cluttered with many areas + +**Text labels** (`plotwhat = "text"`): +- Show only maximum likelihood state +- Cleaner, easier to read +- Doesn't show uncertainty + +**Recommendation**: Generate both in the analysis (template does this automatically) + +## Common Issues and Troubleshooting + +### Species Name Mismatches + +**Symptom**: Error about species in tree not in geography file (or vice versa) + +**Solution**: Use the validation script with `--tree` option to identify mismatches, then either: +1. Edit the geography file to match tree tip labels +2. Edit tree tip labels to match geography file +3. Remove species that aren't in both + +### Tree Not Rooted + +**Symptom**: Error about unrooted tree + +**Solution**: +```r +library(ape) +tr <- read.tree("tree.nwk") +tr <- root(tr, outgroup = "outgroup_species_name") +write.tree(tr, "tree_rooted.nwk") +``` + +Ask user which species to use as outgroup. + +### Formatting Errors in Geography File + +**Symptom**: Validation errors about tabs, spaces, or binary codes + +**Solution**: Use the reformat option: +```bash +python scripts/validate_geography_file.py input.csv --reformat -o geography.data +``` + +### Optimization Fails to Converge + +**Symptom**: NA values in parameter estimates or very negative log-likelihoods + +**Possible causes**: +- Tree and geography data mismatch +- All species in same area (no variation) +- Unrealistic max_range_size + +**Solution**: Check input data quality and try simpler model first (DEC only) + +### Very Slow Runtime + +**Causes**: +- Large number of areas (>6-7 areas gets slow) +- Large max_range_size +- Many tips (>200) + +**Solutions**: +- Reduce max_range_size +- Combine geographic areas if appropriate +- Use `force_sparse = TRUE` in run object +- Run on HPC cluster + +## Resources + +This skill includes: + +### scripts/ + +- **validate_geography_file.py** - Validates and reformats geography files + - Checks PHYLIP format compliance + - Validates against tree tip labels + - Reformats from CSV/TSV to PHYLIP + - Usage: `python validate_geography_file.py --help` + +- **biogeobears_analysis_template.Rmd** - RMarkdown template for complete analysis + - Model fitting for DEC, DIVALIKE, BAYAREALIKE (with/without +J) + - Model comparison with AIC, AICc, weights + - Likelihood ratio tests + - Parameter visualization + - Ancestral range plotting + - Customizable via YAML parameters + +### references/ + +- **biogeobears_details.md** - Comprehensive reference including: + - Detailed model descriptions + - Input file format specifications + - Parameter interpretation guidelines + - Plotting options and customization + - Citations and further reading + - Computational considerations + +Load this reference when: +- Users ask about specific models +- Need to explain parameter estimates +- Troubleshooting complex issues +- Users want detailed methodology for publications + +## Best Practices + +1. **Always validate input files** before analysis - saves time debugging later + +2. **Organize analysis in a dedicated directory** - keeps everything together and reproducible + +3. **Run all 6 models by default** - model comparison is crucial for biogeographic inference + +4. **Document parameters and decisions** - analysis README helps with reproducibility + +5. **Generate both visualization types** - pie charts for uncertainty, text labels for clarity + +6. **Save intermediate results** - the RMarkdown template does this automatically + +7. **Check parameter estimates** - unrealistic values suggest data or model issues + +8. **Provide context with visualizations** - explain what dispersal/extinction rates mean for the user's system + +## Output Interpretation + +When presenting results to users, explain: + +### Model Selection + +- **AIC weights** represent probability that each model is best +- **ΔAIC < 2**: Models essentially equivalent +- **ΔAIC 2-7**: Considerably less support +- **ΔAIC > 10**: Essentially no support + +### Parameter Estimates + +- **d (dispersal rate)**: Higher = more range expansions +- **e (extinction rate)**: Higher = more local extinctions +- **j (founder-event rate)**: Higher = more jump dispersal at speciation +- **Ratio d/e**: > 1 favors expansion, < 1 favors contraction + +### Ancestral Ranges + +- **Pie charts**: Larger slices = higher probability +- **Colors**: Represent areas (single area = bright color, multiple areas = blended) +- **Node labels**: Most likely ancestral range +- **Split events** (at corners): Range changes at speciation + +### Statistical Tests + +- **LRT p < 0.05**: +J parameter significantly improves fit +- **High AIC weight** (>0.7): Strong evidence for one model +- **Similar AIC weights**: Model uncertainty - report results from multiple models + +## Example Usage + +``` +User: "I have a phylogeny of 30 bird species and their distributions across 5 islands. Can you help me figure out where their ancestors lived?" + +Claude (using this skill): +1. Ask for tree and distribution file paths +2. Validate tree file (check 30 tips, rooted) +3. Validate/reformat geography file (5 areas) +4. Ask about max_range_size (suggest 4 areas) +5. Ask about models (suggest all 6) +6. Set up biogeobears_analysis/ directory structure +7. Copy template RMarkdown script with parameters +8. Generate README.md and run_analysis.sh +9. Provide clear instructions to run analysis +10. Explain expected outputs and how to interpret them + +Result: User has complete, ready-to-run analysis with documentation +``` + +## Attribution + +This skill was created based on: +- **BioGeoBEARS** package by Nicholas Matzke +- Tutorial resources from http://phylo.wikidot.com/biogeobears +- Example workflows from the BioGeoBEARS GitHub repository + +## Additional Notes + +**Time estimate for skill execution**: +- File validation: 1-2 minutes +- Directory setup: < 1 minute +- Total setup time: 5-10 minutes + +**Analysis runtime** (separate from skill execution): +- Depends on tree size and number of areas +- Small datasets (<50 tips, ≤5 areas): 10-30 minutes +- Large datasets (>100 tips, >5 areas): 1-6 hours + +**Installation requirements** (user must have): +- R (≥4.0) +- BioGeoBEARS R package +- Supporting packages: ape, rmarkdown, knitr, kableExtra +- Python 3 (for validation script) + +**When to consult references/**: +- Load `biogeobears_details.md` when users need detailed explanations of models, parameters, or interpretation +- Reference it for troubleshooting complex issues +- Use it to help users write methods sections for publications \ No newline at end of file diff --git a/skills/biogeobears/references/biogeobears_details.md b/skills/biogeobears/references/biogeobears_details.md new file mode 100644 index 0000000..410c782 --- /dev/null +++ b/skills/biogeobears/references/biogeobears_details.md @@ -0,0 +1,358 @@ +# BioGeoBEARS Detailed Reference + +## Overview + +BioGeoBEARS (BioGeography with Bayesian and Likelihood Evolutionary Analysis in R Scripts) is an R package for probabilistic inference of historical biogeography on phylogenetic trees. It implements various models of range evolution and allows statistical comparison between them. + +## Installation + +```r +# Install dependencies +install.packages("rexpokit") +install.packages("cladoRcpp") + +# Install from GitHub +library(devtools) +devtools::install_github(repo="nmatzke/BioGeoBEARS") +``` + +## Biogeographic Models + +BioGeoBEARS implements several models that differ in their assumptions about how species ranges evolve: + +### DEC (Dispersal-Extinction-Cladogenesis) + +The DEC model is based on LAGRANGE and includes: + +- **Anagenetic changes** (along branches): + - `d` (dispersal): Rate of range expansion into adjacent areas + - `e` (extinction): Rate of local extinction in an area + +- **Cladogenetic events** (at speciation nodes): + - Vicariance: Ancestral range splits between daughter lineages + - Subset sympatry: One daughter inherits full range, other subset + - Range copying: Both daughters inherit full ancestral range + +**Parameters**: 2 (d, e) +**Best for**: General-purpose biogeographic inference + +### DIVALIKE (Vicariance-focused) + +Similar to DIVA (Dispersal-Vicariance Analysis): + +- Emphasizes vicariance at speciation events +- Fixes subset sympatry probability to 0 +- Only allows vicariance and range copying at nodes + +**Parameters**: 2 (d, e) +**Best for**: Systems where vicariance is the primary speciation mode + +### BAYAREALIKE (Sympatry-focused) + +Based on the BayArea model: + +- Emphasizes sympatric speciation +- Fixes vicariance probability to 0 +- Only allows subset sympatry and range copying + +**Parameters**: 2 (d, e) +**Best for**: Systems where dispersal and sympatric speciation dominate + +### +J Extension (Founder-event speciation) + +Any of the above models can include a "+J" parameter: + +- **j**: Jump dispersal / founder-event speciation rate +- Allows instantaneous dispersal to a new area at speciation +- Often significantly improves model fit +- Can be controversial (some argue it's biologically unrealistic) + +**Examples**: DEC+J, DIVALIKE+J, BAYAREALIKE+J +**Additional parameters**: +1 (j) + +## Model Comparison + +### AIC (Akaike Information Criterion) + +``` +AIC = -2 × ln(L) + 2k +``` + +Where: +- ln(L) = log-likelihood +- k = number of parameters + +**Lower AIC = better model** + +### AICc (Corrected AIC) + +Used when sample size is small relative to parameters: + +``` +AICc = AIC + (2k² + 2k)/(n - k - 1) +``` + +### AIC Weights + +Probability that a model is the best among the set: + +``` +w_i = exp(-0.5 × Δ_i) / Σ exp(-0.5 × Δ_j) +``` + +Where Δ_i = AIC_i - AIC_min + +### Likelihood Ratio Test (LRT) + +For nested models (e.g., DEC vs DEC+J): + +``` +LRT = 2 × (ln(L_complex) - ln(L_simple)) +``` + +- Test statistic follows χ² distribution +- df = difference in number of parameters +- p < 0.05 suggests complex model significantly better + +## Input File Formats + +### Phylogenetic Tree (Newick format) + +Standard Newick format with: +- Branch lengths required +- Tip labels must match geography file +- Should be rooted and ultrametric (for time-stratified analyses) + +Example: +``` +((A:1.0,B:1.0):0.5,C:1.5); +``` + +### Geography File (PHYLIP-like format) + +**Format structure:** +``` +n_species [TAB] n_areas [TAB] (area1 area2 area3 ...) +species1 [TAB] 011 +species2 [TAB] 110 +species3 [TAB] 001 +``` + +**Important formatting rules:** + +1. **Line 1 (Header)**: + - Number of species (integer) + - TAB character + - Number of areas (integer) + - TAB character + - Area names in parentheses, separated by spaces + +2. **Subsequent lines (Species data)**: + - Species name (must match tree tip label) + - TAB character + - Binary presence/absence code (1=present, 0=absent) + - NO SPACES in the binary code + - NO SPACES in species names (use underscores) + +3. **Common errors to avoid**: + - Using spaces instead of tabs + - Spaces within binary codes + - Species names with spaces + - Mismatch between species names in tree and geography file + - Wrong number of digits in binary code + +**Example file:** +``` +5 3 (A B C) +Sp_alpha 011 +Sp_beta 010 +Sp_gamma 111 +Sp_delta 100 +Sp_epsilon 001 +``` + +## Key Parameters and Settings + +### max_range_size + +Maximum number of areas a species can occupy simultaneously. + +- **Default**: Often set to number of areas, or number of areas - 1 +- **Impact**: Larger values = more possible states = longer computation +- **Recommendation**: Set based on biological realism + +### include_null_range + +Whether to include the "null range" (species extinct everywhere). + +- **Default**: TRUE +- **Purpose**: Allows extinction along branches +- **Recommendation**: Usually keep TRUE + +### force_sparse + +Use sparse matrix operations for speed. + +- **Default**: FALSE +- **When to use**: Large state spaces (many areas) +- **Note**: May cause numerical issues + +### speedup + +Various speedup options. + +- **Default**: TRUE +- **Recommendation**: Usually keep TRUE + +### use_optimx + +Use optimx for parameter optimization. + +- **Default**: TRUE +- **Benefit**: More robust optimization +- **Recommendation**: Keep TRUE + +### calc_ancprobs + +Calculate ancestral state probabilities. + +- **Default**: FALSE +- **Must set to TRUE** if you want ancestral range estimates +- **Impact**: Adds computational time + +## Plotting Functions + +### plot_BioGeoBEARS_results() + +Main function for visualizing results. + +**Key parameters:** + +- `plotwhat`: "pie" (probability distributions) or "text" (ML states) +- `tipcex`: Tip label text size +- `statecex`: Node state text/pie chart size +- `splitcex`: Split state text/pie size (at corners) +- `titlecex`: Title text size +- `plotsplits`: Show cladogenetic events (TRUE/FALSE) +- `include_null_range`: Match analysis setting +- `label.offset`: Distance of tip labels from tree +- `cornercoords_loc`: Directory with corner coordinate files + +**Color scheme:** + +- Single areas: Bright primary colors +- Multi-area ranges: Blended colors +- All areas: White +- Colors automatically assigned and mixed + +## Biogeographical Stochastic Mapping (BSM) + +Extension of BioGeoBEARS that simulates stochastic histories: + +- Generates multiple possible biogeographic histories +- Accounts for uncertainty in ancestral ranges +- Allows visualization of range evolution dynamics +- More computationally intensive + +Not covered in basic workflow but available in package. + +## Common Analysis Workflow + +1. **Prepare inputs** + - Phylogenetic tree (Newick) + - Geography file (PHYLIP format) + - Validate both files + +2. **Setup analysis** + - Define max_range_size + - Load tree and geography data + - Create state space + +3. **Fit models** + - DEC, DIVALIKE, BAYAREALIKE + - With and without +J + - 6 models total is standard + +4. **Compare models** + - AIC/AICc scores + - AIC weights + - LRT for nested comparisons + +5. **Visualize best model** + - Pie charts for probabilities + - Text labels for ML states + - Annotate with split events + +6. **Interpret results** + - Ancestral ranges + - Dispersal patterns + - Speciation modes (if using +J) + +## Interpretation Guidelines + +### Dispersal rate (d) + +- **High d**: Frequent range expansions +- **Low d**: Species mostly stay in current ranges +- **Units**: Expected dispersal events per lineage per time unit + +### Extinction rate (e) + +- **High e**: Ranges frequently contract +- **Low e**: Stable occupancy once established +- **Relative to d**: d/e ratio indicates dispersal vs. contraction tendency + +### Founder-event rate (j) + +- **High j**: Jump dispersal important in clade evolution +- **Low j** (but model still better): Minor role but statistically supported +- **j = 0** (in +J model): Founder events not supported + +### Model selection insights + +- **DEC favored**: Balanced dispersal, extinction, and vicariance +- **DIVALIKE favored**: Vicariance-driven diversification +- **BAYAREALIKE favored**: Sympatric speciation and dispersal +- **+J improves fit**: Founder-event speciation may be important + +## Computational Considerations + +### Runtime factors + +- **Number of tips**: Polynomial scaling +- **Number of areas**: Exponential scaling in state space +- **max_range_size**: Major impact (reduces state space) +- **Tree depth**: Linear scaling + +### Memory usage + +- Large trees + many areas can require substantial RAM +- Sparse matrices help but have trade-offs + +### Optimization issues + +- Complex likelihood surfaces +- Multiple local optima possible +- May need multiple optimization runs +- Check parameter estimates for sensibility + +## Citations + +**Main BioGeoBEARS reference:** +Matzke, N. J. (2013). Probabilistic historical biogeography: new models for founder-event speciation, imperfect detection, and fossils allow improved accuracy and model-testing. *Frontiers of Biogeography*, 5(4), 242-248. + +**LAGRANGE (DEC model origin):** +Ree, R. H., & Smith, S. A. (2008). Maximum likelihood inference of geographic range evolution by dispersal, local extinction, and cladogenesis. *Systematic Biology*, 57(1), 4-14. + +**+J parameter discussion:** +Ree, R. H., & Sanmartín, I. (2018). Conceptual and statistical problems with the DEC+J model of founder-event speciation and its comparison with DEC via model selection. *Journal of Biogeography*, 45(4), 741-749. + +**Model comparison best practices:** +Burnham, K. P., & Anderson, D. R. (2002). *Model Selection and Multimodel Inference: A Practical Information-Theoretic Approach* (2nd ed.). Springer. + +## Further Resources + +- **BioGeoBEARS wiki**: http://phylo.wikidot.com/biogeobears +- **GitHub repository**: https://github.com/nmatzke/BioGeoBEARS +- **Google Group**: biogeobears@googlegroups.com +- **Tutorial scripts**: Available in package `inst/extdata/examples/` diff --git a/skills/biogeobears/scripts/biogeobears_analysis_template.Rmd b/skills/biogeobears/scripts/biogeobears_analysis_template.Rmd new file mode 100644 index 0000000..3357cb8 --- /dev/null +++ b/skills/biogeobears/scripts/biogeobears_analysis_template.Rmd @@ -0,0 +1,404 @@ +--- +title: "BioGeoBEARS Biogeographic Analysis" +author: "Generated by Claude Code" +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_float: true + code_folding: show + theme: flatly +params: + tree_file: "tree.nwk" + geog_file: "geography.data" + max_range_size: 4 + models: "DEC,DEC+J,DIVALIKE,DIVALIKE+J" + output_dir: "results" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) +library(BioGeoBEARS) +library(ape) +library(knitr) +library(kableExtra) +``` + +# Analysis Parameters + +```{r parameters, echo=FALSE} +params_df <- data.frame( + Parameter = c("Tree file", "Geography file", "Max range size", "Models to test", "Output directory"), + Value = c(params$tree_file, params$geog_file, params$max_range_size, params$models, params$output_dir) +) + +kable(params_df, caption = "Analysis Parameters") %>% + kable_styling(bootstrap_options = c("striped", "hover")) +``` + +# Input Data + +## Phylogenetic Tree + +```{r load-tree} +trfn <- params$tree_file +tr <- read.tree(trfn) + +cat(paste("Number of tips:", length(tr$tip.label), "\n")) +cat(paste("Tree is rooted:", is.rooted(tr), "\n")) +cat(paste("Tree is ultrametric:", is.ultrametric(tr), "\n")) + +# Plot tree +plot(tr, cex = 0.6, main = "Input Phylogeny") +``` + +## Geographic Distribution Data + +```{r load-geography} +geogfn <- params$geog_file +tipranges <- getranges_from_LagrangePHYLIP(lgdata_fn = geogfn) + +cat(paste("Number of species:", nrow(tipranges@df), "\n")) +cat(paste("Number of areas:", ncol(tipranges@df), "\n")) +cat(paste("Area names:", paste(names(tipranges@df), collapse = ", "), "\n\n")) + +# Display geography matrix +kable(tipranges@df, caption = "Species Distribution Matrix (1 = present, 0 = absent)") %>% + kable_styling(bootstrap_options = c("striped", "hover"), font_size = 10) %>% + scroll_box(height = "400px") +``` + +## State Space Setup + +```{r state-space} +max_range_size <- params$max_range_size +numareas <- ncol(tipranges@df) + +num_states <- numstates_from_numareas(numareas = numareas, + maxareas = max_range_size, + include_null_range = TRUE) + +cat(paste("Maximum range size:", max_range_size, "\n")) +cat(paste("Number of possible states:", num_states, "\n")) +``` + +# Model Fitting + +```{r setup-output} +# Create output directory +if (!dir.exists(params$output_dir)) { + dir.create(params$output_dir, recursive = TRUE) +} + +# Parse models to run +models_to_run <- unlist(strsplit(params$models, ",")) +models_to_run <- trimws(models_to_run) + +cat("Models to fit:\n") +for (model in models_to_run) { + cat(paste(" -", model, "\n")) +} +``` + +```{r model-fitting, results='hide'} +# Storage for results +results_list <- list() +model_comparison <- data.frame( + Model = character(), + LnL = numeric(), + nParams = integer(), + AIC = numeric(), + AICc = numeric(), + d = numeric(), + e = numeric(), + j = numeric(), + stringsAsFactors = FALSE +) + +# Helper function to setup and run a model +run_biogeobears_model <- function(model_name, BioGeoBEARS_run_object) { + cat(paste("\n\nFitting model:", model_name, "\n")) + + # Configure model based on name + if (grepl("DEC", model_name)) { + # DEC model (default settings) + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free" + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free" + } else if (grepl("DIVALIKE", model_name)) { + # DIVALIKE model (vicariance only, no subset sympatry) + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "fixed" + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","init"] = 0.0 + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","est"] = 0.0 + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free" + } else if (grepl("BAYAREALIKE", model_name)) { + # BAYAREALIKE model (sympatry only, no vicariance) + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free" + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "fixed" + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","init"] = 0.0 + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","est"] = 0.0 + } + + # Add +J parameter if specified + if (grepl("\\+J", model_name)) { + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "free" + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.01 + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.01 + } else { + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "fixed" + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.0 + BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.0 + } + + # Run optimization + res <- bears_optim_run(BioGeoBEARS_run_object) + + return(res) +} + +# Base run object setup +BioGeoBEARS_run_object <- define_BioGeoBEARS_run() +BioGeoBEARS_run_object$trfn <- trfn +BioGeoBEARS_run_object$geogfn <- geogfn +BioGeoBEARS_run_object$max_range_size <- max_range_size +BioGeoBEARS_run_object$min_branchlength <- 0.000001 +BioGeoBEARS_run_object$include_null_range <- TRUE +BioGeoBEARS_run_object$force_sparse <- FALSE +BioGeoBEARS_run_object$speedup <- TRUE +BioGeoBEARS_run_object$use_optimx <- TRUE +BioGeoBEARS_run_object$calc_ancprobs <- TRUE +BioGeoBEARS_run_object <- readfiles_BioGeoBEARS_run(BioGeoBEARS_run_object) +BioGeoBEARS_run_object <- calc_loglike_sp(BioGeoBEARS_run_object) + +# Fit each model +for (model in models_to_run) { + tryCatch({ + res <- run_biogeobears_model(model, BioGeoBEARS_run_object) + results_list[[model]] <- res + + # Save result + save(res, file = file.path(params$output_dir, paste0(model, "_result.Rdata"))) + + # Extract parameters for comparison + params_table <- res$outputs@params_table + model_comparison <- rbind(model_comparison, data.frame( + Model = model, + LnL = res$outputs@loglikelihood, + nParams = sum(params_table$type == "free"), + AIC = res$outputs@AIC, + AICc = res$outputs@AICc, + d = params_table["d", "est"], + e = params_table["e", "est"], + j = params_table["j", "est"], + stringsAsFactors = FALSE + )) + }, error = function(e) { + cat(paste("Error fitting model", model, ":", e$message, "\n")) + }) +} +``` + +# Model Comparison + +```{r model-comparison} +# Calculate AIC weights +if (nrow(model_comparison) > 0) { + model_comparison$delta_AIC <- model_comparison$AIC - min(model_comparison$AIC) + model_comparison$AIC_weight <- exp(-0.5 * model_comparison$delta_AIC) / + sum(exp(-0.5 * model_comparison$delta_AIC)) + + # Sort by AIC + model_comparison <- model_comparison[order(model_comparison$AIC), ] + + kable(model_comparison, digits = 3, + caption = "Model Comparison (sorted by AIC)") %>% + kable_styling(bootstrap_options = c("striped", "hover")) %>% + row_spec(1, bold = TRUE, background = "#d4edda") # Highlight best model + + # Model selection summary + best_model <- model_comparison$Model[1] + cat(paste("\n\nBest model by AIC:", best_model, "\n")) + cat(paste("AIC weight:", round(model_comparison$AIC_weight[1], 3), "\n")) +} +``` + +# Ancestral Range Reconstruction + +## Best Model: `r if(exists('best_model')) best_model else 'TBD'` + +```{r plot-best-model, fig.width=10, fig.height=12} +if (exists('best_model') && best_model %in% names(results_list)) { + res_best <- results_list[[best_model]] + + # Create plots directory + plots_dir <- file.path(params$output_dir, "plots") + if (!dir.exists(plots_dir)) { + dir.create(plots_dir, recursive = TRUE) + } + + # Plot with pie charts + pdf(file.path(plots_dir, paste0(best_model, "_pie.pdf")), width = 10, height = 12) + + analysis_titletxt <- paste("BioGeoBEARS:", best_model) + + plot_BioGeoBEARS_results( + results_object = res_best, + analysis_titletxt = analysis_titletxt, + addl_params = list("j"), + plotwhat = "pie", + label.offset = 0.5, + tipcex = 0.7, + statecex = 0.7, + splitcex = 0.6, + titlecex = 0.8, + plotsplits = TRUE, + include_null_range = TRUE, + tr = tr, + tipranges = tipranges + ) + + dev.off() + + # Also create text plot + pdf(file.path(plots_dir, paste0(best_model, "_text.pdf")), width = 10, height = 12) + + plot_BioGeoBEARS_results( + results_object = res_best, + analysis_titletxt = analysis_titletxt, + addl_params = list("j"), + plotwhat = "text", + label.offset = 0.5, + tipcex = 0.7, + statecex = 0.7, + splitcex = 0.6, + titlecex = 0.8, + plotsplits = TRUE, + include_null_range = TRUE, + tr = tr, + tipranges = tipranges + ) + + dev.off() + + # Display in notebook (pie chart version) + plot_BioGeoBEARS_results( + results_object = res_best, + analysis_titletxt = analysis_titletxt, + addl_params = list("j"), + plotwhat = "pie", + label.offset = 0.5, + tipcex = 0.7, + statecex = 0.7, + splitcex = 0.6, + titlecex = 0.8, + plotsplits = TRUE, + include_null_range = TRUE, + tr = tr, + tipranges = tipranges + ) + + cat(paste("\n\nPlots saved to:", plots_dir, "\n")) +} +``` + +# Parameter Estimates + +```{r parameter-estimates, fig.width=10, fig.height=6} +if (nrow(model_comparison) > 0) { + # Extract base models (without +J) + base_models <- model_comparison[!grepl("\\+J", model_comparison$Model), ] + j_models <- model_comparison[grepl("\\+J", model_comparison$Model), ] + + par(mfrow = c(1, 3)) + + # Plot d (dispersal) estimates + barplot(model_comparison$d, names.arg = model_comparison$Model, + main = "Dispersal Rate (d)", ylab = "Rate", las = 2, cex.names = 0.8, + col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue")) + + # Plot e (extinction) estimates + barplot(model_comparison$e, names.arg = model_comparison$Model, + main = "Extinction Rate (e)", ylab = "Rate", las = 2, cex.names = 0.8, + col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue")) + + # Plot j (founder-event) estimates for +J models + j_vals <- model_comparison$j + j_vals[j_vals == 0] <- NA + barplot(j_vals, names.arg = model_comparison$Model, + main = "Founder-event Rate (j)", ylab = "Rate", las = 2, cex.names = 0.8, + col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue")) +} +``` + +# Likelihood Ratio Tests + +```{r lrt-tests} +# Compare models with and without +J +if (nrow(model_comparison) > 0) { + lrt_results <- data.frame( + Comparison = character(), + Model1 = character(), + Model2 = character(), + LRT_statistic = numeric(), + df = integer(), + p_value = numeric(), + stringsAsFactors = FALSE + ) + + base_model_names <- c("DEC", "DIVALIKE", "BAYAREALIKE") + + for (base in base_model_names) { + j_model <- paste0(base, "+J") + + if (base %in% model_comparison$Model && j_model %in% model_comparison$Model) { + lnl_base <- model_comparison[model_comparison$Model == base, "LnL"] + lnl_j <- model_comparison[model_comparison$Model == j_model, "LnL"] + + lrt_stat <- 2 * (lnl_j - lnl_base) + df <- 1 # One additional parameter (j) + p_val <- pchisq(lrt_stat, df = df, lower.tail = FALSE) + + lrt_results <- rbind(lrt_results, data.frame( + Comparison = paste(base, "vs", j_model), + Model1 = base, + Model2 = j_model, + LRT_statistic = lrt_stat, + df = df, + p_value = p_val, + stringsAsFactors = FALSE + )) + } + } + + if (nrow(lrt_results) > 0) { + lrt_results$Significant <- ifelse(lrt_results$p_value < 0.05, "Yes*", "No") + + kable(lrt_results, digits = 4, + caption = "Likelihood Ratio Tests (nested model comparisons)") %>% + kable_styling(bootstrap_options = c("striped", "hover")) + + cat("\n* p < 0.05 indicates significant improvement with +J parameter\n") + } +} +``` + +# Session Info + +```{r session-info} +sessionInfo() +``` + +# Outputs + +All results have been saved to: **`r params$output_dir`** + +Files generated: + +- `[MODEL]_result.Rdata` - R data files with complete model results +- `plots/[MODEL]_pie.pdf` - Phylogeny with pie charts showing ancestral range probabilities +- `plots/[MODEL]_text.pdf` - Phylogeny with text labels showing most likely ancestral ranges +- `biogeobears_analysis_template.html` - This HTML report + +To load a saved result in R: +```r +load("results/DEC+J_result.Rdata") +``` diff --git a/skills/biogeobears/scripts/validate_geography_file.py b/skills/biogeobears/scripts/validate_geography_file.py new file mode 100755 index 0000000..cf87a8f --- /dev/null +++ b/skills/biogeobears/scripts/validate_geography_file.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +""" +Validates and optionally reformats a BioGeoBEARS geography file. + +Geography files must follow the PHYLIP-like format: +Line 1: n_species [TAB] n_areas [TAB] (area1 area2 area3 ...) +Lines 2+: species_name [TAB] binary_string (e.g., 011 for absent in area1, present in area2 and area3) + +Common errors: +- Spaces instead of tabs +- Spaces in species names +- Spaces within binary strings +- Species names not matching tree tip labels +""" + +import sys +import argparse +import re +from pathlib import Path + + +def validate_geography_file(filepath, tree_tips=None): + """ + Validate geography file format. + + Args: + filepath: Path to geography file + tree_tips: Optional set of tree tip labels to validate against + + Returns: + dict with validation results and any errors/warnings + """ + errors = [] + warnings = [] + info = {} + + with open(filepath, 'r') as f: + lines = [line.rstrip('\n\r') for line in f.readlines()] + + if not lines: + errors.append("File is empty") + return {'valid': False, 'errors': errors, 'warnings': warnings, 'info': info} + + # Parse header line + header = lines[0] + if '\t' not in header: + errors.append("Line 1: Missing tab delimiter (should be: n_species [TAB] n_areas [TAB] (area_names))") + else: + parts = header.split('\t') + if len(parts) < 3: + errors.append("Line 1: Expected format 'n_species [TAB] n_areas [TAB] (area_names)'") + else: + try: + n_species = int(parts[0]) + n_areas = int(parts[1]) + + # Parse area names + area_part = parts[2].strip() + if not (area_part.startswith('(') and area_part.endswith(')')): + errors.append("Line 1: Area names should be in parentheses: (A B C)") + else: + areas = area_part[1:-1].split() + if len(areas) != n_areas: + errors.append(f"Line 1: Declared {n_areas} areas but found {len(areas)} area names") + + info['n_species'] = n_species + info['n_areas'] = n_areas + info['areas'] = areas + + # Validate species lines + species_found = [] + for i, line in enumerate(lines[1:], start=2): + if not line.strip(): + continue + + if '\t' not in line: + errors.append(f"Line {i}: Missing tab between species name and binary code") + continue + + parts = line.split('\t') + if len(parts) != 2: + errors.append(f"Line {i}: Expected exactly one tab between species name and binary code") + continue + + species_name = parts[0] + binary_code = parts[1] + + # Check for spaces in species name + if ' ' in species_name: + errors.append(f"Line {i}: Species name '{species_name}' contains spaces (use underscores instead)") + + # Check for spaces in binary code + if ' ' in binary_code or '\t' in binary_code: + errors.append(f"Line {i}: Binary code '{binary_code}' contains spaces or tabs (should be like '011' with no spaces)") + + # Check binary code length + if len(binary_code) != n_areas: + errors.append(f"Line {i}: Binary code length ({len(binary_code)}) doesn't match number of areas ({n_areas})") + + # Check binary code characters + if not all(c in '01' for c in binary_code): + errors.append(f"Line {i}: Binary code contains invalid characters (only 0 and 1 allowed)") + + species_found.append(species_name) + + # Check species count + if len(species_found) != n_species: + warnings.append(f"Header declares {n_species} species but found {len(species_found)} data lines") + + info['species'] = species_found + + # Check against tree tips if provided + if tree_tips: + species_set = set(species_found) + tree_set = set(tree_tips) + + missing_in_tree = species_set - tree_set + missing_in_geog = tree_set - species_set + + if missing_in_tree: + errors.append(f"Species in geography file but not in tree: {', '.join(sorted(missing_in_tree))}") + if missing_in_geog: + errors.append(f"Species in tree but not in geography file: {', '.join(sorted(missing_in_geog))}") + + except ValueError: + errors.append("Line 1: First two fields must be integers (n_species and n_areas)") + + return { + 'valid': len(errors) == 0, + 'errors': errors, + 'warnings': warnings, + 'info': info + } + + +def reformat_geography_file(input_path, output_path, delimiter=','): + """ + Attempt to reformat a geography file from common formats. + + Args: + input_path: Path to input file + output_path: Path for output file + delimiter: Delimiter used in input file (default: comma) + """ + with open(input_path, 'r') as f: + lines = [line.strip() for line in f.readlines()] + + # Detect if first line is a header + header_line = lines[0] + has_header = not header_line[0].isdigit() + + if has_header: + # Parse area names from header + parts = header_line.split(delimiter) + species_col = parts[0] + area_names = [p.strip() for p in parts[1:]] + data_lines = lines[1:] + else: + # No header, infer from first data line + parts = lines[0].split(delimiter) + n_areas = len(parts) - 1 + area_names = [chr(65 + i) for i in range(n_areas)] # A, B, C, ... + data_lines = lines + + # Parse species data + species_data = [] + for line in data_lines: + if not line: + continue + parts = line.split(delimiter) + if len(parts) < 2: + continue + + species_name = parts[0].strip().replace(' ', '_') + presence = ''.join(['1' if p.strip() in ['1', 'present', 'Present', 'TRUE', 'True'] else '0' + for p in parts[1:]]) + species_data.append((species_name, presence)) + + # Write output + with open(output_path, 'w') as f: + # Header line + n_species = len(species_data) + n_areas = len(area_names) + f.write(f"{n_species}\t{n_areas}\t({' '.join(area_names)})\n") + + # Species lines + for species_name, binary_code in species_data: + f.write(f"{species_name}\t{binary_code}\n") + + print(f"Reformatted {n_species} species across {n_areas} areas") + print(f"Output written to: {output_path}") + + +def main(): + parser = argparse.ArgumentParser( + description='Validate and reformat BioGeoBEARS geography files', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Validate a geography file + python validate_geography_file.py input.txt --validate + + # Reformat from CSV to PHYLIP format + python validate_geography_file.py input.csv --reformat -o output.data + + # Reformat with tab delimiter + python validate_geography_file.py input.txt --reformat --delimiter tab -o output.data + """ + ) + + parser.add_argument('input', help='Input geography file') + parser.add_argument('--validate', action='store_true', + help='Validate the file format') + parser.add_argument('--reformat', action='store_true', + help='Reformat file to BioGeoBEARS format') + parser.add_argument('-o', '--output', + help='Output file path (required for --reformat)') + parser.add_argument('--delimiter', default=',', + help='Delimiter in input file (default: comma). Use "tab" for tab-delimited.') + parser.add_argument('--tree', + help='Newick tree file to validate species names against') + + args = parser.parse_args() + + if args.delimiter.lower() == 'tab': + args.delimiter = '\t' + + # Parse tree tips if provided + tree_tips = None + if args.tree: + try: + with open(args.tree, 'r') as f: + tree_string = f.read().strip() + # Extract tip labels using regex + tree_tips = re.findall(r'([^(),:\s]+):', tree_string) + if not tree_tips: + tree_tips = re.findall(r'([^(),:\s]+)[,)]', tree_string) + print(f"Found {len(tree_tips)} tips in tree file") + except Exception as e: + print(f"Warning: Could not parse tree file: {e}") + + if args.validate: + result = validate_geography_file(args.input, tree_tips) + + print(f"\nValidation Results for: {args.input}") + print("=" * 60) + + if result['info']: + print(f"\nFile Info:") + print(f" Species: {result['info'].get('n_species', 'unknown')}") + print(f" Areas: {result['info'].get('n_areas', 'unknown')}") + if 'areas' in result['info']: + print(f" Area names: {', '.join(result['info']['areas'])}") + + if result['warnings']: + print(f"\nWarnings ({len(result['warnings'])}):") + for warning in result['warnings']: + print(f" ⚠️ {warning}") + + if result['errors']: + print(f"\nErrors ({len(result['errors'])}):") + for error in result['errors']: + print(f" ❌ {error}") + else: + print(f"\n✅ File is valid!") + + return 0 if result['valid'] else 1 + + elif args.reformat: + if not args.output: + print("Error: --output required when using --reformat") + return 1 + + try: + reformat_geography_file(args.input, args.output, args.delimiter) + + # Validate reformatted file + result = validate_geography_file(args.output, tree_tips) + if result['valid']: + print("✅ Reformatted file is valid!") + else: + print("\n⚠️ Reformatted file has validation errors:") + for error in result['errors']: + print(f" ❌ {error}") + return 1 + + except Exception as e: + print(f"Error during reformatting: {e}") + return 1 + + else: + parser.print_help() + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/skills/phylo_from_buscos/.skillignore b/skills/phylo_from_buscos/.skillignore new file mode 100644 index 0000000..adf62cf --- /dev/null +++ b/skills/phylo_from_buscos/.skillignore @@ -0,0 +1,12 @@ +# Exclude development materials from skill packaging +info_to_craft_skill/ + +# Exclude GitHub documentation (not needed in skill package) +README.md + +# Exclude local settings +.claude/ + +# Exclude git files +.git/ +.gitignore diff --git a/skills/phylo_from_buscos/README.md b/skills/phylo_from_buscos/README.md new file mode 100644 index 0000000..9e049e2 --- /dev/null +++ b/skills/phylo_from_buscos/README.md @@ -0,0 +1,99 @@ +# BUSCO-based Phylogenomics Skill + +A Claude Code skills for phylogenomic analyses, created by Bruno de Medeiros (Field Museum) based on code initially written by Paul Frandsen (Brigham Young University) + +It generate a complete phylogenetic workflow from genome assemblies using BUSCO/compleasm-based single-copy orthologs. + +**Features:** +- Supports local genome files and NCBI accessions (BioProjects/Assemblies) +- Generates scheduler-specific scripts (SLURM, PBS, cloud, local) +- Uses modern tools (compleasm, MAFFT, IQ-TREE, ASTRAL) +- Multiple alignment trimming options +- Both concatenation and coalescent approaches +- Quality control with recommendations +- Writes a draft methods paragraph describing the pipeline for publications + +**Use when you need to:** +- Build phylogenetic trees from multiple genome assemblies +- Extract and align single-copy orthologs across genomes +- Download genomes from NCBI by accession +- Generate ready-to-run scripts for your computing environment + +## Installation +See README on the repository root folder for plugin installation. + + +## Usage + +Once installed, simply describe your phylogenomics task: + +``` +I need to generate a phylogeny from 20 genome assemblies on a SLURM cluster +``` + +Claude Code will automatically activate the appropriate skill and guide you through the workflow. + +## Workflow Overview + +The complete phylogenomics pipeline: + +1. **Input Preparation** - Download NCBI genomes if needed +2. **Ortholog Identification** - Run compleasm/BUSCO on all genomes +3. **Quality Control** - Assess genome completeness with recommendations +4. **Ortholog Extraction** - Generate per-locus unaligned FASTA files +5. **Alignment** - Align orthologs with MAFFT +6. **Trimming** - Remove poorly aligned regions (Aliscore/ALICUT, trimAl, BMGE, ClipKit) +7. **Concatenation** - Build supermatrix with partition scheme +8. **Phylogenetic Inference** - Generate ML concatenated tree (IQ-TREE), gene trees, and coalescent species tree (ASTRAL) + +## Requirements + +Claude Code is better than the web interface, since Claude will then help you install all requirements. + +The skill generates scripts that install and use: + +- **compleasm** or BUSCO - ortholog detection +- **MAFFT** - multiple sequence alignment +- **Aliscore/ALICUT, trimAl, BMGE, or ClipKit** - alignment trimming +- **FASconCAT** - alignment concatenation +- **IQ-TREE** - maximum likelihood phylogenetic inference +- **ASTRAL** - coalescent species tree estimation +- **NCBI Datasets CLI** - genome download (if using NCBI accessions) + + +## Computing Environments + +The skill supports multiple computing environments: + +- **SLURM clusters** - generates SBATCH array jobs +- **PBS/Torque clusters** - generates PBS array jobs +- **Local machines** - sequential execution scripts + +## Attribution + +Created by **Bruno de Medeiros** (Curator of Pollinating Insects, Field Museum) based on phylogenomics tutorials by **Paul Frandsen** (Brigham Young University). + +## Citation + +If you use this skill for published research, please cite this website and also: + +- **compleasm**: Huang, N., & Li, H. (2023). compleasm: a faster and more accurate reimplementation of BUSCO. *Bioinformatics*, 39(10), btad595. +- **MAFFT**: Katoh, K., & Standley, D. M. (2013). MAFFT multiple sequence alignment software version 7. *Molecular Biology and Evolution*, 30(4), 772-780. +- **IQ-TREE**: Minh, B. Q., et al. (2020). IQ-TREE 2: New models and efficient methods for phylogenetic inference. *Molecular Biology and Evolution*, 37(5), 1530-1534. +- **ASTRAL**: Zhang, C., et al. (2018). ASTRAL-III: polynomial time species tree reconstruction. *BMC Bioinformatics*, 19(6), 153. + +Plus any trimming tool you use (Aliscore/ALICUT, trimAl, BMGE, or ClipKit). + +## License + +MIT License - see individual tool licenses for software dependencies. + +## Support + +For issues or questions: +- Open an issue in this repository +- Contact Bruno de Medeiros at the Field Museum (bdemedeiros@fieldmuseum.org) + +## Acknowledgments + +Special thanks to Paul Frandsen (BYU) for creating the excellent phylogenomics tutorials that form the foundation of this skill. diff --git a/skills/phylo_from_buscos/SKILL.md b/skills/phylo_from_buscos/SKILL.md new file mode 100644 index 0000000..2bd2584 --- /dev/null +++ b/skills/phylo_from_buscos/SKILL.md @@ -0,0 +1,757 @@ +--- +name: busco-phylogeny +description: Generate phylogenies from genome assemblies using BUSCO/compleasm-based single-copy orthologs with scheduler-aware workflow generation +--- + +# BUSCO-based Phylogenomics Workflow Generator + +This skill provides phylogenomics expertise for generating comprehensive, scheduler-aware workflows for phylogenetic inference from genome assemblies using single-copy orthologs. + +## Purpose + +This skill helps users generate phylogenies from genome assemblies by: +1. Handling mixed input (local files and NCBI accessions) +2. Creating scheduler-specific scripts (SLURM, PBS, cloud, local) +3. Setting up complete workflows from raw genomes to final trees +4. Providing quality control and recommendations +5. Supporting flexible software management (bioconda, Docker, custom) + +## Available Resources + +The skill provides access to these bundled resources: + +### Scripts (`scripts/`) +- **`query_ncbi_assemblies.py`** - Query NCBI for available genome assemblies by taxon name (new!) +- **`download_ncbi_genomes.py`** - Download genomes from NCBI using BioProjects or Assembly accessions +- **`rename_genomes.py`** - Rename genome files with meaningful sample names (important!) +- **`generate_qc_report.sh`** - Generate quality control reports from compleasm results +- **`extract_orthologs.sh`** - Extract and reorganize single-copy orthologs +- **`run_aliscore.sh`** - Wrapper for Aliscore to identify randomly similar sequences (RSS) +- **`run_alicut.sh`** - Wrapper for ALICUT to remove RSS positions from alignments +- **`run_aliscore_alicut_batch.sh`** - Batch process all alignments through Aliscore + ALICUT +- **`convert_fasconcat_to_partition.py`** - Convert FASconCAT output to IQ-TREE partition format +- **`predownloaded_aliscore_alicut/`** - Pre-tested Aliscore and ALICUT Perl scripts + +### Templates (`templates/`) +- **`slurm/`** - SLURM job scheduler templates +- **`pbs/`** - PBS/Torque job scheduler templates +- **`local/`** - Local machine templates (with GNU parallel) +- **`README.md`** - Complete template documentation + +### References (`references/`) +- **`REFERENCE.md`** - Detailed technical reference including: + - Sample naming best practices + - BUSCO lineage datasets (complete list) + - Resource recommendations (memory, CPUs, walltime) + - Detailed step-by-step implementation guides + - Quality control guidelines + - Aliscore/ALICUT detailed guide + - Tool citations and download links + - Software installation guide + - Common issues and troubleshooting + +## Workflow Overview + +The complete phylogenomics pipeline follows this sequence: + +**Input Preparation** → **Ortholog Identification** → **Quality Control** → **Ortholog Extraction** → **Alignment** → **Trimming** → **Concatenation** → **Phylogenetic Inference** + +## Initial User Questions + +When a user requests phylogeny generation, gather the following information systematically: + +### Step 1: Detect Computing Environment + +Before asking questions, attempt to detect the local computing environment: + +```bash +# Check for job schedulers +command -v sbatch >/dev/null 2>&1 # SLURM +command -v qsub >/dev/null 2>&1 # PBS/Torque +command -v parallel >/dev/null 2>&1 # GNU parallel +``` + +Report findings to the user, then confirm: **"I detected [X] on this machine. Will you be running the scripts here or on a different system?"** + +### Required Information + +Ask these questions to gather essential workflow parameters: + +1. **Computing Environment** + - Where will these scripts run? (SLURM cluster, PBS/Torque cluster, Cloud computing, Local machine) + +2. **Input Data** + - Local genome files, NCBI accessions, or both? + - If NCBI: Do you already have Assembly accessions (GCA_*/GCF_*) or BioProject accessions (PRJNA*/PRJEB*/PRJDA*)? + - If user doesn't have accessions: Offer to help find assemblies using `query_ncbi_assemblies.py` (see "STEP 0A: Query NCBI for Assemblies" below) + - If local files: What are the file paths? + +3. **Taxonomic Scope & Dataset Details** + - What taxonomic group? (determines BUSCO lineage dataset) + - How many taxa/genomes will be analyzed? + - What is the approximate phylogenetic breadth? (species-level, genus-level, family-level, order-level, etc.) + - See `references/REFERENCE.md` for complete lineage list + +4. **Environment Management** + - Use unified conda environment (default, recommended), or separate environments per tool? + +5. **Resource Constraints** + - How many CPU cores/threads to use in total? (Ask user to specify, do not auto-detect) + - Available memory (RAM) per node/machine? + - Maximum walltime for jobs? + - See `references/REFERENCE.md` for resource recommendations + +6. **Parallelization Strategy** + + Ask the user how they want to handle parallel processing: + + - **For job schedulers (SLURM/PBS)**: + - Use array jobs for parallel steps? (Recommended: Yes) + - Which steps to parallelize? (Steps 2, 5, 6, 8C recommended) + + - **For local machines**: + - Use GNU parallel for parallel steps? (requires `parallel` installed) + - How many concurrent jobs? + + - **For all systems**: + - Optimize for maximum throughput or simplicity? + +7. **Scheduler-Specific Configuration** (if using SLURM or PBS) + - Account/Username for compute time charges + - Partition/Queue to submit jobs to + - Email notifications? (address and when: START, END, FAIL, ALL) + - Job dependencies? (Recommended: Yes for linear workflow) + - Output log directory? (Default: `logs/`) + +8. **Alignment Trimming Preference** + - Aliscore/ALICUT (traditional, thorough), trimAl (fast), BMGE (entropy-based), or ClipKit (modern)? + +9. **Substitution Model Selection** (for IQ-TREE phylogenetic inference) + + **Context needed**: Taxonomic breadth, number of taxa, evolutionary rates + + **Action**: Fetch IQ-TREE model documentation and suggest appropriate amino acid substitution models based on dataset characteristics. + + Use the substitution model recommendation system (see "Substitution Model Recommendation" section below). + +10. **Educational Goals** + - Are you learning bioinformatics and would you like comprehensive explanations of each workflow step? + - If yes: After completing each major workflow stage, offer to explain what the step accomplishes, why certain choices were made, and what best practices are being followed. + - Store this preference to use throughout the workflow. + +--- + +## Recommended Directory Structure + +Organize analyses with dedicated folders for each pipeline step: + +``` +project_name/ +├── logs/ # All log files +├── 00_genomes/ # Input genome assemblies +├── 01_busco_results/ # BUSCO/compleasm outputs +├── 02_qc/ # Quality control reports +├── 03_extracted_orthologs/ # Extracted single-copy orthologs +├── 04_alignments/ # Multiple sequence alignments +├── 05_trimmed/ # Trimmed alignments +├── 06_concatenation/ # Supermatrix and partition files +├── 07_partition_search/ # Partition model selection +├── 08_concatenated_tree/ # Concatenated ML tree +├── 09_gene_trees/ # Individual gene trees +├── 10_species_tree/ # ASTRAL species tree +└── scripts/ # All analysis scripts +``` + +**Benefits**: Easy debugging, clear workflow progression, reproducibility, prevents root directory clutter. + +--- + +## Template System + +This skill uses a template-based system to reduce token usage and improve maintainability. Script templates are stored in the `templates/` directory and organized by computing environment. + +### How to Use Templates + +When generating scripts for users: + +1. **Read the appropriate template** for their computing environment: + ``` + Read("templates/slurm/02_compleasm_first.job") + ``` + +2. **Replace placeholders** with user-specific values: + - `TOTAL_THREADS` → e.g., `64` + - `THREADS_PER_JOB` → e.g., `16` + - `NUM_GENOMES` → e.g., `20` + - `NUM_LOCI` → e.g., `2795` + - `LINEAGE` → e.g., `insecta_odb10` + - `MODEL_SET` → e.g., `LG,WAG,JTT,Q.pfam` + +3. **Present the customized script** to the user with setup instructions + +### Available Templates + +Key templates by workflow step: +- **Step 0 (setup)**: Environment setup script in `references/REFERENCE.md` +- **Step 2 (compleasm)**: `02_compleasm_first`, `02_compleasm_parallel` +- **Step 8A (partition search)**: `08a_partition_search` +- **Step 8C (gene trees)**: `08c_gene_trees_array`, `08c_gene_trees_parallel`, `08c_gene_trees_serial` + +See `templates/README.md` for complete template documentation. + +--- + +## Substitution Model Recommendation + +When asked about substitution model selection (Question 9), use this systematic approach: + +### Step 1: Fetch IQ-TREE Documentation + +Use WebFetch to retrieve current model information: +``` +WebFetch(url="https://iqtree.github.io/doc/Substitution-Models", + prompt="Extract all amino acid substitution models with descriptions and usage guidelines") +``` + +### Step 2: Analyze Dataset Characteristics + +Consider these factors from user responses: +- **Taxonomic Scope**: Species/genus (shallow) vs. family/order (moderate) vs. class/phylum+ (deep) +- **Number of Taxa**: <20 (small), 20-50 (medium), >50 (large) +- **Evolutionary Rates**: Fast-evolving, moderate, or slow-evolving +- **Sequence Type**: Nuclear proteins, mitochondrial, or chloroplast + +### Step 3: Recommend Models + +Provide 3-5 appropriate models based on dataset characteristics. For detailed model recommendation matrices and taxonomically-targeted models, see `references/REFERENCE.md` section "Substitution Model Recommendation". + +**General recommendations**: +- **Nuclear proteins (most common)**: LG, WAG, JTT, Q.pfam +- **Mitochondrial**: mtREV, mtZOA, mtMAM, mtART, mtVer, mtInv +- **Chloroplast**: cpREV +- **Taxonomically-targeted**: Q.bird, Q.mammal, Q.insect, Q.plant, Q.yeast (when applicable) + +### Step 4: Present Recommendations + +Format recommendations with justifications and explain how models will be used in IQ-TREE steps 8A and 8C. + +### Step 5: Store Model Set + +Store the final comma-separated model list (e.g., "LG,WAG,JTT,Q.pfam") for use in Step 8 template placeholders. + +--- + +## Workflow Implementation + +Once required information is gathered, guide the user through these steps. For each step, use templates where available and refer to `references/REFERENCE.md` for detailed implementation. + +### STEP 0: Environment Setup + +**ALWAYS start by generating a setup script** for the user's environment. + +Use the unified conda environment setup script from `references/REFERENCE.md` (Section: "Software Installation Guide"). This creates a single conda environment with all necessary tools: +- compleasm, MAFFT, trimming tools (trimAl, ClipKit, BMGE) +- IQ-TREE, ASTRAL, Perl with BioPerl, GNU parallel +- Downloads and installs Aliscore/ALICUT Perl scripts + +**Key points**: +- Users choose between mamba (faster) or conda +- Users choose between predownloaded Aliscore/ALICUT scripts (tested) or latest from GitHub +- All subsequent steps use `conda activate phylo` (the unified environment) + +See `references/REFERENCE.md` for the complete setup script template. + +--- + +### STEP 0A: Query NCBI for Assemblies (Optional) + +**Use this step when**: User wants to use NCBI data but doesn't have specific assembly accessions yet. + +This optional preliminary step helps users discover available genome assemblies by taxon name before proceeding with the main workflow. + +#### When to Offer This Step + +Offer this step when: +- User wants to analyze genomes from NCBI +- User doesn't have specific Assembly or BioProject accessions +- User mentions a taxonomic group (e.g., "I want to build a phylogeny for beetles") + +#### Workflow + +1. **Ask for focal taxon**: Request the taxonomic group of interest + - Examples: "Coleoptera", "Drosophila", "Apis mellifera" + - Can be at any taxonomic level (order, family, genus, species) + +2. **Query NCBI using the script**: Use `scripts/query_ncbi_assemblies.py` to search for assemblies + + ```bash + # Basic query (returns 20 results by default) + python scripts/query_ncbi_assemblies.py --taxon "Coleoptera" + + # Query with more results + python scripts/query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50 + + # Query for RefSeq assemblies only (higher quality, GCF_* accessions) + python scripts/query_ncbi_assemblies.py --taxon "Apis" --refseq-only + + # Save accessions to file for later download + python scripts/query_ncbi_assemblies.py --taxon "Coleoptera" --save assembly_accessions.txt + ``` + +3. **Present results to user**: The script displays: + - Assembly accession (GCA_* or GCF_*) + - Organism name + - Assembly level (Chromosome, Scaffold, Contig) + - Assembly name + +4. **Help user select assemblies**: Ask user which assemblies they want to include + - Consider assembly level (Chromosome > Scaffold > Contig) + - Consider phylogenetic breadth (species coverage) + - Consider data quality (RefSeq > GenBank when available) + +5. **Collect selected accessions**: Compile the list of chosen assembly accessions + +6. **Proceed to STEP 1**: Use the selected accessions with `download_ncbi_genomes.py` + +#### Tips for Assembly Selection + +- **Assembly Level**: Chromosome-level assemblies are most complete, followed by Scaffold, then Contig +- **RefSeq vs GenBank**: RefSeq (GCF_*) assemblies undergo additional curation; GenBank (GCA_*) are submitter-provided +- **Taxonomic Sampling**: For phylogenetics, aim for representative sampling across the taxonomic group +- **Quality over Quantity**: Better to have 20 high-quality assemblies than 100 poor-quality ones + +--- + +### STEP 1: Download NCBI Genomes (if applicable) + +If user provided NCBI accessions, use `scripts/download_ncbi_genomes.py`: + +**For BioProjects**: +```bash +python scripts/download_ncbi_genomes.py --bioprojects PRJNA12345 -o genomes.zip +unzip genomes.zip +``` + +**For Assembly Accessions**: +```bash +python scripts/download_ncbi_genomes.py --assemblies GCA_123456789.1 -o genomes.zip +unzip genomes.zip +``` + +**IMPORTANT**: After download, genomes must be renamed with meaningful sample names (format: `[ACCESSION]_[SPECIES_NAME]`). Sample names appear in final phylogenetic trees. + +Generate a script that: +1. Finds all downloaded FASTA files in ncbi_dataset directory structure +2. Moves/renames files to main genomes directory with meaningful names +3. Includes any local genome files +4. Creates final genome_list.txt with ALL genomes (local + downloaded) + +See `references/REFERENCE.md` section "Sample Naming Best Practices" for detailed guidelines. + +--- + +### STEP 2: Ortholog Identification with compleasm + +Activate the unified environment and run compleasm on all genomes to identify single-copy orthologs. + +**Key considerations**: +- First genome must run alone to download lineage database +- Remaining genomes can run in parallel +- Thread allocation: Miniprot scales well up to ~16-32 threads per genome + +**Threading guidelines**: See `references/REFERENCE.md` for recommended thread allocation table. + +**Generate scripts using templates**: +- **SLURM**: Read templates `02_compleasm_first.job` and `02_compleasm_parallel.job` +- **PBS**: Read templates `02_compleasm_first.job` and `02_compleasm_parallel.job` +- **Local**: Read templates `02_compleasm_first.sh` and `02_compleasm_parallel.sh` + +Replace placeholders: `TOTAL_THREADS`, `THREADS_PER_JOB`, `NUM_GENOMES`, `LINEAGE` + +For detailed implementation examples, see `references/REFERENCE.md` section "Ortholog Identification Implementation". + +--- + +### STEP 3: Quality Control + +After compleasm completes, generate QC report using `scripts/generate_qc_report.sh`: + +```bash +bash scripts/generate_qc_report.sh qc_report.csv +``` + +Provide interpretation: +- **>95% complete**: Excellent, retain +- **90-95% complete**: Good, retain +- **85-90% complete**: Acceptable, case-by-case +- **70-85% complete**: Questionable, consider excluding +- **<70% complete**: Poor, recommend excluding + +See `references/REFERENCE.md` section "Quality Control Guidelines" for detailed assessment criteria. + +--- + +### STEP 4: Ortholog Extraction + +Use `scripts/extract_orthologs.sh` to extract single-copy orthologs: + +```bash +bash scripts/extract_orthologs.sh LINEAGE_NAME +``` + +This generates per-locus unaligned FASTA files in `single_copy_orthologs/unaligned_aa/`. + +--- + +### STEP 5: Alignment with MAFFT + +Activate the unified environment (`conda activate phylo`) which contains MAFFT. + +Create locus list, then generate alignment scripts: +```bash +cd single_copy_orthologs/unaligned_aa +ls *.fas > locus_names.txt +num_loci=$(wc -l < locus_names.txt) +``` + +**Generate scheduler-specific scripts**: +- **SLURM/PBS**: Array job with one task per locus +- **Local**: Sequential processing or GNU parallel + +For detailed script templates, see `references/REFERENCE.md` section "Alignment Implementation". + +--- + +### STEP 6: Alignment Trimming + +Based on user's preference, provide appropriate trimming method. All tools are available in the unified conda environment. + +**Options**: +- **trimAl**: Fast (`-automated1`), recommended for large datasets +- **ClipKit**: Modern, fast (default smart-gap mode) +- **BMGE**: Entropy-based (`-t AA`) +- **Aliscore/ALICUT**: Traditional, thorough (recommended for phylogenomics) + +**For Aliscore/ALICUT**: +- Perl scripts were installed in STEP 0 +- Use `scripts/run_aliscore_alicut_batch.sh` for batch processing +- Or use array jobs with `scripts/run_aliscore.sh` and `scripts/run_alicut.sh` +- Always use `-N` flag for amino acid sequences + +**Generate scripts** using scheduler-appropriate templates (array jobs for SLURM/PBS, parallel or serial for local). + +For detailed implementation of each trimming method, see `references/REFERENCE.md` section "Alignment Trimming Implementation". + +--- + +### STEP 7: Concatenation and Partition Definition + +Download FASconCAT-G (Perl script) and run concatenation: + +```bash +conda activate phylo # Has Perl installed +wget https://raw.githubusercontent.com/PatrickKueck/FASconCAT-G/master/FASconCAT-G_v1.06.1.pl -O FASconCAT-G.pl +chmod +x FASconCAT-G.pl + +cd trimmed_aa +perl ../FASconCAT-G.pl -s -i +``` + +Convert to IQ-TREE format using `scripts/convert_fasconcat_to_partition.py`: +```bash +python ../scripts/convert_fasconcat_to_partition.py FcC_info.xls partition_def.txt +``` + +Outputs: `FcC_supermatrix.fas`, `FcC_info.xls`, `partition_def.txt` + +--- + +### STEP 8: Phylogenetic Inference + +IQ-TREE is already installed in the unified environment. Activate with `conda activate phylo`. + +#### Part 8A: Partition Model Selection + +Use the substitution models selected during initial setup (Question 9). + +**Generate script using templates**: +- Read appropriate template: `templates/[slurm|pbs|local]/08a_partition_search.[job|sh]` +- Replace `MODEL_SET` placeholder with user's selected models (e.g., "LG,WAG,JTT,Q.pfam") + +For detailed implementation, see `references/REFERENCE.md` section "Partition Model Selection Implementation". + +#### Part 8B: Concatenated ML Tree + +Run IQ-TREE using the best partition scheme from Part 8A: + +```bash +iqtree -s FcC_supermatrix.fas -spp partition_search.best_scheme.nex \ + -nt 18 -safe -pre concatenated_ML_tree -bb 1000 -bnni +``` + +Output: `concatenated_ML_tree.treefile` + +#### Part 8C: Individual Gene Trees + +Estimate gene trees for coalescent-based species tree inference. + +**Generate scripts using templates**: +- **SLURM/PBS**: Read `08c_gene_trees_array.job` template +- **Local**: Read `08c_gene_trees_parallel.sh` or `08c_gene_trees_serial.sh` template +- Replace `NUM_LOCI` placeholder + +For detailed implementation, see `references/REFERENCE.md` section "Gene Trees Implementation". + +#### Part 8D: ASTRAL Species Tree + +ASTRAL is already installed in the unified conda environment. + +```bash +conda activate phylo + +# Concatenate all gene trees +cat trimmed_aa/*.treefile > all_gene_trees.tre + +# Run ASTRAL +astral -i all_gene_trees.tre -o astral_species_tree.tre +``` + +Output: `astral_species_tree.tre` + +--- + +### STEP 9: Generate Methods Paragraph + +**ALWAYS generate a methods paragraph** to help users write their publication methods section. + +Create `METHODS_PARAGRAPH.md` file with: +- Customized text based on tools and parameters used +- Complete citations for all software +- Placeholders for user-specific values (genome count, loci count, thresholds) +- Instructions for adapting to journal requirements + +For the complete methods paragraph template, see `references/REFERENCE.md` section "Methods Paragraph Template". + +Pre-fill known values when possible: +- Number of genomes +- BUSCO lineage +- Trimming method used +- Substitution models tested + +--- + +## Final Outputs Summary + +Provide users with a summary of outputs: + +**Phylogenetic Results**: +1. `concatenated_ML_tree.treefile` - ML tree from concatenated supermatrix +2. `astral_species_tree.tre` - Coalescent species tree +3. `*.treefile` - Individual gene trees + +**Data and Quality Control**: +4. `qc_report.csv` - Genome quality statistics +5. `FcC_supermatrix.fas` - Concatenated alignment +6. `partition_search.best_scheme.nex` - Selected partitioning scheme + +**Publication Materials**: +7. `METHODS_PARAGRAPH.md` - Ready-to-use methods section with citations + +**Visualization tools**: FigTree, iTOL, ggtree (R), ete3/toytree (Python) + +--- + +## Script Validation + +**ALWAYS perform validation checks** after generating scripts but before presenting them to the user. This ensures script accuracy, consistency, and proper resource allocation. + +### Validation Workflow + +For each generated script, perform these validation checks in order: + +#### 1. Program Option Verification + +**Purpose**: Detect hallucinated or incorrect command-line options that may cause scripts to fail. + +**Procedure**: +1. **Extract all command invocations** from the generated script (e.g., `compleasm run`, `iqtree -s`, `mafft --auto`) +2. **Compare against reference sources**: + - First check: Compare against corresponding template in `templates/` directory + - Second check: Compare against examples in `references/REFERENCE.md` + - Third check: If options differ significantly or are uncertain, perform web search for official documentation +3. **Common tools to validate**: + - `compleasm run` - Check `-a`, `-o`, `-l`, `-t` options + - `iqtree` - Verify `-s`, `-p`, `-m`, `-bb`, `-alrt`, `-nt`, `-safe` options + - `mafft` - Check `--auto`, `--thread`, `--reorder` options + - `astral` - Verify `-i`, `-o` options + - Trimming tools (`trimal`, `clipkit`, `BMGE.jar`) - Validate options + +**Action on issues**: +- If incorrect options found: Inform user of the issue and ask if they want you to correct it +- If uncertain: Ask user to verify with tool documentation before proceeding + +#### 2. Pipeline Continuity Verification + +**Purpose**: Ensure outputs from one step correctly feed into inputs of subsequent steps. + +**Procedure**: +1. **Map input/output relationships**: + - Step 2 output (`01_busco_results/*_compleasm/`) → Step 3 input (QC script) + - Step 3 output (`single_copy_orthologs/`) → Step 5 input (MAFFT) + - Step 5 output (`04_alignments/*.fas`) → Step 6 input (trimming) + - Step 6 output (`05_trimmed/*.fas`) → Step 7 input (FASconCAT-G) + - Step 7 output (`FcC_supermatrix.fas`, partition file) → Step 8A input (IQ-TREE) + - Step 8C output (`*.treefile`) → Step 8D input (ASTRAL) + +2. **Check for consistency**: + - File path references match across scripts + - Directory structure follows recommended layout + - Glob patterns correctly match expected files + - Required intermediate files are generated before being used + +**Action on issues**: +- If path mismatches found: Inform user and ask if they want you to correct them +- If directory structure inconsistent: Suggest corrections aligned with recommended structure + +#### 3. Resource Compatibility Check + +**Purpose**: Ensure allocated computational resources are appropriate for the task. + +**Procedure**: +1. **Verify resource allocations** against recommendations in `references/REFERENCE.md`: + - **Memory allocation**: Check if memory per CPU (typically 6GB for compleasm, 2-4GB for others) is adequate + - **Thread allocation**: Verify thread counts are reasonable for the number of genomes/loci + - **Walltime**: Ensure walltime is sufficient based on dataset size guidelines + - **Parallelization**: Check that threads per job × concurrent jobs ≤ total threads + +2. **Common issues to check**: + - Compleasm: First job needs full thread allocation (downloads database) + - IQ-TREE: `-nt` should match allocated CPUs + - Gene trees: Ensure enough threads per tree × concurrent trees ≤ total available + - Memory: Concatenated tree inference may need 8-16GB per CPU for large datasets + +3. **Validate against user-specified constraints**: + - Total CPUs specified by user + - Available memory per node + - Maximum walltime limits + - Scheduler-specific limits (if mentioned) + +**Action on issues**: +- If resource allocation issues found: Inform user and suggest corrections with justification +- If uncertain about adequacy: Ask user about typical job performance in their environment + +### Validation Reporting + +After completing all validation checks: + +1. **If all checks pass**: Inform user briefly: "Scripts validated successfully - options, pipeline flow, and resources verified." + +2. **If issues found**: Present a structured report: + ``` + **Validation Results** + + ⚠️ Issues found during validation: + + 1. [Issue category]: [Description] + - Current: [What was generated] + - Suggested: [Recommended fix] + - Reason: [Why this is an issue] + + Would you like me to apply these corrections? + ``` + +3. **Always ask before correcting**: Never silently fix issues - always get user confirmation before applying changes. + +4. **Document corrections**: If corrections are applied, explain what was changed and why. + +--- + +## Communication Guidelines + +- **Always start with STEP 0**: Generate the unified environment setup script +- **Always end with STEP 9**: Generate the customized methods paragraph +- **Always validate scripts**: Perform validation checks before presenting scripts to users +- **Use unified environment by default**: All scripts should use `conda activate phylo` +- **Always ask about CPU allocation**: Never auto-detect cores, always ask user +- **Recommend optimized workflows**: For users with adequate resources, recommend optimized parallel approaches over simple serial approaches +- **Be clear and pedagogical**: Explain why each step is necessary +- **Provide educational explanations when requested**: If user answered yes to educational goals (question 10): + - After completing each major workflow stage, ask: "Would you like me to explain this step?" + - If yes, provide moderate-length explanation (1-2 paragraphs) covering: + - What the step accomplishes biologically and computationally + - Significant choices made and their rationale + - Best practices being followed in the workflow + - Examples of "major workflow stages": STEP 0 (setup), STEP 1 (download), STEP 2 (BUSCO), STEP 3 (QC), STEP 5 (alignment), STEP 6 (trimming), STEP 7 (concatenation), STEP 8 (phylogenetic inference) +- **Provide complete, ready-to-run scripts**: Users should copy-paste and run +- **Adapt to user's environment**: Always generate scheduler-specific scripts +- **Reference supporting files**: Direct users to `references/REFERENCE.md` for details +- **Use helper scripts**: Leverage provided scripts in `scripts/` directory +- **Include error checking**: Add file existence checks and informative error messages +- **Be encouraging**: Phylogenomics is complex; maintain supportive tone + +--- + +## Important Notes + +### Mandatory Steps +1. **STEP 0 is mandatory**: Always generate the environment setup script first +2. **STEP 9 is mandatory**: Always generate the methods paragraph file at the end + +### Template Usage (IMPORTANT!) +3. **Prefer templates over inline code**: Use `templates/` directory for major scripts +4. **Template workflow**: + - Read: `Read("templates/slurm/02_compleasm_first.job")` + - Replace placeholders: `TOTAL_THREADS`, `LINEAGE`, `NUM_GENOMES`, `MODEL_SET`, etc. + - Present customized script to user +5. **Available templates**: See `templates/README.md` for complete list +6. **Benefits**: Reduces token usage, easier maintenance, consistent structure + +### Script Generation +7. **Always adapt scripts** to user's scheduler (SLURM/PBS/local) +8. **Replace all placeholders** before presenting scripts +9. **Never auto-detect CPU cores**: Always ask user to specify +10. **Provide parallelization options**: For each parallelizable step, offer array job, parallel, and serial options +11. **Scheduler-specific configuration**: For SLURM/PBS, always ask about account, partition, email, etc. + +### Parallelization Strategy +12. **Ask about preferences**: Let user choose between throughput optimization vs. simplicity +13. **Compleasm optimization**: For ≥2 genomes and ≥16 cores, recommend two-phase approach +14. **Use threading guidelines**: Refer to `references/REFERENCE.md` for thread allocation recommendations +15. **Parallelizable steps**: Steps 2 (compleasm), 5 (MAFFT), 6 (trimming), 8C (gene trees) + +### Substitution Model Selection +16. **Always recommend models**: Use the systematic model recommendation process +17. **Fetch current documentation**: Use WebFetch to get IQ-TREE model information +18. **Replace MODEL_SET placeholder**: In Step 8A templates with comma-separated list +19. **Taxonomically-targeted models**: Suggest Q.bird, Q.mammal, Q.insect, Q.plant when applicable + +### Reference Material +20. **Direct users to references/REFERENCE.md** for: + - Detailed implementation guides + - BUSCO lineage datasets (complete list) + - Resource recommendations (memory, CPUs, walltime tables) + - Sample naming best practices + - Quality control assessment criteria + - Aliscore/ALICUT detailed guide and parameters + - Tool citations with DOIs + - Software installation instructions + - Common issues and troubleshooting + +--- + +## Attribution + +This skill was created by **Bruno de Medeiros** (Curator of Pollinating Insects, Field Museum) based on phylogenomics tutorials by **Paul Frandsen** (Brigham Young University). + +## Workflow Entry Point + +When a user requests phylogeny generation: + +1. Gather required information using the "Initial User Questions" section +2. Generate STEP 0 setup script from `references/REFERENCE.MD` +3. If user needs help finding NCBI assemblies, perform STEP 0A using `query_ncbi_assemblies.py` +4. Proceed step-by-step through workflow (STEPS 1-8), using templates and referring to `references/REFERENCE.md` for detailed implementation +5. All workflow scripts should use the unified conda environment (`conda activate phylo`) +6. Validate all generated scripts before presenting to user (see "Script Validation" section) +7. Generate STEP 9 methods paragraph from template in `references/REFERENCE.md` +8. Provide final outputs summary diff --git a/skills/phylo_from_buscos/references/REFERENCE.md b/skills/phylo_from_buscos/references/REFERENCE.md new file mode 100644 index 0000000..01aa5d2 --- /dev/null +++ b/skills/phylo_from_buscos/references/REFERENCE.md @@ -0,0 +1,2225 @@ +# BUSCO-based Phylogenomics - Technical Reference + +Detailed technical reference for implementing phylogenomic workflows. + +## Table of Contents + +1. [Sample Naming Best Practices](#sample-naming-best-practices) +2. [BUSCO Lineage Datasets](#busco-lineage-datasets) +3. [Resource Recommendations](#resource-recommendations) +4. [Template Job Scripts](#template-job-scripts) +5. [Common Issues](#common-issues) +6. [Quality Control Guidelines](#quality-control-guidelines) +7. [Aliscore/ALICUT: Detailed Guide](#aliscorealicut-detailed-guide) +8. [Tool Citations](#tool-citations) +9. [Software Installation Guide](#software-installation-guide) + +--- + +## Sample Naming Best Practices + +**Sample names appear in your final phylogenetic trees**, so choose them carefully! + +### Recommended Format + +**`[ACCESSION]_[SPECIES_NAME]`** + +Examples: +- `GCA000001735_Arabidopsis_thaliana` +- `GCF009858895_Apis_mellifera` +- `PRJNA12345_Drosophila_melanogaster_strain_w1118` + +### Why This Format? + +1. **Accession first** = Easy to trace back to original data +2. **Species name** = Readable in phylogenetic trees +3. **Underscore-separated** = Compatible with all phylogenetics software +4. **No spaces or special characters** = Prevents parsing errors + +### Rules for Sample Names + +**DO:** +- Use only letters, numbers, underscores, and hyphens +- Keep names reasonably short (<50 characters) +- Be consistent across your dataset +- Include strain/population info if relevant (e.g., `GCA123_Species_name_pop1`) + +**DON'T:** +- Use spaces (use underscores instead) +- Use special characters: `()[]{}|<>@#$%^&*+=;:'",./\` +- Start with numbers (some tools don't like this) +- Use periods except for version numbers +- Make names too cryptic (will appear in publications!) + +### Using the Rename Helper Script + +The `scripts/rename_genomes.py` helper can assist with renaming: + +```bash +# Create a template mapping file +python scripts/rename_genomes.py --create-template *.fasta > samples.tsv + +# Edit samples.tsv to add meaningful names: +# GCA_000001735.2.fasta GCA000001735_Arabidopsis_thaliana +# GCF_009858895.2.fasta GCF009858895_Apis_mellifera + +# Apply the mapping (with backup) +python scripts/rename_genomes.py --mapping samples.tsv + +# Or use interactive mode +python scripts/rename_genomes.py --interactive *.fasta +``` + +### For NCBI Downloaded Genomes + +When downloading from NCBI, genome files are typically in subdirectories like: +``` +ncbi_dataset/data/GCA_000001735.2/GCA_000001735.2_genomic.fna +``` + +You'll need to: +1. Extract assembly accessions and organism names +2. Create meaningful sample names +3. Copy and rename files to working directory + +Example workflow: +```bash +# List assemblies with organism names +for dir in ncbi_dataset/data/GCA_*; do + acc=$(basename $dir) + # Extract organism name from metadata + echo "$acc" +done + +# Create mapping file manually or with download_ncbi_genomes.py --list-only +``` + +--- + +## BUSCO Lineage Datasets + +### General Lineages + +- `eukaryota_odb10` - All eukaryotes (255 BUSCOs) +- `bacteria_odb10` - All bacteria (124 BUSCOs) +- `archaea_odb10` - All archaea (194 BUSCOs) + +### Eukaryotic Kingdoms + +- `metazoa_odb10` - Animals (954 BUSCOs) +- `viridiplantae_odb10` - Green plants (425 BUSCOs) +- `fungi_odb10` - Fungi (758 BUSCOs) + +### Animals (Metazoa) + +- `arthropoda_odb10` - Arthropods (1013 BUSCOs) + - `insecta_odb10` - Insects (1367 BUSCOs) + - `diptera_odb10` - Flies (3285 BUSCOs) + - `hymenoptera_odb10` - Bees, wasps, ants (5991 BUSCOs) + - `lepidoptera_odb10` - Moths, butterflies (5286 BUSCOs) + - `arachnida_odb10` - Spiders, mites (2934 BUSCOs) +- `vertebrata_odb10` - Vertebrates (3354 BUSCOs) + - `actinopterygii_odb10` - Ray-finned fish (3640 BUSCOs) + - `mammalia_odb10` - Mammals (9226 BUSCOs) + - `aves_odb10` - Birds (8338 BUSCOs) +- `mollusca_odb10` - Molluscs (5295 BUSCOs) +- `nematoda_odb10` - Roundworms (3131 BUSCOs) + +### Plants (Viridiplantae) + +- `eudicots_odb10` - Eudicots (2326 BUSCOs) +- `liliopsida_odb10` - Monocots (3278 BUSCOs) +- `embryophyta_odb10` - Land plants (1614 BUSCOs) + +### Fungi + +- `ascomycota_odb10` - Ascomycetes (1706 BUSCOs) +- `basidiomycota_odb10` - Basidiomycetes (1335 BUSCOs) + +*For complete list, see: https://busco-data.ezlab.org/v5/data/lineages/* + +--- + +## Resource Recommendations + +### SLURM/PBS Job Resource Allocations + +| Step | CPUs | RAM per CPU | Total RAM | Walltime | Notes | +|------|------|-------------|-----------|----------|-------| +| compleasm | 4 | 6 GB | 24 GB | 24h | Increase to 8-10 GB for large genomes (>2 Gbp) | +| MAFFT (per locus) | 1 | 4 GB | 4 GB | 24h | Can run as large array job | +| Aliscore | 1 | 4 GB | 4 GB | 24h | Array job | +| trimAl | 1 | 2 GB | 2 GB | 2h | Very fast | +| BMGE | 1 | 2 GB | 2 GB | 4h | Moderate speed | +| ClipKit | 1 | 2 GB | 2 GB | 2h | Very fast | +| IQ-TREE (gene) | 1 | 4 GB | 4 GB | 2h | Array job for all loci | +| IQ-TREE (concat) | 18-32 | 4 GB | 72-128 GB | 72h | Main phylogeny job | +| ASTRAL | 1 | 8 GB | 8 GB | <1h | Usually very fast | + +### Scaling Guidelines + +**Small dataset** (<20 genomes, <1000 loci): +- Can run on local machine +- Expect ~2-5 days total runtime + +**Medium dataset** (20-50 genomes, 1000-3000 loci): +- Cluster recommended +- Expect ~3-7 days with parallelization + +**Large dataset** (>50 genomes, >3000 loci): +- Cluster required +- Expect 1-2 weeks with good parallelization + +--- + +## Template Job Scripts + +### SLURM Array Template + +```bash +#!/bin/bash +#SBATCH --job-name=JOB_NAME +#SBATCH --array=1-N +#SBATCH --cpus-per-task=NCPUS +#SBATCH --mem-per-cpu=MEMORY +#SBATCH --time=WALLTIME +#SBATCH --output=logs/%A_%a.JOBNAME.out +#SBATCH --error=logs/%A_%a.JOBNAME.err +#SBATCH --mail-type=FAIL,END +#SBATCH --mail-user=YOUR_EMAIL + +source ~/.bashrc +conda activate ENV_NAME + +# Parse input file +input=$(sed -n "${SLURM_ARRAY_TASK_ID}p" input_list.txt) + +# Run command +COMMAND ${input} +``` + +### PBS Array Template + +```bash +#!/bin/bash +#PBS -N JOB_NAME +#PBS -t 1-N +#PBS -l nodes=1:ppn=NCPUS +#PBS -l mem=MEMORY +#PBS -l walltime=WALLTIME +#PBS -j oe +#PBS -m abe +#PBS -M YOUR_EMAIL + +cd $PBS_O_WORKDIR +source ~/.bashrc +conda activate ENV_NAME + +# Parse input file +input=$(sed -n "${PBS_ARRAYID}p" input_list.txt) + +# Run command +COMMAND ${input} +``` + +### Local Sequential Template + +```bash +#!/bin/bash +# Sequential execution for local machine + +source ~/.bashrc +conda activate ENV_NAME + +while read input; do + echo "Processing ${input}..." + COMMAND ${input} +done < input_list.txt + +echo "All jobs complete" +``` + +--- + +## Common Issues + +### Problem: compleasm runs out of memory + +**Solution:** +- Increase `--mem-per-cpu` to 8 GB or 10 GB +- Some large/complex genomes need more RAM + +### Problem: IQ-TREE stalls or runs extremely slowly + +**Solution:** +- Add `-safe` flag (enables safe numerical mode, slower but more stable) +- Reduce number of threads if on shared system +- Check for very long branches or problematic sequences + +### Problem: Array job exceeds cluster limits + +**Solution:** +- Split into batches (e.g., if limit is 1000, run arrays 1-1000, 1001-2000, etc.) +- Example: `#SBATCH --array=1-1000%50` (runs 1000 jobs, max 50 concurrent) + +### Problem: Missing orthologs in some genomes + +**Solution:** +- This is normal and expected +- FASconCAT and IQ-TREE handle missing data automatically +- If >20% orthologs missing, consider genome quality issues + +### Problem: Alignment looks poor/misaligned + +**Solution:** +- Visualize with AliView or Jalview +- MAFFT L-INS-i is accurate but slow; for very divergent sequences, try E-INS-i +- Consider stricter trimming parameters +- Very divergent sequences may not be suitable for phylogenomics + +### Problem: Gene trees conflict with concatenation tree + +**Solution:** +- This is common and expected (incomplete lineage sorting, gene flow) +- ASTRAL species tree accounts for discordance +- Compare both trees and branch support values +- Look for systematic vs. random conflicts + +### Problem: Low bootstrap/posterior support values + +**Solution:** +- Check alignment quality +- Try more stringent trimming +- Evaluate locus informativeness (some may be uninformative) +- Consider rapid diversification or conflicting signal +- More data doesn't always help if signal quality is poor + +--- + +## Quality Control Guidelines + +### Genome Completeness Assessment + +**Excellent** (>95% complete BUSCOs): +- Highly complete genomes +- Retain for phylogenomics +- Expected to contribute many orthologs + +**Good** (90-95% complete): +- Generally acceptable +- May be missing some loci +- Retain unless other quality concerns + +**Acceptable** (85-90% complete): +- Marginal quality +- Will have more missing orthologs +- Consider case-by-case based on biological importance + +**Questionable** (70-85% complete): +- Poor completeness +- May introduce noise +- Recommend excluding unless scientifically critical + +**Poor** (<70% complete): +- Very incomplete +- Strong recommend to exclude +- Likely contaminated, fragmented, or poor assembly + +### Fragmentation and Duplication + +**Fragmented BUSCOs:** +- <5%: Excellent +- 5-10%: Good +- >10%: Indicates assembly fragmentation issues + +**Duplicated BUSCOs:** +- <2%: Excellent +- 2-5%: Good (may indicate recent WGD or heterozygosity) +- >10%: Likely contamination or assembly issues + +--- + +## Aliscore/ALICUT: Detailed Guide + +### What is Aliscore/ALICUT? + +**Aliscore** (Alignment Sequence Conservancy Checker) uses Monte Carlo resampling to identify randomly similar sequence (RSS) sections in multiple sequence alignments. These are regions where observed similarity is not significantly different from random expectation, which can mislead phylogenetic inference. + +**ALICUT** (Alignment Cutter) removes the RSS positions identified by Aliscore, producing trimmed alignments suitable for phylogenetic analysis. + +### When to Use Aliscore/ALICUT + +**Recommended for:** +- Phylogenomic datasets (hundreds to thousands of loci) +- Amino acid alignments from single-copy orthologs +- Mixed-quality alignments with variable conservation +- Published phylogenomics studies (widely used and cited) + +**Consider alternatives for:** +- Very short alignments (<100 positions) - not enough data for statistics +- Perfectly conserved sequences - no trimming needed +- Time-sensitive analyses - Aliscore can be slow for very long alignments + +### Key Parameters Explained + +#### Window Size (`-w`) + +Controls the sliding window used for scoring alignment regions. + +- **Default: 4** (recommended for most analyses) +- **Smaller (3):** More sensitive, may over-trim well-aligned regions (Type I error) +- **Larger (6-8):** Less sensitive to short RSS sections, more conservative + +**Recommendation:** Use default `-w 4` unless you have specific concerns about over-trimming. + +#### Random Pairs (`-r`) + +Number of pairwise sequence comparisons to perform. + +- **Default: 4×N** (where N = number of taxa) +- **Higher values:** More comprehensive but slower; diminishing returns beyond 4×N +- **Lower values:** Faster but less reliable statistics + +**Recommendation:** Use default for most analyses; increase for small datasets (<10 taxa). + +#### Gap Treatment (`-N`) + +Controls how alignment gaps (indels) are interpreted. + +- **Without `-N`:** Gaps treated as 5th character state (informative) + - Use for: Well-aligned conserved proteins where indels are rare + - Effect: Conserves long indel regions present in most taxa + +- **With `-N`:** Gaps treated as ambiguous/missing data + - **Use for: Amino acid sequences** (recommended) + - Effect: More stringent; removes poorly aligned gappy regions + +**Recommendation:** **Always use `-N` for amino acid data** from BUSCO/compleasm orthologs. + +#### Tree-Guided Mode (`-t`) + +Use a phylogenetic tree to guide comparisons (compares sister taxa first). + +- **Advantages:** More phylogenetically informed, can be faster +- **Disadvantages:** Requires pre-existing tree, assumes tree is accurate +- **When to use:** Large datasets (>100 taxa) where random sampling is slow + +**Recommendation:** Use random mode (default) for initial analyses; tree-guided mode for refinement. + +### Understanding Aliscore Output + +Each Aliscore run generates three files: + +1. **`[alignment]_List_random.txt`** + - Space-separated list of RSS position numbers + - Input file for ALICUT + - Empty file = no RSS detected (alignment is clean) + +2. **`[alignment]_Profile_random.txt`** + - Three columns: Position, Positive_Score, Negative_Score + - Shows quality profile across alignment + - Negative values indicate RSS positions + +3. **`[alignment].svg`** + - Visual plot of scoring profiles + - Y-axis: Score (positive = conserved, negative = random) + - X-axis: Alignment position + - Useful for manual inspection + +### Interpreting Trimming Results + +After running Aliscore + ALICUT, evaluate trimming statistics: + +#### Positions Removed + +- **<10%:** Excellent alignment quality, minimal trimming needed +- **10-20%:** Good alignment quality, reasonable trimming +- **20-35%:** Moderate quality, substantial but acceptable trimming +- **35-50%:** Poor alignment quality, consider manual inspection +- **>50%:** Very poor alignment, **consider excluding entire locus** + +#### What to Check + +```bash +# View summary statistics +cat trimmed_aa/trimming_summary.txt + +# Check specific locus +cd aliscore_output/aliscore_[locus] +cat ALICUT_info.xls +``` + +#### Red Flags + +- **Uniformly high trimming across all loci:** Check alignment quality (MAFFT parameters) +- **One locus with >50% trimmed:** Likely paralogous or contamination +- **No RSS detected for most loci:** Sequences may be too similar (recent divergence) + +### ALICUT Options + +#### `-r` (Remain Stems) + +For RNA secondary structure alignments, preserves paired stem positions. + +```bash +bash scripts/run_alicut.sh aliscore_output/aliscore_16S/ -r -s +``` + +**When to use:** rRNA genes (16S, 18S, 28S) with structure annotation + +#### `-c` (Remove Codon) + +Translates amino acid RSS positions to nucleotide triplets (back-translation). + +```bash +# After running Aliscore on protein alignment +bash scripts/run_alicut.sh aliscore_output/aliscore_protein/ -c -s +``` + +**When to use:** +- Analyzed protein alignment, want to trim corresponding nucleotide alignment +- Requires both protein and nucleotide files with identical names + +#### `-3` (Remove 3rd Position) + +Removes only 3rd codon positions of identified RSS. + +```bash +bash scripts/run_alicut.sh aliscore_output/aliscore_protein/ -c -3 -s +``` + +**When to use:** +- Want to exclude fast-evolving 3rd codon positions +- Can combine with `-c` option + +### Workflow Integration + +#### Typical Usage (Batch Mode) + +```bash +# Process all aligned amino acids through Aliscore + ALICUT +bash scripts/run_aliscore_alicut_batch.sh aligned_aa/ -N -o trimmed_aa +``` + +This is the **recommended approach** for most phylogenomic analyses. + +#### Array Job Mode (HPC) + +For large datasets on compute clusters: + +```bash +# Step 1: Aliscore array job +cd aligned_aa +ls *.fas > locus_list.txt +# Submit array job (see SKILL.md for templates) + +# Step 2: After Aliscore completes, batch process ALICUT +for dir in aliscore_output/aliscore_*/; do + bash ../scripts/run_alicut.sh "${dir}" -s +done +``` + +### Quality Control After Trimming + +#### Check Alignment Lengths + +```bash +# Summary statistics +awk 'NR>1 {sum+=$3; count++} END {print "Mean trimmed length:", sum/count}' \ + trimmed_aa/trimming_summary.txt + +# Find very short alignments +awk 'NR>1 && $3<100 {print $1, $3}' trimmed_aa/trimming_summary.txt +``` + +**Minimum recommended:** 100 amino acids after trimming + +#### Visual Inspection + +For critical loci, view the SVG plots: + +```bash +# Open in browser +firefox aliscore_output/aliscore_[locus]/*svg +``` + +Look for: +- Large contiguous RSS regions → May indicate paralogy +- RSS at alignment ends → Common and acceptable +- Scattered RSS throughout → Normal for divergent sequences + +### Troubleshooting + +#### Aliscore Errors + +**"Can't locate Aliscore_module.pm"** +- Download both `Aliscore.02.2.pl` and `Aliscore_module.pm` +- Keep in same directory + +**"taxon names of tree and sequence files do not match"** +- Tree mode: ensure tree tip labels exactly match FASTA headers +- Solution: Use random mode instead (`-r` option) + +**"Sequence length too short"** +- Alignment has fewer positions than RSS identified +- Usually indicates corrupted input file + +#### ALICUT Errors + +**"Can not find List file"** +- Aliscore didn't complete successfully +- Check Aliscore logs for errors + +**"File [alignment] is empty"** +- Aliscore didn't copy alignment file +- Run from correct directory + +**All positions removed** +- Extremely poor alignment quality +- Exclude this locus from analysis + +### Performance Considerations + +**Memory:** Aliscore typically uses 1-2 GB per alignment + +**Runtime:** +- Small alignments (10 taxa, 1000 positions): 1-5 minutes +- Medium (50 taxa, 2000 positions): 10-30 minutes +- Large (100 taxa, 3000 positions): 30-60 minutes + +**Parallelization:** +- Aliscore itself is single-threaded +- Parallelize across loci using array jobs +- Typical dataset: 1000-2000 loci × 20 minutes = use array jobs + +### Comparison with Other Trimmers + +| Tool | Method | Speed | Stringency | Best For | +|------|--------|-------|------------|----------| +| **Aliscore/ALICUT** | Monte Carlo RSS detection | Slow | Moderate-High | Phylogenomics (gold standard) | +| **trimAl** | Gap/conservation thresholds | Very fast | Customizable | Large datasets, quick analyses | +| **BMGE** | Entropy-based | Fast | Moderate | Balanced speed/quality | +| **ClipKit** | Parsimony-informative sites | Very fast | Low-Moderate | Maximum data retention | + +**Recommendation:** Use Aliscore/ALICUT for final publication-quality analyses; trimAl for initial exploratory work. + +--- + +## Tool Citations + +### Required Citations + +**compleasm:** +Huang, N., & Li, H. (2023). compleasm: a faster and more accurate reimplementation of BUSCO. *Bioinformatics*, 39(10), btad595. +https://doi.org/10.1093/bioinformatics/btad595 + +**BUSCO (if used instead of compleasm):** +Manni, M., Berkeley, M. R., Seppey, M., Simão, F. A., & Zdobnov, E. M. (2021). BUSCO update: novel and streamlined workflows along with broader and deeper phylogenetic coverage for scoring of eukaryotic, prokaryotic, and viral genomes. *Molecular Biology and Evolution*, 38(10), 4647-4654. + +**MAFFT:** +Katoh, K., & Standley, D. M. (2013). MAFFT multiple sequence alignment software version 7: improvements in performance and usability. *Molecular Biology and Evolution*, 30(4), 772-780. + +**IQ-TREE:** +Minh, B. Q., Schmidt, H. A., Chernomor, O., Schrempf, D., Woodhams, M. D., von Haeseler, A., & Lanfear, R. (2020). IQ-TREE 2: new models and efficient methods for phylogenetic inference in the genomic era. *Molecular Biology and Evolution*, 37(5), 1530-1534. + +**ASTRAL:** +Zhang, C., Rabiee, M., Sayyari, E., & Mirarab, S. (2018). ASTRAL-III: polynomial time species tree reconstruction from partially resolved gene trees. *BMC Bioinformatics*, 19(6), 153. + +### Trimming Tool Citations + +**Aliscore/ALICUT:** +Kück, P., Meusemann, K., Dambach, J., Thormann, B., von Reumont, B. M., Wägele, J. W., & Misof, B. (2010). Parametric and non-parametric masking of randomness in sequence alignments can be improved and leads to better resolved trees. *Frontiers in Zoology*, 7(1), 10. + +**trimAl:** +Capella-Gutiérrez, S., Silla-Martínez, J. M., & Gabaldón, T. (2009). trimAl: a tool for automated alignment trimming in large-scale phylogenetic analyses. *Bioinformatics*, 25(15), 1972-1973. + +**BMGE:** +Criscuolo, A., & Gribaldo, S. (2010). BMGE (Block Mapping and Gathering with Entropy): a new software for selection of phylogenetic informative regions from multiple sequence alignments. *BMC Evolutionary Biology*, 10(1), 210. + +**ClipKit:** +Steenwyk, J. L., Buida III, T. J., Li, Y., Shen, X. X., & Rokas, A. (2020). ClipKIT: a multiple sequence alignment trimming software for accurate phylogenomic inference. *PLOS Biology*, 18(12), e3001007. + +### Software Download Links + +- **compleasm:** https://github.com/huangnengCSU/compleasm +- **BUSCO:** https://busco.ezlab.org/ +- **MAFFT:** https://mafft.cbrc.jp/alignment/software/ +- **Aliscore:** https://github.com/PatrickKueck/AliCUT (includes Aliscore.02.2.pl) +- **ALICUT:** https://github.com/PatrickKueck/AliCUT +- **trimAl:** https://github.com/inab/trimal +- **BMGE:** https://gitlab.pasteur.fr/GIPhy/BMGE +- **ClipKit:** https://github.com/JLSteenwyk/ClipKIT +- **IQ-TREE:** https://github.com/iqtree/iqtree2 +- **ASTRAL:** https://github.com/smirarab/ASTRAL +- **FASconCAT-G:** https://github.com/PatrickKueck/FASconCAT-G +- **NCBI Datasets:** https://www.ncbi.nlm.nih.gov/datasets/ + +--- + +## Software Installation Guide + +This section provides detailed installation instructions for all tools, with options for both conda-based and manual installations. All methods work without sudo/admin access. + +### Automated Setup Script (Recommended for Most Users) + +Use this automated bash script to set up a complete unified conda environment with all necessary tools. This is the script referenced in SKILL.md STEP 0. + +**Generate `setup_phylo_env.sh` with the following content:** + +```bash +#!/bin/bash +# setup_phylo_env.sh +# Sets up unified conda environment for phylogenomics workflow +# Generated by Claude phylo_from_buscos skill + +set -e + +echo "==========================================" +echo "Phylogenomics Environment Setup" +echo "==========================================" +echo "" + +# Check if conda is available +if ! command -v conda &> /dev/null; then + echo "ERROR: conda not found. Please install Miniconda or Anaconda first." + echo "Visit: https://docs.conda.io/en/latest/miniconda.html" + exit 1 +fi + +# Ask user preference for conda vs mamba +echo "We will use Anaconda/Miniconda to set up the software environment." +echo "" +echo "Package Manager Options:" +echo " 1) mamba (faster, recommended if available)" +echo " 2) conda (standard, always available)" +echo "" +read -p "Enter choice [1-2] (default: 2): " PKG_MGR_CHOICE +PKG_MGR_CHOICE=${PKG_MGR_CHOICE:-2} + +if [ "${PKG_MGR_CHOICE}" = "1" ]; then + if command -v mamba &> /dev/null; then + PKG_MANAGER="mamba" + echo "Using mamba for environment creation" + else + echo "WARNING: mamba not found. Falling back to conda." + echo "To install mamba: conda install -n base -c conda-forge mamba" + PKG_MANAGER="conda" + fi +else + PKG_MANAGER="conda" + echo "Using conda for environment creation" +fi + +echo "" + +# Environment name +ENV_NAME="phylo" + +echo "Creating environment: ${ENV_NAME}" +echo "" + +# Create environment with all tools (using chosen package manager) +${PKG_MANAGER} create -n ${ENV_NAME} -y \ + -c conda-forge -c bioconda \ + python=3.9 \ + astral-tree \ + compleasm \ + mafft \ + trimal \ + clipkit \ + bmge \ + iqtree \ + perl \ + perl-bioperl \ + parallel \ + wget \ + ncbi-datasets-cli \ + openjdk + +echo "" +echo "Environment created successfully!" +echo "" + +# Setup Aliscore and ALICUT Perl scripts +echo "Setting up Aliscore and ALICUT Perl scripts..." +echo "" + +# Activate environment +source "$(conda info --base)/etc/profile.d/conda.sh" +conda activate ${ENV_NAME} + +# Get the directory where this skill is located +SKILL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Create scripts directory +mkdir -p scripts + +# Ask user preference for script source +echo "Aliscore/ALICUT Script Source Options:" +echo " 1) Use predownloaded scripts (from Paul Frandsen's tutorial, tested)" +echo " 2) Download latest versions from official repository" +echo "" +read -p "Enter choice [1-2] (default: 1): " SCRIPT_CHOICE +SCRIPT_CHOICE=${SCRIPT_CHOICE:-1} + +if [ "${SCRIPT_CHOICE}" = "1" ]; then + echo "Using predownloaded Aliscore/ALICUT scripts..." + + # Download predownloaded scripts from GitHub repository + GITHUB_BASE="https://raw.githubusercontent.com/brunoasm/my_claude_skills/main/phylo_from_buscos/scripts/predownloaded_aliscore_alicut" + if wget -q "${GITHUB_BASE}/Aliscore.02.2.pl" -O scripts/Aliscore.02.2.pl && \ + wget -q "${GITHUB_BASE}/ALICUT_V2.31.pl" -O scripts/ALICUT_V2.31.pl && \ + wget -q "${GITHUB_BASE}/Aliscore_module.pm" -O scripts/Aliscore_module.pm; then + chmod +x scripts/Aliscore.02.2.pl scripts/ALICUT_V2.31.pl + echo "Predownloaded scripts downloaded successfully." + else + echo "ERROR: Failed to download predownloaded scripts. Falling back to download option." + SCRIPT_CHOICE="2" + fi +fi + +if [ "${SCRIPT_CHOICE}" = "2" ]; then + echo "Downloading latest Aliscore/ALICUT scripts from GitHub..." + + # Try to download from GitHub repository + if wget -q https://github.com/PatrickKueck/AliCUT/raw/master/Aliscore_v.2.0/Aliscore.02.2.pl -O scripts/Aliscore.02.2.pl && \ + wget -q https://github.com/PatrickKueck/AliCUT/raw/master/ALICUT_V2.3.1/ALICUT_V2.31.pl -O scripts/ALICUT_V2.31.pl && \ + wget -q https://github.com/PatrickKueck/AliCUT/raw/master/Aliscore_v.2.0/Aliscore_module.pm -O scripts/Aliscore_module.pm; then + chmod +x scripts/Aliscore.02.2.pl scripts/ALICUT_V2.31.pl + echo "Latest scripts downloaded successfully." + else + echo "ERROR: Failed to download scripts from GitHub." + echo "Please manually download from: https://github.com/PatrickKueck/AliCUT" + exit 1 + fi +fi + +echo "" +echo "==========================================" +echo "Setup Complete!" +echo "==========================================" +echo "" +echo "Conda environment: ${ENV_NAME}" +echo "Perl with BioPerl: installed" +echo "Aliscore script: scripts/Aliscore.02.2.pl" +echo "ALICUT script: scripts/ALICUT_V2.31.pl" +echo "Aliscore module: scripts/Aliscore_module.pm" +echo "" +echo "To activate environment:" +echo " conda activate ${ENV_NAME}" +echo "" +echo "Key tools installed:" +conda list | grep -E "compleasm|mafft|trimal|clipkit|bmge|iqtree|astral|parallel|perl|openjdk" +echo "" +``` + +**Usage:** +```bash +# Run setup script +bash setup_phylo_env.sh + +# Activate environment for all workflow steps +conda activate phylo +``` + +**This unified environment includes:** +- `compleasm` - BUSCO ortholog identification +- `mafft` - Multiple sequence alignment +- `trimal`, `clipkit`, `bmge` - Alignment trimming +- `iqtree` - Phylogenetic inference +- `astral-tree` - Species tree inference (coalescent method) +- `openjdk` - Java runtime for ASTRAL and other tools +- `perl` with BioPerl - Required for Aliscore/ALICUT +- `parallel` - GNU parallel for batch processing +- `ncbi-datasets-cli` - For NCBI genome downloads + +**Important Notes:** +- Users can choose between using mamba (faster) or conda (standard) for environment creation +- Users can choose between predownloaded Aliscore/ALICUT scripts (tested with tutorial) or latest versions from GitHub +- Predownloaded scripts are downloaded from the GitHub repository +- All subsequent workflow steps should use `conda activate phylo` instead of creating separate environments +- The unified environment simplifies workflow management and reduces disk space usage +- ASTRAL is installed as `astral-tree` and accessible via the `astral` command (no manual download needed) + +--- + +### Conda/Bioconda Installation (Recommended) + +Most tools are available via conda/bioconda and work on both Linux and macOS (including Apple Silicon with Rosetta). + +#### Initial Setup + +```bash +# Add channels (one-time setup) +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge +conda config --set channel_priority strict +``` + +#### Core Tools (Available via Conda) + +```bash +# Create main phylogenomics environment +conda create -n phylogenomics -c conda-forge -c bioconda \ + compleasm \ + mafft \ + iqtree \ + trimal \ + bmge \ + clipkit \ + astral-tree \ + perl \ + perl-bioperl \ + python=3.9 \ + biopython + +conda activate phylogenomics +``` + +**Individual installations** (if you prefer separate environments): + +```bash +# Ortholog detection +conda create -n compleasm -c conda-forge -c bioconda compleasm +# Alternative: BUSCO +conda create -n busco -c conda-forge -c bioconda busco + +# Alignment +conda create -n mafft -c conda-forge -c bioconda mafft + +# Trimming tools (choose one or more) +conda create -n trimal -c bioconda trimal +conda create -n bmge -c bioconda bmge +conda create -n clipkit -c bioconda clipkit + +# Phylogenetic inference +conda create -n iqtree -c bioconda iqtree +conda create -n astral -c bioconda astral-tree + +# NCBI data download +conda create -n ncbi -c conda-forge ncbi-datasets-cli +``` + +#### Platform-Specific Notes + +**macOS (Intel and Apple Silicon):** +- Most tools work natively on Intel Macs +- Apple Silicon (M1/M2/M3) may require Rosetta 2 for some packages +- If you encounter issues, use: `CONDA_SUBDIR=osx-64 conda create -n myenv ...` + +**Linux:** +- All tools work natively +- HPC systems: Use `module load conda` or install Miniforge in your home directory + +### Manual Installation (Tools NOT in Conda) + +Some tools require manual download and setup within your conda environment. + +#### 1. Aliscore and ALICUT + +Aliscore and ALICUT are Perl scripts that need to be installed into your conda environment. + +**Installation:** + +```bash +# Activate your environment first +conda activate phylogenomics + +# Create temporary download directory +mkdir -p /tmp/phylo_downloads +cd /tmp/phylo_downloads + +# Download Aliscore scripts +wget https://raw.githubusercontent.com/PatrickKueck/AliCUT/master/Aliscore.02.2.pl +wget https://raw.githubusercontent.com/PatrickKueck/AliCUT/master/Aliscore_module.pm + +# Download ALICUT +wget https://raw.githubusercontent.com/PatrickKueck/AliCUT/master/ALICUT_V2.31.pl + +# Install into conda environment bin directory +mkdir -p $CONDA_PREFIX/bin +cp Aliscore.02.2.pl $CONDA_PREFIX/bin/ +cp Aliscore_module.pm $CONDA_PREFIX/bin/ +cp ALICUT_V2.31.pl $CONDA_PREFIX/bin/ + +# Make scripts executable +chmod +x $CONDA_PREFIX/bin/Aliscore.02.2.pl +chmod +x $CONDA_PREFIX/bin/ALICUT_V2.31.pl + +# Create convenient wrapper scripts without .pl extension +cat > $CONDA_PREFIX/bin/aliscore <<'EOF' +#!/bin/bash +perl $(dirname $0)/Aliscore.02.2.pl "$@" +EOF + +cat > $CONDA_PREFIX/bin/alicut <<'EOF' +#!/bin/bash +perl $(dirname $0)/ALICUT_V2.31.pl "$@" +EOF + +chmod +x $CONDA_PREFIX/bin/aliscore +chmod +x $CONDA_PREFIX/bin/alicut + +# Cleanup +cd ~ +rm -rf /tmp/phylo_downloads +``` + +**Verify installation:** + +```bash +# Test with wrapper commands +aliscore +alicut -h + +# Or call Perl scripts directly +perl $CONDA_PREFIX/bin/Aliscore.02.2.pl +perl $CONDA_PREFIX/bin/ALICUT_V2.31.pl -h +``` + +**Note:** The Aliscore_module.pm must be in the same directory as Aliscore.02.2.pl. Since both are installed to `$CONDA_PREFIX/bin`, this is handled automatically. + +#### 2. FASconCAT-G + +FASconCAT-G is a Perl script for concatenating multiple sequence alignments. + +**Installation:** + +```bash +# Activate your environment +conda activate phylogenomics + +# Create temporary download directory +mkdir -p /tmp/phylo_downloads +cd /tmp/phylo_downloads + +# Download FASconCAT-G +wget https://raw.githubusercontent.com/PatrickKueck/FASconCAT-G/master/FASconCAT-G_v1.06.1.pl + +# Install into conda environment bin directory +mkdir -p $CONDA_PREFIX/bin +cp FASconCAT-G_v1.06.1.pl $CONDA_PREFIX/bin/ + +# Make executable +chmod +x $CONDA_PREFIX/bin/FASconCAT-G_v1.06.1.pl + +# Create convenient wrapper script +cat > $CONDA_PREFIX/bin/fasconcat <<'EOF' +#!/bin/bash +perl $(dirname $0)/FASconCAT-G_v1.06.1.pl "$@" +EOF + +chmod +x $CONDA_PREFIX/bin/fasconcat + +# Cleanup +cd ~ +rm -rf /tmp/phylo_downloads +``` + +**Verify installation:** + +```bash +# Test with wrapper command +fasconcat +# Should display the interactive menu + +# Or call Perl script directly +perl $CONDA_PREFIX/bin/FASconCAT-G_v1.06.1.pl +``` + +#### 3. IQ-TREE (Alternative: Direct Binary Download) + +While IQ-TREE is available via conda, you may want the latest version directly from GitHub. + +**Installation (Linux):** + +```bash +conda activate phylogenomics + +# Create temporary download directory +mkdir -p /tmp/phylo_downloads +cd /tmp/phylo_downloads + +# Download and extract +wget https://github.com/iqtree/iqtree2/releases/download/v2.3.6/iqtree-2.3.6-Linux-intel.tar.gz +tar -xzf iqtree-2.3.6-Linux-intel.tar.gz + +# Install binaries into conda environment +mkdir -p $CONDA_PREFIX/bin +cp iqtree-2.3.6-Linux-intel/bin/iqtree2 $CONDA_PREFIX/bin/ +chmod +x $CONDA_PREFIX/bin/iqtree2 + +# Create symlink for version-agnostic usage +ln -sf $CONDA_PREFIX/bin/iqtree2 $CONDA_PREFIX/bin/iqtree + +# Cleanup +cd ~ +rm -rf /tmp/phylo_downloads +``` + +**Installation (macOS Intel):** + +```bash +conda activate phylogenomics + +mkdir -p /tmp/phylo_downloads +cd /tmp/phylo_downloads + +# Download and extract +wget https://github.com/iqtree/iqtree2/releases/download/v2.3.6/iqtree-2.3.6-macOS-intel.tar.gz +tar -xzf iqtree-2.3.6-macOS-intel.tar.gz + +# Install into conda environment +mkdir -p $CONDA_PREFIX/bin +cp iqtree-2.3.6-macOS-intel/bin/iqtree2 $CONDA_PREFIX/bin/ +chmod +x $CONDA_PREFIX/bin/iqtree2 + +# Create symlink for version-agnostic usage +ln -sf $CONDA_PREFIX/bin/iqtree2 $CONDA_PREFIX/bin/iqtree + +# Cleanup +cd ~ +rm -rf /tmp/phylo_downloads +``` + +**Installation (macOS Apple Silicon):** + +```bash +conda activate phylogenomics + +mkdir -p /tmp/phylo_downloads +cd /tmp/phylo_downloads + +# Download and extract +wget https://github.com/iqtree/iqtree2/releases/download/v2.3.6/iqtree-2.3.6-macOS-arm.tar.gz +tar -xzf iqtree-2.3.6-macOS-arm.tar.gz + +# Install into conda environment +mkdir -p $CONDA_PREFIX/bin +cp iqtree-2.3.6-macOS-arm/bin/iqtree2 $CONDA_PREFIX/bin/ +chmod +x $CONDA_PREFIX/bin/iqtree2 + +# Create symlink for version-agnostic usage +ln -sf $CONDA_PREFIX/bin/iqtree2 $CONDA_PREFIX/bin/iqtree + +# Cleanup +cd ~ +rm -rf /tmp/phylo_downloads +``` + +**Verify installation:** + +```bash +iqtree --version +# or +iqtree2 --version +``` + +#### 4. ASTRAL (Alternative: Direct JAR Download) + +While ASTRAL is available via conda, you can also install the JAR file directly. + +**Installation:** + +```bash +conda activate phylogenomics + +# Create temporary download directory +mkdir -p /tmp/phylo_downloads +cd /tmp/phylo_downloads + +# Download ASTRAL +wget https://github.com/smirarab/ASTRAL/raw/master/Astral.5.7.8.zip +unzip Astral.5.7.8.zip + +# Install JAR file into conda environment +mkdir -p $CONDA_PREFIX/share/astral +cp Astral/astral.5.7.8.jar $CONDA_PREFIX/share/astral/ +cp -r Astral/lib $CONDA_PREFIX/share/astral/ 2>/dev/null || true + +# Create wrapper script +cat > $CONDA_PREFIX/bin/astral-jar <<'EOF' +#!/bin/bash +java -jar $CONDA_PREFIX/share/astral/astral.5.7.8.jar "$@" +EOF + +chmod +x $CONDA_PREFIX/bin/astral-jar + +# Cleanup +cd ~ +rm -rf /tmp/phylo_downloads +``` + +**Verify installation:** + +```bash +astral-jar -h +``` + +**Note:** This creates an `astral-jar` command to avoid conflicts with the conda `astral` package if both are installed. + +### Verification Script + +Create a script to verify all installations: + +```bash +#!/bin/bash +# verify_installations.sh + +echo "==============================================" +echo "Phylogenomics Tools Installation Verification" +echo "==============================================" +echo "" +echo "Conda environment: $CONDA_PREFIX" +echo "" + +# Function to check command +check_cmd() { + if command -v $1 &> /dev/null; then + version=$($1 --version 2>&1 | head -n 1 || echo "installed") + echo "✓ $1 is installed ($version)" + return 0 + else + echo "✗ $1 is NOT installed" + return 1 + fi +} + +# Function to check file +check_file() { + if [ -f "$1" ]; then + echo "✓ $2 is installed at $1" + return 0 + else + echo "✗ $2 is NOT installed" + return 1 + fi +} + +echo "Conda-installed tools:" +echo "----------------------" +check_cmd compleasm +check_cmd mafft +check_cmd trimal +check_cmd bmge +check_cmd clipkit +check_cmd iqtree || check_cmd iqtree2 +check_cmd astral + +echo "" +echo "Manually-installed Perl scripts:" +echo "---------------------------------" + +# Check Aliscore +if command -v aliscore &> /dev/null; then + echo "✓ Aliscore is available (wrapper: aliscore)" + check_file "$CONDA_PREFIX/bin/Aliscore.02.2.pl" " Aliscore.02.2.pl" + check_file "$CONDA_PREFIX/bin/Aliscore_module.pm" " Aliscore_module.pm" +elif [ -f "$CONDA_PREFIX/bin/Aliscore.02.2.pl" ]; then + echo "✓ Aliscore.02.2.pl is installed (no wrapper)" +else + echo "✗ Aliscore is NOT installed" +fi + +# Check ALICUT +if command -v alicut &> /dev/null; then + echo "✓ ALICUT is available (wrapper: alicut)" + check_file "$CONDA_PREFIX/bin/ALICUT_V2.31.pl" " ALICUT_V2.31.pl" +elif [ -f "$CONDA_PREFIX/bin/ALICUT_V2.31.pl" ]; then + echo "✓ ALICUT_V2.31.pl is installed (no wrapper)" +else + echo "✗ ALICUT is NOT installed" +fi + +# Check FASconCAT-G +if command -v fasconcat &> /dev/null; then + echo "✓ FASconCAT-G is available (wrapper: fasconcat)" + check_file "$CONDA_PREFIX/bin/FASconCAT-G_v1.06.1.pl" " FASconCAT-G_v1.06.1.pl" +elif [ -f "$CONDA_PREFIX/bin/FASconCAT-G_v1.06.1.pl" ]; then + echo "✓ FASconCAT-G_v1.06.1.pl is installed (no wrapper)" +else + echo "✗ FASconCAT-G is NOT installed" +fi + +echo "" +echo "Alternative installations:" +echo "--------------------------" + +# Check manually installed IQ-TREE +if [ -f "$CONDA_PREFIX/bin/iqtree2" ] && [ ! -L "$CONDA_PREFIX/bin/iqtree2" ]; then + echo "✓ IQ-TREE2 binary (manually installed)" + if [ -L "$CONDA_PREFIX/bin/iqtree" ]; then + echo " ✓ iqtree symlink present" + fi +fi + +# Check manually installed ASTRAL +if command -v astral-jar &> /dev/null; then + echo "✓ ASTRAL JAR (manually installed, command: astral-jar)" + check_file "$CONDA_PREFIX/share/astral/astral.5.7.8.jar" " astral.5.7.8.jar" +fi + +echo "" +echo "==============================================" +echo "Verification complete!" +echo "==============================================" +``` + +**Save and run:** + +```bash +# Save the script +cat > verify_installations.sh <<'SCRIPT' +# ... paste the script above ... +SCRIPT + +chmod +x verify_installations.sh + +# Run it +./verify_installations.sh +``` + +### Troubleshooting + +**Problem: Conda is slow** +- Solution: Use `mamba` instead: `conda install -n base mamba -c conda-forge` +- Then use `mamba` instead of `conda` for all installations + +**Problem: Perl script can't find modules** +```bash +# Install additional Perl dependencies +conda install -c bioconda perl-bioperl perl-file-copy-recursive +``` + +**Problem: Java not found for ASTRAL** +```bash +conda install -c conda-forge openjdk +``` + +**Problem: Permission denied on HPC** +```bash +# Install Miniforge in home directory +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh +bash Miniforge3-Linux-x86_64.sh -b -p $HOME/miniforge3 +source $HOME/miniforge3/etc/profile.d/conda.sh +``` + +**Problem: Apple Silicon compatibility** +```bash +# Force x86_64 architecture +CONDA_SUBDIR=osx-64 conda create -n phylogenomics ... +conda activate phylogenomics +conda config --env --set subdir osx-64 +``` + +--- + +## Docker Container Specification + +If using Docker, here's a complete Dockerfile with all tools: + +```dockerfile +FROM mambaorg/micromamba:latest + +LABEL maintainer="Bruno de Medeiros " +LABEL description="Complete environment for BUSCO-based phylogenomics" + +# Install all phylogenomics tools +RUN micromamba install -y -n base -c conda-forge -c bioconda \ + compleasm \ + busco \ + mafft \ + trimal \ + bmge \ + clipkit \ + iqtree \ + ncbi-datasets-cli \ + python=3.9 \ + biopython \ + perl \ + openjdk \ + wget \ + unzip \ + && micromamba clean --all --yes + +# Set working directory +WORKDIR /data + +# Set entrypoint +ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] +CMD ["/bin/bash"] +``` + +Build and run: +```bash +docker build -t phylogenomics:latest . +docker run -v $(pwd):/data -it phylogenomics:latest +``` + +--- + +## Substitution Model Recommendation Detailed Guide + +This section provides the detailed decision matrix and recommendation process for IQ-TREE substitution model selection. + +### Model Recommendation Matrix + +#### For Nuclear Proteins (most phylogenomics workflows) + +**Deep Phylogeny (class+ level), Many Taxa (>50)**: +1. **LG+F+G4** or **LG+F+R** - Best general model, widely used +2. **WAG+F+G4** - Alternative general model +3. **LG4X** or **LG4M** - Mixture models for heterogeneity +4. **Q.pfam+F+G4** - Database-derived, broad taxonomic sampling +5. **JTT+F+G4** - Classical alternative + +**Moderate Phylogeny (family/order level), Moderate Taxa (20-50)**: +1. **LG+F+G4** - Top choice for most analyses +2. **WAG+F+G4** - Reliable alternative +3. **JTT+F+G4** - Classical model +4. **Q.pfam+F+G4** - Database-derived option + +**Shallow Phylogeny (species/genus level), Few Taxa (<20)**: +1. **LG+F+G4** - Still recommended default +2. **WAG+F+G4** - Good alternative +3. **JTT+F+G4** - Classical option + +#### Taxonomically-Targeted Models + +When applicable, use these taxonomically-specific models: + +- **Birds**: Q.bird+F+G4 +- **Mammals**: Q.mammal+F+G4, mtMAM (if mtDNA) +- **Insects**: Q.insect+F+G4, mtART (if mtDNA) +- **Plants**: Q.plant+F+G4, cpREV (if chloroplast) +- **Yeasts/Fungi**: Q.yeast+F+G4 + +#### Model Notation Explained + +- **LG, WAG, JTT**: Empirical exchange rate matrix name +- **+F**: Use empirical amino acid frequencies from data (recommended) +- **+G4**: Gamma model with 4 rate categories for among-site rate variation +- **+R**: FreeRate model (alternative to Gamma, often better but slower) + +### Presenting Model Recommendations + +Format recommendations like this: + +```markdown +### Recommended Substitution Models for Your Dataset + +Based on your dataset ([NUMBER] taxa, [TAXONOMIC_SCOPE] phylogenetic breadth, [SEQUENCE_TYPE]): + +**Primary Recommendations** (use all in model testing): + +1. **LG+F+G4** + - **Why**: Most widely used modern AA model, performs well across diverse datasets + - **Citations**: Proven effective in numerous phylogenomic studies + +2. **WAG+F+G4** + - **Why**: Excellent general-purpose alternative, often comparable to LG + - **Use case**: Good backup if LG shows poor fit + +3. **JTT+F+G4** + - **Why**: Classical model still widely used, allows comparison with older studies + - **Use case**: Historical comparisons + +[Add 2-3 more specific to their data...] + +**For IQ-TREE Step 8A**, we'll use these models in the partition search: +```bash +-mset LG,WAG,JTT,Q.pfam # Model set for testing +-m TESTMERGEONLY # Test models and merge partitions +``` + +**For Step 8C (gene trees)**, we'll use: +```bash +-m MFP # Model Finder Plus (tests models from our set) +``` + +Would you like to: +1. ✓ Use these recommended models (recommended) +2. Specify a custom model set +3. Let me fetch more information about your specific taxonomic group +``` + +--- + +## Ortholog Identification Implementation + +This section provides detailed implementation scripts for Step 2 (compleasm). + +### Threading Allocation Table + +| Total Cores | First Genome | Subsequent Genomes | Concurrent Jobs | Threads/Job | +|-------------|--------------|-------------------|-----------------|-------------| +| 8 | 8 threads | 8 threads (serial)| 1 | 8 | +| 16 | 16 threads | 8 threads | 2 | 8 | +| 32 | 32 threads | 8 threads | 4 | 8 | +| 64 | 64 threads | 16 threads | 4 | 16 | +| 128 | 128 threads | 16-32 threads | 4-8 | 16 | + +### SLURM Implementation (Option A: Optimized Parallel) + +**First genome job** (`run_compleasm_first.job`): +```bash +#!/bin/bash +#SBATCH --job-name=compleasm_first +#SBATCH --cpus-per-task=TOTAL_THREADS # Replace with all available cores +#SBATCH --mem-per-cpu=6G +#SBATCH --time=24:00:00 +#SBATCH --output=logs/compleasm_first.%j.out +#SBATCH --error=logs/compleasm_first.%j.err + +source ~/.bashrc +conda activate phylo + +mkdir -p logs + +# Process first genome only (downloads lineage database) +first_genome=$(head -n 1 genome_list.txt) +genome_name=$(basename ${first_genome} .fasta) + +echo "Processing first genome: ${genome_name}" +echo "This will download the BUSCO lineage database..." + +compleasm run \ + -a ${first_genome} \ + -o ${genome_name}_compleasm \ + -l LINEAGE \ + -t ${SLURM_CPUS_PER_TASK} + +echo "First genome complete. Database downloaded." +echo "Ready to process remaining genomes in parallel." +``` + +**Parallel genomes job** (`run_compleasm_parallel.job`): +```bash +#!/bin/bash +#SBATCH --job-name=compleasm_parallel +#SBATCH --array=2-NUM_GENOMES # Replace NUM_GENOMES with count +#SBATCH --cpus-per-task=THREADS_PER_JOB # Replace based on table above +#SBATCH --mem-per-cpu=6G +#SBATCH --time=48:00:00 +#SBATCH --output=logs/compleasm.%A_%a.out +#SBATCH --error=logs/compleasm.%A_%a.err + +source ~/.bashrc +conda activate phylo + +# Get genome for this array task +genome=$(sed -n "${SLURM_ARRAY_TASK_ID}p" genome_list.txt) +genome_name=$(basename ${genome} .fasta) + +echo "Processing genome ${SLURM_ARRAY_TASK_ID}: ${genome_name}" + +compleasm run \ + -a ${genome} \ + -o ${genome_name}_compleasm \ + -l LINEAGE \ + -t ${SLURM_CPUS_PER_TASK} + +echo "Completed: ${genome_name}" +``` + +### PBS Implementation (Option A: Optimized Parallel) + +Similar structure to SLURM, but with PBS directives. See templates in `templates/pbs/`. + +### Local Implementation (Option A: Optimized Parallel) + +**First genome script** (`run_compleasm_first.sh`): +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +TOTAL_THREADS=TOTAL_THREADS # Replace with total cores to use + +mkdir -p logs + +# Process first genome only +first_genome=$(head -n 1 genome_list.txt) +genome_name=$(basename ${first_genome} .fasta) + +echo "Processing first genome: ${genome_name}" +echo "Using ${TOTAL_THREADS} threads" +echo "This will download the BUSCO lineage database..." + +compleasm run \ + -a ${first_genome} \ + -o ${genome_name}_compleasm \ + -l LINEAGE \ + -t ${TOTAL_THREADS} 2>&1 | tee logs/compleasm_first.log + +echo "First genome complete!" +``` + +**Parallel genomes script** (`run_compleasm_parallel.sh`): +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +TOTAL_THREADS=TOTAL_THREADS # Replace with total cores available +THREADS_PER_JOB=THREADS_PER_JOB # Replace based on table above +CONCURRENT_JOBS=$((TOTAL_THREADS / THREADS_PER_JOB)) + +echo "Parallel compleasm processing" +echo "Total threads: ${TOTAL_THREADS}" +echo "Threads per genome: ${THREADS_PER_JOB}" +echo "Concurrent genomes: ${CONCURRENT_JOBS}" +echo "" + +mkdir -p logs + +# Process genomes 2-end in parallel using GNU parallel +tail -n +2 genome_list.txt | parallel -j ${CONCURRENT_JOBS} ' + genome_name=$(basename {} .fasta) + echo "Processing: ${genome_name}" + + compleasm run \ + -a {} \ + -o ${genome_name}_compleasm \ + -l LINEAGE \ + -t THREADS_PER_JOB 2>&1 | tee logs/compleasm_${genome_name}.log + + echo "Completed: ${genome_name}" +' + +echo "" +echo "All genomes processed!" +``` + +### Simple Serial Implementation (All Platforms) + +For users who prefer simplicity over optimization: + +**SLURM** (`run_compleasm_serial.job`): +```bash +#!/bin/bash +#SBATCH --job-name=compleasm_serial +#SBATCH --cpus-per-task=THREADS +#SBATCH --mem-per-cpu=6G +#SBATCH --time=72:00:00 +#SBATCH --output=logs/compleasm.%j.out + +source ~/.bashrc +conda activate phylo + +mkdir -p logs + +while read genome; do + genome_name=$(basename ${genome} .fasta) + echo "Processing ${genome_name}..." + + compleasm run \ + -a ${genome} \ + -o ${genome_name}_compleasm \ + -l LINEAGE \ + -t ${SLURM_CPUS_PER_TASK} +done < genome_list.txt + +echo "All genomes processed!" +``` + +--- + +## Alignment Implementation + +This section provides detailed implementation scripts for Step 5 (MAFFT alignment). + +### SLURM Array Job + +```bash +#!/bin/bash +#SBATCH --job-name=mafft_array +#SBATCH --array=1-NUM_LOCI # Replace with actual number +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=4G +#SBATCH --time=24:00:00 +#SBATCH --output=logs/%A_%a.mafft.out +#SBATCH --error=logs/%A_%a.mafft.err + +source ~/.bashrc +conda activate phylo + +cd single_copy_orthologs/unaligned_aa +mkdir -p ../aligned_aa + +locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" locus_names.txt) +output=$(basename ${locus} .fas)_aligned.fas + +echo "Aligning: ${locus}" + +mafft-linsi ${locus} > ../aligned_aa/${output} + +echo "Completed: ${output}" +``` + +### PBS Array Job + +```bash +#!/bin/bash +#PBS -N mafft_array +#PBS -t 1-NUM_LOCI +#PBS -l nodes=1:ppn=1 +#PBS -l mem=4gb +#PBS -l walltime=24:00:00 + +cd $PBS_O_WORKDIR/single_copy_orthologs/unaligned_aa +source ~/.bashrc +conda activate phylo + +mkdir -p ../aligned_aa + +locus=$(sed -n "${PBS_ARRAYID}p" locus_names.txt) +output=$(basename ${locus} .fas)_aligned.fas + +mafft-linsi ${locus} > ../aligned_aa/${output} +``` + +### Local Sequential + +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd single_copy_orthologs/unaligned_aa +mkdir -p ../aligned_aa + +while read locus; do + output=$(basename ${locus} .fas)_aligned.fas + echo "Aligning ${locus}..." + + mafft-linsi ${locus} > ../aligned_aa/${output} +done < locus_names.txt + +echo "All alignments complete!" +``` + +### Local Parallel (GNU parallel) + +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd single_copy_orthologs/unaligned_aa +mkdir -p ../aligned_aa + +CONCURRENT_JOBS=4 # Adjust based on available cores + +cat locus_names.txt | parallel -j ${CONCURRENT_JOBS} ' + output=$(basename {} .fas)_aligned.fas + echo "Aligning: {}" + mafft-linsi {} > ../aligned_aa/${output} + echo "Completed: ${output}" +' + +echo "All alignments complete!" +``` + +--- + +## Alignment Trimming Implementation + +This section provides detailed implementation scripts for Step 6 (alignment trimming) using all supported methods. + +### trimAl Implementation + +**SLURM Array Job**: +```bash +#!/bin/bash +#SBATCH --job-name=trimal_array +#SBATCH --array=1-NUM_LOCI +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=2G +#SBATCH --time=2:00:00 +#SBATCH --output=logs/%A_%a.trimal.out + +source ~/.bashrc +conda activate phylo + +cd aligned_aa +mkdir -p ../trimmed_aa +ls *.fas > aligned_loci.txt + +locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" aligned_loci.txt) +output=$(basename ${locus} _aligned.fas)_trimmed.fas + +trimal -in ${locus} -out ../trimmed_aa/${output} -automated1 +``` + +**PBS Array Job**: +```bash +#!/bin/bash +#PBS -N trimal_array +#PBS -t 1-NUM_LOCI +#PBS -l nodes=1:ppn=1 +#PBS -l mem=2gb +#PBS -l walltime=2:00:00 + +cd $PBS_O_WORKDIR/aligned_aa +source ~/.bashrc +conda activate phylo + +mkdir -p ../trimmed_aa +ls *.fas > aligned_loci.txt + +locus=$(sed -n "${PBS_ARRAYID}p" aligned_loci.txt) +output=$(basename ${locus} _aligned.fas)_trimmed.fas + +trimal -in ${locus} -out ../trimmed_aa/${output} -automated1 +``` + +**Local with GNU parallel**: +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd aligned_aa +mkdir -p ../trimmed_aa + +cat aligned_loci.txt | parallel -j 4 ' + output=$(basename {} _aligned.fas)_trimmed.fas + trimal -in {} -out ../trimmed_aa/${output} -automated1 + echo "Trimmed: ${output}" +' +``` + +### ClipKit Implementation + +**SLURM Array Job**: +```bash +#!/bin/bash +#SBATCH --job-name=clipkit_array +#SBATCH --array=1-NUM_LOCI +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=2G +#SBATCH --time=2:00:00 +#SBATCH --output=logs/%A_%a.clipkit.out + +source ~/.bashrc +conda activate phylo + +cd aligned_aa +mkdir -p ../trimmed_aa + +locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" aligned_loci.txt) +output=$(basename ${locus} _aligned.fas)_trimmed.fas + +clipkit ${locus} -o ../trimmed_aa/${output} +``` + +**Local with GNU parallel**: +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd aligned_aa +mkdir -p ../trimmed_aa + +cat aligned_loci.txt | parallel -j 4 ' + output=$(basename {} _aligned.fas)_trimmed.fas + clipkit {} -o ../trimmed_aa/${output} +' +``` + +### BMGE Implementation + +**SLURM Array Job**: +```bash +#!/bin/bash +#SBATCH --job-name=bmge_array +#SBATCH --array=1-NUM_LOCI +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=2G +#SBATCH --time=2:00:00 +#SBATCH --output=logs/%A_%a.bmge.out + +source ~/.bashrc +conda activate phylo + +cd aligned_aa +mkdir -p ../trimmed_aa + +locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" aligned_loci.txt) +output=$(basename ${locus} _aligned.fas)_trimmed.fas + +bmge -i ${locus} -t AA -o ../trimmed_aa/${output} +``` + +**Local with GNU parallel**: +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd aligned_aa +mkdir -p ../trimmed_aa + +cat aligned_loci.txt | parallel -j 4 ' + output=$(basename {} _aligned.fas)_trimmed.fas + bmge -i {} -t AA -o ../trimmed_aa/${output} +' +``` + +--- + +## Partition Model Selection Implementation + +This section provides detailed implementation scripts for Step 8A (IQ-TREE partition model selection). + +### SLURM Implementation + +```bash +#!/bin/bash +#SBATCH --job-name=partition_search +#SBATCH --cpus-per-task=18 +#SBATCH --mem-per-cpu=4G +#SBATCH --time=72:00:00 +#SBATCH --output=logs/partition_search.%j.out +#SBATCH --error=logs/partition_search.%j.err + +source ~/.bashrc +conda activate phylo + +cd trimmed_aa + +echo "Starting partition model selection..." +echo "Using model set: MODEL_SET" +echo "Threads: ${SLURM_CPUS_PER_TASK}" + +iqtree \ + -s FcC_supermatrix.fas \ + -spp partition_def.txt \ + -m TESTMERGEONLY \ + -mset MODEL_SET \ + -msub nuclear \ + -rcluster 10 \ + -bb 1000 \ + -alrt 1000 \ + -nt ${SLURM_CPUS_PER_TASK} \ + -safe \ + -pre partition_search + +echo "Partition search complete!" +echo "Best scheme: partition_search.best_scheme.nex" +``` + +### PBS Implementation + +```bash +#!/bin/bash +#PBS -N partition_search +#PBS -l nodes=1:ppn=18 +#PBS -l mem=72gb +#PBS -l walltime=72:00:00 + +cd $PBS_O_WORKDIR/trimmed_aa +source ~/.bashrc +conda activate phylo + +iqtree \ + -s FcC_supermatrix.fas \ + -spp partition_def.txt \ + -m TESTMERGEONLY \ + -mset MODEL_SET \ + -msub nuclear \ + -rcluster 10 \ + -bb 1000 \ + -alrt 1000 \ + -nt 18 \ + -safe \ + -pre partition_search +``` + +### Local Implementation + +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd trimmed_aa + +THREADS=18 # Adjust based on available cores + +echo "Starting partition model selection..." +echo "Using ${THREADS} threads" + +iqtree \ + -s FcC_supermatrix.fas \ + -spp partition_def.txt \ + -m TESTMERGEONLY \ + -mset MODEL_SET \ + -msub nuclear \ + -rcluster 10 \ + -bb 1000 \ + -alrt 1000 \ + -nt ${THREADS} \ + -safe \ + -pre partition_search + +echo "Partition search complete!" +``` + +### Parameter Explanations + +- `-m TESTMERGEONLY`: Test models and merge partitions with same best model +- `-mset MODEL_SET`: Test only specified models (e.g., "LG,WAG,JTT,Q.pfam") +- `-msub nuclear`: Rate heterogeneity appropriate for nuclear genes +- `-rcluster 10`: Merge similar partitions (10% relaxed clustering) +- `-bb 1000`: 1000 ultrafast bootstrap replicates +- `-alrt 1000`: 1000 SH-aLRT replicates (additional branch support) +- `-safe`: Safe numerical mode (slower but more stable) + +--- + +## Gene Trees Implementation + +This section provides detailed implementation scripts for Step 8C (individual gene tree estimation). + +### SLURM Array Job + +```bash +#!/bin/bash +#SBATCH --job-name=gene_trees +#SBATCH --array=1-NUM_LOCI # Replace with loci count +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=4G +#SBATCH --time=2:00:00 +#SBATCH --output=logs/%A_%a.gene_tree.out +#SBATCH --error=logs/%A_%a.gene_tree.err + +source ~/.bashrc +conda activate phylo + +cd trimmed_aa +ls *_trimmed.fas > locus_list.txt + +locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" locus_list.txt) +prefix=$(basename ${locus} .fas) + +echo "Estimating gene tree: ${locus}" + +iqtree \ + -s ${locus} \ + -m MFP \ + -bb 1000 \ + -bnni \ + -czb \ + -nt 1 \ + -pre ${prefix} + +echo "Completed: ${prefix}.treefile" +``` + +### PBS Array Job + +```bash +#!/bin/bash +#PBS -N gene_trees +#PBS -t 1-NUM_LOCI +#PBS -l nodes=1:ppn=1 +#PBS -l mem=4gb +#PBS -l walltime=2:00:00 + +cd $PBS_O_WORKDIR/trimmed_aa +source ~/.bashrc +conda activate phylo + +ls *_trimmed.fas > locus_list.txt + +locus=$(sed -n "${PBS_ARRAYID}p" locus_list.txt) +prefix=$(basename ${locus} .fas) + +iqtree \ + -s ${locus} \ + -m MFP \ + -bb 1000 \ + -bnni \ + -czb \ + -nt 1 \ + -pre ${prefix} +``` + +### Local Parallel (GNU parallel) + +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd trimmed_aa +ls *_trimmed.fas > locus_list.txt + +CONCURRENT_JOBS=4 # Adjust based on available cores + +echo "Estimating gene trees in parallel..." +echo "Concurrent jobs: ${CONCURRENT_JOBS}" + +cat locus_list.txt | parallel -j ${CONCURRENT_JOBS} ' + prefix=$(basename {} .fas) + echo "Processing: {}" + + iqtree \ + -s {} \ + -m MFP \ + -bb 1000 \ + -bnni \ + -czb \ + -nt 1 \ + -pre ${prefix} > ${prefix}.log 2>&1 + + echo "Completed: ${prefix}.treefile" +' + +echo "All gene trees estimated!" +``` + +### Local Serial + +```bash +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd trimmed_aa + +echo "Estimating gene trees (serial processing)..." + +for locus in *_trimmed.fas; do + prefix=$(basename ${locus} .fas) + echo "Processing: ${locus}" + + iqtree \ + -s ${locus} \ + -m MFP \ + -bb 1000 \ + -bnni \ + -czb \ + -nt 1 \ + -pre ${prefix} + + echo "Completed: ${prefix}.treefile" +done + +echo "All gene trees estimated!" +``` + +### Parameter Explanations + +- `-m MFP`: Model Finder Plus (automatically select best model) +- `-bb 1000`: 1000 ultrafast bootstrap replicates +- `-bnni`: Reduce NNI iterations to avoid overestimating bootstrap support +- `-czb`: Collapse zero-length branches in final tree +- `-nt 1`: Single thread per gene (parallelize across genes, not within) + +--- + +## Methods Paragraph Template + +This section provides the complete methods paragraph template for publications. + +Use this template to generate a `METHODS_PARAGRAPH.md` file for users. Customize based on their workflow choices. + +```markdown +# Methods Paragraph for Publication + +## Phylogenomic Analysis + +[Copy and customize the text below for your manuscript] + +--- + +### Ortholog Identification and Quality Control + +We identified single-copy orthologs from [NUMBER] genome assemblies using compleasm v[VERSION] (Huang & Li, 2023) with the [LINEAGE_NAME] BUSCO lineage dataset (v[VERSION]). Genomes with completeness scores below [THRESHOLD]% were excluded from downstream analyses. From the retained high-quality genomes, we extracted [NUMBER] single-copy orthologs present in all species. + +### Multiple Sequence Alignment and Trimming + +Each orthologous gene set was aligned using MAFFT v7 (Katoh & Standley, 2013) with the L-INS-i algorithm for accurate alignment of conserved protein sequences. Aligned sequences were then trimmed to remove ambiguously aligned regions using [TRIMMING_METHOD]: + +- **Aliscore/ALICUT**: We used Aliscore v2.2 and ALICUT v2.31 (Kück et al., 2010) to identify and remove randomly similar sequence (RSS) sections. Aliscore identified RSS positions using Monte Carlo resampling with default parameters (window size = 4, treating gaps as ambiguous characters with -N option), and ALICUT removed these positions from the alignments. + +- **trimAl**: We employed trimAl v1.4 (Capella-Gutiérrez et al., 2009) with the -automated1 heuristic method to automatically optimize gap threshold selection. + +- **BMGE**: We used BMGE v1.12 (Criscuolo & Gribaldo, 2010) with entropy-based trimming for amino acid sequences (option -t AA). + +- **ClipKit**: We applied ClipKit v1.3 (Steenwyk et al., 2020) with the default smart-gap mode for phylogenetically informative position selection. + +After trimming, alignments containing fewer than [MIN_LENGTH] informative positions were excluded, resulting in [FINAL_NUMBER] high-quality gene alignments. + +### Phylogenetic Inference + +#### Concatenated Analysis + +Trimmed alignments were concatenated into a supermatrix using FASconCAT-G v1.06.1 (Kück & Longo, 2014), yielding a final alignment of [TOTAL_LENGTH] amino acid positions across [NUMBER] partitions. We performed partitioned maximum likelihood (ML) phylogenetic inference using IQ-TREE v2.3 (Minh et al., 2020). The best-fit partitioning scheme and substitution models were selected using ModelFinder (Kalyaanamoorthy et al., 2017) with the TESTMERGEONLY option and [MODEL_SET] model set. Partitions were merged if they shared the same evolutionary model to reduce model complexity. The final tree was inferred using the selected partition scheme, with branch support assessed using 1,000 ultrafast bootstrap replicates (Hoang et al., 2018). To improve accuracy, we used the -bnni option to reduce potential overestimation of bootstrap support. + +#### Coalescent-Based Species Tree + +To account for incomplete lineage sorting, we also inferred a species tree using the multispecies coalescent model. Individual gene trees were estimated for each of the [NUMBER] alignments using IQ-TREE v2.3 with automatic model selection and 1,000 ultrafast bootstrap replicates. To improve accuracy, we used the -bnni option to reduce potential overestimation of bootstrap support and -czb to collapse zero-length branches. The resulting gene trees were summarized into a species tree using ASTRAL-III v5.7.8 (Zhang et al., 2018), which estimates the species tree topology that agrees with the largest number of quartet trees induced by the gene trees. Branch support was quantified using local posterior probabilities. + +### Software and Reproducibility + +All analyses were conducted using conda environments (conda v[VERSION]) to ensure reproducibility. Analysis scripts and detailed workflow documentation are available at [GITHUB_URL or supplementary materials]. + +--- + +## Complete Reference List + +Capella-Gutiérrez, S., Silla-Martínez, J. M., & Gabaldón, T. (2009). trimAl: a tool for automated alignment trimming in large-scale phylogenetic analyses. *Bioinformatics*, 25(15), 1972-1973. https://doi.org/10.1093/bioinformatics/btp348 + +Criscuolo, A., & Gribaldo, S. (2010). BMGE (Block Mapping and Gathering with Entropy): a new software for selection of phylogenetic informative regions from multiple sequence alignments. *BMC Evolutionary Biology*, 10(1), 210. https://doi.org/10.1186/1471-2148-10-210 + +Hoang, D. T., Chernomor, O., von Haeseler, A., Minh, B. Q., & Vinh, L. S. (2018). UFBoot2: improving the ultrafast bootstrap approximation. *Molecular Biology and Evolution*, 35(2), 518-522. https://doi.org/10.1093/molbev/msx281 + +Huang, N., & Li, H. (2023). compleasm: a faster and more accurate reimplementation of BUSCO. *Bioinformatics*, 39(10), btad595. https://doi.org/10.1093/bioinformatics/btad595 + +Kalyaanamoorthy, S., Minh, B. Q., Wong, T. K., von Haeseler, A., & Jermiin, L. S. (2017). ModelFinder: fast model selection for accurate phylogenetic estimates. *Nature Methods*, 14(6), 587-589. https://doi.org/10.1038/nmeth.4285 + +Katoh, K., & Standley, D. M. (2013). MAFFT multiple sequence alignment software version 7: improvements in performance and usability. *Molecular Biology and Evolution*, 30(4), 772-780. https://doi.org/10.1093/molbev/mst010 + +Kück, P., & Longo, G. C. (2014). FASconCAT-G: extensive functions for multiple sequence alignment preparations concerning phylogenetic studies. *Frontiers in Zoology*, 11(1), 81. https://doi.org/10.1186/s12983-014-0081-x + +Kück, P., Meusemann, K., Dambach, J., Thormann, B., von Reumont, B. M., Wägele, J. W., & Misof, B. (2010). Parametric and non-parametric masking of randomness in sequence alignments can be improved and leads to better resolved trees. *Frontiers in Zoology*, 7(1), 10. https://doi.org/10.1186/1742-9994-7-10 + +Minh, B. Q., Schmidt, H. A., Chernomor, O., Schrempf, D., Woodhams, M. D., von Haeseler, A., & Lanfear, R. (2020). IQ-TREE 2: new models and efficient methods for phylogenetic inference in the genomic era. *Molecular Biology and Evolution*, 37(5), 1530-1534. https://doi.org/10.1093/molbev/msaa015 + +Steenwyk, J. L., Buida III, T. J., Li, Y., Shen, X. X., & Rokas, A. (2020). ClipKIT: a multiple sequence alignment trimming software for accurate phylogenomic inference. *PLOS Biology*, 18(12), e3001007. https://doi.org/10.1371/journal.pbio.3001007 + +Zhang, C., Rabiee, M., Sayyari, E., & Mirarab, S. (2018). ASTRAL-III: polynomial time species tree reconstruction from partially resolved gene trees. *BMC Bioinformatics*, 19(6), 153. https://doi.org/10.1186/s12859-018-2129-y + +--- + +## Instructions for Use + +1. **Replace placeholders in brackets** with your actual values: + - `[NUMBER]`, `[VERSION]`, `[LINEAGE_NAME]`, `[THRESHOLD]`, `[MIN_LENGTH]`, etc. + +2. **Remove sections for tools you didn't use**: + - Delete the trimming method descriptions you didn't use + - If you only did concatenated OR coalescent analysis, remove the other section + +3. **Adjust detail level** based on your target journal: + - Combine into shorter paragraph for journals with strict word limits + - Expand with more parameter details for bioinformatics journals + +4. **Add to your manuscript**: + - This goes in your Materials and Methods section + - Add all references to your bibliography + +5. **Update version numbers**: + - Check actual versions used: `conda list` in your phylo environment + - Include versions in your methods for reproducibility +``` + +--- + +*This reference guide complements the main BUSCO phylogenomics skill and provides detailed technical specifications for implementation.* diff --git a/skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py b/skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py new file mode 100755 index 0000000..458b1a5 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/convert_fasconcat_to_partition.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +""" +Convert FASconCAT info file to IQ-TREE partition format + +Usage: + python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt] + +Author: Bruno de Medeiros (Field Museum) +Based on tutorials by Paul Frandsen (BYU) +""" + +import sys + + +def convert_fcc_to_partition(fcc_file, output_file="partition_def.txt"): + """ + Convert FASconCAT info file to IQ-TREE partition format + + Args: + fcc_file: Path to FcC_info.xls file from FASconCAT + output_file: Path to output partition definition file + """ + + try: + with open(fcc_file, 'r') as f: + lines = f.readlines() + except FileNotFoundError: + print(f"Error: File '{fcc_file}' not found") + sys.exit(1) + + partitions_written = 0 + + with open(output_file, 'w') as out: + # Skip first two header lines (FASconCAT INFO and column headers) + for line in lines[2:]: + line = line.strip() + if line: + parts = line.split('\t') + if len(parts) >= 3: + locus = parts[0] + start = parts[1] + end = parts[2] + out.write(f"AA, {locus} = {start}-{end}\n") + partitions_written += 1 + + print(f"Partition file created: {output_file}") + print(f"Number of partitions: {partitions_written}") + + +def main(): + if len(sys.argv) < 2: + print("Usage: python convert_fasconcat_to_partition.py FcC_info.xls [output_file.txt]") + print("\nConverts FASconCAT info file to IQ-TREE partition format") + sys.exit(1) + + fcc_file = sys.argv[1] + output_file = sys.argv[2] if len(sys.argv) > 2 else "partition_def.txt" + + convert_fcc_to_partition(fcc_file, output_file) + + +if __name__ == "__main__": + main() diff --git a/skills/phylo_from_buscos/scripts/download_ncbi_genomes.py b/skills/phylo_from_buscos/scripts/download_ncbi_genomes.py new file mode 100755 index 0000000..9592ba0 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/download_ncbi_genomes.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +""" +Download genomes from NCBI using BioProject or Assembly accessions + +Usage: + python download_ncbi_genomes.py --bioprojects PRJNA12345 PRJEB67890 + python download_ncbi_genomes.py --assemblies GCA_123456789.1 GCF_987654321.1 + +Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib) + +Author: Bruno de Medeiros (Field Museum) +Based on tutorials by Paul Frandsen (BYU) +""" + +import argparse +import sys +import subprocess + + +def download_using_cli(accessions, output_file="genomes.zip"): + """ + Download genomes using NCBI datasets CLI + + Args: + accessions: List of BioProject or Assembly accessions + output_file: Name of output zip file + """ + cmd = ["datasets", "download", "genome", "accession"] + accessions + ["--filename", output_file] + + print(f"Running: {' '.join(cmd)}") + print("") + + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + print(result.stdout) + print(f"\nDownload complete: {output_file}") + print("Extract with: unzip " + output_file) + return True + except subprocess.CalledProcessError as e: + print(f"Error downloading genomes: {e}", file=sys.stderr) + print(e.stderr, file=sys.stderr) + return False + except FileNotFoundError: + print("Error: 'datasets' command not found", file=sys.stderr) + print("Install with: conda install -c conda-forge ncbi-datasets-cli", file=sys.stderr) + return False + + +def get_bioproject_assemblies(bioprojects): + """ + Get assembly accessions for given BioProjects using Python API + + Args: + bioprojects: List of BioProject accessions + + Returns: + List of tuples (assembly_accession, organism_name) + """ + try: + from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions + except ImportError: + print("Error: ncbi-datasets-pylib not installed", file=sys.stderr) + print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr) + sys.exit(1) + + assemblies = [] + + print(f"Fetching assembly information for {len(bioprojects)} BioProject(s)...") + print("") + + for assembly in get_assembly_metadata_by_bioproject_accessions(bioprojects): + acc = assembly.accession + name = assembly.organism.organism_name + assemblies.append((acc, name)) + print(f" {name}: {acc}") + + print(f"\nFound {len(assemblies)} assemblies") + + return assemblies + + +def main(): + parser = argparse.ArgumentParser( + description="Download genomes from NCBI using BioProject or Assembly accessions" + ) + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + "--bioprojects", + nargs="+", + help="BioProject accessions (e.g., PRJNA12345 PRJEB67890)" + ) + group.add_argument( + "--assemblies", + nargs="+", + help="Assembly accessions (e.g., GCA_123456789.1 GCF_987654321.1)" + ) + + parser.add_argument( + "-o", "--output", + default="genomes.zip", + help="Output zip file name (default: genomes.zip)" + ) + + parser.add_argument( + "--list-only", + action="store_true", + help="List assemblies without downloading (BioProject mode only)" + ) + + args = parser.parse_args() + + if args.bioprojects: + assemblies = get_bioproject_assemblies(args.bioprojects) + + if args.list_only: + print("\nAssembly accessions (use with --assemblies to download):") + for acc, name in assemblies: + print(acc) + return + + # Download assemblies + assembly_accs = [acc for acc, name in assemblies] + success = download_using_cli(assembly_accs, args.output) + + elif args.assemblies: + success = download_using_cli(args.assemblies, args.output) + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/phylo_from_buscos/scripts/extract_orthologs.sh b/skills/phylo_from_buscos/scripts/extract_orthologs.sh new file mode 100755 index 0000000..a7fe762 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/extract_orthologs.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Extract and reorganize single-copy orthologs from compleasm output +# +# Usage: bash extract_orthologs.sh LINEAGE_NAME +# Example: bash extract_orthologs.sh metazoa +# +# Author: Bruno de Medeiros (Field Museum) +# Based on tutorials by Paul Frandsen (BYU) + +if [ $# -lt 1 ]; then + echo "Usage: bash extract_orthologs.sh LINEAGE_NAME" + echo " Example: bash extract_orthologs.sh metazoa" + exit 1 +fi + +LINEAGE="$1" + +echo "Extracting single-copy orthologs for lineage: ${LINEAGE}" + +# Create directory for ortholog FASTA files +mkdir -p single_copy_orthologs + +# Copy gene_marker.fasta files and rename by species +count=0 +for dir in 01_busco_results/*_compleasm; do + if [ ! -d "${dir}" ]; then + continue + fi + + genome=$(basename "${dir}" _compleasm) + + # Auto-detect the OrthoDB version (odb10, odb11, odb12, etc.) + odb_dirs=("${dir}/${LINEAGE}_odb"*) + if [ -d "${odb_dirs[0]}" ]; then + marker_file="${odb_dirs[0]}/gene_marker.fasta" + else + echo " Warning: No OrthoDB directory found for ${genome}" >&2 + continue + fi + + if [ -f "${marker_file}" ]; then + cp "${marker_file}" "single_copy_orthologs/${genome}.fasta" + echo " Extracted: ${genome}" + count=$((count + 1)) + else + echo " Warning: Marker file not found for ${genome}" >&2 + fi +done + +if [ ${count} -eq 0 ]; then + echo "Error: No gene_marker.fasta files found. Check lineage name." >&2 + exit 1 +fi + +echo "Extracted ${count} genomes" +echo "" +echo "Now generating per-locus unaligned FASTA files..." + +cd single_copy_orthologs || exit 1 +mkdir -p unaligned_aa +cd unaligned_aa || exit 1 + +# AWK script to split by ortholog ID +awk 'BEGIN{RS=">"; FS="\n"} { + if (NF > 1) { + split($1, b, "_"); + fnme = b[1] ".fas"; + n = split(FILENAME, a, "/"); + species = a[length(a)]; + gsub(".fasta", "", species); + print ">" species "\n" $2 >> fnme; + close(fnme); + } +}' ../*.fasta + +# Fix headers +if [[ "$OSTYPE" == "darwin"* ]]; then + # macOS + sed -i '' -e 's/.fasta//g' *.fas +else + # Linux + sed -i -e 's/.fasta//g' *.fas +fi + +num_loci=$(ls -1 *.fas 2>/dev/null | wc -l) +echo "Unaligned ortholog files generated: ${num_loci} loci" +echo "" +echo "Output directory: single_copy_orthologs/unaligned_aa/" diff --git a/skills/phylo_from_buscos/scripts/generate_qc_report.sh b/skills/phylo_from_buscos/scripts/generate_qc_report.sh new file mode 100755 index 0000000..0f8bb74 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/generate_qc_report.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# Quality control report generator for compleasm results +# +# Usage: bash generate_qc_report.sh [output_file.csv] +# +# Author: Bruno de Medeiros (Field Museum) +# Based on tutorials by Paul Frandsen (BYU) + +OUTPUT_FILE="${1:-qc_report.csv}" + +echo "Genome,Complete_SCO,Fragmented,Duplicated,Missing,Completeness(%)" > "${OUTPUT_FILE}" + +count=0 +for dir in 01_busco_results/*_compleasm; do + if [ ! -d "${dir}" ]; then + continue + fi + + genome=$(basename "${dir}" _compleasm) + summary="${dir}/summary.txt" + + if [ -f "${summary}" ]; then + # Parse completeness statistics from compleasm format + # compleasm uses: S: (single-copy), D: (duplicated), F: (fragmented), M: (missing) + # Format: "S:80.93%, 2283" where we need the count (2283) + complete=$(grep "^S:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ') + duplicated=$(grep "^D:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ') + fragmented=$(grep "^F:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ') + missing=$(grep "^M:" "${summary}" | awk -F',' '{print $2}' | tr -d ' ') + + # Check if all values were successfully extracted + if [ -z "${complete}" ] || [ -z "${fragmented}" ] || [ -z "${missing}" ]; then + echo "Warning: Could not parse statistics for ${genome}" >&2 + continue + fi + + # Calculate completeness percentage (Complete / Total * 100) + total=$((complete + duplicated + fragmented + missing)) + if command -v bc &> /dev/null; then + completeness=$(echo "scale=2; (${complete} + ${duplicated}) / ${total} * 100" | bc) + else + # Fallback if bc not available + completeness=$(awk "BEGIN {printf \"%.2f\", (${complete} + ${duplicated}) / ${total} * 100}") + fi + + echo "${genome},${complete},${fragmented},${duplicated},${missing},${completeness}" >> "${OUTPUT_FILE}" + count=$((count + 1)) + else + echo "Warning: Summary file not found for ${genome}" >&2 + fi +done + +if [ ${count} -eq 0 ]; then + echo "Error: No compleasm output directories found (*_compleasm)" >&2 + exit 1 +fi + +echo "QC report generated: ${OUTPUT_FILE}" +echo "Genomes analyzed: ${count}" diff --git a/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl b/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl new file mode 100755 index 0000000..b0db02f --- /dev/null +++ b/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/ALICUT_V2.31.pl @@ -0,0 +1,742 @@ +#!/usr/bin/perl +use strict ; +use File::Copy ; +use Tie::File ; +use Fcntl ; +use Term::Cap ; +use Term::ANSIColor qw(:constants); +use Getopt::Std ; + +# updated on 13th february , 2009 by patrick kck +# updated on 2nd april , 2009 by patrick kck +# updated on 15th june , 2009 by patrick kck +# updated on 26th july , 2009 by patrick kck +# updated on 7th september, 2011 by patrick kck (alicut v2.3) +# updated on 22.2.2017, by patrick kck (alicut v2.31) -> correction of initial warning due to line 547, changed some terminal prints, argv handling commands + +my @answer_remain_stems = ( 'no', 'yes' ) ; +my @answer_codons = ( 'no', 'yes' ) ; +my @answer_third_pos = ( 'no', 'yes' ) ; + +&argv_handling ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ; +&menu ( \@answer_remain_stems, \@answer_codons, \@answer_third_pos ) ; + + + +sub argv_handling{ + + my $aref_remain_stems = $_[0] ; + my $aref_codons = $_[1] ; + my $aref_third_pos = $_[2] ; + + my ( $commandline ) = join "", @ARGV ; + + $commandline =~ s/ |\s+// ; + my @commands = split "-", $commandline ; + shift @commands ; + + for my $single_command ( sort @commands ){ + + if ( $single_command =~ /^r$/i ) { @$aref_remain_stems = ( reverse @$aref_remain_stems) } + elsif ( $single_command =~ /^c$/i ) { @$aref_codons = ( reverse @$aref_codons ) } + elsif ( $single_command =~ /^3$/i ) { @$aref_third_pos = ( reverse @$aref_third_pos ) } + elsif ( $single_command =~ /^h$/i ) { &help } + elsif ( $single_command =~ /^p$/i ) { &preface } + elsif ( $single_command =~ /^s$/i ) { + &header ; + &commands( \$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0]) ; + &start (\$aref_remain_stems->[0], \$aref_codons->[0], \$aref_third_pos->[0]) + } + else { print "\n\t!COMMAND-ERROR!: unknown command \"-", $single_command, "\"\n" } + } + + &menu ( \@$aref_remain_stems, \@$aref_codons, \@$aref_third_pos) +} + +sub header{ + + printf "\n%68s\n", "------------------------------------------------------------" ; + printf "%49s\n" , "Welcome to ALICUT V2.31 !" ; + printf "%60s\n" , "a Perlscript to cut ALISCORE identified RSS" ; + printf "%57s\n" , "written by Patrick Kueck (ZFMK, Bonn)" ; + printf "%68s\n\n", "------------------------------------------------------------" ; +} + +sub commands{ + + my $sref_rem_stems = $_[0] ; + my $sref_reo_codon = $_[1] ; + my $sref_th_posit = $_[2] ; + + print "\n\t------------------------------------------------------------" ; + print "\n\tRemain Stem Position :\t", $$sref_rem_stems ; + print "\n\tRemove Codon :\t", $$sref_reo_codon ; + print "\n\tRemove 3rd Position :\t", $$sref_th_posit ; + print "\n\t------------------------------------------------------------\n" ; +} + +sub help{ + + print + < return (via Menu) or + Type (via command line) + + + + R-Option (Remain Stems) + ------------------------------------------------------------------- + To remain all stem positions of identified rss within FASTA file(s): + + Type (via Menu) + Type (via command line) + + + + C-Option (Remove Codon) + ------------------------------------------------------------------- + To translate ALISCORE identified RSS positions of amino-acid data + into nucleotide triplet positions before exclusion of randomised + sequence sections: + + Type return return (via Menu) or + Type (via command line) + + Note: + This option is only useful if you have analysed amino-acid + data, but wish to exclude nucleotide positions from the amino-acid + data corresponding nucleotide data. + Be aware, that the name of the nucleotide data file has to be named + equal to the ALISCORE analysed amino-acid data file. The C-option + can not be applied on amino-acid sequences. Otherwise, ALICUT + excludes the original ALISCORE identified sequence sections. + + + + 3-Option (Remove 3rd position) + ------------------------------------------------------------------- + To remove ALISCORE identified RSS only if its sequence position is + up to amultiple of 3: + + Type <3> (via Menu) + Type (via command line) + + Note: + The 3-Option can be combined with the C-option. In this case, + positions of the ALISCORE "List" outfile(s) are translated into + codon positions from which only the 3rd positions are excluded. + The 3-Option can only be applied on nucleotide data. Otherwise, + ALICUT excludes the original ALISCORE identified sequence sections. + + + + ALICUT IN and OUT files + ------------------------------------------------------------------- + ALICUT V2.3 needs the original ALISCORE FASTA infile(s) and "List" + outfile(s) in the same folder as ALICUT V2.3. + + The "List" outfile(s) must contain the identified RSS positions + in one single line, separated by whitespace. + + e.g. 1 3 5 6 8 9 10 11 123 127 10000 10001 + + ALICUT V2.0 can handle unlimited FASTA files in one single run. + The sole condition is that the Prefix of the ALISCORE "List" + outfile(s) are identic with the associated FASTA infile(s). + ALICUT V2.3 first searches for the ALISCORE "List" outfile(s), + removes the Suffix "_List_random.txt" and searches for the + "List" associated FASTA file(s). + + e.g. COI.fas_List_random.txt (ALISCORE "List" outfile) + COI.fas (Associated FASTA infile) + + If both files are detected, ALICUT V2.3 excludes the RSS identified + positions of the "List" file(s) in the associated + FASTA file(s) and saves the changes in a new FASTA outfile, + named "ALICUT_FASTAinputname.fas". + + Under the C- and 3-Option, removed sequence positions differ from + the original "List" position numbers. Under both options, ALICUT + prints the actually removed positions in separate "ALICUT_LIST" + outfile(s). + + ALICUT V2.3 generates also an info file "ALICUT_info". This file + informs about the number and percentage of removed positions, number + of single sequences, single parameter settings, and sequence states + of each restricted FASTA file. + If structure sequences are identified by ALICUT, ALICUT generates + structure info file(s) which lists remaining stem pairs and loop + positions, as well as percentages of both structure elements. + + ------------------------------------------------------------------- + ------------------------------------------------------------------- + + +info +; + + print "\tBACK to ALICUT MAIN-Menu:\t\t type \n" ; + print "\n\t------------------------------------------------------------\n\t" ; + + chomp ( my $answer_xy = ); + + &menu ; + +} + +sub preface{ + +print +<\n" ; + print "\n\t------------------------------------------------------------\n\t" ; + + chomp ( my $answer_xy = ); + + &menu; +} + +sub menu{ + + my $aref_remain_stems = $_[0] ; + my $aref_remove_codon = $_[1] ; + my $aref_third_posit = $_[2] ; + + &header ; + + print "\n\tSTART ALICUT:\t\ttype " ; + print "\n\tQUIT ALICUT:\t\ttype " ; + print "\n\tREMAIN STEMS:\t\ttype " ; + print "\n\tREMOVE CODON:\t\ttype " ; + print "\n\tREMOVE 3rd:\t\ttype <3> " ; + print "\n\tHELP:\t\t\ttype " ; + print "\n\tPREFACE:\t\ttype

" ; + + &commands ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] ); + + my $answer_opening = &commandline ; + + until ( $answer_opening =~ /^s$|^r$|^c$|^p$|^h$|^1$|^2$|^q$|^3$/i ){ + + print "\n\t!COMMAND-ERROR!: unknown command \"$answer_opening\"!\n" ; + + $answer_opening = &commandline ; + } + + $answer_opening =~ /^s$/i and do { &start ( \$aref_remain_stems->[0], \$aref_remove_codon->[0], \$aref_third_posit->[0] ) } ; + $answer_opening =~ /^r$/i and do { @$aref_remain_stems = (reverse @$aref_remain_stems ); &menu } ; + $answer_opening =~ /^c$/i and do { @$aref_remove_codon = (reverse @$aref_remove_codon ); &menu } ; + $answer_opening =~ /^3$/i and do { @$aref_third_posit = (reverse @$aref_third_posit ); &menu } ; + $answer_opening =~ /^q$/i and do { exit } ; + $answer_opening =~ /^h$/i and do { &help } ; + $answer_opening =~ /^1$/ and do { &error1 } ; + $answer_opening =~ /^2$/ and do { &error2 } ; + $answer_opening =~ /^p$/i and do { &preface } +} + +sub start{ + + my $sref_stems_remain = $_[0] ; + my $sref_codon_remove = $_[1] ; + my $sref_third_remove = $_[2] ; + + my $j = 0 ; + + open OUTinfo, ">>ALICUT_info.xls" ; + print OUTinfo "\nUsed List File\tUsed Fasta file\tremove triplets\tremove 3rd position\tnumber taxa\tbp before\tbp after\tremaining bp [%]\tsequence type\n" ; + + + + # Read IN of all List_random.txt files within the same folder as ALICUT and handle it + READING: + foreach my $file ( <*List_*.txt> ) { + + # Set counter +1 + $j++; + + + + # Read in of the ALISCORE-list outfile + &tie_linefeeds ( \$file ) ; + ( open IN, "<$file" ) or die "n\t!FILE-ERROR!: Can not open listfile $file!\n" ; + my $line = ; chomp $line ; + + # check for correct aliscore list format + unless ( $line =~ /^(\d+ )+\d+$|^\d+$/ ) { warn "\t!FILE-WARN!: $file has no ALISCORE list format!\n" ; next READING } + + # Total number of randomized identified positions + my @cut_positions = split " ", $line ; close IN ; + + + + # "filename.fas_List_random.txt" to "filename.fas" + ( my $file_fasta = $file ) =~ s/_List_.+// ; + + # Read in of the original ALISCORE fasta infile which belongs to the listfile + &tie_linefeeds ( \$file_fasta ) ; + ( open INfas, "<$file_fasta" ) or warn "\t!FILE-WARN!: Can not find $file_fasta!\n" and next READING ; + + chomp ( my @inputfile = ) ; close INfas ; + warn "\t!FILE-WARN!: File $file_fasta is empty!\n" if 0 == @inputfile and next READING ; + + # Handle the FASTA file in the way that sequencename and sequence alternate in each line + @inputfile = fas_bearbeiten ( @inputfile ) ; + + # Generate a hash: key=>taxon, value => sequenz + my %sequence = @inputfile ; + my @values = values %sequence ; + + # Determine basepositions before und after cut. Output of cuttings as total number and in percent + my $number_sequences = keys %sequence ; + my $number_characters_before = length $values[0] ; + + + + + + + # Check for correct FASTA format and handling of structure sequence + my $sequence_state = 'nt' ; + SEQUENCE_CHECK: + for my $raw_taxon ( keys %sequence ){ + + # if whitespace are between ">" and the next sign within a sequence name, delete these whitespaces + $raw_taxon =~ s/^\>\s*/\>/g ; + + # if whitespaces between last sign and newline in sequence name, delete these whitespaces + $raw_taxon =~ s/\s*$//g ; + + die "\n\t!FILE-ERROR!: $raw_taxon in $file_fasta is not in FASTA format!\n" if $raw_taxon !~ /^\>/ ; + die "\n\t!FILE-ERROR!: Sequence name missing in $file_fasta!\n" if $raw_taxon =~ /^\>$/ ; + die "\n\t!FILE-ERROR!: Sequence name $raw_taxon in $file_fasta involves forbidden signs!\n" if $raw_taxon !~ /\w/ ; + die "\n\t!FILE-ERROR!: Sequences of $file_fasta have no equal length!\n" if length $sequence{$raw_taxon} != $number_characters_before ; + die "\n\t!FILE-ERROR!: Sequence missing in $file_fasta!\n" if $sequence{$raw_taxon} =~ /^\n$|^$/ ; + die "\n\t!FILE-ERROR!: Sequence length in $file_fasta is too short to cut all positions!\n" if $number_characters_before < $cut_positions[ $#cut_positions ] ; + + + + # Structure handling + if ( $sequence{$raw_taxon} =~ /.*\(.*\).*/ ){ + + $sequence{$raw_taxon} =~ s/-/./g ; + my @strc_elements = split "" , $sequence{$raw_taxon} ; + + for my $str_sign ( @strc_elements ){ + + unless ( $str_sign =~ /\(|\)|\./ ){ die "\n\t!FILE-ERROR!: Structure string of $file_fasta involves forbidden signs in $raw_taxon!\n" } + } + + my $structurestring = $sequence{$raw_taxon} ; + $structurestring =~ s/-/./g ; + $sequence{$raw_taxon} = &structure_handling ( \$structurestring, \$$sref_stems_remain, \@cut_positions, \$file_fasta ); next SEQUENCE_CHECK ; + } + + + + # Check for correct sequence states + $sequence{$raw_taxon} =~ s/(\w+)/\U$1/ig ; + my @seq_elements = split "" , $sequence{$raw_taxon} ; + + for my $seq_sign ( @seq_elements ){ + + unless ( $seq_sign =~ /A|C|G|T|U|-|N|Y|X|R|W|S|K|M|D|V|H|B|Q|E|I|L|F|P|\?/ ){ die "\n\t!FILE-ERROR!: Sequence of $file_fasta involves forbidden signs in $raw_taxon!\n" } + } + + if ( $sequence{$raw_taxon} =~ /I|E|L|Q|F|P/ ) { $sequence_state = 'aa' } + } + + + + + + + + + # Translate cut positions + my @fasta_cut; + &translate_cut_positions( \$$sref_codon_remove, \$$sref_third_remove, \@cut_positions, \$number_characters_before, \@fasta_cut, \$sequence_state, \$file_fasta ); + + + # Calculate percent of remaining positions + my $number_cut_positions = @cut_positions ; + my $number_characters_after = $number_characters_before-$number_cut_positions ; + + my $percent_left = sprintf "%.1f", ( $number_characters_after / $number_characters_before ) * 100 ; + $percent_left =~ s/\./,/g ; + + + # Assume uncut positions to $final and print out to ALICUT_$file_fasta + if ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_codon_3rd_$file_fasta" } + elsif ( $$sref_codon_remove =~ /yes/ && $$sref_third_remove =~ /no/ ){ open OUT, ">ALICUT_codon_$file_fasta" } + elsif ( $$sref_codon_remove =~ /no/ && $$sref_third_remove =~ /yes/ ){ open OUT, ">ALICUT_3rd_$file_fasta" } + else { open OUT, ">ALICUT_$file_fasta" } + + for ( keys %sequence ){ + + my @bases = split "", $sequence{$_} ; + my @final = map { $bases[$_] } @fasta_cut ; + my $final = $_."\n".( join "", @final )."\n" ; + + print OUT "$final" ; + } + close OUT; + + + + # Print Out of extra infos to ALICUT_info + print OUTinfo "$file\t$file_fasta\t$$sref_codon_remove\t$$sref_third_remove\t$number_sequences\t$number_characters_before\t$number_characters_after\t$percent_left\t$sequence_state\n" ; + print "\tDone : $file cut to ALICUT_$file_fasta\n" + } + + close OUTinfo ; + + + # Print OUT number of right handled FASTA files in relation to total number of files + printf "\n%68s\n", "------------------------------------------------------------" ; + printf "%42s\n", "$j FASTA file(s) correctly handled!" ; + printf "%57s\n", "Further infos are printed out in Alicut_info.txt!" ; + printf "\n%63s\n", "ALICUT V2.0 Finished! Thank you and good bye!" ; + printf "%68s\n", "------------------------------------------------------------" ; + + + &set_timer ; + exit ; + + sub tie_linefeeds{ + + my $sref_filename = $_[0] ; + + ( open IN , "<$$sref_filename" ) or warn "\tError: can not open $$sref_filename!\n" and next READING ; + + (tie ( my @data, 'Tie::File', $$sref_filename )) ; + + warn "\t!FILE-WARN!: $$sref_filename is empty!\n" and next READING if 0 == @data ; + + map { s/\r\n/\n/g } @data ; + map { s/\r/\n/g } @data ; + + untie @data ; close IN ; + + } + + sub set_timer{ + + my ( $user, $system, $cuser, $csystem ) = times ; + +print <ALICUT_cut_positions_codon.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon.txt" ; + print OUTnewcut $string_cutnumbers ; close OUTnewcut ; + } + + else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!" } + } + + # Translate identified RSS aminoacid positions to nucleotide triplet positions, but remove only third position + elsif ( $$sref_command_codon_remove =~ /yes/ && $$sref_command_third_remove =~ /yes/){ + + unless ( $$sref_sequence_state =~ /aa/ ){ + + my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = (); + for my $number( @fasta_old ){ + + push @$aref_cut_positions, ($number*3) + } + + my $string_cutnumbers = join " ", @$aref_cut_positions ; + open OUTnewcut, ">ALICUT_cut_positions_codon_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_codon_3rd.txt" ; + print OUTnewcut $string_cutnumbers ; close OUTnewcut ; + } + + else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tCodon positions not translated!\n\t3rd codon position not removed!" } + } + + # Remove only identified RSS if third position of original sequence + elsif ( $$sref_command_codon_remove =~ /no/ && $$sref_command_third_remove =~ /yes/){ + + unless ( $$sref_sequence_state =~ /aa/ ){ + + my @fasta_old = @$aref_cut_positions ; @$aref_cut_positions = (); + for my $number( @fasta_old ){ + + if ( $number % 3 == 0 ){ push @$aref_cut_positions, $number } + } + + my $string_cutnumbers = join " ", @$aref_cut_positions ; + open OUTnewcut, ">ALICUT_cut_positions_3rd.txt" or die "\n\t!FILE-ERROR!: Can not open File ALICUT_cut_positions_3rd.txt" ; + print OUTnewcut $string_cutnumbers ; close OUTnewcut + } + + else { warn "\n\t!FILE-WARN!: $$sref_filename include aa sequences!\n\tNot only 3rd codon position removed!" } + } + + + # Examine remaining positions + my ( %seen, @zahlenreihe ) ; + for ( 1 .. $$sref_number_characters ) { push @zahlenreihe, $_-1 } + + for my $value ( @$aref_cut_positions ){ $seen{$value-1}++ } + for ( @zahlenreihe ){ unless ( $seen{$_} ){ push @$aref_remaining_positions, $_ } } + } +} + +sub fas_bearbeiten{ + + my @infile = @_ ; + + grep s/(\>.*)/$1\t/, @infile ; + grep s/ //g, @infile ; + grep s/\n//g, @infile ; + grep s/\t/\n/g, @infile ; + grep s/\>/\n\>/g, @infile ; + my $string = join "", @infile ; + @infile = split "\n", $string ; + shift @infile ; + return @infile ; +} + +sub structure_handling{ + + my $sref_string = $_[0] ; + my $sref_answer_remain = $_[1] ; + my $aref_cut_positions = $_[2] ; + my $sref_filename = $_[3] ; + + my ( + + @pair_infos , + @forward , + @structurestring , + @loops , + @pairs , + %structure_of_position , + %seen_struc + + ); + + + # Stem assignment + my @structures = split "", $$sref_string ; + my $i = 0 ; + CHECKING: + for ( @structures ){ $i++ ; + + SWITCH: + $structure_of_position{$i} = $_ ; + + if ( $_ =~ /\(/ ){ push @forward, $i and next CHECKING } + if ( $_ =~ /\)/ ){ my $pair_1 = pop @forward; push @pairs, ( $pair_1, $i ); push @pair_infos, ( $pair_1.":".$i ); next CHECKING } + if ( $_ =~ /\./ ){ push @loops, $i and next CHECKING } + } + + @pair_infos = reverse @pair_infos ; + + + + + # Generate listfiles for structure_info file + my $pairlist = join "\n\t\t\t\t\t", @pair_infos ; + my $looplist = join "\n\t\t\t\t\t", @loops ; + + + # Number and proportion of stem and loop positions for structure info file + my $N_total = @structures ; + my $N_stems = @pair_infos ; + my $N_loops = $N_total - ( $N_stems * 2 ) ; + my $P_loops = ( $N_loops / $N_total ) * 100 ; + my $P_stems = 100 - $P_loops ; + + + # Open structure info outfile + open OUTstruc, ">ALICUT_Struc_info_${$sref_filename}.txt" ; + + # Print out + print OUTstruc "\nOriginal structure information identified in $$sref_filename:\n\n" ; + print OUTstruc "- Number of characters:\t\t\t$N_total\n" ; + print OUTstruc "- Number of single loop characters:\t$N_loops [$P_stems %]\n" ; + print OUTstruc "- Number of paired stem characters:\t$N_stems [$P_loops %]\n" ; + print OUTstruc "\n- Paired stem positions:\t\t$pairlist\n\n" ; + print OUTstruc "\n- Loop positions:\t\t\t$looplist\n" ; + + close OUTstruc; + + if ( $$sref_answer_remain =~ /yes/i ){ + + my @cut_positions2 = (); + + # Remain rss identified stem positions within the MSA + for ( @pairs ){ $seen_struc{$_} = 1 } + for ( @$aref_cut_positions ){ unless ( $seen_struc{$_} ){ push @cut_positions2, $_ } } + @$aref_cut_positions = @cut_positions2 ; + } + + else{ + + my %pair = @pairs; + + # Replace paired structure positions of rss identified positions by dots + for my $bp_for ( keys %pair ){ + + for my $rss ( @$aref_cut_positions ){ + + if ( $bp_for == $rss ){ $structure_of_position{$pair{$bp_for}} = "." ; last } + if ( $pair{$bp_for} == $rss ){ $structure_of_position{$bp_for} = "." ; last } + } + } + } + + for ( my $k=1; $k<=@structures-1; $k++ ){ push @structurestring, $structure_of_position{$k} } + my $structure_string_neu = join "", @structurestring ; + return $structure_string_neu ; + +} + +sub commandline{ + + print "\n\tCOMMAND:\t " ; + + chomp ( my $sub_answer_opening = ); + + print "\n\t------------------------------------------------------------\n" ; + + return $sub_answer_opening; +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl b/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl new file mode 100755 index 0000000..f102137 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/predownloaded_aliscore_alicut/Aliscore.02.2.pl @@ -0,0 +1,1271 @@ +#!/usr/bin/perl + +#written by Bernhard Misof, ZFMK, Bonn +#version of 20th February 2012 + +#updated by Bernhard Misof, 4th March 2008 +#updated by Bernhard Misof, 7th March 2008 +#updated by Bernhard Misof,11th March 2008 +#updated by Bernhard Misof,26th March 2008 +#updated by Bernhard Misof, 2nd April 2008 +#updated by Bernhard Misof, 6th May 2008 +#updated by Bernhard Misof, 6th May 2008 => -e for nt sequences! -e for nt sequences disables N replacement for fuzzy ends of sequences! +#updated by Bernhard Misof, 20th February 2012 => svg output added +#updated by Bernhard Misof, 21th February 2012 => RY input files possible + + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +# + + +use strict ; +use warnings ; +use Aliscore_module ; +use Tie::File ; +use Fcntl ; + + +#converts different line feeds in open process +use open IN => ":raw", OUT => ":raw" ; + + + +=pod + +=head1 Introduction + +Aliscore is designed to filter alignment ambiguous or randomly similar sites in multiple sequence alignments (MSA). It does not generate a generic alignment, this must be provided by the user. Aliscore reads exclusively alignments in FASTA format independently of suffices (.fas .txt .fts etc.). Aliscore reads the alignment and generates a hash of these sequences with taxon names as keys and simple sequence arrays as values. It works on these hash elements and uses these hasj elements as the basic data. Aliscore tolerates newlines in sequences but not in taxon names. Sequences must be of similar length! Aliscore can not read sequences in interleaved format, but this does not correspond to a plain fasta file anyway. Blanks in sequences are ignored, any other sign in sequences except for these covered by the universal DNA/RNA code will chock the program. Ambiguities are understood, as are indels. Kapital or small letters are equally good as input and can be used interchangeably, RNA and DNA sequences can be used in one alignment, RNA sequences are translated into DNA sequences. +Aliscore works on WindowsPCs, Macs and Linux mashines, but was written on Linux. If input files are coming from Windows make sure CRFL feeds are removed. Aliscore tries to remove them, but my not succeed in every instance. +Taxon names must only include alphanumeric signs, underscores (_) and blanks, everything else might chock the program. Aliscore will issue an error prompt and die if any non-alphanumeric sign is encountered in taxon names. If used with the outgroup option avoid blanks in names as this might lead to erroneous recognition of taxon names. +Aliscore will write results into its own folder. it will produce two files, one file with the consensus Profile, and one file with a list of characters with negative scores in this profile. + +=over +=item + + example of an input file: + + >Podura aquatica 18S_1 + aaagtctgtgacgttgtacggact + gcgtgtgcagctgtgacgggcgcc + >Sminthurus_sp + AUTGCTugccguuugaucgugugc + UUGGACUGCGUCGATCGUUGCGCG + +=back + +=head1 Usage + +Aliscores knows several options, it chocks if an unknown option is encountered. Make sure you write the input options correctly, for example -w 4 and not -w4 or -w_4, etc., likewise do not (!) use -in infile, or in infile or -i_infile; these are all wrong input formats and will cause the program to die. It will still trie to open an "n infile" or "_infile" which is hopefully not present, it will also tell you this. + +=over + +=item * + +-N option: without invoking the -N option gaps are treated as 5th character. With the -N option invoked gaps are treated as ambiguous character. Leading and tailing gaps of sequences are always interpreted as ambigous characters with and without the -N option. Interpreting gaps as ambiguous characters results in a loss of long indel sections consistently found in the majority of taxa. This means that well aligned expansion segements in rDNA sequences, which are not present in other taxa will be lost, if not commonly found in the MSA. Interpreting gaps as 5th character interprets stretches of indels as well aligned sections. + + +=item * + +-w # option: without invoking this -w # option, Aliscore will use the default window size of 4 for the sliding window. You may choose any other window size, smaller or larger, but it does not make sense to choose something smaller then 4. If you vote for a much larger window size then 4, Aliscore will become successively blind for small stretches of randomly similar sections. (See paper on Aliscore performance). If you vote for window size <4 Aliscore will start making substantial type I errors and call non-randomly similar sites randomly similar, depending on its neighbors. + + +=item * + +-r # option: if -r is used without an argument 4*N random pairs are compared, checking for replications (which are avoided). If -r is used with an argument, this number of randomly selected pairs is analysed and used to infer the consensus profile, if -r used used with an argument which is beyond the maximal number of possible non-overlapping pairs, only the maximal number of pairs is compared. If the -r option and the -t option are not used, random pairs are compared as default, with 4*N selection of pairs. + + +=item * + +-t treefile option: -t must be used with a tree file in Newick format, rooted or unrooted. The tree file should be in the same folder as the sequence file (not mandatory). If there are more than one tree in the tree file, only the first one will be read, all other trees will be ignored. Aliscore will read the tree and store as a hash with node levels as keys and taxa as values for each node. Aliscores uses this tree to work through the MSA from tips to bottom of tree. First, sister groups of terminal taxa are identified (node lists, level 1 as key) and compared, these taxa are then replaced by consensus sequences using the ambiguity code. Consensus sequences represent now the new set of terminal taxa with which Aliscore proceeds. This process is repeated until every possible pair of sequences within the tree is evaluated. Make shure that your tree does not contain CRFL from Windows if working on Linux! + + +=item * + +-l # option: -l can be used to restrict iterating through the tree to a specific node level, specified with the argument at the -l option. If -l 1 is used only primary sister group relationships are used to infer the consensus profile. If there are less node levels then arguments, Aliscore iterates through the tree and stops. + + +=item * + +-s option: -s option can be used to generate a strict profile from all single comparisons. This profile will be very conservative because it scores every site as negative which exhibits a negative score in one single profile already. This option does not make to much sense, do not use it on purpose! + + +=item * + +-o taxon,taxon,.. option: the -o option is used with a list of taxa separated by commatas. These taxa will be compared with all other taxa not in this list, but not with each other. It can be used to assess the range of randomness between outgroup taxa and ingroup taxa, or between every two groups of interest, if the alignment is restricted to ingroup taxa only before analysis. + +=back + +=cut + + +=pod + +=head1 Interna + +Details and comments are given in order of its appearance in code. + + +=head2 Input + +Input Arguments are collected into a 1-dimensional array and grep is used to retrieve options plus arguments; +white spaces are cleaned off, and array is created by splitting input string at -; +If you use taxon names with white spaces in -o option you might run into problems. + + for example: + our ($file)=grep /^i.+/,@INPUT;$file=~s/(^i)//; + +=cut + + +#print $USAGE if infile or switches are missing + +#declare and initialize variables + + our $transcript = '-' ; + our $sequence_length = '-' ; + our @variability = () ; + our $invariant = 0 ; + our @invar_sections = () ; + our @temp = () ; + our $n_sections = 0 ; + our @section = () ; + + our @PAIRS_org = () ; + our @PAIRS = () ; + our @PAIRScores = () ; + our $range = '-' ; + our @Temp = () ; + our @Profile = () ; + our $position = () ; + our @Character_List = () ; + our $nchar = '-' ; + our @taboo = () ; + our $ref_scoring = () ; + our $threshhold = () ; + +# process switches + +#both subs assume the presence of global input variables listed above, they are therefore dependent on these and serve here just for improving readability +#Aliscore_module::help is used to provide short describtions of options and their use + +#reads input given by @ARGV and changes default parameter if defined + +our ( $file , + $random , + $window , + $option , + $tree , + $level , + $strict , + $outgroup , + $pairs , + $indels , + $group , + $strict_in , + $ingo , + $matrix + ) = Aliscore_module::get_options () ; + + +=pod + +=head2 Reading FASTA file + +Fasta file is read and stored as a hash with taxon names as keys and references to sequence arrays as values. Sequences are stored as flat list, each position constituting an element. Only references to these hash elements are returned from the subroutine. The reference to the hash is used as a global variable indicated by our, only the file name is used as argument for the subroutine to open and read the file; will die if file has not bee found. Aliscore understands DNA ambiguity code, there is no need to replaces these. Aliscore does not accept any sign except letters and indels in sequences. It will die if anything else is encountered in seqquences. + + command: + our ($ref_FASTA)=Alignment_alpha::readFASTA_simple($file); + +number of taxa and taxon names are collected into an array for later comparison +Aliscore attempts to estimate the data type, either nucleotide or amino acid data. Aliscore considers sequences whith an ACTG content of > 0.8 (without counting indels and N) as nucleotide sequences, if less then 0.8 as amino acid data. It estimates data property from every sequence, if two sequences are considered of different data type, Aliscore stops. Aliscore might stop if a single nucleotide sequence contains more then 0.2 ambiguities. In almost every case, Aliscore will correctly estimate data type, if it does not, it will stop and report on the problem. If the data contains sequences of more then 0.2 ambiguities, it might be advisable to recode ambiguities as N's or remove the particular sequence. +RNA sequences will be recoded to DNA sequences. Nucleotide data can be a mix of RNA/DNA data. + +=cut + + +print +< DNA","\n\n" if $transcript > 0 ; + + }; + +$type eq 'RY' and do { + + if ($option eq "N"){map {grep s/\-/N/,@$_} values %$ref_FASTA} + + CHECK: { + + $sequence_length = @$_ and last CHECK for values %$ref_FASTA ; + + } + }; + + +#replaces indels for X in amino acid sequences + +$type eq 'aa' and do { + + #map {grep s/\-/X/,@$_} values %$ref_FASTA; + + #reads sequence lengths + + CHECK: { + + $sequence_length = @$_ and last CHECK for values %$ref_FASTA; + + } + + }; + +=pod + +=head2 Reading data type and scoring matrix + +Reads data type and generates accordingly scoring matrix. In case of nucleotide data, the scoring matrix is a simple match mismatch scoring matrix, in case of ambiguous characters the mismatch is optimistically interpreted. If indels are considered 5th charaters, they are scored in a mismatch/match pattern. A BLOSUM62 is used for the amino acid scoring with indels and X scoring 0. For aminoacid scoring, a Monte Carlo approach is used to generate a threshhold value, given the actual window size and aminoacid composition of the data. + +=cut + +#gets scoring matrix depending on sequence type + +$ref_scoring = Aliscore_module::get_scoring ( $type, $matrix ); + +#for ( keys %$ref_scoring ) { print $_,"\t",$ref_scoring->{$_},"\n"};#exit; + +#creates cutoff value for aminoacid scoring using a delete half bootstrap plus MC resampling of scoring values, depending on window size + +$type eq 'aa' and do { + +print < 0 and do { + +print <$window, scores site patterns,records invariant sections and variant sections to score + +print <w+2 (w window size). Reports these sections and places information as an argument into subroutine later. This step improves speed, because only variable sections are actually scored for random similarity. A simple iteration through all sequence arrays is used to check variability of sites. A @temp array is used to create the list of variable sections, results are reported to terminal + +=cut + +$type eq 'nt' and do { + +VARIABILITY: for my $i (0..$sequence_length-1){ + + my %Patterns; + + for (values %$ref_FASTA){ $Patterns{$_->[$i]}++ }; + + my $variability = keys %Patterns; + + if (1 == $variability && 0 == grep /N|\-/,keys %Patterns){ + + push @section, $i; + } + + else { + + if (($window*2-2) <= @section){ + + push @invar_sections, (join",",@section); + + splice @section,$#section-($window-2);#print "@section\n";exit; + + push @temp, @section; + + $n_sections++ + } + + @section = (); + } + + $invariant++ if 1 == $variability && 0 == grep /N|\-/,keys %Patterns; + } +#print "@temp\n";exit; +#report on variability + +print < 0 and do { + +print < $extend: $n_sections + +SECTIONS + }; +# +$n_sections == 0 and do { + +print < $extend + +SECTIONS + }; +} + + +#creates array of variable sections + + for my $position ( 0..$sequence_length-1 ){ push @variability, $position if 0 == grep/^$position$/,@temp }; + +#removes positions > $sequence_length-($window-1) + + pop @variability until ( $variability[$#variability] <= $sequence_length-($window) ); + +}; + + +$type eq 'RY' and do { + +VARIABILITY: for my $i (0..$sequence_length-1){ + + my %Patterns; + + for (values %$ref_FASTA){ $Patterns{$_->[$i]}++ }; + + my $variability = keys %Patterns; + + if (1 == $variability && 0 == grep /N|\-/,keys %Patterns){ + + push @section, $i; + } + + else { + + if (($window*2-2) <= @section){ + + push @invar_sections, (join",",@section); + + splice @section,$#section-($window-2);#print "@section\n";exit; + + push @temp, @section; + + $n_sections++ + } + + @section = (); + } + + $invariant++ if 1 == $variability && 0 == grep /N|\-/,keys %Patterns; + } +#print "@temp\n";exit; +#report on variability + +print < 0 and do { + +print < $extend: $n_sections + +SECTIONS + }; +# +$n_sections == 0 and do { + +print < $extend + +SECTIONS + }; +} + + +#creates array of variable sections + + for my $position ( 0..$sequence_length-1 ){ push @variability, $position if 0 == grep/^$position$/,@temp }; + +#removes positions > $sequence_length-($window-1); + + pop @variability until ( $variability[$#variability] <= $sequence_length-($window) ); + +}; + +#exit; + +#execute MC process + +$type eq 'nt' and do { + +print <= @TAXA ; + +#outgroup option + if ($outgroup!~/-/){ + + for my $taxon (split"\,",$outgroup){grep s/^$taxon$//,@TAXA} + + for my $taxon (split"\,",$outgroup){for (@TAXA){push @PAIRS, ($taxon.",".$_) unless 0 == length($_)}} + } + +#generates set of all possible pairs + + else { + until (1 == @TAXA){ + + my $first = shift@TAXA; + + push @PAIRS_org, ($first.",".$TAXA[$_]) for(0..$#TAXA); + } + + my $number = @PAIRS_org; + +#draws randomly from this set of all possible pairs + +#report on number of pairs and random switch + +$random < $max_PAIRS and do { + +print <= $max_PAIRS and do { + +print <= $max_PAIRS) { + + @PAIRS=@PAIRS_org + } + +#if variable random is less then max number of pairs, array of pairs is filled randomly, and without replicates from all possible pairs + + else { + until ($random == @PAIRS){ + + my @pair_new = splice@PAIRS_org,int(rand($#PAIRS_org)),1; + + push @PAIRS, $pair_new[0]; + } + } + } + + +#starts scoring of sequence pairs, uses parsimony scoring subroutine + + +=pod + +=head3 Scoring + +For each entry in the pairs list, it uses the two taxon names to look in the data hash for both sequences and uses the subroutine C>> nuc/aa_score_two >> with the scoring type, flat list of variable characters and both sequence references as arguments. All arguments are provided as references. The scoring profile is returned as a reference. Description of the scoring process see Alignment_alpha.pm. The list of arguments must be in order, reference to the scoring type must be first. + +=cut + + + my $counter = 1; + + + for (@PAIRS){ + + my ($taxon1,$taxon2) = split"\,"; + + printf "\tpair: %-6.6s \-\>\ttaxon1: %-10.10s\ttaxon2: %-10.10s\n" , $counter, $taxon1, $taxon2 ; + + my $score; + +#subroutine nuc/aa_score_two delivers a reference to the scoring array for the two sequences, it expects as input four arguments, first an option, window size, and two sequences as 1-dimensional arrays + + $score = Aliscore_module::nuc_score_two ($ref_scoring,\@variability,$window,$$ref_FASTA{$taxon1},$$ref_FASTA{$taxon2}) if $type eq 'nt';#print "@$score\n";exit; + $score = Aliscore_module::nuc_score_two ($ref_scoring,\@variability,$window,$$ref_FASTA{$taxon1},$$ref_FASTA{$taxon2}) if $type eq 'RY';#print "@$score\n";exit; + $score = Aliscore_module::aa_score_two ($ref_scoring,$threshhold,$window,$$ref_FASTA{$taxon1},$$ref_FASTA{$taxon2}) if $type eq 'aa';#print "@$score\n";exit; + +#counts the frequency of minus scores in score array and reports + + my $count = grep /\-\d/,@$score; + + print "\t \tpositions below base line: ",$count,"\n"; + +#transfers the score as a string into score collector array + + push @PAIRScores, (join"\,",@$score); + + $counter++ + }#end of foreach pairs + + }#end of RANDOM +} +#end of random sampling of pairs, RANDOM +#_______________________________________________________________________________________________________________________________________ + + +=pod + +=head2 Scoring using tree based selection of pairwise comparisons + +A user provided tree, rooted or unrooted, but fully dichotomous must be provided by the user. This tree is used for selection of sequence pairs. First, terminal sister taxa are compared, then these sequence pairs are replace by one consensus sequence. Consequently, the next set of terminal sequence pairs might contain consensus sequences and primary sequences. Consensus sequences make uses of the full ambiguity code to represent every difference in primary parent sequences. The scoring stops when the last sequence pair has been analysed. + +=cut + + +else { + +#reads tree and delivers a list of nodes for the progressive aliscore evaluation +#reads tree file if option on + + + TREE: { + + my ( $NJ_Tree, @nodes ); + + if ( $tree =~ /^NJ$/ ) { + + my $hamming_distance = Aliscore_module::hamming_distance ( $ref_FASTA , @taboo ) ; + + ( $NJ_Tree , @nodes ) = Aliscore_module::NJ_tree ( $hamming_distance ) ; + + grep { s/\(//g , s/\)//g , s/(\:(?i:[\w\-\.])+)//g } @nodes ; #print "@nodes\n";exit; + + + PRINTTREE: { + + open TREE ,">","${file}.tre"; + print TREE "$NJ_Tree\;" ; + close TREE ; #print $NJ_Tree,"\n";exit; + + } + + } + + else { + +=pod + +=head3 Reading Tree + +Tree must be in Newick format. Be careful, PAUP saves trees with basal polytomy as default. If these trees are used, an error message will stop the process (hopefully !). Save trees without basal polytomies in PAUP and everything will be fine. Check set options in PAUP! You can use rooted trees, either rooted in PAUP or any other software package, and everything should be fine. Take care to check taxon names in trees, because only if these names correspond exactly (!) to names in sequence files, scoring will be performed. Aliscore will have its own tree reconstruction routine soon, to avoid problems of incongruent taxon names and polytomies. + +=cut + + + + my($ref_tree_taxa,$ref_nodes,$ref_tree) = Aliscore_module::readTOPOLOGY ($tree) ; + + @nodes = keys %$ref_nodes; #print $_,"\n" for @nodes;exit; + + } + + + #gets ambiguitiy table as reference + + my $table = Aliscore_module::ambiguity_table (); + + + #remove doubled taxa, noch besser machen!!!!!!! + + +=pod + +=head3 Removing potentially identical taxa + +Similarly to random pair selection, Aliscore removes potentially identical sequences in tree base selection of sequence pairs. + +=cut + + + for my $taboo (@taboo){grep s/\Q$taboo\E//,@nodes}; + + for (@nodes){ + + s/^\,+// ; + s/\,+/\,/g; + s/\,+$// ; + s/.+// if !/\,/ + + } + + my $nodes = join";",@nodes; + + for ($nodes){ + + s/^\;+// ; + s/\;+/\;/g; + s/\;+$// ; + + } + + @nodes = split "\;",$nodes; #print $_,"\n" for @nodes;exit; + + #removes leading and trailing blanks of taxon names + + grep s/^ *//,@nodes; + grep s/ *$//,@nodes; + + my $nodelevel=1; + + until ( ! grep { /^((?i:[\w\-\.\* ])+\,(?i:[\w\-\.\* ])+)$/ } @nodes ){ + + + @PAIRS = grep /^((?i:[\w\-\.\* ])+\,(?i:[\w\-\.\* ])+)$/,@nodes; #print $_,"\n" for @PAIRS;#exit; + + +print<", "${file}_Profile_l${level}.txt" and last OUTFILE if $level =~ /\d+/; +open OUT, ">", "${file}_Profile_l_all.txt" and last OUTFILE if $level !~ /\d+/ && $tree=~/\w/; +open OUT, ">", "${file}_Profile_random.txt" and last OUTFILE if $random =~ /\d+/; + + } + + +=pod + +=head2 Generation of Consensus Profiles + +From the collection of single profiles a consensus profile is generated. The consensus profile consists of medians for each site derived from site scores of all single profiles. It is thus a consensus representation of the situation in single profiles. Aliscore generates a List of all characters of the consensus profile below the 0 - base line. This list is written into a list file. Additionally, Aliscore writes a profile file in which three collumns are written. First column, an enummeration of positions, second column sites with positive consensus values and third column sites with negative consensus values. +Alternative consensus techniques would be conceivable, but the median certainly reflects the dominating mode among single profiles. +Single profiles are collected into a temporary array, before a consensus profile will be generated. If the number of taxa > 200 and/or length of sequences > 8000 the process might crash because of RAM limits. This must be corrected in the near future, to avoid problems with very large data. + +=cut + + +#generates quality profile from collected scores + +print <$b} @site_pr; + + my $site_median = ($range/2)=~/\./ ? $site_pr[(($range+1)/2)-1] : ($site_pr[($range/2)-1]+$site_pr[$range/2])/2; + + $site_median /= $window; + + push @Profile, "$site_median\t0\n" if $site_median >= 0; + push @Profile, "0\t$site_median\n" if $site_median < 0; + push @Medianprofile, "$site_median"; + } + }; + +#prints out into profile file + + print OUT (join"",@Profile); +# undef @Temp; + close OUT; + +#print OUT $Profile; + +for my $element (@Temp){ + map {$_ /= $window} @{$element} +} + +unshift @Temp, \@Medianprofile ; + +#print "@{$Temp[0]}\n";exit; + + +print <", "${file}_List_l${level}.txt" and last LIST if $level =~ /\d+/; + open LIST, ">", "${file}_List_l_all.txt" and last LIST if $level !~ /\d+/&& $tree=~/\w/; + open LIST, ">", "${file}_List_random.txt" and last LIST if $random =~ /\d+/; + + } + + print LIST $Character_List."\n"; + close LIST; + +PRINTINGPROFILE: { + + print "\tProfile written to ${file}_Profile_l${level}.txt\n" and last PRINTINGPROFILE if $level =~ /\d+/; + print "\tProfile written to ${file}_Profile_l_all.txt\n" and last PRINTINGPROFILE if $level !~ /\d+/&& $tree=~/\w/; + print "\tProfile written to ${file}_Profile_random.txt\n" and last PRINTINGPROFILE if $random =~ /\d+/; + + } +PRINTINGLIST: { + + print "\tList written to ${file}_List_l${level}.txt\n" and last PRINTINGLIST if $level =~ /\d+/; + print "\tList written to ${file}_List_l_all.txt\n" and last PRINTINGLIST if $level !~ /\d+/&& $tree=~/\w/; + print "\tList written to ${file}_List_random.txt\n" and last PRINTINGLIST if $random =~ /\d+/; + + } + +undef (%$ref_FASTA); +undef @PAIRScores; + +my ($user,$system,$cuser,$csystem) = times; + +print < ":raw", OUT => ":raw" ; + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. +# + + +#generated by Bernhard Misof, 15th April 2007 + +#updated by Bernhard Misof, 4th July 2007 +#updated by Bernhard Misof, 26th August 2007 +#updated by Bernhard Misof, 10th November 2007 +#updated by Bernhard Misof, 22nd December 2007 +#updated by Bernhard Misof, 2nd January 2008 +#updated by Bernhard Misof, 10th February 2008 +#updated by Bernhard Misof, 16th February 2008 +#updated by Bernhard Misof, 4th March 2008 +#updated by Bernhard Misof, 7th March 2008 +#updated by Bernhard Misof, 11th March 2008 +#updated by Bernhard Misof, 26th March 2008 +#updated by Bernhard Misof, 2nd April 2008 +#updated by Bernhard Misof, 23nd June 2008 +#updated by Bernhard Misof, 2nd September 2008 +#updated by Bernhard Misof, 24th September 2008 => $ingo in reading FASTA files included +#updated by Bernhard Misof, 20th February 2012 => svg output added +#updated by Bernhard Misof, 21th February 2012 => RY input files possible + +#used in Aliscore.02.1.pl + + +sub get_options { + + #no warnings; + + my $file ; + my $random ; + my $window ; + my $option ; + my $tree ; + my $level ; + my $strict ; + my $outgroup ; + my $pairs ; + my $indels ; + my $group ; + my $strict_in ; + my $ingo ; + my $matrix ; + my %options ; + +our $USAGE = do { + +<=# are recorded + + with -e : strange option for aminoacid scoring + +random input order of options allowed + + +two output files are produced: + +one profile, one list of characters below 0 cutoff line + + + + +USAGE + +}; + +die $USAGE if ! @ARGV ; + +our $HELP = do { + +< 1 ; + $window = 6 if $options{w} <= 1 ; + + $option = qw(N) if $options{N} == 1 ; + $indels = 'ambiguous' if $options{N} == 1 ; + $indels = 'characters' if $options{N} == 0 ; + + $random = $options{r} if $options{r} > 1 ; + $random = qw(r) if $options{r} == 1 ; + + $pairs = '4*NTAXA' if $options{r} == 1 ; + $pairs = $options{r} if $options{r} > 1 ; + + $tree = '-' if $options{r} > 1 && + $options{t} =~ /0/ ; + $tree = $options{t} if $options{t} =~ /\D+ (?#:file name should contain a letter) /x ; + $tree = qw(NJ) if $options{t} =~ /^1/ ; + $tree = qw(NJ) if $options{t} =~ /^0$/ && + $options{r} =~ /^0$/ && + $options{t} !~ /\D+ (?#:file name should contain a letter) /x ; + + $level = qw(all) if $options{t} =~ /^1$/ || + $options{t} =~ /\D+/ ; + $level = $options{l} if $options{l} !~ /^0$ (?#:any number is ok) /x ; + $level = qw(all) if $options{t} =~ /^0$/ && + $options{r} =~ /^0$/ && + $options{l} =~ /^0$/; + + $strict = $options{s} if $options{s} == 1 ; + $strict_in = 'yes' if $options{s} == 1 ; + + $outgroup = $options{o} if $options{o} !~ /^0$/ ; + $group = 'yes' if $options{o} !~ /^0$/ ; + + $ingo = $options{e} if $options{e} == 1 ; + + $matrix = 'BLOSUM62' ; + $matrix = $options{matrix} if defined $options{matrix} ; + + die $HELP if $options{h} == 1 ; + + + } + + + $window = Aliscore_module::even_numbered_window ( $window ) ; + + die "\nrandom and tree based selection of pairs are not compatible\n\n".$USAGE if $random =~ /\w+/ && ($tree =~ /\w+/ || $level =~ /\d+/); + + + return ( $file , + $random , + $window , + $option , + $tree , + $level , + $strict , + $outgroup , + $pairs , + $indels , + $group , + $strict_in , + $ingo , + $matrix + ) + + } + + +sub even_numbered_window { + + my ( $window ) = @_ ; + + my $window2 = $window / 2 ; + + $window += 1 if $window2 =~ /\./ ; + + return $window + + } + + + +sub help { + + use Text::Wrap qw ($columns &wrap); + + $columns = 70 ; + + my ( $option ) = @_ ; + + $option eq '-w' and do { + + my @text = ('-w option ...','specifies dimension of the sliding window, default is w = 6. Window size below 4 do not make much sense since', + 'error rates for miscalling randomness and non-randomness will be much to high. Only window dimensions of even numbered size ', + 'will be accepted, if an uneven number is called by the user it will be changed to the next larger even number.','Larger window', + 'dimensions will make the profiling less sensitive to small alignment sections of randomness.','Changing the window dimension is ', + 'like using different magnifying glasses. see also Misof & Misof, 2008, Syst. Biol.' + ); + + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-i' and do { + + my @text = ('-i option ...','give the full name of the input file including extension. Input file must be in same folder as ALISCORE.', + 'ALISCORE currently accepts files in FASTA format, in simple ASCII text format. Avoid formating input file with text editors', + 'like MSWord or something comparable. Sequences my contain line breaks. The first line break is interpreted', ' as taxon name separator following the FASTA file convention. Taxon names must not contain *, ambiguities ', + ' ?, indels and stop codons (*) in are possible. ALISCORE estimates data type from ', + 'frequencies of ACGT. Only if estimates are consistant among sequencies, ALISCORE continues with the process using the appropriate', + 'scoring function.','RNA sequences will be recoded to DNA sequences. Leading and trailing indels will be replaced by N\'s in ', + 'nucleotide, but not in aminoacids sequences. There is no restriction on sequence number and sequence length.','For more', + ' information on reading the input data consult manual.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-t' and do { + + my @text = ('-t option ...','There are two possibilities to use a guiding tree in ALISCORE: if option -t is used without a input treefile', + 'a NJ tree will be reconstructed using simple Hamming distances. The NJ tree will be saved in a file with .tre suffix. If -t is used with a treefile name, tree will be read from a file', + 'in NEWICK format. Tree file must be in same folder. Only the first tree in a multiple trees file is read. If taxon names do not ', + 'match between sequence files and tree files, ALISCORE issues an error.','ALISCORE uses the tree to work through the MSA from tips', + 'to bottom. First, sister groups of terminal taxa are identified (node lists, level 1) and compared, these taxa are then replaced', + 'by consensus sequences using the ambiguity code. Consensus sequences represent now new sets of terminal taxa.','This process is', + 'repeated until every possible pair of sequences is evaluated.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-r' and do { + + my @text = ('-r option ...','if -r is used without an argument 4*N random pairs are compared, checking for replications (which are avoided).', + 'If -r is used with an argument which is beyond the maximal number of possible non-overlapping pairs, only the maximal number', + 'of pairs is compared. If the -r option and the -t option are not used, a NJ tree is generated by default for nucleotide,', + 'sequences and 4*N random pairs are compared for aminoacid sequences.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-o' and do { + + my @text = ('-o option ...','The -o option is used with a list of taxa separated by commatas. A blank between -o and list of taxa is mandatory, ', + 'taxa must beseparated by commatas and blanks should be avoided within taxon names and between taxa. These taxa will be compared with', + 'all other taxa', + 'not in this list. This option can be used to assess the range of randomness between outgroup taxa and ingroup taxa, or between', + 'any two groups of taxa. This option is currently not compatible with the tree -t option.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-l' and do { + + my @text = ('-l option ...','-l can be used to restrict iterating through the tree to a specific node level, specified with the argument at', + 'the -l option. If -l 1 is used only primary sister group relationships are used to infer the consensus profile. If there are ', + 'less node levels then arguments, ALISCORE iterates through the tree and stops.','If -l #+ is used, ALISCORE records only single', + 'profiles starting at the specified node label. Using this option can give you information on the signal in the data depending on', + 'node level in the tree.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-s' and do { + + my @text = ('-s option ...','option can be used to generate a strict profile from all single comparisons. This profile will be very conservative', + 'because it scores every site as negative in the consensus profile which exhibits a negative score in one single profile already.', + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-N' and do { + + my @text = ('-N option ...','without invoking the -N option indels are treated as 5th character. With the -N option indels are treated as ', + 'ambiguous characters. Leading and trailing gaps of sequences are always interpreted as ambiguous characters with and without ', + 'the -N option. Interpreting indels as ambiguous characters results in a loss of long indel sections consistently found in the ', + 'majority of taxa. This means that well aligned expansion segements in rDNA sequences, which are not present in other taxa will ', + 'be lost, if not commonly found in the MSA. Interpreting gaps as 5th character interprets stretches of indels as well aligned ', + 'sections. This option is currently ignored for aminoacid sequences.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-output' and do { + + my @text = ('output ... ','Aliscore writes into two(three) files. One file will be a flat list of characters scoring negatively in the consensus ', + 'profile separated by single spaces. This list can be used to define a character set for example in PAUP. The other file will be a flat table', + 'of three columns, first column alignment position, second column characters scoring positively otherwise zero, and third column characters ', + 'scoring negatively or zero. These columns can be used to generate a profile in many software packages, like GNUplot, Gnumerics or Excel.', + 'If -t option was used a third file with the NJ guiding tree will be written. ALISCORE does not remove any characters from your alignment.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-scoring' and do { + + my @text = ('scoring ... ','For nucleotide sequences ALISCORE uses a simple match/mismatch scoring matrix to calculate the observed window ', + 'and resampled window scores. If ambiguities are present, ALISCORE makes an optimistic estimate with a reduced match score according', + 'to degeneracy of ambiguities. Indels are either scored a fith\'s charaters or ambiguities.','For aminoacid sequences a BLOSUM62 ', + 'matrix is used, indels are scored as strongly penalized mismatches if matching and not penalized if matching an aminoacid.', + 'This scoring induces the effect, that if indels dominate in sections, these sections will be scored negatively in consensus profiles', + ', but if aminoacids dominate the section, indels will not have a negative effect.','this scoring was set to account for missing data,', + 'typical in concatenated EST or phylogenomic data.','The biological implications of this scoring are admittedly problematic.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-commands' and do { + + my @text = ('commands ... ','Input must contain an infile, all other options are not mandatory. Order of options in command line does not ', + 'matter. You can use e.g. Aliscore.pl -i infile -t -N -l 6+ or Aliscore.pl -N -l 6+ -i infile -t . All options must proceed a -', + 'sign e.g. -i or -o etc.. Specifications of options like number of pairwise comparison, tree file, window size must directly ', + 'follow option invokation with blanks inbetween. If you use no option, ALISCORE will switch to defaults, tree search for ', + 'nucleotide sequences and 4*NTAXA for amino acid sequences with window size of 6.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + $option eq '-e' and do { + + my @text = ('-e option ...','If run with the -e option ALISCORE scores indels in aminoacid data in an very unorthodox way',' It penalizes matching', + 'indels heavily, but does not so if aminoacids and indels are compared. This has the effect, that alignment sections in which a couple of', + 'sequences have missing data or long stretches of indels, these sections will not be penalized if otherwise conserved or with apparent signal.', + 'A biological interpretation of this option is clearly absent, but it makes sense if data with lots of missing sections, like EST data,', + 'are used. One possibility to interpret this scoring is, if many indels are present in a column, it is interpreted as an artifact.', + 'This option is internally handled by the variable $ingo in honor of its initializer.' + ); + + print "\nhelp called for $option .... \n\n" ; + print wrap(""," ",@text), "\n\n" ; + + exit; + + }; + + + } + + + +sub draw_singletons { + + no warnings; + + my ($type,$ref_FASTA) = @_; + my @TAXA = keys %$ref_FASTA; + + my $compare = sub { + + my ($type,$seq1,$seq2) = @_ ; + my @seq1 = @$seq1; + my @seq2 = @$seq2; + + $seq1 = join"" ,@seq1; + $seq1 =~s/-//g ; + @seq1 = split"",$seq1; + #print "@seq1\n"; + + $seq2 = join"" ,@seq2; + $seq2 =~s/-//g ; + @seq2 = split"",$seq2; + #print "@seq2\n"; + + my $maxposition = @seq1 < @seq2 ? @seq1 : @seq2; + + + $type eq 'nt' and do { + + CHECK: for my $position (0..$maxposition){ + return -1 if (($seq1[$position] ne "N") and ($seq2[$position] ne "N") and ($seq1[$position] ne $seq2[$position])); + } + }; + + $type eq 'aa' and do { + + CHECK: for my $position (0..$maxposition){ + return -1 if (($seq1[$position] ne "X") and ($seq2[$position] ne "X") and ($seq1[$position] ne $seq2[$position])); + } + }; + }; + + + + my (@PAIRS,@TABOO); + + until (1 == @TAXA){ + my $first = shift@TAXA; + push @PAIRS, ($first.",".$TAXA[$_]) for(0..$#TAXA); + } + + for (@PAIRS){ + my ($taxon1,$taxon2) = split "\,";#print "taxon1: ",$taxon1," Taxon2: ",$taxon2,"\n"; + my $score = $compare->($type,$$ref_FASTA{$taxon1},$$ref_FASTA{$taxon2}) || 1 ; + + if (1 == $score){ + if ($type eq 'nt') { + if ((my $count1 = grep /[AGCT]/i,@{$$ref_FASTA{$taxon1}}) < (my $count2 = grep /[AGCT]/i,@{$$ref_FASTA{$taxon2}})) { push @TABOO, $taxon1 and printf "\t%-20.20s pot. sim. to %-20.20s\n" , $taxon1, $taxon2 if !grep /\b$taxon1\b/,@TABOO }; + if ((my $count1 = grep /[AGCT]/i,@{$$ref_FASTA{$taxon1}}) > (my $count2 = grep /[AGCT]/i,@{$$ref_FASTA{$taxon2}})) { push @TABOO, $taxon2 and printf "\t%-20.20s pot. sim. to %-20.20s\n" , $taxon2, $taxon1 if !grep /\b$taxon2\b/,@TABOO }; + if ((my $count1 = grep /[AGCT]/i,@{$$ref_FASTA{$taxon1}}) == (my $count2 = grep /[AGCT]/i,@{$$ref_FASTA{$taxon2}})) { push @TABOO, $taxon2 and printf "\t%-20.20s pot. sim. to %-20.20s\n" , $taxon2, $taxon1 if !grep /\b$taxon2\b/,@TABOO }; + } + + if ($type eq 'aa') { + if ((my $count1 = grep /[ARNDCQEGHILKMFPSTWYV]/i,@{$$ref_FASTA{$taxon1}}) < (my $count2 = grep /[ARNDCQEGHILKMFPSTWYV]/i,@{$$ref_FASTA{$taxon2}})) { push @TABOO, $taxon1 and printf "\t%-20.20s pot. sim. to %-20.20s\n" , $taxon1, $taxon2 if !grep /\b$taxon1\b/,@TABOO }; + if ((my $count1 = grep /[ARNDCQEGHILKMFPSTWYV]/i,@{$$ref_FASTA{$taxon1}}) > (my $count2 = grep /[ARNDCQEGHILKMFPSTWYV]/i,@{$$ref_FASTA{$taxon2}})) { push @TABOO, $taxon2 and printf "\t%-20.20s pot. sim. to %-20.20s\n" , $taxon2, $taxon1 if !grep /\b$taxon2\b/,@TABOO }; + if ((my $count1 = grep /[ARNDCQEGHILKMFPSTWYV]/i,@{$$ref_FASTA{$taxon1}}) == (my $count2 = grep /[ARNDCQEGHILKMFPSTWYV]/i,@{$$ref_FASTA{$taxon2}})) { push @TABOO, $taxon2 and printf "\t%-20.20s pot. sim. to %-20.20s\n" , $taxon2, $taxon1 if !grep /\b$taxon2\b/,@TABOO }; + } + } + } + + #filter all double entries! + + my %seen; + + @TABOO = grep { ! $seen{$_}++ } @TABOO; + + return @TABOO; + + } + + +#_______________________________________________________________________________________ + +sub get_scoring { + my ( $type, $matrix ) = @_; + my %SCORING; + + die "sequence type not known!\n" if $type !~ /(nt|aa|RY)/; + die "scoring matrix not implemented\nuse BLOSUM62, PAM250, PAM500 or MATCH\n" if $matrix !~ /(BLOSUM62|PAM250|PAM500|MATCH)/; + + $matrix = uc ( $matrix ) ; + + my $check = sub { + my ($arr1,$arr2) = @_; + my (%hash2,@overlap); + + $hash2{$_} = 1 for (@$arr2); + for (0..$#$arr1){push @overlap, $arr1->[$_] if (defined $hash2{$arr1->[$_]})}; + + my $hit = @overlap; + return $hit; + }; + + #creates nuleotide scoring function, includes ambiguities + + $type eq "nt" and do { + + my @nucleotides = qw ( A C G T - AC AG AT CG CT GT ACG ACT CGT AGT ACGT A- C- G- T- AC- AG- AT- CG- CT- GT- ACG- ACT- CGT- AGT- ACGT-); + my %ambiguities = ('A-'=>'A-','C-'=>'C-','G-'=>'G-','T-'=>'T-','AC'=>'M','AG'=>'R','AT'=>'W','CG'=>'S','CT'=>'Y','GT'=>'K','ACG'=>'V', + 'ACT'=>'H','CGT'=>'B','AGT'=>'D','ACGT'=>'N','AC-'=>'M-','AG-'=>'R-','AT-'=>'W-','CG-'=>'S-','CT-'=>'Y-','GT-'=>'K-', + 'ACG-'=>'V-','ACT-'=>'H-','CGT-'=>'B-','AGT-'=>'D-','ACGT-'=>'N-'); + for my $i (@nucleotides){ + my @i = split"",$i;#print $i,"\t"; + my $recode1 = defined $ambiguities{$i} ? $ambiguities{$i} : $i; + + for my $j (@nucleotides){ + my @j = split"",$j;#print $j,"\n";exit; + my $recode2 = defined $ambiguities{$j} ? $ambiguities{$j} : $j; + my $overlap = $check->(\@i,\@j); + my $max = @i > @j ? @i : @j;#print $max,"\n";exit; + + $SCORING{$recode1.$recode2} = $overlap > 0 ? 1/$max : -1 if $max < 4; + $SCORING{$recode1.$recode2} = $overlap > 0 ? 0 : -1 if $max > 3; + + } + } + + };#end of do nucleotides! + + $type eq 'RY' and do { + + my @nucleotides = qw ( - AC AG AT CG CT GT ACG ACT CGT AGT ACGT AC- AG- AT- CG- CT- GT- ACG- ACT- CGT- AGT- ACGT-); + my %ambiguities = ('AC'=>'M','AG'=>'R','AT'=>'W','CG'=>'S','CT'=>'Y','GT'=>'K','ACG'=>'V', + 'ACT'=>'H','CGT'=>'B','AGT'=>'D','ACGT'=>'N','AC-'=>'M-','AG-'=>'R-','AT-'=>'W-','CG-'=>'S-','CT-'=>'Y-','GT-'=>'K-', + 'ACG-'=>'V-','ACT-'=>'H-','CGT-'=>'B-','AGT-'=>'D-','ACGT-'=>'N-'); + for my $i (@nucleotides){ + my @i = split"",$i;#print $i,"\t"; + my $recode1 = defined $ambiguities{$i} ? $ambiguities{$i} : $i; + + for my $j (@nucleotides){ + my @j = split"",$j;#print $j,"\n";exit; + my $recode2 = defined $ambiguities{$j} ? $ambiguities{$j} : $j; + my $overlap = $check->(\@i,\@j); + my $max = @i > @j ? @i : @j;#print $max,"\n";exit; + + $SCORING{$recode1.$recode2} = $overlap > 0 ? 1/$max : -1 if $max < 4 ; + $SCORING{$recode1.$recode2} = $overlap > 0 ? 0 : -1 if $max > 3 ; + $SCORING{$recode1.$recode2} = 1 if $overlap > 0 and $recode1=~ /Y/ and $recode2=~ /Y/; + $SCORING{$recode1.$recode2} = 1 if $overlap > 0 and $recode1=~ /R/ and $recode2=~ /R/; + + } + } + + };#end of do RY scoring + + + $type eq "aa" and do { + + # Blosum62 source: NCBI + # Matrix made by matblas from blosum62.iij + # * column uses minimum score + # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units + # Blocks Database = /data/blocks_5.0/blocks.dat + # Cluster Percentage: >= 62 + # Entropy = 0.6979, Expected = -0.5209 + + my @BLOSUM62 = ( + + [ 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4 ], + [-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4 ], + [-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4 ], + [-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 4, 1,-1,-4 ], + [ 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4 ], + [-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, 0, 3,-1,-4 ], + [-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4 ], + [ 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-2,-1,-4 ], + [-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, 0, 0,-1,-4 ], + [-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-3,-3,-1,-4 ], + [-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-3,-1,-4 ], + [-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, 0, 1,-1,-4 ], + [-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-3,-1,-1,-4 ], + [-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-3,-3,-1,-4 ], + [-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-1,-2,-4 ], + [ 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0, 0, 0,-4 ], + [ 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-1,-1, 0,-4 ], + [-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-3,-2,-4 ], + [-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-3,-2,-1,-4 ], + [ 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-3,-2,-1,-4 ], + [-2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, 1,-1,-4 ], + [-1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4 ], + [ 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,-4 ], + [-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1 ], + + ); + + + # PAM250 source: NCBI + # This matrix was produced by "pam" Version 1.0.6 [28-Jul-93] + # PAM 250 substitution matrix, scale = ln(2)/3 = 0.231049 + # Expected score = -0.844, Entropy = 0.354 bits + # Lowest score = -8, Highest score = 17 + # + + my @PAM250 = ( + + [ 2,-2, 0, 0,-2, 0, 0, 1,-1,-1,-2,-1,-1,-3, 1, 1, 1,-6,-3, 0, 0, 0, 0,-8 ], + [-2, 6, 0,-1,-4, 1,-1,-3, 2,-2,-3, 3, 0,-4, 0, 0,-1, 2,-4,-2,-1, 0,-1,-8 ], + [ 0, 0, 2, 2,-4, 1, 1, 0, 2,-2,-3, 1,-2,-3, 0, 1, 0,-4,-2,-2, 2, 1, 0,-8 ], + [ 0,-1, 2, 4,-5, 2, 3, 1, 1,-2,-4, 0,-3,-6,-1, 0, 0,-7,-4,-2, 3, 3,-1,-8 ], + [-2,-4,-4,-5,12,-5,-5,-3,-3,-2,-6,-5,-5,-4,-3, 0,-2,-8, 0,-2,-4,-5,-3,-8 ], + [ 0, 1, 1, 2,-5, 4, 2,-1, 3,-2,-2, 1,-1,-5, 0,-1,-1,-5,-4,-2, 1, 3,-1,-8 ], + [ 0,-1, 1, 3,-5, 2, 4, 0, 1,-2,-3, 0,-2,-5,-1, 0, 0,-7,-4,-2, 3, 3,-1,-8 ], + [ 1,-3, 0, 1,-3,-1, 0, 5,-2,-3,-4,-2,-3,-5, 0, 1, 0,-7,-5,-1, 0, 0,-1,-8 ], + [-1, 2, 2, 1,-3, 3, 1,-2, 6,-2,-2, 0,-2,-2, 0,-1,-1,-3, 0,-2, 1, 2,-1,-8 ], + [-1,-2,-2,-2,-2,-2,-2,-3,-2, 5, 2,-2, 2, 1,-2,-1, 0,-5,-1, 4,-2,-2,-1,-8 ], + [-2,-3,-3,-4,-6,-2,-3,-4,-2, 2, 6,-3, 4, 2,-3,-3,-2,-2,-1, 2,-3,-3,-1,-8 ], + [-1, 3, 1, 0,-5, 1, 0,-2, 0,-2,-3, 5, 0,-5,-1, 0, 0,-3,-4,-2, 1, 0,-1,-8 ], + [-1, 0,-2,-3,-5,-1,-2,-3,-2, 2, 4, 0, 6, 0,-2,-2,-1,-4,-2, 2,-2,-2,-1,-8 ], + [-3,-4,-3,-6,-4,-5,-5,-5,-2, 1, 2,-5, 0, 9,-5,-3,-3, 0, 7,-1,-4,-5,-2,-8 ], + [ 1, 0, 0,-1,-3, 0,-1, 0, 0,-2,-3,-1,-2,-5, 6, 1, 0,-6,-5,-1,-1, 0,-1,-8 ], + [ 1, 0, 1, 0, 0,-1, 0, 1,-1,-1,-3, 0,-2,-3, 1, 2, 1,-2,-3,-1, 0, 0, 0,-8 ], + [ 1,-1, 0, 0,-2,-1, 0, 0,-1, 0,-2, 0,-1,-3, 0, 1, 3,-5,-3, 0, 0,-1, 0,-8 ], + [-6, 2,-4,-7,-8,-5,-7,-7,-3,-5,-2,-3,-4, 0,-6,-2,-5,17, 0,-6,-5,-6,-4,-8 ], + [-3,-4,-2,-4, 0,-4,-4,-5, 0,-1,-1,-4,-2, 7,-5,-3,-3, 0,10,-2,-3,-4,-2,-8 ], + [ 0,-2,-2,-2,-2,-2,-2,-1,-2, 4, 2,-2, 2,-1,-1,-1, 0,-6,-2, 4,-2,-2,-1,-8 ], + [ 0,-1, 2, 3,-4, 1, 3, 0, 1,-2,-3, 1,-2,-4,-1, 0, 0,-5,-3,-2, 3, 2,-1,-8 ], + [ 0, 0, 1, 3,-5, 3, 3, 0, 2,-2,-3, 0,-2,-5, 0, 0,-1,-6,-4,-2, 2, 3,-1,-8 ], + [ 0,-1, 0,-1,-3,-1,-1,-1,-1,-1,-1,-1,-1,-2,-1, 0, 0,-4,-2,-1,-1,-1,-1,-8 ], + [-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8,-8, 1 ], + + ); + + + # PAM500 source: NCBI + # This matrix was produced by "pam" Version 1.0.6 [28-Jul-93] + # PAM 500 substitution matrix, scale = ln(2)/7 = 0.0990210 + # Expected score = -0.401, Entropy = 0.0803 bits + # Lowest score = -9, Highest score = 34 + # + + my @PAM500 = ( + + [ 1,-1, 0, 1,-2, 0, 1, 1, 0, 0,-1, 0,-1,-3, 1, 1, 1,-6,-3, 0, 1, 0, 0,-9 ], + [-1, 5, 1, 0,-4, 2, 0,-1, 2,-2,-2, 4, 0,-4, 0, 0, 0, 4,-4,-2, 0, 1, 0,-9 ], + [ 0, 1, 1, 2,-3, 1, 1, 1, 1,-1,-2, 1,-1,-4, 0, 1, 0,-5,-3,-1, 1, 1, 0,-9 ], + [ 1, 0, 2, 3,-5, 2, 3, 1, 1,-2,-3, 1,-2,-5, 0, 1, 0,-7,-5,-1, 2, 2, 0,-9 ], + [-2,-4,-3,-5,22,-5,-5,-3,-4,-2,-6,-5,-5,-3,-2, 0,-2,-9, 2,-2,-4,-5,-2,-9 ], + [ 0, 2, 1, 2,-5, 2, 2, 0, 2,-1,-2, 1,-1,-4, 1, 0, 0,-5,-4,-1, 2, 2, 0,-9 ], + [ 1, 0, 1, 3,-5, 2, 3, 1, 1,-2,-3, 1,-1,-5, 0, 1, 0,-7,-5,-1, 2, 2, 0,-9 ], + [ 1,-1, 1, 1,-3, 0, 1, 4,-1,-2,-3, 0,-2,-5, 1, 1, 1,-8,-5,-1, 1, 1, 0,-9 ], + [ 0, 2, 1, 1,-4, 2, 1,-1, 4,-2,-2, 1,-1,-2, 0, 0, 0,-2, 0,-2, 1, 2, 0,-9 ], + [ 0,-2,-1,-2,-2,-1,-2,-2,-2, 3, 4,-2, 3, 2,-1,-1, 0,-5, 0, 3,-2,-2, 0,-9 ], + [-1,-2,-2,-3,-6,-2,-3,-3,-2, 4, 7,-2, 4, 4,-2,-2,-1,-1, 1, 3,-3,-2,-1,-9 ], + [ 0, 4, 1, 1,-5, 1, 1, 0, 1,-2,-2, 4, 0,-5, 0, 0, 0,-3,-5,-2, 1, 1, 0,-9 ], + [-1, 0,-1,-2,-5,-1,-1,-2,-1, 3, 4, 0, 4, 1,-1,-1, 0,-4,-1, 2,-1,-1, 0,-9 ], + [-3,-4,-4,-5,-3,-4,-5,-5,-2, 2, 4,-5, 1,13,-4,-3,-3, 3,13, 0,-4,-5,-2,-9 ], + [ 1, 0, 0, 0,-2, 1, 0, 1, 0,-1,-2, 0,-1,-4, 4, 1, 1,-6,-5,-1, 0, 1, 0,-9 ], + [ 1, 0, 1, 1, 0, 0, 1, 1, 0,-1,-2, 0,-1,-3, 1, 1, 1,-3,-3,-1, 1, 0, 0,-9 ], + [ 1, 0, 0, 0,-2, 0, 0, 1, 0, 0,-1, 0, 0,-3, 1, 1, 1,-6,-3, 0, 0, 0, 0,-9 ], + [-6, 4,-5,-7,-9,-5,-7,-8,-2,-5,-1,-3,-4, 3,-6,-3,-6,34, 2,-6,-6,-6,-4,-9 ], + [-3,-4,-3,-5, 2,-4,-5,-5, 0, 0, 1,-5,-1,13,-5,-3,-3, 2,15,-1,-4,-4,-2,-9 ], + [ 0,-2,-1,-1,-2,-1,-1,-1,-2, 3, 3,-2, 2, 0,-1,-1, 0,-6,-1, 3,-1,-1, 0,-9 ], + [ 1, 0, 1, 2,-4, 2, 2, 1, 1,-2,-3, 1,-1,-4, 0, 1, 0,-6,-4,-1, 2, 2, 0,-9 ], + [ 0, 1, 1, 2,-5, 2, 2, 1, 2,-2,-2, 1,-1,-5, 1, 0, 0,-6,-4,-1, 2, 2, 0,-9 ], + [ 0, 0, 0, 0,-2, 0, 0, 0, 0, 0,-1, 0, 0,-2, 0, 0, 0,-4,-2, 0, 0, 0, 0,-9 ], + [-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9, 1 ], + + ); + + my @MATCH = ( + + [ 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,13,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,13,-1,-1,-1,-1, 1,-1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1,-1 ], + [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 1 ], + + ); + + +=pod + +Following lines generate a hash with scoring values for a given scoring matrix and fill indel scoring with scoring of stop codons derived from the scoring matrix. + + +=cut + my @aminoacids = ( 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'B', 'Z', 'X', '*'); + + my $create = sub { + + my $aa_lead = 0 ; + + for my $line (@_) { + + for (my $a=0;$a<=$#{$line};$a++) { + + $SCORING{$aminoacids[$aa_lead].$aminoacids[$a]} = $line -> [$a]; + + } + + $aa_lead ++ + } + + } ; + + + + $matrix eq 'BLOSUM62' and do { $create -> ( @BLOSUM62 ) } ; + $matrix eq 'PAM250' and do { $create -> ( @PAM250 ) } ; + $matrix eq 'PAM500' and do { $create -> ( @PAM500 ) } ; + $matrix eq 'MATCH' and do { $create -> ( @MATCH ) } ; + + my $indel_score = $SCORING{'A'.'*'}; + + for (@aminoacids) { $SCORING{$_.'-'} = $indel_score; $SCORING{'-'.$_} = $indel_score; $SCORING{'-'.'-'} = $indel_score } + + + + }; + + return \%SCORING; + + }#end of sub get_scoring + + +#_______________________________________________________________________________________ + + +sub ambiguity_table { + + my %TABLE; + my @nucleotides = qw ( A C G T - M R W S Y K V H B D N A- C- G- T- M- R- W- S- Y- K- V- H- B- D- N-); + my %ambiguities = ('A'=>'A','C'=>'C','G'=>'G','T'=>'T','-'=>'-','A-'=>'A-','C-'=>'C-','G-'=>'G-','T-'=>'T-','AC'=>'M','AG'=>'R','AT'=>'W', + 'CG'=>'S','CT'=>'Y','GT'=>'K','ACG'=>'V','ACT'=>'H','CGT'=>'B','AGT'=>'D','ACGT'=>'N','AC-'=>'M-','AG-'=>'R-','AT-'=>'W-', + 'CG-'=>'S-','CT-'=>'Y-','GT-'=>'K-','ACG-'=>'V-','ACT-'=>'H-','CGT-'=>'B-','AGT-'=>'D-','ACGT-'=>'N-'); + + + for my $i (@nucleotides){ + my ($recode1,$recode2); + + for my $key (keys %ambiguities){ + if ($i eq $ambiguities{$key}){ $recode1 = $key} + } + + for my $j (@nucleotides){ + + for my $key (keys %ambiguities){ + if ($j eq $ambiguities{$key}){ $recode2 = $key} + } + + my @merging = sort (split"",($recode1.$recode2)); + + my $merging = join"",@merging; + $merging =~ s/(-*)(.+)/$2$1/; + $merging =~ tr/ACGT-/ACGT-/s; + + $TABLE{$i.$j} = $ambiguities{$merging}; + + } + } + + return \%TABLE; + }#end of sub ambiguity table + +#_______________________________________________________________________________________ + +#sub of getting threshhold fot aa scoring + +sub get_threshhold { + + my ($ref_scoring,$window,$ref_FASTA) = @_; + my @aminoacids = qw(A R N D C Q E G H I L K M F P S T W Y V); + my %freq_aa; + my $aa_total; + my @freq_aa_base; + my %freq_scores; + + for my $ref_sequence (values %$ref_FASTA) { $aa_total += (grep /[ARNDCQEGHILKMFPSTWYV]/,@$ref_sequence ) } + + + for my $aa (@aminoacids) { + my $aa_single; + for my $ref_sequence (values %$ref_FASTA) { $aa_single += (grep /$aa/,@$ref_sequence ) } + $freq_aa{$aa} = int (($aa_single/$aa_total) * 100 ) + + } + + + for my $key (keys %freq_aa) { + push @freq_aa_base, ($key) x ( $freq_aa {$key}*10 ) + + } + + + for (1..100) { + + my @boot; + push @boot , \$freq_aa_base[int(rand($#freq_aa_base + 1))] until (($#freq_aa_base + 1) == @boot); + + + for (1..100) { + my @half_boot; + push @half_boot , $boot[int(rand($#boot + 1))] until (($#boot + 1)/2 == @half_boot); + + for (1..100) { + my $score; + + for (1..$window) { + $score += $$ref_scoring { ${$half_boot[ int(rand($#half_boot + 1))]}.${$half_boot[ int(rand($#half_boot + 1))]} } + } + + $freq_scores {$score} ++ + } + } + } + + map { $_/=(100*100*100) } (values %freq_scores); + + my ($cum_score,$threshhold); + + #open OUT,">","distribution.txt"; + + CUTOFF: for my $single_score (sort {$a<=>$b} keys %freq_scores){ + + # print OUT $single_score,"\t",$freq_scores {$single_score},"\n"; + + $cum_score += $freq_scores{$single_score}; + $freq_scores{$single_score} = $cum_score; + $threshhold = $single_score and last CUTOFF if $freq_scores{$single_score} >= 0.95 + } + #print $threshhold,"\n"; + return $threshhold; + + }; + +#_______________________________________________________________________________________ + +#sub replace ambiguities in DNA, sequences must be 2-dimensional arrays + +sub rp_ambiguities { + my ($position) = @_; + #ambiguities + my @N = qw(A C G T); + my @M = qw(A C); + my @R = qw(A G); + my @W = qw(A T); + my @S = qw(C G); + my @Y = qw(C T); + my @K = qw(G T); + my @V = qw(A C G); + my @H = qw(A C T); + my @B = qw(C G T); + my @D = qw(A G T); + #replace with arrays + if (grep /N/,@$position){shift @$position;push @$position, @N} + if (grep /M/,@$position){shift @$position;push @$position, @M} + if (grep /R/,@$position){shift @$position;push @$position, @R} + if (grep /W/,@$position){shift @$position;push @$position, @W} + if (grep /S/,@$position){shift @$position;push @$position, @S} + if (grep /Y/,@$position){shift @$position;push @$position, @Y} + if (grep /K/,@$position){shift @$position;push @$position, @K} + if (grep /V/,@$position){shift @$position;push @$position, @V} + if (grep /H/,@$position){shift @$position;push @$position, @H} + if (grep /B/,@$position){shift @$position;push @$position, @B} + if (grep /D/,@$position){shift @$position;push @$position, @D} + }#end of sub rp_ambiguities + + +#_______________________________________________________________________________________ + + +sub nuc_score_two { + + my ($ref_scoring,$variability,$window,$seq1,$seq2) = @_; + + #sequences are read as references, $seq1, $seq2 are references + my @basket = ($seq1,$seq2); + + #check whether sequences are of equal length + die "sequences are not of equal length!\n" if (my $length1 = @$seq1)!= (my $length2 = @$seq2); + my $length = $length1; + + #declare and initialize return array of alignment quality scores and references to input sequences + my @MSA_Score = (0)x $length; + + #start sliding window approach for MC quality score, labeled SLIDE + SLIDE: foreach my $j (@$variability){ + + my ($MC_score,%MC_Hash,$score,$threshhold,$Org_score); + my $position = $j; + + #evaluates window of original sequences + #print $seq1->[$position].$seq2->[$position],'=>',$$ref_scoring{$seq1->[$position].$seq2->[$position]},"\n"; + $Org_score += $$ref_scoring{$seq1->[$position].$seq2->[$position]}, $position++ for (1..$window);#print $Org_score,"\n";exit; + undef $position; + #print "org_score: $Org_score, position: $j\n"; + + + #checks $Org_score value, starts MC resampling if Org_score did not hit max or min value + #PERFECT: + if ($Org_score == $window){ + map {$_+=1} @MSA_Score[$j..($j+($window-1))] and next SLIDE;#print "@MSA_Score\n"; + } + #NOTHING: + elsif ($Org_score == (0-$window)){ + map {$_-=1} @MSA_Score[$j..($j+($window-1))] and next SLIDE;#print "@MSA_Score\n"; + } + #starts MC resampling, since Org_score did not hit max or min value + #TESTING: + else { + #set MC sampling pool from a double size sequence window, + #does not move compositional window until $j>=$window/2 + #LEFT: + if ($j < ($window/2)){ + #runs the MC resampling approach to generate new similarity scores of randomly drawn sequences + foreach (1..100){ + foreach (1..$window){ + $MC_score=0; + #selects randomly two positions from compositional window of both sequences + $MC_score+=$$ref_scoring{$basket[int(rand(2))]->[int(rand($window*2))].$basket[int(rand(2))]->[int(rand($window*2))]}; + } + #print "mc-score: $MC_score\n"; + $MC_Hash{$MC_score}++; + } + } + #does not move compositional window if $j>$length-($window*2+1) + #RIGHT: + elsif ($j > ($length-($window+$window/2))){ + #runs the MC resampling approach to generate new similarity scores of randomly drawn sequences + foreach (1..100){ + foreach (1..$window){ + $MC_score=0; + $MC_score+=$$ref_scoring{$basket[int(rand(2))]->[$length-(int(rand($window*2))+1)].$basket[int(rand(2))]->[$length-(int(rand($window*2))+1)]}; + } + $MC_Hash{$MC_score}++; + } + } + #slides compositional window if ($window/2)<$j>$length-($window*2+1) + #MIDDLE: + else { + #runs the MC resampling approach to generate new similarity scores of randomly drawn sequences + foreach (1..100){ + foreach (1..$window){ + $MC_score=0;; + $MC_score+=$$ref_scoring{$basket[int(rand(2))]->[(int(rand($window*2)))+($j-$window/2)].$basket[int(rand(2))]->[(int(rand($window*2)))+($j-$window/2)]}; + } + $MC_Hash{$MC_score}++; + } + } + + + #MC resampling finished, Hash with scores of random sampling generated + + #summarizes hits of MC resampling + CUTOFF: foreach $MC_score (sort {$a<=>$b} keys %MC_Hash){ + #print "key: $MC_score\t->value: $MC_Hash{$MC_score}\t"; + $score+=$MC_Hash{$MC_score}; + $MC_Hash{$MC_score}=$score;#print "cumul. score: $score\n"; + $threshhold=$MC_score and last CUTOFF if $MC_Hash{$MC_score}>=95 + } + + #sets score value for each position in MSA Aliscore array + #print "threshhold: $threshhold\n"; + map {$_+=1} @MSA_Score[$j..($j+($window-1))] if $Org_score > $threshhold; + map {$_-=1} @MSA_Score[$j..($j+($window-1))] if $Org_score <= $threshhold; + #print "@MSA_Score\n"; + + }#end of else TESTING + + + #cleaning of runing variables + ($Org_score,$MC_score,$threshhold)=(); + undef %MC_Hash; + + }#end of sliding window and for loop + + return \@MSA_Score; + + } #end of sub pars_score_two + + +#_______________________________________________________________________________________ + + + +sub aa_score_two { + + my ($ref_scoring,$threshhold,$window,$seq1,$seq2) = @_; + + #sequences are read as references, $seq1, $seq2 are references + my @basket = ($seq1,$seq2); + + #check whether sequences are of equal length + die "sequences are not of equal length!\n" if (my $length1 = @$seq1) != (my $length2 = @$seq2); + my $length = $length1; + + #declare and initialize return array of alignment quality scores and references to input sequences + my @MSA_Score = (0)x $length; + + + #start sliding window approach for MC quality score, labeled SLIDE + SLIDE: foreach my $j (0..$length-($window+1)){ + + my $Org_score ; + my $position = $j; + + #evaluates window of original sequences + + $Org_score += $$ref_scoring{$seq1->[$position].$seq2->[$position]}, $position++ for (1..$window);#print $Org_score,"\n";exit; + + #print "org_score: $Org_score, position: $j\n";#exit; + + + #PERFECT: + if ($Org_score > $threshhold) { + map {$_+=1} @MSA_Score[$j..($j+($window-1))] and next SLIDE; + } + #NOTHING: + if ($Org_score <= $threshhold) { + map {$_-=1} @MSA_Score[$j..($j+($window-1))] and next SLIDE;#print "@MSA_Score\n"; + } + + + }#end of else SLIDE + + return \@MSA_Score; + + } #end of sub pars_score_two + + +#_______________________________________________________________________________________ + + +#sub consensus sequence for two 2-dimensional sequence arrays + +sub conseq { + my ($seq1,$seq2)=@_; + my @conseq; + #check whether sequences are of equal length + die "sequences are not of equal length!\n" if (my $length1=@$seq1)!= (my $length2=@$seq2); + #compares 2-dimensional arrays + + for my $i (0..$length1-1){ + my (@position,$hit); + my $dimension=@{$seq1->[$i]}; + for my $n (0..$dimension-1){ + push @position, $seq1->[$i][$n] if 0 == ($hit=(grep /$seq1->[$i][$n]/,@{$seq2->[$i]})); + } + push @position, @{$seq2->[$i]};#print "pos: @position\n"; + push @conseq, \@position; + } + return @conseq; + } + +#_______________________________________________________________________________________ + +sub consensus_sequence { + my ($table,$seq1,$seq2)=@_; + my @conseq; + my $length1=@$seq1;#print $length1,"\n",$seq1->[0],"\n";exit; + + die "sequences are not of equal length!\n" if ($length1!= (my $length2=@$seq2)); + + push @conseq, ($$table{$seq1->[$_].$seq2->[$_]}) foreach (0..$length1-1); + return @conseq; + } + +#_______________________________________________________________________________________ + +#sub for overlap of arrays + +sub overlap { + my ($arr1,$arr2)=@_; + my (%hash2,@overlap); + + $hash2{$_}=1 for (@$arr2); + for (0..$#$arr1){push @overlap, $arr1->[$_] if (defined $hash2{$arr1->[$_]})}; + + my $hit=@overlap; + return \$hit; + } +#_______________________________________________________________________________________ + +sub overlap_explicit { + my ($arr1,$arr2)=@_; + my (%hash2,@overlap); + + $hash2{$_}=1 for (@$arr2); + for (0..$#$arr1){push @overlap, $arr1->[$_] if (defined $hash2{$arr1->[$_]})}; + + return \@overlap; + } + +#_______________________________________________________________________________________ + + +#sub read FASTA file, returns references to fasta hash, keys taxa, values sequences + +sub readFASTA_simple { + + + my ($taxon,@FASTA,%FASTA,%STRUCTURES,$number,$type,$aa_type); + + $aa_type = 0 ; + + my ( $file , $ingo ) = @_; + + + #makes sure that linefeeds are just naked \n and nothing else + + (open IN , $file ) || die "file $file can not be opened!\n"; close IN ; + + (tie ( my @data, 'Tie::File', $file )); + + die "file $file is empty!\n" if 0 == @data ; + map { s/\r\n/\n/g } @data ; + map { s/\r/\n/g } @data ; + + untie @data ; + + open (my $fh, "<",$file) || die "file $file not found! terminated\n"; + + READING: + + while ( <$fh> ) { + + die "convert from mac format!\n" if /\r[^\n]/; + + chomp ; + + next READING if /^\s+$/; + die "taxon name missing!\n" if /^\>$/ ; + + if ( /\>/ ) { + + $taxon = $_ ; + $taxon =~ s/^\>\s*/\>/g ; + $taxon =~ s/\s*$//g ; + die "multiple taxa of $taxon\n" if defined $FASTA{$taxon} ; + next READING ; + + } + + s/\s*//g and $FASTA{$taxon} .= $_ ; + + } + + + for my $key ( keys %FASTA ) { + + my $NAMEERROR = do { "forbidden sign or space in\n$key\n" } ; + + for ( $key ) { + + die $NAMEERROR if /\|/ ; + die $NAMEERROR if /\(/ ; + die $NAMEERROR if /\)/ ; + die $NAMEERROR if /\:/ ; + die $NAMEERROR if /\;/ ; + die $NAMEERROR if /\,/ ; + die $NAMEERROR if /--/ ; + die $NAMEERROR if /\*/ ; + die $NAMEERROR if /\s/ ; + + } + + die "file is not in FASTA format\n" if $key !~ /^\>/; + die "file is not in FASTA format\n" if $FASTA{$key} =~ /\>/ ; + die "not enough sequences!\n" if 1>= keys %FASTA ; + + my %seen = () ; + + my @TAXA = grep { ! $seen{$_}++ } keys %FASTA ; + + } + + + #cleaning of taxa and sequences removing leading and trailing blanks + + grep s/^\>// || s/^\s*// || s/\s*$// , @FASTA = %FASTA; + + %FASTA = @FASTA; + + + #identifying structure strings, cleaning, reading and modifying sequences using input options + + for $taxon ( keys %FASTA ) { + + no warnings ; + + if ( $FASTA{$taxon} =~ /\(/ && /\)/ && /\./ ) { + + s/ //g; + + die "structure not balanced\n" if grep /\(/,$FASTA{$taxon} != grep /\)/,$FASTA{$taxon}; + + print "\tstructure: $taxon \-\> ",length ( $FASTA{$taxon} ),"\n"; + + $STRUCTURES{$taxon} = $FASTA{$taxon}; + + delete $FASTA{$taxon} + } + + else { + + $number ++; + + $FASTA{$taxon} = uc( $FASTA{$taxon} ); + + $FASTA{$taxon} =~ s/ //g; #print $taxon,"\n"; + + die "\nsequence $taxon contains unknown characters!\nprocess terminated!\n" if $FASTA{$taxon} =~ /[^A-Z\?\-\*]/; + + my $abs_indelN = ($FASTA{$taxon} =~ tr/N\-// ); + + if ( length( $FASTA{$taxon} ) != $abs_indelN ) { + + my $freq_ACGT = ($FASTA{$taxon} =~ tr/ACG(T|U)// )/(length( $FASTA{$taxon} ) - $abs_indelN); + my $freq_RY = ($FASTA{$taxon} =~ tr/RY// )/(length( $FASTA{$taxon} ) - $abs_indelN); + $type = $freq_ACGT > 0.8 ? 'nt' : 'aa'; + $type = $freq_RY > 0.8 ? 'RY' : 'aa' if $type eq 'aa' + } + + if ( length( $FASTA{$taxon} ) == $abs_indelN ) { + + $number -- + } + + my @sequence = split"", $FASTA{$taxon} ; + + $type eq "nt" and do { + + grep { s/\?/N/ } @sequence ; + grep { s/\*/N/ } @sequence ; + + $ingo =~ /-/ and do { + + for ( @sequence ){ last if /\w/; s/\-/N/ } + for ( reverse @sequence ){ last if /\w/; s/\-/N/ } + + }; + + }; + + $type eq "aa" and do { + + grep { s/\?/-/ } @sequence ; + #for ( @sequence ){ last if /\w/; s/\-/X/ } + #for ( reverse @sequence ){ last if /\w/; s/\-/X/ } + + }; + $type eq 'RY' and do { + + grep { s/\?/N/ } @sequence ; + grep { s/\*/N/ } @sequence ; + die "A,C,G,T are not allowed in RY recoded sequences!\n" if grep /[ACGT]/i,@sequence; + + $ingo =~ /-/ and do { + + for ( @sequence ){ last if /\w/; s/\-/N/ } + for ( reverse @sequence ){ last if /\w/; s/\-/N/ } + + }; + + }; + #reports on sequence length + + printf "\t%-20.20s \-\> sites: %-6s\n" , $taxon, $#sequence+1; + + $FASTA{$taxon} = \@sequence; + + $aa_type++ if $type eq "aa"; + + } + + } + + die "mixture of nucleotide and amino acid sequences!\n" if (0 < ($number - $aa_type)/$number) && (1 > ($number - $aa_type)/$number) ; + + #returns reference to FASTA hash and structure hash + + return (\%FASTA,\%STRUCTURES,$type);print "\n\n";#exit; + + }#end of sub +#_______________________________________________________________________________________ + +#_______________________________________________________________________________________ +#reads tree from file, and returns list of nodes, tree must be rooted at first taxon in tree!!!!!, needs sub list_nodes!!! +sub readTOPOLOGY { + +#reads a tree in newick format and strips business parts of the tree, like brlens and support values and semi colons;taxon names may contain blanks alphanumeric signs and -, but no other signs, blanks are not modified into underscores! trees must not be fully resolved! + my ($tree) = @_; + + #makes sure that linefeeds are just naked \n and nothing else + + tie ( my @data, 'Tie::File', $tree ) or die "file $tree can not be opened!\n" ; + + map { s/\r\n/\n/g } @data ; + map { s/\r/\n/g } @data ; + + untie @data ; + + open (my ($tr),"<",$tree) || die "tree file not found!\n"; + + my @tree = <$tr>; #print "da: ","@tree\n";exit; + + die "tree is unbalanced or not in NEWICK standard format!\n" if ($tree[0]!~/\(/) || ($tree[0]!~/\;/) || ($tree[0]!~/\)/); + + $tree = join"",@tree; + + for($tree){ + s/e\-\d+//g ; + s/(\:\-*?\d+\.*\d*)//g ; + s/\;//g ; + } + + my @taxa = $tree =~ /(?i:[\w\-\. ])+/g; #print $tree,"\n";exit; + +#generates node list + my $nodes = node_list($tree); + return \@taxa,$nodes,\$tree; + + sub node_list { + + my ($tree)=@_; + $tree=~s/ {2,}/ /g;#print $tree,"\n";#exit; + die "unbalanced tree !\n" if ($tree=~/\(/)!=($tree=~/\)/); + my (%nodes,$level); + + #First level Filter + $level++; + my @selection=$tree=~/\((?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])*\)(?#:two or more taxa)| + \((?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])*\((?#:two or more taxa)| + \)(?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])*\)(?#:two or more taxa) + /gx; + #die "tree not dichotomous!\n" if grep /(?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])+/,@selection; + + grep s/\(|\)//g,@selection;#print "@selection\n"; + + #wrights nodes and level to key and value of %nodes + $nodes{$_}=$level for (@selection); + + $tree=~s/\(((?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])*)\(/$1\(/g; + $tree=~s/\)((?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])*)\)/\)$1/g; + + clean_tree (\$tree); + + sub clean_tree { + + my ($tree)=@_; + for ($$tree){ + s/\(((?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])*)\)/\,$1\,/g; + s/\,*\)\,*/\)/g; + s/\,*\(\,*/\(/g; + s/\,+/\,/g; + } + }#print $tree,"\n"; + + + #Second++ level Filter + + until ($tree!~/\(/ || $tree!~/\)/){ + $level++; + my @selection=$tree=~/\((?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])*\)(?#:two or more taxa)/gx; + + # die "tree not dichotomous!\n" if grep /(?i:[\w\-\. ])+\,(?i:[\w\-\. ])+(?i:[\w\-\,\. ])+/,@selection; + + grep s/\(|\)//g,@selection; + + #wrights nodes and level to key and value of %nodes + $nodes{$_}=$level for (@selection); + clean_tree (\$tree);#print $tree,"\n"; + }#end of second level Filter + + #map {print $_,"\t",$nodes{$_},"\n"} (keys %nodes);exit; + return \%nodes + }#end of sub node_list + +}#end of readTopology + + +sub hamming_distance { + +=pod + +A simple match/mismatch distance matrix is calculated from sequence data, using a FASTA file as input file. Only the upper triangle without the zero diagonal is computed. Data is stored in a hash structure with taxon names separated by * as keys for distances between these two taxa as values. It returns a reference to the full distance matrix. +N's are ignored as ambiguous characters, all other ambiguities and indels are considered real differences. If Aliscore, for example was set to indels as ambiguous characters, -N option, all indels will be ignored. + +=cut + + + my ( $ref_FASTA, @taboo ) = @_; + + my @TAXA = keys %$ref_FASTA; + my %dist_matrix = (); + + +print <= @TAXA ; + + until ( 1 == @TAXA ) { + + my $first = shift @TAXA ; + + for my $second ( @TAXA ) { + + my $distance = 0; + my $length = @{$$ref_FASTA{$first}}; + + COMPARISON: for ( 0..$#{$$ref_FASTA{$first}} ) { + + $distance ++ and next COMPARISON if ${$$ref_FASTA{$first}}[$_] ne ${$$ref_FASTA{$second}}[$_] ; + + $length -- and next COMPARISON if ( ${$$ref_FASTA{$first}}[$_] | ${$$ref_FASTA{$second}}[$_] ) eq 'N'; + + } + + $dist_matrix{$first."*".$second} = $distance / $length ; + + } + + } + + return \%dist_matrix ; + +} + +sub net_divergence { + +=pod + +This subroutine expects as input a distance matrix. Full distance matrices will not result in correct calculations of net divergence of unique taxa. It further expects that distance matrix is handed over as a hash structure in which distances are values and both taxa separated by a * are associated keys. The subroutine extracts a list of unique taxa from the keys list of the distance matrix and extracts all keys from the distance matrix in which a particular taxon is involved. From these entries a net divergence for each taxon is then calculated. +The subroutine returns a reference to a hash with keys of unique taxa and associated net divergence values. + + net_div(i) = sum d(ij) for all j from i ... N + +Remark: +Identical taxa names would lead to wrong net divergence calculations. Currently, the subroutine does not check for doubled entries of taxon names and it will also not report on these if present. + +=cut + + my ( $dist_matrix ) = @_ ; + + my ( %net_div, @TAXA, @tmp, %seen ); + + + push @tmp , (split /\*/ , $_ ) for ( keys %$dist_matrix ) ; + + @TAXA = grep { ! $seen{$_}++ } @tmp ; + + + for my $taxon ( @TAXA ) { + + my @set = grep /\Q$taxon\E/, keys %$dist_matrix ; + + $net_div{$taxon} += $$dist_matrix{$_} for ( @set ) + + } + + return \%net_div ; + +} + + +sub smallest_relative_distance_pair { + +=pod + +Calculates an upper triangle relative distance matrix ( not including zero diagonals and both symetric cases) by using two input matrices, first the absolute distance matrix and secondly a net divergence matriy for all unique taxa. If taxon names do not correspond, the calculation of the relative distance matrix will be wrong. If the distance matrix has double entries of taxa, 'no-unique' taxa names, it will give wrong results, because the association of net divergence taxon names and distance taxon names will not be unique. +It returns a reference to the relative full distance matrix stored in a hash structure, taxa separated by * as keys, and relative distances as associated values. + +M(ij) = d(ij)-[ net_div(i)+net_div(j)]/(N-2) + +=cut + + + my ( $dist_matrix, $net_div ) = @_ ; + + my ( $closest_dist, $closest_pair, @TAXA, @tmp, %seen ) ; + + push @tmp , (split /\*/ , $_ ) for ( keys %$dist_matrix ) ; + + @TAXA = grep { ! $seen{$_}++ } @tmp ; + + my $NTAXA = @TAXA ; + + for my $key ( keys %$dist_matrix ) { + + if ( 3 > $NTAXA ) { + + $closest_pair = $key and return $closest_pair ; + + } + + else { + + my ( $first, $second ) = split /\*/ , $key ; + + $closest_pair = $key ; + + $closest_dist = $$dist_matrix{$key} - ( $$net_div{$first} + $$net_div{$second} ) / ( $NTAXA - 2 ) ; + + last + + } + + } + + for my $key ( keys %$dist_matrix ) { + + my ( $first, $second ) = split /\*/ , $key ; + + my $min_relative_dist = $$dist_matrix{$key} - ( $$net_div{$first} + $$net_div{$second} ) / ( $NTAXA - 2 ) ; + + if ( $closest_dist > $min_relative_dist ) { + + $closest_pair = $key ; + $closest_dist = $min_relative_dist ; + + } + + } + + return $closest_pair + +} + + +sub closest_pair_net_divergence { + + my ( $dist_matrix ) = @_ ; + + my $net_div = net_divergence ( $dist_matrix ) ; + + my $closest_pair = smallest_relative_distance_pair ( $dist_matrix, $net_div ) ; + + return $closest_pair, $net_div + +} + + +sub NJ_tree { + + no warnings ; + +=pod + +This subroutine calculates a NJ tree given a set of taxa and its associated set of distances. It reads a hash list of distances with taxon names separated with * as key entries. If taxon namesare not separated by * it will fail to construct a correct NJ tree. Any list of distances can be used, given that they contain positive values. + + +=cut + + my ( $dist_matrix ) = @_ ; + + my @node_list = () ; + +=pod + +Loop through the distance matrix to create a full NJ tree. + +=cut + +print<" , "$file.profiles.svg" ; + + my $width = @{$ref_Profiles->[0]} ; + my $height = $width/10 ; + my $y_zero = $height/2 ; + + my $framewidth = $width ; + my $frameheight= $height; + my $unit_height= $height/5; + + +print $fh_matrix_out < + + + + + + + + + +FRAME1 + + +for (1..5){ + + $unit_height *= $_; + print $fh_matrix_out '',"\n"; + print $fh_matrix_out '',"\n"; + print {$fh_matrix_out} '',$_/5,'',"\n"; + print {$fh_matrix_out} '','-',$_/5,'',"\n"; + $unit_height /= $_; +} + + +for (my $x=0;$x<$width;$x+=10){ + + if ($x%100 == 0){ + print $fh_matrix_out '',"\n"; + } + else{ + print $fh_matrix_out '',"\n"; + } +} + + +my $median = shift @{$ref_Profiles} ; + + +for my $profile (@{$ref_Profiles}){ + + my $profile_line = '' ; + + for (my $x=1,my $y=0;$x<=$width;$x++,$y++){ + $profile_line .= $x.' '.($y_zero-${$profile}[$y]*$height/2).','; + } + + chop $profile_line ; #print $profile_line,"\n";exit; + +print $fh_matrix_out < + +PROFILE + +} + +my $median_line= '' ; + +for (my $x=1,my $y=0;$x<=$width;$x++,$y++){ + $median_line .= $x.' '.($y_zero-${$median}[$y]*$height/2).',';#print ${$median}[$y],"\t",$height/2,"\t",$y_zero,"\t",${$median}[$y]*$height/2,"\t",${$median}[$y]*$height/2+$y_zero,"\n";exit; + } + chop $median_line ; #print $median_line,"\n";exit; + +print $fh_matrix_out < + +MEDIANLINE + + +my $median_line1 = '0'.' '.$y_zero.',' ; +my $median_line2 = '0'.' '.$y_zero.',' ; + +for (my $x=1,my $y=0;$x<=$width;$x++,$y++){ + if (${$median}[$y]<= 0){ + $median_line1 .= $x.' '.($y_zero).',' ; + $median_line2 .= $x.' '.($y_zero-${$median}[$y]*$height/2).','; + } + else{ + $median_line1 .= $x.' '.($y_zero-${$median}[$y]*$height/2).',' ; + $median_line2 .= $x.' '.($y_zero).',' ; + } +} + +$median_line1 .= $width.' '.$y_zero.','.'0'.' '.$y_zero; +$median_line2 .= $width.' '.$y_zero.','.'0'.' '.$y_zero; + +print $fh_matrix_out < + + +MEDIAN + + + +print $fh_matrix_out < + + +FINISH + + + +} + + + + + + + +1; + + + + + + + + + + + diff --git a/skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py b/skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py new file mode 100755 index 0000000..7f86e14 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/query_ncbi_assemblies.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Query NCBI for available genome assemblies by taxon name + +Usage: + python query_ncbi_assemblies.py --taxon "Coleoptera" + python query_ncbi_assemblies.py --taxon "Drosophila" --max-results 50 + python query_ncbi_assemblies.py --taxon "Apis" --refseq-only + +Requires: ncbi-datasets-pylib (pip install ncbi-datasets-pylib) + +Author: Bruno de Medeiros (Field Museum) +""" + +import argparse +import sys + + +def query_assemblies_by_taxon(taxon, max_results=20, refseq_only=False): + """ + Query NCBI for genome assemblies of a given taxon + + Args: + taxon: Taxon name (e.g., "Coleoptera", "Drosophila melanogaster") + max_results: Maximum number of results to return + refseq_only: If True, only return RefSeq assemblies (GCF_*) + + Returns: + List of dictionaries with assembly information + """ + try: + from ncbi.datasets import GenomeApi + from ncbi.datasets.openapi import ApiClient, ApiException + except ImportError: + print("Error: ncbi-datasets-pylib not installed", file=sys.stderr) + print("Install with: pip install ncbi-datasets-pylib", file=sys.stderr) + sys.exit(1) + + assemblies = [] + + print(f"Querying NCBI for '{taxon}' genome assemblies...") + print(f"(Limiting to {max_results} results)") + if refseq_only: + print("(RefSeq assemblies only)") + print("") + + try: + with ApiClient() as api_client: + api = GenomeApi(api_client) + + # Query genome assemblies for the taxon + genome_summary = api.genome_summary_by_taxon( + taxon=taxon, + limit=str(max_results), + filters_refseq_only=refseq_only + ) + + if not genome_summary.reports: + print(f"No assemblies found for taxon '{taxon}'") + return [] + + for report in genome_summary.reports: + assembly_info = { + 'accession': report.accession, + 'organism': report.organism.organism_name, + 'assembly_level': report.assembly_info.assembly_level, + 'assembly_name': report.assembly_info.assembly_name, + 'submission_date': report.assembly_info.release_date if hasattr(report.assembly_info, 'release_date') else 'N/A' + } + assemblies.append(assembly_info) + + except ApiException as e: + print(f"Error querying NCBI: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(1) + + return assemblies + + +def format_table(assemblies): + """ + Format assemblies as a readable table + + Args: + assemblies: List of assembly dictionaries + """ + if not assemblies: + return + + print(f"Found {len(assemblies)} assemblies:\n") + + # Print header + print(f"{'#':<4} {'Accession':<20} {'Organism':<40} {'Level':<15} {'Assembly Name':<30}") + print("-" * 110) + + # Print data rows + for i, asm in enumerate(assemblies, 1): + organism = asm['organism'][:38] + '..' if len(asm['organism']) > 40 else asm['organism'] + assembly_name = asm['assembly_name'][:28] + '..' if len(asm['assembly_name']) > 30 else asm['assembly_name'] + + print(f"{i:<4} {asm['accession']:<20} {organism:<40} {asm['assembly_level']:<15} {assembly_name:<30}") + + print("") + + +def save_accessions(assemblies, output_file): + """ + Save assembly accessions to a file + + Args: + assemblies: List of assembly dictionaries + output_file: Output file path + """ + with open(output_file, 'w') as f: + for asm in assemblies: + f.write(f"{asm['accession']}\n") + + print(f"Accessions saved to: {output_file}") + print(f"You can download these assemblies using:") + print(f" python download_ncbi_genomes.py --assemblies $(cat {output_file})") + + +def main(): + parser = argparse.ArgumentParser( + description="Query NCBI for available genome assemblies by taxon name", + epilog="Example: python query_ncbi_assemblies.py --taxon 'Coleoptera' --max-results 50" + ) + + parser.add_argument( + "--taxon", + required=True, + help="Taxon name (e.g., 'Coleoptera', 'Drosophila melanogaster')" + ) + + parser.add_argument( + "--max-results", + type=int, + default=20, + help="Maximum number of results to return (default: 20)" + ) + + parser.add_argument( + "--refseq-only", + action="store_true", + help="Only return RefSeq assemblies (GCF_* accessions)" + ) + + parser.add_argument( + "--save", + metavar="FILE", + help="Save accessions to a file for later download" + ) + + args = parser.parse_args() + + # Query NCBI + assemblies = query_assemblies_by_taxon( + taxon=args.taxon, + max_results=args.max_results, + refseq_only=args.refseq_only + ) + + # Display results + format_table(assemblies) + + # Save if requested + if args.save and assemblies: + save_accessions(assemblies, args.save) + + +if __name__ == "__main__": + main() diff --git a/skills/phylo_from_buscos/scripts/rename_genomes.py b/skills/phylo_from_buscos/scripts/rename_genomes.py new file mode 100755 index 0000000..e98cac6 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/rename_genomes.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +""" +Rename genome files with clean, meaningful sample names for phylogenomics + +This script helps create a mapping between genome files (often with cryptic +accession numbers) and clean species/sample names that will appear in the +final phylogenetic tree. + +Usage: + # Interactive mode - prompts for names + python rename_genomes.py --interactive genome1.fasta genome2.fasta + + # From mapping file (TSV: old_namenew_name) + python rename_genomes.py --mapping samples.tsv + + # Create template mapping file + python rename_genomes.py --create-template *.fasta > samples.tsv + +Author: Bruno de Medeiros (Field Museum) +Based on tutorials by Paul Frandsen (BYU) +""" + +import argparse +import os +import sys +import shutil +from pathlib import Path + + +def sanitize_name(name): + """ + Sanitize a name to be phylogenomics-safe + - Replace spaces with underscores + - Remove special characters + - Keep only alphanumeric, underscore, hyphen + """ + # Replace spaces with underscores + name = name.replace(' ', '_') + # Remove special characters except underscore and hyphen + name = ''.join(c for c in name if c.isalnum() or c in '_-') + return name + + +def create_template(genome_files, output=sys.stdout): + """Create a template mapping file""" + output.write("# Sample mapping file\n") + output.write("# Format: original_filenamenew_sample_name\n") + output.write("# Edit the second column with meaningful species/sample names\n") + output.write("# Recommended format: [ACCESSION]_[NAME] (e.g., GCA000123456_Penstemon_eatonii)\n") + output.write("# This keeps accession for traceability while having readable names in trees\n") + output.write("# Names should contain only letters, numbers, underscores, and hyphens\n") + output.write("#\n") + + for gfile in genome_files: + basename = Path(gfile).stem # Remove extension + output.write(f"{gfile}\t{basename}\n") + + +def read_mapping(mapping_file): + """Read mapping from TSV file""" + mapping = {} + with open(mapping_file, 'r') as f: + for line in f: + line = line.strip() + # Skip comments and empty lines + if not line or line.startswith('#'): + continue + + parts = line.split('\t') + if len(parts) != 2: + print(f"Warning: Skipping invalid line: {line}", file=sys.stderr) + continue + + old_name, new_name = parts + new_name = sanitize_name(new_name) + mapping[old_name] = new_name + + return mapping + + +def interactive_rename(genome_files): + """Interactively ask for new names""" + mapping = {} + + print("Enter new sample names for each genome file.") + print("Press Enter to keep the current name.") + print("Names will be sanitized (spaces→underscores, special chars removed)\n") + + for gfile in genome_files: + current_name = Path(gfile).stem + new_name = input(f"{gfile} → [{current_name}]: ").strip() + + if not new_name: + new_name = current_name + + new_name = sanitize_name(new_name) + mapping[gfile] = new_name + print(f" Will rename to: {new_name}.fasta\n") + + return mapping + + +def rename_files(mapping, dry_run=False, backup=True): + """Rename genome files according to mapping""" + + renamed = [] + errors = [] + + for old_file, new_name in mapping.items(): + if not os.path.exists(old_file): + errors.append(f"File not found: {old_file}") + continue + + # Get extension from original file + ext = Path(old_file).suffix + if not ext: + ext = '.fasta' + + new_file = f"{new_name}{ext}" + + # Check if target exists + if os.path.exists(new_file) and new_file != old_file: + errors.append(f"Target exists: {new_file}") + continue + + # Skip if names are the same + if old_file == new_file: + print(f"Skip (no change): {old_file}") + continue + + if dry_run: + print(f"[DRY RUN] Would rename: {old_file} → {new_file}") + else: + # Backup if requested + if backup: + backup_file = f"{old_file}.backup" + shutil.copy2(old_file, backup_file) + print(f"Backup created: {backup_file}") + + # Rename + shutil.move(old_file, new_file) + print(f"Renamed: {old_file} → {new_file}") + renamed.append((old_file, new_file)) + + return renamed, errors + + +def main(): + parser = argparse.ArgumentParser( + description="Rename genome files with meaningful sample names for phylogenomics", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Create template mapping file + python rename_genomes.py --create-template *.fasta > samples.tsv + # Edit samples.tsv, then apply mapping + python rename_genomes.py --mapping samples.tsv + + # Interactive renaming + python rename_genomes.py --interactive genome1.fasta genome2.fasta + + # Dry run (preview changes) + python rename_genomes.py --mapping samples.tsv --dry-run + """ + ) + + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + '--create-template', + nargs='+', + metavar='GENOME', + help='Create a template mapping file from genome files' + ) + group.add_argument( + '--mapping', + metavar='FILE', + help='TSV file with mapping (old_namenew_name)' + ) + group.add_argument( + '--interactive', + nargs='+', + metavar='GENOME', + help='Interactively rename genome files' + ) + + parser.add_argument( + '--dry-run', + action='store_true', + help='Show what would be renamed without actually renaming' + ) + + parser.add_argument( + '--no-backup', + action='store_true', + help='Do not create backup files' + ) + + args = parser.parse_args() + + # Create template + if args.create_template: + create_template(args.create_template) + return + + # Interactive mode + if args.interactive: + mapping = interactive_rename(args.interactive) + # Mapping file mode + elif args.mapping: + mapping = read_mapping(args.mapping) + else: + parser.error("No mode specified") + + if not mapping: + print("No files to rename", file=sys.stderr) + return + + # Perform renaming + renamed, errors = rename_files( + mapping, + dry_run=args.dry_run, + backup=not args.no_backup + ) + + # Summary + print("\n" + "="*60) + if args.dry_run: + print("DRY RUN - No files were actually renamed") + else: + print(f"Successfully renamed {len(renamed)} file(s)") + + if errors: + print(f"\nErrors ({len(errors)}):") + for error in errors: + print(f" - {error}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/phylo_from_buscos/scripts/run_alicut.sh b/skills/phylo_from_buscos/scripts/run_alicut.sh new file mode 100755 index 0000000..2435d9f --- /dev/null +++ b/skills/phylo_from_buscos/scripts/run_alicut.sh @@ -0,0 +1,247 @@ +#!/bin/bash + +# run_alicut.sh +# Wrapper script for running ALICUT to remove Aliscore-identified RSS positions +# Removes randomly similar sequence sections from alignments +# +# Usage: +# bash run_alicut.sh [aliscore_dir] [options] +# +# Options: +# -r Remain stem positions (for RNA secondary structures) +# -c Remove codon (translate AA positions to nucleotide triplets) +# -3 Remove only 3rd codon positions +# -s Silent mode (non-interactive, use defaults) +# +# Requirements: +# - ALICUT_V2.31.pl in PATH or same directory +# - Perl with File::Copy, Tie::File, Term::Cap modules +# - Aliscore output directory with *_List_*.txt and original .fas file + +set -euo pipefail + +# Script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Check for ALICUT script +if command -v ALICUT_V2.31.pl &> /dev/null; then + ALICUT_SCRIPT="ALICUT_V2.31.pl" +elif [ -f "${SCRIPT_DIR}/ALICUT_V2.31.pl" ]; then + ALICUT_SCRIPT="${SCRIPT_DIR}/ALICUT_V2.31.pl" +elif [ -f "./ALICUT_V2.31.pl" ]; then + ALICUT_SCRIPT="./ALICUT_V2.31.pl" +else + echo "ERROR: ALICUT_V2.31.pl not found in PATH, script directory, or current directory" + echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/alicut" + exit 1 +fi + +# Function to display usage +usage() { + cat </dev/null | head -n 1) +if [ -z "${LIST_FILE}" ]; then + echo "ERROR: No Aliscore List file found (*_List_*.txt)" + echo "Make sure Aliscore completed successfully" + exit 1 +fi + +echo "Found List file: ${LIST_FILE}" + +# Find original FASTA file +FASTA_FILE=$(find . -maxdepth 1 \( -name "*.fas" -o -name "*.fasta" \) -type f | head -n 1 | sed 's|^\./||') +if [ -z "${FASTA_FILE}" ]; then + echo "ERROR: No FASTA alignment file found (*.fas or *.fasta)" + echo "ALICUT requires the original alignment file in the same directory as List file" + exit 1 +fi + +echo "Found FASTA file: ${FASTA_FILE}" + +# Check if List file contains RSS positions +RSS_COUNT=$(wc -w < "${LIST_FILE}" || echo "0") +if [ "${RSS_COUNT}" -eq 0 ]; then + echo "WARNING: List file is empty (no RSS positions identified)" + echo "Aliscore found no randomly similar sequences to remove" + echo "Skipping ALICUT - alignment is already clean" + + # Create a symbolic link to indicate no trimming was needed + ln -sf "${FASTA_FILE}" "ALICUT_${FASTA_FILE}" + echo "Created symbolic link: ALICUT_${FASTA_FILE} -> ${FASTA_FILE}" + + cd .. + exit 0 +fi + +echo "Found ${RSS_COUNT} RSS positions to remove" + +# Run ALICUT +echo "" +echo "Running ALICUT..." +echo "Options: ${ALICUT_OPTS}" + +# Construct ALICUT command +ALICUT_CMD="perl ${ALICUT_SCRIPT} ${ALICUT_OPTS}" + +if [ "${SILENT_MODE}" = true ]; then + echo "Command: ${ALICUT_CMD}" + eval ${ALICUT_CMD} +else + echo "Running ALICUT in interactive mode..." + echo "Press 's' and Enter to start with current options" + echo "" + perl "${ALICUT_SCRIPT}" ${ALICUT_OPTS} +fi + +# Check if ALICUT completed successfully +if [ $? -eq 0 ]; then + echo "" + echo "ALICUT completed successfully" + + # Find output file + OUTPUT_FILE=$(ls ALICUT_*.fas ALICUT_*.fasta 2>/dev/null | head -n 1) + + if [ -n "${OUTPUT_FILE}" ]; then + echo "" + echo "Output files:" + ls -lh ALICUT_* 2>/dev/null + + # Calculate and report trimming statistics (handle multi-line FASTA format) + if [ -f "${OUTPUT_FILE}" ]; then + ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${FASTA_FILE}" | head -n 1 | wc -c) + TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${OUTPUT_FILE}" | head -n 1 | wc -c) + REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH)) + PERCENT_REMOVED=$(awk "BEGIN {printf \"%.1f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}") + + echo "" + echo "Trimming statistics:" + echo " Original length: ${ORIGINAL_LENGTH} bp" + echo " Trimmed length: ${TRIMMED_LENGTH} bp" + echo " Removed: ${REMOVED_LENGTH} bp (${PERCENT_REMOVED}%)" + fi + + # Check for info file + if [ -f "ALICUT_info.xls" ]; then + echo "" + echo "Detailed statistics in: ALICUT_info.xls" + fi + else + echo "WARNING: Expected output file ALICUT_*.fas not found" + fi +else + echo "ERROR: ALICUT failed" + cd .. + exit 1 +fi + +# Return to parent directory +cd .. + +echo "" +echo "Done: ${ALISCORE_DIR}" diff --git a/skills/phylo_from_buscos/scripts/run_aliscore.sh b/skills/phylo_from_buscos/scripts/run_aliscore.sh new file mode 100755 index 0000000..2fa6d9d --- /dev/null +++ b/skills/phylo_from_buscos/scripts/run_aliscore.sh @@ -0,0 +1,248 @@ +#!/bin/bash + +# run_aliscore.sh +# Wrapper script for running Aliscore on aligned sequences +# Identifies randomly similar sequence sections (RSS) in multiple sequence alignments +# +# Usage: +# bash run_aliscore.sh [alignment.fas] [options] +# +# Options: +# -w INT Window size (default: 4) +# -r INT Number of random pairs to compare (default: 4*N taxa) +# -N Treat gaps as ambiguous characters (recommended for amino acids) +# -t TREE Tree file in Newick format for guided comparisons +# -l LEVEL Node level for tree-based comparisons +# -o TAXA Comma-separated list of outgroup taxa +# +# Array job usage: +# Set SLURM_ARRAY_TASK_ID or PBS_ARRAYID environment variable +# Create locus_list.txt with one alignment file per line +# +# Requirements: +# - Aliscore.02.2.pl in PATH or same directory +# - Perl with Tie::File and Fcntl modules + +set -euo pipefail + +# Script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Check for Aliscore script +if command -v Aliscore.02.2.pl &> /dev/null; then + ALISCORE_SCRIPT="Aliscore.02.2.pl" +elif [ -f "${SCRIPT_DIR}/Aliscore.02.2.pl" ]; then + ALISCORE_SCRIPT="${SCRIPT_DIR}/Aliscore.02.2.pl" +elif [ -f "./Aliscore.02.2.pl" ]; then + ALISCORE_SCRIPT="./Aliscore.02.2.pl" +else + echo "ERROR: Aliscore.02.2.pl not found in PATH, script directory, or current directory" + echo "Please download from: https://www.zfmk.de/en/research/research-centres-and-groups/aliscore" + exit 1 +fi + +# Function to display usage +usage() { + cat < locus_list.txt + +Examples: + # Basic run with defaults (outputs to aliscore_output/) + bash run_aliscore.sh alignment.fas + + # Amino acid sequences with gaps as ambiguous + bash run_aliscore.sh protein_alignment.fas -N + + # Custom output directory + bash run_aliscore.sh alignment.fas -d my_aliscore_results + + # Custom window size and random pairs + bash run_aliscore.sh alignment.fas -w 6 -r 100 + + # Tree-guided analysis + bash run_aliscore.sh alignment.fas -t species.tre + + # Array job on SLURM + ls aligned_aa/*.fas > locus_list.txt + sbatch --array=1-\$(wc -l < locus_list.txt) run_aliscore_array.job + +Output Files (in aliscore_output/aliscore_[alignment]/): + - [alignment]_List_random.txt : Positions identified as RSS (for ALICUT) + - [alignment]_Profile_random.txt: Quality profile for each position + - [alignment].svg : Visual plot of scoring profiles + +Citation: + Misof B, Misof K (2009) A Monte Carlo approach successfully identifies + randomness in multiple sequence alignments: a more objective means of data + exclusion. Syst Biol 58(1):21-34. doi: 10.1093/sysbio/syp006 + +EOF + exit 0 +} + +# Parse command line arguments +ALIGNMENT="" +ALISCORE_OPTS="" +BASE_OUTPUT_DIR="aliscore_output" + +if [ $# -eq 0 ]; then + usage +fi + +# Check for array job mode +ARRAY_MODE=false +ARRAY_ID="" + +if [ -n "${SLURM_ARRAY_TASK_ID:-}" ]; then + ARRAY_MODE=true + ARRAY_ID="${SLURM_ARRAY_TASK_ID}" +elif [ -n "${PBS_ARRAYID:-}" ]; then + ARRAY_MODE=true + ARRAY_ID="${PBS_ARRAYID}" +fi + +# If in array mode, get alignment from locus list +if [ "${ARRAY_MODE}" = true ]; then + if [ ! -f "locus_list.txt" ]; then + echo "ERROR: Array job mode requires locus_list.txt" + echo "Create with: ls *.fas > locus_list.txt" + exit 1 + fi + + ALIGNMENT=$(sed -n "${ARRAY_ID}p" locus_list.txt) + + if [ -z "${ALIGNMENT}" ]; then + echo "ERROR: Could not read alignment for array index ${ARRAY_ID}" + exit 1 + fi + + echo "Array job ${ARRAY_ID}: Processing ${ALIGNMENT}" + + # Remaining arguments are Aliscore options + shift $# # Clear positional parameters + set -- "$@" # Reset with remaining args +else + # First argument is alignment file + ALIGNMENT="$1" + shift +fi + +# Validate alignment file exists +if [ ! -f "${ALIGNMENT}" ]; then + echo "ERROR: Alignment file not found: ${ALIGNMENT}" + exit 1 +fi + +# Parse Aliscore options +while [ $# -gt 0 ]; do + case "$1" in + -h|--help) + usage + ;; + -d|--output-dir) + BASE_OUTPUT_DIR="$2" + shift 2 + ;; + -w) + ALISCORE_OPTS="${ALISCORE_OPTS} -w $2" + shift 2 + ;; + -r) + ALISCORE_OPTS="${ALISCORE_OPTS} -r $2" + shift 2 + ;; + -N) + ALISCORE_OPTS="${ALISCORE_OPTS} -N" + shift + ;; + -t) + if [ ! -f "$2" ]; then + echo "ERROR: Tree file not found: $2" + exit 1 + fi + ALISCORE_OPTS="${ALISCORE_OPTS} -t $2" + shift 2 + ;; + -l) + ALISCORE_OPTS="${ALISCORE_OPTS} -l $2" + shift 2 + ;; + -o) + ALISCORE_OPTS="${ALISCORE_OPTS} -o $2" + shift 2 + ;; + *) + echo "ERROR: Unknown option: $1" + usage + ;; + esac +done + +# Get alignment name without extension +ALIGNMENT_NAME=$(basename "${ALIGNMENT}" .fas) +ALIGNMENT_NAME=$(basename "${ALIGNMENT_NAME}" .fasta) + +# Create base output directory and specific directory for this alignment +mkdir -p "${BASE_OUTPUT_DIR}" +OUTPUT_DIR="${BASE_OUTPUT_DIR}/aliscore_${ALIGNMENT_NAME}" +mkdir -p "${OUTPUT_DIR}" + +# Copy alignment to output directory +cp "${ALIGNMENT}" "${OUTPUT_DIR}/" + +# Change to output directory +cd "${OUTPUT_DIR}" + +# Run Aliscore +echo "Running Aliscore on ${ALIGNMENT}..." +echo "Options: ${ALISCORE_OPTS}" +echo "Aliscore script: ${ALISCORE_SCRIPT}" + +# Construct and run Aliscore command +ALISCORE_CMD="perl -I${SCRIPT_DIR} ${ALISCORE_SCRIPT} -i $(basename ${ALIGNMENT}) ${ALISCORE_OPTS}" +echo "Command: ${ALISCORE_CMD}" + +eval ${ALISCORE_CMD} + +# Check if Aliscore completed successfully +if [ $? -eq 0 ]; then + echo "Aliscore completed successfully for ${ALIGNMENT}" + + # List output files + echo "" + echo "Output files in ${OUTPUT_DIR}:" + ls -lh *List*.txt *Profile*.txt *.svg 2>/dev/null || echo " (some expected files not generated)" + + # Report RSS positions if found + if [ -f "$(basename ${ALIGNMENT})_List_random.txt" ]; then + RSS_COUNT=$(wc -w < "$(basename ${ALIGNMENT})_List_random.txt") + echo "" + echo "Identified ${RSS_COUNT} randomly similar sequence positions" + echo "See: ${OUTPUT_DIR}/$(basename ${ALIGNMENT})_List_random.txt" + fi +else + echo "ERROR: Aliscore failed for ${ALIGNMENT}" + cd .. + exit 1 +fi + +# Return to parent directory +cd .. + +echo "Done: ${ALIGNMENT} -> ${OUTPUT_DIR}" diff --git a/skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh b/skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh new file mode 100755 index 0000000..dd0d886 --- /dev/null +++ b/skills/phylo_from_buscos/scripts/run_aliscore_alicut_batch.sh @@ -0,0 +1,270 @@ +#!/bin/bash + +# run_aliscore_alicut_batch.sh +# Batch processing script for Aliscore + ALICUT alignment trimming +# Processes all alignments in a directory through both tools sequentially +# +# Usage: +# bash run_aliscore_alicut_batch.sh [alignment_dir] [options] +# +# This script: +# 1. Runs Aliscore on all alignments to identify RSS +# 2. Runs ALICUT on each Aliscore output to remove RSS +# 3. Collects trimmed alignments in output directory +# +# Requirements: +# - run_aliscore.sh and run_alicut.sh in same directory or PATH +# - Aliscore.02.2.pl and ALICUT_V2.31.pl available + +set -euo pipefail + +# Script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Function to display usage +usage() { + cat < "${SUMMARY_FILE}" + +# Process each alignment +SUCCESS_COUNT=0 +FAIL_COUNT=0 + +for ALIGNMENT in "${ALIGNMENTS[@]}"; do + LOCUS=$(basename "${ALIGNMENT}" .fas) + LOCUS=$(basename "${LOCUS}" .fasta) + + echo "==========================================" + echo "Processing: ${LOCUS}" + echo "==========================================" + + # Step 1: Run Aliscore + echo "" + echo "Step 1/2: Running Aliscore..." + + if bash "${RUN_ALISCORE}" "${ALIGNMENT}" -d "${ALISCORE_BASE_DIR}" ${ALISCORE_OPTS}; then + echo "Aliscore completed for ${LOCUS}" + else + echo "ERROR: Aliscore failed for ${LOCUS}" + FAIL_COUNT=$((FAIL_COUNT + 1)) + continue + fi + + # Step 2: Run ALICUT + echo "" + echo "Step 2/2: Running ALICUT..." + + ALISCORE_DIR="${ALISCORE_BASE_DIR}/aliscore_${LOCUS}" + + if [ ! -d "${ALISCORE_DIR}" ]; then + echo "ERROR: Aliscore output directory not found: ${ALISCORE_DIR}" + FAIL_COUNT=$((FAIL_COUNT + 1)) + continue + fi + + if bash "${RUN_ALICUT}" "${ALISCORE_DIR}" ${ALICUT_OPTS}; then + echo "ALICUT completed for ${LOCUS}" + else + echo "ERROR: ALICUT failed for ${LOCUS}" + FAIL_COUNT=$((FAIL_COUNT + 1)) + continue + fi + + # Copy trimmed alignment to output directory + TRIMMED_FILE=$(find "${ALISCORE_DIR}" -name "ALICUT_*.fas" -o -name "ALICUT_*.fasta" | head -n 1) + + if [ -n "${TRIMMED_FILE}" ] && [ -f "${TRIMMED_FILE}" ]; then + cp "${TRIMMED_FILE}" "${OUTPUT_DIR}/${LOCUS}_trimmed.fas" + echo "Trimmed alignment: ${OUTPUT_DIR}/${LOCUS}_trimmed.fas" + + # Calculate statistics (handle multi-line FASTA format) + ORIGINAL_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${ALIGNMENT}" | head -n 1 | tr -d ' ' | wc -c) + TRIMMED_LENGTH=$(awk '/^>/ {if (seq) {print seq; seq=""}; next} {seq = seq $0} END {if (seq) print seq}' "${TRIMMED_FILE}" | head -n 1 | tr -d ' ' | wc -c) + REMOVED_LENGTH=$((ORIGINAL_LENGTH - TRIMMED_LENGTH)) + PERCENT_REMOVED=$(awk "BEGIN {printf \"%.2f\", (${REMOVED_LENGTH}/${ORIGINAL_LENGTH})*100}") + + # Count RSS positions + LIST_FILE=$(find "${ALISCORE_DIR}" -name "*_List_*.txt" | head -n 1) + RSS_COUNT=$(wc -w < "${LIST_FILE}" 2>/dev/null || echo "0") + + # Append to summary + echo -e "${LOCUS}\t${ORIGINAL_LENGTH}\t${TRIMMED_LENGTH}\t${REMOVED_LENGTH}\t${PERCENT_REMOVED}\t${RSS_COUNT}" >> "${SUMMARY_FILE}" + + SUCCESS_COUNT=$((SUCCESS_COUNT + 1)) + else + echo "WARNING: Trimmed file not found for ${LOCUS}" + FAIL_COUNT=$((FAIL_COUNT + 1)) + fi + + echo "" +done + +# Final report +echo "==========================================" +echo "BATCH PROCESSING COMPLETE" +echo "==========================================" +echo "" +echo "Successfully processed: ${SUCCESS_COUNT}/${#ALIGNMENTS[@]} alignments" +echo "Failed: ${FAIL_COUNT}/${#ALIGNMENTS[@]} alignments" +echo "" +echo "Output directory: ${OUTPUT_DIR}" +echo "Trimmed alignments: ${OUTPUT_DIR}/*_trimmed.fas" +echo "Summary statistics: ${SUMMARY_FILE}" +echo "" + +# Display summary statistics +if [ ${SUCCESS_COUNT} -gt 0 ]; then + echo "Overall trimming statistics:" + awk 'NR>1 { + total_orig += $2; + total_trim += $3; + total_removed += $4; + count++ + } + END { + if (count > 0) { + avg_removed = (total_removed / total_orig) * 100; + printf " Total positions before: %d\n", total_orig; + printf " Total positions after: %d\n", total_trim; + printf " Total removed: %d (%.2f%%)\n", total_removed, avg_removed; + printf " Average per locus: %.2f%% removed\n", avg_removed; + } + }' "${SUMMARY_FILE}" +fi + +echo "" +echo "Done!" diff --git a/skills/phylo_from_buscos/templates/README.md b/skills/phylo_from_buscos/templates/README.md new file mode 100644 index 0000000..5c5da6d --- /dev/null +++ b/skills/phylo_from_buscos/templates/README.md @@ -0,0 +1,125 @@ +# Phylogenomics Workflow Templates + +This directory contains template scripts for running the phylogenomics pipeline across different computing environments. + +## Directory Structure + +``` +templates/ +├── slurm/ # SLURM job scheduler templates +├── pbs/ # PBS/Torque job scheduler templates +└── local/ # Local machine templates (with GNU parallel support) +``` + +## Template Naming Convention + +Templates follow a consistent naming pattern: `NN_step_name[_variant].ext` + +- `NN`: Step number (e.g., `02` for compleasm, `08a` for partition search) +- `step_name`: Descriptive name of the pipeline step +- `_variant`: Optional variant (e.g., `_first`, `_parallel`, `_serial`) +- `.ext`: File extension (`.job` for schedulers, `.sh` for local scripts) + +## Available Templates + +### Step 2: Ortholog Identification (compleasm) + +**SLURM:** +- `02_compleasm_first.job` - Process first genome to download lineage database +- `02_compleasm_parallel.job` - Array job for remaining genomes + +**PBS:** +- `02_compleasm_first.job` - Process first genome to download lineage database +- `02_compleasm_parallel.job` - Array job for remaining genomes + +**Local:** +- `02_compleasm_first.sh` - Process first genome to download lineage database +- `02_compleasm_parallel.sh` - GNU parallel for remaining genomes + +### Step 8A: Partition Model Selection + +**SLURM:** +- `08a_partition_search.job` - IQ-TREE partition model search with TESTMERGEONLY + +**PBS:** +- `08a_partition_search.job` - IQ-TREE partition model search with TESTMERGEONLY + +**Local:** +- `08a_partition_search.sh` - IQ-TREE partition model search with TESTMERGEONLY + +### Step 8C: Individual Gene Trees + +**SLURM:** +- `08c_gene_trees_array.job` - Array job for parallel gene tree estimation + +**PBS:** +- `08c_gene_trees_array.job` - Array job for parallel gene tree estimation + +**Local:** +- `08c_gene_trees_parallel.sh` - GNU parallel for gene tree estimation +- `08c_gene_trees_serial.sh` - Serial processing (for debugging/limited resources) + +## Placeholders + +Templates contain placeholders that must be replaced with user-specific values: + +| Placeholder | Description | Example | +|-------------|-------------|---------| +| `TOTAL_THREADS` | Total CPU cores available | `64` | +| `THREADS_PER_JOB` | Threads per concurrent job | `16` | +| `NUM_GENOMES` | Number of genomes in analysis | `20` | +| `NUM_LOCI` | Number of loci/alignments | `2795` | +| `LINEAGE` | BUSCO lineage dataset | `insecta_odb10` | +| `MODEL_SET` | Comma-separated substitution models | `LG,WAG,JTT,Q.pfam` | + +## Usage + +### For Claude (LLM) + +When a user requests scripts for a specific computing environment: + +1. **Read the appropriate template** using the Read tool +2. **Replace placeholders** with user-specified values +3. **Present the customized script** to the user +4. **Provide setup instructions** (e.g., how many genomes, how to calculate thread allocation) + +Example: +```python +# Read template +template = Read("templates/slurm/02_compleasm_first.job") + +# Replace placeholders +script = template.replace("TOTAL_THREADS", "64") +script = script.replace("LINEAGE", "insecta_odb10") + +# Present to user +print(script) +``` + +### For Users + +Templates are not meant to be used directly. Instead: + +1. Follow the workflow in `SKILL.md` +2. Answer Claude's questions about your setup +3. Claude will fetch the appropriate template and customize it for you +4. Copy the customized script Claude provides + +## Benefits of This Structure + +1. **Reduced token usage**: Claude only reads templates when needed +2. **Easier maintenance**: Update one template file instead of multiple locations in SKILL.md +3. **Consistency**: All users get the same base template structure +4. **Clarity**: Separate files are easier to review than inline code +5. **Extensibility**: Easy to add new templates for additional tools or variants + +## Adding New Templates + +When adding new templates: + +1. **Follow naming convention**: `NN_descriptive_name[_variant].ext` +2. **Include clear comments**: Explain what the script does +3. **Use consistent placeholders**: Match existing placeholder names +4. **Test thoroughly**: Ensure placeholders are complete and correct +5. **Update this README**: Add the new template to the "Available Templates" section +6. **Update SKILL.md**: Reference the new template in the appropriate workflow step diff --git a/skills/phylo_from_buscos/templates/local/02_compleasm_first.sh b/skills/phylo_from_buscos/templates/local/02_compleasm_first.sh new file mode 100644 index 0000000..d2d4c39 --- /dev/null +++ b/skills/phylo_from_buscos/templates/local/02_compleasm_first.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# run_compleasm_first.sh +source ~/.bashrc +conda activate phylo + +# User-specified total CPU threads +TOTAL_THREADS=TOTAL_THREADS # Replace with total cores you want to use (e.g., 16, 32, 64) +echo "Processing first genome with ${TOTAL_THREADS} CPU threads to download lineage database..." + +# Create output directory +mkdir -p 01_busco_results + +# Process FIRST genome only +first_genome=$(head -n 1 genome_list.txt) +genome_name=$(basename ${first_genome} .fasta) +echo "Processing: ${genome_name}" + +compleasm run \ + -a ${first_genome} \ + -o 01_busco_results/${genome_name}_compleasm \ + -l LINEAGE \ + -t ${TOTAL_THREADS} + +echo "" +echo "First genome complete! Lineage database is now cached." +echo "Now run the parallel script for remaining genomes: bash run_compleasm_parallel.sh" diff --git a/skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh b/skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh new file mode 100644 index 0000000..924ab2e --- /dev/null +++ b/skills/phylo_from_buscos/templates/local/02_compleasm_parallel.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# run_compleasm_parallel.sh +source ~/.bashrc +conda activate phylo + +# Threading configuration (adjust based on your system) +TOTAL_THREADS=TOTAL_THREADS # Total cores to use (e.g., 64) +THREADS_PER_JOB=THREADS_PER_JOB # Threads per genome (e.g., 16) +CONCURRENT_JOBS=$((TOTAL_THREADS / THREADS_PER_JOB)) # Calculated automatically + +echo "Configuration:" +echo " Total threads: ${TOTAL_THREADS}" +echo " Threads per genome: ${THREADS_PER_JOB}" +echo " Concurrent genomes: ${CONCURRENT_JOBS}" +echo "" + +# Create output directory +mkdir -p 01_busco_results + +# Process remaining genomes (skip first one) in parallel +tail -n +2 genome_list.txt | parallel -j ${CONCURRENT_JOBS} ' + genome_name=$(basename {} .fasta) + echo "Processing ${genome_name} with THREADS_PER_JOB threads..." + + compleasm run \ + -a {} \ + -o 01_busco_results/${genome_name}_compleasm \ + -l LINEAGE \ + -t THREADS_PER_JOB +' + +echo "" +echo "All genomes processed!" diff --git a/skills/phylo_from_buscos/templates/local/08a_partition_search.sh b/skills/phylo_from_buscos/templates/local/08a_partition_search.sh new file mode 100644 index 0000000..ee3a3a5 --- /dev/null +++ b/skills/phylo_from_buscos/templates/local/08a_partition_search.sh @@ -0,0 +1,20 @@ +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd 06_concatenation + +iqtree \ + -s FcC_supermatrix.fas \ + -spp partition_def.txt \ + -nt 18 \ + -safe \ + -pre partition_search \ + -m TESTMERGEONLY \ + -mset MODEL_SET \ + -msub nuclear \ + -rcluster 10 \ + -bb 1000 \ + -alrt 1000 + +echo "Partition search complete! Best scheme: partition_search.best_scheme.nex" diff --git a/skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh b/skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh new file mode 100644 index 0000000..6fa9a44 --- /dev/null +++ b/skills/phylo_from_buscos/templates/local/08c_gene_trees_parallel.sh @@ -0,0 +1,17 @@ +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd trimmed_aa + +# Create list of alignments +ls *_trimmed.fas > locus_alignments.txt + +# Run IQ-TREE in parallel (adjust -j for number of concurrent jobs) +cat locus_alignments.txt | parallel -j 4 ' + prefix=$(basename {} _trimmed.fas) + iqtree -s {} -m MFP -bb 1000 -bnni -czb -pre ${prefix} -nt 1 + echo "Tree complete: ${prefix}" +' + +echo "All gene trees complete!" diff --git a/skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh b/skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh new file mode 100644 index 0000000..126dae2 --- /dev/null +++ b/skills/phylo_from_buscos/templates/local/08c_gene_trees_serial.sh @@ -0,0 +1,13 @@ +#!/bin/bash +source ~/.bashrc +conda activate phylo + +cd trimmed_aa + +for locus in *_trimmed.fas; do + prefix=$(basename ${locus} _trimmed.fas) + echo "Processing ${prefix}..." + iqtree -s ${locus} -m MFP -bb 1000 -bnni -czb -pre ${prefix} -nt 1 +done + +echo "All gene trees complete!" diff --git a/skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job b/skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job new file mode 100644 index 0000000..0bac18d --- /dev/null +++ b/skills/phylo_from_buscos/templates/pbs/02_compleasm_first.job @@ -0,0 +1,27 @@ +#!/bin/bash +#PBS -N compleasm_first +#PBS -l nodes=1:ppn=TOTAL_THREADS # Replace with total available CPUs (e.g., 64) +#PBS -l mem=384gb # Adjust based on ppn × 6GB +#PBS -l walltime=24:00:00 + +cd $PBS_O_WORKDIR +source ~/.bashrc +conda activate phylo + +mkdir -p logs +mkdir -p 01_busco_results + +# Process FIRST genome only (downloads lineage database) +first_genome=$(head -n 1 genome_list.txt) +genome_name=$(basename ${first_genome} .fasta) +echo "Processing first genome: ${genome_name} with $PBS_NUM_PPN threads..." +echo "This will download the BUSCO lineage database for subsequent runs." + +compleasm run \ + -a ${first_genome} \ + -o 01_busco_results/${genome_name}_compleasm \ + -l LINEAGE \ + -t $PBS_NUM_PPN + +echo "First genome complete! Lineage database is now cached." +echo "Submit the parallel job for remaining genomes: qsub run_compleasm_parallel.job" diff --git a/skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job b/skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job new file mode 100644 index 0000000..106c3f9 --- /dev/null +++ b/skills/phylo_from_buscos/templates/pbs/02_compleasm_parallel.job @@ -0,0 +1,24 @@ +#!/bin/bash +#PBS -N compleasm_parallel +#PBS -t 2-NUM_GENOMES # Start from genome 2 (first genome already processed) +#PBS -l nodes=1:ppn=THREADS_PER_JOB # e.g., 16 for 64-core system +#PBS -l mem=96gb # Adjust based on ppn × 6GB +#PBS -l walltime=48:00:00 + +cd $PBS_O_WORKDIR +source ~/.bashrc +conda activate phylo + +mkdir -p 01_busco_results + +# Get genome for this array task +genome=$(sed -n "${PBS_ARRAYID}p" genome_list.txt) +genome_name=$(basename ${genome} .fasta) + +echo "Processing ${genome_name} with $PBS_NUM_PPN threads..." + +compleasm run \ + -a ${genome} \ + -o 01_busco_results/${genome_name}_compleasm \ + -l LINEAGE \ + -t $PBS_NUM_PPN diff --git a/skills/phylo_from_buscos/templates/pbs/08a_partition_search.job b/skills/phylo_from_buscos/templates/pbs/08a_partition_search.job new file mode 100644 index 0000000..a9b159e --- /dev/null +++ b/skills/phylo_from_buscos/templates/pbs/08a_partition_search.job @@ -0,0 +1,22 @@ +#!/bin/bash +#PBS -N iqtree_partition +#PBS -l nodes=1:ppn=18 +#PBS -l mem=72gb +#PBS -l walltime=72:00:00 + +cd $PBS_O_WORKDIR/06_concatenation +source ~/.bashrc +conda activate phylo + +iqtree \ + -s FcC_supermatrix.fas \ + -spp partition_def.txt \ + -nt 18 \ + -safe \ + -pre partition_search \ + -m TESTMERGEONLY \ + -mset MODEL_SET \ + -msub nuclear \ + -rcluster 10 \ + -bb 1000 \ + -alrt 1000 diff --git a/skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job b/skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job new file mode 100644 index 0000000..bfa9cc0 --- /dev/null +++ b/skills/phylo_from_buscos/templates/pbs/08c_gene_trees_array.job @@ -0,0 +1,26 @@ +#!/bin/bash +#PBS -N iqtree_genes +#PBS -t 1-NUM_LOCI +#PBS -l nodes=1:ppn=1 +#PBS -l mem=4gb +#PBS -l walltime=2:00:00 + +cd $PBS_O_WORKDIR/trimmed_aa +source ~/.bashrc +conda activate phylo + +# Create list of alignments if not present +if [ ! -f locus_alignments.txt ]; then + ls *_trimmed.fas > locus_alignments.txt +fi + +locus=$(sed -n "${PBS_ARRAYID}p" locus_alignments.txt) + +iqtree \ + -s ${locus} \ + -m MFP \ + -bb 1000 \ + -bnni \ + -czb \ + -pre $(basename ${locus} _trimmed.fas) \ + -nt 1 diff --git a/skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job b/skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job new file mode 100644 index 0000000..7af44fc --- /dev/null +++ b/skills/phylo_from_buscos/templates/slurm/02_compleasm_first.job @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=compleasm_first +#SBATCH --cpus-per-task=TOTAL_THREADS # Replace with total available CPUs (e.g., 64) +#SBATCH --mem-per-cpu=6G +#SBATCH --time=24:00:00 +#SBATCH --output=logs/compleasm_first.%j.out +#SBATCH --error=logs/compleasm_first.%j.err + +source ~/.bashrc +conda activate phylo + +mkdir -p logs +mkdir -p 01_busco_results + +# Process FIRST genome only (downloads lineage database) +first_genome=$(head -n 1 genome_list.txt) +genome_name=$(basename ${first_genome} .fasta) +echo "Processing first genome: ${genome_name} with ${SLURM_CPUS_PER_TASK} threads..." +echo "This will download the BUSCO lineage database for subsequent runs." + +compleasm run \ + -a ${first_genome} \ + -o 01_busco_results/${genome_name}_compleasm \ + -l LINEAGE \ + -t ${SLURM_CPUS_PER_TASK} + +echo "First genome complete! Lineage database is now cached." +echo "Submit the parallel job for remaining genomes: sbatch run_compleasm_parallel.job" diff --git a/skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job b/skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job new file mode 100644 index 0000000..d23cc16 --- /dev/null +++ b/skills/phylo_from_buscos/templates/slurm/02_compleasm_parallel.job @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --job-name=compleasm_parallel +#SBATCH --array=2-NUM_GENOMES # Start from genome 2 (first genome already processed) +#SBATCH --cpus-per-task=THREADS_PER_JOB # e.g., 16 for 64-core system with 4 concurrent jobs +#SBATCH --mem-per-cpu=6G +#SBATCH --time=48:00:00 +#SBATCH --output=logs/compleasm.%A_%a.out +#SBATCH --error=logs/compleasm.%A_%a.err + +source ~/.bashrc +conda activate phylo + +mkdir -p 01_busco_results + +# Get genome for this array task (skipping the first one) +genome=$(sed -n "${SLURM_ARRAY_TASK_ID}p" genome_list.txt) +genome_name=$(basename ${genome} .fasta) + +echo "Processing ${genome_name} with ${SLURM_CPUS_PER_TASK} threads..." + +compleasm run \ + -a ${genome} \ + -o 01_busco_results/${genome_name}_compleasm \ + -l LINEAGE \ + -t ${SLURM_CPUS_PER_TASK} diff --git a/skills/phylo_from_buscos/templates/slurm/08a_partition_search.job b/skills/phylo_from_buscos/templates/slurm/08a_partition_search.job new file mode 100644 index 0000000..cedea49 --- /dev/null +++ b/skills/phylo_from_buscos/templates/slurm/08a_partition_search.job @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --job-name=iqtree_partition +#SBATCH --cpus-per-task=18 +#SBATCH --mem-per-cpu=4G +#SBATCH --time=72:00:00 +#SBATCH --output=logs/partition_search.out +#SBATCH --error=logs/partition_search.err + +source ~/.bashrc +conda activate phylo + +cd 06_concatenation # Use organized directory structure + +iqtree \ + -s FcC_supermatrix.fas \ + -spp partition_def.txt \ + -nt ${SLURM_CPUS_PER_TASK} \ + -safe \ + -pre partition_search \ + -m TESTMERGEONLY \ + -mset MODEL_SET \ + -msub nuclear \ + -rcluster 10 \ + -bb 1000 \ + -alrt 1000 + +# Output: partition_search.best_scheme.nex diff --git a/skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job b/skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job new file mode 100644 index 0000000..c8989b2 --- /dev/null +++ b/skills/phylo_from_buscos/templates/slurm/08c_gene_trees_array.job @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=iqtree_genes +#SBATCH --array=1-NUM_LOCI +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=4G +#SBATCH --time=2:00:00 +#SBATCH --output=logs/%A_%a.genetree.out + +source ~/.bashrc +conda activate phylo + +cd trimmed_aa + +# Create list of alignments if not present +if [ ! -f locus_alignments.txt ]; then + ls *_trimmed.fas > locus_alignments.txt +fi + +locus=$(sed -n "${SLURM_ARRAY_TASK_ID}p" locus_alignments.txt) + +iqtree \ + -s ${locus} \ + -m MFP \ + -bb 1000 \ + -bnni \ + -czb \ + -pre $(basename ${locus} _trimmed.fas) \ + -nt 1