Initial commit
This commit is contained in:
404
skills/biogeobears/scripts/biogeobears_analysis_template.Rmd
Normal file
404
skills/biogeobears/scripts/biogeobears_analysis_template.Rmd
Normal file
@@ -0,0 +1,404 @@
|
||||
---
|
||||
title: "BioGeoBEARS Biogeographic Analysis"
|
||||
author: "Generated by Claude Code"
|
||||
date: "`r Sys.Date()`"
|
||||
output:
|
||||
html_document:
|
||||
toc: true
|
||||
toc_float: true
|
||||
code_folding: show
|
||||
theme: flatly
|
||||
params:
|
||||
tree_file: "tree.nwk"
|
||||
geog_file: "geography.data"
|
||||
max_range_size: 4
|
||||
models: "DEC,DEC+J,DIVALIKE,DIVALIKE+J"
|
||||
output_dir: "results"
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
|
||||
library(BioGeoBEARS)
|
||||
library(ape)
|
||||
library(knitr)
|
||||
library(kableExtra)
|
||||
```
|
||||
|
||||
# Analysis Parameters
|
||||
|
||||
```{r parameters, echo=FALSE}
|
||||
params_df <- data.frame(
|
||||
Parameter = c("Tree file", "Geography file", "Max range size", "Models to test", "Output directory"),
|
||||
Value = c(params$tree_file, params$geog_file, params$max_range_size, params$models, params$output_dir)
|
||||
)
|
||||
|
||||
kable(params_df, caption = "Analysis Parameters") %>%
|
||||
kable_styling(bootstrap_options = c("striped", "hover"))
|
||||
```
|
||||
|
||||
# Input Data
|
||||
|
||||
## Phylogenetic Tree
|
||||
|
||||
```{r load-tree}
|
||||
trfn <- params$tree_file
|
||||
tr <- read.tree(trfn)
|
||||
|
||||
cat(paste("Number of tips:", length(tr$tip.label), "\n"))
|
||||
cat(paste("Tree is rooted:", is.rooted(tr), "\n"))
|
||||
cat(paste("Tree is ultrametric:", is.ultrametric(tr), "\n"))
|
||||
|
||||
# Plot tree
|
||||
plot(tr, cex = 0.6, main = "Input Phylogeny")
|
||||
```
|
||||
|
||||
## Geographic Distribution Data
|
||||
|
||||
```{r load-geography}
|
||||
geogfn <- params$geog_file
|
||||
tipranges <- getranges_from_LagrangePHYLIP(lgdata_fn = geogfn)
|
||||
|
||||
cat(paste("Number of species:", nrow(tipranges@df), "\n"))
|
||||
cat(paste("Number of areas:", ncol(tipranges@df), "\n"))
|
||||
cat(paste("Area names:", paste(names(tipranges@df), collapse = ", "), "\n\n"))
|
||||
|
||||
# Display geography matrix
|
||||
kable(tipranges@df, caption = "Species Distribution Matrix (1 = present, 0 = absent)") %>%
|
||||
kable_styling(bootstrap_options = c("striped", "hover"), font_size = 10) %>%
|
||||
scroll_box(height = "400px")
|
||||
```
|
||||
|
||||
## State Space Setup
|
||||
|
||||
```{r state-space}
|
||||
max_range_size <- params$max_range_size
|
||||
numareas <- ncol(tipranges@df)
|
||||
|
||||
num_states <- numstates_from_numareas(numareas = numareas,
|
||||
maxareas = max_range_size,
|
||||
include_null_range = TRUE)
|
||||
|
||||
cat(paste("Maximum range size:", max_range_size, "\n"))
|
||||
cat(paste("Number of possible states:", num_states, "\n"))
|
||||
```
|
||||
|
||||
# Model Fitting
|
||||
|
||||
```{r setup-output}
|
||||
# Create output directory
|
||||
if (!dir.exists(params$output_dir)) {
|
||||
dir.create(params$output_dir, recursive = TRUE)
|
||||
}
|
||||
|
||||
# Parse models to run
|
||||
models_to_run <- unlist(strsplit(params$models, ","))
|
||||
models_to_run <- trimws(models_to_run)
|
||||
|
||||
cat("Models to fit:\n")
|
||||
for (model in models_to_run) {
|
||||
cat(paste(" -", model, "\n"))
|
||||
}
|
||||
```
|
||||
|
||||
```{r model-fitting, results='hide'}
|
||||
# Storage for results
|
||||
results_list <- list()
|
||||
model_comparison <- data.frame(
|
||||
Model = character(),
|
||||
LnL = numeric(),
|
||||
nParams = integer(),
|
||||
AIC = numeric(),
|
||||
AICc = numeric(),
|
||||
d = numeric(),
|
||||
e = numeric(),
|
||||
j = numeric(),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
# Helper function to setup and run a model
|
||||
run_biogeobears_model <- function(model_name, BioGeoBEARS_run_object) {
|
||||
cat(paste("\n\nFitting model:", model_name, "\n"))
|
||||
|
||||
# Configure model based on name
|
||||
if (grepl("DEC", model_name)) {
|
||||
# DEC model (default settings)
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free"
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free"
|
||||
} else if (grepl("DIVALIKE", model_name)) {
|
||||
# DIVALIKE model (vicariance only, no subset sympatry)
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "fixed"
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","init"] = 0.0
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","est"] = 0.0
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free"
|
||||
} else if (grepl("BAYAREALIKE", model_name)) {
|
||||
# BAYAREALIKE model (sympatry only, no vicariance)
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free"
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "fixed"
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","init"] = 0.0
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","est"] = 0.0
|
||||
}
|
||||
|
||||
# Add +J parameter if specified
|
||||
if (grepl("\\+J", model_name)) {
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "free"
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.01
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.01
|
||||
} else {
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "fixed"
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.0
|
||||
BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.0
|
||||
}
|
||||
|
||||
# Run optimization
|
||||
res <- bears_optim_run(BioGeoBEARS_run_object)
|
||||
|
||||
return(res)
|
||||
}
|
||||
|
||||
# Base run object setup
|
||||
BioGeoBEARS_run_object <- define_BioGeoBEARS_run()
|
||||
BioGeoBEARS_run_object$trfn <- trfn
|
||||
BioGeoBEARS_run_object$geogfn <- geogfn
|
||||
BioGeoBEARS_run_object$max_range_size <- max_range_size
|
||||
BioGeoBEARS_run_object$min_branchlength <- 0.000001
|
||||
BioGeoBEARS_run_object$include_null_range <- TRUE
|
||||
BioGeoBEARS_run_object$force_sparse <- FALSE
|
||||
BioGeoBEARS_run_object$speedup <- TRUE
|
||||
BioGeoBEARS_run_object$use_optimx <- TRUE
|
||||
BioGeoBEARS_run_object$calc_ancprobs <- TRUE
|
||||
BioGeoBEARS_run_object <- readfiles_BioGeoBEARS_run(BioGeoBEARS_run_object)
|
||||
BioGeoBEARS_run_object <- calc_loglike_sp(BioGeoBEARS_run_object)
|
||||
|
||||
# Fit each model
|
||||
for (model in models_to_run) {
|
||||
tryCatch({
|
||||
res <- run_biogeobears_model(model, BioGeoBEARS_run_object)
|
||||
results_list[[model]] <- res
|
||||
|
||||
# Save result
|
||||
save(res, file = file.path(params$output_dir, paste0(model, "_result.Rdata")))
|
||||
|
||||
# Extract parameters for comparison
|
||||
params_table <- res$outputs@params_table
|
||||
model_comparison <- rbind(model_comparison, data.frame(
|
||||
Model = model,
|
||||
LnL = res$outputs@loglikelihood,
|
||||
nParams = sum(params_table$type == "free"),
|
||||
AIC = res$outputs@AIC,
|
||||
AICc = res$outputs@AICc,
|
||||
d = params_table["d", "est"],
|
||||
e = params_table["e", "est"],
|
||||
j = params_table["j", "est"],
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}, error = function(e) {
|
||||
cat(paste("Error fitting model", model, ":", e$message, "\n"))
|
||||
})
|
||||
}
|
||||
```
|
||||
|
||||
# Model Comparison
|
||||
|
||||
```{r model-comparison}
|
||||
# Calculate AIC weights
|
||||
if (nrow(model_comparison) > 0) {
|
||||
model_comparison$delta_AIC <- model_comparison$AIC - min(model_comparison$AIC)
|
||||
model_comparison$AIC_weight <- exp(-0.5 * model_comparison$delta_AIC) /
|
||||
sum(exp(-0.5 * model_comparison$delta_AIC))
|
||||
|
||||
# Sort by AIC
|
||||
model_comparison <- model_comparison[order(model_comparison$AIC), ]
|
||||
|
||||
kable(model_comparison, digits = 3,
|
||||
caption = "Model Comparison (sorted by AIC)") %>%
|
||||
kable_styling(bootstrap_options = c("striped", "hover")) %>%
|
||||
row_spec(1, bold = TRUE, background = "#d4edda") # Highlight best model
|
||||
|
||||
# Model selection summary
|
||||
best_model <- model_comparison$Model[1]
|
||||
cat(paste("\n\nBest model by AIC:", best_model, "\n"))
|
||||
cat(paste("AIC weight:", round(model_comparison$AIC_weight[1], 3), "\n"))
|
||||
}
|
||||
```
|
||||
|
||||
# Ancestral Range Reconstruction
|
||||
|
||||
## Best Model: `r if(exists('best_model')) best_model else 'TBD'`
|
||||
|
||||
```{r plot-best-model, fig.width=10, fig.height=12}
|
||||
if (exists('best_model') && best_model %in% names(results_list)) {
|
||||
res_best <- results_list[[best_model]]
|
||||
|
||||
# Create plots directory
|
||||
plots_dir <- file.path(params$output_dir, "plots")
|
||||
if (!dir.exists(plots_dir)) {
|
||||
dir.create(plots_dir, recursive = TRUE)
|
||||
}
|
||||
|
||||
# Plot with pie charts
|
||||
pdf(file.path(plots_dir, paste0(best_model, "_pie.pdf")), width = 10, height = 12)
|
||||
|
||||
analysis_titletxt <- paste("BioGeoBEARS:", best_model)
|
||||
|
||||
plot_BioGeoBEARS_results(
|
||||
results_object = res_best,
|
||||
analysis_titletxt = analysis_titletxt,
|
||||
addl_params = list("j"),
|
||||
plotwhat = "pie",
|
||||
label.offset = 0.5,
|
||||
tipcex = 0.7,
|
||||
statecex = 0.7,
|
||||
splitcex = 0.6,
|
||||
titlecex = 0.8,
|
||||
plotsplits = TRUE,
|
||||
include_null_range = TRUE,
|
||||
tr = tr,
|
||||
tipranges = tipranges
|
||||
)
|
||||
|
||||
dev.off()
|
||||
|
||||
# Also create text plot
|
||||
pdf(file.path(plots_dir, paste0(best_model, "_text.pdf")), width = 10, height = 12)
|
||||
|
||||
plot_BioGeoBEARS_results(
|
||||
results_object = res_best,
|
||||
analysis_titletxt = analysis_titletxt,
|
||||
addl_params = list("j"),
|
||||
plotwhat = "text",
|
||||
label.offset = 0.5,
|
||||
tipcex = 0.7,
|
||||
statecex = 0.7,
|
||||
splitcex = 0.6,
|
||||
titlecex = 0.8,
|
||||
plotsplits = TRUE,
|
||||
include_null_range = TRUE,
|
||||
tr = tr,
|
||||
tipranges = tipranges
|
||||
)
|
||||
|
||||
dev.off()
|
||||
|
||||
# Display in notebook (pie chart version)
|
||||
plot_BioGeoBEARS_results(
|
||||
results_object = res_best,
|
||||
analysis_titletxt = analysis_titletxt,
|
||||
addl_params = list("j"),
|
||||
plotwhat = "pie",
|
||||
label.offset = 0.5,
|
||||
tipcex = 0.7,
|
||||
statecex = 0.7,
|
||||
splitcex = 0.6,
|
||||
titlecex = 0.8,
|
||||
plotsplits = TRUE,
|
||||
include_null_range = TRUE,
|
||||
tr = tr,
|
||||
tipranges = tipranges
|
||||
)
|
||||
|
||||
cat(paste("\n\nPlots saved to:", plots_dir, "\n"))
|
||||
}
|
||||
```
|
||||
|
||||
# Parameter Estimates
|
||||
|
||||
```{r parameter-estimates, fig.width=10, fig.height=6}
|
||||
if (nrow(model_comparison) > 0) {
|
||||
# Extract base models (without +J)
|
||||
base_models <- model_comparison[!grepl("\\+J", model_comparison$Model), ]
|
||||
j_models <- model_comparison[grepl("\\+J", model_comparison$Model), ]
|
||||
|
||||
par(mfrow = c(1, 3))
|
||||
|
||||
# Plot d (dispersal) estimates
|
||||
barplot(model_comparison$d, names.arg = model_comparison$Model,
|
||||
main = "Dispersal Rate (d)", ylab = "Rate", las = 2, cex.names = 0.8,
|
||||
col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
|
||||
|
||||
# Plot e (extinction) estimates
|
||||
barplot(model_comparison$e, names.arg = model_comparison$Model,
|
||||
main = "Extinction Rate (e)", ylab = "Rate", las = 2, cex.names = 0.8,
|
||||
col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
|
||||
|
||||
# Plot j (founder-event) estimates for +J models
|
||||
j_vals <- model_comparison$j
|
||||
j_vals[j_vals == 0] <- NA
|
||||
barplot(j_vals, names.arg = model_comparison$Model,
|
||||
main = "Founder-event Rate (j)", ylab = "Rate", las = 2, cex.names = 0.8,
|
||||
col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
|
||||
}
|
||||
```
|
||||
|
||||
# Likelihood Ratio Tests
|
||||
|
||||
```{r lrt-tests}
|
||||
# Compare models with and without +J
|
||||
if (nrow(model_comparison) > 0) {
|
||||
lrt_results <- data.frame(
|
||||
Comparison = character(),
|
||||
Model1 = character(),
|
||||
Model2 = character(),
|
||||
LRT_statistic = numeric(),
|
||||
df = integer(),
|
||||
p_value = numeric(),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
base_model_names <- c("DEC", "DIVALIKE", "BAYAREALIKE")
|
||||
|
||||
for (base in base_model_names) {
|
||||
j_model <- paste0(base, "+J")
|
||||
|
||||
if (base %in% model_comparison$Model && j_model %in% model_comparison$Model) {
|
||||
lnl_base <- model_comparison[model_comparison$Model == base, "LnL"]
|
||||
lnl_j <- model_comparison[model_comparison$Model == j_model, "LnL"]
|
||||
|
||||
lrt_stat <- 2 * (lnl_j - lnl_base)
|
||||
df <- 1 # One additional parameter (j)
|
||||
p_val <- pchisq(lrt_stat, df = df, lower.tail = FALSE)
|
||||
|
||||
lrt_results <- rbind(lrt_results, data.frame(
|
||||
Comparison = paste(base, "vs", j_model),
|
||||
Model1 = base,
|
||||
Model2 = j_model,
|
||||
LRT_statistic = lrt_stat,
|
||||
df = df,
|
||||
p_value = p_val,
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
if (nrow(lrt_results) > 0) {
|
||||
lrt_results$Significant <- ifelse(lrt_results$p_value < 0.05, "Yes*", "No")
|
||||
|
||||
kable(lrt_results, digits = 4,
|
||||
caption = "Likelihood Ratio Tests (nested model comparisons)") %>%
|
||||
kable_styling(bootstrap_options = c("striped", "hover"))
|
||||
|
||||
cat("\n* p < 0.05 indicates significant improvement with +J parameter\n")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
# Session Info
|
||||
|
||||
```{r session-info}
|
||||
sessionInfo()
|
||||
```
|
||||
|
||||
# Outputs
|
||||
|
||||
All results have been saved to: **`r params$output_dir`**
|
||||
|
||||
Files generated:
|
||||
|
||||
- `[MODEL]_result.Rdata` - R data files with complete model results
|
||||
- `plots/[MODEL]_pie.pdf` - Phylogeny with pie charts showing ancestral range probabilities
|
||||
- `plots/[MODEL]_text.pdf` - Phylogeny with text labels showing most likely ancestral ranges
|
||||
- `biogeobears_analysis_template.html` - This HTML report
|
||||
|
||||
To load a saved result in R:
|
||||
```r
|
||||
load("results/DEC+J_result.Rdata")
|
||||
```
|
||||
299
skills/biogeobears/scripts/validate_geography_file.py
Executable file
299
skills/biogeobears/scripts/validate_geography_file.py
Executable file
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validates and optionally reformats a BioGeoBEARS geography file.
|
||||
|
||||
Geography files must follow the PHYLIP-like format:
|
||||
Line 1: n_species [TAB] n_areas [TAB] (area1 area2 area3 ...)
|
||||
Lines 2+: species_name [TAB] binary_string (e.g., 011 for absent in area1, present in area2 and area3)
|
||||
|
||||
Common errors:
|
||||
- Spaces instead of tabs
|
||||
- Spaces in species names
|
||||
- Spaces within binary strings
|
||||
- Species names not matching tree tip labels
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def validate_geography_file(filepath, tree_tips=None):
|
||||
"""
|
||||
Validate geography file format.
|
||||
|
||||
Args:
|
||||
filepath: Path to geography file
|
||||
tree_tips: Optional set of tree tip labels to validate against
|
||||
|
||||
Returns:
|
||||
dict with validation results and any errors/warnings
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
info = {}
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
lines = [line.rstrip('\n\r') for line in f.readlines()]
|
||||
|
||||
if not lines:
|
||||
errors.append("File is empty")
|
||||
return {'valid': False, 'errors': errors, 'warnings': warnings, 'info': info}
|
||||
|
||||
# Parse header line
|
||||
header = lines[0]
|
||||
if '\t' not in header:
|
||||
errors.append("Line 1: Missing tab delimiter (should be: n_species [TAB] n_areas [TAB] (area_names))")
|
||||
else:
|
||||
parts = header.split('\t')
|
||||
if len(parts) < 3:
|
||||
errors.append("Line 1: Expected format 'n_species [TAB] n_areas [TAB] (area_names)'")
|
||||
else:
|
||||
try:
|
||||
n_species = int(parts[0])
|
||||
n_areas = int(parts[1])
|
||||
|
||||
# Parse area names
|
||||
area_part = parts[2].strip()
|
||||
if not (area_part.startswith('(') and area_part.endswith(')')):
|
||||
errors.append("Line 1: Area names should be in parentheses: (A B C)")
|
||||
else:
|
||||
areas = area_part[1:-1].split()
|
||||
if len(areas) != n_areas:
|
||||
errors.append(f"Line 1: Declared {n_areas} areas but found {len(areas)} area names")
|
||||
|
||||
info['n_species'] = n_species
|
||||
info['n_areas'] = n_areas
|
||||
info['areas'] = areas
|
||||
|
||||
# Validate species lines
|
||||
species_found = []
|
||||
for i, line in enumerate(lines[1:], start=2):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
if '\t' not in line:
|
||||
errors.append(f"Line {i}: Missing tab between species name and binary code")
|
||||
continue
|
||||
|
||||
parts = line.split('\t')
|
||||
if len(parts) != 2:
|
||||
errors.append(f"Line {i}: Expected exactly one tab between species name and binary code")
|
||||
continue
|
||||
|
||||
species_name = parts[0]
|
||||
binary_code = parts[1]
|
||||
|
||||
# Check for spaces in species name
|
||||
if ' ' in species_name:
|
||||
errors.append(f"Line {i}: Species name '{species_name}' contains spaces (use underscores instead)")
|
||||
|
||||
# Check for spaces in binary code
|
||||
if ' ' in binary_code or '\t' in binary_code:
|
||||
errors.append(f"Line {i}: Binary code '{binary_code}' contains spaces or tabs (should be like '011' with no spaces)")
|
||||
|
||||
# Check binary code length
|
||||
if len(binary_code) != n_areas:
|
||||
errors.append(f"Line {i}: Binary code length ({len(binary_code)}) doesn't match number of areas ({n_areas})")
|
||||
|
||||
# Check binary code characters
|
||||
if not all(c in '01' for c in binary_code):
|
||||
errors.append(f"Line {i}: Binary code contains invalid characters (only 0 and 1 allowed)")
|
||||
|
||||
species_found.append(species_name)
|
||||
|
||||
# Check species count
|
||||
if len(species_found) != n_species:
|
||||
warnings.append(f"Header declares {n_species} species but found {len(species_found)} data lines")
|
||||
|
||||
info['species'] = species_found
|
||||
|
||||
# Check against tree tips if provided
|
||||
if tree_tips:
|
||||
species_set = set(species_found)
|
||||
tree_set = set(tree_tips)
|
||||
|
||||
missing_in_tree = species_set - tree_set
|
||||
missing_in_geog = tree_set - species_set
|
||||
|
||||
if missing_in_tree:
|
||||
errors.append(f"Species in geography file but not in tree: {', '.join(sorted(missing_in_tree))}")
|
||||
if missing_in_geog:
|
||||
errors.append(f"Species in tree but not in geography file: {', '.join(sorted(missing_in_geog))}")
|
||||
|
||||
except ValueError:
|
||||
errors.append("Line 1: First two fields must be integers (n_species and n_areas)")
|
||||
|
||||
return {
|
||||
'valid': len(errors) == 0,
|
||||
'errors': errors,
|
||||
'warnings': warnings,
|
||||
'info': info
|
||||
}
|
||||
|
||||
|
||||
def reformat_geography_file(input_path, output_path, delimiter=','):
|
||||
"""
|
||||
Attempt to reformat a geography file from common formats.
|
||||
|
||||
Args:
|
||||
input_path: Path to input file
|
||||
output_path: Path for output file
|
||||
delimiter: Delimiter used in input file (default: comma)
|
||||
"""
|
||||
with open(input_path, 'r') as f:
|
||||
lines = [line.strip() for line in f.readlines()]
|
||||
|
||||
# Detect if first line is a header
|
||||
header_line = lines[0]
|
||||
has_header = not header_line[0].isdigit()
|
||||
|
||||
if has_header:
|
||||
# Parse area names from header
|
||||
parts = header_line.split(delimiter)
|
||||
species_col = parts[0]
|
||||
area_names = [p.strip() for p in parts[1:]]
|
||||
data_lines = lines[1:]
|
||||
else:
|
||||
# No header, infer from first data line
|
||||
parts = lines[0].split(delimiter)
|
||||
n_areas = len(parts) - 1
|
||||
area_names = [chr(65 + i) for i in range(n_areas)] # A, B, C, ...
|
||||
data_lines = lines
|
||||
|
||||
# Parse species data
|
||||
species_data = []
|
||||
for line in data_lines:
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split(delimiter)
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
|
||||
species_name = parts[0].strip().replace(' ', '_')
|
||||
presence = ''.join(['1' if p.strip() in ['1', 'present', 'Present', 'TRUE', 'True'] else '0'
|
||||
for p in parts[1:]])
|
||||
species_data.append((species_name, presence))
|
||||
|
||||
# Write output
|
||||
with open(output_path, 'w') as f:
|
||||
# Header line
|
||||
n_species = len(species_data)
|
||||
n_areas = len(area_names)
|
||||
f.write(f"{n_species}\t{n_areas}\t({' '.join(area_names)})\n")
|
||||
|
||||
# Species lines
|
||||
for species_name, binary_code in species_data:
|
||||
f.write(f"{species_name}\t{binary_code}\n")
|
||||
|
||||
print(f"Reformatted {n_species} species across {n_areas} areas")
|
||||
print(f"Output written to: {output_path}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate and reformat BioGeoBEARS geography files',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Validate a geography file
|
||||
python validate_geography_file.py input.txt --validate
|
||||
|
||||
# Reformat from CSV to PHYLIP format
|
||||
python validate_geography_file.py input.csv --reformat -o output.data
|
||||
|
||||
# Reformat with tab delimiter
|
||||
python validate_geography_file.py input.txt --reformat --delimiter tab -o output.data
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input', help='Input geography file')
|
||||
parser.add_argument('--validate', action='store_true',
|
||||
help='Validate the file format')
|
||||
parser.add_argument('--reformat', action='store_true',
|
||||
help='Reformat file to BioGeoBEARS format')
|
||||
parser.add_argument('-o', '--output',
|
||||
help='Output file path (required for --reformat)')
|
||||
parser.add_argument('--delimiter', default=',',
|
||||
help='Delimiter in input file (default: comma). Use "tab" for tab-delimited.')
|
||||
parser.add_argument('--tree',
|
||||
help='Newick tree file to validate species names against')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.delimiter.lower() == 'tab':
|
||||
args.delimiter = '\t'
|
||||
|
||||
# Parse tree tips if provided
|
||||
tree_tips = None
|
||||
if args.tree:
|
||||
try:
|
||||
with open(args.tree, 'r') as f:
|
||||
tree_string = f.read().strip()
|
||||
# Extract tip labels using regex
|
||||
tree_tips = re.findall(r'([^(),:\s]+):', tree_string)
|
||||
if not tree_tips:
|
||||
tree_tips = re.findall(r'([^(),:\s]+)[,)]', tree_string)
|
||||
print(f"Found {len(tree_tips)} tips in tree file")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not parse tree file: {e}")
|
||||
|
||||
if args.validate:
|
||||
result = validate_geography_file(args.input, tree_tips)
|
||||
|
||||
print(f"\nValidation Results for: {args.input}")
|
||||
print("=" * 60)
|
||||
|
||||
if result['info']:
|
||||
print(f"\nFile Info:")
|
||||
print(f" Species: {result['info'].get('n_species', 'unknown')}")
|
||||
print(f" Areas: {result['info'].get('n_areas', 'unknown')}")
|
||||
if 'areas' in result['info']:
|
||||
print(f" Area names: {', '.join(result['info']['areas'])}")
|
||||
|
||||
if result['warnings']:
|
||||
print(f"\nWarnings ({len(result['warnings'])}):")
|
||||
for warning in result['warnings']:
|
||||
print(f" ⚠️ {warning}")
|
||||
|
||||
if result['errors']:
|
||||
print(f"\nErrors ({len(result['errors'])}):")
|
||||
for error in result['errors']:
|
||||
print(f" ❌ {error}")
|
||||
else:
|
||||
print(f"\n✅ File is valid!")
|
||||
|
||||
return 0 if result['valid'] else 1
|
||||
|
||||
elif args.reformat:
|
||||
if not args.output:
|
||||
print("Error: --output required when using --reformat")
|
||||
return 1
|
||||
|
||||
try:
|
||||
reformat_geography_file(args.input, args.output, args.delimiter)
|
||||
|
||||
# Validate reformatted file
|
||||
result = validate_geography_file(args.output, tree_tips)
|
||||
if result['valid']:
|
||||
print("✅ Reformatted file is valid!")
|
||||
else:
|
||||
print("\n⚠️ Reformatted file has validation errors:")
|
||||
for error in result['errors']:
|
||||
print(f" ❌ {error}")
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during reformatting: {e}")
|
||||
return 1
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user