Initial commit

2025-11-29 18:02:37 +08:00
commit c1d9dee646
38 changed files with 11210 additions and 0 deletions
--- a/skills/biogeobears/scripts/biogeobears_analysis_template.Rmd
+++ b/skills/biogeobears/scripts/biogeobears_analysis_template.Rmd
@@ -0,0 +1,404 @@
+---
+title: "BioGeoBEARS Biogeographic Analysis"
+author: "Generated by Claude Code"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: true
+    toc_float: true
+    code_folding: show
+    theme: flatly
+params:
+  tree_file: "tree.nwk"
+  geog_file: "geography.data"
+  max_range_size: 4
+  models: "DEC,DEC+J,DIVALIKE,DIVALIKE+J"
+  output_dir: "results"
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
+library(BioGeoBEARS)
+library(ape)
+library(knitr)
+library(kableExtra)
+```
+
+# Analysis Parameters
+
+```{r parameters, echo=FALSE}
+params_df <- data.frame(
+  Parameter = c("Tree file", "Geography file", "Max range size", "Models to test", "Output directory"),
+  Value = c(params$tree_file, params$geog_file, params$max_range_size, params$models, params$output_dir)
+)
+
+kable(params_df, caption = "Analysis Parameters") %>%
+  kable_styling(bootstrap_options = c("striped", "hover"))
+```
+
+# Input Data
+
+## Phylogenetic Tree
+
+```{r load-tree}
+trfn <- params$tree_file
+tr <- read.tree(trfn)
+
+cat(paste("Number of tips:", length(tr$tip.label), "\n"))
+cat(paste("Tree is rooted:", is.rooted(tr), "\n"))
+cat(paste("Tree is ultrametric:", is.ultrametric(tr), "\n"))
+
+# Plot tree
+plot(tr, cex = 0.6, main = "Input Phylogeny")
+```
+
+## Geographic Distribution Data
+
+```{r load-geography}
+geogfn <- params$geog_file
+tipranges <- getranges_from_LagrangePHYLIP(lgdata_fn = geogfn)
+
+cat(paste("Number of species:", nrow(tipranges@df), "\n"))
+cat(paste("Number of areas:", ncol(tipranges@df), "\n"))
+cat(paste("Area names:", paste(names(tipranges@df), collapse = ", "), "\n\n"))
+
+# Display geography matrix
+kable(tipranges@df, caption = "Species Distribution Matrix (1 = present, 0 = absent)") %>%
+  kable_styling(bootstrap_options = c("striped", "hover"), font_size = 10) %>%
+  scroll_box(height = "400px")
+```
+
+## State Space Setup
+
+```{r state-space}
+max_range_size <- params$max_range_size
+numareas <- ncol(tipranges@df)
+
+num_states <- numstates_from_numareas(numareas = numareas,
+                                       maxareas = max_range_size,
+                                       include_null_range = TRUE)
+
+cat(paste("Maximum range size:", max_range_size, "\n"))
+cat(paste("Number of possible states:", num_states, "\n"))
+```
+
+# Model Fitting
+
+```{r setup-output}
+# Create output directory
+if (!dir.exists(params$output_dir)) {
+  dir.create(params$output_dir, recursive = TRUE)
+}
+
+# Parse models to run
+models_to_run <- unlist(strsplit(params$models, ","))
+models_to_run <- trimws(models_to_run)
+
+cat("Models to fit:\n")
+for (model in models_to_run) {
+  cat(paste("  -", model, "\n"))
+}
+```
+
+```{r model-fitting, results='hide'}
+# Storage for results
+results_list <- list()
+model_comparison <- data.frame(
+  Model = character(),
+  LnL = numeric(),
+  nParams = integer(),
+  AIC = numeric(),
+  AICc = numeric(),
+  d = numeric(),
+  e = numeric(),
+  j = numeric(),
+  stringsAsFactors = FALSE
+)
+
+# Helper function to setup and run a model
+run_biogeobears_model <- function(model_name, BioGeoBEARS_run_object) {
+  cat(paste("\n\nFitting model:", model_name, "\n"))
+
+  # Configure model based on name
+  if (grepl("DEC", model_name)) {
+    # DEC model (default settings)
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free"
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free"
+  } else if (grepl("DIVALIKE", model_name)) {
+    # DIVALIKE model (vicariance only, no subset sympatry)
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "fixed"
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","init"] = 0.0
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","est"] = 0.0
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "free"
+  } else if (grepl("BAYAREALIKE", model_name)) {
+    # BAYAREALIKE model (sympatry only, no vicariance)
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["s","type"] = "free"
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","type"] = "fixed"
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","init"] = 0.0
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["v","est"] = 0.0
+  }
+
+  # Add +J parameter if specified
+  if (grepl("\\+J", model_name)) {
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "free"
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.01
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.01
+  } else {
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","type"] = "fixed"
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","init"] = 0.0
+    BioGeoBEARS_run_object$BioGeoBEARS_model_object@params_table["j","est"] = 0.0
+  }
+
+  # Run optimization
+  res <- bears_optim_run(BioGeoBEARS_run_object)
+
+  return(res)
+}
+
+# Base run object setup
+BioGeoBEARS_run_object <- define_BioGeoBEARS_run()
+BioGeoBEARS_run_object$trfn <- trfn
+BioGeoBEARS_run_object$geogfn <- geogfn
+BioGeoBEARS_run_object$max_range_size <- max_range_size
+BioGeoBEARS_run_object$min_branchlength <- 0.000001
+BioGeoBEARS_run_object$include_null_range <- TRUE
+BioGeoBEARS_run_object$force_sparse <- FALSE
+BioGeoBEARS_run_object$speedup <- TRUE
+BioGeoBEARS_run_object$use_optimx <- TRUE
+BioGeoBEARS_run_object$calc_ancprobs <- TRUE
+BioGeoBEARS_run_object <- readfiles_BioGeoBEARS_run(BioGeoBEARS_run_object)
+BioGeoBEARS_run_object <- calc_loglike_sp(BioGeoBEARS_run_object)
+
+# Fit each model
+for (model in models_to_run) {
+  tryCatch({
+    res <- run_biogeobears_model(model, BioGeoBEARS_run_object)
+    results_list[[model]] <- res
+
+    # Save result
+    save(res, file = file.path(params$output_dir, paste0(model, "_result.Rdata")))
+
+    # Extract parameters for comparison
+    params_table <- res$outputs@params_table
+    model_comparison <- rbind(model_comparison, data.frame(
+      Model = model,
+      LnL = res$outputs@loglikelihood,
+      nParams = sum(params_table$type == "free"),
+      AIC = res$outputs@AIC,
+      AICc = res$outputs@AICc,
+      d = params_table["d", "est"],
+      e = params_table["e", "est"],
+      j = params_table["j", "est"],
+      stringsAsFactors = FALSE
+    ))
+  }, error = function(e) {
+    cat(paste("Error fitting model", model, ":", e$message, "\n"))
+  })
+}
+```
+
+# Model Comparison
+
+```{r model-comparison}
+# Calculate AIC weights
+if (nrow(model_comparison) > 0) {
+  model_comparison$delta_AIC <- model_comparison$AIC - min(model_comparison$AIC)
+  model_comparison$AIC_weight <- exp(-0.5 * model_comparison$delta_AIC) /
+                                 sum(exp(-0.5 * model_comparison$delta_AIC))
+
+  # Sort by AIC
+  model_comparison <- model_comparison[order(model_comparison$AIC), ]
+
+  kable(model_comparison, digits = 3,
+        caption = "Model Comparison (sorted by AIC)") %>%
+    kable_styling(bootstrap_options = c("striped", "hover")) %>%
+    row_spec(1, bold = TRUE, background = "#d4edda")  # Highlight best model
+
+  # Model selection summary
+  best_model <- model_comparison$Model[1]
+  cat(paste("\n\nBest model by AIC:", best_model, "\n"))
+  cat(paste("AIC weight:", round(model_comparison$AIC_weight[1], 3), "\n"))
+}
+```
+
+# Ancestral Range Reconstruction
+
+## Best Model: `r if(exists('best_model')) best_model else 'TBD'`
+
+```{r plot-best-model, fig.width=10, fig.height=12}
+if (exists('best_model') && best_model %in% names(results_list)) {
+  res_best <- results_list[[best_model]]
+
+  # Create plots directory
+  plots_dir <- file.path(params$output_dir, "plots")
+  if (!dir.exists(plots_dir)) {
+    dir.create(plots_dir, recursive = TRUE)
+  }
+
+  # Plot with pie charts
+  pdf(file.path(plots_dir, paste0(best_model, "_pie.pdf")), width = 10, height = 12)
+
+  analysis_titletxt <- paste("BioGeoBEARS:", best_model)
+
+  plot_BioGeoBEARS_results(
+    results_object = res_best,
+    analysis_titletxt = analysis_titletxt,
+    addl_params = list("j"),
+    plotwhat = "pie",
+    label.offset = 0.5,
+    tipcex = 0.7,
+    statecex = 0.7,
+    splitcex = 0.6,
+    titlecex = 0.8,
+    plotsplits = TRUE,
+    include_null_range = TRUE,
+    tr = tr,
+    tipranges = tipranges
+  )
+
+  dev.off()
+
+  # Also create text plot
+  pdf(file.path(plots_dir, paste0(best_model, "_text.pdf")), width = 10, height = 12)
+
+  plot_BioGeoBEARS_results(
+    results_object = res_best,
+    analysis_titletxt = analysis_titletxt,
+    addl_params = list("j"),
+    plotwhat = "text",
+    label.offset = 0.5,
+    tipcex = 0.7,
+    statecex = 0.7,
+    splitcex = 0.6,
+    titlecex = 0.8,
+    plotsplits = TRUE,
+    include_null_range = TRUE,
+    tr = tr,
+    tipranges = tipranges
+  )
+
+  dev.off()
+
+  # Display in notebook (pie chart version)
+  plot_BioGeoBEARS_results(
+    results_object = res_best,
+    analysis_titletxt = analysis_titletxt,
+    addl_params = list("j"),
+    plotwhat = "pie",
+    label.offset = 0.5,
+    tipcex = 0.7,
+    statecex = 0.7,
+    splitcex = 0.6,
+    titlecex = 0.8,
+    plotsplits = TRUE,
+    include_null_range = TRUE,
+    tr = tr,
+    tipranges = tipranges
+  )
+
+  cat(paste("\n\nPlots saved to:", plots_dir, "\n"))
+}
+```
+
+# Parameter Estimates
+
+```{r parameter-estimates, fig.width=10, fig.height=6}
+if (nrow(model_comparison) > 0) {
+  # Extract base models (without +J)
+  base_models <- model_comparison[!grepl("\\+J", model_comparison$Model), ]
+  j_models <- model_comparison[grepl("\\+J", model_comparison$Model), ]
+
+  par(mfrow = c(1, 3))
+
+  # Plot d (dispersal) estimates
+  barplot(model_comparison$d, names.arg = model_comparison$Model,
+          main = "Dispersal Rate (d)", ylab = "Rate", las = 2, cex.names = 0.8,
+          col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
+
+  # Plot e (extinction) estimates
+  barplot(model_comparison$e, names.arg = model_comparison$Model,
+          main = "Extinction Rate (e)", ylab = "Rate", las = 2, cex.names = 0.8,
+          col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
+
+  # Plot j (founder-event) estimates for +J models
+  j_vals <- model_comparison$j
+  j_vals[j_vals == 0] <- NA
+  barplot(j_vals, names.arg = model_comparison$Model,
+          main = "Founder-event Rate (j)", ylab = "Rate", las = 2, cex.names = 0.8,
+          col = ifelse(model_comparison$Model == best_model, "darkgreen", "lightblue"))
+}
+```
+
+# Likelihood Ratio Tests
+
+```{r lrt-tests}
+# Compare models with and without +J
+if (nrow(model_comparison) > 0) {
+  lrt_results <- data.frame(
+    Comparison = character(),
+    Model1 = character(),
+    Model2 = character(),
+    LRT_statistic = numeric(),
+    df = integer(),
+    p_value = numeric(),
+    stringsAsFactors = FALSE
+  )
+
+  base_model_names <- c("DEC", "DIVALIKE", "BAYAREALIKE")
+
+  for (base in base_model_names) {
+    j_model <- paste0(base, "+J")
+
+    if (base %in% model_comparison$Model && j_model %in% model_comparison$Model) {
+      lnl_base <- model_comparison[model_comparison$Model == base, "LnL"]
+      lnl_j <- model_comparison[model_comparison$Model == j_model, "LnL"]
+
+      lrt_stat <- 2 * (lnl_j - lnl_base)
+      df <- 1  # One additional parameter (j)
+      p_val <- pchisq(lrt_stat, df = df, lower.tail = FALSE)
+
+      lrt_results <- rbind(lrt_results, data.frame(
+        Comparison = paste(base, "vs", j_model),
+        Model1 = base,
+        Model2 = j_model,
+        LRT_statistic = lrt_stat,
+        df = df,
+        p_value = p_val,
+        stringsAsFactors = FALSE
+      ))
+    }
+  }
+
+  if (nrow(lrt_results) > 0) {
+    lrt_results$Significant <- ifelse(lrt_results$p_value < 0.05, "Yes*", "No")
+
+    kable(lrt_results, digits = 4,
+          caption = "Likelihood Ratio Tests (nested model comparisons)") %>%
+      kable_styling(bootstrap_options = c("striped", "hover"))
+
+    cat("\n* p < 0.05 indicates significant improvement with +J parameter\n")
+  }
+}
+```
+
+# Session Info
+
+```{r session-info}
+sessionInfo()
+```
+
+# Outputs
+
+All results have been saved to: **`r params$output_dir`**
+
+Files generated:
+
+- `[MODEL]_result.Rdata` - R data files with complete model results
+- `plots/[MODEL]_pie.pdf` - Phylogeny with pie charts showing ancestral range probabilities
+- `plots/[MODEL]_text.pdf` - Phylogeny with text labels showing most likely ancestral ranges
+- `biogeobears_analysis_template.html` - This HTML report
+
+To load a saved result in R:
+```r
+load("results/DEC+J_result.Rdata")
+```
--- a/skills/biogeobears/scripts/validate_geography_file.py
+++ b/skills/biogeobears/scripts/validate_geography_file.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""
+Validates and optionally reformats a BioGeoBEARS geography file.
+
+Geography files must follow the PHYLIP-like format:
+Line 1: n_species [TAB] n_areas [TAB] (area1 area2 area3 ...)
+Lines 2+: species_name [TAB] binary_string (e.g., 011 for absent in area1, present in area2 and area3)
+
+Common errors:
+- Spaces instead of tabs
+- Spaces in species names
+- Spaces within binary strings
+- Species names not matching tree tip labels
+"""
+
+import sys
+import argparse
+import re
+from pathlib import Path
+
+
+def validate_geography_file(filepath, tree_tips=None):
+    """
+    Validate geography file format.
+
+    Args:
+        filepath: Path to geography file
+        tree_tips: Optional set of tree tip labels to validate against
+
+    Returns:
+        dict with validation results and any errors/warnings
+    """
+    errors = []
+    warnings = []
+    info = {}
+
+    with open(filepath, 'r') as f:
+        lines = [line.rstrip('\n\r') for line in f.readlines()]
+
+    if not lines:
+        errors.append("File is empty")
+        return {'valid': False, 'errors': errors, 'warnings': warnings, 'info': info}
+
+    # Parse header line
+    header = lines[0]
+    if '\t' not in header:
+        errors.append("Line 1: Missing tab delimiter (should be: n_species [TAB] n_areas [TAB] (area_names))")
+    else:
+        parts = header.split('\t')
+        if len(parts) < 3:
+            errors.append("Line 1: Expected format 'n_species [TAB] n_areas [TAB] (area_names)'")
+        else:
+            try:
+                n_species = int(parts[0])
+                n_areas = int(parts[1])
+
+                # Parse area names
+                area_part = parts[2].strip()
+                if not (area_part.startswith('(') and area_part.endswith(')')):
+                    errors.append("Line 1: Area names should be in parentheses: (A B C)")
+                else:
+                    areas = area_part[1:-1].split()
+                    if len(areas) != n_areas:
+                        errors.append(f"Line 1: Declared {n_areas} areas but found {len(areas)} area names")
+
+                    info['n_species'] = n_species
+                    info['n_areas'] = n_areas
+                    info['areas'] = areas
+
+                    # Validate species lines
+                    species_found = []
+                    for i, line in enumerate(lines[1:], start=2):
+                        if not line.strip():
+                            continue
+
+                        if '\t' not in line:
+                            errors.append(f"Line {i}: Missing tab between species name and binary code")
+                            continue
+
+                        parts = line.split('\t')
+                        if len(parts) != 2:
+                            errors.append(f"Line {i}: Expected exactly one tab between species name and binary code")
+                            continue
+
+                        species_name = parts[0]
+                        binary_code = parts[1]
+
+                        # Check for spaces in species name
+                        if ' ' in species_name:
+                            errors.append(f"Line {i}: Species name '{species_name}' contains spaces (use underscores instead)")
+
+                        # Check for spaces in binary code
+                        if ' ' in binary_code or '\t' in binary_code:
+                            errors.append(f"Line {i}: Binary code '{binary_code}' contains spaces or tabs (should be like '011' with no spaces)")
+
+                        # Check binary code length
+                        if len(binary_code) != n_areas:
+                            errors.append(f"Line {i}: Binary code length ({len(binary_code)}) doesn't match number of areas ({n_areas})")
+
+                        # Check binary code characters
+                        if not all(c in '01' for c in binary_code):
+                            errors.append(f"Line {i}: Binary code contains invalid characters (only 0 and 1 allowed)")
+
+                        species_found.append(species_name)
+
+                    # Check species count
+                    if len(species_found) != n_species:
+                        warnings.append(f"Header declares {n_species} species but found {len(species_found)} data lines")
+
+                    info['species'] = species_found
+
+                    # Check against tree tips if provided
+                    if tree_tips:
+                        species_set = set(species_found)
+                        tree_set = set(tree_tips)
+
+                        missing_in_tree = species_set - tree_set
+                        missing_in_geog = tree_set - species_set
+
+                        if missing_in_tree:
+                            errors.append(f"Species in geography file but not in tree: {', '.join(sorted(missing_in_tree))}")
+                        if missing_in_geog:
+                            errors.append(f"Species in tree but not in geography file: {', '.join(sorted(missing_in_geog))}")
+
+            except ValueError:
+                errors.append("Line 1: First two fields must be integers (n_species and n_areas)")
+
+    return {
+        'valid': len(errors) == 0,
+        'errors': errors,
+        'warnings': warnings,
+        'info': info
+    }
+
+
+def reformat_geography_file(input_path, output_path, delimiter=','):
+    """
+    Attempt to reformat a geography file from common formats.
+
+    Args:
+        input_path: Path to input file
+        output_path: Path for output file
+        delimiter: Delimiter used in input file (default: comma)
+    """
+    with open(input_path, 'r') as f:
+        lines = [line.strip() for line in f.readlines()]
+
+    # Detect if first line is a header
+    header_line = lines[0]
+    has_header = not header_line[0].isdigit()
+
+    if has_header:
+        # Parse area names from header
+        parts = header_line.split(delimiter)
+        species_col = parts[0]
+        area_names = [p.strip() for p in parts[1:]]
+        data_lines = lines[1:]
+    else:
+        # No header, infer from first data line
+        parts = lines[0].split(delimiter)
+        n_areas = len(parts) - 1
+        area_names = [chr(65 + i) for i in range(n_areas)]  # A, B, C, ...
+        data_lines = lines
+
+    # Parse species data
+    species_data = []
+    for line in data_lines:
+        if not line:
+            continue
+        parts = line.split(delimiter)
+        if len(parts) < 2:
+            continue
+
+        species_name = parts[0].strip().replace(' ', '_')
+        presence = ''.join(['1' if p.strip() in ['1', 'present', 'Present', 'TRUE', 'True'] else '0'
+                           for p in parts[1:]])
+        species_data.append((species_name, presence))
+
+    # Write output
+    with open(output_path, 'w') as f:
+        # Header line
+        n_species = len(species_data)
+        n_areas = len(area_names)
+        f.write(f"{n_species}\t{n_areas}\t({' '.join(area_names)})\n")
+
+        # Species lines
+        for species_name, binary_code in species_data:
+            f.write(f"{species_name}\t{binary_code}\n")
+
+    print(f"Reformatted {n_species} species across {n_areas} areas")
+    print(f"Output written to: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Validate and reformat BioGeoBEARS geography files',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Validate a geography file
+  python validate_geography_file.py input.txt --validate
+
+  # Reformat from CSV to PHYLIP format
+  python validate_geography_file.py input.csv --reformat -o output.data
+
+  # Reformat with tab delimiter
+  python validate_geography_file.py input.txt --reformat --delimiter tab -o output.data
+        """
+    )
+
+    parser.add_argument('input', help='Input geography file')
+    parser.add_argument('--validate', action='store_true',
+                       help='Validate the file format')
+    parser.add_argument('--reformat', action='store_true',
+                       help='Reformat file to BioGeoBEARS format')
+    parser.add_argument('-o', '--output',
+                       help='Output file path (required for --reformat)')
+    parser.add_argument('--delimiter', default=',',
+                       help='Delimiter in input file (default: comma). Use "tab" for tab-delimited.')
+    parser.add_argument('--tree',
+                       help='Newick tree file to validate species names against')
+
+    args = parser.parse_args()
+
+    if args.delimiter.lower() == 'tab':
+        args.delimiter = '\t'
+
+    # Parse tree tips if provided
+    tree_tips = None
+    if args.tree:
+        try:
+            with open(args.tree, 'r') as f:
+                tree_string = f.read().strip()
+            # Extract tip labels using regex
+            tree_tips = re.findall(r'([^(),:\s]+):', tree_string)
+            if not tree_tips:
+                tree_tips = re.findall(r'([^(),:\s]+)[,)]', tree_string)
+            print(f"Found {len(tree_tips)} tips in tree file")
+        except Exception as e:
+            print(f"Warning: Could not parse tree file: {e}")
+
+    if args.validate:
+        result = validate_geography_file(args.input, tree_tips)
+
+        print(f"\nValidation Results for: {args.input}")
+        print("=" * 60)
+
+        if result['info']:
+            print(f"\nFile Info:")
+            print(f"  Species: {result['info'].get('n_species', 'unknown')}")
+            print(f"  Areas: {result['info'].get('n_areas', 'unknown')}")
+            if 'areas' in result['info']:
+                print(f"  Area names: {', '.join(result['info']['areas'])}")
+
+        if result['warnings']:
+            print(f"\nWarnings ({len(result['warnings'])}):")
+            for warning in result['warnings']:
+                print(f"  ⚠️  {warning}")
+
+        if result['errors']:
+            print(f"\nErrors ({len(result['errors'])}):")
+            for error in result['errors']:
+                print(f"  ❌ {error}")
+        else:
+            print(f"\n✅ File is valid!")
+
+        return 0 if result['valid'] else 1
+
+    elif args.reformat:
+        if not args.output:
+            print("Error: --output required when using --reformat")
+            return 1
+
+        try:
+            reformat_geography_file(args.input, args.output, args.delimiter)
+
+            # Validate reformatted file
+            result = validate_geography_file(args.output, tree_tips)
+            if result['valid']:
+                print("✅ Reformatted file is valid!")
+            else:
+                print("\n⚠️  Reformatted file has validation errors:")
+                for error in result['errors']:
+                    print(f"  ❌ {error}")
+                return 1
+
+        except Exception as e:
+            print(f"Error during reformatting: {e}")
+            return 1
+
+    else:
+        parser.print_help()
+        return 1
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())