#!/usr/bin/env python3
"""
Validates and optionally reformats a BioGeoBEARS geography file.

Geography files must follow the PHYLIP-like format:
Line 1: n_species [TAB] n_areas [TAB] (area1 area2 area3 ...)
Lines 2+: species_name [TAB] binary_string (e.g., 011 for absent in area1, present in area2 and area3)

Common errors:
- Spaces instead of tabs
- Spaces in species names
- Spaces within binary strings
- Species names not matching tree tip labels
"""

import sys
import argparse
import re
from pathlib import Path


def validate_geography_file(filepath, tree_tips=None):
    """
    Validate geography file format.

    Args:
        filepath: Path to geography file
        tree_tips: Optional set of tree tip labels to validate against

    Returns:
        dict with validation results and any errors/warnings
    """
    errors = []
    warnings = []
    info = {}

    with open(filepath, 'r') as f:
        lines = [line.rstrip('\n\r') for line in f.readlines()]

    if not lines:
        errors.append("File is empty")
        return {'valid': False, 'errors': errors, 'warnings': warnings, 'info': info}

    # Parse header line
    header = lines[0]
    if '\t' not in header:
        errors.append("Line 1: Missing tab delimiter (should be: n_species [TAB] n_areas [TAB] (area_names))")
    else:
        parts = header.split('\t')
        if len(parts) < 3:
            errors.append("Line 1: Expected format 'n_species [TAB] n_areas [TAB] (area_names)'")
        else:
            try:
                n_species = int(parts[0])
                n_areas = int(parts[1])

                # Parse area names
                area_part = parts[2].strip()
                if not (area_part.startswith('(') and area_part.endswith(')')):
                    errors.append("Line 1: Area names should be in parentheses: (A B C)")
                else:
                    areas = area_part[1:-1].split()
                    if len(areas) != n_areas:
                        errors.append(f"Line 1: Declared {n_areas} areas but found {len(areas)} area names")

                    info['n_species'] = n_species
                    info['n_areas'] = n_areas
                    info['areas'] = areas

                    # Validate species lines
                    species_found = []
                    for i, line in enumerate(lines[1:], start=2):
                        if not line.strip():
                            continue

                        if '\t' not in line:
                            errors.append(f"Line {i}: Missing tab between species name and binary code")
                            continue

                        parts = line.split('\t')
                        if len(parts) != 2:
                            errors.append(f"Line {i}: Expected exactly one tab between species name and binary code")
                            continue

                        species_name = parts[0]
                        binary_code = parts[1]

                        # Check for spaces in species name
                        if ' ' in species_name:
                            errors.append(f"Line {i}: Species name '{species_name}' contains spaces (use underscores instead)")

                        # Check for spaces in binary code
                        if ' ' in binary_code or '\t' in binary_code:
                            errors.append(f"Line {i}: Binary code '{binary_code}' contains spaces or tabs (should be like '011' with no spaces)")

                        # Check binary code length
                        if len(binary_code) != n_areas:
                            errors.append(f"Line {i}: Binary code length ({len(binary_code)}) doesn't match number of areas ({n_areas})")

                        # Check binary code characters
                        if not all(c in '01' for c in binary_code):
                            errors.append(f"Line {i}: Binary code contains invalid characters (only 0 and 1 allowed)")

                        species_found.append(species_name)

                    # Check species count
                    if len(species_found) != n_species:
                        warnings.append(f"Header declares {n_species} species but found {len(species_found)} data lines")

                    info['species'] = species_found

                    # Check against tree tips if provided
                    if tree_tips:
                        species_set = set(species_found)
                        tree_set = set(tree_tips)

                        missing_in_tree = species_set - tree_set
                        missing_in_geog = tree_set - species_set

                        if missing_in_tree:
                            errors.append(f"Species in geography file but not in tree: {', '.join(sorted(missing_in_tree))}")
                        if missing_in_geog:
                            errors.append(f"Species in tree but not in geography file: {', '.join(sorted(missing_in_geog))}")

            except ValueError:
                errors.append("Line 1: First two fields must be integers (n_species and n_areas)")

    return {
        'valid': len(errors) == 0,
        'errors': errors,
        'warnings': warnings,
        'info': info
    }


def reformat_geography_file(input_path, output_path, delimiter=','):
    """
    Attempt to reformat a geography file from common formats.

    Args:
        input_path: Path to input file
        output_path: Path for output file
        delimiter: Delimiter used in input file (default: comma)
    """
    with open(input_path, 'r') as f:
        lines = [line.strip() for line in f.readlines()]

    # Detect if first line is a header
    header_line = lines[0]
    has_header = not header_line[0].isdigit()

    if has_header:
        # Parse area names from header
        parts = header_line.split(delimiter)
        species_col = parts[0]
        area_names = [p.strip() for p in parts[1:]]
        data_lines = lines[1:]
    else:
        # No header, infer from first data line
        parts = lines[0].split(delimiter)
        n_areas = len(parts) - 1
        area_names = [chr(65 + i) for i in range(n_areas)]  # A, B, C, ...
        data_lines = lines

    # Parse species data
    species_data = []
    for line in data_lines:
        if not line:
            continue
        parts = line.split(delimiter)
        if len(parts) < 2:
            continue

        species_name = parts[0].strip().replace(' ', '_')
        presence = ''.join(['1' if p.strip() in ['1', 'present', 'Present', 'TRUE', 'True'] else '0'
                           for p in parts[1:]])
        species_data.append((species_name, presence))

    # Write output
    with open(output_path, 'w') as f:
        # Header line
        n_species = len(species_data)
        n_areas = len(area_names)
        f.write(f"{n_species}\t{n_areas}\t({' '.join(area_names)})\n")

        # Species lines
        for species_name, binary_code in species_data:
            f.write(f"{species_name}\t{binary_code}\n")

    print(f"Reformatted {n_species} species across {n_areas} areas")
    print(f"Output written to: {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description='Validate and reformat BioGeoBEARS geography files',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Validate a geography file
  python validate_geography_file.py input.txt --validate

  # Reformat from CSV to PHYLIP format
  python validate_geography_file.py input.csv --reformat -o output.data

  # Reformat with tab delimiter
  python validate_geography_file.py input.txt --reformat --delimiter tab -o output.data
        """
    )

    parser.add_argument('input', help='Input geography file')
    parser.add_argument('--validate', action='store_true',
                       help='Validate the file format')
    parser.add_argument('--reformat', action='store_true',
                       help='Reformat file to BioGeoBEARS format')
    parser.add_argument('-o', '--output',
                       help='Output file path (required for --reformat)')
    parser.add_argument('--delimiter', default=',',
                       help='Delimiter in input file (default: comma). Use "tab" for tab-delimited.')
    parser.add_argument('--tree',
                       help='Newick tree file to validate species names against')

    args = parser.parse_args()

    if args.delimiter.lower() == 'tab':
        args.delimiter = '\t'

    # Parse tree tips if provided
    tree_tips = None
    if args.tree:
        try:
            with open(args.tree, 'r') as f:
                tree_string = f.read().strip()
            # Extract tip labels using regex
            tree_tips = re.findall(r'([^(),:\s]+):', tree_string)
            if not tree_tips:
                tree_tips = re.findall(r'([^(),:\s]+)[,)]', tree_string)
            print(f"Found {len(tree_tips)} tips in tree file")
        except Exception as e:
            print(f"Warning: Could not parse tree file: {e}")

    if args.validate:
        result = validate_geography_file(args.input, tree_tips)

        print(f"\nValidation Results for: {args.input}")
        print("=" * 60)

        if result['info']:
            print(f"\nFile Info:")
            print(f"  Species: {result['info'].get('n_species', 'unknown')}")
            print(f"  Areas: {result['info'].get('n_areas', 'unknown')}")
            if 'areas' in result['info']:
                print(f"  Area names: {', '.join(result['info']['areas'])}")

        if result['warnings']:
            print(f"\nWarnings ({len(result['warnings'])}):")
            for warning in result['warnings']:
                print(f"  ⚠️  {warning}")

        if result['errors']:
            print(f"\nErrors ({len(result['errors'])}):")
            for error in result['errors']:
                print(f"  ❌ {error}")
        else:
            print(f"\n✅ File is valid!")

        return 0 if result['valid'] else 1

    elif args.reformat:
        if not args.output:
            print("Error: --output required when using --reformat")
            return 1

        try:
            reformat_geography_file(args.input, args.output, args.delimiter)

            # Validate reformatted file
            result = validate_geography_file(args.output, tree_tips)
            if result['valid']:
                print("✅ Reformatted file is valid!")
            else:
                print("\n⚠️  Reformatted file has validation errors:")
                for error in result['errors']:
                    print(f"  ❌ {error}")
                return 1

        except Exception as e:
            print(f"Error during reformatting: {e}")
            return 1

    else:
        parser.print_help()
        return 1

    return 0


if __name__ == '__main__':
    sys.exit(main())