gh-doodledood-claude-code-p…/skills/consultant/scripts/consultant_cli.py

#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "litellm",
#     "requests>=2.31.0",
#     "tenacity",
#     "markitdown>=0.1.0",
# ]
# ///
"""
Consultant CLI - LiteLLM-powered LLM consultation tool
Supports async invocation, custom base URLs, and flexible model selection

Run with: uv run consultant_cli.py [args]
This automatically installs/updates dependencies (litellm, requests) on first run.
"""

import argparse
import json
import sys
from pathlib import Path

# Add scripts directory to path
SCRIPTS_DIR = Path(__file__).parent
sys.path.insert(0, str(SCRIPTS_DIR))

import config
from file_handler import (
    FileHandler,
    build_multimodal_content,
    build_prompt_with_references,
    has_images,
    validate_vision_support,
)
from litellm_client import LiteLLMClient
from model_selector import ModelSelector
from session_manager import SessionManager


def validate_context_size(
    full_prompt: str, model: str, client: LiteLLMClient, num_files: int
) -> bool:
    """
    Validate that full prompt fits in model context.
    Returns True if OK, raises ValueError if exceeds.
    """

    # Count tokens for the complete prompt
    total_tokens = client.count_tokens(full_prompt, model)

    # Get limit
    max_tokens = client.get_max_tokens(model)

    # Reserve for response
    available_tokens = int(max_tokens * (1 - config.CONTEXT_RESERVE_RATIO))

    # Print summary
    print("\n📊 Token Usage:")
    print(f"- Input: {total_tokens:,} tokens ({num_files} files)")
    print(f"- Limit: {max_tokens:,} tokens")
    print(
        f"- Available: {available_tokens:,} tokens ({int((available_tokens/max_tokens)*100)}%)\n"
    )

    if total_tokens > max_tokens:
        raise ValueError(
            f"Input exceeds context limit!\n"
            f"  Input: {total_tokens:,} tokens\n"
            f"  Limit: {max_tokens:,} tokens\n"
            f"  Overage: {total_tokens - max_tokens:,} tokens\n\n"
            f"Suggestions:\n"
            f"1. Reduce number of files (currently {num_files})\n"
            f"2. Use a model with larger context\n"
            f"3. Shorten the prompt"
        )

    if total_tokens > available_tokens:
        print(f"⚠️  WARNING: Using {int((total_tokens/max_tokens)*100)}% of context")
        print("   Consider reducing input size for better response quality\n")

    return True


def handle_invocation(args: argparse.Namespace) -> int:
    """Handle main invocation command"""

    # Determine base URL: --base-url flag > OPENAI_BASE_URL env var > None
    base_url = args.base_url
    if not base_url:
        base_url = config.get_base_url()
        if base_url:
            print(f"Using base URL from OPENAI_BASE_URL: {base_url}")

    # Initialize components
    session_mgr = SessionManager()
    client = LiteLLMClient(base_url=base_url, api_key=args.api_key)

    # Process files using FileHandler
    file_handler = FileHandler()
    processed_files = []
    multimodal_content = None

    if args.files:
        processed_files, file_errors = file_handler.process_files(args.files)

        # If any files failed, report errors and exit
        if file_errors:
            print("\nERROR: Some files could not be processed:", file=sys.stderr)
            for err in file_errors:
                print(f"  - {err.path}: {err.reason}", file=sys.stderr)
            print(
                "\nPlease fix or remove the problematic files and try again.",
                file=sys.stderr,
            )
            return 1

        # Validate vision support if images present
        if has_images(processed_files):
            validate_vision_support(args.model, has_images=True)

        # Print file processing summary
        text_count = sum(1 for f in processed_files if f.category.value == "text")
        office_count = sum(1 for f in processed_files if f.category.value == "office")
        image_count = sum(1 for f in processed_files if f.category.value == "image")

        print("\nFile Processing Summary:")
        print(f"  - Text files: {text_count}")
        print(f"  - Office documents (converted): {office_count}")
        print(f"  - Images: {image_count}")

    # Log model being used
    print(f"Using model: {args.model}")

    # Validate environment variables (only if no custom base URL)
    if not base_url:
        env_status = client.validate_environment(args.model)
        if not env_status.get("keys_in_environment", False):
            missing = env_status.get("missing_keys", [])
            error = env_status.get("error", "")

            print(
                f"\n❌ ERROR: Missing required environment variables for model '{args.model}'",
                file=sys.stderr,
            )
            print(f"\nMissing keys: {', '.join(missing)}", file=sys.stderr)

            if error:
                print(f"\nDetails: {error}", file=sys.stderr)

            print("\n💡 To fix this:", file=sys.stderr)
            print("   1. Set the required environment variable(s):", file=sys.stderr)
            for key in missing:
                print(f"      export {key}=your-api-key", file=sys.stderr)
            print(
                "   2. Or use --base-url to specify a custom LiteLLM endpoint",
                file=sys.stderr,
            )
            print(
                "   3. Or use --model to specify a different model\n", file=sys.stderr
            )

            return 1

    # Build full prompt with reference files section
    full_prompt = build_prompt_with_references(args.prompt, processed_files)

    # Build multimodal content if we have images
    if has_images(processed_files):
        multimodal_content = build_multimodal_content(full_prompt, processed_files)

    # Check context limits on the full prompt
    try:
        validate_context_size(full_prompt, args.model, client, len(processed_files))
    except ValueError as e:
        print(f"ERROR: {e}", file=sys.stderr)
        return 1

    # Create and start session
    session_id = session_mgr.create_session(
        slug=args.slug,
        prompt=full_prompt,
        model=args.model,
        base_url=base_url,
        api_key=args.api_key,
        reasoning_effort=args.reasoning_effort,
        multimodal_content=multimodal_content,
    )

    print(f"Session created: {session_id}")
    print(f"Reattach via: python3 {__file__} session {args.slug}")
    print("Waiting for completion...")

    try:
        result = session_mgr.wait_for_completion(session_id)

        if result.get("status") == "completed":
            print("\n" + "=" * 80)
            print("RESPONSE:")
            print("=" * 80)
            print(result.get("output", "No output available"))
            print("=" * 80)

            # Print metadata section (model, reasoning effort, tokens, cost)
            print("\n" + "=" * 80)
            print("METADATA:")
            print("=" * 80)

            # Model info
            print(f"model: {result.get('model', args.model)}")
            print(
                f"reasoning_effort: {result.get('reasoning_effort', args.reasoning_effort)}"
            )

            # Token usage and cost
            usage = result.get("usage")
            cost_info = result.get("cost_info")

            if cost_info:
                print(f"input_tokens: {cost_info.get('input_tokens', 0)}")
                print(f"output_tokens: {cost_info.get('output_tokens', 0)}")
                print(
                    f"total_tokens: {cost_info.get('input_tokens', 0) + cost_info.get('output_tokens', 0)}"
                )
                print(f"input_cost_usd: {cost_info.get('input_cost', 0):.6f}")
                print(f"output_cost_usd: {cost_info.get('output_cost', 0):.6f}")
                print(f"total_cost_usd: {cost_info.get('total_cost', 0):.6f}")
            elif usage:
                input_tokens = usage.get("prompt_tokens") or usage.get(
                    "input_tokens", 0
                )
                output_tokens = usage.get("completion_tokens") or usage.get(
                    "output_tokens", 0
                )
                print(f"input_tokens: {input_tokens}")
                print(f"output_tokens: {output_tokens}")
                print(f"total_tokens: {input_tokens + output_tokens}")

            print("=" * 80)

            return 0
        else:
            print(f"\nSession ended with status: {result.get('status')}")
            if "error" in result:
                print(f"Error: {result['error']}")
            return 1

    except TimeoutError as e:
        print(f"\nERROR: {e}", file=sys.stderr)
        return 1


def handle_session_status(args: argparse.Namespace) -> int:
    """Handle session status check"""

    session_mgr = SessionManager()
    status = session_mgr.get_session_status(args.slug)

    if "error" in status and "No session found" in status["error"]:
        print(f"ERROR: {status['error']}", file=sys.stderr)
        return 1

    # Pretty print status
    print(json.dumps(status, indent=2))
    return 0


def handle_list_sessions(args: argparse.Namespace) -> int:
    """Handle list sessions command"""

    session_mgr = SessionManager()
    sessions = session_mgr.list_sessions()

    if not sessions:
        print("No sessions found.")
        return 0

    print(f"\nFound {len(sessions)} session(s):\n")
    for s in sessions:
        status_icon = {
            "running": "🔄",
            "completed": "✅",
            "error": "❌",
            "calling_llm": "📞",
        }.get(s.get("status", ""), "❓")

        print(
            f"{status_icon} {s.get('slug', 'unknown')} - {s.get('status', 'unknown')}"
        )
        print(f"   Created: {s.get('created_at', 'unknown')}")
        print(f"   Model: {s.get('model', 'unknown')}")
        if s.get("error"):
            print(f"   Error: {s['error'][:100]}...")
        print()

    return 0


def handle_list_models(args: argparse.Namespace) -> int:
    """Handle list models command"""

    # Determine base URL: --base-url flag > OPENAI_BASE_URL env var > None
    base_url = args.base_url
    if not base_url:
        base_url = config.get_base_url()
        if base_url:
            print(f"Using base URL from OPENAI_BASE_URL: {base_url}")

    LiteLLMClient(base_url=base_url)
    models = ModelSelector.list_models(base_url)

    print(json.dumps(models, indent=2))
    return 0


def main() -> int:
    parser = argparse.ArgumentParser(
        description="""
Consultant CLI - LiteLLM-powered LLM consultation tool

This CLI tool allows you to consult powerful LLM models for code analysis,
reviews, architectural decisions, and complex technical questions. It supports
100+ LLM providers via LiteLLM with custom base URLs.

CORE WORKFLOW:
  1. Provide a prompt describing your analysis task
  2. Attach relevant files for context
  3. The CLI sends everything to the LLM and waits for completion
  4. Results are printed with full metadata (model, tokens, cost)

OUTPUT FORMAT:
  The CLI prints structured output with clear sections:
  - RESPONSE: The LLM's analysis/response
  - METADATA: Model used, reasoning effort, token counts, costs

ENVIRONMENT VARIABLES:
  LITELLM_API_KEY      Primary API key (checked first)
  OPENAI_API_KEY       OpenAI API key (fallback)
  ANTHROPIC_API_KEY    Anthropic API key (fallback)
  OPENAI_BASE_URL      Default base URL for custom LiteLLM proxy
""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
EXAMPLES:

  Basic consultation with prompt and files:
    %(prog)s -p "Review this code for bugs" -f src/main.py -s code-review

  Multiple files:
    %(prog)s -p "Analyze architecture" -f src/api.py -f src/db.py -f src/models.py -s arch-review

  Specify model explicitly:
    %(prog)s -p "Security audit" -f auth.py -s security -m claude-3-5-sonnet-20241022

  Use custom LiteLLM proxy:
    %(prog)s -p "Code review" -f app.py -s review --base-url http://localhost:8000

  Lower reasoning effort (faster, cheaper):
    %(prog)s -p "Quick check" -f code.py -s quick --reasoning-effort low

  Check session status:
    %(prog)s session my-review

  List all sessions:
    %(prog)s list

  List available models from proxy:
    %(prog)s models --base-url http://localhost:8000

SUBCOMMANDS:
  session <slug>    Check status of a session by its slug
  list              List all sessions with their status
  models            List available models (from proxy or known models)

For more information, see the consultant plugin documentation.
""",
    )

    # Subcommands
    subparsers = parser.add_subparsers(dest="command", help="Available subcommands")

    # Main invocation arguments
    parser.add_argument(
        "-p",
        "--prompt",
        metavar="TEXT",
        help="""The analysis prompt to send to the LLM. This should describe
                what you want the model to analyze or review. The prompt will
                be combined with any attached files to form the full request.
                REQUIRED for main invocation.""",
    )
    parser.add_argument(
        "-f",
        "--file",
        action="append",
        dest="files",
        metavar="PATH",
        help="""File to attach for analysis. Can be specified multiple times
                to attach multiple files. Each file's contents will be included
                in the prompt sent to the LLM. Supports any text file format.
                Example: -f src/main.py -f src/utils.py -f README.md""",
    )
    parser.add_argument(
        "-s",
        "--slug",
        metavar="NAME",
        help="""Unique identifier for this session. Used to track and retrieve
                session results. Should be descriptive (e.g., "pr-review-123",
                "security-audit", "arch-analysis"). REQUIRED for main invocation.""",
    )
    parser.add_argument(
        "-m",
        "--model",
        metavar="MODEL_ID",
        default="gpt-5-pro",
        help="""Specific LLM model to use. Default: gpt-5-pro. Examples:
                "gpt-5.1", "claude-sonnet-4-5", "gemini/gemini-2.5-flash".
                Use the "models" subcommand to see available models.""",
    )
    parser.add_argument(
        "--base-url",
        metavar="URL",
        help="""Custom base URL for LiteLLM proxy server (e.g., "http://localhost:8000").
                When set, all API calls go through this proxy. The proxy's /v1/models
                endpoint will be queried for available models. If not set, uses
                direct provider APIs based on the model prefix.""",
    )
    parser.add_argument(
        "--api-key",
        metavar="KEY",
        help="""API key for the LLM provider. If not provided, the CLI will look
                for keys in environment variables: LITELLM_API_KEY, OPENAI_API_KEY,
                or ANTHROPIC_API_KEY (in that order).""",
    )
    parser.add_argument(
        "--reasoning-effort",
        choices=["low", "medium", "high"],
        default="high",
        metavar="LEVEL",
        help="""Reasoning effort level for the LLM. Higher effort = more thorough
                analysis but slower and more expensive. Choices: low, medium, high.
                Default: high. Use "low" for quick checks, "high" for thorough reviews.""",
    )

    # Session status subcommand
    session_parser = subparsers.add_parser(
        "session",
        help="Check the status of a session",
        description="""Check the current status of a consultation session.
                       Returns JSON with session metadata, status, and output if completed.""",
    )
    session_parser.add_argument(
        "slug", help="Session slug/identifier to check (the value passed to -s/--slug)"
    )

    # List sessions subcommand
    subparsers.add_parser(
        "list",
        help="List all consultation sessions",
        description="""List all consultation sessions with their status.
                       Shows session slug, status, creation time, model used, and any errors.""",
    )

    # List models subcommand
    models_parser = subparsers.add_parser(
        "models",
        help="List available LLM models",
        description="""List available LLM models. If --base-url is provided, queries
                       the proxy's /v1/models endpoint. Otherwise, returns known models
                       from LiteLLM's model registry.""",
    )
    models_parser.add_argument(
        "--base-url",
        metavar="URL",
        help="Base URL of LiteLLM proxy to query for available models",
    )

    args = parser.parse_args()

    # Handle commands
    if args.command == "session":
        return handle_session_status(args)

    elif args.command == "list":
        return handle_list_sessions(args)

    elif args.command == "models":
        return handle_list_models(args)

    else:
        # Main invocation
        if not args.prompt or not args.slug:
            parser.print_help()
            print("\nERROR: --prompt and --slug are required", file=sys.stderr)
            return 1

        return handle_invocation(args)


if __name__ == "__main__":
    sys.exit(main())