#!/usr/bin/env python3 # /// script # requires-python = ">=3.10" # dependencies = [ # "litellm", # "requests>=2.31.0", # "tenacity", # "markitdown>=0.1.0", # ] # /// """ Consultant CLI - LiteLLM-powered LLM consultation tool Supports async invocation, custom base URLs, and flexible model selection Run with: uv run consultant_cli.py [args] This automatically installs/updates dependencies (litellm, requests) on first run. """ import argparse import json import sys from pathlib import Path # Add scripts directory to path SCRIPTS_DIR = Path(__file__).parent sys.path.insert(0, str(SCRIPTS_DIR)) import config from file_handler import ( FileHandler, build_multimodal_content, build_prompt_with_references, has_images, validate_vision_support, ) from litellm_client import LiteLLMClient from model_selector import ModelSelector from session_manager import SessionManager def validate_context_size( full_prompt: str, model: str, client: LiteLLMClient, num_files: int ) -> bool: """ Validate that full prompt fits in model context. Returns True if OK, raises ValueError if exceeds. """ # Count tokens for the complete prompt total_tokens = client.count_tokens(full_prompt, model) # Get limit max_tokens = client.get_max_tokens(model) # Reserve for response available_tokens = int(max_tokens * (1 - config.CONTEXT_RESERVE_RATIO)) # Print summary print("\nšŸ“Š Token Usage:") print(f"- Input: {total_tokens:,} tokens ({num_files} files)") print(f"- Limit: {max_tokens:,} tokens") print( f"- Available: {available_tokens:,} tokens ({int((available_tokens/max_tokens)*100)}%)\n" ) if total_tokens > max_tokens: raise ValueError( f"Input exceeds context limit!\n" f" Input: {total_tokens:,} tokens\n" f" Limit: {max_tokens:,} tokens\n" f" Overage: {total_tokens - max_tokens:,} tokens\n\n" f"Suggestions:\n" f"1. Reduce number of files (currently {num_files})\n" f"2. Use a model with larger context\n" f"3. Shorten the prompt" ) if total_tokens > available_tokens: print(f"āš ļø WARNING: Using {int((total_tokens/max_tokens)*100)}% of context") print(" Consider reducing input size for better response quality\n") return True def handle_invocation(args: argparse.Namespace) -> int: """Handle main invocation command""" # Determine base URL: --base-url flag > OPENAI_BASE_URL env var > None base_url = args.base_url if not base_url: base_url = config.get_base_url() if base_url: print(f"Using base URL from OPENAI_BASE_URL: {base_url}") # Initialize components session_mgr = SessionManager() client = LiteLLMClient(base_url=base_url, api_key=args.api_key) # Process files using FileHandler file_handler = FileHandler() processed_files = [] multimodal_content = None if args.files: processed_files, file_errors = file_handler.process_files(args.files) # If any files failed, report errors and exit if file_errors: print("\nERROR: Some files could not be processed:", file=sys.stderr) for err in file_errors: print(f" - {err.path}: {err.reason}", file=sys.stderr) print( "\nPlease fix or remove the problematic files and try again.", file=sys.stderr, ) return 1 # Validate vision support if images present if has_images(processed_files): validate_vision_support(args.model, has_images=True) # Print file processing summary text_count = sum(1 for f in processed_files if f.category.value == "text") office_count = sum(1 for f in processed_files if f.category.value == "office") image_count = sum(1 for f in processed_files if f.category.value == "image") print("\nFile Processing Summary:") print(f" - Text files: {text_count}") print(f" - Office documents (converted): {office_count}") print(f" - Images: {image_count}") # Log model being used print(f"Using model: {args.model}") # Validate environment variables (only if no custom base URL) if not base_url: env_status = client.validate_environment(args.model) if not env_status.get("keys_in_environment", False): missing = env_status.get("missing_keys", []) error = env_status.get("error", "") print( f"\nāŒ ERROR: Missing required environment variables for model '{args.model}'", file=sys.stderr, ) print(f"\nMissing keys: {', '.join(missing)}", file=sys.stderr) if error: print(f"\nDetails: {error}", file=sys.stderr) print("\nšŸ’” To fix this:", file=sys.stderr) print(" 1. Set the required environment variable(s):", file=sys.stderr) for key in missing: print(f" export {key}=your-api-key", file=sys.stderr) print( " 2. Or use --base-url to specify a custom LiteLLM endpoint", file=sys.stderr, ) print( " 3. Or use --model to specify a different model\n", file=sys.stderr ) return 1 # Build full prompt with reference files section full_prompt = build_prompt_with_references(args.prompt, processed_files) # Build multimodal content if we have images if has_images(processed_files): multimodal_content = build_multimodal_content(full_prompt, processed_files) # Check context limits on the full prompt try: validate_context_size(full_prompt, args.model, client, len(processed_files)) except ValueError as e: print(f"ERROR: {e}", file=sys.stderr) return 1 # Create and start session session_id = session_mgr.create_session( slug=args.slug, prompt=full_prompt, model=args.model, base_url=base_url, api_key=args.api_key, reasoning_effort=args.reasoning_effort, multimodal_content=multimodal_content, ) print(f"Session created: {session_id}") print(f"Reattach via: python3 {__file__} session {args.slug}") print("Waiting for completion...") try: result = session_mgr.wait_for_completion(session_id) if result.get("status") == "completed": print("\n" + "=" * 80) print("RESPONSE:") print("=" * 80) print(result.get("output", "No output available")) print("=" * 80) # Print metadata section (model, reasoning effort, tokens, cost) print("\n" + "=" * 80) print("METADATA:") print("=" * 80) # Model info print(f"model: {result.get('model', args.model)}") print( f"reasoning_effort: {result.get('reasoning_effort', args.reasoning_effort)}" ) # Token usage and cost usage = result.get("usage") cost_info = result.get("cost_info") if cost_info: print(f"input_tokens: {cost_info.get('input_tokens', 0)}") print(f"output_tokens: {cost_info.get('output_tokens', 0)}") print( f"total_tokens: {cost_info.get('input_tokens', 0) + cost_info.get('output_tokens', 0)}" ) print(f"input_cost_usd: {cost_info.get('input_cost', 0):.6f}") print(f"output_cost_usd: {cost_info.get('output_cost', 0):.6f}") print(f"total_cost_usd: {cost_info.get('total_cost', 0):.6f}") elif usage: input_tokens = usage.get("prompt_tokens") or usage.get( "input_tokens", 0 ) output_tokens = usage.get("completion_tokens") or usage.get( "output_tokens", 0 ) print(f"input_tokens: {input_tokens}") print(f"output_tokens: {output_tokens}") print(f"total_tokens: {input_tokens + output_tokens}") print("=" * 80) return 0 else: print(f"\nSession ended with status: {result.get('status')}") if "error" in result: print(f"Error: {result['error']}") return 1 except TimeoutError as e: print(f"\nERROR: {e}", file=sys.stderr) return 1 def handle_session_status(args: argparse.Namespace) -> int: """Handle session status check""" session_mgr = SessionManager() status = session_mgr.get_session_status(args.slug) if "error" in status and "No session found" in status["error"]: print(f"ERROR: {status['error']}", file=sys.stderr) return 1 # Pretty print status print(json.dumps(status, indent=2)) return 0 def handle_list_sessions(args: argparse.Namespace) -> int: """Handle list sessions command""" session_mgr = SessionManager() sessions = session_mgr.list_sessions() if not sessions: print("No sessions found.") return 0 print(f"\nFound {len(sessions)} session(s):\n") for s in sessions: status_icon = { "running": "šŸ”„", "completed": "āœ…", "error": "āŒ", "calling_llm": "šŸ“ž", }.get(s.get("status", ""), "ā“") print( f"{status_icon} {s.get('slug', 'unknown')} - {s.get('status', 'unknown')}" ) print(f" Created: {s.get('created_at', 'unknown')}") print(f" Model: {s.get('model', 'unknown')}") if s.get("error"): print(f" Error: {s['error'][:100]}...") print() return 0 def handle_list_models(args: argparse.Namespace) -> int: """Handle list models command""" # Determine base URL: --base-url flag > OPENAI_BASE_URL env var > None base_url = args.base_url if not base_url: base_url = config.get_base_url() if base_url: print(f"Using base URL from OPENAI_BASE_URL: {base_url}") LiteLLMClient(base_url=base_url) models = ModelSelector.list_models(base_url) print(json.dumps(models, indent=2)) return 0 def main() -> int: parser = argparse.ArgumentParser( description=""" Consultant CLI - LiteLLM-powered LLM consultation tool This CLI tool allows you to consult powerful LLM models for code analysis, reviews, architectural decisions, and complex technical questions. It supports 100+ LLM providers via LiteLLM with custom base URLs. CORE WORKFLOW: 1. Provide a prompt describing your analysis task 2. Attach relevant files for context 3. The CLI sends everything to the LLM and waits for completion 4. Results are printed with full metadata (model, tokens, cost) OUTPUT FORMAT: The CLI prints structured output with clear sections: - RESPONSE: The LLM's analysis/response - METADATA: Model used, reasoning effort, token counts, costs ENVIRONMENT VARIABLES: LITELLM_API_KEY Primary API key (checked first) OPENAI_API_KEY OpenAI API key (fallback) ANTHROPIC_API_KEY Anthropic API key (fallback) OPENAI_BASE_URL Default base URL for custom LiteLLM proxy """, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" EXAMPLES: Basic consultation with prompt and files: %(prog)s -p "Review this code for bugs" -f src/main.py -s code-review Multiple files: %(prog)s -p "Analyze architecture" -f src/api.py -f src/db.py -f src/models.py -s arch-review Specify model explicitly: %(prog)s -p "Security audit" -f auth.py -s security -m claude-3-5-sonnet-20241022 Use custom LiteLLM proxy: %(prog)s -p "Code review" -f app.py -s review --base-url http://localhost:8000 Lower reasoning effort (faster, cheaper): %(prog)s -p "Quick check" -f code.py -s quick --reasoning-effort low Check session status: %(prog)s session my-review List all sessions: %(prog)s list List available models from proxy: %(prog)s models --base-url http://localhost:8000 SUBCOMMANDS: session Check status of a session by its slug list List all sessions with their status models List available models (from proxy or known models) For more information, see the consultant plugin documentation. """, ) # Subcommands subparsers = parser.add_subparsers(dest="command", help="Available subcommands") # Main invocation arguments parser.add_argument( "-p", "--prompt", metavar="TEXT", help="""The analysis prompt to send to the LLM. This should describe what you want the model to analyze or review. The prompt will be combined with any attached files to form the full request. REQUIRED for main invocation.""", ) parser.add_argument( "-f", "--file", action="append", dest="files", metavar="PATH", help="""File to attach for analysis. Can be specified multiple times to attach multiple files. Each file's contents will be included in the prompt sent to the LLM. Supports any text file format. Example: -f src/main.py -f src/utils.py -f README.md""", ) parser.add_argument( "-s", "--slug", metavar="NAME", help="""Unique identifier for this session. Used to track and retrieve session results. Should be descriptive (e.g., "pr-review-123", "security-audit", "arch-analysis"). REQUIRED for main invocation.""", ) parser.add_argument( "-m", "--model", metavar="MODEL_ID", default="gpt-5-pro", help="""Specific LLM model to use. Default: gpt-5-pro. Examples: "gpt-5.1", "claude-sonnet-4-5", "gemini/gemini-2.5-flash". Use the "models" subcommand to see available models.""", ) parser.add_argument( "--base-url", metavar="URL", help="""Custom base URL for LiteLLM proxy server (e.g., "http://localhost:8000"). When set, all API calls go through this proxy. The proxy's /v1/models endpoint will be queried for available models. If not set, uses direct provider APIs based on the model prefix.""", ) parser.add_argument( "--api-key", metavar="KEY", help="""API key for the LLM provider. If not provided, the CLI will look for keys in environment variables: LITELLM_API_KEY, OPENAI_API_KEY, or ANTHROPIC_API_KEY (in that order).""", ) parser.add_argument( "--reasoning-effort", choices=["low", "medium", "high"], default="high", metavar="LEVEL", help="""Reasoning effort level for the LLM. Higher effort = more thorough analysis but slower and more expensive. Choices: low, medium, high. Default: high. Use "low" for quick checks, "high" for thorough reviews.""", ) # Session status subcommand session_parser = subparsers.add_parser( "session", help="Check the status of a session", description="""Check the current status of a consultation session. Returns JSON with session metadata, status, and output if completed.""", ) session_parser.add_argument( "slug", help="Session slug/identifier to check (the value passed to -s/--slug)" ) # List sessions subcommand subparsers.add_parser( "list", help="List all consultation sessions", description="""List all consultation sessions with their status. Shows session slug, status, creation time, model used, and any errors.""", ) # List models subcommand models_parser = subparsers.add_parser( "models", help="List available LLM models", description="""List available LLM models. If --base-url is provided, queries the proxy's /v1/models endpoint. Otherwise, returns known models from LiteLLM's model registry.""", ) models_parser.add_argument( "--base-url", metavar="URL", help="Base URL of LiteLLM proxy to query for available models", ) args = parser.parse_args() # Handle commands if args.command == "session": return handle_session_status(args) elif args.command == "list": return handle_list_sessions(args) elif args.command == "models": return handle_list_models(args) else: # Main invocation if not args.prompt or not args.slug: parser.print_help() print("\nERROR: --prompt and --slug are required", file=sys.stderr) return 1 return handle_invocation(args) if __name__ == "__main__": sys.exit(main())