gh-doodledood-claude-code-p…/skills/consultant/scripts/response_strategy.py

"""
Response strategies for different LLM providers.
Handles retries, background jobs, and provider-specific quirks.
Automatically detects responses API vs completions API support.
"""

import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any

import litellm
from litellm import _should_retry, completion, responses

import config


def _is_responses_api_model(model_name: str) -> bool:
    """
    Check if a model name indicates responses API support.

    Uses general patterns that will work for future model versions:
    - GPT-4+ (gpt-4, gpt-5, gpt-6, etc.)
    - O-series reasoning models (o1, o2, o3, o4, etc.)
    - Codex models
    - Computer-use models

    Args:
        model_name: Model name without provider prefix (lowercase)

    Returns:
        True if model should use responses API
    """
    import re

    # GPT-4 and above (gpt-4, gpt-5, gpt-6, etc. but not gpt-3.5)
    # Matches: gpt-4, gpt4, gpt-4-turbo, gpt-5.1, gpt-6-turbo, etc.
    gpt_match = re.search(r"gpt-?(\d+)", model_name)
    if gpt_match:
        version = int(gpt_match.group(1))
        if version >= 4:
            return True

    # O-series reasoning models (o1, o2, o3, o4, etc.)
    # Matches: o1, o1-pro, o3-mini, o4-preview, etc.
    if re.search(r"\bo\d+\b", model_name) or re.search(r"\bo\d+-", model_name):
        return True

    # Codex models (use responses API)
    if "codex" in model_name:
        return True

    # Computer-use models
    return "computer-use" in model_name


def get_responses_api_models() -> set[str]:
    """
    Determine which models support the native OpenAI Responses API.

    Uses litellm.models_by_provider to get OpenAI models, then filters
    to those that support the responses API.

    Returns:
        Set of model identifiers that support the responses API natively.
    """
    responses_models: set[str] = set()

    # Get OpenAI models from litellm
    openai_models = litellm.models_by_provider.get("openai", [])
    azure_models = litellm.models_by_provider.get("azure", [])

    for model in openai_models + azure_models:
        if _is_responses_api_model(model.lower()):
            responses_models.add(model)
            responses_models.add(f"openai/{model}")
            responses_models.add(f"azure/{model}")

    return responses_models


def supports_responses_api(model: str) -> bool:
    """
    Check if a model supports the native OpenAI Responses API.

    Uses general patterns that work for current and future models:
    - GPT-4+ series (gpt-4, gpt-5, gpt-6, etc.)
    - O-series reasoning models (o1, o2, o3, etc.)
    - Codex models
    - Computer-use models

    Args:
        model: Model identifier (e.g., "openai/gpt-4", "gpt-5-mini")

    Returns:
        True if model supports responses API natively, False otherwise.
    """
    model_lower = model.lower()

    # Extract model name and provider
    if "/" in model_lower:
        provider, model_name = model_lower.split("/", 1)
    else:
        provider = "openai"  # Default provider for bare model names
        model_name = model_lower

    # Only OpenAI and Azure support the responses API natively
    if provider not in ("openai", "azure"):
        return False

    # Use the generalized pattern matching
    return _is_responses_api_model(model_name)


class ResponseStrategy(ABC):
    """Base class for response strategies"""

    @abstractmethod
    def execute(
        self,
        model: str,
        prompt: str,
        session_dir: Path | None = None,
        multimodal_content: list[dict[str, Any]] | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """
        Execute LLM request with provider-specific strategy.
        Returns dict with 'content' and optional 'usage'.

        Args:
            model: Model identifier
            prompt: Text prompt
            session_dir: Optional session directory for state persistence
            multimodal_content: Optional multimodal content array for images
            **kwargs: Additional provider-specific arguments
        """
        raise NotImplementedError

    @abstractmethod
    def can_resume(self) -> bool:
        """Whether this strategy supports resuming after failure"""
        raise NotImplementedError

    def _calculate_backoff_delay(
        self, attempt: int, base_delay: int, max_delay: int
    ) -> float:
        """Calculate exponential backoff delay with jitter"""
        import random

        delay = min(base_delay * (2**attempt), max_delay)
        # Add 10% jitter to avoid thundering herd
        jitter = delay * 0.1 * random.random()
        return float(delay + jitter)

    def _extract_content(self, response: Any) -> str:
        """
        Extract text content from response.output structure.

        Handles different output item types:
        - ResponseOutputMessage (type='message'): has content with text
        - ResponseReasoningItem (type='reasoning'): has summary, no content
        """
        content = ""
        if hasattr(response, "output") and response.output:
            for item in response.output:
                # Check item type - only 'message' type has content
                item_type = getattr(item, "type", None)

                if item_type == "message":
                    # ResponseOutputMessage: extract text from content
                    if hasattr(item, "content") and item.content:
                        for content_item in item.content:
                            if hasattr(content_item, "text"):
                                content += content_item.text
                # Skip 'reasoning' items (ResponseReasoningItem) - they have summary, not content
        return content

    def _serialize_usage(self, usage: Any) -> dict[str, Any] | None:
        """
        Safely convert usage object to a JSON-serializable dict.
        Handles Pydantic models (OpenAI), dataclasses, and plain dicts.
        """
        if usage is None:
            return None

        # Already a dict - return as-is
        if isinstance(usage, dict):
            return dict(usage)

        # Pydantic v2 model
        if hasattr(usage, "model_dump"):
            result: dict[str, Any] = usage.model_dump()
            return result

        # Pydantic v1 model
        if hasattr(usage, "dict"):
            result = usage.dict()
            return dict(result)

        # Dataclass or object with __dict__
        if hasattr(usage, "__dict__"):
            return dict(usage.__dict__)

        # Last resort - try to convert directly
        try:
            return dict(usage)
        except (TypeError, ValueError):
            # If all else fails, return None rather than crash
            return None


class BackgroundJobStrategy(ResponseStrategy):
    """
    For OpenAI/Azure - uses background jobs with response_id polling.
    Supports resuming after network failures by persisting response_id.
    """

    def _convert_to_responses_api_format(
        self, multimodal_content: list[dict[str, Any]]
    ) -> list[dict[str, Any]]:
        """
        Convert multimodal content from Completions API format to Responses API format.

        Completions format: [{"type": "text/image_url", ...}]
        Responses format: [{"type": "input_text/input_image", ...}]
        """
        converted: list[dict[str, Any]] = []
        for item in multimodal_content:
            item_type = item.get("type", "")
            if item_type == "text":
                converted.append({"type": "input_text", "text": item.get("text", "")})
            elif item_type == "image_url":
                # Extract URL from nested object
                image_url = item.get("image_url", {})
                url = image_url.get("url", "") if isinstance(image_url, dict) else ""
                converted.append({"type": "input_image", "image_url": url})
        return converted

    def execute(
        self,
        model: str,
        prompt: str,
        session_dir: Path | None = None,
        multimodal_content: list[dict[str, Any]] | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """Execute with background job and polling"""

        response_id_file = session_dir / "response_id.txt" if session_dir else None

        # Check if we're resuming an existing background job
        if response_id_file and response_id_file.exists():
            response_id = response_id_file.read_text().strip()
            print(f"Resuming background job: {response_id}")
            return self._poll_for_completion(response_id)

        # Build input - convert multimodal to Responses API format if provided
        input_content: str | list[dict[str, Any]]
        if multimodal_content:
            input_content = self._convert_to_responses_api_format(multimodal_content)
        else:
            input_content = prompt

        # Start new background job
        try:
            response = responses(
                model=model,
                input=input_content,
                background=True,  # Returns immediately with response_id
                num_retries=config.MAX_RETRIES,  # Use LiteLLM's built-in retries
                **kwargs,
            )

            response_id = response.id

            # Persist response_id for resumability
            if response_id_file:
                response_id_file.write_text(response_id)
                print(f"Started background job: {response_id}")

            # Poll until complete
            return self._poll_for_completion(response_id)

        except Exception as e:
            # If background mode fails, maybe not supported - raise for fallback
            raise RuntimeError(f"Background job failed to start: {e}") from e

    def _poll_for_completion(self, response_id: str) -> dict[str, Any]:
        """Poll for completion with exponential backoff and retries"""

        start_time = time.time()
        attempt = 0

        while time.time() - start_time < config.POLL_TIMEOUT:
            try:
                # Retrieve the response by ID
                result = litellm.get_response(response_id=response_id)

                if hasattr(result, "status"):
                    if result.status == "completed":
                        content = self._extract_content(result)
                        if not content:
                            raise RuntimeError("No content in completed response")
                        return {
                            "content": content,
                            "usage": self._serialize_usage(
                                getattr(result, "usage", None)
                            ),
                            "response": result,  # Include full response for cost calculation
                        }
                    elif result.status == "failed":
                        error = getattr(result, "error", "Unknown error")
                        raise RuntimeError(f"Background job failed: {error}")
                    elif result.status in ["in_progress", "queued"]:
                        # Still processing, wait and retry
                        time.sleep(config.POLL_INTERVAL)
                        attempt += 1
                        continue
                    else:
                        # Unknown status, wait and retry
                        time.sleep(config.POLL_INTERVAL)
                        continue
                else:
                    # No status field - might be complete already
                    content = self._extract_content(result)
                    if content:
                        return {
                            "content": content,
                            "usage": self._serialize_usage(
                                getattr(result, "usage", None)
                            ),
                            "response": result,  # Include full response for cost calculation
                        }
                    # No content, wait and retry
                    time.sleep(config.POLL_INTERVAL)
                    continue

            except Exception as e:
                error_msg = str(e).lower()

                # Network errors - retry with backoff
                if any(x in error_msg for x in ["network", "timeout", "connection"]):
                    if attempt < config.MAX_RETRIES:
                        delay = self._calculate_backoff_delay(
                            attempt, config.INITIAL_RETRY_DELAY, config.MAX_RETRY_DELAY
                        )
                        print(
                            f"Network error polling job, retrying in {delay:.1f}s... (attempt {attempt + 1}/{config.MAX_RETRIES})"
                        )
                        time.sleep(delay)
                        attempt += 1
                        continue
                    else:
                        raise RuntimeError(
                            f"Network errors exceeded max retries: {e}"
                        ) from e

                # Other errors - raise immediately
                raise

        raise TimeoutError(
            f"Background job {response_id} did not complete within {config.POLL_TIMEOUT}s"
        )

    def can_resume(self) -> bool:
        return True


class SyncRetryStrategy(ResponseStrategy):
    """
    For OpenAI/Azure models using responses API - direct sync calls with retry logic.
    Cannot resume - must retry from scratch if it fails.
    """

    def _convert_to_responses_api_format(
        self, multimodal_content: list[dict[str, Any]]
    ) -> list[dict[str, Any]]:
        """
        Convert multimodal content from Completions API format to Responses API format.

        Completions format: [{"type": "text/image_url", ...}]
        Responses format: [{"type": "input_text/input_image", ...}]
        """
        converted: list[dict[str, Any]] = []
        for item in multimodal_content:
            item_type = item.get("type", "")
            if item_type == "text":
                converted.append({"type": "input_text", "text": item.get("text", "")})
            elif item_type == "image_url":
                # Extract URL from nested object
                image_url = item.get("image_url", {})
                url = image_url.get("url", "") if isinstance(image_url, dict) else ""
                converted.append({"type": "input_image", "image_url": url})
        return converted

    def execute(
        self,
        model: str,
        prompt: str,
        session_dir: Path | None = None,
        multimodal_content: list[dict[str, Any]] | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """Execute with synchronous retries using responses API"""

        # Build input - convert multimodal to Responses API format if provided
        input_content: str | list[dict[str, Any]]
        if multimodal_content:
            input_content = self._convert_to_responses_api_format(multimodal_content)
        else:
            input_content = prompt

        for attempt in range(config.MAX_RETRIES):
            try:
                response = responses(
                    model=model,
                    input=input_content,
                    stream=False,
                    num_retries=config.MAX_RETRIES,  # Use LiteLLM's built-in retries
                    **kwargs,
                )

                content = self._extract_content(response)

                if not content:
                    raise RuntimeError("No content in response from LLM")

                return {
                    "content": content,
                    "usage": self._serialize_usage(getattr(response, "usage", None)),
                    "response": response,  # Include full response for cost calculation
                }

            except Exception as e:
                # Use LiteLLM's built-in retry logic for HTTP errors
                if _should_retry and hasattr(e, "status_code"):
                    retryable = _should_retry(e.status_code)
                else:
                    # Fallback to string matching for non-HTTP errors
                    error_msg = str(e).lower()
                    retryable = any(
                        x in error_msg
                        for x in [
                            "network",
                            "timeout",
                            "connection",
                            "429",
                            "rate limit",
                            "503",
                            "overloaded",
                        ]
                    )
                    non_retryable = any(
                        x in error_msg
                        for x in [
                            "auth",
                            "key",
                            "context",
                            "token limit",
                            "not found",
                            "invalid",
                        ]
                    )

                    if non_retryable:
                        raise

                if retryable and attempt < config.MAX_RETRIES - 1:
                    delay = self._calculate_backoff_delay(
                        attempt, config.INITIAL_RETRY_DELAY, config.MAX_RETRY_DELAY
                    )
                    print(
                        f"Retryable error, waiting {delay:.1f}s before retry {attempt + 2}/{config.MAX_RETRIES}..."
                    )
                    time.sleep(delay)
                    continue

                raise

        raise RuntimeError("Max retries exceeded")

    def can_resume(self) -> bool:
        return False


class CompletionsAPIStrategy(ResponseStrategy):
    """
    For Anthropic/Google/other providers - uses chat completions API directly.
    More efficient than bridging through responses API for non-OpenAI providers.
    """

    def execute(
        self,
        model: str,
        prompt: str,
        session_dir: Path | None = None,
        multimodal_content: list[dict[str, Any]] | None = None,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """Execute with chat completions API"""

        # Remove responses-specific kwargs that don't apply to completions
        kwargs.pop("reasoning_effort", None)
        kwargs.pop("background", None)

        # Build message content - use multimodal content if provided, else plain prompt
        message_content: str | list[dict[str, Any]] = (
            multimodal_content if multimodal_content else prompt
        )

        for attempt in range(config.MAX_RETRIES):
            try:
                # Use chat completions API
                response = completion(
                    model=model,
                    messages=[{"role": "user", "content": message_content}],
                    stream=False,
                    num_retries=config.MAX_RETRIES,
                    **kwargs,
                )

                # Extract content from chat completion response
                content = self._extract_completion_content(response)

                if not content:
                    raise RuntimeError("No content in response from LLM")

                return {
                    "content": content,
                    "usage": self._serialize_usage(getattr(response, "usage", None)),
                    "response": response,
                }

            except Exception as e:
                # Use LiteLLM's built-in retry logic for HTTP errors
                if _should_retry and hasattr(e, "status_code"):
                    retryable = _should_retry(e.status_code)
                else:
                    error_msg = str(e).lower()
                    retryable = any(
                        x in error_msg
                        for x in [
                            "network",
                            "timeout",
                            "connection",
                            "429",
                            "rate limit",
                            "503",
                            "overloaded",
                        ]
                    )
                    non_retryable = any(
                        x in error_msg
                        for x in [
                            "auth",
                            "key",
                            "context",
                            "token limit",
                            "not found",
                            "invalid",
                        ]
                    )

                    if non_retryable:
                        raise

                if retryable and attempt < config.MAX_RETRIES - 1:
                    delay = self._calculate_backoff_delay(
                        attempt, config.INITIAL_RETRY_DELAY, config.MAX_RETRY_DELAY
                    )
                    print(
                        f"Retryable error, waiting {delay:.1f}s before retry {attempt + 2}/{config.MAX_RETRIES}..."
                    )
                    time.sleep(delay)
                    continue

                raise

        raise RuntimeError("Max retries exceeded")

    def _extract_completion_content(self, response: Any) -> str:
        """Extract text content from chat completions response"""
        if hasattr(response, "choices") and response.choices:
            choice = response.choices[0]
            if hasattr(choice, "message") and hasattr(choice.message, "content"):
                return choice.message.content or ""
        return ""

    def can_resume(self) -> bool:
        return False


class ResponseStrategyFactory:
    """Factory to select appropriate strategy based on model/provider and API support"""

    # Models/providers that support background jobs (OpenAI Responses API feature)
    BACKGROUND_SUPPORTED = {
        "openai/",
        "azure/",
    }

    @staticmethod
    def get_strategy(model: str) -> ResponseStrategy:
        """
        Select strategy based on model capabilities and API support.

        Decision tree:
        1. If model supports responses API AND background jobs -> BackgroundJobStrategy
        2. If model supports responses API (no background) -> SyncRetryStrategy
        3. If model doesn't support responses API -> CompletionsAPIStrategy

        Uses litellm.models_by_provider to determine support.
        """
        # Check if model supports native responses API
        if supports_responses_api(model):
            # Check if it also supports background jobs
            if ResponseStrategyFactory.supports_background(model):
                return BackgroundJobStrategy()
            return SyncRetryStrategy()

        # For all other providers (Anthropic, Google, Bedrock, etc.)
        # Use completions API directly - more efficient than bridging
        return CompletionsAPIStrategy()

    @staticmethod
    def supports_background(model: str) -> bool:
        """Check if model supports background job execution (OpenAI/Azure only)"""
        model_lower = model.lower()
        return any(
            model_lower.startswith(prefix)
            for prefix in ResponseStrategyFactory.BACKGROUND_SUPPORTED
        )

    @staticmethod
    def get_api_type(model: str) -> str:
        """
        Determine which API type will be used for a given model.

        Returns:
            'responses' for models using OpenAI Responses API
            'completions' for models using Chat Completions API
        """
        if supports_responses_api(model):
            return "responses"
        return "completions"