Initial commit
This commit is contained in:
615
.claude/skills/llm-app-architecture/SKILL.md
Normal file
615
.claude/skills/llm-app-architecture/SKILL.md
Normal file
@@ -0,0 +1,615 @@
|
||||
---
|
||||
name: llm-app-architecture
|
||||
description: Automatically applies when building LLM applications. Ensures proper async patterns for LLM calls, streaming responses, token management, retry logic, and error handling.
|
||||
category: ai-llm
|
||||
---
|
||||
|
||||
# LLM Application Architecture Patterns
|
||||
|
||||
When building applications with LLM APIs (Claude, OpenAI, etc.), follow these patterns for reliable, efficient, and maintainable systems.
|
||||
|
||||
**Trigger Keywords**: LLM, AI application, model API, Claude, OpenAI, GPT, language model, LLM call, completion, chat completion, embeddings
|
||||
|
||||
**Agent Integration**: Used by `ml-system-architect`, `llm-app-engineer`, `agent-orchestrator-engineer`, `rag-architect`, `performance-and-cost-engineer-llm`
|
||||
|
||||
## ✅ Correct Pattern: Async LLM Calls
|
||||
|
||||
```python
|
||||
import httpx
|
||||
import anthropic
|
||||
from typing import AsyncIterator
|
||||
import asyncio
|
||||
|
||||
class LLMClient:
|
||||
"""Async LLM client with proper error handling."""
|
||||
|
||||
def __init__(self, api_key: str, timeout: int = 60):
|
||||
self.client = anthropic.AsyncAnthropic(
|
||||
api_key=api_key,
|
||||
timeout=httpx.Timeout(timeout, connect=5.0)
|
||||
)
|
||||
self.model = "claude-sonnet-4-20250514"
|
||||
|
||||
async def complete(
|
||||
self,
|
||||
prompt: str,
|
||||
system: str | None = None,
|
||||
max_tokens: int = 1024,
|
||||
temperature: float = 1.0
|
||||
) -> str:
|
||||
"""
|
||||
Generate completion from LLM.
|
||||
|
||||
Args:
|
||||
prompt: User message content
|
||||
system: Optional system prompt
|
||||
max_tokens: Maximum tokens to generate
|
||||
temperature: Sampling temperature (0-1)
|
||||
|
||||
Returns:
|
||||
Generated text response
|
||||
|
||||
Raises:
|
||||
LLMError: If API call fails
|
||||
"""
|
||||
try:
|
||||
message = await self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
system=system if system else anthropic.NOT_GIVEN,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
return message.content[0].text
|
||||
|
||||
except anthropic.APITimeoutError as e:
|
||||
raise LLMTimeoutError("LLM request timed out") from e
|
||||
except anthropic.APIConnectionError as e:
|
||||
raise LLMConnectionError("Failed to connect to LLM API") from e
|
||||
except anthropic.RateLimitError as e:
|
||||
raise LLMRateLimitError("Rate limit exceeded") from e
|
||||
except anthropic.APIStatusError as e:
|
||||
raise LLMError(f"LLM API error: {e.status_code}") from e
|
||||
|
||||
|
||||
# Custom exceptions
|
||||
class LLMError(Exception):
|
||||
"""Base LLM error."""
|
||||
pass
|
||||
|
||||
|
||||
class LLMTimeoutError(LLMError):
|
||||
"""LLM request timeout."""
|
||||
pass
|
||||
|
||||
|
||||
class LLMConnectionError(LLMError):
|
||||
"""LLM connection error."""
|
||||
pass
|
||||
|
||||
|
||||
class LLMRateLimitError(LLMError):
|
||||
"""LLM rate limit exceeded."""
|
||||
pass
|
||||
```
|
||||
|
||||
## Streaming Responses
|
||||
|
||||
```python
|
||||
from typing import AsyncIterator
|
||||
|
||||
async def stream_completion(
|
||||
self,
|
||||
prompt: str,
|
||||
system: str | None = None,
|
||||
max_tokens: int = 1024
|
||||
) -> AsyncIterator[str]:
|
||||
"""
|
||||
Stream completion from LLM token by token.
|
||||
|
||||
Yields:
|
||||
Individual text chunks as they arrive
|
||||
|
||||
Usage:
|
||||
async for chunk in client.stream_completion("Hello"):
|
||||
print(chunk, end="", flush=True)
|
||||
"""
|
||||
try:
|
||||
async with self.client.messages.stream(
|
||||
model=self.model,
|
||||
max_tokens=max_tokens,
|
||||
system=system if system else anthropic.NOT_GIVEN,
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
) as stream:
|
||||
async for text in stream.text_stream:
|
||||
yield text
|
||||
|
||||
except anthropic.APIError as e:
|
||||
raise LLMError(f"Streaming error: {str(e)}") from e
|
||||
|
||||
|
||||
# Use in FastAPI endpoint
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.post("/stream")
|
||||
async def stream_endpoint(prompt: str) -> StreamingResponse:
|
||||
"""Stream LLM response to client."""
|
||||
client = LLMClient(api_key=settings.anthropic_api_key)
|
||||
|
||||
async def generate():
|
||||
async for chunk in client.stream_completion(prompt):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/plain"
|
||||
)
|
||||
```
|
||||
|
||||
## Token Counting and Management
|
||||
|
||||
```python
|
||||
from anthropic import Anthropic
|
||||
from typing import List, Dict
|
||||
|
||||
class TokenCounter:
|
||||
"""Token counting utilities for LLM calls."""
|
||||
|
||||
def __init__(self):
|
||||
self.client = Anthropic()
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""
|
||||
Count tokens in text using Claude's tokenizer.
|
||||
|
||||
Args:
|
||||
text: Input text to count
|
||||
|
||||
Returns:
|
||||
Number of tokens
|
||||
"""
|
||||
return self.client.count_tokens(text)
|
||||
|
||||
def count_message_tokens(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
system: str | None = None
|
||||
) -> int:
|
||||
"""
|
||||
Count tokens for a full message exchange.
|
||||
|
||||
Args:
|
||||
messages: List of message dicts with role and content
|
||||
system: Optional system prompt
|
||||
|
||||
Returns:
|
||||
Total token count including message formatting overhead
|
||||
"""
|
||||
total = 0
|
||||
|
||||
# System prompt
|
||||
if system:
|
||||
total += self.count_tokens(system)
|
||||
|
||||
# Messages (include role tokens)
|
||||
for msg in messages:
|
||||
total += self.count_tokens(msg["content"])
|
||||
total += 4 # Overhead for role and formatting
|
||||
|
||||
return total
|
||||
|
||||
def estimate_cost(
|
||||
self,
|
||||
input_tokens: int,
|
||||
output_tokens: int,
|
||||
model: str = "claude-sonnet-4-20250514"
|
||||
) -> float:
|
||||
"""
|
||||
Estimate cost for LLM call.
|
||||
|
||||
Args:
|
||||
input_tokens: Input token count
|
||||
output_tokens: Output token count
|
||||
model: Model name
|
||||
|
||||
Returns:
|
||||
Estimated cost in USD
|
||||
"""
|
||||
# Pricing as of 2025 (update as needed)
|
||||
pricing = {
|
||||
"claude-sonnet-4-20250514": {
|
||||
"input": 3.00 / 1_000_000, # $3/MTok
|
||||
"output": 15.00 / 1_000_000 # $15/MTok
|
||||
},
|
||||
"claude-opus-4-20250514": {
|
||||
"input": 15.00 / 1_000_000,
|
||||
"output": 75.00 / 1_000_000
|
||||
},
|
||||
"claude-haiku-3-5-20250514": {
|
||||
"input": 0.80 / 1_000_000,
|
||||
"output": 4.00 / 1_000_000
|
||||
}
|
||||
}
|
||||
|
||||
rates = pricing.get(model, pricing["claude-sonnet-4-20250514"])
|
||||
return (input_tokens * rates["input"]) + (output_tokens * rates["output"])
|
||||
```
|
||||
|
||||
## Retry Logic with Exponential Backoff
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from typing import TypeVar, Callable, Any
|
||||
from functools import wraps
|
||||
import random
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
def retry_with_backoff(
|
||||
max_retries: int = 3,
|
||||
base_delay: float = 1.0,
|
||||
max_delay: float = 60.0,
|
||||
exponential_base: float = 2.0,
|
||||
jitter: bool = True
|
||||
):
|
||||
"""
|
||||
Retry decorator with exponential backoff for LLM calls.
|
||||
|
||||
Args:
|
||||
max_retries: Maximum number of retry attempts
|
||||
base_delay: Initial delay in seconds
|
||||
max_delay: Maximum delay in seconds
|
||||
exponential_base: Base for exponential calculation
|
||||
jitter: Add random jitter to prevent thundering herd
|
||||
"""
|
||||
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs) -> Any:
|
||||
last_exception = None
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
except (LLMTimeoutError, LLMConnectionError, LLMRateLimitError) as e:
|
||||
last_exception = e
|
||||
|
||||
if attempt == max_retries:
|
||||
raise
|
||||
|
||||
# Calculate delay with exponential backoff
|
||||
delay = min(
|
||||
base_delay * (exponential_base ** attempt),
|
||||
max_delay
|
||||
)
|
||||
|
||||
# Add jitter
|
||||
if jitter:
|
||||
delay *= (0.5 + random.random() * 0.5)
|
||||
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
raise last_exception
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
class RobustLLMClient(LLMClient):
|
||||
"""LLM client with automatic retries."""
|
||||
|
||||
@retry_with_backoff(max_retries=3, base_delay=1.0)
|
||||
async def complete(self, prompt: str, **kwargs) -> str:
|
||||
"""Complete with automatic retries."""
|
||||
return await super().complete(prompt, **kwargs)
|
||||
```
|
||||
|
||||
## Caching and Prompt Optimization
|
||||
|
||||
```python
|
||||
from functools import lru_cache
|
||||
import hashlib
|
||||
from typing import Optional
|
||||
|
||||
class CachedLLMClient(LLMClient):
|
||||
"""LLM client with response caching."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._cache: dict[str, str] = {}
|
||||
|
||||
def _cache_key(self, prompt: str, system: str | None = None) -> str:
|
||||
"""Generate cache key from prompt and system."""
|
||||
content = f"{system or ''}||{prompt}"
|
||||
return hashlib.sha256(content.encode()).hexdigest()
|
||||
|
||||
async def complete(
|
||||
self,
|
||||
prompt: str,
|
||||
system: str | None = None,
|
||||
use_cache: bool = True,
|
||||
**kwargs
|
||||
) -> str:
|
||||
"""Complete with caching support."""
|
||||
if use_cache:
|
||||
cache_key = self._cache_key(prompt, system)
|
||||
if cache_key in self._cache:
|
||||
return self._cache[cache_key]
|
||||
|
||||
response = await super().complete(prompt, system=system, **kwargs)
|
||||
|
||||
if use_cache:
|
||||
self._cache[cache_key] = response
|
||||
|
||||
return response
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear response cache."""
|
||||
self._cache.clear()
|
||||
|
||||
|
||||
# Use Claude prompt caching for repeated system prompts
|
||||
async def complete_with_prompt_caching(
|
||||
self,
|
||||
prompt: str,
|
||||
system: str,
|
||||
max_tokens: int = 1024
|
||||
) -> str:
|
||||
"""
|
||||
Use Claude's prompt caching for repeated system prompts.
|
||||
|
||||
Caches system prompt on Claude's servers for 5 minutes,
|
||||
reducing cost for repeated calls with same system prompt.
|
||||
"""
|
||||
message = await self.client.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=max_tokens,
|
||||
system=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": system,
|
||||
"cache_control": {"type": "ephemeral"} # Cache this
|
||||
}
|
||||
],
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
return message.content[0].text
|
||||
```
|
||||
|
||||
## Batch Processing
|
||||
|
||||
```python
|
||||
from typing import List
|
||||
import asyncio
|
||||
|
||||
async def batch_complete(
|
||||
self,
|
||||
prompts: List[str],
|
||||
system: str | None = None,
|
||||
max_concurrent: int = 5
|
||||
) -> List[str]:
|
||||
"""
|
||||
Process multiple prompts concurrently with concurrency limit.
|
||||
|
||||
Args:
|
||||
prompts: List of prompts to process
|
||||
system: Optional system prompt
|
||||
max_concurrent: Maximum concurrent requests
|
||||
|
||||
Returns:
|
||||
List of responses in same order as prompts
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
async def process_one(prompt: str) -> str:
|
||||
async with semaphore:
|
||||
return await self.complete(prompt, system=system)
|
||||
|
||||
tasks = [process_one(p) for p in prompts]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
# Example usage
|
||||
async def process_documents():
|
||||
"""Process multiple documents with LLM."""
|
||||
client = LLMClient(api_key=settings.anthropic_api_key)
|
||||
documents = ["doc1 text", "doc2 text", "doc3 text"]
|
||||
|
||||
# Process in batches of 5
|
||||
results = await client.batch_complete(
|
||||
prompts=documents,
|
||||
system="Summarize this document in 2 sentences.",
|
||||
max_concurrent=5
|
||||
)
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
## Observability and Logging
|
||||
|
||||
```python
|
||||
import logging
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ObservableLLMClient(LLMClient):
|
||||
"""LLM client with comprehensive logging."""
|
||||
|
||||
async def complete(self, prompt: str, **kwargs) -> str:
|
||||
"""Complete with observability."""
|
||||
request_id = str(uuid.uuid4())
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
# Log request (redact if needed)
|
||||
logger.info(
|
||||
"LLM request started",
|
||||
extra={
|
||||
"request_id": request_id,
|
||||
"model": self.model,
|
||||
"prompt_length": len(prompt),
|
||||
"max_tokens": kwargs.get("max_tokens", 1024),
|
||||
"temperature": kwargs.get("temperature", 1.0)
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
response = await super().complete(prompt, **kwargs)
|
||||
|
||||
# Count tokens
|
||||
counter = TokenCounter()
|
||||
input_tokens = counter.count_tokens(prompt)
|
||||
output_tokens = counter.count_tokens(response)
|
||||
cost = counter.estimate_cost(input_tokens, output_tokens, self.model)
|
||||
|
||||
# Log success
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
logger.info(
|
||||
"LLM request completed",
|
||||
extra={
|
||||
"request_id": request_id,
|
||||
"duration_seconds": duration,
|
||||
"input_tokens": input_tokens,
|
||||
"output_tokens": output_tokens,
|
||||
"total_tokens": input_tokens + output_tokens,
|
||||
"estimated_cost_usd": cost,
|
||||
"response_length": len(response)
|
||||
}
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
# Log error
|
||||
duration = (datetime.utcnow() - start_time).total_seconds()
|
||||
logger.error(
|
||||
"LLM request failed",
|
||||
extra={
|
||||
"request_id": request_id,
|
||||
"duration_seconds": duration,
|
||||
"error_type": type(e).__name__,
|
||||
"error_message": str(e)
|
||||
},
|
||||
exc_info=True
|
||||
)
|
||||
raise
|
||||
```
|
||||
|
||||
## ❌ Anti-Patterns
|
||||
|
||||
```python
|
||||
# ❌ Synchronous API calls in async code
|
||||
def complete(prompt: str) -> str: # Should be async!
|
||||
response = anthropic.Anthropic().messages.create(...)
|
||||
return response.content[0].text
|
||||
|
||||
# ✅ Better: Use async client
|
||||
async def complete(prompt: str) -> str:
|
||||
async_client = anthropic.AsyncAnthropic()
|
||||
response = await async_client.messages.create(...)
|
||||
return response.content[0].text
|
||||
|
||||
|
||||
# ❌ No timeout
|
||||
client = anthropic.AsyncAnthropic() # No timeout!
|
||||
|
||||
# ✅ Better: Set reasonable timeout
|
||||
client = anthropic.AsyncAnthropic(
|
||||
timeout=httpx.Timeout(60.0, connect=5.0)
|
||||
)
|
||||
|
||||
|
||||
# ❌ No error handling
|
||||
async def complete(prompt: str) -> str:
|
||||
response = await client.messages.create(...) # Can fail!
|
||||
return response.content[0].text
|
||||
|
||||
# ✅ Better: Handle specific errors
|
||||
async def complete(prompt: str) -> str:
|
||||
try:
|
||||
response = await client.messages.create(...)
|
||||
return response.content[0].text
|
||||
except anthropic.RateLimitError:
|
||||
# Handle rate limit
|
||||
raise LLMRateLimitError("Rate limit exceeded")
|
||||
except anthropic.APITimeoutError:
|
||||
# Handle timeout
|
||||
raise LLMTimeoutError("Request timed out")
|
||||
|
||||
|
||||
# ❌ No token tracking
|
||||
async def complete(prompt: str) -> str:
|
||||
return await client.messages.create(...) # No idea of cost!
|
||||
|
||||
# ✅ Better: Track tokens and cost
|
||||
async def complete(prompt: str) -> str:
|
||||
response = await client.messages.create(...)
|
||||
logger.info(
|
||||
"LLM call",
|
||||
extra={
|
||||
"input_tokens": response.usage.input_tokens,
|
||||
"output_tokens": response.usage.output_tokens,
|
||||
"cost": estimate_cost(response.usage)
|
||||
}
|
||||
)
|
||||
return response.content[0].text
|
||||
|
||||
|
||||
# ❌ Sequential processing
|
||||
async def process_many(prompts: List[str]) -> List[str]:
|
||||
results = []
|
||||
for prompt in prompts: # Sequential!
|
||||
result = await complete(prompt)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
# ✅ Better: Concurrent processing with limits
|
||||
async def process_many(prompts: List[str]) -> List[str]:
|
||||
semaphore = asyncio.Semaphore(5) # Max 5 concurrent
|
||||
|
||||
async def process_one(prompt):
|
||||
async with semaphore:
|
||||
return await complete(prompt)
|
||||
|
||||
return await asyncio.gather(*[process_one(p) for p in prompts])
|
||||
```
|
||||
|
||||
## Best Practices Checklist
|
||||
|
||||
- ✅ Use async/await for all LLM API calls
|
||||
- ✅ Set reasonable timeouts (30-60 seconds)
|
||||
- ✅ Implement retry logic with exponential backoff
|
||||
- ✅ Handle specific API exceptions (rate limit, timeout, connection)
|
||||
- ✅ Track token usage and estimated cost
|
||||
- ✅ Log all LLM calls with request IDs
|
||||
- ✅ Use streaming for long responses
|
||||
- ✅ Implement prompt caching for repeated system prompts
|
||||
- ✅ Process multiple requests concurrently with semaphores
|
||||
- ✅ Redact sensitive data in logs
|
||||
- ✅ Set max_tokens to prevent runaway costs
|
||||
- ✅ Use appropriate temperature for task (0 for deterministic, 1 for creative)
|
||||
|
||||
## Auto-Apply
|
||||
|
||||
When making LLM API calls:
|
||||
1. Use async client (AsyncAnthropic, AsyncOpenAI)
|
||||
2. Add timeout configuration
|
||||
3. Implement retry logic for transient errors
|
||||
4. Track tokens and cost
|
||||
5. Log requests with structured logging
|
||||
6. Use streaming for real-time responses
|
||||
7. Handle rate limits gracefully
|
||||
|
||||
## Related Skills
|
||||
|
||||
- `async-await-checker` - For async/await patterns
|
||||
- `structured-errors` - For error handling
|
||||
- `observability-logging` - For logging and tracing
|
||||
- `pydantic-models` - For request/response validation
|
||||
- `fastapi-patterns` - For building LLM API endpoints
|
||||
Reference in New Issue
Block a user