#!/usr/bin/env python3 """ Prompt Optimization Script Automatically test and optimize prompts using A/B testing and metrics tracking. """ import json import time from typing import List, Dict, Any from dataclasses import dataclass import numpy as np @dataclass class TestCase: input: Dict[str, Any] expected_output: str metadata: Dict[str, Any] = None class PromptOptimizer: def __init__(self, llm_client, test_suite: List[TestCase]): self.client = llm_client self.test_suite = test_suite self.results_history = [] def evaluate_prompt(self, prompt_template: str, test_cases: List[TestCase] = None) -> Dict[str, float]: """Evaluate a prompt template against test cases.""" if test_cases is None: test_cases = self.test_suite metrics = { 'accuracy': [], 'latency': [], 'token_count': [], 'success_rate': [] } for test_case in test_cases: start_time = time.time() # Render prompt with test case inputs prompt = prompt_template.format(**test_case.input) # Get LLM response response = self.client.complete(prompt) # Measure latency latency = time.time() - start_time # Calculate metrics metrics['latency'].append(latency) metrics['token_count'].append(len(prompt.split()) + len(response.split())) metrics['success_rate'].append(1 if response else 0) # Check accuracy accuracy = self.calculate_accuracy(response, test_case.expected_output) metrics['accuracy'].append(accuracy) # Aggregate metrics return { 'avg_accuracy': np.mean(metrics['accuracy']), 'avg_latency': np.mean(metrics['latency']), 'p95_latency': np.percentile(metrics['latency'], 95), 'avg_tokens': np.mean(metrics['token_count']), 'success_rate': np.mean(metrics['success_rate']) } def calculate_accuracy(self, response: str, expected: str) -> float: """Calculate accuracy score between response and expected output.""" # Simple exact match if response.strip().lower() == expected.strip().lower(): return 1.0 # Partial match using word overlap response_words = set(response.lower().split()) expected_words = set(expected.lower().split()) if not expected_words: return 0.0 overlap = len(response_words & expected_words) return overlap / len(expected_words) def optimize(self, base_prompt: str, max_iterations: int = 5) -> Dict[str, Any]: """Iteratively optimize a prompt.""" current_prompt = base_prompt best_prompt = base_prompt best_score = 0 for iteration in range(max_iterations): print(f"\nIteration {iteration + 1}/{max_iterations}") # Evaluate current prompt metrics = self.evaluate_prompt(current_prompt) print(f"Accuracy: {metrics['avg_accuracy']:.2f}, Latency: {metrics['avg_latency']:.2f}s") # Track results self.results_history.append({ 'iteration': iteration, 'prompt': current_prompt, 'metrics': metrics }) # Update best if improved if metrics['avg_accuracy'] > best_score: best_score = metrics['avg_accuracy'] best_prompt = current_prompt # Stop if good enough if metrics['avg_accuracy'] > 0.95: print("Achieved target accuracy!") break # Generate variations for next iteration variations = self.generate_variations(current_prompt, metrics) # Test variations and pick best best_variation = current_prompt best_variation_score = metrics['avg_accuracy'] for variation in variations: var_metrics = self.evaluate_prompt(variation) if var_metrics['avg_accuracy'] > best_variation_score: best_variation_score = var_metrics['avg_accuracy'] best_variation = variation current_prompt = best_variation return { 'best_prompt': best_prompt, 'best_score': best_score, 'history': self.results_history } def generate_variations(self, prompt: str, current_metrics: Dict) -> List[str]: """Generate prompt variations to test.""" variations = [] # Variation 1: Add explicit format instruction variations.append(prompt + "\n\nProvide your answer in a clear, concise format.") # Variation 2: Add step-by-step instruction variations.append("Let's solve this step by step.\n\n" + prompt) # Variation 3: Add verification step variations.append(prompt + "\n\nVerify your answer before responding.") # Variation 4: Make more concise concise = self.make_concise(prompt) if concise != prompt: variations.append(concise) # Variation 5: Add examples (if none present) if "example" not in prompt.lower(): variations.append(self.add_examples(prompt)) return variations[:3] # Return top 3 variations def make_concise(self, prompt: str) -> str: """Remove redundant words to make prompt more concise.""" replacements = [ ("in order to", "to"), ("due to the fact that", "because"), ("at this point in time", "now"), ("in the event that", "if"), ] result = prompt for old, new in replacements: result = result.replace(old, new) return result def add_examples(self, prompt: str) -> str: """Add example section to prompt.""" return f"""{prompt} Example: Input: Sample input Output: Sample output """ def compare_prompts(self, prompt_a: str, prompt_b: str) -> Dict[str, Any]: """A/B test two prompts.""" print("Testing Prompt A...") metrics_a = self.evaluate_prompt(prompt_a) print("Testing Prompt B...") metrics_b = self.evaluate_prompt(prompt_b) return { 'prompt_a_metrics': metrics_a, 'prompt_b_metrics': metrics_b, 'winner': 'A' if metrics_a['avg_accuracy'] > metrics_b['avg_accuracy'] else 'B', 'improvement': abs(metrics_a['avg_accuracy'] - metrics_b['avg_accuracy']) } def export_results(self, filename: str): """Export optimization results to JSON.""" with open(filename, 'w') as f: json.dump(self.results_history, f, indent=2) def main(): # Example usage test_suite = [ TestCase( input={'text': 'This movie was amazing!'}, expected_output='Positive' ), TestCase( input={'text': 'Worst purchase ever.'}, expected_output='Negative' ), TestCase( input={'text': 'It was okay, nothing special.'}, expected_output='Neutral' ) ] # Mock LLM client for demonstration class MockLLMClient: def complete(self, prompt): # Simulate LLM response if 'amazing' in prompt: return 'Positive' elif 'worst' in prompt.lower(): return 'Negative' else: return 'Neutral' optimizer = PromptOptimizer(MockLLMClient(), test_suite) base_prompt = "Classify the sentiment of: {text}\nSentiment:" results = optimizer.optimize(base_prompt) print("\n" + "="*50) print("Optimization Complete!") print(f"Best Accuracy: {results['best_score']:.2f}") print(f"Best Prompt:\n{results['best_prompt']}") optimizer.export_results('optimization_results.json') if __name__ == '__main__': main()