250 lines
7.8 KiB
Python
250 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Prompt Optimization Script
|
|
|
|
Automatically test and optimize prompts using A/B testing and metrics tracking.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from typing import List, Dict, Any
|
|
from dataclasses import dataclass
|
|
import numpy as np
|
|
|
|
|
|
@dataclass
|
|
class TestCase:
|
|
input: Dict[str, Any]
|
|
expected_output: str
|
|
metadata: Dict[str, Any] = None
|
|
|
|
|
|
class PromptOptimizer:
|
|
def __init__(self, llm_client, test_suite: List[TestCase]):
|
|
self.client = llm_client
|
|
self.test_suite = test_suite
|
|
self.results_history = []
|
|
|
|
def evaluate_prompt(self, prompt_template: str, test_cases: List[TestCase] = None) -> Dict[str, float]:
|
|
"""Evaluate a prompt template against test cases."""
|
|
if test_cases is None:
|
|
test_cases = self.test_suite
|
|
|
|
metrics = {
|
|
'accuracy': [],
|
|
'latency': [],
|
|
'token_count': [],
|
|
'success_rate': []
|
|
}
|
|
|
|
for test_case in test_cases:
|
|
start_time = time.time()
|
|
|
|
# Render prompt with test case inputs
|
|
prompt = prompt_template.format(**test_case.input)
|
|
|
|
# Get LLM response
|
|
response = self.client.complete(prompt)
|
|
|
|
# Measure latency
|
|
latency = time.time() - start_time
|
|
|
|
# Calculate metrics
|
|
metrics['latency'].append(latency)
|
|
metrics['token_count'].append(len(prompt.split()) + len(response.split()))
|
|
metrics['success_rate'].append(1 if response else 0)
|
|
|
|
# Check accuracy
|
|
accuracy = self.calculate_accuracy(response, test_case.expected_output)
|
|
metrics['accuracy'].append(accuracy)
|
|
|
|
# Aggregate metrics
|
|
return {
|
|
'avg_accuracy': np.mean(metrics['accuracy']),
|
|
'avg_latency': np.mean(metrics['latency']),
|
|
'p95_latency': np.percentile(metrics['latency'], 95),
|
|
'avg_tokens': np.mean(metrics['token_count']),
|
|
'success_rate': np.mean(metrics['success_rate'])
|
|
}
|
|
|
|
def calculate_accuracy(self, response: str, expected: str) -> float:
|
|
"""Calculate accuracy score between response and expected output."""
|
|
# Simple exact match
|
|
if response.strip().lower() == expected.strip().lower():
|
|
return 1.0
|
|
|
|
# Partial match using word overlap
|
|
response_words = set(response.lower().split())
|
|
expected_words = set(expected.lower().split())
|
|
|
|
if not expected_words:
|
|
return 0.0
|
|
|
|
overlap = len(response_words & expected_words)
|
|
return overlap / len(expected_words)
|
|
|
|
def optimize(self, base_prompt: str, max_iterations: int = 5) -> Dict[str, Any]:
|
|
"""Iteratively optimize a prompt."""
|
|
current_prompt = base_prompt
|
|
best_prompt = base_prompt
|
|
best_score = 0
|
|
|
|
for iteration in range(max_iterations):
|
|
print(f"\nIteration {iteration + 1}/{max_iterations}")
|
|
|
|
# Evaluate current prompt
|
|
metrics = self.evaluate_prompt(current_prompt)
|
|
print(f"Accuracy: {metrics['avg_accuracy']:.2f}, Latency: {metrics['avg_latency']:.2f}s")
|
|
|
|
# Track results
|
|
self.results_history.append({
|
|
'iteration': iteration,
|
|
'prompt': current_prompt,
|
|
'metrics': metrics
|
|
})
|
|
|
|
# Update best if improved
|
|
if metrics['avg_accuracy'] > best_score:
|
|
best_score = metrics['avg_accuracy']
|
|
best_prompt = current_prompt
|
|
|
|
# Stop if good enough
|
|
if metrics['avg_accuracy'] > 0.95:
|
|
print("Achieved target accuracy!")
|
|
break
|
|
|
|
# Generate variations for next iteration
|
|
variations = self.generate_variations(current_prompt, metrics)
|
|
|
|
# Test variations and pick best
|
|
best_variation = current_prompt
|
|
best_variation_score = metrics['avg_accuracy']
|
|
|
|
for variation in variations:
|
|
var_metrics = self.evaluate_prompt(variation)
|
|
if var_metrics['avg_accuracy'] > best_variation_score:
|
|
best_variation_score = var_metrics['avg_accuracy']
|
|
best_variation = variation
|
|
|
|
current_prompt = best_variation
|
|
|
|
return {
|
|
'best_prompt': best_prompt,
|
|
'best_score': best_score,
|
|
'history': self.results_history
|
|
}
|
|
|
|
def generate_variations(self, prompt: str, current_metrics: Dict) -> List[str]:
|
|
"""Generate prompt variations to test."""
|
|
variations = []
|
|
|
|
# Variation 1: Add explicit format instruction
|
|
variations.append(prompt + "\n\nProvide your answer in a clear, concise format.")
|
|
|
|
# Variation 2: Add step-by-step instruction
|
|
variations.append("Let's solve this step by step.\n\n" + prompt)
|
|
|
|
# Variation 3: Add verification step
|
|
variations.append(prompt + "\n\nVerify your answer before responding.")
|
|
|
|
# Variation 4: Make more concise
|
|
concise = self.make_concise(prompt)
|
|
if concise != prompt:
|
|
variations.append(concise)
|
|
|
|
# Variation 5: Add examples (if none present)
|
|
if "example" not in prompt.lower():
|
|
variations.append(self.add_examples(prompt))
|
|
|
|
return variations[:3] # Return top 3 variations
|
|
|
|
def make_concise(self, prompt: str) -> str:
|
|
"""Remove redundant words to make prompt more concise."""
|
|
replacements = [
|
|
("in order to", "to"),
|
|
("due to the fact that", "because"),
|
|
("at this point in time", "now"),
|
|
("in the event that", "if"),
|
|
]
|
|
|
|
result = prompt
|
|
for old, new in replacements:
|
|
result = result.replace(old, new)
|
|
|
|
return result
|
|
|
|
def add_examples(self, prompt: str) -> str:
|
|
"""Add example section to prompt."""
|
|
return f"""{prompt}
|
|
|
|
Example:
|
|
Input: Sample input
|
|
Output: Sample output
|
|
"""
|
|
|
|
def compare_prompts(self, prompt_a: str, prompt_b: str) -> Dict[str, Any]:
|
|
"""A/B test two prompts."""
|
|
print("Testing Prompt A...")
|
|
metrics_a = self.evaluate_prompt(prompt_a)
|
|
|
|
print("Testing Prompt B...")
|
|
metrics_b = self.evaluate_prompt(prompt_b)
|
|
|
|
return {
|
|
'prompt_a_metrics': metrics_a,
|
|
'prompt_b_metrics': metrics_b,
|
|
'winner': 'A' if metrics_a['avg_accuracy'] > metrics_b['avg_accuracy'] else 'B',
|
|
'improvement': abs(metrics_a['avg_accuracy'] - metrics_b['avg_accuracy'])
|
|
}
|
|
|
|
def export_results(self, filename: str):
|
|
"""Export optimization results to JSON."""
|
|
with open(filename, 'w') as f:
|
|
json.dump(self.results_history, f, indent=2)
|
|
|
|
|
|
def main():
|
|
# Example usage
|
|
test_suite = [
|
|
TestCase(
|
|
input={'text': 'This movie was amazing!'},
|
|
expected_output='Positive'
|
|
),
|
|
TestCase(
|
|
input={'text': 'Worst purchase ever.'},
|
|
expected_output='Negative'
|
|
),
|
|
TestCase(
|
|
input={'text': 'It was okay, nothing special.'},
|
|
expected_output='Neutral'
|
|
)
|
|
]
|
|
|
|
# Mock LLM client for demonstration
|
|
class MockLLMClient:
|
|
def complete(self, prompt):
|
|
# Simulate LLM response
|
|
if 'amazing' in prompt:
|
|
return 'Positive'
|
|
elif 'worst' in prompt.lower():
|
|
return 'Negative'
|
|
else:
|
|
return 'Neutral'
|
|
|
|
optimizer = PromptOptimizer(MockLLMClient(), test_suite)
|
|
|
|
base_prompt = "Classify the sentiment of: {text}\nSentiment:"
|
|
|
|
results = optimizer.optimize(base_prompt)
|
|
|
|
print("\n" + "="*50)
|
|
print("Optimization Complete!")
|
|
print(f"Best Accuracy: {results['best_score']:.2f}")
|
|
print(f"Best Prompt:\n{results['best_prompt']}")
|
|
|
|
optimizer.export_results('optimization_results.json')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|