Files
gh-hermeticormus-hermetic-l…/skills/prompt-engineering-patterns/scripts/optimize-prompt.py
2025-11-29 18:44:49 +08:00

250 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""
Prompt Optimization Script
Automatically test and optimize prompts using A/B testing and metrics tracking.
"""
import json
import time
from typing import List, Dict, Any
from dataclasses import dataclass
import numpy as np
@dataclass
class TestCase:
input: Dict[str, Any]
expected_output: str
metadata: Dict[str, Any] = None
class PromptOptimizer:
def __init__(self, llm_client, test_suite: List[TestCase]):
self.client = llm_client
self.test_suite = test_suite
self.results_history = []
def evaluate_prompt(self, prompt_template: str, test_cases: List[TestCase] = None) -> Dict[str, float]:
"""Evaluate a prompt template against test cases."""
if test_cases is None:
test_cases = self.test_suite
metrics = {
'accuracy': [],
'latency': [],
'token_count': [],
'success_rate': []
}
for test_case in test_cases:
start_time = time.time()
# Render prompt with test case inputs
prompt = prompt_template.format(**test_case.input)
# Get LLM response
response = self.client.complete(prompt)
# Measure latency
latency = time.time() - start_time
# Calculate metrics
metrics['latency'].append(latency)
metrics['token_count'].append(len(prompt.split()) + len(response.split()))
metrics['success_rate'].append(1 if response else 0)
# Check accuracy
accuracy = self.calculate_accuracy(response, test_case.expected_output)
metrics['accuracy'].append(accuracy)
# Aggregate metrics
return {
'avg_accuracy': np.mean(metrics['accuracy']),
'avg_latency': np.mean(metrics['latency']),
'p95_latency': np.percentile(metrics['latency'], 95),
'avg_tokens': np.mean(metrics['token_count']),
'success_rate': np.mean(metrics['success_rate'])
}
def calculate_accuracy(self, response: str, expected: str) -> float:
"""Calculate accuracy score between response and expected output."""
# Simple exact match
if response.strip().lower() == expected.strip().lower():
return 1.0
# Partial match using word overlap
response_words = set(response.lower().split())
expected_words = set(expected.lower().split())
if not expected_words:
return 0.0
overlap = len(response_words & expected_words)
return overlap / len(expected_words)
def optimize(self, base_prompt: str, max_iterations: int = 5) -> Dict[str, Any]:
"""Iteratively optimize a prompt."""
current_prompt = base_prompt
best_prompt = base_prompt
best_score = 0
for iteration in range(max_iterations):
print(f"\nIteration {iteration + 1}/{max_iterations}")
# Evaluate current prompt
metrics = self.evaluate_prompt(current_prompt)
print(f"Accuracy: {metrics['avg_accuracy']:.2f}, Latency: {metrics['avg_latency']:.2f}s")
# Track results
self.results_history.append({
'iteration': iteration,
'prompt': current_prompt,
'metrics': metrics
})
# Update best if improved
if metrics['avg_accuracy'] > best_score:
best_score = metrics['avg_accuracy']
best_prompt = current_prompt
# Stop if good enough
if metrics['avg_accuracy'] > 0.95:
print("Achieved target accuracy!")
break
# Generate variations for next iteration
variations = self.generate_variations(current_prompt, metrics)
# Test variations and pick best
best_variation = current_prompt
best_variation_score = metrics['avg_accuracy']
for variation in variations:
var_metrics = self.evaluate_prompt(variation)
if var_metrics['avg_accuracy'] > best_variation_score:
best_variation_score = var_metrics['avg_accuracy']
best_variation = variation
current_prompt = best_variation
return {
'best_prompt': best_prompt,
'best_score': best_score,
'history': self.results_history
}
def generate_variations(self, prompt: str, current_metrics: Dict) -> List[str]:
"""Generate prompt variations to test."""
variations = []
# Variation 1: Add explicit format instruction
variations.append(prompt + "\n\nProvide your answer in a clear, concise format.")
# Variation 2: Add step-by-step instruction
variations.append("Let's solve this step by step.\n\n" + prompt)
# Variation 3: Add verification step
variations.append(prompt + "\n\nVerify your answer before responding.")
# Variation 4: Make more concise
concise = self.make_concise(prompt)
if concise != prompt:
variations.append(concise)
# Variation 5: Add examples (if none present)
if "example" not in prompt.lower():
variations.append(self.add_examples(prompt))
return variations[:3] # Return top 3 variations
def make_concise(self, prompt: str) -> str:
"""Remove redundant words to make prompt more concise."""
replacements = [
("in order to", "to"),
("due to the fact that", "because"),
("at this point in time", "now"),
("in the event that", "if"),
]
result = prompt
for old, new in replacements:
result = result.replace(old, new)
return result
def add_examples(self, prompt: str) -> str:
"""Add example section to prompt."""
return f"""{prompt}
Example:
Input: Sample input
Output: Sample output
"""
def compare_prompts(self, prompt_a: str, prompt_b: str) -> Dict[str, Any]:
"""A/B test two prompts."""
print("Testing Prompt A...")
metrics_a = self.evaluate_prompt(prompt_a)
print("Testing Prompt B...")
metrics_b = self.evaluate_prompt(prompt_b)
return {
'prompt_a_metrics': metrics_a,
'prompt_b_metrics': metrics_b,
'winner': 'A' if metrics_a['avg_accuracy'] > metrics_b['avg_accuracy'] else 'B',
'improvement': abs(metrics_a['avg_accuracy'] - metrics_b['avg_accuracy'])
}
def export_results(self, filename: str):
"""Export optimization results to JSON."""
with open(filename, 'w') as f:
json.dump(self.results_history, f, indent=2)
def main():
# Example usage
test_suite = [
TestCase(
input={'text': 'This movie was amazing!'},
expected_output='Positive'
),
TestCase(
input={'text': 'Worst purchase ever.'},
expected_output='Negative'
),
TestCase(
input={'text': 'It was okay, nothing special.'},
expected_output='Neutral'
)
]
# Mock LLM client for demonstration
class MockLLMClient:
def complete(self, prompt):
# Simulate LLM response
if 'amazing' in prompt:
return 'Positive'
elif 'worst' in prompt.lower():
return 'Negative'
else:
return 'Neutral'
optimizer = PromptOptimizer(MockLLMClient(), test_suite)
base_prompt = "Classify the sentiment of: {text}\nSentiment:"
results = optimizer.optimize(base_prompt)
print("\n" + "="*50)
print("Optimization Complete!")
print(f"Best Accuracy: {results['best_score']:.2f}")
print(f"Best Prompt:\n{results['best_prompt']}")
optimizer.export_results('optimization_results.json')
if __name__ == '__main__':
main()