415 lines
12 KiB
Markdown
415 lines
12 KiB
Markdown
# Prompt Optimization Guide
|
|
|
|
## Systematic Refinement Process
|
|
|
|
### 1. Baseline Establishment
|
|
```python
|
|
def establish_baseline(prompt, test_cases):
|
|
results = {
|
|
'accuracy': 0,
|
|
'avg_tokens': 0,
|
|
'avg_latency': 0,
|
|
'success_rate': 0
|
|
}
|
|
|
|
for test_case in test_cases:
|
|
response = llm.complete(prompt.format(**test_case['input']))
|
|
|
|
results['accuracy'] += evaluate_accuracy(response, test_case['expected'])
|
|
results['avg_tokens'] += count_tokens(response)
|
|
results['avg_latency'] += measure_latency(response)
|
|
results['success_rate'] += is_valid_response(response)
|
|
|
|
# Average across test cases
|
|
n = len(test_cases)
|
|
return {k: v/n for k, v in results.items()}
|
|
```
|
|
|
|
### 2. Iterative Refinement Workflow
|
|
```
|
|
Initial Prompt → Test → Analyze Failures → Refine → Test → Repeat
|
|
```
|
|
|
|
```python
|
|
class PromptOptimizer:
|
|
def __init__(self, initial_prompt, test_suite):
|
|
self.prompt = initial_prompt
|
|
self.test_suite = test_suite
|
|
self.history = []
|
|
|
|
def optimize(self, max_iterations=10):
|
|
for i in range(max_iterations):
|
|
# Test current prompt
|
|
results = self.evaluate_prompt(self.prompt)
|
|
self.history.append({
|
|
'iteration': i,
|
|
'prompt': self.prompt,
|
|
'results': results
|
|
})
|
|
|
|
# Stop if good enough
|
|
if results['accuracy'] > 0.95:
|
|
break
|
|
|
|
# Analyze failures
|
|
failures = self.analyze_failures(results)
|
|
|
|
# Generate refinement suggestions
|
|
refinements = self.generate_refinements(failures)
|
|
|
|
# Apply best refinement
|
|
self.prompt = self.select_best_refinement(refinements)
|
|
|
|
return self.get_best_prompt()
|
|
```
|
|
|
|
### 3. A/B Testing Framework
|
|
```python
|
|
class PromptABTest:
|
|
def __init__(self, variant_a, variant_b):
|
|
self.variant_a = variant_a
|
|
self.variant_b = variant_b
|
|
|
|
def run_test(self, test_queries, metrics=['accuracy', 'latency']):
|
|
results = {
|
|
'A': {m: [] for m in metrics},
|
|
'B': {m: [] for m in metrics}
|
|
}
|
|
|
|
for query in test_queries:
|
|
# Randomly assign variant (50/50 split)
|
|
variant = 'A' if random.random() < 0.5 else 'B'
|
|
prompt = self.variant_a if variant == 'A' else self.variant_b
|
|
|
|
response, metrics_data = self.execute_with_metrics(
|
|
prompt.format(query=query['input'])
|
|
)
|
|
|
|
for metric in metrics:
|
|
results[variant][metric].append(metrics_data[metric])
|
|
|
|
return self.analyze_results(results)
|
|
|
|
def analyze_results(self, results):
|
|
from scipy import stats
|
|
|
|
analysis = {}
|
|
for metric in results['A'].keys():
|
|
a_values = results['A'][metric]
|
|
b_values = results['B'][metric]
|
|
|
|
# Statistical significance test
|
|
t_stat, p_value = stats.ttest_ind(a_values, b_values)
|
|
|
|
analysis[metric] = {
|
|
'A_mean': np.mean(a_values),
|
|
'B_mean': np.mean(b_values),
|
|
'improvement': (np.mean(b_values) - np.mean(a_values)) / np.mean(a_values),
|
|
'statistically_significant': p_value < 0.05,
|
|
'p_value': p_value,
|
|
'winner': 'B' if np.mean(b_values) > np.mean(a_values) else 'A'
|
|
}
|
|
|
|
return analysis
|
|
```
|
|
|
|
## Optimization Strategies
|
|
|
|
### Token Reduction
|
|
```python
|
|
def optimize_for_tokens(prompt):
|
|
optimizations = [
|
|
# Remove redundant phrases
|
|
('in order to', 'to'),
|
|
('due to the fact that', 'because'),
|
|
('at this point in time', 'now'),
|
|
|
|
# Consolidate instructions
|
|
('First, ...\\nThen, ...\\nFinally, ...', 'Steps: 1) ... 2) ... 3) ...'),
|
|
|
|
# Use abbreviations (after first definition)
|
|
('Natural Language Processing (NLP)', 'NLP'),
|
|
|
|
# Remove filler words
|
|
(' actually ', ' '),
|
|
(' basically ', ' '),
|
|
(' really ', ' ')
|
|
]
|
|
|
|
optimized = prompt
|
|
for old, new in optimizations:
|
|
optimized = optimized.replace(old, new)
|
|
|
|
return optimized
|
|
```
|
|
|
|
### Latency Reduction
|
|
```python
|
|
def optimize_for_latency(prompt):
|
|
strategies = {
|
|
'shorter_prompt': reduce_token_count(prompt),
|
|
'streaming': enable_streaming_response(prompt),
|
|
'caching': add_cacheable_prefix(prompt),
|
|
'early_stopping': add_stop_sequences(prompt)
|
|
}
|
|
|
|
# Test each strategy
|
|
best_strategy = None
|
|
best_latency = float('inf')
|
|
|
|
for name, modified_prompt in strategies.items():
|
|
latency = measure_average_latency(modified_prompt)
|
|
if latency < best_latency:
|
|
best_latency = latency
|
|
best_strategy = modified_prompt
|
|
|
|
return best_strategy
|
|
```
|
|
|
|
### Accuracy Improvement
|
|
```python
|
|
def improve_accuracy(prompt, failure_cases):
|
|
improvements = []
|
|
|
|
# Add constraints for common failures
|
|
if has_format_errors(failure_cases):
|
|
improvements.append("Output must be valid JSON with no additional text.")
|
|
|
|
# Add examples for edge cases
|
|
edge_cases = identify_edge_cases(failure_cases)
|
|
if edge_cases:
|
|
improvements.append(f"Examples of edge cases:\\n{format_examples(edge_cases)}")
|
|
|
|
# Add verification step
|
|
if has_logical_errors(failure_cases):
|
|
improvements.append("Before responding, verify your answer is logically consistent.")
|
|
|
|
# Strengthen instructions
|
|
if has_ambiguity_errors(failure_cases):
|
|
improvements.append(clarify_ambiguous_instructions(prompt))
|
|
|
|
return integrate_improvements(prompt, improvements)
|
|
```
|
|
|
|
## Performance Metrics
|
|
|
|
### Core Metrics
|
|
```python
|
|
class PromptMetrics:
|
|
@staticmethod
|
|
def accuracy(responses, ground_truth):
|
|
return sum(r == gt for r, gt in zip(responses, ground_truth)) / len(responses)
|
|
|
|
@staticmethod
|
|
def consistency(responses):
|
|
# Measure how often identical inputs produce identical outputs
|
|
from collections import defaultdict
|
|
input_responses = defaultdict(list)
|
|
|
|
for inp, resp in responses:
|
|
input_responses[inp].append(resp)
|
|
|
|
consistency_scores = []
|
|
for inp, resps in input_responses.items():
|
|
if len(resps) > 1:
|
|
# Percentage of responses that match the most common response
|
|
most_common_count = Counter(resps).most_common(1)[0][1]
|
|
consistency_scores.append(most_common_count / len(resps))
|
|
|
|
return np.mean(consistency_scores) if consistency_scores else 1.0
|
|
|
|
@staticmethod
|
|
def token_efficiency(prompt, responses):
|
|
avg_prompt_tokens = np.mean([count_tokens(prompt.format(**r['input'])) for r in responses])
|
|
avg_response_tokens = np.mean([count_tokens(r['output']) for r in responses])
|
|
return avg_prompt_tokens + avg_response_tokens
|
|
|
|
@staticmethod
|
|
def latency_p95(latencies):
|
|
return np.percentile(latencies, 95)
|
|
```
|
|
|
|
### Automated Evaluation
|
|
```python
|
|
def evaluate_prompt_comprehensively(prompt, test_suite):
|
|
results = {
|
|
'accuracy': [],
|
|
'consistency': [],
|
|
'latency': [],
|
|
'tokens': [],
|
|
'success_rate': []
|
|
}
|
|
|
|
# Run each test case multiple times for consistency measurement
|
|
for test_case in test_suite:
|
|
runs = []
|
|
for _ in range(3): # 3 runs per test case
|
|
start = time.time()
|
|
response = llm.complete(prompt.format(**test_case['input']))
|
|
latency = time.time() - start
|
|
|
|
runs.append(response)
|
|
results['latency'].append(latency)
|
|
results['tokens'].append(count_tokens(prompt) + count_tokens(response))
|
|
|
|
# Accuracy (best of 3 runs)
|
|
accuracies = [evaluate_accuracy(r, test_case['expected']) for r in runs]
|
|
results['accuracy'].append(max(accuracies))
|
|
|
|
# Consistency (how similar are the 3 runs?)
|
|
results['consistency'].append(calculate_similarity(runs))
|
|
|
|
# Success rate (all runs successful?)
|
|
results['success_rate'].append(all(is_valid(r) for r in runs))
|
|
|
|
return {
|
|
'avg_accuracy': np.mean(results['accuracy']),
|
|
'avg_consistency': np.mean(results['consistency']),
|
|
'p95_latency': np.percentile(results['latency'], 95),
|
|
'avg_tokens': np.mean(results['tokens']),
|
|
'success_rate': np.mean(results['success_rate'])
|
|
}
|
|
```
|
|
|
|
## Failure Analysis
|
|
|
|
### Categorizing Failures
|
|
```python
|
|
class FailureAnalyzer:
|
|
def categorize_failures(self, test_results):
|
|
categories = {
|
|
'format_errors': [],
|
|
'factual_errors': [],
|
|
'logic_errors': [],
|
|
'incomplete_responses': [],
|
|
'hallucinations': [],
|
|
'off_topic': []
|
|
}
|
|
|
|
for result in test_results:
|
|
if not result['success']:
|
|
category = self.determine_failure_type(
|
|
result['response'],
|
|
result['expected']
|
|
)
|
|
categories[category].append(result)
|
|
|
|
return categories
|
|
|
|
def generate_fixes(self, categorized_failures):
|
|
fixes = []
|
|
|
|
if categorized_failures['format_errors']:
|
|
fixes.append({
|
|
'issue': 'Format errors',
|
|
'fix': 'Add explicit format examples and constraints',
|
|
'priority': 'high'
|
|
})
|
|
|
|
if categorized_failures['hallucinations']:
|
|
fixes.append({
|
|
'issue': 'Hallucinations',
|
|
'fix': 'Add grounding instruction: "Base your answer only on provided context"',
|
|
'priority': 'critical'
|
|
})
|
|
|
|
if categorized_failures['incomplete_responses']:
|
|
fixes.append({
|
|
'issue': 'Incomplete responses',
|
|
'fix': 'Add: "Ensure your response fully addresses all parts of the question"',
|
|
'priority': 'medium'
|
|
})
|
|
|
|
return fixes
|
|
```
|
|
|
|
## Versioning and Rollback
|
|
|
|
### Prompt Version Control
|
|
```python
|
|
class PromptVersionControl:
|
|
def __init__(self, storage_path):
|
|
self.storage = storage_path
|
|
self.versions = []
|
|
|
|
def save_version(self, prompt, metadata):
|
|
version = {
|
|
'id': len(self.versions),
|
|
'prompt': prompt,
|
|
'timestamp': datetime.now(),
|
|
'metrics': metadata.get('metrics', {}),
|
|
'description': metadata.get('description', ''),
|
|
'parent_id': metadata.get('parent_id')
|
|
}
|
|
self.versions.append(version)
|
|
self.persist()
|
|
return version['id']
|
|
|
|
def rollback(self, version_id):
|
|
if version_id < len(self.versions):
|
|
return self.versions[version_id]['prompt']
|
|
raise ValueError(f"Version {version_id} not found")
|
|
|
|
def compare_versions(self, v1_id, v2_id):
|
|
v1 = self.versions[v1_id]
|
|
v2 = self.versions[v2_id]
|
|
|
|
return {
|
|
'diff': generate_diff(v1['prompt'], v2['prompt']),
|
|
'metrics_comparison': {
|
|
metric: {
|
|
'v1': v1['metrics'].get(metric),
|
|
'v2': v2['metrics'].get(metric'),
|
|
'change': v2['metrics'].get(metric, 0) - v1['metrics'].get(metric, 0)
|
|
}
|
|
for metric in set(v1['metrics'].keys()) | set(v2['metrics'].keys())
|
|
}
|
|
}
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Establish Baseline**: Always measure initial performance
|
|
2. **Change One Thing**: Isolate variables for clear attribution
|
|
3. **Test Thoroughly**: Use diverse, representative test cases
|
|
4. **Track Metrics**: Log all experiments and results
|
|
5. **Validate Significance**: Use statistical tests for A/B comparisons
|
|
6. **Document Changes**: Keep detailed notes on what and why
|
|
7. **Version Everything**: Enable rollback to previous versions
|
|
8. **Monitor Production**: Continuously evaluate deployed prompts
|
|
|
|
## Common Optimization Patterns
|
|
|
|
### Pattern 1: Add Structure
|
|
```
|
|
Before: "Analyze this text"
|
|
After: "Analyze this text for:\n1. Main topic\n2. Key arguments\n3. Conclusion"
|
|
```
|
|
|
|
### Pattern 2: Add Examples
|
|
```
|
|
Before: "Extract entities"
|
|
After: "Extract entities\\n\\nExample:\\nText: Apple released iPhone\\nEntities: {company: Apple, product: iPhone}"
|
|
```
|
|
|
|
### Pattern 3: Add Constraints
|
|
```
|
|
Before: "Summarize this"
|
|
After: "Summarize in exactly 3 bullet points, 15 words each"
|
|
```
|
|
|
|
### Pattern 4: Add Verification
|
|
```
|
|
Before: "Calculate..."
|
|
After: "Calculate... Then verify your calculation is correct before responding."
|
|
```
|
|
|
|
## Tools and Utilities
|
|
|
|
- Prompt diff tools for version comparison
|
|
- Automated test runners
|
|
- Metric dashboards
|
|
- A/B testing frameworks
|
|
- Token counting utilities
|
|
- Latency profilers
|