Initial commit
This commit is contained in:
@@ -0,0 +1,414 @@
|
||||
# Prompt Optimization Guide
|
||||
|
||||
## Systematic Refinement Process
|
||||
|
||||
### 1. Baseline Establishment
|
||||
```python
|
||||
def establish_baseline(prompt, test_cases):
|
||||
results = {
|
||||
'accuracy': 0,
|
||||
'avg_tokens': 0,
|
||||
'avg_latency': 0,
|
||||
'success_rate': 0
|
||||
}
|
||||
|
||||
for test_case in test_cases:
|
||||
response = llm.complete(prompt.format(**test_case['input']))
|
||||
|
||||
results['accuracy'] += evaluate_accuracy(response, test_case['expected'])
|
||||
results['avg_tokens'] += count_tokens(response)
|
||||
results['avg_latency'] += measure_latency(response)
|
||||
results['success_rate'] += is_valid_response(response)
|
||||
|
||||
# Average across test cases
|
||||
n = len(test_cases)
|
||||
return {k: v/n for k, v in results.items()}
|
||||
```
|
||||
|
||||
### 2. Iterative Refinement Workflow
|
||||
```
|
||||
Initial Prompt → Test → Analyze Failures → Refine → Test → Repeat
|
||||
```
|
||||
|
||||
```python
|
||||
class PromptOptimizer:
|
||||
def __init__(self, initial_prompt, test_suite):
|
||||
self.prompt = initial_prompt
|
||||
self.test_suite = test_suite
|
||||
self.history = []
|
||||
|
||||
def optimize(self, max_iterations=10):
|
||||
for i in range(max_iterations):
|
||||
# Test current prompt
|
||||
results = self.evaluate_prompt(self.prompt)
|
||||
self.history.append({
|
||||
'iteration': i,
|
||||
'prompt': self.prompt,
|
||||
'results': results
|
||||
})
|
||||
|
||||
# Stop if good enough
|
||||
if results['accuracy'] > 0.95:
|
||||
break
|
||||
|
||||
# Analyze failures
|
||||
failures = self.analyze_failures(results)
|
||||
|
||||
# Generate refinement suggestions
|
||||
refinements = self.generate_refinements(failures)
|
||||
|
||||
# Apply best refinement
|
||||
self.prompt = self.select_best_refinement(refinements)
|
||||
|
||||
return self.get_best_prompt()
|
||||
```
|
||||
|
||||
### 3. A/B Testing Framework
|
||||
```python
|
||||
class PromptABTest:
|
||||
def __init__(self, variant_a, variant_b):
|
||||
self.variant_a = variant_a
|
||||
self.variant_b = variant_b
|
||||
|
||||
def run_test(self, test_queries, metrics=['accuracy', 'latency']):
|
||||
results = {
|
||||
'A': {m: [] for m in metrics},
|
||||
'B': {m: [] for m in metrics}
|
||||
}
|
||||
|
||||
for query in test_queries:
|
||||
# Randomly assign variant (50/50 split)
|
||||
variant = 'A' if random.random() < 0.5 else 'B'
|
||||
prompt = self.variant_a if variant == 'A' else self.variant_b
|
||||
|
||||
response, metrics_data = self.execute_with_metrics(
|
||||
prompt.format(query=query['input'])
|
||||
)
|
||||
|
||||
for metric in metrics:
|
||||
results[variant][metric].append(metrics_data[metric])
|
||||
|
||||
return self.analyze_results(results)
|
||||
|
||||
def analyze_results(self, results):
|
||||
from scipy import stats
|
||||
|
||||
analysis = {}
|
||||
for metric in results['A'].keys():
|
||||
a_values = results['A'][metric]
|
||||
b_values = results['B'][metric]
|
||||
|
||||
# Statistical significance test
|
||||
t_stat, p_value = stats.ttest_ind(a_values, b_values)
|
||||
|
||||
analysis[metric] = {
|
||||
'A_mean': np.mean(a_values),
|
||||
'B_mean': np.mean(b_values),
|
||||
'improvement': (np.mean(b_values) - np.mean(a_values)) / np.mean(a_values),
|
||||
'statistically_significant': p_value < 0.05,
|
||||
'p_value': p_value,
|
||||
'winner': 'B' if np.mean(b_values) > np.mean(a_values) else 'A'
|
||||
}
|
||||
|
||||
return analysis
|
||||
```
|
||||
|
||||
## Optimization Strategies
|
||||
|
||||
### Token Reduction
|
||||
```python
|
||||
def optimize_for_tokens(prompt):
|
||||
optimizations = [
|
||||
# Remove redundant phrases
|
||||
('in order to', 'to'),
|
||||
('due to the fact that', 'because'),
|
||||
('at this point in time', 'now'),
|
||||
|
||||
# Consolidate instructions
|
||||
('First, ...\\nThen, ...\\nFinally, ...', 'Steps: 1) ... 2) ... 3) ...'),
|
||||
|
||||
# Use abbreviations (after first definition)
|
||||
('Natural Language Processing (NLP)', 'NLP'),
|
||||
|
||||
# Remove filler words
|
||||
(' actually ', ' '),
|
||||
(' basically ', ' '),
|
||||
(' really ', ' ')
|
||||
]
|
||||
|
||||
optimized = prompt
|
||||
for old, new in optimizations:
|
||||
optimized = optimized.replace(old, new)
|
||||
|
||||
return optimized
|
||||
```
|
||||
|
||||
### Latency Reduction
|
||||
```python
|
||||
def optimize_for_latency(prompt):
|
||||
strategies = {
|
||||
'shorter_prompt': reduce_token_count(prompt),
|
||||
'streaming': enable_streaming_response(prompt),
|
||||
'caching': add_cacheable_prefix(prompt),
|
||||
'early_stopping': add_stop_sequences(prompt)
|
||||
}
|
||||
|
||||
# Test each strategy
|
||||
best_strategy = None
|
||||
best_latency = float('inf')
|
||||
|
||||
for name, modified_prompt in strategies.items():
|
||||
latency = measure_average_latency(modified_prompt)
|
||||
if latency < best_latency:
|
||||
best_latency = latency
|
||||
best_strategy = modified_prompt
|
||||
|
||||
return best_strategy
|
||||
```
|
||||
|
||||
### Accuracy Improvement
|
||||
```python
|
||||
def improve_accuracy(prompt, failure_cases):
|
||||
improvements = []
|
||||
|
||||
# Add constraints for common failures
|
||||
if has_format_errors(failure_cases):
|
||||
improvements.append("Output must be valid JSON with no additional text.")
|
||||
|
||||
# Add examples for edge cases
|
||||
edge_cases = identify_edge_cases(failure_cases)
|
||||
if edge_cases:
|
||||
improvements.append(f"Examples of edge cases:\\n{format_examples(edge_cases)}")
|
||||
|
||||
# Add verification step
|
||||
if has_logical_errors(failure_cases):
|
||||
improvements.append("Before responding, verify your answer is logically consistent.")
|
||||
|
||||
# Strengthen instructions
|
||||
if has_ambiguity_errors(failure_cases):
|
||||
improvements.append(clarify_ambiguous_instructions(prompt))
|
||||
|
||||
return integrate_improvements(prompt, improvements)
|
||||
```
|
||||
|
||||
## Performance Metrics
|
||||
|
||||
### Core Metrics
|
||||
```python
|
||||
class PromptMetrics:
|
||||
@staticmethod
|
||||
def accuracy(responses, ground_truth):
|
||||
return sum(r == gt for r, gt in zip(responses, ground_truth)) / len(responses)
|
||||
|
||||
@staticmethod
|
||||
def consistency(responses):
|
||||
# Measure how often identical inputs produce identical outputs
|
||||
from collections import defaultdict
|
||||
input_responses = defaultdict(list)
|
||||
|
||||
for inp, resp in responses:
|
||||
input_responses[inp].append(resp)
|
||||
|
||||
consistency_scores = []
|
||||
for inp, resps in input_responses.items():
|
||||
if len(resps) > 1:
|
||||
# Percentage of responses that match the most common response
|
||||
most_common_count = Counter(resps).most_common(1)[0][1]
|
||||
consistency_scores.append(most_common_count / len(resps))
|
||||
|
||||
return np.mean(consistency_scores) if consistency_scores else 1.0
|
||||
|
||||
@staticmethod
|
||||
def token_efficiency(prompt, responses):
|
||||
avg_prompt_tokens = np.mean([count_tokens(prompt.format(**r['input'])) for r in responses])
|
||||
avg_response_tokens = np.mean([count_tokens(r['output']) for r in responses])
|
||||
return avg_prompt_tokens + avg_response_tokens
|
||||
|
||||
@staticmethod
|
||||
def latency_p95(latencies):
|
||||
return np.percentile(latencies, 95)
|
||||
```
|
||||
|
||||
### Automated Evaluation
|
||||
```python
|
||||
def evaluate_prompt_comprehensively(prompt, test_suite):
|
||||
results = {
|
||||
'accuracy': [],
|
||||
'consistency': [],
|
||||
'latency': [],
|
||||
'tokens': [],
|
||||
'success_rate': []
|
||||
}
|
||||
|
||||
# Run each test case multiple times for consistency measurement
|
||||
for test_case in test_suite:
|
||||
runs = []
|
||||
for _ in range(3): # 3 runs per test case
|
||||
start = time.time()
|
||||
response = llm.complete(prompt.format(**test_case['input']))
|
||||
latency = time.time() - start
|
||||
|
||||
runs.append(response)
|
||||
results['latency'].append(latency)
|
||||
results['tokens'].append(count_tokens(prompt) + count_tokens(response))
|
||||
|
||||
# Accuracy (best of 3 runs)
|
||||
accuracies = [evaluate_accuracy(r, test_case['expected']) for r in runs]
|
||||
results['accuracy'].append(max(accuracies))
|
||||
|
||||
# Consistency (how similar are the 3 runs?)
|
||||
results['consistency'].append(calculate_similarity(runs))
|
||||
|
||||
# Success rate (all runs successful?)
|
||||
results['success_rate'].append(all(is_valid(r) for r in runs))
|
||||
|
||||
return {
|
||||
'avg_accuracy': np.mean(results['accuracy']),
|
||||
'avg_consistency': np.mean(results['consistency']),
|
||||
'p95_latency': np.percentile(results['latency'], 95),
|
||||
'avg_tokens': np.mean(results['tokens']),
|
||||
'success_rate': np.mean(results['success_rate'])
|
||||
}
|
||||
```
|
||||
|
||||
## Failure Analysis
|
||||
|
||||
### Categorizing Failures
|
||||
```python
|
||||
class FailureAnalyzer:
|
||||
def categorize_failures(self, test_results):
|
||||
categories = {
|
||||
'format_errors': [],
|
||||
'factual_errors': [],
|
||||
'logic_errors': [],
|
||||
'incomplete_responses': [],
|
||||
'hallucinations': [],
|
||||
'off_topic': []
|
||||
}
|
||||
|
||||
for result in test_results:
|
||||
if not result['success']:
|
||||
category = self.determine_failure_type(
|
||||
result['response'],
|
||||
result['expected']
|
||||
)
|
||||
categories[category].append(result)
|
||||
|
||||
return categories
|
||||
|
||||
def generate_fixes(self, categorized_failures):
|
||||
fixes = []
|
||||
|
||||
if categorized_failures['format_errors']:
|
||||
fixes.append({
|
||||
'issue': 'Format errors',
|
||||
'fix': 'Add explicit format examples and constraints',
|
||||
'priority': 'high'
|
||||
})
|
||||
|
||||
if categorized_failures['hallucinations']:
|
||||
fixes.append({
|
||||
'issue': 'Hallucinations',
|
||||
'fix': 'Add grounding instruction: "Base your answer only on provided context"',
|
||||
'priority': 'critical'
|
||||
})
|
||||
|
||||
if categorized_failures['incomplete_responses']:
|
||||
fixes.append({
|
||||
'issue': 'Incomplete responses',
|
||||
'fix': 'Add: "Ensure your response fully addresses all parts of the question"',
|
||||
'priority': 'medium'
|
||||
})
|
||||
|
||||
return fixes
|
||||
```
|
||||
|
||||
## Versioning and Rollback
|
||||
|
||||
### Prompt Version Control
|
||||
```python
|
||||
class PromptVersionControl:
|
||||
def __init__(self, storage_path):
|
||||
self.storage = storage_path
|
||||
self.versions = []
|
||||
|
||||
def save_version(self, prompt, metadata):
|
||||
version = {
|
||||
'id': len(self.versions),
|
||||
'prompt': prompt,
|
||||
'timestamp': datetime.now(),
|
||||
'metrics': metadata.get('metrics', {}),
|
||||
'description': metadata.get('description', ''),
|
||||
'parent_id': metadata.get('parent_id')
|
||||
}
|
||||
self.versions.append(version)
|
||||
self.persist()
|
||||
return version['id']
|
||||
|
||||
def rollback(self, version_id):
|
||||
if version_id < len(self.versions):
|
||||
return self.versions[version_id]['prompt']
|
||||
raise ValueError(f"Version {version_id} not found")
|
||||
|
||||
def compare_versions(self, v1_id, v2_id):
|
||||
v1 = self.versions[v1_id]
|
||||
v2 = self.versions[v2_id]
|
||||
|
||||
return {
|
||||
'diff': generate_diff(v1['prompt'], v2['prompt']),
|
||||
'metrics_comparison': {
|
||||
metric: {
|
||||
'v1': v1['metrics'].get(metric),
|
||||
'v2': v2['metrics'].get(metric'),
|
||||
'change': v2['metrics'].get(metric, 0) - v1['metrics'].get(metric, 0)
|
||||
}
|
||||
for metric in set(v1['metrics'].keys()) | set(v2['metrics'].keys())
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Establish Baseline**: Always measure initial performance
|
||||
2. **Change One Thing**: Isolate variables for clear attribution
|
||||
3. **Test Thoroughly**: Use diverse, representative test cases
|
||||
4. **Track Metrics**: Log all experiments and results
|
||||
5. **Validate Significance**: Use statistical tests for A/B comparisons
|
||||
6. **Document Changes**: Keep detailed notes on what and why
|
||||
7. **Version Everything**: Enable rollback to previous versions
|
||||
8. **Monitor Production**: Continuously evaluate deployed prompts
|
||||
|
||||
## Common Optimization Patterns
|
||||
|
||||
### Pattern 1: Add Structure
|
||||
```
|
||||
Before: "Analyze this text"
|
||||
After: "Analyze this text for:\n1. Main topic\n2. Key arguments\n3. Conclusion"
|
||||
```
|
||||
|
||||
### Pattern 2: Add Examples
|
||||
```
|
||||
Before: "Extract entities"
|
||||
After: "Extract entities\\n\\nExample:\\nText: Apple released iPhone\\nEntities: {company: Apple, product: iPhone}"
|
||||
```
|
||||
|
||||
### Pattern 3: Add Constraints
|
||||
```
|
||||
Before: "Summarize this"
|
||||
After: "Summarize in exactly 3 bullet points, 15 words each"
|
||||
```
|
||||
|
||||
### Pattern 4: Add Verification
|
||||
```
|
||||
Before: "Calculate..."
|
||||
After: "Calculate... Then verify your calculation is correct before responding."
|
||||
```
|
||||
|
||||
## Tools and Utilities
|
||||
|
||||
- Prompt diff tools for version comparison
|
||||
- Automated test runners
|
||||
- Metric dashboards
|
||||
- A/B testing frameworks
|
||||
- Token counting utilities
|
||||
- Latency profilers
|
||||
Reference in New Issue
Block a user