From e33f0fba544129cfd2a80b40b54d43589b0eb040 Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:27:42 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 11 + README.md | 3 + plugin.lock.json | 73 +++ skills/ace-skill.md | 561 +++++++++++++++++++++++ skills/curate-delta/SKILL.md | 414 +++++++++++++++++ skills/generate-appworld-code/SKILL.md | 232 ++++++++++ skills/playbook.json | 260 +++++++++++ skills/playbook_appworld_seed.json | 99 ++++ skills/playbook_epoch_1.json | 260 +++++++++++ skills/playbook_epoch_2.json | 184 ++++++++ skills/reflect-appworld-failure/SKILL.md | 200 ++++++++ 11 files changed, 2297 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/ace-skill.md create mode 100644 skills/curate-delta/SKILL.md create mode 100644 skills/generate-appworld-code/SKILL.md create mode 100644 skills/playbook.json create mode 100644 skills/playbook_appworld_seed.json create mode 100644 skills/playbook_epoch_1.json create mode 100644 skills/playbook_epoch_2.json create mode 100644 skills/reflect-appworld-failure/SKILL.md diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..fde1967 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "ace-context-engineering", + "description": "Agentic Context Engineering: evolve context through bullets, deltas, and TF-IDF retrieval. Complete with schemas, validation, and production scripts.", + "version": "1.0.0", + "author": { + "name": "jmanhype" + }, + "skills": [ + "./skills" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..08d5329 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# ace-context-engineering + +Agentic Context Engineering: evolve context through bullets, deltas, and TF-IDF retrieval. Complete with schemas, validation, and production scripts. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..bdbb140 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,73 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:jmanhype/claude-code-plugin-marketplace:plugins/ace-context-engineering", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "f136d46c90ac546cc69a2526e5e03f73d504a6b6", + "treeHash": "765b9d30c96c284be286228cee624ead6484969dc9ef201e67fa98d2cd786254", + "generatedAt": "2025-11-28T10:19:15.965102Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "ace-context-engineering", + "description": "Agentic Context Engineering: evolve context through bullets, deltas, and TF-IDF retrieval. Complete with schemas, validation, and production scripts.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "b3591cf72300bc6f3b82f80ae37385d50d94a052de6401b719807b02a6561fcc" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "38d94c48e64a3e11412846987e57edcc96967469b2dbbe41c333b6399532aedd" + }, + { + "path": "skills/ace-skill.md", + "sha256": "8908295b83a6c129a9c6614b3336ab682dd2466d1986646f978eedcb63a4b079" + }, + { + "path": "skills/playbook_epoch_2.json", + "sha256": "c55023281edfedb7983e8892c481cb6c20e74cd8365d6ce8f72d20734b898164" + }, + { + "path": "skills/playbook_appworld_seed.json", + "sha256": "fe741607453e8b83f02a99d3bb1fe5a7731f30c6dbbfec595aceba77fba924ab" + }, + { + "path": "skills/playbook.json", + "sha256": "203caacf8224d3a0e36647ddde8750fe8331cb1e9fb06edaa053788875cc80a7" + }, + { + "path": "skills/playbook_epoch_1.json", + "sha256": "203caacf8224d3a0e36647ddde8750fe8331cb1e9fb06edaa053788875cc80a7" + }, + { + "path": "skills/curate-delta/SKILL.md", + "sha256": "463f525f2fb3aff5a5e5a57cb1481f0f4d390941bf69fed51ebab8ad4eb42f17" + }, + { + "path": "skills/reflect-appworld-failure/SKILL.md", + "sha256": "6857a4c0f5df816e98f65e834a6766c93c9588396d44b783e75c27d00d59e823" + }, + { + "path": "skills/generate-appworld-code/SKILL.md", + "sha256": "61f0af64574665f9cbd2162b305dd3d6c6565b7c25b25327487555a3566625f9" + } + ], + "dirSha256": "765b9d30c96c284be286228cee624ead6484969dc9ef201e67fa98d2cd786254" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/ace-skill.md b/skills/ace-skill.md new file mode 100644 index 0000000..13974bf --- /dev/null +++ b/skills/ace-skill.md @@ -0,0 +1,561 @@ +# ACE Context Engineering Skill + +**Name:** ace-context-engineering +**Version:** 1.0.0 +**Description:** Agentic Context Engineering - Progressive disclosure and incremental context evolution through bullets and deltas + +--- + +## Purpose + +This skill enables autonomous evolution of the context system through: + +1. **Retrieval** - Query relevant bullets based on task context +2. **Generation** - Solve tasks using retrieved guidance +3. **Reflection** - Evaluate what worked/didn't work +4. **Curation** - Propose incremental delta updates +5. **Merge** - Apply deltas to evolve the playbook + +--- + +## When to Use This Skill + +✅ **Use when:** +- Completing complex tasks that could benefit from learned patterns +- Discovering new heuristics or edge cases +- Finding gaps in current context/guidance +- Task failed and needs to capture lessons learned +- User explicitly requests context improvement + +❌ **Don't use when:** +- Simple, one-off tasks with no reusable patterns +- Just reading/retrieving context (use Read tool instead) +- No clear signal about what worked or didn't work + +--- + +## File Structure + +``` +.claude/skills/ace-context-engineering/ +├── skill.md # This file +├── playbook.json # Current bullet library +├── schemas/ +│ ├── bullet.schema.json # Bullet validation schema +│ └── delta.schema.json # Delta validation schema +└── scripts/ + └── validate_delta.py # Delta validation script +``` + +--- + +## Workflow: Full ACE Cycle + +### Phase 1: RETRIEVE + +**Goal:** Load relevant bullets for current task + +1. Read `playbook.json` to get available bullets +2. Filter by tags matching task domain (e.g., "tool.edit", "git.push", "domain.plugin_marketplace") +3. Score bullets by: + - Tag relevance (how many tags overlap with task) + - `helpful_count` (higher = better) + - `harmful_count` (higher = worse, may skip) + - Confidence level + - Recency (newer bullets might be more relevant) +4. Select top 5-10 bullets (avoid context overload) + +**Example retrieval for "edit JSON file" task:** + +```json +{ + "task_tags": ["tool.edit", "tool.read", "json", "validation"], + "retrieved_bullets": [ + "bullet-2025-10-25-001", // Read before edit + "bullet-2025-10-25-012", // Preserve indentation + "bullet-2025-10-25-006" // Validate JSON + ] +} +``` + +### Phase 2: GENERATE + +**Goal:** Solve the task using retrieved guidance + +1. Apply bullet guidance to task execution +2. Track which bullets you actually used +3. Observe execution outcomes (success/failure) +4. Note any gaps in guidance + +**Output format:** + +```json +{ + "final_answer": "Completed successfully. Edited file.json with proper validation.", + "used_bullet_ids": [ + "bullet-2025-10-25-001", + "bullet-2025-10-25-012", + "bullet-2025-10-25-006" + ], + "observations": [ + "Read-before-edit rule prevented error", + "Indentation rule ensured match worked", + "Validation caught malformed JSON before commit" + ], + "answer_confidence": "high", + "unused_bullets": [], + "missing_guidance": [] +} +``` + +### Phase 3: REFLECT + +**Goal:** Evaluate effectiveness and propose improvements + +1. Review generator output and actual outcomes +2. Mark helpful bullets (incremented `helpful_count`) +3. Mark harmful bullets (incremented `harmful_count`) +4. Identify missing patterns (propose new bullets) +5. Identify redundancies (propose merges) +6. Identify obsolete guidance (propose deprecations) + +**Output format:** + +```json +{ + "proposed_deltas": { + "new_bullets": [ + { + "id": "bullet-2025-10-25-013", + "title": "Validate JSON before committing to avoid CI failures", + "content": "Always run JSON validation (using jq or validation script) before committing JSON files. Common issues: trailing commas, unescaped strings, missing brackets. Catching these locally prevents CI pipeline failures and saves time.", + "tags": ["json", "validation", "git.commit", "best_practice"], + "evidence": [ + { + "type": "execution", + "ref": "task-2025-10-25-042", + "note": "Prevented malformed JSON from being committed" + } + ], + "confidence": "medium", + "scope": "global" + } + ], + "counters": [ + {"id": "bullet-2025-10-25-001", "helpful_delta": 1}, + {"id": "bullet-2025-10-25-012", "helpful_delta": 1}, + {"id": "bullet-2025-10-25-006", "helpful_delta": 1} + ] + }, + "reflection_notes": "All three retrieved bullets were directly helpful. Discovered new pattern about JSON validation timing that should be captured." +} +``` + +### Phase 4: CURATE + +**Goal:** Normalize and deduplicate proposed deltas + +1. Check for duplicate bullets (compare with existing playbook) +2. Normalize tags (use existing tag taxonomy) +3. Ensure rationales are clear +4. Validate delta against schema +5. Resolve conflicts (editing + deprecating same bullet) + +**Deduplication criteria:** + +- Semantic similarity > 80% (similar title + content) +- High tag overlap (>70% shared tags) +- Same intent/purpose + +**Output format:** + +```json +{ + "clean_delta": { + "delta_id": "delta-2025-10-25-001", + "timestamp": "2025-10-25T12:34:56Z", + "author": "agent", + "rationale": "Captured JSON validation pattern from successful task execution", + "task_context": "Editing plugin metadata.json file", + "reviewed": false, + "new_bullets": [...], + "counters": [...] + }, + "curation_notes": "No duplicates found. New bullet is distinct from existing validation guidance.", + "requires_human_review": false +} +``` + +### Phase 5: MERGE + +**Goal:** Apply delta to playbook + +1. Validate delta using `validate_delta.py` +2. Apply operations in order: + - Update counters + - Add new bullets + - Apply edits + - Execute merges (combine + archive merged bullets) + - Process deprecations (move to archived status) +3. Update metadata (total_bullets, last_curated timestamp) +4. Save updated playbook +5. (Optional) Commit to version control + +--- + +## Retrieval Algorithm + +### Simple Tag-Based Retrieval + +```python +def retrieve_bullets(task_tags: List[str], playbook: Dict) -> List[Dict]: + """ + Retrieve relevant bullets for a task based on tag overlap and effectiveness + """ + bullets = playbook['bullets'] + scored = [] + + for bullet in bullets: + if bullet['status'] != 'active': + continue # Skip deprecated/archived + + # Calculate tag overlap + bullet_tags = set(bullet['tags']) + task_tags_set = set(task_tags) + overlap = len(bullet_tags & task_tags_set) + + if overlap == 0: + continue # No relevance + + # Calculate success rate + total = bullet['helpful_count'] + bullet['harmful_count'] + success_rate = bullet['helpful_count'] / total if total > 0 else 0.5 + + # Confidence weighting + confidence_weight = {'high': 1.0, 'medium': 0.8, 'low': 0.6} + conf = confidence_weight.get(bullet.get('confidence', 'medium'), 0.8) + + # Combined score + score = overlap * success_rate * conf + scored.append((score, bullet)) + + # Sort by score descending, take top 10 + scored.sort(reverse=True, key=lambda x: x[0]) + return [bullet for score, bullet in scored[:10]] +``` + +### Advanced: Semantic Similarity + +For more sophisticated retrieval, use embedding-based similarity: + +1. Embed task description +2. Embed bullet title + content +3. Compute cosine similarity +4. Combine with tag-based score +5. Rank and select top-k + +--- + +## Delta Merge Algorithm + +### Deterministic Merge Process + +```python +def merge_delta(playbook: Dict, delta: Dict) -> Dict: + """ + Apply delta operations to playbook deterministically + """ + # 1. Update counters + for counter in delta.get('counters', []): + bullet = find_bullet(playbook, counter['id']) + if bullet: + bullet['helpful_count'] += counter.get('helpful_delta', 0) + bullet['harmful_count'] += counter.get('harmful_delta', 0) + bullet['last_updated'] = now() + + # 2. Add new bullets + for new_bullet in delta.get('new_bullets', []): + new_bullet['created'] = now() + new_bullet['last_updated'] = now() + playbook['bullets'].append(new_bullet) + + # 3. Apply edits + for edit in delta.get('edits', []): + bullet = find_bullet(playbook, edit['id']) + if bullet: + bullet.update(edit['set']) + bullet['last_updated'] = now() + + # 4. Execute merges + for merge in delta.get('merges', []): + keep_bullet = find_bullet(playbook, merge['keep_id']) + for merge_id in merge['merge_ids']: + merged = find_bullet(playbook, merge_id) + if merged: + # Combine counters + keep_bullet['helpful_count'] += merged['helpful_count'] + keep_bullet['harmful_count'] += merged['harmful_count'] + # Archive merged bullet + merged['status'] = 'archived' + merged['deprecation_reason'] = f"Merged into {merge['keep_id']}" + if 'merged_content' in merge: + keep_bullet['content'] = merge['merged_content'] + keep_bullet['last_updated'] = now() + + # 5. Process deprecations + for deprecation in delta.get('deprecations', []): + bullet = find_bullet(playbook, deprecation['id']) + if bullet: + bullet['status'] = 'deprecated' + bullet['deprecation_reason'] = deprecation['reason'] + bullet['last_updated'] = now() + + # 6. Update metadata + playbook['metadata']['total_bullets'] = len(playbook['bullets']) + playbook['metadata']['active_bullets'] = sum( + 1 for b in playbook['bullets'] if b['status'] == 'active' + ) + playbook['metadata']['last_curated'] = now() + + return playbook +``` + +--- + +## Practical Examples + +### Example 1: Simple Counter Update + +**Scenario:** Used bullet-2025-10-25-001 successfully + +**Delta:** + +```json +{ + "counters": [ + { + "id": "bullet-2025-10-25-001", + "helpful_delta": 1, + "evidence": { + "type": "execution", + "ref": "task-edit-config", + "note": "Read-before-edit prevented error" + } + } + ] +} +``` + +### Example 2: New Pattern Discovered + +**Scenario:** Found that git push needs retry logic for network failures + +**Delta:** + +```json +{ + "new_bullets": [ + { + "id": "bullet-2025-10-25-013", + "title": "Retry git push with exponential backoff on network failures", + "content": "When git push fails with network errors (not auth errors), retry up to 4 times with exponential backoff: 2s, 4s, 8s, 16s. This handles transient network issues without overwhelming the server. Check error message to distinguish network vs auth failures.", + "tags": ["git.push", "retry", "network", "error_handling"], + "evidence": [ + { + "type": "execution", + "ref": "commit-abc123", + "note": "Push succeeded on 2nd retry after network timeout" + } + ], + "confidence": "high", + "scope": "global", + "helpful_count": 0, + "harmful_count": 0, + "links": ["bullet-2025-10-25-003"] + } + ] +} +``` + +### Example 3: Merge Redundant Bullets + +**Scenario:** Two bullets say similar things about JSON validation + +**Delta:** + +```json +{ + "merges": [ + { + "keep_id": "bullet-2025-10-25-006", + "merge_ids": ["bullet-2025-10-25-999"], + "rationale": "Both bullets address JSON validation before commit. bullet-006 is more comprehensive and has higher helpful_count.", + "merged_content": "Always validate JSON files before committing. Use validation scripts or jq. Common issues: trailing commas, unescaped strings, missing brackets. For plugin marketplace, also check against schema. Validation prevents CI failures and saves time." + } + ] +} +``` + +### Example 4: Deprecate Obsolete Guidance + +**Scenario:** Old bullet says to use deprecated API + +**Delta:** + +```json +{ + "deprecations": [ + { + "id": "bullet-2024-08-15-042", + "reason": "API v1 was deprecated. All code now uses API v2.", + "replacement_id": "bullet-2025-10-25-088" + } + ] +} +``` + +--- + +## Usage Guidelines + +### When to Grow vs Refine + +**Grow (add new bullets):** +- Discovered new useful pattern +- Found edge case not covered +- Learned domain-specific heuristic +- Tool usage recipe emerged +- Default mode + +**Refine (merge/deprecate):** +- Clear redundancy between bullets +- Proven obsolescence (API changed, tool deprecated) +- High context pressure (playbook too large) +- Conflicting guidance (need to resolve) + +### Evidence Quality + +**High confidence evidence:** +- `execution`: Tool success/failure logs +- `validation`: Test results, schema validation +- `test_result`: Automated test outcomes + +**Medium confidence evidence:** +- `user_feedback`: User confirmed it helped +- `documentation`: Official docs support this + +**Low confidence evidence:** +- `low_confidence`: Hunch or untested hypothesis +- No evidence: Mark bullet with `confidence: "low"` + +### Tag Taxonomy + +Use hierarchical tags with dots: + +- `tool.bash`, `tool.edit`, `tool.read` +- `git.push`, `git.commit`, `git.fetch` +- `api.github`, `api.github.ratelimit` +- `domain.plugin_marketplace`, `domain.web_scraping` +- `error_handling`, `retry`, `validation` +- `antipattern`, `best_practice`, `critical` + +--- + +## Validation Workflow + +Before merging a delta: + +```bash +# 1. Validate delta structure +python .claude/skills/ace-context-engineering/scripts/validate_delta.py \ + proposed_delta.json \ + --playbook .claude/skills/ace-context-engineering/playbook.json + +# 2. If valid, review output +# 3. If approved, merge (apply delta operations) +# 4. Commit updated playbook to version control +``` + +--- + +## Integration with Context System + +### Automatic Reflection Triggers + +Consider proposing deltas after: + +1. **Task completion** (if new patterns emerged) +2. **Error recovery** (if guidance was missing or wrong) +3. **Tool failure** (if unexpected behavior occurred) +4. **User correction** (if user pointed out mistake) + +### TodoWrite Integration + +When using TodoWrite, consider ACE reflection as final step: + +``` +[ ] Complete feature X +[ ] Run tests +[ ] Fix any errors +[ ] Propose ACE delta for patterns discovered +``` + +### Feedback Loop + +``` +Task → Retrieve Bullets → Execute → Observe Outcome → Reflect → Propose Delta → Curate → Merge → Updated Playbook + ↑ + └─ Next Task +``` + +--- + +## Limitations & Future Work + +**Current limitations:** + +- Manual retrieval (no automatic semantic search) +- No A/B testing (can't compare with/without bullet) +- Simple scoring (no learned weights) +- No conflict resolution strategy (relies on human review) + +**Future enhancements:** + +- Embedding-based semantic retrieval +- Automated counter updates from test results +- Learned retrieval and ranking models +- Automatic duplicate detection +- Impact analysis (measure bullet effectiveness) +- Cross-project bullet sharing + +--- + +## Quick Reference + +### Read playbook + +```bash +cat .claude/skills/ace-context-engineering/playbook.json | jq '.bullets[] | {id, title, tags}' +``` + +### Find bullets by tag + +```bash +cat playbook.json | jq '.bullets[] | select(.tags[] | contains("tool.edit"))' +``` + +### Validate delta + +```bash +python scripts/validate_delta.py my_delta.json --playbook playbook.json +``` + +### Check bullet effectiveness + +```bash +cat playbook.json | jq '.bullets[] | {id, title, helpful: .helpful_count, harmful: .harmful_count}' +``` + +--- + +**End of skill.md** diff --git a/skills/curate-delta/SKILL.md b/skills/curate-delta/SKILL.md new file mode 100644 index 0000000..9ae3f42 --- /dev/null +++ b/skills/curate-delta/SKILL.md @@ -0,0 +1,414 @@ +--- +name: curate-delta +description: Synthesize Reflector insights into structured delta proposals for playbook updates, following ACE paper's Curator architecture +allowed-tools: Read +--- + +# Curate Delta Proposal + +You are the Curator component of the ACE (Agentic Context Engineering) system. Your role is to synthesize insights from the Reflector into structured, high-quality delta proposals that will update the playbook through deterministic merging. + +## Input Format + +You will receive Reflector output containing: +- Task metadata (instruction, apps, outcome) +- Execution feedback (success/failure, error analysis) +- Proposed bullets from Reflector +- Existing playbook state +- Bullet usage feedback (helpful/unhelpful) + +## Your Responsibilities + +### 1. Synthesize Insights +- Review the Reflector's analysis and proposed bullets +- Assess the quality and specificity of each proposed bullet +- Check for redundancy with existing playbook bullets +- Validate that bullets are actionable and generalizable + +### 2. Structure Delta Proposal +Generate a JSON delta with these components: + +**new_bullets**: New insights to add to the playbook +- Must be specific, actionable, and evidence-backed +- Should generalize beyond the specific task +- Include concrete code examples when applicable +- Tag appropriately for retrieval + +**counters**: Update usage statistics for existing bullets +- Increment `helpful_count` for bullets that aided success +- Increment `unhelpful_count` for bullets that misled +- Use bullet IDs from the playbook + +**edits**: Modifications to existing bullets (optional) +- Clarify ambiguous language +- Add missing edge cases +- Improve code examples +- Merge near-duplicates + +**merges**: Combine redundant bullets (optional) +- Identify bullets with >80% semantic overlap +- Preserve best content from both +- Maintain evidence provenance + +**deprecations**: Mark outdated bullets (optional) +- Identify bullets contradicted by new evidence +- Mark as deprecated rather than delete (preserve history) + +## Output Format + +**CRITICAL: You must return ONLY valid JSON with no additional text, explanation, or commentary before or after the JSON.** + +Return ONLY this JSON object structure: + +```json +{ + "delta": { + "new_bullets": [ + { + "id": "bullet-YYYY-MM-DD-HHMMSS", + "title": "", + "content": "", + "tags": ["app.", "", ""], + "evidence": [ + { + "type": "execution", + "ref": "", + "note": "Discovered from " + } + ], + "confidence": "high|medium|low", + "scope": "app|global" + } + ], + "counters": { + "": { + "helpful_count": 1, + "unhelpful_count": 0 + } + }, + "edits": [ + { + "bullet_id": "", + "field": "content|title|tags", + "old_value": "...", + "new_value": "...", + "reason": "Why this edit improves the bullet" + } + ], + "merges": [ + { + "primary_id": "", + "secondary_ids": [""], + "reason": "Why these bullets are redundant" + } + ], + "deprecations": [ + { + "bullet_id": "", + "reason": "Why this bullet is outdated/incorrect" + } + ] + }, + "curation_notes": [ + "Accepted 1 new bullet with high confidence", + "Updated counters for 3 helpful bullets", + "Rejected 1 duplicate bullet (similar to existing bullet-123)" + ], + "quality_score": 0.85 +} +``` + +## Quality Guidelines + +### ACCEPT bullets that are: +- **Specific**: Reference concrete APIs, parameters, or patterns +- **Actionable**: Provide clear guidance with code examples +- **Evidence-backed**: Link to specific task failures/successes +- **Generalizable**: Apply beyond the specific task instance +- **Non-redundant**: Add new information not in existing bullets + +### REJECT bullets that are: +- **Vague**: Generic advice without specifics ("Be careful with X") +- **Task-specific**: Only apply to one unique task instance +- **Redundant**: Duplicate existing bullets (>80% semantic overlap) +- **Incorrect**: Contradict known-good patterns +- **Unhelpful**: Provide advice that doesn't address root cause + +### Examples of GOOD vs BAD Bullets + +#### GOOD: Specific, actionable, code-backed +``` +Title: "Spotify: Use show_playlist_songs() for each playlist separately" +Content: "Spotify API requires fetching playlist songs individually: +1. Get playlists: apis.spotify.show_playlist_library(token) +2. For each playlist: apis.spotify.show_playlist_songs(token, playlist_id) +3. Aggregate results across all playlists +Common error: Calling show_playlist_library() expecting nested songs." +Tags: ["app.spotify", "api", "aggregation"] +Scope: app +Confidence: high +``` + +#### BAD: Vague, no code, not actionable +``` +Title: "Review Spotify API logic carefully" +Content: "When working with Spotify, make sure to check the API documentation and verify your logic is correct." +Tags: ["app.spotify", "debugging"] +Scope: app +Confidence: low +``` + +#### GOOD: Global pattern with concrete guidance +``` +Title: "Always call login() before any app API methods" +Content: "All app APIs require authentication first: +1. response = apis..login(username, password) +2. token = response['access_token'] +3. Use token in subsequent API calls +Exception: apis.supervisor methods don't need login." +Tags: ["authentication", "api", "global"] +Scope: global +Confidence: high +``` + +#### BAD: Task-specific, not generalizable +``` +Title: "For task 82e2fac_1, call Spotify login" +Content: "This specific task needs you to login to Spotify first." +Tags: ["app.spotify", "task-specific"] +Scope: app +Confidence: low +``` + +## Handling Reflector Proposals + +When the Reflector proposes a new bullet: + +1. **Validate Quality** + - Does it have a specific title? + - Does it include concrete code examples? + - Is the guidance actionable? + +2. **Check for Redundancy** + - Compare semantic similarity with existing bullets + - If >80% overlap, consider merging instead of adding + - If improving an existing bullet, use `edits` instead of `new_bullets` + +3. **Assess Confidence** + - **High**: Backed by clear failure pattern + working fix + - **Medium**: Reasonable hypothesis, needs more validation + - **Low**: Speculative, insufficient evidence + +4. **Determine Scope** + - **app**: Specific to one app (e.g., Spotify, Gmail) + - **global**: Applies across all apps (e.g., login patterns, error handling) + +## Counter Updates + +Use bullet feedback from execution to update counters: + +- **helpful**: Bullet was retrieved and task succeeded +- **unhelpful**: Bullet was retrieved but task still failed +- **unused**: Bullet not retrieved for this task + +Update format: +```json +"counters": { + "appworld-spotify-005": { + "helpful_count": 1 + }, + "appworld-login-001": { + "helpful_count": 1 + } +} +``` + +## Edge Cases + +### No New Bullets Needed +If the Reflector's proposals are low-quality or redundant: +```json +{ + "delta": { + "new_bullets": [], + "counters": { /* update existing bullet counters */ } + }, + "curation_notes": [ + "No new bullets accepted (proposals too vague)", + "Updated counters for existing bullets" + ], + "quality_score": 0.5 +} +``` + +### Bullet Improvement +If an existing bullet needs improvement: +```json +{ + "delta": { + "new_bullets": [], + "edits": [ + { + "bullet_id": "appworld-spotify-005", + "field": "content", + "old_value": "Get user playlists and track details separately", + "new_value": "Get user playlists with show_playlist_library(), then fetch songs for each playlist using show_playlist_songs(playlist_id)", + "reason": "Added specific API method names for clarity" + } + ] + }, + "curation_notes": ["Improved existing bullet with API details"], + "quality_score": 0.8 +} +``` + +### Bullet Deprecation +If new evidence contradicts an old bullet: +```json +{ + "delta": { + "deprecations": [ + { + "bullet_id": "appworld-old-pattern-123", + "reason": "Contradicted by successful executions using new pattern" + } + ] + }, + "curation_notes": ["Deprecated outdated bullet"], + "quality_score": 0.7 +} +``` + +## Quality Score Calculation + +Assess the overall quality of the delta: +- **1.0**: All bullets high-quality, specific, non-redundant +- **0.8-0.9**: Good bullets with minor improvements possible +- **0.5-0.7**: Some issues (vague guidance, minor redundancy) +- **0.3-0.5**: Significant issues (task-specific, duplicate) +- **0.0-0.3**: Poor quality (no actionable guidance) + +## Task Examples + +### Example 1: Successful Task with Helpful Bullets + +**Input:** +``` +Task: Find most-liked song in Spotify playlists +Outcome: Success (TGC=1.0) +Bullets Used: appworld-spotify-005, appworld-login-001, appworld-complete-003 +Reflector Proposal: None (success, no new insights) +``` + +**Output:** +```json +{ + "delta": { + "new_bullets": [], + "counters": { + "appworld-spotify-005": {"helpful_count": 1}, + "appworld-login-001": {"helpful_count": 1}, + "appworld-complete-003": {"helpful_count": 1} + } + }, + "curation_notes": [ + "Task succeeded with existing bullets", + "Updated counters for 3 helpful bullets" + ], + "quality_score": 1.0 +} +``` + +### Example 2: Failed Task with New Insight + +**Input:** +``` +Task: Find least-played song in Spotify albums +Outcome: Failure (TGC=0.0, error: KeyError 'play_count') +Bullets Used: appworld-spotify-005, appworld-login-001 +Reflector Proposal: { + "title": "Spotify: Verify field names before accessing nested data", + "content": "Spotify song objects may not have all fields...", + "tags": ["app.spotify", "error-handling"], + "confidence": "medium" +} +``` + +**Output:** +```json +{ + "delta": { + "new_bullets": [ + { + "id": "bullet-2025-10-27-120000", + "title": "Spotify: Verify field names before accessing nested data", + "content": "Spotify song objects may not have all expected fields. Use .get() with defaults:\n\nplay_count = song.get('play_count', 0)\nlikes = song.get('likes', 0)\n\nCommon missing fields: play_count, explicit, preview_url", + "tags": ["app.spotify", "error-handling", "defensive"], + "evidence": [ + { + "type": "execution", + "ref": "task_123", + "note": "KeyError when accessing 'play_count' directly" + } + ], + "confidence": "high", + "scope": "app" + } + ], + "counters": { + "appworld-spotify-005": {"unhelpful_count": 1}, + "appworld-login-001": {"helpful_count": 1} + } + }, + "curation_notes": [ + "Accepted 1 new bullet with improved content and code example", + "Updated counters: 1 helpful, 1 unhelpful" + ], + "quality_score": 0.9 +} +``` + +### Example 3: Rejecting Redundant Bullet + +**Input:** +``` +Reflector Proposal: { + "title": "Always login before using APIs", + "content": "Call login() first", + "tags": ["authentication"] +} + +Existing Bullet: { + "id": "appworld-login-001", + "title": "Always call login() before using any app API methods", + "content": "All app APIs require authentication first: response = apis..login()...", + "tags": ["authentication", "api", "global"] +} +``` + +**Output:** +```json +{ + "delta": { + "new_bullets": [], + "counters": { /* existing counters */ } + }, + "curation_notes": [ + "Rejected duplicate bullet (>95% overlap with appworld-login-001)", + "No new bullets needed" + ], + "quality_score": 0.6 +} +``` + +## Important Notes + +1. **Prioritize Quality over Quantity**: Better to reject vague bullets than pollute the playbook +2. **Evidence Matters**: Always link bullets to specific execution failures/successes +3. **Generalize Appropriately**: Balance specificity with reusability +4. **Maintain Provenance**: Track where insights came from via evidence refs +5. **Update Counters Reliably**: Honest feedback improves retrieval over time + +Your goal is to maintain a high-quality, non-redundant playbook that genuinely improves agent performance through targeted, evidence-backed guidance. + +**REMINDER: Output ONLY valid JSON with the structure described above. No explanations, no commentary, just the JSON object.** diff --git a/skills/generate-appworld-code/SKILL.md b/skills/generate-appworld-code/SKILL.md new file mode 100644 index 0000000..d275547 --- /dev/null +++ b/skills/generate-appworld-code/SKILL.md @@ -0,0 +1,232 @@ +--- +name: generate-appworld-code +description: Generate Python code to solve AppWorld agent tasks using playbook bullet guidance. Use when the AppWorld executor needs executable Python code for tasks involving Spotify, Venmo, Gmail, Calendar, Contacts, or other AppWorld APIs. +allowed-tools: Read +--- + +# Generate AppWorld Code + +Generate executable Python code for AppWorld agent tasks, applying learned strategies from the ACE playbook. + +## Purpose + +When the AppWorld executor encounters a task, it calls this Skill with: +- Task instruction (natural language) +- Available apps (e.g., ['spotify', 'venmo']) +- Playbook bullets (learned strategies to apply) + +You generate Python code that: +1. Solves the task using AppWorld APIs +2. Applies bullet guidance strategies +3. Handles errors gracefully +4. Calls `apis.supervisor.complete_task()` when done + +## Input Format + +```json +{ + "instruction": "What is the title of the most-liked song in my Spotify playlists", + "apps": ["spotify"], + "strategies": [ + "Always login before API calls", + "Handle pagination for large result sets" + ], + "bullets": [ + { + "id": "bullet-xxx", + "title": "Spotify login pattern", + "content": "Login to Spotify using apis.spotify.login() with credentials..." + } + ] +} +``` + +## AppWorld API Patterns + +### Spotify +```python +# Login +response = apis.spotify.login(username="user@example.com", password="password") +token = response["access_token"] + +# Get playlists +playlists = apis.spotify.show_playlist_library(access_token=token) + +# Get songs in playlist +songs = apis.spotify.show_playlist_songs( + access_token=token, + playlist_id=playlists[0]["id"] +) +``` + +### Venmo +```python +# Login +response = apis.venmo.login(username="user@example.com", password="password") +token = response["access_token"] + +# Get friends +friends = apis.venmo.show_friends(access_token=token) + +# Send payment +apis.venmo.send_payment( + access_token=token, + recipient_id=friend["id"], + amount=10.00, + note="Payment note" +) +``` + +### Gmail +```python +# Login +response = apis.gmail.login(username="user@example.com", password="password") +token = response["access_token"] + +# Fetch emails +emails = apis.gmail.fetch_emails( + access_token=token, + max_results=10, + query="is:unread" +) + +# Send email +apis.gmail.send_email( + access_token=token, + to="recipient@example.com", + subject="Subject", + body="Email body" +) +``` + +### Contacts +```python +# Get contacts +contacts = apis.contacts.show_contacts() + +# Add contact +apis.contacts.add_contact( + name="John Doe", + email="john@example.com", + phone="+1234567890" +) +``` + +### Calendar +```python +# Get events +events = apis.calendar.show_events( + start_date="2025-01-01", + end_date="2025-12-31" +) + +# Create event +apis.calendar.create_event( + title="Meeting", + start_time="2025-10-26T14:00:00", + end_time="2025-10-26T15:00:00" +) +``` + +## Code Generation Rules + +1. **Always complete task**: Call `apis.supervisor.complete_task()` at the end +2. **Apply bullet strategies**: Use patterns from playbook bullets +3. **Handle errors**: Use try/except for API calls +4. **Be specific**: Don't use placeholders - generate actual implementations +5. **No explanations**: Return ONLY executable Python code + +## Example Generation + +**Input**: +```json +{ + "instruction": "Send an email to john@example.com saying hello", + "apps": ["gmail"], + "strategies": ["Login before API calls", "Validate email addresses"], + "bullets": [...] +} +``` + +**Output**: +```python +# Gmail task: Send email to john@example.com +# Applying strategies: Login before API calls, Validate email addresses + +try: + # Login to Gmail + response = apis.gmail.login(username="user@example.com", password="password") + token = response["access_token"] + + # Validate recipient + recipient = "john@example.com" + if "@" not in recipient: + raise ValueError(f"Invalid email: {recipient}") + + # Send email + apis.gmail.send_email( + access_token=token, + to=recipient, + subject="Hello", + body="Hello from AppWorld!" + ) + + # Complete task + apis.supervisor.complete_task() + +except Exception as e: + print(f"Error: {str(e)}") + raise +``` + +## Credentials + +AppWorld provides test credentials automatically. Use these common patterns: +- `username="user@example.com"` +- `password="password"` +- Tokens are returned from login APIs + +## Common Patterns from Playbook + +### Pattern: Friend Management (Venmo/Contacts) +```python +# Get current friends +current_friends = apis.venmo.show_friends(access_token=token) +current_ids = {f["id"] for f in current_friends} + +# Get target friends (from contacts) +target_contacts = apis.contacts.show_contacts() +target_ids = {c["id"] for c in target_contacts if c.get("venmo_id")} + +# Add missing +for target_id in target_ids - current_ids: + apis.venmo.add_friend(access_token=token, user_id=target_id) + +# Remove extra +for current_id in current_ids - target_ids: + apis.venmo.remove_friend(access_token=token, user_id=current_id) +``` + +### Pattern: Aggregation (Spotify/Media) +```python +# Get all playlists +playlists = apis.spotify.show_playlist_library(access_token=token) + +all_songs = [] +for playlist in playlists: + songs = apis.spotify.show_playlist_songs( + access_token=token, + playlist_id=playlist["id"] + ) + all_songs.extend(songs) + +# Find most-liked +most_liked = max(all_songs, key=lambda s: s.get("likes", 0)) +result = most_liked["title"] +``` + +## Response Format + +Return Python code as plain text (no markdown formatting, no explanations). + +The code should be immediately executable in the AppWorld environment. diff --git a/skills/playbook.json b/skills/playbook.json new file mode 100644 index 0000000..4757ac5 --- /dev/null +++ b/skills/playbook.json @@ -0,0 +1,260 @@ +{ + "version": "1.0.0", + "schema_version": "1.0.0", + "last_updated": "2025-10-26T23:00:00Z", + "bullets": [ + { + "id": "appworld-login-001", + "title": "Always call login() before using any app API methods", + "content": "Every AppWorld app requires authentication. Always call app.login() as the FIRST operation before making any API calls. Example: spotify.login(), venmo.login(), gmail.login(). Skipping login will cause 'access_token' errors.", + "tags": [ + "app.spotify", + "app.venmo", + "app.gmail", + "app.general", + "authentication", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 3, + "last_updated": "2025-10-27T12:33:11.269633", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-search-002", + "title": "Use search_* methods not get_* or fetch_* for querying data", + "content": "AppWorld APIs use search_* naming convention (e.g., spotify.search_tracks(), venmo.search_transactions()). Don't use get_* or fetch_* - those don't exist. Always check available methods for each app.", + "tags": [ + "app.spotify", + "app.venmo", + "app.gmail", + "api_naming", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 123, + "harmful_count": 2, + "last_updated": "2025-10-27T02:59:40.558445", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-complete-003", + "title": "Always call apis.supervisor.complete_task() at the end", + "content": "Every AppWorld task MUST end with apis.supervisor.complete_task() to signal completion. This is required for the test framework to evaluate results. Missing this call will cause all tests to fail.", + "tags": [ + "app.general", + "task_completion", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 3, + "last_updated": "2025-10-27T12:33:11.269631", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-error-004", + "title": "Check API response structure before accessing nested fields", + "content": "AppWorld API responses can have varying structures. Always check if fields exist before accessing them (e.g., if 'items' in response, check response.get('data'), etc.). Use .get() with defaults to avoid NoneType errors.", + "tags": [ + "app.general", + "error_handling", + "defensive" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 2, + "last_updated": "2025-10-27T07:28:18.738967", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-spotify-005", + "title": "Spotify: Get user playlists and track details separately", + "content": "To find songs in Spotify playlists: (1) Call spotify.get_user_playlists() to get playlist IDs, (2) For each playlist, call spotify.get_playlist_tracks(playlist_id) to get track details including likes/plays.", + "tags": [ + "app.spotify", + "playlists", + "pattern" + ], + "evidence": [], + "links": [ + "appworld-login-001" + ], + "confidence": "high", + "scope": "app", + "prerequisites": [ + "appworld-login-001" + ], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 3, + "last_updated": "2025-10-27T12:33:11.269627", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "bullet-2025-10-27-012842", + "title": "Verify general API logic and requirements", + "content": "When implementing general operations: Check task logic and requirements; Missing login() call for general", + "tags": [ + "logic", + "debugging", + "api", + "app.general" + ], + "evidence": [ + { + "type": "execution", + "ref": "82e2fac_1", + "note": "Task failed with logic_error: Tests failed: 1/2" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "medium", + "scope": "app", + "prerequisites": [], + "author": "reflector", + "status": "active", + "created": "2025-10-27T01:28:42.340996", + "last_updated": "2025-10-27T01:28:42.340997", + "links": [] + }, + { + "id": "bullet-2025-10-27-012856", + "title": "Verify venmo API logic and requirements", + "content": "When implementing venmo operations: Check task logic and requirements; Missing login() call for venmo", + "tags": [ + "logic", + "debugging", + "api", + "app.venmo" + ], + "evidence": [ + { + "type": "execution", + "ref": "2a163ab_1", + "note": "Task failed with logic_error: Tests failed: 1/6" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "medium", + "scope": "app", + "prerequisites": [], + "author": "reflector", + "status": "active", + "created": "2025-10-27T01:28:56.281941", + "last_updated": "2025-10-27T01:28:56.281946", + "links": [] + }, + { + "id": "bullet-2025-10-27-025940", + "title": "Review general API implementation", + "content": "Task 'What is the title of the most-liked song in my Spotify playlists.' failed with logic_error. Review API usage and error handling.", + "tags": [ + "api", + "debugging", + "app.general" + ], + "evidence": [ + { + "type": "execution", + "ref": "fallback_reflection", + "note": "Fallback bullet (LLM reflection unavailable)" + } + ], + "confidence": "low", + "scope": "app", + "created": "2025-10-27T02:59:40.558459", + "last_updated": "2025-10-27T12:33:11.269624", + "helpful_count": 2, + "harmful_count": 1, + "status": "active" + }, + { + "id": "bullet-2025-10-27-072605", + "title": "Review general API implementation", + "content": "Task 'What is the title of the most-liked song in my Spotify playlists.' failed with logic_error. Review API usage and error handling.", + "tags": [ + "api", + "debugging", + "app.general" + ], + "evidence": [ + { + "type": "execution", + "ref": "fallback_reflection", + "note": "Fallback bullet (LLM reflection unavailable)" + } + ], + "confidence": "low", + "scope": "app", + "created": "2025-10-27T07:28:18.738970", + "last_updated": "2025-10-27T12:33:11.269629", + "helpful_count": 0, + "harmful_count": 1, + "status": "active" + }, + { + "id": "bullet-2025-10-27-123311", + "title": "Verify spotify API logic and requirements", + "content": "When implementing spotify operations: Check task logic and requirements; Missing login() call for spotify", + "tags": [ + "logic", + "debugging", + "api", + "app.spotify" + ], + "evidence": [ + { + "type": "execution", + "ref": "82e2fac_1", + "note": "Task failed with logic_error" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "low", + "scope": "app", + "prerequisites": [], + "author": "reflector_fallback", + "status": "active", + "created": "2025-10-27T12:33:11.268061", + "last_updated": "2025-10-27T12:33:11.268063", + "links": [] + } + ], + "metadata": { + "total_bullets": 10, + "active_bullets": 10, + "deprecated_bullets": 0, + "archived_bullets": 0, + "last_curated": "2025-10-27T12:33:11.269640" + } +} \ No newline at end of file diff --git a/skills/playbook_appworld_seed.json b/skills/playbook_appworld_seed.json new file mode 100644 index 0000000..621a51e --- /dev/null +++ b/skills/playbook_appworld_seed.json @@ -0,0 +1,99 @@ +{ + "version": "1.0.0", + "schema_version": "1.0.0", + "last_updated": "2025-10-26T23:00:00Z", + "bullets": [ + { + "id": "appworld-login-001", + "title": "Always call login() before using any app API methods", + "content": "Every AppWorld app requires authentication. Always call app.login() as the FIRST operation before making any API calls. Example: spotify.login(), venmo.login(), gmail.login(). Skipping login will cause 'access_token' errors.", + "tags": ["app.spotify", "app.venmo", "app.gmail", "app.general", "authentication", "critical"], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 100, + "harmful_count": 0, + "last_updated": "2025-10-26T23:00:00Z", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-search-002", + "title": "Use search_* methods not get_* or fetch_* for querying data", + "content": "AppWorld APIs use search_* naming convention (e.g., spotify.search_tracks(), venmo.search_transactions()). Don't use get_* or fetch_* - those don't exist. Always check available methods for each app.", + "tags": ["app.spotify", "app.venmo", "app.gmail", "api_naming", "critical"], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 100, + "harmful_count": 0, + "last_updated": "2025-10-26T23:00:00Z", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-complete-003", + "title": "Always call apis.supervisor.complete_task() at the end", + "content": "Every AppWorld task MUST end with apis.supervisor.complete_task() to signal completion. This is required for the test framework to evaluate results. Missing this call will cause all tests to fail.", + "tags": ["app.general", "task_completion", "critical"], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 100, + "harmful_count": 0, + "last_updated": "2025-10-26T23:00:00Z", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-error-004", + "title": "Check API response structure before accessing nested fields", + "content": "AppWorld API responses can have varying structures. Always check if fields exist before accessing them (e.g., if 'items' in response, check response.get('data'), etc.). Use .get() with defaults to avoid NoneType errors.", + "tags": ["app.general", "error_handling", "defensive"], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 100, + "harmful_count": 0, + "last_updated": "2025-10-26T23:00:00Z", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-spotify-005", + "title": "Spotify: Get user playlists and track details separately", + "content": "To find songs in Spotify playlists: (1) Call spotify.get_user_playlists() to get playlist IDs, (2) For each playlist, call spotify.get_playlist_tracks(playlist_id) to get track details including likes/plays.", + "tags": ["app.spotify", "playlists", "pattern"], + "evidence": [], + "links": ["appworld-login-001"], + "confidence": "high", + "scope": "app", + "prerequisites": ["appworld-login-001"], + "author": "seed", + "status": "active", + "helpful_count": 100, + "harmful_count": 0, + "last_updated": "2025-10-26T23:00:00Z", + "created": "2025-10-26T23:00:00Z" + } + ], + "metadata": { + "total_bullets": 5, + "active_bullets": 5, + "deprecated_bullets": 0, + "archived_bullets": 0, + "last_curated": "2025-10-26T23:00:00Z" + } +} diff --git a/skills/playbook_epoch_1.json b/skills/playbook_epoch_1.json new file mode 100644 index 0000000..4757ac5 --- /dev/null +++ b/skills/playbook_epoch_1.json @@ -0,0 +1,260 @@ +{ + "version": "1.0.0", + "schema_version": "1.0.0", + "last_updated": "2025-10-26T23:00:00Z", + "bullets": [ + { + "id": "appworld-login-001", + "title": "Always call login() before using any app API methods", + "content": "Every AppWorld app requires authentication. Always call app.login() as the FIRST operation before making any API calls. Example: spotify.login(), venmo.login(), gmail.login(). Skipping login will cause 'access_token' errors.", + "tags": [ + "app.spotify", + "app.venmo", + "app.gmail", + "app.general", + "authentication", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 3, + "last_updated": "2025-10-27T12:33:11.269633", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-search-002", + "title": "Use search_* methods not get_* or fetch_* for querying data", + "content": "AppWorld APIs use search_* naming convention (e.g., spotify.search_tracks(), venmo.search_transactions()). Don't use get_* or fetch_* - those don't exist. Always check available methods for each app.", + "tags": [ + "app.spotify", + "app.venmo", + "app.gmail", + "api_naming", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 123, + "harmful_count": 2, + "last_updated": "2025-10-27T02:59:40.558445", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-complete-003", + "title": "Always call apis.supervisor.complete_task() at the end", + "content": "Every AppWorld task MUST end with apis.supervisor.complete_task() to signal completion. This is required for the test framework to evaluate results. Missing this call will cause all tests to fail.", + "tags": [ + "app.general", + "task_completion", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 3, + "last_updated": "2025-10-27T12:33:11.269631", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-error-004", + "title": "Check API response structure before accessing nested fields", + "content": "AppWorld API responses can have varying structures. Always check if fields exist before accessing them (e.g., if 'items' in response, check response.get('data'), etc.). Use .get() with defaults to avoid NoneType errors.", + "tags": [ + "app.general", + "error_handling", + "defensive" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 2, + "last_updated": "2025-10-27T07:28:18.738967", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-spotify-005", + "title": "Spotify: Get user playlists and track details separately", + "content": "To find songs in Spotify playlists: (1) Call spotify.get_user_playlists() to get playlist IDs, (2) For each playlist, call spotify.get_playlist_tracks(playlist_id) to get track details including likes/plays.", + "tags": [ + "app.spotify", + "playlists", + "pattern" + ], + "evidence": [], + "links": [ + "appworld-login-001" + ], + "confidence": "high", + "scope": "app", + "prerequisites": [ + "appworld-login-001" + ], + "author": "seed", + "status": "active", + "helpful_count": 125, + "harmful_count": 3, + "last_updated": "2025-10-27T12:33:11.269627", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "bullet-2025-10-27-012842", + "title": "Verify general API logic and requirements", + "content": "When implementing general operations: Check task logic and requirements; Missing login() call for general", + "tags": [ + "logic", + "debugging", + "api", + "app.general" + ], + "evidence": [ + { + "type": "execution", + "ref": "82e2fac_1", + "note": "Task failed with logic_error: Tests failed: 1/2" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "medium", + "scope": "app", + "prerequisites": [], + "author": "reflector", + "status": "active", + "created": "2025-10-27T01:28:42.340996", + "last_updated": "2025-10-27T01:28:42.340997", + "links": [] + }, + { + "id": "bullet-2025-10-27-012856", + "title": "Verify venmo API logic and requirements", + "content": "When implementing venmo operations: Check task logic and requirements; Missing login() call for venmo", + "tags": [ + "logic", + "debugging", + "api", + "app.venmo" + ], + "evidence": [ + { + "type": "execution", + "ref": "2a163ab_1", + "note": "Task failed with logic_error: Tests failed: 1/6" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "medium", + "scope": "app", + "prerequisites": [], + "author": "reflector", + "status": "active", + "created": "2025-10-27T01:28:56.281941", + "last_updated": "2025-10-27T01:28:56.281946", + "links": [] + }, + { + "id": "bullet-2025-10-27-025940", + "title": "Review general API implementation", + "content": "Task 'What is the title of the most-liked song in my Spotify playlists.' failed with logic_error. Review API usage and error handling.", + "tags": [ + "api", + "debugging", + "app.general" + ], + "evidence": [ + { + "type": "execution", + "ref": "fallback_reflection", + "note": "Fallback bullet (LLM reflection unavailable)" + } + ], + "confidence": "low", + "scope": "app", + "created": "2025-10-27T02:59:40.558459", + "last_updated": "2025-10-27T12:33:11.269624", + "helpful_count": 2, + "harmful_count": 1, + "status": "active" + }, + { + "id": "bullet-2025-10-27-072605", + "title": "Review general API implementation", + "content": "Task 'What is the title of the most-liked song in my Spotify playlists.' failed with logic_error. Review API usage and error handling.", + "tags": [ + "api", + "debugging", + "app.general" + ], + "evidence": [ + { + "type": "execution", + "ref": "fallback_reflection", + "note": "Fallback bullet (LLM reflection unavailable)" + } + ], + "confidence": "low", + "scope": "app", + "created": "2025-10-27T07:28:18.738970", + "last_updated": "2025-10-27T12:33:11.269629", + "helpful_count": 0, + "harmful_count": 1, + "status": "active" + }, + { + "id": "bullet-2025-10-27-123311", + "title": "Verify spotify API logic and requirements", + "content": "When implementing spotify operations: Check task logic and requirements; Missing login() call for spotify", + "tags": [ + "logic", + "debugging", + "api", + "app.spotify" + ], + "evidence": [ + { + "type": "execution", + "ref": "82e2fac_1", + "note": "Task failed with logic_error" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "low", + "scope": "app", + "prerequisites": [], + "author": "reflector_fallback", + "status": "active", + "created": "2025-10-27T12:33:11.268061", + "last_updated": "2025-10-27T12:33:11.268063", + "links": [] + } + ], + "metadata": { + "total_bullets": 10, + "active_bullets": 10, + "deprecated_bullets": 0, + "archived_bullets": 0, + "last_curated": "2025-10-27T12:33:11.269640" + } +} \ No newline at end of file diff --git a/skills/playbook_epoch_2.json b/skills/playbook_epoch_2.json new file mode 100644 index 0000000..fd4bcd1 --- /dev/null +++ b/skills/playbook_epoch_2.json @@ -0,0 +1,184 @@ +{ + "version": "1.0.0", + "schema_version": "1.0.0", + "last_updated": "2025-10-26T23:00:00Z", + "bullets": [ + { + "id": "appworld-login-001", + "title": "Always call login() before using any app API methods", + "content": "Every AppWorld app requires authentication. Always call app.login() as the FIRST operation before making any API calls. Example: spotify.login(), venmo.login(), gmail.login(). Skipping login will cause 'access_token' errors.", + "tags": [ + "app.spotify", + "app.venmo", + "app.gmail", + "app.general", + "authentication", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 118, + "harmful_count": 2, + "last_updated": "2025-10-27T01:29:30.644912", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-search-002", + "title": "Use search_* methods not get_* or fetch_* for querying data", + "content": "AppWorld APIs use search_* naming convention (e.g., spotify.search_tracks(), venmo.search_transactions()). Don't use get_* or fetch_* - those don't exist. Always check available methods for each app.", + "tags": [ + "app.spotify", + "app.venmo", + "app.gmail", + "api_naming", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 118, + "harmful_count": 2, + "last_updated": "2025-10-27T01:29:30.644915", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-complete-003", + "title": "Always call apis.supervisor.complete_task() at the end", + "content": "Every AppWorld task MUST end with apis.supervisor.complete_task() to signal completion. This is required for the test framework to evaluate results. Missing this call will cause all tests to fail.", + "tags": [ + "app.general", + "task_completion", + "critical" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 118, + "harmful_count": 2, + "last_updated": "2025-10-27T01:29:30.644909", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-error-004", + "title": "Check API response structure before accessing nested fields", + "content": "AppWorld API responses can have varying structures. Always check if fields exist before accessing them (e.g., if 'items' in response, check response.get('data'), etc.). Use .get() with defaults to avoid NoneType errors.", + "tags": [ + "app.general", + "error_handling", + "defensive" + ], + "evidence": [], + "links": [], + "confidence": "high", + "scope": "global", + "prerequisites": [], + "author": "seed", + "status": "active", + "helpful_count": 118, + "harmful_count": 2, + "last_updated": "2025-10-27T01:29:30.644911", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "appworld-spotify-005", + "title": "Spotify: Get user playlists and track details separately", + "content": "To find songs in Spotify playlists: (1) Call spotify.get_user_playlists() to get playlist IDs, (2) For each playlist, call spotify.get_playlist_tracks(playlist_id) to get track details including likes/plays.", + "tags": [ + "app.spotify", + "playlists", + "pattern" + ], + "evidence": [], + "links": [ + "appworld-login-001" + ], + "confidence": "high", + "scope": "app", + "prerequisites": [ + "appworld-login-001" + ], + "author": "seed", + "status": "active", + "helpful_count": 118, + "harmful_count": 2, + "last_updated": "2025-10-27T01:29:30.644907", + "created": "2025-10-26T23:00:00Z" + }, + { + "id": "bullet-2025-10-27-012842", + "title": "Verify general API logic and requirements", + "content": "When implementing general operations: Check task logic and requirements; Missing login() call for general", + "tags": [ + "logic", + "debugging", + "api", + "app.general" + ], + "evidence": [ + { + "type": "execution", + "ref": "82e2fac_1", + "note": "Task failed with logic_error: Tests failed: 1/2" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "medium", + "scope": "app", + "prerequisites": [], + "author": "reflector", + "status": "active", + "created": "2025-10-27T01:28:42.340996", + "last_updated": "2025-10-27T01:28:42.340997", + "links": [] + }, + { + "id": "bullet-2025-10-27-012856", + "title": "Verify venmo API logic and requirements", + "content": "When implementing venmo operations: Check task logic and requirements; Missing login() call for venmo", + "tags": [ + "logic", + "debugging", + "api", + "app.venmo" + ], + "evidence": [ + { + "type": "execution", + "ref": "2a163ab_1", + "note": "Task failed with logic_error: Tests failed: 1/6" + } + ], + "helpful_count": 0, + "harmful_count": 0, + "confidence": "medium", + "scope": "app", + "prerequisites": [], + "author": "reflector", + "status": "active", + "created": "2025-10-27T01:28:56.281941", + "last_updated": "2025-10-27T01:28:56.281946", + "links": [] + } + ], + "metadata": { + "total_bullets": 7, + "active_bullets": 7, + "deprecated_bullets": 0, + "archived_bullets": 0, + "last_curated": "2025-10-27T01:29:30.644923" + } +} \ No newline at end of file diff --git a/skills/reflect-appworld-failure/SKILL.md b/skills/reflect-appworld-failure/SKILL.md new file mode 100644 index 0000000..30a9e5b --- /dev/null +++ b/skills/reflect-appworld-failure/SKILL.md @@ -0,0 +1,200 @@ +--- +name: reflect-appworld-failure +description: Analyze AppWorld task failures to extract specific API patterns and generate actionable playbook bullets with concrete code examples +allowed-tools: Read +--- + +# Reflect on AppWorld Failure + +Analyze failed AppWorld tasks to extract specific, actionable learnings that can be added to the playbook. + +## Purpose + +When an AppWorld task fails, the Reflector calls this Skill with error details and failed code. You analyze the failure semantically and generate a high-quality bullet with: +1. Specific title describing the pattern +2. Detailed content with working code examples +3. Relevant tags for retrieval +4. Appropriate confidence level + +## Input Format + +The input will be a text description with sections: + +``` +# Task + + +## Apps + + +## Error Type + + +## Error Messages + + +## Failed Code Snippet + + +## Missing Patterns (from heuristics) + + +## Suggested Fixes (from heuristics) + +``` + +## Your Analysis Process + +1. **Identify Root Cause**: What was the fundamental mistake? + - Wrong API method name? + - Missing authentication? + - Incorrect data structure access? + - Logic error? + +2. **Extract Pattern**: What general pattern does this represent? + - Is this specific to one app or applies to multiple? + - Is this about API order (login first)? + - Is this about method naming conventions? + - Is this about data validation? + +3. **Generate Concrete Example**: Create working code that demonstrates the CORRECT pattern + +4. **Write Actionable Bullet**: Make it specific enough that the Generator can apply it + +## Output Format + +Return a JSON object with this structure: + +```json +{ + "bullet": { + "id": "bullet-YYYY-MM-DD-HHMMSS", + "title": "", + "content": "", + "tags": ["app.", "", ""], + "evidence": [ + { + "type": "execution", + "ref": "", + "note": "" + } + ], + "confidence": "high|medium|low", + "scope": "app|global" + } +} +``` + +## Bullet Quality Guidelines + +### GOOD Bullets (Specific and Actionable) + +**Title**: "Spotify: Use show_playlist_songs() not get_tracks()" +**Content**: "Spotify API uses show_playlist_songs(access_token, playlist_id) to retrieve tracks. The method get_tracks() does not exist. Example: `songs = apis.spotify.show_playlist_songs(access_token=token, playlist_id=playlist['id'])`" +**Tags**: ["app.spotify", "api_misuse", "method_names", "playlists"] + +**Title**: "Venmo: Call login() before search_transactions()" +**Content**: "Venmo API requires authentication token for all operations. Always call venmo.login() first to get access_token, then pass it to other methods. Example: `response = apis.venmo.login(username='user', password='pass'); token = response['access_token']; results = apis.venmo.search_transactions(access_token=token, query={'friend': 'Alice'})`" +**Tags**: ["app.venmo", "authentication", "api_order", "search"] + +### BAD Bullets (Too Generic) + +**Title**: "Verify venmo API logic and requirements" +**Content**: "When implementing venmo operations: Check task logic and requirements; Missing login() call for venmo" +**Tags**: ["logic", "debugging", "api", "app.venmo"] + +**Why Bad**: No concrete code example, vague guidance, doesn't teach the specific pattern + +## Example Analysis + +### Input: +``` +# Task +What is the title of the most-liked song in my Spotify playlists + +## Apps +spotify + +## Error Type +api_misuse + +## Error Messages +AttributeError: 'Spotify' object has no attribute 'get_tracks' + +## Failed Code Snippet +songs = spotify.get_tracks(playlist_id=pid) + +## Missing Patterns +- Use correct Spotify API methods + +## Suggested Fixes +- Check Spotify API documentation for available methods +``` + +### Your Analysis: + +1. **Root Cause**: Code used non-existent method `get_tracks()` instead of correct `show_playlist_songs()` + +2. **Pattern**: Spotify uses `show_*` naming convention for retrieval methods + +3. **Scope**: App-specific (Spotify) + +### Output: +```json +{ + "bullet": { + "id": "bullet-2025-10-27-123456", + "title": "Spotify: Use show_playlist_songs() to get tracks from playlist", + "content": "To retrieve songs from a Spotify playlist, use show_playlist_songs(access_token, playlist_id). Don't use get_tracks() - it doesn't exist. Example: `token = apis.spotify.login()['access_token']; playlists = apis.spotify.show_playlist_library(access_token=token); songs = apis.spotify.show_playlist_songs(access_token=token, playlist_id=playlists[0]['id']); most_liked = max(songs, key=lambda s: s['likes'])`", + "tags": ["app.spotify", "api_misuse", "method_names", "playlists", "retrieval"], + "evidence": [ + { + "type": "execution", + "ref": "spotify_task_001", + "note": "AttributeError: 'Spotify' object has no attribute 'get_tracks'" + } + ], + "confidence": "high", + "scope": "app" + } +} +``` + +## Common AppWorld Patterns to Look For + +### Authentication Order +- Most apps require login() first to get access_token +- Token must be passed to subsequent API calls + +### Method Naming Conventions +- Spotify: `show_*` for retrieval (show_playlist_songs, show_album_library) +- Venmo: `show_friends`, `send_payment`, `search_transactions` +- Gmail: `fetch_emails`, `send_email` +- Contacts: `show_contacts`, `add_contact` +- Calendar: `show_events`, `create_event` + +### Data Structure Access +- API responses may have nested structures +- Always check if keys exist before accessing +- Use `.get()` with defaults for safety + +### Aggregation Patterns +- To find "most-liked song in playlists": Get all playlists → Get songs from each → Find max by likes +- To find "most expensive transaction": Get all transactions → Find max by amount + +### Task Completion +- ALWAYS call `apis.supervisor.complete_task()` at the end +- This signals successful completion to test framework + +## Important Rules + +1. **Be Specific**: Include actual method names, parameter names, and code examples +2. **Be Actionable**: The Generator should know exactly what to do after reading your bullet +3. **Include Working Code**: Show a complete example that demonstrates the correct pattern +4. **Tag Appropriately**: Use `app.` for app-specific bullets, plus semantic tags +5. **Set Confidence**: "high" for clear patterns, "medium" for uncertain, "low" for speculative +6. **Return ONLY JSON**: No explanations, no markdown formatting outside the JSON + +## Response Format + +Return the JSON object as plain text. Make sure it's valid JSON that can be parsed directly.