From 2e92090219880fbc3e6edaab8655a7e780deb3ee Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 17:51:05 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 15 + README.md | 3 + commands/explore.md | 185 ++++++++++++ plugin.lock.json | 73 +++++ skills/gemini-imagen/SKILL.md | 193 +++++++++++++ skills/gemini-imagen/mise.toml | 3 + .../gemini-imagen/scripts/compose_images.py | 162 +++++++++++ skills/gemini-imagen/scripts/edit_image.py | 148 ++++++++++ skills/gemini-imagen/scripts/gemini_images.py | 269 ++++++++++++++++++ .../gemini-imagen/scripts/generate_image.py | 137 +++++++++ .../gemini-imagen/scripts/multi_turn_chat.py | 220 ++++++++++++++ 11 files changed, 1408 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 commands/explore.md create mode 100644 plugin.lock.json create mode 100644 skills/gemini-imagen/SKILL.md create mode 100644 skills/gemini-imagen/mise.toml create mode 100755 skills/gemini-imagen/scripts/compose_images.py create mode 100755 skills/gemini-imagen/scripts/edit_image.py create mode 100755 skills/gemini-imagen/scripts/gemini_images.py create mode 100755 skills/gemini-imagen/scripts/generate_image.py create mode 100755 skills/gemini-imagen/scripts/multi_turn_chat.py diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..490715a --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "explore-with-illustrations", + "description": "Generate and edit images using Gemini API (Nano Banana Pro). Specialized for creating high-quality technical illustrations, architecture diagrams, code concept visualizations, and educational content from codebases.", + "version": "1.0.0", + "author": { + "name": "Agney", + "url": "https://github.com/agneym/agneym-claude-marketplace" + }, + "skills": [ + "./skills" + ], + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a3eacfb --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# explore-with-illustrations + +Generate and edit images using Gemini API (Nano Banana Pro). Specialized for creating high-quality technical illustrations, architecture diagrams, code concept visualizations, and educational content from codebases. diff --git a/commands/explore.md b/commands/explore.md new file mode 100644 index 0000000..fb43b1d --- /dev/null +++ b/commands/explore.md @@ -0,0 +1,185 @@ +--- +description: Analyze and visualize architecture of a codebase area +argument-hint: [directory-or-area] +allowed-tools: * +--- + +# Architecture Analysis + +Analyze the architecture of: **$ARGUMENTS** + +## Your Task + +### Setup: Create Visualization Submodule + +Before starting the analysis, create a git submodule for storing all visualization outputs: + +1. Create a directory called `visualizations/` in the user's repository root +2. Initialize it as a git submodule (if it doesn't exist already) +3. All outputs (HTML files, images, assets) will be stored in this submodule +4. The HTML file will be the main entry point for viewing the visualization + +### Workflow + +1. **Explore** the codebase area using Glob and Read to understand: + - Key components and their responsibilities + - Data flow and interactions + - Dependencies and relationships + +2. **Explain** the architecture concisely: + - Main components and their roles + - How they interact + - Key patterns or design decisions + +3. **Visualize** - You have complete freedom to choose the best visualization approach: + - **HTML + JavaScript**: Create interactive visualizations using any JavaScript libraries (D3.js, Chart.js, Plotly, Three.js, etc.) + - **Generated Images**: Use gemini-imagen skill scripts to generate diagrams + - **Hybrid**: Combine both - generate images and embed them in an interactive HTML page + + **CRITICAL - Image Handling Rule:** + - If you generate images (using gemini-imagen scripts), save them in the `visualizations/` submodule + - Reference these images in an HTML file (also in the submodule) + - Images should NOT be standalone - always create an HTML file that displays them + - The HTML file serves as the entry point for viewing all visualizations + +## Critical Visualization Guidelines + +### For Image Generation (using gemini-imagen) + +**NEVER be vague in image prompts. The image model cannot see the codebase.** + +1. **Specify exact positions**: "Component A at top center, Component B at bottom left" (not "components arranged logically") +2. **Label every connection**: "Arrow from A to B labeled 'POST /api/login with JWT'" (not "A calls B") +3. **Include all details**: Methods, parameters, return types, HTTP verbs, data formats +4. **Use specific colors**: "Requests in blue, responses in green, errors in red" (not "color-coded") +5. **State cardinality**: "1 to N (many)" on relationship lines (not "has many") +6. **Complete flows**: List every step sequentially with explicit labels + +### For HTML/Interactive Visualizations + +You have complete freedom to create any type of interactive visualization. Consider: + +**JavaScript Libraries** (load via CDN): +- **D3.js**: Complex data visualizations, force-directed graphs, hierarchies +- **Chart.js**: Simple charts (bar, line, pie, radar) +- **Plotly**: Interactive scientific/statistical charts +- **Three.js**: 3D visualizations +- **Mermaid.js**: Diagrams from text descriptions (flowcharts, sequence diagrams, etc.) +- **Cytoscape.js**: Network/graph visualizations +- **Vis.js**: Timeline, network, and graph visualizations +- Or any other library you find appropriate + +**Visualization Types:** +- Interactive architecture diagrams with clickable components +- Animated data flow visualizations +- Filterable/searchable dependency graphs +- Timeline views of execution flows +- Interactive code maps with zoom/pan +- Combined visualizations (images + interactive overlays) + +**File Structure:** +- Create `visualizations/index.html` as the main entry point +- Can use multiple HTML files if needed +- External CSS/JS files are allowed +- Reference any generated images with relative paths + +**Best Practices:** +- Use multiple files with external references when appropriate +- Include clear navigation if creating multiple pages +- Add interactivity where it enhances understanding (hover tooltips, click to expand, etc.) +- Keep it simple - this is throwaway code, don't over-engineer + +## Diagram Templates (for Image Generation) + +Choose the appropriate template based on what you discovered: + +### Architecture Diagram +``` +"Technical architecture diagram: [COMPONENT_1] at top center, [COMPONENT_2] on left middle, [COMPONENT_3] on right middle. +Arrow from [COMPONENT_1] to [COMPONENT_2] labeled '[HTTP_METHOD] [PATH] [PURPOSE]'. +Arrow from [COMPONENT_2] to [COMPONENT_3] labeled '[PROTOCOL] [DATA_TYPE]'. +[Repeat for ALL connections with explicit labels]. +Clean labeled boxes, directional arrows, white background." +``` + +### Data Flow Diagram +``` +"Data flow diagram: Step 1: [ENTITY_A] at left. Step 2: Arrow to [ENTITY_B] labeled '[METHOD] [PATH] with [DATA]'. +Step 3: Arrow back labeled '[STATUS] [RESPONSE_TYPE]'. [Continue for all steps]. +Number each step, color-code: [TYPE_1] in blue, [TYPE_2] in green. Technical style, 16:9." +``` + +### Component Relationships (UML) +``` +"UML class diagram: [CLASS_1] box at top with attributes '[ATTRS]' and methods '[METHODS]'. +[CLASS_2] box at bottom with '[ATTRS/METHODS]'. +[CLASS_1] to [CLASS_2]: [RELATIONSHIP] shown with [ARROW_TYPE], labeled '1 to N'. +[Repeat for all relationships]. Clean UML style." +``` + +### Code Execution Flow +``` +"Flowchart for [FUNCTION]: Start. Step 1: '[ACTION]' in blue rectangle. +Step 2: Diamond '[CONDITION]' with YES arrow to [NEXT] and NO arrow to [ALT]. +[Continue all steps]. Errors in red rounded boxes, success in green. Label all arrows." +``` + +### Database Schema +``` +"Database schema: [TABLE_1] with columns '[COL] [TYPE] [CONSTRAINTS]'. +[TABLE_2] with '[COLUMNS]'. Foreign key: [TABLE_2].[FK] → [TABLE_1].[PK] +shown with line labeled '1 to N'. [Repeat for all tables]. Show PK icons." +``` + +### API Endpoints +``` +"REST API for [SERVICE]: Endpoint 1: [METHOD] [PATH] with body {[FIELDS]} returns {[RESPONSE]} [STATUS]. +[Repeat for all endpoints]. Color-code: GET blue, POST green, PUT yellow, DELETE red. +Show full JSON examples." +``` + +## Educational Approach + +**Your goal: Create the best visualization for understanding, not just documentation.** + +Consider creative formats when appropriate: +- **Metaphors**: Database transaction as restaurant order system +- **Comics**: Function execution as sequential panels +- **Real-world scenarios**: Authentication as bouncer checking IDs +- **Analogies**: Cache as kitchen pantry with frequently-used items + +**Example creative prompt:** +> "Comic strip showing JWT auth: Panel 1: User (detective) at API Gateway (security desk). Panel 2: Gateway calls Auth Service (background check). Panel 3: Auth returns golden badge (JWT). Panel 4: User shows badge to Resource Server (VIP room). Cartoon style." + +## Output + +### Creating Visualizations + +**Option 1: HTML + Interactive JavaScript** +- Create `visualizations/index.html` with your interactive visualization +- Use any JavaScript libraries via CDN +- Can create additional HTML/CSS/JS files as needed +- Reference any generated images with relative paths + +**Option 2: Generated Images (via gemini-imagen)** +- Generate diagrams using: `scripts/generate_image.py "YOUR_EXPLICIT_PROMPT" visualizations/architecture-diagram.png --size 4K --aspect 16:9` +- The script has a python and uv shebang - execute it directly instead of using python +- Save all images in the `visualizations/` submodule +- Create `visualizations/index.html` that displays the images +- If complex, create multiple focused diagrams rather than one overwhelming image +- Use descriptive filenames: `component-relationships.png`, `data-flow.png`, etc. + +**Option 3: Hybrid Approach** +- Generate images using gemini-imagen scripts (save to `visualizations/`) +- Create interactive HTML that embeds/references the images +- Add JavaScript interactivity on top (zoom, annotations, navigation, etc.) + +### Viewing the Visualization + +After creating the visualization, start a local server: + +```bash +python -m http.server --directory visualizations/ +``` + +Then open `http://localhost:8000` in a browser to view the visualization. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..ec3e72a --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,73 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:agneym/agneym-claude-marketplace:plugins/explore-with-illustrations", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "1b7690938521111ee8938f3ab8f78f8f61e9b351", + "treeHash": "57bf45b2b23f3a3ff04f332342ca38658a28173c9025644ac8b5990299d1ac09", + "generatedAt": "2025-11-28T10:13:02.101831Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "explore-with-illustrations", + "description": "Generate and edit images using Gemini API (Nano Banana Pro). Specialized for creating high-quality technical illustrations, architecture diagrams, code concept visualizations, and educational content from codebases.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "a71773d123857f7c100e09d11c90c6339138ddb6ee8e4a05964bf89838ca4c0f" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "ef6e992cf87ffe1741afee9898c450d7007d44c6b03bc47ee1c386690b10285d" + }, + { + "path": "commands/explore.md", + "sha256": "7dbf290f0b822b7365d22fe83aa43c40c8c4d23e35acb672e341b339aff6b859" + }, + { + "path": "skills/gemini-imagen/mise.toml", + "sha256": "dbb25cfa908fd44614ccebf8295635f2eee1e05ed950f63fd60faefef8889c56" + }, + { + "path": "skills/gemini-imagen/SKILL.md", + "sha256": "e0f38d3d77c0b378c982f699200f849d36eaad54334f0e696d07c05d70a1d99c" + }, + { + "path": "skills/gemini-imagen/scripts/gemini_images.py", + "sha256": "0f7f45c8ad0ab942ff05a4a2ae99900dfe7088235f754489106ddc1411938722" + }, + { + "path": "skills/gemini-imagen/scripts/compose_images.py", + "sha256": "1dac8f1ba49d0f58a3d4dc1439c50ed1177b4c5c8335e622dc10dec78f4b42b1" + }, + { + "path": "skills/gemini-imagen/scripts/generate_image.py", + "sha256": "66cd1b59e2b3be98eff8bc03c5f924ee34d7c445cba8981214f3194b037c009f" + }, + { + "path": "skills/gemini-imagen/scripts/multi_turn_chat.py", + "sha256": "d12db12d52d6ebd35ed449ff1799acb40b7714a90e6ae118c0661152a8f2b2b6" + }, + { + "path": "skills/gemini-imagen/scripts/edit_image.py", + "sha256": "c2d031289e65c64246daf4b296e86e9b8aa9af4af280ef97aa2531a1e5a12eb4" + } + ], + "dirSha256": "57bf45b2b23f3a3ff04f332342ca38658a28173c9025644ac8b5990299d1ac09" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/gemini-imagen/SKILL.md b/skills/gemini-imagen/SKILL.md new file mode 100644 index 0000000..6c5805e --- /dev/null +++ b/skills/gemini-imagen/SKILL.md @@ -0,0 +1,193 @@ +--- +name: gemini-imagegen +description: Generate and edit images using Gemini API (Nano Banana Pro). Supports text-to-image, image editing, multi-turn refinement, Google Search grounding for factual accuracy, and composition from multiple reference images. +--- + +# Gemini Image Generation (Nano Banana Pro) + +Generate professional-quality images using Google's **Gemini 3 Pro Image** model (aka Nano Banana Pro). The environment variable `GEMINI_API_KEY` must be set. + +## Model + +**gemini-3-pro-image-preview** (Nano Banana Pro) +- Resolution: Up to 4K (1K, 2K, 4K) +- Built on Gemini 3 Pro with advanced reasoning and real-world knowledge +- Best for: Professional assets, illustrations, diagrams, text rendering, product mockups +- Features: Google Search grounding, automatic "Thinking" process for refined composition + +## Quick Start Scripts + +CRITICAL FOR AGENTS: These are executable scripts in your PATH. All scripts now default to **gemini-3-pro-image-preview**. + +### Text-to-Image +```bash +scripts/generate_image.py "A technical diagram showing microservices architecture" output.png +``` + +### Edit Existing Image +```bash +scripts/edit_image.py diagram.png "Add API gateway component with arrows showing data flow" output.png +``` + +### Multi-Turn Chat (Iterative Refinement) +```bash +scripts/multi_turn_chat.py +``` + +For high-resolution technical diagrams: +```bash +scripts/generate_image.py "Your prompt" output.png --size 4K --aspect 16:9 +``` + +## Core API Pattern + +All image generation uses the `generateContent` endpoint with `responseModalities: ["TEXT", "IMAGE"]`: + +```python +import os +from google import genai + +client = genai.Client(api_key=os.environ["GEMINI_API_KEY"]) + +response = client.models.generate_content( + model="gemini-3-pro-image-preview", + contents=["Your prompt here"], +) + +for part in response.parts: + if part.text: + print(part.text) + elif part.inline_data: + image = part.as_image() + image.save("output.png") +``` + +## Image Configuration Options + +Control output with `image_config`: + +```python +from google.genai import types + +response = client.models.generate_content( + model="gemini-3-pro-image-preview", + contents=[prompt], + config=types.GenerateContentConfig( + response_modalities=['TEXT', 'IMAGE'], + image_config=types.ImageConfig( + aspect_ratio="16:9", # 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9 + image_size="4K" # 1K, 2K, 4K (Nano Banana Pro supports up to 4K) + ), + ) +) +``` + +## Editing Images + +Pass existing images with text prompts: + +```python +from PIL import Image + +img = Image.open("input.png") +response = client.models.generate_content( + model="gemini-3-pro-image-preview", + contents=["Add a sunset to this scene", img], +) +``` + +## Multi-Turn Refinement + +Use chat for iterative editing: + +```python +from google.genai import types + +chat = client.chats.create( + model="gemini-3-pro-image-preview", + config=types.GenerateContentConfig(response_modalities=['TEXT', 'IMAGE']) +) + +response = chat.send_message("Create a logo for 'Acme Corp'") +# Save first image... + +response = chat.send_message("Make the text bolder and add a blue gradient") +# Save refined image... +``` + +## Prompting Best Practices + +### Core Prompt Structure +Keep prompts concise and specific. Research shows prompts under 25 words achieve **30% higher accuracy**. Structure as: + +**Subject + Adjectives + Action + Location/Context + Composition + Lighting + Style** + +### Photorealistic Scenes +Include camera details: lens type, lighting, angle, mood. +> "Photorealistic close-up portrait, 85mm lens, soft golden hour light, shallow depth of field" + +### Stylized Art +Specify style explicitly: +> "Kawaii-style sticker of a happy red panda, bold outlines, cel-shading, white background" + +### Text in Images +Be explicit about font style and placement: +> "Logo with text 'Daily Grind' in clean sans-serif, black and white, coffee bean motif" + +### Product Mockups +Describe lighting setup and surface: +> "Studio-lit product photo on polished concrete, three-point softbox setup, 45-degree angle" + +### Technical Diagrams +Be explicit about positions, relationships, and labels: +> "Technical diagram: Component A at top, Component B at bottom. Arrow from A to B labeled 'HTTP GET'. Clean boxes, directional arrows, white background." + +## Advanced Features + +### Google Search Grounding +Generate images based on real-time data: + +```python +response = client.models.generate_content( + model="gemini-3-pro-image-preview", + contents=["Visualize today's weather in Tokyo as an infographic"], + config=types.GenerateContentConfig( + response_modalities=['TEXT', 'IMAGE'], + tools=[{"google_search": {}}] + ) +) +``` + +### Multiple Reference Images (Up to 14) +Combine elements from multiple sources: + +```python +response = client.models.generate_content( + model="gemini-3-pro-image-preview", + contents=[ + "Create a group photo of these people in an office", + Image.open("person1.png"), + Image.open("person2.png"), + Image.open("person3.png"), + ], +) +``` + +## REST API (curl) + +```bash +curl -s -X POST \ + "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent" \ + -H "x-goog-api-key: $GEMINI_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "contents": [{"parts": [{"text": "Technical diagram showing RESTful API architecture"}]}] + }' | jq -r '.candidates[0].content.parts[] | select(.inlineData) | .inlineData.data' | base64 --decode > output.png +``` + +## Notes + +- All generated images include SynthID watermarks +- Image-only mode (`responseModalities: ["IMAGE"]`) won't work with Google Search grounding +- For editing, describe changes conversationally—the model understands semantic masking +- Be specific about positions, colors, labels, and relationships for best results diff --git a/skills/gemini-imagen/mise.toml b/skills/gemini-imagen/mise.toml new file mode 100644 index 0000000..7fc2e8b --- /dev/null +++ b/skills/gemini-imagen/mise.toml @@ -0,0 +1,3 @@ +[tools] +python = "3.11" + diff --git a/skills/gemini-imagen/scripts/compose_images.py b/skills/gemini-imagen/scripts/compose_images.py new file mode 100755 index 0000000..fdb497c --- /dev/null +++ b/skills/gemini-imagen/scripts/compose_images.py @@ -0,0 +1,162 @@ +#!/usr/bin/env -S uv run --script +# +# /// script +# requires-python = ">=3.12" +# dependencies = ["google-genai", "pillow"] +# /// +""" +Compose multiple images into a new image using Gemini API. + +Usage: + python compose_images.py "instruction" output.png image1.png [image2.png ...] + +Examples: + python compose_images.py "Create a group photo of these people" group.png person1.png person2.png + python compose_images.py "Put the cat from the first image on the couch from the second" result.png cat.png couch.png + python compose_images.py "Apply the art style from the first image to the scene in the second" styled.png style.png photo.png + +Note: Supports up to 14 reference images (Gemini 3 Pro only). + +Environment: + GEMINI_API_KEY - Required API key +""" + +import argparse +import os +import sys + +from PIL import Image +from google import genai +from google.genai import types + + +def compose_images( + instruction: str, + output_path: str, + image_paths: list[str], + model: str = "gemini-3-pro-image-preview", + aspect_ratio: str | None = None, + image_size: str | None = None, +) -> str | None: + """Compose multiple images based on instructions. + + Args: + instruction: Text description of how to combine images + output_path: Path to save the result + image_paths: List of input image paths (up to 14) + model: Gemini model to use (pro recommended) + aspect_ratio: Output aspect ratio + image_size: Output resolution + + Returns: + Any text response from the model, or None + """ + api_key = os.environ.get("GEMINI_API_KEY") + if not api_key: + raise EnvironmentError("GEMINI_API_KEY environment variable not set") + + if len(image_paths) > 14: + raise ValueError("Maximum 14 reference images supported") + + if len(image_paths) < 1: + raise ValueError("At least one image is required") + + # Verify all images exist + for path in image_paths: + if not os.path.exists(path): + raise FileNotFoundError(f"Image not found: {path}") + + client = genai.Client(api_key=api_key) + + # Load images + images = [Image.open(path) for path in image_paths] + + # Build contents: instruction first, then images + contents = [instruction] + images + + # Build config + config_kwargs = {"response_modalities": ["TEXT", "IMAGE"]} + + image_config_kwargs = {} + if aspect_ratio: + image_config_kwargs["aspect_ratio"] = aspect_ratio + if image_size: + image_config_kwargs["image_size"] = image_size + + if image_config_kwargs: + config_kwargs["image_config"] = types.ImageConfig(**image_config_kwargs) + + config = types.GenerateContentConfig(**config_kwargs) + + response = client.models.generate_content( + model=model, + contents=contents, + config=config, + ) + + text_response = None + image_saved = False + + for part in response.parts: + if part.text is not None: + text_response = part.text + elif part.inline_data is not None: + image = part.as_image() + image.save(output_path) + image_saved = True + + if not image_saved: + raise RuntimeError("No image was generated.") + + return text_response + + +def main(): + parser = argparse.ArgumentParser( + description="Compose multiple images using Gemini API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument("instruction", help="Composition instruction") + parser.add_argument("output", help="Output file path") + parser.add_argument("images", nargs="+", help="Input images (up to 14)") + parser.add_argument( + "--model", "-m", + default="gemini-3-pro-image-preview", + choices=["gemini-2.5-flash-image", "gemini-3-pro-image-preview"], + help="Model to use (pro recommended for composition)" + ) + parser.add_argument( + "--aspect", "-a", + choices=["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], + help="Output aspect ratio" + ) + parser.add_argument( + "--size", "-s", + choices=["1K", "2K", "4K"], + help="Output resolution" + ) + + args = parser.parse_args() + + try: + text = compose_images( + instruction=args.instruction, + output_path=args.output, + image_paths=args.images, + model=args.model, + aspect_ratio=args.aspect, + image_size=args.size, + ) + + print(f"Composed image saved to: {args.output}") + if text: + print(f"Model response: {text}") + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/gemini-imagen/scripts/edit_image.py b/skills/gemini-imagen/scripts/edit_image.py new file mode 100755 index 0000000..284f783 --- /dev/null +++ b/skills/gemini-imagen/scripts/edit_image.py @@ -0,0 +1,148 @@ +#!/usr/bin/env -S uv run --script +# +# /// script +# requires-python = ">=3.12" +# dependencies = ["google-genai", "pillow"] +# /// +""" +Edit existing images using Gemini API (Nano Banana Pro). + +Usage: + python edit_image.py input.png "edit instruction" output.png [options] + +Examples: + python edit_image.py diagram.png "Add API Gateway component between client and services" edited.png + python edit_image.py schema.png "Highlight the foreign key relationships in red" schema_edited.png + python edit_image.py flowchart.png "Add error handling branch with red arrows" flowchart_v2.png + +Environment: + GEMINI_API_KEY - Required API key +""" + +import argparse +import os +import sys + +from PIL import Image +from google import genai +from google.genai import types + + +def edit_image( + input_path: str, + instruction: str, + output_path: str, + model: str = "gemini-3-pro-image-preview", + aspect_ratio: str | None = None, + image_size: str | None = None, +) -> str | None: + """Edit an existing image based on text instructions using Nano Banana Pro. + + Args: + input_path: Path to the input image + instruction: Text description of edits to make + output_path: Path to save the edited image + model: Gemini model to use (defaults to Nano Banana Pro) + aspect_ratio: Output aspect ratio + image_size: Output resolution (up to 4K) + + Returns: + Any text response from the model, or None + """ + api_key = os.environ.get("GEMINI_API_KEY") + if not api_key: + raise EnvironmentError("GEMINI_API_KEY environment variable not set") + + if not os.path.exists(input_path): + raise FileNotFoundError(f"Input image not found: {input_path}") + + client = genai.Client(api_key=api_key) + + # Load input image + input_image = Image.open(input_path) + + # Build config + config_kwargs = {"response_modalities": ["TEXT", "IMAGE"]} + + image_config_kwargs = {} + if aspect_ratio: + image_config_kwargs["aspect_ratio"] = aspect_ratio + if image_size: + image_config_kwargs["image_size"] = image_size + + if image_config_kwargs: + config_kwargs["image_config"] = types.ImageConfig(**image_config_kwargs) + + config = types.GenerateContentConfig(**config_kwargs) + + response = client.models.generate_content( + model=model, + contents=[instruction, input_image], + config=config, + ) + + text_response = None + image_saved = False + + for part in response.parts: + if part.text is not None: + text_response = part.text + elif part.inline_data is not None: + image = part.as_image() + image.save(output_path) + image_saved = True + + if not image_saved: + raise RuntimeError("No image was generated. Check your instruction and try again.") + + return text_response + + +def main(): + parser = argparse.ArgumentParser( + description="Edit images using Gemini API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument("input", help="Input image path") + parser.add_argument("instruction", help="Edit instruction") + parser.add_argument("output", help="Output file path") + parser.add_argument( + "--model", "-m", + default="gemini-3-pro-image-preview", + help="Model to use (default: gemini-3-pro-image-preview / Nano Banana Pro)" + ) + parser.add_argument( + "--aspect", "-a", + choices=["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], + help="Output aspect ratio" + ) + parser.add_argument( + "--size", "-s", + choices=["1K", "2K", "4K"], + help="Output resolution" + ) + + args = parser.parse_args() + + try: + text = edit_image( + input_path=args.input, + instruction=args.instruction, + output_path=args.output, + model=args.model, + aspect_ratio=args.aspect, + image_size=args.size, + ) + + print(f"Edited image saved to: {args.output}") + if text: + print(f"Model response: {text}") + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/gemini-imagen/scripts/gemini_images.py b/skills/gemini-imagen/scripts/gemini_images.py new file mode 100755 index 0000000..fb88e8f --- /dev/null +++ b/skills/gemini-imagen/scripts/gemini_images.py @@ -0,0 +1,269 @@ +#!/usr/bin/env -S uv run --script +# +# /// script +# requires-python = ">=3.12" +# dependencies = ["google-genai", "pillow"] +# /// +""" +Gemini Image Generation Library + +A simple Python library for generating and editing images with the Gemini API. + +Usage: + from gemini_images import GeminiImageGenerator + + gen = GeminiImageGenerator() + gen.generate("A sunset over mountains", "sunset.png") + gen.edit("input.png", "Add clouds", "output.png") + +Environment: + GEMINI_API_KEY - Required API key +""" + +import os +from pathlib import Path +from typing import Literal + +from PIL import Image +from google import genai +from google.genai import types + + +AspectRatio = Literal["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"] +ImageSize = Literal["1K", "2K", "4K"] +Model = Literal["gemini-2.5-flash-image", "gemini-3-pro-image-preview"] + + +class GeminiImageGenerator: + """High-level interface for Gemini image generation.""" + + FLASH = "gemini-2.5-flash-image" + PRO = "gemini-3-pro-image-preview" + + def __init__(self, api_key: str | None = None, model: Model = FLASH): + """Initialize the generator. + + Args: + api_key: Gemini API key (defaults to GEMINI_API_KEY env var) + model: Default model to use + """ + self.api_key = api_key or os.environ.get("GEMINI_API_KEY") + if not self.api_key: + raise EnvironmentError("GEMINI_API_KEY not set") + + self.client = genai.Client(api_key=self.api_key) + self.model = model + + def _build_config( + self, + aspect_ratio: AspectRatio | None = None, + image_size: ImageSize | None = None, + google_search: bool = False, + ) -> types.GenerateContentConfig: + """Build generation config.""" + kwargs = {"response_modalities": ["TEXT", "IMAGE"]} + + img_config = {} + if aspect_ratio: + img_config["aspect_ratio"] = aspect_ratio + if image_size: + img_config["image_size"] = image_size + + if img_config: + kwargs["image_config"] = types.ImageConfig(**img_config) + + if google_search: + kwargs["tools"] = [{"google_search": {}}] + + return types.GenerateContentConfig(**kwargs) + + def generate( + self, + prompt: str, + output: str | Path, + *, + model: Model | None = None, + aspect_ratio: AspectRatio | None = None, + image_size: ImageSize | None = None, + google_search: bool = False, + ) -> tuple[Path, str | None]: + """Generate an image from a text prompt. + + Args: + prompt: Text description + output: Output file path + model: Override default model + aspect_ratio: Output aspect ratio + image_size: Output resolution + google_search: Enable Google Search grounding (Pro only) + + Returns: + Tuple of (output path, optional text response) + """ + output = Path(output) + config = self._build_config(aspect_ratio, image_size, google_search) + + response = self.client.models.generate_content( + model=model or self.model, + contents=[prompt], + config=config, + ) + + text = None + for part in response.parts: + if part.text: + text = part.text + elif part.inline_data: + part.as_image().save(output) + + return output, text + + def edit( + self, + input_image: str | Path | Image.Image, + instruction: str, + output: str | Path, + *, + model: Model | None = None, + aspect_ratio: AspectRatio | None = None, + image_size: ImageSize | None = None, + ) -> tuple[Path, str | None]: + """Edit an existing image. + + Args: + input_image: Input image (path or PIL Image) + instruction: Edit instruction + output: Output file path + model: Override default model + aspect_ratio: Output aspect ratio + image_size: Output resolution + + Returns: + Tuple of (output path, optional text response) + """ + output = Path(output) + + if isinstance(input_image, (str, Path)): + input_image = Image.open(input_image) + + config = self._build_config(aspect_ratio, image_size) + + response = self.client.models.generate_content( + model=model or self.model, + contents=[instruction, input_image], + config=config, + ) + + text = None + for part in response.parts: + if part.text: + text = part.text + elif part.inline_data: + part.as_image().save(output) + + return output, text + + def compose( + self, + instruction: str, + images: list[str | Path | Image.Image], + output: str | Path, + *, + model: Model | None = None, + aspect_ratio: AspectRatio | None = None, + image_size: ImageSize | None = None, + ) -> tuple[Path, str | None]: + """Compose multiple images into one. + + Args: + instruction: Composition instruction + images: List of input images (up to 14) + output: Output file path + model: Override default model (Pro recommended) + aspect_ratio: Output aspect ratio + image_size: Output resolution + + Returns: + Tuple of (output path, optional text response) + """ + output = Path(output) + + # Load images + loaded = [] + for img in images: + if isinstance(img, (str, Path)): + loaded.append(Image.open(img)) + else: + loaded.append(img) + + config = self._build_config(aspect_ratio, image_size) + contents = [instruction] + loaded + + response = self.client.models.generate_content( + model=model or self.PRO, # Pro recommended for composition + contents=contents, + config=config, + ) + + text = None + for part in response.parts: + if part.text: + text = part.text + elif part.inline_data: + part.as_image().save(output) + + return output, text + + def chat(self) -> "ImageChat": + """Start an interactive chat session for iterative refinement.""" + return ImageChat(self.client, self.model) + + +class ImageChat: + """Multi-turn chat session for iterative image generation.""" + + def __init__(self, client: genai.Client, model: Model): + self.client = client + self.model = model + self._chat = client.chats.create( + model=model, + config=types.GenerateContentConfig(response_modalities=["TEXT", "IMAGE"]), + ) + self.current_image: Image.Image | None = None + + def send( + self, + message: str, + image: Image.Image | str | Path | None = None, + ) -> tuple[Image.Image | None, str | None]: + """Send a message and optionally an image. + + Returns: + Tuple of (generated image or None, text response or None) + """ + contents = [message] + if image: + if isinstance(image, (str, Path)): + image = Image.open(image) + contents.append(image) + + response = self._chat.send_message(contents) + + text = None + img = None + for part in response.parts: + if part.text: + text = part.text + elif part.inline_data: + img = part.as_image() + self.current_image = img + + return img, text + + def reset(self): + """Reset the chat session.""" + self._chat = self.client.chats.create( + model=self.model, + config=types.GenerateContentConfig(response_modalities=["TEXT", "IMAGE"]), + ) + self.current_image = None diff --git a/skills/gemini-imagen/scripts/generate_image.py b/skills/gemini-imagen/scripts/generate_image.py new file mode 100755 index 0000000..a8edda7 --- /dev/null +++ b/skills/gemini-imagen/scripts/generate_image.py @@ -0,0 +1,137 @@ +#!/usr/bin/env -S uv run --script +# +# /// script +# requires-python = ">=3.12" +# dependencies = ["google-genai", "pillow"] +# /// +""" +Generate images from text prompts using Gemini API (Nano Banana Pro). + +Usage: + python generate_image.py "prompt" output.png [--aspect RATIO] [--size SIZE] + +Examples: + python generate_image.py "Microservices architecture diagram with labeled components" diagram.png + python generate_image.py "Logo for Acme Corp, clean sans-serif text" logo.png --aspect 1:1 --size 4K + python generate_image.py "OAuth flow diagram with numbered steps" flow.png --aspect 16:9 --size 2K + +Environment: + GEMINI_API_KEY - Required API key +""" + +import argparse +import os +import sys + +from google import genai +from google.genai import types + + +def generate_image( + prompt: str, + output_path: str, + model: str = "gemini-3-pro-image-preview", + aspect_ratio: str | None = None, + image_size: str | None = None, +) -> str | None: + """Generate an image from a text prompt using Nano Banana Pro. + + Args: + prompt: Text description of the image to generate + output_path: Path to save the generated image + model: Gemini model to use (defaults to Nano Banana Pro) + aspect_ratio: Aspect ratio (1:1, 16:9, 9:16, etc.) + image_size: Resolution (1K, 2K, 4K) + + Returns: + Any text response from the model, or None + """ + api_key = os.environ.get("GEMINI_API_KEY") + if not api_key: + raise EnvironmentError("GEMINI_API_KEY environment variable not set") + + client = genai.Client(api_key=api_key) + + # Build config + config_kwargs = {"response_modalities": ["TEXT", "IMAGE"]} + + image_config_kwargs = {} + if aspect_ratio: + image_config_kwargs["aspect_ratio"] = aspect_ratio + if image_size: + image_config_kwargs["image_size"] = image_size + + if image_config_kwargs: + config_kwargs["image_config"] = types.ImageConfig(**image_config_kwargs) + + config = types.GenerateContentConfig(**config_kwargs) + + response = client.models.generate_content( + model=model, + contents=[prompt], + config=config, + ) + + text_response = None + image_saved = False + + for part in response.parts: + if part.text is not None: + text_response = part.text + elif part.inline_data is not None: + image = part.as_image() + image.save(output_path) + image_saved = True + + if not image_saved: + raise RuntimeError("No image was generated. Check your prompt and try again.") + + return text_response + + +def main(): + parser = argparse.ArgumentParser( + description="Generate images from text prompts using Gemini API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument("prompt", help="Text prompt describing the image") + parser.add_argument("output", help="Output file path (e.g., output.png)") + parser.add_argument( + "--model", "-m", + default="gemini-3-pro-image-preview", + help="Model to use (default: gemini-3-pro-image-preview / Nano Banana Pro)" + ) + parser.add_argument( + "--aspect", "-a", + choices=["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], + help="Aspect ratio" + ) + parser.add_argument( + "--size", "-s", + choices=["1K", "2K", "4K"], + help="Image resolution (up to 4K with Nano Banana Pro)" + ) + + args = parser.parse_args() + + try: + text = generate_image( + prompt=args.prompt, + output_path=args.output, + model=args.model, + aspect_ratio=args.aspect, + image_size=args.size, + ) + + print(f"Image saved to: {args.output}") + if text: + print(f"Model response: {text}") + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/gemini-imagen/scripts/multi_turn_chat.py b/skills/gemini-imagen/scripts/multi_turn_chat.py new file mode 100755 index 0000000..abe3d02 --- /dev/null +++ b/skills/gemini-imagen/scripts/multi_turn_chat.py @@ -0,0 +1,220 @@ +#!/usr/bin/env -S uv run --script +# +# /// script +# requires-python = ">=3.12" +# dependencies = ["google-genai", "pillow"] +# /// +""" +Interactive multi-turn image generation and refinement using Gemini API (Nano Banana Pro). + +Usage: + python multi_turn_chat.py [--output-dir DIR] + +This starts an interactive session where you can: +- Generate technical diagrams and illustrations from prompts +- Iteratively refine images through conversation +- Load existing images for editing +- Save images at any point + +Commands: + /save [filename] - Save current image + /load - Load an image into the conversation + /clear - Start fresh conversation + /quit - Exit + +Environment: + GEMINI_API_KEY - Required API key +""" + +import argparse +import os +import sys +from datetime import datetime +from pathlib import Path + +from PIL import Image +from google import genai +from google.genai import types + + +class ImageChat: + """Interactive chat session for image generation and refinement using Nano Banana Pro.""" + + def __init__( + self, + model: str = "gemini-3-pro-image-preview", + output_dir: str = ".", + ): + api_key = os.environ.get("GEMINI_API_KEY") + if not api_key: + raise EnvironmentError("GEMINI_API_KEY environment variable not set") + + self.client = genai.Client(api_key=api_key) + self.model = model + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + self.chat = None + self.current_image = None + self.image_count = 0 + + self._init_chat() + + def _init_chat(self): + """Initialize or reset the chat session.""" + config = types.GenerateContentConfig( + response_modalities=["TEXT", "IMAGE"] + ) + self.chat = self.client.chats.create( + model=self.model, + config=config, + ) + self.current_image = None + + def send_message(self, message: str, image: Image.Image | None = None) -> tuple[str | None, Image.Image | None]: + """Send a message and optionally an image, return response text and image.""" + contents = [] + if message: + contents.append(message) + if image: + contents.append(image) + + if not contents: + return None, None + + response = self.chat.send_message(contents) + + text_response = None + image_response = None + + for part in response.parts: + if part.text is not None: + text_response = part.text + elif part.inline_data is not None: + image_response = part.as_image() + self.current_image = image_response + + return text_response, image_response + + def save_image(self, filename: str | None = None) -> str | None: + """Save the current image to a file.""" + if self.current_image is None: + return None + + if filename is None: + self.image_count += 1 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"image_{timestamp}_{self.image_count}.png" + + filepath = self.output_dir / filename + self.current_image.save(filepath) + return str(filepath) + + def load_image(self, path: str) -> Image.Image: + """Load an image from disk.""" + img = Image.open(path) + self.current_image = img + return img + + +def main(): + parser = argparse.ArgumentParser( + description="Interactive multi-turn image generation using Nano Banana Pro", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument( + "--model", "-m", + default="gemini-3-pro-image-preview", + help="Model to use (default: gemini-3-pro-image-preview / Nano Banana Pro)" + ) + parser.add_argument( + "--output-dir", "-o", + default=".", + help="Directory to save images" + ) + + args = parser.parse_args() + + try: + chat = ImageChat(model=args.model, output_dir=args.output_dir) + except Exception as e: + print(f"Error initializing: {e}", file=sys.stderr) + sys.exit(1) + + print(f"Gemini Image Chat ({args.model})") + print("Commands: /save [name], /load , /clear, /quit") + print("-" * 50) + + while True: + try: + user_input = input("\nYou: ").strip() + except (EOFError, KeyboardInterrupt): + print("\nGoodbye!") + break + + if not user_input: + continue + + # Handle commands + if user_input.startswith("/"): + parts = user_input.split(maxsplit=1) + cmd = parts[0].lower() + arg = parts[1] if len(parts) > 1 else None + + if cmd == "/quit": + print("Goodbye!") + break + + elif cmd == "/clear": + chat._init_chat() + print("Conversation cleared.") + continue + + elif cmd == "/save": + path = chat.save_image(arg) + if path: + print(f"Image saved to: {path}") + else: + print("No image to save.") + continue + + elif cmd == "/load": + if not arg: + print("Usage: /load ") + continue + try: + chat.load_image(arg) + print(f"Loaded: {arg}") + print("You can now describe edits to make.") + except Exception as e: + print(f"Error loading image: {e}") + continue + + else: + print(f"Unknown command: {cmd}") + continue + + # Send message to model + try: + # If we have a loaded image and this is first message, include it + image_to_send = None + if chat.current_image and not chat.chat.history: + image_to_send = chat.current_image + + text, image = chat.send_message(user_input, image_to_send) + + if text: + print(f"\nGemini: {text}") + + if image: + # Auto-save + path = chat.save_image() + print(f"\n[Image generated: {path}]") + + except Exception as e: + print(f"\nError: {e}") + + +if __name__ == "__main__": + main()