Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 09:03:52 +08:00
commit 0b586b3216
42 changed files with 5241 additions and 0 deletions

View File

@@ -0,0 +1,157 @@
#!/usr/bin/env python3
"""
Compose multiple images into a new image using Gemini API.
Usage:
python compose_images.py "instruction" output.png image1.png [image2.png ...]
Examples:
python compose_images.py "Create a group photo of these people" group.png person1.png person2.png
python compose_images.py "Put the cat from the first image on the couch from the second" result.png cat.png couch.png
python compose_images.py "Apply the art style from the first image to the scene in the second" styled.png style.png photo.png
Note: Supports up to 14 reference images (Gemini 3 Pro only).
Environment:
GEMINI_API_KEY - Required API key
"""
import argparse
import os
import sys
from PIL import Image
from google import genai
from google.genai import types
def compose_images(
instruction: str,
output_path: str,
image_paths: list[str],
model: str = "gemini-3-pro-image-preview",
aspect_ratio: str | None = None,
image_size: str | None = None,
) -> str | None:
"""Compose multiple images based on instructions.
Args:
instruction: Text description of how to combine images
output_path: Path to save the result
image_paths: List of input image paths (up to 14)
model: Gemini model to use (pro recommended)
aspect_ratio: Output aspect ratio
image_size: Output resolution
Returns:
Any text response from the model, or None
"""
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise EnvironmentError("GEMINI_API_KEY environment variable not set")
if len(image_paths) > 14:
raise ValueError("Maximum 14 reference images supported")
if len(image_paths) < 1:
raise ValueError("At least one image is required")
# Verify all images exist
for path in image_paths:
if not os.path.exists(path):
raise FileNotFoundError(f"Image not found: {path}")
client = genai.Client(api_key=api_key)
# Load images
images = [Image.open(path) for path in image_paths]
# Build contents: instruction first, then images
contents = [instruction] + images
# Build config
config_kwargs = {"response_modalities": ["TEXT", "IMAGE"]}
image_config_kwargs = {}
if aspect_ratio:
image_config_kwargs["aspect_ratio"] = aspect_ratio
if image_size:
image_config_kwargs["image_size"] = image_size
if image_config_kwargs:
config_kwargs["image_config"] = types.ImageConfig(**image_config_kwargs)
config = types.GenerateContentConfig(**config_kwargs)
response = client.models.generate_content(
model=model,
contents=contents,
config=config,
)
text_response = None
image_saved = False
for part in response.parts:
if part.text is not None:
text_response = part.text
elif part.inline_data is not None:
image = part.as_image()
image.save(output_path)
image_saved = True
if not image_saved:
raise RuntimeError("No image was generated.")
return text_response
def main():
parser = argparse.ArgumentParser(
description="Compose multiple images using Gemini API",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument("instruction", help="Composition instruction")
parser.add_argument("output", help="Output file path")
parser.add_argument("images", nargs="+", help="Input images (up to 14)")
parser.add_argument(
"--model", "-m",
default="gemini-3-pro-image-preview",
choices=["gemini-2.5-flash-image", "gemini-3-pro-image-preview"],
help="Model to use (pro recommended for composition)"
)
parser.add_argument(
"--aspect", "-a",
choices=["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"],
help="Output aspect ratio"
)
parser.add_argument(
"--size", "-s",
choices=["1K", "2K", "4K"],
help="Output resolution"
)
args = parser.parse_args()
try:
text = compose_images(
instruction=args.instruction,
output_path=args.output,
image_paths=args.images,
model=args.model,
aspect_ratio=args.aspect,
image_size=args.size,
)
print(f"Composed image saved to: {args.output}")
if text:
print(f"Model response: {text}")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""
Edit existing images using Gemini API.
Usage:
python edit_image.py input.png "edit instruction" output.png [options]
Examples:
python edit_image.py photo.png "Add a rainbow in the sky" edited.png
python edit_image.py room.jpg "Change the sofa to red leather" room_edited.jpg
python edit_image.py portrait.png "Make it look like a Van Gogh painting" artistic.png --model gemini-3-pro-image-preview
Environment:
GEMINI_API_KEY - Required API key
"""
import argparse
import os
import sys
from PIL import Image
from google import genai
from google.genai import types
def edit_image(
input_path: str,
instruction: str,
output_path: str,
model: str = "gemini-2.5-flash-image",
aspect_ratio: str | None = None,
image_size: str | None = None,
) -> str | None:
"""Edit an existing image based on text instructions.
Args:
input_path: Path to the input image
instruction: Text description of edits to make
output_path: Path to save the edited image
model: Gemini model to use
aspect_ratio: Output aspect ratio
image_size: Output resolution
Returns:
Any text response from the model, or None
"""
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise EnvironmentError("GEMINI_API_KEY environment variable not set")
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input image not found: {input_path}")
client = genai.Client(api_key=api_key)
# Load input image
input_image = Image.open(input_path)
# Build config
config_kwargs = {"response_modalities": ["TEXT", "IMAGE"]}
image_config_kwargs = {}
if aspect_ratio:
image_config_kwargs["aspect_ratio"] = aspect_ratio
if image_size:
image_config_kwargs["image_size"] = image_size
if image_config_kwargs:
config_kwargs["image_config"] = types.ImageConfig(**image_config_kwargs)
config = types.GenerateContentConfig(**config_kwargs)
response = client.models.generate_content(
model=model,
contents=[instruction, input_image],
config=config,
)
text_response = None
image_saved = False
for part in response.parts:
if part.text is not None:
text_response = part.text
elif part.inline_data is not None:
image = part.as_image()
image.save(output_path)
image_saved = True
if not image_saved:
raise RuntimeError("No image was generated. Check your instruction and try again.")
return text_response
def main():
parser = argparse.ArgumentParser(
description="Edit images using Gemini API",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument("input", help="Input image path")
parser.add_argument("instruction", help="Edit instruction")
parser.add_argument("output", help="Output file path")
parser.add_argument(
"--model", "-m",
default="gemini-2.5-flash-image",
choices=["gemini-2.5-flash-image", "gemini-3-pro-image-preview"],
help="Model to use (default: gemini-2.5-flash-image)"
)
parser.add_argument(
"--aspect", "-a",
choices=["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"],
help="Output aspect ratio"
)
parser.add_argument(
"--size", "-s",
choices=["1K", "2K", "4K"],
help="Output resolution"
)
args = parser.parse_args()
try:
text = edit_image(
input_path=args.input,
instruction=args.instruction,
output_path=args.output,
model=args.model,
aspect_ratio=args.aspect,
image_size=args.size,
)
print(f"Edited image saved to: {args.output}")
if text:
print(f"Model response: {text}")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,263 @@
"""
Gemini Image Generation Library
A simple Python library for generating and editing images with the Gemini API.
Usage:
from gemini_images import GeminiImageGenerator
gen = GeminiImageGenerator()
gen.generate("A sunset over mountains", "sunset.png")
gen.edit("input.png", "Add clouds", "output.png")
Environment:
GEMINI_API_KEY - Required API key
"""
import os
from pathlib import Path
from typing import Literal
from PIL import Image
from google import genai
from google.genai import types
AspectRatio = Literal["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"]
ImageSize = Literal["1K", "2K", "4K"]
Model = Literal["gemini-2.5-flash-image", "gemini-3-pro-image-preview"]
class GeminiImageGenerator:
"""High-level interface for Gemini image generation."""
FLASH = "gemini-2.5-flash-image"
PRO = "gemini-3-pro-image-preview"
def __init__(self, api_key: str | None = None, model: Model = FLASH):
"""Initialize the generator.
Args:
api_key: Gemini API key (defaults to GEMINI_API_KEY env var)
model: Default model to use
"""
self.api_key = api_key or os.environ.get("GEMINI_API_KEY")
if not self.api_key:
raise EnvironmentError("GEMINI_API_KEY not set")
self.client = genai.Client(api_key=self.api_key)
self.model = model
def _build_config(
self,
aspect_ratio: AspectRatio | None = None,
image_size: ImageSize | None = None,
google_search: bool = False,
) -> types.GenerateContentConfig:
"""Build generation config."""
kwargs = {"response_modalities": ["TEXT", "IMAGE"]}
img_config = {}
if aspect_ratio:
img_config["aspect_ratio"] = aspect_ratio
if image_size:
img_config["image_size"] = image_size
if img_config:
kwargs["image_config"] = types.ImageConfig(**img_config)
if google_search:
kwargs["tools"] = [{"google_search": {}}]
return types.GenerateContentConfig(**kwargs)
def generate(
self,
prompt: str,
output: str | Path,
*,
model: Model | None = None,
aspect_ratio: AspectRatio | None = None,
image_size: ImageSize | None = None,
google_search: bool = False,
) -> tuple[Path, str | None]:
"""Generate an image from a text prompt.
Args:
prompt: Text description
output: Output file path
model: Override default model
aspect_ratio: Output aspect ratio
image_size: Output resolution
google_search: Enable Google Search grounding (Pro only)
Returns:
Tuple of (output path, optional text response)
"""
output = Path(output)
config = self._build_config(aspect_ratio, image_size, google_search)
response = self.client.models.generate_content(
model=model or self.model,
contents=[prompt],
config=config,
)
text = None
for part in response.parts:
if part.text:
text = part.text
elif part.inline_data:
part.as_image().save(output)
return output, text
def edit(
self,
input_image: str | Path | Image.Image,
instruction: str,
output: str | Path,
*,
model: Model | None = None,
aspect_ratio: AspectRatio | None = None,
image_size: ImageSize | None = None,
) -> tuple[Path, str | None]:
"""Edit an existing image.
Args:
input_image: Input image (path or PIL Image)
instruction: Edit instruction
output: Output file path
model: Override default model
aspect_ratio: Output aspect ratio
image_size: Output resolution
Returns:
Tuple of (output path, optional text response)
"""
output = Path(output)
if isinstance(input_image, (str, Path)):
input_image = Image.open(input_image)
config = self._build_config(aspect_ratio, image_size)
response = self.client.models.generate_content(
model=model or self.model,
contents=[instruction, input_image],
config=config,
)
text = None
for part in response.parts:
if part.text:
text = part.text
elif part.inline_data:
part.as_image().save(output)
return output, text
def compose(
self,
instruction: str,
images: list[str | Path | Image.Image],
output: str | Path,
*,
model: Model | None = None,
aspect_ratio: AspectRatio | None = None,
image_size: ImageSize | None = None,
) -> tuple[Path, str | None]:
"""Compose multiple images into one.
Args:
instruction: Composition instruction
images: List of input images (up to 14)
output: Output file path
model: Override default model (Pro recommended)
aspect_ratio: Output aspect ratio
image_size: Output resolution
Returns:
Tuple of (output path, optional text response)
"""
output = Path(output)
# Load images
loaded = []
for img in images:
if isinstance(img, (str, Path)):
loaded.append(Image.open(img))
else:
loaded.append(img)
config = self._build_config(aspect_ratio, image_size)
contents = [instruction] + loaded
response = self.client.models.generate_content(
model=model or self.PRO, # Pro recommended for composition
contents=contents,
config=config,
)
text = None
for part in response.parts:
if part.text:
text = part.text
elif part.inline_data:
part.as_image().save(output)
return output, text
def chat(self) -> "ImageChat":
"""Start an interactive chat session for iterative refinement."""
return ImageChat(self.client, self.model)
class ImageChat:
"""Multi-turn chat session for iterative image generation."""
def __init__(self, client: genai.Client, model: Model):
self.client = client
self.model = model
self._chat = client.chats.create(
model=model,
config=types.GenerateContentConfig(response_modalities=["TEXT", "IMAGE"]),
)
self.current_image: Image.Image | None = None
def send(
self,
message: str,
image: Image.Image | str | Path | None = None,
) -> tuple[Image.Image | None, str | None]:
"""Send a message and optionally an image.
Returns:
Tuple of (generated image or None, text response or None)
"""
contents = [message]
if image:
if isinstance(image, (str, Path)):
image = Image.open(image)
contents.append(image)
response = self._chat.send_message(contents)
text = None
img = None
for part in response.parts:
if part.text:
text = part.text
elif part.inline_data:
img = part.as_image()
self.current_image = img
return img, text
def reset(self):
"""Reset the chat session."""
self._chat = self.client.chats.create(
model=self.model,
config=types.GenerateContentConfig(response_modalities=["TEXT", "IMAGE"]),
)
self.current_image = None

View File

@@ -0,0 +1,133 @@
#!/usr/bin/env python3
"""
Generate images from text prompts using Gemini API.
Usage:
python generate_image.py "prompt" output.png [--model MODEL] [--aspect RATIO] [--size SIZE]
Examples:
python generate_image.py "A cat in space" cat.png
python generate_image.py "A logo for Acme Corp" logo.png --model gemini-3-pro-image-preview --aspect 1:1
python generate_image.py "Epic landscape" landscape.png --aspect 16:9 --size 2K
Environment:
GEMINI_API_KEY - Required API key
"""
import argparse
import os
import sys
from google import genai
from google.genai import types
def generate_image(
prompt: str,
output_path: str,
model: str = "gemini-2.5-flash-image",
aspect_ratio: str | None = None,
image_size: str | None = None,
) -> str | None:
"""Generate an image from a text prompt.
Args:
prompt: Text description of the image to generate
output_path: Path to save the generated image
model: Gemini model to use
aspect_ratio: Aspect ratio (1:1, 16:9, 9:16, etc.)
image_size: Resolution (1K, 2K, 4K - 4K only for pro model)
Returns:
Any text response from the model, or None
"""
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise EnvironmentError("GEMINI_API_KEY environment variable not set")
client = genai.Client(api_key=api_key)
# Build config
config_kwargs = {"response_modalities": ["TEXT", "IMAGE"]}
image_config_kwargs = {}
if aspect_ratio:
image_config_kwargs["aspect_ratio"] = aspect_ratio
if image_size:
image_config_kwargs["image_size"] = image_size
if image_config_kwargs:
config_kwargs["image_config"] = types.ImageConfig(**image_config_kwargs)
config = types.GenerateContentConfig(**config_kwargs)
response = client.models.generate_content(
model=model,
contents=[prompt],
config=config,
)
text_response = None
image_saved = False
for part in response.parts:
if part.text is not None:
text_response = part.text
elif part.inline_data is not None:
image = part.as_image()
image.save(output_path)
image_saved = True
if not image_saved:
raise RuntimeError("No image was generated. Check your prompt and try again.")
return text_response
def main():
parser = argparse.ArgumentParser(
description="Generate images from text prompts using Gemini API",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument("prompt", help="Text prompt describing the image")
parser.add_argument("output", help="Output file path (e.g., output.png)")
parser.add_argument(
"--model", "-m",
default="gemini-2.5-flash-image",
choices=["gemini-2.5-flash-image", "gemini-3-pro-image-preview"],
help="Model to use (default: gemini-2.5-flash-image)"
)
parser.add_argument(
"--aspect", "-a",
choices=["1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"],
help="Aspect ratio"
)
parser.add_argument(
"--size", "-s",
choices=["1K", "2K", "4K"],
help="Image resolution (4K only available with pro model)"
)
args = parser.parse_args()
try:
text = generate_image(
prompt=args.prompt,
output_path=args.output,
model=args.model,
aspect_ratio=args.aspect,
image_size=args.size,
)
print(f"Image saved to: {args.output}")
if text:
print(f"Model response: {text}")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,216 @@
#!/usr/bin/env python3
"""
Interactive multi-turn image generation and refinement using Gemini API.
Usage:
python multi_turn_chat.py [--model MODEL] [--output-dir DIR]
This starts an interactive session where you can:
- Generate images from prompts
- Iteratively refine images through conversation
- Load existing images for editing
- Save images at any point
Commands:
/save [filename] - Save current image
/load <path> - Load an image into the conversation
/clear - Start fresh conversation
/quit - Exit
Environment:
GEMINI_API_KEY - Required API key
"""
import argparse
import os
import sys
from datetime import datetime
from pathlib import Path
from PIL import Image
from google import genai
from google.genai import types
class ImageChat:
"""Interactive chat session for image generation and refinement."""
def __init__(
self,
model: str = "gemini-2.5-flash-image",
output_dir: str = ".",
):
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise EnvironmentError("GEMINI_API_KEY environment variable not set")
self.client = genai.Client(api_key=api_key)
self.model = model
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.chat = None
self.current_image = None
self.image_count = 0
self._init_chat()
def _init_chat(self):
"""Initialize or reset the chat session."""
config = types.GenerateContentConfig(
response_modalities=["TEXT", "IMAGE"]
)
self.chat = self.client.chats.create(
model=self.model,
config=config,
)
self.current_image = None
def send_message(self, message: str, image: Image.Image | None = None) -> tuple[str | None, Image.Image | None]:
"""Send a message and optionally an image, return response text and image."""
contents = []
if message:
contents.append(message)
if image:
contents.append(image)
if not contents:
return None, None
response = self.chat.send_message(contents)
text_response = None
image_response = None
for part in response.parts:
if part.text is not None:
text_response = part.text
elif part.inline_data is not None:
image_response = part.as_image()
self.current_image = image_response
return text_response, image_response
def save_image(self, filename: str | None = None) -> str | None:
"""Save the current image to a file."""
if self.current_image is None:
return None
if filename is None:
self.image_count += 1
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"image_{timestamp}_{self.image_count}.png"
filepath = self.output_dir / filename
self.current_image.save(filepath)
return str(filepath)
def load_image(self, path: str) -> Image.Image:
"""Load an image from disk."""
img = Image.open(path)
self.current_image = img
return img
def main():
parser = argparse.ArgumentParser(
description="Interactive multi-turn image generation",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
"--model", "-m",
default="gemini-2.5-flash-image",
choices=["gemini-2.5-flash-image", "gemini-3-pro-image-preview"],
help="Model to use"
)
parser.add_argument(
"--output-dir", "-o",
default=".",
help="Directory to save images"
)
args = parser.parse_args()
try:
chat = ImageChat(model=args.model, output_dir=args.output_dir)
except Exception as e:
print(f"Error initializing: {e}", file=sys.stderr)
sys.exit(1)
print(f"Gemini Image Chat ({args.model})")
print("Commands: /save [name], /load <path>, /clear, /quit")
print("-" * 50)
while True:
try:
user_input = input("\nYou: ").strip()
except (EOFError, KeyboardInterrupt):
print("\nGoodbye!")
break
if not user_input:
continue
# Handle commands
if user_input.startswith("/"):
parts = user_input.split(maxsplit=1)
cmd = parts[0].lower()
arg = parts[1] if len(parts) > 1 else None
if cmd == "/quit":
print("Goodbye!")
break
elif cmd == "/clear":
chat._init_chat()
print("Conversation cleared.")
continue
elif cmd == "/save":
path = chat.save_image(arg)
if path:
print(f"Image saved to: {path}")
else:
print("No image to save.")
continue
elif cmd == "/load":
if not arg:
print("Usage: /load <path>")
continue
try:
chat.load_image(arg)
print(f"Loaded: {arg}")
print("You can now describe edits to make.")
except Exception as e:
print(f"Error loading image: {e}")
continue
else:
print(f"Unknown command: {cmd}")
continue
# Send message to model
try:
# If we have a loaded image and this is first message, include it
image_to_send = None
if chat.current_image and not chat.chat.history:
image_to_send = chat.current_image
text, image = chat.send_message(user_input, image_to_send)
if text:
print(f"\nGemini: {text}")
if image:
# Auto-save
path = chat.save_image()
print(f"\n[Image generated: {path}]")
except Exception as e:
print(f"\nError: {e}")
if __name__ == "__main__":
main()