Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:14:46 +08:00
commit b5d12ef27c
8 changed files with 2822 additions and 0 deletions

View File

@@ -0,0 +1,570 @@
# API Operations Reference
Guide for operating CocoIndex flows programmatically using Python APIs.
## Overview
CocoIndex flows can be operated through Python APIs, providing programmatic control over setup, updates, and queries. This is useful for embedding flows in applications, automating workflows, or building custom tools.
## Basic Setup
### Initialization
```python
from dotenv import load_dotenv
import cocoindex
# Load environment variables
load_dotenv()
# Initialize CocoIndex
cocoindex.init()
```
### Flow Definition
```python
@cocoindex.flow_def(name="MyFlow")
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
# Flow definition
pass
```
The decorator returns a `cocoindex.Flow` object that can be used for operations.
## Flow Operations
### Setup Flow
Create persistent backends (tables, collections, etc.) for the flow.
```python
# Basic setup
my_flow.setup()
# With progress output
my_flow.setup(report_to_stdout=True)
# Async version
await my_flow.setup_async(report_to_stdout=True)
```
**When to use:**
- Before first update
- After modifying flow structure
- After dropping flow to recreate resources
### Setup All Flows
```python
# Setup all flows at once
cocoindex.setup_all_flows(report_to_stdout=True)
```
### Drop Flow
Remove all persistent backends owned by the flow.
```python
# Drop flow
my_flow.drop()
# With progress output
my_flow.drop(report_to_stdout=True)
# Async version
await my_flow.drop_async(report_to_stdout=True)
```
**Note:** After dropping, the Flow object is still valid and can be setup again.
### Drop All Flows
```python
# Drop all flows
cocoindex.drop_all_flows(report_to_stdout=True)
```
### Close Flow
Remove flow from current process memory (doesn't affect persistent data).
```python
my_flow.close()
# After this, my_flow is invalid and should not be used
```
## Update Operations
### One-Time Update
Build or update target data based on current source data.
```python
# Basic update
stats = my_flow.update()
print(f"Processed {stats.total_rows} rows")
# With reexport (force reprocess even if unchanged)
stats = my_flow.update(reexport_targets=True)
# Async version
stats = await my_flow.update_async()
stats = await my_flow.update_async(reexport_targets=True)
```
**Returns:** Statistics about processed data
**Note:** Multiple calls to `update()` can run simultaneously. CocoIndex will automatically combine them efficiently.
### Live Update
Continuously monitor source changes and update targets.
```python
import cocoindex
# Create live updater
updater = cocoindex.FlowLiveUpdater(
my_flow,
cocoindex.FlowLiveUpdaterOptions(
live_mode=True, # Enable live updates
print_stats=True, # Print progress
reexport_targets=False # Only reexport on first update if True
)
)
# Start the updater
updater.start()
# Your application logic here
# (updater runs in background threads)
# Wait for completion
updater.wait()
# Print final stats
print(updater.update_stats())
```
#### As Context Manager
```python
with cocoindex.FlowLiveUpdater(my_flow) as updater:
# Updater starts automatically
# Your application logic here
pass
# Updater aborts and waits automatically
# Async version
async with cocoindex.FlowLiveUpdater(my_flow) as updater:
# Your application logic
pass
```
#### Monitoring Status Updates
```python
updater = cocoindex.FlowLiveUpdater(my_flow)
updater.start()
while True:
# Block until next status update
updates = updater.next_status_updates()
# Check which sources were updated
for source in updates.updated_sources:
print(f"Source {source} has new data")
# Trigger downstream operations
# Check if updater stopped
if not updates.active_sources:
print("All sources stopped")
break
# Async version
while True:
updates = await updater.next_status_updates_async()
# ... same logic
```
#### Control Methods
```python
# Start updater
updater.start()
await updater.start_async()
# Abort updater
updater.abort()
# Wait for completion
updater.wait()
await updater.wait_async()
# Get current stats
stats = updater.update_stats()
```
## Evaluate Flow
Run transformations without updating targets (for testing).
```python
# Evaluate and dump results
my_flow.evaluate_and_dump(
cocoindex.EvaluateAndDumpOptions(
output_dir="./eval_output",
use_cache=True # Use existing cache (but don't update it)
)
)
```
**Use cases:**
- Testing flow logic
- Debugging transformations
- Inspecting intermediate data
## Query Operations
### Transform Flows
Transform flows enable reusable transformation logic for both indexing and querying.
```python
from numpy.typing import NDArray
import numpy as np
# Define transform flow
@cocoindex.transform_flow()
def text_to_embedding(
text: cocoindex.DataSlice[str]
) -> cocoindex.DataSlice[NDArray[np.float32]]:
"""Convert text to embedding vector."""
return text.transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"
)
)
# Use in indexing flow
@cocoindex.flow_def(name="TextEmbedding")
def text_embedding_flow(flow_builder, data_scope):
# ... setup source ...
with data_scope["documents"].row() as doc:
doc["embedding"] = text_to_embedding(doc["content"])
# ... collect and export ...
# Use for querying (evaluate with input)
query_embedding = text_to_embedding.eval("search query text")
# query_embedding is now a numpy array
```
### Query Handlers
Attach query logic to flows for easy query execution.
```python
import functools
from psycopg_pool import ConnectionPool
from pgvector.psycopg import register_vector
@functools.cache
def connection_pool():
return ConnectionPool(os.environ["COCOINDEX_DATABASE_URL"])
# Register query handler
@my_flow.query_handler(
result_fields=cocoindex.QueryHandlerResultFields(
embedding=["embedding"], # Field name(s) containing embeddings
score="score" # Field name for similarity score
)
)
def search(query: str) -> cocoindex.QueryOutput:
"""Search for documents matching query."""
# Get table name for this flow's export
table_name = cocoindex.utils.get_target_default_name(my_flow, "doc_embeddings")
# Compute query embedding using transform flow
query_vector = text_to_embedding.eval(query)
# Execute query
with connection_pool().connection() as conn:
register_vector(conn)
with conn.cursor() as cur:
cur.execute(
f"""
SELECT filename, text, embedding, embedding <=> %s AS distance
FROM {table_name}
ORDER BY distance
LIMIT 10
""",
(query_vector,)
)
return cocoindex.QueryOutput(
query_info=cocoindex.QueryInfo(
embedding=query_vector,
similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
),
results=[
{
"filename": row[0],
"text": row[1],
"embedding": row[2],
"score": 1.0 - row[3] # Convert distance to similarity
}
for row in cur.fetchall()
]
)
# Call the query handler
results = search("machine learning algorithms")
for result in results.results:
print(f"[{result['score']:.3f}] {result['filename']}: {result['text']}")
```
### Query with Qdrant
```python
from qdrant_client import QdrantClient
import functools
@functools.cache
def get_qdrant_client():
return QdrantClient(url="http://localhost:6334", prefer_grpc=True)
@my_flow.query_handler(
result_fields=cocoindex.QueryHandlerResultFields(
embedding=["embedding"],
score="score"
)
)
def search_qdrant(query: str) -> cocoindex.QueryOutput:
client = get_qdrant_client()
# Get query embedding
query_embedding = text_to_embedding.eval(query)
# Search Qdrant
search_results = client.search(
collection_name="my_collection",
query_vector=("text_embedding", query_embedding),
limit=10
)
return cocoindex.QueryOutput(
query_info=cocoindex.QueryInfo(
embedding=query_embedding,
similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
),
results=[
{
"text": result.payload["text"],
"embedding": result.vector,
"score": result.score
}
for result in search_results
]
)
```
## Application Integration Patterns
### Pattern 1: Simple Application with Update
```python
from dotenv import load_dotenv
import cocoindex
# Initialize
load_dotenv()
cocoindex.init()
# Define flow
@cocoindex.flow_def(name="MyApp")
def my_app_flow(flow_builder, data_scope):
# ... flow definition ...
pass
def main():
# Ensure flow is set up and data is fresh
stats = my_app_flow.update()
print(f"Updated index: {stats}")
# Run application logic
while True:
query = input("Search: ")
if not query:
break
results = search(query)
for result in results.results:
print(f" {result['score']:.3f}: {result['text']}")
if __name__ == "__main__":
main()
```
### Pattern 2: Web Application with Live Updates
```python
from fastapi import FastAPI
import cocoindex
from dotenv import load_dotenv
load_dotenv()
cocoindex.init()
@cocoindex.flow_def(name="WebAppFlow")
def web_app_flow(flow_builder, data_scope):
# ... flow definition ...
pass
# Create FastAPI app
app = FastAPI()
# Global updater
updater = None
@app.on_event("startup")
async def startup():
global updater
# Start live updater in background
updater = cocoindex.FlowLiveUpdater(
web_app_flow,
cocoindex.FlowLiveUpdaterOptions(live_mode=True, print_stats=True)
)
await updater.start_async()
print("Live updater started")
@app.on_event("shutdown")
async def shutdown():
global updater
if updater:
updater.abort()
await updater.wait_async()
print("Live updater stopped")
@app.get("/search")
async def search_endpoint(q: str):
results = search(q)
return {
"query": q,
"results": results.results
}
```
### Pattern 3: Batch Processing
```python
import cocoindex
from dotenv import load_dotenv
load_dotenv()
cocoindex.init()
@cocoindex.flow_def(name="BatchProcessor")
def batch_flow(flow_builder, data_scope):
# ... flow definition ...
pass
def process_batch():
"""Run as scheduled job (cron, etc.)"""
# Setup if needed (no-op if already set up)
batch_flow.setup()
# Run update
stats = batch_flow.update()
# Log results
print(f"Batch completed: {stats.total_rows} rows processed")
return stats
if __name__ == "__main__":
process_batch()
```
### Pattern 4: React to Updates
```python
import cocoindex
@cocoindex.flow_def(name="ReactiveFlow")
def reactive_flow(flow_builder, data_scope):
# ... flow definition ...
pass
async def run_with_reactions():
"""Monitor updates and trigger downstream actions."""
async with cocoindex.FlowLiveUpdater(reactive_flow) as updater:
while True:
updates = await updater.next_status_updates_async()
# React to specific source updates
if "products" in updates.updated_sources:
await rebuild_product_index()
if "customers" in updates.updated_sources:
await refresh_customer_cache()
# Exit when updater stops
if not updates.active_sources:
break
async def rebuild_product_index():
print("Rebuilding product index...")
# Custom logic
async def refresh_customer_cache():
print("Refreshing customer cache...")
# Custom logic
```
## Error Handling
### Handling Update Errors
```python
try:
stats = my_flow.update()
except cocoindex.CocoIndexError as e:
print(f"Update failed: {e}")
# Handle error (log, retry, alert, etc.)
```
### Graceful Shutdown
```python
import signal
updater = None
def signal_handler(sig, frame):
print("Shutting down gracefully...")
if updater:
updater.abort()
updater.wait()
print("Shutdown complete")
exit(0)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
updater = cocoindex.FlowLiveUpdater(my_flow)
updater.start()
updater.wait()
```
## Best Practices
1. **Always call cocoindex.init()** - Initialize before using any CocoIndex APIs
2. **Load environment variables** - Use dotenv or similar to load configuration
3. **Use context managers** - For live updaters to ensure cleanup
4. **Cache expensive resources** - Use `@functools.cache` for database pools, clients
5. **Handle signals** - Gracefully shutdown live updaters on SIGINT/SIGTERM
6. **Separate concerns** - Keep flow definitions, queries, and application logic separate
7. **Use transform flows** - Share logic between indexing and querying
8. **Monitor update stats** - Log and track processing statistics
9. **Test with evaluate** - Use evaluate_and_dump for testing before updates

View File

@@ -0,0 +1,401 @@
# CLI Operations Reference
Complete guide for operating CocoIndex flows using the CLI.
## Overview
The CocoIndex CLI (`cocoindex` command) provides tools for managing and inspecting flows. Most commands require an `APP_TARGET` argument specifying where flow definitions are located.
## Environment Setup
### Environment Variables
Create a `.env` file in the project directory:
```bash
# Database connection (required)
COCOINDEX_DATABASE_URL=postgresql://user:password@localhost/cocoindex_db
# Optional: App namespace for organizing flows
COCOINDEX_APP_NAMESPACE=dev
# Optional: Global concurrency limits
COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS=50
COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES=524288000 # 500MB
# Optional: LLM API keys (if using LLM functions)
OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-...
VOYAGE_API_KEY=pa-...
```
### Loading Environment Files
```bash
# Default: loads .env from current directory
cocoindex <command> ...
# Specify custom env file
cocoindex --env-file path/to/.env <command> ...
# Specify app directory
cocoindex --app-dir /path/to/project <command> ...
```
## APP_TARGET Format
The `APP_TARGET` tells the CLI where flow definitions are located:
### Python Module
```bash
# Load from module name
cocoindex update main
# Load from package module
cocoindex update my_package.flows
```
### Python File
```bash
# Load from file path
cocoindex update main.py
# Load from nested file
cocoindex update path/to/flows.py
```
### Specific Flow
```bash
# Target specific flow in module
cocoindex update main:MyFlowName
# Target specific flow in file
cocoindex update path/to/flows.py:MyFlowName
```
## Core Commands
### setup - Initialize Flow Resources
Create all persistent backends needed by flows (database tables, collections, etc.).
```bash
# Setup all flows
cocoindex setup main.py
# Setup specific flow
cocoindex setup main.py:MyFlow
```
**What it does:**
- Creates internal storage tables in Postgres
- Creates target resources (database tables, vector collections, graph structures)
- Updates schemas if flow definition changed
- No-op if already set up and no changes needed
**When to use:**
- First time running a flow
- After modifying flow structure (new fields, new targets)
- After dropping flows to recreate resources
### update - Build/Update Target Data
Run transformations and update target data based on current source data.
```bash
# One-time update
cocoindex update main.py
# One-time update with setup
cocoindex update --setup main.py
# One-time update specific flow
cocoindex update main.py:TextEmbedding
# Force reexport even if no changes
cocoindex update --reexport main.py
```
**What it does:**
- Reads source data
- Applies transformations
- Updates target databases
- Uses incremental processing (only processes changed data)
**Options:**
- `--setup` - Run setup first if needed
- `--reexport` - Reexport all data even if unchanged (useful after data loss)
### update -L - Live Update Mode
Continuously monitor source changes and update targets.
```bash
# Live update mode
cocoindex update main.py -L
# Live update with setup
cocoindex update --setup main.py -L
# Live update with reexport on initial update
cocoindex update --reexport main.py -L
```
**What it does:**
- Performs initial one-time update
- Continuously monitors source changes
- Automatically processes updates
- Runs until aborted (Ctrl-C)
**Requires:**
- At least one source with change capture enabled:
- `refresh_interval` parameter on source
- Source-specific change capture (Postgres notifications, S3 events, etc.)
**Example with refresh interval:**
```python
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="documents"),
refresh_interval=datetime.timedelta(minutes=1) # Check every minute
)
```
### drop - Remove Flow Resources
Remove all persistent backends owned by flows.
```bash
# Drop all flows
cocoindex drop main.py
# Drop specific flow
cocoindex drop main.py:MyFlow
```
**What it does:**
- Drops internal storage tables
- Drops target resources (tables, collections, graphs)
- Cleans up all persistent data
**Warning:** This is destructive and cannot be undone!
### show - Inspect Flow Definition
Display flow structure and statistics.
```bash
# Show flow structure
cocoindex show main.py:MyFlow
# Show all flows
cocoindex show main.py
```
**What it shows:**
- Flow name and structure
- Sources configured
- Transformations defined
- Targets and their schemas
- Current statistics (if flow is set up)
### evaluate - Test Flow Without Updating
Run transformations and dump results to files without updating targets.
```bash
# Evaluate flow
cocoindex evaluate main.py:MyFlow
# Specify output directory
cocoindex evaluate main.py:MyFlow --output-dir ./eval_results
# Disable cache
cocoindex evaluate main.py:MyFlow --no-cache
```
**What it does:**
- Runs transformations
- Saves results to files (JSON, CSV, etc.)
- Does NOT update targets
- Uses existing cache by default
**When to use:**
- Testing flow logic before running full update
- Debugging transformation issues
- Inspecting intermediate data
- Validating output format
**Options:**
- `--output-dir PATH` - Directory for output files (default: `eval_{flow_name}_{timestamp}`)
- `--no-cache` - Disable reading from cache (still doesn't write to cache)
## Complete Workflow Examples
### First-Time Setup and Indexing
```bash
# 1. Setup flow resources
cocoindex setup main.py
# 2. Run initial indexing
cocoindex update main.py
# 3. Verify results
cocoindex show main.py
```
### Development Workflow
```bash
# 1. Test with evaluate (no side effects)
cocoindex evaluate main.py:MyFlow --output-dir ./test_output
# 2. If looks good, setup and update
cocoindex update --setup main.py:MyFlow
# 3. Check results
cocoindex show main.py:MyFlow
```
### Production Live Updates
```bash
# Run with live updates and auto-setup
cocoindex update --setup main.py -L
```
### Rebuild After Changes
```bash
# Drop old resources
cocoindex drop main.py
# Setup with new definition
cocoindex setup main.py
# Reindex everything
cocoindex update --reexport main.py
```
### Multiple Flows
```bash
# Setup all flows
cocoindex setup main.py
# Update specific flows
cocoindex update main.py:CodeEmbedding
cocoindex update main.py:DocumentEmbedding
# Show all flows
cocoindex show main.py
```
## Common Issues and Solutions
### Issue: "Flow not found"
**Problem:** CLI can't find the flow definition.
**Solutions:**
```bash
# Make sure APP_TARGET is correct
cocoindex show main.py # Should list flows
# Use --app-dir if not in project root
cocoindex --app-dir /path/to/project show main.py
# Check flow name is correct
cocoindex show main.py:CorrectFlowName
```
### Issue: "Database connection failed"
**Problem:** Can't connect to Postgres.
**Solutions:**
```bash
# Check .env file exists
cat .env | grep COCOINDEX_DATABASE_URL
# Test connection
psql $COCOINDEX_DATABASE_URL
# Use --env-file if .env is elsewhere
cocoindex --env-file /path/to/.env update main.py
```
### Issue: "Schema mismatch"
**Problem:** Flow definition changed but resources not updated.
**Solution:**
```bash
# Re-run setup to update schemas
cocoindex setup main.py
# Then update data
cocoindex update main.py
```
### Issue: "Live update exits immediately"
**Problem:** No change capture mechanisms enabled.
**Solution:**
Add refresh_interval or use source-specific change capture:
```python
data_scope["docs"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="docs"),
refresh_interval=datetime.timedelta(seconds=30) # Add this
)
```
## Advanced Options
### Global Options
```bash
# Show version
cocoindex --version
# Show help
cocoindex --help
cocoindex update --help
# Specify app directory
cocoindex --app-dir /custom/path update main
# Custom env file
cocoindex --env-file prod.env update main
```
### Performance Tuning
Set environment variables for concurrency:
```bash
# In .env file
COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS=100
COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES=1073741824 # 1GB
```
Or per-source in code:
```python
data_scope["docs"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="docs"),
max_inflight_rows=50,
max_inflight_bytes=500*1024*1024 # 500MB
)
```
## Best Practices
1. **Use evaluate before update** - Test flow logic without side effects
2. **Always setup before first update** - Or use `--setup` flag
3. **Use live updates in production** - Keeps targets always fresh
4. **Set app namespace** - Organize flows across environments (dev/staging/prod)
5. **Monitor with show** - Regularly check flow statistics
6. **Version control .env.example** - Document required environment variables
7. **Use specific flow targets** - For selective updates: `main.py:FlowName`
8. **Setup after definition changes** - Ensures schemas match flow definition

View File

@@ -0,0 +1,467 @@
# Custom Functions Reference
Complete guide for creating custom functions in CocoIndex.
## Overview
Custom functions allow creating data transformation logic that can be used within flows. There are two approaches:
1. **Standalone function** - Simple, no configuration or setup logic
2. **Function spec + executor** - Advanced, with configuration and setup logic
## Standalone Functions
Use for simple transformations that don't need configuration or setup.
### Basic Example
```python
@cocoindex.op.function(behavior_version=1)
def compute_word_count(text: str) -> int:
"""Count words in text."""
return len(text.split())
```
**Requirements:**
- Decorate with `@cocoindex.op.function()`
- Type annotations required for all arguments and return value
- Supports basic types, structs, tables, and numpy arrays
### With Optional Parameters
```python
@cocoindex.op.function(behavior_version=1)
def extract_info(content: str, filename: str, max_length: int | None = None) -> dict:
"""
Extract information from content.
Args:
content: The document content
filename: Source filename
max_length: Optional maximum length for truncation
"""
info = {
"filename": filename,
"length": len(content),
"word_count": len(content.split())
}
if max_length and len(content) > max_length:
info["truncated"] = True
return info
```
### Using in Flows
```python
@cocoindex.flow_def(name="MyFlow")
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="documents")
)
collector = data_scope.add_collector()
with data_scope["documents"].row() as doc:
# Use standalone function
doc["word_count"] = doc["content"].transform(compute_word_count)
# With additional arguments
doc["info"] = doc["content"].transform(
extract_info,
filename=doc["filename"],
max_length=1000
)
collector.collect(
filename=doc["filename"],
word_count=doc["word_count"],
info=doc["info"]
)
collector.export("documents", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
```
## Function Spec + Executor
Use for functions that need configuration or setup logic (e.g., loading models).
### Basic Structure
```python
# 1. Define the function spec (configuration)
class ComputeSomething(cocoindex.op.FunctionSpec):
"""
Configuration for the ComputeSomething function.
"""
param1: str
param2: int = 10 # Optional with default
# 2. Define the executor (implementation)
@cocoindex.op.executor_class(behavior_version=1)
class ComputeSomethingExecutor:
spec: ComputeSomething # Required: link to spec
def prepare(self) -> None:
"""
Optional: Setup logic run once before execution.
Use for loading models, establishing connections, etc.
"""
# Setup based on self.spec
pass
def __call__(self, input_data: str) -> dict:
"""
Required: Execute the function for each data row.
Args must have type annotations.
Return type must have type annotation.
"""
# Use self.spec.param1, self.spec.param2
return {"result": f"{input_data}-{self.spec.param1}"}
```
### Example: Custom Embedding Function
```python
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy.typing import NDArray
class CustomEmbed(cocoindex.op.FunctionSpec):
"""
Embed text using a specified SentenceTransformer model.
"""
model_name: str
normalize: bool = True
@cocoindex.op.executor_class(cache=True, behavior_version=1)
class CustomEmbedExecutor:
spec: CustomEmbed
model: SentenceTransformer | None = None
def prepare(self) -> None:
"""Load the model once during initialization."""
self.model = SentenceTransformer(self.spec.model_name)
def __call__(self, text: str) -> NDArray[np.float32]:
"""Embed the input text."""
assert self.model is not None
embedding = self.model.encode(text, normalize_embeddings=self.spec.normalize)
return embedding.astype(np.float32)
# Usage in flow
@cocoindex.flow_def(name="CustomEmbedFlow")
def custom_embed_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="documents")
)
collector = data_scope.add_collector()
with data_scope["documents"].row() as doc:
doc["embedding"] = doc["content"].transform(
CustomEmbed(
model_name="sentence-transformers/all-MiniLM-L6-v2",
normalize=True
)
)
collector.collect(
text=doc["content"],
embedding=doc["embedding"]
)
collector.export("embeddings", cocoindex.targets.Postgres(), primary_key_fields=["text"])
```
### Example: PDF Processing
```python
import pymupdf # PyMuPDF
class PdfToMarkdown(cocoindex.op.FunctionSpec):
"""
Convert PDF to markdown.
"""
extract_images: bool = False
page_range: tuple[int, int] | None = None # (start, end) pages
@cocoindex.op.executor_class(cache=True, behavior_version=1)
class PdfToMarkdownExecutor:
spec: PdfToMarkdown
def __call__(self, pdf_bytes: bytes) -> str:
"""Convert PDF bytes to markdown text."""
doc = pymupdf.Document(stream=pdf_bytes, filetype="pdf")
# Determine page range
start = 0
end = doc.page_count
if self.spec.page_range:
start, end = self.spec.page_range
start = max(0, start)
end = min(doc.page_count, end)
markdown_parts = []
for page_num in range(start, end):
page = doc[page_num]
text = page.get_text()
markdown_parts.append(f"# Page {page_num + 1}\n\n{text}")
return "\n\n".join(markdown_parts)
# Usage
@cocoindex.flow_def(name="PdfFlow")
def pdf_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
data_scope["pdfs"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="pdfs", included_patterns=["*.pdf"])
)
collector = data_scope.add_collector()
with data_scope["pdfs"].row() as pdf:
pdf["markdown"] = pdf["content"].transform(
PdfToMarkdown(extract_images=False, page_range=(0, 10))
)
collector.collect(
filename=pdf["filename"],
markdown=pdf["markdown"]
)
collector.export("pdf_text", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
```
## Function Parameters
Both standalone functions and executors support these parameters:
### cache (bool)
Enable caching of function results for reuse during reprocessing.
```python
@cocoindex.op.function(cache=True, behavior_version=1)
def expensive_computation(text: str) -> dict:
# Computationally intensive operation
return {"result": analyze(text)}
```
**When to use:**
- Functions that are computationally expensive
- LLM API calls
- Model inference
- External API calls
### behavior_version (int)
Required when `cache=True`. Increment this when function behavior changes to invalidate cache.
```python
@cocoindex.op.function(cache=True, behavior_version=2) # Incremented from 1
def improved_analysis(text: str) -> dict:
# Updated algorithm - need to reprocess cached data
return {"result": new_analysis(text)}
```
### gpu (bool)
Indicates the function uses GPU resources, affecting scheduling.
```python
@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
class GpuModelExecutor:
spec: GpuModel
def prepare(self) -> None:
self.model = load_model_on_gpu(self.spec.model_name)
def __call__(self, text: str) -> NDArray[np.float32]:
return self.model.predict(text)
```
### arg_relationship
Specifies metadata about argument relationships for tools like CocoInsight.
```python
@cocoindex.op.function(
cache=True,
behavior_version=1,
arg_relationship=(cocoindex.ArgRelationship.CHUNKS_BASE_TEXT, "content")
)
def custom_chunker(content: str, chunk_size: int) -> list[dict]:
"""
Chunks are derived from 'content' argument.
First element of each chunk dict must be a Range type.
"""
# Return list of chunks with location ranges
return [
{"location": cocoindex.Range(...), "text": chunk}
for chunk in split_content(content, chunk_size)
]
```
**Supported relationships:**
- `ArgRelationship.CHUNKS_BASE_TEXT` - Output is chunks of input text
- `ArgRelationship.EMBEDDING_ORIGIN_TEXT` - Output is embedding of input text
- `ArgRelationship.RECTS_BASE_IMAGE` - Output is rectangles on input image
## Supported Data Types
Functions can use these types for arguments and return values:
### Basic Types
- `str` - Text
- `int` - Integer (maps to Int64)
- `float` - Float (maps to Float64)
- `bool` - Boolean
- `bytes` - Binary data
- `None` / `type(None)` - Null value
### Collection Types
- `list[T]` - List of type T
- `dict[str, T]` - Dictionary (becomes Struct)
- `cocoindex.Json` - Arbitrary JSON
### Numpy Types
- `NDArray[np.float32]` - Vector[Float32, N]
- `NDArray[np.float64]` - Vector[Float64, N]
- `NDArray[np.int32]` - Vector[Int32, N]
- `NDArray[np.int64]` - Vector[Int64, N]
### CocoIndex Types
- `cocoindex.Range` - Text range with location info
- Dataclasses - Become Struct types
### Optional Types
- `T | None` or `Optional[T]` - Optional value
### Table Types (Output only)
Functions can return table-like data using dataclasses:
```python
@dataclasses.dataclass
class Chunk:
location: cocoindex.Range
text: str
@cocoindex.op.function(behavior_version=1)
def chunk_text(content: str) -> list[Chunk]:
"""Returns a list representing a table."""
return [
Chunk(location=..., text=chunk)
for chunk in split_content(content)
]
```
## Common Patterns
### Pattern: LLM-based Extraction
```python
from openai import OpenAI
class ExtractStructuredInfo(cocoindex.op.FunctionSpec):
"""Extract structured information using an LLM."""
model: str = "gpt-4"
system_prompt: str = "Extract key information from the text."
@cocoindex.op.executor_class(cache=True, behavior_version=1)
class ExtractStructuredInfoExecutor:
spec: ExtractStructuredInfo
client: OpenAI | None = None
def prepare(self) -> None:
self.client = OpenAI() # Uses OPENAI_API_KEY env var
def __call__(self, text: str) -> dict:
assert self.client is not None
response = self.client.chat.completions.create(
model=self.spec.model,
messages=[
{"role": "system", "content": self.spec.system_prompt},
{"role": "user", "content": text}
]
)
# Parse and return structured data
return {"extracted": response.choices[0].message.content}
```
### Pattern: External API Call
```python
import requests
class FetchEnrichmentData(cocoindex.op.FunctionSpec):
"""Fetch enrichment data from external API."""
api_endpoint: str
api_key: str
@cocoindex.op.executor_class(cache=True, behavior_version=1)
class FetchEnrichmentDataExecutor:
spec: FetchEnrichmentData
def __call__(self, entity_id: str) -> dict:
response = requests.get(
f"{self.spec.api_endpoint}/entities/{entity_id}",
headers={"Authorization": f"Bearer {self.spec.api_key}"}
)
response.raise_for_status()
return response.json()
```
### Pattern: Multi-step Processing
```python
class ProcessDocument(cocoindex.op.FunctionSpec):
"""Process document through multiple steps."""
min_quality_score: float = 0.7
@cocoindex.op.executor_class(cache=True, behavior_version=1)
class ProcessDocumentExecutor:
spec: ProcessDocument
nlp_model = None
def prepare(self) -> None:
import spacy
self.nlp_model = spacy.load("en_core_web_sm")
def __call__(self, text: str) -> dict:
# Step 1: Clean text
cleaned = self._clean_text(text)
# Step 2: Extract entities
doc = self.nlp_model(cleaned)
entities = [ent.text for ent in doc.ents]
# Step 3: Quality check
quality_score = self._compute_quality(cleaned)
return {
"cleaned_text": cleaned if quality_score >= self.spec.min_quality_score else None,
"entities": entities,
"quality_score": quality_score
}
def _clean_text(self, text: str) -> str:
# Cleaning logic
return text.strip()
def _compute_quality(self, text: str) -> float:
# Quality scoring logic
return len(text) / 1000.0
```
## Best Practices
1. **Use caching for expensive operations** - Enable `cache=True` for LLM calls, model inference, or external APIs
2. **Type annotations required** - All arguments and return types must be annotated
3. **Increment behavior_version** - When changing cached function logic, increment version to invalidate cache
4. **Use prepare() for initialization** - Load models, establish connections once in prepare()
5. **Keep functions focused** - Each function should do one thing well
6. **Document parameters** - Use docstrings to explain function purpose and parameters
7. **Handle errors gracefully** - Consider edge cases and invalid inputs
8. **Use appropriate return types** - Match return types to target schema needs

View File

@@ -0,0 +1,478 @@
# CocoIndex Flow Patterns
This reference provides common patterns and examples for building CocoIndex flows.
## Basic Flow Pattern
```python
import cocoindex
@cocoindex.flow_def(name="FlowName")
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
# 1. Import source data
data_scope["source_data"] = flow_builder.add_source(...)
# 2. Create collectors for output
my_collector = data_scope.add_collector()
# 3. Transform data
with data_scope["source_data"].row() as item:
item["transformed"] = item["field"].transform(...)
my_collector.collect(...)
# 4. Export to target
my_collector.export("target_name", ..., primary_key_fields=[...])
```
## Common Flow Patterns
### Pattern 1: Simple Text Embedding
Embed documents from local files into a vector database.
```python
@cocoindex.flow_def(name="TextEmbedding")
def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
# Import documents
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="documents")
)
doc_embeddings = data_scope.add_collector()
with data_scope["documents"].row() as doc:
# Split into chunks
doc["chunks"] = doc["content"].transform(
cocoindex.functions.SplitRecursively(),
language="markdown",
chunk_size=2000,
chunk_overlap=500
)
with doc["chunks"].row() as chunk:
# Embed each chunk
chunk["embedding"] = chunk["text"].transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"
)
)
doc_embeddings.collect(
id=cocoindex.GeneratedField.UUID,
filename=doc["filename"],
text=chunk["text"],
embedding=chunk["embedding"]
)
# Export to Postgres with vector index
doc_embeddings.export(
"doc_embeddings",
cocoindex.targets.Postgres(),
primary_key_fields=["id"],
vector_indexes=[
cocoindex.VectorIndexDef(
field_name="embedding",
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
)
]
)
```
### Pattern 2: Code Embedding with Language Detection
```python
@cocoindex.flow_def(name="CodeEmbedding")
def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
data_scope["files"] = flow_builder.add_source(
cocoindex.sources.LocalFile(
path=".",
included_patterns=["*.py", "*.rs", "*.md"],
excluded_patterns=["**/.*", "target", "**/node_modules"]
)
)
code_embeddings = data_scope.add_collector()
with data_scope["files"].row() as file:
# Detect language
file["language"] = file["filename"].transform(
cocoindex.functions.DetectProgrammingLanguage()
)
# Split using language-aware chunking
file["chunks"] = file["content"].transform(
cocoindex.functions.SplitRecursively(),
language=file["language"],
chunk_size=1000,
chunk_overlap=300
)
with file["chunks"].row() as chunk:
chunk["embedding"] = chunk["text"].transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"
)
)
code_embeddings.collect(
filename=file["filename"],
location=chunk["location"],
code=chunk["text"],
embedding=chunk["embedding"],
start=chunk["start"],
end=chunk["end"]
)
code_embeddings.export(
"code_embeddings",
cocoindex.targets.Postgres(),
primary_key_fields=["filename", "location"],
vector_indexes=[
cocoindex.VectorIndexDef(
field_name="embedding",
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
)
]
)
```
### Pattern 3: LLM-based Extraction to Knowledge Graph
Extract structured information using LLMs and build a knowledge graph.
```python
import dataclasses
@dataclasses.dataclass
class ProductInfo:
id: str
title: str
price: float
@dataclasses.dataclass
class Taxonomy:
name: str
@cocoindex.flow_def(name="ProductGraph")
def product_graph_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
# Setup Neo4j connection
neo4j_conn = cocoindex.add_auth_entry(
"Neo4jConnection",
cocoindex.targets.Neo4jConnection(
uri="bolt://localhost:7687",
user="neo4j",
password="password"
)
)
data_scope["products"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="products", included_patterns=["*.json"])
)
product_nodes = data_scope.add_collector()
product_taxonomy = data_scope.add_collector()
with data_scope["products"].row() as product:
# Parse JSON and extract info
data = product["content"].transform(
cocoindex.functions.ParseJson()
)
# Use LLM to extract taxonomies
taxonomy = data["description"].transform(
cocoindex.functions.ExtractByLlm(
llm_spec=cocoindex.LlmSpec(
api_type=cocoindex.LlmApiType.OPENAI,
model="gpt-4"
),
output_type=list[Taxonomy]
)
)
product_nodes.collect(
id=data["id"],
title=data["title"],
price=data["price"]
)
with taxonomy.row() as t:
product_taxonomy.collect(
id=cocoindex.GeneratedField.UUID,
product_id=data["id"],
taxonomy=t["name"]
)
# Export product nodes
product_nodes.export(
"product_node",
cocoindex.targets.Neo4j(
connection=neo4j_conn,
mapping=cocoindex.targets.Nodes(label="Product")
),
primary_key_fields=["id"]
)
# Declare taxonomy nodes
flow_builder.declare(
cocoindex.targets.Neo4jDeclaration(
connection=neo4j_conn,
nodes_label="Taxonomy",
primary_key_fields=["value"]
)
)
# Export relationships
product_taxonomy.export(
"product_taxonomy",
cocoindex.targets.Neo4j(
connection=neo4j_conn,
mapping=cocoindex.targets.Relationships(
rel_type="HAS_TAXONOMY",
source=cocoindex.targets.NodeFromFields(
label="Product",
fields=[cocoindex.targets.TargetFieldMapping(source="product_id", target="id")]
),
target=cocoindex.targets.NodeFromFields(
label="Taxonomy",
fields=[cocoindex.targets.TargetFieldMapping(source="taxonomy", target="value")]
)
)
),
primary_key_fields=["id"]
)
```
### Pattern 4: Live Updates with Refresh Interval
```python
import datetime
@cocoindex.flow_def(name="LiveDataFlow")
def live_data_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
# Add source with refresh interval
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="live_documents"),
refresh_interval=datetime.timedelta(minutes=1) # Refresh every minute
)
# ... rest of flow definition
```
### Pattern 5: Custom Transform Function
```python
@cocoindex.op.function(behavior_version=1)
def extract_metadata(content: str, filename: str) -> dict:
"""Extract metadata from document content."""
return {
"word_count": len(content.split()),
"char_count": len(content),
"source": filename
}
@cocoindex.flow_def(name="CustomFunctionFlow")
def custom_function_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="documents")
)
collector = data_scope.add_collector()
with data_scope["documents"].row() as doc:
# Use custom function
doc["metadata"] = doc["content"].transform(
extract_metadata,
filename=doc["filename"]
)
collector.collect(
filename=doc["filename"],
word_count=doc["metadata"]["word_count"],
char_count=doc["metadata"]["char_count"]
)
collector.export("metadata", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
```
### Pattern 6: Transform Flow for Reusable Logic
Transform flows allow extracting reusable transformation logic that can be shared between indexing and querying.
```python
@cocoindex.transform_flow()
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
"""Shared embedding logic for both indexing and querying."""
return text.transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"
)
)
@cocoindex.flow_def(name="MainFlow")
def main_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="documents")
)
collector = data_scope.add_collector()
with data_scope["documents"].row() as doc:
# Use transform flow
doc["embedding"] = text_to_embedding(doc["content"])
collector.collect(text=doc["content"], embedding=doc["embedding"])
collector.export("docs", cocoindex.targets.Postgres(), primary_key_fields=["text"])
# Later, use same transform flow for querying
def search(query: str):
query_embedding = text_to_embedding.eval(query) # Evaluate with input
# ... perform search with query_embedding
```
### Pattern 7: Concurrency Control
```python
@cocoindex.flow_def(name="ConcurrencyControlFlow")
def concurrency_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
# Limit concurrent processing at source level
data_scope["documents"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="large_documents"),
max_inflight_rows=10, # Max 10 documents at once
max_inflight_bytes=100 * 1024 * 1024 # Max 100MB in memory
)
collector = data_scope.add_collector()
with data_scope["documents"].row() as doc:
doc["chunks"] = doc["content"].transform(
cocoindex.functions.SplitRecursively(),
chunk_size=2000
)
# Limit concurrent processing at row iteration level
with doc["chunks"].row(max_inflight_rows=100) as chunk:
chunk["embedding"] = chunk["text"].transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"
)
)
collector.collect(text=chunk["text"], embedding=chunk["embedding"])
collector.export("chunks", cocoindex.targets.Postgres(), primary_key_fields=["text"])
```
## Data Source Patterns
### Local Files
```python
cocoindex.sources.LocalFile(
path="documents",
included_patterns=["*.md", "*.txt"],
excluded_patterns=["**/.*", "node_modules"]
)
```
### Amazon S3
```python
cocoindex.sources.AmazonS3(
bucket="my-bucket",
prefix="documents/",
included_patterns=["*.pdf"],
aws_access_key_id=cocoindex.add_transient_auth_entry("..."),
aws_secret_access_key=cocoindex.add_transient_auth_entry("...")
)
```
### Postgres Source
```python
cocoindex.sources.Postgres(
connection=cocoindex.add_auth_entry(
"postgres_conn",
cocoindex.sources.PostgresConnection(
host="localhost",
database="mydb",
user="user",
password="password"
)
),
query="SELECT id, content FROM documents"
)
```
## Target Patterns
### Postgres
```python
collector.export(
"target_name",
cocoindex.targets.Postgres(),
primary_key_fields=["id"],
vector_indexes=[
cocoindex.VectorIndexDef(
field_name="embedding",
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
)
]
)
```
### Qdrant
```python
collector.export(
"target_name",
cocoindex.targets.Qdrant(collection_name="my_collection"),
primary_key_fields=["id"]
)
```
### LanceDB
```python
collector.export(
"target_name",
cocoindex.targets.LanceDB(
uri="lancedb_data",
table_name="my_table"
),
primary_key_fields=["id"]
)
```
### Neo4j (Knowledge Graph)
```python
# Node export
collector.export(
"nodes",
cocoindex.targets.Neo4j(
connection=neo4j_conn,
mapping=cocoindex.targets.Nodes(label="Entity")
),
primary_key_fields=["id"]
)
# Relationship export
collector.export(
"relationships",
cocoindex.targets.Neo4j(
connection=neo4j_conn,
mapping=cocoindex.targets.Relationships(
rel_type="RELATES_TO",
source=cocoindex.targets.NodeFromFields(
label="Entity",
fields=[cocoindex.targets.TargetFieldMapping(source="source_id", target="id")]
),
target=cocoindex.targets.NodeFromFields(
label="Entity",
fields=[cocoindex.targets.TargetFieldMapping(source="target_id", target="id")]
)
)
),
primary_key_fields=["id"]
)
```