Initial commit
This commit is contained in:
570
skills/cocoindex/references/api_operations.md
Normal file
570
skills/cocoindex/references/api_operations.md
Normal file
@@ -0,0 +1,570 @@
|
||||
# API Operations Reference
|
||||
|
||||
Guide for operating CocoIndex flows programmatically using Python APIs.
|
||||
|
||||
## Overview
|
||||
|
||||
CocoIndex flows can be operated through Python APIs, providing programmatic control over setup, updates, and queries. This is useful for embedding flows in applications, automating workflows, or building custom tools.
|
||||
|
||||
## Basic Setup
|
||||
|
||||
### Initialization
|
||||
|
||||
```python
|
||||
from dotenv import load_dotenv
|
||||
import cocoindex
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Initialize CocoIndex
|
||||
cocoindex.init()
|
||||
```
|
||||
|
||||
### Flow Definition
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="MyFlow")
|
||||
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Flow definition
|
||||
pass
|
||||
```
|
||||
|
||||
The decorator returns a `cocoindex.Flow` object that can be used for operations.
|
||||
|
||||
## Flow Operations
|
||||
|
||||
### Setup Flow
|
||||
|
||||
Create persistent backends (tables, collections, etc.) for the flow.
|
||||
|
||||
```python
|
||||
# Basic setup
|
||||
my_flow.setup()
|
||||
|
||||
# With progress output
|
||||
my_flow.setup(report_to_stdout=True)
|
||||
|
||||
# Async version
|
||||
await my_flow.setup_async(report_to_stdout=True)
|
||||
```
|
||||
|
||||
**When to use:**
|
||||
- Before first update
|
||||
- After modifying flow structure
|
||||
- After dropping flow to recreate resources
|
||||
|
||||
### Setup All Flows
|
||||
|
||||
```python
|
||||
# Setup all flows at once
|
||||
cocoindex.setup_all_flows(report_to_stdout=True)
|
||||
```
|
||||
|
||||
### Drop Flow
|
||||
|
||||
Remove all persistent backends owned by the flow.
|
||||
|
||||
```python
|
||||
# Drop flow
|
||||
my_flow.drop()
|
||||
|
||||
# With progress output
|
||||
my_flow.drop(report_to_stdout=True)
|
||||
|
||||
# Async version
|
||||
await my_flow.drop_async(report_to_stdout=True)
|
||||
```
|
||||
|
||||
**Note:** After dropping, the Flow object is still valid and can be setup again.
|
||||
|
||||
### Drop All Flows
|
||||
|
||||
```python
|
||||
# Drop all flows
|
||||
cocoindex.drop_all_flows(report_to_stdout=True)
|
||||
```
|
||||
|
||||
### Close Flow
|
||||
|
||||
Remove flow from current process memory (doesn't affect persistent data).
|
||||
|
||||
```python
|
||||
my_flow.close()
|
||||
# After this, my_flow is invalid and should not be used
|
||||
```
|
||||
|
||||
## Update Operations
|
||||
|
||||
### One-Time Update
|
||||
|
||||
Build or update target data based on current source data.
|
||||
|
||||
```python
|
||||
# Basic update
|
||||
stats = my_flow.update()
|
||||
print(f"Processed {stats.total_rows} rows")
|
||||
|
||||
# With reexport (force reprocess even if unchanged)
|
||||
stats = my_flow.update(reexport_targets=True)
|
||||
|
||||
# Async version
|
||||
stats = await my_flow.update_async()
|
||||
stats = await my_flow.update_async(reexport_targets=True)
|
||||
```
|
||||
|
||||
**Returns:** Statistics about processed data
|
||||
|
||||
**Note:** Multiple calls to `update()` can run simultaneously. CocoIndex will automatically combine them efficiently.
|
||||
|
||||
### Live Update
|
||||
|
||||
Continuously monitor source changes and update targets.
|
||||
|
||||
```python
|
||||
import cocoindex
|
||||
|
||||
# Create live updater
|
||||
updater = cocoindex.FlowLiveUpdater(
|
||||
my_flow,
|
||||
cocoindex.FlowLiveUpdaterOptions(
|
||||
live_mode=True, # Enable live updates
|
||||
print_stats=True, # Print progress
|
||||
reexport_targets=False # Only reexport on first update if True
|
||||
)
|
||||
)
|
||||
|
||||
# Start the updater
|
||||
updater.start()
|
||||
|
||||
# Your application logic here
|
||||
# (updater runs in background threads)
|
||||
|
||||
# Wait for completion
|
||||
updater.wait()
|
||||
|
||||
# Print final stats
|
||||
print(updater.update_stats())
|
||||
```
|
||||
|
||||
#### As Context Manager
|
||||
|
||||
```python
|
||||
with cocoindex.FlowLiveUpdater(my_flow) as updater:
|
||||
# Updater starts automatically
|
||||
# Your application logic here
|
||||
pass
|
||||
# Updater aborts and waits automatically
|
||||
|
||||
# Async version
|
||||
async with cocoindex.FlowLiveUpdater(my_flow) as updater:
|
||||
# Your application logic
|
||||
pass
|
||||
```
|
||||
|
||||
#### Monitoring Status Updates
|
||||
|
||||
```python
|
||||
updater = cocoindex.FlowLiveUpdater(my_flow)
|
||||
updater.start()
|
||||
|
||||
while True:
|
||||
# Block until next status update
|
||||
updates = updater.next_status_updates()
|
||||
|
||||
# Check which sources were updated
|
||||
for source in updates.updated_sources:
|
||||
print(f"Source {source} has new data")
|
||||
# Trigger downstream operations
|
||||
|
||||
# Check if updater stopped
|
||||
if not updates.active_sources:
|
||||
print("All sources stopped")
|
||||
break
|
||||
|
||||
# Async version
|
||||
while True:
|
||||
updates = await updater.next_status_updates_async()
|
||||
# ... same logic
|
||||
```
|
||||
|
||||
#### Control Methods
|
||||
|
||||
```python
|
||||
# Start updater
|
||||
updater.start()
|
||||
await updater.start_async()
|
||||
|
||||
# Abort updater
|
||||
updater.abort()
|
||||
|
||||
# Wait for completion
|
||||
updater.wait()
|
||||
await updater.wait_async()
|
||||
|
||||
# Get current stats
|
||||
stats = updater.update_stats()
|
||||
```
|
||||
|
||||
## Evaluate Flow
|
||||
|
||||
Run transformations without updating targets (for testing).
|
||||
|
||||
```python
|
||||
# Evaluate and dump results
|
||||
my_flow.evaluate_and_dump(
|
||||
cocoindex.EvaluateAndDumpOptions(
|
||||
output_dir="./eval_output",
|
||||
use_cache=True # Use existing cache (but don't update it)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**Use cases:**
|
||||
- Testing flow logic
|
||||
- Debugging transformations
|
||||
- Inspecting intermediate data
|
||||
|
||||
## Query Operations
|
||||
|
||||
### Transform Flows
|
||||
|
||||
Transform flows enable reusable transformation logic for both indexing and querying.
|
||||
|
||||
```python
|
||||
from numpy.typing import NDArray
|
||||
import numpy as np
|
||||
|
||||
# Define transform flow
|
||||
@cocoindex.transform_flow()
|
||||
def text_to_embedding(
|
||||
text: cocoindex.DataSlice[str]
|
||||
) -> cocoindex.DataSlice[NDArray[np.float32]]:
|
||||
"""Convert text to embedding vector."""
|
||||
return text.transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
|
||||
# Use in indexing flow
|
||||
@cocoindex.flow_def(name="TextEmbedding")
|
||||
def text_embedding_flow(flow_builder, data_scope):
|
||||
# ... setup source ...
|
||||
with data_scope["documents"].row() as doc:
|
||||
doc["embedding"] = text_to_embedding(doc["content"])
|
||||
# ... collect and export ...
|
||||
|
||||
# Use for querying (evaluate with input)
|
||||
query_embedding = text_to_embedding.eval("search query text")
|
||||
# query_embedding is now a numpy array
|
||||
```
|
||||
|
||||
### Query Handlers
|
||||
|
||||
Attach query logic to flows for easy query execution.
|
||||
|
||||
```python
|
||||
import functools
|
||||
from psycopg_pool import ConnectionPool
|
||||
from pgvector.psycopg import register_vector
|
||||
|
||||
@functools.cache
|
||||
def connection_pool():
|
||||
return ConnectionPool(os.environ["COCOINDEX_DATABASE_URL"])
|
||||
|
||||
# Register query handler
|
||||
@my_flow.query_handler(
|
||||
result_fields=cocoindex.QueryHandlerResultFields(
|
||||
embedding=["embedding"], # Field name(s) containing embeddings
|
||||
score="score" # Field name for similarity score
|
||||
)
|
||||
)
|
||||
def search(query: str) -> cocoindex.QueryOutput:
|
||||
"""Search for documents matching query."""
|
||||
|
||||
# Get table name for this flow's export
|
||||
table_name = cocoindex.utils.get_target_default_name(my_flow, "doc_embeddings")
|
||||
|
||||
# Compute query embedding using transform flow
|
||||
query_vector = text_to_embedding.eval(query)
|
||||
|
||||
# Execute query
|
||||
with connection_pool().connection() as conn:
|
||||
register_vector(conn)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT filename, text, embedding, embedding <=> %s AS distance
|
||||
FROM {table_name}
|
||||
ORDER BY distance
|
||||
LIMIT 10
|
||||
""",
|
||||
(query_vector,)
|
||||
)
|
||||
|
||||
return cocoindex.QueryOutput(
|
||||
query_info=cocoindex.QueryInfo(
|
||||
embedding=query_vector,
|
||||
similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
),
|
||||
results=[
|
||||
{
|
||||
"filename": row[0],
|
||||
"text": row[1],
|
||||
"embedding": row[2],
|
||||
"score": 1.0 - row[3] # Convert distance to similarity
|
||||
}
|
||||
for row in cur.fetchall()
|
||||
]
|
||||
)
|
||||
|
||||
# Call the query handler
|
||||
results = search("machine learning algorithms")
|
||||
for result in results.results:
|
||||
print(f"[{result['score']:.3f}] {result['filename']}: {result['text']}")
|
||||
```
|
||||
|
||||
### Query with Qdrant
|
||||
|
||||
```python
|
||||
from qdrant_client import QdrantClient
|
||||
import functools
|
||||
|
||||
@functools.cache
|
||||
def get_qdrant_client():
|
||||
return QdrantClient(url="http://localhost:6334", prefer_grpc=True)
|
||||
|
||||
@my_flow.query_handler(
|
||||
result_fields=cocoindex.QueryHandlerResultFields(
|
||||
embedding=["embedding"],
|
||||
score="score"
|
||||
)
|
||||
)
|
||||
def search_qdrant(query: str) -> cocoindex.QueryOutput:
|
||||
client = get_qdrant_client()
|
||||
|
||||
# Get query embedding
|
||||
query_embedding = text_to_embedding.eval(query)
|
||||
|
||||
# Search Qdrant
|
||||
search_results = client.search(
|
||||
collection_name="my_collection",
|
||||
query_vector=("text_embedding", query_embedding),
|
||||
limit=10
|
||||
)
|
||||
|
||||
return cocoindex.QueryOutput(
|
||||
query_info=cocoindex.QueryInfo(
|
||||
embedding=query_embedding,
|
||||
similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
),
|
||||
results=[
|
||||
{
|
||||
"text": result.payload["text"],
|
||||
"embedding": result.vector,
|
||||
"score": result.score
|
||||
}
|
||||
for result in search_results
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Application Integration Patterns
|
||||
|
||||
### Pattern 1: Simple Application with Update
|
||||
|
||||
```python
|
||||
from dotenv import load_dotenv
|
||||
import cocoindex
|
||||
|
||||
# Initialize
|
||||
load_dotenv()
|
||||
cocoindex.init()
|
||||
|
||||
# Define flow
|
||||
@cocoindex.flow_def(name="MyApp")
|
||||
def my_app_flow(flow_builder, data_scope):
|
||||
# ... flow definition ...
|
||||
pass
|
||||
|
||||
def main():
|
||||
# Ensure flow is set up and data is fresh
|
||||
stats = my_app_flow.update()
|
||||
print(f"Updated index: {stats}")
|
||||
|
||||
# Run application logic
|
||||
while True:
|
||||
query = input("Search: ")
|
||||
if not query:
|
||||
break
|
||||
results = search(query)
|
||||
for result in results.results:
|
||||
print(f" {result['score']:.3f}: {result['text']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
### Pattern 2: Web Application with Live Updates
|
||||
|
||||
```python
|
||||
from fastapi import FastAPI
|
||||
import cocoindex
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
cocoindex.init()
|
||||
|
||||
@cocoindex.flow_def(name="WebAppFlow")
|
||||
def web_app_flow(flow_builder, data_scope):
|
||||
# ... flow definition ...
|
||||
pass
|
||||
|
||||
# Create FastAPI app
|
||||
app = FastAPI()
|
||||
|
||||
# Global updater
|
||||
updater = None
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
global updater
|
||||
# Start live updater in background
|
||||
updater = cocoindex.FlowLiveUpdater(
|
||||
web_app_flow,
|
||||
cocoindex.FlowLiveUpdaterOptions(live_mode=True, print_stats=True)
|
||||
)
|
||||
await updater.start_async()
|
||||
print("Live updater started")
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown():
|
||||
global updater
|
||||
if updater:
|
||||
updater.abort()
|
||||
await updater.wait_async()
|
||||
print("Live updater stopped")
|
||||
|
||||
@app.get("/search")
|
||||
async def search_endpoint(q: str):
|
||||
results = search(q)
|
||||
return {
|
||||
"query": q,
|
||||
"results": results.results
|
||||
}
|
||||
```
|
||||
|
||||
### Pattern 3: Batch Processing
|
||||
|
||||
```python
|
||||
import cocoindex
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
cocoindex.init()
|
||||
|
||||
@cocoindex.flow_def(name="BatchProcessor")
|
||||
def batch_flow(flow_builder, data_scope):
|
||||
# ... flow definition ...
|
||||
pass
|
||||
|
||||
def process_batch():
|
||||
"""Run as scheduled job (cron, etc.)"""
|
||||
# Setup if needed (no-op if already set up)
|
||||
batch_flow.setup()
|
||||
|
||||
# Run update
|
||||
stats = batch_flow.update()
|
||||
|
||||
# Log results
|
||||
print(f"Batch completed: {stats.total_rows} rows processed")
|
||||
|
||||
return stats
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_batch()
|
||||
```
|
||||
|
||||
### Pattern 4: React to Updates
|
||||
|
||||
```python
|
||||
import cocoindex
|
||||
|
||||
@cocoindex.flow_def(name="ReactiveFlow")
|
||||
def reactive_flow(flow_builder, data_scope):
|
||||
# ... flow definition ...
|
||||
pass
|
||||
|
||||
async def run_with_reactions():
|
||||
"""Monitor updates and trigger downstream actions."""
|
||||
async with cocoindex.FlowLiveUpdater(reactive_flow) as updater:
|
||||
while True:
|
||||
updates = await updater.next_status_updates_async()
|
||||
|
||||
# React to specific source updates
|
||||
if "products" in updates.updated_sources:
|
||||
await rebuild_product_index()
|
||||
|
||||
if "customers" in updates.updated_sources:
|
||||
await refresh_customer_cache()
|
||||
|
||||
# Exit when updater stops
|
||||
if not updates.active_sources:
|
||||
break
|
||||
|
||||
async def rebuild_product_index():
|
||||
print("Rebuilding product index...")
|
||||
# Custom logic
|
||||
|
||||
async def refresh_customer_cache():
|
||||
print("Refreshing customer cache...")
|
||||
# Custom logic
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Handling Update Errors
|
||||
|
||||
```python
|
||||
try:
|
||||
stats = my_flow.update()
|
||||
except cocoindex.CocoIndexError as e:
|
||||
print(f"Update failed: {e}")
|
||||
# Handle error (log, retry, alert, etc.)
|
||||
```
|
||||
|
||||
### Graceful Shutdown
|
||||
|
||||
```python
|
||||
import signal
|
||||
|
||||
updater = None
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
print("Shutting down gracefully...")
|
||||
if updater:
|
||||
updater.abort()
|
||||
updater.wait()
|
||||
print("Shutdown complete")
|
||||
exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
updater = cocoindex.FlowLiveUpdater(my_flow)
|
||||
updater.start()
|
||||
updater.wait()
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always call cocoindex.init()** - Initialize before using any CocoIndex APIs
|
||||
2. **Load environment variables** - Use dotenv or similar to load configuration
|
||||
3. **Use context managers** - For live updaters to ensure cleanup
|
||||
4. **Cache expensive resources** - Use `@functools.cache` for database pools, clients
|
||||
5. **Handle signals** - Gracefully shutdown live updaters on SIGINT/SIGTERM
|
||||
6. **Separate concerns** - Keep flow definitions, queries, and application logic separate
|
||||
7. **Use transform flows** - Share logic between indexing and querying
|
||||
8. **Monitor update stats** - Log and track processing statistics
|
||||
9. **Test with evaluate** - Use evaluate_and_dump for testing before updates
|
||||
401
skills/cocoindex/references/cli_operations.md
Normal file
401
skills/cocoindex/references/cli_operations.md
Normal file
@@ -0,0 +1,401 @@
|
||||
# CLI Operations Reference
|
||||
|
||||
Complete guide for operating CocoIndex flows using the CLI.
|
||||
|
||||
## Overview
|
||||
|
||||
The CocoIndex CLI (`cocoindex` command) provides tools for managing and inspecting flows. Most commands require an `APP_TARGET` argument specifying where flow definitions are located.
|
||||
|
||||
## Environment Setup
|
||||
|
||||
### Environment Variables
|
||||
|
||||
Create a `.env` file in the project directory:
|
||||
|
||||
```bash
|
||||
# Database connection (required)
|
||||
COCOINDEX_DATABASE_URL=postgresql://user:password@localhost/cocoindex_db
|
||||
|
||||
# Optional: App namespace for organizing flows
|
||||
COCOINDEX_APP_NAMESPACE=dev
|
||||
|
||||
# Optional: Global concurrency limits
|
||||
COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS=50
|
||||
COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES=524288000 # 500MB
|
||||
|
||||
# Optional: LLM API keys (if using LLM functions)
|
||||
OPENAI_API_KEY=sk-...
|
||||
ANTHROPIC_API_KEY=sk-ant-...
|
||||
VOYAGE_API_KEY=pa-...
|
||||
```
|
||||
|
||||
### Loading Environment Files
|
||||
|
||||
```bash
|
||||
# Default: loads .env from current directory
|
||||
cocoindex <command> ...
|
||||
|
||||
# Specify custom env file
|
||||
cocoindex --env-file path/to/.env <command> ...
|
||||
|
||||
# Specify app directory
|
||||
cocoindex --app-dir /path/to/project <command> ...
|
||||
```
|
||||
|
||||
## APP_TARGET Format
|
||||
|
||||
The `APP_TARGET` tells the CLI where flow definitions are located:
|
||||
|
||||
### Python Module
|
||||
```bash
|
||||
# Load from module name
|
||||
cocoindex update main
|
||||
|
||||
# Load from package module
|
||||
cocoindex update my_package.flows
|
||||
```
|
||||
|
||||
### Python File
|
||||
```bash
|
||||
# Load from file path
|
||||
cocoindex update main.py
|
||||
|
||||
# Load from nested file
|
||||
cocoindex update path/to/flows.py
|
||||
```
|
||||
|
||||
### Specific Flow
|
||||
```bash
|
||||
# Target specific flow in module
|
||||
cocoindex update main:MyFlowName
|
||||
|
||||
# Target specific flow in file
|
||||
cocoindex update path/to/flows.py:MyFlowName
|
||||
```
|
||||
|
||||
## Core Commands
|
||||
|
||||
### setup - Initialize Flow Resources
|
||||
|
||||
Create all persistent backends needed by flows (database tables, collections, etc.).
|
||||
|
||||
```bash
|
||||
# Setup all flows
|
||||
cocoindex setup main.py
|
||||
|
||||
# Setup specific flow
|
||||
cocoindex setup main.py:MyFlow
|
||||
```
|
||||
|
||||
**What it does:**
|
||||
- Creates internal storage tables in Postgres
|
||||
- Creates target resources (database tables, vector collections, graph structures)
|
||||
- Updates schemas if flow definition changed
|
||||
- No-op if already set up and no changes needed
|
||||
|
||||
**When to use:**
|
||||
- First time running a flow
|
||||
- After modifying flow structure (new fields, new targets)
|
||||
- After dropping flows to recreate resources
|
||||
|
||||
### update - Build/Update Target Data
|
||||
|
||||
Run transformations and update target data based on current source data.
|
||||
|
||||
```bash
|
||||
# One-time update
|
||||
cocoindex update main.py
|
||||
|
||||
# One-time update with setup
|
||||
cocoindex update --setup main.py
|
||||
|
||||
# One-time update specific flow
|
||||
cocoindex update main.py:TextEmbedding
|
||||
|
||||
# Force reexport even if no changes
|
||||
cocoindex update --reexport main.py
|
||||
```
|
||||
|
||||
**What it does:**
|
||||
- Reads source data
|
||||
- Applies transformations
|
||||
- Updates target databases
|
||||
- Uses incremental processing (only processes changed data)
|
||||
|
||||
**Options:**
|
||||
- `--setup` - Run setup first if needed
|
||||
- `--reexport` - Reexport all data even if unchanged (useful after data loss)
|
||||
|
||||
### update -L - Live Update Mode
|
||||
|
||||
Continuously monitor source changes and update targets.
|
||||
|
||||
```bash
|
||||
# Live update mode
|
||||
cocoindex update main.py -L
|
||||
|
||||
# Live update with setup
|
||||
cocoindex update --setup main.py -L
|
||||
|
||||
# Live update with reexport on initial update
|
||||
cocoindex update --reexport main.py -L
|
||||
```
|
||||
|
||||
**What it does:**
|
||||
- Performs initial one-time update
|
||||
- Continuously monitors source changes
|
||||
- Automatically processes updates
|
||||
- Runs until aborted (Ctrl-C)
|
||||
|
||||
**Requires:**
|
||||
- At least one source with change capture enabled:
|
||||
- `refresh_interval` parameter on source
|
||||
- Source-specific change capture (Postgres notifications, S3 events, etc.)
|
||||
|
||||
**Example with refresh interval:**
|
||||
```python
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents"),
|
||||
refresh_interval=datetime.timedelta(minutes=1) # Check every minute
|
||||
)
|
||||
```
|
||||
|
||||
### drop - Remove Flow Resources
|
||||
|
||||
Remove all persistent backends owned by flows.
|
||||
|
||||
```bash
|
||||
# Drop all flows
|
||||
cocoindex drop main.py
|
||||
|
||||
# Drop specific flow
|
||||
cocoindex drop main.py:MyFlow
|
||||
```
|
||||
|
||||
**What it does:**
|
||||
- Drops internal storage tables
|
||||
- Drops target resources (tables, collections, graphs)
|
||||
- Cleans up all persistent data
|
||||
|
||||
**Warning:** This is destructive and cannot be undone!
|
||||
|
||||
### show - Inspect Flow Definition
|
||||
|
||||
Display flow structure and statistics.
|
||||
|
||||
```bash
|
||||
# Show flow structure
|
||||
cocoindex show main.py:MyFlow
|
||||
|
||||
# Show all flows
|
||||
cocoindex show main.py
|
||||
```
|
||||
|
||||
**What it shows:**
|
||||
- Flow name and structure
|
||||
- Sources configured
|
||||
- Transformations defined
|
||||
- Targets and their schemas
|
||||
- Current statistics (if flow is set up)
|
||||
|
||||
### evaluate - Test Flow Without Updating
|
||||
|
||||
Run transformations and dump results to files without updating targets.
|
||||
|
||||
```bash
|
||||
# Evaluate flow
|
||||
cocoindex evaluate main.py:MyFlow
|
||||
|
||||
# Specify output directory
|
||||
cocoindex evaluate main.py:MyFlow --output-dir ./eval_results
|
||||
|
||||
# Disable cache
|
||||
cocoindex evaluate main.py:MyFlow --no-cache
|
||||
```
|
||||
|
||||
**What it does:**
|
||||
- Runs transformations
|
||||
- Saves results to files (JSON, CSV, etc.)
|
||||
- Does NOT update targets
|
||||
- Uses existing cache by default
|
||||
|
||||
**When to use:**
|
||||
- Testing flow logic before running full update
|
||||
- Debugging transformation issues
|
||||
- Inspecting intermediate data
|
||||
- Validating output format
|
||||
|
||||
**Options:**
|
||||
- `--output-dir PATH` - Directory for output files (default: `eval_{flow_name}_{timestamp}`)
|
||||
- `--no-cache` - Disable reading from cache (still doesn't write to cache)
|
||||
|
||||
## Complete Workflow Examples
|
||||
|
||||
### First-Time Setup and Indexing
|
||||
|
||||
```bash
|
||||
# 1. Setup flow resources
|
||||
cocoindex setup main.py
|
||||
|
||||
# 2. Run initial indexing
|
||||
cocoindex update main.py
|
||||
|
||||
# 3. Verify results
|
||||
cocoindex show main.py
|
||||
```
|
||||
|
||||
### Development Workflow
|
||||
|
||||
```bash
|
||||
# 1. Test with evaluate (no side effects)
|
||||
cocoindex evaluate main.py:MyFlow --output-dir ./test_output
|
||||
|
||||
# 2. If looks good, setup and update
|
||||
cocoindex update --setup main.py:MyFlow
|
||||
|
||||
# 3. Check results
|
||||
cocoindex show main.py:MyFlow
|
||||
```
|
||||
|
||||
### Production Live Updates
|
||||
|
||||
```bash
|
||||
# Run with live updates and auto-setup
|
||||
cocoindex update --setup main.py -L
|
||||
```
|
||||
|
||||
### Rebuild After Changes
|
||||
|
||||
```bash
|
||||
# Drop old resources
|
||||
cocoindex drop main.py
|
||||
|
||||
# Setup with new definition
|
||||
cocoindex setup main.py
|
||||
|
||||
# Reindex everything
|
||||
cocoindex update --reexport main.py
|
||||
```
|
||||
|
||||
### Multiple Flows
|
||||
|
||||
```bash
|
||||
# Setup all flows
|
||||
cocoindex setup main.py
|
||||
|
||||
# Update specific flows
|
||||
cocoindex update main.py:CodeEmbedding
|
||||
cocoindex update main.py:DocumentEmbedding
|
||||
|
||||
# Show all flows
|
||||
cocoindex show main.py
|
||||
```
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
### Issue: "Flow not found"
|
||||
|
||||
**Problem:** CLI can't find the flow definition.
|
||||
|
||||
**Solutions:**
|
||||
```bash
|
||||
# Make sure APP_TARGET is correct
|
||||
cocoindex show main.py # Should list flows
|
||||
|
||||
# Use --app-dir if not in project root
|
||||
cocoindex --app-dir /path/to/project show main.py
|
||||
|
||||
# Check flow name is correct
|
||||
cocoindex show main.py:CorrectFlowName
|
||||
```
|
||||
|
||||
### Issue: "Database connection failed"
|
||||
|
||||
**Problem:** Can't connect to Postgres.
|
||||
|
||||
**Solutions:**
|
||||
```bash
|
||||
# Check .env file exists
|
||||
cat .env | grep COCOINDEX_DATABASE_URL
|
||||
|
||||
# Test connection
|
||||
psql $COCOINDEX_DATABASE_URL
|
||||
|
||||
# Use --env-file if .env is elsewhere
|
||||
cocoindex --env-file /path/to/.env update main.py
|
||||
```
|
||||
|
||||
### Issue: "Schema mismatch"
|
||||
|
||||
**Problem:** Flow definition changed but resources not updated.
|
||||
|
||||
**Solution:**
|
||||
```bash
|
||||
# Re-run setup to update schemas
|
||||
cocoindex setup main.py
|
||||
|
||||
# Then update data
|
||||
cocoindex update main.py
|
||||
```
|
||||
|
||||
### Issue: "Live update exits immediately"
|
||||
|
||||
**Problem:** No change capture mechanisms enabled.
|
||||
|
||||
**Solution:**
|
||||
Add refresh_interval or use source-specific change capture:
|
||||
```python
|
||||
data_scope["docs"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="docs"),
|
||||
refresh_interval=datetime.timedelta(seconds=30) # Add this
|
||||
)
|
||||
```
|
||||
|
||||
## Advanced Options
|
||||
|
||||
### Global Options
|
||||
|
||||
```bash
|
||||
# Show version
|
||||
cocoindex --version
|
||||
|
||||
# Show help
|
||||
cocoindex --help
|
||||
cocoindex update --help
|
||||
|
||||
# Specify app directory
|
||||
cocoindex --app-dir /custom/path update main
|
||||
|
||||
# Custom env file
|
||||
cocoindex --env-file prod.env update main
|
||||
```
|
||||
|
||||
### Performance Tuning
|
||||
|
||||
Set environment variables for concurrency:
|
||||
|
||||
```bash
|
||||
# In .env file
|
||||
COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS=100
|
||||
COCOINDEX_SOURCE_MAX_INFLIGHT_BYTES=1073741824 # 1GB
|
||||
```
|
||||
|
||||
Or per-source in code:
|
||||
```python
|
||||
data_scope["docs"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="docs"),
|
||||
max_inflight_rows=50,
|
||||
max_inflight_bytes=500*1024*1024 # 500MB
|
||||
)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use evaluate before update** - Test flow logic without side effects
|
||||
2. **Always setup before first update** - Or use `--setup` flag
|
||||
3. **Use live updates in production** - Keeps targets always fresh
|
||||
4. **Set app namespace** - Organize flows across environments (dev/staging/prod)
|
||||
5. **Monitor with show** - Regularly check flow statistics
|
||||
6. **Version control .env.example** - Document required environment variables
|
||||
7. **Use specific flow targets** - For selective updates: `main.py:FlowName`
|
||||
8. **Setup after definition changes** - Ensures schemas match flow definition
|
||||
467
skills/cocoindex/references/custom_functions.md
Normal file
467
skills/cocoindex/references/custom_functions.md
Normal file
@@ -0,0 +1,467 @@
|
||||
# Custom Functions Reference
|
||||
|
||||
Complete guide for creating custom functions in CocoIndex.
|
||||
|
||||
## Overview
|
||||
|
||||
Custom functions allow creating data transformation logic that can be used within flows. There are two approaches:
|
||||
|
||||
1. **Standalone function** - Simple, no configuration or setup logic
|
||||
2. **Function spec + executor** - Advanced, with configuration and setup logic
|
||||
|
||||
## Standalone Functions
|
||||
|
||||
Use for simple transformations that don't need configuration or setup.
|
||||
|
||||
### Basic Example
|
||||
|
||||
```python
|
||||
@cocoindex.op.function(behavior_version=1)
|
||||
def compute_word_count(text: str) -> int:
|
||||
"""Count words in text."""
|
||||
return len(text.split())
|
||||
```
|
||||
|
||||
**Requirements:**
|
||||
- Decorate with `@cocoindex.op.function()`
|
||||
- Type annotations required for all arguments and return value
|
||||
- Supports basic types, structs, tables, and numpy arrays
|
||||
|
||||
### With Optional Parameters
|
||||
|
||||
```python
|
||||
@cocoindex.op.function(behavior_version=1)
|
||||
def extract_info(content: str, filename: str, max_length: int | None = None) -> dict:
|
||||
"""
|
||||
Extract information from content.
|
||||
|
||||
Args:
|
||||
content: The document content
|
||||
filename: Source filename
|
||||
max_length: Optional maximum length for truncation
|
||||
"""
|
||||
info = {
|
||||
"filename": filename,
|
||||
"length": len(content),
|
||||
"word_count": len(content.split())
|
||||
}
|
||||
|
||||
if max_length and len(content) > max_length:
|
||||
info["truncated"] = True
|
||||
|
||||
return info
|
||||
```
|
||||
|
||||
### Using in Flows
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="MyFlow")
|
||||
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
# Use standalone function
|
||||
doc["word_count"] = doc["content"].transform(compute_word_count)
|
||||
|
||||
# With additional arguments
|
||||
doc["info"] = doc["content"].transform(
|
||||
extract_info,
|
||||
filename=doc["filename"],
|
||||
max_length=1000
|
||||
)
|
||||
|
||||
collector.collect(
|
||||
filename=doc["filename"],
|
||||
word_count=doc["word_count"],
|
||||
info=doc["info"]
|
||||
)
|
||||
|
||||
collector.export("documents", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
|
||||
```
|
||||
|
||||
## Function Spec + Executor
|
||||
|
||||
Use for functions that need configuration or setup logic (e.g., loading models).
|
||||
|
||||
### Basic Structure
|
||||
|
||||
```python
|
||||
# 1. Define the function spec (configuration)
|
||||
class ComputeSomething(cocoindex.op.FunctionSpec):
|
||||
"""
|
||||
Configuration for the ComputeSomething function.
|
||||
"""
|
||||
param1: str
|
||||
param2: int = 10 # Optional with default
|
||||
|
||||
# 2. Define the executor (implementation)
|
||||
@cocoindex.op.executor_class(behavior_version=1)
|
||||
class ComputeSomethingExecutor:
|
||||
spec: ComputeSomething # Required: link to spec
|
||||
|
||||
def prepare(self) -> None:
|
||||
"""
|
||||
Optional: Setup logic run once before execution.
|
||||
Use for loading models, establishing connections, etc.
|
||||
"""
|
||||
# Setup based on self.spec
|
||||
pass
|
||||
|
||||
def __call__(self, input_data: str) -> dict:
|
||||
"""
|
||||
Required: Execute the function for each data row.
|
||||
|
||||
Args must have type annotations.
|
||||
Return type must have type annotation.
|
||||
"""
|
||||
# Use self.spec.param1, self.spec.param2
|
||||
return {"result": f"{input_data}-{self.spec.param1}"}
|
||||
```
|
||||
|
||||
### Example: Custom Embedding Function
|
||||
|
||||
```python
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import numpy as np
|
||||
from numpy.typing import NDArray
|
||||
|
||||
class CustomEmbed(cocoindex.op.FunctionSpec):
|
||||
"""
|
||||
Embed text using a specified SentenceTransformer model.
|
||||
"""
|
||||
model_name: str
|
||||
normalize: bool = True
|
||||
|
||||
@cocoindex.op.executor_class(cache=True, behavior_version=1)
|
||||
class CustomEmbedExecutor:
|
||||
spec: CustomEmbed
|
||||
model: SentenceTransformer | None = None
|
||||
|
||||
def prepare(self) -> None:
|
||||
"""Load the model once during initialization."""
|
||||
self.model = SentenceTransformer(self.spec.model_name)
|
||||
|
||||
def __call__(self, text: str) -> NDArray[np.float32]:
|
||||
"""Embed the input text."""
|
||||
assert self.model is not None
|
||||
embedding = self.model.encode(text, normalize_embeddings=self.spec.normalize)
|
||||
return embedding.astype(np.float32)
|
||||
|
||||
# Usage in flow
|
||||
@cocoindex.flow_def(name="CustomEmbedFlow")
|
||||
def custom_embed_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
doc["embedding"] = doc["content"].transform(
|
||||
CustomEmbed(
|
||||
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
||||
normalize=True
|
||||
)
|
||||
)
|
||||
|
||||
collector.collect(
|
||||
text=doc["content"],
|
||||
embedding=doc["embedding"]
|
||||
)
|
||||
|
||||
collector.export("embeddings", cocoindex.targets.Postgres(), primary_key_fields=["text"])
|
||||
```
|
||||
|
||||
### Example: PDF Processing
|
||||
|
||||
```python
|
||||
import pymupdf # PyMuPDF
|
||||
|
||||
class PdfToMarkdown(cocoindex.op.FunctionSpec):
|
||||
"""
|
||||
Convert PDF to markdown.
|
||||
"""
|
||||
extract_images: bool = False
|
||||
page_range: tuple[int, int] | None = None # (start, end) pages
|
||||
|
||||
@cocoindex.op.executor_class(cache=True, behavior_version=1)
|
||||
class PdfToMarkdownExecutor:
|
||||
spec: PdfToMarkdown
|
||||
|
||||
def __call__(self, pdf_bytes: bytes) -> str:
|
||||
"""Convert PDF bytes to markdown text."""
|
||||
doc = pymupdf.Document(stream=pdf_bytes, filetype="pdf")
|
||||
|
||||
# Determine page range
|
||||
start = 0
|
||||
end = doc.page_count
|
||||
if self.spec.page_range:
|
||||
start, end = self.spec.page_range
|
||||
start = max(0, start)
|
||||
end = min(doc.page_count, end)
|
||||
|
||||
markdown_parts = []
|
||||
for page_num in range(start, end):
|
||||
page = doc[page_num]
|
||||
text = page.get_text()
|
||||
markdown_parts.append(f"# Page {page_num + 1}\n\n{text}")
|
||||
|
||||
return "\n\n".join(markdown_parts)
|
||||
|
||||
# Usage
|
||||
@cocoindex.flow_def(name="PdfFlow")
|
||||
def pdf_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["pdfs"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="pdfs", included_patterns=["*.pdf"])
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["pdfs"].row() as pdf:
|
||||
pdf["markdown"] = pdf["content"].transform(
|
||||
PdfToMarkdown(extract_images=False, page_range=(0, 10))
|
||||
)
|
||||
|
||||
collector.collect(
|
||||
filename=pdf["filename"],
|
||||
markdown=pdf["markdown"]
|
||||
)
|
||||
|
||||
collector.export("pdf_text", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
|
||||
```
|
||||
|
||||
## Function Parameters
|
||||
|
||||
Both standalone functions and executors support these parameters:
|
||||
|
||||
### cache (bool)
|
||||
|
||||
Enable caching of function results for reuse during reprocessing.
|
||||
|
||||
```python
|
||||
@cocoindex.op.function(cache=True, behavior_version=1)
|
||||
def expensive_computation(text: str) -> dict:
|
||||
# Computationally intensive operation
|
||||
return {"result": analyze(text)}
|
||||
```
|
||||
|
||||
**When to use:**
|
||||
- Functions that are computationally expensive
|
||||
- LLM API calls
|
||||
- Model inference
|
||||
- External API calls
|
||||
|
||||
### behavior_version (int)
|
||||
|
||||
Required when `cache=True`. Increment this when function behavior changes to invalidate cache.
|
||||
|
||||
```python
|
||||
@cocoindex.op.function(cache=True, behavior_version=2) # Incremented from 1
|
||||
def improved_analysis(text: str) -> dict:
|
||||
# Updated algorithm - need to reprocess cached data
|
||||
return {"result": new_analysis(text)}
|
||||
```
|
||||
|
||||
### gpu (bool)
|
||||
|
||||
Indicates the function uses GPU resources, affecting scheduling.
|
||||
|
||||
```python
|
||||
@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
|
||||
class GpuModelExecutor:
|
||||
spec: GpuModel
|
||||
|
||||
def prepare(self) -> None:
|
||||
self.model = load_model_on_gpu(self.spec.model_name)
|
||||
|
||||
def __call__(self, text: str) -> NDArray[np.float32]:
|
||||
return self.model.predict(text)
|
||||
```
|
||||
|
||||
### arg_relationship
|
||||
|
||||
Specifies metadata about argument relationships for tools like CocoInsight.
|
||||
|
||||
```python
|
||||
@cocoindex.op.function(
|
||||
cache=True,
|
||||
behavior_version=1,
|
||||
arg_relationship=(cocoindex.ArgRelationship.CHUNKS_BASE_TEXT, "content")
|
||||
)
|
||||
def custom_chunker(content: str, chunk_size: int) -> list[dict]:
|
||||
"""
|
||||
Chunks are derived from 'content' argument.
|
||||
First element of each chunk dict must be a Range type.
|
||||
"""
|
||||
# Return list of chunks with location ranges
|
||||
return [
|
||||
{"location": cocoindex.Range(...), "text": chunk}
|
||||
for chunk in split_content(content, chunk_size)
|
||||
]
|
||||
```
|
||||
|
||||
**Supported relationships:**
|
||||
- `ArgRelationship.CHUNKS_BASE_TEXT` - Output is chunks of input text
|
||||
- `ArgRelationship.EMBEDDING_ORIGIN_TEXT` - Output is embedding of input text
|
||||
- `ArgRelationship.RECTS_BASE_IMAGE` - Output is rectangles on input image
|
||||
|
||||
## Supported Data Types
|
||||
|
||||
Functions can use these types for arguments and return values:
|
||||
|
||||
### Basic Types
|
||||
- `str` - Text
|
||||
- `int` - Integer (maps to Int64)
|
||||
- `float` - Float (maps to Float64)
|
||||
- `bool` - Boolean
|
||||
- `bytes` - Binary data
|
||||
- `None` / `type(None)` - Null value
|
||||
|
||||
### Collection Types
|
||||
- `list[T]` - List of type T
|
||||
- `dict[str, T]` - Dictionary (becomes Struct)
|
||||
- `cocoindex.Json` - Arbitrary JSON
|
||||
|
||||
### Numpy Types
|
||||
- `NDArray[np.float32]` - Vector[Float32, N]
|
||||
- `NDArray[np.float64]` - Vector[Float64, N]
|
||||
- `NDArray[np.int32]` - Vector[Int32, N]
|
||||
- `NDArray[np.int64]` - Vector[Int64, N]
|
||||
|
||||
### CocoIndex Types
|
||||
- `cocoindex.Range` - Text range with location info
|
||||
- Dataclasses - Become Struct types
|
||||
|
||||
### Optional Types
|
||||
- `T | None` or `Optional[T]` - Optional value
|
||||
|
||||
### Table Types (Output only)
|
||||
Functions can return table-like data using dataclasses:
|
||||
|
||||
```python
|
||||
@dataclasses.dataclass
|
||||
class Chunk:
|
||||
location: cocoindex.Range
|
||||
text: str
|
||||
|
||||
@cocoindex.op.function(behavior_version=1)
|
||||
def chunk_text(content: str) -> list[Chunk]:
|
||||
"""Returns a list representing a table."""
|
||||
return [
|
||||
Chunk(location=..., text=chunk)
|
||||
for chunk in split_content(content)
|
||||
]
|
||||
```
|
||||
|
||||
## Common Patterns
|
||||
|
||||
### Pattern: LLM-based Extraction
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
class ExtractStructuredInfo(cocoindex.op.FunctionSpec):
|
||||
"""Extract structured information using an LLM."""
|
||||
model: str = "gpt-4"
|
||||
system_prompt: str = "Extract key information from the text."
|
||||
|
||||
@cocoindex.op.executor_class(cache=True, behavior_version=1)
|
||||
class ExtractStructuredInfoExecutor:
|
||||
spec: ExtractStructuredInfo
|
||||
client: OpenAI | None = None
|
||||
|
||||
def prepare(self) -> None:
|
||||
self.client = OpenAI() # Uses OPENAI_API_KEY env var
|
||||
|
||||
def __call__(self, text: str) -> dict:
|
||||
assert self.client is not None
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.spec.model,
|
||||
messages=[
|
||||
{"role": "system", "content": self.spec.system_prompt},
|
||||
{"role": "user", "content": text}
|
||||
]
|
||||
)
|
||||
# Parse and return structured data
|
||||
return {"extracted": response.choices[0].message.content}
|
||||
```
|
||||
|
||||
### Pattern: External API Call
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
class FetchEnrichmentData(cocoindex.op.FunctionSpec):
|
||||
"""Fetch enrichment data from external API."""
|
||||
api_endpoint: str
|
||||
api_key: str
|
||||
|
||||
@cocoindex.op.executor_class(cache=True, behavior_version=1)
|
||||
class FetchEnrichmentDataExecutor:
|
||||
spec: FetchEnrichmentData
|
||||
|
||||
def __call__(self, entity_id: str) -> dict:
|
||||
response = requests.get(
|
||||
f"{self.spec.api_endpoint}/entities/{entity_id}",
|
||||
headers={"Authorization": f"Bearer {self.spec.api_key}"}
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
```
|
||||
|
||||
### Pattern: Multi-step Processing
|
||||
|
||||
```python
|
||||
class ProcessDocument(cocoindex.op.FunctionSpec):
|
||||
"""Process document through multiple steps."""
|
||||
min_quality_score: float = 0.7
|
||||
|
||||
@cocoindex.op.executor_class(cache=True, behavior_version=1)
|
||||
class ProcessDocumentExecutor:
|
||||
spec: ProcessDocument
|
||||
nlp_model = None
|
||||
|
||||
def prepare(self) -> None:
|
||||
import spacy
|
||||
self.nlp_model = spacy.load("en_core_web_sm")
|
||||
|
||||
def __call__(self, text: str) -> dict:
|
||||
# Step 1: Clean text
|
||||
cleaned = self._clean_text(text)
|
||||
|
||||
# Step 2: Extract entities
|
||||
doc = self.nlp_model(cleaned)
|
||||
entities = [ent.text for ent in doc.ents]
|
||||
|
||||
# Step 3: Quality check
|
||||
quality_score = self._compute_quality(cleaned)
|
||||
|
||||
return {
|
||||
"cleaned_text": cleaned if quality_score >= self.spec.min_quality_score else None,
|
||||
"entities": entities,
|
||||
"quality_score": quality_score
|
||||
}
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
# Cleaning logic
|
||||
return text.strip()
|
||||
|
||||
def _compute_quality(self, text: str) -> float:
|
||||
# Quality scoring logic
|
||||
return len(text) / 1000.0
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use caching for expensive operations** - Enable `cache=True` for LLM calls, model inference, or external APIs
|
||||
2. **Type annotations required** - All arguments and return types must be annotated
|
||||
3. **Increment behavior_version** - When changing cached function logic, increment version to invalidate cache
|
||||
4. **Use prepare() for initialization** - Load models, establish connections once in prepare()
|
||||
5. **Keep functions focused** - Each function should do one thing well
|
||||
6. **Document parameters** - Use docstrings to explain function purpose and parameters
|
||||
7. **Handle errors gracefully** - Consider edge cases and invalid inputs
|
||||
8. **Use appropriate return types** - Match return types to target schema needs
|
||||
478
skills/cocoindex/references/flow_patterns.md
Normal file
478
skills/cocoindex/references/flow_patterns.md
Normal file
@@ -0,0 +1,478 @@
|
||||
# CocoIndex Flow Patterns
|
||||
|
||||
This reference provides common patterns and examples for building CocoIndex flows.
|
||||
|
||||
## Basic Flow Pattern
|
||||
|
||||
```python
|
||||
import cocoindex
|
||||
|
||||
@cocoindex.flow_def(name="FlowName")
|
||||
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# 1. Import source data
|
||||
data_scope["source_data"] = flow_builder.add_source(...)
|
||||
|
||||
# 2. Create collectors for output
|
||||
my_collector = data_scope.add_collector()
|
||||
|
||||
# 3. Transform data
|
||||
with data_scope["source_data"].row() as item:
|
||||
item["transformed"] = item["field"].transform(...)
|
||||
my_collector.collect(...)
|
||||
|
||||
# 4. Export to target
|
||||
my_collector.export("target_name", ..., primary_key_fields=[...])
|
||||
```
|
||||
|
||||
## Common Flow Patterns
|
||||
|
||||
### Pattern 1: Simple Text Embedding
|
||||
|
||||
Embed documents from local files into a vector database.
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="TextEmbedding")
|
||||
def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Import documents
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
doc_embeddings = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
# Split into chunks
|
||||
doc["chunks"] = doc["content"].transform(
|
||||
cocoindex.functions.SplitRecursively(),
|
||||
language="markdown",
|
||||
chunk_size=2000,
|
||||
chunk_overlap=500
|
||||
)
|
||||
|
||||
with doc["chunks"].row() as chunk:
|
||||
# Embed each chunk
|
||||
chunk["embedding"] = chunk["text"].transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
|
||||
doc_embeddings.collect(
|
||||
id=cocoindex.GeneratedField.UUID,
|
||||
filename=doc["filename"],
|
||||
text=chunk["text"],
|
||||
embedding=chunk["embedding"]
|
||||
)
|
||||
|
||||
# Export to Postgres with vector index
|
||||
doc_embeddings.export(
|
||||
"doc_embeddings",
|
||||
cocoindex.targets.Postgres(),
|
||||
primary_key_fields=["id"],
|
||||
vector_indexes=[
|
||||
cocoindex.VectorIndexDef(
|
||||
field_name="embedding",
|
||||
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 2: Code Embedding with Language Detection
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="CodeEmbedding")
|
||||
def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["files"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(
|
||||
path=".",
|
||||
included_patterns=["*.py", "*.rs", "*.md"],
|
||||
excluded_patterns=["**/.*", "target", "**/node_modules"]
|
||||
)
|
||||
)
|
||||
|
||||
code_embeddings = data_scope.add_collector()
|
||||
|
||||
with data_scope["files"].row() as file:
|
||||
# Detect language
|
||||
file["language"] = file["filename"].transform(
|
||||
cocoindex.functions.DetectProgrammingLanguage()
|
||||
)
|
||||
|
||||
# Split using language-aware chunking
|
||||
file["chunks"] = file["content"].transform(
|
||||
cocoindex.functions.SplitRecursively(),
|
||||
language=file["language"],
|
||||
chunk_size=1000,
|
||||
chunk_overlap=300
|
||||
)
|
||||
|
||||
with file["chunks"].row() as chunk:
|
||||
chunk["embedding"] = chunk["text"].transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
|
||||
code_embeddings.collect(
|
||||
filename=file["filename"],
|
||||
location=chunk["location"],
|
||||
code=chunk["text"],
|
||||
embedding=chunk["embedding"],
|
||||
start=chunk["start"],
|
||||
end=chunk["end"]
|
||||
)
|
||||
|
||||
code_embeddings.export(
|
||||
"code_embeddings",
|
||||
cocoindex.targets.Postgres(),
|
||||
primary_key_fields=["filename", "location"],
|
||||
vector_indexes=[
|
||||
cocoindex.VectorIndexDef(
|
||||
field_name="embedding",
|
||||
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 3: LLM-based Extraction to Knowledge Graph
|
||||
|
||||
Extract structured information using LLMs and build a knowledge graph.
|
||||
|
||||
```python
|
||||
import dataclasses
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ProductInfo:
|
||||
id: str
|
||||
title: str
|
||||
price: float
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Taxonomy:
|
||||
name: str
|
||||
|
||||
@cocoindex.flow_def(name="ProductGraph")
|
||||
def product_graph_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Setup Neo4j connection
|
||||
neo4j_conn = cocoindex.add_auth_entry(
|
||||
"Neo4jConnection",
|
||||
cocoindex.targets.Neo4jConnection(
|
||||
uri="bolt://localhost:7687",
|
||||
user="neo4j",
|
||||
password="password"
|
||||
)
|
||||
)
|
||||
|
||||
data_scope["products"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="products", included_patterns=["*.json"])
|
||||
)
|
||||
|
||||
product_nodes = data_scope.add_collector()
|
||||
product_taxonomy = data_scope.add_collector()
|
||||
|
||||
with data_scope["products"].row() as product:
|
||||
# Parse JSON and extract info
|
||||
data = product["content"].transform(
|
||||
cocoindex.functions.ParseJson()
|
||||
)
|
||||
|
||||
# Use LLM to extract taxonomies
|
||||
taxonomy = data["description"].transform(
|
||||
cocoindex.functions.ExtractByLlm(
|
||||
llm_spec=cocoindex.LlmSpec(
|
||||
api_type=cocoindex.LlmApiType.OPENAI,
|
||||
model="gpt-4"
|
||||
),
|
||||
output_type=list[Taxonomy]
|
||||
)
|
||||
)
|
||||
|
||||
product_nodes.collect(
|
||||
id=data["id"],
|
||||
title=data["title"],
|
||||
price=data["price"]
|
||||
)
|
||||
|
||||
with taxonomy.row() as t:
|
||||
product_taxonomy.collect(
|
||||
id=cocoindex.GeneratedField.UUID,
|
||||
product_id=data["id"],
|
||||
taxonomy=t["name"]
|
||||
)
|
||||
|
||||
# Export product nodes
|
||||
product_nodes.export(
|
||||
"product_node",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Nodes(label="Product")
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
|
||||
# Declare taxonomy nodes
|
||||
flow_builder.declare(
|
||||
cocoindex.targets.Neo4jDeclaration(
|
||||
connection=neo4j_conn,
|
||||
nodes_label="Taxonomy",
|
||||
primary_key_fields=["value"]
|
||||
)
|
||||
)
|
||||
|
||||
# Export relationships
|
||||
product_taxonomy.export(
|
||||
"product_taxonomy",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Relationships(
|
||||
rel_type="HAS_TAXONOMY",
|
||||
source=cocoindex.targets.NodeFromFields(
|
||||
label="Product",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="product_id", target="id")]
|
||||
),
|
||||
target=cocoindex.targets.NodeFromFields(
|
||||
label="Taxonomy",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="taxonomy", target="value")]
|
||||
)
|
||||
)
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 4: Live Updates with Refresh Interval
|
||||
|
||||
```python
|
||||
import datetime
|
||||
|
||||
@cocoindex.flow_def(name="LiveDataFlow")
|
||||
def live_data_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Add source with refresh interval
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="live_documents"),
|
||||
refresh_interval=datetime.timedelta(minutes=1) # Refresh every minute
|
||||
)
|
||||
|
||||
# ... rest of flow definition
|
||||
```
|
||||
|
||||
### Pattern 5: Custom Transform Function
|
||||
|
||||
```python
|
||||
@cocoindex.op.function(behavior_version=1)
|
||||
def extract_metadata(content: str, filename: str) -> dict:
|
||||
"""Extract metadata from document content."""
|
||||
return {
|
||||
"word_count": len(content.split()),
|
||||
"char_count": len(content),
|
||||
"source": filename
|
||||
}
|
||||
|
||||
@cocoindex.flow_def(name="CustomFunctionFlow")
|
||||
def custom_function_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
# Use custom function
|
||||
doc["metadata"] = doc["content"].transform(
|
||||
extract_metadata,
|
||||
filename=doc["filename"]
|
||||
)
|
||||
|
||||
collector.collect(
|
||||
filename=doc["filename"],
|
||||
word_count=doc["metadata"]["word_count"],
|
||||
char_count=doc["metadata"]["char_count"]
|
||||
)
|
||||
|
||||
collector.export("metadata", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
|
||||
```
|
||||
|
||||
### Pattern 6: Transform Flow for Reusable Logic
|
||||
|
||||
Transform flows allow extracting reusable transformation logic that can be shared between indexing and querying.
|
||||
|
||||
```python
|
||||
@cocoindex.transform_flow()
|
||||
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
|
||||
"""Shared embedding logic for both indexing and querying."""
|
||||
return text.transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
|
||||
@cocoindex.flow_def(name="MainFlow")
|
||||
def main_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
# Use transform flow
|
||||
doc["embedding"] = text_to_embedding(doc["content"])
|
||||
collector.collect(text=doc["content"], embedding=doc["embedding"])
|
||||
|
||||
collector.export("docs", cocoindex.targets.Postgres(), primary_key_fields=["text"])
|
||||
|
||||
# Later, use same transform flow for querying
|
||||
def search(query: str):
|
||||
query_embedding = text_to_embedding.eval(query) # Evaluate with input
|
||||
# ... perform search with query_embedding
|
||||
```
|
||||
|
||||
### Pattern 7: Concurrency Control
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="ConcurrencyControlFlow")
|
||||
def concurrency_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Limit concurrent processing at source level
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="large_documents"),
|
||||
max_inflight_rows=10, # Max 10 documents at once
|
||||
max_inflight_bytes=100 * 1024 * 1024 # Max 100MB in memory
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
doc["chunks"] = doc["content"].transform(
|
||||
cocoindex.functions.SplitRecursively(),
|
||||
chunk_size=2000
|
||||
)
|
||||
|
||||
# Limit concurrent processing at row iteration level
|
||||
with doc["chunks"].row(max_inflight_rows=100) as chunk:
|
||||
chunk["embedding"] = chunk["text"].transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
collector.collect(text=chunk["text"], embedding=chunk["embedding"])
|
||||
|
||||
collector.export("chunks", cocoindex.targets.Postgres(), primary_key_fields=["text"])
|
||||
```
|
||||
|
||||
## Data Source Patterns
|
||||
|
||||
### Local Files
|
||||
|
||||
```python
|
||||
cocoindex.sources.LocalFile(
|
||||
path="documents",
|
||||
included_patterns=["*.md", "*.txt"],
|
||||
excluded_patterns=["**/.*", "node_modules"]
|
||||
)
|
||||
```
|
||||
|
||||
### Amazon S3
|
||||
|
||||
```python
|
||||
cocoindex.sources.AmazonS3(
|
||||
bucket="my-bucket",
|
||||
prefix="documents/",
|
||||
included_patterns=["*.pdf"],
|
||||
aws_access_key_id=cocoindex.add_transient_auth_entry("..."),
|
||||
aws_secret_access_key=cocoindex.add_transient_auth_entry("...")
|
||||
)
|
||||
```
|
||||
|
||||
### Postgres Source
|
||||
|
||||
```python
|
||||
cocoindex.sources.Postgres(
|
||||
connection=cocoindex.add_auth_entry(
|
||||
"postgres_conn",
|
||||
cocoindex.sources.PostgresConnection(
|
||||
host="localhost",
|
||||
database="mydb",
|
||||
user="user",
|
||||
password="password"
|
||||
)
|
||||
),
|
||||
query="SELECT id, content FROM documents"
|
||||
)
|
||||
```
|
||||
|
||||
## Target Patterns
|
||||
|
||||
### Postgres
|
||||
|
||||
```python
|
||||
collector.export(
|
||||
"target_name",
|
||||
cocoindex.targets.Postgres(),
|
||||
primary_key_fields=["id"],
|
||||
vector_indexes=[
|
||||
cocoindex.VectorIndexDef(
|
||||
field_name="embedding",
|
||||
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Qdrant
|
||||
|
||||
```python
|
||||
collector.export(
|
||||
"target_name",
|
||||
cocoindex.targets.Qdrant(collection_name="my_collection"),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
|
||||
### LanceDB
|
||||
|
||||
```python
|
||||
collector.export(
|
||||
"target_name",
|
||||
cocoindex.targets.LanceDB(
|
||||
uri="lancedb_data",
|
||||
table_name="my_table"
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
|
||||
### Neo4j (Knowledge Graph)
|
||||
|
||||
```python
|
||||
# Node export
|
||||
collector.export(
|
||||
"nodes",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Nodes(label="Entity")
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
|
||||
# Relationship export
|
||||
collector.export(
|
||||
"relationships",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Relationships(
|
||||
rel_type="RELATES_TO",
|
||||
source=cocoindex.targets.NodeFromFields(
|
||||
label="Entity",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="source_id", target="id")]
|
||||
),
|
||||
target=cocoindex.targets.NodeFromFields(
|
||||
label="Entity",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="target_id", target="id")]
|
||||
)
|
||||
)
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
Reference in New Issue
Block a user