479 lines
14 KiB
Markdown
479 lines
14 KiB
Markdown
# CocoIndex Flow Patterns
|
|
|
|
This reference provides common patterns and examples for building CocoIndex flows.
|
|
|
|
## Basic Flow Pattern
|
|
|
|
```python
|
|
import cocoindex
|
|
|
|
@cocoindex.flow_def(name="FlowName")
|
|
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
# 1. Import source data
|
|
data_scope["source_data"] = flow_builder.add_source(...)
|
|
|
|
# 2. Create collectors for output
|
|
my_collector = data_scope.add_collector()
|
|
|
|
# 3. Transform data
|
|
with data_scope["source_data"].row() as item:
|
|
item["transformed"] = item["field"].transform(...)
|
|
my_collector.collect(...)
|
|
|
|
# 4. Export to target
|
|
my_collector.export("target_name", ..., primary_key_fields=[...])
|
|
```
|
|
|
|
## Common Flow Patterns
|
|
|
|
### Pattern 1: Simple Text Embedding
|
|
|
|
Embed documents from local files into a vector database.
|
|
|
|
```python
|
|
@cocoindex.flow_def(name="TextEmbedding")
|
|
def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
# Import documents
|
|
data_scope["documents"] = flow_builder.add_source(
|
|
cocoindex.sources.LocalFile(path="documents")
|
|
)
|
|
|
|
doc_embeddings = data_scope.add_collector()
|
|
|
|
with data_scope["documents"].row() as doc:
|
|
# Split into chunks
|
|
doc["chunks"] = doc["content"].transform(
|
|
cocoindex.functions.SplitRecursively(),
|
|
language="markdown",
|
|
chunk_size=2000,
|
|
chunk_overlap=500
|
|
)
|
|
|
|
with doc["chunks"].row() as chunk:
|
|
# Embed each chunk
|
|
chunk["embedding"] = chunk["text"].transform(
|
|
cocoindex.functions.SentenceTransformerEmbed(
|
|
model="sentence-transformers/all-MiniLM-L6-v2"
|
|
)
|
|
)
|
|
|
|
doc_embeddings.collect(
|
|
id=cocoindex.GeneratedField.UUID,
|
|
filename=doc["filename"],
|
|
text=chunk["text"],
|
|
embedding=chunk["embedding"]
|
|
)
|
|
|
|
# Export to Postgres with vector index
|
|
doc_embeddings.export(
|
|
"doc_embeddings",
|
|
cocoindex.targets.Postgres(),
|
|
primary_key_fields=["id"],
|
|
vector_indexes=[
|
|
cocoindex.VectorIndexDef(
|
|
field_name="embedding",
|
|
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
|
)
|
|
]
|
|
)
|
|
```
|
|
|
|
### Pattern 2: Code Embedding with Language Detection
|
|
|
|
```python
|
|
@cocoindex.flow_def(name="CodeEmbedding")
|
|
def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
data_scope["files"] = flow_builder.add_source(
|
|
cocoindex.sources.LocalFile(
|
|
path=".",
|
|
included_patterns=["*.py", "*.rs", "*.md"],
|
|
excluded_patterns=["**/.*", "target", "**/node_modules"]
|
|
)
|
|
)
|
|
|
|
code_embeddings = data_scope.add_collector()
|
|
|
|
with data_scope["files"].row() as file:
|
|
# Detect language
|
|
file["language"] = file["filename"].transform(
|
|
cocoindex.functions.DetectProgrammingLanguage()
|
|
)
|
|
|
|
# Split using language-aware chunking
|
|
file["chunks"] = file["content"].transform(
|
|
cocoindex.functions.SplitRecursively(),
|
|
language=file["language"],
|
|
chunk_size=1000,
|
|
chunk_overlap=300
|
|
)
|
|
|
|
with file["chunks"].row() as chunk:
|
|
chunk["embedding"] = chunk["text"].transform(
|
|
cocoindex.functions.SentenceTransformerEmbed(
|
|
model="sentence-transformers/all-MiniLM-L6-v2"
|
|
)
|
|
)
|
|
|
|
code_embeddings.collect(
|
|
filename=file["filename"],
|
|
location=chunk["location"],
|
|
code=chunk["text"],
|
|
embedding=chunk["embedding"],
|
|
start=chunk["start"],
|
|
end=chunk["end"]
|
|
)
|
|
|
|
code_embeddings.export(
|
|
"code_embeddings",
|
|
cocoindex.targets.Postgres(),
|
|
primary_key_fields=["filename", "location"],
|
|
vector_indexes=[
|
|
cocoindex.VectorIndexDef(
|
|
field_name="embedding",
|
|
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
|
)
|
|
]
|
|
)
|
|
```
|
|
|
|
### Pattern 3: LLM-based Extraction to Knowledge Graph
|
|
|
|
Extract structured information using LLMs and build a knowledge graph.
|
|
|
|
```python
|
|
import dataclasses
|
|
|
|
@dataclasses.dataclass
|
|
class ProductInfo:
|
|
id: str
|
|
title: str
|
|
price: float
|
|
|
|
@dataclasses.dataclass
|
|
class Taxonomy:
|
|
name: str
|
|
|
|
@cocoindex.flow_def(name="ProductGraph")
|
|
def product_graph_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
# Setup Neo4j connection
|
|
neo4j_conn = cocoindex.add_auth_entry(
|
|
"Neo4jConnection",
|
|
cocoindex.targets.Neo4jConnection(
|
|
uri="bolt://localhost:7687",
|
|
user="neo4j",
|
|
password="password"
|
|
)
|
|
)
|
|
|
|
data_scope["products"] = flow_builder.add_source(
|
|
cocoindex.sources.LocalFile(path="products", included_patterns=["*.json"])
|
|
)
|
|
|
|
product_nodes = data_scope.add_collector()
|
|
product_taxonomy = data_scope.add_collector()
|
|
|
|
with data_scope["products"].row() as product:
|
|
# Parse JSON and extract info
|
|
data = product["content"].transform(
|
|
cocoindex.functions.ParseJson()
|
|
)
|
|
|
|
# Use LLM to extract taxonomies
|
|
taxonomy = data["description"].transform(
|
|
cocoindex.functions.ExtractByLlm(
|
|
llm_spec=cocoindex.LlmSpec(
|
|
api_type=cocoindex.LlmApiType.OPENAI,
|
|
model="gpt-4"
|
|
),
|
|
output_type=list[Taxonomy]
|
|
)
|
|
)
|
|
|
|
product_nodes.collect(
|
|
id=data["id"],
|
|
title=data["title"],
|
|
price=data["price"]
|
|
)
|
|
|
|
with taxonomy.row() as t:
|
|
product_taxonomy.collect(
|
|
id=cocoindex.GeneratedField.UUID,
|
|
product_id=data["id"],
|
|
taxonomy=t["name"]
|
|
)
|
|
|
|
# Export product nodes
|
|
product_nodes.export(
|
|
"product_node",
|
|
cocoindex.targets.Neo4j(
|
|
connection=neo4j_conn,
|
|
mapping=cocoindex.targets.Nodes(label="Product")
|
|
),
|
|
primary_key_fields=["id"]
|
|
)
|
|
|
|
# Declare taxonomy nodes
|
|
flow_builder.declare(
|
|
cocoindex.targets.Neo4jDeclaration(
|
|
connection=neo4j_conn,
|
|
nodes_label="Taxonomy",
|
|
primary_key_fields=["value"]
|
|
)
|
|
)
|
|
|
|
# Export relationships
|
|
product_taxonomy.export(
|
|
"product_taxonomy",
|
|
cocoindex.targets.Neo4j(
|
|
connection=neo4j_conn,
|
|
mapping=cocoindex.targets.Relationships(
|
|
rel_type="HAS_TAXONOMY",
|
|
source=cocoindex.targets.NodeFromFields(
|
|
label="Product",
|
|
fields=[cocoindex.targets.TargetFieldMapping(source="product_id", target="id")]
|
|
),
|
|
target=cocoindex.targets.NodeFromFields(
|
|
label="Taxonomy",
|
|
fields=[cocoindex.targets.TargetFieldMapping(source="taxonomy", target="value")]
|
|
)
|
|
)
|
|
),
|
|
primary_key_fields=["id"]
|
|
)
|
|
```
|
|
|
|
### Pattern 4: Live Updates with Refresh Interval
|
|
|
|
```python
|
|
import datetime
|
|
|
|
@cocoindex.flow_def(name="LiveDataFlow")
|
|
def live_data_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
# Add source with refresh interval
|
|
data_scope["documents"] = flow_builder.add_source(
|
|
cocoindex.sources.LocalFile(path="live_documents"),
|
|
refresh_interval=datetime.timedelta(minutes=1) # Refresh every minute
|
|
)
|
|
|
|
# ... rest of flow definition
|
|
```
|
|
|
|
### Pattern 5: Custom Transform Function
|
|
|
|
```python
|
|
@cocoindex.op.function(behavior_version=1)
|
|
def extract_metadata(content: str, filename: str) -> dict:
|
|
"""Extract metadata from document content."""
|
|
return {
|
|
"word_count": len(content.split()),
|
|
"char_count": len(content),
|
|
"source": filename
|
|
}
|
|
|
|
@cocoindex.flow_def(name="CustomFunctionFlow")
|
|
def custom_function_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
data_scope["documents"] = flow_builder.add_source(
|
|
cocoindex.sources.LocalFile(path="documents")
|
|
)
|
|
|
|
collector = data_scope.add_collector()
|
|
|
|
with data_scope["documents"].row() as doc:
|
|
# Use custom function
|
|
doc["metadata"] = doc["content"].transform(
|
|
extract_metadata,
|
|
filename=doc["filename"]
|
|
)
|
|
|
|
collector.collect(
|
|
filename=doc["filename"],
|
|
word_count=doc["metadata"]["word_count"],
|
|
char_count=doc["metadata"]["char_count"]
|
|
)
|
|
|
|
collector.export("metadata", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
|
|
```
|
|
|
|
### Pattern 6: Transform Flow for Reusable Logic
|
|
|
|
Transform flows allow extracting reusable transformation logic that can be shared between indexing and querying.
|
|
|
|
```python
|
|
@cocoindex.transform_flow()
|
|
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
|
|
"""Shared embedding logic for both indexing and querying."""
|
|
return text.transform(
|
|
cocoindex.functions.SentenceTransformerEmbed(
|
|
model="sentence-transformers/all-MiniLM-L6-v2"
|
|
)
|
|
)
|
|
|
|
@cocoindex.flow_def(name="MainFlow")
|
|
def main_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
data_scope["documents"] = flow_builder.add_source(
|
|
cocoindex.sources.LocalFile(path="documents")
|
|
)
|
|
|
|
collector = data_scope.add_collector()
|
|
|
|
with data_scope["documents"].row() as doc:
|
|
# Use transform flow
|
|
doc["embedding"] = text_to_embedding(doc["content"])
|
|
collector.collect(text=doc["content"], embedding=doc["embedding"])
|
|
|
|
collector.export("docs", cocoindex.targets.Postgres(), primary_key_fields=["text"])
|
|
|
|
# Later, use same transform flow for querying
|
|
def search(query: str):
|
|
query_embedding = text_to_embedding.eval(query) # Evaluate with input
|
|
# ... perform search with query_embedding
|
|
```
|
|
|
|
### Pattern 7: Concurrency Control
|
|
|
|
```python
|
|
@cocoindex.flow_def(name="ConcurrencyControlFlow")
|
|
def concurrency_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
|
# Limit concurrent processing at source level
|
|
data_scope["documents"] = flow_builder.add_source(
|
|
cocoindex.sources.LocalFile(path="large_documents"),
|
|
max_inflight_rows=10, # Max 10 documents at once
|
|
max_inflight_bytes=100 * 1024 * 1024 # Max 100MB in memory
|
|
)
|
|
|
|
collector = data_scope.add_collector()
|
|
|
|
with data_scope["documents"].row() as doc:
|
|
doc["chunks"] = doc["content"].transform(
|
|
cocoindex.functions.SplitRecursively(),
|
|
chunk_size=2000
|
|
)
|
|
|
|
# Limit concurrent processing at row iteration level
|
|
with doc["chunks"].row(max_inflight_rows=100) as chunk:
|
|
chunk["embedding"] = chunk["text"].transform(
|
|
cocoindex.functions.SentenceTransformerEmbed(
|
|
model="sentence-transformers/all-MiniLM-L6-v2"
|
|
)
|
|
)
|
|
collector.collect(text=chunk["text"], embedding=chunk["embedding"])
|
|
|
|
collector.export("chunks", cocoindex.targets.Postgres(), primary_key_fields=["text"])
|
|
```
|
|
|
|
## Data Source Patterns
|
|
|
|
### Local Files
|
|
|
|
```python
|
|
cocoindex.sources.LocalFile(
|
|
path="documents",
|
|
included_patterns=["*.md", "*.txt"],
|
|
excluded_patterns=["**/.*", "node_modules"]
|
|
)
|
|
```
|
|
|
|
### Amazon S3
|
|
|
|
```python
|
|
cocoindex.sources.AmazonS3(
|
|
bucket="my-bucket",
|
|
prefix="documents/",
|
|
included_patterns=["*.pdf"],
|
|
aws_access_key_id=cocoindex.add_transient_auth_entry("..."),
|
|
aws_secret_access_key=cocoindex.add_transient_auth_entry("...")
|
|
)
|
|
```
|
|
|
|
### Postgres Source
|
|
|
|
```python
|
|
cocoindex.sources.Postgres(
|
|
connection=cocoindex.add_auth_entry(
|
|
"postgres_conn",
|
|
cocoindex.sources.PostgresConnection(
|
|
host="localhost",
|
|
database="mydb",
|
|
user="user",
|
|
password="password"
|
|
)
|
|
),
|
|
query="SELECT id, content FROM documents"
|
|
)
|
|
```
|
|
|
|
## Target Patterns
|
|
|
|
### Postgres
|
|
|
|
```python
|
|
collector.export(
|
|
"target_name",
|
|
cocoindex.targets.Postgres(),
|
|
primary_key_fields=["id"],
|
|
vector_indexes=[
|
|
cocoindex.VectorIndexDef(
|
|
field_name="embedding",
|
|
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
|
)
|
|
]
|
|
)
|
|
```
|
|
|
|
### Qdrant
|
|
|
|
```python
|
|
collector.export(
|
|
"target_name",
|
|
cocoindex.targets.Qdrant(collection_name="my_collection"),
|
|
primary_key_fields=["id"]
|
|
)
|
|
```
|
|
|
|
### LanceDB
|
|
|
|
```python
|
|
collector.export(
|
|
"target_name",
|
|
cocoindex.targets.LanceDB(
|
|
uri="lancedb_data",
|
|
table_name="my_table"
|
|
),
|
|
primary_key_fields=["id"]
|
|
)
|
|
```
|
|
|
|
### Neo4j (Knowledge Graph)
|
|
|
|
```python
|
|
# Node export
|
|
collector.export(
|
|
"nodes",
|
|
cocoindex.targets.Neo4j(
|
|
connection=neo4j_conn,
|
|
mapping=cocoindex.targets.Nodes(label="Entity")
|
|
),
|
|
primary_key_fields=["id"]
|
|
)
|
|
|
|
# Relationship export
|
|
collector.export(
|
|
"relationships",
|
|
cocoindex.targets.Neo4j(
|
|
connection=neo4j_conn,
|
|
mapping=cocoindex.targets.Relationships(
|
|
rel_type="RELATES_TO",
|
|
source=cocoindex.targets.NodeFromFields(
|
|
label="Entity",
|
|
fields=[cocoindex.targets.TargetFieldMapping(source="source_id", target="id")]
|
|
),
|
|
target=cocoindex.targets.NodeFromFields(
|
|
label="Entity",
|
|
fields=[cocoindex.targets.TargetFieldMapping(source="target_id", target="id")]
|
|
)
|
|
)
|
|
),
|
|
primary_key_fields=["id"]
|
|
)
|
|
```
|