Initial commit
This commit is contained in:
478
skills/cocoindex/references/flow_patterns.md
Normal file
478
skills/cocoindex/references/flow_patterns.md
Normal file
@@ -0,0 +1,478 @@
|
||||
# CocoIndex Flow Patterns
|
||||
|
||||
This reference provides common patterns and examples for building CocoIndex flows.
|
||||
|
||||
## Basic Flow Pattern
|
||||
|
||||
```python
|
||||
import cocoindex
|
||||
|
||||
@cocoindex.flow_def(name="FlowName")
|
||||
def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# 1. Import source data
|
||||
data_scope["source_data"] = flow_builder.add_source(...)
|
||||
|
||||
# 2. Create collectors for output
|
||||
my_collector = data_scope.add_collector()
|
||||
|
||||
# 3. Transform data
|
||||
with data_scope["source_data"].row() as item:
|
||||
item["transformed"] = item["field"].transform(...)
|
||||
my_collector.collect(...)
|
||||
|
||||
# 4. Export to target
|
||||
my_collector.export("target_name", ..., primary_key_fields=[...])
|
||||
```
|
||||
|
||||
## Common Flow Patterns
|
||||
|
||||
### Pattern 1: Simple Text Embedding
|
||||
|
||||
Embed documents from local files into a vector database.
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="TextEmbedding")
|
||||
def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Import documents
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
doc_embeddings = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
# Split into chunks
|
||||
doc["chunks"] = doc["content"].transform(
|
||||
cocoindex.functions.SplitRecursively(),
|
||||
language="markdown",
|
||||
chunk_size=2000,
|
||||
chunk_overlap=500
|
||||
)
|
||||
|
||||
with doc["chunks"].row() as chunk:
|
||||
# Embed each chunk
|
||||
chunk["embedding"] = chunk["text"].transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
|
||||
doc_embeddings.collect(
|
||||
id=cocoindex.GeneratedField.UUID,
|
||||
filename=doc["filename"],
|
||||
text=chunk["text"],
|
||||
embedding=chunk["embedding"]
|
||||
)
|
||||
|
||||
# Export to Postgres with vector index
|
||||
doc_embeddings.export(
|
||||
"doc_embeddings",
|
||||
cocoindex.targets.Postgres(),
|
||||
primary_key_fields=["id"],
|
||||
vector_indexes=[
|
||||
cocoindex.VectorIndexDef(
|
||||
field_name="embedding",
|
||||
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 2: Code Embedding with Language Detection
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="CodeEmbedding")
|
||||
def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["files"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(
|
||||
path=".",
|
||||
included_patterns=["*.py", "*.rs", "*.md"],
|
||||
excluded_patterns=["**/.*", "target", "**/node_modules"]
|
||||
)
|
||||
)
|
||||
|
||||
code_embeddings = data_scope.add_collector()
|
||||
|
||||
with data_scope["files"].row() as file:
|
||||
# Detect language
|
||||
file["language"] = file["filename"].transform(
|
||||
cocoindex.functions.DetectProgrammingLanguage()
|
||||
)
|
||||
|
||||
# Split using language-aware chunking
|
||||
file["chunks"] = file["content"].transform(
|
||||
cocoindex.functions.SplitRecursively(),
|
||||
language=file["language"],
|
||||
chunk_size=1000,
|
||||
chunk_overlap=300
|
||||
)
|
||||
|
||||
with file["chunks"].row() as chunk:
|
||||
chunk["embedding"] = chunk["text"].transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
|
||||
code_embeddings.collect(
|
||||
filename=file["filename"],
|
||||
location=chunk["location"],
|
||||
code=chunk["text"],
|
||||
embedding=chunk["embedding"],
|
||||
start=chunk["start"],
|
||||
end=chunk["end"]
|
||||
)
|
||||
|
||||
code_embeddings.export(
|
||||
"code_embeddings",
|
||||
cocoindex.targets.Postgres(),
|
||||
primary_key_fields=["filename", "location"],
|
||||
vector_indexes=[
|
||||
cocoindex.VectorIndexDef(
|
||||
field_name="embedding",
|
||||
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 3: LLM-based Extraction to Knowledge Graph
|
||||
|
||||
Extract structured information using LLMs and build a knowledge graph.
|
||||
|
||||
```python
|
||||
import dataclasses
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ProductInfo:
|
||||
id: str
|
||||
title: str
|
||||
price: float
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Taxonomy:
|
||||
name: str
|
||||
|
||||
@cocoindex.flow_def(name="ProductGraph")
|
||||
def product_graph_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Setup Neo4j connection
|
||||
neo4j_conn = cocoindex.add_auth_entry(
|
||||
"Neo4jConnection",
|
||||
cocoindex.targets.Neo4jConnection(
|
||||
uri="bolt://localhost:7687",
|
||||
user="neo4j",
|
||||
password="password"
|
||||
)
|
||||
)
|
||||
|
||||
data_scope["products"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="products", included_patterns=["*.json"])
|
||||
)
|
||||
|
||||
product_nodes = data_scope.add_collector()
|
||||
product_taxonomy = data_scope.add_collector()
|
||||
|
||||
with data_scope["products"].row() as product:
|
||||
# Parse JSON and extract info
|
||||
data = product["content"].transform(
|
||||
cocoindex.functions.ParseJson()
|
||||
)
|
||||
|
||||
# Use LLM to extract taxonomies
|
||||
taxonomy = data["description"].transform(
|
||||
cocoindex.functions.ExtractByLlm(
|
||||
llm_spec=cocoindex.LlmSpec(
|
||||
api_type=cocoindex.LlmApiType.OPENAI,
|
||||
model="gpt-4"
|
||||
),
|
||||
output_type=list[Taxonomy]
|
||||
)
|
||||
)
|
||||
|
||||
product_nodes.collect(
|
||||
id=data["id"],
|
||||
title=data["title"],
|
||||
price=data["price"]
|
||||
)
|
||||
|
||||
with taxonomy.row() as t:
|
||||
product_taxonomy.collect(
|
||||
id=cocoindex.GeneratedField.UUID,
|
||||
product_id=data["id"],
|
||||
taxonomy=t["name"]
|
||||
)
|
||||
|
||||
# Export product nodes
|
||||
product_nodes.export(
|
||||
"product_node",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Nodes(label="Product")
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
|
||||
# Declare taxonomy nodes
|
||||
flow_builder.declare(
|
||||
cocoindex.targets.Neo4jDeclaration(
|
||||
connection=neo4j_conn,
|
||||
nodes_label="Taxonomy",
|
||||
primary_key_fields=["value"]
|
||||
)
|
||||
)
|
||||
|
||||
# Export relationships
|
||||
product_taxonomy.export(
|
||||
"product_taxonomy",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Relationships(
|
||||
rel_type="HAS_TAXONOMY",
|
||||
source=cocoindex.targets.NodeFromFields(
|
||||
label="Product",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="product_id", target="id")]
|
||||
),
|
||||
target=cocoindex.targets.NodeFromFields(
|
||||
label="Taxonomy",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="taxonomy", target="value")]
|
||||
)
|
||||
)
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
|
||||
### Pattern 4: Live Updates with Refresh Interval
|
||||
|
||||
```python
|
||||
import datetime
|
||||
|
||||
@cocoindex.flow_def(name="LiveDataFlow")
|
||||
def live_data_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Add source with refresh interval
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="live_documents"),
|
||||
refresh_interval=datetime.timedelta(minutes=1) # Refresh every minute
|
||||
)
|
||||
|
||||
# ... rest of flow definition
|
||||
```
|
||||
|
||||
### Pattern 5: Custom Transform Function
|
||||
|
||||
```python
|
||||
@cocoindex.op.function(behavior_version=1)
|
||||
def extract_metadata(content: str, filename: str) -> dict:
|
||||
"""Extract metadata from document content."""
|
||||
return {
|
||||
"word_count": len(content.split()),
|
||||
"char_count": len(content),
|
||||
"source": filename
|
||||
}
|
||||
|
||||
@cocoindex.flow_def(name="CustomFunctionFlow")
|
||||
def custom_function_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
# Use custom function
|
||||
doc["metadata"] = doc["content"].transform(
|
||||
extract_metadata,
|
||||
filename=doc["filename"]
|
||||
)
|
||||
|
||||
collector.collect(
|
||||
filename=doc["filename"],
|
||||
word_count=doc["metadata"]["word_count"],
|
||||
char_count=doc["metadata"]["char_count"]
|
||||
)
|
||||
|
||||
collector.export("metadata", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
|
||||
```
|
||||
|
||||
### Pattern 6: Transform Flow for Reusable Logic
|
||||
|
||||
Transform flows allow extracting reusable transformation logic that can be shared between indexing and querying.
|
||||
|
||||
```python
|
||||
@cocoindex.transform_flow()
|
||||
def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
|
||||
"""Shared embedding logic for both indexing and querying."""
|
||||
return text.transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
|
||||
@cocoindex.flow_def(name="MainFlow")
|
||||
def main_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="documents")
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
# Use transform flow
|
||||
doc["embedding"] = text_to_embedding(doc["content"])
|
||||
collector.collect(text=doc["content"], embedding=doc["embedding"])
|
||||
|
||||
collector.export("docs", cocoindex.targets.Postgres(), primary_key_fields=["text"])
|
||||
|
||||
# Later, use same transform flow for querying
|
||||
def search(query: str):
|
||||
query_embedding = text_to_embedding.eval(query) # Evaluate with input
|
||||
# ... perform search with query_embedding
|
||||
```
|
||||
|
||||
### Pattern 7: Concurrency Control
|
||||
|
||||
```python
|
||||
@cocoindex.flow_def(name="ConcurrencyControlFlow")
|
||||
def concurrency_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
|
||||
# Limit concurrent processing at source level
|
||||
data_scope["documents"] = flow_builder.add_source(
|
||||
cocoindex.sources.LocalFile(path="large_documents"),
|
||||
max_inflight_rows=10, # Max 10 documents at once
|
||||
max_inflight_bytes=100 * 1024 * 1024 # Max 100MB in memory
|
||||
)
|
||||
|
||||
collector = data_scope.add_collector()
|
||||
|
||||
with data_scope["documents"].row() as doc:
|
||||
doc["chunks"] = doc["content"].transform(
|
||||
cocoindex.functions.SplitRecursively(),
|
||||
chunk_size=2000
|
||||
)
|
||||
|
||||
# Limit concurrent processing at row iteration level
|
||||
with doc["chunks"].row(max_inflight_rows=100) as chunk:
|
||||
chunk["embedding"] = chunk["text"].transform(
|
||||
cocoindex.functions.SentenceTransformerEmbed(
|
||||
model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
)
|
||||
)
|
||||
collector.collect(text=chunk["text"], embedding=chunk["embedding"])
|
||||
|
||||
collector.export("chunks", cocoindex.targets.Postgres(), primary_key_fields=["text"])
|
||||
```
|
||||
|
||||
## Data Source Patterns
|
||||
|
||||
### Local Files
|
||||
|
||||
```python
|
||||
cocoindex.sources.LocalFile(
|
||||
path="documents",
|
||||
included_patterns=["*.md", "*.txt"],
|
||||
excluded_patterns=["**/.*", "node_modules"]
|
||||
)
|
||||
```
|
||||
|
||||
### Amazon S3
|
||||
|
||||
```python
|
||||
cocoindex.sources.AmazonS3(
|
||||
bucket="my-bucket",
|
||||
prefix="documents/",
|
||||
included_patterns=["*.pdf"],
|
||||
aws_access_key_id=cocoindex.add_transient_auth_entry("..."),
|
||||
aws_secret_access_key=cocoindex.add_transient_auth_entry("...")
|
||||
)
|
||||
```
|
||||
|
||||
### Postgres Source
|
||||
|
||||
```python
|
||||
cocoindex.sources.Postgres(
|
||||
connection=cocoindex.add_auth_entry(
|
||||
"postgres_conn",
|
||||
cocoindex.sources.PostgresConnection(
|
||||
host="localhost",
|
||||
database="mydb",
|
||||
user="user",
|
||||
password="password"
|
||||
)
|
||||
),
|
||||
query="SELECT id, content FROM documents"
|
||||
)
|
||||
```
|
||||
|
||||
## Target Patterns
|
||||
|
||||
### Postgres
|
||||
|
||||
```python
|
||||
collector.export(
|
||||
"target_name",
|
||||
cocoindex.targets.Postgres(),
|
||||
primary_key_fields=["id"],
|
||||
vector_indexes=[
|
||||
cocoindex.VectorIndexDef(
|
||||
field_name="embedding",
|
||||
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
### Qdrant
|
||||
|
||||
```python
|
||||
collector.export(
|
||||
"target_name",
|
||||
cocoindex.targets.Qdrant(collection_name="my_collection"),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
|
||||
### LanceDB
|
||||
|
||||
```python
|
||||
collector.export(
|
||||
"target_name",
|
||||
cocoindex.targets.LanceDB(
|
||||
uri="lancedb_data",
|
||||
table_name="my_table"
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
|
||||
### Neo4j (Knowledge Graph)
|
||||
|
||||
```python
|
||||
# Node export
|
||||
collector.export(
|
||||
"nodes",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Nodes(label="Entity")
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
|
||||
# Relationship export
|
||||
collector.export(
|
||||
"relationships",
|
||||
cocoindex.targets.Neo4j(
|
||||
connection=neo4j_conn,
|
||||
mapping=cocoindex.targets.Relationships(
|
||||
rel_type="RELATES_TO",
|
||||
source=cocoindex.targets.NodeFromFields(
|
||||
label="Entity",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="source_id", target="id")]
|
||||
),
|
||||
target=cocoindex.targets.NodeFromFields(
|
||||
label="Entity",
|
||||
fields=[cocoindex.targets.TargetFieldMapping(source="target_id", target="id")]
|
||||
)
|
||||
)
|
||||
),
|
||||
primary_key_fields=["id"]
|
||||
)
|
||||
```
|
||||
Reference in New Issue
Block a user