Initial commit

2025-11-29 18:14:46 +08:00
commit b5d12ef27c
8 changed files with 2822 additions and 0 deletions
--- a/skills/cocoindex/references/flow_patterns.md
+++ b/skills/cocoindex/references/flow_patterns.md
@@ -0,0 +1,478 @@
+# CocoIndex Flow Patterns
+
+This reference provides common patterns and examples for building CocoIndex flows.
+
+## Basic Flow Pattern
+
+```python
+import cocoindex
+
+@cocoindex.flow_def(name="FlowName")
+def my_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    # 1. Import source data
+    data_scope["source_data"] = flow_builder.add_source(...)
+
+    # 2. Create collectors for output
+    my_collector = data_scope.add_collector()
+
+    # 3. Transform data
+    with data_scope["source_data"].row() as item:
+        item["transformed"] = item["field"].transform(...)
+        my_collector.collect(...)
+
+    # 4. Export to target
+    my_collector.export("target_name", ..., primary_key_fields=[...])
+```
+
+## Common Flow Patterns
+
+### Pattern 1: Simple Text Embedding
+
+Embed documents from local files into a vector database.
+
+```python
+@cocoindex.flow_def(name="TextEmbedding")
+def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    # Import documents
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="documents")
+    )
+
+    doc_embeddings = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        # Split into chunks
+        doc["chunks"] = doc["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            language="markdown",
+            chunk_size=2000,
+            chunk_overlap=500
+        )
+
+        with doc["chunks"].row() as chunk:
+            # Embed each chunk
+            chunk["embedding"] = chunk["text"].transform(
+                cocoindex.functions.SentenceTransformerEmbed(
+                    model="sentence-transformers/all-MiniLM-L6-v2"
+                )
+            )
+
+            doc_embeddings.collect(
+                id=cocoindex.GeneratedField.UUID,
+                filename=doc["filename"],
+                text=chunk["text"],
+                embedding=chunk["embedding"]
+            )
+
+    # Export to Postgres with vector index
+    doc_embeddings.export(
+        "doc_embeddings",
+        cocoindex.targets.Postgres(),
+        primary_key_fields=["id"],
+        vector_indexes=[
+            cocoindex.VectorIndexDef(
+                field_name="embedding",
+                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
+            )
+        ]
+    )
+```
+
+### Pattern 2: Code Embedding with Language Detection
+
+```python
+@cocoindex.flow_def(name="CodeEmbedding")
+def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    data_scope["files"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(
+            path=".",
+            included_patterns=["*.py", "*.rs", "*.md"],
+            excluded_patterns=["**/.*", "target", "**/node_modules"]
+        )
+    )
+
+    code_embeddings = data_scope.add_collector()
+
+    with data_scope["files"].row() as file:
+        # Detect language
+        file["language"] = file["filename"].transform(
+            cocoindex.functions.DetectProgrammingLanguage()
+        )
+
+        # Split using language-aware chunking
+        file["chunks"] = file["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            language=file["language"],
+            chunk_size=1000,
+            chunk_overlap=300
+        )
+
+        with file["chunks"].row() as chunk:
+            chunk["embedding"] = chunk["text"].transform(
+                cocoindex.functions.SentenceTransformerEmbed(
+                    model="sentence-transformers/all-MiniLM-L6-v2"
+                )
+            )
+
+            code_embeddings.collect(
+                filename=file["filename"],
+                location=chunk["location"],
+                code=chunk["text"],
+                embedding=chunk["embedding"],
+                start=chunk["start"],
+                end=chunk["end"]
+            )
+
+    code_embeddings.export(
+        "code_embeddings",
+        cocoindex.targets.Postgres(),
+        primary_key_fields=["filename", "location"],
+        vector_indexes=[
+            cocoindex.VectorIndexDef(
+                field_name="embedding",
+                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
+            )
+        ]
+    )
+```
+
+### Pattern 3: LLM-based Extraction to Knowledge Graph
+
+Extract structured information using LLMs and build a knowledge graph.
+
+```python
+import dataclasses
+
+@dataclasses.dataclass
+class ProductInfo:
+    id: str
+    title: str
+    price: float
+
+@dataclasses.dataclass
+class Taxonomy:
+    name: str
+
+@cocoindex.flow_def(name="ProductGraph")
+def product_graph_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    # Setup Neo4j connection
+    neo4j_conn = cocoindex.add_auth_entry(
+        "Neo4jConnection",
+        cocoindex.targets.Neo4jConnection(
+            uri="bolt://localhost:7687",
+            user="neo4j",
+            password="password"
+        )
+    )
+
+    data_scope["products"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="products", included_patterns=["*.json"])
+    )
+
+    product_nodes = data_scope.add_collector()
+    product_taxonomy = data_scope.add_collector()
+
+    with data_scope["products"].row() as product:
+        # Parse JSON and extract info
+        data = product["content"].transform(
+            cocoindex.functions.ParseJson()
+        )
+
+        # Use LLM to extract taxonomies
+        taxonomy = data["description"].transform(
+            cocoindex.functions.ExtractByLlm(
+                llm_spec=cocoindex.LlmSpec(
+                    api_type=cocoindex.LlmApiType.OPENAI,
+                    model="gpt-4"
+                ),
+                output_type=list[Taxonomy]
+            )
+        )
+
+        product_nodes.collect(
+            id=data["id"],
+            title=data["title"],
+            price=data["price"]
+        )
+
+        with taxonomy.row() as t:
+            product_taxonomy.collect(
+                id=cocoindex.GeneratedField.UUID,
+                product_id=data["id"],
+                taxonomy=t["name"]
+            )
+
+    # Export product nodes
+    product_nodes.export(
+        "product_node",
+        cocoindex.targets.Neo4j(
+            connection=neo4j_conn,
+            mapping=cocoindex.targets.Nodes(label="Product")
+        ),
+        primary_key_fields=["id"]
+    )
+
+    # Declare taxonomy nodes
+    flow_builder.declare(
+        cocoindex.targets.Neo4jDeclaration(
+            connection=neo4j_conn,
+            nodes_label="Taxonomy",
+            primary_key_fields=["value"]
+        )
+    )
+
+    # Export relationships
+    product_taxonomy.export(
+        "product_taxonomy",
+        cocoindex.targets.Neo4j(
+            connection=neo4j_conn,
+            mapping=cocoindex.targets.Relationships(
+                rel_type="HAS_TAXONOMY",
+                source=cocoindex.targets.NodeFromFields(
+                    label="Product",
+                    fields=[cocoindex.targets.TargetFieldMapping(source="product_id", target="id")]
+                ),
+                target=cocoindex.targets.NodeFromFields(
+                    label="Taxonomy",
+                    fields=[cocoindex.targets.TargetFieldMapping(source="taxonomy", target="value")]
+                )
+            )
+        ),
+        primary_key_fields=["id"]
+    )
+```
+
+### Pattern 4: Live Updates with Refresh Interval
+
+```python
+import datetime
+
+@cocoindex.flow_def(name="LiveDataFlow")
+def live_data_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    # Add source with refresh interval
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="live_documents"),
+        refresh_interval=datetime.timedelta(minutes=1)  # Refresh every minute
+    )
+
+    # ... rest of flow definition
+```
+
+### Pattern 5: Custom Transform Function
+
+```python
+@cocoindex.op.function(behavior_version=1)
+def extract_metadata(content: str, filename: str) -> dict:
+    """Extract metadata from document content."""
+    return {
+        "word_count": len(content.split()),
+        "char_count": len(content),
+        "source": filename
+    }
+
+@cocoindex.flow_def(name="CustomFunctionFlow")
+def custom_function_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="documents")
+    )
+
+    collector = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        # Use custom function
+        doc["metadata"] = doc["content"].transform(
+            extract_metadata,
+            filename=doc["filename"]
+        )
+
+        collector.collect(
+            filename=doc["filename"],
+            word_count=doc["metadata"]["word_count"],
+            char_count=doc["metadata"]["char_count"]
+        )
+
+    collector.export("metadata", cocoindex.targets.Postgres(), primary_key_fields=["filename"])
+```
+
+### Pattern 6: Transform Flow for Reusable Logic
+
+Transform flows allow extracting reusable transformation logic that can be shared between indexing and querying.
+
+```python
+@cocoindex.transform_flow()
+def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
+    """Shared embedding logic for both indexing and querying."""
+    return text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    )
+
+@cocoindex.flow_def(name="MainFlow")
+def main_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="documents")
+    )
+
+    collector = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        # Use transform flow
+        doc["embedding"] = text_to_embedding(doc["content"])
+        collector.collect(text=doc["content"], embedding=doc["embedding"])
+
+    collector.export("docs", cocoindex.targets.Postgres(), primary_key_fields=["text"])
+
+# Later, use same transform flow for querying
+def search(query: str):
+    query_embedding = text_to_embedding.eval(query)  # Evaluate with input
+    # ... perform search with query_embedding
+```
+
+### Pattern 7: Concurrency Control
+
+```python
+@cocoindex.flow_def(name="ConcurrencyControlFlow")
+def concurrency_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
+    # Limit concurrent processing at source level
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="large_documents"),
+        max_inflight_rows=10,                    # Max 10 documents at once
+        max_inflight_bytes=100 * 1024 * 1024    # Max 100MB in memory
+    )
+
+    collector = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["chunks"] = doc["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            chunk_size=2000
+        )
+
+        # Limit concurrent processing at row iteration level
+        with doc["chunks"].row(max_inflight_rows=100) as chunk:
+            chunk["embedding"] = chunk["text"].transform(
+                cocoindex.functions.SentenceTransformerEmbed(
+                    model="sentence-transformers/all-MiniLM-L6-v2"
+                )
+            )
+            collector.collect(text=chunk["text"], embedding=chunk["embedding"])
+
+    collector.export("chunks", cocoindex.targets.Postgres(), primary_key_fields=["text"])
+```
+
+## Data Source Patterns
+
+### Local Files
+
+```python
+cocoindex.sources.LocalFile(
+    path="documents",
+    included_patterns=["*.md", "*.txt"],
+    excluded_patterns=["**/.*", "node_modules"]
+)
+```
+
+### Amazon S3
+
+```python
+cocoindex.sources.AmazonS3(
+    bucket="my-bucket",
+    prefix="documents/",
+    included_patterns=["*.pdf"],
+    aws_access_key_id=cocoindex.add_transient_auth_entry("..."),
+    aws_secret_access_key=cocoindex.add_transient_auth_entry("...")
+)
+```
+
+### Postgres Source
+
+```python
+cocoindex.sources.Postgres(
+    connection=cocoindex.add_auth_entry(
+        "postgres_conn",
+        cocoindex.sources.PostgresConnection(
+            host="localhost",
+            database="mydb",
+            user="user",
+            password="password"
+        )
+    ),
+    query="SELECT id, content FROM documents"
+)
+```
+
+## Target Patterns
+
+### Postgres
+
+```python
+collector.export(
+    "target_name",
+    cocoindex.targets.Postgres(),
+    primary_key_fields=["id"],
+    vector_indexes=[
+        cocoindex.VectorIndexDef(
+            field_name="embedding",
+            metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY
+        )
+    ]
+)
+```
+
+### Qdrant
+
+```python
+collector.export(
+    "target_name",
+    cocoindex.targets.Qdrant(collection_name="my_collection"),
+    primary_key_fields=["id"]
+)
+```
+
+### LanceDB
+
+```python
+collector.export(
+    "target_name",
+    cocoindex.targets.LanceDB(
+        uri="lancedb_data",
+        table_name="my_table"
+    ),
+    primary_key_fields=["id"]
+)
+```
+
+### Neo4j (Knowledge Graph)
+
+```python
+# Node export
+collector.export(
+    "nodes",
+    cocoindex.targets.Neo4j(
+        connection=neo4j_conn,
+        mapping=cocoindex.targets.Nodes(label="Entity")
+    ),
+    primary_key_fields=["id"]
+)
+
+# Relationship export
+collector.export(
+    "relationships",
+    cocoindex.targets.Neo4j(
+        connection=neo4j_conn,
+        mapping=cocoindex.targets.Relationships(
+            rel_type="RELATES_TO",
+            source=cocoindex.targets.NodeFromFields(
+                label="Entity",
+                fields=[cocoindex.targets.TargetFieldMapping(source="source_id", target="id")]
+            ),
+            target=cocoindex.targets.NodeFromFields(
+                label="Entity",
+                fields=[cocoindex.targets.TargetFieldMapping(source="target_id", target="id")]
+            )
+        )
+    ),
+    primary_key_fields=["id"]
+)
+```