Initial commit

2025-11-29 18:25:45 +08:00
commit 919e6673e7
13 changed files with 4381 additions and 0 deletions
--- a/commands/data-iceberg-table.md
+++ b/commands/data-iceberg-table.md
@@ -0,0 +1,549 @@
+---
+description: Create and manage Apache Iceberg tables with ACID transactions and schema evolution
+---
+
+# Apache Iceberg Tables
+
+Help the user work with Apache Iceberg tables for data lakes with ACID transactions, time travel, and schema evolution capabilities.
+
+## Steps
+
+1. **Add required dependencies**:
+   ```toml
+   [dependencies]
+   iceberg = "0.3"
+   iceberg-catalog-rest = "0.3"
+   arrow = "52"
+   parquet = "52"
+   object_store = "0.9"
+   tokio = { version = "1", features = ["full"] }
+   ```
+
+2. **Set up Iceberg catalog**:
+   ```rust
+   use iceberg::{Catalog, TableIdent};
+   use iceberg_catalog_rest::RestCatalog;
+
+   async fn create_catalog() -> Result<RestCatalog> {
+       // REST catalog (works with services like Polaris, Nessie, etc.)
+       let catalog = RestCatalog::new(
+           "http://localhost:8181",  // Catalog endpoint
+           "warehouse",               // Warehouse location
+       ).await?;
+
+       Ok(catalog)
+   }
+
+   // For AWS Glue catalog
+   // use iceberg_catalog_glue::GlueCatalog;
+
+   // For file-based catalog (development)
+   // use iceberg::catalog::FileCatalog;
+   ```
+
+3. **Create an Iceberg table**:
+   ```rust
+   use iceberg::{
+       spec::{Schema, NestedField, PrimitiveType, Type},
+       NamespaceIdent, TableCreation,
+   };
+
+   async fn create_table(catalog: &impl Catalog) -> Result<()> {
+       // Define schema
+       let schema = Schema::builder()
+           .with_fields(vec![
+               NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long)),
+               NestedField::required(2, "timestamp", Type::Primitive(PrimitiveType::Timestamp)),
+               NestedField::required(3, "user_id", Type::Primitive(PrimitiveType::String)),
+               NestedField::optional(4, "event_type", Type::Primitive(PrimitiveType::String)),
+               NestedField::optional(5, "properties", Type::Primitive(PrimitiveType::String)),
+           ])
+           .build()?;
+
+       // Define partitioning
+       let partition_spec = iceberg::spec::PartitionSpec::builder()
+           .with_spec_id(0)
+           .add_partition_field(2, "year", iceberg::spec::Transform::Year)? // Partition by year
+           .add_partition_field(2, "month", iceberg::spec::Transform::Month)? // Partition by month
+           .build()?;
+
+       // Define sort order (for data clustering)
+       let sort_order = iceberg::spec::SortOrder::builder()
+           .with_order_id(0)
+           .add_sort_field(
+               iceberg::spec::SortField::builder()
+                   .source_id(2) // timestamp field
+                   .direction(iceberg::spec::SortDirection::Ascending)
+                   .null_order(iceberg::spec::NullOrder::First)
+                   .build(),
+           )
+           .build()?;
+
+       // Create table
+       let table_creation = TableCreation::builder()
+           .name("events".to_string())
+           .schema(schema)
+           .partition_spec(partition_spec)
+           .sort_order(sort_order)
+           .build();
+
+       let namespace = NamespaceIdent::new("db".to_string());
+       let table_ident = TableIdent::new(namespace, "events".to_string());
+
+       catalog.create_table(&table_ident, table_creation).await?;
+
+       println!("Table created: db.events");
+       Ok(())
+   }
+   ```
+
+4. **Load an existing table**:
+   ```rust
+   async fn load_table(catalog: &impl Catalog) -> Result<iceberg::Table> {
+       let namespace = NamespaceIdent::new("db".to_string());
+       let table_ident = TableIdent::new(namespace, "events".to_string());
+
+       let table = catalog.load_table(&table_ident).await?;
+
+       // Inspect table metadata
+       println!("Schema: {:?}", table.metadata().current_schema());
+       println!("Location: {}", table.metadata().location());
+       println!("Snapshots: {}", table.metadata().snapshots().len());
+
+       Ok(table)
+   }
+   ```
+
+5. **Write data to Iceberg table**:
+   ```rust
+   use iceberg::writer::{IcebergWriter, RecordBatchWriter};
+   use arrow::record_batch::RecordBatch;
+
+   async fn write_data(
+       table: &iceberg::Table,
+       batches: Vec<RecordBatch>,
+   ) -> Result<()> {
+       // Create writer
+       let mut writer = table
+           .writer()
+           .partition_by(table.metadata().default_partition_spec()?)
+           .build()
+           .await?;
+
+       // Write batches
+       for batch in batches {
+           writer.write(&batch).await?;
+       }
+
+       // Commit (ACID transaction)
+       let data_files = writer.close().await?;
+
+       // Create snapshot
+       let mut append = table.new_append();
+       for file in data_files {
+           append.add_data_file(file)?;
+       }
+       append.commit().await?;
+
+       println!("Data written and committed");
+       Ok(())
+   }
+   ```
+
+6. **Read data with time travel**:
+   ```rust
+   use iceberg::scan::{TableScan, TableScanBuilder};
+
+   async fn read_latest(table: &iceberg::Table) -> Result<Vec<RecordBatch>> {
+       // Read latest snapshot
+       let scan = table.scan().build().await?;
+
+       let batches = scan.to_arrow().await?;
+
+       Ok(batches)
+   }
+
+   async fn read_snapshot(
+       table: &iceberg::Table,
+       snapshot_id: i64,
+   ) -> Result<Vec<RecordBatch>> {
+       // Time travel to specific snapshot
+       let scan = table
+           .scan()
+           .snapshot_id(snapshot_id)
+           .build()
+           .await?;
+
+       let batches = scan.to_arrow().await?;
+
+       Ok(batches)
+   }
+
+   async fn read_as_of_timestamp(
+       table: &iceberg::Table,
+       timestamp_ms: i64,
+   ) -> Result<Vec<RecordBatch>> {
+       // Time travel to specific timestamp
+       let scan = table
+           .scan()
+           .as_of_timestamp(timestamp_ms)
+           .build()
+           .await?;
+
+       let batches = scan.to_arrow().await?;
+
+       Ok(batches)
+   }
+   ```
+
+7. **Perform schema evolution**:
+   ```rust
+   async fn evolve_schema(table: &mut iceberg::Table) -> Result<()> {
+       // Add new column
+       let mut update = table.update_schema();
+       update
+           .add_column("new_field", Type::Primitive(PrimitiveType::String), true)?
+           .commit()
+           .await?;
+
+       println!("Added column: new_field");
+
+       // Rename column
+       let mut update = table.update_schema();
+       update
+           .rename_column("old_name", "new_name")?
+           .commit()
+           .await?;
+
+       println!("Renamed column: old_name -> new_name");
+
+       // Delete column (metadata only)
+       let mut update = table.update_schema();
+       update
+           .delete_column("unused_field")?
+           .commit()
+           .await?;
+
+       println!("Deleted column: unused_field");
+
+       // Update column type (limited support)
+       let mut update = table.update_schema();
+       update
+           .update_column("numeric_field", Type::Primitive(PrimitiveType::Double))?
+           .commit()
+           .await?;
+
+       // Reorder columns
+       let mut update = table.update_schema();
+       update
+           .move_first("important_field")?
+           .move_after("field_a", "field_b")?
+           .commit()
+           .await?;
+
+       Ok(())
+   }
+   ```
+
+8. **Query history and snapshots**:
+   ```rust
+   async fn inspect_history(table: &iceberg::Table) -> Result<()> {
+       let metadata = table.metadata();
+
+       // List all snapshots
+       println!("Snapshots:");
+       for snapshot in metadata.snapshots() {
+           println!(
+               "  ID: {}, Timestamp: {}, Summary: {:?}",
+               snapshot.snapshot_id(),
+               snapshot.timestamp_ms(),
+               snapshot.summary()
+           );
+       }
+
+       // Get current snapshot
+       if let Some(current) = metadata.current_snapshot() {
+           println!("Current snapshot: {}", current.snapshot_id());
+           println!("Manifest list: {}", current.manifest_list());
+       }
+
+       // Get schema history
+       println!("\nSchema versions:");
+       for schema in metadata.schemas() {
+           println!("  Schema ID {}: {} fields", schema.schema_id(), schema.fields().len());
+       }
+
+       Ok(())
+   }
+   ```
+
+## Advanced Features
+
+**Partition evolution**:
+```rust
+async fn evolve_partitioning(table: &mut iceberg::Table) -> Result<()> {
+    // Change partition strategy without rewriting data
+    let mut update = table.update_partition_spec();
+
+    // Add day partitioning
+    update.add_field(
+        "timestamp",
+        "day",
+        iceberg::spec::Transform::Day,
+    )?;
+
+    // Remove old month partitioning
+    update.remove_field("month")?;
+
+    update.commit().await?;
+
+    println!("Partition spec evolved");
+    Ok(())
+}
+```
+
+**Hidden partitioning**:
+```rust
+// Iceberg supports hidden partitioning - partition on derived values
+// Users don't need to specify partition columns in queries
+
+async fn create_table_with_hidden_partitioning(catalog: &impl Catalog) -> Result<()> {
+    let schema = Schema::builder()
+        .with_fields(vec![
+            NestedField::required(1, "timestamp", Type::Primitive(PrimitiveType::Timestamp)),
+            NestedField::required(2, "data", Type::Primitive(PrimitiveType::String)),
+        ])
+        .build()?;
+
+    // Partition by year(timestamp) and month(timestamp)
+    // But timestamp is a regular column, not a partition column
+    let partition_spec = iceberg::spec::PartitionSpec::builder()
+        .add_partition_field(1, "year", iceberg::spec::Transform::Year)?
+        .add_partition_field(1, "month", iceberg::spec::Transform::Month)?
+        .build()?;
+
+    // Now queries like:
+    // SELECT * FROM table WHERE timestamp >= '2024-01-01'
+    // Will automatically use partition pruning
+
+    Ok(())
+}
+```
+
+**Incremental reads**:
+```rust
+async fn incremental_read(
+    table: &iceberg::Table,
+    from_snapshot_id: i64,
+    to_snapshot_id: Option<i64>,
+) -> Result<Vec<RecordBatch>> {
+    // Read only data added between snapshots
+    let scan = table
+        .scan()
+        .from_snapshot_id(from_snapshot_id)
+        .snapshot_id(to_snapshot_id.unwrap_or_else(|| {
+            table.metadata().current_snapshot().unwrap().snapshot_id()
+        }))
+        .build()
+        .await?;
+
+    let batches = scan.to_arrow().await?;
+
+    Ok(batches)
+}
+```
+
+**Filtering and projection**:
+```rust
+use iceberg::expr::{Predicate, Reference};
+
+async fn filtered_scan(table: &iceberg::Table) -> Result<Vec<RecordBatch>> {
+    // Build predicate
+    let predicate = Predicate::and(
+        Predicate::greater_than("timestamp", 1704067200000i64), // > 2024-01-01
+        Predicate::equal("event_type", "click"),
+    );
+
+    // Scan with predicate pushdown
+    let scan = table
+        .scan()
+        .with_filter(predicate)
+        .select(&["user_id", "timestamp", "event_type"]) // Column projection
+        .build()
+        .await?;
+
+    let batches = scan.to_arrow().await?;
+
+    Ok(batches)
+}
+```
+
+**Compaction (optimize files)**:
+```rust
+async fn compact_table(table: &iceberg::Table) -> Result<()> {
+    // Read small files
+    let scan = table.scan().build().await?;
+    let batches = scan.to_arrow().await?;
+
+    // Rewrite as larger, optimized files
+    let mut writer = table
+        .writer()
+        .partition_by(table.metadata().default_partition_spec()?)
+        .build()
+        .await?;
+
+    for batch in batches {
+        writer.write(&batch).await?;
+    }
+
+    let new_files = writer.close().await?;
+
+    // Atomic replace
+    let mut rewrite = table.new_rewrite();
+    rewrite
+        .delete_files(/* old files */)
+        .add_files(new_files)
+        .commit()
+        .await?;
+
+    Ok(())
+}
+```
+
+## Integration with DataFusion
+
+```rust
+use datafusion::prelude::*;
+use iceberg::datafusion::IcebergTableProvider;
+
+async fn query_with_datafusion(table: iceberg::Table) -> Result<()> {
+    // Create DataFusion context
+    let ctx = SessionContext::new();
+
+    // Register Iceberg table
+    let provider = IcebergTableProvider::try_new(table).await?;
+    ctx.register_table("events", Arc::new(provider))?;
+
+    // Query with SQL
+    let df = ctx.sql("
+        SELECT
+            event_type,
+            COUNT(*) as count
+        FROM events
+        WHERE timestamp >= '2024-01-01'
+        GROUP BY event_type
+    ").await?;
+
+    df.show().await?;
+
+    Ok(())
+}
+```
+
+## Common Patterns
+
+**Creating a data pipeline**:
+```rust
+async fn data_pipeline(
+    source_store: Arc<dyn ObjectStore>,
+    table: &iceberg::Table,
+) -> Result<()> {
+    // 1. Read from source (e.g., Parquet)
+    let batches = read_parquet_files(source_store).await?;
+
+    // 2. Transform data
+    let transformed = transform_batches(batches)?;
+
+    // 3. Write to Iceberg table
+    write_data(table, transformed).await?;
+
+    println!("Pipeline complete");
+    Ok(())
+}
+```
+
+**Implementing time-based retention**:
+```rust
+async fn expire_old_snapshots(table: &mut iceberg::Table, days: i64) -> Result<()> {
+    let cutoff_ms = chrono::Utc::now().timestamp_millis() - (days * 24 * 60 * 60 * 1000);
+
+    let mut expire = table.expire_snapshots();
+    expire
+        .expire_older_than(cutoff_ms)
+        .retain_last(10) // Keep at least 10 snapshots
+        .commit()
+        .await?;
+
+    println!("Expired snapshots older than {} days", days);
+    Ok(())
+}
+```
+
+**Atomic updates**:
+```rust
+async fn atomic_update(table: &iceberg::Table) -> Result<()> {
+    // All or nothing - either entire commit succeeds or fails
+    let mut transaction = table.new_transaction();
+
+    // Multiple operations in one transaction
+    transaction.append(/* new data */);
+    transaction.update_schema(/* schema change */);
+    transaction.update_properties(/* property change */);
+
+    // Atomic commit
+    transaction.commit().await?;
+
+    Ok(())
+}
+```
+
+## Best Practices
+
+- **Use hidden partitioning** for cleaner queries and easier partition evolution
+- **Define sort order** to cluster related data together
+- **Expire old snapshots** regularly to avoid metadata bloat
+- **Use schema evolution** instead of creating new tables
+- **Leverage time travel** for debugging and auditing
+- **Compact small files** periodically for better read performance
+- **Use partition evolution** to adapt to changing data patterns
+- **Enable statistics** for query optimization
+
+## Benefits Over Raw Parquet
+
+1. **ACID Transactions**: Atomic commits prevent partial updates
+2. **Time Travel**: Query historical table states
+3. **Schema Evolution**: Add/rename/reorder columns safely
+4. **Partition Evolution**: Change partitioning without rewriting
+5. **Hidden Partitioning**: Cleaner queries, automatic partition pruning
+6. **Concurrency**: Multiple writers with optimistic concurrency
+7. **Metadata Management**: Efficient metadata operations
+8. **Data Lineage**: Track changes over time
+
+## Troubleshooting
+
+**Metadata file not found**:
+- Verify catalog configuration
+- Check object store permissions
+- Ensure table was created successfully
+
+**Schema mismatch on write**:
+- Verify writer schema matches table schema
+- Use schema evolution to add new fields
+- Check for required vs. optional fields
+
+**Slow queries**:
+- Use predicate pushdown with filters
+- Enable column projection
+- Compact small files
+- Verify partition pruning is working
+
+**Snapshot expiration issues**:
+- Ensure retain_last is set appropriately
+- Don't expire too aggressively if time travel is needed
+- Clean up orphaned files separately
+
+## Resources
+
+- [Apache Iceberg Specification](https://iceberg.apache.org/spec/)
+- [iceberg-rust Documentation](https://docs.rs/iceberg/)
+- [Iceberg Table Format](https://iceberg.apache.org/docs/latest/)