Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:25:45 +08:00
commit 919e6673e7
13 changed files with 4381 additions and 0 deletions

View File

@@ -0,0 +1,359 @@
---
description: Read Parquet files efficiently with predicate pushdown and column projection
---
# Read Parquet Files
Help the user read Parquet files from object storage with optimal performance using predicate pushdown, column projection, and row group filtering.
## Steps
1. **Add required dependencies**:
```toml
[dependencies]
parquet = "52"
arrow = "52"
object_store = "0.9"
tokio = { version = "1", features = ["full"] }
futures = "0.3"
```
2. **Create a basic Parquet reader** from object_store:
```rust
use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};
use object_store::{ObjectStore, path::Path};
use arrow::record_batch::RecordBatch;
use futures::stream::StreamExt;
async fn read_parquet(
store: Arc<dyn ObjectStore>,
path: &str,
) -> Result<Vec<RecordBatch>> {
let path = Path::from(path);
// Get file metadata
let meta = store.head(&path).await?;
// Create reader
let reader = ParquetObjectReader::new(store, meta);
// Build stream
let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
let mut stream = builder.build()?;
// Collect batches
let mut batches = Vec::new();
while let Some(batch) = stream.next().await {
batches.push(batch?);
}
Ok(batches)
}
```
3. **Add column projection** to read only needed columns:
```rust
use parquet::arrow::ProjectionMask;
let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
// Get schema to determine column indices
let schema = builder.schema();
println!("Available columns: {:?}", schema.fields());
// Project specific columns by index
let projection = ProjectionMask::roots(schema, vec![0, 2, 5]);
let builder = builder.with_projection(projection);
// Or project by column name (helper function)
fn project_columns(builder: ParquetRecordBatchStreamBuilder<ParquetObjectReader>,
column_names: &[&str]) -> ParquetRecordBatchStreamBuilder<ParquetObjectReader> {
let schema = builder.schema();
let indices: Vec<usize> = column_names
.iter()
.filter_map(|name| schema.column_with_name(name).map(|(idx, _)| idx))
.collect();
let projection = ProjectionMask::roots(schema, indices);
builder.with_projection(projection)
}
let builder = project_columns(builder, &["user_id", "timestamp", "event_type"]);
```
4. **Add row group filtering** using statistics:
```rust
use parquet::file::metadata::ParquetMetaData;
let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
let metadata = builder.metadata();
// Filter row groups based on statistics
let row_groups_to_read: Vec<usize> = metadata
.row_groups()
.iter()
.enumerate()
.filter_map(|(idx, rg)| {
// Example: filter by min/max values
let col_metadata = rg.column(0); // First column
if let Some(stats) = col_metadata.statistics() {
// Check if row group might contain relevant data
// This is pseudo-code; actual implementation depends on data type
if stats_match_predicate(stats) {
return Some(idx);
}
}
None
})
.collect();
let builder = builder.with_row_groups(row_groups_to_read);
```
5. **Implement streaming processing** for large files:
```rust
async fn process_large_parquet(
store: Arc<dyn ObjectStore>,
path: &str,
) -> Result<()> {
let path = Path::from(path);
let meta = store.head(&path).await?;
let reader = ParquetObjectReader::new(store, meta);
let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
// Limit batch size to control memory usage
let builder = builder.with_batch_size(8192);
let mut stream = builder.build()?;
// Process batches incrementally
while let Some(batch) = stream.next().await {
let batch = batch?;
// Process this batch
println!("Processing batch with {} rows", batch.num_rows());
process_batch(&batch)?;
// Batch is dropped here, freeing memory
}
Ok(())
}
fn process_batch(batch: &RecordBatch) -> Result<()> {
// Your processing logic
Ok(())
}
```
6. **Add comprehensive error handling**:
```rust
use thiserror::Error;
#[derive(Error, Debug)]
enum ParquetReadError {
#[error("Object store error: {0}")]
ObjectStore(#[from] object_store::Error),
#[error("Parquet error: {0}")]
Parquet(#[from] parquet::errors::ParquetError),
#[error("Arrow error: {0}")]
Arrow(#[from] arrow::error::ArrowError),
#[error("File not found: {0}")]
FileNotFound(String),
}
async fn read_with_error_handling(
store: Arc<dyn ObjectStore>,
path: &str,
) -> Result<Vec<RecordBatch>, ParquetReadError> {
let path = Path::from(path);
// Check if file exists
if !store.head(&path).await.is_ok() {
return Err(ParquetReadError::FileNotFound(path.to_string()));
}
let meta = store.head(&path).await?;
let reader = ParquetObjectReader::new(store, meta);
let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
let mut stream = builder.build()?;
let mut batches = Vec::new();
while let Some(batch) = stream.next().await {
batches.push(batch?);
}
Ok(batches)
}
```
## Performance Optimization
**Reading with all optimizations**:
```rust
async fn optimized_read(
store: Arc<dyn ObjectStore>,
path: &str,
columns: &[&str],
) -> Result<Vec<RecordBatch>> {
let path = Path::from(path);
let meta = store.head(&path).await?;
let reader = ParquetObjectReader::new(store, meta);
let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
// 1. Column projection
let schema = builder.schema();
let indices: Vec<usize> = columns
.iter()
.filter_map(|name| schema.column_with_name(name).map(|(idx, _)| idx))
.collect();
let projection = ProjectionMask::roots(schema, indices);
builder = builder.with_projection(projection);
// 2. Batch size tuning
builder = builder.with_batch_size(8192);
// 3. Row group filtering (if applicable)
// builder = builder.with_row_groups(filtered_row_groups);
let mut stream = builder.build()?;
let mut batches = Vec::new();
while let Some(batch) = stream.next().await {
batches.push(batch?);
}
Ok(batches)
}
```
## Reading Metadata Only
```rust
async fn read_metadata(
store: Arc<dyn ObjectStore>,
path: &str,
) -> Result<()> {
let path = Path::from(path);
let meta = store.head(&path).await?;
let reader = ParquetObjectReader::new(store, meta);
let builder = ParquetRecordBatchStreamBuilder::new(reader).await?;
let metadata = builder.metadata();
println!("Schema: {:?}", builder.schema());
println!("Number of row groups: {}", metadata.num_row_groups());
println!("Total rows: {}", metadata.file_metadata().num_rows());
for (idx, rg) in metadata.row_groups().iter().enumerate() {
println!("Row Group {}: {} rows", idx, rg.num_rows());
for (col_idx, col) in rg.columns().iter().enumerate() {
if let Some(stats) = col.statistics() {
println!(" Column {}: min={:?}, max={:?}, null_count={:?}",
col_idx,
stats.min_bytes(),
stats.max_bytes(),
stats.null_count()
);
}
}
}
Ok(())
}
```
## Common Patterns
**Reading multiple files in parallel**:
```rust
use futures::stream::{self, StreamExt};
async fn read_multiple_files(
store: Arc<dyn ObjectStore>,
paths: Vec<String>,
) -> Result<Vec<RecordBatch>> {
let results = stream::iter(paths)
.map(|path| {
let store = store.clone();
async move {
read_parquet(store, &path).await
}
})
.buffer_unordered(10) // Process 10 files concurrently
.collect::<Vec<_>>()
.await;
// Flatten results
let mut all_batches = Vec::new();
for result in results {
all_batches.extend(result?);
}
Ok(all_batches)
}
```
**Reading partitioned data**:
```rust
async fn read_partition(
store: Arc<dyn ObjectStore>,
base_path: &str,
year: i32,
month: u32,
) -> Result<Vec<RecordBatch>> {
let partition_path = format!("{}/year={}/month={:02}/", base_path, year, month);
// List all files in partition
let prefix = Some(&Path::from(partition_path));
let files: Vec<_> = store.list(prefix)
.filter_map(|meta| async move {
meta.ok().and_then(|m| {
if m.location.as_ref().ends_with(".parquet") {
Some(m.location.to_string())
} else {
None
}
})
})
.collect()
.await;
// Read all files
read_multiple_files(store, files).await
}
```
## Best Practices
- **Use column projection** to read only needed columns (10x+ speedup for wide tables)
- **Stream large files** instead of collecting all batches into memory
- **Check metadata first** to understand file structure before reading
- **Use batch_size** to control memory usage (8192-65536 rows per batch)
- **Filter row groups** using statistics when possible
- **Read multiple files in parallel** for partitioned datasets
- **Handle schema evolution** by checking schema before processing
## Troubleshooting
**Out of memory errors**:
- Reduce batch size: `.with_batch_size(4096)`
- Stream instead of collecting: process batches one at a time
- Use column projection to read fewer columns
**Slow reads**:
- Enable column projection if reading wide tables
- Check if row group filtering is possible
- Increase parallelism when reading multiple files
- Verify network connectivity to object store
**Schema mismatch**:
- Read metadata first to inspect actual schema
- Handle optional columns that may not exist in older files
- Use schema evolution strategies from DataFusion