Initial commit
This commit is contained in:
649
skills/polars/references/best_practices.md
Normal file
649
skills/polars/references/best_practices.md
Normal file
@@ -0,0 +1,649 @@
|
||||
# Polars Best Practices and Performance Guide
|
||||
|
||||
Comprehensive guide to writing efficient Polars code and avoiding common pitfalls.
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### 1. Use Lazy Evaluation
|
||||
|
||||
**Always prefer lazy mode for large datasets:**
|
||||
|
||||
```python
|
||||
# Bad: Eager mode loads everything immediately
|
||||
df = pl.read_csv("large_file.csv")
|
||||
result = df.filter(pl.col("age") > 25).select("name", "age")
|
||||
|
||||
# Good: Lazy mode optimizes before execution
|
||||
lf = pl.scan_csv("large_file.csv")
|
||||
result = lf.filter(pl.col("age") > 25).select("name", "age").collect()
|
||||
```
|
||||
|
||||
**Benefits of lazy evaluation:**
|
||||
- Predicate pushdown (filter at source)
|
||||
- Projection pushdown (read only needed columns)
|
||||
- Query optimization
|
||||
- Parallel execution planning
|
||||
|
||||
### 2. Filter and Select Early
|
||||
|
||||
Push filters and column selection as early as possible in the pipeline:
|
||||
|
||||
```python
|
||||
# Bad: Process all data, then filter and select
|
||||
result = (
|
||||
lf.group_by("category")
|
||||
.agg(pl.col("value").mean())
|
||||
.join(other, on="category")
|
||||
.filter(pl.col("value") > 100)
|
||||
.select("category", "value")
|
||||
)
|
||||
|
||||
# Good: Filter and select early
|
||||
result = (
|
||||
lf.select("category", "value") # Only needed columns
|
||||
.filter(pl.col("value") > 100) # Filter early
|
||||
.group_by("category")
|
||||
.agg(pl.col("value").mean())
|
||||
.join(other.select("category", "other_col"), on="category")
|
||||
)
|
||||
```
|
||||
|
||||
### 3. Avoid Python Functions
|
||||
|
||||
Stay within the expression API to maintain parallelization:
|
||||
|
||||
```python
|
||||
# Bad: Python function disables parallelization
|
||||
df = df.with_columns(
|
||||
result=pl.col("value").map_elements(lambda x: x * 2, return_dtype=pl.Float64)
|
||||
)
|
||||
|
||||
# Good: Use native expressions (parallelized)
|
||||
df = df.with_columns(result=pl.col("value") * 2)
|
||||
```
|
||||
|
||||
**When you must use custom functions:**
|
||||
```python
|
||||
# If truly needed, be explicit
|
||||
df = df.with_columns(
|
||||
result=pl.col("value").map_elements(
|
||||
custom_function,
|
||||
return_dtype=pl.Float64,
|
||||
skip_nulls=True # Optimize null handling
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### 4. Use Streaming for Very Large Data
|
||||
|
||||
Enable streaming for datasets larger than RAM:
|
||||
|
||||
```python
|
||||
# Streaming mode processes data in chunks
|
||||
lf = pl.scan_parquet("very_large.parquet")
|
||||
result = lf.filter(pl.col("value") > 100).collect(streaming=True)
|
||||
|
||||
# Or use sink for direct streaming writes
|
||||
lf.filter(pl.col("value") > 100).sink_parquet("output.parquet")
|
||||
```
|
||||
|
||||
### 5. Optimize Data Types
|
||||
|
||||
Choose appropriate data types to reduce memory and improve performance:
|
||||
|
||||
```python
|
||||
# Bad: Default types may be wasteful
|
||||
df = pl.read_csv("data.csv")
|
||||
|
||||
# Good: Specify optimal types
|
||||
df = pl.read_csv(
|
||||
"data.csv",
|
||||
dtypes={
|
||||
"id": pl.UInt32, # Instead of Int64 if values fit
|
||||
"category": pl.Categorical, # For low-cardinality strings
|
||||
"date": pl.Date, # Instead of String
|
||||
"small_int": pl.Int16, # Instead of Int64
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
**Type optimization guidelines:**
|
||||
- Use smallest integer type that fits your data
|
||||
- Use `Categorical` for strings with low cardinality (<50% unique)
|
||||
- Use `Date` instead of `Datetime` when time isn't needed
|
||||
- Use `Boolean` instead of integers for binary flags
|
||||
|
||||
### 6. Parallel Operations
|
||||
|
||||
Structure code to maximize parallelization:
|
||||
|
||||
```python
|
||||
# Bad: Sequential pipe operations disable parallelization
|
||||
df = (
|
||||
df.pipe(operation1)
|
||||
.pipe(operation2)
|
||||
.pipe(operation3)
|
||||
)
|
||||
|
||||
# Good: Combined operations enable parallelization
|
||||
df = df.with_columns(
|
||||
result1=operation1_expr(),
|
||||
result2=operation2_expr(),
|
||||
result3=operation3_expr()
|
||||
)
|
||||
```
|
||||
|
||||
### 7. Rechunk After Concatenation
|
||||
|
||||
```python
|
||||
# Concatenation can fragment data
|
||||
combined = pl.concat([df1, df2, df3])
|
||||
|
||||
# Rechunk for better performance in subsequent operations
|
||||
combined = pl.concat([df1, df2, df3], rechunk=True)
|
||||
```
|
||||
|
||||
## Expression Patterns
|
||||
|
||||
### Conditional Logic
|
||||
|
||||
**Simple conditions:**
|
||||
```python
|
||||
df.with_columns(
|
||||
status=pl.when(pl.col("age") >= 18)
|
||||
.then("adult")
|
||||
.otherwise("minor")
|
||||
)
|
||||
```
|
||||
|
||||
**Multiple conditions:**
|
||||
```python
|
||||
df.with_columns(
|
||||
grade=pl.when(pl.col("score") >= 90)
|
||||
.then("A")
|
||||
.when(pl.col("score") >= 80)
|
||||
.then("B")
|
||||
.when(pl.col("score") >= 70)
|
||||
.then("C")
|
||||
.when(pl.col("score") >= 60)
|
||||
.then("D")
|
||||
.otherwise("F")
|
||||
)
|
||||
```
|
||||
|
||||
**Complex conditions:**
|
||||
```python
|
||||
df.with_columns(
|
||||
category=pl.when(
|
||||
(pl.col("revenue") > 1000000) & (pl.col("customers") > 100)
|
||||
)
|
||||
.then("enterprise")
|
||||
.when(
|
||||
(pl.col("revenue") > 100000) | (pl.col("customers") > 50)
|
||||
)
|
||||
.then("business")
|
||||
.otherwise("starter")
|
||||
)
|
||||
```
|
||||
|
||||
### Null Handling
|
||||
|
||||
**Check for nulls:**
|
||||
```python
|
||||
df.filter(pl.col("value").is_null())
|
||||
df.filter(pl.col("value").is_not_null())
|
||||
```
|
||||
|
||||
**Fill nulls:**
|
||||
```python
|
||||
# Constant value
|
||||
df.with_columns(pl.col("value").fill_null(0))
|
||||
|
||||
# Forward fill
|
||||
df.with_columns(pl.col("value").fill_null(strategy="forward"))
|
||||
|
||||
# Backward fill
|
||||
df.with_columns(pl.col("value").fill_null(strategy="backward"))
|
||||
|
||||
# Mean
|
||||
df.with_columns(pl.col("value").fill_null(strategy="mean"))
|
||||
|
||||
# Per-group fill
|
||||
df.with_columns(
|
||||
pl.col("value").fill_null(pl.col("value").mean()).over("group")
|
||||
)
|
||||
```
|
||||
|
||||
**Coalesce (first non-null):**
|
||||
```python
|
||||
df.with_columns(
|
||||
combined=pl.coalesce(["col1", "col2", "col3"])
|
||||
)
|
||||
```
|
||||
|
||||
### Column Selection Patterns
|
||||
|
||||
**By name:**
|
||||
```python
|
||||
df.select("col1", "col2", "col3")
|
||||
```
|
||||
|
||||
**By pattern:**
|
||||
```python
|
||||
# Regex
|
||||
df.select(pl.col("^sales_.*$"))
|
||||
|
||||
# Starts with
|
||||
df.select(pl.col("^sales"))
|
||||
|
||||
# Ends with
|
||||
df.select(pl.col("_total$"))
|
||||
|
||||
# Contains
|
||||
df.select(pl.col(".*revenue.*"))
|
||||
```
|
||||
|
||||
**By type:**
|
||||
```python
|
||||
# All numeric columns
|
||||
df.select(pl.col(pl.NUMERIC_DTYPES))
|
||||
|
||||
# All string columns
|
||||
df.select(pl.col(pl.Utf8))
|
||||
|
||||
# Multiple types
|
||||
df.select(pl.col(pl.NUMERIC_DTYPES, pl.Boolean))
|
||||
```
|
||||
|
||||
**Exclude columns:**
|
||||
```python
|
||||
df.select(pl.all().exclude("id", "timestamp"))
|
||||
```
|
||||
|
||||
**Transform multiple columns:**
|
||||
```python
|
||||
# Apply same operation to multiple columns
|
||||
df.select(
|
||||
pl.col("^sales_.*$") * 1.1 # 10% increase to all sales columns
|
||||
)
|
||||
```
|
||||
|
||||
### Aggregation Patterns
|
||||
|
||||
**Multiple aggregations:**
|
||||
```python
|
||||
df.group_by("category").agg(
|
||||
pl.col("value").sum().alias("total"),
|
||||
pl.col("value").mean().alias("average"),
|
||||
pl.col("value").std().alias("std_dev"),
|
||||
pl.col("id").count().alias("count"),
|
||||
pl.col("id").n_unique().alias("unique_count"),
|
||||
pl.col("value").min().alias("minimum"),
|
||||
pl.col("value").max().alias("maximum"),
|
||||
pl.col("value").quantile(0.5).alias("median"),
|
||||
pl.col("value").quantile(0.95).alias("p95")
|
||||
)
|
||||
```
|
||||
|
||||
**Conditional aggregations:**
|
||||
```python
|
||||
df.group_by("category").agg(
|
||||
# Count high values
|
||||
(pl.col("value") > 100).sum().alias("high_count"),
|
||||
|
||||
# Average of filtered values
|
||||
pl.col("value").filter(pl.col("active")).mean().alias("active_avg"),
|
||||
|
||||
# Conditional sum
|
||||
pl.when(pl.col("status") == "completed")
|
||||
.then(pl.col("amount"))
|
||||
.otherwise(0)
|
||||
.sum()
|
||||
.alias("completed_total")
|
||||
)
|
||||
```
|
||||
|
||||
**Grouped transformations:**
|
||||
```python
|
||||
df.with_columns(
|
||||
# Group statistics
|
||||
group_mean=pl.col("value").mean().over("category"),
|
||||
group_std=pl.col("value").std().over("category"),
|
||||
|
||||
# Rank within groups
|
||||
rank=pl.col("value").rank().over("category"),
|
||||
|
||||
# Percentage of group total
|
||||
pct_of_group=(pl.col("value") / pl.col("value").sum().over("category")) * 100
|
||||
)
|
||||
```
|
||||
|
||||
## Common Pitfalls and Anti-Patterns
|
||||
|
||||
### Pitfall 1: Row Iteration
|
||||
|
||||
```python
|
||||
# Bad: Never iterate rows
|
||||
for row in df.iter_rows():
|
||||
# Process row
|
||||
result = row[0] * 2
|
||||
|
||||
# Good: Use vectorized operations
|
||||
df = df.with_columns(result=pl.col("value") * 2)
|
||||
```
|
||||
|
||||
### Pitfall 2: Modifying in Place
|
||||
|
||||
```python
|
||||
# Bad: Polars is immutable, this doesn't work as expected
|
||||
df["new_col"] = df["old_col"] * 2 # May work but not recommended
|
||||
|
||||
# Good: Functional style
|
||||
df = df.with_columns(new_col=pl.col("old_col") * 2)
|
||||
```
|
||||
|
||||
### Pitfall 3: Not Using Expressions
|
||||
|
||||
```python
|
||||
# Bad: String-based operations
|
||||
df.select("value * 2") # Won't work
|
||||
|
||||
# Good: Expression-based
|
||||
df.select(pl.col("value") * 2)
|
||||
```
|
||||
|
||||
### Pitfall 4: Inefficient Joins
|
||||
|
||||
```python
|
||||
# Bad: Join large tables without filtering
|
||||
result = large_df1.join(large_df2, on="id")
|
||||
|
||||
# Good: Filter before joining
|
||||
result = (
|
||||
large_df1.filter(pl.col("active"))
|
||||
.join(
|
||||
large_df2.filter(pl.col("status") == "valid"),
|
||||
on="id"
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Pitfall 5: Not Specifying Types
|
||||
|
||||
```python
|
||||
# Bad: Let Polars infer everything
|
||||
df = pl.read_csv("data.csv")
|
||||
|
||||
# Good: Specify types for correctness and performance
|
||||
df = pl.read_csv(
|
||||
"data.csv",
|
||||
dtypes={"id": pl.Int64, "date": pl.Date, "category": pl.Categorical}
|
||||
)
|
||||
```
|
||||
|
||||
### Pitfall 6: Creating Many Small DataFrames
|
||||
|
||||
```python
|
||||
# Bad: Many operations creating intermediate DataFrames
|
||||
df1 = df.filter(pl.col("age") > 25)
|
||||
df2 = df1.select("name", "age")
|
||||
df3 = df2.sort("age")
|
||||
result = df3.head(10)
|
||||
|
||||
# Good: Chain operations
|
||||
result = (
|
||||
df.filter(pl.col("age") > 25)
|
||||
.select("name", "age")
|
||||
.sort("age")
|
||||
.head(10)
|
||||
)
|
||||
|
||||
# Better: Use lazy mode
|
||||
result = (
|
||||
df.lazy()
|
||||
.filter(pl.col("age") > 25)
|
||||
.select("name", "age")
|
||||
.sort("age")
|
||||
.head(10)
|
||||
.collect()
|
||||
)
|
||||
```
|
||||
|
||||
## Memory Management
|
||||
|
||||
### Monitor Memory Usage
|
||||
|
||||
```python
|
||||
# Check DataFrame size
|
||||
print(f"Estimated size: {df.estimated_size('mb'):.2f} MB")
|
||||
|
||||
# Profile memory during operations
|
||||
lf = pl.scan_csv("large.csv")
|
||||
print(lf.explain()) # See query plan
|
||||
```
|
||||
|
||||
### Reduce Memory Footprint
|
||||
|
||||
```python
|
||||
# 1. Use lazy mode
|
||||
lf = pl.scan_parquet("data.parquet")
|
||||
|
||||
# 2. Stream results
|
||||
result = lf.collect(streaming=True)
|
||||
|
||||
# 3. Select only needed columns
|
||||
lf = lf.select("col1", "col2")
|
||||
|
||||
# 4. Optimize data types
|
||||
df = df.with_columns(
|
||||
pl.col("int_col").cast(pl.Int32), # Downcast if possible
|
||||
pl.col("category").cast(pl.Categorical) # For low cardinality
|
||||
)
|
||||
|
||||
# 5. Drop columns not needed
|
||||
df = df.drop("large_text_col", "unused_col")
|
||||
```
|
||||
|
||||
## Testing and Debugging
|
||||
|
||||
### Inspect Query Plans
|
||||
|
||||
```python
|
||||
lf = pl.scan_csv("data.csv")
|
||||
query = lf.filter(pl.col("age") > 25).select("name", "age")
|
||||
|
||||
# View the optimized query plan
|
||||
print(query.explain())
|
||||
|
||||
# View detailed query plan
|
||||
print(query.explain(optimized=True))
|
||||
```
|
||||
|
||||
### Sample Data for Development
|
||||
|
||||
```python
|
||||
# Use n_rows for testing
|
||||
df = pl.read_csv("large.csv", n_rows=1000)
|
||||
|
||||
# Or sample after reading
|
||||
df_sample = df.sample(n=1000, seed=42)
|
||||
```
|
||||
|
||||
### Validate Schemas
|
||||
|
||||
```python
|
||||
# Check schema
|
||||
print(df.schema)
|
||||
|
||||
# Ensure schema matches expectation
|
||||
expected_schema = {
|
||||
"id": pl.Int64,
|
||||
"name": pl.Utf8,
|
||||
"date": pl.Date
|
||||
}
|
||||
|
||||
assert df.schema == expected_schema
|
||||
```
|
||||
|
||||
### Profile Performance
|
||||
|
||||
```python
|
||||
import time
|
||||
|
||||
# Time operations
|
||||
start = time.time()
|
||||
result = lf.collect()
|
||||
print(f"Execution time: {time.time() - start:.2f}s")
|
||||
|
||||
# Compare eager vs lazy
|
||||
start = time.time()
|
||||
df_eager = pl.read_csv("data.csv").filter(pl.col("age") > 25)
|
||||
eager_time = time.time() - start
|
||||
|
||||
start = time.time()
|
||||
df_lazy = pl.scan_csv("data.csv").filter(pl.col("age") > 25).collect()
|
||||
lazy_time = time.time() - start
|
||||
|
||||
print(f"Eager: {eager_time:.2f}s, Lazy: {lazy_time:.2f}s")
|
||||
```
|
||||
|
||||
## File Format Best Practices
|
||||
|
||||
### Choose the Right Format
|
||||
|
||||
**Parquet:**
|
||||
- Best for: Large datasets, archival, data lakes
|
||||
- Pros: Excellent compression, columnar, fast reads
|
||||
- Cons: Not human-readable
|
||||
|
||||
**CSV:**
|
||||
- Best for: Small datasets, human inspection, legacy systems
|
||||
- Pros: Universal, human-readable
|
||||
- Cons: Slow, large file size, no type preservation
|
||||
|
||||
**Arrow IPC:**
|
||||
- Best for: Inter-process communication, temporary storage
|
||||
- Pros: Fastest, zero-copy, preserves all types
|
||||
- Cons: Less compression than Parquet
|
||||
|
||||
### File Reading Best Practices
|
||||
|
||||
```python
|
||||
# 1. Use lazy reading
|
||||
lf = pl.scan_parquet("data.parquet") # Not read_parquet
|
||||
|
||||
# 2. Read multiple files efficiently
|
||||
lf = pl.scan_parquet("data/*.parquet") # Parallel reading
|
||||
|
||||
# 3. Specify schema when known
|
||||
lf = pl.scan_csv(
|
||||
"data.csv",
|
||||
dtypes={"id": pl.Int64, "date": pl.Date}
|
||||
)
|
||||
|
||||
# 4. Use predicate pushdown
|
||||
result = lf.filter(pl.col("date") >= "2023-01-01").collect()
|
||||
```
|
||||
|
||||
### File Writing Best Practices
|
||||
|
||||
```python
|
||||
# 1. Use Parquet for large data
|
||||
df.write_parquet("output.parquet", compression="zstd")
|
||||
|
||||
# 2. Partition large datasets
|
||||
df.write_parquet("output", partition_by=["year", "month"])
|
||||
|
||||
# 3. Use streaming for very large writes
|
||||
lf.sink_parquet("output.parquet") # Streaming write
|
||||
|
||||
# 4. Optimize compression
|
||||
df.write_parquet(
|
||||
"output.parquet",
|
||||
compression="snappy", # Fast compression
|
||||
statistics=True # Enable predicate pushdown on read
|
||||
)
|
||||
```
|
||||
|
||||
## Code Organization
|
||||
|
||||
### Reusable Expressions
|
||||
|
||||
```python
|
||||
# Define reusable expressions
|
||||
age_group = (
|
||||
pl.when(pl.col("age") < 18)
|
||||
.then("minor")
|
||||
.when(pl.col("age") < 65)
|
||||
.then("adult")
|
||||
.otherwise("senior")
|
||||
)
|
||||
|
||||
revenue_per_customer = pl.col("revenue") / pl.col("customer_count")
|
||||
|
||||
# Use in multiple contexts
|
||||
df = df.with_columns(
|
||||
age_group=age_group,
|
||||
rpc=revenue_per_customer
|
||||
)
|
||||
|
||||
# Reuse in filtering
|
||||
df = df.filter(revenue_per_customer > 100)
|
||||
```
|
||||
|
||||
### Pipeline Functions
|
||||
|
||||
```python
|
||||
def clean_data(lf: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Clean and standardize data."""
|
||||
return lf.with_columns(
|
||||
pl.col("name").str.to_uppercase(),
|
||||
pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"),
|
||||
pl.col("amount").fill_null(0)
|
||||
)
|
||||
|
||||
def add_features(lf: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Add computed features."""
|
||||
return lf.with_columns(
|
||||
month=pl.col("date").dt.month(),
|
||||
year=pl.col("date").dt.year(),
|
||||
amount_log=pl.col("amount").log()
|
||||
)
|
||||
|
||||
# Compose pipeline
|
||||
result = (
|
||||
pl.scan_csv("data.csv")
|
||||
.pipe(clean_data)
|
||||
.pipe(add_features)
|
||||
.filter(pl.col("year") == 2023)
|
||||
.collect()
|
||||
)
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
Always document complex expressions and transformations:
|
||||
|
||||
```python
|
||||
# Good: Document intent
|
||||
df = df.with_columns(
|
||||
# Calculate customer lifetime value as sum of purchases
|
||||
# divided by months since first purchase
|
||||
clv=(
|
||||
pl.col("total_purchases") /
|
||||
((pl.col("last_purchase_date") - pl.col("first_purchase_date"))
|
||||
.dt.total_days() / 30)
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## Version Compatibility
|
||||
|
||||
```python
|
||||
# Check Polars version
|
||||
import polars as pl
|
||||
print(pl.__version__)
|
||||
|
||||
# Feature availability varies by version
|
||||
# Document version requirements for production code
|
||||
```
|
||||
Reference in New Issue
Block a user