14 KiB
Polars Best Practices and Performance Guide
Comprehensive guide to writing efficient Polars code and avoiding common pitfalls.
Performance Optimization
1. Use Lazy Evaluation
Always prefer lazy mode for large datasets:
# Bad: Eager mode loads everything immediately
df = pl.read_csv("large_file.csv")
result = df.filter(pl.col("age") > 25).select("name", "age")
# Good: Lazy mode optimizes before execution
lf = pl.scan_csv("large_file.csv")
result = lf.filter(pl.col("age") > 25).select("name", "age").collect()
Benefits of lazy evaluation:
- Predicate pushdown (filter at source)
- Projection pushdown (read only needed columns)
- Query optimization
- Parallel execution planning
2. Filter and Select Early
Push filters and column selection as early as possible in the pipeline:
# Bad: Process all data, then filter and select
result = (
lf.group_by("category")
.agg(pl.col("value").mean())
.join(other, on="category")
.filter(pl.col("value") > 100)
.select("category", "value")
)
# Good: Filter and select early
result = (
lf.select("category", "value") # Only needed columns
.filter(pl.col("value") > 100) # Filter early
.group_by("category")
.agg(pl.col("value").mean())
.join(other.select("category", "other_col"), on="category")
)
3. Avoid Python Functions
Stay within the expression API to maintain parallelization:
# Bad: Python function disables parallelization
df = df.with_columns(
result=pl.col("value").map_elements(lambda x: x * 2, return_dtype=pl.Float64)
)
# Good: Use native expressions (parallelized)
df = df.with_columns(result=pl.col("value") * 2)
When you must use custom functions:
# If truly needed, be explicit
df = df.with_columns(
result=pl.col("value").map_elements(
custom_function,
return_dtype=pl.Float64,
skip_nulls=True # Optimize null handling
)
)
4. Use Streaming for Very Large Data
Enable streaming for datasets larger than RAM:
# Streaming mode processes data in chunks
lf = pl.scan_parquet("very_large.parquet")
result = lf.filter(pl.col("value") > 100).collect(streaming=True)
# Or use sink for direct streaming writes
lf.filter(pl.col("value") > 100).sink_parquet("output.parquet")
5. Optimize Data Types
Choose appropriate data types to reduce memory and improve performance:
# Bad: Default types may be wasteful
df = pl.read_csv("data.csv")
# Good: Specify optimal types
df = pl.read_csv(
"data.csv",
dtypes={
"id": pl.UInt32, # Instead of Int64 if values fit
"category": pl.Categorical, # For low-cardinality strings
"date": pl.Date, # Instead of String
"small_int": pl.Int16, # Instead of Int64
}
)
Type optimization guidelines:
- Use smallest integer type that fits your data
- Use
Categoricalfor strings with low cardinality (<50% unique) - Use
Dateinstead ofDatetimewhen time isn't needed - Use
Booleaninstead of integers for binary flags
6. Parallel Operations
Structure code to maximize parallelization:
# Bad: Sequential pipe operations disable parallelization
df = (
df.pipe(operation1)
.pipe(operation2)
.pipe(operation3)
)
# Good: Combined operations enable parallelization
df = df.with_columns(
result1=operation1_expr(),
result2=operation2_expr(),
result3=operation3_expr()
)
7. Rechunk After Concatenation
# Concatenation can fragment data
combined = pl.concat([df1, df2, df3])
# Rechunk for better performance in subsequent operations
combined = pl.concat([df1, df2, df3], rechunk=True)
Expression Patterns
Conditional Logic
Simple conditions:
df.with_columns(
status=pl.when(pl.col("age") >= 18)
.then("adult")
.otherwise("minor")
)
Multiple conditions:
df.with_columns(
grade=pl.when(pl.col("score") >= 90)
.then("A")
.when(pl.col("score") >= 80)
.then("B")
.when(pl.col("score") >= 70)
.then("C")
.when(pl.col("score") >= 60)
.then("D")
.otherwise("F")
)
Complex conditions:
df.with_columns(
category=pl.when(
(pl.col("revenue") > 1000000) & (pl.col("customers") > 100)
)
.then("enterprise")
.when(
(pl.col("revenue") > 100000) | (pl.col("customers") > 50)
)
.then("business")
.otherwise("starter")
)
Null Handling
Check for nulls:
df.filter(pl.col("value").is_null())
df.filter(pl.col("value").is_not_null())
Fill nulls:
# Constant value
df.with_columns(pl.col("value").fill_null(0))
# Forward fill
df.with_columns(pl.col("value").fill_null(strategy="forward"))
# Backward fill
df.with_columns(pl.col("value").fill_null(strategy="backward"))
# Mean
df.with_columns(pl.col("value").fill_null(strategy="mean"))
# Per-group fill
df.with_columns(
pl.col("value").fill_null(pl.col("value").mean()).over("group")
)
Coalesce (first non-null):
df.with_columns(
combined=pl.coalesce(["col1", "col2", "col3"])
)
Column Selection Patterns
By name:
df.select("col1", "col2", "col3")
By pattern:
# Regex
df.select(pl.col("^sales_.*$"))
# Starts with
df.select(pl.col("^sales"))
# Ends with
df.select(pl.col("_total$"))
# Contains
df.select(pl.col(".*revenue.*"))
By type:
# All numeric columns
df.select(pl.col(pl.NUMERIC_DTYPES))
# All string columns
df.select(pl.col(pl.Utf8))
# Multiple types
df.select(pl.col(pl.NUMERIC_DTYPES, pl.Boolean))
Exclude columns:
df.select(pl.all().exclude("id", "timestamp"))
Transform multiple columns:
# Apply same operation to multiple columns
df.select(
pl.col("^sales_.*$") * 1.1 # 10% increase to all sales columns
)
Aggregation Patterns
Multiple aggregations:
df.group_by("category").agg(
pl.col("value").sum().alias("total"),
pl.col("value").mean().alias("average"),
pl.col("value").std().alias("std_dev"),
pl.col("id").count().alias("count"),
pl.col("id").n_unique().alias("unique_count"),
pl.col("value").min().alias("minimum"),
pl.col("value").max().alias("maximum"),
pl.col("value").quantile(0.5).alias("median"),
pl.col("value").quantile(0.95).alias("p95")
)
Conditional aggregations:
df.group_by("category").agg(
# Count high values
(pl.col("value") > 100).sum().alias("high_count"),
# Average of filtered values
pl.col("value").filter(pl.col("active")).mean().alias("active_avg"),
# Conditional sum
pl.when(pl.col("status") == "completed")
.then(pl.col("amount"))
.otherwise(0)
.sum()
.alias("completed_total")
)
Grouped transformations:
df.with_columns(
# Group statistics
group_mean=pl.col("value").mean().over("category"),
group_std=pl.col("value").std().over("category"),
# Rank within groups
rank=pl.col("value").rank().over("category"),
# Percentage of group total
pct_of_group=(pl.col("value") / pl.col("value").sum().over("category")) * 100
)
Common Pitfalls and Anti-Patterns
Pitfall 1: Row Iteration
# Bad: Never iterate rows
for row in df.iter_rows():
# Process row
result = row[0] * 2
# Good: Use vectorized operations
df = df.with_columns(result=pl.col("value") * 2)
Pitfall 2: Modifying in Place
# Bad: Polars is immutable, this doesn't work as expected
df["new_col"] = df["old_col"] * 2 # May work but not recommended
# Good: Functional style
df = df.with_columns(new_col=pl.col("old_col") * 2)
Pitfall 3: Not Using Expressions
# Bad: String-based operations
df.select("value * 2") # Won't work
# Good: Expression-based
df.select(pl.col("value") * 2)
Pitfall 4: Inefficient Joins
# Bad: Join large tables without filtering
result = large_df1.join(large_df2, on="id")
# Good: Filter before joining
result = (
large_df1.filter(pl.col("active"))
.join(
large_df2.filter(pl.col("status") == "valid"),
on="id"
)
)
Pitfall 5: Not Specifying Types
# Bad: Let Polars infer everything
df = pl.read_csv("data.csv")
# Good: Specify types for correctness and performance
df = pl.read_csv(
"data.csv",
dtypes={"id": pl.Int64, "date": pl.Date, "category": pl.Categorical}
)
Pitfall 6: Creating Many Small DataFrames
# Bad: Many operations creating intermediate DataFrames
df1 = df.filter(pl.col("age") > 25)
df2 = df1.select("name", "age")
df3 = df2.sort("age")
result = df3.head(10)
# Good: Chain operations
result = (
df.filter(pl.col("age") > 25)
.select("name", "age")
.sort("age")
.head(10)
)
# Better: Use lazy mode
result = (
df.lazy()
.filter(pl.col("age") > 25)
.select("name", "age")
.sort("age")
.head(10)
.collect()
)
Memory Management
Monitor Memory Usage
# Check DataFrame size
print(f"Estimated size: {df.estimated_size('mb'):.2f} MB")
# Profile memory during operations
lf = pl.scan_csv("large.csv")
print(lf.explain()) # See query plan
Reduce Memory Footprint
# 1. Use lazy mode
lf = pl.scan_parquet("data.parquet")
# 2. Stream results
result = lf.collect(streaming=True)
# 3. Select only needed columns
lf = lf.select("col1", "col2")
# 4. Optimize data types
df = df.with_columns(
pl.col("int_col").cast(pl.Int32), # Downcast if possible
pl.col("category").cast(pl.Categorical) # For low cardinality
)
# 5. Drop columns not needed
df = df.drop("large_text_col", "unused_col")
Testing and Debugging
Inspect Query Plans
lf = pl.scan_csv("data.csv")
query = lf.filter(pl.col("age") > 25).select("name", "age")
# View the optimized query plan
print(query.explain())
# View detailed query plan
print(query.explain(optimized=True))
Sample Data for Development
# Use n_rows for testing
df = pl.read_csv("large.csv", n_rows=1000)
# Or sample after reading
df_sample = df.sample(n=1000, seed=42)
Validate Schemas
# Check schema
print(df.schema)
# Ensure schema matches expectation
expected_schema = {
"id": pl.Int64,
"name": pl.Utf8,
"date": pl.Date
}
assert df.schema == expected_schema
Profile Performance
import time
# Time operations
start = time.time()
result = lf.collect()
print(f"Execution time: {time.time() - start:.2f}s")
# Compare eager vs lazy
start = time.time()
df_eager = pl.read_csv("data.csv").filter(pl.col("age") > 25)
eager_time = time.time() - start
start = time.time()
df_lazy = pl.scan_csv("data.csv").filter(pl.col("age") > 25).collect()
lazy_time = time.time() - start
print(f"Eager: {eager_time:.2f}s, Lazy: {lazy_time:.2f}s")
File Format Best Practices
Choose the Right Format
Parquet:
- Best for: Large datasets, archival, data lakes
- Pros: Excellent compression, columnar, fast reads
- Cons: Not human-readable
CSV:
- Best for: Small datasets, human inspection, legacy systems
- Pros: Universal, human-readable
- Cons: Slow, large file size, no type preservation
Arrow IPC:
- Best for: Inter-process communication, temporary storage
- Pros: Fastest, zero-copy, preserves all types
- Cons: Less compression than Parquet
File Reading Best Practices
# 1. Use lazy reading
lf = pl.scan_parquet("data.parquet") # Not read_parquet
# 2. Read multiple files efficiently
lf = pl.scan_parquet("data/*.parquet") # Parallel reading
# 3. Specify schema when known
lf = pl.scan_csv(
"data.csv",
dtypes={"id": pl.Int64, "date": pl.Date}
)
# 4. Use predicate pushdown
result = lf.filter(pl.col("date") >= "2023-01-01").collect()
File Writing Best Practices
# 1. Use Parquet for large data
df.write_parquet("output.parquet", compression="zstd")
# 2. Partition large datasets
df.write_parquet("output", partition_by=["year", "month"])
# 3. Use streaming for very large writes
lf.sink_parquet("output.parquet") # Streaming write
# 4. Optimize compression
df.write_parquet(
"output.parquet",
compression="snappy", # Fast compression
statistics=True # Enable predicate pushdown on read
)
Code Organization
Reusable Expressions
# Define reusable expressions
age_group = (
pl.when(pl.col("age") < 18)
.then("minor")
.when(pl.col("age") < 65)
.then("adult")
.otherwise("senior")
)
revenue_per_customer = pl.col("revenue") / pl.col("customer_count")
# Use in multiple contexts
df = df.with_columns(
age_group=age_group,
rpc=revenue_per_customer
)
# Reuse in filtering
df = df.filter(revenue_per_customer > 100)
Pipeline Functions
def clean_data(lf: pl.LazyFrame) -> pl.LazyFrame:
"""Clean and standardize data."""
return lf.with_columns(
pl.col("name").str.to_uppercase(),
pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"),
pl.col("amount").fill_null(0)
)
def add_features(lf: pl.LazyFrame) -> pl.LazyFrame:
"""Add computed features."""
return lf.with_columns(
month=pl.col("date").dt.month(),
year=pl.col("date").dt.year(),
amount_log=pl.col("amount").log()
)
# Compose pipeline
result = (
pl.scan_csv("data.csv")
.pipe(clean_data)
.pipe(add_features)
.filter(pl.col("year") == 2023)
.collect()
)
Documentation
Always document complex expressions and transformations:
# Good: Document intent
df = df.with_columns(
# Calculate customer lifetime value as sum of purchases
# divided by months since first purchase
clv=(
pl.col("total_purchases") /
((pl.col("last_purchase_date") - pl.col("first_purchase_date"))
.dt.total_days() / 30)
)
)
Version Compatibility
# Check Polars version
import polars as pl
print(pl.__version__)
# Feature availability varies by version
# Document version requirements for production code