Initial commit
This commit is contained in:
557
skills/polars/references/io_guide.md
Normal file
557
skills/polars/references/io_guide.md
Normal file
@@ -0,0 +1,557 @@
|
||||
# Polars Data I/O Guide
|
||||
|
||||
Comprehensive guide to reading and writing data in various formats with Polars.
|
||||
|
||||
## CSV Files
|
||||
|
||||
### Reading CSV
|
||||
|
||||
**Eager mode (loads into memory):**
|
||||
```python
|
||||
import polars as pl
|
||||
|
||||
# Basic read
|
||||
df = pl.read_csv("data.csv")
|
||||
|
||||
# With options
|
||||
df = pl.read_csv(
|
||||
"data.csv",
|
||||
separator=",",
|
||||
has_header=True,
|
||||
columns=["col1", "col2"], # Select specific columns
|
||||
n_rows=1000, # Read only first 1000 rows
|
||||
skip_rows=10, # Skip first 10 rows
|
||||
dtypes={"col1": pl.Int64, "col2": pl.Utf8}, # Specify types
|
||||
null_values=["NA", "null", ""], # Define null values
|
||||
encoding="utf-8",
|
||||
ignore_errors=False
|
||||
)
|
||||
```
|
||||
|
||||
**Lazy mode (scans without loading - recommended for large files):**
|
||||
```python
|
||||
# Scan CSV (builds query plan)
|
||||
lf = pl.scan_csv("data.csv")
|
||||
|
||||
# Apply operations
|
||||
result = lf.filter(pl.col("age") > 25).select("name", "age")
|
||||
|
||||
# Execute and load
|
||||
df = result.collect()
|
||||
```
|
||||
|
||||
### Writing CSV
|
||||
|
||||
```python
|
||||
# Basic write
|
||||
df.write_csv("output.csv")
|
||||
|
||||
# With options
|
||||
df.write_csv(
|
||||
"output.csv",
|
||||
separator=",",
|
||||
include_header=True,
|
||||
null_value="", # How to represent nulls
|
||||
quote_char='"',
|
||||
line_terminator="\n"
|
||||
)
|
||||
```
|
||||
|
||||
### Multiple CSV Files
|
||||
|
||||
**Read multiple files:**
|
||||
```python
|
||||
# Read all CSVs in directory
|
||||
lf = pl.scan_csv("data/*.csv")
|
||||
|
||||
# Read specific files
|
||||
lf = pl.scan_csv(["file1.csv", "file2.csv", "file3.csv"])
|
||||
```
|
||||
|
||||
## Parquet Files
|
||||
|
||||
Parquet is the recommended format for performance and compression.
|
||||
|
||||
### Reading Parquet
|
||||
|
||||
**Eager:**
|
||||
```python
|
||||
df = pl.read_parquet("data.parquet")
|
||||
|
||||
# With options
|
||||
df = pl.read_parquet(
|
||||
"data.parquet",
|
||||
columns=["col1", "col2"], # Select specific columns
|
||||
n_rows=1000, # Read first N rows
|
||||
parallel="auto" # Control parallelization
|
||||
)
|
||||
```
|
||||
|
||||
**Lazy (recommended):**
|
||||
```python
|
||||
lf = pl.scan_parquet("data.parquet")
|
||||
|
||||
# Automatic predicate and projection pushdown
|
||||
result = lf.filter(pl.col("age") > 25).select("name", "age").collect()
|
||||
```
|
||||
|
||||
### Writing Parquet
|
||||
|
||||
```python
|
||||
# Basic write
|
||||
df.write_parquet("output.parquet")
|
||||
|
||||
# With compression
|
||||
df.write_parquet(
|
||||
"output.parquet",
|
||||
compression="snappy", # Options: "snappy", "gzip", "brotli", "lz4", "zstd"
|
||||
statistics=True, # Write statistics (enables predicate pushdown)
|
||||
use_pyarrow=False # Use Rust writer (faster)
|
||||
)
|
||||
```
|
||||
|
||||
### Partitioned Parquet (Hive-style)
|
||||
|
||||
**Write partitioned:**
|
||||
```python
|
||||
# Write with partitioning
|
||||
df.write_parquet(
|
||||
"output_dir",
|
||||
partition_by=["year", "month"] # Creates directory structure
|
||||
)
|
||||
# Creates: output_dir/year=2023/month=01/data.parquet
|
||||
```
|
||||
|
||||
**Read partitioned:**
|
||||
```python
|
||||
lf = pl.scan_parquet("output_dir/**/*.parquet")
|
||||
|
||||
# Hive partitioning columns are automatically added
|
||||
result = lf.filter(pl.col("year") == 2023).collect()
|
||||
```
|
||||
|
||||
## JSON Files
|
||||
|
||||
### Reading JSON
|
||||
|
||||
**NDJSON (newline-delimited JSON) - recommended:**
|
||||
```python
|
||||
df = pl.read_ndjson("data.ndjson")
|
||||
|
||||
# Lazy
|
||||
lf = pl.scan_ndjson("data.ndjson")
|
||||
```
|
||||
|
||||
**Standard JSON:**
|
||||
```python
|
||||
df = pl.read_json("data.json")
|
||||
|
||||
# From JSON string
|
||||
df = pl.read_json('{"col1": [1, 2], "col2": ["a", "b"]}')
|
||||
```
|
||||
|
||||
### Writing JSON
|
||||
|
||||
```python
|
||||
# Write NDJSON
|
||||
df.write_ndjson("output.ndjson")
|
||||
|
||||
# Write standard JSON
|
||||
df.write_json("output.json")
|
||||
|
||||
# Pretty printed
|
||||
df.write_json("output.json", pretty=True, row_oriented=False)
|
||||
```
|
||||
|
||||
## Excel Files
|
||||
|
||||
### Reading Excel
|
||||
|
||||
```python
|
||||
# Read first sheet
|
||||
df = pl.read_excel("data.xlsx")
|
||||
|
||||
# Specific sheet
|
||||
df = pl.read_excel("data.xlsx", sheet_name="Sheet1")
|
||||
# Or by index
|
||||
df = pl.read_excel("data.xlsx", sheet_id=0)
|
||||
|
||||
# With options
|
||||
df = pl.read_excel(
|
||||
"data.xlsx",
|
||||
sheet_name="Sheet1",
|
||||
columns=["A", "B", "C"], # Excel columns
|
||||
n_rows=100,
|
||||
skip_rows=5,
|
||||
has_header=True
|
||||
)
|
||||
```
|
||||
|
||||
### Writing Excel
|
||||
|
||||
```python
|
||||
# Write to Excel
|
||||
df.write_excel("output.xlsx")
|
||||
|
||||
# Multiple sheets
|
||||
with pl.ExcelWriter("output.xlsx") as writer:
|
||||
df1.write_excel(writer, worksheet="Sheet1")
|
||||
df2.write_excel(writer, worksheet="Sheet2")
|
||||
```
|
||||
|
||||
## Database Connectivity
|
||||
|
||||
### Read from Database
|
||||
|
||||
```python
|
||||
import polars as pl
|
||||
|
||||
# Read entire table
|
||||
df = pl.read_database("SELECT * FROM users", connection_uri="postgresql://...")
|
||||
|
||||
# Using connectorx for better performance
|
||||
df = pl.read_database_uri(
|
||||
"SELECT * FROM users WHERE age > 25",
|
||||
uri="postgresql://user:pass@localhost/db"
|
||||
)
|
||||
```
|
||||
|
||||
### Write to Database
|
||||
|
||||
```python
|
||||
# Using SQLAlchemy
|
||||
from sqlalchemy import create_engine
|
||||
|
||||
engine = create_engine("postgresql://user:pass@localhost/db")
|
||||
df.write_database("table_name", connection=engine)
|
||||
|
||||
# With options
|
||||
df.write_database(
|
||||
"table_name",
|
||||
connection=engine,
|
||||
if_exists="replace", # or "append", "fail"
|
||||
)
|
||||
```
|
||||
|
||||
### Common Database Connectors
|
||||
|
||||
**PostgreSQL:**
|
||||
```python
|
||||
uri = "postgresql://username:password@localhost:5432/database"
|
||||
df = pl.read_database_uri("SELECT * FROM table", uri=uri)
|
||||
```
|
||||
|
||||
**MySQL:**
|
||||
```python
|
||||
uri = "mysql://username:password@localhost:3306/database"
|
||||
df = pl.read_database_uri("SELECT * FROM table", uri=uri)
|
||||
```
|
||||
|
||||
**SQLite:**
|
||||
```python
|
||||
uri = "sqlite:///path/to/database.db"
|
||||
df = pl.read_database_uri("SELECT * FROM table", uri=uri)
|
||||
```
|
||||
|
||||
## Cloud Storage
|
||||
|
||||
### AWS S3
|
||||
|
||||
```python
|
||||
# Read from S3
|
||||
df = pl.read_parquet("s3://bucket/path/to/file.parquet")
|
||||
lf = pl.scan_parquet("s3://bucket/path/*.parquet")
|
||||
|
||||
# Write to S3
|
||||
df.write_parquet("s3://bucket/path/output.parquet")
|
||||
|
||||
# With credentials
|
||||
import os
|
||||
os.environ["AWS_ACCESS_KEY_ID"] = "your_key"
|
||||
os.environ["AWS_SECRET_ACCESS_KEY"] = "your_secret"
|
||||
os.environ["AWS_REGION"] = "us-west-2"
|
||||
|
||||
df = pl.read_parquet("s3://bucket/file.parquet")
|
||||
```
|
||||
|
||||
### Azure Blob Storage
|
||||
|
||||
```python
|
||||
# Read from Azure
|
||||
df = pl.read_parquet("az://container/path/file.parquet")
|
||||
|
||||
# Write to Azure
|
||||
df.write_parquet("az://container/path/output.parquet")
|
||||
|
||||
# With credentials
|
||||
os.environ["AZURE_STORAGE_ACCOUNT_NAME"] = "account"
|
||||
os.environ["AZURE_STORAGE_ACCOUNT_KEY"] = "key"
|
||||
```
|
||||
|
||||
### Google Cloud Storage (GCS)
|
||||
|
||||
```python
|
||||
# Read from GCS
|
||||
df = pl.read_parquet("gs://bucket/path/file.parquet")
|
||||
|
||||
# Write to GCS
|
||||
df.write_parquet("gs://bucket/path/output.parquet")
|
||||
|
||||
# With credentials
|
||||
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/credentials.json"
|
||||
```
|
||||
|
||||
## Google BigQuery
|
||||
|
||||
```python
|
||||
# Read from BigQuery
|
||||
df = pl.read_database(
|
||||
"SELECT * FROM project.dataset.table",
|
||||
connection_uri="bigquery://project"
|
||||
)
|
||||
|
||||
# Or using Google Cloud SDK
|
||||
from google.cloud import bigquery
|
||||
client = bigquery.Client()
|
||||
|
||||
query = "SELECT * FROM project.dataset.table WHERE date > '2023-01-01'"
|
||||
df = pl.from_pandas(client.query(query).to_dataframe())
|
||||
```
|
||||
|
||||
## Apache Arrow
|
||||
|
||||
### IPC/Feather Format
|
||||
|
||||
**Read:**
|
||||
```python
|
||||
df = pl.read_ipc("data.arrow")
|
||||
lf = pl.scan_ipc("data.arrow")
|
||||
```
|
||||
|
||||
**Write:**
|
||||
```python
|
||||
df.write_ipc("output.arrow")
|
||||
|
||||
# Compressed
|
||||
df.write_ipc("output.arrow", compression="zstd")
|
||||
```
|
||||
|
||||
### Arrow Streaming
|
||||
|
||||
```python
|
||||
# Write streaming format
|
||||
df.write_ipc("output.arrows", compression="zstd")
|
||||
|
||||
# Read streaming
|
||||
df = pl.read_ipc("output.arrows")
|
||||
```
|
||||
|
||||
### From/To Arrow
|
||||
|
||||
```python
|
||||
import pyarrow as pa
|
||||
|
||||
# From Arrow Table
|
||||
arrow_table = pa.table({"col": [1, 2, 3]})
|
||||
df = pl.from_arrow(arrow_table)
|
||||
|
||||
# To Arrow Table
|
||||
arrow_table = df.to_arrow()
|
||||
```
|
||||
|
||||
## In-Memory Formats
|
||||
|
||||
### Python Dictionaries
|
||||
|
||||
```python
|
||||
# From dict
|
||||
df = pl.DataFrame({
|
||||
"col1": [1, 2, 3],
|
||||
"col2": ["a", "b", "c"]
|
||||
})
|
||||
|
||||
# To dict
|
||||
data_dict = df.to_dict() # Column-oriented
|
||||
data_dict = df.to_dict(as_series=False) # Lists instead of Series
|
||||
```
|
||||
|
||||
### NumPy Arrays
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
# From NumPy
|
||||
arr = np.array([[1, 2], [3, 4], [5, 6]])
|
||||
df = pl.DataFrame(arr, schema=["col1", "col2"])
|
||||
|
||||
# To NumPy
|
||||
arr = df.to_numpy()
|
||||
```
|
||||
|
||||
### Pandas DataFrames
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
# From Pandas
|
||||
pd_df = pd.DataFrame({"col": [1, 2, 3]})
|
||||
pl_df = pl.from_pandas(pd_df)
|
||||
|
||||
# To Pandas
|
||||
pd_df = pl_df.to_pandas()
|
||||
|
||||
# Zero-copy when possible
|
||||
pl_df = pl.from_arrow(pd_df)
|
||||
```
|
||||
|
||||
### Lists of Rows
|
||||
|
||||
```python
|
||||
# From list of dicts
|
||||
data = [
|
||||
{"name": "Alice", "age": 25},
|
||||
{"name": "Bob", "age": 30}
|
||||
]
|
||||
df = pl.DataFrame(data)
|
||||
|
||||
# To list of dicts
|
||||
rows = df.to_dicts()
|
||||
|
||||
# From list of tuples
|
||||
data = [("Alice", 25), ("Bob", 30)]
|
||||
df = pl.DataFrame(data, schema=["name", "age"])
|
||||
```
|
||||
|
||||
## Streaming Large Files
|
||||
|
||||
For datasets larger than memory, use lazy mode with streaming:
|
||||
|
||||
```python
|
||||
# Streaming mode
|
||||
lf = pl.scan_csv("very_large.csv")
|
||||
result = lf.filter(pl.col("value") > 100).collect(streaming=True)
|
||||
|
||||
# Streaming with multiple files
|
||||
lf = pl.scan_parquet("data/*.parquet")
|
||||
result = lf.group_by("category").agg(pl.col("value").sum()).collect(streaming=True)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Format Selection
|
||||
|
||||
**Use Parquet when:**
|
||||
- Need compression (up to 10x smaller than CSV)
|
||||
- Want fast reads/writes
|
||||
- Need to preserve data types
|
||||
- Working with large datasets
|
||||
- Need predicate pushdown
|
||||
|
||||
**Use CSV when:**
|
||||
- Need human-readable format
|
||||
- Interfacing with legacy systems
|
||||
- Data is small
|
||||
- Need universal compatibility
|
||||
|
||||
**Use JSON when:**
|
||||
- Working with nested/hierarchical data
|
||||
- Need web API compatibility
|
||||
- Data has flexible schema
|
||||
|
||||
**Use Arrow IPC when:**
|
||||
- Need zero-copy data sharing
|
||||
- Fastest serialization required
|
||||
- Working between Arrow-compatible systems
|
||||
|
||||
### Reading Large Files
|
||||
|
||||
```python
|
||||
# 1. Always use lazy mode
|
||||
lf = pl.scan_csv("large.csv") # NOT read_csv
|
||||
|
||||
# 2. Filter and select early (pushdown optimization)
|
||||
result = (
|
||||
lf
|
||||
.select("col1", "col2", "col3") # Only needed columns
|
||||
.filter(pl.col("date") > "2023-01-01") # Filter early
|
||||
.collect()
|
||||
)
|
||||
|
||||
# 3. Use streaming for very large data
|
||||
result = lf.filter(...).select(...).collect(streaming=True)
|
||||
|
||||
# 4. Read only needed rows during development
|
||||
df = pl.read_csv("large.csv", n_rows=10000) # Sample for testing
|
||||
```
|
||||
|
||||
### Writing Large Files
|
||||
|
||||
```python
|
||||
# 1. Use Parquet with compression
|
||||
df.write_parquet("output.parquet", compression="zstd")
|
||||
|
||||
# 2. Use partitioning for very large datasets
|
||||
df.write_parquet("output", partition_by=["year", "month"])
|
||||
|
||||
# 3. Write streaming
|
||||
lf = pl.scan_csv("input.csv")
|
||||
lf.sink_parquet("output.parquet") # Streaming write
|
||||
```
|
||||
|
||||
### Performance Tips
|
||||
|
||||
```python
|
||||
# 1. Specify dtypes when reading CSV
|
||||
df = pl.read_csv(
|
||||
"data.csv",
|
||||
dtypes={"id": pl.Int64, "name": pl.Utf8} # Avoids inference
|
||||
)
|
||||
|
||||
# 2. Use appropriate compression
|
||||
df.write_parquet("output.parquet", compression="snappy") # Fast
|
||||
df.write_parquet("output.parquet", compression="zstd") # Better compression
|
||||
|
||||
# 3. Parallel reading
|
||||
df = pl.read_csv("data.csv", parallel="auto")
|
||||
|
||||
# 4. Read multiple files in parallel
|
||||
lf = pl.scan_parquet("data/*.parquet") # Automatic parallel read
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
```python
|
||||
try:
|
||||
df = pl.read_csv("data.csv")
|
||||
except pl.exceptions.ComputeError as e:
|
||||
print(f"Error reading CSV: {e}")
|
||||
|
||||
# Ignore errors during parsing
|
||||
df = pl.read_csv("messy.csv", ignore_errors=True)
|
||||
|
||||
# Handle missing files
|
||||
from pathlib import Path
|
||||
if Path("data.csv").exists():
|
||||
df = pl.read_csv("data.csv")
|
||||
else:
|
||||
print("File not found")
|
||||
```
|
||||
|
||||
## Schema Management
|
||||
|
||||
```python
|
||||
# Infer schema from sample
|
||||
schema = pl.read_csv("data.csv", n_rows=1000).schema
|
||||
|
||||
# Use inferred schema for full read
|
||||
df = pl.read_csv("data.csv", dtypes=schema)
|
||||
|
||||
# Define schema explicitly
|
||||
schema = {
|
||||
"id": pl.Int64,
|
||||
"name": pl.Utf8,
|
||||
"date": pl.Date,
|
||||
"value": pl.Float64
|
||||
}
|
||||
df = pl.read_csv("data.csv", dtypes=schema)
|
||||
```
|
||||
Reference in New Issue
Block a user