From ec0d1b59055742e602d8e104af1bff9627cdab7e Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sat, 29 Nov 2025 18:15:04 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 15 + README.md | 3 + plugin.lock.json | 104 ++ skills/pixi/LICENSE | 28 + skills/pixi/SKILL.md | 1286 +++++++++++++++++ skills/python-style-guide/LICENSE | 395 +++++ skills/python-style-guide/SKILL.md | 482 ++++++ .../references/advanced_types.md | 259 ++++ .../references/antipatterns.md | 361 +++++ .../references/docstring_examples.md | 384 +++++ skills/r-development/SKILL.md | 214 +++ .../references/object-systems.md | 310 ++++ .../references/package-development.md | 393 +++++ .../r-development/references/performance.md | 311 ++++ .../references/rlang-patterns.md | 247 ++++ skills/shell-scripting/README.md | 1 + skills/shell-scripting/SKILL.md | 140 ++ skills/shell-scripting/references/patterns.md | 505 +++++++ skills/shell-scripting/references/template.sh | 258 ++++ 19 files changed, 5696 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 plugin.lock.json create mode 100644 skills/pixi/LICENSE create mode 100644 skills/pixi/SKILL.md create mode 100644 skills/python-style-guide/LICENSE create mode 100644 skills/python-style-guide/SKILL.md create mode 100644 skills/python-style-guide/references/advanced_types.md create mode 100644 skills/python-style-guide/references/antipatterns.md create mode 100644 skills/python-style-guide/references/docstring_examples.md create mode 100644 skills/r-development/SKILL.md create mode 100644 skills/r-development/references/object-systems.md create mode 100644 skills/r-development/references/package-development.md create mode 100644 skills/r-development/references/performance.md create mode 100644 skills/r-development/references/rlang-patterns.md create mode 100644 skills/shell-scripting/README.md create mode 100644 skills/shell-scripting/SKILL.md create mode 100644 skills/shell-scripting/references/patterns.md create mode 100755 skills/shell-scripting/references/template.sh diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..8977c6b --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "programming-skills", + "description": "Collection of skills used for various programming tasks", + "version": "0.0.0-2025.11.28", + "author": { + "name": "Falko Noé", + "email": "falkonoe@gmail.com" + }, + "skills": [ + "./skills/shell-scripting", + "./skills/r-development", + "./skills/python-style-guide", + "./skills/pixi" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d38e57e --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# programming-skills + +Collection of skills used for various programming tasks diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..6440742 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,104 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:CodingKaiser/claude-kaiser-skills:programming-skills", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "eb44df3f7d48b76ef0143ee8f93d0d637a510756", + "treeHash": "a891b505518265eba1bfb02b421971f74b39e284194ab1b43dfdd4f965ee91a3", + "generatedAt": "2025-11-28T10:10:03.663248Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "programming-skills", + "description": "Collection of skills used for various programming tasks" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "e557525503f5cff708690de58a73d8405e2919a16a7d0dd909449017ea950c4f" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "358f3defc2ff66da95972f2bfbead49442899a9c6e2281b4408e3ac46b5b00e8" + }, + { + "path": "skills/pixi/LICENSE", + "sha256": "936fd8bfe2a7b0bb13f939135ceb73ece2ec0fdea04fcb571d7c3d8be885a29c" + }, + { + "path": "skills/pixi/SKILL.md", + "sha256": "bfaa74c06182d089be380de6c61449b8cda50d14fc075cc92c3421253e975caf" + }, + { + "path": "skills/shell-scripting/README.md", + "sha256": "4d4d2e417c3738fd5ff0b0eac0024f7f4d2ffb735f8e4ef23eac103d3cb676e8" + }, + { + "path": "skills/shell-scripting/SKILL.md", + "sha256": "440600e15f5e4868366a27043feb712d0a3deaf52a7752d9ca8e9296e0829f2a" + }, + { + "path": "skills/shell-scripting/references/template.sh", + "sha256": "e365addd9cd11482d5ba23b2ff242cab442b1dd1326f2a35a9952f85d296cedf" + }, + { + "path": "skills/shell-scripting/references/patterns.md", + "sha256": "f801cdf109642ea1c09e6d411e16c6d6ce27ebc8be095cca6240021bd03694b8" + }, + { + "path": "skills/python-style-guide/LICENSE", + "sha256": "7e7170e3cebf88a9f60c7b8421418323c09304da1af4d5e90f4da1dc1c8a2661" + }, + { + "path": "skills/python-style-guide/SKILL.md", + "sha256": "f12bade5ad7ad86b2170d1e0e8384d124805307f678088574c04142f37bdcbf8" + }, + { + "path": "skills/python-style-guide/references/docstring_examples.md", + "sha256": "5f1b41345546235f1f13525689a91cc0696d15d62d2aa478d2585f6d6e24c2d2" + }, + { + "path": "skills/python-style-guide/references/advanced_types.md", + "sha256": "5dffa3bec8a81804c04fbadc035cd1ae4490cdac588c5e2b521e2c832e94e0ac" + }, + { + "path": "skills/python-style-guide/references/antipatterns.md", + "sha256": "89c834f62b88bd8f31548b666f944ecfdb58ae8af32a0513f7cac6a9888a5388" + }, + { + "path": "skills/r-development/SKILL.md", + "sha256": "c0afa20c816cb54aab14cb7256f5bbf435b795976b327e614c6be0bc6d119333" + }, + { + "path": "skills/r-development/references/rlang-patterns.md", + "sha256": "e1f94f60d6312de8b0cdcf16e0f0d3a5096f246f8358bf7594e733ba886c757b" + }, + { + "path": "skills/r-development/references/object-systems.md", + "sha256": "d00e48f01108a21806db1ef0f6d5606e5c183f8a0d3b7be8fba90f5bbf34e69b" + }, + { + "path": "skills/r-development/references/performance.md", + "sha256": "9d7558bd6b62d65e353da06394c317db7f08d639e0567281562a3c4d1a4fe4d2" + }, + { + "path": "skills/r-development/references/package-development.md", + "sha256": "c4ab3453e58ab5623f98aeeb3a572c05102afe4dd0ba5d7a0ba73712f57c326f" + } + ], + "dirSha256": "a891b505518265eba1bfb02b421971f74b39e284194ab1b43dfdd4f965ee91a3" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/pixi/LICENSE b/skills/pixi/LICENSE new file mode 100644 index 0000000..f6fb483 --- /dev/null +++ b/skills/pixi/LICENSE @@ -0,0 +1,28 @@ +BSD 3-Clause License + +Copyright (c) 2025, UW Scientific Software Engineering Center + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/skills/pixi/SKILL.md b/skills/pixi/SKILL.md new file mode 100644 index 0000000..dacfbfc --- /dev/null +++ b/skills/pixi/SKILL.md @@ -0,0 +1,1286 @@ +--- +name: pixi-package-manager +description: Fast, reproducible scientific Python environments with pixi - conda and PyPI unified +--- + +# Pixi Package Manager for Scientific Python + +Master **pixi**, the modern package manager that unifies conda and PyPI ecosystems for fast, reproducible scientific Python development. Learn how to manage complex scientific dependencies, create isolated environments, and build reproducible workflows using `pyproject.toml` integration. + +**Official Documentation**: https://pixi.sh +**GitHub**: https://github.com/prefix-dev/pixi + +## Quick Reference Card + +### Setup +```bash +# Installation must be performed separately +# On the server, load via lmod if not already in path +module load Dev/pixi + +# Initialize new project with pyproject.toml +pixi init --format pyproject + +# Initialize existing Python project +pixi init --format pyproject --import-environment +``` + +### Essential Commands +```bash +# Add dependencies +pixi add numpy scipy pandas # conda packages +pixi add --pypi pytest-cov # PyPI-only packages +pixi add --feature dev pytest ruff # dev environment + +# Install all dependencies +pixi install + +# Run commands in environment +pixi run python script.py +pixi run pytest + +# Shell with environment activated +pixi shell + +# Add tasks +pixi task add test "pytest tests/" +pixi task add docs "sphinx-build docs/ docs/_build" + +# Run tasks +pixi run test +pixi run docs + +# Update dependencies +pixi update numpy # update specific +pixi update # update all + +# List packages +pixi list +pixi tree numpy # show dependency tree +``` + +### Quick Decision Tree: Pixi vs UV vs Both + +``` +Need compiled scientific libraries (NumPy, SciPy, GDAL)? +├─ YES → Use pixi (conda-forge has pre-built binaries) +└─ NO → Consider uv for pure Python projects + +Need multi-language support (Python + R, Julia, C++)? +├─ YES → Use pixi (supports conda ecosystem) +└─ NO → uv sufficient for Python-only + +Need multiple environments (dev, test, prod, GPU, CPU)? +├─ YES → Use pixi features for environment management +└─ NO → Single environment projects work with either + +Need reproducible environments across platforms? +├─ CRITICAL → Use pixi (lockfiles include all platforms) +└─ LESS CRITICAL → uv also provides lockfiles + +Want to use both conda-forge AND PyPI packages? +├─ YES → Use pixi (seamless integration) +└─ ONLY PYPI → uv is simpler and faster + +Legacy conda environment files (environment.yml)? +├─ YES → pixi can import and modernize +└─ NO → Start fresh with pixi or uv +``` + +## When to Use This Skill + +- **Setting up scientific Python projects** with complex compiled dependencies (NumPy, SciPy, Pandas, scikit-learn, GDAL, netCDF4) +- **Building reproducible research environments** that work identically across different machines and platforms +- **Managing multi-language projects** that combine Python with R, Julia, C++, or Fortran +- **Creating multiple environment configurations** for different hardware (GPU/CPU), testing scenarios, or deployment targets +- **Replacing conda/mamba workflows** with faster, more reliable dependency resolution +- **Developing packages that depend on both conda-forge and PyPI** packages +- **Migrating from environment.yml or requirements.txt** to modern, reproducible workflows +- **Running automated scientific workflows** with task runners and CI/CD integration +- **Working with geospatial, climate, or astronomy packages** that require complex C/Fortran dependencies + +## Core Concepts + +### 1. Unified Package Management (conda + PyPI) + +Pixi resolves dependencies from **both conda-forge and PyPI** in a single unified graph, ensuring compatibility: + +```toml +[project] +name = "my-science-project" +dependencies = [ + "numpy>=1.24", # from conda-forge (optimized builds) + "pandas>=2.0", # from conda-forge +] + +[tool.pixi.pypi-dependencies] +my-custom-pkg = ">=1.0" # PyPI-only package +``` + +**Why this matters for scientific Python:** +- Get optimized NumPy/SciPy builds from conda-forge (MKL, OpenBLAS) +- Use PyPI packages not available in conda +- Single lockfile ensures all dependencies are compatible + +### 2. Multi-Platform Lockfiles + +Pixi generates `pixi.lock` with dependency specifications for **all platforms** (Linux, macOS, Windows, different architectures): + +```toml +# pixi.lock includes: +# - linux-64 +# - osx-64, osx-arm64 +# - win-64 +``` + +**Benefits:** +- Commit lockfile to git → everyone gets identical environments +- Works on collaborator's different OS without changes +- CI/CD uses exact same versions as local development + +### 3. Feature-Based Environments + +Create multiple environments using **features** without duplicating dependencies: + +```toml +[tool.pixi.feature.test.dependencies] +pytest = ">=7.0" +pytest-cov = ">=4.0" + +[tool.pixi.feature.gpu.dependencies] +pytorch-cuda = "11.8.*" + +[tool.pixi.environments] +test = ["test"] +gpu = ["gpu"] +gpu-test = ["gpu", "test"] # combines features +``` + +### 4. Task Automation + +Define reusable commands as tasks: + +```toml +[tool.pixi.tasks] +test = "pytest tests/ -v" +format = "ruff format src/ tests/" +lint = "ruff check src/ tests/" +docs = "sphinx-build docs/ docs/_build" +analyse = { cmd = "python scripts/analyze.py", depends-on = ["test"] } +``` + +### 5. Fast Dependency Resolution + +Pixi uses **rattler** (Rust-based conda resolver) for 10-100x faster resolution than conda: + +- Parallel package downloads +- Efficient caching +- Smart dependency solver + +### 6. pyproject.toml Integration + +Pixi reads standard Python project metadata from `pyproject.toml`, enabling: +- Single source of truth for project configuration +- Compatibility with pip, uv, and other tools +- Standard Python packaging workflows + +## Quick Start + +### Minimal Example: Data Analysis Project + +```bash +# Create new project +mkdir climate-analysis && cd climate-analysis +pixi init --format pyproject + +# Add scientific stack +pixi add python=3.11 numpy pandas matplotlib xarray + +# Add development tools +pixi add --feature dev pytest ipython ruff + +# Create analysis script +cat > analyze.py << 'EOF' +import pandas as pd +import matplotlib.pyplot as plt + +# Your analysis code +data = pd.read_csv("data.csv") +data.plot() +plt.savefig("output.png") +EOF + +# Run in pixi environment +pixi run python analyze.py + +# Or activate shell +pixi shell +python analyze.py +``` + +### Example: Machine Learning Project with GPU Support + +```bash +# Initialize project +pixi init ml-project --format pyproject +cd ml-project + +# Add base dependencies +pixi add python=3.11 numpy pandas scikit-learn matplotlib jupyter + +# Add CPU PyTorch +pixi add --platform linux-64 --platform osx-arm64 pytorch torchvision cpuonly -c pytorch + +# Create GPU feature +pixi add --feature gpu pytorch-cuda=11.8 -c pytorch -c nvidia + +# Add development tools +pixi add --feature dev pytest black mypy + +# Configure environments in pyproject.toml +cat >> pyproject.toml << 'EOF' + +[tool.pixi.environments] +default = { solve-group = "default" } +gpu = { features = ["gpu"], solve-group = "default" } +dev = { features = ["dev"], solve-group = "default" } +EOF + +# Install and run +pixi install +pixi run python train.py # uses default (CPU) +pixi run --environment gpu python train.py # uses GPU +``` + +## Patterns + +### Pattern 1: Converting Existing Projects to Pixi + +**Scenario**: You have an existing project with `requirements.txt` or `environment.yml` + +**Solution**: + +```bash +# From requirements.txt +cd existing-project +pixi init --format pyproject + +# Import from requirements.txt +while IFS= read -r package; do + # Skip comments and empty lines + [[ "$package" =~ ^#.*$ ]] || [[ -z "$package" ]] && continue + + # Try conda first, fallback to PyPI + pixi add "$package" 2>/dev/null || pixi add --pypi "$package" +done < requirements.txt + +# From environment.yml +pixi init --format pyproject --import-environment environment.yml + +# Verify installation +pixi install +pixi run python -c "import numpy, pandas, scipy; print('Success!')" +``` + +**Best Practice**: Review generated `pyproject.toml` and organize dependencies: +- Core runtime dependencies → `[project.dependencies]` +- PyPI-only packages → `[tool.pixi.pypi-dependencies]` +- Development tools → `[tool.pixi.feature.dev.dependencies]` + +### Pattern 2: Multi-Environment Scientific Workflow + +**Scenario**: Different environments for development, testing, production, and GPU computing + +**Implementation**: + +```toml +[project] +name = "research-pipeline" +version = "0.1.0" +dependencies = [ + "python>=3.11", + "numpy>=1.24", + "pandas>=2.0", + "xarray>=2023.1", +] + +# Development tools +[tool.pixi.feature.dev.dependencies] +ipython = ">=8.0" +jupyter = ">=1.0" +ruff = ">=0.1" + +[tool.pixi.feature.dev.pypi-dependencies] +jupyterlab-vim = ">=0.16" + +# Testing tools +[tool.pixi.feature.test.dependencies] +pytest = ">=7.4" +pytest-cov = ">=4.1" +pytest-xdist = ">=3.3" +hypothesis = ">=6.82" + +# GPU dependencies +[tool.pixi.feature.gpu.dependencies] +pytorch-cuda = "11.8.*" +cudatoolkit = "11.8.*" + +[tool.pixi.feature.gpu.pypi-dependencies] +nvidia-ml-py = ">=12.0" + +# Production optimizations +[tool.pixi.feature.prod.dependencies] +python = "3.11.*" # pin exact version + +# Define environments combining features +[tool.pixi.environments] +default = { solve-group = "default" } +dev = { features = ["dev"], solve-group = "default" } +test = { features = ["test"], solve-group = "default" } +gpu = { features = ["gpu"], solve-group = "gpu" } +gpu-dev = { features = ["gpu", "dev"], solve-group = "gpu" } +prod = { features = ["prod"], solve-group = "prod" } + +# Tasks for each environment +[tool.pixi.tasks] +dev-notebook = { cmd = "jupyter lab", env = { JUPYTER_CONFIG_DIR = ".jupyter" } } +test = "pytest tests/ -v --cov=src" +test-parallel = "pytest tests/ -n auto" +train-cpu = "python train.py --device cpu" +train-gpu = "python train.py --device cuda" +benchmark = "python benchmark.py" +``` + +**Usage**: + +```bash +# Development +pixi run --environment dev dev-notebook + +# Testing +pixi run --environment test test + +# GPU training +pixi run --environment gpu train-gpu + +# Production +pixi run --environment prod benchmark +``` + +### Pattern 3: Scientific Library Development + +**Scenario**: Developing a scientific Python package with proper packaging, testing, and documentation + +**Structure**: + +```toml +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "mylib" +version = "0.1.0" +description = "Scientific computing library" +dependencies = [ + "numpy>=1.24", + "scipy>=1.11", +] + +[project.optional-dependencies] +viz = ["matplotlib>=3.7", "seaborn>=0.12"] + +# Development dependencies +[tool.pixi.feature.dev.dependencies] +ipython = "*" +ruff = "*" +mypy = "*" + +# Testing dependencies +[tool.pixi.feature.test.dependencies] +pytest = ">=7.4" +pytest-cov = ">=4.1" +pytest-benchmark = ">=4.0" +hypothesis = ">=6.82" + +# Documentation dependencies +[tool.pixi.feature.docs.dependencies] +sphinx = ">=7.0" +sphinx-rtd-theme = ">=1.3" +numpydoc = ">=1.5" +sphinx-gallery = ">=0.14" + +[tool.pixi.feature.docs.pypi-dependencies] +myst-parser = ">=2.0" + +# Build dependencies +[tool.pixi.feature.build.dependencies] +build = "*" +twine = "*" + +[tool.pixi.environments] +default = { features = [], solve-group = "default" } +dev = { features = ["dev", "test", "docs"], solve-group = "default" } +test = { features = ["test"], solve-group = "default" } +docs = { features = ["docs"], solve-group = "default" } + +# Tasks for development workflow +[tool.pixi.tasks] +# Development +install-dev = "pip install -e ." +format = "ruff format src/ tests/" +lint = "ruff check src/ tests/" +typecheck = "mypy src/" + +# Testing +test = "pytest tests/ -v" +test-cov = "pytest tests/ --cov=src --cov-report=html --cov-report=term" +test-fast = "pytest tests/ -x -v" +benchmark = "pytest tests/benchmarks/ --benchmark-only" + +# Documentation +docs-build = "sphinx-build docs/ docs/_build/html" +docs-serve = { cmd = "python -m http.server 8000 -d docs/_build/html", depends-on = ["docs-build"] } +docs-clean = "rm -rf docs/_build docs/generated" + +# Build and release +build = "python -m build" +publish-test = { cmd = "twine upload --repository testpypi dist/*", depends-on = ["build"] } +publish = { cmd = "twine upload dist/*", depends-on = ["build"] } + +# Combined workflows +ci = { depends-on = ["format", "lint", "typecheck", "test-cov"] } +pre-commit = { depends-on = ["format", "lint", "test-fast"] } +``` + +**Workflow**: + +```bash +# Initial setup +pixi install --environment dev +pixi run install-dev + +# Development cycle +pixi run format # format code +pixi run lint # check style +pixi run typecheck # type checking +pixi run test # run tests + +# Or run all checks +pixi run ci + +# Build documentation +pixi run docs-build +pixi run docs-serve # view at http://localhost:8000 + +# Release workflow +pixi run build +pixi run publish-test # test on TestPyPI +pixi run publish # publish to PyPI +``` + +### Pattern 4: Conda + PyPI Dependency Strategy + +**Scenario**: Optimize dependency sources for performance and availability + +**Strategy**: + +```toml +[project] +dependencies = [ + # Core scientific stack: prefer conda-forge (optimized builds) + "numpy>=1.24", # MKL or OpenBLAS optimized + "scipy>=1.11", # optimized BLAS/LAPACK + "pandas>=2.0", # optimized pandas + "matplotlib>=3.7", # compiled components + "scikit-learn>=1.3", # optimized algorithms + + # Geospatial/climate: conda-forge essential (C/Fortran deps) + "xarray>=2023.1", + "netcdf4>=1.6", + "h5py>=3.9", + "rasterio>=1.3", # GDAL dependency + + # Data processing: conda-forge preferred + "dask>=2023.1", + "numba>=0.57", # LLVM dependency +] + +[tool.pixi.pypi-dependencies] +# Pure Python packages or PyPI-only packages +my-custom-tool = ">=1.0" +experimental-lib = { git = "https://github.com/user/repo.git" } +internal-pkg = { path = "../internal-pkg", editable = true } +``` + +**Decision Rules**: + +1. **Use conda-forge (pixi add) for**: + - NumPy, SciPy, Pandas (optimized builds) + - Packages with C/C++/Fortran extensions (GDAL, netCDF4, h5py) + - Packages with complex system dependencies (Qt, OpenCV) + - R, Julia, or other language packages + +2. **Use PyPI (pixi add --pypi) for**: + - Pure Python packages not in conda-forge + - Bleeding-edge versions before conda-forge packaging + - Internal/private packages + - Editable local packages during development + +### Pattern 5: Reproducible Research Environment + +**Scenario**: Ensure research is reproducible across time and machines + +**Implementation**: + +```toml +[project] +name = "nature-paper-2024" +version = "1.0.0" +description = "Analysis for Nature Paper 2024" +requires-python = ">=3.11,<3.12" # pin Python version range + +dependencies = [ + "python=3.11.6", # exact Python version + "numpy=1.26.2", # exact versions for reproducibility + "pandas=2.1.4", + "scipy=1.11.4", + "matplotlib=3.8.2", + "scikit-learn=1.3.2", +] + +[tool.pixi.pypi-dependencies] +# Pin with exact hashes for ultimate reproducibility +seaborn = "==0.13.0" + +# Analysis environments +[tool.pixi.feature.analysis.dependencies] +jupyter = "1.0.0" +jupyterlab = "4.0.9" + +[tool.pixi.feature.analysis.pypi-dependencies] +jupyterlab-vim = "0.16.0" + +# Environments +[tool.pixi.environments] +default = { solve-group = "default" } +analysis = { features = ["analysis"], solve-group = "default" } + +# Reproducible tasks +[tool.pixi.tasks] +# Data processing pipeline +download-data = "python scripts/01_download.py" +preprocess = { cmd = "python scripts/02_preprocess.py", depends-on = ["download-data"] } +analyze = { cmd = "python scripts/03_analyze.py", depends-on = ["preprocess"] } +visualize = { cmd = "python scripts/04_visualize.py", depends-on = ["analyze"] } +full-pipeline = { depends-on = ["download-data", "preprocess", "analyze", "visualize"] } + +# Notebook execution +run-notebooks = "jupyter nbconvert --execute --to notebook --inplace notebooks/*.ipynb" +``` + +**Best Practices**: + +```bash +# Generate lockfile +pixi install + +# Commit lockfile to repository +git add pixi.lock pyproject.toml +git commit -m "Lock environment for reproducibility" + +# Anyone can recreate exact environment +git clone https://github.com/user/nature-paper-2024.git +cd nature-paper-2024 +pixi install # installs exact versions from pixi.lock + +# Run complete pipeline +pixi run full-pipeline + +# Archive for long-term preservation +pixi list --export environment.yml # backup as conda format +``` + +### Pattern 6: Cross-Platform Development + +**Scenario**: Team members on Linux, macOS (Intel/ARM), and Windows + +**Configuration**: + +```toml +[project] +name = "cross-platform-science" +dependencies = [ + "python>=3.11", + "numpy>=1.24", + "pandas>=2.0", +] + +# Platform-specific dependencies +[tool.pixi.target.linux-64.dependencies] +# Linux-specific optimized builds +mkl = "*" + +[tool.pixi.target.osx-arm64.dependencies] +# Apple Silicon optimizations +accelerate = "*" + +[tool.pixi.target.win-64.dependencies] +# Windows-specific packages +pywin32 = "*" + +# Tasks with platform-specific behavior +[tool.pixi.tasks] +test = "pytest tests/" + +[tool.pixi.target.linux-64.tasks] +test-gpu = "pytest tests/ --gpu" + +[tool.pixi.target.win-64.tasks] +test = "pytest tests/ --timeout=30" # slower on Windows CI +``` + +**Platform Selectors**: + +```toml +# Supported platforms +[tool.pixi.platforms] +linux-64 = true +linux-aarch64 = true +osx-64 = true +osx-arm64 = true +win-64 = true +``` + +### Pattern 7: Task Dependencies and Workflows + +**Scenario**: Complex scientific workflows with data dependencies + +**Implementation**: + +```toml +[tool.pixi.tasks] +# Data acquisition +download-raw = "python scripts/download.py --source=api" +validate-raw = { cmd = "python scripts/validate.py data/raw/", depends-on = ["download-raw"] } + +# Data processing pipeline +clean-data = { cmd = "python scripts/clean.py", depends-on = ["validate-raw"] } +transform = { cmd = "python scripts/transform.py", depends-on = ["clean-data"] } +feature-engineering = { cmd = "python scripts/features.py", depends-on = ["transform"] } + +# Analysis +train-model = { cmd = "python scripts/train.py", depends-on = ["feature-engineering"] } +evaluate = { cmd = "python scripts/evaluate.py", depends-on = ["train-model"] } +visualize = { cmd = "python scripts/visualize.py", depends-on = ["evaluate"] } + +# Testing at each stage +test-cleaning = "pytest tests/test_clean.py" +test-transform = "pytest tests/test_transform.py" +test-features = "pytest tests/test_features.py" +test-model = "pytest tests/test_model.py" + +# Combined workflows +all-tests = { depends-on = ["test-cleaning", "test-transform", "test-features", "test-model"] } +full-pipeline = { depends-on = ["download-raw", "validate-raw", "clean-data", "transform", "feature-engineering", "train-model", "evaluate", "visualize"] } +pipeline-with-tests = { depends-on = ["all-tests", "full-pipeline"] } + +# Parallel execution where possible +[tool.pixi.task.download-supplementary] +cmd = "python scripts/download_supplement.py" + +[tool.pixi.task.process-all] +depends-on = ["download-raw", "download-supplementary"] # run in parallel +``` + +**Running Workflows**: + +```bash +# Run entire pipeline +pixi run full-pipeline + +# Run with testing +pixi run pipeline-with-tests + +# Check what will run +pixi task list --summary + +# Visualize task dependencies +pixi task info full-pipeline +``` + +### Pattern 8: Integration with UV for Pure Python Development + +**Scenario**: Use pixi for complex dependencies, uv for fast pure Python workflows + +**Hybrid Approach**: + +```toml +[project] +name = "hybrid-project" +dependencies = [ + # Heavy scientific deps via pixi/conda + "python>=3.11", + "numpy>=1.24", + "scipy>=1.11", + "gdal>=3.7", # complex C++ dependency + "netcdf4>=1.6", # Fortran dependency +] + +[tool.pixi.pypi-dependencies] +# Pure Python packages +requests = ">=2.31" +pydantic = ">=2.0" +typer = ">=0.9" + +[tool.pixi.feature.dev.dependencies] +ruff = "*" +mypy = "*" + +[tool.pixi.feature.dev.pypi-dependencies] +pytest = ">=7.4" + +[tool.pixi.tasks] +# Use uv for fast pure Python operations +install-dev = "uv pip install -e ." +sync-deps = "uv pip sync requirements.txt" +add-py-dep = "uv pip install" +``` + +**Workflow**: + +```bash +# Pixi manages environment with conda packages +pixi install + +# Activate pixi environment +pixi shell + +# Inside pixi shell, use uv for fast pure Python operations +uv pip install requests httpx pydantic # fast pure Python installs +uv pip freeze > requirements-py.txt + +# Or define as tasks +pixi run install-dev +``` + +**When to use this pattern**: +- Project needs conda for compiled deps (GDAL, netCDF, HDF5) +- But also rapid iteration on pure Python dependencies +- Want uv's speed for locking/installing pure Python packages +- Need conda's solver for complex scientific dependency graphs + +### Pattern 9: CI/CD Integration + +**Scenario**: Reproducible testing in GitHub Actions, GitLab CI, etc. + +**GitHub Actions Example**: + +```yaml +# .github/workflows/test.yml +name: Tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Pixi + uses: prefix-dev/setup-pixi@v0.4.1 + with: + pixi-version: latest + cache: true + + - name: Install dependencies + run: pixi install --environment test + + - name: Run tests + run: pixi run test + + - name: Upload coverage + uses: codecov/codecov-action@v3 + with: + file: ./coverage.xml + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: prefix-dev/setup-pixi@v0.4.1 + - run: pixi run format --check + - run: pixi run lint + + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: prefix-dev/setup-pixi@v0.4.1 + - run: pixi run --environment docs docs-build + - uses: actions/upload-artifact@v3 + with: + name: documentation + path: docs/_build/html +``` + +**GitLab CI Example**: + +```yaml +# .gitlab-ci.yml +image: ubuntu:latest + +before_script: + - curl -fsSL https://pixi.sh/install.sh | bash + - export PATH=$HOME/.pixi/bin:$PATH + +stages: + - test + - build + +test: + stage: test + script: + - pixi run test + cache: + key: ${CI_COMMIT_REF_SLUG} + paths: + - .pixi/ + +lint: + stage: test + script: + - pixi run lint + - pixi run typecheck + +docs: + stage: build + script: + - pixi run --environment docs docs-build + artifacts: + paths: + - docs/_build/html +``` + +### Pattern 10: Local Development with Remote Computing + +**Scenario**: Develop locally, run heavy computation on remote GPU cluster + +**Local Configuration** (`pyproject.toml`): + +```toml +[project] +dependencies = [ + "numpy>=1.24", + "pandas>=2.0", +] + +[tool.pixi.feature.dev.dependencies] +jupyter = "*" +matplotlib = "*" + +[tool.pixi.feature.remote.dependencies] +# Heavy GPU dependencies only for remote +pytorch-cuda = "11.8.*" +tensorboard = "*" + +[tool.pixi.environments] +default = { features = ["dev"], solve-group = "default" } +remote = { features = ["remote"], solve-group = "remote" } + +[tool.pixi.tasks] +notebook = "jupyter lab" +sync-remote = "rsync -av --exclude='.pixi' . user@remote:~/project/" +remote-train = { cmd = "ssh user@remote 'cd ~/project && pixi run train'", depends-on = ["sync-remote"] } +``` + +**Workflow**: + +```bash +# Local development (no GPU deps) +pixi install +pixi run notebook + +# Push to remote and train +pixi run remote-train + +# Or manually +pixi run sync-remote +ssh user@remote +cd ~/project +pixi install --environment remote # installs GPU deps on remote +pixi run --environment remote train +``` + +## Best Practices Checklist + +### Project Setup +- [ ] Use `pixi init --format pyproject` for new projects +- [ ] Set explicit Python version constraint (`python>=3.11,<3.13`) +- [ ] Organize dependencies by source (conda vs PyPI) +- [ ] Create separate features for dev, test, docs environments +- [ ] Define useful tasks for common workflows +- [ ] Set up `.gitignore` to exclude `.pixi/` directory + +### Dependency Management +- [ ] Prefer conda-forge for compiled scientific packages (NumPy, SciPy, GDAL) +- [ ] Use PyPI only for pure Python or conda-unavailable packages +- [ ] Pin exact versions for reproducible research +- [ ] Use version ranges for libraries (allow updates) +- [ ] Specify solve groups for independent environment solving +- [ ] Use `pixi update` regularly to get security patches + +### Reproducibility +- [ ] Commit `pixi.lock` to version control +- [ ] Include all platforms in lockfile for cross-platform teams +- [ ] Document environment recreation steps in README +- [ ] Use exact version pins for published research +- [ ] Test environment from scratch periodically +- [ ] Archive environments for long-term preservation + +### Performance +- [ ] Use pixi's parallel downloads (automatic) +- [ ] Leverage caching in CI/CD (`prefix-dev/setup-pixi` action) +- [ ] Keep environments minimal (only necessary dependencies) +- [ ] Use solve groups to isolate independent environments +- [ ] Clean old packages with `pixi clean cache` + +### Development Workflow +- [ ] Define tasks for common operations (test, lint, format) +- [ ] Use task dependencies for complex workflows +- [ ] Create environment-specific tasks when needed +- [ ] Use `pixi shell` for interactive development +- [ ] Use `pixi run` for automated scripts and CI +- [ ] Test in clean environment before releasing + +### Team Collaboration +- [ ] Document pixi installation in README +- [ ] Provide quick start commands for new contributors +- [ ] Use consistent naming for features and environments +- [ ] Set up pre-commit hooks with pixi tasks +- [ ] Integrate with CI/CD for automated testing +- [ ] Keep pyproject.toml clean and well-commented + +### Security +- [ ] Audit dependencies regularly (`pixi list`) +- [ ] Use trusted channels (conda-forge, PyPI) +- [ ] Review `pixi.lock` changes in PRs +- [ ] Keep pixi updated to latest version +- [ ] Use virtual environments (pixi automatic) +- [ ] Scan for vulnerabilities in dependencies + +## Resources + +### Official Documentation +- **Pixi Website**: https://pixi.sh +- **Documentation**: https://pixi.sh/latest/ +- **GitHub Repository**: https://github.com/prefix-dev/pixi +- **Configuration Reference**: https://pixi.sh/latest/reference/project_configuration/ + +### Community & Support +- **Discord**: https://discord.gg/kKV8ZxyzY4 +- **GitHub Discussions**: https://github.com/prefix-dev/pixi/discussions +- **Issue Tracker**: https://github.com/prefix-dev/pixi/issues + +### Related Technologies +- **Conda-forge**: https://conda-forge.org/ +- **Rattler**: https://github.com/mamba-org/rattler (underlying solver) +- **PyPI**: https://pypi.org/ +- **UV Package Manager**: https://github.com/astral-sh/uv + +### Complementary Skills +- **scientific-python-packaging**: Modern Python packaging patterns +- **scientific-python-testing**: Testing strategies with pytest +- **uv-package-manager**: Fast pure-Python package management + +### Learning Resources +- **Pixi Examples**: https://github.com/prefix-dev/pixi/tree/main/examples +- **Migration Guides**: https://pixi.sh/latest/switching_from/conda/ +- **Best Practices**: https://pixi.sh/latest/features/ + +### Scientific Python Ecosystem +- **NumPy**: https://numpy.org/ +- **SciPy**: https://scipy.org/ +- **Pandas**: https://pandas.pydata.org/ +- **Scikit-learn**: https://scikit-learn.org/ +- **PyData**: https://pydata.org/ + +## Common Issues and Solutions + +### Issue: Package Not Found in Conda-forge + +**Problem**: Running `pixi add my-package` fails with "package not found" + +**Solution**: +```bash +# Search conda-forge +pixi search my-package + +# If not in conda-forge, use PyPI +pixi add --pypi my-package + +# Check if package has different name in conda +# Example: scikit-learn (PyPI) vs sklearn (conda) +pixi add scikit-learn # correct conda name +``` + +### Issue: Conflicting Dependencies + +**Problem**: Dependency solver fails with "conflict" error + +**Solution**: +```bash +# Check dependency tree +pixi tree numpy + +# Use solve groups to isolate conflicts +[tool.pixi.environments] +env1 = { features = ["feat1"], solve-group = "group1" } +env2 = { features = ["feat2"], solve-group = "group2" } # separate solver + +# Relax version constraints +# Instead of: numpy==1.26.0 +# Use: numpy>=1.24,<2.0 + +# Force specific channel priority +pixi add numpy -c conda-forge --force-reinstall +``` + +### Issue: Slow Environment Creation + +**Problem**: `pixi install` takes very long + +**Solution**: +```bash +# Use solve groups to avoid re-solving everything +[tool.pixi.environments] +default = { solve-group = "default" } +test = { features = ["test"], solve-group = "default" } # reuses default solve + +# Clean cache if corrupted +pixi clean cache + +# Check for large dependency trees +pixi tree --depth 2 + +# Update pixi to latest version +pixi self-update +``` + +### Issue: Platform-Specific Failures + +**Problem**: Works on Linux but fails on macOS/Windows + +**Solution**: +```toml +# Use platform-specific dependencies +[tool.pixi.target.osx-arm64.dependencies] +# macOS ARM specific packages +tensorflow-macos = "*" + +[tool.pixi.target.linux-64.dependencies] +# Linux-specific +tensorflow = "*" + +# Exclude unsupported platforms +[tool.pixi.platforms] +linux-64 = true +osx-arm64 = true +# win-64 intentionally excluded if unsupported +``` + +### Issue: PyPI Package Installation Fails + +**Problem**: `pixi add --pypi package` fails with build errors + +**Solution**: +```bash +# Install build dependencies from conda first +pixi add python-build setuptools wheel + +# Then retry PyPI package +pixi add --pypi package + +# For packages needing system libraries +pixi add libgdal # system library +pixi add --pypi gdal # Python bindings + +# Check if conda-forge version exists +pixi search gdal # might have compiled version +``` + +### Issue: Environment Activation in Scripts + +**Problem**: Need to run scripts outside of `pixi run` + +**Solution**: +```bash +# Use pixi shell for interactive sessions +pixi shell +python script.py + +# For automation, always use pixi run +pixi run python script.py + +# In bash scripts +#!/usr/bin/env bash +eval "$(pixi shell-hook)" +python script.py + +# In task definitions +[tool.pixi.tasks] +run-script = "python script.py" # automatically in environment +``` + +### Issue: Lockfile Merge Conflicts + +**Problem**: Git merge conflicts in `pixi.lock` + +**Solution**: +```bash +# Accept one version +git checkout --theirs pixi.lock # or --ours + +# Regenerate lockfile +pixi install + +# Commit regenerated lockfile +git add pixi.lock +git commit -m "Regenerate lockfile after merge" + +# Prevention: coordinate updates with team +# One person updates dependencies at a time +``` + +### Issue: Missing System Dependencies + +**Problem**: Package fails at runtime with "library not found" + +**Solution**: +```bash +# Check what's actually in environment +pixi list + +# Add system libraries explicitly +pixi add libgdal proj geos # for geospatial +pixi add hdf5 netcdf4 # for climate data +pixi add mkl # for optimized linear algebra + +# Use conda for everything when possible +# Don't mix system packages with conda packages +``` + +### Issue: Cannot Find Executable in Environment + +**Problem**: `pixi run mycommand` fails with "command not found" + +**Solution**: +```bash +# List all installed packages +pixi list + +# Check if package provides executable +pixi add --help # documentation + +# Ensure package is in active environment +[tool.pixi.feature.dev.dependencies] +mypackage = "*" + +[tool.pixi.environments] +default = { features = ["dev"] } # must include feature + +# Or run in specific environment +pixi run --environment dev mycommand +``` + +### Issue: Want to Use Both Pixi and Conda + +**Problem**: Existing conda environment, want to migrate gradually + +**Solution**: +```bash +# Export existing conda environment +conda env export > environment.yml + +# Import to pixi project +pixi init --format pyproject --import-environment environment.yml + +# Or manually alongside +conda activate myenv # activate conda env +pixi shell # activate pixi env (nested) + +# Long term: migrate fully to pixi +# Pixi replaces conda/mamba entirely +``` + +### Issue: Editable Install of Local Package + +**Problem**: Want to develop local package in pixi environment + +**Solution**: +```toml +[tool.pixi.pypi-dependencies] +mypackage = { path = ".", editable = true } + +# Or for relative paths +sibling-package = { path = "../sibling", editable = true } +``` + +```bash +# Install in development mode +pixi install + +# Changes to source immediately reflected +pixi run python -c "import mypackage; print(mypackage.__file__)" +``` + +### Issue: Need Different Python Versions + +**Problem**: Test across Python 3.10, 3.11, 3.12 + +**Solution**: +```toml +[tool.pixi.feature.py310.dependencies] +python = "3.10.*" + +[tool.pixi.feature.py311.dependencies] +python = "3.11.*" + +[tool.pixi.feature.py312.dependencies] +python = "3.12.*" + +[tool.pixi.environments] +py310 = { features = ["py310"], solve-group = "py310" } +py311 = { features = ["py311"], solve-group = "py311" } +py312 = { features = ["py312"], solve-group = "py312" } +``` + +```bash +# Test all versions +pixi run --environment py310 pytest +pixi run --environment py311 pytest +pixi run --environment py312 pytest +``` + +## Summary + +Pixi revolutionizes scientific Python development by unifying conda and PyPI ecosystems with blazing-fast dependency resolution, reproducible multi-platform lockfiles, and seamless environment management. By leveraging `pyproject.toml` integration, pixi provides a modern, standards-compliant approach to managing complex scientific dependencies while maintaining compatibility with the broader Python ecosystem. + +**Key advantages for scientific computing:** + +1. **Optimized Scientific Packages**: Access conda-forge's pre-built binaries for NumPy, SciPy, and other compiled packages with MKL/OpenBLAS optimizations +2. **Complex Dependencies Made Simple**: Handle challenging packages like GDAL, netCDF4, and HDF5 that require C/Fortran/C++ system libraries +3. **True Reproducibility**: Multi-platform lockfiles ensure identical environments across Linux, macOS, and Windows +4. **Flexible Environment Management**: Feature-based environments for dev/test/prod, GPU/CPU, or any custom configuration +5. **Fast and Reliable**: 10-100x faster than conda with Rust-based parallel dependency resolution +6. **Task Automation**: Built-in task runner for scientific workflows, testing, and documentation +7. **Best of Both Worlds**: Seamlessly mix conda-forge optimized packages with PyPI's vast ecosystem + +Whether you're conducting reproducible research, developing scientific software, or managing complex data analysis pipelines, pixi provides the robust foundation for modern scientific Python development. By replacing conda/mamba with pixi, you gain speed, reliability, and modern workflows while maintaining full access to the scientific Python ecosystem. + +**Ready to get started?** Install pixi, initialize your project with `pixi init --format pyproject`, and experience the future of scientific Python package management. diff --git a/skills/python-style-guide/LICENSE b/skills/python-style-guide/LICENSE new file mode 100644 index 0000000..2f244ac --- /dev/null +++ b/skills/python-style-guide/LICENSE @@ -0,0 +1,395 @@ +Attribution 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution 4.0 International Public License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution 4.0 International Public License ("Public License"). To the +extent this Public License may be interpreted as a contract, You are +granted the Licensed Rights in consideration of Your acceptance of +these terms and conditions, and the Licensor grants You such rights in +consideration of benefits the Licensor receives from making the +Licensed Material available under these terms and conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + j. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + k. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. diff --git a/skills/python-style-guide/SKILL.md b/skills/python-style-guide/SKILL.md new file mode 100644 index 0000000..4543356 --- /dev/null +++ b/skills/python-style-guide/SKILL.md @@ -0,0 +1,482 @@ +--- +name: python-style-guide +description: Comprehensive Python programming guidelines based on Google's Python Style Guide. Use when Claude needs to write Python code, review Python code for style issues, refactor Python code, or provide Python programming guidance. Covers language rules (imports, exceptions, type annotations), style rules (naming conventions, formatting, docstrings), and best practices for clean, maintainable Python code. +license: Complete terms in LICENSE.txt +--- + +# Python Style Guide + +Comprehensive guidelines for writing clean, maintainable Python code based on [Google's Python Style Guide](https://google.github.io/styleguide/pyguide.html). + +## Core Philosophy + +**BE CONSISTENT.** Match the style of the code around you. Use these guidelines as defaults, but always prioritize consistency with existing code. + +## Language Rules + +### Imports + +Use `import` statements for packages and modules only, not for individual classes or functions. + +**Yes:** +```python +from doctor.who import jodie +import sound_effects.utils +``` + +**No:** +```python +from sound_effects.utils import EffectsRegistry # Don't import classes directly +``` + +#### Import Formatting + +- Group imports: standard library, third-party, application-specific +- Alphabetize within each group +- Use absolute imports (not relative imports) +- One import per line (except for multiple items from `typing` or `collections.abc`) + +```python +# Standard library +import os +import sys + +# Third-party +import numpy as np +import tensorflow as tf + +# Application-specific +from myproject.backend import api_utils +``` + +### Exceptions + +Use exceptions appropriately. Do not suppress errors with bare `except:` clauses. + +**Yes:** +```python +try: + result = risky_operation() +except ValueError as e: + logging.error(f"Invalid value: {e}") + raise +``` + +**No:** +```python +try: + result = risky_operation() +except: # Too broad, hides bugs + pass +``` + +### Type Annotations + +Annotate all function signatures. Type annotations improve code readability and catch errors early. + +**General rules:** +- Annotate all public APIs +- Use built-in types (`list`, `dict`, `set`) instead of `typing.List`, etc. (Python 3.9+) +- Import typing symbols directly: `from typing import Any, Union` +- Use `None` instead of `type(None)` or `NoneType` + +```python +def fetch_data(url: str, timeout: int = 30) -> dict[str, Any]: + """Fetch data from URL.""" + ... + +def process_items(items: list[str]) -> None: + """Process a list of items.""" + ... +``` + +### Default Argument Values + +Never use mutable objects as default values in function definitions. + +**Yes:** +```python +def foo(a: int, b: list[int] | None = None) -> None: + if b is None: + b = [] +``` + +**No:** +```python +def foo(a: int, b: list[int] = []) -> None: # Mutable default - WRONG! + b.append(a) +``` + +### True/False Evaluations + +Use implicit false where possible. Empty sequences, `None`, and `0` are false in boolean contexts. + +**Yes:** +```python +if not users: # Preferred +if not some_dict: +if value: +``` + +**No:** +```python +if len(users) == 0: # Verbose +if users == []: +if value == True: # Never compare to True/False explicitly +``` + +### Comprehensions & Generators + +Use comprehensions and generators for simple cases. Keep them readable. + +**Yes:** +```python +result = [x for x in data if x > 0] +squares = (x**2 for x in range(10)) +``` + +**No:** +```python +# Too complex +result = [ + x.strip().lower() for x in data + if x and len(x) > 5 and not x.startswith('#') + for y in x.split(',') if y +] # Use a regular loop instead +``` + +### Lambda Functions + +Use lambdas for one-liners only. For anything complex, define a proper function. + +**Yes:** +```python +sorted(data, key=lambda x: x.timestamp) +``` + +**Acceptable but prefer named function:** +```python +def get_timestamp(item): + return item.timestamp + +sorted(data, key=get_timestamp) +``` + +## Style Rules + +### Line Length + +Maximum line length: 80 characters. Exceptions allowed for imports, URLs, and long strings that can't be broken. + +### Indentation + +Use 4 spaces per indentation level. Never use tabs. + +For hanging indents, align wrapped elements vertically or use 4-space hanging indent: + +```python +# Aligned with opening delimiter +foo = long_function_name(var_one, var_two, + var_three, var_four) + +# Hanging indent (4 spaces) +foo = long_function_name( + var_one, var_two, var_three, + var_four) +``` + +### Blank Lines + +- Two blank lines between top-level definitions +- One blank line between method definitions +- Use blank lines sparingly within functions to show logical sections + +### Naming Conventions + +| Type | Convention | Examples | +|------|-----------|----------| +| Packages/Modules | `lower_with_under` | `my_module.py` | +| Classes | `CapWords` | `MyClass` | +| Functions/Methods | `lower_with_under()` | `my_function()` | +| Constants | `CAPS_WITH_UNDER` | `MAX_SIZE` | +| Variables | `lower_with_under` | `my_var` | +| Private | `_leading_underscore` | `_private_var` | + +**Avoid:** +- Single character names except for counters/iterators (`i`, `j`, `k`) +- Dashes in any name +- `__double_leading_and_trailing_underscore__` (reserved for Python) + +### Comments and Docstrings + +#### Docstring Format + +Use Google-style docstrings for all public modules, functions, classes, and methods. + +**Function docstring:** +```python +def fetch_smalltable_rows( + table_handle: smalltable.Table, + keys: Sequence[bytes | str], + require_all_keys: bool = False, +) -> Mapping[bytes, tuple[str, ...]]: + """Fetches rows from a Smalltable. + + Retrieves rows pertaining to the given keys from the Table instance + represented by table_handle. String keys will be UTF-8 encoded. + + Args: + table_handle: An open smalltable.Table instance. + keys: A sequence of strings representing the key of each table + row to fetch. String keys will be UTF-8 encoded. + require_all_keys: If True, raise ValueError if any key is missing. + + Returns: + A dict mapping keys to the corresponding table row data + fetched. Each row is represented as a tuple of strings. + + Raises: + IOError: An error occurred accessing the smalltable. + ValueError: A key is missing and require_all_keys is True. + """ + ... +``` + +**Class docstring:** +```python +class SampleClass: + """Summary of class here. + + Longer class information... + Longer class information... + + Attributes: + likes_spam: A boolean indicating if we like SPAM or not. + eggs: An integer count of the eggs we have laid. + """ + + def __init__(self, likes_spam: bool = False): + """Initializes the instance based on spam preference. + + Args: + likes_spam: Defines if instance exhibits this preference. + """ + self.likes_spam = likes_spam + self.eggs = 0 +``` + +#### Block and Inline Comments + +- Use complete sentences with proper capitalization +- Block comments indent to the same level as the code +- Inline comments should be separated by at least 2 spaces +- Use inline comments sparingly + +```python +# Block comment explaining the following code. +# Can span multiple lines. +x = x + 1 # Inline comment (use sparingly) +``` + +### Strings + +Use f-strings for formatting (Python 3.6+). + +**Yes:** +```python +x = f"name: {name}; score: {score}" +``` + +**Acceptable:** +```python +x = "name: %s; score: %d" % (name, score) +x = "name: {}; score: {}".format(name, score) +``` + +**No:** +```python +x = "name: " + name + "; score: " + str(score) # Avoid + for formatting +``` + +#### Logging + +Use `%` formatting for logging, not f-strings (allows lazy evaluation): + +```python +logging.info("Request from %s resulted in %d", ip_address, status_code) +``` + +### Files and Resources + +Always use context managers (`with` statements) for file operations: + +```python +with open("file.txt") as f: + data = f.read() +``` + +### Statements + +Generally avoid multiple statements on one line. + +**Yes:** +```python +if foo: + bar() +``` + +**No:** +```python +if foo: bar() # Avoid +``` + +### Main + +For executable scripts, use: + +```python +def main(): + ... + +if __name__ == "__main__": + main() +``` + +### Function Length + +Keep functions focused and reasonably sized. If a function exceeds about 40 lines, consider splitting it unless it remains very readable. + +## Type Annotation Details + +### Forward Declarations + +Use string quotes for forward references: + +```python +class MyClass: + def method(self) -> "MyClass": + return self +``` + +### Type Aliases + +Create aliases for complex types: + +```python +from typing import TypeAlias + +ConnectionOptions: TypeAlias = dict[str, str] +Address: TypeAlias = tuple[str, int] +Server: TypeAlias = tuple[Address, ConnectionOptions] +``` + +### TypeVars + +Use descriptive names for TypeVars: + +```python +from typing import TypeVar + +_T = TypeVar("_T") # Good: private, unconstrained +AddableType = TypeVar("AddableType", int, float, str) # Good: descriptive +``` + +### Generics + +Always specify type parameters for generic types: + +**Yes:** +```python +def get_names(employee_ids: list[int]) -> dict[int, str]: + ... +``` + +**No:** +```python +def get_names(employee_ids: list) -> dict: # Missing type parameters + ... +``` + +### Imports for Typing + +Import typing symbols directly: + +```python +from collections.abc import Mapping, Sequence +from typing import Any, Union + +# Use built-in types for containers (Python 3.9+) +def foo(items: list[str]) -> dict[str, int]: + ... +``` + +## Common Patterns + +### Properties + +Use properties for simple attribute access: + +```python +class Square: + def __init__(self, side: float): + self._side = side + + @property + def area(self) -> float: + return self._side ** 2 +``` + +### Conditional Expressions + +Use ternary operators for simple conditions: + +```python +x = "yes" if condition else "no" +``` + +### Context Managers + +Create custom context managers when appropriate: + +```python +from contextlib import contextmanager + +@contextmanager +def managed_resource(*args, **kwargs): + resource = acquire_resource(*args, **kwargs) + try: + yield resource + finally: + release_resource(resource) +``` + +## Linting + +Run `pylint` on all Python code. Suppress warnings only when necessary with clear explanations: + +```python +dict = 'something' # pylint: disable=redefined-builtin +``` + +## Summary + +When writing Python code: + +1. Use type annotations for all functions +2. Follow naming conventions consistently +3. Write clear docstrings for all public APIs +4. Keep functions focused and reasonably sized +5. Use comprehensions for simple cases +6. Prefer implicit false in boolean contexts +7. Use f-strings for formatting +8. Always use context managers for resources +9. Run pylint and fix issues +10. **BE CONSISTENT** with existing code + +## Additional Resources + +For detailed reference on specific topics, see: + +- **references/advanced_types.md** - Advanced type annotation patterns including Protocol, TypedDict, Literal, ParamSpec, and more +- **references/antipatterns.md** - Common Python mistakes and their fixes +- **references/docstring_examples.md** - Comprehensive docstring examples for all Python constructs diff --git a/skills/python-style-guide/references/advanced_types.md b/skills/python-style-guide/references/advanced_types.md new file mode 100644 index 0000000..0ba7a69 --- /dev/null +++ b/skills/python-style-guide/references/advanced_types.md @@ -0,0 +1,259 @@ +# Advanced Type Annotations Reference + +This document provides detailed guidance on advanced type annotation patterns in Python. + +## Union Types + +Use `|` (union operator) for Python 3.10+ or `Union` for earlier versions: + +```python +# Python 3.10+ +def process(value: int | str) -> None: + ... + +# Python 3.9 and earlier +from typing import Union +def process(value: Union[int, str]) -> None: + ... +``` + +## Optional Types + +`Optional[X]` is shorthand for `X | None`: + +```python +from typing import Optional + +# These are equivalent: +def foo(x: Optional[int]) -> None: ... +def foo(x: int | None) -> None: ... # Preferred in Python 3.10+ +``` + +## Callable Types + +For function types, use `Callable`: + +```python +from collections.abc import Callable + +def apply_func(func: Callable[[int, int], int], x: int, y: int) -> int: + return func(x, y) + +# Callable[[arg1_type, arg2_type], return_type] +``` + +For functions with variable arguments: + +```python +# Use ... for variable arguments +def accepts_any_callable(func: Callable[..., int]) -> None: + ... +``` + +## Sequence, Mapping, and Iterable + +Use abstract types from `collections.abc` when you don't need specific container features: + +```python +from collections.abc import Sequence, Mapping, Iterable + +def process_items(items: Sequence[str]) -> None: + """Works with lists, tuples, or any sequence.""" + ... + +def process_mapping(data: Mapping[str, int]) -> None: + """Works with dicts or any mapping.""" + ... + +def sum_numbers(nums: Iterable[int]) -> int: + """Works with any iterable.""" + return sum(nums) +``` + +## Protocol and Structural Subtyping + +Define structural types using `Protocol`: + +```python +from typing import Protocol + +class Drawable(Protocol): + def draw(self) -> None: + ... + +def render(obj: Drawable) -> None: + obj.draw() # Any object with a draw() method works +``` + +## TypedDict for Structured Dictionaries + +Use `TypedDict` for dictionaries with known keys: + +```python +from typing import TypedDict + +class Employee(TypedDict): + name: str + id: int + department: str + +def process_employee(emp: Employee) -> None: + print(emp["name"]) # Type checker knows this key exists +``` + +Optional fields: + +```python +from typing import TypedDict, NotRequired + +class Employee(TypedDict): + name: str + id: int + department: NotRequired[str] # Optional field +``` + +## Literal Types + +Use `Literal` for specific values: + +```python +from typing import Literal + +def set_mode(mode: Literal["read", "write", "append"]) -> None: + ... + +# Type checker ensures only these values are passed +set_mode("read") # OK +set_mode("delete") # Error +``` + +## Generic Classes + +Create generic classes with `Generic`: + +```python +from typing import Generic, TypeVar + +T = TypeVar("T") + +class Stack(Generic[T]): + def __init__(self) -> None: + self._items: list[T] = [] + + def push(self, item: T) -> None: + self._items.append(item) + + def pop(self) -> T: + return self._items.pop() + +# Usage +int_stack: Stack[int] = Stack() +int_stack.push(42) +``` + +## ParamSpec for Higher-Order Functions + +Use `ParamSpec` to preserve function signatures: + +```python +from typing import ParamSpec, TypeVar, Callable + +P = ParamSpec("P") +R = TypeVar("R") + +def log_calls(func: Callable[P, R]) -> Callable[P, R]: + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + print(f"Calling {func.__name__}") + return func(*args, **kwargs) + return wrapper + +@log_calls +def greet(name: str, excited: bool = False) -> str: + return f"Hello, {name}{'!' if excited else '.'}" + +# Type checker preserves the signature of greet +``` + +## TypeGuard for Type Narrowing + +Use `TypeGuard` for custom type checking functions: + +```python +from typing import TypeGuard + +def is_str_list(val: list[object]) -> TypeGuard[list[str]]: + return all(isinstance(x, str) for x in val) + +def process(items: list[object]) -> None: + if is_str_list(items): + # Type checker knows items is list[str] here + print(", ".join(items)) +``` + +## Annotating *args and **kwargs + +```python +def foo(*args: int, **kwargs: str) -> None: + # args is tuple[int, ...] + # kwargs is dict[str, str] + ... +``` + +## Overload for Multiple Signatures + +Use `@overload` for functions with different return types based on arguments: + +```python +from typing import overload + +@overload +def process(x: int) -> int: ... + +@overload +def process(x: str) -> str: ... + +def process(x: int | str) -> int | str: + if isinstance(x, int): + return x * 2 + return x.upper() +``` + +## Self Type (Python 3.11+) + +Use `Self` for methods that return the instance: + +```python +from typing import Self + +class Builder: + def add_item(self, item: str) -> Self: + self.items.append(item) + return self # Return type is automatically the class type + + def build(self) -> dict: + return {"items": self.items} +``` + +For Python < 3.11, use TypeVar: + +```python +from typing import TypeVar + +TBuilder = TypeVar("TBuilder", bound="Builder") + +class Builder: + def add_item(self: TBuilder, item: str) -> TBuilder: + self.items.append(item) + return self +``` + +## Best Practices + +1. Use the most general type that works (e.g., `Sequence` over `list`) +2. Use `Protocol` for duck typing +3. Use `TypedDict` for structured dictionaries +4. Use `Literal` to restrict to specific values +5. Use `TypeGuard` for custom type narrowing +6. Always annotate public APIs +7. Use `Any` sparingly and explicitly when needed +8. Prefer built-in generic types (`list`, `dict`) over `typing` equivalents (Python 3.9+) diff --git a/skills/python-style-guide/references/antipatterns.md b/skills/python-style-guide/references/antipatterns.md new file mode 100644 index 0000000..765dc86 --- /dev/null +++ b/skills/python-style-guide/references/antipatterns.md @@ -0,0 +1,361 @@ +# Python Anti-Patterns and Fixes + +Common Python mistakes and their corrections. + +## 1. Mutable Default Arguments + +**Anti-pattern:** +```python +def add_item(item, items=[]): # WRONG + items.append(item) + return items +``` + +**Why it's wrong:** The list is created once when the function is defined, not each time it's called. + +**Fix:** +```python +def add_item(item, items=None): + if items is None: + items = [] + items.append(item) + return items +``` + +## 2. Bare Except Clauses + +**Anti-pattern:** +```python +try: + risky_operation() +except: # WRONG - catches everything, including KeyboardInterrupt + handle_error() +``` + +**Fix:** +```python +try: + risky_operation() +except Exception as e: # Or specific exception types + logger.error(f"Operation failed: {e}") + handle_error() +``` + +## 3. Using == for None Comparisons + +**Anti-pattern:** +```python +if value == None: # WRONG + ... +``` + +**Fix:** +```python +if value is None: + ... +``` + +**Why:** `is` checks identity, `==` checks equality. `None` is a singleton. + +## 4. Comparing Boolean Values Explicitly + +**Anti-pattern:** +```python +if flag == True: # WRONG + ... +if len(items) > 0: # WRONG + ... +``` + +**Fix:** +```python +if flag: + ... +if items: + ... +``` + +## 5. Not Using Context Managers for Files + +**Anti-pattern:** +```python +f = open("file.txt") # WRONG - file may not close if error occurs +data = f.read() +f.close() +``` + +**Fix:** +```python +with open("file.txt") as f: + data = f.read() +``` + +## 6. String Concatenation in Loops + +**Anti-pattern:** +```python +result = "" +for item in items: + result += str(item) # WRONG - creates new string each iteration +``` + +**Fix:** +```python +result = "".join(str(item) for item in items) +``` + +## 7. Modifying List While Iterating + +**Anti-pattern:** +```python +for item in items: + if should_remove(item): + items.remove(item) # WRONG - skips elements +``` + +**Fix:** +```python +items = [item for item in items if not should_remove(item)] +# Or +items[:] = [item for item in items if not should_remove(item)] +``` + +## 8. Using eval() or exec() + +**Anti-pattern:** +```python +user_input = get_user_input() +result = eval(user_input) # WRONG - major security risk +``` + +**Fix:** +```python +import ast +result = ast.literal_eval(user_input) # Only evaluates literals +``` + +## 9. Not Using enumerate() + +**Anti-pattern:** +```python +i = 0 +for item in items: + print(f"{i}: {item}") + i += 1 +``` + +**Fix:** +```python +for i, item in enumerate(items): + print(f"{i}: {item}") +``` + +## 10. Creating Empty Lists/Dicts Unnecessarily + +**Anti-pattern:** +```python +items = [] +items.append(1) +items.append(2) +items.append(3) +``` + +**Fix:** +```python +items = [1, 2, 3] +``` + +## 11. Not Using dict.get() with Defaults + +**Anti-pattern:** +```python +if key in my_dict: + value = my_dict[key] +else: + value = default +``` + +**Fix:** +```python +value = my_dict.get(key, default) +``` + +## 12. Using range(len()) Instead of enumerate() + +**Anti-pattern:** +```python +for i in range(len(items)): + item = items[i] + print(f"{i}: {item}") +``` + +**Fix:** +```python +for i, item in enumerate(items): + print(f"{i}: {item}") +``` + +## 13. Not Using Collections Module + +**Anti-pattern:** +```python +word_counts = {} +for word in words: + if word in word_counts: + word_counts[word] += 1 + else: + word_counts[word] = 1 +``` + +**Fix:** +```python +from collections import Counter +word_counts = Counter(words) +``` + +## 14. Not Using defaultdict + +**Anti-pattern:** +```python +groups = {} +for item in items: + key = get_key(item) + if key not in groups: + groups[key] = [] + groups[key].append(item) +``` + +**Fix:** +```python +from collections import defaultdict +groups = defaultdict(list) +for item in items: + key = get_key(item) + groups[key].append(item) +``` + +## 15. Overly Complex Comprehensions + +**Anti-pattern:** +```python +result = [ + transform(x) + for x in items + if condition1(x) + if condition2(x) + if condition3(x) + for y in x.sub_items + if condition4(y) +] # WRONG - too complex +``` + +**Fix:** +```python +result = [] +for x in items: + if condition1(x) and condition2(x) and condition3(x): + for y in x.sub_items: + if condition4(y): + result.append(transform(x)) +``` + +## 16. Not Using Path Objects + +**Anti-pattern:** +```python +import os +path = os.path.join(dir_name, "file.txt") +if os.path.exists(path): + with open(path) as f: + ... +``` + +**Fix:** +```python +from pathlib import Path +path = Path(dir_name) / "file.txt" +if path.exists(): + with path.open() as f: + ... +``` + +## 17. String Formatting with + or % + +**Anti-pattern:** +```python +message = "Hello, " + name + "! You have " + str(count) + " messages." +message = "Hello, %s! You have %d messages." % (name, count) +``` + +**Fix:** +```python +message = f"Hello, {name}! You have {count} messages." +``` + +## 18. Not Using dataclasses + +**Anti-pattern:** +```python +class Point: + def __init__(self, x, y): + self.x = x + self.y = y + + def __repr__(self): + return f"Point(x={self.x}, y={self.y})" + + def __eq__(self, other): + return self.x == other.x and self.y == other.y +``` + +**Fix:** +```python +from dataclasses import dataclass + +@dataclass +class Point: + x: float + y: float +``` + +## 19. Lambda Abuse + +**Anti-pattern:** +```python +process = lambda x: x.strip().lower().replace(" ", "_")[:20] # WRONG +``` + +**Fix:** +```python +def process(x: str) -> str: + """Clean and truncate string.""" + return x.strip().lower().replace(" ", "_")[:20] +``` + +## 20. Not Using Sets for Membership Testing + +**Anti-pattern:** +```python +valid_codes = ["A1", "A2", "A3", ...] # Long list +if code in valid_codes: # O(n) lookup + ... +``` + +**Fix:** +```python +valid_codes = {"A1", "A2", "A3", ...} # Set +if code in valid_codes: # O(1) lookup + ... +``` + +## Summary + +Key principles to avoid anti-patterns: + +1. Use built-in functions and standard library when possible +2. Leverage context managers for resource management +3. Use appropriate data structures (sets for membership, Counter for counting) +4. Keep code readable and idiomatic +5. Use modern Python features (f-strings, dataclasses, Path) +6. Avoid premature optimization +7. Write explicit, clear code over clever code diff --git a/skills/python-style-guide/references/docstring_examples.md b/skills/python-style-guide/references/docstring_examples.md new file mode 100644 index 0000000..a216b54 --- /dev/null +++ b/skills/python-style-guide/references/docstring_examples.md @@ -0,0 +1,384 @@ +# Docstring Examples + +Complete examples of Google-style docstrings for various Python constructs. + +## Module Docstring + +```python +"""This is an example module docstring. + +This module provides utilities for processing user data. It includes functions +for validation, transformation, and persistence of user information. + +Typical usage example: + + user = create_user("John Doe", "john@example.com") + validate_user(user) + save_user(user) +""" +``` + +## Function Docstrings + +### Simple Function + +```python +def greet(name: str) -> str: + """Returns a greeting message. + + Args: + name: The name of the person to greet. + + Returns: + A greeting string. + """ + return f"Hello, {name}!" +``` + +### Function with Multiple Arguments + +```python +def calculate_total( + price: float, + quantity: int, + discount: float = 0.0, + tax_rate: float = 0.0 +) -> float: + """Calculates the total cost including discount and tax. + + Args: + price: The unit price of the item. + quantity: The number of items. + discount: The discount as a decimal (e.g., 0.1 for 10% off). + Defaults to 0.0. + tax_rate: The tax rate as a decimal (e.g., 0.08 for 8% tax). + Defaults to 0.0. + + Returns: + The total cost after applying discount and tax. + + Raises: + ValueError: If price or quantity is negative. + """ + if price < 0 or quantity < 0: + raise ValueError("Price and quantity must be non-negative") + + subtotal = price * quantity * (1 - discount) + return subtotal * (1 + tax_rate) +``` + +### Function with Complex Return Type + +```python +def parse_config( + config_path: str +) -> tuple[dict[str, str], list[str]]: + """Parses a configuration file. + + Args: + config_path: Path to the configuration file. + + Returns: + A tuple containing: + - A dictionary of configuration key-value pairs. + - A list of warning messages encountered during parsing. + + Raises: + FileNotFoundError: If the config file doesn't exist. + ValueError: If the config file is malformed. + """ + ... +``` + +### Function with Side Effects + +```python +def update_database( + user_id: int, + data: dict[str, Any] +) -> None: + """Updates user data in the database. + + Note: + This function modifies the database directly. Ensure proper + transaction handling in the calling code. + + Args: + user_id: The ID of the user to update. + data: Dictionary containing fields to update. + + Raises: + DatabaseError: If the database operation fails. + ValueError: If user_id is invalid or data is empty. + """ + ... +``` + +## Class Docstrings + +### Simple Class + +```python +class User: + """Represents a user in the system. + + Attributes: + username: The user's unique username. + email: The user's email address. + created_at: Timestamp when the user was created. + """ + + def __init__(self, username: str, email: str): + """Initializes a new User. + + Args: + username: The desired username. + email: The user's email address. + """ + self.username = username + self.email = email + self.created_at = datetime.now() +``` + +### Complex Class with Properties + +```python +class Rectangle: + """Represents a rectangle with width and height. + + This class provides methods for calculating area and perimeter, + and properties for accessing dimensions. + + Attributes: + width: The width of the rectangle. + height: The height of the rectangle. + + Example: + >>> rect = Rectangle(10, 5) + >>> rect.area + 50 + >>> rect.perimeter + 30 + """ + + def __init__(self, width: float, height: float): + """Initializes a Rectangle. + + Args: + width: The width of the rectangle. Must be positive. + height: The height of the rectangle. Must be positive. + + Raises: + ValueError: If width or height is not positive. + """ + if width <= 0 or height <= 0: + raise ValueError("Width and height must be positive") + self._width = width + self._height = height + + @property + def width(self) -> float: + """Gets the width of the rectangle.""" + return self._width + + @width.setter + def width(self, value: float) -> None: + """Sets the width of the rectangle. + + Args: + value: The new width. Must be positive. + + Raises: + ValueError: If value is not positive. + """ + if value <= 0: + raise ValueError("Width must be positive") + self._width = value + + @property + def area(self) -> float: + """Calculates and returns the area of the rectangle.""" + return self._width * self._height + + @property + def perimeter(self) -> float: + """Calculates and returns the perimeter of the rectangle.""" + return 2 * (self._width + self._height) +``` + +## Generator Functions + +```python +def fibonacci(n: int) -> Iterator[int]: + """Generates the first n Fibonacci numbers. + + Args: + n: The number of Fibonacci numbers to generate. + + Yields: + The next Fibonacci number in the sequence. + + Raises: + ValueError: If n is negative. + + Example: + >>> list(fibonacci(5)) + [0, 1, 1, 2, 3] + """ + if n < 0: + raise ValueError("n must be non-negative") + + a, b = 0, 1 + for _ in range(n): + yield a + a, b = b, a + b +``` + +## Exception Classes + +```python +class InvalidUserError(Exception): + """Raised when user data is invalid. + + This exception is raised during user validation when the provided + data doesn't meet the required criteria. + + Attributes: + username: The invalid username that caused the error. + message: Explanation of the validation failure. + """ + + def __init__(self, username: str, message: str): + """Initializes the exception. + + Args: + username: The username that failed validation. + message: Description of why validation failed. + """ + self.username = username + self.message = message + super().__init__(f"{username}: {message}") +``` + +## Context Manager + +```python +class DatabaseConnection: + """Context manager for database connections. + + Automatically handles connection setup and teardown. + + Example: + >>> with DatabaseConnection("localhost", 5432) as conn: + ... conn.execute("SELECT * FROM users") + """ + + def __init__(self, host: str, port: int): + """Initializes the database connection parameters. + + Args: + host: The database host address. + port: The database port number. + """ + self.host = host + self.port = port + self._connection = None + + def __enter__(self) -> "DatabaseConnection": + """Establishes the database connection. + + Returns: + The DatabaseConnection instance. + + Raises: + ConnectionError: If connection cannot be established. + """ + self._connection = create_connection(self.host, self.port) + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + """Closes the database connection. + + Args: + exc_type: The exception type, if an exception occurred. + exc_val: The exception value, if an exception occurred. + exc_tb: The exception traceback, if an exception occurred. + + Returns: + False to propagate exceptions, True to suppress them. + """ + if self._connection: + self._connection.close() + return False +``` + +## Async Functions + +```python +async def fetch_data(url: str, timeout: float = 30.0) -> dict[str, Any]: + """Asynchronously fetches data from a URL. + + Args: + url: The URL to fetch data from. + timeout: Maximum time to wait for response in seconds. + Defaults to 30.0. + + Returns: + A dictionary containing the fetched data. + + Raises: + aiohttp.ClientError: If the request fails. + asyncio.TimeoutError: If the request times out. + + Example: + >>> data = await fetch_data("https://api.example.com/data") + """ + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=timeout) as response: + return await response.json() +``` + +## Test Functions + +```python +def test_user_creation(): + """Tests that User objects are created correctly. + + This test verifies: + - Username is set correctly + - Email is set correctly + - created_at is set to current time + """ + user = User("john_doe", "john@example.com") + assert user.username == "john_doe" + assert user.email == "john@example.com" + assert isinstance(user.created_at, datetime) +``` + +## Docstring Sections + +Common sections in Google-style docstrings: + +- **Args:** Function/method parameters +- **Returns:** Return value description +- **Yields:** For generator functions +- **Raises:** Exceptions that may be raised +- **Attributes:** For classes, describes instance attributes +- **Example:** Usage examples +- **Note:** Important notes or warnings +- **Warning:** Critical warnings +- **Todo:** Planned improvements +- **See Also:** Related functions or classes + +## Style Guidelines + +1. Use triple double quotes (`"""`) for all docstrings +2. First line is a brief summary (one sentence, no period needed if one line) +3. Leave a blank line before sections (Args, Returns, etc.) +4. Capitalize section headers +5. Use imperative mood ("Returns" not "Return") +6. Be specific and concise +7. Include type information in Args and Returns when not obvious from annotations +8. Always document exceptions that can be raised +9. Include examples for complex functions +10. Keep line length under 80 characters where possible diff --git a/skills/r-development/SKILL.md b/skills/r-development/SKILL.md new file mode 100644 index 0000000..853c299 --- /dev/null +++ b/skills/r-development/SKILL.md @@ -0,0 +1,214 @@ +--- +name: r-development +description: Modern R development practices emphasizing tidyverse patterns (dplyr 1.1 and later, native pipe, join_by, .by grouping), rlang metaprogramming, performance optimization, and package development. Use when Claude needs to write R code, create R packages, optimize R performance, or provide R programming guidance. +--- + +# R Development + +This skill provides comprehensive guidance for modern R development, emphasizing current best practices with tidyverse, performance optimization, and professional package development. + +## Core Principles + +1. **Use modern tidyverse patterns** - Prioritize dplyr 1.1+ features, native pipe, and current APIs +2. **Profile before optimizing** - Use profvis and bench to identify real bottlenecks +3. **Write readable code first** - Optimize only when necessary and after profiling +4. **Follow tidyverse style guide** - Consistent naming, spacing, and structure + +## Modern Tidyverse Essentials + +### Native Pipe (`|>` not `%>%`) + +Always use native pipe `|>` instead of magrittr `%>%` (R 4.1+): + +```r +# Modern +data |> + filter(year >= 2020) |> + summarise(mean_value = mean(value)) + +# Avoid legacy pipe +data %>% filter(year >= 2020) +``` + +### Join Syntax (dplyr 1.1+) + +Use `join_by()` for all joins: + +```r +# Modern join syntax with equality +transactions |> + inner_join(companies, by = join_by(company == id)) + +# Inequality joins +transactions |> + inner_join(companies, join_by(company == id, year >= since)) + +# Rolling joins (closest match) +transactions |> + inner_join(companies, join_by(company == id, closest(year >= since))) +``` + +Control match behavior: + +```r +# Expect 1:1 matches +inner_join(x, y, by = join_by(id), multiple = "error") + +# Ensure all rows match +inner_join(x, y, by = join_by(id), unmatched = "error") +``` + +### Per-Operation Grouping with `.by` + +Use `.by` instead of `group_by() |> ... |> ungroup()`: + +```r +# Modern approach (always returns ungrouped) +data |> + summarise(mean_value = mean(value), .by = category) + +# Multiple grouping variables +data |> + summarise(total = sum(revenue), .by = c(company, year)) +``` + +### Column Operations + +Use modern column selection and transformation functions: + +```r +# pick() for column selection in data-masking contexts +data |> + summarise( + n_x_cols = ncol(pick(starts_with("x"))), + n_y_cols = ncol(pick(starts_with("y"))) + ) + +# across() for applying functions to multiple columns +data |> + summarise(across(where(is.numeric), mean, .names = "mean_{.col}"), .by = group) + +# reframe() for multi-row results per group +data |> + reframe(quantiles = quantile(x, c(0.25, 0.5, 0.75)), .by = group) +``` + +## rlang Metaprogramming + +For comprehensive rlang patterns, see [references/rlang-patterns.md](references/rlang-patterns.md). + +### Quick Reference + +- **`{{}}`** - Forward function arguments to data-masking functions +- **`!!`** - Inject single expressions or values +- **`!!!`** - Inject multiple arguments from a list +- **`.data[[]]`** - Access columns by name (character vectors) +- **`pick()`** - Select columns inside data-masking functions + +Example function with embracing: + +```r +my_summary <- function(data, group_var, summary_var) { + data |> + summarise(mean_val = mean({{ summary_var }}), .by = {{ group_var }}) +} +``` + +## Performance Optimization + +For detailed performance guidance, see [references/performance.md](references/performance.md). + +### Key Strategies + +1. **Profile first**: Use `profvis::profvis()` and `bench::mark()` +2. **Vectorize operations**: Avoid loops when vectorized alternatives exist +3. **Use dtplyr**: For large data operations (lazy evaluation with data.table backend) +4. **Parallel processing**: Use `furrr::future_map()` for parallelizable work +5. **Memory efficiency**: Pre-allocate, use appropriate data types + +Quick example: + +```r +# Profile code +profvis::profvis({ + result <- data |> + complex_operation() |> + another_operation() +}) + +# Benchmark alternatives +bench::mark( + approach_1 = method1(data), + approach_2 = method2(data), + check = FALSE +) +``` + +## Package Development + +For complete package development guidance, see [references/package-development.md](references/package-development.md). + +### Quick Guidelines + +**API Design:** +- Use `.by` parameter for per-operation grouping +- Use `{{}}` for column arguments +- Return tibbles consistently +- Validate user-facing function inputs thoroughly + +**Dependencies:** +- Add dependencies for significant functionality gains +- Core tidyverse packages usually worth including: dplyr, purrr, stringr, tidyr +- Minimize dependencies for widely-used packages + +**Testing:** +- Unit tests for individual functions +- Integration tests for workflows +- Test edge cases and error conditions + +**Documentation:** +- Document all exported functions +- Provide usage examples +- Explain non-obvious parameter interactions + +## Common Migration Patterns + +### Base R → Tidyverse + +```r +# Data manipulation +subset(data, condition) → filter(data, condition) +data[order(data$x), ] → arrange(data, x) +aggregate(x ~ y, data, mean) → summarise(data, mean(x), .by = y) + +# Functional programming +sapply(x, f) → map(x, f) # type-stable +lapply(x, f) → map(x, f) + +# Strings +grepl("pattern", text) → str_detect(text, "pattern") +gsub("old", "new", text) → str_replace_all(text, "old", "new") +``` + +### Old → New Tidyverse + +```r +# Pipes +%>% → |> + +# Grouping +group_by() |> ... |> ungroup() → summarise(..., .by = x) + +# Joins +by = c("a" = "b") → by = join_by(a == b) + +# Reshaping +gather()/spread() → pivot_longer()/pivot_wider() +``` + +## Additional Resources + +- **rlang patterns**: See [references/rlang-patterns.md](references/rlang-patterns.md) for comprehensive data-masking and metaprogramming guidance +- **Performance optimization**: See [references/performance.md](references/performance.md) for profiling, benchmarking, and optimization strategies +- **Package development**: See [references/package-development.md](references/package-development.md) for complete package creation guidance +- **Object systems**: See [references/object-systems.md](references/object-systems.md) for S3, S4, S7, R6, and vctrs guidance diff --git a/skills/r-development/references/object-systems.md b/skills/r-development/references/object-systems.md new file mode 100644 index 0000000..c9a2e87 --- /dev/null +++ b/skills/r-development/references/object-systems.md @@ -0,0 +1,310 @@ +# Object-Oriented Programming in R + +## S7: Modern OOP for New Projects + +S7 combines S3 simplicity with S4 structure: +- Formal class definitions with automatic validation +- Compatible with existing S3 code +- Better error messages and discoverability + +```r +# S7 class definition +Range <- new_class("Range", + properties = list( + start = class_double, + end = class_double + ), + validator = function(self) { + if (self@end < self@start) { + "@end must be >= @start" + } + } +) + +# Usage - constructor and property access +x <- Range(start = 1, end = 10) +x@start # 1 +x@end <- 20 # automatic validation + +# Methods +inside <- new_generic("inside", "x") +method(inside, Range) <- function(x, y) { + y >= x@start & y <= x@end +} +``` + +## OOP System Decision Matrix + +### Decision Tree: What Are You Building? + +#### 1. Vector-like Objects + +**Use vctrs when:** +- ✓ Need data frame integration (columns/rows) +- ✓ Want type-stable vector operations +- ✓ Building factor-like, date-like, or numeric-like classes +- ✓ Need consistent coercion/casting behavior +- ✓ Working with existing tidyverse infrastructure + +**Examples:** custom date classes, units, categorical data + +```r +# Vector-like behavior in data frames +percent <- new_vctr(0.5, class = "percentage") +data.frame(x = 1:3, pct = percent(c(0.1, 0.2, 0.3))) # works seamlessly + +# Type-stable operations +vec_c(percent(0.1), percent(0.2)) # predictable behavior +vec_cast(0.5, percent()) # explicit, safe casting +``` + +#### 2. General Objects (Complex Data Structures) + +**Use S7 when:** +- ✓ NEW projects that need formal classes +- ✓ Want property validation and safe property access (@) +- ✓ Need multiple dispatch (beyond S3's double dispatch) +- ✓ Converting from S3 and want better structure +- ✓ Building class hierarchies with inheritance +- ✓ Want better error messages and discoverability + +```r +# Complex validation needs +Range <- new_class("Range", + properties = list(start = class_double, end = class_double), + validator = function(self) { + if (self@end < self@start) "@end must be >= @start" + } +) + +# Multiple dispatch needs +method(generic, list(ClassA, ClassB)) <- function(x, y) ... + +# Class hierarchies with clear inheritance +Child <- new_class("Child", parent = Parent) +``` + +**Use S3 when:** +- ✓ Simple classes with minimal structure needs +- ✓ Maximum compatibility and minimal dependencies +- ✓ Quick prototyping or internal classes +- ✓ Contributing to existing S3-based ecosystems +- ✓ Performance is absolutely critical (minimal overhead) + +```r +# Simple classes without complex needs +new_simple <- function(x) structure(x, class = "simple") +print.simple <- function(x, ...) cat("Simple:", x) +``` + +**Use S4 when:** +- ✓ Working in Bioconductor ecosystem +- ✓ Need complex multiple inheritance (S7 doesn't support this) +- ✓ Existing S4 codebase that works well + +**Use R6 when:** +- ✓ Need reference semantics (mutable objects) +- ✓ Building stateful objects +- ✓ Coming from OOP languages like Python/Java +- ✓ Need encapsulation and private methods + +## Detailed S7 vs S3 Comparison + +| Feature | S3 | S7 | When S7 wins | +|---------|----|----|---------------| +| **Class definition** | Informal (convention) | Formal (`new_class()`) | Need guaranteed structure | +| **Property access** | `$` or `attr()` (unsafe) | `@` (safe, validated) | Property validation matters | +| **Validation** | Manual, inconsistent | Built-in validators | Data integrity important | +| **Method discovery** | Hard to find methods | Clear method printing | Developer experience matters | +| **Multiple dispatch** | Limited (base generics) | Full multiple dispatch | Complex method dispatch needed | +| **Inheritance** | Informal, `NextMethod()` | Explicit `super()` | Predictable inheritance needed | +| **Migration cost** | - | Low (1-2 hours) | Want better structure | +| **Performance** | Fastest | ~Same as S3 | Performance difference negligible | +| **Compatibility** | Full S3 | Full S3 + S7 | Need both old and new patterns | + +## vctrs for Vector Classes + +### Basic Vector Class + +```r +# Constructor (low-level) +new_percent <- function(x = double()) { + vec_assert(x, double()) + new_vctr(x, class = "pkg_percent") +} + +# Helper (user-facing) +percent <- function(x = double()) { + x <- vec_cast(x, double()) + new_percent(x) +} + +# Format method +format.pkg_percent <- function(x, ...) { + paste0(vec_data(x) * 100, "%") +} +``` + +### Coercion Methods + +```r +# Self-coercion +vec_ptype2.pkg_percent.pkg_percent <- function(x, y, ...) { + new_percent() +} + +# With double +vec_ptype2.pkg_percent.double <- function(x, y, ...) double() +vec_ptype2.double.pkg_percent <- function(x, y, ...) double() + +# Casting +vec_cast.pkg_percent.double <- function(x, to, ...) { + new_percent(x) +} +vec_cast.double.pkg_percent <- function(x, to, ...) { + vec_data(x) +} +``` + +## S3 Basics + +### Creating S3 Classes + +```r +# Constructor +new_myclass <- function(x, y) { + structure( + list(x = x, y = y), + class = "myclass" + ) +} + +# Methods +print.myclass <- function(x, ...) { + cat("myclass object\n") + cat("x:", x$x, "\n") + cat("y:", x$y, "\n") +} + +summary.myclass <- function(object, ...) { + list(x = object$x, y = object$y) +} +``` + +### Generic Functions + +```r +# Create generic +my_generic <- function(x, ...) { + UseMethod("my_generic") +} + +# Default method +my_generic.default <- function(x, ...) { + stop("No method for class ", class(x)) +} + +# Specific method +my_generic.myclass <- function(x, ...) { + # Implementation +} +``` + +## R6 Classes + +### Basic R6 Class + +```r +library(R6) + +MyClass <- R6Class("MyClass", + public = list( + x = NULL, + y = NULL, + + initialize = function(x, y) { + self$x <- x + self$y <- y + }, + + add = function() { + self$x + self$y + } + ), + + private = list( + internal_value = NULL + ) +) + +# Usage +obj <- MyClass$new(1, 2) +obj$add() # 3 +``` + +## Migration Strategy + +### S3 → S7 + +Usually 1-2 hours work, keeps full compatibility: + +```r +# S3 version +new_range <- function(start, end) { + structure( + list(start = start, end = end), + class = "range" + ) +} + +# S7 version +Range <- new_class("Range", + properties = list( + start = class_double, + end = class_double + ) +) +``` + +### S4 → S7 + +More complex, evaluate if S4 features are actually needed. + +### Base R → vctrs + +For vector-like classes, significant benefits in type stability and data frame integration. + +### Combining Approaches + +S7 classes can use vctrs principles internally for vector-like properties. + +## When to Use Each System + +### Use S7 for: +- New projects needing formal OOP +- Class validation and type safety +- Multiple dispatch +- Better developer experience + +### Use vctrs for: +- Vector-like classes +- Data frame columns +- Type-stable operations +- Tidyverse integration + +### Use S3 for: +- Simple classes +- Maximum compatibility +- Existing S3 ecosystems +- Quick prototypes + +### Use S4 for: +- Bioconductor packages +- Complex multiple inheritance +- Existing S4 codebases + +### Use R6 for: +- Mutable state +- Reference semantics +- Encapsulation needs +- Coming from OOP languages diff --git a/skills/r-development/references/package-development.md b/skills/r-development/references/package-development.md new file mode 100644 index 0000000..5b2359f --- /dev/null +++ b/skills/r-development/references/package-development.md @@ -0,0 +1,393 @@ +# Package Development + +## Dependency Strategy + +### When to Add Dependencies vs Base R + +```r +# Add dependency when: +✓ Significant functionality gain +✓ Maintenance burden reduction +✓ User experience improvement +✓ Complex implementation (regex, dates, web) + +# Use base R when: +✓ Simple utility functions +✓ Package will be widely used (minimize deps) +✓ Dependency is large for small benefit +✓ Base R solution is straightforward + +# Example decisions: +str_detect(x, "pattern") # Worth stringr dependency +length(x) > 0 # Don't need purrr for this +parse_dates(x) # Worth lubridate dependency +x + 1 # Don't need dplyr for this +``` + +### Tidyverse Dependency Guidelines + +```r +# Core tidyverse (usually worth it): +dplyr # Complex data manipulation +purrr # Functional programming, parallel +stringr # String manipulation +tidyr # Data reshaping + +# Specialized tidyverse (evaluate carefully): +lubridate # If heavy date manipulation +forcats # If many categorical operations +readr # If specific file reading needs +ggplot2 # If package creates visualizations + +# Heavy dependencies (use sparingly): +tidyverse # Meta-package, very heavy +shiny # Only for interactive apps +``` + +## API Design Patterns + +### Function Design Strategy + +```r +# Modern tidyverse API patterns + +# 1. Use .by for per-operation grouping +my_summarise <- function(.data, ..., .by = NULL) { + # Support modern grouped operations +} + +# 2. Use {{ }} for user-provided columns +my_select <- function(.data, cols) { + .data |> select({{ cols }}) +} + +# 3. Use ... for flexible arguments +my_mutate <- function(.data, ..., .by = NULL) { + .data |> mutate(..., .by = {{ .by }}) +} + +# 4. Return consistent types (tibbles, not data.frames) +my_function <- function(.data) { + result |> tibble::as_tibble() +} +``` + +### Input Validation Strategy + +```r +# Validation level by function type: + +# User-facing functions - comprehensive validation +user_function <- function(x, threshold = 0.5) { + # Check all inputs thoroughly + if (!is.numeric(x)) stop("x must be numeric") + if (!is.numeric(threshold) || length(threshold) != 1) { + stop("threshold must be a single number") + } + # ... function body +} + +# Internal functions - minimal validation +.internal_function <- function(x, threshold) { + # Assume inputs are valid (document assumptions) + # Only check critical invariants + # ... function body +} + +# Package functions with vctrs - type-stable validation +safe_function <- function(x, y) { + x <- vec_cast(x, double()) + y <- vec_cast(y, double()) + # Automatic type checking and coercion +} +``` + +## Error Handling Patterns + +```r +# Good error messages - specific and actionable +if (length(x) == 0) { + cli::cli_abort( + "Input {.arg x} cannot be empty.", + "i" = "Provide a non-empty vector." + ) +} + +# Include function name in errors +validate_input <- function(x, call = caller_env()) { + if (!is.numeric(x)) { + cli::cli_abort("Input must be numeric", call = call) + } +} + +# Use consistent error styling +# cli package for user-friendly messages +# rlang for developer tools +``` + +## When to Create Internal vs Exported Functions + +### Export Function When: + +```r +✓ Users will call it directly +✓ Other packages might want to extend it +✓ Part of the core package functionality +✓ Stable API that won't change often + +# Example: main data processing functions +export_these <- function(.data, ...) { + # Comprehensive input validation + # Full documentation required + # Stable API contract +} +``` + +### Keep Function Internal When: + +```r +✓ Implementation detail that may change +✓ Only used within package +✓ Complex implementation helpers +✓ Would clutter user-facing API + +# Example: helper functions +.internal_helper <- function(x, y) { + # Minimal documentation + # Can change without breaking users + # Assume inputs are pre-validated +} +``` + +## Testing and Documentation Strategy + +### Testing Levels + +```r +# Unit tests - individual functions +test_that("function handles edge cases", { + expect_equal(my_func(c()), expected_empty_result) + expect_error(my_func(NULL), class = "my_error_class") +}) + +# Integration tests - workflow combinations +test_that("pipeline works end-to-end", { + result <- data |> + step1() |> + step2() |> + step3() + expect_s3_class(result, "expected_class") +}) + +# Property-based tests for package functions +test_that("function properties hold", { + # Test invariants across many inputs +}) +``` + +### Testing rlang Functions + +```r +# Test data-masking behavior +test_that("function supports data masking", { + result <- my_function(mtcars, cyl) + expect_equal(names(result), "mean_cyl") + + # Test with expressions + result2 <- my_function(mtcars, cyl * 2) + expect_true("mean_cyl * 2" %in% names(result2)) +}) + +# Test injection behavior +test_that("function supports injection", { + var <- "cyl" + result <- my_function(mtcars, !!sym(var)) + expect_true(nrow(result) > 0) +}) +``` + +### Documentation Priorities + +```r +# Must document: +✓ All exported functions +✓ Complex algorithms or formulas +✓ Non-obvious parameter interactions +✓ Examples of typical usage + +# Can skip documentation: +✗ Simple internal helpers +✗ Obvious parameter meanings +✗ Functions that just call other functions +``` + +### Documentation Tags for rlang + +```r +#' @param var <[`data-masked`][dplyr::dplyr_data_masking]> Column to summarize +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Additional grouping variables +#' @param cols <[`tidy-select`][dplyr::dplyr_tidy_select]> Columns to select +``` + +## Package Structure + +### DESCRIPTION File + +```r +Package: mypackage +Title: What the Package Does (One Line, Title Case) +Version: 0.1.0 +Authors@R: person("First", "Last", email = "email@example.com", role = c("aut", "cre")) +Description: What the package does (one paragraph). +License: MIT + file LICENSE +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.3 +Imports: + dplyr (>= 1.1.0), + rlang (>= 1.1.0), + cli +Suggests: + testthat (>= 3.0.0) +Config/testthat/edition: 3 +``` + +### NAMESPACE Management + +Use roxygen2 for NAMESPACE management: + +```r +# Import specific functions +#' @importFrom rlang := enquo enquos +#' @importFrom dplyr mutate filter + +# Or import entire packages (use sparingly) +#' @import dplyr +``` + +### rlang Import Strategy + +```r +# In DESCRIPTION: +Imports: rlang + +# In NAMESPACE, import specific functions: +importFrom(rlang, enquo, enquos, expr, !!!, :=) + +# Or import key functions: +#' @importFrom rlang := enquo enquos +``` + +## Naming Conventions + +```r +# Good naming: snake_case for variables/functions +calculate_mean_score <- function(data, score_col) { + # Function body +} + +# Prefix non-standard arguments with . +my_function <- function(.data, ...) { + # Reduces argument conflicts +} + +# Internal functions start with . +.internal_helper <- function(x, y) { + # Not exported +} +``` + +## Style Guide Essentials + +### Object Names + +- Use snake_case for all names +- Variable names = nouns, function names = verbs +- Avoid dots except for S3 methods + +```r +# Good +day_one +calculate_mean +user_data + +# Avoid +DayOne +calculate.mean +userData +``` + +### Spacing and Layout + +```r +# Good spacing +x[, 1] +mean(x, na.rm = TRUE) +if (condition) { + action() +} + +# Pipe formatting +data |> + filter(year >= 2020) |> + group_by(category) |> + summarise( + mean_value = mean(value), + count = n() + ) +``` + +## Package Development Workflow + +1. **Setup**: Use `usethis::create_package()` +2. **Add functions**: Place in `R/` directory +3. **Document**: Use roxygen2 comments +4. **Test**: Write tests in `tests/testthat/` +5. **Check**: Run `devtools::check()` +6. **Build**: Use `devtools::build()` +7. **Install**: Use `devtools::install()` + +### Key usethis Functions + +```r +# Initial setup +usethis::create_package("mypackage") +usethis::use_git() +usethis::use_mit_license() + +# Add dependencies +usethis::use_package("dplyr") +usethis::use_package("testthat", "Suggests") + +# Add infrastructure +usethis::use_readme_md() +usethis::use_news_md() +usethis::use_testthat() + +# Add files +usethis::use_r("my_function") +usethis::use_test("my_function") +usethis::use_vignette("introduction") +``` + +## Common Pitfalls + +### What to Avoid + +```r +# Don't use library() in packages +# Use Imports in DESCRIPTION instead + +# Don't use source() +# Use proper function dependencies + +# Don't use attach() +# Always use explicit :: notation + +# Don't modify global options without restoring +old <- options(stringsAsFactors = FALSE) +on.exit(options(old), add = TRUE) + +# Don't use setwd() +# Use here::here() or relative paths +``` diff --git a/skills/r-development/references/performance.md b/skills/r-development/references/performance.md new file mode 100644 index 0000000..15ebf60 --- /dev/null +++ b/skills/r-development/references/performance.md @@ -0,0 +1,311 @@ +# Performance Optimization + +## Performance Tool Selection Guide + +### Profiling Tools Decision Matrix + +| Tool | Use When | Don't Use When | What It Shows | +|------|----------|----------------|---------------| +| **`profvis`** | Complex code, unknown bottlenecks | Simple functions, known issues | Time per line, call stack | +| **`bench::mark()`** | Comparing alternatives | Single approach | Relative performance, memory | +| **`system.time()`** | Quick checks | Detailed analysis | Total runtime only | +| **`Rprof()`** | Base R only environments | When profvis available | Raw profiling data | + +### Step-by-Step Performance Workflow + +```r +# 1. Profile first - find the actual bottlenecks +library(profvis) +profvis({ + # Your slow code here +}) + +# 2. Focus on the slowest parts (80/20 rule) +# Don't optimize until you know where time is spent + +# 3. Benchmark alternatives for hot spots +library(bench) +bench::mark( + current = current_approach(data), + vectorized = vectorized_approach(data), + parallel = map(data, in_parallel(func)) +) + +# 4. Consider tool trade-offs based on bottleneck type +``` + +## When Each Tool Helps vs Hurts + +### Parallel Processing (`in_parallel()`) + +```r +# Helps when: +✓ CPU-intensive computations +✓ Embarrassingly parallel problems +✓ Large datasets with independent operations +✓ I/O bound operations (file reading, API calls) + +# Hurts when: +✗ Simple, fast operations (overhead > benefit) +✗ Memory-intensive operations (may cause thrashing) +✗ Operations requiring shared state +✗ Small datasets + +# Example decision point: +expensive_func <- function(x) Sys.sleep(0.1) # 100ms per call +fast_func <- function(x) x^2 # microseconds per call + +# Good for parallel +map(1:100, in_parallel(expensive_func)) # ~10s -> ~2.5s on 4 cores + +# Bad for parallel (overhead > benefit) +map(1:100, in_parallel(fast_func)) # 100μs -> 50ms (500x slower!) +``` + +### vctrs Backend Tools + +```r +# Use vctrs when: +✓ Type safety matters more than raw speed +✓ Building reusable package functions +✓ Complex coercion/combination logic +✓ Consistent behavior across edge cases + +# Avoid vctrs when: +✗ One-off scripts where speed matters most +✗ Simple operations where base R is sufficient +✗ Memory is extremely constrained + +# Decision point: +simple_combine <- function(x, y) c(x, y) # Fast, simple +robust_combine <- function(x, y) vec_c(x, y) # Safer, slight overhead + +# Use simple for hot loops, robust for package APIs +``` + +### Data Backend Selection + +```r +# Use data.table when: +✓ Very large datasets (>1GB) +✓ Complex grouping operations +✓ Reference semantics desired +✓ Maximum performance critical + +# Use dplyr when: +✓ Readability and maintainability priority +✓ Complex joins and window functions +✓ Team familiarity with tidyverse +✓ Moderate sized data (<100MB) + +# Use dtplyr (dplyr with data.table backend) when: +✓ Want dplyr syntax with data.table performance +✓ Large data but team prefers tidyverse +✓ Lazy evaluation desired + +# Use base R when: +✓ No dependencies allowed +✓ Simple operations +✓ Teaching/learning contexts +``` + +## Profiling Best Practices + +```r +# 1. Profile realistic data sizes +profvis({ + # Use actual data size, not toy examples + real_data |> your_analysis() +}) + +# 2. Profile multiple runs for stability +bench::mark( + your_function(data), + min_iterations = 10, # Multiple runs + max_iterations = 100 +) + +# 3. Check memory usage too +bench::mark( + approach1 = method1(data), + approach2 = method2(data), + check = FALSE, # If outputs differ slightly + filter_gc = FALSE # Include GC time +) + +# 4. Profile with realistic usage patterns +# Not just isolated function calls +``` + +## Performance Anti-Patterns to Avoid + +```r +# Don't optimize without measuring +# ✗ "This looks slow" -> immediately rewrite +# ✓ Profile first, optimize bottlenecks + +# Don't over-engineer for performance +# ✗ Complex optimizations for 1% gains +# ✓ Focus on algorithmic improvements + +# Don't assume - measure +# ✗ "for loops are always slow in R" +# ✓ Benchmark your specific use case + +# Don't ignore readability costs +# ✗ Unreadable code for minor speedups +# ✓ Readable code with targeted optimizations + +# Don't grow objects in loops +# ✗ result <- c(); for(i in 1:n) result <- c(result, x[i]) +# ✓ result <- vector("list", n); for(i in 1:n) result[[i]] <- x[i] +``` + +## Modern purrr Patterns for Performance + +Use modern purrr 1.0+ patterns: + +```r +# Modern data frame row binding (purrr 1.0+) +models <- data_splits |> + map(\(split) train_model(split)) |> + list_rbind() # Replaces map_dfr() + +# Column binding +summaries <- data_list |> + map(\(df) get_summary_stats(df)) |> + list_cbind() # Replaces map_dfc() + +# Side effects with walk() +plots <- walk2(data_list, plot_names, \(df, name) { + p <- ggplot(df, aes(x, y)) + geom_point() + ggsave(name, p) +}) + +# Parallel processing (purrr 1.1.0+) +library(mirai) +daemons(4) +results <- large_datasets |> + map(in_parallel(expensive_computation)) +daemons(0) +``` + +## Vectorization + +```r +# Good - vectorized operations +result <- x + y + +# Good - Type-stable purrr functions +map_dbl(data, mean) # always returns double +map_chr(data, class) # always returns character + +# Avoid - Type-unstable base functions +sapply(data, mean) # might return list or vector + +# Avoid - explicit loops for simple operations +result <- numeric(length(x)) +for(i in seq_along(x)) { + result[i] <- x[i] + y[i] +} +``` + +## Using dtplyr for Large Data + +For large datasets, use dtplyr to get data.table performance with dplyr syntax: + +```r +library(dtplyr) + +# Convert to lazy data.table +large_data_dt <- lazy_dt(large_data) + +# Use dplyr syntax as normal +result <- large_data_dt |> + filter(year >= 2020) |> + group_by(category) |> + summarise( + total = sum(value), + avg = mean(value) + ) |> + as_tibble() # Convert back to tibble + +# See generated data.table code +result |> show_query() +``` + +## Memory Optimization + +```r +# Pre-allocate vectors +result <- vector("numeric", n) + +# Use appropriate data types +# integer instead of double when possible +x <- 1:1000 # integer +y <- seq(1, 1000, by = 1) # double + +# Remove large objects when done +rm(large_object) +gc() # Force garbage collection if needed + +# Use data.table for large data +library(data.table) +dt <- as.data.table(large_df) +dt[, new_col := old_col * 2] # Modifies in place +``` + +## String Manipulation Performance + +Use stringr over base R for consistency and performance: + +```r +# Good - stringr (consistent, pipe-friendly) +text |> + str_to_lower() |> + str_trim() |> + str_replace_all("pattern", "replacement") |> + str_extract("\\d+") + +# Common patterns +str_detect(text, "pattern") # vs grepl("pattern", text) +str_extract(text, "pattern") # vs complex regmatches() +str_replace_all(text, "a", "b") # vs gsub("a", "b", text) +str_split(text, ",") # vs strsplit(text, ",") +str_length(text) # vs nchar(text) +str_sub(text, 1, 5) # vs substr(text, 1, 5) +``` + +## When to Use vctrs + +### Core Benefits +- **Type stability** - Predictable output types regardless of input values +- **Size stability** - Predictable output sizes from input sizes +- **Consistent coercion rules** - Single set of rules applied everywhere +- **Robust class design** - Proper S3 vector infrastructure + +### Use vctrs when: + +```r +# Type-Stable Functions in Packages +my_function <- function(x, y) { + # Always returns double, regardless of input values + vec_cast(result, double()) +} + +# Consistent Coercion/Casting +vec_cast(x, double()) # Clear intent, predictable behavior +vec_ptype_common(x, y, z) # Finds richest compatible type + +# Size/Length Stability +vec_c(x, y) # size = vec_size(x) + vec_size(y) +vec_rbind(df1, df2) # size = sum of input sizes +``` + +### Don't Use vctrs When: +- Simple one-off analyses - Base R is sufficient +- No custom classes needed - Standard types work fine +- Performance critical + simple operations - Base R may be faster +- External API constraints - Must return base R types + +The key insight: **vctrs is most valuable in package development where type safety, consistency, and extensibility matter more than raw speed for simple operations.** diff --git a/skills/r-development/references/rlang-patterns.md b/skills/r-development/references/rlang-patterns.md new file mode 100644 index 0000000..58a228d --- /dev/null +++ b/skills/r-development/references/rlang-patterns.md @@ -0,0 +1,247 @@ +# rlang Patterns for Data-Masking + +## Core Concepts + +**Data-masking** allows R expressions to refer to data frame columns as if they were variables in the environment. rlang provides the metaprogramming framework that powers tidyverse data-masking. + +### Key rlang Tools + +- **Embracing `{{}}`** - Forward function arguments to data-masking functions +- **Injection `!!`** - Inject single expressions or values +- **Splicing `!!!`** - Inject multiple arguments from a list +- **Dynamic dots** - Programmable `...` with injection support +- **Pronouns `.data`/`.env`** - Explicit disambiguation between data and environment variables + +## Function Argument Patterns + +### Forwarding with `{{}}` + +Use `{{}}` to forward function arguments to data-masking functions: + +```r +# Single argument forwarding +my_summarise <- function(data, var) { + data |> dplyr::summarise(mean = mean({{ var }})) +} + +# Works with any data-masking expression +mtcars |> my_summarise(cyl) +mtcars |> my_summarise(cyl * am) +mtcars |> my_summarise(.data$cyl) # pronoun syntax supported +``` + +### Forwarding `...` + +No special syntax needed for dots forwarding: + +```r +# Simple dots forwarding +my_group_by <- function(.data, ...) { + .data |> dplyr::group_by(...) +} + +# Works with tidy selections too +my_select <- function(.data, ...) { + .data |> dplyr::select(...) +} + +# For single-argument tidy selections, wrap in c() +my_pivot_longer <- function(.data, ...) { + .data |> tidyr::pivot_longer(c(...)) +} +``` + +### Names Patterns with `.data` + +Use `.data` pronoun for programmatic column access: + +```r +# Single column by name +my_mean <- function(data, var) { + data |> dplyr::summarise(mean = mean(.data[[var]])) +} + +# Usage - completely insulated from data-masking +mtcars |> my_mean("cyl") # No ambiguity, works like regular function + +# Multiple columns with all_of() +my_select_vars <- function(data, vars) { + data |> dplyr::select(all_of(vars)) +} + +mtcars |> my_select_vars(c("cyl", "am")) +``` + +## Injection Operators + +### When to Use Each Operator + +| Operator | Use Case | Example | +|----------|----------|---------| +| `{{ }}` | Forward function arguments | `summarise(mean = mean({{ var }}))` | +| `!!` | Inject single expression/value | `summarise(mean = mean(!!sym(var)))` | +| `!!!` | Inject multiple arguments | `group_by(!!!syms(vars))` | +| `.data[[]]` | Access columns by name | `mean(.data[[var]])` | + +### Advanced Injection with `!!` + +```r +# Create symbols from strings +var <- "cyl" +mtcars |> dplyr::summarise(mean = mean(!!sym(var))) + +# Inject values to avoid name collisions +df <- data.frame(x = 1:3) +x <- 100 +df |> dplyr::mutate(scaled = x / !!x) # Uses both data and env x + +# Use data_sym() for tidyeval contexts (more robust) +mtcars |> dplyr::summarise(mean = mean(!!data_sym(var))) +``` + +### Splicing with `!!!` + +```r +# Multiple symbols from character vector +vars <- c("cyl", "am") +mtcars |> dplyr::group_by(!!!syms(vars)) + +# Or use data_syms() for tidy contexts +mtcars |> dplyr::group_by(!!!data_syms(vars)) + +# Splice lists of arguments +args <- list(na.rm = TRUE, trim = 0.1) +mtcars |> dplyr::summarise(mean = mean(cyl, !!!args)) +``` + +## Dynamic Dots Patterns + +### Using `list2()` for Dynamic Dots Support + +```r +my_function <- function(...) { + # Collect with list2() instead of list() for dynamic features + dots <- list2(...) + # Process dots... +} + +# Enables these features: +my_function(a = 1, b = 2) # Normal usage +my_function(!!!list(a = 1, b = 2)) # Splice a list +my_function("{name}" := value) # Name injection +my_function(a = 1, ) # Trailing commas OK +``` + +### Name Injection with Glue Syntax + +```r +# Basic name injection +name <- "result" +list2("{name}" := 1) # Creates list(result = 1) + +# In function arguments with {{ +my_mean <- function(data, var) { + data |> dplyr::summarise("mean_{{ var }}" := mean({{ var }})) +} + +mtcars |> my_mean(cyl) # Creates column "mean_cyl" +mtcars |> my_mean(cyl * am) # Creates column "mean_cyl * am" + +# Allow custom names with englue() +my_mean <- function(data, var, name = englue("mean_{{ var }}")) { + data |> dplyr::summarise("{name}" := mean({{ var }})) +} + +# User can override default +mtcars |> my_mean(cyl, name = "cylinder_mean") +``` + +## Pronouns for Disambiguation + +### `.data` and `.env` Best Practices + +```r +# Explicit disambiguation prevents masking issues +cyl <- 1000 # Environment variable + +mtcars |> dplyr::summarise( + data_cyl = mean(.data$cyl), # Data frame column + env_cyl = mean(.env$cyl), # Environment variable + ambiguous = mean(cyl) # Could be either (usually data wins) +) + +# Use in loops and programmatic contexts +vars <- c("cyl", "am") +for (var in vars) { + result <- mtcars |> dplyr::summarise(mean = mean(.data[[var]])) + print(result) +} +``` + +## Programming Patterns + +### Bridge Patterns + +Converting between data-masking and tidy selection behaviors: + +```r +# across() as selection-to-data-mask bridge +my_group_by <- function(data, vars) { + data |> dplyr::group_by(across({{ vars }})) +} + +# Works with tidy selection +mtcars |> my_group_by(starts_with("c")) + +# across(all_of()) as names-to-data-mask bridge +my_group_by <- function(data, vars) { + data |> dplyr::group_by(across(all_of(vars))) +} + +mtcars |> my_group_by(c("cyl", "am")) +``` + +### Transformation Patterns + +```r +# Transform single arguments by wrapping +my_mean <- function(data, var) { + data |> dplyr::summarise(mean = mean({{ var }}, na.rm = TRUE)) +} + +# Transform dots with across() +my_means <- function(data, ...) { + data |> dplyr::summarise(across(c(...), ~ mean(.x, na.rm = TRUE))) +} + +# Manual transformation (advanced) +my_means_manual <- function(.data, ...) { + vars <- enquos(..., .named = TRUE) + vars <- purrr::map(vars, ~ expr(mean(!!.x, na.rm = TRUE))) + .data |> dplyr::summarise(!!!vars) +} +``` + +## Common Patterns Summary + +### When to Use What + +**Use `{{}}` when:** +- Forwarding user-provided column references +- Building wrapper functions around dplyr/tidyr +- Need to support both bare names and expressions + +**Use `.data[[]]` when:** +- Working with character vector column names +- Iterating over column names programmatically +- Need complete insulation from data-masking + +**Use `!!` when:** +- Need to inject computed expressions +- Converting strings to symbols with `sym()` +- Avoiding variable name collisions + +**Use `!!!` when:** +- Injecting multiple arguments from a list +- Working with variable numbers of columns +- Splicing named arguments diff --git a/skills/shell-scripting/README.md b/skills/shell-scripting/README.md new file mode 100644 index 0000000..8acedf7 --- /dev/null +++ b/skills/shell-scripting/README.md @@ -0,0 +1 @@ +Adapted from [this repo](https://github.com/einverne/dotfiles/tree/4112dbe69457a07f7e25d046de13fbc4975dfeef/.claude/skills/shell-scripting) diff --git a/skills/shell-scripting/SKILL.md b/skills/shell-scripting/SKILL.md new file mode 100644 index 0000000..7c27b82 --- /dev/null +++ b/skills/shell-scripting/SKILL.md @@ -0,0 +1,140 @@ +--- +name: shell-scripting +description: Specialized knowledge of Bash and Zsh scripting, shell automation, command-line tools, and scripting best practices. Use when the user needs to write, debug, or optimize shell scripts, work with command-line tools, automate tasks with bash/zsh, or asks for shell script help. +--- + +# Shell Scripting Expert + +Expert guidance for writing robust, maintainable Bash and Zsh scripts with best practices for automation and command-line tool usage. + +## Script Structure Essentials + +Start every script with: +```bash +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' +``` + +- `set -e`: Exit on error +- `set -u`: Error on undefined variables +- `set -o pipefail`: Catch errors in pipes +- `IFS=$'\n\t'`: Safer word splitting + +## Critical Best Practices + +1. **Always quote variables**: `"$variable"` not `$variable` +2. **Use `[[` for conditionals** (Bash): `if [[ "$var" == "value" ]]; then` +3. **Check command existence**: `if command -v git &> /dev/null; then` +4. **Avoid parsing `ls`**: Use globs or `find` instead +5. **Use arrays for lists**: `files=("file1" "file2")` not space-separated strings +6. **Handle errors with traps**: + ```bash + trap cleanup EXIT + trap 'echo "Error on line $LINENO"' ERR + ``` + +## Common Patterns + +### Argument Parsing +```bash +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) usage; exit 0 ;; + -v|--verbose) VERBOSE=true; shift ;; + -*) echo "Unknown option: $1"; exit 1 ;; + *) break ;; + esac +done +``` + +### Safe File Iteration +```bash +# Prefer this (handles spaces, newlines correctly): +while IFS= read -r -d '' file; do + echo "Processing: $file" +done < <(find . -type f -name "*.txt" -print0) + +# Or with simple globs: +for file in *.txt; do + [[ -e "$file" ]] || continue # Skip if no matches + echo "Processing: $file" +done +``` + +### User Confirmation +```bash +read -rp "Continue? [y/N] " response +if [[ "$response" =~ ^[Yy]$ ]]; then + echo "Continuing..." +fi +``` + +### Colored Output +```bash +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}Success${NC}" +echo -e "${RED}Error${NC}" >&2 +``` + +## Modern Tool Alternatives + +When appropriate, suggest these modern replacements: +- `ripgrep` (rg) → faster than grep +- `fd` → faster than find +- `fzf` → interactive filtering +- `jq` → JSON processing +- `yq` → YAML processing +- `bat` → cat with syntax highlighting +- `eza` → enhanced ls + +## Function Organization + +```bash +usage() { + cat < +Description of what this script does. +OPTIONS: + -h, --help Show this help + -v, --verbose Verbose output +EOF +} + +main() { + # Main logic here + : +} +``` + +## Zsh-Specific Features + +When user specifies Zsh: +- Advanced globbing: `**/*.txt` (recursive), `*.txt~*test*` (exclude pattern) +- Parameter expansion: `${var:u}` (uppercase), `${var:l}` (lowercase) +- Associative arrays: `typeset -A hash; hash[key]=value` +- Extended globbing: Enable with `setopt extended_glob` + +## Security Considerations + +- **Never** `eval` untrusted input +- Validate user input before use +- Use `mktemp` for temporary files: `TEMP_FILE=$(mktemp)` +- Be explicit with `rm -rf` operations +- Check for TOCTOU (Time-Of-Check-Time-Of-Use) race conditions +- Don't store secrets in scripts; use environment variables or secret managers + +## Performance Tips + +- Use built-ins over external commands (`[[ ]]` vs `test`, `$(( ))` vs `expr`) +- Avoid unnecessary subshells: `var=$(cat file)` → `var=$(/dev/null; then + echo "Process is running" +fi +``` + +### Process timeout +```bash +# Using timeout command (GNU coreutils) +timeout 30s command + +# Manual implementation +command & +pid=$! +sleep 30 +if kill -0 $pid 2>/dev/null; then + kill $pid + echo "Command timed out" +fi +``` + +### Parallel execution +```bash +# Using xargs +find . -name "*.txt" -print0 | xargs -0 -P 4 -I {} process_file {} + +# Using GNU parallel (if available) +parallel process_file ::: file1 file2 file3 + +# Manual parallel execution +for file in *.txt; do + process_file "$file" & +done +wait # Wait for all background jobs +``` + +## Input/Output Redirection + +### Standard redirections +```bash +# Redirect stdout to file +command > file + +# Redirect stderr to file +command 2> file + +# Redirect both stdout and stderr +command &> file +command > file 2>&1 + +# Append instead of overwrite +command >> file + +# Redirect stderr to stdout +command 2>&1 + +# Discard output +command > /dev/null 2>&1 +``` + +### Here documents +```bash +# Basic here document +cat <&2 + exit "${2:-1}" +} + +# Usage +[[ -f "$file" ]] || error_exit "File not found: $file" 2 +``` + +### Trap signals +```bash +# Cleanup on exit +cleanup() { + rm -f "$TEMP_FILE" + echo "Cleanup complete" +} +trap cleanup EXIT + +# Handle specific signals +trap 'echo "Interrupted"; exit 130' INT +trap 'echo "Terminated"; exit 143' TERM + +# Error line number +trap 'echo "Error on line $LINENO"' ERR +``` + +## Text Processing + +### Using grep +```bash +# Basic search +grep "pattern" file + +# Case-insensitive +grep -i "pattern" file + +# Recursive search +grep -r "pattern" directory/ + +# Show line numbers +grep -n "pattern" file + +# Invert match (show non-matching lines) +grep -v "pattern" file + +# Extended regex +grep -E "pattern1|pattern2" file +``` + +### Using sed +```bash +# Replace text +sed 's/old/new/' file # Replace first occurrence per line +sed 's/old/new/g' file # Replace all occurrences +sed 's/old/new/gi' file # Case-insensitive replace + +# Delete lines +sed '/pattern/d' file # Delete matching lines +sed '1d' file # Delete first line +sed '$d' file # Delete last line + +# Print specific lines +sed -n '5p' file # Print line 5 +sed -n '5,10p' file # Print lines 5-10 +sed -n '/pattern/p' file # Print matching lines + +# In-place editing +sed -i 's/old/new/g' file # Linux +sed -i '' 's/old/new/g' file # macOS +``` + +### Using awk +```bash +# Print specific columns +awk '{print $1, $3}' file + +# Filter by column value +awk '$3 > 100' file + +# Sum a column +awk '{sum += $1} END {print sum}' file + +# Custom field separator +awk -F: '{print $1}' /etc/passwd + +# Pattern matching +awk '/pattern/ {print $1}' file +``` + +## Date and Time + +### Get current date/time +```bash +# Current timestamp +now=$(date +%s) + +# Formatted date +date=$(date +"%Y-%m-%d") +datetime=$(date +"%Y-%m-%d %H:%M:%S") + +# ISO 8601 format +iso_date=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +``` + +### Date arithmetic +```bash +# Days ago (GNU date) +yesterday=$(date -d "yesterday" +%Y-%m-%d) +week_ago=$(date -d "7 days ago" +%Y-%m-%d) + +# Days ago (BSD date - macOS) +yesterday=$(date -v-1d +%Y-%m-%d) +week_ago=$(date -v-7d +%Y-%m-%d) +``` + +## Network Operations + +### Check if host is reachable +```bash +if ping -c 1 -W 1 example.com &> /dev/null; then + echo "Host is reachable" +fi +``` + +### Download files +```bash +# Using curl +curl -O https://example.com/file +curl -o output.txt https://example.com/file + +# Using wget +wget https://example.com/file +wget -O output.txt https://example.com/file + +# Follow redirects and show progress +curl -L --progress-bar -o file https://example.com/file +``` + +### HTTP requests +```bash +# GET request +curl https://api.example.com/endpoint + +# POST request with JSON +curl -X POST https://api.example.com/endpoint \ + -H "Content-Type: application/json" \ + -d '{"key": "value"}' + +# Check HTTP status +status=$(curl -s -o /dev/null -w "%{http_code}" https://example.com) +``` + +## Temporary Files and Directories + +### Create temporary files safely +```bash +# Create temporary file +TEMP_FILE=$(mktemp) +trap 'rm -f "$TEMP_FILE"' EXIT + +# Create temporary directory +TEMP_DIR=$(mktemp -d) +trap 'rm -rf "$TEMP_DIR"' EXIT + +# Create in specific location +TEMP_FILE=$(mktemp /tmp/myapp.XXXXXX) +``` + +## Miscellaneous + +### Generate random numbers +```bash +# Random number 0-32767 +random=$RANDOM + +# Random number in range 1-100 +random=$((RANDOM % 100 + 1)) + +# Better random (if available) +random=$(shuf -i 1-100 -n 1) +``` + +### URL encoding +```bash +urlencode() { + local string="$1" + local strlen=${#string} + local encoded="" + local pos c o + + for (( pos=0; pos + +# Exit on error, undefined variables, and pipe failures +set -euo pipefail + +# Safer word splitting (newline and tab only) +IFS=$'\n\t' + +# Constants +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly SCRIPT_NAME="$(basename "${BASH_SOURCE[0]}")" +readonly VERSION="1.0.0" + +# Colors for output +readonly RED='\033[0;31m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly BLUE='\033[0;34m' +readonly NC='\033[0m' # No Color + +# Global variables +VERBOSE=false +DRY_RUN=false +LOG_FILE="" + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $*" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $*" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $*" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $*" >&2 +} + +log_debug() { + if [[ "$VERBOSE" == true ]]; then + echo -e "${BLUE}[DEBUG]${NC} $*" >&2 + fi +} + +# Usage information +usage() { + cat < [args...] + +Description of what this script does. + +OPTIONS: + -h, --help Show this help message and exit + -v, --verbose Enable verbose output + -d, --dry-run Show what would be done without doing it + -l, --log FILE Write log output to FILE + -V, --version Show version information + +COMMANDS: + process Process the specified file + batch Process all files in directory + clean Clean up temporary files + +EXAMPLES: + $SCRIPT_NAME --verbose process input.txt + $SCRIPT_NAME --dry-run batch /path/to/files + $SCRIPT_NAME clean + +EOF +} + +# Version information +version() { + echo "$SCRIPT_NAME version $VERSION" +} + +# Cleanup function (runs on EXIT) +cleanup() { + local exit_code=$? + log_debug "Cleaning up..." + + # Remove temporary files + if [[ -n "${TEMP_FILE:-}" ]] && [[ -f "$TEMP_FILE" ]]; then + rm -f "$TEMP_FILE" + fi + + # Additional cleanup tasks here + + if [[ $exit_code -ne 0 ]]; then + log_error "Script failed with exit code $exit_code" + fi +} + +# Error handler (runs on ERR) +error_handler() { + local line_num=$1 + log_error "Error occurred in script at line $line_num" +} + +# Set up traps +trap cleanup EXIT +trap 'error_handler $LINENO' ERR + +# Command implementations +cmd_process() { + local file=$1 + + if [[ ! -f "$file" ]]; then + log_error "File not found: $file" + return 1 + fi + + log_info "Processing file: $file" + + if [[ "$DRY_RUN" == true ]]; then + log_info "[DRY RUN] Would process: $file" + return 0 + fi + + # Actual processing logic here + log_debug "Processing contents of $file" + + log_success "Successfully processed: $file" +} + +cmd_batch() { + local dir=$1 + + if [[ ! -d "$dir" ]]; then + log_error "Directory not found: $dir" + return 1 + fi + + log_info "Batch processing directory: $dir" + + local count=0 + while IFS= read -r -d '' file; do + if cmd_process "$file"; then + ((count++)) + fi + done < <(find "$dir" -type f -name "*.txt" -print0) + + log_success "Processed $count files" +} + +cmd_clean() { + log_info "Cleaning up temporary files..." + + if [[ "$DRY_RUN" == true ]]; then + log_info "[DRY RUN] Would remove temporary files" + return 0 + fi + + # Clean up logic here + + log_success "Cleanup complete" +} + +# Main entry point +main() { + # Check if no arguments provided + if [[ $# -eq 0 ]]; then + usage + exit 1 + fi + + # Parse command-line options + while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage + exit 0 + ;; + -V|--version) + version + exit 0 + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + -d|--dry-run) + DRY_RUN=true + log_info "Dry run mode enabled" + shift + ;; + -l|--log) + if [[ -z "${2:-}" ]]; then + log_error "Option --log requires an argument" + exit 1 + fi + LOG_FILE="$2" + shift 2 + ;; + -*) + log_error "Unknown option: $1" + usage + exit 1 + ;; + *) + # First non-option argument is the command + break + ;; + esac + done + + # Redirect output to log file if specified + if [[ -n "$LOG_FILE" ]]; then + exec 1> >(tee -a "$LOG_FILE") + exec 2> >(tee -a "$LOG_FILE" >&2) + fi + + # Get command + local command="${1:-}" + if [[ -z "$command" ]]; then + log_error "No command specified" + usage + exit 1 + fi + shift + + # Execute command + case "$command" in + process) + if [[ $# -eq 0 ]]; then + log_error "process command requires a file argument" + exit 1 + fi + cmd_process "$@" + ;; + batch) + if [[ $# -eq 0 ]]; then + log_error "batch command requires a directory argument" + exit 1 + fi + cmd_batch "$@" + ;; + clean) + cmd_clean + ;; + *) + log_error "Unknown command: $command" + usage + exit 1 + ;; + esac +} + +# Run main function +main "$@"