Initial commit

2025-11-30 08:53:48 +08:00
commit df7d982d80
5 changed files with 566 additions and 0 deletions
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,11 @@
+{
+  "name": "data-science-skills",
+  "description": "Specialized skills for data analysis, machine learning, and scientific computing with Python",
+  "version": "1.0.0",
+  "author": {
+    "name": "Claude Skills Marketplace"
+  },
+  "commands": [
+    "./commands"
+  ]
+}
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
+# data-science-skills
+
+Specialized skills for data analysis, machine learning, and scientific computing with Python
--- a/commands/data-analyst.md
+++ b/commands/data-analyst.md
@@ -0,0 +1,274 @@
+# Data Analyst
+
+**Description**: Perform systematic data analysis using Python, pandas, and visualization libraries
+
+## Core Workflow
+
+1. **Load & Inspect Data**
+2. **Clean & Transform Data**
+3. **Explore & Visualize**
+4. **Analyze & Model**
+5. **Report Findings**
+
+## Data Loading & Inspection
+
+```python
+import pandas as pd
+import numpy as np
+
+# Load data
+df = pd.read_csv('data.csv')
+# df = pd.read_excel('data.xlsx')
+# df = pd.read_sql(query, connection)
+# df = pd.read_json('data.json')
+
+# Initial inspection
+print(df.shape)              # (rows, columns)
+print(df.info())             # Data types, null counts
+print(df.describe())         # Statistical summary
+print(df.head())             # First 5 rows
+print(df.columns.tolist())   # Column names
+print(df.dtypes)             # Data types
+
+# Check for missing data
+print(df.isnull().sum())
+print(df.duplicated().sum())
+```
+
+## Data Cleaning
+
+```python
+# Handle missing values
+df = df.dropna()                           # Drop rows with any null
+df = df.dropna(subset=['important_col'])   # Drop if specific column null
+df['col'] = df['col'].fillna(0)            # Fill with value
+df['col'] = df['col'].fillna(df['col'].mean())  # Fill with mean
+
+# Remove duplicates
+df = df.drop_duplicates()
+df = df.drop_duplicates(subset=['id'], keep='first')
+
+# Fix data types
+df['date'] = pd.to_datetime(df['date'])
+df['price'] = df['price'].astype(float)
+df['category'] = df['category'].astype('category')
+
+# Handle outliers
+from scipy import stats
+df = df[(np.abs(stats.zscore(df['value'])) < 3)]  # Remove outliers
+
+# Rename columns
+df = df.rename(columns={'old_name': 'new_name'})
+df.columns = df.columns.str.lower().str.replace(' ', '_')
+```
+
+## Data Transformation
+
+```python
+# Create new columns
+df['total'] = df['quantity'] * df['price']
+df['month'] = df['date'].dt.month
+df['year'] = df['date'].dt.year
+
+# Apply functions
+df['normalized'] = df['value'] / df['value'].max()
+df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3})
+
+# Groupby aggregations
+summary = df.groupby('category').agg({
+    'sales': ['sum', 'mean', 'count'],
+    'profit': 'sum'
+}).round(2)
+
+# Pivot tables
+pivot = df.pivot_table(
+    values='sales',
+    index='category',
+    columns='region',
+    aggfunc='sum',
+    fill_value=0
+)
+
+# Merge datasets
+merged = pd.merge(df1, df2, on='id', how='left')
+```
+
+## Exploratory Data Analysis
+
+```python
+# Distribution analysis
+print(df['value'].value_counts())
+print(df['value'].value_counts(normalize=True))  # Percentages
+
+# Correlation
+correlation = df[['col1', 'col2', 'col3']].corr()
+
+# Cross-tabulation
+pd.crosstab(df['category'], df['region'], normalize='index')
+```
+
+## Visualization
+
+```python
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Set style
+sns.set_style('whitegrid')
+plt.rcParams['figure.figsize'] = (12, 6)
+
+# Histogram
+plt.hist(df['value'], bins=30, edgecolor='black')
+plt.xlabel('Value')
+plt.ylabel('Frequency')
+plt.title('Distribution of Values')
+plt.show()
+
+# Boxplot
+sns.boxplot(data=df, x='category', y='value')
+plt.title('Value Distribution by Category')
+plt.show()
+
+# Time series
+df.set_index('date')['sales'].plot()
+plt.title('Sales Over Time')
+plt.show()
+
+# Heatmap
+sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
+plt.title('Correlation Matrix')
+plt.show()
+
+# Scatter plot
+sns.scatterplot(data=df, x='x_value', y='y_value', hue='category')
+plt.title('X vs Y by Category')
+plt.show()
+```
+
+## Statistical Analysis
+
+```python
+from scipy import stats
+
+# T-test
+group1 = df[df['category'] == 'A']['value']
+group2 = df[df['category'] == 'B']['value']
+t_stat, p_value = stats.ttest_ind(group1, group2)
+print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
+
+# Chi-square test
+contingency_table = pd.crosstab(df['category'], df['outcome'])
+chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
+print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}")
+
+# Linear regression
+from sklearn.linear_model import LinearRegression
+X = df[['feature1', 'feature2']]
+y = df['target']
+model = LinearRegression()
+model.fit(X, y)
+print(f"R² Score: {model.score(X, y):.4f}")
+print(f"Coefficients: {model.coef_}")
+```
+
+## Best Practices
+
+### 1. **Reproducibility**
+```python
+# Set random seed
+np.random.seed(42)
+
+# Save processed data
+df.to_csv('cleaned_data.csv', index=False)
+
+# Document transformations
+# Use Jupyter notebooks with markdown cells
+```
+
+### 2. **Data Quality Checks**
+```python
+def validate_data(df):
+    """Validate data quality"""
+    checks = {
+        'null_values': df.isnull().sum().sum() == 0,
+        'duplicates': df.duplicated().sum() == 0,
+        'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'),
+        'no_negative_prices': (df['price'] >= 0).all()
+    }
+    return all(checks.values()), checks
+
+is_valid, results = validate_data(df)
+print(f"Data valid: {is_valid}")
+print(results)
+```
+
+### 3. **Performance**
+```python
+# Use vectorized operations
+df['result'] = df['a'] * df['b']  # ✅ Fast
+
+# Avoid loops
+for i in range(len(df)):  # ❌ Slow
+    df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']
+
+# Use appropriate data types
+df['category'] = df['category'].astype('category')  # Saves memory
+
+# Chunk large files
+for chunk in pd.read_csv('large_file.csv', chunksize=10000):
+    process(chunk)
+```
+
+## Analysis Workflow Template
+
+```python
+# 1. LOAD DATA
+df = pd.read_csv('data.csv')
+
+# 2. INITIAL INSPECTION
+print(f"Shape: {df.shape}")
+print(f"Columns: {df.columns.tolist()}")
+print(f"Nulls: \n{df.isnull().sum()}")
+
+# 3. CLEAN DATA
+df = df.drop_duplicates()
+df = df.dropna(subset=['critical_column'])
+df['date'] = pd.to_datetime(df['date'])
+
+# 4. TRANSFORM
+df['month'] = df['date'].dt.month
+df['total'] = df['quantity'] * df['price']
+
+# 5. EXPLORE
+print(df['category'].value_counts())
+print(df[['sales', 'profit']].describe())
+
+# 6. VISUALIZE
+df.groupby('month')['sales'].sum().plot(kind='bar')
+plt.title('Monthly Sales')
+plt.show()
+
+# 7. ANALYZE
+monthly_avg = df.groupby('month')['sales'].mean()
+print(f"Average monthly sales: ${monthly_avg.mean():.2f}")
+
+# 8. EXPORT RESULTS
+results = df.groupby('category').agg({
+    'sales': 'sum',
+    'profit': 'sum'
+})
+results.to_csv('results.csv')
+```
+
+## When to Use This Skill
+
+- Analyzing CSV/Excel datasets
+- Creating data visualizations
+- Performing statistical analysis
+- Cleaning and transforming data
+- Generating data reports
+- Exploratory data analysis
+
+---
+
+**Remember**: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report!
--- a/commands/ml-engineer.md
+++ b/commands/ml-engineer.md
@@ -0,0 +1,229 @@
+# ML Engineer
+
+**Description**: Build, train, and deploy machine learning models following MLOps best practices
+
+## ML Project Workflow
+
+1. **Define Problem** - Classification, regression, clustering?
+2. **Prepare Data** - Clean, split, scale
+3. **Choose Model** - Select appropriate algorithm
+4. **Train Model** - Fit to training data
+5. **Evaluate** - Test performance
+6. **Tune** - Optimize hyperparameters
+7. **Deploy** - Put model in production
+
+## Data Preparation
+
+```python
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+
+# Load data
+X = df[['feature1', 'feature2', 'feature3']]
+y = df['target']
+
+# Split data (80/20 train/test)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42, stratify=y
+)
+
+# Scale features
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)  # Use same scaler!
+
+# Encode categorical labels
+label_encoder = LabelEncoder()
+y_train_encoded = label_encoder.fit_transform(y_train)
+```
+
+## Classification Models
+
+```python
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report, confusion_matrix
+
+# Train model
+model = RandomForestClassifier(n_estimators=100, random_state=42)
+model.fit(X_train_scaled, y_train)
+
+# Make predictions
+y_pred = model.predict(X_test_scaled)
+y_pred_proba = model.predict_proba(X_test_scaled)
+
+# Evaluate
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
+print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
+print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
+print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
+
+print("\nClassification Report:")
+print(classification_report(y_test, y_pred))
+
+print("\nConfusion Matrix:")
+print(confusion_matrix(y_test, y_pred))
+```
+
+## Regression Models
+
+```python
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+
+# Train model
+model = GradientBoostingRegressor(n_estimators=100, random_state=42)
+model.fit(X_train_scaled, y_train)
+
+# Predict
+y_pred = model.predict(X_test_scaled)
+
+# Evaluate
+mse = mean_squared_error(y_test, y_pred)
+rmse = np.sqrt(mse)
+mae = mean_absolute_error(y_test, y_pred)
+r2 = r2_score(y_test, y_pred)
+
+print(f"RMSE: {rmse:.4f}")
+print(f"MAE: {mae:.4f}")
+print(f"R² Score: {r2:.4f}")
+```
+
+## Model Selection & Comparison
+
+```python
+from sklearn.model_selection import cross_val_score
+
+models = {
+    'Logistic Regression': LogisticRegression(),
+    'Random Forest': RandomForestClassifier(),
+    'Gradient Boosting': GradientBoostingClassifier(),
+}
+
+for name, model in models.items():
+    scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
+    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")
+```
+
+## Hyperparameter Tuning
+
+```python
+from sklearn.model_selection import GridSearchCV
+
+# Define parameter grid
+param_grid = {
+    'n_estimators': [50, 100, 200],
+    'max_depth': [None, 10, 20],
+    'min_samples_split': [2, 5, 10]
+}
+
+# Grid search
+grid_search = GridSearchCV(
+    RandomForestClassifier(random_state=42),
+    param_grid,
+    cv=5,
+    scoring='f1_weighted',
+    n_jobs=-1
+)
+
+grid_search.fit(X_train_scaled, y_train)
+
+print(f"Best parameters: {grid_search.best_params_}")
+print(f"Best score: {grid_search.best_score_:.4f}")
+
+# Use best model
+best_model = grid_search.best_estimator_
+```
+
+## Feature Importance
+
+```python
+# For tree-based models
+importances = model.feature_importances_
+feature_importance_df = pd.DataFrame({
+    'feature': X.columns,
+    'importance': importances
+}).sort_values('importance', ascending=False)
+
+print(feature_importance_df)
+
+# Plot
+plt.barh(feature_importance_df['feature'], feature_importance_df['importance'])
+plt.xlabel('Importance')
+plt.title('Feature Importance')
+plt.show()
+```
+
+## Save & Load Models
+
+```python
+import joblib
+
+# Save model
+joblib.dump(model, 'model.pkl')
+joblib.dump(scaler, 'scaler.pkl')
+
+# Load model
+loaded_model = joblib.load('model.pkl')
+loaded_scaler = joblib.load('scaler.pkl')
+
+# Make predictions
+new_data = [[value1, value2, value3]]
+new_data_scaled = loaded_scaler.transform(new_data)
+prediction = loaded_model.predict(new_data_scaled)
+```
+
+## Best Practices
+
+### 1. **Always Split Data Before Scaling**
+```python
+# ✅ Correct order
+X_train, X_test = train_test_split(X, y)
+scaler.fit(X_train)  # Fit only on training
+X_train_scaled = scaler.transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+
+# ❌ Wrong: Data leakage
+scaler.fit(X)  # Don't fit on all data!
+```
+
+### 2. **Use Cross-Validation**
+```python
+from sklearn.model_selection import cross_validate
+
+cv_results = cross_validate(
+    model, X, y,
+    cv=5,
+    scoring=['accuracy', 'f1_weighted'],
+    return_train_score=True
+)
+```
+
+### 3. **Handle Imbalanced Data**
+```python
+from sklearn.utils.class_weight import compute_class_weight
+
+# Compute class weights
+class_weights = compute_class_weight(
+    'balanced',
+    classes=np.unique(y_train),
+    y=y_train
+)
+
+# Use in model
+model = RandomForestClassifier(class_weight='balanced')
+```
+
+## When to Use This Skill
+
+- Building predictive models
+- Training classification/regression models
+- Evaluating model performance
+- Tuning hyperparameters
+- Deploying ML models
+
+---
+
+**Remember**: Good ML is methodical - clean data, proper splits, cross-validation, and thorough evaluation!
--- a/plugin.lock.json
+++ b/plugin.lock.json
@@ -0,0 +1,49 @@
+{
+  "$schema": "internal://schemas/plugin.lock.v1.json",
+  "pluginId": "gh:samuelgarrett/claude-code-plugin-test:data-science-skills",
+  "normalized": {
+    "repo": null,
+    "ref": "refs/tags/v20251128.0",
+    "commit": "f6847456ddc62cac0eb1e1b8dddaa54039b033c7",
+    "treeHash": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49",
+    "generatedAt": "2025-11-28T10:28:07.842177Z",
+    "toolVersion": "publish_plugins.py@0.2.0"
+  },
+  "origin": {
+    "remote": "git@github.com:zhongweili/42plugin-data.git",
+    "branch": "master",
+    "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
+    "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
+  },
+  "manifest": {
+    "name": "data-science-skills",
+    "description": "Specialized skills for data analysis, machine learning, and scientific computing with Python",
+    "version": "1.0.0"
+  },
+  "content": {
+    "files": [
+      {
+        "path": "README.md",
+        "sha256": "447a86922c3649863baae5713109d4fc1238099d40db4097055fb71dd82d0c38"
+      },
+      {
+        "path": ".claude-plugin/plugin.json",
+        "sha256": "0d93ceea2a14453493c862437164a0e0c864859488f362f2c974eba0c2a080f2"
+      },
+      {
+        "path": "commands/data-analyst.md",
+        "sha256": "4bcafbe49524548efdde7d8b76e1ba7721e173355d633371d8d504d3058cf802"
+      },
+      {
+        "path": "commands/ml-engineer.md",
+        "sha256": "38a495d7c619a6c4e09ddc754f1ea3ebb1682ccd361f4326315d334c10a5d123"
+      }
+    ],
+    "dirSha256": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49"
+  },
+  "security": {
+    "scannedAt": null,
+    "scannerVersion": null,
+    "flags": []
+  }
+}