Initial commit
This commit is contained in:
11
.claude-plugin/plugin.json
Normal file
11
.claude-plugin/plugin.json
Normal file
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"name": "data-science-skills",
|
||||
"description": "Specialized skills for data analysis, machine learning, and scientific computing with Python",
|
||||
"version": "1.0.0",
|
||||
"author": {
|
||||
"name": "Claude Skills Marketplace"
|
||||
},
|
||||
"commands": [
|
||||
"./commands"
|
||||
]
|
||||
}
|
||||
3
README.md
Normal file
3
README.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# data-science-skills
|
||||
|
||||
Specialized skills for data analysis, machine learning, and scientific computing with Python
|
||||
274
commands/data-analyst.md
Normal file
274
commands/data-analyst.md
Normal file
@@ -0,0 +1,274 @@
|
||||
# Data Analyst
|
||||
|
||||
**Description**: Perform systematic data analysis using Python, pandas, and visualization libraries
|
||||
|
||||
## Core Workflow
|
||||
|
||||
1. **Load & Inspect Data**
|
||||
2. **Clean & Transform Data**
|
||||
3. **Explore & Visualize**
|
||||
4. **Analyze & Model**
|
||||
5. **Report Findings**
|
||||
|
||||
## Data Loading & Inspection
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Load data
|
||||
df = pd.read_csv('data.csv')
|
||||
# df = pd.read_excel('data.xlsx')
|
||||
# df = pd.read_sql(query, connection)
|
||||
# df = pd.read_json('data.json')
|
||||
|
||||
# Initial inspection
|
||||
print(df.shape) # (rows, columns)
|
||||
print(df.info()) # Data types, null counts
|
||||
print(df.describe()) # Statistical summary
|
||||
print(df.head()) # First 5 rows
|
||||
print(df.columns.tolist()) # Column names
|
||||
print(df.dtypes) # Data types
|
||||
|
||||
# Check for missing data
|
||||
print(df.isnull().sum())
|
||||
print(df.duplicated().sum())
|
||||
```
|
||||
|
||||
## Data Cleaning
|
||||
|
||||
```python
|
||||
# Handle missing values
|
||||
df = df.dropna() # Drop rows with any null
|
||||
df = df.dropna(subset=['important_col']) # Drop if specific column null
|
||||
df['col'] = df['col'].fillna(0) # Fill with value
|
||||
df['col'] = df['col'].fillna(df['col'].mean()) # Fill with mean
|
||||
|
||||
# Remove duplicates
|
||||
df = df.drop_duplicates()
|
||||
df = df.drop_duplicates(subset=['id'], keep='first')
|
||||
|
||||
# Fix data types
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
df['price'] = df['price'].astype(float)
|
||||
df['category'] = df['category'].astype('category')
|
||||
|
||||
# Handle outliers
|
||||
from scipy import stats
|
||||
df = df[(np.abs(stats.zscore(df['value'])) < 3)] # Remove outliers
|
||||
|
||||
# Rename columns
|
||||
df = df.rename(columns={'old_name': 'new_name'})
|
||||
df.columns = df.columns.str.lower().str.replace(' ', '_')
|
||||
```
|
||||
|
||||
## Data Transformation
|
||||
|
||||
```python
|
||||
# Create new columns
|
||||
df['total'] = df['quantity'] * df['price']
|
||||
df['month'] = df['date'].dt.month
|
||||
df['year'] = df['date'].dt.year
|
||||
|
||||
# Apply functions
|
||||
df['normalized'] = df['value'] / df['value'].max()
|
||||
df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3})
|
||||
|
||||
# Groupby aggregations
|
||||
summary = df.groupby('category').agg({
|
||||
'sales': ['sum', 'mean', 'count'],
|
||||
'profit': 'sum'
|
||||
}).round(2)
|
||||
|
||||
# Pivot tables
|
||||
pivot = df.pivot_table(
|
||||
values='sales',
|
||||
index='category',
|
||||
columns='region',
|
||||
aggfunc='sum',
|
||||
fill_value=0
|
||||
)
|
||||
|
||||
# Merge datasets
|
||||
merged = pd.merge(df1, df2, on='id', how='left')
|
||||
```
|
||||
|
||||
## Exploratory Data Analysis
|
||||
|
||||
```python
|
||||
# Distribution analysis
|
||||
print(df['value'].value_counts())
|
||||
print(df['value'].value_counts(normalize=True)) # Percentages
|
||||
|
||||
# Correlation
|
||||
correlation = df[['col1', 'col2', 'col3']].corr()
|
||||
|
||||
# Cross-tabulation
|
||||
pd.crosstab(df['category'], df['region'], normalize='index')
|
||||
```
|
||||
|
||||
## Visualization
|
||||
|
||||
```python
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
# Set style
|
||||
sns.set_style('whitegrid')
|
||||
plt.rcParams['figure.figsize'] = (12, 6)
|
||||
|
||||
# Histogram
|
||||
plt.hist(df['value'], bins=30, edgecolor='black')
|
||||
plt.xlabel('Value')
|
||||
plt.ylabel('Frequency')
|
||||
plt.title('Distribution of Values')
|
||||
plt.show()
|
||||
|
||||
# Boxplot
|
||||
sns.boxplot(data=df, x='category', y='value')
|
||||
plt.title('Value Distribution by Category')
|
||||
plt.show()
|
||||
|
||||
# Time series
|
||||
df.set_index('date')['sales'].plot()
|
||||
plt.title('Sales Over Time')
|
||||
plt.show()
|
||||
|
||||
# Heatmap
|
||||
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
|
||||
plt.title('Correlation Matrix')
|
||||
plt.show()
|
||||
|
||||
# Scatter plot
|
||||
sns.scatterplot(data=df, x='x_value', y='y_value', hue='category')
|
||||
plt.title('X vs Y by Category')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
## Statistical Analysis
|
||||
|
||||
```python
|
||||
from scipy import stats
|
||||
|
||||
# T-test
|
||||
group1 = df[df['category'] == 'A']['value']
|
||||
group2 = df[df['category'] == 'B']['value']
|
||||
t_stat, p_value = stats.ttest_ind(group1, group2)
|
||||
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
|
||||
|
||||
# Chi-square test
|
||||
contingency_table = pd.crosstab(df['category'], df['outcome'])
|
||||
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
|
||||
print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}")
|
||||
|
||||
# Linear regression
|
||||
from sklearn.linear_model import LinearRegression
|
||||
X = df[['feature1', 'feature2']]
|
||||
y = df['target']
|
||||
model = LinearRegression()
|
||||
model.fit(X, y)
|
||||
print(f"R² Score: {model.score(X, y):.4f}")
|
||||
print(f"Coefficients: {model.coef_}")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. **Reproducibility**
|
||||
```python
|
||||
# Set random seed
|
||||
np.random.seed(42)
|
||||
|
||||
# Save processed data
|
||||
df.to_csv('cleaned_data.csv', index=False)
|
||||
|
||||
# Document transformations
|
||||
# Use Jupyter notebooks with markdown cells
|
||||
```
|
||||
|
||||
### 2. **Data Quality Checks**
|
||||
```python
|
||||
def validate_data(df):
|
||||
"""Validate data quality"""
|
||||
checks = {
|
||||
'null_values': df.isnull().sum().sum() == 0,
|
||||
'duplicates': df.duplicated().sum() == 0,
|
||||
'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'),
|
||||
'no_negative_prices': (df['price'] >= 0).all()
|
||||
}
|
||||
return all(checks.values()), checks
|
||||
|
||||
is_valid, results = validate_data(df)
|
||||
print(f"Data valid: {is_valid}")
|
||||
print(results)
|
||||
```
|
||||
|
||||
### 3. **Performance**
|
||||
```python
|
||||
# Use vectorized operations
|
||||
df['result'] = df['a'] * df['b'] # ✅ Fast
|
||||
|
||||
# Avoid loops
|
||||
for i in range(len(df)): # ❌ Slow
|
||||
df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']
|
||||
|
||||
# Use appropriate data types
|
||||
df['category'] = df['category'].astype('category') # Saves memory
|
||||
|
||||
# Chunk large files
|
||||
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
|
||||
process(chunk)
|
||||
```
|
||||
|
||||
## Analysis Workflow Template
|
||||
|
||||
```python
|
||||
# 1. LOAD DATA
|
||||
df = pd.read_csv('data.csv')
|
||||
|
||||
# 2. INITIAL INSPECTION
|
||||
print(f"Shape: {df.shape}")
|
||||
print(f"Columns: {df.columns.tolist()}")
|
||||
print(f"Nulls: \n{df.isnull().sum()}")
|
||||
|
||||
# 3. CLEAN DATA
|
||||
df = df.drop_duplicates()
|
||||
df = df.dropna(subset=['critical_column'])
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
|
||||
# 4. TRANSFORM
|
||||
df['month'] = df['date'].dt.month
|
||||
df['total'] = df['quantity'] * df['price']
|
||||
|
||||
# 5. EXPLORE
|
||||
print(df['category'].value_counts())
|
||||
print(df[['sales', 'profit']].describe())
|
||||
|
||||
# 6. VISUALIZE
|
||||
df.groupby('month')['sales'].sum().plot(kind='bar')
|
||||
plt.title('Monthly Sales')
|
||||
plt.show()
|
||||
|
||||
# 7. ANALYZE
|
||||
monthly_avg = df.groupby('month')['sales'].mean()
|
||||
print(f"Average monthly sales: ${monthly_avg.mean():.2f}")
|
||||
|
||||
# 8. EXPORT RESULTS
|
||||
results = df.groupby('category').agg({
|
||||
'sales': 'sum',
|
||||
'profit': 'sum'
|
||||
})
|
||||
results.to_csv('results.csv')
|
||||
```
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
- Analyzing CSV/Excel datasets
|
||||
- Creating data visualizations
|
||||
- Performing statistical analysis
|
||||
- Cleaning and transforming data
|
||||
- Generating data reports
|
||||
- Exploratory data analysis
|
||||
|
||||
---
|
||||
|
||||
**Remember**: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report!
|
||||
229
commands/ml-engineer.md
Normal file
229
commands/ml-engineer.md
Normal file
@@ -0,0 +1,229 @@
|
||||
# ML Engineer
|
||||
|
||||
**Description**: Build, train, and deploy machine learning models following MLOps best practices
|
||||
|
||||
## ML Project Workflow
|
||||
|
||||
1. **Define Problem** - Classification, regression, clustering?
|
||||
2. **Prepare Data** - Clean, split, scale
|
||||
3. **Choose Model** - Select appropriate algorithm
|
||||
4. **Train Model** - Fit to training data
|
||||
5. **Evaluate** - Test performance
|
||||
6. **Tune** - Optimize hyperparameters
|
||||
7. **Deploy** - Put model in production
|
||||
|
||||
## Data Preparation
|
||||
|
||||
```python
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
||||
|
||||
# Load data
|
||||
X = df[['feature1', 'feature2', 'feature3']]
|
||||
y = df['target']
|
||||
|
||||
# Split data (80/20 train/test)
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
X, y, test_size=0.2, random_state=42, stratify=y
|
||||
)
|
||||
|
||||
# Scale features
|
||||
scaler = StandardScaler()
|
||||
X_train_scaled = scaler.fit_transform(X_train)
|
||||
X_test_scaled = scaler.transform(X_test) # Use same scaler!
|
||||
|
||||
# Encode categorical labels
|
||||
label_encoder = LabelEncoder()
|
||||
y_train_encoded = label_encoder.fit_transform(y_train)
|
||||
```
|
||||
|
||||
## Classification Models
|
||||
|
||||
```python
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
|
||||
# Train model
|
||||
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
# Make predictions
|
||||
y_pred = model.predict(X_test_scaled)
|
||||
y_pred_proba = model.predict_proba(X_test_scaled)
|
||||
|
||||
# Evaluate
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||
|
||||
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
|
||||
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
|
||||
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
|
||||
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
|
||||
|
||||
print("\nClassification Report:")
|
||||
print(classification_report(y_test, y_pred))
|
||||
|
||||
print("\nConfusion Matrix:")
|
||||
print(confusion_matrix(y_test, y_pred))
|
||||
```
|
||||
|
||||
## Regression Models
|
||||
|
||||
```python
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
||||
|
||||
# Train model
|
||||
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
# Predict
|
||||
y_pred = model.predict(X_test_scaled)
|
||||
|
||||
# Evaluate
|
||||
mse = mean_squared_error(y_test, y_pred)
|
||||
rmse = np.sqrt(mse)
|
||||
mae = mean_absolute_error(y_test, y_pred)
|
||||
r2 = r2_score(y_test, y_pred)
|
||||
|
||||
print(f"RMSE: {rmse:.4f}")
|
||||
print(f"MAE: {mae:.4f}")
|
||||
print(f"R² Score: {r2:.4f}")
|
||||
```
|
||||
|
||||
## Model Selection & Comparison
|
||||
|
||||
```python
|
||||
from sklearn.model_selection import cross_val_score
|
||||
|
||||
models = {
|
||||
'Logistic Regression': LogisticRegression(),
|
||||
'Random Forest': RandomForestClassifier(),
|
||||
'Gradient Boosting': GradientBoostingClassifier(),
|
||||
}
|
||||
|
||||
for name, model in models.items():
|
||||
scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
|
||||
print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")
|
||||
```
|
||||
|
||||
## Hyperparameter Tuning
|
||||
|
||||
```python
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
||||
# Define parameter grid
|
||||
param_grid = {
|
||||
'n_estimators': [50, 100, 200],
|
||||
'max_depth': [None, 10, 20],
|
||||
'min_samples_split': [2, 5, 10]
|
||||
}
|
||||
|
||||
# Grid search
|
||||
grid_search = GridSearchCV(
|
||||
RandomForestClassifier(random_state=42),
|
||||
param_grid,
|
||||
cv=5,
|
||||
scoring='f1_weighted',
|
||||
n_jobs=-1
|
||||
)
|
||||
|
||||
grid_search.fit(X_train_scaled, y_train)
|
||||
|
||||
print(f"Best parameters: {grid_search.best_params_}")
|
||||
print(f"Best score: {grid_search.best_score_:.4f}")
|
||||
|
||||
# Use best model
|
||||
best_model = grid_search.best_estimator_
|
||||
```
|
||||
|
||||
## Feature Importance
|
||||
|
||||
```python
|
||||
# For tree-based models
|
||||
importances = model.feature_importances_
|
||||
feature_importance_df = pd.DataFrame({
|
||||
'feature': X.columns,
|
||||
'importance': importances
|
||||
}).sort_values('importance', ascending=False)
|
||||
|
||||
print(feature_importance_df)
|
||||
|
||||
# Plot
|
||||
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'])
|
||||
plt.xlabel('Importance')
|
||||
plt.title('Feature Importance')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
## Save & Load Models
|
||||
|
||||
```python
|
||||
import joblib
|
||||
|
||||
# Save model
|
||||
joblib.dump(model, 'model.pkl')
|
||||
joblib.dump(scaler, 'scaler.pkl')
|
||||
|
||||
# Load model
|
||||
loaded_model = joblib.load('model.pkl')
|
||||
loaded_scaler = joblib.load('scaler.pkl')
|
||||
|
||||
# Make predictions
|
||||
new_data = [[value1, value2, value3]]
|
||||
new_data_scaled = loaded_scaler.transform(new_data)
|
||||
prediction = loaded_model.predict(new_data_scaled)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. **Always Split Data Before Scaling**
|
||||
```python
|
||||
# ✅ Correct order
|
||||
X_train, X_test = train_test_split(X, y)
|
||||
scaler.fit(X_train) # Fit only on training
|
||||
X_train_scaled = scaler.transform(X_train)
|
||||
X_test_scaled = scaler.transform(X_test)
|
||||
|
||||
# ❌ Wrong: Data leakage
|
||||
scaler.fit(X) # Don't fit on all data!
|
||||
```
|
||||
|
||||
### 2. **Use Cross-Validation**
|
||||
```python
|
||||
from sklearn.model_selection import cross_validate
|
||||
|
||||
cv_results = cross_validate(
|
||||
model, X, y,
|
||||
cv=5,
|
||||
scoring=['accuracy', 'f1_weighted'],
|
||||
return_train_score=True
|
||||
)
|
||||
```
|
||||
|
||||
### 3. **Handle Imbalanced Data**
|
||||
```python
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
|
||||
# Compute class weights
|
||||
class_weights = compute_class_weight(
|
||||
'balanced',
|
||||
classes=np.unique(y_train),
|
||||
y=y_train
|
||||
)
|
||||
|
||||
# Use in model
|
||||
model = RandomForestClassifier(class_weight='balanced')
|
||||
```
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
- Building predictive models
|
||||
- Training classification/regression models
|
||||
- Evaluating model performance
|
||||
- Tuning hyperparameters
|
||||
- Deploying ML models
|
||||
|
||||
---
|
||||
|
||||
**Remember**: Good ML is methodical - clean data, proper splits, cross-validation, and thorough evaluation!
|
||||
49
plugin.lock.json
Normal file
49
plugin.lock.json
Normal file
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"$schema": "internal://schemas/plugin.lock.v1.json",
|
||||
"pluginId": "gh:samuelgarrett/claude-code-plugin-test:data-science-skills",
|
||||
"normalized": {
|
||||
"repo": null,
|
||||
"ref": "refs/tags/v20251128.0",
|
||||
"commit": "f6847456ddc62cac0eb1e1b8dddaa54039b033c7",
|
||||
"treeHash": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49",
|
||||
"generatedAt": "2025-11-28T10:28:07.842177Z",
|
||||
"toolVersion": "publish_plugins.py@0.2.0"
|
||||
},
|
||||
"origin": {
|
||||
"remote": "git@github.com:zhongweili/42plugin-data.git",
|
||||
"branch": "master",
|
||||
"commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
|
||||
"repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
|
||||
},
|
||||
"manifest": {
|
||||
"name": "data-science-skills",
|
||||
"description": "Specialized skills for data analysis, machine learning, and scientific computing with Python",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"content": {
|
||||
"files": [
|
||||
{
|
||||
"path": "README.md",
|
||||
"sha256": "447a86922c3649863baae5713109d4fc1238099d40db4097055fb71dd82d0c38"
|
||||
},
|
||||
{
|
||||
"path": ".claude-plugin/plugin.json",
|
||||
"sha256": "0d93ceea2a14453493c862437164a0e0c864859488f362f2c974eba0c2a080f2"
|
||||
},
|
||||
{
|
||||
"path": "commands/data-analyst.md",
|
||||
"sha256": "4bcafbe49524548efdde7d8b76e1ba7721e173355d633371d8d504d3058cf802"
|
||||
},
|
||||
{
|
||||
"path": "commands/ml-engineer.md",
|
||||
"sha256": "38a495d7c619a6c4e09ddc754f1ea3ebb1682ccd361f4326315d334c10a5d123"
|
||||
}
|
||||
],
|
||||
"dirSha256": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49"
|
||||
},
|
||||
"security": {
|
||||
"scannedAt": null,
|
||||
"scannerVersion": null,
|
||||
"flags": []
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user