Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:53:48 +08:00
commit df7d982d80
5 changed files with 566 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
{
"name": "data-science-skills",
"description": "Specialized skills for data analysis, machine learning, and scientific computing with Python",
"version": "1.0.0",
"author": {
"name": "Claude Skills Marketplace"
},
"commands": [
"./commands"
]
}

3
README.md Normal file
View File

@@ -0,0 +1,3 @@
# data-science-skills
Specialized skills for data analysis, machine learning, and scientific computing with Python

274
commands/data-analyst.md Normal file
View File

@@ -0,0 +1,274 @@
# Data Analyst
**Description**: Perform systematic data analysis using Python, pandas, and visualization libraries
## Core Workflow
1. **Load & Inspect Data**
2. **Clean & Transform Data**
3. **Explore & Visualize**
4. **Analyze & Model**
5. **Report Findings**
## Data Loading & Inspection
```python
import pandas as pd
import numpy as np
# Load data
df = pd.read_csv('data.csv')
# df = pd.read_excel('data.xlsx')
# df = pd.read_sql(query, connection)
# df = pd.read_json('data.json')
# Initial inspection
print(df.shape) # (rows, columns)
print(df.info()) # Data types, null counts
print(df.describe()) # Statistical summary
print(df.head()) # First 5 rows
print(df.columns.tolist()) # Column names
print(df.dtypes) # Data types
# Check for missing data
print(df.isnull().sum())
print(df.duplicated().sum())
```
## Data Cleaning
```python
# Handle missing values
df = df.dropna() # Drop rows with any null
df = df.dropna(subset=['important_col']) # Drop if specific column null
df['col'] = df['col'].fillna(0) # Fill with value
df['col'] = df['col'].fillna(df['col'].mean()) # Fill with mean
# Remove duplicates
df = df.drop_duplicates()
df = df.drop_duplicates(subset=['id'], keep='first')
# Fix data types
df['date'] = pd.to_datetime(df['date'])
df['price'] = df['price'].astype(float)
df['category'] = df['category'].astype('category')
# Handle outliers
from scipy import stats
df = df[(np.abs(stats.zscore(df['value'])) < 3)] # Remove outliers
# Rename columns
df = df.rename(columns={'old_name': 'new_name'})
df.columns = df.columns.str.lower().str.replace(' ', '_')
```
## Data Transformation
```python
# Create new columns
df['total'] = df['quantity'] * df['price']
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
# Apply functions
df['normalized'] = df['value'] / df['value'].max()
df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3})
# Groupby aggregations
summary = df.groupby('category').agg({
'sales': ['sum', 'mean', 'count'],
'profit': 'sum'
}).round(2)
# Pivot tables
pivot = df.pivot_table(
values='sales',
index='category',
columns='region',
aggfunc='sum',
fill_value=0
)
# Merge datasets
merged = pd.merge(df1, df2, on='id', how='left')
```
## Exploratory Data Analysis
```python
# Distribution analysis
print(df['value'].value_counts())
print(df['value'].value_counts(normalize=True)) # Percentages
# Correlation
correlation = df[['col1', 'col2', 'col3']].corr()
# Cross-tabulation
pd.crosstab(df['category'], df['region'], normalize='index')
```
## Visualization
```python
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
# Histogram
plt.hist(df['value'], bins=30, edgecolor='black')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of Values')
plt.show()
# Boxplot
sns.boxplot(data=df, x='category', y='value')
plt.title('Value Distribution by Category')
plt.show()
# Time series
df.set_index('date')['sales'].plot()
plt.title('Sales Over Time')
plt.show()
# Heatmap
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()
# Scatter plot
sns.scatterplot(data=df, x='x_value', y='y_value', hue='category')
plt.title('X vs Y by Category')
plt.show()
```
## Statistical Analysis
```python
from scipy import stats
# T-test
group1 = df[df['category'] == 'A']['value']
group2 = df[df['category'] == 'B']['value']
t_stat, p_value = stats.ttest_ind(group1, group2)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")
# Chi-square test
contingency_table = pd.crosstab(df['category'], df['outcome'])
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}")
# Linear regression
from sklearn.linear_model import LinearRegression
X = df[['feature1', 'feature2']]
y = df['target']
model = LinearRegression()
model.fit(X, y)
print(f"R² Score: {model.score(X, y):.4f}")
print(f"Coefficients: {model.coef_}")
```
## Best Practices
### 1. **Reproducibility**
```python
# Set random seed
np.random.seed(42)
# Save processed data
df.to_csv('cleaned_data.csv', index=False)
# Document transformations
# Use Jupyter notebooks with markdown cells
```
### 2. **Data Quality Checks**
```python
def validate_data(df):
"""Validate data quality"""
checks = {
'null_values': df.isnull().sum().sum() == 0,
'duplicates': df.duplicated().sum() == 0,
'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'),
'no_negative_prices': (df['price'] >= 0).all()
}
return all(checks.values()), checks
is_valid, results = validate_data(df)
print(f"Data valid: {is_valid}")
print(results)
```
### 3. **Performance**
```python
# Use vectorized operations
df['result'] = df['a'] * df['b'] # ✅ Fast
# Avoid loops
for i in range(len(df)): # ❌ Slow
df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b']
# Use appropriate data types
df['category'] = df['category'].astype('category') # Saves memory
# Chunk large files
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
process(chunk)
```
## Analysis Workflow Template
```python
# 1. LOAD DATA
df = pd.read_csv('data.csv')
# 2. INITIAL INSPECTION
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Nulls: \n{df.isnull().sum()}")
# 3. CLEAN DATA
df = df.drop_duplicates()
df = df.dropna(subset=['critical_column'])
df['date'] = pd.to_datetime(df['date'])
# 4. TRANSFORM
df['month'] = df['date'].dt.month
df['total'] = df['quantity'] * df['price']
# 5. EXPLORE
print(df['category'].value_counts())
print(df[['sales', 'profit']].describe())
# 6. VISUALIZE
df.groupby('month')['sales'].sum().plot(kind='bar')
plt.title('Monthly Sales')
plt.show()
# 7. ANALYZE
monthly_avg = df.groupby('month')['sales'].mean()
print(f"Average monthly sales: ${monthly_avg.mean():.2f}")
# 8. EXPORT RESULTS
results = df.groupby('category').agg({
'sales': 'sum',
'profit': 'sum'
})
results.to_csv('results.csv')
```
## When to Use This Skill
- Analyzing CSV/Excel datasets
- Creating data visualizations
- Performing statistical analysis
- Cleaning and transforming data
- Generating data reports
- Exploratory data analysis
---
**Remember**: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report!

229
commands/ml-engineer.md Normal file
View File

@@ -0,0 +1,229 @@
# ML Engineer
**Description**: Build, train, and deploy machine learning models following MLOps best practices
## ML Project Workflow
1. **Define Problem** - Classification, regression, clustering?
2. **Prepare Data** - Clean, split, scale
3. **Choose Model** - Select appropriate algorithm
4. **Train Model** - Fit to training data
5. **Evaluate** - Test performance
6. **Tune** - Optimize hyperparameters
7. **Deploy** - Put model in production
## Data Preparation
```python
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Load data
X = df[['feature1', 'feature2', 'feature3']]
y = df['target']
# Split data (80/20 train/test)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use same scaler!
# Encode categorical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
```
## Classification Models
```python
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)
# Evaluate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
```
## Regression Models
```python
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Train model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Predict
y_pred = model.predict(X_test_scaled)
# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")
```
## Model Selection & Comparison
```python
from sklearn.model_selection import cross_val_score
models = {
'Logistic Regression': LogisticRegression(),
'Random Forest': RandomForestClassifier(),
'Gradient Boosting': GradientBoostingClassifier(),
}
for name, model in models.items():
scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")
```
## Hyperparameter Tuning
```python
from sklearn.model_selection import GridSearchCV
# Define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
# Grid search
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='f1_weighted',
n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
# Use best model
best_model = grid_search.best_estimator_
```
## Feature Importance
```python
# For tree-based models
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
'feature': X.columns,
'importance': importances
}).sort_values('importance', ascending=False)
print(feature_importance_df)
# Plot
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.show()
```
## Save & Load Models
```python
import joblib
# Save model
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
# Load model
loaded_model = joblib.load('model.pkl')
loaded_scaler = joblib.load('scaler.pkl')
# Make predictions
new_data = [[value1, value2, value3]]
new_data_scaled = loaded_scaler.transform(new_data)
prediction = loaded_model.predict(new_data_scaled)
```
## Best Practices
### 1. **Always Split Data Before Scaling**
```python
# ✅ Correct order
X_train, X_test = train_test_split(X, y)
scaler.fit(X_train) # Fit only on training
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# ❌ Wrong: Data leakage
scaler.fit(X) # Don't fit on all data!
```
### 2. **Use Cross-Validation**
```python
from sklearn.model_selection import cross_validate
cv_results = cross_validate(
model, X, y,
cv=5,
scoring=['accuracy', 'f1_weighted'],
return_train_score=True
)
```
### 3. **Handle Imbalanced Data**
```python
from sklearn.utils.class_weight import compute_class_weight
# Compute class weights
class_weights = compute_class_weight(
'balanced',
classes=np.unique(y_train),
y=y_train
)
# Use in model
model = RandomForestClassifier(class_weight='balanced')
```
## When to Use This Skill
- Building predictive models
- Training classification/regression models
- Evaluating model performance
- Tuning hyperparameters
- Deploying ML models
---
**Remember**: Good ML is methodical - clean data, proper splits, cross-validation, and thorough evaluation!

49
plugin.lock.json Normal file
View File

@@ -0,0 +1,49 @@
{
"$schema": "internal://schemas/plugin.lock.v1.json",
"pluginId": "gh:samuelgarrett/claude-code-plugin-test:data-science-skills",
"normalized": {
"repo": null,
"ref": "refs/tags/v20251128.0",
"commit": "f6847456ddc62cac0eb1e1b8dddaa54039b033c7",
"treeHash": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49",
"generatedAt": "2025-11-28T10:28:07.842177Z",
"toolVersion": "publish_plugins.py@0.2.0"
},
"origin": {
"remote": "git@github.com:zhongweili/42plugin-data.git",
"branch": "master",
"commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
"repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
},
"manifest": {
"name": "data-science-skills",
"description": "Specialized skills for data analysis, machine learning, and scientific computing with Python",
"version": "1.0.0"
},
"content": {
"files": [
{
"path": "README.md",
"sha256": "447a86922c3649863baae5713109d4fc1238099d40db4097055fb71dd82d0c38"
},
{
"path": ".claude-plugin/plugin.json",
"sha256": "0d93ceea2a14453493c862437164a0e0c864859488f362f2c974eba0c2a080f2"
},
{
"path": "commands/data-analyst.md",
"sha256": "4bcafbe49524548efdde7d8b76e1ba7721e173355d633371d8d504d3058cf802"
},
{
"path": "commands/ml-engineer.md",
"sha256": "38a495d7c619a6c4e09ddc754f1ea3ebb1682ccd361f4326315d334c10a5d123"
}
],
"dirSha256": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49"
},
"security": {
"scannedAt": null,
"scannerVersion": null,
"flags": []
}
}