commit df7d982d8059f899a505c330268c08645fea744b Author: Zhongwei Li Date: Sun Nov 30 08:53:48 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..6b68147 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,11 @@ +{ + "name": "data-science-skills", + "description": "Specialized skills for data analysis, machine learning, and scientific computing with Python", + "version": "1.0.0", + "author": { + "name": "Claude Skills Marketplace" + }, + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d300176 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# data-science-skills + +Specialized skills for data analysis, machine learning, and scientific computing with Python diff --git a/commands/data-analyst.md b/commands/data-analyst.md new file mode 100644 index 0000000..65ae641 --- /dev/null +++ b/commands/data-analyst.md @@ -0,0 +1,274 @@ +# Data Analyst + +**Description**: Perform systematic data analysis using Python, pandas, and visualization libraries + +## Core Workflow + +1. **Load & Inspect Data** +2. **Clean & Transform Data** +3. **Explore & Visualize** +4. **Analyze & Model** +5. **Report Findings** + +## Data Loading & Inspection + +```python +import pandas as pd +import numpy as np + +# Load data +df = pd.read_csv('data.csv') +# df = pd.read_excel('data.xlsx') +# df = pd.read_sql(query, connection) +# df = pd.read_json('data.json') + +# Initial inspection +print(df.shape) # (rows, columns) +print(df.info()) # Data types, null counts +print(df.describe()) # Statistical summary +print(df.head()) # First 5 rows +print(df.columns.tolist()) # Column names +print(df.dtypes) # Data types + +# Check for missing data +print(df.isnull().sum()) +print(df.duplicated().sum()) +``` + +## Data Cleaning + +```python +# Handle missing values +df = df.dropna() # Drop rows with any null +df = df.dropna(subset=['important_col']) # Drop if specific column null +df['col'] = df['col'].fillna(0) # Fill with value +df['col'] = df['col'].fillna(df['col'].mean()) # Fill with mean + +# Remove duplicates +df = df.drop_duplicates() +df = df.drop_duplicates(subset=['id'], keep='first') + +# Fix data types +df['date'] = pd.to_datetime(df['date']) +df['price'] = df['price'].astype(float) +df['category'] = df['category'].astype('category') + +# Handle outliers +from scipy import stats +df = df[(np.abs(stats.zscore(df['value'])) < 3)] # Remove outliers + +# Rename columns +df = df.rename(columns={'old_name': 'new_name'}) +df.columns = df.columns.str.lower().str.replace(' ', '_') +``` + +## Data Transformation + +```python +# Create new columns +df['total'] = df['quantity'] * df['price'] +df['month'] = df['date'].dt.month +df['year'] = df['date'].dt.year + +# Apply functions +df['normalized'] = df['value'] / df['value'].max() +df['category_encoded'] = df['category'].map({'A': 1, 'B': 2, 'C': 3}) + +# Groupby aggregations +summary = df.groupby('category').agg({ + 'sales': ['sum', 'mean', 'count'], + 'profit': 'sum' +}).round(2) + +# Pivot tables +pivot = df.pivot_table( + values='sales', + index='category', + columns='region', + aggfunc='sum', + fill_value=0 +) + +# Merge datasets +merged = pd.merge(df1, df2, on='id', how='left') +``` + +## Exploratory Data Analysis + +```python +# Distribution analysis +print(df['value'].value_counts()) +print(df['value'].value_counts(normalize=True)) # Percentages + +# Correlation +correlation = df[['col1', 'col2', 'col3']].corr() + +# Cross-tabulation +pd.crosstab(df['category'], df['region'], normalize='index') +``` + +## Visualization + +```python +import matplotlib.pyplot as plt +import seaborn as sns + +# Set style +sns.set_style('whitegrid') +plt.rcParams['figure.figsize'] = (12, 6) + +# Histogram +plt.hist(df['value'], bins=30, edgecolor='black') +plt.xlabel('Value') +plt.ylabel('Frequency') +plt.title('Distribution of Values') +plt.show() + +# Boxplot +sns.boxplot(data=df, x='category', y='value') +plt.title('Value Distribution by Category') +plt.show() + +# Time series +df.set_index('date')['sales'].plot() +plt.title('Sales Over Time') +plt.show() + +# Heatmap +sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0) +plt.title('Correlation Matrix') +plt.show() + +# Scatter plot +sns.scatterplot(data=df, x='x_value', y='y_value', hue='category') +plt.title('X vs Y by Category') +plt.show() +``` + +## Statistical Analysis + +```python +from scipy import stats + +# T-test +group1 = df[df['category'] == 'A']['value'] +group2 = df[df['category'] == 'B']['value'] +t_stat, p_value = stats.ttest_ind(group1, group2) +print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}") + +# Chi-square test +contingency_table = pd.crosstab(df['category'], df['outcome']) +chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table) +print(f"Chi-square: {chi2:.4f}, P-value: {p_value:.4f}") + +# Linear regression +from sklearn.linear_model import LinearRegression +X = df[['feature1', 'feature2']] +y = df['target'] +model = LinearRegression() +model.fit(X, y) +print(f"R² Score: {model.score(X, y):.4f}") +print(f"Coefficients: {model.coef_}") +``` + +## Best Practices + +### 1. **Reproducibility** +```python +# Set random seed +np.random.seed(42) + +# Save processed data +df.to_csv('cleaned_data.csv', index=False) + +# Document transformations +# Use Jupyter notebooks with markdown cells +``` + +### 2. **Data Quality Checks** +```python +def validate_data(df): + """Validate data quality""" + checks = { + 'null_values': df.isnull().sum().sum() == 0, + 'duplicates': df.duplicated().sum() == 0, + 'date_range_valid': df['date'].min() > pd.Timestamp('2000-01-01'), + 'no_negative_prices': (df['price'] >= 0).all() + } + return all(checks.values()), checks + +is_valid, results = validate_data(df) +print(f"Data valid: {is_valid}") +print(results) +``` + +### 3. **Performance** +```python +# Use vectorized operations +df['result'] = df['a'] * df['b'] # ✅ Fast + +# Avoid loops +for i in range(len(df)): # ❌ Slow + df.loc[i, 'result'] = df.loc[i, 'a'] * df.loc[i, 'b'] + +# Use appropriate data types +df['category'] = df['category'].astype('category') # Saves memory + +# Chunk large files +for chunk in pd.read_csv('large_file.csv', chunksize=10000): + process(chunk) +``` + +## Analysis Workflow Template + +```python +# 1. LOAD DATA +df = pd.read_csv('data.csv') + +# 2. INITIAL INSPECTION +print(f"Shape: {df.shape}") +print(f"Columns: {df.columns.tolist()}") +print(f"Nulls: \n{df.isnull().sum()}") + +# 3. CLEAN DATA +df = df.drop_duplicates() +df = df.dropna(subset=['critical_column']) +df['date'] = pd.to_datetime(df['date']) + +# 4. TRANSFORM +df['month'] = df['date'].dt.month +df['total'] = df['quantity'] * df['price'] + +# 5. EXPLORE +print(df['category'].value_counts()) +print(df[['sales', 'profit']].describe()) + +# 6. VISUALIZE +df.groupby('month')['sales'].sum().plot(kind='bar') +plt.title('Monthly Sales') +plt.show() + +# 7. ANALYZE +monthly_avg = df.groupby('month')['sales'].mean() +print(f"Average monthly sales: ${monthly_avg.mean():.2f}") + +# 8. EXPORT RESULTS +results = df.groupby('category').agg({ + 'sales': 'sum', + 'profit': 'sum' +}) +results.to_csv('results.csv') +``` + +## When to Use This Skill + +- Analyzing CSV/Excel datasets +- Creating data visualizations +- Performing statistical analysis +- Cleaning and transforming data +- Generating data reports +- Exploratory data analysis + +--- + +**Remember**: Good data analysis is methodical - inspect, clean, explore, analyze, visualize, report! diff --git a/commands/ml-engineer.md b/commands/ml-engineer.md new file mode 100644 index 0000000..045e905 --- /dev/null +++ b/commands/ml-engineer.md @@ -0,0 +1,229 @@ +# ML Engineer + +**Description**: Build, train, and deploy machine learning models following MLOps best practices + +## ML Project Workflow + +1. **Define Problem** - Classification, regression, clustering? +2. **Prepare Data** - Clean, split, scale +3. **Choose Model** - Select appropriate algorithm +4. **Train Model** - Fit to training data +5. **Evaluate** - Test performance +6. **Tune** - Optimize hyperparameters +7. **Deploy** - Put model in production + +## Data Preparation + +```python +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, LabelEncoder + +# Load data +X = df[['feature1', 'feature2', 'feature3']] +y = df['target'] + +# Split data (80/20 train/test) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42, stratify=y +) + +# Scale features +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) # Use same scaler! + +# Encode categorical labels +label_encoder = LabelEncoder() +y_train_encoded = label_encoder.fit_transform(y_train) +``` + +## Classification Models + +```python +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report, confusion_matrix + +# Train model +model = RandomForestClassifier(n_estimators=100, random_state=42) +model.fit(X_train_scaled, y_train) + +# Make predictions +y_pred = model.predict(X_test_scaled) +y_pred_proba = model.predict_proba(X_test_scaled) + +# Evaluate +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + +print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}") +print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}") +print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}") +print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted'):.4f}") + +print("\nClassification Report:") +print(classification_report(y_test, y_pred)) + +print("\nConfusion Matrix:") +print(confusion_matrix(y_test, y_pred)) +``` + +## Regression Models + +```python +from sklearn.ensemble import GradientBoostingRegressor +from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error + +# Train model +model = GradientBoostingRegressor(n_estimators=100, random_state=42) +model.fit(X_train_scaled, y_train) + +# Predict +y_pred = model.predict(X_test_scaled) + +# Evaluate +mse = mean_squared_error(y_test, y_pred) +rmse = np.sqrt(mse) +mae = mean_absolute_error(y_test, y_pred) +r2 = r2_score(y_test, y_pred) + +print(f"RMSE: {rmse:.4f}") +print(f"MAE: {mae:.4f}") +print(f"R² Score: {r2:.4f}") +``` + +## Model Selection & Comparison + +```python +from sklearn.model_selection import cross_val_score + +models = { + 'Logistic Regression': LogisticRegression(), + 'Random Forest': RandomForestClassifier(), + 'Gradient Boosting': GradientBoostingClassifier(), +} + +for name, model in models.items(): + scores = cross_val_score(model, X_train_scaled, y_train, cv=5) + print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})") +``` + +## Hyperparameter Tuning + +```python +from sklearn.model_selection import GridSearchCV + +# Define parameter grid +param_grid = { + 'n_estimators': [50, 100, 200], + 'max_depth': [None, 10, 20], + 'min_samples_split': [2, 5, 10] +} + +# Grid search +grid_search = GridSearchCV( + RandomForestClassifier(random_state=42), + param_grid, + cv=5, + scoring='f1_weighted', + n_jobs=-1 +) + +grid_search.fit(X_train_scaled, y_train) + +print(f"Best parameters: {grid_search.best_params_}") +print(f"Best score: {grid_search.best_score_:.4f}") + +# Use best model +best_model = grid_search.best_estimator_ +``` + +## Feature Importance + +```python +# For tree-based models +importances = model.feature_importances_ +feature_importance_df = pd.DataFrame({ + 'feature': X.columns, + 'importance': importances +}).sort_values('importance', ascending=False) + +print(feature_importance_df) + +# Plot +plt.barh(feature_importance_df['feature'], feature_importance_df['importance']) +plt.xlabel('Importance') +plt.title('Feature Importance') +plt.show() +``` + +## Save & Load Models + +```python +import joblib + +# Save model +joblib.dump(model, 'model.pkl') +joblib.dump(scaler, 'scaler.pkl') + +# Load model +loaded_model = joblib.load('model.pkl') +loaded_scaler = joblib.load('scaler.pkl') + +# Make predictions +new_data = [[value1, value2, value3]] +new_data_scaled = loaded_scaler.transform(new_data) +prediction = loaded_model.predict(new_data_scaled) +``` + +## Best Practices + +### 1. **Always Split Data Before Scaling** +```python +# ✅ Correct order +X_train, X_test = train_test_split(X, y) +scaler.fit(X_train) # Fit only on training +X_train_scaled = scaler.transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# ❌ Wrong: Data leakage +scaler.fit(X) # Don't fit on all data! +``` + +### 2. **Use Cross-Validation** +```python +from sklearn.model_selection import cross_validate + +cv_results = cross_validate( + model, X, y, + cv=5, + scoring=['accuracy', 'f1_weighted'], + return_train_score=True +) +``` + +### 3. **Handle Imbalanced Data** +```python +from sklearn.utils.class_weight import compute_class_weight + +# Compute class weights +class_weights = compute_class_weight( + 'balanced', + classes=np.unique(y_train), + y=y_train +) + +# Use in model +model = RandomForestClassifier(class_weight='balanced') +``` + +## When to Use This Skill + +- Building predictive models +- Training classification/regression models +- Evaluating model performance +- Tuning hyperparameters +- Deploying ML models + +--- + +**Remember**: Good ML is methodical - clean data, proper splits, cross-validation, and thorough evaluation! diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..63171f7 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,49 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:samuelgarrett/claude-code-plugin-test:data-science-skills", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "f6847456ddc62cac0eb1e1b8dddaa54039b033c7", + "treeHash": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49", + "generatedAt": "2025-11-28T10:28:07.842177Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "data-science-skills", + "description": "Specialized skills for data analysis, machine learning, and scientific computing with Python", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "447a86922c3649863baae5713109d4fc1238099d40db4097055fb71dd82d0c38" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "0d93ceea2a14453493c862437164a0e0c864859488f362f2c974eba0c2a080f2" + }, + { + "path": "commands/data-analyst.md", + "sha256": "4bcafbe49524548efdde7d8b76e1ba7721e173355d633371d8d504d3058cf802" + }, + { + "path": "commands/ml-engineer.md", + "sha256": "38a495d7c619a6c4e09ddc754f1ea3ebb1682ccd361f4326315d334c10a5d123" + } + ], + "dirSha256": "6dfce7936856643508dfa8adb47db92b785f55207198d193f4644b1fb66e2f49" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file