commit 9174377a09347090e85ff606ca7967373c56dfd4 Author: Zhongwei Li Date: Sat Nov 29 18:51:40 2025 +0800 Initial commit diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..7b11ee7 --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,15 @@ +{ + "name": "model-evaluation-suite", + "description": "Comprehensive model evaluation with multiple metrics", + "version": "1.0.0", + "author": { + "name": "Claude Code Plugins", + "email": "[email protected]" + }, + "skills": [ + "./skills" + ], + "commands": [ + "./commands" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0cde9d8 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# model-evaluation-suite + +Comprehensive model evaluation with multiple metrics diff --git a/commands/eval-model.md b/commands/eval-model.md new file mode 100644 index 0000000..7e9e4fa --- /dev/null +++ b/commands/eval-model.md @@ -0,0 +1,15 @@ +--- +description: Execute AI/ML task with intelligent automation +--- + +# AI/ML Task Executor + +You are an AI/ML specialist. When this command is invoked: + +1. Analyze the current context and requirements +2. Generate appropriate code for the ML task +3. Include data validation and error handling +4. Provide performance metrics and insights +5. Save artifacts and generate documentation + +Support modern ML frameworks and best practices. diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..9284fe0 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,65 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:jeremylongshore/claude-code-plugins-plus:plugins/ai-ml/model-evaluation-suite", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "b9f507f94903142e330be3d625a0e9c6a6a26aca", + "treeHash": "09d58c7674d9a864e0dd2319f4827618cdb40111ff5a0730ebbe30e17ad67935", + "generatedAt": "2025-11-28T10:18:35.137876Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "model-evaluation-suite", + "description": "Comprehensive model evaluation with multiple metrics", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "85e425c62ac2bcd0eed67c58fd7c7824732fe8d408f44a6ad16b3cfb8633bcf8" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "f26b867c0c9ee313df48d0d58b045b2c4cb4baf440e6831bedb644ab0f826abd" + }, + { + "path": "commands/eval-model.md", + "sha256": "043efb83e2f02fc6d0869c8a3a7388d6e49f6c809292b93dd6a97a1b142e5647" + }, + { + "path": "skills/model-evaluation-suite/SKILL.md", + "sha256": "6c40f9a443496c7e83f0ee00aafb159fb137e3b0007c2fd3e642050f61b12fad" + }, + { + "path": "skills/model-evaluation-suite/references/README.md", + "sha256": "c93bef278e79b528ed49b4c7de04867600b23eb111c0c1bf8aab87940a621ebb" + }, + { + "path": "skills/model-evaluation-suite/scripts/README.md", + "sha256": "f3aca6b5395d192bf3ead4514aa5aeb5148cad7092ef6d7ccb88f9801a7bc0ee" + }, + { + "path": "skills/model-evaluation-suite/assets/visualization_script.py", + "sha256": "e416c0d736295ac95074885c2151ac9b6ee1314df44fd56e479701e4c4de442b" + }, + { + "path": "skills/model-evaluation-suite/assets/README.md", + "sha256": "f72fa6ee871a203b534e9df91ec8660909915c788440ea72940ad18fdd4d8c32" + } + ], + "dirSha256": "09d58c7674d9a864e0dd2319f4827618cdb40111ff5a0730ebbe30e17ad67935" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/skills/model-evaluation-suite/SKILL.md b/skills/model-evaluation-suite/SKILL.md new file mode 100644 index 0000000..8937501 --- /dev/null +++ b/skills/model-evaluation-suite/SKILL.md @@ -0,0 +1,55 @@ +--- +name: evaluating-machine-learning-models +description: | + This skill allows Claude to evaluate machine learning models using a comprehensive suite of metrics. It should be used when the user requests model performance analysis, validation, or testing. Claude can use this skill to assess model accuracy, precision, recall, F1-score, and other relevant metrics. Trigger this skill when the user mentions "evaluate model", "model performance", "testing metrics", "validation results", or requests a comprehensive "model evaluation". +allowed-tools: Read, Write, Edit, Grep, Glob, Bash +version: 1.0.0 +--- + +## Overview + +This skill empowers Claude to perform thorough evaluations of machine learning models, providing detailed performance insights. It leverages the `model-evaluation-suite` plugin to generate a range of metrics, enabling informed decisions about model selection and optimization. + +## How It Works + +1. **Analyzing Context**: Claude analyzes the user's request to identify the model to be evaluated and any specific metrics of interest. +2. **Executing Evaluation**: Claude uses the `/eval-model` command to initiate the model evaluation process within the `model-evaluation-suite` plugin. +3. **Presenting Results**: Claude presents the generated metrics and insights to the user, highlighting key performance indicators and potential areas for improvement. + +## When to Use This Skill + +This skill activates when you need to: +- Assess the performance of a machine learning model. +- Compare the performance of multiple models. +- Identify areas where a model can be improved. +- Validate a model's performance before deployment. + +## Examples + +### Example 1: Evaluating Model Accuracy + +User request: "Evaluate the accuracy of my image classification model." + +The skill will: +1. Invoke the `/eval-model` command. +2. Analyze the model's performance on a held-out dataset. +3. Report the accuracy score and other relevant metrics. + +### Example 2: Comparing Model Performance + +User request: "Compare the F1-score of model A and model B." + +The skill will: +1. Invoke the `/eval-model` command for both models. +2. Extract the F1-score from the evaluation results. +3. Present a comparison of the F1-scores for model A and model B. + +## Best Practices + +- **Specify Metrics**: Clearly define the specific metrics of interest for the evaluation. +- **Data Validation**: Ensure the data used for evaluation is representative of the real-world data the model will encounter. +- **Interpret Results**: Provide context and interpretation of the evaluation results to facilitate informed decision-making. + +## Integration + +This skill integrates seamlessly with the `model-evaluation-suite` plugin, providing a comprehensive solution for model evaluation within the Claude Code environment. It can be combined with other skills to build automated machine learning workflows. \ No newline at end of file diff --git a/skills/model-evaluation-suite/assets/README.md b/skills/model-evaluation-suite/assets/README.md new file mode 100644 index 0000000..f2423cc --- /dev/null +++ b/skills/model-evaluation-suite/assets/README.md @@ -0,0 +1,7 @@ +# Assets + +Bundled resources for model-evaluation-suite skill + +- [ ] evaluation_template.md: Template for generating evaluation reports with placeholders for metrics and visualizations. +- [ ] example_dataset.csv: Example dataset for testing the evaluation process. +- [ ] visualization_script.py: Script to generate visualizations of model performance metrics. diff --git a/skills/model-evaluation-suite/assets/visualization_script.py b/skills/model-evaluation-suite/assets/visualization_script.py new file mode 100644 index 0000000..d564a97 --- /dev/null +++ b/skills/model-evaluation-suite/assets/visualization_script.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +""" +visualization_script.py + +This script generates visualizations of model performance metrics. +It supports various plot types and data formats. + +Example Usage: + To generate a scatter plot of predicted vs. actual values: + python visualization_script.py --plot_type scatter --actual_values actual.csv --predicted_values predicted.csv --output scatter_plot.png + + To generate a histogram of errors: + python visualization_script.py --plot_type histogram --errors errors.csv --output error_histogram.png +""" + +import argparse +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import os + + +def generate_scatter_plot(actual_values_path, predicted_values_path, output_path): + """ + Generates a scatter plot of actual vs. predicted values. + + Args: + actual_values_path (str): Path to the CSV file containing actual values. + predicted_values_path (str): Path to the CSV file containing predicted values. + output_path (str): Path to save the generated plot. + """ + try: + actual_values = pd.read_csv(actual_values_path).values.flatten() + predicted_values = pd.read_csv(predicted_values_path).values.flatten() + + plt.figure(figsize=(10, 8)) + sns.scatterplot(x=actual_values, y=predicted_values) + plt.xlabel("Actual Values") + plt.ylabel("Predicted Values") + plt.title("Actual vs. Predicted Values") + plt.savefig(output_path) + plt.close() + + print(f"Scatter plot saved to {output_path}") + + except FileNotFoundError as e: + print(f"Error: File not found: {e}") + except Exception as e: + print(f"Error generating scatter plot: {e}") + + +def generate_histogram(errors_path, output_path): + """ + Generates a histogram of errors. + + Args: + errors_path (str): Path to the CSV file containing errors. + output_path (str): Path to save the generated plot. + """ + try: + errors = pd.read_csv(errors_path).values.flatten() + + plt.figure(figsize=(10, 8)) + sns.histplot(errors, kde=True) # Add kernel density estimate + plt.xlabel("Error") + plt.ylabel("Frequency") + plt.title("Distribution of Errors") + plt.savefig(output_path) + plt.close() + + print(f"Histogram saved to {output_path}") + + except FileNotFoundError as e: + print(f"Error: File not found: {e}") + except Exception as e: + print(f"Error generating histogram: {e}") + + +def generate_residual_plot(actual_values_path, predicted_values_path, output_path): + """ + Generates a residual plot. + + Args: + actual_values_path (str): Path to the CSV file containing actual values. + predicted_values_path (str): Path to the CSV file containing predicted values. + output_path (str): Path to save the generated plot. + """ + try: + actual_values = pd.read_csv(actual_values_path).values.flatten() + predicted_values = pd.read_csv(predicted_values_path).values.flatten() + + residuals = actual_values - predicted_values + + plt.figure(figsize=(10, 8)) + sns.scatterplot(x=predicted_values, y=residuals) + plt.xlabel("Predicted Values") + plt.ylabel("Residuals") + plt.title("Residual Plot") + plt.axhline(y=0, color='r', linestyle='--') # Add a horizontal line at y=0 + plt.savefig(output_path) + plt.close() + + print(f"Residual plot saved to {output_path}") + + except FileNotFoundError as e: + print(f"Error: File not found: {e}") + except Exception as e: + print(f"Error generating residual plot: {e}") + + +def main(): + """ + Main function to parse arguments and generate visualizations. + """ + parser = argparse.ArgumentParser( + description="Generate visualizations of model performance metrics." + ) + parser.add_argument( + "--plot_type", + type=str, + required=True, + choices=["scatter", "histogram", "residual"], + help="Type of plot to generate (scatter, histogram, residual).", + ) + parser.add_argument( + "--actual_values", + type=str, + help="Path to the CSV file containing actual values (required for scatter and residual plots).", + ) + parser.add_argument( + "--predicted_values", + type=str, + help="Path to the CSV file containing predicted values (required for scatter and residual plots).", + ) + parser.add_argument( + "--errors", + type=str, + help="Path to the CSV file containing errors (required for histogram).", + ) + parser.add_argument( + "--output", type=str, required=True, help="Path to save the generated plot." + ) + + args = parser.parse_args() + + if args.plot_type == "scatter": + if not args.actual_values or not args.predicted_values: + print( + "Error: --actual_values and --predicted_values are required for scatter plots." + ) + return + generate_scatter_plot(args.actual_values, args.predicted_values, args.output) + elif args.plot_type == "histogram": + if not args.errors: + print("Error: --errors is required for histograms.") + return + generate_histogram(args.errors, args.output) + elif args.plot_type == "residual": + if not args.actual_values or not args.predicted_values: + print( + "Error: --actual_values and --predicted_values are required for residual plots." + ) + return + generate_residual_plot(args.actual_values, args.predicted_values, args.output) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/skills/model-evaluation-suite/references/README.md b/skills/model-evaluation-suite/references/README.md new file mode 100644 index 0000000..7ec647e --- /dev/null +++ b/skills/model-evaluation-suite/references/README.md @@ -0,0 +1,7 @@ +# References + +Bundled resources for model-evaluation-suite skill + +- [ ] metrics_definitions.md: Detailed definitions and explanations of all supported evaluation metrics. +- [ ] dataset_schemas.md: Schemas for supported datasets, including required fields and data types. +- [ ] model_api_documentation.md: Documentation for the model API, including input/output formats and authentication details. diff --git a/skills/model-evaluation-suite/scripts/README.md b/skills/model-evaluation-suite/scripts/README.md new file mode 100644 index 0000000..61e39bf --- /dev/null +++ b/skills/model-evaluation-suite/scripts/README.md @@ -0,0 +1,7 @@ +# Scripts + +Bundled resources for model-evaluation-suite skill + +- [ ] evaluate_model.py: Script to execute model evaluation using specified metrics and datasets. +- [ ] data_loader.py: Script to load model and datasets for evaluation. +- [ ] metrics_calculator.py: Script to calculate evaluation metrics based on model predictions and ground truth.