Initial commit

2025-11-29 18:51:12 +08:00
commit cf82920fd2
11 changed files with 340 additions and 0 deletions
--- a/skills/dataset-splitter/assets/README.md
+++ b/skills/dataset-splitter/assets/README.md
@@ -0,0 +1,7 @@
+# Assets
+
+Bundled resources for dataset-splitter skill
+
+- [ ] example_dataset.csv: A small example dataset for demonstration purposes.
+- [ ] split_data_config.yaml: Example configuration file for specifying dataset splitting parameters.
+- [ ] dataset_schema.json: Example JSON schema for dataset validation.
--- a/skills/dataset-splitter/assets/dataset_schema.json
+++ b/skills/dataset-splitter/assets/dataset_schema.json
@@ -0,0 +1,97 @@
+{
+  "_comment": "This JSON schema defines the expected structure for a dataset to be split.",
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Dataset Schema",
+  "description": "Schema for validating dataset structure before splitting.",
+  "type": "object",
+  "properties": {
+    "dataset_name": {
+      "type": "string",
+      "description": "Name of the dataset",
+      "example": "iris_dataset"
+    },
+    "file_path": {
+      "type": "string",
+      "description": "Path to the dataset file (e.g., CSV, JSON)",
+      "example": "data/iris.csv"
+    },
+    "file_type": {
+      "type": "string",
+      "description": "Type of the dataset file",
+      "enum": ["csv", "json", "parquet"],
+      "example": "csv"
+    },
+    "separator": {
+      "type": "string",
+      "description": "Separator character for CSV files (e.g., ',', ';')",
+      "default": ",",
+      "example": ","
+    },
+    "target_column": {
+      "type": "string",
+      "description": "Name of the target/label column",
+      "example": "species"
+    },
+    "features": {
+      "type": "array",
+      "description": "List of feature column names",
+      "items": {
+        "type": "string"
+      },
+      "example": ["sepal_length", "sepal_width", "petal_length", "petal_width"]
+    },
+    "split_ratios": {
+      "type": "object",
+      "description": "Ratios for splitting the dataset into training, validation, and testing sets.",
+      "properties": {
+        "train": {
+          "type": "number",
+          "description": "Ratio for the training set (e.g., 0.7 for 70%)",
+          "minimum": 0,
+          "maximum": 1,
+          "example": 0.7
+        },
+        "validation": {
+          "type": "number",
+          "description": "Ratio for the validation set (e.g., 0.15 for 15%)",
+          "minimum": 0,
+          "maximum": 1,
+          "example": 0.15
+        },
+        "test": {
+          "type": "number",
+          "description": "Ratio for the testing set (e.g., 0.15 for 15%)",
+          "minimum": 0,
+          "maximum": 1,
+          "example": 0.15
+        }
+      },
+      "required": ["train", "validation", "test"]
+    },
+    "random_state": {
+      "type": "integer",
+      "description": "Random seed for reproducibility",
+      "default": 42,
+      "example": 42
+    },
+    "stratify": {
+      "type": "boolean",
+      "description": "Whether to stratify the split based on the target column",
+      "default": true,
+      "example": true
+    },
+    "output_directory": {
+      "type": "string",
+      "description": "Directory to save the split datasets",
+      "default": "split_data",
+      "example": "split_data"
+    },
+    "file_name_prefix": {
+      "type": "string",
+      "description": "Prefix for the output file names",
+      "default": "split",
+      "example": "split"
+    }
+  },
+  "required": ["dataset_name", "file_path", "file_type", "target_column", "features", "split_ratios"]
+}
--- a/skills/dataset-splitter/assets/example_dataset.csv
+++ b/skills/dataset-splitter/assets/example_dataset.csv
@@ -0,0 +1,23 @@
+# example_dataset.csv
+# This is a small example dataset to demonstrate the dataset-splitter plugin.
+# It contains [NUMBER] rows and [NUMBER] columns.
+# The first row is the header row, defining the column names.
+# You can replace this with your own dataset.
+# To use this dataset with the plugin, save it as a .csv file and place it in the same directory as your plugin execution.
+
+# Column descriptions:
+# - feature1: [DESCRIPTION OF FEATURE 1]
+# - feature2: [DESCRIPTION OF FEATURE 2]
+# - target: [DESCRIPTION OF TARGET VARIABLE]
+
+feature1,feature2,target
+1.0,2.0,0
+3.0,4.0,1
+5.0,6.0,0
+7.0,8.0,1
+9.0,10.0,0
+11.0,12.0,1
+13.0,14.0,0
+15.0,16.0,1
+17.0,18.0,0
+19.0,20.0,1
--- a/skills/dataset-splitter/assets/split_data_config.yaml
+++ b/skills/dataset-splitter/assets/split_data_config.yaml
@@ -0,0 +1,42 @@
+# Configuration file for dataset splitting
+
+# Input dataset parameters
+input_dataset:
+  path: "REPLACE_ME/path/to/your/dataset.csv"  # Path to the input dataset file
+  format: "csv"  # Dataset format (e.g., csv, parquet, json)
+  header: True   # Whether the dataset has a header row (True/False)
+  separator: ","  # Separator used in the dataset (e.g., comma, tab, semicolon)
+
+# Splitting ratios for training, validation, and testing sets
+split_ratios:
+  train: 0.7   # Percentage of data for training (0.0 - 1.0)
+  validation: 0.15  # Percentage of data for validation (0.0 - 1.0)
+  test: 0.15  # Percentage of data for testing (0.0 - 1.0)
+
+# Random seed for reproducibility
+random_seed: 42  # Integer value for the random seed
+
+# Output directory for split datasets
+output_directory: "REPLACE_ME/path/to/output/directory"  # Directory where the split datasets will be saved
+
+# Naming convention for output files
+output_names:
+  train: "train.csv"   # Name of the training dataset file
+  validation: "validation.csv" # Name of the validation dataset file
+  test: "test.csv"  # Name of the test dataset file
+
+# Optional features (e.g., stratification)
+features:
+  stratify: False  # Whether to stratify the split based on a specific column (True/False)
+  stratify_column: "YOUR_VALUE_HERE" # Name of the column to use for stratification (if stratify is True)
+  shuffle: True   # Whether to shuffle the data before splitting (True/False)
+
+# Error handling configuration
+error_handling:
+  on_error: "warn"  # Action to take on error ("warn", "ignore", "raise") - default is warn
+  log_errors: True # Whether to log error messages to a file
+
+# Logging configuration
+logging:
+  level: "INFO"  # Logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
+  file: "split_data.log" # Log file path