Initial commit

This commit is contained in:
Zhongwei Li
2025-11-29 18:51:12 +08:00
commit cf82920fd2
11 changed files with 340 additions and 0 deletions

View File

@@ -0,0 +1,7 @@
# Assets
Bundled resources for dataset-splitter skill
- [ ] example_dataset.csv: A small example dataset for demonstration purposes.
- [ ] split_data_config.yaml: Example configuration file for specifying dataset splitting parameters.
- [ ] dataset_schema.json: Example JSON schema for dataset validation.

View File

@@ -0,0 +1,97 @@
{
"_comment": "This JSON schema defines the expected structure for a dataset to be split.",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Dataset Schema",
"description": "Schema for validating dataset structure before splitting.",
"type": "object",
"properties": {
"dataset_name": {
"type": "string",
"description": "Name of the dataset",
"example": "iris_dataset"
},
"file_path": {
"type": "string",
"description": "Path to the dataset file (e.g., CSV, JSON)",
"example": "data/iris.csv"
},
"file_type": {
"type": "string",
"description": "Type of the dataset file",
"enum": ["csv", "json", "parquet"],
"example": "csv"
},
"separator": {
"type": "string",
"description": "Separator character for CSV files (e.g., ',', ';')",
"default": ",",
"example": ","
},
"target_column": {
"type": "string",
"description": "Name of the target/label column",
"example": "species"
},
"features": {
"type": "array",
"description": "List of feature column names",
"items": {
"type": "string"
},
"example": ["sepal_length", "sepal_width", "petal_length", "petal_width"]
},
"split_ratios": {
"type": "object",
"description": "Ratios for splitting the dataset into training, validation, and testing sets.",
"properties": {
"train": {
"type": "number",
"description": "Ratio for the training set (e.g., 0.7 for 70%)",
"minimum": 0,
"maximum": 1,
"example": 0.7
},
"validation": {
"type": "number",
"description": "Ratio for the validation set (e.g., 0.15 for 15%)",
"minimum": 0,
"maximum": 1,
"example": 0.15
},
"test": {
"type": "number",
"description": "Ratio for the testing set (e.g., 0.15 for 15%)",
"minimum": 0,
"maximum": 1,
"example": 0.15
}
},
"required": ["train", "validation", "test"]
},
"random_state": {
"type": "integer",
"description": "Random seed for reproducibility",
"default": 42,
"example": 42
},
"stratify": {
"type": "boolean",
"description": "Whether to stratify the split based on the target column",
"default": true,
"example": true
},
"output_directory": {
"type": "string",
"description": "Directory to save the split datasets",
"default": "split_data",
"example": "split_data"
},
"file_name_prefix": {
"type": "string",
"description": "Prefix for the output file names",
"default": "split",
"example": "split"
}
},
"required": ["dataset_name", "file_path", "file_type", "target_column", "features", "split_ratios"]
}

View File

@@ -0,0 +1,23 @@
# example_dataset.csv
# This is a small example dataset to demonstrate the dataset-splitter plugin.
# It contains [NUMBER] rows and [NUMBER] columns.
# The first row is the header row, defining the column names.
# You can replace this with your own dataset.
# To use this dataset with the plugin, save it as a .csv file and place it in the same directory as your plugin execution.
# Column descriptions:
# - feature1: [DESCRIPTION OF FEATURE 1]
# - feature2: [DESCRIPTION OF FEATURE 2]
# - target: [DESCRIPTION OF TARGET VARIABLE]
feature1,feature2,target
1.0,2.0,0
3.0,4.0,1
5.0,6.0,0
7.0,8.0,1
9.0,10.0,0
11.0,12.0,1
13.0,14.0,0
15.0,16.0,1
17.0,18.0,0
19.0,20.0,1
1 # example_dataset.csv
2 # This is a small example dataset to demonstrate the dataset-splitter plugin.
3 # It contains [NUMBER] rows and [NUMBER] columns.
4 # The first row is the header row, defining the column names.
5 # You can replace this with your own dataset.
6 # To use this dataset with the plugin, save it as a .csv file and place it in the same directory as your plugin execution.
7 # Column descriptions:
8 # - feature1: [DESCRIPTION OF FEATURE 1]
9 # - feature2: [DESCRIPTION OF FEATURE 2]
10 # - target: [DESCRIPTION OF TARGET VARIABLE]
11 feature1,feature2,target
12 1.0,2.0,0
13 3.0,4.0,1
14 5.0,6.0,0
15 7.0,8.0,1
16 9.0,10.0,0
17 11.0,12.0,1
18 13.0,14.0,0
19 15.0,16.0,1
20 17.0,18.0,0
21 19.0,20.0,1

View File

@@ -0,0 +1,42 @@
# Configuration file for dataset splitting
# Input dataset parameters
input_dataset:
path: "REPLACE_ME/path/to/your/dataset.csv" # Path to the input dataset file
format: "csv" # Dataset format (e.g., csv, parquet, json)
header: True # Whether the dataset has a header row (True/False)
separator: "," # Separator used in the dataset (e.g., comma, tab, semicolon)
# Splitting ratios for training, validation, and testing sets
split_ratios:
train: 0.7 # Percentage of data for training (0.0 - 1.0)
validation: 0.15 # Percentage of data for validation (0.0 - 1.0)
test: 0.15 # Percentage of data for testing (0.0 - 1.0)
# Random seed for reproducibility
random_seed: 42 # Integer value for the random seed
# Output directory for split datasets
output_directory: "REPLACE_ME/path/to/output/directory" # Directory where the split datasets will be saved
# Naming convention for output files
output_names:
train: "train.csv" # Name of the training dataset file
validation: "validation.csv" # Name of the validation dataset file
test: "test.csv" # Name of the test dataset file
# Optional features (e.g., stratification)
features:
stratify: False # Whether to stratify the split based on a specific column (True/False)
stratify_column: "YOUR_VALUE_HERE" # Name of the column to use for stratification (if stratify is True)
shuffle: True # Whether to shuffle the data before splitting (True/False)
# Error handling configuration
error_handling:
on_error: "warn" # Action to take on error ("warn", "ignore", "raise") - default is warn
log_errors: True # Whether to log error messages to a file
# Logging configuration
logging:
level: "INFO" # Logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
file: "split_data.log" # Log file path