Initial commit
This commit is contained in:
7
skills/dataset-splitter/assets/README.md
Normal file
7
skills/dataset-splitter/assets/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Assets
|
||||
|
||||
Bundled resources for dataset-splitter skill
|
||||
|
||||
- [ ] example_dataset.csv: A small example dataset for demonstration purposes.
|
||||
- [ ] split_data_config.yaml: Example configuration file for specifying dataset splitting parameters.
|
||||
- [ ] dataset_schema.json: Example JSON schema for dataset validation.
|
||||
97
skills/dataset-splitter/assets/dataset_schema.json
Normal file
97
skills/dataset-splitter/assets/dataset_schema.json
Normal file
@@ -0,0 +1,97 @@
|
||||
{
|
||||
"_comment": "This JSON schema defines the expected structure for a dataset to be split.",
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "Dataset Schema",
|
||||
"description": "Schema for validating dataset structure before splitting.",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"dataset_name": {
|
||||
"type": "string",
|
||||
"description": "Name of the dataset",
|
||||
"example": "iris_dataset"
|
||||
},
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "Path to the dataset file (e.g., CSV, JSON)",
|
||||
"example": "data/iris.csv"
|
||||
},
|
||||
"file_type": {
|
||||
"type": "string",
|
||||
"description": "Type of the dataset file",
|
||||
"enum": ["csv", "json", "parquet"],
|
||||
"example": "csv"
|
||||
},
|
||||
"separator": {
|
||||
"type": "string",
|
||||
"description": "Separator character for CSV files (e.g., ',', ';')",
|
||||
"default": ",",
|
||||
"example": ","
|
||||
},
|
||||
"target_column": {
|
||||
"type": "string",
|
||||
"description": "Name of the target/label column",
|
||||
"example": "species"
|
||||
},
|
||||
"features": {
|
||||
"type": "array",
|
||||
"description": "List of feature column names",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"example": ["sepal_length", "sepal_width", "petal_length", "petal_width"]
|
||||
},
|
||||
"split_ratios": {
|
||||
"type": "object",
|
||||
"description": "Ratios for splitting the dataset into training, validation, and testing sets.",
|
||||
"properties": {
|
||||
"train": {
|
||||
"type": "number",
|
||||
"description": "Ratio for the training set (e.g., 0.7 for 70%)",
|
||||
"minimum": 0,
|
||||
"maximum": 1,
|
||||
"example": 0.7
|
||||
},
|
||||
"validation": {
|
||||
"type": "number",
|
||||
"description": "Ratio for the validation set (e.g., 0.15 for 15%)",
|
||||
"minimum": 0,
|
||||
"maximum": 1,
|
||||
"example": 0.15
|
||||
},
|
||||
"test": {
|
||||
"type": "number",
|
||||
"description": "Ratio for the testing set (e.g., 0.15 for 15%)",
|
||||
"minimum": 0,
|
||||
"maximum": 1,
|
||||
"example": 0.15
|
||||
}
|
||||
},
|
||||
"required": ["train", "validation", "test"]
|
||||
},
|
||||
"random_state": {
|
||||
"type": "integer",
|
||||
"description": "Random seed for reproducibility",
|
||||
"default": 42,
|
||||
"example": 42
|
||||
},
|
||||
"stratify": {
|
||||
"type": "boolean",
|
||||
"description": "Whether to stratify the split based on the target column",
|
||||
"default": true,
|
||||
"example": true
|
||||
},
|
||||
"output_directory": {
|
||||
"type": "string",
|
||||
"description": "Directory to save the split datasets",
|
||||
"default": "split_data",
|
||||
"example": "split_data"
|
||||
},
|
||||
"file_name_prefix": {
|
||||
"type": "string",
|
||||
"description": "Prefix for the output file names",
|
||||
"default": "split",
|
||||
"example": "split"
|
||||
}
|
||||
},
|
||||
"required": ["dataset_name", "file_path", "file_type", "target_column", "features", "split_ratios"]
|
||||
}
|
||||
23
skills/dataset-splitter/assets/example_dataset.csv
Normal file
23
skills/dataset-splitter/assets/example_dataset.csv
Normal file
@@ -0,0 +1,23 @@
|
||||
# example_dataset.csv
|
||||
# This is a small example dataset to demonstrate the dataset-splitter plugin.
|
||||
# It contains [NUMBER] rows and [NUMBER] columns.
|
||||
# The first row is the header row, defining the column names.
|
||||
# You can replace this with your own dataset.
|
||||
# To use this dataset with the plugin, save it as a .csv file and place it in the same directory as your plugin execution.
|
||||
|
||||
# Column descriptions:
|
||||
# - feature1: [DESCRIPTION OF FEATURE 1]
|
||||
# - feature2: [DESCRIPTION OF FEATURE 2]
|
||||
# - target: [DESCRIPTION OF TARGET VARIABLE]
|
||||
|
||||
feature1,feature2,target
|
||||
1.0,2.0,0
|
||||
3.0,4.0,1
|
||||
5.0,6.0,0
|
||||
7.0,8.0,1
|
||||
9.0,10.0,0
|
||||
11.0,12.0,1
|
||||
13.0,14.0,0
|
||||
15.0,16.0,1
|
||||
17.0,18.0,0
|
||||
19.0,20.0,1
|
||||
|
42
skills/dataset-splitter/assets/split_data_config.yaml
Normal file
42
skills/dataset-splitter/assets/split_data_config.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
# Configuration file for dataset splitting
|
||||
|
||||
# Input dataset parameters
|
||||
input_dataset:
|
||||
path: "REPLACE_ME/path/to/your/dataset.csv" # Path to the input dataset file
|
||||
format: "csv" # Dataset format (e.g., csv, parquet, json)
|
||||
header: True # Whether the dataset has a header row (True/False)
|
||||
separator: "," # Separator used in the dataset (e.g., comma, tab, semicolon)
|
||||
|
||||
# Splitting ratios for training, validation, and testing sets
|
||||
split_ratios:
|
||||
train: 0.7 # Percentage of data for training (0.0 - 1.0)
|
||||
validation: 0.15 # Percentage of data for validation (0.0 - 1.0)
|
||||
test: 0.15 # Percentage of data for testing (0.0 - 1.0)
|
||||
|
||||
# Random seed for reproducibility
|
||||
random_seed: 42 # Integer value for the random seed
|
||||
|
||||
# Output directory for split datasets
|
||||
output_directory: "REPLACE_ME/path/to/output/directory" # Directory where the split datasets will be saved
|
||||
|
||||
# Naming convention for output files
|
||||
output_names:
|
||||
train: "train.csv" # Name of the training dataset file
|
||||
validation: "validation.csv" # Name of the validation dataset file
|
||||
test: "test.csv" # Name of the test dataset file
|
||||
|
||||
# Optional features (e.g., stratification)
|
||||
features:
|
||||
stratify: False # Whether to stratify the split based on a specific column (True/False)
|
||||
stratify_column: "YOUR_VALUE_HERE" # Name of the column to use for stratification (if stratify is True)
|
||||
shuffle: True # Whether to shuffle the data before splitting (True/False)
|
||||
|
||||
# Error handling configuration
|
||||
error_handling:
|
||||
on_error: "warn" # Action to take on error ("warn", "ignore", "raise") - default is warn
|
||||
log_errors: True # Whether to log error messages to a file
|
||||
|
||||
# Logging configuration
|
||||
logging:
|
||||
level: "INFO" # Logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
file: "split_data.log" # Log file path
|
||||
Reference in New Issue
Block a user