{ "_comment": "This JSON schema defines the expected structure for a dataset to be split.", "$schema": "http://json-schema.org/draft-07/schema#", "title": "Dataset Schema", "description": "Schema for validating dataset structure before splitting.", "type": "object", "properties": { "dataset_name": { "type": "string", "description": "Name of the dataset", "example": "iris_dataset" }, "file_path": { "type": "string", "description": "Path to the dataset file (e.g., CSV, JSON)", "example": "data/iris.csv" }, "file_type": { "type": "string", "description": "Type of the dataset file", "enum": ["csv", "json", "parquet"], "example": "csv" }, "separator": { "type": "string", "description": "Separator character for CSV files (e.g., ',', ';')", "default": ",", "example": "," }, "target_column": { "type": "string", "description": "Name of the target/label column", "example": "species" }, "features": { "type": "array", "description": "List of feature column names", "items": { "type": "string" }, "example": ["sepal_length", "sepal_width", "petal_length", "petal_width"] }, "split_ratios": { "type": "object", "description": "Ratios for splitting the dataset into training, validation, and testing sets.", "properties": { "train": { "type": "number", "description": "Ratio for the training set (e.g., 0.7 for 70%)", "minimum": 0, "maximum": 1, "example": 0.7 }, "validation": { "type": "number", "description": "Ratio for the validation set (e.g., 0.15 for 15%)", "minimum": 0, "maximum": 1, "example": 0.15 }, "test": { "type": "number", "description": "Ratio for the testing set (e.g., 0.15 for 15%)", "minimum": 0, "maximum": 1, "example": 0.15 } }, "required": ["train", "validation", "test"] }, "random_state": { "type": "integer", "description": "Random seed for reproducibility", "default": 42, "example": 42 }, "stratify": { "type": "boolean", "description": "Whether to stratify the split based on the target column", "default": true, "example": true }, "output_directory": { "type": "string", "description": "Directory to save the split datasets", "default": "split_data", "example": "split_data" }, "file_name_prefix": { "type": "string", "description": "Prefix for the output file names", "default": "split", "example": "split" } }, "required": ["dataset_name", "file_path", "file_type", "target_column", "features", "split_ratios"] }