97 lines
2.8 KiB
JSON
97 lines
2.8 KiB
JSON
{
|
|
"_comment": "This JSON schema defines the expected structure for a dataset to be split.",
|
|
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
"title": "Dataset Schema",
|
|
"description": "Schema for validating dataset structure before splitting.",
|
|
"type": "object",
|
|
"properties": {
|
|
"dataset_name": {
|
|
"type": "string",
|
|
"description": "Name of the dataset",
|
|
"example": "iris_dataset"
|
|
},
|
|
"file_path": {
|
|
"type": "string",
|
|
"description": "Path to the dataset file (e.g., CSV, JSON)",
|
|
"example": "data/iris.csv"
|
|
},
|
|
"file_type": {
|
|
"type": "string",
|
|
"description": "Type of the dataset file",
|
|
"enum": ["csv", "json", "parquet"],
|
|
"example": "csv"
|
|
},
|
|
"separator": {
|
|
"type": "string",
|
|
"description": "Separator character for CSV files (e.g., ',', ';')",
|
|
"default": ",",
|
|
"example": ","
|
|
},
|
|
"target_column": {
|
|
"type": "string",
|
|
"description": "Name of the target/label column",
|
|
"example": "species"
|
|
},
|
|
"features": {
|
|
"type": "array",
|
|
"description": "List of feature column names",
|
|
"items": {
|
|
"type": "string"
|
|
},
|
|
"example": ["sepal_length", "sepal_width", "petal_length", "petal_width"]
|
|
},
|
|
"split_ratios": {
|
|
"type": "object",
|
|
"description": "Ratios for splitting the dataset into training, validation, and testing sets.",
|
|
"properties": {
|
|
"train": {
|
|
"type": "number",
|
|
"description": "Ratio for the training set (e.g., 0.7 for 70%)",
|
|
"minimum": 0,
|
|
"maximum": 1,
|
|
"example": 0.7
|
|
},
|
|
"validation": {
|
|
"type": "number",
|
|
"description": "Ratio for the validation set (e.g., 0.15 for 15%)",
|
|
"minimum": 0,
|
|
"maximum": 1,
|
|
"example": 0.15
|
|
},
|
|
"test": {
|
|
"type": "number",
|
|
"description": "Ratio for the testing set (e.g., 0.15 for 15%)",
|
|
"minimum": 0,
|
|
"maximum": 1,
|
|
"example": 0.15
|
|
}
|
|
},
|
|
"required": ["train", "validation", "test"]
|
|
},
|
|
"random_state": {
|
|
"type": "integer",
|
|
"description": "Random seed for reproducibility",
|
|
"default": 42,
|
|
"example": 42
|
|
},
|
|
"stratify": {
|
|
"type": "boolean",
|
|
"description": "Whether to stratify the split based on the target column",
|
|
"default": true,
|
|
"example": true
|
|
},
|
|
"output_directory": {
|
|
"type": "string",
|
|
"description": "Directory to save the split datasets",
|
|
"default": "split_data",
|
|
"example": "split_data"
|
|
},
|
|
"file_name_prefix": {
|
|
"type": "string",
|
|
"description": "Prefix for the output file names",
|
|
"default": "split",
|
|
"example": "split"
|
|
}
|
|
},
|
|
"required": ["dataset_name", "file_path", "file_type", "target_column", "features", "split_ratios"]
|
|
} |