Files
gh-jeremylongshore-claude-c…/skills/dataset-splitter/assets/dataset_schema.json
2025-11-29 18:51:12 +08:00

97 lines
2.8 KiB
JSON

{
"_comment": "This JSON schema defines the expected structure for a dataset to be split.",
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Dataset Schema",
"description": "Schema for validating dataset structure before splitting.",
"type": "object",
"properties": {
"dataset_name": {
"type": "string",
"description": "Name of the dataset",
"example": "iris_dataset"
},
"file_path": {
"type": "string",
"description": "Path to the dataset file (e.g., CSV, JSON)",
"example": "data/iris.csv"
},
"file_type": {
"type": "string",
"description": "Type of the dataset file",
"enum": ["csv", "json", "parquet"],
"example": "csv"
},
"separator": {
"type": "string",
"description": "Separator character for CSV files (e.g., ',', ';')",
"default": ",",
"example": ","
},
"target_column": {
"type": "string",
"description": "Name of the target/label column",
"example": "species"
},
"features": {
"type": "array",
"description": "List of feature column names",
"items": {
"type": "string"
},
"example": ["sepal_length", "sepal_width", "petal_length", "petal_width"]
},
"split_ratios": {
"type": "object",
"description": "Ratios for splitting the dataset into training, validation, and testing sets.",
"properties": {
"train": {
"type": "number",
"description": "Ratio for the training set (e.g., 0.7 for 70%)",
"minimum": 0,
"maximum": 1,
"example": 0.7
},
"validation": {
"type": "number",
"description": "Ratio for the validation set (e.g., 0.15 for 15%)",
"minimum": 0,
"maximum": 1,
"example": 0.15
},
"test": {
"type": "number",
"description": "Ratio for the testing set (e.g., 0.15 for 15%)",
"minimum": 0,
"maximum": 1,
"example": 0.15
}
},
"required": ["train", "validation", "test"]
},
"random_state": {
"type": "integer",
"description": "Random seed for reproducibility",
"default": 42,
"example": 42
},
"stratify": {
"type": "boolean",
"description": "Whether to stratify the split based on the target column",
"default": true,
"example": true
},
"output_directory": {
"type": "string",
"description": "Directory to save the split datasets",
"default": "split_data",
"example": "split_data"
},
"file_name_prefix": {
"type": "string",
"description": "Prefix for the output file names",
"default": "split",
"example": "split"
}
},
"required": ["dataset_name", "file_path", "file_type", "target_column", "features", "split_ratios"]
}