gh-jeremylongshore-claude-c…/skills/dataset-splitter/assets/dataset_schema.json

{
  "_comment": "This JSON schema defines the expected structure for a dataset to be split.",
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Dataset Schema",
  "description": "Schema for validating dataset structure before splitting.",
  "type": "object",
  "properties": {
    "dataset_name": {
      "type": "string",
      "description": "Name of the dataset",
      "example": "iris_dataset"
    },
    "file_path": {
      "type": "string",
      "description": "Path to the dataset file (e.g., CSV, JSON)",
      "example": "data/iris.csv"
    },
    "file_type": {
      "type": "string",
      "description": "Type of the dataset file",
      "enum": ["csv", "json", "parquet"],
      "example": "csv"
    },
    "separator": {
      "type": "string",
      "description": "Separator character for CSV files (e.g., ',', ';')",
      "default": ",",
      "example": ","
    },
    "target_column": {
      "type": "string",
      "description": "Name of the target/label column",
      "example": "species"
    },
    "features": {
      "type": "array",
      "description": "List of feature column names",
      "items": {
        "type": "string"
      },
      "example": ["sepal_length", "sepal_width", "petal_length", "petal_width"]
    },
    "split_ratios": {
      "type": "object",
      "description": "Ratios for splitting the dataset into training, validation, and testing sets.",
      "properties": {
        "train": {
          "type": "number",
          "description": "Ratio for the training set (e.g., 0.7 for 70%)",
          "minimum": 0,
          "maximum": 1,
          "example": 0.7
        },
        "validation": {
          "type": "number",
          "description": "Ratio for the validation set (e.g., 0.15 for 15%)",
          "minimum": 0,
          "maximum": 1,
          "example": 0.15
        },
        "test": {
          "type": "number",
          "description": "Ratio for the testing set (e.g., 0.15 for 15%)",
          "minimum": 0,
          "maximum": 1,
          "example": 0.15
        }
      },
      "required": ["train", "validation", "test"]
    },
    "random_state": {
      "type": "integer",
      "description": "Random seed for reproducibility",
      "default": 42,
      "example": 42
    },
    "stratify": {
      "type": "boolean",
      "description": "Whether to stratify the split based on the target column",
      "default": true,
      "example": true
    },
    "output_directory": {
      "type": "string",
      "description": "Directory to save the split datasets",
      "default": "split_data",
      "example": "split_data"
    },
    "file_name_prefix": {
      "type": "string",
      "description": "Prefix for the output file names",
      "default": "split",
      "example": "split"
    }
  },
  "required": ["dataset_name", "file_path", "file_type", "target_column", "features", "split_ratios"]
}