Files
2025-11-29 18:51:12 +08:00

42 lines
1.7 KiB
YAML

# Configuration file for dataset splitting
# Input dataset parameters
input_dataset:
path: "REPLACE_ME/path/to/your/dataset.csv" # Path to the input dataset file
format: "csv" # Dataset format (e.g., csv, parquet, json)
header: True # Whether the dataset has a header row (True/False)
separator: "," # Separator used in the dataset (e.g., comma, tab, semicolon)
# Splitting ratios for training, validation, and testing sets
split_ratios:
train: 0.7 # Percentage of data for training (0.0 - 1.0)
validation: 0.15 # Percentage of data for validation (0.0 - 1.0)
test: 0.15 # Percentage of data for testing (0.0 - 1.0)
# Random seed for reproducibility
random_seed: 42 # Integer value for the random seed
# Output directory for split datasets
output_directory: "REPLACE_ME/path/to/output/directory" # Directory where the split datasets will be saved
# Naming convention for output files
output_names:
train: "train.csv" # Name of the training dataset file
validation: "validation.csv" # Name of the validation dataset file
test: "test.csv" # Name of the test dataset file
# Optional features (e.g., stratification)
features:
stratify: False # Whether to stratify the split based on a specific column (True/False)
stratify_column: "YOUR_VALUE_HERE" # Name of the column to use for stratification (if stratify is True)
shuffle: True # Whether to shuffle the data before splitting (True/False)
# Error handling configuration
error_handling:
on_error: "warn" # Action to take on error ("warn", "ignore", "raise") - default is warn
log_errors: True # Whether to log error messages to a file
# Logging configuration
logging:
level: "INFO" # Logging level (e.g., DEBUG, INFO, WARNING, ERROR, CRITICAL)
file: "split_data.log" # Log file path