{
  "_comment": "Optimization configuration template for deep learning models.",
  "optimizer_name": "Adam",
  "_comment": "Name of the optimization algorithm to use. Options: Adam, SGD, RMSprop, AdamW, etc.",
  "learning_rate": 0.001,
  "_comment": "Learning rate for the optimizer. A smaller value might be needed for complex models.",
  "weight_decay": 0.0001,
  "_comment": "L2 regularization strength. Helps prevent overfitting.",
  "beta1": 0.9,
  "_comment": "Beta1 parameter for Adam optimizer (exponential decay rate for the 1st moment estimates).",
  "beta2": 0.999,
  "_comment": "Beta2 parameter for Adam optimizer (exponential decay rate for the 2nd moment estimates).",
  "epsilon": 1e-08,
  "_comment": "Epsilon parameter for Adam optimizer (term added to the denominator to improve numerical stability).",
  "momentum": 0.0,
  "_comment": "Momentum factor for SGD optimizer. Typically a value between 0 and 1.",
  "nesterov": false,
  "_comment": "Whether to use Nesterov momentum for SGD optimizer.",
  "learning_rate_scheduler": {
    "enabled": true,
    "_comment": "Enable or disable learning rate scheduling.",
    "scheduler_type": "ReduceLROnPlateau",
    "_comment": "Type of learning rate scheduler. Options: StepLR, MultiStepLR, ExponentialLR, ReduceLROnPlateau, CosineAnnealingLR, CyclicLR, etc.",
    "factor": 0.1,
    "_comment": "Factor by which the learning rate will be reduced.",
    "patience": 10,
    "_comment": "Number of epochs with no improvement after which learning rate will be reduced.",
    "threshold": 0.0001,
    "_comment": "Threshold for measuring the new optimum, to only focus on significant changes.",
    "threshold_mode": "rel",
    "_comment": "One of rel, abs. In rel mode, dynamic_threshold = best * ( 1 + threshold ) in 'max' mode or best * ( 1 - threshold ) in min mode. In abs mode, dynamic_threshold = best + threshold in max mode or best - threshold in min mode.",
    "cooldown": 0,
    "_comment": "Number of epochs to wait before resuming normal operation after lr has been reduced.",
    "min_lr": 0,
    "_comment": "A scalar or a list of scalars. A lower bound on the learning rate of all param groups or each group respectively.",
    "verbose": true
     "_comment": "If True, prints a message to stdout for each update."
  },
  "gradient_clipping": {
    "enabled": true,
    "_comment": "Enable or disable gradient clipping.",
    "clip_value": 1.0,
    "_comment": "The clipping threshold. Gradients will be clipped to this value.",
    "clip_norm_type": 2.0,
    "_comment": "The type of the norm used for clipping. Can be 2.0 (L2 norm), inf (infinity norm), etc."
  }
}