Initial commit
This commit is contained in:
7
skills/deployment-rollback-manager/assets/README.md
Normal file
7
skills/deployment-rollback-manager/assets/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Assets
|
||||
|
||||
Bundled resources for deployment-rollback-manager skill
|
||||
|
||||
- [ ] rollback_template.yml: A YAML template for defining rollback configurations.
|
||||
- [ ] example_rollback_config.yml: An example YAML configuration file for a specific rollback scenario.
|
||||
- [ ] rollback_report_template.md: A markdown template for generating rollback reports.
|
||||
@@ -0,0 +1,121 @@
|
||||
# Configuration file for Deployment Rollback Manager Plugin
|
||||
|
||||
# --- General Settings ---
|
||||
general:
|
||||
# Description of this rollback configuration. Useful for identifying different scenarios.
|
||||
description: "Rollback to version 1.2.3 due to critical bug found in 1.2.4"
|
||||
|
||||
# Environment to perform the rollback in. Should match your deployment environment.
|
||||
environment: "production" # Example: production, staging, dev
|
||||
|
||||
# Whether to automatically approve the rollback after safety checks pass.
|
||||
# Set to false for manual approval after checks.
|
||||
auto_approve: false
|
||||
|
||||
# --- Deployment Settings ---
|
||||
deployment:
|
||||
# Target application or service name.
|
||||
application_name: "YOUR_APPLICATION_NAME"
|
||||
|
||||
# Deployment platform to rollback. Options: kubernetes, docker, aws_ecs, manual_script
|
||||
platform: "kubernetes"
|
||||
|
||||
# Version to rollback to.
|
||||
rollback_version: "1.2.3"
|
||||
|
||||
# Current version being rolled back from. This is used for verification.
|
||||
current_version: "1.2.4"
|
||||
|
||||
# --- Kubernetes Specific Settings (only applicable if platform is kubernetes) ---
|
||||
kubernetes:
|
||||
# Namespace where the application is deployed.
|
||||
namespace: "default"
|
||||
|
||||
# Deployment name.
|
||||
deployment_name: "YOUR_DEPLOYMENT_NAME"
|
||||
|
||||
# Image tag to rollback to.
|
||||
image_tag: "1.2.3"
|
||||
|
||||
# Number of replicas to verify after the rollback.
|
||||
replica_count: 3
|
||||
|
||||
# --- Docker Specific Settings (only applicable if platform is docker) ---
|
||||
docker:
|
||||
# Docker compose file path.
|
||||
compose_file: "docker-compose.yml"
|
||||
|
||||
# Service name to rollback.
|
||||
service_name: "YOUR_SERVICE_NAME"
|
||||
|
||||
# --- AWS ECS Specific Settings (only applicable if platform is aws_ecs) ---
|
||||
aws_ecs:
|
||||
# ECS cluster name
|
||||
cluster_name: "YOUR_ECS_CLUSTER_NAME"
|
||||
|
||||
# ECS service name
|
||||
service_name: "YOUR_ECS_SERVICE_NAME"
|
||||
|
||||
# Task definition family name
|
||||
task_definition_family: "YOUR_TASK_DEFINITION_FAMILY"
|
||||
|
||||
# --- Manual Script Settings (only applicable if platform is manual_script) ---
|
||||
manual_script:
|
||||
# Path to the script to execute for the rollback.
|
||||
script_path: "/path/to/rollback_script.sh"
|
||||
|
||||
# Arguments to pass to the script.
|
||||
script_arguments: ["--rollback", "--version", "1.2.3"]
|
||||
|
||||
# --- Safety Checks ---
|
||||
safety_checks:
|
||||
# List of checks to perform before initiating the rollback.
|
||||
checks:
|
||||
- type: "health_check"
|
||||
# URL to check for application health.
|
||||
url: "https://YOUR_APPLICATION_URL/health"
|
||||
# Expected HTTP status code for a healthy application.
|
||||
expected_status_code: 200
|
||||
# Timeout in seconds for the health check.
|
||||
timeout: 10
|
||||
|
||||
- type: "database_check"
|
||||
# SQL query to execute.
|
||||
query: "SELECT COUNT(*) FROM users;"
|
||||
# Expected result from the query.
|
||||
expected_result: "REPLACE_ME"
|
||||
|
||||
- type: "metric_check"
|
||||
# Metric name to check.
|
||||
metric_name: "error_rate"
|
||||
# Maximum acceptable value for the metric.
|
||||
max_value: 0.01 # Example: 1% error rate
|
||||
# Monitoring system (e.g., prometheus, cloudwatch)
|
||||
monitoring_system: "prometheus"
|
||||
# Query to get the metric from the monitoring system.
|
||||
query: "sum(rate(http_requests_total{status=~'5..'}[5m])) / sum(rate(http_requests_total[5m]))"
|
||||
|
||||
# --- Notification Settings ---
|
||||
notifications:
|
||||
# List of notification channels to send rollback status updates to.
|
||||
channels:
|
||||
- type: "slack"
|
||||
# Slack webhook URL.
|
||||
webhook_url: "YOUR_SLACK_WEBHOOK_URL"
|
||||
# Channel to send notifications to.
|
||||
channel: "#deployment-alerts"
|
||||
|
||||
- type: "email"
|
||||
# Recipient email addresses.
|
||||
recipients: ["admin@example.com", "devops@example.com"]
|
||||
|
||||
# --- Advanced Settings ---
|
||||
advanced:
|
||||
# Timeout in seconds for the entire rollback process.
|
||||
rollback_timeout: 3600 # 1 hour
|
||||
|
||||
# Number of retries for failed operations.
|
||||
retries: 2
|
||||
|
||||
# Delay in seconds between retries.
|
||||
retry_delay: 30
|
||||
@@ -0,0 +1,113 @@
|
||||
# Rollback Report
|
||||
|
||||
This report documents the rollback process, its reasons, actions taken, and outcomes.
|
||||
|
||||
## 1. Executive Summary
|
||||
|
||||
[**Placeholder: Briefly summarize the reason for the rollback, the actions taken, and the overall outcome. Was the rollback successful? Are there any ongoing issues?**]
|
||||
|
||||
*Example: A critical bug was discovered in version 2.5.0 of the application. A rollback to version 2.4.0 was initiated. The rollback was successful, and the application is now stable. Further investigation into the bug in version 2.5.0 is underway.*
|
||||
|
||||
## 2. Rollback Trigger
|
||||
|
||||
### 2.1. Reason for Rollback
|
||||
|
||||
[**Placeholder: Describe in detail the reason for initiating the rollback. What specific issue or incident triggered the decision?**]
|
||||
|
||||
*Example: After deploying version 2.5.0, users reported intermittent server errors (HTTP 500). Monitoring dashboards showed a significant spike in error rates and a degradation in application performance.*
|
||||
|
||||
### 2.2. Severity Level
|
||||
|
||||
[**Placeholder: Indicate the severity level of the issue that triggered the rollback.**]
|
||||
|
||||
*Possible values: Critical, High, Medium, Low*
|
||||
|
||||
*Example: Critical*
|
||||
|
||||
### 2.3. Impact Assessment
|
||||
|
||||
[**Placeholder: Describe the impact of the issue on users, services, and the business.**]
|
||||
|
||||
*Example: The server errors were preventing users from completing critical transactions, impacting revenue and user satisfaction. The incident also affected the availability of a key API used by partner applications.*
|
||||
|
||||
## 3. Rollback Procedure
|
||||
|
||||
### 3.1. Rollback Plan
|
||||
|
||||
[**Placeholder: Outline the planned steps for the rollback. Include specific versions, scripts, and commands used.**]
|
||||
|
||||
*Example:
|
||||
1. Verify the integrity of the previous version (2.4.0) backup.
|
||||
2. Stop the current version (2.5.0) deployment.
|
||||
3. Deploy the backup version (2.4.0).
|
||||
4. Run database migrations to revert to the previous schema (if applicable).
|
||||
5. Verify application functionality and performance.
|
||||
6. Monitor system logs for any errors.*
|
||||
|
||||
### 3.2. Rollback Execution
|
||||
|
||||
[**Placeholder: Document the actual steps taken during the rollback process. Note any deviations from the original plan and the reasons for those deviations.**]
|
||||
|
||||
*Example: The rollback was executed as planned, with the exception of a minor issue with database migration scripts. A slight modification was made to the script to ensure compatibility with the current database state. This was documented in the database migration log.*
|
||||
|
||||
### 3.3. Rollback Time
|
||||
|
||||
[**Placeholder: Record the start and end times of the rollback process.**]
|
||||
|
||||
*Example:
|
||||
* Start Time: 2024-01-26 10:00 UTC
|
||||
* End Time: 2024-01-26 10:45 UTC*
|
||||
|
||||
### 3.4. Personnel Involved
|
||||
|
||||
[**Placeholder: List the individuals involved in the rollback process and their roles.**]
|
||||
|
||||
*Example:
|
||||
* John Doe: Lead Engineer
|
||||
* Jane Smith: Database Administrator
|
||||
* Peter Jones: Operations Engineer*
|
||||
|
||||
## 4. Post-Rollback Analysis
|
||||
|
||||
### 4.1. Verification of Rollback Success
|
||||
|
||||
[**Placeholder: Describe how the success of the rollback was verified. Include specific tests and monitoring metrics used.**]
|
||||
|
||||
*Example: After the rollback, application functionality was tested using automated integration tests and manual user acceptance testing. Monitoring dashboards showed a return to normal error rates and application performance.*
|
||||
|
||||
### 4.2. Root Cause Analysis (Initial)
|
||||
|
||||
[**Placeholder: Provide an initial assessment of the root cause of the issue that triggered the rollback. This may be a preliminary investigation and may require further analysis.**]
|
||||
|
||||
*Example: The initial investigation suggests that a recently introduced code change in the payment processing module caused the server errors. Further code review and debugging are required to pinpoint the exact cause.*
|
||||
|
||||
### 4.3. Corrective Actions
|
||||
|
||||
[**Placeholder: Outline the planned corrective actions to prevent similar issues from occurring in the future.**]
|
||||
|
||||
*Example:
|
||||
1. Conduct a thorough code review of the payment processing module.
|
||||
2. Implement more robust unit and integration tests for critical code paths.
|
||||
3. Improve monitoring and alerting to detect issues earlier.
|
||||
4. Implement a canary deployment strategy for future releases.*
|
||||
|
||||
### 4.4. Lessons Learned
|
||||
|
||||
[**Placeholder: Document any lessons learned from the rollback process. What went well? What could be improved?**]
|
||||
|
||||
*Example:
|
||||
* The rollback process was well-documented and executed efficiently.
|
||||
* The monitoring dashboards provided valuable insights into the issue.
|
||||
* The communication between teams was effective.
|
||||
* Areas for improvement:
|
||||
* Improve the speed of database migrations.
|
||||
* Automate the rollback process further.*
|
||||
|
||||
## 5. Appendix
|
||||
|
||||
[**Placeholder: Include any supporting documentation, such as logs, screenshots, and scripts.**]
|
||||
|
||||
*Example:
|
||||
* Database migration log: [Link to log file]
|
||||
* Monitoring dashboard screenshot: [Link to screenshot]
|
||||
* Rollback script: [Link to script]*
|
||||
@@ -0,0 +1,71 @@
|
||||
# rollback_template.yml
|
||||
# This file defines the configuration for a deployment rollback.
|
||||
|
||||
# General Rollback Information
|
||||
rollback_name: "REPLACE_ME - Descriptive Rollback Name" # e.g., "Rollback to v1.2.3 due to login issues"
|
||||
description: "Detailed description of the rollback purpose and context." # More details on why this rollback is necessary.
|
||||
|
||||
# Target Environment
|
||||
environment: "production" # e.g., "staging", "production", "qa"
|
||||
region: "us-east-1" # AWS region or equivalent
|
||||
|
||||
# Deployment Details
|
||||
application_name: "YOUR_VALUE_HERE - Application Name" # e.g., "web-app", "api-service"
|
||||
current_version: "YOUR_VALUE_HERE - Current Deployed Version" # e.g., "v1.2.4", "release-2023-10-27"
|
||||
rollback_version: "YOUR_VALUE_HERE - Version to Rollback To" # e.g., "v1.2.3", "release-2023-10-26"
|
||||
|
||||
# Deployment Strategy (Choose one)
|
||||
deployment_strategy: "blue_green" # Options: "blue_green", "rolling_update", "canary"
|
||||
|
||||
# Blue/Green Specific Configuration
|
||||
blue_green:
|
||||
active_color: "blue" # e.g., "blue", "green" - Which color is currently serving traffic?
|
||||
inactive_color: "green" # e.g., "green", "blue" - Which color will be promoted?
|
||||
switch_traffic_method: "dns_swap" # Options: "dns_swap", "load_balancer_update"
|
||||
dns_record: "api.example.com" # DNS record to update (if using dns_swap)
|
||||
load_balancer_arn: "YOUR_VALUE_HERE - ARN of the Load Balancer" # ARN of the load balancer (if using load_balancer_update)
|
||||
|
||||
# Rolling Update Specific Configuration
|
||||
rolling_update:
|
||||
batch_size: 2 # Number of instances/pods to update in each batch
|
||||
wait_time: 60 # Seconds to wait between batches for health checks
|
||||
|
||||
# Canary Deployment Specific Configuration
|
||||
canary:
|
||||
initial_percentage: 10 # Percentage of traffic to route to the canary version initially
|
||||
increment_percentage: 10 # Percentage to increase traffic to the canary version in each step
|
||||
increment_interval: 300 # Seconds between traffic increments
|
||||
max_percentage: 50 # Maximum percentage of traffic to route to the canary version
|
||||
|
||||
# Health Check Configuration
|
||||
health_check:
|
||||
type: "http" # Options: "http", "tcp", "custom"
|
||||
url: "/health" # URL to check for HTTP health checks
|
||||
port: 8080 # Port to check for TCP health checks
|
||||
timeout: 30 # Seconds before health check times out
|
||||
success_codes: [200, 204] # HTTP status codes indicating success
|
||||
interval: 15 # Seconds between health checks
|
||||
max_failures: 3 # Number of consecutive failures before rollback is considered failed
|
||||
|
||||
# Pre-Rollback Checks
|
||||
pre_rollback_checks:
|
||||
database_backup: true # Perform a database backup before rollback
|
||||
monitoring_enabled: true # Ensure monitoring is enabled
|
||||
alerting_configured: true # Ensure alerting is configured
|
||||
|
||||
# Post-Rollback Checks
|
||||
post_rollback_checks:
|
||||
health_check_success: true # Verify health checks are passing
|
||||
performance_metrics_stable: true # Verify performance metrics are within acceptable limits
|
||||
error_rates_acceptable: true # Verify error rates are within acceptable limits
|
||||
|
||||
# Notifications
|
||||
notifications:
|
||||
on_success:
|
||||
- "slack:YOUR_VALUE_HERE - Slack Channel ID" # Send a notification to Slack on success
|
||||
on_failure:
|
||||
- "email:YOUR_VALUE_HERE - Email Address" # Send an email on failure
|
||||
|
||||
# Advanced Configuration
|
||||
dry_run: false # Set to true to simulate the rollback without actually executing it
|
||||
force: false # Set to true to bypass certain safety checks (use with caution)
|
||||
Reference in New Issue
Block a user