Files
gh-openshift-eng-ai-helpers…/skills/list-regressions/list_regressions.py
2025-11-30 08:45:43 +08:00

671 lines
24 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Script to fetch regression data for OpenShift components.
Usage:
python3 list_regressions.py --release <release> [--components comp1 comp2 ...] [--short]
Example:
python3 list_regressions.py --release 4.17
python3 list_regressions.py --release 4.21 --components Monitoring etcd
python3 list_regressions.py --release 4.21 --short
"""
import argparse
import os
import json
import sys
import urllib.request
import urllib.error
from datetime import datetime, timezone
def calculate_hours_between(start_timestamp: str, end_timestamp: str) -> int:
"""
Calculate the number of hours between two timestamps, rounded to the nearest hour.
Args:
start_timestamp: ISO format timestamp string (e.g., "2025-09-26T00:02:51.385944Z")
end_timestamp: ISO format timestamp string (e.g., "2025-09-27T12:04:24.966914Z")
Returns:
Number of hours between the timestamps, rounded to the nearest hour
Raises:
ValueError: If timestamp parsing fails
"""
start_time = datetime.fromisoformat(start_timestamp.replace('Z', '+00:00'))
end_time = datetime.fromisoformat(end_timestamp.replace('Z', '+00:00'))
time_diff = end_time - start_time
return round(time_diff.total_seconds() / 3600)
def fetch_regressions(release: str) -> dict:
"""
Fetch regression data from the component health API.
Args:
release: The release version (e.g., "4.17", "4.16")
Returns:
Dictionary containing the regression data
Raises:
urllib.error.URLError: If the request fails
"""
# Construct the base URL
base_url = f"https://sippy.dptools.openshift.org/api/component_readiness/regressions"
# Build query parameters
params = [f"release={release}"]
url = f"{base_url}?{'&'.join(params)}"
print(f"Fetching regressions from: {url}", file=sys.stderr)
try:
with urllib.request.urlopen(url, timeout=30) as response:
if response.status == 200:
data = json.loads(response.read().decode('utf-8'))
return data
else:
raise Exception(f"HTTP {response.status}: {response.reason}")
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.reason}", file=sys.stderr)
raise
except urllib.error.URLError as e:
print(f"URL Error: {e.reason}", file=sys.stderr)
raise
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
raise
def filter_by_components(data: list, components: list = None) -> list:
"""
Filter regression data by component names.
Args:
data: List of regression dictionaries
components: Optional list of component names to filter by
Returns:
Filtered list of regressions matching the specified components
"""
# Always filter out regressions with empty component names
# These are legacy prior to a code change to ensure it is always set.
filtered = [
regression for regression in data
if regression.get('component', '') != ''
]
# If no specific components requested, return all non-empty components
if not components:
return filtered
# Convert components to lowercase for case-insensitive comparison
components_lower = [c.lower() for c in components]
# Further filter by specified components
filtered = [
regression for regression in filtered
if regression.get('component', '').lower() in components_lower
]
print(f"Filtered to {len(filtered)} regressions for components: {', '.join(components)}",
file=sys.stderr)
return filtered
def simplify_time_fields(data: list) -> list:
"""
Simplify time fields in regression data.
Converts time fields from a nested structure like:
{"Time": "2025-09-27T12:04:24.966914Z", "Valid": true}
to either:
- The timestamp string if Valid is true
- null if Valid is false
This applies to fields: 'closed', 'last_failure'
Args:
data: List of regression dictionaries
Returns:
List of regressions with simplified time fields
"""
time_fields = ['closed', 'last_failure']
for regression in data:
for field in time_fields:
if field in regression:
value = regression[field]
# Check if the field is a dict with Valid and Time fields
if isinstance(value, dict):
if value.get('Valid') is True:
# Replace with just the timestamp string
regression[field] = value.get('Time')
else:
# Replace with null if not valid
regression[field] = None
return data
def filter_by_date_range(regressions: list, start_date: str = None, end_date: str = None) -> list:
"""
Filter regressions by date range.
Args:
regressions: List of regression dictionaries
start_date: Start date in YYYY-MM-DD format. Filters out regressions closed before this date.
end_date: End date in YYYY-MM-DD format. Filters out regressions opened after this date.
Returns:
Filtered list of regressions
Note:
- If start_date is provided: excludes regressions that were closed before start_date
- If end_date is provided: excludes regressions that were opened after end_date
- This allows filtering to a development window (e.g., from development_start to GA)
"""
if not start_date and not end_date:
return regressions
filtered = []
for regression in regressions:
# Skip if opened after end_date
if end_date and regression.get('opened'):
opened_date = regression['opened'].split('T')[0] # Extract YYYY-MM-DD
if opened_date > end_date:
continue
# Skip if closed before start_date
if start_date and regression.get('closed'):
closed_date = regression['closed'].split('T')[0] # Extract YYYY-MM-DD
if closed_date < start_date:
continue
filtered.append(regression)
return filtered
def remove_unnecessary_fields(regressions: list) -> list:
"""
Remove unnecessary fields from regressions to reduce response size.
Removes 'links' and 'test_id' fields from each regression object.
Args:
regressions: List of regression dictionaries
Returns:
List of regression dictionaries with unnecessary fields removed
"""
for regression in regressions:
# Remove links and test_id to reduce response size
regression.pop('links', None)
regression.pop('test_id', None)
return regressions
def exclude_suspected_infra_regressions(regressions: list) -> tuple[list, int]:
"""
Filter out suspected infrastructure-related mass regressions.
This is an imprecise attempt to filter out mass regressions caused by infrastructure
issues which the TRT handles via a separate mechanism. These
mass incidents typically result in many short-lived regressions being opened and
closed on the same day.
Algorithm:
1. First pass: Count how many short-lived regressions (closed within 96 hours of opening)
were closed on each date.
2. Second pass: Filter out regressions that:
- Were closed within 96 hours of being opened, AND
- Were closed on a date where >50 short-lived regressions were closed
Args:
regressions: List of regression dictionaries
Returns:
Tuple of (filtered_regressions, count_of_filtered_regressions)
"""
# First pass: Track count of short-lived regressions closed on each date
short_lived_closures_by_date = {}
for regression in regressions:
opened = regression.get('opened')
closed = regression.get('closed')
# Skip if not closed or missing opened timestamp
if not closed or not opened:
continue
try:
# Calculate how long the regression was open
hours_open = calculate_hours_between(opened, closed)
# If closed within 96 hours, increment counter for the closed date
if hours_open <= 96:
closed_date = closed.split('T')[0] # Extract YYYY-MM-DD
short_lived_closures_by_date[closed_date] = short_lived_closures_by_date.get(closed_date, 0) + 1
except (ValueError, KeyError, TypeError):
# Skip if timestamp parsing fails
continue
# Second pass: Filter out suspected infra regressions
filtered_regressions = []
filtered_count = 0
for regression in regressions:
opened = regression.get('opened')
closed = regression.get('closed')
# Keep open regressions
if not closed or not opened:
filtered_regressions.append(regression)
continue
try:
# Calculate how long the regression was open
hours_open = calculate_hours_between(opened, closed)
closed_date = closed.split('T')[0] # Extract YYYY-MM-DD
# Filter out if:
# 1. Was closed within 96 hours, AND
# 2. More than 50 short-lived regressions were closed on that date
if hours_open <= 96 and short_lived_closures_by_date.get(closed_date, 0) > 50:
filtered_count += 1
continue
# Keep this regression
filtered_regressions.append(regression)
except (ValueError, KeyError, TypeError):
# If timestamp parsing fails, keep the regression
filtered_regressions.append(regression)
return filtered_regressions, filtered_count
def group_by_component(data: list) -> dict:
"""
Group regressions by component name and split into open/closed.
Args:
data: List of regression dictionaries
Returns:
Dictionary mapping component names to objects containing open and closed regression lists
"""
components = {}
for regression in data:
component = regression.get('component', 'Unknown')
if component not in components:
components[component] = {
"open": [],
"closed": []
}
# Split based on whether closed field is null
if regression.get('closed') is None:
components[component]["open"].append(regression)
else:
components[component]["closed"].append(regression)
# Sort component names for consistent output
return dict(sorted(components.items()))
def calculate_summary(regressions: list, filtered_suspected_infra: int = 0) -> dict:
"""
Calculate summary statistics for a list of regressions.
Args:
regressions: List of regression dictionaries
filtered_suspected_infra: Count of regressions filtered out as suspected infrastructure issues
Returns:
Dictionary containing summary statistics with nested open/closed totals, triaged counts,
and average time to triage
"""
total = 0
open_total = 0
open_triaged = 0
open_triage_times = []
open_times = []
closed_total = 0
closed_triaged = 0
closed_triage_times = []
closed_times = []
triaged_to_closed_times = []
# Get current time for calculating open duration
current_time = datetime.now(timezone.utc)
current_time_str = current_time.isoformat().replace('+00:00', 'Z')
# Single pass through all regressions
for regression in regressions:
total += 1
triages = regression.get('triages', [])
is_triaged = bool(triages)
# Calculate time to triage if regression is triaged
time_to_triage_hrs = None
if is_triaged and regression.get('opened'):
try:
# Find earliest triage timestamp
earliest_triage_time = min(
t['created_at'] for t in triages if t.get('created_at')
)
# Calculate difference in hours
time_to_triage_hrs = calculate_hours_between(
regression['opened'],
earliest_triage_time
)
except (ValueError, KeyError, TypeError):
# Skip if timestamp parsing fails
pass
# It is common for a triage to be reused as new regressions appear, which makes this a very tricky case to calculate time to triage.
# If you triaged a first round of regressions, then added more 24 hours later, we don't actually know when you triaged them in the db.
# Treating them as if they were immediately triaged would skew results.
# Best we can do is ignore these from consideration. They will count as if they got triaged, but we have no idea what to do with the time to triage.
if regression.get('closed') is None:
# Open regression
open_total += 1
if is_triaged:
open_triaged += 1
if time_to_triage_hrs is not None and time_to_triage_hrs > 0:
open_triage_times.append(time_to_triage_hrs)
# Calculate how long regression has been open
if regression.get('opened'):
try:
time_open_hrs = calculate_hours_between(
regression['opened'],
current_time_str
)
# Only include positive time differences
if time_open_hrs > 0:
open_times.append(time_open_hrs)
except (ValueError, KeyError, TypeError):
# Skip if timestamp parsing fails
pass
else:
# Closed regression
closed_total += 1
if is_triaged:
closed_triaged += 1
if time_to_triage_hrs is not None and time_to_triage_hrs > 0:
closed_triage_times.append(time_to_triage_hrs)
# Calculate time from triage to closed
if regression.get('closed') and triages:
try:
earliest_triage_time = min(
t['created_at'] for t in triages if t.get('created_at')
)
time_triaged_to_closed_hrs = calculate_hours_between(
earliest_triage_time,
regression['closed']
)
# Only include positive time differences:
if time_triaged_to_closed_hrs > 0:
triaged_to_closed_times.append(time_triaged_to_closed_hrs)
except (ValueError, KeyError, TypeError):
# Skip if timestamp parsing fails
pass
# Calculate time to close
if regression.get('opened') and regression.get('closed'):
try:
time_to_close_hrs = calculate_hours_between(
regression['opened'],
regression['closed']
)
# Only include positive time differences
if time_to_close_hrs > 0:
closed_times.append(time_to_close_hrs)
except (ValueError, KeyError, TypeError):
# Skip if timestamp parsing fails
pass
# Calculate averages and maximums
open_avg_triage_time = round(sum(open_triage_times) / len(open_triage_times)) if open_triage_times else None
open_max_triage_time = max(open_triage_times) if open_triage_times else None
open_avg_time = round(sum(open_times) / len(open_times)) if open_times else None
open_max_time = max(open_times) if open_times else None
closed_avg_triage_time = round(sum(closed_triage_times) / len(closed_triage_times)) if closed_triage_times else None
closed_max_triage_time = max(closed_triage_times) if closed_triage_times else None
closed_avg_time = round(sum(closed_times) / len(closed_times)) if closed_times else None
closed_max_time = max(closed_times) if closed_times else None
triaged_to_closed_avg_time = round(sum(triaged_to_closed_times) / len(triaged_to_closed_times)) if triaged_to_closed_times else None
triaged_to_closed_max_time = max(triaged_to_closed_times) if triaged_to_closed_times else None
# Calculate triage percentages
total_triaged = open_triaged + closed_triaged
triage_percentage = round((total_triaged / total * 100), 1) if total > 0 else 0
open_triage_percentage = round((open_triaged / open_total * 100), 1) if open_total > 0 else 0
closed_triage_percentage = round((closed_triaged / closed_total * 100), 1) if closed_total > 0 else 0
# Calculate overall time to triage (combining open and closed)
all_triage_times = open_triage_times + closed_triage_times
overall_avg_triage_time = round(sum(all_triage_times) / len(all_triage_times)) if all_triage_times else None
overall_max_triage_time = max(all_triage_times) if all_triage_times else None
# Time to close is only for closed regressions (already calculated in closed_avg_time/closed_max_time)
return {
"total": total,
"triaged": total_triaged,
"triage_percentage": triage_percentage,
"filtered_suspected_infra_regressions": filtered_suspected_infra,
"time_to_triage_hrs_avg": overall_avg_triage_time,
"time_to_triage_hrs_max": overall_max_triage_time,
"time_to_close_hrs_avg": closed_avg_time,
"time_to_close_hrs_max": closed_max_time,
"open": {
"total": open_total,
"triaged": open_triaged,
"triage_percentage": open_triage_percentage,
"time_to_triage_hrs_avg": open_avg_triage_time,
"time_to_triage_hrs_max": open_max_triage_time,
"open_hrs_avg": open_avg_time,
"open_hrs_max": open_max_time
},
"closed": {
"total": closed_total,
"triaged": closed_triaged,
"triage_percentage": closed_triage_percentage,
"time_to_triage_hrs_avg": closed_avg_triage_time,
"time_to_triage_hrs_max": closed_max_triage_time,
"time_to_close_hrs_avg": closed_avg_time,
"time_to_close_hrs_max": closed_max_time,
"time_triaged_closed_hrs_avg": triaged_to_closed_avg_time,
"time_triaged_closed_hrs_max": triaged_to_closed_max_time
}
}
def add_component_summaries(components: dict) -> dict:
"""
Add summary statistics to each component object.
Args:
components: Dictionary mapping component names to objects containing open and closed regression lists
Returns:
Dictionary with summaries added to each component
"""
for component, component_data in components.items():
# Combine open and closed to get all regressions for this component
all_regressions = component_data["open"] + component_data["closed"]
component_data["summary"] = calculate_summary(all_regressions)
return components
def format_output(data: dict) -> str:
"""
Format the regression data for output.
Args:
data: Dictionary containing regression data with keys:
- 'summary': Overall statistics (total, open, closed)
- 'components': Dictionary mapping component names to objects with:
- 'summary': Per-component statistics
- 'open': List of open regression objects
- 'closed': List of closed regression objects
Returns:
Formatted JSON string output
"""
return json.dumps(data, indent=2)
def main():
parser = argparse.ArgumentParser(
description='Fetch regression data for OpenShift components',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# List all regressions for release 4.17
%(prog)s --release 4.17
# Filter by specific components
%(prog)s --release 4.21 --components Monitoring "kube-apiserver"
# Filter by multiple components
%(prog)s --release 4.21 --components Monitoring etcd "kube-apiserver"
# Short output mode (summaries only, no regression data)
%(prog)s --release 4.17 --short
"""
)
parser.add_argument(
'--release',
type=str,
required=True,
help='Release version (e.g., "4.17", "4.16")'
)
parser.add_argument(
'--components',
type=str,
nargs='+',
default=None,
help='Filter by component names (space-separated list, case-insensitive)'
)
parser.add_argument(
'--start',
type=str,
default=None,
help='Start date for filtering (YYYY-MM-DD format, e.g., "2022-03-10"). Filters out regressions closed before this date.'
)
parser.add_argument(
'--end',
type=str,
default=None,
help='End date for filtering (YYYY-MM-DD format, e.g., "2022-08-10"). Filters out regressions opened after this date.'
)
parser.add_argument(
'--short',
action='store_true',
help='Short output mode: exclude regression data, only include summaries'
)
args = parser.parse_args()
try:
# Fetch regressions
regressions = fetch_regressions(args.release)
# Filter by components (always called to remove empty component names)
if isinstance(regressions, list):
regressions = filter_by_components(regressions, args.components)
# Simplify time field structures (closed, last_failure)
if isinstance(regressions, list):
regressions = simplify_time_fields(regressions)
# Filter by date range (to focus on development window)
if isinstance(regressions, list):
regressions = filter_by_date_range(regressions, args.start, args.end)
# Remove unnecessary fields to reduce response size
if isinstance(regressions, list):
regressions = remove_unnecessary_fields(regressions)
# Filter out suspected infrastructure regressions
filtered_infra_count = 0
if isinstance(regressions, list):
regressions, filtered_infra_count = exclude_suspected_infra_regressions(regressions)
print(f"Filtered out {filtered_infra_count} suspected infrastructure regressions",
file=sys.stderr)
# Group regressions by component
if isinstance(regressions, list):
components = group_by_component(regressions)
else:
components = {}
# Add summaries to each component
if isinstance(components, dict):
components = add_component_summaries(components)
# Calculate overall summary statistics from all regressions
all_regressions = []
for comp_data in components.values():
all_regressions.extend(comp_data["open"])
all_regressions.extend(comp_data["closed"])
overall_summary = calculate_summary(all_regressions, filtered_infra_count)
# Construct output with summary and components
# If --short flag is specified, remove regression data from components
if args.short:
# Create a copy of components with only summaries
components_short = {}
for component_name, component_data in components.items():
components_short[component_name] = {
"summary": component_data["summary"]
}
output_data = {
"summary": overall_summary,
"components": components_short
}
else:
output_data = {
"summary": overall_summary,
"components": components
}
# Format and print output
output = format_output(output_data)
print(output)
return 0
except Exception as e:
print(f"Failed to fetch regressions: {e}", file=sys.stderr)
return 1
if __name__ == '__main__':
sys.exit(main())