Files
gh-kylesnowschwartz-simplec…/hooks/handlers/transcript_parser.rb
2025-11-30 08:36:27 +08:00

236 lines
8.6 KiB
Ruby
Executable File

#!/usr/bin/env ruby
# frozen_string_literal: true
# TranscriptParser - Defensive JSON parsing utilities for Claude Code transcripts
#
# PURPOSE: Provide safe, robust parsing of Claude Code transcript JSON data
# HANDLES: Variable content formats (String vs Array), malformed data, type validation
#
# KEY FEATURES:
# - Defensive type checking before processing
# - Graceful degradation for unexpected formats
# - Comprehensive error handling with meaningful messages
# - Ruby best practices for type validation
# - Extensible for future transcript format changes
module TranscriptParser
# Custom errors for specific parsing scenarios
class TranscriptParseError < StandardError; end
class MalformedTranscriptError < TranscriptParseError; end
class UnsupportedContentFormatError < TranscriptParseError; end
class MissingRequiredFieldError < TranscriptParseError; end
# Main entry point for parsing transcript JSON data
# @param data [Hash] The parsed JSON data from transcript
# @param strict [Boolean] Whether to raise errors on malformed data (default: false)
# @return [Hash] Normalized data structure with validated content
def self.parse_transcript_entry(data, strict: false)
validate_transcript_structure!(data)
result = {
request_id: extract_request_id(data),
timestamp: extract_timestamp(data),
message: parse_message(data['message'], strict: strict)
}
# Add optional fields if present
result[:tool_name] = data['toolName'] if data.key?('toolName')
result[:tool_input] = data['toolInput'] if data.key?('toolInput')
result[:tool_response] = data['toolResponse'] if data.key?('toolResponse')
result
rescue StandardError => e
raise TranscriptParseError, "Failed to parse transcript entry: #{e.message}" if strict
# Return safe fallback structure
{
request_id: data['requestId'] || 'unknown',
timestamp: data['timestamp'] || Time.now.to_f,
message: { role: 'unknown', content: '', type: 'fallback' },
parse_error: e.message
}
end
# Extract and normalize message content with defensive type handling
# @param message [Hash] The message object from transcript data
# @param strict [Boolean] Whether to raise errors on unsupported formats
# @return [Hash] Normalized message with extracted content
def self.parse_message(message, strict: false)
validate_message_structure!(message)
role = extract_role(message)
content = extract_content(message['content'], strict: strict)
{
role: role,
content: content[:text],
content_type: content[:type],
raw_content: message['content'] # Preserve original for debugging
}
end
# Safely extract text content from variable format content field
# @param content_data [String, Array, Object] The content field from message
# @param strict [Boolean] Whether to raise errors on unsupported formats
# @return [Hash] Hash with :text and :type keys
def self.extract_content(content_data, strict: false)
case content_data
when String
{ text: content_data, type: 'string' }
when Array
{ text: extract_content_from_array(content_data), type: 'array' }
when Hash
{ text: extract_content_from_hash(content_data), type: 'hash' }
when nil
{ text: '', type: 'nil' }
else
if strict
raise UnsupportedContentFormatError,
"Unsupported content format: #{content_data.class} - #{content_data.inspect}"
else
{ text: content_data.to_s, type: 'converted' }
end
end
end
# Extract text from array-based content (Claude's structured message format)
# @param content_array [Array] Array of content blocks
# @return [String] Concatenated text content
def self.extract_content_from_array(content_array)
return '' unless content_array.is_a?(Array)
content_array.filter_map do |block|
case block
when Hash
# Skip tool_use entries as they don't contain displayable text
next if block['type'] == 'tool_use' || block[:type] == 'tool_use'
# Handle structured content blocks (e.g., {type: 'text', text: 'content'})
block['text'] || block[:text] || block.dig('content', 'text') ||
block.values.find { |v| v.is_a?(String) && v.length.positive? }
when String
block
else
block.to_s if block.respond_to?(:to_s)
end
end.join('')
end
# Extract text from hash-based content
# @param content_hash [Hash] Hash containing content data
# @return [String] Extracted text content
def self.extract_content_from_hash(content_hash)
return '' unless content_hash.is_a?(Hash)
# Try common content keys in order of preference
%w[text content message body data value].each do |key|
return content_hash[key] if content_hash.key?(key) && content_hash[key].is_a?(String)
end
# Fallback: convert whole hash to string for debugging
content_hash.to_s
end
# Validate basic transcript entry structure
# @param data [Hash] The transcript entry data
# @raise [MalformedTranscriptError] If required fields are missing
def self.validate_transcript_structure!(data)
raise MalformedTranscriptError, 'Data must be a Hash' unless data.is_a?(Hash)
return if data.key?('message')
raise MissingRequiredFieldError, "Missing required 'message' field"
end
# Validate message structure
# @param message [Hash] The message object
# @raise [MalformedTranscriptError] If message structure is invalid
def self.validate_message_structure!(message)
raise MalformedTranscriptError, 'Message must be a Hash' unless message.is_a?(Hash)
raise MissingRequiredFieldError, "Missing required 'role' field in message" unless message.key?('role')
return if message.key?('content')
raise MissingRequiredFieldError, "Missing required 'content' field in message"
end
# Extract request ID with fallback
# @param data [Hash] The transcript entry data
# @return [String] Request ID or generated fallback
def self.extract_request_id(data)
data['requestId'] || data['request_id'] || "req_#{Time.now.to_f}"
end
# Extract timestamp with fallback
# @param data [Hash] The transcript entry data
# @return [Float] Timestamp or current time
def self.extract_timestamp(data)
timestamp = data['timestamp'] || data['created_at'] || Time.now.to_f
timestamp.is_a?(Numeric) ? timestamp : Time.now.to_f
end
# Extract role with validation
# @param message [Hash] The message object
# @return [String] The role (user, assistant, system)
def self.extract_role(message)
role = message['role'] || message[:role]
return role if %w[user assistant system].include?(role)
# Fallback role detection
'unknown'
end
# Batch process multiple transcript entries with error isolation
# @param lines [Array<String>] Array of JSON strings from transcript file
# @param strict [Boolean] Whether to raise errors on malformed entries
# @return [Array<Hash>] Array of parsed entries, with errors isolated
def self.parse_transcript_lines(lines, strict: false)
results = []
lines.each_with_index do |line, index|
next if line.nil? || line.strip.empty?
begin
data = JSON.parse(line.strip)
results << parse_transcript_entry(data, strict: strict)
rescue JSON::ParserError => e
error_entry = {
line_number: index + 1,
parse_error: "JSON parse error: #{e.message}",
raw_line: line.strip[0..100] + (line.length > 100 ? '...' : ''),
message: { role: 'error', content: '', type: 'json_error' }
}
raise TranscriptParseError, "JSON parse error on line #{index + 1}: #{e.message}" if strict
results << error_entry
rescue StandardError => e
error_entry = {
line_number: index + 1,
parse_error: "Unexpected error: #{e.message}",
raw_line: line.strip[0..100] + (line.length > 100 ? '...' : ''),
message: { role: 'error', content: '', type: 'parse_error' }
}
raise TranscriptParseError, "Parse error on line #{index + 1}: #{e.message}" if strict
results << error_entry
end
end
results
end
# Helper method to safely read and parse transcript file
# @param file_path [String] Path to transcript file
# @param strict [Boolean] Whether to raise errors on malformed data
# @return [Array<Hash>] Array of parsed transcript entries
def self.parse_transcript_file(file_path, strict: false)
raise TranscriptParseError, "Transcript file not found: #{file_path}" unless File.exist?(file_path)
lines = File.readlines(file_path, chomp: true)
parse_transcript_lines(lines, strict: strict)
end
end