236 lines
8.6 KiB
Ruby
Executable File
236 lines
8.6 KiB
Ruby
Executable File
#!/usr/bin/env ruby
|
|
# frozen_string_literal: true
|
|
|
|
# TranscriptParser - Defensive JSON parsing utilities for Claude Code transcripts
|
|
#
|
|
# PURPOSE: Provide safe, robust parsing of Claude Code transcript JSON data
|
|
# HANDLES: Variable content formats (String vs Array), malformed data, type validation
|
|
#
|
|
# KEY FEATURES:
|
|
# - Defensive type checking before processing
|
|
# - Graceful degradation for unexpected formats
|
|
# - Comprehensive error handling with meaningful messages
|
|
# - Ruby best practices for type validation
|
|
# - Extensible for future transcript format changes
|
|
|
|
module TranscriptParser
|
|
# Custom errors for specific parsing scenarios
|
|
class TranscriptParseError < StandardError; end
|
|
class MalformedTranscriptError < TranscriptParseError; end
|
|
class UnsupportedContentFormatError < TranscriptParseError; end
|
|
class MissingRequiredFieldError < TranscriptParseError; end
|
|
|
|
# Main entry point for parsing transcript JSON data
|
|
# @param data [Hash] The parsed JSON data from transcript
|
|
# @param strict [Boolean] Whether to raise errors on malformed data (default: false)
|
|
# @return [Hash] Normalized data structure with validated content
|
|
def self.parse_transcript_entry(data, strict: false)
|
|
validate_transcript_structure!(data)
|
|
|
|
result = {
|
|
request_id: extract_request_id(data),
|
|
timestamp: extract_timestamp(data),
|
|
message: parse_message(data['message'], strict: strict)
|
|
}
|
|
|
|
# Add optional fields if present
|
|
result[:tool_name] = data['toolName'] if data.key?('toolName')
|
|
result[:tool_input] = data['toolInput'] if data.key?('toolInput')
|
|
result[:tool_response] = data['toolResponse'] if data.key?('toolResponse')
|
|
|
|
result
|
|
rescue StandardError => e
|
|
raise TranscriptParseError, "Failed to parse transcript entry: #{e.message}" if strict
|
|
|
|
# Return safe fallback structure
|
|
{
|
|
request_id: data['requestId'] || 'unknown',
|
|
timestamp: data['timestamp'] || Time.now.to_f,
|
|
message: { role: 'unknown', content: '', type: 'fallback' },
|
|
parse_error: e.message
|
|
}
|
|
end
|
|
|
|
# Extract and normalize message content with defensive type handling
|
|
# @param message [Hash] The message object from transcript data
|
|
# @param strict [Boolean] Whether to raise errors on unsupported formats
|
|
# @return [Hash] Normalized message with extracted content
|
|
def self.parse_message(message, strict: false)
|
|
validate_message_structure!(message)
|
|
|
|
role = extract_role(message)
|
|
content = extract_content(message['content'], strict: strict)
|
|
|
|
{
|
|
role: role,
|
|
content: content[:text],
|
|
content_type: content[:type],
|
|
raw_content: message['content'] # Preserve original for debugging
|
|
}
|
|
end
|
|
|
|
# Safely extract text content from variable format content field
|
|
# @param content_data [String, Array, Object] The content field from message
|
|
# @param strict [Boolean] Whether to raise errors on unsupported formats
|
|
# @return [Hash] Hash with :text and :type keys
|
|
def self.extract_content(content_data, strict: false)
|
|
case content_data
|
|
when String
|
|
{ text: content_data, type: 'string' }
|
|
when Array
|
|
{ text: extract_content_from_array(content_data), type: 'array' }
|
|
when Hash
|
|
{ text: extract_content_from_hash(content_data), type: 'hash' }
|
|
when nil
|
|
{ text: '', type: 'nil' }
|
|
else
|
|
if strict
|
|
raise UnsupportedContentFormatError,
|
|
"Unsupported content format: #{content_data.class} - #{content_data.inspect}"
|
|
else
|
|
{ text: content_data.to_s, type: 'converted' }
|
|
end
|
|
end
|
|
end
|
|
|
|
# Extract text from array-based content (Claude's structured message format)
|
|
# @param content_array [Array] Array of content blocks
|
|
# @return [String] Concatenated text content
|
|
def self.extract_content_from_array(content_array)
|
|
return '' unless content_array.is_a?(Array)
|
|
|
|
content_array.filter_map do |block|
|
|
case block
|
|
when Hash
|
|
# Skip tool_use entries as they don't contain displayable text
|
|
next if block['type'] == 'tool_use' || block[:type] == 'tool_use'
|
|
|
|
# Handle structured content blocks (e.g., {type: 'text', text: 'content'})
|
|
block['text'] || block[:text] || block.dig('content', 'text') ||
|
|
block.values.find { |v| v.is_a?(String) && v.length.positive? }
|
|
when String
|
|
block
|
|
else
|
|
block.to_s if block.respond_to?(:to_s)
|
|
end
|
|
end.join('')
|
|
end
|
|
|
|
# Extract text from hash-based content
|
|
# @param content_hash [Hash] Hash containing content data
|
|
# @return [String] Extracted text content
|
|
def self.extract_content_from_hash(content_hash)
|
|
return '' unless content_hash.is_a?(Hash)
|
|
|
|
# Try common content keys in order of preference
|
|
%w[text content message body data value].each do |key|
|
|
return content_hash[key] if content_hash.key?(key) && content_hash[key].is_a?(String)
|
|
end
|
|
|
|
# Fallback: convert whole hash to string for debugging
|
|
content_hash.to_s
|
|
end
|
|
|
|
# Validate basic transcript entry structure
|
|
# @param data [Hash] The transcript entry data
|
|
# @raise [MalformedTranscriptError] If required fields are missing
|
|
def self.validate_transcript_structure!(data)
|
|
raise MalformedTranscriptError, 'Data must be a Hash' unless data.is_a?(Hash)
|
|
|
|
return if data.key?('message')
|
|
|
|
raise MissingRequiredFieldError, "Missing required 'message' field"
|
|
end
|
|
|
|
# Validate message structure
|
|
# @param message [Hash] The message object
|
|
# @raise [MalformedTranscriptError] If message structure is invalid
|
|
def self.validate_message_structure!(message)
|
|
raise MalformedTranscriptError, 'Message must be a Hash' unless message.is_a?(Hash)
|
|
|
|
raise MissingRequiredFieldError, "Missing required 'role' field in message" unless message.key?('role')
|
|
|
|
return if message.key?('content')
|
|
|
|
raise MissingRequiredFieldError, "Missing required 'content' field in message"
|
|
end
|
|
|
|
# Extract request ID with fallback
|
|
# @param data [Hash] The transcript entry data
|
|
# @return [String] Request ID or generated fallback
|
|
def self.extract_request_id(data)
|
|
data['requestId'] || data['request_id'] || "req_#{Time.now.to_f}"
|
|
end
|
|
|
|
# Extract timestamp with fallback
|
|
# @param data [Hash] The transcript entry data
|
|
# @return [Float] Timestamp or current time
|
|
def self.extract_timestamp(data)
|
|
timestamp = data['timestamp'] || data['created_at'] || Time.now.to_f
|
|
timestamp.is_a?(Numeric) ? timestamp : Time.now.to_f
|
|
end
|
|
|
|
# Extract role with validation
|
|
# @param message [Hash] The message object
|
|
# @return [String] The role (user, assistant, system)
|
|
def self.extract_role(message)
|
|
role = message['role'] || message[:role]
|
|
return role if %w[user assistant system].include?(role)
|
|
|
|
# Fallback role detection
|
|
'unknown'
|
|
end
|
|
|
|
# Batch process multiple transcript entries with error isolation
|
|
# @param lines [Array<String>] Array of JSON strings from transcript file
|
|
# @param strict [Boolean] Whether to raise errors on malformed entries
|
|
# @return [Array<Hash>] Array of parsed entries, with errors isolated
|
|
def self.parse_transcript_lines(lines, strict: false)
|
|
results = []
|
|
|
|
lines.each_with_index do |line, index|
|
|
next if line.nil? || line.strip.empty?
|
|
|
|
begin
|
|
data = JSON.parse(line.strip)
|
|
results << parse_transcript_entry(data, strict: strict)
|
|
rescue JSON::ParserError => e
|
|
error_entry = {
|
|
line_number: index + 1,
|
|
parse_error: "JSON parse error: #{e.message}",
|
|
raw_line: line.strip[0..100] + (line.length > 100 ? '...' : ''),
|
|
message: { role: 'error', content: '', type: 'json_error' }
|
|
}
|
|
|
|
raise TranscriptParseError, "JSON parse error on line #{index + 1}: #{e.message}" if strict
|
|
|
|
results << error_entry
|
|
rescue StandardError => e
|
|
error_entry = {
|
|
line_number: index + 1,
|
|
parse_error: "Unexpected error: #{e.message}",
|
|
raw_line: line.strip[0..100] + (line.length > 100 ? '...' : ''),
|
|
message: { role: 'error', content: '', type: 'parse_error' }
|
|
}
|
|
|
|
raise TranscriptParseError, "Parse error on line #{index + 1}: #{e.message}" if strict
|
|
|
|
results << error_entry
|
|
end
|
|
end
|
|
|
|
results
|
|
end
|
|
|
|
# Helper method to safely read and parse transcript file
|
|
# @param file_path [String] Path to transcript file
|
|
# @param strict [Boolean] Whether to raise errors on malformed data
|
|
# @return [Array<Hash>] Array of parsed transcript entries
|
|
def self.parse_transcript_file(file_path, strict: false)
|
|
raise TranscriptParseError, "Transcript file not found: #{file_path}" unless File.exist?(file_path)
|
|
|
|
lines = File.readlines(file_path, chomp: true)
|
|
parse_transcript_lines(lines, strict: strict)
|
|
end
|
|
end
|