Files
gh-k-dense-ai-claude-scient…/skills/scholar-evaluation/scripts/calculate_scores.py
2025-11-30 08:30:18 +08:00

379 lines
11 KiB
Python

#!/usr/bin/env python3
"""
ScholarEval Score Calculator
Calculate aggregate evaluation scores from dimension-level ratings.
Supports weighted averaging, threshold analysis, and score visualization.
Usage:
python calculate_scores.py --scores <dimension_scores.json> --output <report.txt>
python calculate_scores.py --scores <dimension_scores.json> --weights <weights.json>
python calculate_scores.py --interactive
Author: ScholarEval Framework
License: MIT
"""
import json
import argparse
import sys
from typing import Dict, List, Optional
from pathlib import Path
# Default dimension weights (total = 100%)
DEFAULT_WEIGHTS = {
"problem_formulation": 0.15,
"literature_review": 0.15,
"methodology": 0.20,
"data_collection": 0.10,
"analysis": 0.15,
"results": 0.10,
"writing": 0.10,
"citations": 0.05
}
# Quality level definitions
QUALITY_LEVELS = {
(4.5, 5.0): ("Exceptional", "Ready for top-tier publication"),
(4.0, 4.4): ("Strong", "Publication-ready with minor revisions"),
(3.5, 3.9): ("Good", "Major revisions required, promising work"),
(3.0, 3.4): ("Acceptable", "Significant revisions needed"),
(2.0, 2.9): ("Weak", "Fundamental issues, major rework required"),
(0.0, 1.9): ("Poor", "Not suitable without complete revision")
}
def load_scores(filepath: Path) -> Dict[str, float]:
"""Load dimension scores from JSON file."""
try:
with open(filepath, 'r') as f:
scores = json.load(f)
# Validate scores
for dim, score in scores.items():
if not 1 <= score <= 5:
raise ValueError(f"Score for {dim} must be between 1 and 5, got {score}")
return scores
except FileNotFoundError:
print(f"Error: File not found: {filepath}")
sys.exit(1)
except json.JSONDecodeError:
print(f"Error: Invalid JSON in {filepath}")
sys.exit(1)
except ValueError as e:
print(f"Error: {e}")
sys.exit(1)
def load_weights(filepath: Optional[Path] = None) -> Dict[str, float]:
"""Load dimension weights from JSON file or return defaults."""
if filepath is None:
return DEFAULT_WEIGHTS
try:
with open(filepath, 'r') as f:
weights = json.load(f)
# Validate weights sum to 1.0
total = sum(weights.values())
if not 0.99 <= total <= 1.01: # Allow small floating point errors
raise ValueError(f"Weights must sum to 1.0, got {total}")
return weights
except FileNotFoundError:
print(f"Error: File not found: {filepath}")
sys.exit(1)
except json.JSONDecodeError:
print(f"Error: Invalid JSON in {filepath}")
sys.exit(1)
except ValueError as e:
print(f"Error: {e}")
sys.exit(1)
def calculate_weighted_average(scores: Dict[str, float], weights: Dict[str, float]) -> float:
"""Calculate weighted average score."""
total_score = 0.0
total_weight = 0.0
for dimension, score in scores.items():
# Handle dimension name variations (e.g., "problem_formulation" vs "problem-formulation")
dim_key = dimension.replace('-', '_').lower()
weight = weights.get(dim_key, 0.0)
total_score += score * weight
total_weight += weight
# Normalize if not all dimensions were scored
if total_weight > 0:
return total_score / total_weight * (sum(weights.values()) / total_weight)
return 0.0
def get_quality_level(score: float) -> tuple:
"""Get quality level description for a given score."""
for (low, high), (level, description) in QUALITY_LEVELS.items():
if low <= score <= high:
return level, description
return "Unknown", "Score out of expected range"
def generate_bar_chart(scores: Dict[str, float], max_width: int = 50) -> str:
"""Generate ASCII bar chart of dimension scores."""
lines = []
max_name_len = max(len(name) for name in scores.keys())
for dimension, score in sorted(scores.items(), key=lambda x: x[1], reverse=True):
bar_length = int((score / 5.0) * max_width)
bar = '' * bar_length
padding = ' ' * (max_name_len - len(dimension))
lines.append(f" {dimension}{padding}{bar} {score:.2f}")
return '\n'.join(lines)
def identify_strengths_weaknesses(scores: Dict[str, float]) -> tuple:
"""Identify top strengths and areas for improvement."""
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
strengths = [dim for dim, score in sorted_scores[:3] if score >= 4.0]
weaknesses = [dim for dim, score in sorted_scores[-3:] if score < 3.5]
return strengths, weaknesses
def generate_report(scores: Dict[str, float], weights: Dict[str, float],
output_file: Optional[Path] = None) -> str:
"""Generate comprehensive evaluation report."""
overall_score = calculate_weighted_average(scores, weights)
quality_level, quality_desc = get_quality_level(overall_score)
strengths, weaknesses = identify_strengths_weaknesses(scores)
report_lines = [
"="*70,
"SCHOLAREVAL SCORE REPORT",
"="*70,
"",
f"Overall Score: {overall_score:.2f} / 5.00",
f"Quality Level: {quality_level}",
f"Assessment: {quality_desc}",
"",
"="*70,
"DIMENSION SCORES",
"="*70,
"",
generate_bar_chart(scores),
"",
"="*70,
"DETAILED BREAKDOWN",
"="*70,
""
]
# Add detailed scores with weights
for dimension, score in sorted(scores.items()):
dim_key = dimension.replace('-', '_').lower()
weight = weights.get(dim_key, 0.0)
weighted_contribution = score * weight
percentage = weight * 100
report_lines.append(
f" {dimension:25s} {score:.2f}/5.00 "
f"(weight: {percentage:4.1f}%, contribution: {weighted_contribution:.3f})"
)
report_lines.extend([
"",
"="*70,
"ASSESSMENT SUMMARY",
"="*70,
""
])
if strengths:
report_lines.append("Top Strengths:")
for dim in strengths:
report_lines.append(f"{dim}: {scores[dim]:.2f}/5.00")
report_lines.append("")
if weaknesses:
report_lines.append("Areas for Improvement:")
for dim in weaknesses:
report_lines.append(f"{dim}: {scores[dim]:.2f}/5.00")
report_lines.append("")
# Add recommendations based on score
report_lines.extend([
"="*70,
"RECOMMENDATIONS",
"="*70,
""
])
if overall_score >= 4.5:
report_lines.append(" Excellent work! Ready for submission to top-tier venues.")
elif overall_score >= 4.0:
report_lines.append(" Strong work. Address minor issues identified in weaknesses.")
elif overall_score >= 3.5:
report_lines.append(" Good foundation. Focus on major revisions in weak dimensions.")
elif overall_score >= 3.0:
report_lines.append(" Significant revisions needed. Prioritize weakest dimensions.")
elif overall_score >= 2.0:
report_lines.append(" Major rework required. Consider restructuring approach.")
else:
report_lines.append(" Fundamental revision needed across multiple dimensions.")
report_lines.append("")
report_lines.append("="*70)
report = '\n'.join(report_lines)
# Write to file if specified
if output_file:
try:
with open(output_file, 'w') as f:
f.write(report)
print(f"\nReport saved to: {output_file}")
except IOError as e:
print(f"Error writing to {output_file}: {e}")
return report
def interactive_mode():
"""Run interactive score entry mode."""
print("ScholarEval Interactive Score Calculator")
print("="*50)
print("\nEnter scores for each dimension (1-5):")
print("(Press Enter to skip a dimension)\n")
scores = {}
dimensions = [
"problem_formulation",
"literature_review",
"methodology",
"data_collection",
"analysis",
"results",
"writing",
"citations"
]
for dim in dimensions:
while True:
dim_display = dim.replace('_', ' ').title()
user_input = input(f"{dim_display}: ").strip()
if not user_input:
break
try:
score = float(user_input)
if 1 <= score <= 5:
scores[dim] = score
break
else:
print(" Score must be between 1 and 5")
except ValueError:
print(" Invalid input. Please enter a number between 1 and 5")
if not scores:
print("\nNo scores entered. Exiting.")
return
print("\n" + "="*50)
print("SCORES ENTERED:")
for dim, score in scores.items():
print(f" {dim.replace('_', ' ').title()}: {score}")
print("\nCalculating overall assessment...\n")
report = generate_report(scores, DEFAULT_WEIGHTS)
print(report)
# Ask if user wants to save
save = input("\nSave report to file? (y/n): ").strip().lower()
if save == 'y':
filename = input("Enter filename [scholareval_report.txt]: ").strip()
if not filename:
filename = "scholareval_report.txt"
generate_report(scores, DEFAULT_WEIGHTS, Path(filename))
def main():
parser = argparse.ArgumentParser(
description="Calculate aggregate ScholarEval scores from dimension ratings",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Calculate from JSON file
python calculate_scores.py --scores my_scores.json
# Calculate with custom weights
python calculate_scores.py --scores my_scores.json --weights custom_weights.json
# Save report to file
python calculate_scores.py --scores my_scores.json --output report.txt
# Interactive mode
python calculate_scores.py --interactive
Score JSON Format:
{
"problem_formulation": 4.5,
"literature_review": 4.0,
"methodology": 3.5,
"data_collection": 4.0,
"analysis": 3.5,
"results": 4.0,
"writing": 4.5,
"citations": 4.0
}
Weights JSON Format:
{
"problem_formulation": 0.15,
"literature_review": 0.15,
"methodology": 0.20,
"data_collection": 0.10,
"analysis": 0.15,
"results": 0.10,
"writing": 0.10,
"citations": 0.05
}
"""
)
parser.add_argument('--scores', type=Path, help='Path to JSON file with dimension scores')
parser.add_argument('--weights', type=Path, help='Path to JSON file with dimension weights (optional)')
parser.add_argument('--output', type=Path, help='Path to output report file (optional)')
parser.add_argument('--interactive', '-i', action='store_true', help='Run in interactive mode')
args = parser.parse_args()
# Interactive mode
if args.interactive:
interactive_mode()
return
# File mode
if not args.scores:
parser.print_help()
print("\nError: --scores is required (or use --interactive)")
sys.exit(1)
scores = load_scores(args.scores)
weights = load_weights(args.weights)
report = generate_report(scores, weights, args.output)
# Print to stdout if no output file specified
if not args.output:
print(report)
if __name__ == '__main__':
main()