Initial commit

2025-11-30 08:30:18 +08:00
commit 74bee324ab
335 changed files with 147377 additions and 0 deletions
--- a/skills/scientific-slides/scripts/pdf_to_images.py
+++ b/skills/scientific-slides/scripts/pdf_to_images.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+"""
+PDF to Images Converter for Presentations
+
+Converts presentation PDFs to images for visual inspection and review.
+Supports multiple output formats and resolutions.
+"""
+
+import sys
+import os
+import argparse
+import subprocess
+from pathlib import Path
+from typing import Optional, List
+
+# Try to import pdf2image
+try:
+    from pdf2image import convert_from_path
+    HAS_PDF2IMAGE = True
+except ImportError:
+    HAS_PDF2IMAGE = False
+
+
+class PDFToImagesConverter:
+    """Converts PDF presentations to images."""
+    
+    def __init__(
+        self,
+        pdf_path: str,
+        output_prefix: str,
+        dpi: int = 150,
+        format: str = 'jpg',
+        first_page: Optional[int] = None,
+        last_page: Optional[int] = None
+    ):
+        self.pdf_path = Path(pdf_path)
+        self.output_prefix = output_prefix
+        self.dpi = dpi
+        self.format = format.lower()
+        self.first_page = first_page
+        self.last_page = last_page
+        
+        # Validate format
+        if self.format not in ['jpg', 'jpeg', 'png']:
+            raise ValueError(f"Unsupported format: {format}. Use jpg or png.")
+    
+    def convert(self) -> List[Path]:
+        """Convert PDF to images using available method."""
+        if not self.pdf_path.exists():
+            raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
+        
+        print(f"Converting: {self.pdf_path.name}")
+        print(f"Output prefix: {self.output_prefix}")
+        print(f"DPI: {self.dpi}")
+        print(f"Format: {self.format}")
+        
+        # Try methods in order of preference
+        if HAS_PDF2IMAGE:
+            return self._convert_with_pdf2image()
+        elif self._has_pdftoppm():
+            return self._convert_with_pdftoppm()
+        elif self._has_imagemagick():
+            return self._convert_with_imagemagick()
+        else:
+            raise RuntimeError(
+                "No conversion tool found. Install one of:\n"
+                "  - pdf2image: pip install pdf2image\n"
+                "  - poppler-utils (pdftoppm): apt/brew install poppler-utils\n"
+                "  - ImageMagick: apt/brew install imagemagick"
+            )
+    
+    def _convert_with_pdf2image(self) -> List[Path]:
+        """Convert using pdf2image library."""
+        print("Using pdf2image library...")
+        
+        images = convert_from_path(
+            self.pdf_path,
+            dpi=self.dpi,
+            fmt=self.format,
+            first_page=self.first_page,
+            last_page=self.last_page
+        )
+        
+        output_files = []
+        output_dir = Path(self.output_prefix).parent
+        output_dir.mkdir(parents=True, exist_ok=True)
+        
+        for i, image in enumerate(images, start=1):
+            output_path = Path(f"{self.output_prefix}-{i:03d}.{self.format}")
+            image.save(output_path, self.format.upper())
+            output_files.append(output_path)
+            print(f"  Created: {output_path.name}")
+        
+        return output_files
+    
+    def _convert_with_pdftoppm(self) -> List[Path]:
+        """Convert using pdftoppm command-line tool."""
+        print("Using pdftoppm...")
+        
+        # Build command
+        cmd = [
+            'pdftoppm',
+            '-r', str(self.dpi)
+        ]
+        
+        # Add format flag
+        if self.format in ['jpg', 'jpeg']:
+            cmd.append('-jpeg')
+        else:
+            cmd.append('-png')
+        
+        # Add page range if specified
+        if self.first_page:
+            cmd.extend(['-f', str(self.first_page)])
+        if self.last_page:
+            cmd.extend(['-l', str(self.last_page)])
+        
+        # Add input and output
+        cmd.extend([str(self.pdf_path), self.output_prefix])
+        
+        # Run command
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            
+            # Find generated files
+            output_dir = Path(self.output_prefix).parent
+            pattern = f"{Path(self.output_prefix).name}-*.{self.format}"
+            output_files = sorted(output_dir.glob(pattern))
+            
+            for f in output_files:
+                print(f"  Created: {f.name}")
+            
+            return output_files
+            
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"pdftoppm failed: {e.stderr}")
+    
+    def _convert_with_imagemagick(self) -> List[Path]:
+        """Convert using ImageMagick convert command."""
+        print("Using ImageMagick...")
+        
+        # Build command
+        cmd = [
+            'convert',
+            '-density', str(self.dpi)
+        ]
+        
+        # Add page range if specified
+        if self.first_page and self.last_page:
+            page_range = f"[{self.first_page-1}-{self.last_page-1}]"
+            cmd.append(str(self.pdf_path) + page_range)
+        elif self.first_page:
+            cmd.append(str(self.pdf_path) + f"[{self.first_page-1}-]")
+        elif self.last_page:
+            cmd.append(str(self.pdf_path) + f"[0-{self.last_page-1}]")
+        else:
+            cmd.append(str(self.pdf_path))
+        
+        # Output path
+        output_path = f"{self.output_prefix}-%03d.{self.format}"
+        cmd.append(output_path)
+        
+        # Run command
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            
+            # Find generated files
+            output_dir = Path(self.output_prefix).parent
+            pattern = f"{Path(self.output_prefix).name}-*.{self.format}"
+            output_files = sorted(output_dir.glob(pattern))
+            
+            for f in output_files:
+                print(f"  Created: {f.name}")
+            
+            return output_files
+            
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"ImageMagick failed: {e.stderr}")
+    
+    def _has_pdftoppm(self) -> bool:
+        """Check if pdftoppm is available."""
+        try:
+            subprocess.run(
+                ['pdftoppm', '-v'],
+                capture_output=True,
+                check=True
+            )
+            return True
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            return False
+    
+    def _has_imagemagick(self) -> bool:
+        """Check if ImageMagick is available."""
+        try:
+            subprocess.run(
+                ['convert', '-version'],
+                capture_output=True,
+                check=True
+            )
+            return True
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert presentation PDFs to images',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s presentation.pdf slides
+    → Creates slides-001.jpg, slides-002.jpg, ...
+  
+  %(prog)s presentation.pdf output/slide --dpi 300 --format png
+    → Creates output/slide-001.png, slide-002.png, ... at high resolution
+  
+  %(prog)s presentation.pdf review/s --first 5 --last 10
+    → Converts only slides 5-10
+
+Output:
+  Images are named: PREFIX-001.FORMAT, PREFIX-002.FORMAT, etc.
+  
+Resolution:
+  - 150 DPI: Good for screen review (default)
+  - 200 DPI: Higher quality for detailed inspection
+  - 300 DPI: Print quality (larger files)
+
+Requirements:
+  Install one of these tools:
+  - pdf2image: pip install pdf2image (recommended)
+  - poppler-utils: apt/brew install poppler-utils
+  - ImageMagick: apt/brew install imagemagick
+        """
+    )
+    
+    parser.add_argument(
+        'pdf_path',
+        help='Path to PDF presentation'
+    )
+    
+    parser.add_argument(
+        'output_prefix',
+        help='Output filename prefix (e.g., "slides" or "output/slide")'
+    )
+    
+    parser.add_argument(
+        '--dpi', '-r',
+        type=int,
+        default=150,
+        help='Resolution in DPI (default: 150)'
+    )
+    
+    parser.add_argument(
+        '--format', '-f',
+        choices=['jpg', 'jpeg', 'png'],
+        default='jpg',
+        help='Output format (default: jpg)'
+    )
+    
+    parser.add_argument(
+        '--first',
+        type=int,
+        help='First page to convert (1-indexed)'
+    )
+    
+    parser.add_argument(
+        '--last',
+        type=int,
+        help='Last page to convert (1-indexed)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Create output directory if needed
+    output_dir = Path(args.output_prefix).parent
+    if output_dir != Path('.'):
+        output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Convert
+    try:
+        converter = PDFToImagesConverter(
+            pdf_path=args.pdf_path,
+            output_prefix=args.output_prefix,
+            dpi=args.dpi,
+            format=args.format,
+            first_page=args.first,
+            last_page=args.last
+        )
+        
+        output_files = converter.convert()
+        
+        print()
+        print("=" * 60)
+        print(f"✅ Success! Created {len(output_files)} image(s)")
+        print("=" * 60)
+        
+        if output_files:
+            print(f"\nFirst image: {output_files[0]}")
+            print(f"Last image: {output_files[-1]}")
+            
+            # Calculate total size
+            total_size = sum(f.stat().st_size for f in output_files)
+            size_mb = total_size / (1024 * 1024)
+            print(f"Total size: {size_mb:.2f} MB")
+            
+            print("\nNext steps:")
+            print("  1. Review images for layout issues")
+            print("  2. Check for text overflow or element overlap")
+            print("  3. Verify readability from distance")
+            print("  4. Document issues with slide numbers")
+        
+        sys.exit(0)
+        
+    except Exception as e:
+        print(f"\n❌ Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
+
--- a/skills/scientific-slides/scripts/validate_presentation.py
+++ b/skills/scientific-slides/scripts/validate_presentation.py
@@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+"""
+Presentation Validation Script
+
+Validates scientific presentations for common issues:
+- Slide count vs. duration
+- LaTeX compilation
+- File size checks
+- Basic format validation
+"""
+
+import sys
+import os
+import argparse
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional
+
+# Try to import PyPDF2 for PDF analysis
+try:
+    import PyPDF2
+    HAS_PYPDF2 = True
+except ImportError:
+    HAS_PYPDF2 = False
+
+# Try to import python-pptx for PowerPoint analysis
+try:
+    from pptx import Presentation
+    HAS_PPTX = True
+except ImportError:
+    HAS_PPTX = False
+
+
+class PresentationValidator:
+    """Validates presentations for common issues."""
+    
+    # Recommended slide counts by duration (min, recommended, max)
+    SLIDE_GUIDELINES = {
+        5: (5, 6, 8),
+        10: (8, 11, 14),
+        15: (13, 16, 20),
+        20: (18, 22, 26),
+        30: (22, 27, 33),
+        45: (32, 40, 50),
+        60: (40, 52, 65),
+    }
+    
+    def __init__(self, filepath: str, duration: Optional[int] = None):
+        self.filepath = Path(filepath)
+        self.duration = duration
+        self.file_type = self.filepath.suffix.lower()
+        self.issues = []
+        self.warnings = []
+        self.info = []
+        
+    def validate(self) -> Dict:
+        """Run all validations and return results."""
+        print(f"Validating: {self.filepath.name}")
+        print(f"File type: {self.file_type}")
+        print("=" * 60)
+        
+        # Check file exists
+        if not self.filepath.exists():
+            self.issues.append(f"File not found: {self.filepath}")
+            return self._format_results()
+        
+        # File size check
+        self._check_file_size()
+        
+        # Type-specific validation
+        if self.file_type == '.pdf':
+            self._validate_pdf()
+        elif self.file_type in ['.pptx', '.ppt']:
+            self._validate_pptx()
+        elif self.file_type in ['.tex']:
+            self._validate_latex()
+        else:
+            self.warnings.append(f"Unknown file type: {self.file_type}")
+        
+        return self._format_results()
+    
+    def _check_file_size(self):
+        """Check if file size is reasonable."""
+        size_mb = self.filepath.stat().st_size / (1024 * 1024)
+        self.info.append(f"File size: {size_mb:.2f} MB")
+        
+        if size_mb > 100:
+            self.issues.append(
+                f"File is very large ({size_mb:.1f} MB). "
+                "Consider compressing images."
+            )
+        elif size_mb > 50:
+            self.warnings.append(
+                f"File is large ({size_mb:.1f} MB). "
+                "May be slow to email or upload."
+            )
+    
+    def _validate_pdf(self):
+        """Validate PDF presentation."""
+        if not HAS_PYPDF2:
+            self.warnings.append(
+                "PyPDF2 not installed. Install with: pip install PyPDF2"
+            )
+            return
+        
+        try:
+            with open(self.filepath, 'rb') as f:
+                reader = PyPDF2.PdfReader(f)
+                num_pages = len(reader.pages)
+                
+                self.info.append(f"Number of slides: {num_pages}")
+                
+                # Check slide count against duration
+                if self.duration:
+                    self._check_slide_count(num_pages)
+                
+                # Get page size
+                first_page = reader.pages[0]
+                media_box = first_page.mediabox
+                width = float(media_box.width)
+                height = float(media_box.height)
+                
+                # Convert points to inches (72 points = 1 inch)
+                width_in = width / 72
+                height_in = height / 72
+                aspect = width / height
+                
+                self.info.append(
+                    f"Slide dimensions: {width_in:.1f}\" × {height_in:.1f}\" "
+                    f"(aspect ratio: {aspect:.2f})"
+                )
+                
+                # Check common aspect ratios
+                if abs(aspect - 16/9) < 0.01:
+                    self.info.append("Aspect ratio: 16:9 (widescreen)")
+                elif abs(aspect - 4/3) < 0.01:
+                    self.info.append("Aspect ratio: 4:3 (standard)")
+                else:
+                    self.warnings.append(
+                        f"Unusual aspect ratio: {aspect:.2f}. "
+                        "Confirm this matches venue requirements."
+                    )
+                
+        except Exception as e:
+            self.issues.append(f"Error reading PDF: {str(e)}")
+    
+    def _validate_pptx(self):
+        """Validate PowerPoint presentation."""
+        if not HAS_PPTX:
+            self.warnings.append(
+                "python-pptx not installed. Install with: pip install python-pptx"
+            )
+            return
+        
+        try:
+            prs = Presentation(self.filepath)
+            num_slides = len(prs.slides)
+            
+            self.info.append(f"Number of slides: {num_slides}")
+            
+            # Check slide count against duration
+            if self.duration:
+                self._check_slide_count(num_slides)
+            
+            # Get slide dimensions
+            width_inches = prs.slide_width / 914400  # EMU to inches
+            height_inches = prs.slide_height / 914400
+            aspect = prs.slide_width / prs.slide_height
+            
+            self.info.append(
+                f"Slide dimensions: {width_inches:.1f}\" × {height_inches:.1f}\" "
+                f"(aspect ratio: {aspect:.2f})"
+            )
+            
+            # Check fonts and text
+            self._check_pptx_content(prs)
+            
+        except Exception as e:
+            self.issues.append(f"Error reading PowerPoint: {str(e)}")
+    
+    def _check_pptx_content(self, prs):
+        """Check PowerPoint content for common issues."""
+        small_text_slides = []
+        many_bullets_slides = []
+        
+        for idx, slide in enumerate(prs.slides, start=1):
+            for shape in slide.shapes:
+                if not shape.has_text_frame:
+                    continue
+                
+                text_frame = shape.text_frame
+                
+                # Check for small fonts
+                for paragraph in text_frame.paragraphs:
+                    for run in paragraph.runs:
+                        if run.font.size and run.font.size.pt < 18:
+                            small_text_slides.append(idx)
+                            break
+                
+                # Check for too many bullets
+                bullet_count = sum(1 for p in text_frame.paragraphs if p.level == 0)
+                if bullet_count > 6:
+                    many_bullets_slides.append(idx)
+        
+        # Report issues
+        if small_text_slides:
+            unique_slides = sorted(set(small_text_slides))
+            self.warnings.append(
+                f"Small text (<18pt) found on slides: {unique_slides[:5]}"
+                + (" ..." if len(unique_slides) > 5 else "")
+            )
+        
+        if many_bullets_slides:
+            unique_slides = sorted(set(many_bullets_slides))
+            self.warnings.append(
+                f"Many bullets (>6) on slides: {unique_slides[:5]}"
+                + (" ..." if len(unique_slides) > 5 else "")
+            )
+    
+    def _validate_latex(self):
+        """Validate LaTeX Beamer presentation."""
+        self.info.append("LaTeX source file detected")
+        
+        # Try to compile
+        if self._try_compile_latex():
+            self.info.append("LaTeX compilation: SUCCESS")
+            
+            # If PDF was generated, validate it
+            pdf_path = self.filepath.with_suffix('.pdf')
+            if pdf_path.exists():
+                pdf_validator = PresentationValidator(str(pdf_path), self.duration)
+                pdf_results = pdf_validator.validate()
+                
+                # Merge results
+                self.info.extend(pdf_results['info'])
+                self.warnings.extend(pdf_results['warnings'])
+                self.issues.extend(pdf_results['issues'])
+        else:
+            self.issues.append(
+                "LaTeX compilation failed. Check .log file for errors."
+            )
+    
+    def _try_compile_latex(self) -> bool:
+        """Try to compile LaTeX file."""
+        try:
+            # Try pdflatex
+            result = subprocess.run(
+                ['pdflatex', '-interaction=nonstopmode', self.filepath.name],
+                cwd=self.filepath.parent,
+                capture_output=True,
+                timeout=60
+            )
+            return result.returncode == 0
+        except (subprocess.TimeoutExpired, FileNotFoundError):
+            return False
+    
+    def _check_slide_count(self, num_slides: int):
+        """Check if slide count is appropriate for duration."""
+        if self.duration not in self.SLIDE_GUIDELINES:
+            # Find nearest duration
+            durations = sorted(self.SLIDE_GUIDELINES.keys())
+            nearest = min(durations, key=lambda x: abs(x - self.duration))
+            min_slides, rec_slides, max_slides = self.SLIDE_GUIDELINES[nearest]
+            self.info.append(
+                f"Using guidelines for {nearest}-minute talk "
+                f"(closest to {self.duration} minutes)"
+            )
+        else:
+            min_slides, rec_slides, max_slides = self.SLIDE_GUIDELINES[self.duration]
+        
+        self.info.append(
+            f"Recommended slides for {self.duration}-minute talk: "
+            f"{min_slides}-{max_slides} (optimal: ~{rec_slides})"
+        )
+        
+        if num_slides < min_slides:
+            self.warnings.append(
+                f"Fewer slides ({num_slides}) than recommended ({min_slides}-{max_slides}). "
+                "May have too much time or too little content."
+            )
+        elif num_slides > max_slides:
+            self.warnings.append(
+                f"More slides ({num_slides}) than recommended ({min_slides}-{max_slides}). "
+                "Likely to run over time."
+            )
+        else:
+            self.info.append(
+                f"Slide count ({num_slides}) is within recommended range."
+            )
+    
+    def _format_results(self) -> Dict:
+        """Format validation results."""
+        return {
+            'filepath': str(self.filepath),
+            'file_type': self.file_type,
+            'info': self.info,
+            'warnings': self.warnings,
+            'issues': self.issues,
+            'valid': len(self.issues) == 0
+        }
+
+
+def print_results(results: Dict):
+    """Print validation results in a readable format."""
+    print()
+    print("=" * 60)
+    print("VALIDATION RESULTS")
+    print("=" * 60)
+    
+    # Print info
+    if results['info']:
+        print("\n📋 Information:")
+        for item in results['info']:
+            print(f"  • {item}")
+    
+    # Print warnings
+    if results['warnings']:
+        print("\n⚠️  Warnings:")
+        for item in results['warnings']:
+            print(f"  • {item}")
+    
+    # Print issues
+    if results['issues']:
+        print("\n❌ Issues:")
+        for item in results['issues']:
+            print(f"  • {item}")
+    
+    # Overall status
+    print("\n" + "=" * 60)
+    if results['valid']:
+        print("✅ Validation PASSED")
+        if results['warnings']:
+            print(f"   ({len(results['warnings'])} warning(s) found)")
+    else:
+        print("❌ Validation FAILED")
+        print(f"   ({len(results['issues'])} issue(s) found)")
+    print("=" * 60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Validate scientific presentations',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s presentation.pdf --duration 15
+  %(prog)s slides.pptx --duration 45
+  %(prog)s beamer_talk.tex --duration 20
+
+Supported file types:
+  - PDF (.pdf)
+  - PowerPoint (.pptx, .ppt)
+  - LaTeX Beamer (.tex)
+
+Validation checks:
+  - Slide count vs. duration
+  - File size
+  - Slide dimensions
+  - Font sizes (PowerPoint)
+  - LaTeX compilation (Beamer)
+        """
+    )
+    
+    parser.add_argument(
+        'filepath',
+        help='Path to presentation file (PDF, PPTX, or TEX)'
+    )
+    
+    parser.add_argument(
+        '--duration', '-d',
+        type=int,
+        help='Presentation duration in minutes'
+    )
+    
+    parser.add_argument(
+        '--quiet', '-q',
+        action='store_true',
+        help='Only show issues and warnings'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate
+    validator = PresentationValidator(args.filepath, args.duration)
+    results = validator.validate()
+    
+    # Print results
+    if args.quiet:
+        # Only show warnings and issues
+        if results['warnings'] or results['issues']:
+            print_results(results)
+        else:
+            print("✅ No issues found")
+    else:
+        print_results(results)
+    
+    # Exit with appropriate code
+    sys.exit(0 if results['valid'] else 1)
+
+
+if __name__ == '__main__':
+    main()
+