Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:18 +08:00
commit 74bee324ab
335 changed files with 147377 additions and 0 deletions

View File

@@ -0,0 +1,331 @@
#!/usr/bin/env python3
"""
PDF to Images Converter for Presentations
Converts presentation PDFs to images for visual inspection and review.
Supports multiple output formats and resolutions.
"""
import sys
import os
import argparse
import subprocess
from pathlib import Path
from typing import Optional, List
# Try to import pdf2image
try:
from pdf2image import convert_from_path
HAS_PDF2IMAGE = True
except ImportError:
HAS_PDF2IMAGE = False
class PDFToImagesConverter:
"""Converts PDF presentations to images."""
def __init__(
self,
pdf_path: str,
output_prefix: str,
dpi: int = 150,
format: str = 'jpg',
first_page: Optional[int] = None,
last_page: Optional[int] = None
):
self.pdf_path = Path(pdf_path)
self.output_prefix = output_prefix
self.dpi = dpi
self.format = format.lower()
self.first_page = first_page
self.last_page = last_page
# Validate format
if self.format not in ['jpg', 'jpeg', 'png']:
raise ValueError(f"Unsupported format: {format}. Use jpg or png.")
def convert(self) -> List[Path]:
"""Convert PDF to images using available method."""
if not self.pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
print(f"Converting: {self.pdf_path.name}")
print(f"Output prefix: {self.output_prefix}")
print(f"DPI: {self.dpi}")
print(f"Format: {self.format}")
# Try methods in order of preference
if HAS_PDF2IMAGE:
return self._convert_with_pdf2image()
elif self._has_pdftoppm():
return self._convert_with_pdftoppm()
elif self._has_imagemagick():
return self._convert_with_imagemagick()
else:
raise RuntimeError(
"No conversion tool found. Install one of:\n"
" - pdf2image: pip install pdf2image\n"
" - poppler-utils (pdftoppm): apt/brew install poppler-utils\n"
" - ImageMagick: apt/brew install imagemagick"
)
def _convert_with_pdf2image(self) -> List[Path]:
"""Convert using pdf2image library."""
print("Using pdf2image library...")
images = convert_from_path(
self.pdf_path,
dpi=self.dpi,
fmt=self.format,
first_page=self.first_page,
last_page=self.last_page
)
output_files = []
output_dir = Path(self.output_prefix).parent
output_dir.mkdir(parents=True, exist_ok=True)
for i, image in enumerate(images, start=1):
output_path = Path(f"{self.output_prefix}-{i:03d}.{self.format}")
image.save(output_path, self.format.upper())
output_files.append(output_path)
print(f" Created: {output_path.name}")
return output_files
def _convert_with_pdftoppm(self) -> List[Path]:
"""Convert using pdftoppm command-line tool."""
print("Using pdftoppm...")
# Build command
cmd = [
'pdftoppm',
'-r', str(self.dpi)
]
# Add format flag
if self.format in ['jpg', 'jpeg']:
cmd.append('-jpeg')
else:
cmd.append('-png')
# Add page range if specified
if self.first_page:
cmd.extend(['-f', str(self.first_page)])
if self.last_page:
cmd.extend(['-l', str(self.last_page)])
# Add input and output
cmd.extend([str(self.pdf_path), self.output_prefix])
# Run command
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True
)
# Find generated files
output_dir = Path(self.output_prefix).parent
pattern = f"{Path(self.output_prefix).name}-*.{self.format}"
output_files = sorted(output_dir.glob(pattern))
for f in output_files:
print(f" Created: {f.name}")
return output_files
except subprocess.CalledProcessError as e:
raise RuntimeError(f"pdftoppm failed: {e.stderr}")
def _convert_with_imagemagick(self) -> List[Path]:
"""Convert using ImageMagick convert command."""
print("Using ImageMagick...")
# Build command
cmd = [
'convert',
'-density', str(self.dpi)
]
# Add page range if specified
if self.first_page and self.last_page:
page_range = f"[{self.first_page-1}-{self.last_page-1}]"
cmd.append(str(self.pdf_path) + page_range)
elif self.first_page:
cmd.append(str(self.pdf_path) + f"[{self.first_page-1}-]")
elif self.last_page:
cmd.append(str(self.pdf_path) + f"[0-{self.last_page-1}]")
else:
cmd.append(str(self.pdf_path))
# Output path
output_path = f"{self.output_prefix}-%03d.{self.format}"
cmd.append(output_path)
# Run command
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True
)
# Find generated files
output_dir = Path(self.output_prefix).parent
pattern = f"{Path(self.output_prefix).name}-*.{self.format}"
output_files = sorted(output_dir.glob(pattern))
for f in output_files:
print(f" Created: {f.name}")
return output_files
except subprocess.CalledProcessError as e:
raise RuntimeError(f"ImageMagick failed: {e.stderr}")
def _has_pdftoppm(self) -> bool:
"""Check if pdftoppm is available."""
try:
subprocess.run(
['pdftoppm', '-v'],
capture_output=True,
check=True
)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
def _has_imagemagick(self) -> bool:
"""Check if ImageMagick is available."""
try:
subprocess.run(
['convert', '-version'],
capture_output=True,
check=True
)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
def main():
parser = argparse.ArgumentParser(
description='Convert presentation PDFs to images',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s presentation.pdf slides
→ Creates slides-001.jpg, slides-002.jpg, ...
%(prog)s presentation.pdf output/slide --dpi 300 --format png
→ Creates output/slide-001.png, slide-002.png, ... at high resolution
%(prog)s presentation.pdf review/s --first 5 --last 10
→ Converts only slides 5-10
Output:
Images are named: PREFIX-001.FORMAT, PREFIX-002.FORMAT, etc.
Resolution:
- 150 DPI: Good for screen review (default)
- 200 DPI: Higher quality for detailed inspection
- 300 DPI: Print quality (larger files)
Requirements:
Install one of these tools:
- pdf2image: pip install pdf2image (recommended)
- poppler-utils: apt/brew install poppler-utils
- ImageMagick: apt/brew install imagemagick
"""
)
parser.add_argument(
'pdf_path',
help='Path to PDF presentation'
)
parser.add_argument(
'output_prefix',
help='Output filename prefix (e.g., "slides" or "output/slide")'
)
parser.add_argument(
'--dpi', '-r',
type=int,
default=150,
help='Resolution in DPI (default: 150)'
)
parser.add_argument(
'--format', '-f',
choices=['jpg', 'jpeg', 'png'],
default='jpg',
help='Output format (default: jpg)'
)
parser.add_argument(
'--first',
type=int,
help='First page to convert (1-indexed)'
)
parser.add_argument(
'--last',
type=int,
help='Last page to convert (1-indexed)'
)
args = parser.parse_args()
# Create output directory if needed
output_dir = Path(args.output_prefix).parent
if output_dir != Path('.'):
output_dir.mkdir(parents=True, exist_ok=True)
# Convert
try:
converter = PDFToImagesConverter(
pdf_path=args.pdf_path,
output_prefix=args.output_prefix,
dpi=args.dpi,
format=args.format,
first_page=args.first,
last_page=args.last
)
output_files = converter.convert()
print()
print("=" * 60)
print(f"✅ Success! Created {len(output_files)} image(s)")
print("=" * 60)
if output_files:
print(f"\nFirst image: {output_files[0]}")
print(f"Last image: {output_files[-1]}")
# Calculate total size
total_size = sum(f.stat().st_size for f in output_files)
size_mb = total_size / (1024 * 1024)
print(f"Total size: {size_mb:.2f} MB")
print("\nNext steps:")
print(" 1. Review images for layout issues")
print(" 2. Check for text overflow or element overlap")
print(" 3. Verify readability from distance")
print(" 4. Document issues with slide numbers")
sys.exit(0)
except Exception as e:
print(f"\n❌ Error: {str(e)}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,403 @@
#!/usr/bin/env python3
"""
Presentation Validation Script
Validates scientific presentations for common issues:
- Slide count vs. duration
- LaTeX compilation
- File size checks
- Basic format validation
"""
import sys
import os
import argparse
import subprocess
from pathlib import Path
from typing import Dict, List, Tuple, Optional
# Try to import PyPDF2 for PDF analysis
try:
import PyPDF2
HAS_PYPDF2 = True
except ImportError:
HAS_PYPDF2 = False
# Try to import python-pptx for PowerPoint analysis
try:
from pptx import Presentation
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
class PresentationValidator:
"""Validates presentations for common issues."""
# Recommended slide counts by duration (min, recommended, max)
SLIDE_GUIDELINES = {
5: (5, 6, 8),
10: (8, 11, 14),
15: (13, 16, 20),
20: (18, 22, 26),
30: (22, 27, 33),
45: (32, 40, 50),
60: (40, 52, 65),
}
def __init__(self, filepath: str, duration: Optional[int] = None):
self.filepath = Path(filepath)
self.duration = duration
self.file_type = self.filepath.suffix.lower()
self.issues = []
self.warnings = []
self.info = []
def validate(self) -> Dict:
"""Run all validations and return results."""
print(f"Validating: {self.filepath.name}")
print(f"File type: {self.file_type}")
print("=" * 60)
# Check file exists
if not self.filepath.exists():
self.issues.append(f"File not found: {self.filepath}")
return self._format_results()
# File size check
self._check_file_size()
# Type-specific validation
if self.file_type == '.pdf':
self._validate_pdf()
elif self.file_type in ['.pptx', '.ppt']:
self._validate_pptx()
elif self.file_type in ['.tex']:
self._validate_latex()
else:
self.warnings.append(f"Unknown file type: {self.file_type}")
return self._format_results()
def _check_file_size(self):
"""Check if file size is reasonable."""
size_mb = self.filepath.stat().st_size / (1024 * 1024)
self.info.append(f"File size: {size_mb:.2f} MB")
if size_mb > 100:
self.issues.append(
f"File is very large ({size_mb:.1f} MB). "
"Consider compressing images."
)
elif size_mb > 50:
self.warnings.append(
f"File is large ({size_mb:.1f} MB). "
"May be slow to email or upload."
)
def _validate_pdf(self):
"""Validate PDF presentation."""
if not HAS_PYPDF2:
self.warnings.append(
"PyPDF2 not installed. Install with: pip install PyPDF2"
)
return
try:
with open(self.filepath, 'rb') as f:
reader = PyPDF2.PdfReader(f)
num_pages = len(reader.pages)
self.info.append(f"Number of slides: {num_pages}")
# Check slide count against duration
if self.duration:
self._check_slide_count(num_pages)
# Get page size
first_page = reader.pages[0]
media_box = first_page.mediabox
width = float(media_box.width)
height = float(media_box.height)
# Convert points to inches (72 points = 1 inch)
width_in = width / 72
height_in = height / 72
aspect = width / height
self.info.append(
f"Slide dimensions: {width_in:.1f}\" × {height_in:.1f}\" "
f"(aspect ratio: {aspect:.2f})"
)
# Check common aspect ratios
if abs(aspect - 16/9) < 0.01:
self.info.append("Aspect ratio: 16:9 (widescreen)")
elif abs(aspect - 4/3) < 0.01:
self.info.append("Aspect ratio: 4:3 (standard)")
else:
self.warnings.append(
f"Unusual aspect ratio: {aspect:.2f}. "
"Confirm this matches venue requirements."
)
except Exception as e:
self.issues.append(f"Error reading PDF: {str(e)}")
def _validate_pptx(self):
"""Validate PowerPoint presentation."""
if not HAS_PPTX:
self.warnings.append(
"python-pptx not installed. Install with: pip install python-pptx"
)
return
try:
prs = Presentation(self.filepath)
num_slides = len(prs.slides)
self.info.append(f"Number of slides: {num_slides}")
# Check slide count against duration
if self.duration:
self._check_slide_count(num_slides)
# Get slide dimensions
width_inches = prs.slide_width / 914400 # EMU to inches
height_inches = prs.slide_height / 914400
aspect = prs.slide_width / prs.slide_height
self.info.append(
f"Slide dimensions: {width_inches:.1f}\" × {height_inches:.1f}\" "
f"(aspect ratio: {aspect:.2f})"
)
# Check fonts and text
self._check_pptx_content(prs)
except Exception as e:
self.issues.append(f"Error reading PowerPoint: {str(e)}")
def _check_pptx_content(self, prs):
"""Check PowerPoint content for common issues."""
small_text_slides = []
many_bullets_slides = []
for idx, slide in enumerate(prs.slides, start=1):
for shape in slide.shapes:
if not shape.has_text_frame:
continue
text_frame = shape.text_frame
# Check for small fonts
for paragraph in text_frame.paragraphs:
for run in paragraph.runs:
if run.font.size and run.font.size.pt < 18:
small_text_slides.append(idx)
break
# Check for too many bullets
bullet_count = sum(1 for p in text_frame.paragraphs if p.level == 0)
if bullet_count > 6:
many_bullets_slides.append(idx)
# Report issues
if small_text_slides:
unique_slides = sorted(set(small_text_slides))
self.warnings.append(
f"Small text (<18pt) found on slides: {unique_slides[:5]}"
+ (" ..." if len(unique_slides) > 5 else "")
)
if many_bullets_slides:
unique_slides = sorted(set(many_bullets_slides))
self.warnings.append(
f"Many bullets (>6) on slides: {unique_slides[:5]}"
+ (" ..." if len(unique_slides) > 5 else "")
)
def _validate_latex(self):
"""Validate LaTeX Beamer presentation."""
self.info.append("LaTeX source file detected")
# Try to compile
if self._try_compile_latex():
self.info.append("LaTeX compilation: SUCCESS")
# If PDF was generated, validate it
pdf_path = self.filepath.with_suffix('.pdf')
if pdf_path.exists():
pdf_validator = PresentationValidator(str(pdf_path), self.duration)
pdf_results = pdf_validator.validate()
# Merge results
self.info.extend(pdf_results['info'])
self.warnings.extend(pdf_results['warnings'])
self.issues.extend(pdf_results['issues'])
else:
self.issues.append(
"LaTeX compilation failed. Check .log file for errors."
)
def _try_compile_latex(self) -> bool:
"""Try to compile LaTeX file."""
try:
# Try pdflatex
result = subprocess.run(
['pdflatex', '-interaction=nonstopmode', self.filepath.name],
cwd=self.filepath.parent,
capture_output=True,
timeout=60
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError):
return False
def _check_slide_count(self, num_slides: int):
"""Check if slide count is appropriate for duration."""
if self.duration not in self.SLIDE_GUIDELINES:
# Find nearest duration
durations = sorted(self.SLIDE_GUIDELINES.keys())
nearest = min(durations, key=lambda x: abs(x - self.duration))
min_slides, rec_slides, max_slides = self.SLIDE_GUIDELINES[nearest]
self.info.append(
f"Using guidelines for {nearest}-minute talk "
f"(closest to {self.duration} minutes)"
)
else:
min_slides, rec_slides, max_slides = self.SLIDE_GUIDELINES[self.duration]
self.info.append(
f"Recommended slides for {self.duration}-minute talk: "
f"{min_slides}-{max_slides} (optimal: ~{rec_slides})"
)
if num_slides < min_slides:
self.warnings.append(
f"Fewer slides ({num_slides}) than recommended ({min_slides}-{max_slides}). "
"May have too much time or too little content."
)
elif num_slides > max_slides:
self.warnings.append(
f"More slides ({num_slides}) than recommended ({min_slides}-{max_slides}). "
"Likely to run over time."
)
else:
self.info.append(
f"Slide count ({num_slides}) is within recommended range."
)
def _format_results(self) -> Dict:
"""Format validation results."""
return {
'filepath': str(self.filepath),
'file_type': self.file_type,
'info': self.info,
'warnings': self.warnings,
'issues': self.issues,
'valid': len(self.issues) == 0
}
def print_results(results: Dict):
"""Print validation results in a readable format."""
print()
print("=" * 60)
print("VALIDATION RESULTS")
print("=" * 60)
# Print info
if results['info']:
print("\n📋 Information:")
for item in results['info']:
print(f"{item}")
# Print warnings
if results['warnings']:
print("\n⚠️ Warnings:")
for item in results['warnings']:
print(f"{item}")
# Print issues
if results['issues']:
print("\n❌ Issues:")
for item in results['issues']:
print(f"{item}")
# Overall status
print("\n" + "=" * 60)
if results['valid']:
print("✅ Validation PASSED")
if results['warnings']:
print(f" ({len(results['warnings'])} warning(s) found)")
else:
print("❌ Validation FAILED")
print(f" ({len(results['issues'])} issue(s) found)")
print("=" * 60)
def main():
parser = argparse.ArgumentParser(
description='Validate scientific presentations',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s presentation.pdf --duration 15
%(prog)s slides.pptx --duration 45
%(prog)s beamer_talk.tex --duration 20
Supported file types:
- PDF (.pdf)
- PowerPoint (.pptx, .ppt)
- LaTeX Beamer (.tex)
Validation checks:
- Slide count vs. duration
- File size
- Slide dimensions
- Font sizes (PowerPoint)
- LaTeX compilation (Beamer)
"""
)
parser.add_argument(
'filepath',
help='Path to presentation file (PDF, PPTX, or TEX)'
)
parser.add_argument(
'--duration', '-d',
type=int,
help='Presentation duration in minutes'
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help='Only show issues and warnings'
)
args = parser.parse_args()
# Validate
validator = PresentationValidator(args.filepath, args.duration)
results = validator.validate()
# Print results
if args.quiet:
# Only show warnings and issues
if results['warnings'] or results['issues']:
print_results(results)
else:
print("✅ No issues found")
else:
print_results(results)
# Exit with appropriate code
sys.exit(0 if results['valid'] else 1)
if __name__ == '__main__':
main()