#!/usr/bin/env python3 """ Validate Format Script Check if document meets venue-specific formatting requirements. Usage: python validate_format.py --file my_paper.pdf --venue "Nature" --check-all python validate_format.py --file my_paper.pdf --venue "NeurIPS" --check page-count,margins python validate_format.py --file my_paper.pdf --venue "PLOS ONE" --report validation_report.txt """ import argparse import subprocess from pathlib import Path import re # Venue requirements database VENUE_REQUIREMENTS = { "nature": { "page_limit": 5, # Approximate for ~3000 words "margins": {"top": 2.5, "bottom": 2.5, "left": 2.5, "right": 2.5}, # cm "font_size": 12, # pt "font_family": "Times", "line_spacing": "double" }, "neurips": { "page_limit": 8, # Excluding refs "margins": {"top": 2.54, "bottom": 2.54, "left": 2.54, "right": 2.54}, # cm (1 inch) "font_size": 10, "font_family": "Times", "format": "two-column" }, "plos_one": { "page_limit": None, # No limit "margins": {"top": 2.54, "bottom": 2.54, "left": 2.54, "right": 2.54}, "font_size": 10, "font_family": "Arial", "line_spacing": "double" }, "nsf": { "page_limit": 15, # Project description "margins": {"top": 2.54, "bottom": 2.54, "left": 2.54, "right": 2.54}, # 1 inch required "font_size": 11, # Minimum "font_family": "Times Roman", "line_spacing": "single or double" }, "nih": { "page_limit": 12, # Research strategy "margins": {"top": 1.27, "bottom": 1.27, "left": 1.27, "right": 1.27}, # 0.5 inch minimum "font_size": 11, # Arial 11pt minimum "font_family": "Arial", "line_spacing": "any" } } def get_pdf_info(pdf_path): """Extract information from PDF using pdfinfo.""" try: result = subprocess.run( ['pdfinfo', str(pdf_path)], capture_output=True, text=True, check=True ) info = {} for line in result.stdout.split('\n'): if ':' in line: key, value = line.split(':', 1) info[key.strip()] = value.strip() return info except FileNotFoundError: print("⚠️ pdfinfo not found. Install poppler-utils for full PDF analysis.") print(" macOS: brew install poppler") print(" Linux: sudo apt-get install poppler-utils") return None except subprocess.CalledProcessError as e: print(f"Error running pdfinfo: {e}") return None def check_page_count(pdf_path, venue_reqs): """Check if page count is within limit.""" pdf_info = get_pdf_info(pdf_path) if not pdf_info: return {"status": "skip", "message": "Could not determine page count"} pages = int(pdf_info.get('Pages', 0)) limit = venue_reqs.get('page_limit') if limit is None: return {"status": "pass", "message": f"No page limit. Document has {pages} pages."} if pages <= limit: return {"status": "pass", "message": f"✓ Page count OK: {pages}/{limit} pages"} else: return {"status": "fail", "message": f"✗ Page count exceeded: {pages}/{limit} pages"} def check_margins(pdf_path, venue_reqs): """Check if margins meet requirements.""" # Note: This is a simplified check. Full margin analysis requires more sophisticated tools. req_margins = venue_reqs.get('margins', {}) if not req_margins: return {"status": "skip", "message": "No margin requirements specified"} # This is a placeholder - accurate margin checking requires parsing PDF content return { "status": "info", "message": f"ℹ️ Required margins: {req_margins} cm (manual verification recommended)" } def check_fonts(pdf_path, venue_reqs): """Check fonts in PDF.""" try: result = subprocess.run( ['pdffonts', str(pdf_path)], capture_output=True, text=True, check=True ) fonts_found = [] for line in result.stdout.split('\n')[2:]: # Skip header if line.strip(): parts = line.split() if parts: fonts_found.append(parts[0]) req_font = venue_reqs.get('font_family', '') req_size = venue_reqs.get('font_size') message = f"ℹ️ Fonts found: {', '.join(set(fonts_found))}\n" message += f" Required: {req_font}" if req_size: message += f" {req_size}pt minimum" return {"status": "info", "message": message} except FileNotFoundError: return {"status": "skip", "message": "pdffonts not available"} except subprocess.CalledProcessError: return {"status": "skip", "message": "Could not extract font information"} def validate_document(pdf_path, venue, checks): """Validate document against venue requirements.""" venue_key = venue.lower().replace(" ", "_") if venue_key not in VENUE_REQUIREMENTS: print(f"❌ Unknown venue: {venue}") print(f"Available venues: {', '.join(VENUE_REQUIREMENTS.keys())}") return venue_reqs = VENUE_REQUIREMENTS[venue_key] print(f"\n{'='*60}") print(f"VALIDATING: {pdf_path.name}") print(f"VENUE: {venue}") print(f"{'='*60}\n") results = {} # Run requested checks if 'page-count' in checks or 'all' in checks: results['page-count'] = check_page_count(pdf_path, venue_reqs) if 'margins' in checks or 'all' in checks: results['margins'] = check_margins(pdf_path, venue_reqs) if 'fonts' in checks or 'all' in checks: results['fonts'] = check_fonts(pdf_path, venue_reqs) # Print results for check_name, result in results.items(): print(f"{check_name.upper()}:") print(f" {result['message']}\n") # Summary failures = sum(1 for r in results.values() if r['status'] == 'fail') passes = sum(1 for r in results.values() if r['status'] == 'pass') print(f"{'='*60}") if failures == 0: print(f"✓ VALIDATION PASSED ({passes} checks)") else: print(f"✗ VALIDATION FAILED ({failures} issues)") print(f"{'='*60}\n") return results def generate_report(pdf_path, venue, results, report_path): """Generate validation report.""" with open(report_path, 'w') as f: f.write(f"Validation Report\n") f.write(f"{'='*60}\n\n") f.write(f"File: {pdf_path}\n") f.write(f"Venue: {venue}\n") f.write(f"Date: {Path.ctime(pdf_path)}\n\n") for check_name, result in results.items(): f.write(f"{check_name.upper()}:\n") f.write(f" Status: {result['status']}\n") f.write(f" {result['message']}\n\n") failures = sum(1 for r in results.values() if r['status'] == 'fail') f.write(f"\nSummary: {'PASSED' if failures == 0 else 'FAILED'}\n") print(f"Report saved to: {report_path}") def main(): parser = argparse.ArgumentParser( description="Validate document formatting for venue requirements", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s --file my_paper.pdf --venue "Nature" --check-all %(prog)s --file my_paper.pdf --venue "NeurIPS" --check page-count,fonts %(prog)s --file proposal.pdf --venue "NSF" --report validation.txt """ ) parser.add_argument('--file', type=str, required=True, help='PDF file to validate') parser.add_argument('--venue', type=str, required=True, help='Target venue') parser.add_argument('--check', type=str, default='all', help='Checks to perform: page-count, margins, fonts, all (comma-separated)') parser.add_argument('--check-all', action='store_true', help='Perform all checks') parser.add_argument('--report', type=str, help='Save report to file') args = parser.parse_args() # Check file exists pdf_path = Path(args.file) if not pdf_path.exists(): print(f"Error: File not found: {pdf_path}") return # Parse checks if args.check_all: checks = ['all'] else: checks = [c.strip() for c in args.check.split(',')] # Validate results = validate_document(pdf_path, args.venue, checks) # Generate report if requested if args.report and results: generate_report(pdf_path, args.venue, results, Path(args.report)) if __name__ == "__main__": main()