#!/usr/bin/env python3 """ Literature Database Search Script Searches multiple literature databases and aggregates results. """ import json import sys from typing import Dict, List from datetime import datetime def format_search_results(results: List[Dict], output_format: str = 'json') -> str: """ Format search results for output. Args: results: List of search results output_format: Format (json, markdown, or bibtex) Returns: Formatted string """ if output_format == 'json': return json.dumps(results, indent=2) elif output_format == 'markdown': md = f"# Literature Search Results\n\n" md += f"**Search Date**: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n" md += f"**Total Results**: {len(results)}\n\n" for i, result in enumerate(results, 1): md += f"## {i}. {result.get('title', 'Untitled')}\n\n" md += f"**Authors**: {result.get('authors', 'Unknown')}\n\n" md += f"**Year**: {result.get('year', 'N/A')}\n\n" md += f"**Source**: {result.get('source', 'Unknown')}\n\n" if result.get('abstract'): md += f"**Abstract**: {result['abstract']}\n\n" if result.get('doi'): md += f"**DOI**: [{result['doi']}](https://doi.org/{result['doi']})\n\n" if result.get('url'): md += f"**URL**: {result['url']}\n\n" if result.get('citations'): md += f"**Citations**: {result['citations']}\n\n" md += "---\n\n" return md elif output_format == 'bibtex': bibtex = "" for i, result in enumerate(results, 1): entry_type = result.get('type', 'article') cite_key = f"{result.get('first_author', 'unknown')}{result.get('year', '0000')}" bibtex += f"@{entry_type}{{{cite_key},\n" bibtex += f" title = {{{result.get('title', '')}}},\n" bibtex += f" author = {{{result.get('authors', '')}}},\n" bibtex += f" year = {{{result.get('year', '')}}},\n" if result.get('journal'): bibtex += f" journal = {{{result['journal']}}},\n" if result.get('volume'): bibtex += f" volume = {{{result['volume']}}},\n" if result.get('pages'): bibtex += f" pages = {{{result['pages']}}},\n" if result.get('doi'): bibtex += f" doi = {{{result['doi']}}},\n" bibtex += "}\n\n" return bibtex else: raise ValueError(f"Unknown format: {output_format}") def deduplicate_results(results: List[Dict]) -> List[Dict]: """ Remove duplicate results based on DOI or title. Args: results: List of search results Returns: Deduplicated list """ seen_dois = set() seen_titles = set() unique_results = [] for result in results: doi = result.get('doi', '').lower().strip() title = result.get('title', '').lower().strip() # Check DOI first (more reliable) if doi and doi in seen_dois: continue # Check title as fallback if not doi and title in seen_titles: continue # Add to results if doi: seen_dois.add(doi) if title: seen_titles.add(title) unique_results.append(result) return unique_results def rank_results(results: List[Dict], criteria: str = 'citations') -> List[Dict]: """ Rank results by specified criteria. Args: results: List of search results criteria: Ranking criteria (citations, year, relevance) Returns: Ranked list """ if criteria == 'citations': return sorted(results, key=lambda x: x.get('citations', 0), reverse=True) elif criteria == 'year': return sorted(results, key=lambda x: x.get('year', '0'), reverse=True) elif criteria == 'relevance': return sorted(results, key=lambda x: x.get('relevance_score', 0), reverse=True) else: return results def filter_by_year(results: List[Dict], start_year: int = None, end_year: int = None) -> List[Dict]: """ Filter results by publication year range. Args: results: List of search results start_year: Minimum year (inclusive) end_year: Maximum year (inclusive) Returns: Filtered list """ filtered = [] for result in results: try: year = int(result.get('year', 0)) if start_year and year < start_year: continue if end_year and year > end_year: continue filtered.append(result) except (ValueError, TypeError): # Include if year parsing fails filtered.append(result) return filtered def generate_search_summary(results: List[Dict]) -> Dict: """ Generate summary statistics for search results. Args: results: List of search results Returns: Summary dictionary """ summary = { 'total_results': len(results), 'sources': {}, 'year_distribution': {}, 'avg_citations': 0, 'total_citations': 0 } citations = [] for result in results: # Count by source source = result.get('source', 'Unknown') summary['sources'][source] = summary['sources'].get(source, 0) + 1 # Count by year year = result.get('year', 'Unknown') summary['year_distribution'][year] = summary['year_distribution'].get(year, 0) + 1 # Collect citations if result.get('citations'): try: citations.append(int(result['citations'])) except (ValueError, TypeError): pass if citations: summary['avg_citations'] = sum(citations) / len(citations) summary['total_citations'] = sum(citations) return summary def main(): """Command-line interface for search result processing.""" if len(sys.argv) < 2: print("Usage: python search_databases.py [options]") print("\nOptions:") print(" --format FORMAT Output format (json, markdown, bibtex)") print(" --output FILE Output file (default: stdout)") print(" --rank CRITERIA Rank by (citations, year, relevance)") print(" --year-start YEAR Filter by start year") print(" --year-end YEAR Filter by end year") print(" --deduplicate Remove duplicates") print(" --summary Show summary statistics") sys.exit(1) # Load results results_file = sys.argv[1] try: with open(results_file, 'r', encoding='utf-8') as f: results = json.load(f) except Exception as e: print(f"Error loading results: {e}") sys.exit(1) # Parse options output_format = 'markdown' output_file = None rank_criteria = None year_start = None year_end = None do_dedup = False show_summary = False i = 2 while i < len(sys.argv): arg = sys.argv[i] if arg == '--format' and i + 1 < len(sys.argv): output_format = sys.argv[i + 1] i += 2 elif arg == '--output' and i + 1 < len(sys.argv): output_file = sys.argv[i + 1] i += 2 elif arg == '--rank' and i + 1 < len(sys.argv): rank_criteria = sys.argv[i + 1] i += 2 elif arg == '--year-start' and i + 1 < len(sys.argv): year_start = int(sys.argv[i + 1]) i += 2 elif arg == '--year-end' and i + 1 < len(sys.argv): year_end = int(sys.argv[i + 1]) i += 2 elif arg == '--deduplicate': do_dedup = True i += 1 elif arg == '--summary': show_summary = True i += 1 else: i += 1 # Process results if do_dedup: results = deduplicate_results(results) print(f"After deduplication: {len(results)} results") if year_start or year_end: results = filter_by_year(results, year_start, year_end) print(f"After year filter: {len(results)} results") if rank_criteria: results = rank_results(results, rank_criteria) print(f"Ranked by: {rank_criteria}") # Show summary if show_summary: summary = generate_search_summary(results) print("\n" + "="*60) print("SEARCH SUMMARY") print("="*60) print(json.dumps(summary, indent=2)) print() # Format output output = format_search_results(results, output_format) # Write output if output_file: with open(output_file, 'w', encoding='utf-8') as f: f.write(output) print(f"✓ Results saved to: {output_file}") else: print(output) if __name__ == "__main__": main()