#!/usr/bin/env python3 """ Google Scholar Search Tool Search Google Scholar and export results. Note: This script requires the 'scholarly' library. Install with: pip install scholarly """ import sys import argparse import json import time import random from typing import List, Dict, Optional try: from scholarly import scholarly, ProxyGenerator SCHOLARLY_AVAILABLE = True except ImportError: SCHOLARLY_AVAILABLE = False print('Warning: scholarly library not installed. Install with: pip install scholarly', file=sys.stderr) class GoogleScholarSearcher: """Search Google Scholar using scholarly library.""" def __init__(self, use_proxy: bool = False): """ Initialize searcher. Args: use_proxy: Use free proxy (helps avoid rate limiting) """ if not SCHOLARLY_AVAILABLE: raise ImportError('scholarly library required. Install with: pip install scholarly') # Setup proxy if requested if use_proxy: try: pg = ProxyGenerator() pg.FreeProxies() scholarly.use_proxy(pg) print('Using free proxy', file=sys.stderr) except Exception as e: print(f'Warning: Could not setup proxy: {e}', file=sys.stderr) def search(self, query: str, max_results: int = 50, year_start: Optional[int] = None, year_end: Optional[int] = None, sort_by: str = 'relevance') -> List[Dict]: """ Search Google Scholar. Args: query: Search query max_results: Maximum number of results year_start: Start year filter year_end: End year filter sort_by: Sort order ('relevance' or 'citations') Returns: List of result dictionaries """ if not SCHOLARLY_AVAILABLE: print('Error: scholarly library not installed', file=sys.stderr) return [] print(f'Searching Google Scholar: {query}', file=sys.stderr) print(f'Max results: {max_results}', file=sys.stderr) results = [] try: # Perform search search_query = scholarly.search_pubs(query) for i, result in enumerate(search_query): if i >= max_results: break print(f'Retrieved {i+1}/{max_results}', file=sys.stderr) # Extract metadata metadata = { 'title': result.get('bib', {}).get('title', ''), 'authors': ', '.join(result.get('bib', {}).get('author', [])), 'year': result.get('bib', {}).get('pub_year', ''), 'venue': result.get('bib', {}).get('venue', ''), 'abstract': result.get('bib', {}).get('abstract', ''), 'citations': result.get('num_citations', 0), 'url': result.get('pub_url', ''), 'eprint_url': result.get('eprint_url', ''), } # Filter by year if year_start or year_end: try: pub_year = int(metadata['year']) if metadata['year'] else 0 if year_start and pub_year < year_start: continue if year_end and pub_year > year_end: continue except ValueError: pass results.append(metadata) # Rate limiting to avoid blocking time.sleep(random.uniform(2, 5)) except Exception as e: print(f'Error during search: {e}', file=sys.stderr) # Sort if requested if sort_by == 'citations' and results: results.sort(key=lambda x: x.get('citations', 0), reverse=True) return results def metadata_to_bibtex(self, metadata: Dict) -> str: """Convert metadata to BibTeX format.""" # Generate citation key if metadata.get('authors'): first_author = metadata['authors'].split(',')[0].strip() last_name = first_author.split()[-1] if first_author else 'Unknown' else: last_name = 'Unknown' year = metadata.get('year', 'XXXX') # Get keyword from title import re title = metadata.get('title', '') words = re.findall(r'\b[a-zA-Z]{4,}\b', title) keyword = words[0].lower() if words else 'paper' citation_key = f'{last_name}{year}{keyword}' # Determine entry type (guess based on venue) venue = metadata.get('venue', '').lower() if 'proceedings' in venue or 'conference' in venue: entry_type = 'inproceedings' venue_field = 'booktitle' else: entry_type = 'article' venue_field = 'journal' # Build BibTeX lines = [f'@{entry_type}{{{citation_key},'] # Convert authors format if metadata.get('authors'): authors = metadata['authors'].replace(',', ' and') lines.append(f' author = {{{authors}}},') if metadata.get('title'): lines.append(f' title = {{{metadata["title"]}}},') if metadata.get('venue'): lines.append(f' {venue_field} = {{{metadata["venue"]}}},') if metadata.get('year'): lines.append(f' year = {{{metadata["year"]}}},') if metadata.get('url'): lines.append(f' url = {{{metadata["url"]}}},') if metadata.get('citations'): lines.append(f' note = {{Cited by: {metadata["citations"]}}},') # Remove trailing comma if lines[-1].endswith(','): lines[-1] = lines[-1][:-1] lines.append('}') return '\n'.join(lines) def main(): """Command-line interface.""" parser = argparse.ArgumentParser( description='Search Google Scholar (requires scholarly library)', epilog='Example: python search_google_scholar.py "machine learning" --limit 50' ) parser.add_argument( 'query', help='Search query' ) parser.add_argument( '--limit', type=int, default=50, help='Maximum number of results (default: 50)' ) parser.add_argument( '--year-start', type=int, help='Start year for filtering' ) parser.add_argument( '--year-end', type=int, help='End year for filtering' ) parser.add_argument( '--sort-by', choices=['relevance', 'citations'], default='relevance', help='Sort order (default: relevance)' ) parser.add_argument( '--use-proxy', action='store_true', help='Use free proxy to avoid rate limiting' ) parser.add_argument( '-o', '--output', help='Output file (default: stdout)' ) parser.add_argument( '--format', choices=['json', 'bibtex'], default='json', help='Output format (default: json)' ) args = parser.parse_args() if not SCHOLARLY_AVAILABLE: print('\nError: scholarly library not installed', file=sys.stderr) print('Install with: pip install scholarly', file=sys.stderr) print('\nAlternatively, use PubMed search for biomedical literature:', file=sys.stderr) print(' python search_pubmed.py "your query"', file=sys.stderr) sys.exit(1) # Search searcher = GoogleScholarSearcher(use_proxy=args.use_proxy) results = searcher.search( args.query, max_results=args.limit, year_start=args.year_start, year_end=args.year_end, sort_by=args.sort_by ) if not results: print('No results found', file=sys.stderr) sys.exit(1) # Format output if args.format == 'json': output = json.dumps({ 'query': args.query, 'count': len(results), 'results': results }, indent=2) else: # bibtex bibtex_entries = [searcher.metadata_to_bibtex(r) for r in results] output = '\n\n'.join(bibtex_entries) + '\n' # Write output if args.output: with open(args.output, 'w', encoding='utf-8') as f: f.write(output) print(f'Wrote {len(results)} results to {args.output}', file=sys.stderr) else: print(output) print(f'\nRetrieved {len(results)} results', file=sys.stderr) if __name__ == '__main__': main()