Initial commit

2025-11-30 08:30:18 +08:00
commit 74bee324ab
335 changed files with 147377 additions and 0 deletions
--- a/skills/citation-management/scripts/search_google_scholar.py
+++ b/skills/citation-management/scripts/search_google_scholar.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+"""
+Google Scholar Search Tool
+Search Google Scholar and export results.
+
+Note: This script requires the 'scholarly' library.
+Install with: pip install scholarly
+"""
+
+import sys
+import argparse
+import json
+import time
+import random
+from typing import List, Dict, Optional
+
+try:
+    from scholarly import scholarly, ProxyGenerator
+    SCHOLARLY_AVAILABLE = True
+except ImportError:
+    SCHOLARLY_AVAILABLE = False
+    print('Warning: scholarly library not installed. Install with: pip install scholarly', file=sys.stderr)
+
+class GoogleScholarSearcher:
+    """Search Google Scholar using scholarly library."""
+    
+    def __init__(self, use_proxy: bool = False):
+        """
+        Initialize searcher.
+        
+        Args:
+            use_proxy: Use free proxy (helps avoid rate limiting)
+        """
+        if not SCHOLARLY_AVAILABLE:
+            raise ImportError('scholarly library required. Install with: pip install scholarly')
+        
+        # Setup proxy if requested
+        if use_proxy:
+            try:
+                pg = ProxyGenerator()
+                pg.FreeProxies()
+                scholarly.use_proxy(pg)
+                print('Using free proxy', file=sys.stderr)
+            except Exception as e:
+                print(f'Warning: Could not setup proxy: {e}', file=sys.stderr)
+    
+    def search(self, query: str, max_results: int = 50,
+               year_start: Optional[int] = None, year_end: Optional[int] = None,
+               sort_by: str = 'relevance') -> List[Dict]:
+        """
+        Search Google Scholar.
+        
+        Args:
+            query: Search query
+            max_results: Maximum number of results
+            year_start: Start year filter
+            year_end: End year filter
+            sort_by: Sort order ('relevance' or 'citations')
+            
+        Returns:
+            List of result dictionaries
+        """
+        if not SCHOLARLY_AVAILABLE:
+            print('Error: scholarly library not installed', file=sys.stderr)
+            return []
+        
+        print(f'Searching Google Scholar: {query}', file=sys.stderr)
+        print(f'Max results: {max_results}', file=sys.stderr)
+        
+        results = []
+        
+        try:
+            # Perform search
+            search_query = scholarly.search_pubs(query)
+            
+            for i, result in enumerate(search_query):
+                if i >= max_results:
+                    break
+                
+                print(f'Retrieved {i+1}/{max_results}', file=sys.stderr)
+                
+                # Extract metadata
+                metadata = {
+                    'title': result.get('bib', {}).get('title', ''),
+                    'authors': ', '.join(result.get('bib', {}).get('author', [])),
+                    'year': result.get('bib', {}).get('pub_year', ''),
+                    'venue': result.get('bib', {}).get('venue', ''),
+                    'abstract': result.get('bib', {}).get('abstract', ''),
+                    'citations': result.get('num_citations', 0),
+                    'url': result.get('pub_url', ''),
+                    'eprint_url': result.get('eprint_url', ''),
+                }
+                
+                # Filter by year
+                if year_start or year_end:
+                    try:
+                        pub_year = int(metadata['year']) if metadata['year'] else 0
+                        if year_start and pub_year < year_start:
+                            continue
+                        if year_end and pub_year > year_end:
+                            continue
+                    except ValueError:
+                        pass
+                
+                results.append(metadata)
+                
+                # Rate limiting to avoid blocking
+                time.sleep(random.uniform(2, 5))
+            
+        except Exception as e:
+            print(f'Error during search: {e}', file=sys.stderr)
+        
+        # Sort if requested
+        if sort_by == 'citations' and results:
+            results.sort(key=lambda x: x.get('citations', 0), reverse=True)
+        
+        return results
+    
+    def metadata_to_bibtex(self, metadata: Dict) -> str:
+        """Convert metadata to BibTeX format."""
+        # Generate citation key
+        if metadata.get('authors'):
+            first_author = metadata['authors'].split(',')[0].strip()
+            last_name = first_author.split()[-1] if first_author else 'Unknown'
+        else:
+            last_name = 'Unknown'
+        
+        year = metadata.get('year', 'XXXX')
+        
+        # Get keyword from title
+        import re
+        title = metadata.get('title', '')
+        words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
+        keyword = words[0].lower() if words else 'paper'
+        
+        citation_key = f'{last_name}{year}{keyword}'
+        
+        # Determine entry type (guess based on venue)
+        venue = metadata.get('venue', '').lower()
+        if 'proceedings' in venue or 'conference' in venue:
+            entry_type = 'inproceedings'
+            venue_field = 'booktitle'
+        else:
+            entry_type = 'article'
+            venue_field = 'journal'
+        
+        # Build BibTeX
+        lines = [f'@{entry_type}{{{citation_key},']
+        
+        # Convert authors format
+        if metadata.get('authors'):
+            authors = metadata['authors'].replace(',', ' and')
+            lines.append(f'  author  = {{{authors}}},')
+        
+        if metadata.get('title'):
+            lines.append(f'  title   = {{{metadata["title"]}}},')
+        
+        if metadata.get('venue'):
+            lines.append(f'  {venue_field} = {{{metadata["venue"]}}},')
+        
+        if metadata.get('year'):
+            lines.append(f'  year    = {{{metadata["year"]}}},')
+        
+        if metadata.get('url'):
+            lines.append(f'  url     = {{{metadata["url"]}}},')
+        
+        if metadata.get('citations'):
+            lines.append(f'  note    = {{Cited by: {metadata["citations"]}}},')
+        
+        # Remove trailing comma
+        if lines[-1].endswith(','):
+            lines[-1] = lines[-1][:-1]
+        
+        lines.append('}')
+        
+        return '\n'.join(lines)
+
+
+def main():
+    """Command-line interface."""
+    parser = argparse.ArgumentParser(
+        description='Search Google Scholar (requires scholarly library)',
+        epilog='Example: python search_google_scholar.py "machine learning" --limit 50'
+    )
+    
+    parser.add_argument(
+        'query',
+        help='Search query'
+    )
+    
+    parser.add_argument(
+        '--limit',
+        type=int,
+        default=50,
+        help='Maximum number of results (default: 50)'
+    )
+    
+    parser.add_argument(
+        '--year-start',
+        type=int,
+        help='Start year for filtering'
+    )
+    
+    parser.add_argument(
+        '--year-end',
+        type=int,
+        help='End year for filtering'
+    )
+    
+    parser.add_argument(
+        '--sort-by',
+        choices=['relevance', 'citations'],
+        default='relevance',
+        help='Sort order (default: relevance)'
+    )
+    
+    parser.add_argument(
+        '--use-proxy',
+        action='store_true',
+        help='Use free proxy to avoid rate limiting'
+    )
+    
+    parser.add_argument(
+        '-o', '--output',
+        help='Output file (default: stdout)'
+    )
+    
+    parser.add_argument(
+        '--format',
+        choices=['json', 'bibtex'],
+        default='json',
+        help='Output format (default: json)'
+    )
+    
+    args = parser.parse_args()
+    
+    if not SCHOLARLY_AVAILABLE:
+        print('\nError: scholarly library not installed', file=sys.stderr)
+        print('Install with: pip install scholarly', file=sys.stderr)
+        print('\nAlternatively, use PubMed search for biomedical literature:', file=sys.stderr)
+        print('  python search_pubmed.py "your query"', file=sys.stderr)
+        sys.exit(1)
+    
+    # Search
+    searcher = GoogleScholarSearcher(use_proxy=args.use_proxy)
+    results = searcher.search(
+        args.query,
+        max_results=args.limit,
+        year_start=args.year_start,
+        year_end=args.year_end,
+        sort_by=args.sort_by
+    )
+    
+    if not results:
+        print('No results found', file=sys.stderr)
+        sys.exit(1)
+    
+    # Format output
+    if args.format == 'json':
+        output = json.dumps({
+            'query': args.query,
+            'count': len(results),
+            'results': results
+        }, indent=2)
+    else:  # bibtex
+        bibtex_entries = [searcher.metadata_to_bibtex(r) for r in results]
+        output = '\n\n'.join(bibtex_entries) + '\n'
+    
+    # Write output
+    if args.output:
+        with open(args.output, 'w', encoding='utf-8') as f:
+            f.write(output)
+        print(f'Wrote {len(results)} results to {args.output}', file=sys.stderr)
+    else:
+        print(output)
+    
+    print(f'\nRetrieved {len(results)} results', file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
+