Initial commit

2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions
--- a/skills/literature-review/scripts/generate_pdf.py
+++ b/skills/literature-review/scripts/generate_pdf.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+"""
+PDF Generation Script for Literature Reviews
+Converts markdown files to professionally formatted PDFs with proper styling.
+"""
+
+import subprocess
+import sys
+import os
+from pathlib import Path
+
+def generate_pdf(
+    markdown_file: str,
+    output_pdf: str = None,
+    citation_style: str = "apa",
+    template: str = None,
+    toc: bool = True,
+    number_sections: bool = True
+) -> bool:
+    """
+    Generate a PDF from a markdown file using pandoc.
+
+    Args:
+        markdown_file: Path to the markdown file
+        output_pdf: Path for output PDF (defaults to same name as markdown)
+        citation_style: Citation style (apa, nature, chicago, etc.)
+        template: Path to custom LaTeX template
+        toc: Include table of contents
+        number_sections: Number the sections
+
+    Returns:
+        True if successful, False otherwise
+    """
+
+    # Verify markdown file exists
+    if not os.path.exists(markdown_file):
+        print(f"Error: Markdown file not found: {markdown_file}")
+        return False
+
+    # Set default output path
+    if output_pdf is None:
+        output_pdf = Path(markdown_file).with_suffix('.pdf')
+
+    # Check if pandoc is installed
+    try:
+        subprocess.run(['pandoc', '--version'], capture_output=True, check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("Error: pandoc is not installed.")
+        print("Install with: brew install pandoc (macOS) or apt-get install pandoc (Linux)")
+        return False
+
+    # Build pandoc command
+    cmd = [
+        'pandoc',
+        markdown_file,
+        '-o', str(output_pdf),
+        '--pdf-engine=xelatex',  # Better Unicode support
+        '-V', 'geometry:margin=1in',
+        '-V', 'fontsize=11pt',
+        '-V', 'colorlinks=true',
+        '-V', 'linkcolor=blue',
+        '-V', 'urlcolor=blue',
+        '-V', 'citecolor=blue',
+    ]
+
+    # Add table of contents
+    if toc:
+        cmd.extend(['--toc', '--toc-depth=3'])
+
+    # Add section numbering
+    if number_sections:
+        cmd.append('--number-sections')
+
+    # Add citation processing if bibliography exists
+    bib_file = Path(markdown_file).with_suffix('.bib')
+    if bib_file.exists():
+        cmd.extend([
+            '--citeproc',
+            '--bibliography', str(bib_file),
+            '--csl', f'{citation_style}.csl' if not citation_style.endswith('.csl') else citation_style
+        ])
+
+    # Add custom template if provided
+    if template and os.path.exists(template):
+        cmd.extend(['--template', template])
+
+    # Execute pandoc
+    try:
+        print(f"Generating PDF: {output_pdf}")
+        print(f"Command: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        print(f"✓ PDF generated successfully: {output_pdf}")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error generating PDF:")
+        print(f"STDOUT: {e.stdout}")
+        print(f"STDERR: {e.stderr}")
+        return False
+
+def check_dependencies():
+    """Check if required dependencies are installed."""
+    dependencies = {
+        'pandoc': 'pandoc --version',
+        'xelatex': 'xelatex --version'
+    }
+
+    missing = []
+    for name, cmd in dependencies.items():
+        try:
+            subprocess.run(cmd.split(), capture_output=True, check=True)
+            print(f"✓ {name} is installed")
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            print(f"✗ {name} is NOT installed")
+            missing.append(name)
+
+    if missing:
+        print("\n" + "="*60)
+        print("Missing dependencies:")
+        for dep in missing:
+            if dep == 'pandoc':
+                print("  - pandoc: brew install pandoc (macOS) or apt-get install pandoc (Linux)")
+            elif dep == 'xelatex':
+                print("  - xelatex: brew install --cask mactex (macOS) or apt-get install texlive-xetex (Linux)")
+        return False
+
+    return True
+
+def main():
+    """Command-line interface."""
+    if len(sys.argv) < 2:
+        print("Usage: python generate_pdf.py <markdown_file> [output_pdf] [--citation-style STYLE]")
+        print("\nOptions:")
+        print("  --citation-style STYLE    Citation style (default: apa)")
+        print("  --no-toc                  Disable table of contents")
+        print("  --no-numbers              Disable section numbering")
+        print("  --check-deps              Check if dependencies are installed")
+        sys.exit(1)
+
+    # Check dependencies mode
+    if '--check-deps' in sys.argv:
+        check_dependencies()
+        sys.exit(0)
+
+    # Parse arguments
+    markdown_file = sys.argv[1]
+    output_pdf = sys.argv[2] if len(sys.argv) > 2 and not sys.argv[2].startswith('--') else None
+
+    citation_style = 'apa'
+    toc = True
+    number_sections = True
+
+    # Parse optional flags
+    if '--citation-style' in sys.argv:
+        idx = sys.argv.index('--citation-style')
+        if idx + 1 < len(sys.argv):
+            citation_style = sys.argv[idx + 1]
+
+    if '--no-toc' in sys.argv:
+        toc = False
+
+    if '--no-numbers' in sys.argv:
+        number_sections = False
+
+    # Generate PDF
+    success = generate_pdf(
+        markdown_file,
+        output_pdf,
+        citation_style=citation_style,
+        toc=toc,
+        number_sections=number_sections
+    )
+
+    sys.exit(0 if success else 1)
+
+if __name__ == "__main__":
+    main()
--- a/skills/literature-review/scripts/search_databases.py
+++ b/skills/literature-review/scripts/search_databases.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+"""
+Literature Database Search Script
+Searches multiple literature databases and aggregates results.
+"""
+
+import json
+import sys
+from typing import Dict, List
+from datetime import datetime
+
+def format_search_results(results: List[Dict], output_format: str = 'json') -> str:
+    """
+    Format search results for output.
+
+    Args:
+        results: List of search results
+        output_format: Format (json, markdown, or bibtex)
+
+    Returns:
+        Formatted string
+    """
+    if output_format == 'json':
+        return json.dumps(results, indent=2)
+
+    elif output_format == 'markdown':
+        md = f"# Literature Search Results\n\n"
+        md += f"**Search Date**: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
+        md += f"**Total Results**: {len(results)}\n\n"
+
+        for i, result in enumerate(results, 1):
+            md += f"## {i}. {result.get('title', 'Untitled')}\n\n"
+            md += f"**Authors**: {result.get('authors', 'Unknown')}\n\n"
+            md += f"**Year**: {result.get('year', 'N/A')}\n\n"
+            md += f"**Source**: {result.get('source', 'Unknown')}\n\n"
+
+            if result.get('abstract'):
+                md += f"**Abstract**: {result['abstract']}\n\n"
+
+            if result.get('doi'):
+                md += f"**DOI**: [{result['doi']}](https://doi.org/{result['doi']})\n\n"
+
+            if result.get('url'):
+                md += f"**URL**: {result['url']}\n\n"
+
+            if result.get('citations'):
+                md += f"**Citations**: {result['citations']}\n\n"
+
+            md += "---\n\n"
+
+        return md
+
+    elif output_format == 'bibtex':
+        bibtex = ""
+        for i, result in enumerate(results, 1):
+            entry_type = result.get('type', 'article')
+            cite_key = f"{result.get('first_author', 'unknown')}{result.get('year', '0000')}"
+
+            bibtex += f"@{entry_type}{{{cite_key},\n"
+            bibtex += f"  title = {{{result.get('title', '')}}},\n"
+            bibtex += f"  author = {{{result.get('authors', '')}}},\n"
+            bibtex += f"  year = {{{result.get('year', '')}}},\n"
+
+            if result.get('journal'):
+                bibtex += f"  journal = {{{result['journal']}}},\n"
+
+            if result.get('volume'):
+                bibtex += f"  volume = {{{result['volume']}}},\n"
+
+            if result.get('pages'):
+                bibtex += f"  pages = {{{result['pages']}}},\n"
+
+            if result.get('doi'):
+                bibtex += f"  doi = {{{result['doi']}}},\n"
+
+            bibtex += "}\n\n"
+
+        return bibtex
+
+    else:
+        raise ValueError(f"Unknown format: {output_format}")
+
+def deduplicate_results(results: List[Dict]) -> List[Dict]:
+    """
+    Remove duplicate results based on DOI or title.
+
+    Args:
+        results: List of search results
+
+    Returns:
+        Deduplicated list
+    """
+    seen_dois = set()
+    seen_titles = set()
+    unique_results = []
+
+    for result in results:
+        doi = result.get('doi', '').lower().strip()
+        title = result.get('title', '').lower().strip()
+
+        # Check DOI first (more reliable)
+        if doi and doi in seen_dois:
+            continue
+
+        # Check title as fallback
+        if not doi and title in seen_titles:
+            continue
+
+        # Add to results
+        if doi:
+            seen_dois.add(doi)
+        if title:
+            seen_titles.add(title)
+
+        unique_results.append(result)
+
+    return unique_results
+
+def rank_results(results: List[Dict], criteria: str = 'citations') -> List[Dict]:
+    """
+    Rank results by specified criteria.
+
+    Args:
+        results: List of search results
+        criteria: Ranking criteria (citations, year, relevance)
+
+    Returns:
+        Ranked list
+    """
+    if criteria == 'citations':
+        return sorted(results, key=lambda x: x.get('citations', 0), reverse=True)
+    elif criteria == 'year':
+        return sorted(results, key=lambda x: x.get('year', '0'), reverse=True)
+    elif criteria == 'relevance':
+        return sorted(results, key=lambda x: x.get('relevance_score', 0), reverse=True)
+    else:
+        return results
+
+def filter_by_year(results: List[Dict], start_year: int = None, end_year: int = None) -> List[Dict]:
+    """
+    Filter results by publication year range.
+
+    Args:
+        results: List of search results
+        start_year: Minimum year (inclusive)
+        end_year: Maximum year (inclusive)
+
+    Returns:
+        Filtered list
+    """
+    filtered = []
+
+    for result in results:
+        try:
+            year = int(result.get('year', 0))
+            if start_year and year < start_year:
+                continue
+            if end_year and year > end_year:
+                continue
+            filtered.append(result)
+        except (ValueError, TypeError):
+            # Include if year parsing fails
+            filtered.append(result)
+
+    return filtered
+
+def generate_search_summary(results: List[Dict]) -> Dict:
+    """
+    Generate summary statistics for search results.
+
+    Args:
+        results: List of search results
+
+    Returns:
+        Summary dictionary
+    """
+    summary = {
+        'total_results': len(results),
+        'sources': {},
+        'year_distribution': {},
+        'avg_citations': 0,
+        'total_citations': 0
+    }
+
+    citations = []
+
+    for result in results:
+        # Count by source
+        source = result.get('source', 'Unknown')
+        summary['sources'][source] = summary['sources'].get(source, 0) + 1
+
+        # Count by year
+        year = result.get('year', 'Unknown')
+        summary['year_distribution'][year] = summary['year_distribution'].get(year, 0) + 1
+
+        # Collect citations
+        if result.get('citations'):
+            try:
+                citations.append(int(result['citations']))
+            except (ValueError, TypeError):
+                pass
+
+    if citations:
+        summary['avg_citations'] = sum(citations) / len(citations)
+        summary['total_citations'] = sum(citations)
+
+    return summary
+
+def main():
+    """Command-line interface for search result processing."""
+    if len(sys.argv) < 2:
+        print("Usage: python search_databases.py <results.json> [options]")
+        print("\nOptions:")
+        print("  --format FORMAT          Output format (json, markdown, bibtex)")
+        print("  --output FILE            Output file (default: stdout)")
+        print("  --rank CRITERIA          Rank by (citations, year, relevance)")
+        print("  --year-start YEAR        Filter by start year")
+        print("  --year-end YEAR          Filter by end year")
+        print("  --deduplicate            Remove duplicates")
+        print("  --summary                Show summary statistics")
+        sys.exit(1)
+
+    # Load results
+    results_file = sys.argv[1]
+    try:
+        with open(results_file, 'r', encoding='utf-8') as f:
+            results = json.load(f)
+    except Exception as e:
+        print(f"Error loading results: {e}")
+        sys.exit(1)
+
+    # Parse options
+    output_format = 'markdown'
+    output_file = None
+    rank_criteria = None
+    year_start = None
+    year_end = None
+    do_dedup = False
+    show_summary = False
+
+    i = 2
+    while i < len(sys.argv):
+        arg = sys.argv[i]
+
+        if arg == '--format' and i + 1 < len(sys.argv):
+            output_format = sys.argv[i + 1]
+            i += 2
+        elif arg == '--output' and i + 1 < len(sys.argv):
+            output_file = sys.argv[i + 1]
+            i += 2
+        elif arg == '--rank' and i + 1 < len(sys.argv):
+            rank_criteria = sys.argv[i + 1]
+            i += 2
+        elif arg == '--year-start' and i + 1 < len(sys.argv):
+            year_start = int(sys.argv[i + 1])
+            i += 2
+        elif arg == '--year-end' and i + 1 < len(sys.argv):
+            year_end = int(sys.argv[i + 1])
+            i += 2
+        elif arg == '--deduplicate':
+            do_dedup = True
+            i += 1
+        elif arg == '--summary':
+            show_summary = True
+            i += 1
+        else:
+            i += 1
+
+    # Process results
+    if do_dedup:
+        results = deduplicate_results(results)
+        print(f"After deduplication: {len(results)} results")
+
+    if year_start or year_end:
+        results = filter_by_year(results, year_start, year_end)
+        print(f"After year filter: {len(results)} results")
+
+    if rank_criteria:
+        results = rank_results(results, rank_criteria)
+        print(f"Ranked by: {rank_criteria}")
+
+    # Show summary
+    if show_summary:
+        summary = generate_search_summary(results)
+        print("\n" + "="*60)
+        print("SEARCH SUMMARY")
+        print("="*60)
+        print(json.dumps(summary, indent=2))
+        print()
+
+    # Format output
+    output = format_search_results(results, output_format)
+
+    # Write output
+    if output_file:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(output)
+        print(f"✓ Results saved to: {output_file}")
+    else:
+        print(output)
+
+if __name__ == "__main__":
+    main()
--- a/skills/literature-review/scripts/verify_citations.py
+++ b/skills/literature-review/scripts/verify_citations.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+Citation Verification Script
+Verifies DOIs, URLs, and citation metadata for accuracy.
+"""
+
+import re
+import requests
+import json
+from typing import Dict, List, Tuple
+from urllib.parse import urlparse
+import time
+
+class CitationVerifier:
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'CitationVerifier/1.0 (Literature Review Tool)'
+        })
+
+    def extract_dois(self, text: str) -> List[str]:
+        """Extract all DOIs from text."""
+        doi_pattern = r'10\.\d{4,}/[^\s\]\)"]+'
+        return re.findall(doi_pattern, text)
+
+    def verify_doi(self, doi: str) -> Tuple[bool, Dict]:
+        """
+        Verify a DOI and retrieve metadata.
+        Returns (is_valid, metadata)
+        """
+        try:
+            url = f"https://doi.org/api/handles/{doi}"
+            response = self.session.get(url, timeout=10)
+
+            if response.status_code == 200:
+                # DOI exists, now get metadata from CrossRef
+                metadata = self._get_crossref_metadata(doi)
+                return True, metadata
+            else:
+                return False, {}
+        except Exception as e:
+            return False, {"error": str(e)}
+
+    def _get_crossref_metadata(self, doi: str) -> Dict:
+        """Get metadata from CrossRef API."""
+        try:
+            url = f"https://api.crossref.org/works/{doi}"
+            response = self.session.get(url, timeout=10)
+
+            if response.status_code == 200:
+                data = response.json()
+                message = data.get('message', {})
+
+                # Extract key metadata
+                metadata = {
+                    'title': message.get('title', [''])[0],
+                    'authors': self._format_authors(message.get('author', [])),
+                    'year': self._extract_year(message),
+                    'journal': message.get('container-title', [''])[0],
+                    'volume': message.get('volume', ''),
+                    'pages': message.get('page', ''),
+                    'doi': doi
+                }
+                return metadata
+            return {}
+        except Exception as e:
+            return {"error": str(e)}
+
+    def _format_authors(self, authors: List[Dict]) -> str:
+        """Format author list."""
+        if not authors:
+            return ""
+
+        formatted = []
+        for author in authors[:3]:  # First 3 authors
+            given = author.get('given', '')
+            family = author.get('family', '')
+            if family:
+                formatted.append(f"{family}, {given[0]}." if given else family)
+
+        if len(authors) > 3:
+            formatted.append("et al.")
+
+        return ", ".join(formatted)
+
+    def _extract_year(self, message: Dict) -> str:
+        """Extract publication year."""
+        date_parts = message.get('published-print', {}).get('date-parts', [[]])
+        if not date_parts or not date_parts[0]:
+            date_parts = message.get('published-online', {}).get('date-parts', [[]])
+
+        if date_parts and date_parts[0]:
+            return str(date_parts[0][0])
+        return ""
+
+    def verify_url(self, url: str) -> Tuple[bool, int]:
+        """
+        Verify a URL is accessible.
+        Returns (is_accessible, status_code)
+        """
+        try:
+            response = self.session.head(url, timeout=10, allow_redirects=True)
+            is_accessible = response.status_code < 400
+            return is_accessible, response.status_code
+        except Exception as e:
+            return False, 0
+
+    def verify_citations_in_file(self, filepath: str) -> Dict:
+        """
+        Verify all citations in a markdown file.
+        Returns a report of verification results.
+        """
+        with open(filepath, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+        dois = self.extract_dois(content)
+
+        report = {
+            'total_dois': len(dois),
+            'verified': [],
+            'failed': [],
+            'metadata': {}
+        }
+
+        for doi in dois:
+            print(f"Verifying DOI: {doi}")
+            is_valid, metadata = self.verify_doi(doi)
+
+            if is_valid:
+                report['verified'].append(doi)
+                report['metadata'][doi] = metadata
+            else:
+                report['failed'].append(doi)
+
+            time.sleep(0.5)  # Rate limiting
+
+        return report
+
+    def format_citation_apa(self, metadata: Dict) -> str:
+        """Format citation in APA style."""
+        authors = metadata.get('authors', '')
+        year = metadata.get('year', 'n.d.')
+        title = metadata.get('title', '')
+        journal = metadata.get('journal', '')
+        volume = metadata.get('volume', '')
+        pages = metadata.get('pages', '')
+        doi = metadata.get('doi', '')
+
+        citation = f"{authors} ({year}). {title}. "
+        if journal:
+            citation += f"*{journal}*"
+        if volume:
+            citation += f", *{volume}*"
+        if pages:
+            citation += f", {pages}"
+        if doi:
+            citation += f". https://doi.org/{doi}"
+
+        return citation
+
+    def format_citation_nature(self, metadata: Dict) -> str:
+        """Format citation in Nature style."""
+        authors = metadata.get('authors', '')
+        title = metadata.get('title', '')
+        journal = metadata.get('journal', '')
+        volume = metadata.get('volume', '')
+        pages = metadata.get('pages', '')
+        year = metadata.get('year', '')
+
+        citation = f"{authors} {title}. "
+        if journal:
+            citation += f"*{journal}* "
+        if volume:
+            citation += f"**{volume}**, "
+        if pages:
+            citation += f"{pages} "
+        if year:
+            citation += f"({year})"
+
+        return citation
+
+def main():
+    """Example usage."""
+    import sys
+
+    if len(sys.argv) < 2:
+        print("Usage: python verify_citations.py <markdown_file>")
+        sys.exit(1)
+
+    filepath = sys.argv[1]
+    verifier = CitationVerifier()
+
+    print(f"Verifying citations in: {filepath}")
+    report = verifier.verify_citations_in_file(filepath)
+
+    print("\n" + "="*60)
+    print("CITATION VERIFICATION REPORT")
+    print("="*60)
+    print(f"\nTotal DOIs found: {report['total_dois']}")
+    print(f"Verified: {len(report['verified'])}")
+    print(f"Failed: {len(report['failed'])}")
+
+    if report['failed']:
+        print("\nFailed DOIs:")
+        for doi in report['failed']:
+            print(f"  - {doi}")
+
+    if report['metadata']:
+        print("\n\nVerified Citations (APA format):")
+        for doi, metadata in report['metadata'].items():
+            citation = verifier.format_citation_apa(metadata)
+            print(f"\n{citation}")
+
+    # Save detailed report
+    output_file = filepath.replace('.md', '_citation_report.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(report, f, indent=2)
+
+    print(f"\n\nDetailed report saved to: {output_file}")
+
+if __name__ == "__main__":
+    main()