Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions

View File

@@ -0,0 +1,176 @@
#!/usr/bin/env python3
"""
PDF Generation Script for Literature Reviews
Converts markdown files to professionally formatted PDFs with proper styling.
"""
import subprocess
import sys
import os
from pathlib import Path
def generate_pdf(
markdown_file: str,
output_pdf: str = None,
citation_style: str = "apa",
template: str = None,
toc: bool = True,
number_sections: bool = True
) -> bool:
"""
Generate a PDF from a markdown file using pandoc.
Args:
markdown_file: Path to the markdown file
output_pdf: Path for output PDF (defaults to same name as markdown)
citation_style: Citation style (apa, nature, chicago, etc.)
template: Path to custom LaTeX template
toc: Include table of contents
number_sections: Number the sections
Returns:
True if successful, False otherwise
"""
# Verify markdown file exists
if not os.path.exists(markdown_file):
print(f"Error: Markdown file not found: {markdown_file}")
return False
# Set default output path
if output_pdf is None:
output_pdf = Path(markdown_file).with_suffix('.pdf')
# Check if pandoc is installed
try:
subprocess.run(['pandoc', '--version'], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
print("Error: pandoc is not installed.")
print("Install with: brew install pandoc (macOS) or apt-get install pandoc (Linux)")
return False
# Build pandoc command
cmd = [
'pandoc',
markdown_file,
'-o', str(output_pdf),
'--pdf-engine=xelatex', # Better Unicode support
'-V', 'geometry:margin=1in',
'-V', 'fontsize=11pt',
'-V', 'colorlinks=true',
'-V', 'linkcolor=blue',
'-V', 'urlcolor=blue',
'-V', 'citecolor=blue',
]
# Add table of contents
if toc:
cmd.extend(['--toc', '--toc-depth=3'])
# Add section numbering
if number_sections:
cmd.append('--number-sections')
# Add citation processing if bibliography exists
bib_file = Path(markdown_file).with_suffix('.bib')
if bib_file.exists():
cmd.extend([
'--citeproc',
'--bibliography', str(bib_file),
'--csl', f'{citation_style}.csl' if not citation_style.endswith('.csl') else citation_style
])
# Add custom template if provided
if template and os.path.exists(template):
cmd.extend(['--template', template])
# Execute pandoc
try:
print(f"Generating PDF: {output_pdf}")
print(f"Command: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
print(f"✓ PDF generated successfully: {output_pdf}")
return True
except subprocess.CalledProcessError as e:
print(f"Error generating PDF:")
print(f"STDOUT: {e.stdout}")
print(f"STDERR: {e.stderr}")
return False
def check_dependencies():
"""Check if required dependencies are installed."""
dependencies = {
'pandoc': 'pandoc --version',
'xelatex': 'xelatex --version'
}
missing = []
for name, cmd in dependencies.items():
try:
subprocess.run(cmd.split(), capture_output=True, check=True)
print(f"{name} is installed")
except (subprocess.CalledProcessError, FileNotFoundError):
print(f"{name} is NOT installed")
missing.append(name)
if missing:
print("\n" + "="*60)
print("Missing dependencies:")
for dep in missing:
if dep == 'pandoc':
print(" - pandoc: brew install pandoc (macOS) or apt-get install pandoc (Linux)")
elif dep == 'xelatex':
print(" - xelatex: brew install --cask mactex (macOS) or apt-get install texlive-xetex (Linux)")
return False
return True
def main():
"""Command-line interface."""
if len(sys.argv) < 2:
print("Usage: python generate_pdf.py <markdown_file> [output_pdf] [--citation-style STYLE]")
print("\nOptions:")
print(" --citation-style STYLE Citation style (default: apa)")
print(" --no-toc Disable table of contents")
print(" --no-numbers Disable section numbering")
print(" --check-deps Check if dependencies are installed")
sys.exit(1)
# Check dependencies mode
if '--check-deps' in sys.argv:
check_dependencies()
sys.exit(0)
# Parse arguments
markdown_file = sys.argv[1]
output_pdf = sys.argv[2] if len(sys.argv) > 2 and not sys.argv[2].startswith('--') else None
citation_style = 'apa'
toc = True
number_sections = True
# Parse optional flags
if '--citation-style' in sys.argv:
idx = sys.argv.index('--citation-style')
if idx + 1 < len(sys.argv):
citation_style = sys.argv[idx + 1]
if '--no-toc' in sys.argv:
toc = False
if '--no-numbers' in sys.argv:
number_sections = False
# Generate PDF
success = generate_pdf(
markdown_file,
output_pdf,
citation_style=citation_style,
toc=toc,
number_sections=number_sections
)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,303 @@
#!/usr/bin/env python3
"""
Literature Database Search Script
Searches multiple literature databases and aggregates results.
"""
import json
import sys
from typing import Dict, List
from datetime import datetime
def format_search_results(results: List[Dict], output_format: str = 'json') -> str:
"""
Format search results for output.
Args:
results: List of search results
output_format: Format (json, markdown, or bibtex)
Returns:
Formatted string
"""
if output_format == 'json':
return json.dumps(results, indent=2)
elif output_format == 'markdown':
md = f"# Literature Search Results\n\n"
md += f"**Search Date**: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
md += f"**Total Results**: {len(results)}\n\n"
for i, result in enumerate(results, 1):
md += f"## {i}. {result.get('title', 'Untitled')}\n\n"
md += f"**Authors**: {result.get('authors', 'Unknown')}\n\n"
md += f"**Year**: {result.get('year', 'N/A')}\n\n"
md += f"**Source**: {result.get('source', 'Unknown')}\n\n"
if result.get('abstract'):
md += f"**Abstract**: {result['abstract']}\n\n"
if result.get('doi'):
md += f"**DOI**: [{result['doi']}](https://doi.org/{result['doi']})\n\n"
if result.get('url'):
md += f"**URL**: {result['url']}\n\n"
if result.get('citations'):
md += f"**Citations**: {result['citations']}\n\n"
md += "---\n\n"
return md
elif output_format == 'bibtex':
bibtex = ""
for i, result in enumerate(results, 1):
entry_type = result.get('type', 'article')
cite_key = f"{result.get('first_author', 'unknown')}{result.get('year', '0000')}"
bibtex += f"@{entry_type}{{{cite_key},\n"
bibtex += f" title = {{{result.get('title', '')}}},\n"
bibtex += f" author = {{{result.get('authors', '')}}},\n"
bibtex += f" year = {{{result.get('year', '')}}},\n"
if result.get('journal'):
bibtex += f" journal = {{{result['journal']}}},\n"
if result.get('volume'):
bibtex += f" volume = {{{result['volume']}}},\n"
if result.get('pages'):
bibtex += f" pages = {{{result['pages']}}},\n"
if result.get('doi'):
bibtex += f" doi = {{{result['doi']}}},\n"
bibtex += "}\n\n"
return bibtex
else:
raise ValueError(f"Unknown format: {output_format}")
def deduplicate_results(results: List[Dict]) -> List[Dict]:
"""
Remove duplicate results based on DOI or title.
Args:
results: List of search results
Returns:
Deduplicated list
"""
seen_dois = set()
seen_titles = set()
unique_results = []
for result in results:
doi = result.get('doi', '').lower().strip()
title = result.get('title', '').lower().strip()
# Check DOI first (more reliable)
if doi and doi in seen_dois:
continue
# Check title as fallback
if not doi and title in seen_titles:
continue
# Add to results
if doi:
seen_dois.add(doi)
if title:
seen_titles.add(title)
unique_results.append(result)
return unique_results
def rank_results(results: List[Dict], criteria: str = 'citations') -> List[Dict]:
"""
Rank results by specified criteria.
Args:
results: List of search results
criteria: Ranking criteria (citations, year, relevance)
Returns:
Ranked list
"""
if criteria == 'citations':
return sorted(results, key=lambda x: x.get('citations', 0), reverse=True)
elif criteria == 'year':
return sorted(results, key=lambda x: x.get('year', '0'), reverse=True)
elif criteria == 'relevance':
return sorted(results, key=lambda x: x.get('relevance_score', 0), reverse=True)
else:
return results
def filter_by_year(results: List[Dict], start_year: int = None, end_year: int = None) -> List[Dict]:
"""
Filter results by publication year range.
Args:
results: List of search results
start_year: Minimum year (inclusive)
end_year: Maximum year (inclusive)
Returns:
Filtered list
"""
filtered = []
for result in results:
try:
year = int(result.get('year', 0))
if start_year and year < start_year:
continue
if end_year and year > end_year:
continue
filtered.append(result)
except (ValueError, TypeError):
# Include if year parsing fails
filtered.append(result)
return filtered
def generate_search_summary(results: List[Dict]) -> Dict:
"""
Generate summary statistics for search results.
Args:
results: List of search results
Returns:
Summary dictionary
"""
summary = {
'total_results': len(results),
'sources': {},
'year_distribution': {},
'avg_citations': 0,
'total_citations': 0
}
citations = []
for result in results:
# Count by source
source = result.get('source', 'Unknown')
summary['sources'][source] = summary['sources'].get(source, 0) + 1
# Count by year
year = result.get('year', 'Unknown')
summary['year_distribution'][year] = summary['year_distribution'].get(year, 0) + 1
# Collect citations
if result.get('citations'):
try:
citations.append(int(result['citations']))
except (ValueError, TypeError):
pass
if citations:
summary['avg_citations'] = sum(citations) / len(citations)
summary['total_citations'] = sum(citations)
return summary
def main():
"""Command-line interface for search result processing."""
if len(sys.argv) < 2:
print("Usage: python search_databases.py <results.json> [options]")
print("\nOptions:")
print(" --format FORMAT Output format (json, markdown, bibtex)")
print(" --output FILE Output file (default: stdout)")
print(" --rank CRITERIA Rank by (citations, year, relevance)")
print(" --year-start YEAR Filter by start year")
print(" --year-end YEAR Filter by end year")
print(" --deduplicate Remove duplicates")
print(" --summary Show summary statistics")
sys.exit(1)
# Load results
results_file = sys.argv[1]
try:
with open(results_file, 'r', encoding='utf-8') as f:
results = json.load(f)
except Exception as e:
print(f"Error loading results: {e}")
sys.exit(1)
# Parse options
output_format = 'markdown'
output_file = None
rank_criteria = None
year_start = None
year_end = None
do_dedup = False
show_summary = False
i = 2
while i < len(sys.argv):
arg = sys.argv[i]
if arg == '--format' and i + 1 < len(sys.argv):
output_format = sys.argv[i + 1]
i += 2
elif arg == '--output' and i + 1 < len(sys.argv):
output_file = sys.argv[i + 1]
i += 2
elif arg == '--rank' and i + 1 < len(sys.argv):
rank_criteria = sys.argv[i + 1]
i += 2
elif arg == '--year-start' and i + 1 < len(sys.argv):
year_start = int(sys.argv[i + 1])
i += 2
elif arg == '--year-end' and i + 1 < len(sys.argv):
year_end = int(sys.argv[i + 1])
i += 2
elif arg == '--deduplicate':
do_dedup = True
i += 1
elif arg == '--summary':
show_summary = True
i += 1
else:
i += 1
# Process results
if do_dedup:
results = deduplicate_results(results)
print(f"After deduplication: {len(results)} results")
if year_start or year_end:
results = filter_by_year(results, year_start, year_end)
print(f"After year filter: {len(results)} results")
if rank_criteria:
results = rank_results(results, rank_criteria)
print(f"Ranked by: {rank_criteria}")
# Show summary
if show_summary:
summary = generate_search_summary(results)
print("\n" + "="*60)
print("SEARCH SUMMARY")
print("="*60)
print(json.dumps(summary, indent=2))
print()
# Format output
output = format_search_results(results, output_format)
# Write output
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output)
print(f"✓ Results saved to: {output_file}")
else:
print(output)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,222 @@
#!/usr/bin/env python3
"""
Citation Verification Script
Verifies DOIs, URLs, and citation metadata for accuracy.
"""
import re
import requests
import json
from typing import Dict, List, Tuple
from urllib.parse import urlparse
import time
class CitationVerifier:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'CitationVerifier/1.0 (Literature Review Tool)'
})
def extract_dois(self, text: str) -> List[str]:
"""Extract all DOIs from text."""
doi_pattern = r'10\.\d{4,}/[^\s\]\)"]+'
return re.findall(doi_pattern, text)
def verify_doi(self, doi: str) -> Tuple[bool, Dict]:
"""
Verify a DOI and retrieve metadata.
Returns (is_valid, metadata)
"""
try:
url = f"https://doi.org/api/handles/{doi}"
response = self.session.get(url, timeout=10)
if response.status_code == 200:
# DOI exists, now get metadata from CrossRef
metadata = self._get_crossref_metadata(doi)
return True, metadata
else:
return False, {}
except Exception as e:
return False, {"error": str(e)}
def _get_crossref_metadata(self, doi: str) -> Dict:
"""Get metadata from CrossRef API."""
try:
url = f"https://api.crossref.org/works/{doi}"
response = self.session.get(url, timeout=10)
if response.status_code == 200:
data = response.json()
message = data.get('message', {})
# Extract key metadata
metadata = {
'title': message.get('title', [''])[0],
'authors': self._format_authors(message.get('author', [])),
'year': self._extract_year(message),
'journal': message.get('container-title', [''])[0],
'volume': message.get('volume', ''),
'pages': message.get('page', ''),
'doi': doi
}
return metadata
return {}
except Exception as e:
return {"error": str(e)}
def _format_authors(self, authors: List[Dict]) -> str:
"""Format author list."""
if not authors:
return ""
formatted = []
for author in authors[:3]: # First 3 authors
given = author.get('given', '')
family = author.get('family', '')
if family:
formatted.append(f"{family}, {given[0]}." if given else family)
if len(authors) > 3:
formatted.append("et al.")
return ", ".join(formatted)
def _extract_year(self, message: Dict) -> str:
"""Extract publication year."""
date_parts = message.get('published-print', {}).get('date-parts', [[]])
if not date_parts or not date_parts[0]:
date_parts = message.get('published-online', {}).get('date-parts', [[]])
if date_parts and date_parts[0]:
return str(date_parts[0][0])
return ""
def verify_url(self, url: str) -> Tuple[bool, int]:
"""
Verify a URL is accessible.
Returns (is_accessible, status_code)
"""
try:
response = self.session.head(url, timeout=10, allow_redirects=True)
is_accessible = response.status_code < 400
return is_accessible, response.status_code
except Exception as e:
return False, 0
def verify_citations_in_file(self, filepath: str) -> Dict:
"""
Verify all citations in a markdown file.
Returns a report of verification results.
"""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
dois = self.extract_dois(content)
report = {
'total_dois': len(dois),
'verified': [],
'failed': [],
'metadata': {}
}
for doi in dois:
print(f"Verifying DOI: {doi}")
is_valid, metadata = self.verify_doi(doi)
if is_valid:
report['verified'].append(doi)
report['metadata'][doi] = metadata
else:
report['failed'].append(doi)
time.sleep(0.5) # Rate limiting
return report
def format_citation_apa(self, metadata: Dict) -> str:
"""Format citation in APA style."""
authors = metadata.get('authors', '')
year = metadata.get('year', 'n.d.')
title = metadata.get('title', '')
journal = metadata.get('journal', '')
volume = metadata.get('volume', '')
pages = metadata.get('pages', '')
doi = metadata.get('doi', '')
citation = f"{authors} ({year}). {title}. "
if journal:
citation += f"*{journal}*"
if volume:
citation += f", *{volume}*"
if pages:
citation += f", {pages}"
if doi:
citation += f". https://doi.org/{doi}"
return citation
def format_citation_nature(self, metadata: Dict) -> str:
"""Format citation in Nature style."""
authors = metadata.get('authors', '')
title = metadata.get('title', '')
journal = metadata.get('journal', '')
volume = metadata.get('volume', '')
pages = metadata.get('pages', '')
year = metadata.get('year', '')
citation = f"{authors} {title}. "
if journal:
citation += f"*{journal}* "
if volume:
citation += f"**{volume}**, "
if pages:
citation += f"{pages} "
if year:
citation += f"({year})"
return citation
def main():
"""Example usage."""
import sys
if len(sys.argv) < 2:
print("Usage: python verify_citations.py <markdown_file>")
sys.exit(1)
filepath = sys.argv[1]
verifier = CitationVerifier()
print(f"Verifying citations in: {filepath}")
report = verifier.verify_citations_in_file(filepath)
print("\n" + "="*60)
print("CITATION VERIFICATION REPORT")
print("="*60)
print(f"\nTotal DOIs found: {report['total_dois']}")
print(f"Verified: {len(report['verified'])}")
print(f"Failed: {len(report['failed'])}")
if report['failed']:
print("\nFailed DOIs:")
for doi in report['failed']:
print(f" - {doi}")
if report['metadata']:
print("\n\nVerified Citations (APA format):")
for doi, metadata in report['metadata'].items():
citation = verifier.format_citation_apa(metadata)
print(f"\n{citation}")
# Save detailed report
output_file = filepath.replace('.md', '_citation_report.json')
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print(f"\n\nDetailed report saved to: {output_file}")
if __name__ == "__main__":
main()