223 lines
6.9 KiB
Python
223 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Citation Verification Script
|
|
Verifies DOIs, URLs, and citation metadata for accuracy.
|
|
"""
|
|
|
|
import re
|
|
import requests
|
|
import json
|
|
from typing import Dict, List, Tuple
|
|
from urllib.parse import urlparse
|
|
import time
|
|
|
|
class CitationVerifier:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'CitationVerifier/1.0 (Literature Review Tool)'
|
|
})
|
|
|
|
def extract_dois(self, text: str) -> List[str]:
|
|
"""Extract all DOIs from text."""
|
|
doi_pattern = r'10\.\d{4,}/[^\s\]\)"]+'
|
|
return re.findall(doi_pattern, text)
|
|
|
|
def verify_doi(self, doi: str) -> Tuple[bool, Dict]:
|
|
"""
|
|
Verify a DOI and retrieve metadata.
|
|
Returns (is_valid, metadata)
|
|
"""
|
|
try:
|
|
url = f"https://doi.org/api/handles/{doi}"
|
|
response = self.session.get(url, timeout=10)
|
|
|
|
if response.status_code == 200:
|
|
# DOI exists, now get metadata from CrossRef
|
|
metadata = self._get_crossref_metadata(doi)
|
|
return True, metadata
|
|
else:
|
|
return False, {}
|
|
except Exception as e:
|
|
return False, {"error": str(e)}
|
|
|
|
def _get_crossref_metadata(self, doi: str) -> Dict:
|
|
"""Get metadata from CrossRef API."""
|
|
try:
|
|
url = f"https://api.crossref.org/works/{doi}"
|
|
response = self.session.get(url, timeout=10)
|
|
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
message = data.get('message', {})
|
|
|
|
# Extract key metadata
|
|
metadata = {
|
|
'title': message.get('title', [''])[0],
|
|
'authors': self._format_authors(message.get('author', [])),
|
|
'year': self._extract_year(message),
|
|
'journal': message.get('container-title', [''])[0],
|
|
'volume': message.get('volume', ''),
|
|
'pages': message.get('page', ''),
|
|
'doi': doi
|
|
}
|
|
return metadata
|
|
return {}
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
def _format_authors(self, authors: List[Dict]) -> str:
|
|
"""Format author list."""
|
|
if not authors:
|
|
return ""
|
|
|
|
formatted = []
|
|
for author in authors[:3]: # First 3 authors
|
|
given = author.get('given', '')
|
|
family = author.get('family', '')
|
|
if family:
|
|
formatted.append(f"{family}, {given[0]}." if given else family)
|
|
|
|
if len(authors) > 3:
|
|
formatted.append("et al.")
|
|
|
|
return ", ".join(formatted)
|
|
|
|
def _extract_year(self, message: Dict) -> str:
|
|
"""Extract publication year."""
|
|
date_parts = message.get('published-print', {}).get('date-parts', [[]])
|
|
if not date_parts or not date_parts[0]:
|
|
date_parts = message.get('published-online', {}).get('date-parts', [[]])
|
|
|
|
if date_parts and date_parts[0]:
|
|
return str(date_parts[0][0])
|
|
return ""
|
|
|
|
def verify_url(self, url: str) -> Tuple[bool, int]:
|
|
"""
|
|
Verify a URL is accessible.
|
|
Returns (is_accessible, status_code)
|
|
"""
|
|
try:
|
|
response = self.session.head(url, timeout=10, allow_redirects=True)
|
|
is_accessible = response.status_code < 400
|
|
return is_accessible, response.status_code
|
|
except Exception as e:
|
|
return False, 0
|
|
|
|
def verify_citations_in_file(self, filepath: str) -> Dict:
|
|
"""
|
|
Verify all citations in a markdown file.
|
|
Returns a report of verification results.
|
|
"""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
dois = self.extract_dois(content)
|
|
|
|
report = {
|
|
'total_dois': len(dois),
|
|
'verified': [],
|
|
'failed': [],
|
|
'metadata': {}
|
|
}
|
|
|
|
for doi in dois:
|
|
print(f"Verifying DOI: {doi}")
|
|
is_valid, metadata = self.verify_doi(doi)
|
|
|
|
if is_valid:
|
|
report['verified'].append(doi)
|
|
report['metadata'][doi] = metadata
|
|
else:
|
|
report['failed'].append(doi)
|
|
|
|
time.sleep(0.5) # Rate limiting
|
|
|
|
return report
|
|
|
|
def format_citation_apa(self, metadata: Dict) -> str:
|
|
"""Format citation in APA style."""
|
|
authors = metadata.get('authors', '')
|
|
year = metadata.get('year', 'n.d.')
|
|
title = metadata.get('title', '')
|
|
journal = metadata.get('journal', '')
|
|
volume = metadata.get('volume', '')
|
|
pages = metadata.get('pages', '')
|
|
doi = metadata.get('doi', '')
|
|
|
|
citation = f"{authors} ({year}). {title}. "
|
|
if journal:
|
|
citation += f"*{journal}*"
|
|
if volume:
|
|
citation += f", *{volume}*"
|
|
if pages:
|
|
citation += f", {pages}"
|
|
if doi:
|
|
citation += f". https://doi.org/{doi}"
|
|
|
|
return citation
|
|
|
|
def format_citation_nature(self, metadata: Dict) -> str:
|
|
"""Format citation in Nature style."""
|
|
authors = metadata.get('authors', '')
|
|
title = metadata.get('title', '')
|
|
journal = metadata.get('journal', '')
|
|
volume = metadata.get('volume', '')
|
|
pages = metadata.get('pages', '')
|
|
year = metadata.get('year', '')
|
|
|
|
citation = f"{authors} {title}. "
|
|
if journal:
|
|
citation += f"*{journal}* "
|
|
if volume:
|
|
citation += f"**{volume}**, "
|
|
if pages:
|
|
citation += f"{pages} "
|
|
if year:
|
|
citation += f"({year})"
|
|
|
|
return citation
|
|
|
|
def main():
|
|
"""Example usage."""
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python verify_citations.py <markdown_file>")
|
|
sys.exit(1)
|
|
|
|
filepath = sys.argv[1]
|
|
verifier = CitationVerifier()
|
|
|
|
print(f"Verifying citations in: {filepath}")
|
|
report = verifier.verify_citations_in_file(filepath)
|
|
|
|
print("\n" + "="*60)
|
|
print("CITATION VERIFICATION REPORT")
|
|
print("="*60)
|
|
print(f"\nTotal DOIs found: {report['total_dois']}")
|
|
print(f"Verified: {len(report['verified'])}")
|
|
print(f"Failed: {len(report['failed'])}")
|
|
|
|
if report['failed']:
|
|
print("\nFailed DOIs:")
|
|
for doi in report['failed']:
|
|
print(f" - {doi}")
|
|
|
|
if report['metadata']:
|
|
print("\n\nVerified Citations (APA format):")
|
|
for doi, metadata in report['metadata'].items():
|
|
citation = verifier.format_citation_apa(metadata)
|
|
print(f"\n{citation}")
|
|
|
|
# Save detailed report
|
|
output_file = filepath.replace('.md', '_citation_report.json')
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
print(f"\n\nDetailed report saved to: {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|