Files
gh-k-dense-ai-claude-scient…/skills/literature-review/scripts/search_databases.py
2025-11-30 08:30:10 +08:00

304 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""
Literature Database Search Script
Searches multiple literature databases and aggregates results.
"""
import json
import sys
from typing import Dict, List
from datetime import datetime
def format_search_results(results: List[Dict], output_format: str = 'json') -> str:
"""
Format search results for output.
Args:
results: List of search results
output_format: Format (json, markdown, or bibtex)
Returns:
Formatted string
"""
if output_format == 'json':
return json.dumps(results, indent=2)
elif output_format == 'markdown':
md = f"# Literature Search Results\n\n"
md += f"**Search Date**: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n"
md += f"**Total Results**: {len(results)}\n\n"
for i, result in enumerate(results, 1):
md += f"## {i}. {result.get('title', 'Untitled')}\n\n"
md += f"**Authors**: {result.get('authors', 'Unknown')}\n\n"
md += f"**Year**: {result.get('year', 'N/A')}\n\n"
md += f"**Source**: {result.get('source', 'Unknown')}\n\n"
if result.get('abstract'):
md += f"**Abstract**: {result['abstract']}\n\n"
if result.get('doi'):
md += f"**DOI**: [{result['doi']}](https://doi.org/{result['doi']})\n\n"
if result.get('url'):
md += f"**URL**: {result['url']}\n\n"
if result.get('citations'):
md += f"**Citations**: {result['citations']}\n\n"
md += "---\n\n"
return md
elif output_format == 'bibtex':
bibtex = ""
for i, result in enumerate(results, 1):
entry_type = result.get('type', 'article')
cite_key = f"{result.get('first_author', 'unknown')}{result.get('year', '0000')}"
bibtex += f"@{entry_type}{{{cite_key},\n"
bibtex += f" title = {{{result.get('title', '')}}},\n"
bibtex += f" author = {{{result.get('authors', '')}}},\n"
bibtex += f" year = {{{result.get('year', '')}}},\n"
if result.get('journal'):
bibtex += f" journal = {{{result['journal']}}},\n"
if result.get('volume'):
bibtex += f" volume = {{{result['volume']}}},\n"
if result.get('pages'):
bibtex += f" pages = {{{result['pages']}}},\n"
if result.get('doi'):
bibtex += f" doi = {{{result['doi']}}},\n"
bibtex += "}\n\n"
return bibtex
else:
raise ValueError(f"Unknown format: {output_format}")
def deduplicate_results(results: List[Dict]) -> List[Dict]:
"""
Remove duplicate results based on DOI or title.
Args:
results: List of search results
Returns:
Deduplicated list
"""
seen_dois = set()
seen_titles = set()
unique_results = []
for result in results:
doi = result.get('doi', '').lower().strip()
title = result.get('title', '').lower().strip()
# Check DOI first (more reliable)
if doi and doi in seen_dois:
continue
# Check title as fallback
if not doi and title in seen_titles:
continue
# Add to results
if doi:
seen_dois.add(doi)
if title:
seen_titles.add(title)
unique_results.append(result)
return unique_results
def rank_results(results: List[Dict], criteria: str = 'citations') -> List[Dict]:
"""
Rank results by specified criteria.
Args:
results: List of search results
criteria: Ranking criteria (citations, year, relevance)
Returns:
Ranked list
"""
if criteria == 'citations':
return sorted(results, key=lambda x: x.get('citations', 0), reverse=True)
elif criteria == 'year':
return sorted(results, key=lambda x: x.get('year', '0'), reverse=True)
elif criteria == 'relevance':
return sorted(results, key=lambda x: x.get('relevance_score', 0), reverse=True)
else:
return results
def filter_by_year(results: List[Dict], start_year: int = None, end_year: int = None) -> List[Dict]:
"""
Filter results by publication year range.
Args:
results: List of search results
start_year: Minimum year (inclusive)
end_year: Maximum year (inclusive)
Returns:
Filtered list
"""
filtered = []
for result in results:
try:
year = int(result.get('year', 0))
if start_year and year < start_year:
continue
if end_year and year > end_year:
continue
filtered.append(result)
except (ValueError, TypeError):
# Include if year parsing fails
filtered.append(result)
return filtered
def generate_search_summary(results: List[Dict]) -> Dict:
"""
Generate summary statistics for search results.
Args:
results: List of search results
Returns:
Summary dictionary
"""
summary = {
'total_results': len(results),
'sources': {},
'year_distribution': {},
'avg_citations': 0,
'total_citations': 0
}
citations = []
for result in results:
# Count by source
source = result.get('source', 'Unknown')
summary['sources'][source] = summary['sources'].get(source, 0) + 1
# Count by year
year = result.get('year', 'Unknown')
summary['year_distribution'][year] = summary['year_distribution'].get(year, 0) + 1
# Collect citations
if result.get('citations'):
try:
citations.append(int(result['citations']))
except (ValueError, TypeError):
pass
if citations:
summary['avg_citations'] = sum(citations) / len(citations)
summary['total_citations'] = sum(citations)
return summary
def main():
"""Command-line interface for search result processing."""
if len(sys.argv) < 2:
print("Usage: python search_databases.py <results.json> [options]")
print("\nOptions:")
print(" --format FORMAT Output format (json, markdown, bibtex)")
print(" --output FILE Output file (default: stdout)")
print(" --rank CRITERIA Rank by (citations, year, relevance)")
print(" --year-start YEAR Filter by start year")
print(" --year-end YEAR Filter by end year")
print(" --deduplicate Remove duplicates")
print(" --summary Show summary statistics")
sys.exit(1)
# Load results
results_file = sys.argv[1]
try:
with open(results_file, 'r', encoding='utf-8') as f:
results = json.load(f)
except Exception as e:
print(f"Error loading results: {e}")
sys.exit(1)
# Parse options
output_format = 'markdown'
output_file = None
rank_criteria = None
year_start = None
year_end = None
do_dedup = False
show_summary = False
i = 2
while i < len(sys.argv):
arg = sys.argv[i]
if arg == '--format' and i + 1 < len(sys.argv):
output_format = sys.argv[i + 1]
i += 2
elif arg == '--output' and i + 1 < len(sys.argv):
output_file = sys.argv[i + 1]
i += 2
elif arg == '--rank' and i + 1 < len(sys.argv):
rank_criteria = sys.argv[i + 1]
i += 2
elif arg == '--year-start' and i + 1 < len(sys.argv):
year_start = int(sys.argv[i + 1])
i += 2
elif arg == '--year-end' and i + 1 < len(sys.argv):
year_end = int(sys.argv[i + 1])
i += 2
elif arg == '--deduplicate':
do_dedup = True
i += 1
elif arg == '--summary':
show_summary = True
i += 1
else:
i += 1
# Process results
if do_dedup:
results = deduplicate_results(results)
print(f"After deduplication: {len(results)} results")
if year_start or year_end:
results = filter_by_year(results, year_start, year_end)
print(f"After year filter: {len(results)} results")
if rank_criteria:
results = rank_results(results, rank_criteria)
print(f"Ranked by: {rank_criteria}")
# Show summary
if show_summary:
summary = generate_search_summary(results)
print("\n" + "="*60)
print("SEARCH SUMMARY")
print("="*60)
print(json.dumps(summary, indent=2))
print()
# Format output
output = format_search_results(results, output_format)
# Write output
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output)
print(f"✓ Results saved to: {output_file}")
else:
print(output)
if __name__ == "__main__":
main()