Initial commit
This commit is contained in:
1081
skills/citation-management/SKILL.md
Normal file
1081
skills/citation-management/SKILL.md
Normal file
File diff suppressed because it is too large
Load Diff
264
skills/citation-management/assets/bibtex_template.bib
Normal file
264
skills/citation-management/assets/bibtex_template.bib
Normal file
@@ -0,0 +1,264 @@
|
||||
% BibTeX Template File
|
||||
% Examples of properly formatted entries for all common types
|
||||
|
||||
% =============================================================================
|
||||
% JOURNAL ARTICLES
|
||||
% =============================================================================
|
||||
|
||||
@article{Jumper2021,
|
||||
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and others},
|
||||
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
|
||||
journal = {Nature},
|
||||
year = {2021},
|
||||
volume = {596},
|
||||
number = {7873},
|
||||
pages = {583--589},
|
||||
doi = {10.1038/s41586-021-03819-2}
|
||||
}
|
||||
|
||||
@article{Watson1953,
|
||||
author = {Watson, James D. and Crick, Francis H. C.},
|
||||
title = {Molecular Structure of Nucleic Acids: A Structure for Deoxyribose Nucleic Acid},
|
||||
journal = {Nature},
|
||||
year = {1953},
|
||||
volume = {171},
|
||||
number = {4356},
|
||||
pages = {737--738},
|
||||
doi = {10.1038/171737a0}
|
||||
}
|
||||
|
||||
@article{Doudna2014,
|
||||
author = {Doudna, Jennifer A. and Charpentier, Emmanuelle},
|
||||
title = {The New Frontier of Genome Engineering with {CRISPR-Cas9}},
|
||||
journal = {Science},
|
||||
year = {2014},
|
||||
volume = {346},
|
||||
number = {6213},
|
||||
pages = {1258096},
|
||||
doi = {10.1126/science.1258096}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% BOOKS
|
||||
% =============================================================================
|
||||
|
||||
@book{Kumar2021,
|
||||
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
|
||||
title = {Robbins and Cotran Pathologic Basis of Disease},
|
||||
publisher = {Elsevier},
|
||||
year = {2021},
|
||||
edition = {10},
|
||||
address = {Philadelphia, PA},
|
||||
isbn = {978-0-323-53113-9}
|
||||
}
|
||||
|
||||
@book{Alberts2014,
|
||||
author = {Alberts, Bruce and Johnson, Alexander and Lewis, Julian and Morgan, David and Raff, Martin and Roberts, Keith and Walter, Peter},
|
||||
title = {Molecular Biology of the Cell},
|
||||
publisher = {Garland Science},
|
||||
year = {2014},
|
||||
edition = {6},
|
||||
address = {New York, NY},
|
||||
isbn = {978-0-815-34432-2}
|
||||
}
|
||||
|
||||
% Book with editor instead of author
|
||||
@book{Sambrook2001,
|
||||
editor = {Sambrook, Joseph and Russell, David W.},
|
||||
title = {Molecular Cloning: A Laboratory Manual},
|
||||
publisher = {Cold Spring Harbor Laboratory Press},
|
||||
year = {2001},
|
||||
edition = {3},
|
||||
address = {Cold Spring Harbor, NY},
|
||||
isbn = {978-0-879-69576-7}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% CONFERENCE PAPERS (PROCEEDINGS)
|
||||
% =============================================================================
|
||||
|
||||
@inproceedings{Vaswani2017,
|
||||
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},
|
||||
title = {Attention is All You Need},
|
||||
booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
|
||||
year = {2017},
|
||||
pages = {5998--6008},
|
||||
address = {Long Beach, CA},
|
||||
url = {https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html}
|
||||
}
|
||||
|
||||
@inproceedings{He2016,
|
||||
author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
|
||||
title = {Deep Residual Learning for Image Recognition},
|
||||
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
|
||||
year = {2016},
|
||||
pages = {770--778},
|
||||
address = {Las Vegas, NV},
|
||||
doi = {10.1109/CVPR.2016.90}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% BOOK CHAPTERS
|
||||
% =============================================================================
|
||||
|
||||
@incollection{Brown2020,
|
||||
author = {Brown, Peter O. and Botstein, David},
|
||||
title = {Exploring the New World of the Genome with {DNA} Microarrays},
|
||||
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
|
||||
editor = {Eisen, Michael B. and Brown, Patrick O.},
|
||||
publisher = {Cold Spring Harbor Laboratory Press},
|
||||
year = {2020},
|
||||
pages = {1--45},
|
||||
address = {Cold Spring Harbor, NY}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% PHD THESES / DISSERTATIONS
|
||||
% =============================================================================
|
||||
|
||||
@phdthesis{Johnson2023,
|
||||
author = {Johnson, Mary L.},
|
||||
title = {Novel Approaches to Cancer Immunotherapy Using {CRISPR} Technology},
|
||||
school = {Stanford University},
|
||||
year = {2023},
|
||||
type = {{PhD} dissertation},
|
||||
address = {Stanford, CA}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% MASTER'S THESES
|
||||
% =============================================================================
|
||||
|
||||
@mastersthesis{Smith2022,
|
||||
author = {Smith, Robert J.},
|
||||
title = {Machine Learning Methods for Protein Structure Prediction},
|
||||
school = {Massachusetts Institute of Technology},
|
||||
year = {2022},
|
||||
type = {{Master's} thesis},
|
||||
address = {Cambridge, MA}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% TECHNICAL REPORTS
|
||||
% =============================================================================
|
||||
|
||||
@techreport{WHO2020,
|
||||
author = {{World Health Organization}},
|
||||
title = {Clinical Management of {COVID-19}: Interim Guidance},
|
||||
institution = {World Health Organization},
|
||||
year = {2020},
|
||||
type = {Technical Report},
|
||||
number = {WHO/2019-nCoV/clinical/2020.5},
|
||||
address = {Geneva, Switzerland}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% PREPRINTS
|
||||
% =============================================================================
|
||||
|
||||
% bioRxiv preprint
|
||||
@misc{Zhang2024preprint,
|
||||
author = {Zhang, Yi and Chen, Li and Wang, Hui and Liu, Xin},
|
||||
title = {Novel Therapeutic Targets in {Alzheimer}'s Disease},
|
||||
year = {2024},
|
||||
howpublished = {bioRxiv},
|
||||
doi = {10.1101/2024.01.15.575432},
|
||||
note = {Preprint}
|
||||
}
|
||||
|
||||
% arXiv preprint
|
||||
@misc{Brown2024arxiv,
|
||||
author = {Brown, Alice and Green, Bob},
|
||||
title = {Advances in Quantum Computing},
|
||||
year = {2024},
|
||||
howpublished = {arXiv},
|
||||
note = {arXiv:2401.12345}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% DATASETS
|
||||
% =============================================================================
|
||||
|
||||
@misc{AlphaFoldDB2021,
|
||||
author = {{DeepMind} and {EMBL-EBI}},
|
||||
title = {{AlphaFold} Protein Structure Database},
|
||||
year = {2021},
|
||||
howpublished = {Database},
|
||||
url = {https://alphafold.ebi.ac.uk/},
|
||||
doi = {10.1093/nar/gkab1061},
|
||||
note = {Version 4}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% SOFTWARE / CODE
|
||||
% =============================================================================
|
||||
|
||||
@misc{McKinney2010pandas,
|
||||
author = {McKinney, Wes},
|
||||
title = {pandas: A Foundational {Python} Library for Data Analysis and Statistics},
|
||||
year = {2010},
|
||||
howpublished = {Software},
|
||||
url = {https://pandas.pydata.org/},
|
||||
note = {Python Data Analysis Library}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% WEBSITES / ONLINE RESOURCES
|
||||
% =============================================================================
|
||||
|
||||
@misc{NCBI2024,
|
||||
author = {{National Center for Biotechnology Information}},
|
||||
title = {{PubMed}: Database of Biomedical Literature},
|
||||
year = {2024},
|
||||
howpublished = {Website},
|
||||
url = {https://pubmed.ncbi.nlm.nih.gov/},
|
||||
note = {Accessed: 2024-01-15}
|
||||
}
|
||||
|
||||
% =============================================================================
|
||||
% SPECIAL CASES
|
||||
% =============================================================================
|
||||
|
||||
% Article with organization as author
|
||||
@article{NatureEditorial2023,
|
||||
author = {{Nature Editorial Board}},
|
||||
title = {The Future of {AI} in Scientific Research},
|
||||
journal = {Nature},
|
||||
year = {2023},
|
||||
volume = {615},
|
||||
pages = {1--2},
|
||||
doi = {10.1038/d41586-023-00001-1}
|
||||
}
|
||||
|
||||
% Article with no volume number (some journals)
|
||||
@article{OpenAccess2024,
|
||||
author = {Williams, Sarah and Thomas, Michael},
|
||||
title = {Open Access Publishing in the 21st Century},
|
||||
journal = {Journal of Scholarly Communication},
|
||||
year = {2024},
|
||||
pages = {e123456},
|
||||
doi = {10.1234/jsc.2024.123456}
|
||||
}
|
||||
|
||||
% Conference paper with DOI
|
||||
@inproceedings{Garcia2023,
|
||||
author = {Garc{\'i}a-Mart{\'i}nez, Jos{\'e} and M{\"u}ller, Hans},
|
||||
title = {International Collaboration in Science},
|
||||
booktitle = {Proceedings of the International Conference on Academic Publishing},
|
||||
year = {2023},
|
||||
pages = {45--52},
|
||||
doi = {10.1109/ICAP.2023.123456}
|
||||
}
|
||||
|
||||
% Article with PMID but no DOI (older papers)
|
||||
@article{OldPaper1995,
|
||||
author = {Anderson, Philip W.},
|
||||
title = {Through the Glass Lightly},
|
||||
journal = {Science},
|
||||
year = {1995},
|
||||
volume = {267},
|
||||
number = {5204},
|
||||
pages = {1615--1616},
|
||||
note = {PMID: 17808148}
|
||||
}
|
||||
|
||||
386
skills/citation-management/assets/citation_checklist.md
Normal file
386
skills/citation-management/assets/citation_checklist.md
Normal file
@@ -0,0 +1,386 @@
|
||||
# Citation Quality Checklist
|
||||
|
||||
Use this checklist to ensure your citations are accurate, complete, and properly formatted before final submission.
|
||||
|
||||
## Pre-Submission Checklist
|
||||
|
||||
### ✓ Metadata Accuracy
|
||||
|
||||
- [ ] All author names are correct and properly formatted
|
||||
- [ ] Article titles match the actual publication
|
||||
- [ ] Journal/conference names are complete (not abbreviated unless required)
|
||||
- [ ] Publication years are accurate
|
||||
- [ ] Volume and issue numbers are correct
|
||||
- [ ] Page ranges are accurate
|
||||
|
||||
### ✓ Required Fields
|
||||
|
||||
- [ ] All @article entries have: author, title, journal, year
|
||||
- [ ] All @book entries have: author/editor, title, publisher, year
|
||||
- [ ] All @inproceedings entries have: author, title, booktitle, year
|
||||
- [ ] Modern papers (2000+) include DOI when available
|
||||
- [ ] All entries have unique citation keys
|
||||
|
||||
### ✓ DOI Verification
|
||||
|
||||
- [ ] All DOIs are properly formatted (10.XXXX/...)
|
||||
- [ ] DOIs resolve correctly to the article
|
||||
- [ ] No DOI prefix in the BibTeX field (no "doi:" or "https://doi.org/")
|
||||
- [ ] Metadata from CrossRef matches your BibTeX entry
|
||||
- [ ] Run: `python scripts/validate_citations.py references.bib --check-dois`
|
||||
|
||||
### ✓ Formatting Consistency
|
||||
|
||||
- [ ] Page ranges use double hyphen (--) not single (-)
|
||||
- [ ] No "pp." prefix in pages field
|
||||
- [ ] Author names use "and" separator (not semicolon or ampersand)
|
||||
- [ ] Capitalization protected in titles ({AlphaFold}, {CRISPR}, etc.)
|
||||
- [ ] Month names use standard abbreviations if included
|
||||
- [ ] Citation keys follow consistent format
|
||||
|
||||
### ✓ Duplicate Detection
|
||||
|
||||
- [ ] No duplicate DOIs in bibliography
|
||||
- [ ] No duplicate citation keys
|
||||
- [ ] No near-duplicate titles
|
||||
- [ ] Preprints updated to published versions when available
|
||||
- [ ] Run: `python scripts/validate_citations.py references.bib`
|
||||
|
||||
### ✓ Special Characters
|
||||
|
||||
- [ ] Accented characters properly formatted (e.g., {\"u} for ü)
|
||||
- [ ] Mathematical symbols use LaTeX commands
|
||||
- [ ] Chemical formulas properly formatted
|
||||
- [ ] No unescaped special characters (%, &, $, #, etc.)
|
||||
|
||||
### ✓ BibTeX Syntax
|
||||
|
||||
- [ ] All entries have balanced braces {}
|
||||
- [ ] Fields separated by commas
|
||||
- [ ] No comma after last field in each entry
|
||||
- [ ] Valid entry types (@article, @book, etc.)
|
||||
- [ ] Run: `python scripts/validate_citations.py references.bib`
|
||||
|
||||
### ✓ File Organization
|
||||
|
||||
- [ ] Bibliography sorted in logical order (by year, author, or key)
|
||||
- [ ] Consistent formatting throughout
|
||||
- [ ] No formatting inconsistencies between entries
|
||||
- [ ] Run: `python scripts/format_bibtex.py references.bib --sort year`
|
||||
|
||||
## Automated Validation
|
||||
|
||||
### Step 1: Format and Clean
|
||||
|
||||
```bash
|
||||
python scripts/format_bibtex.py references.bib \
|
||||
--deduplicate \
|
||||
--sort year \
|
||||
--descending \
|
||||
--output clean_references.bib
|
||||
```
|
||||
|
||||
**What this does**:
|
||||
- Removes duplicates
|
||||
- Standardizes formatting
|
||||
- Fixes common issues (page ranges, DOI format, etc.)
|
||||
- Sorts by year (newest first)
|
||||
|
||||
### Step 2: Validate
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py clean_references.bib \
|
||||
--check-dois \
|
||||
--report validation_report.json \
|
||||
--verbose
|
||||
```
|
||||
|
||||
**What this does**:
|
||||
- Checks required fields
|
||||
- Verifies DOIs resolve
|
||||
- Detects duplicates
|
||||
- Validates syntax
|
||||
- Generates detailed report
|
||||
|
||||
### Step 3: Review Report
|
||||
|
||||
```bash
|
||||
cat validation_report.json
|
||||
```
|
||||
|
||||
**Address any**:
|
||||
- **Errors**: Must fix (missing fields, broken DOIs, syntax errors)
|
||||
- **Warnings**: Should fix (missing recommended fields, formatting issues)
|
||||
- **Duplicates**: Remove or consolidate
|
||||
|
||||
### Step 4: Final Check
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py clean_references.bib --verbose
|
||||
```
|
||||
|
||||
**Goal**: Zero errors, minimal warnings
|
||||
|
||||
## Manual Review Checklist
|
||||
|
||||
### Critical Citations (Top 10-20 Most Important)
|
||||
|
||||
For your most important citations, manually verify:
|
||||
|
||||
- [ ] Visit DOI link and confirm it's the correct article
|
||||
- [ ] Check author names against the actual publication
|
||||
- [ ] Verify year matches publication date
|
||||
- [ ] Confirm journal/conference name is correct
|
||||
- [ ] Check that volume/pages match
|
||||
|
||||
### Common Issues to Watch For
|
||||
|
||||
**Missing Information**:
|
||||
- [ ] No DOI for papers published after 2000
|
||||
- [ ] Missing volume or page numbers for journal articles
|
||||
- [ ] Missing publisher for books
|
||||
- [ ] Missing conference location for proceedings
|
||||
|
||||
**Formatting Errors**:
|
||||
- [ ] Single hyphen in page ranges (123-145 → 123--145)
|
||||
- [ ] Ampersands in author lists (Smith & Jones → Smith and Jones)
|
||||
- [ ] Unprotected acronyms in titles (DNA → {DNA})
|
||||
- [ ] DOI includes URL prefix (https://doi.org/10.xxx → 10.xxx)
|
||||
|
||||
**Metadata Mismatches**:
|
||||
- [ ] Author names differ from publication
|
||||
- [ ] Year is online-first instead of print publication
|
||||
- [ ] Journal name abbreviated when it should be full
|
||||
- [ ] Volume/issue numbers swapped
|
||||
|
||||
**Duplicates**:
|
||||
- [ ] Same paper cited with different citation keys
|
||||
- [ ] Preprint and published version both cited
|
||||
- [ ] Conference paper and journal version both cited
|
||||
|
||||
## Field-Specific Checks
|
||||
|
||||
### Biomedical Sciences
|
||||
|
||||
- [ ] PubMed Central ID (PMCID) included when available
|
||||
- [ ] MeSH terms appropriate (if using)
|
||||
- [ ] Clinical trial registration number included (if applicable)
|
||||
- [ ] All references to treatments/drugs accurately cited
|
||||
|
||||
### Computer Science
|
||||
|
||||
- [ ] arXiv ID included for preprints
|
||||
- [ ] Conference proceedings properly cited (not just "NeurIPS")
|
||||
- [ ] Software/dataset citations include version numbers
|
||||
- [ ] GitHub links stable and permanent
|
||||
|
||||
### General Sciences
|
||||
|
||||
- [ ] Data availability statements properly cited
|
||||
- [ ] Retracted papers identified and removed
|
||||
- [ ] Preprints checked for published versions
|
||||
- [ ] Supplementary materials referenced if critical
|
||||
|
||||
## Final Pre-Submission Steps
|
||||
|
||||
### 1 Week Before Submission
|
||||
|
||||
- [ ] Run full validation with DOI checking
|
||||
- [ ] Fix all errors and critical warnings
|
||||
- [ ] Manually verify top 10-20 most important citations
|
||||
- [ ] Check for any retracted papers
|
||||
|
||||
### 3 Days Before Submission
|
||||
|
||||
- [ ] Re-run validation after any manual edits
|
||||
- [ ] Ensure all in-text citations have corresponding bibliography entries
|
||||
- [ ] Ensure all bibliography entries are cited in text
|
||||
- [ ] Check citation style matches journal requirements
|
||||
|
||||
### 1 Day Before Submission
|
||||
|
||||
- [ ] Final validation check
|
||||
- [ ] LaTeX compilation successful with no warnings
|
||||
- [ ] PDF renders all citations correctly
|
||||
- [ ] Bibliography appears in correct format
|
||||
- [ ] No placeholder citations (Smith et al. XXXX)
|
||||
|
||||
### Submission Day
|
||||
|
||||
- [ ] One final validation run
|
||||
- [ ] No last-minute edits without re-validation
|
||||
- [ ] Bibliography file included in submission package
|
||||
- [ ] Figures/tables referenced in text match bibliography
|
||||
|
||||
## Quality Metrics
|
||||
|
||||
### Excellent Bibliography
|
||||
|
||||
- ✓ 100% of entries have DOIs (for modern papers)
|
||||
- ✓ Zero validation errors
|
||||
- ✓ Zero missing required fields
|
||||
- ✓ Zero broken DOIs
|
||||
- ✓ Zero duplicates
|
||||
- ✓ Consistent formatting throughout
|
||||
- ✓ All citations manually spot-checked
|
||||
|
||||
### Acceptable Bibliography
|
||||
|
||||
- ✓ 90%+ of modern entries have DOIs
|
||||
- ✓ Zero high-severity errors
|
||||
- ✓ Minor warnings only (e.g., missing recommended fields)
|
||||
- ✓ Key citations manually verified
|
||||
- ✓ Compilation succeeds without errors
|
||||
|
||||
### Needs Improvement
|
||||
|
||||
- ✗ Missing DOIs for recent papers
|
||||
- ✗ High-severity validation errors
|
||||
- ✗ Broken or incorrect DOIs
|
||||
- ✗ Duplicate entries
|
||||
- ✗ Inconsistent formatting
|
||||
- ✗ Compilation warnings or errors
|
||||
|
||||
## Emergency Fixes
|
||||
|
||||
If you discover issues at the last minute:
|
||||
|
||||
### Broken DOI
|
||||
|
||||
```bash
|
||||
# Find correct DOI
|
||||
# Option 1: Search CrossRef
|
||||
# https://www.crossref.org/
|
||||
|
||||
# Option 2: Search on publisher website
|
||||
# Option 3: Google Scholar
|
||||
|
||||
# Re-extract metadata
|
||||
python scripts/extract_metadata.py --doi CORRECT_DOI
|
||||
```
|
||||
|
||||
### Missing Information
|
||||
|
||||
```bash
|
||||
# Extract from DOI
|
||||
python scripts/extract_metadata.py --doi 10.xxxx/yyyy
|
||||
|
||||
# Or from PMID (biomedical)
|
||||
python scripts/extract_metadata.py --pmid 12345678
|
||||
|
||||
# Or from arXiv
|
||||
python scripts/extract_metadata.py --arxiv 2103.12345
|
||||
```
|
||||
|
||||
### Duplicate Entries
|
||||
|
||||
```bash
|
||||
# Auto-remove duplicates
|
||||
python scripts/format_bibtex.py references.bib \
|
||||
--deduplicate \
|
||||
--output fixed_references.bib
|
||||
```
|
||||
|
||||
### Formatting Errors
|
||||
|
||||
```bash
|
||||
# Auto-fix common issues
|
||||
python scripts/format_bibtex.py references.bib \
|
||||
--output fixed_references.bib
|
||||
|
||||
# Then validate
|
||||
python scripts/validate_citations.py fixed_references.bib
|
||||
```
|
||||
|
||||
## Long-Term Best Practices
|
||||
|
||||
### During Research
|
||||
|
||||
- [ ] Add citations to bibliography file as you find them
|
||||
- [ ] Extract metadata immediately using DOI
|
||||
- [ ] Validate after every 10-20 additions
|
||||
- [ ] Keep bibliography file under version control
|
||||
|
||||
### During Writing
|
||||
|
||||
- [ ] Cite as you write
|
||||
- [ ] Use consistent citation keys
|
||||
- [ ] Don't delay adding references
|
||||
- [ ] Validate weekly
|
||||
|
||||
### Before Submission
|
||||
|
||||
- [ ] Allow 2-3 days for citation cleanup
|
||||
- [ ] Don't wait until the last day
|
||||
- [ ] Automate what you can
|
||||
- [ ] Manually verify critical citations
|
||||
|
||||
## Tool Quick Reference
|
||||
|
||||
### Extract Metadata
|
||||
|
||||
```bash
|
||||
# From DOI
|
||||
python scripts/doi_to_bibtex.py 10.1038/nature12345
|
||||
|
||||
# From multiple sources
|
||||
python scripts/extract_metadata.py \
|
||||
--doi 10.1038/nature12345 \
|
||||
--pmid 12345678 \
|
||||
--arxiv 2103.12345 \
|
||||
--output references.bib
|
||||
```
|
||||
|
||||
### Validate
|
||||
|
||||
```bash
|
||||
# Basic validation
|
||||
python scripts/validate_citations.py references.bib
|
||||
|
||||
# With DOI checking (slow but thorough)
|
||||
python scripts/validate_citations.py references.bib --check-dois
|
||||
|
||||
# Generate report
|
||||
python scripts/validate_citations.py references.bib \
|
||||
--report validation.json \
|
||||
--verbose
|
||||
```
|
||||
|
||||
### Format and Clean
|
||||
|
||||
```bash
|
||||
# Format and fix issues
|
||||
python scripts/format_bibtex.py references.bib
|
||||
|
||||
# Remove duplicates and sort
|
||||
python scripts/format_bibtex.py references.bib \
|
||||
--deduplicate \
|
||||
--sort year \
|
||||
--descending \
|
||||
--output clean_refs.bib
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
**Minimum Requirements**:
|
||||
1. Run `format_bibtex.py --deduplicate`
|
||||
2. Run `validate_citations.py`
|
||||
3. Fix all errors
|
||||
4. Compile successfully
|
||||
|
||||
**Recommended**:
|
||||
1. Format, deduplicate, and sort
|
||||
2. Validate with `--check-dois`
|
||||
3. Fix all errors and warnings
|
||||
4. Manually verify top citations
|
||||
5. Re-validate after fixes
|
||||
|
||||
**Best Practice**:
|
||||
1. Validate throughout research process
|
||||
2. Use automated tools consistently
|
||||
3. Keep bibliography clean and organized
|
||||
4. Document any special cases
|
||||
5. Final validation 1-3 days before submission
|
||||
|
||||
**Remember**: Citation errors reflect poorly on your scholarship. Taking time to ensure accuracy is worthwhile!
|
||||
|
||||
908
skills/citation-management/references/bibtex_formatting.md
Normal file
908
skills/citation-management/references/bibtex_formatting.md
Normal file
@@ -0,0 +1,908 @@
|
||||
# BibTeX Formatting Guide
|
||||
|
||||
Comprehensive guide to BibTeX entry types, required fields, formatting conventions, and best practices.
|
||||
|
||||
## Overview
|
||||
|
||||
BibTeX is the standard bibliography format for LaTeX documents. Proper formatting ensures:
|
||||
- Correct citation rendering
|
||||
- Consistent formatting
|
||||
- Compatibility with citation styles
|
||||
- No compilation errors
|
||||
|
||||
This guide covers all common entry types and formatting rules.
|
||||
|
||||
## Entry Types
|
||||
|
||||
### @article - Journal Articles
|
||||
|
||||
**Most common entry type** for peer-reviewed journal articles.
|
||||
|
||||
**Required fields**:
|
||||
- `author`: Author names
|
||||
- `title`: Article title
|
||||
- `journal`: Journal name
|
||||
- `year`: Publication year
|
||||
|
||||
**Optional fields**:
|
||||
- `volume`: Volume number
|
||||
- `number`: Issue number
|
||||
- `pages`: Page range
|
||||
- `month`: Publication month
|
||||
- `doi`: Digital Object Identifier
|
||||
- `url`: URL
|
||||
- `note`: Additional notes
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@article{CitationKey2024,
|
||||
author = {Last1, First1 and Last2, First2},
|
||||
title = {Article Title Here},
|
||||
journal = {Journal Name},
|
||||
year = {2024},
|
||||
volume = {10},
|
||||
number = {3},
|
||||
pages = {123--145},
|
||||
doi = {10.1234/journal.2024.123456},
|
||||
month = jan
|
||||
}
|
||||
```
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@article{Jumper2021,
|
||||
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and others},
|
||||
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
|
||||
journal = {Nature},
|
||||
year = {2021},
|
||||
volume = {596},
|
||||
number = {7873},
|
||||
pages = {583--589},
|
||||
doi = {10.1038/s41586-021-03819-2}
|
||||
}
|
||||
```
|
||||
|
||||
### @book - Books
|
||||
|
||||
**For entire books**.
|
||||
|
||||
**Required fields**:
|
||||
- `author` OR `editor`: Author(s) or editor(s)
|
||||
- `title`: Book title
|
||||
- `publisher`: Publisher name
|
||||
- `year`: Publication year
|
||||
|
||||
**Optional fields**:
|
||||
- `volume`: Volume number (if multi-volume)
|
||||
- `series`: Series name
|
||||
- `address`: Publisher location
|
||||
- `edition`: Edition number
|
||||
- `isbn`: ISBN
|
||||
- `url`: URL
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@book{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Book Title},
|
||||
publisher = {Publisher Name},
|
||||
year = {2024},
|
||||
edition = {3},
|
||||
address = {City, Country},
|
||||
isbn = {978-0-123-45678-9}
|
||||
}
|
||||
```
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@book{Kumar2021,
|
||||
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
|
||||
title = {Robbins and Cotran Pathologic Basis of Disease},
|
||||
publisher = {Elsevier},
|
||||
year = {2021},
|
||||
edition = {10},
|
||||
address = {Philadelphia, PA},
|
||||
isbn = {978-0-323-53113-9}
|
||||
}
|
||||
```
|
||||
|
||||
### @inproceedings - Conference Papers
|
||||
|
||||
**For papers in conference proceedings**.
|
||||
|
||||
**Required fields**:
|
||||
- `author`: Author names
|
||||
- `title`: Paper title
|
||||
- `booktitle`: Conference/proceedings name
|
||||
- `year`: Year
|
||||
|
||||
**Optional fields**:
|
||||
- `editor`: Proceedings editor(s)
|
||||
- `volume`: Volume number
|
||||
- `series`: Series name
|
||||
- `pages`: Page range
|
||||
- `address`: Conference location
|
||||
- `month`: Conference month
|
||||
- `organization`: Organizing body
|
||||
- `publisher`: Publisher
|
||||
- `doi`: DOI
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@inproceedings{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Paper Title},
|
||||
booktitle = {Proceedings of Conference Name},
|
||||
year = {2024},
|
||||
pages = {123--145},
|
||||
address = {City, Country},
|
||||
month = jun
|
||||
}
|
||||
```
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@inproceedings{Vaswani2017,
|
||||
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and others},
|
||||
title = {Attention is All You Need},
|
||||
booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
|
||||
year = {2017},
|
||||
pages = {5998--6008},
|
||||
address = {Long Beach, CA}
|
||||
}
|
||||
```
|
||||
|
||||
**Note**: `@conference` is an alias for `@inproceedings`.
|
||||
|
||||
### @incollection - Book Chapters
|
||||
|
||||
**For chapters in edited books**.
|
||||
|
||||
**Required fields**:
|
||||
- `author`: Chapter author(s)
|
||||
- `title`: Chapter title
|
||||
- `booktitle`: Book title
|
||||
- `publisher`: Publisher name
|
||||
- `year`: Publication year
|
||||
|
||||
**Optional fields**:
|
||||
- `editor`: Book editor(s)
|
||||
- `volume`: Volume number
|
||||
- `series`: Series name
|
||||
- `type`: Type of section (e.g., "chapter")
|
||||
- `chapter`: Chapter number
|
||||
- `pages`: Page range
|
||||
- `address`: Publisher location
|
||||
- `edition`: Edition
|
||||
- `month`: Month
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@incollection{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Chapter Title},
|
||||
booktitle = {Book Title},
|
||||
editor = {Editor, Last and Editor2, Last},
|
||||
publisher = {Publisher Name},
|
||||
year = {2024},
|
||||
pages = {123--145},
|
||||
chapter = {5}
|
||||
}
|
||||
```
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@incollection{Brown2020,
|
||||
author = {Brown, Peter O. and Botstein, David},
|
||||
title = {Exploring the New World of the Genome with {DNA} Microarrays},
|
||||
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
|
||||
editor = {Eisen, Michael B. and Brown, Patrick O.},
|
||||
publisher = {Cold Spring Harbor Laboratory Press},
|
||||
year = {2020},
|
||||
pages = {1--45},
|
||||
address = {Cold Spring Harbor, NY}
|
||||
}
|
||||
```
|
||||
|
||||
### @phdthesis - Doctoral Dissertations
|
||||
|
||||
**For PhD dissertations and theses**.
|
||||
|
||||
**Required fields**:
|
||||
- `author`: Author name
|
||||
- `title`: Thesis title
|
||||
- `school`: Institution
|
||||
- `year`: Year
|
||||
|
||||
**Optional fields**:
|
||||
- `type`: Type (e.g., "PhD dissertation", "PhD thesis")
|
||||
- `address`: Institution location
|
||||
- `month`: Month
|
||||
- `url`: URL
|
||||
- `note`: Additional notes
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@phdthesis{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Dissertation Title},
|
||||
school = {University Name},
|
||||
year = {2024},
|
||||
type = {{PhD} dissertation},
|
||||
address = {City, State}
|
||||
}
|
||||
```
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@phdthesis{Johnson2023,
|
||||
author = {Johnson, Mary L.},
|
||||
title = {Novel Approaches to Cancer Immunotherapy Using {CRISPR} Technology},
|
||||
school = {Stanford University},
|
||||
year = {2023},
|
||||
type = {{PhD} dissertation},
|
||||
address = {Stanford, CA}
|
||||
}
|
||||
```
|
||||
|
||||
**Note**: `@mastersthesis` is similar but for Master's theses.
|
||||
|
||||
### @mastersthesis - Master's Theses
|
||||
|
||||
**For Master's theses**.
|
||||
|
||||
**Required fields**:
|
||||
- `author`: Author name
|
||||
- `title`: Thesis title
|
||||
- `school`: Institution
|
||||
- `year`: Year
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@mastersthesis{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Thesis Title},
|
||||
school = {University Name},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
### @misc - Miscellaneous
|
||||
|
||||
**For items that don't fit other categories** (preprints, datasets, software, websites, etc.).
|
||||
|
||||
**Required fields**:
|
||||
- `author` (if known)
|
||||
- `title`
|
||||
- `year`
|
||||
|
||||
**Optional fields**:
|
||||
- `howpublished`: Repository, website, format
|
||||
- `url`: URL
|
||||
- `doi`: DOI
|
||||
- `note`: Additional information
|
||||
- `month`: Month
|
||||
|
||||
**Template for preprints**:
|
||||
```bibtex
|
||||
@misc{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Preprint Title},
|
||||
year = {2024},
|
||||
howpublished = {bioRxiv},
|
||||
doi = {10.1101/2024.01.01.123456},
|
||||
note = {Preprint}
|
||||
}
|
||||
```
|
||||
|
||||
**Template for datasets**:
|
||||
```bibtex
|
||||
@misc{DatasetName2024,
|
||||
author = {Last, First},
|
||||
title = {Dataset Title},
|
||||
year = {2024},
|
||||
howpublished = {Zenodo},
|
||||
doi = {10.5281/zenodo.123456},
|
||||
note = {Version 1.2}
|
||||
}
|
||||
```
|
||||
|
||||
**Template for software**:
|
||||
```bibtex
|
||||
@misc{SoftwareName2024,
|
||||
author = {Last, First},
|
||||
title = {Software Name},
|
||||
year = {2024},
|
||||
howpublished = {GitHub},
|
||||
url = {https://github.com/user/repo},
|
||||
note = {Version 2.0}
|
||||
}
|
||||
```
|
||||
|
||||
### @techreport - Technical Reports
|
||||
|
||||
**For technical reports**.
|
||||
|
||||
**Required fields**:
|
||||
- `author`: Author name(s)
|
||||
- `title`: Report title
|
||||
- `institution`: Institution
|
||||
- `year`: Year
|
||||
|
||||
**Optional fields**:
|
||||
- `type`: Type of report
|
||||
- `number`: Report number
|
||||
- `address`: Institution location
|
||||
- `month`: Month
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@techreport{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Report Title},
|
||||
institution = {Institution Name},
|
||||
year = {2024},
|
||||
type = {Technical Report},
|
||||
number = {TR-2024-01}
|
||||
}
|
||||
```
|
||||
|
||||
### @unpublished - Unpublished Work
|
||||
|
||||
**For unpublished works** (not preprints - use @misc for those).
|
||||
|
||||
**Required fields**:
|
||||
- `author`: Author name(s)
|
||||
- `title`: Work title
|
||||
- `note`: Description
|
||||
|
||||
**Optional fields**:
|
||||
- `month`: Month
|
||||
- `year`: Year
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@unpublished{CitationKey2024,
|
||||
author = {Last, First},
|
||||
title = {Work Title},
|
||||
note = {Unpublished manuscript},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
### @online/@electronic - Online Resources
|
||||
|
||||
**For web pages and online-only content**.
|
||||
|
||||
**Note**: Not standard BibTeX, but supported by many bibliography packages (biblatex).
|
||||
|
||||
**Required fields**:
|
||||
- `author` OR `organization`
|
||||
- `title`
|
||||
- `url`
|
||||
- `year`
|
||||
|
||||
**Template**:
|
||||
```bibtex
|
||||
@online{CitationKey2024,
|
||||
author = {{Organization Name}},
|
||||
title = {Page Title},
|
||||
url = {https://example.com/page},
|
||||
year = {2024},
|
||||
note = {Accessed: 2024-01-15}
|
||||
}
|
||||
```
|
||||
|
||||
## Formatting Rules
|
||||
|
||||
### Citation Keys
|
||||
|
||||
**Convention**: `FirstAuthorYEARkeyword`
|
||||
|
||||
**Examples**:
|
||||
```bibtex
|
||||
Smith2024protein
|
||||
Doe2023machine
|
||||
JohnsonWilliams2024cancer % Multiple authors, no space
|
||||
NatureEditorial2024 % No author, use publication
|
||||
WHO2024guidelines % Organization author
|
||||
```
|
||||
|
||||
**Rules**:
|
||||
- Alphanumeric plus: `-`, `_`, `.`, `:`
|
||||
- No spaces
|
||||
- Case-sensitive
|
||||
- Unique within file
|
||||
- Descriptive
|
||||
|
||||
**Avoid**:
|
||||
- Special characters: `@`, `#`, `&`, `%`, `$`
|
||||
- Spaces: use CamelCase or underscores
|
||||
- Starting with numbers: `2024Smith` (some systems disallow)
|
||||
|
||||
### Author Names
|
||||
|
||||
**Recommended format**: `Last, First Middle`
|
||||
|
||||
**Single author**:
|
||||
```bibtex
|
||||
author = {Smith, John}
|
||||
author = {Smith, John A.}
|
||||
author = {Smith, John Andrew}
|
||||
```
|
||||
|
||||
**Multiple authors** - separate with `and`:
|
||||
```bibtex
|
||||
author = {Smith, John and Doe, Jane}
|
||||
author = {Smith, John A. and Doe, Jane M. and Johnson, Mary L.}
|
||||
```
|
||||
|
||||
**Many authors** (10+):
|
||||
```bibtex
|
||||
author = {Smith, John and Doe, Jane and Johnson, Mary and others}
|
||||
```
|
||||
|
||||
**Special cases**:
|
||||
```bibtex
|
||||
% Suffix (Jr., III, etc.)
|
||||
author = {King, Jr., Martin Luther}
|
||||
|
||||
% Organization as author
|
||||
author = {{World Health Organization}}
|
||||
% Note: Double braces keep as single entity
|
||||
|
||||
% Multiple surnames
|
||||
author = {Garc{\'i}a-Mart{\'i}nez, Jos{\'e}}
|
||||
|
||||
% Particles (van, von, de, etc.)
|
||||
author = {van der Waals, Johannes}
|
||||
author = {de Broglie, Louis}
|
||||
```
|
||||
|
||||
**Wrong formats** (don't use):
|
||||
```bibtex
|
||||
author = {Smith, J.; Doe, J.} % Semicolons (wrong)
|
||||
author = {Smith, J., Doe, J.} % Commas (wrong)
|
||||
author = {Smith, J. & Doe, J.} % Ampersand (wrong)
|
||||
author = {Smith J} % No comma
|
||||
```
|
||||
|
||||
### Title Capitalization
|
||||
|
||||
**Protect capitalization** with braces:
|
||||
|
||||
```bibtex
|
||||
% Proper nouns, acronyms, formulas
|
||||
title = {{AlphaFold}: Protein Structure Prediction}
|
||||
title = {Machine Learning for {DNA} Sequencing}
|
||||
title = {The {Ising} Model in Statistical Physics}
|
||||
title = {{CRISPR-Cas9} Gene Editing Technology}
|
||||
```
|
||||
|
||||
**Reason**: Citation styles may change capitalization. Braces protect.
|
||||
|
||||
**Examples**:
|
||||
```bibtex
|
||||
% Good
|
||||
title = {Advances in {COVID-19} Treatment}
|
||||
title = {Using {Python} for Data Analysis}
|
||||
title = {The {AlphaFold} Protein Structure Database}
|
||||
|
||||
% Will be lowercase in title case styles
|
||||
title = {Advances in COVID-19 Treatment} % covid-19
|
||||
title = {Using Python for Data Analysis} % python
|
||||
```
|
||||
|
||||
**Whole title protection** (rarely needed):
|
||||
```bibtex
|
||||
title = {{This Entire Title Keeps Its Capitalization}}
|
||||
```
|
||||
|
||||
### Page Ranges
|
||||
|
||||
**Use en-dash** (double hyphen `--`):
|
||||
|
||||
```bibtex
|
||||
pages = {123--145} % Correct
|
||||
pages = {1234--1256} % Correct
|
||||
pages = {e0123456} % Article ID (PLOS, etc.)
|
||||
pages = {123} % Single page
|
||||
```
|
||||
|
||||
**Wrong**:
|
||||
```bibtex
|
||||
pages = {123-145} % Single hyphen (don't use)
|
||||
pages = {pp. 123-145} % "pp." not needed
|
||||
pages = {123–145} % Unicode en-dash (may cause issues)
|
||||
```
|
||||
|
||||
### Month Names
|
||||
|
||||
**Use three-letter abbreviations** (unquoted):
|
||||
|
||||
```bibtex
|
||||
month = jan
|
||||
month = feb
|
||||
month = mar
|
||||
month = apr
|
||||
month = may
|
||||
month = jun
|
||||
month = jul
|
||||
month = aug
|
||||
month = sep
|
||||
month = oct
|
||||
month = nov
|
||||
month = dec
|
||||
```
|
||||
|
||||
**Or numeric**:
|
||||
```bibtex
|
||||
month = {1} % January
|
||||
month = {12} % December
|
||||
```
|
||||
|
||||
**Or full name in braces**:
|
||||
```bibtex
|
||||
month = {January}
|
||||
```
|
||||
|
||||
**Standard abbreviations work without quotes** because they're defined in BibTeX.
|
||||
|
||||
### Journal Names
|
||||
|
||||
**Full name** (not abbreviated):
|
||||
|
||||
```bibtex
|
||||
journal = {Nature}
|
||||
journal = {Science}
|
||||
journal = {Cell}
|
||||
journal = {Proceedings of the National Academy of Sciences}
|
||||
journal = {Journal of the American Chemical Society}
|
||||
```
|
||||
|
||||
**Bibliography style** will handle abbreviation if needed.
|
||||
|
||||
**Avoid manual abbreviation**:
|
||||
```bibtex
|
||||
% Don't do this in BibTeX file
|
||||
journal = {Proc. Natl. Acad. Sci. U.S.A.}
|
||||
|
||||
% Do this instead
|
||||
journal = {Proceedings of the National Academy of Sciences}
|
||||
```
|
||||
|
||||
**Exception**: If style requires abbreviations, use full abbreviated form:
|
||||
```bibtex
|
||||
journal = {Proc. Natl. Acad. Sci. U.S.A.} % If required by style
|
||||
```
|
||||
|
||||
### DOI Formatting
|
||||
|
||||
**URL format** (preferred):
|
||||
|
||||
```bibtex
|
||||
doi = {10.1038/s41586-021-03819-2}
|
||||
```
|
||||
|
||||
**Not**:
|
||||
```bibtex
|
||||
doi = {https://doi.org/10.1038/s41586-021-03819-2} % Don't include URL
|
||||
doi = {doi:10.1038/s41586-021-03819-2} % Don't include prefix
|
||||
```
|
||||
|
||||
**LaTeX** will format as URL automatically.
|
||||
|
||||
**Note**: No period after DOI field!
|
||||
|
||||
### URL Formatting
|
||||
|
||||
```bibtex
|
||||
url = {https://www.example.com/article}
|
||||
```
|
||||
|
||||
**Use**:
|
||||
- When DOI not available
|
||||
- For web pages
|
||||
- For supplementary materials
|
||||
|
||||
**Don't duplicate**:
|
||||
```bibtex
|
||||
% Don't include both if DOI URL is same as url
|
||||
doi = {10.1038/nature12345}
|
||||
url = {https://doi.org/10.1038/nature12345} % Redundant!
|
||||
```
|
||||
|
||||
### Special Characters
|
||||
|
||||
**Accents and diacritics**:
|
||||
```bibtex
|
||||
author = {M{\"u}ller, Hans} % ü
|
||||
author = {Garc{\'i}a, Jos{\'e}} % í, é
|
||||
author = {Erd{\H{o}}s, Paul} % ő
|
||||
author = {Schr{\"o}dinger, Erwin} % ö
|
||||
```
|
||||
|
||||
**Or use UTF-8** (with proper LaTeX setup):
|
||||
```bibtex
|
||||
author = {Müller, Hans}
|
||||
author = {García, José}
|
||||
```
|
||||
|
||||
**Mathematical symbols**:
|
||||
```bibtex
|
||||
title = {The $\alpha$-helix Structure}
|
||||
title = {$\beta$-sheet Prediction}
|
||||
```
|
||||
|
||||
**Chemical formulas**:
|
||||
```bibtex
|
||||
title = {H$_2$O Molecular Dynamics}
|
||||
% Or with chemformula package:
|
||||
title = {\ce{H2O} Molecular Dynamics}
|
||||
```
|
||||
|
||||
### Field Order
|
||||
|
||||
**Recommended order** (for readability):
|
||||
|
||||
```bibtex
|
||||
@article{Key,
|
||||
author = {},
|
||||
title = {},
|
||||
journal = {},
|
||||
year = {},
|
||||
volume = {},
|
||||
number = {},
|
||||
pages = {},
|
||||
doi = {},
|
||||
url = {},
|
||||
note = {}
|
||||
}
|
||||
```
|
||||
|
||||
**Rules**:
|
||||
- Most important fields first
|
||||
- Consistent across entries
|
||||
- Use formatter to standardize
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Consistent Formatting
|
||||
|
||||
Use same format throughout:
|
||||
- Author name format
|
||||
- Title capitalization
|
||||
- Journal names
|
||||
- Citation key style
|
||||
|
||||
### 2. Required Fields
|
||||
|
||||
Always include:
|
||||
- All required fields for entry type
|
||||
- DOI for modern papers (2000+)
|
||||
- Volume and pages for articles
|
||||
- Publisher for books
|
||||
|
||||
### 3. Protect Capitalization
|
||||
|
||||
Use braces for:
|
||||
- Proper nouns: `{AlphaFold}`
|
||||
- Acronyms: `{DNA}`, `{CRISPR}`
|
||||
- Formulas: `{H2O}`
|
||||
- Names: `{Python}`, `{R}`
|
||||
|
||||
### 4. Complete Author Lists
|
||||
|
||||
Include all authors when possible:
|
||||
- All authors if <10
|
||||
- Use "and others" for 10+
|
||||
- Don't abbreviate to "et al." manually
|
||||
|
||||
### 5. Use Standard Entry Types
|
||||
|
||||
Choose correct entry type:
|
||||
- Journal article → `@article`
|
||||
- Book → `@book`
|
||||
- Conference paper → `@inproceedings`
|
||||
- Preprint → `@misc`
|
||||
|
||||
### 6. Validate Syntax
|
||||
|
||||
Check for:
|
||||
- Balanced braces
|
||||
- Commas after fields
|
||||
- Unique citation keys
|
||||
- Valid entry types
|
||||
|
||||
### 7. Use Formatters
|
||||
|
||||
Use automated tools:
|
||||
```bash
|
||||
python scripts/format_bibtex.py references.bib
|
||||
```
|
||||
|
||||
Benefits:
|
||||
- Consistent formatting
|
||||
- Catch syntax errors
|
||||
- Standardize field order
|
||||
- Fix common issues
|
||||
|
||||
## Common Mistakes
|
||||
|
||||
### 1. Wrong Author Separator
|
||||
|
||||
**Wrong**:
|
||||
```bibtex
|
||||
author = {Smith, J.; Doe, J.} % Semicolon
|
||||
author = {Smith, J., Doe, J.} % Comma
|
||||
author = {Smith, J. & Doe, J.} % Ampersand
|
||||
```
|
||||
|
||||
**Correct**:
|
||||
```bibtex
|
||||
author = {Smith, John and Doe, Jane}
|
||||
```
|
||||
|
||||
### 2. Missing Commas
|
||||
|
||||
**Wrong**:
|
||||
```bibtex
|
||||
@article{Smith2024,
|
||||
author = {Smith, John} % Missing comma!
|
||||
title = {Title}
|
||||
}
|
||||
```
|
||||
|
||||
**Correct**:
|
||||
```bibtex
|
||||
@article{Smith2024,
|
||||
author = {Smith, John}, % Comma after each field
|
||||
title = {Title}
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Unprotected Capitalization
|
||||
|
||||
**Wrong**:
|
||||
```bibtex
|
||||
title = {Machine Learning with Python}
|
||||
% "Python" will become "python" in title case
|
||||
```
|
||||
|
||||
**Correct**:
|
||||
```bibtex
|
||||
title = {Machine Learning with {Python}}
|
||||
```
|
||||
|
||||
### 4. Single Hyphen in Pages
|
||||
|
||||
**Wrong**:
|
||||
```bibtex
|
||||
pages = {123-145} % Single hyphen
|
||||
```
|
||||
|
||||
**Correct**:
|
||||
```bibtex
|
||||
pages = {123--145} % Double hyphen (en-dash)
|
||||
```
|
||||
|
||||
### 5. Redundant "pp." in Pages
|
||||
|
||||
**Wrong**:
|
||||
```bibtex
|
||||
pages = {pp. 123--145}
|
||||
```
|
||||
|
||||
**Correct**:
|
||||
```bibtex
|
||||
pages = {123--145}
|
||||
```
|
||||
|
||||
### 6. DOI with URL Prefix
|
||||
|
||||
**Wrong**:
|
||||
```bibtex
|
||||
doi = {https://doi.org/10.1038/nature12345}
|
||||
doi = {doi:10.1038/nature12345}
|
||||
```
|
||||
|
||||
**Correct**:
|
||||
```bibtex
|
||||
doi = {10.1038/nature12345}
|
||||
```
|
||||
|
||||
## Example Complete Bibliography
|
||||
|
||||
```bibtex
|
||||
% Journal article
|
||||
@article{Jumper2021,
|
||||
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and others},
|
||||
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
|
||||
journal = {Nature},
|
||||
year = {2021},
|
||||
volume = {596},
|
||||
number = {7873},
|
||||
pages = {583--589},
|
||||
doi = {10.1038/s41586-021-03819-2}
|
||||
}
|
||||
|
||||
% Book
|
||||
@book{Kumar2021,
|
||||
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
|
||||
title = {Robbins and Cotran Pathologic Basis of Disease},
|
||||
publisher = {Elsevier},
|
||||
year = {2021},
|
||||
edition = {10},
|
||||
address = {Philadelphia, PA},
|
||||
isbn = {978-0-323-53113-9}
|
||||
}
|
||||
|
||||
% Conference paper
|
||||
@inproceedings{Vaswani2017,
|
||||
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and others},
|
||||
title = {Attention is All You Need},
|
||||
booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
|
||||
year = {2017},
|
||||
pages = {5998--6008}
|
||||
}
|
||||
|
||||
% Book chapter
|
||||
@incollection{Brown2020,
|
||||
author = {Brown, Peter O. and Botstein, David},
|
||||
title = {Exploring the New World of the Genome with {DNA} Microarrays},
|
||||
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
|
||||
editor = {Eisen, Michael B. and Brown, Patrick O.},
|
||||
publisher = {Cold Spring Harbor Laboratory Press},
|
||||
year = {2020},
|
||||
pages = {1--45}
|
||||
}
|
||||
|
||||
% PhD thesis
|
||||
@phdthesis{Johnson2023,
|
||||
author = {Johnson, Mary L.},
|
||||
title = {Novel Approaches to Cancer Immunotherapy},
|
||||
school = {Stanford University},
|
||||
year = {2023},
|
||||
type = {{PhD} dissertation}
|
||||
}
|
||||
|
||||
% Preprint
|
||||
@misc{Zhang2024,
|
||||
author = {Zhang, Yi and Chen, Li and Wang, Hui},
|
||||
title = {Novel Therapeutic Targets in {Alzheimer}'s Disease},
|
||||
year = {2024},
|
||||
howpublished = {bioRxiv},
|
||||
doi = {10.1101/2024.01.001},
|
||||
note = {Preprint}
|
||||
}
|
||||
|
||||
% Dataset
|
||||
@misc{AlphaFoldDB2021,
|
||||
author = {{DeepMind} and {EMBL-EBI}},
|
||||
title = {{AlphaFold} Protein Structure Database},
|
||||
year = {2021},
|
||||
howpublished = {Database},
|
||||
url = {https://alphafold.ebi.ac.uk/},
|
||||
doi = {10.1093/nar/gkab1061}
|
||||
}
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
BibTeX formatting essentials:
|
||||
|
||||
✓ **Choose correct entry type** (@article, @book, etc.)
|
||||
✓ **Include all required fields**
|
||||
✓ **Use `and` for multiple authors**
|
||||
✓ **Protect capitalization** with braces
|
||||
✓ **Use `--` for page ranges**
|
||||
✓ **Include DOI** for modern papers
|
||||
✓ **Validate syntax** before compilation
|
||||
|
||||
Use formatting tools to ensure consistency:
|
||||
```bash
|
||||
python scripts/format_bibtex.py references.bib
|
||||
```
|
||||
|
||||
Properly formatted BibTeX ensures correct, consistent citations across all bibliography styles!
|
||||
|
||||
794
skills/citation-management/references/citation_validation.md
Normal file
794
skills/citation-management/references/citation_validation.md
Normal file
@@ -0,0 +1,794 @@
|
||||
# Citation Validation Guide
|
||||
|
||||
Comprehensive guide to validating citation accuracy, completeness, and formatting in BibTeX files.
|
||||
|
||||
## Overview
|
||||
|
||||
Citation validation ensures:
|
||||
- All citations are accurate and complete
|
||||
- DOIs resolve correctly
|
||||
- Required fields are present
|
||||
- No duplicate entries
|
||||
- Proper formatting and syntax
|
||||
- Links are accessible
|
||||
|
||||
Validation should be performed:
|
||||
- After extracting metadata
|
||||
- Before manuscript submission
|
||||
- After manual edits to BibTeX files
|
||||
- Periodically for maintained bibliographies
|
||||
|
||||
## Validation Categories
|
||||
|
||||
### 1. DOI Verification
|
||||
|
||||
**Purpose**: Ensure DOIs are valid and resolve correctly.
|
||||
|
||||
#### What to Check
|
||||
|
||||
**DOI format**:
|
||||
```
|
||||
Valid: 10.1038/s41586-021-03819-2
|
||||
Valid: 10.1126/science.aam9317
|
||||
Invalid: 10.1038/invalid
|
||||
Invalid: doi:10.1038/... (should omit "doi:" prefix in BibTeX)
|
||||
```
|
||||
|
||||
**DOI resolution**:
|
||||
- DOI should resolve via https://doi.org/
|
||||
- Should redirect to actual article
|
||||
- Should not return 404 or error
|
||||
|
||||
**Metadata consistency**:
|
||||
- CrossRef metadata should match BibTeX
|
||||
- Author names should align
|
||||
- Title should match
|
||||
- Year should match
|
||||
|
||||
#### How to Validate
|
||||
|
||||
**Manual check**:
|
||||
1. Copy DOI from BibTeX
|
||||
2. Visit https://doi.org/10.1038/nature12345
|
||||
3. Verify it redirects to correct article
|
||||
4. Check metadata matches
|
||||
|
||||
**Automated check** (recommended):
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib --check-dois
|
||||
```
|
||||
|
||||
**Process**:
|
||||
1. Extract all DOIs from BibTeX file
|
||||
2. Query doi.org resolver for each
|
||||
3. Query CrossRef API for metadata
|
||||
4. Compare metadata with BibTeX entry
|
||||
5. Report discrepancies
|
||||
|
||||
#### Common Issues
|
||||
|
||||
**Broken DOIs**:
|
||||
- Typos in DOI
|
||||
- Publisher changed DOI (rare)
|
||||
- Article retracted
|
||||
- Solution: Find correct DOI from publisher site
|
||||
|
||||
**Mismatched metadata**:
|
||||
- BibTeX has old/incorrect information
|
||||
- Solution: Re-extract metadata from CrossRef
|
||||
|
||||
**Missing DOIs**:
|
||||
- Older articles may not have DOIs
|
||||
- Acceptable for pre-2000 publications
|
||||
- Add URL or PMID instead
|
||||
|
||||
### 2. Required Fields
|
||||
|
||||
**Purpose**: Ensure all necessary information is present.
|
||||
|
||||
#### Required by Entry Type
|
||||
|
||||
**@article**:
|
||||
```bibtex
|
||||
author % REQUIRED
|
||||
title % REQUIRED
|
||||
journal % REQUIRED
|
||||
year % REQUIRED
|
||||
volume % Highly recommended
|
||||
pages % Highly recommended
|
||||
doi % Highly recommended for modern papers
|
||||
```
|
||||
|
||||
**@book**:
|
||||
```bibtex
|
||||
author OR editor % REQUIRED (at least one)
|
||||
title % REQUIRED
|
||||
publisher % REQUIRED
|
||||
year % REQUIRED
|
||||
isbn % Recommended
|
||||
```
|
||||
|
||||
**@inproceedings**:
|
||||
```bibtex
|
||||
author % REQUIRED
|
||||
title % REQUIRED
|
||||
booktitle % REQUIRED (conference/proceedings name)
|
||||
year % REQUIRED
|
||||
pages % Recommended
|
||||
```
|
||||
|
||||
**@incollection** (book chapter):
|
||||
```bibtex
|
||||
author % REQUIRED
|
||||
title % REQUIRED (chapter title)
|
||||
booktitle % REQUIRED (book title)
|
||||
publisher % REQUIRED
|
||||
year % REQUIRED
|
||||
editor % Recommended
|
||||
pages % Recommended
|
||||
```
|
||||
|
||||
**@phdthesis**:
|
||||
```bibtex
|
||||
author % REQUIRED
|
||||
title % REQUIRED
|
||||
school % REQUIRED
|
||||
year % REQUIRED
|
||||
```
|
||||
|
||||
**@misc** (preprints, datasets, etc.):
|
||||
```bibtex
|
||||
author % REQUIRED
|
||||
title % REQUIRED
|
||||
year % REQUIRED
|
||||
howpublished % Recommended (bioRxiv, Zenodo, etc.)
|
||||
doi OR url % At least one required
|
||||
```
|
||||
|
||||
#### Validation Script
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib --check-required-fields
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
Error: Entry 'Smith2024' missing required field 'journal'
|
||||
Error: Entry 'Doe2023' missing required field 'year'
|
||||
Warning: Entry 'Jones2022' missing recommended field 'volume'
|
||||
```
|
||||
|
||||
### 3. Author Name Formatting
|
||||
|
||||
**Purpose**: Ensure consistent, correct author name formatting.
|
||||
|
||||
#### Proper Format
|
||||
|
||||
**Recommended BibTeX format**:
|
||||
```bibtex
|
||||
author = {Last1, First1 and Last2, First2 and Last3, First3}
|
||||
```
|
||||
|
||||
**Examples**:
|
||||
```bibtex
|
||||
% Correct
|
||||
author = {Smith, John}
|
||||
author = {Smith, John A.}
|
||||
author = {Smith, John Andrew}
|
||||
author = {Smith, John and Doe, Jane}
|
||||
author = {Smith, John and Doe, Jane and Johnson, Mary}
|
||||
|
||||
% For many authors
|
||||
author = {Smith, John and Doe, Jane and others}
|
||||
|
||||
% Incorrect
|
||||
author = {John Smith} % First Last format (not recommended)
|
||||
author = {Smith, J.; Doe, J.} % Semicolon separator (wrong)
|
||||
author = {Smith J, Doe J} % Missing commas
|
||||
```
|
||||
|
||||
#### Special Cases
|
||||
|
||||
**Suffixes (Jr., III, etc.)**:
|
||||
```bibtex
|
||||
author = {King, Jr., Martin Luther}
|
||||
```
|
||||
|
||||
**Multiple surnames (hyphenated)**:
|
||||
```bibtex
|
||||
author = {Smith-Jones, Mary}
|
||||
```
|
||||
|
||||
**Van, von, de, etc.**:
|
||||
```bibtex
|
||||
author = {van der Waals, Johannes}
|
||||
author = {de Broglie, Louis}
|
||||
```
|
||||
|
||||
**Organizations as authors**:
|
||||
```bibtex
|
||||
author = {{World Health Organization}}
|
||||
% Double braces treat as single author
|
||||
```
|
||||
|
||||
#### Validation Checks
|
||||
|
||||
**Automated validation**:
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib --check-authors
|
||||
```
|
||||
|
||||
**Checks for**:
|
||||
- Proper separator (and, not &, ; , etc.)
|
||||
- Comma placement
|
||||
- Empty author fields
|
||||
- Malformed names
|
||||
|
||||
### 4. Data Consistency
|
||||
|
||||
**Purpose**: Ensure all fields contain valid, reasonable values.
|
||||
|
||||
#### Year Validation
|
||||
|
||||
**Valid years**:
|
||||
```bibtex
|
||||
year = {2024} % Current/recent
|
||||
year = {1953} % Watson & Crick DNA structure (historical)
|
||||
year = {1665} % Hooke's Micrographia (very old)
|
||||
```
|
||||
|
||||
**Invalid years**:
|
||||
```bibtex
|
||||
year = {24} % Two digits (ambiguous)
|
||||
year = {202} % Typo
|
||||
year = {2025} % Future (unless accepted/in press)
|
||||
year = {0} % Obviously wrong
|
||||
```
|
||||
|
||||
**Check**:
|
||||
- Four digits
|
||||
- Reasonable range (1600-current+1)
|
||||
- Not all zeros
|
||||
|
||||
#### Volume/Number Validation
|
||||
|
||||
```bibtex
|
||||
volume = {123} % Numeric
|
||||
volume = {12} % Valid
|
||||
number = {3} % Valid
|
||||
number = {S1} % Supplement issue (valid)
|
||||
```
|
||||
|
||||
**Invalid**:
|
||||
```bibtex
|
||||
volume = {Vol. 123} % Should be just number
|
||||
number = {Issue 3} % Should be just number
|
||||
```
|
||||
|
||||
#### Page Range Validation
|
||||
|
||||
**Correct format**:
|
||||
```bibtex
|
||||
pages = {123--145} % En-dash (two hyphens)
|
||||
pages = {e0123456} % PLOS-style article ID
|
||||
pages = {123} % Single page
|
||||
```
|
||||
|
||||
**Incorrect format**:
|
||||
```bibtex
|
||||
pages = {123-145} % Single hyphen (use --)
|
||||
pages = {pp. 123-145} % Remove "pp."
|
||||
pages = {123–145} % Unicode en-dash (may cause issues)
|
||||
```
|
||||
|
||||
#### URL Validation
|
||||
|
||||
**Check**:
|
||||
- URLs are accessible (return 200 status)
|
||||
- HTTPS when available
|
||||
- No obvious typos
|
||||
- Permanent links (not temporary)
|
||||
|
||||
**Valid**:
|
||||
```bibtex
|
||||
url = {https://www.nature.com/articles/nature12345}
|
||||
url = {https://arxiv.org/abs/2103.14030}
|
||||
```
|
||||
|
||||
**Questionable**:
|
||||
```bibtex
|
||||
url = {http://...} % HTTP instead of HTTPS
|
||||
url = {file:///...} % Local file path
|
||||
url = {bit.ly/...} % URL shortener (not permanent)
|
||||
```
|
||||
|
||||
### 5. Duplicate Detection
|
||||
|
||||
**Purpose**: Find and remove duplicate entries.
|
||||
|
||||
#### Types of Duplicates
|
||||
|
||||
**Exact duplicates** (same DOI):
|
||||
```bibtex
|
||||
@article{Smith2024a,
|
||||
doi = {10.1038/nature12345},
|
||||
...
|
||||
}
|
||||
|
||||
@article{Smith2024b,
|
||||
doi = {10.1038/nature12345}, % Same DOI!
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
**Near duplicates** (similar title/authors):
|
||||
```bibtex
|
||||
@article{Smith2024,
|
||||
title = {Machine Learning for Drug Discovery},
|
||||
...
|
||||
}
|
||||
|
||||
@article{Smith2024method,
|
||||
title = {Machine learning for drug discovery}, % Same, different case
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
**Preprint + Published**:
|
||||
```bibtex
|
||||
@misc{Smith2023arxiv,
|
||||
title = {AlphaFold Results},
|
||||
howpublished = {arXiv},
|
||||
...
|
||||
}
|
||||
|
||||
@article{Smith2024,
|
||||
title = {AlphaFold Results}, % Same paper, now published
|
||||
journal = {Nature},
|
||||
...
|
||||
}
|
||||
% Keep published version only
|
||||
```
|
||||
|
||||
#### Detection Methods
|
||||
|
||||
**By DOI** (most reliable):
|
||||
- Same DOI = exact duplicate
|
||||
- Keep one, remove other
|
||||
|
||||
**By title similarity**:
|
||||
- Normalize: lowercase, remove punctuation
|
||||
- Calculate similarity (e.g., Levenshtein distance)
|
||||
- Flag if >90% similar
|
||||
|
||||
**By author-year-title**:
|
||||
- Same first author + year + similar title
|
||||
- Likely duplicate
|
||||
|
||||
**Automated detection**:
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib --check-duplicates
|
||||
```
|
||||
|
||||
**Output**:
|
||||
```
|
||||
Warning: Possible duplicate entries:
|
||||
- Smith2024a (DOI: 10.1038/nature12345)
|
||||
- Smith2024b (DOI: 10.1038/nature12345)
|
||||
Recommendation: Keep one entry, remove the other.
|
||||
```
|
||||
|
||||
### 6. Format and Syntax
|
||||
|
||||
**Purpose**: Ensure valid BibTeX syntax.
|
||||
|
||||
#### Common Syntax Errors
|
||||
|
||||
**Missing commas**:
|
||||
```bibtex
|
||||
@article{Smith2024,
|
||||
author = {Smith, John} % Missing comma!
|
||||
title = {Title}
|
||||
}
|
||||
% Should be:
|
||||
author = {Smith, John}, % Comma after each field
|
||||
```
|
||||
|
||||
**Unbalanced braces**:
|
||||
```bibtex
|
||||
title = {Title with {Protected} Text % Missing closing brace
|
||||
% Should be:
|
||||
title = {Title with {Protected} Text}
|
||||
```
|
||||
|
||||
**Missing closing brace for entry**:
|
||||
```bibtex
|
||||
@article{Smith2024,
|
||||
author = {Smith, John},
|
||||
title = {Title}
|
||||
% Missing closing brace!
|
||||
% Should end with:
|
||||
}
|
||||
```
|
||||
|
||||
**Invalid characters in keys**:
|
||||
```bibtex
|
||||
@article{Smith&Doe2024, % & not allowed in key
|
||||
...
|
||||
}
|
||||
% Use:
|
||||
@article{SmithDoe2024,
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
#### BibTeX Syntax Rules
|
||||
|
||||
**Entry structure**:
|
||||
```bibtex
|
||||
@TYPE{citationkey,
|
||||
field1 = {value1},
|
||||
field2 = {value2},
|
||||
...
|
||||
fieldN = {valueN}
|
||||
}
|
||||
```
|
||||
|
||||
**Citation keys**:
|
||||
- Alphanumeric and some punctuation (-, _, ., :)
|
||||
- No spaces
|
||||
- Case-sensitive
|
||||
- Unique within file
|
||||
|
||||
**Field values**:
|
||||
- Enclosed in {braces} or "quotes"
|
||||
- Braces preferred for complex text
|
||||
- Numbers can be unquoted: `year = 2024`
|
||||
|
||||
**Special characters**:
|
||||
- `{` and `}` for grouping
|
||||
- `\` for LaTeX commands
|
||||
- Protect capitalization: `{AlphaFold}`
|
||||
- Accents: `{\"u}`, `{\'e}`, `{\aa}`
|
||||
|
||||
#### Validation
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib --check-syntax
|
||||
```
|
||||
|
||||
**Checks**:
|
||||
- Valid BibTeX structure
|
||||
- Balanced braces
|
||||
- Proper commas
|
||||
- Valid entry types
|
||||
- Unique citation keys
|
||||
|
||||
## Validation Workflow
|
||||
|
||||
### Step 1: Basic Validation
|
||||
|
||||
Run comprehensive validation:
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib
|
||||
```
|
||||
|
||||
**Checks all**:
|
||||
- DOI resolution
|
||||
- Required fields
|
||||
- Author formatting
|
||||
- Data consistency
|
||||
- Duplicates
|
||||
- Syntax
|
||||
|
||||
### Step 2: Review Report
|
||||
|
||||
Examine validation report:
|
||||
|
||||
```json
|
||||
{
|
||||
"total_entries": 150,
|
||||
"valid_entries": 140,
|
||||
"errors": [
|
||||
{
|
||||
"entry": "Smith2024",
|
||||
"error": "missing_required_field",
|
||||
"field": "journal",
|
||||
"severity": "high"
|
||||
},
|
||||
{
|
||||
"entry": "Doe2023",
|
||||
"error": "invalid_doi",
|
||||
"doi": "10.1038/broken",
|
||||
"severity": "high"
|
||||
}
|
||||
],
|
||||
"warnings": [
|
||||
{
|
||||
"entry": "Jones2022",
|
||||
"warning": "missing_recommended_field",
|
||||
"field": "volume",
|
||||
"severity": "medium"
|
||||
}
|
||||
],
|
||||
"duplicates": [
|
||||
{
|
||||
"entries": ["Smith2024a", "Smith2024b"],
|
||||
"reason": "same_doi",
|
||||
"doi": "10.1038/nature12345"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Step 3: Fix Issues
|
||||
|
||||
**High-priority** (errors):
|
||||
1. Add missing required fields
|
||||
2. Fix broken DOIs
|
||||
3. Remove duplicates
|
||||
4. Correct syntax errors
|
||||
|
||||
**Medium-priority** (warnings):
|
||||
1. Add recommended fields
|
||||
2. Improve author formatting
|
||||
3. Fix page ranges
|
||||
|
||||
**Low-priority**:
|
||||
1. Standardize formatting
|
||||
2. Add URLs for accessibility
|
||||
|
||||
### Step 4: Auto-Fix
|
||||
|
||||
Use auto-fix for safe corrections:
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib \
|
||||
--auto-fix \
|
||||
--output fixed_references.bib
|
||||
```
|
||||
|
||||
**Auto-fix can**:
|
||||
- Fix page range format (- to --)
|
||||
- Remove "pp." from pages
|
||||
- Standardize author separators
|
||||
- Fix common syntax errors
|
||||
- Normalize field order
|
||||
|
||||
**Auto-fix cannot**:
|
||||
- Add missing information
|
||||
- Find correct DOIs
|
||||
- Determine which duplicate to keep
|
||||
- Fix semantic errors
|
||||
|
||||
### Step 5: Manual Review
|
||||
|
||||
Review auto-fixed file:
|
||||
```bash
|
||||
# Check what changed
|
||||
diff references.bib fixed_references.bib
|
||||
|
||||
# Review specific entries that had errors
|
||||
grep -A 10 "Smith2024" fixed_references.bib
|
||||
```
|
||||
|
||||
### Step 6: Re-Validate
|
||||
|
||||
Validate after fixes:
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py fixed_references.bib --verbose
|
||||
```
|
||||
|
||||
Should show:
|
||||
```
|
||||
✓ All DOIs valid
|
||||
✓ All required fields present
|
||||
✓ No duplicates found
|
||||
✓ Syntax valid
|
||||
✓ 150/150 entries valid
|
||||
```
|
||||
|
||||
## Validation Checklist
|
||||
|
||||
Use this checklist before final submission:
|
||||
|
||||
### DOI Validation
|
||||
- [ ] All DOIs resolve correctly
|
||||
- [ ] Metadata matches between BibTeX and CrossRef
|
||||
- [ ] No broken or invalid DOIs
|
||||
|
||||
### Completeness
|
||||
- [ ] All entries have required fields
|
||||
- [ ] Modern papers (2000+) have DOIs
|
||||
- [ ] Authors properly formatted
|
||||
- [ ] Journals/conferences properly named
|
||||
|
||||
### Consistency
|
||||
- [ ] Years are 4-digit numbers
|
||||
- [ ] Page ranges use -- not -
|
||||
- [ ] Volume/number are numeric
|
||||
- [ ] URLs are accessible
|
||||
|
||||
### Duplicates
|
||||
- [ ] No entries with same DOI
|
||||
- [ ] No near-duplicate titles
|
||||
- [ ] Preprints updated to published versions
|
||||
|
||||
### Formatting
|
||||
- [ ] Valid BibTeX syntax
|
||||
- [ ] Balanced braces
|
||||
- [ ] Proper commas
|
||||
- [ ] Unique citation keys
|
||||
|
||||
### Final Checks
|
||||
- [ ] Bibliography compiles without errors
|
||||
- [ ] All citations in text appear in bibliography
|
||||
- [ ] All bibliography entries cited in text
|
||||
- [ ] Citation style matches journal requirements
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Validate Early and Often
|
||||
|
||||
```bash
|
||||
# After extraction
|
||||
python scripts/extract_metadata.py --doi ... --output refs.bib
|
||||
python scripts/validate_citations.py refs.bib
|
||||
|
||||
# After manual edits
|
||||
python scripts/validate_citations.py refs.bib
|
||||
|
||||
# Before submission
|
||||
python scripts/validate_citations.py refs.bib --strict
|
||||
```
|
||||
|
||||
### 2. Use Automated Tools
|
||||
|
||||
Don't validate manually - use scripts:
|
||||
- Faster
|
||||
- More comprehensive
|
||||
- Catches errors humans miss
|
||||
- Generates reports
|
||||
|
||||
### 3. Keep Backup
|
||||
|
||||
```bash
|
||||
# Before auto-fix
|
||||
cp references.bib references_backup.bib
|
||||
|
||||
# Run auto-fix
|
||||
python scripts/validate_citations.py references.bib \
|
||||
--auto-fix \
|
||||
--output references_fixed.bib
|
||||
|
||||
# Review changes
|
||||
diff references.bib references_fixed.bib
|
||||
|
||||
# If satisfied, replace
|
||||
mv references_fixed.bib references.bib
|
||||
```
|
||||
|
||||
### 4. Fix High-Priority First
|
||||
|
||||
**Priority order**:
|
||||
1. Syntax errors (prevent compilation)
|
||||
2. Missing required fields (incomplete citations)
|
||||
3. Broken DOIs (broken links)
|
||||
4. Duplicates (confusion, wasted space)
|
||||
5. Missing recommended fields
|
||||
6. Formatting inconsistencies
|
||||
|
||||
### 5. Document Exceptions
|
||||
|
||||
For entries that can't be fixed:
|
||||
|
||||
```bibtex
|
||||
@article{Old1950,
|
||||
author = {Smith, John},
|
||||
title = {Title},
|
||||
journal = {Obscure Journal},
|
||||
year = {1950},
|
||||
volume = {12},
|
||||
pages = {34--56},
|
||||
note = {DOI not available for publications before 2000}
|
||||
}
|
||||
```
|
||||
|
||||
### 6. Validate Against Journal Requirements
|
||||
|
||||
Different journals have different requirements:
|
||||
- Citation style (numbered, author-year)
|
||||
- Abbreviations (journal names)
|
||||
- Maximum reference count
|
||||
- Format (BibTeX, EndNote, manual)
|
||||
|
||||
Check journal author guidelines!
|
||||
|
||||
## Common Validation Issues
|
||||
|
||||
### Issue 1: Metadata Mismatch
|
||||
|
||||
**Problem**: BibTeX says 2023, CrossRef says 2024.
|
||||
|
||||
**Cause**:
|
||||
- Online-first vs print publication
|
||||
- Correction/update
|
||||
- Extraction error
|
||||
|
||||
**Solution**:
|
||||
1. Check actual article
|
||||
2. Use more recent/accurate date
|
||||
3. Update BibTeX entry
|
||||
4. Re-validate
|
||||
|
||||
### Issue 2: Special Characters
|
||||
|
||||
**Problem**: LaTeX compilation fails on special characters.
|
||||
|
||||
**Cause**:
|
||||
- Accented characters (é, ü, ñ)
|
||||
- Chemical formulas (H₂O)
|
||||
- Math symbols (α, β, ±)
|
||||
|
||||
**Solution**:
|
||||
```bibtex
|
||||
% Use LaTeX commands
|
||||
author = {M{\"u}ller, Hans} % Müller
|
||||
title = {Study of H\textsubscript{2}O} % H₂O
|
||||
% Or use UTF-8 with proper LaTeX packages
|
||||
```
|
||||
|
||||
### Issue 3: Incomplete Extraction
|
||||
|
||||
**Problem**: Extracted metadata missing fields.
|
||||
|
||||
**Cause**:
|
||||
- Source doesn't provide all metadata
|
||||
- Extraction error
|
||||
- Incomplete record
|
||||
|
||||
**Solution**:
|
||||
1. Check original article
|
||||
2. Manually add missing fields
|
||||
3. Use alternative source (PubMed vs CrossRef)
|
||||
|
||||
### Issue 4: Cannot Find Duplicate
|
||||
|
||||
**Problem**: Same paper appears twice, not detected.
|
||||
|
||||
**Cause**:
|
||||
- Different DOIs (should be rare)
|
||||
- Different titles (abbreviated, typo)
|
||||
- Different citation keys
|
||||
|
||||
**Solution**:
|
||||
- Manual search for author + year
|
||||
- Check for similar titles
|
||||
- Remove manually
|
||||
|
||||
## Summary
|
||||
|
||||
Validation ensures citation quality:
|
||||
|
||||
✓ **Accuracy**: DOIs resolve, metadata correct
|
||||
✓ **Completeness**: All required fields present
|
||||
✓ **Consistency**: Proper formatting throughout
|
||||
✓ **No duplicates**: Each paper cited once
|
||||
✓ **Valid syntax**: BibTeX compiles without errors
|
||||
|
||||
**Always validate** before final submission!
|
||||
|
||||
Use automated tools:
|
||||
```bash
|
||||
python scripts/validate_citations.py references.bib
|
||||
```
|
||||
|
||||
Follow workflow:
|
||||
1. Extract metadata
|
||||
2. Validate
|
||||
3. Fix errors
|
||||
4. Re-validate
|
||||
5. Submit
|
||||
|
||||
725
skills/citation-management/references/google_scholar_search.md
Normal file
725
skills/citation-management/references/google_scholar_search.md
Normal file
@@ -0,0 +1,725 @@
|
||||
# Google Scholar Search Guide
|
||||
|
||||
Comprehensive guide to searching Google Scholar for academic papers, including advanced search operators, filtering strategies, and metadata extraction.
|
||||
|
||||
## Overview
|
||||
|
||||
Google Scholar provides the most comprehensive coverage of academic literature across all disciplines:
|
||||
- **Coverage**: 100+ million scholarly documents
|
||||
- **Scope**: All academic disciplines
|
||||
- **Content types**: Journal articles, books, theses, conference papers, preprints, patents, court opinions
|
||||
- **Citation tracking**: "Cited by" links for forward citation tracking
|
||||
- **Accessibility**: Free to use, no account required
|
||||
|
||||
## Basic Search
|
||||
|
||||
### Simple Keyword Search
|
||||
|
||||
Search for papers containing specific terms anywhere in the document (title, abstract, full text):
|
||||
|
||||
```
|
||||
CRISPR gene editing
|
||||
machine learning protein folding
|
||||
climate change impact agriculture
|
||||
quantum computing algorithms
|
||||
```
|
||||
|
||||
**Tips**:
|
||||
- Use specific technical terms
|
||||
- Include key acronyms and abbreviations
|
||||
- Start broad, then refine
|
||||
- Check spelling of technical terms
|
||||
|
||||
### Exact Phrase Search
|
||||
|
||||
Use quotation marks to search for exact phrases:
|
||||
|
||||
```
|
||||
"deep learning"
|
||||
"CRISPR-Cas9"
|
||||
"systematic review"
|
||||
"randomized controlled trial"
|
||||
```
|
||||
|
||||
**When to use**:
|
||||
- Technical terms that must appear together
|
||||
- Proper names
|
||||
- Specific methodologies
|
||||
- Exact titles
|
||||
|
||||
## Advanced Search Operators
|
||||
|
||||
### Author Search
|
||||
|
||||
Find papers by specific authors:
|
||||
|
||||
```
|
||||
author:LeCun
|
||||
author:"Geoffrey Hinton"
|
||||
author:Church synthetic biology
|
||||
```
|
||||
|
||||
**Variations**:
|
||||
- Single last name: `author:Smith`
|
||||
- Full name in quotes: `author:"Jane Smith"`
|
||||
- Author + topic: `author:Doudna CRISPR`
|
||||
|
||||
**Tips**:
|
||||
- Authors may publish under different name variations
|
||||
- Try with and without middle initials
|
||||
- Consider name changes (marriage, etc.)
|
||||
- Use quotation marks for full names
|
||||
|
||||
### Title Search
|
||||
|
||||
Search only in article titles:
|
||||
|
||||
```
|
||||
intitle:transformer
|
||||
intitle:"attention mechanism"
|
||||
intitle:review climate change
|
||||
```
|
||||
|
||||
**Use cases**:
|
||||
- Finding papers specifically about a topic
|
||||
- More precise than full-text search
|
||||
- Reduces irrelevant results
|
||||
- Good for finding reviews or methods
|
||||
|
||||
### Source (Journal) Search
|
||||
|
||||
Search within specific journals or conferences:
|
||||
|
||||
```
|
||||
source:Nature
|
||||
source:"Nature Communications"
|
||||
source:NeurIPS
|
||||
source:"Journal of Machine Learning Research"
|
||||
```
|
||||
|
||||
**Applications**:
|
||||
- Track publications in top-tier venues
|
||||
- Find papers in specialized journals
|
||||
- Identify conference-specific work
|
||||
- Verify publication venue
|
||||
|
||||
### Exclusion Operator
|
||||
|
||||
Exclude terms from results:
|
||||
|
||||
```
|
||||
machine learning -survey
|
||||
CRISPR -patent
|
||||
climate change -news
|
||||
deep learning -tutorial -review
|
||||
```
|
||||
|
||||
**Common exclusions**:
|
||||
- `-survey`: Exclude survey papers
|
||||
- `-review`: Exclude review articles
|
||||
- `-patent`: Exclude patents
|
||||
- `-book`: Exclude books
|
||||
- `-news`: Exclude news articles
|
||||
- `-tutorial`: Exclude tutorials
|
||||
|
||||
### OR Operator
|
||||
|
||||
Search for papers containing any of multiple terms:
|
||||
|
||||
```
|
||||
"machine learning" OR "deep learning"
|
||||
CRISPR OR "gene editing"
|
||||
"climate change" OR "global warming"
|
||||
```
|
||||
|
||||
**Best practices**:
|
||||
- OR must be uppercase
|
||||
- Combine synonyms
|
||||
- Include acronyms and spelled-out versions
|
||||
- Use with exact phrases
|
||||
|
||||
### Wildcard Search
|
||||
|
||||
Use asterisk (*) as wildcard for unknown words:
|
||||
|
||||
```
|
||||
"machine * learning"
|
||||
"CRISPR * editing"
|
||||
"* neural network"
|
||||
```
|
||||
|
||||
**Note**: Limited wildcard support in Google Scholar compared to other databases.
|
||||
|
||||
## Advanced Filtering
|
||||
|
||||
### Year Range
|
||||
|
||||
Filter by publication year:
|
||||
|
||||
**Using interface**:
|
||||
- Click "Since [year]" on left sidebar
|
||||
- Select custom range
|
||||
|
||||
**Using search operators**:
|
||||
```
|
||||
# Not directly in search query
|
||||
# Use interface or URL parameters
|
||||
```
|
||||
|
||||
**In script**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "quantum computing" \
|
||||
--year-start 2020 \
|
||||
--year-end 2024
|
||||
```
|
||||
|
||||
### Sorting Options
|
||||
|
||||
**By relevance** (default):
|
||||
- Google's algorithm determines relevance
|
||||
- Considers citations, author reputation, publication venue
|
||||
- Generally good for most searches
|
||||
|
||||
**By date**:
|
||||
- Most recent papers first
|
||||
- Good for fast-moving fields
|
||||
- May miss highly cited older papers
|
||||
- Click "Sort by date" in interface
|
||||
|
||||
**By citation count** (via script):
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "transformers" \
|
||||
--sort-by citations \
|
||||
--limit 50
|
||||
```
|
||||
|
||||
### Language Filtering
|
||||
|
||||
**In interface**:
|
||||
- Settings → Languages
|
||||
- Select preferred languages
|
||||
|
||||
**Default**: English and papers with English abstracts
|
||||
|
||||
## Search Strategies
|
||||
|
||||
### Finding Seminal Papers
|
||||
|
||||
Identify highly influential papers in a field:
|
||||
|
||||
1. **Search by topic** with broad terms
|
||||
2. **Sort by citations** (most cited first)
|
||||
3. **Look for review articles** for comprehensive overviews
|
||||
4. **Check publication dates** for foundational vs recent work
|
||||
|
||||
**Example**:
|
||||
```
|
||||
"generative adversarial networks"
|
||||
# Sort by citations
|
||||
# Top results: original GAN paper (Goodfellow et al., 2014), key variants
|
||||
```
|
||||
|
||||
### Finding Recent Work
|
||||
|
||||
Stay current with latest research:
|
||||
|
||||
1. **Search by topic**
|
||||
2. **Filter to recent years** (last 1-2 years)
|
||||
3. **Sort by date** for newest first
|
||||
4. **Set up alerts** for ongoing tracking
|
||||
|
||||
**Example**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "AlphaFold protein structure" \
|
||||
--year-start 2023 \
|
||||
--year-end 2024 \
|
||||
--limit 50
|
||||
```
|
||||
|
||||
### Finding Review Articles
|
||||
|
||||
Get comprehensive overviews of a field:
|
||||
|
||||
```
|
||||
intitle:review "machine learning"
|
||||
"systematic review" CRISPR
|
||||
intitle:survey "natural language processing"
|
||||
```
|
||||
|
||||
**Indicators**:
|
||||
- "review", "survey", "perspective" in title
|
||||
- Often highly cited
|
||||
- Published in review journals (Nature Reviews, Trends, etc.)
|
||||
- Comprehensive reference lists
|
||||
|
||||
### Citation Chain Search
|
||||
|
||||
**Forward citations** (papers citing a key paper):
|
||||
1. Find seminal paper
|
||||
2. Click "Cited by X"
|
||||
3. See all papers that cite it
|
||||
4. Identify how field has developed
|
||||
|
||||
**Backward citations** (references in a key paper):
|
||||
1. Find recent review or important paper
|
||||
2. Check its reference list
|
||||
3. Identify foundational work
|
||||
4. Trace development of ideas
|
||||
|
||||
**Example workflow**:
|
||||
```
|
||||
# Find original transformer paper
|
||||
"Attention is all you need" author:Vaswani
|
||||
|
||||
# Check "Cited by 120,000+"
|
||||
# See evolution: BERT, GPT, T5, etc.
|
||||
|
||||
# Check references in original paper
|
||||
# Find RNN, LSTM, attention mechanism origins
|
||||
```
|
||||
|
||||
### Comprehensive Literature Search
|
||||
|
||||
For thorough coverage (e.g., systematic reviews):
|
||||
|
||||
1. **Generate synonym list**:
|
||||
- Main terms + alternatives
|
||||
- Acronyms + spelled out
|
||||
- US vs UK spelling
|
||||
|
||||
2. **Use OR operators**:
|
||||
```
|
||||
("machine learning" OR "deep learning" OR "neural networks")
|
||||
```
|
||||
|
||||
3. **Combine multiple concepts**:
|
||||
```
|
||||
("machine learning" OR "deep learning") ("drug discovery" OR "drug development")
|
||||
```
|
||||
|
||||
4. **Search without date filters** initially:
|
||||
- Get total landscape
|
||||
- Filter later if too many results
|
||||
|
||||
5. **Export results** for systematic analysis:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py \
|
||||
'"machine learning" OR "deep learning" drug discovery' \
|
||||
--limit 500 \
|
||||
--output comprehensive_search.json
|
||||
```
|
||||
|
||||
## Extracting Citation Information
|
||||
|
||||
### From Google Scholar Results Page
|
||||
|
||||
Each result shows:
|
||||
- **Title**: Paper title (linked to full text if available)
|
||||
- **Authors**: Author list (often truncated)
|
||||
- **Source**: Journal/conference, year, publisher
|
||||
- **Cited by**: Number of citations + link to citing papers
|
||||
- **Related articles**: Link to similar papers
|
||||
- **All versions**: Different versions of the same paper
|
||||
|
||||
### Export Options
|
||||
|
||||
**Manual export**:
|
||||
1. Click "Cite" under paper
|
||||
2. Select BibTeX format
|
||||
3. Copy citation
|
||||
|
||||
**Limitations**:
|
||||
- One paper at a time
|
||||
- Manual process
|
||||
- Time-consuming for many papers
|
||||
|
||||
**Automated export** (using script):
|
||||
```bash
|
||||
# Search and export to BibTeX
|
||||
python scripts/search_google_scholar.py "quantum computing" \
|
||||
--limit 50 \
|
||||
--format bibtex \
|
||||
--output quantum_papers.bib
|
||||
```
|
||||
|
||||
### Metadata Available
|
||||
|
||||
From Google Scholar you can typically extract:
|
||||
- Title
|
||||
- Authors (may be incomplete)
|
||||
- Year
|
||||
- Source (journal/conference)
|
||||
- Citation count
|
||||
- Link to full text (when available)
|
||||
- Link to PDF (when available)
|
||||
|
||||
**Note**: Metadata quality varies:
|
||||
- Some fields may be missing
|
||||
- Author names may be incomplete
|
||||
- Need to verify with DOI lookup for accuracy
|
||||
|
||||
## Rate Limiting and Access
|
||||
|
||||
### Rate Limits
|
||||
|
||||
Google Scholar has rate limiting to prevent automated scraping:
|
||||
|
||||
**Symptoms of rate limiting**:
|
||||
- CAPTCHA challenges
|
||||
- Temporary IP blocks
|
||||
- 429 "Too Many Requests" errors
|
||||
|
||||
**Best practices**:
|
||||
1. **Add delays between requests**: 2-5 seconds minimum
|
||||
2. **Limit query volume**: Don't search hundreds of queries rapidly
|
||||
3. **Use scholarly library**: Handles rate limiting automatically
|
||||
4. **Rotate User-Agents**: Appear as different browsers
|
||||
5. **Consider proxies**: For large-scale searches (use ethically)
|
||||
|
||||
**In our scripts**:
|
||||
```python
|
||||
# Automatic rate limiting built in
|
||||
time.sleep(random.uniform(3, 7)) # Random delay 3-7 seconds
|
||||
```
|
||||
|
||||
### Ethical Considerations
|
||||
|
||||
**DO**:
|
||||
- Respect rate limits
|
||||
- Use reasonable delays
|
||||
- Cache results (don't re-query)
|
||||
- Use official APIs when available
|
||||
- Attribute data properly
|
||||
|
||||
**DON'T**:
|
||||
- Scrape aggressively
|
||||
- Use multiple IPs to bypass limits
|
||||
- Violate terms of service
|
||||
- Burden servers unnecessarily
|
||||
- Use data commercially without permission
|
||||
|
||||
### Institutional Access
|
||||
|
||||
**Benefits of institutional access**:
|
||||
- Access to full-text PDFs through library subscriptions
|
||||
- Better download capabilities
|
||||
- Integration with library systems
|
||||
- Link resolver to full text
|
||||
|
||||
**Setup**:
|
||||
- Google Scholar → Settings → Library links
|
||||
- Add your institution
|
||||
- Links appear in search results
|
||||
|
||||
## Tips and Best Practices
|
||||
|
||||
### Search Optimization
|
||||
|
||||
1. **Start simple, then refine**:
|
||||
```
|
||||
# Too specific initially
|
||||
intitle:"deep learning" intitle:review source:Nature 2023..2024
|
||||
|
||||
# Better approach
|
||||
deep learning review
|
||||
# Review results
|
||||
# Add intitle:, source:, year filters as needed
|
||||
```
|
||||
|
||||
2. **Use multiple search strategies**:
|
||||
- Keyword search
|
||||
- Author search for known experts
|
||||
- Citation chaining from key papers
|
||||
- Source search in top journals
|
||||
|
||||
3. **Check spelling and variations**:
|
||||
- Color vs colour
|
||||
- Optimization vs optimisation
|
||||
- Tumor vs tumour
|
||||
- Try common misspellings if few results
|
||||
|
||||
4. **Combine operators strategically**:
|
||||
```
|
||||
# Good combination
|
||||
author:Church intitle:"synthetic biology" 2015..2024
|
||||
|
||||
# Find reviews by specific author on topic in recent years
|
||||
```
|
||||
|
||||
### Result Evaluation
|
||||
|
||||
1. **Check citation counts**:
|
||||
- High citations indicate influence
|
||||
- Recent papers may have low citations but be important
|
||||
- Citation counts vary by field
|
||||
|
||||
2. **Verify publication venue**:
|
||||
- Peer-reviewed journals vs preprints
|
||||
- Conference proceedings
|
||||
- Book chapters
|
||||
- Technical reports
|
||||
|
||||
3. **Check for full text access**:
|
||||
- [PDF] link on right side
|
||||
- "All X versions" may have open access version
|
||||
- Check institutional access
|
||||
- Try author's website or ResearchGate
|
||||
|
||||
4. **Look for review articles**:
|
||||
- Comprehensive overviews
|
||||
- Good starting point for new topics
|
||||
- Extensive reference lists
|
||||
|
||||
### Managing Results
|
||||
|
||||
1. **Use citation manager integration**:
|
||||
- Export to BibTeX
|
||||
- Import to Zotero, Mendeley, EndNote
|
||||
- Maintain organized library
|
||||
|
||||
2. **Set up alerts** for ongoing research:
|
||||
- Google Scholar → Alerts
|
||||
- Get emails for new papers matching query
|
||||
- Track specific authors or topics
|
||||
|
||||
3. **Create collections**:
|
||||
- Save papers to Google Scholar Library
|
||||
- Organize by project or topic
|
||||
- Add labels and notes
|
||||
|
||||
4. **Export systematically**:
|
||||
```bash
|
||||
# Save search results for later analysis
|
||||
python scripts/search_google_scholar.py "your topic" \
|
||||
--output topic_papers.json
|
||||
|
||||
# Can re-process later without re-searching
|
||||
python scripts/extract_metadata.py \
|
||||
--input topic_papers.json \
|
||||
--output topic_refs.bib
|
||||
```
|
||||
|
||||
## Advanced Techniques
|
||||
|
||||
### Boolean Logic Combinations
|
||||
|
||||
Combine multiple operators for precise searches:
|
||||
|
||||
```
|
||||
# Highly cited reviews on specific topic by known authors
|
||||
intitle:review "machine learning" ("drug discovery" OR "drug development")
|
||||
author:Horvath OR author:Bengio 2020..2024
|
||||
|
||||
# Method papers excluding reviews
|
||||
intitle:method "protein folding" -review -survey
|
||||
|
||||
# Papers in top journals only
|
||||
("Nature" OR "Science" OR "Cell") CRISPR 2022..2024
|
||||
```
|
||||
|
||||
### Finding Open Access Papers
|
||||
|
||||
```
|
||||
# Search with generic terms
|
||||
machine learning
|
||||
|
||||
# Filter by "All versions" which often includes preprints
|
||||
# Look for green [PDF] links (often open access)
|
||||
# Check arXiv, bioRxiv versions
|
||||
```
|
||||
|
||||
**In script**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "topic" \
|
||||
--open-access-only \
|
||||
--output open_access_papers.json
|
||||
```
|
||||
|
||||
### Tracking Research Impact
|
||||
|
||||
**For a specific paper**:
|
||||
1. Find the paper
|
||||
2. Click "Cited by X"
|
||||
3. Analyze citing papers:
|
||||
- How is it being used?
|
||||
- What fields cite it?
|
||||
- Recent vs older citations?
|
||||
|
||||
**For an author**:
|
||||
1. Search `author:LastName`
|
||||
2. Check h-index and i10-index
|
||||
3. View citation history graph
|
||||
4. Identify most influential papers
|
||||
|
||||
**For a topic**:
|
||||
1. Search topic
|
||||
2. Sort by citations
|
||||
3. Identify seminal papers (highly cited, older)
|
||||
4. Check recent highly-cited papers (emerging important work)
|
||||
|
||||
### Finding Preprints and Early Work
|
||||
|
||||
```
|
||||
# arXiv papers
|
||||
source:arxiv "deep learning"
|
||||
|
||||
# bioRxiv papers
|
||||
source:biorxiv CRISPR
|
||||
|
||||
# All preprint servers
|
||||
("arxiv" OR "biorxiv" OR "medrxiv") your topic
|
||||
```
|
||||
|
||||
**Note**: Preprints are not peer-reviewed. Always check if published version exists.
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
### Too Many Results
|
||||
|
||||
**Problem**: Search returns 100,000+ results, overwhelming.
|
||||
|
||||
**Solutions**:
|
||||
1. Add more specific terms
|
||||
2. Use `intitle:` to search only titles
|
||||
3. Filter by recent years
|
||||
4. Add exclusions (e.g., `-review`)
|
||||
5. Search within specific journals
|
||||
|
||||
### Too Few Results
|
||||
|
||||
**Problem**: Search returns 0-10 results, suspiciously few.
|
||||
|
||||
**Solutions**:
|
||||
1. Remove restrictive operators
|
||||
2. Try synonyms and related terms
|
||||
3. Check spelling
|
||||
4. Broaden year range
|
||||
5. Use OR for alternative terms
|
||||
|
||||
### Irrelevant Results
|
||||
|
||||
**Problem**: Results don't match intent.
|
||||
|
||||
**Solutions**:
|
||||
1. Use exact phrases with quotes
|
||||
2. Add more specific context terms
|
||||
3. Use `intitle:` for title-only search
|
||||
4. Exclude common irrelevant terms
|
||||
5. Combine multiple specific terms
|
||||
|
||||
### CAPTCHA or Rate Limiting
|
||||
|
||||
**Problem**: Google Scholar shows CAPTCHA or blocks access.
|
||||
|
||||
**Solutions**:
|
||||
1. Wait several minutes before continuing
|
||||
2. Reduce query frequency
|
||||
3. Use longer delays in scripts (5-10 seconds)
|
||||
4. Switch to different IP/network
|
||||
5. Consider using institutional access
|
||||
|
||||
### Missing Metadata
|
||||
|
||||
**Problem**: Author names, year, or venue missing from results.
|
||||
|
||||
**Solutions**:
|
||||
1. Click through to see full details
|
||||
2. Check "All versions" for better metadata
|
||||
3. Look up by DOI if available
|
||||
4. Extract metadata from CrossRef/PubMed instead
|
||||
5. Manually verify from paper PDF
|
||||
|
||||
### Duplicate Results
|
||||
|
||||
**Problem**: Same paper appears multiple times.
|
||||
|
||||
**Solutions**:
|
||||
1. Click "All X versions" to see consolidated view
|
||||
2. Choose version with best metadata
|
||||
3. Use deduplication in post-processing:
|
||||
```bash
|
||||
python scripts/format_bibtex.py results.bib \
|
||||
--deduplicate \
|
||||
--output clean_results.bib
|
||||
```
|
||||
|
||||
## Integration with Scripts
|
||||
|
||||
### search_google_scholar.py Usage
|
||||
|
||||
**Basic search**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "machine learning drug discovery"
|
||||
```
|
||||
|
||||
**With year filter**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "CRISPR" \
|
||||
--year-start 2020 \
|
||||
--year-end 2024 \
|
||||
--limit 100
|
||||
```
|
||||
|
||||
**Sort by citations**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "transformers" \
|
||||
--sort-by citations \
|
||||
--limit 50
|
||||
```
|
||||
|
||||
**Export to BibTeX**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "quantum computing" \
|
||||
--format bibtex \
|
||||
--output quantum.bib
|
||||
```
|
||||
|
||||
**Export to JSON for later processing**:
|
||||
```bash
|
||||
python scripts/search_google_scholar.py "topic" \
|
||||
--format json \
|
||||
--output results.json
|
||||
|
||||
# Later: extract full metadata
|
||||
python scripts/extract_metadata.py \
|
||||
--input results.json \
|
||||
--output references.bib
|
||||
```
|
||||
|
||||
### Batch Searching
|
||||
|
||||
For multiple topics:
|
||||
|
||||
```bash
|
||||
# Create file with search queries (queries.txt)
|
||||
# One query per line
|
||||
|
||||
# Search each query
|
||||
while read query; do
|
||||
python scripts/search_google_scholar.py "$query" \
|
||||
--limit 50 \
|
||||
--output "${query// /_}.json"
|
||||
sleep 10 # Delay between queries
|
||||
done < queries.txt
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
Google Scholar is the most comprehensive academic search engine, providing:
|
||||
|
||||
✓ **Broad coverage**: All disciplines, 100M+ documents
|
||||
✓ **Free access**: No account or subscription required
|
||||
✓ **Citation tracking**: "Cited by" for impact analysis
|
||||
✓ **Multiple formats**: Articles, books, theses, patents
|
||||
✓ **Full-text search**: Not just abstracts
|
||||
|
||||
Key strategies:
|
||||
- Use advanced operators for precision
|
||||
- Combine author, title, source searches
|
||||
- Track citations for impact
|
||||
- Export systematically to citation manager
|
||||
- Respect rate limits and access policies
|
||||
- Verify metadata with CrossRef/PubMed
|
||||
|
||||
For biomedical research, complement with PubMed for MeSH terms and curated metadata.
|
||||
|
||||
870
skills/citation-management/references/metadata_extraction.md
Normal file
870
skills/citation-management/references/metadata_extraction.md
Normal file
@@ -0,0 +1,870 @@
|
||||
# Metadata Extraction Guide
|
||||
|
||||
Comprehensive guide to extracting accurate citation metadata from DOIs, PMIDs, arXiv IDs, and URLs using various APIs and services.
|
||||
|
||||
## Overview
|
||||
|
||||
Accurate metadata is essential for proper citations. This guide covers:
|
||||
- Identifying paper identifiers (DOI, PMID, arXiv ID)
|
||||
- Querying metadata APIs (CrossRef, PubMed, arXiv, DataCite)
|
||||
- Required BibTeX fields by entry type
|
||||
- Handling edge cases and special situations
|
||||
- Validating extracted metadata
|
||||
|
||||
## Paper Identifiers
|
||||
|
||||
### DOI (Digital Object Identifier)
|
||||
|
||||
**Format**: `10.XXXX/suffix`
|
||||
|
||||
**Examples**:
|
||||
```
|
||||
10.1038/s41586-021-03819-2 # Nature article
|
||||
10.1126/science.aam9317 # Science article
|
||||
10.1016/j.cell.2023.01.001 # Cell article
|
||||
10.1371/journal.pone.0123456 # PLOS ONE article
|
||||
```
|
||||
|
||||
**Properties**:
|
||||
- Permanent identifier
|
||||
- Most reliable for metadata
|
||||
- Resolves to current location
|
||||
- Publisher-assigned
|
||||
|
||||
**Where to find**:
|
||||
- First page of article
|
||||
- Article webpage
|
||||
- CrossRef, Google Scholar, PubMed
|
||||
- Usually prominent on publisher site
|
||||
|
||||
### PMID (PubMed ID)
|
||||
|
||||
**Format**: 8-digit number (typically)
|
||||
|
||||
**Examples**:
|
||||
```
|
||||
34265844
|
||||
28445112
|
||||
35476778
|
||||
```
|
||||
|
||||
**Properties**:
|
||||
- Specific to PubMed database
|
||||
- Biomedical literature only
|
||||
- Assigned by NCBI
|
||||
- Permanent identifier
|
||||
|
||||
**Where to find**:
|
||||
- PubMed search results
|
||||
- Article page on PubMed
|
||||
- Often in article PDF footer
|
||||
- PMC (PubMed Central) pages
|
||||
|
||||
### PMCID (PubMed Central ID)
|
||||
|
||||
**Format**: PMC followed by numbers
|
||||
|
||||
**Examples**:
|
||||
```
|
||||
PMC8287551
|
||||
PMC7456789
|
||||
```
|
||||
|
||||
**Properties**:
|
||||
- Free full-text articles in PMC
|
||||
- Subset of PubMed articles
|
||||
- Open access or author manuscripts
|
||||
|
||||
### arXiv ID
|
||||
|
||||
**Format**: YYMM.NNNNN or archive/YYMMNNN
|
||||
|
||||
**Examples**:
|
||||
```
|
||||
2103.14030 # New format (since 2007)
|
||||
2401.12345 # 2024 submission
|
||||
arXiv:hep-th/9901001 # Old format
|
||||
```
|
||||
|
||||
**Properties**:
|
||||
- Preprints (not peer-reviewed)
|
||||
- Physics, math, CS, q-bio, etc.
|
||||
- Version tracking (v1, v2, etc.)
|
||||
- Free, open access
|
||||
|
||||
**Where to find**:
|
||||
- arXiv.org
|
||||
- Often cited before publication
|
||||
- Paper PDF header
|
||||
|
||||
### Other Identifiers
|
||||
|
||||
**ISBN** (Books):
|
||||
```
|
||||
978-0-12-345678-9
|
||||
0-123-45678-9
|
||||
```
|
||||
|
||||
**arXiv category**:
|
||||
```
|
||||
cs.LG # Computer Science - Machine Learning
|
||||
q-bio.QM # Quantitative Biology - Quantitative Methods
|
||||
math.ST # Mathematics - Statistics
|
||||
```
|
||||
|
||||
## Metadata APIs
|
||||
|
||||
### CrossRef API
|
||||
|
||||
**Primary source for DOIs** - Most comprehensive metadata for journal articles.
|
||||
|
||||
**Base URL**: `https://api.crossref.org/works/`
|
||||
|
||||
**No API key required**, but polite pool recommended:
|
||||
- Add email to User-Agent
|
||||
- Gets better service
|
||||
- No rate limits
|
||||
|
||||
#### Basic DOI Lookup
|
||||
|
||||
**Request**:
|
||||
```
|
||||
GET https://api.crossref.org/works/10.1038/s41586-021-03819-2
|
||||
```
|
||||
|
||||
**Response** (simplified):
|
||||
```json
|
||||
{
|
||||
"message": {
|
||||
"DOI": "10.1038/s41586-021-03819-2",
|
||||
"title": ["Article title here"],
|
||||
"author": [
|
||||
{"given": "John", "family": "Smith"},
|
||||
{"given": "Jane", "family": "Doe"}
|
||||
],
|
||||
"container-title": ["Nature"],
|
||||
"volume": "595",
|
||||
"issue": "7865",
|
||||
"page": "123-128",
|
||||
"published-print": {"date-parts": [[2021, 7, 1]]},
|
||||
"publisher": "Springer Nature",
|
||||
"type": "journal-article",
|
||||
"ISSN": ["0028-0836"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Fields Available
|
||||
|
||||
**Always present**:
|
||||
- `DOI`: Digital Object Identifier
|
||||
- `title`: Article title (array)
|
||||
- `type`: Content type (journal-article, book-chapter, etc.)
|
||||
|
||||
**Usually present**:
|
||||
- `author`: Array of author objects
|
||||
- `container-title`: Journal/book title
|
||||
- `published-print` or `published-online`: Publication date
|
||||
- `volume`, `issue`, `page`: Publication details
|
||||
- `publisher`: Publisher name
|
||||
|
||||
**Sometimes present**:
|
||||
- `abstract`: Article abstract
|
||||
- `subject`: Subject categories
|
||||
- `ISSN`: Journal ISSN
|
||||
- `ISBN`: Book ISBN
|
||||
- `reference`: Reference list
|
||||
- `is-referenced-by-count`: Citation count
|
||||
|
||||
#### Content Types
|
||||
|
||||
CrossRef `type` field values:
|
||||
- `journal-article`: Journal articles
|
||||
- `book-chapter`: Book chapters
|
||||
- `book`: Books
|
||||
- `proceedings-article`: Conference papers
|
||||
- `posted-content`: Preprints
|
||||
- `dataset`: Research datasets
|
||||
- `report`: Technical reports
|
||||
- `dissertation`: Theses/dissertations
|
||||
|
||||
### PubMed E-utilities API
|
||||
|
||||
**Specialized for biomedical literature** - Curated metadata with MeSH terms.
|
||||
|
||||
**Base URL**: `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/`
|
||||
|
||||
**API key recommended** (free):
|
||||
- Higher rate limits
|
||||
- Better performance
|
||||
|
||||
#### PMID to Metadata
|
||||
|
||||
**Step 1: EFetch for full record**
|
||||
|
||||
```
|
||||
GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
|
||||
db=pubmed&
|
||||
id=34265844&
|
||||
retmode=xml&
|
||||
api_key=YOUR_KEY
|
||||
```
|
||||
|
||||
**Response**: XML with comprehensive metadata
|
||||
|
||||
**Step 2: Parse XML**
|
||||
|
||||
Key fields:
|
||||
```xml
|
||||
<PubmedArticle>
|
||||
<MedlineCitation>
|
||||
<PMID>34265844</PMID>
|
||||
<Article>
|
||||
<ArticleTitle>Title here</ArticleTitle>
|
||||
<AuthorList>
|
||||
<Author><LastName>Smith</LastName><ForeName>John</ForeName></Author>
|
||||
</AuthorList>
|
||||
<Journal>
|
||||
<Title>Nature</Title>
|
||||
<JournalIssue>
|
||||
<Volume>595</Volume>
|
||||
<Issue>7865</Issue>
|
||||
<PubDate><Year>2021</Year></PubDate>
|
||||
</JournalIssue>
|
||||
</Journal>
|
||||
<Pagination><MedlinePgn>123-128</MedlinePgn></Pagination>
|
||||
<Abstract><AbstractText>Abstract text here</AbstractText></Abstract>
|
||||
</Article>
|
||||
</MedlineCitation>
|
||||
<PubmedData>
|
||||
<ArticleIdList>
|
||||
<ArticleId IdType="doi">10.1038/s41586-021-03819-2</ArticleId>
|
||||
<ArticleId IdType="pmc">PMC8287551</ArticleId>
|
||||
</ArticleIdList>
|
||||
</PubmedData>
|
||||
</PubmedArticle>
|
||||
```
|
||||
|
||||
#### Unique PubMed Fields
|
||||
|
||||
**MeSH Terms**: Controlled vocabulary
|
||||
```xml
|
||||
<MeshHeadingList>
|
||||
<MeshHeading>
|
||||
<DescriptorName UI="D003920">Diabetes Mellitus</DescriptorName>
|
||||
</MeshHeading>
|
||||
</MeshHeadingList>
|
||||
```
|
||||
|
||||
**Publication Types**:
|
||||
```xml
|
||||
<PublicationTypeList>
|
||||
<PublicationType UI="D016428">Journal Article</PublicationType>
|
||||
<PublicationType UI="D016449">Randomized Controlled Trial</PublicationType>
|
||||
</PublicationTypeList>
|
||||
```
|
||||
|
||||
**Grant Information**:
|
||||
```xml
|
||||
<GrantList>
|
||||
<Grant>
|
||||
<GrantID>R01-123456</GrantID>
|
||||
<Agency>NIAID NIH HHS</Agency>
|
||||
<Country>United States</Country>
|
||||
</Grant>
|
||||
</GrantList>
|
||||
```
|
||||
|
||||
### arXiv API
|
||||
|
||||
**Preprints in physics, math, CS, q-bio** - Free, open access.
|
||||
|
||||
**Base URL**: `http://export.arxiv.org/api/query`
|
||||
|
||||
**No API key required**
|
||||
|
||||
#### arXiv ID to Metadata
|
||||
|
||||
**Request**:
|
||||
```
|
||||
GET http://export.arxiv.org/api/query?id_list=2103.14030
|
||||
```
|
||||
|
||||
**Response**: Atom XML
|
||||
|
||||
```xml
|
||||
<entry>
|
||||
<id>http://arxiv.org/abs/2103.14030v2</id>
|
||||
<title>Highly accurate protein structure prediction with AlphaFold</title>
|
||||
<author><name>John Jumper</name></author>
|
||||
<author><name>Richard Evans</name></author>
|
||||
<published>2021-03-26T17:47:17Z</published>
|
||||
<updated>2021-07-01T16:51:46Z</updated>
|
||||
<summary>Abstract text here...</summary>
|
||||
<arxiv:doi>10.1038/s41586-021-03819-2</arxiv:doi>
|
||||
<category term="q-bio.BM" scheme="http://arxiv.org/schemas/atom"/>
|
||||
<category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
|
||||
</entry>
|
||||
```
|
||||
|
||||
#### Key Fields
|
||||
|
||||
- `id`: arXiv URL
|
||||
- `title`: Preprint title
|
||||
- `author`: Author list
|
||||
- `published`: First version date
|
||||
- `updated`: Latest version date
|
||||
- `summary`: Abstract
|
||||
- `arxiv:doi`: DOI if published
|
||||
- `arxiv:journal_ref`: Journal reference if published
|
||||
- `category`: arXiv categories
|
||||
|
||||
#### Version Tracking
|
||||
|
||||
arXiv tracks versions:
|
||||
- `v1`: Initial submission
|
||||
- `v2`, `v3`, etc.: Revisions
|
||||
|
||||
**Always check** if preprint has been published in journal (use DOI if available).
|
||||
|
||||
### DataCite API
|
||||
|
||||
**Research datasets, software, other outputs** - Assigns DOIs to non-traditional scholarly works.
|
||||
|
||||
**Base URL**: `https://api.datacite.org/dois/`
|
||||
|
||||
**Similar to CrossRef** but for datasets, software, code, etc.
|
||||
|
||||
**Request**:
|
||||
```
|
||||
GET https://api.datacite.org/dois/10.5281/zenodo.1234567
|
||||
```
|
||||
|
||||
**Response**: JSON with metadata for dataset/software
|
||||
|
||||
## Required BibTeX Fields
|
||||
|
||||
### @article (Journal Articles)
|
||||
|
||||
**Required**:
|
||||
- `author`: Author names
|
||||
- `title`: Article title
|
||||
- `journal`: Journal name
|
||||
- `year`: Publication year
|
||||
|
||||
**Optional but recommended**:
|
||||
- `volume`: Volume number
|
||||
- `number`: Issue number
|
||||
- `pages`: Page range (e.g., 123--145)
|
||||
- `doi`: Digital Object Identifier
|
||||
- `url`: URL if no DOI
|
||||
- `month`: Publication month
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@article{Smith2024,
|
||||
author = {Smith, John and Doe, Jane},
|
||||
title = {Novel Approach to Protein Folding},
|
||||
journal = {Nature},
|
||||
year = {2024},
|
||||
volume = {625},
|
||||
number = {8001},
|
||||
pages = {123--145},
|
||||
doi = {10.1038/nature12345}
|
||||
}
|
||||
```
|
||||
|
||||
### @book (Books)
|
||||
|
||||
**Required**:
|
||||
- `author` or `editor`: Author(s) or editor(s)
|
||||
- `title`: Book title
|
||||
- `publisher`: Publisher name
|
||||
- `year`: Publication year
|
||||
|
||||
**Optional but recommended**:
|
||||
- `edition`: Edition number (if not first)
|
||||
- `address`: Publisher location
|
||||
- `isbn`: ISBN
|
||||
- `url`: URL
|
||||
- `series`: Series name
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@book{Kumar2021,
|
||||
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
|
||||
title = {Robbins and Cotran Pathologic Basis of Disease},
|
||||
publisher = {Elsevier},
|
||||
year = {2021},
|
||||
edition = {10},
|
||||
isbn = {978-0-323-53113-9}
|
||||
}
|
||||
```
|
||||
|
||||
### @inproceedings (Conference Papers)
|
||||
|
||||
**Required**:
|
||||
- `author`: Author names
|
||||
- `title`: Paper title
|
||||
- `booktitle`: Conference/proceedings name
|
||||
- `year`: Year
|
||||
|
||||
**Optional but recommended**:
|
||||
- `pages`: Page range
|
||||
- `organization`: Organizing body
|
||||
- `publisher`: Publisher
|
||||
- `address`: Conference location
|
||||
- `month`: Conference month
|
||||
- `doi`: DOI if available
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@inproceedings{Vaswani2017,
|
||||
author = {Vaswani, Ashish and Shazeer, Noam and others},
|
||||
title = {Attention is All You Need},
|
||||
booktitle = {Advances in Neural Information Processing Systems},
|
||||
year = {2017},
|
||||
pages = {5998--6008},
|
||||
volume = {30}
|
||||
}
|
||||
```
|
||||
|
||||
### @incollection (Book Chapters)
|
||||
|
||||
**Required**:
|
||||
- `author`: Chapter author(s)
|
||||
- `title`: Chapter title
|
||||
- `booktitle`: Book title
|
||||
- `publisher`: Publisher name
|
||||
- `year`: Publication year
|
||||
|
||||
**Optional but recommended**:
|
||||
- `editor`: Book editor(s)
|
||||
- `pages`: Chapter page range
|
||||
- `chapter`: Chapter number
|
||||
- `edition`: Edition
|
||||
- `address`: Publisher location
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@incollection{Brown2020,
|
||||
author = {Brown, Peter O. and Botstein, David},
|
||||
title = {Exploring the New World of the Genome with {DNA} Microarrays},
|
||||
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
|
||||
editor = {Eisen, Michael B. and Brown, Patrick O.},
|
||||
publisher = {Cold Spring Harbor Laboratory Press},
|
||||
year = {2020},
|
||||
pages = {1--45}
|
||||
}
|
||||
```
|
||||
|
||||
### @phdthesis (Dissertations)
|
||||
|
||||
**Required**:
|
||||
- `author`: Author name
|
||||
- `title`: Thesis title
|
||||
- `school`: Institution
|
||||
- `year`: Year
|
||||
|
||||
**Optional**:
|
||||
- `type`: Type (e.g., "PhD dissertation")
|
||||
- `address`: Institution location
|
||||
- `month`: Month
|
||||
- `url`: URL
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@phdthesis{Johnson2023,
|
||||
author = {Johnson, Mary L.},
|
||||
title = {Novel Approaches to Cancer Immunotherapy},
|
||||
school = {Stanford University},
|
||||
year = {2023},
|
||||
type = {{PhD} dissertation}
|
||||
}
|
||||
```
|
||||
|
||||
### @misc (Preprints, Software, Datasets)
|
||||
|
||||
**Required**:
|
||||
- `author`: Author(s)
|
||||
- `title`: Title
|
||||
- `year`: Year
|
||||
|
||||
**For preprints, add**:
|
||||
- `howpublished`: Repository (e.g., "bioRxiv")
|
||||
- `doi`: Preprint DOI
|
||||
- `note`: Preprint ID
|
||||
|
||||
**Example (preprint)**:
|
||||
```bibtex
|
||||
@misc{Zhang2024,
|
||||
author = {Zhang, Yi and Chen, Li and Wang, Hui},
|
||||
title = {Novel Therapeutic Targets in Alzheimer's Disease},
|
||||
year = {2024},
|
||||
howpublished = {bioRxiv},
|
||||
doi = {10.1101/2024.01.001},
|
||||
note = {Preprint}
|
||||
}
|
||||
```
|
||||
|
||||
**Example (software)**:
|
||||
```bibtex
|
||||
@misc{AlphaFold2021,
|
||||
author = {DeepMind},
|
||||
title = {{AlphaFold} Protein Structure Database},
|
||||
year = {2021},
|
||||
howpublished = {Software},
|
||||
url = {https://alphafold.ebi.ac.uk/},
|
||||
doi = {10.5281/zenodo.5123456}
|
||||
}
|
||||
```
|
||||
|
||||
## Extraction Workflows
|
||||
|
||||
### From DOI
|
||||
|
||||
**Best practice** - Most reliable source:
|
||||
|
||||
```bash
|
||||
# Single DOI
|
||||
python scripts/extract_metadata.py --doi 10.1038/s41586-021-03819-2
|
||||
|
||||
# Multiple DOIs
|
||||
python scripts/extract_metadata.py \
|
||||
--doi 10.1038/nature12345 \
|
||||
--doi 10.1126/science.abc1234 \
|
||||
--output refs.bib
|
||||
```
|
||||
|
||||
**Process**:
|
||||
1. Query CrossRef API with DOI
|
||||
2. Parse JSON response
|
||||
3. Extract required fields
|
||||
4. Determine entry type (@article, @book, etc.)
|
||||
5. Format as BibTeX
|
||||
6. Validate completeness
|
||||
|
||||
### From PMID
|
||||
|
||||
**For biomedical literature**:
|
||||
|
||||
```bash
|
||||
# Single PMID
|
||||
python scripts/extract_metadata.py --pmid 34265844
|
||||
|
||||
# Multiple PMIDs
|
||||
python scripts/extract_metadata.py \
|
||||
--pmid 34265844 \
|
||||
--pmid 28445112 \
|
||||
--output refs.bib
|
||||
```
|
||||
|
||||
**Process**:
|
||||
1. Query PubMed EFetch with PMID
|
||||
2. Parse XML response
|
||||
3. Extract metadata including MeSH terms
|
||||
4. Check for DOI in response
|
||||
5. If DOI exists, optionally query CrossRef for additional metadata
|
||||
6. Format as BibTeX
|
||||
|
||||
### From arXiv ID
|
||||
|
||||
**For preprints**:
|
||||
|
||||
```bash
|
||||
python scripts/extract_metadata.py --arxiv 2103.14030
|
||||
```
|
||||
|
||||
**Process**:
|
||||
1. Query arXiv API with ID
|
||||
2. Parse Atom XML response
|
||||
3. Check for published version (DOI in response)
|
||||
4. If published: Use DOI and CrossRef
|
||||
5. If not published: Use preprint metadata
|
||||
6. Format as @misc with preprint note
|
||||
|
||||
**Important**: Always check if preprint has been published!
|
||||
|
||||
### From URL
|
||||
|
||||
**When you only have URL**:
|
||||
|
||||
```bash
|
||||
python scripts/extract_metadata.py \
|
||||
--url "https://www.nature.com/articles/s41586-021-03819-2"
|
||||
```
|
||||
|
||||
**Process**:
|
||||
1. Parse URL to extract identifier
|
||||
2. Identify type (DOI, PMID, arXiv)
|
||||
3. Extract identifier from URL
|
||||
4. Query appropriate API
|
||||
5. Format as BibTeX
|
||||
|
||||
**URL patterns**:
|
||||
```
|
||||
# DOI URLs
|
||||
https://doi.org/10.1038/nature12345
|
||||
https://dx.doi.org/10.1126/science.abc123
|
||||
https://www.nature.com/articles/s41586-021-03819-2
|
||||
|
||||
# PubMed URLs
|
||||
https://pubmed.ncbi.nlm.nih.gov/34265844/
|
||||
https://www.ncbi.nlm.nih.gov/pubmed/34265844
|
||||
|
||||
# arXiv URLs
|
||||
https://arxiv.org/abs/2103.14030
|
||||
https://arxiv.org/pdf/2103.14030.pdf
|
||||
```
|
||||
|
||||
### Batch Processing
|
||||
|
||||
**From file with mixed identifiers**:
|
||||
|
||||
```bash
|
||||
# Create file with one identifier per line
|
||||
# identifiers.txt:
|
||||
# 10.1038/nature12345
|
||||
# 34265844
|
||||
# 2103.14030
|
||||
# https://doi.org/10.1126/science.abc123
|
||||
|
||||
python scripts/extract_metadata.py \
|
||||
--input identifiers.txt \
|
||||
--output references.bib
|
||||
```
|
||||
|
||||
**Process**:
|
||||
- Script auto-detects identifier type
|
||||
- Queries appropriate API
|
||||
- Combines all into single BibTeX file
|
||||
- Handles errors gracefully
|
||||
|
||||
## Special Cases and Edge Cases
|
||||
|
||||
### Preprints Later Published
|
||||
|
||||
**Issue**: Preprint cited, but journal version now available.
|
||||
|
||||
**Solution**:
|
||||
1. Check arXiv metadata for DOI field
|
||||
2. If DOI present, use published version
|
||||
3. Update citation to journal article
|
||||
4. Note preprint version in comments if needed
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
% Originally: arXiv:2103.14030
|
||||
% Published as:
|
||||
@article{Jumper2021,
|
||||
author = {Jumper, John and Evans, Richard and others},
|
||||
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
|
||||
journal = {Nature},
|
||||
year = {2021},
|
||||
volume = {596},
|
||||
pages = {583--589},
|
||||
doi = {10.1038/s41586-021-03819-2}
|
||||
}
|
||||
```
|
||||
|
||||
### Multiple Authors (et al.)
|
||||
|
||||
**Issue**: Many authors (10+).
|
||||
|
||||
**BibTeX practice**:
|
||||
- Include all authors if <10
|
||||
- Use "and others" for 10+
|
||||
- Or list all (journals vary)
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@article{LargeCollaboration2024,
|
||||
author = {First, Author and Second, Author and Third, Author and others},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### Author Name Variations
|
||||
|
||||
**Issue**: Authors publish under different name formats.
|
||||
|
||||
**Standardization**:
|
||||
```
|
||||
# Common variations
|
||||
John Smith
|
||||
John A. Smith
|
||||
John Andrew Smith
|
||||
J. A. Smith
|
||||
Smith, J.
|
||||
Smith, J. A.
|
||||
|
||||
# BibTeX format (recommended)
|
||||
author = {Smith, John A.}
|
||||
```
|
||||
|
||||
**Extraction preference**:
|
||||
1. Use full name if available
|
||||
2. Include middle initial if available
|
||||
3. Format: Last, First Middle
|
||||
|
||||
### No DOI Available
|
||||
|
||||
**Issue**: Older papers or books without DOIs.
|
||||
|
||||
**Solutions**:
|
||||
1. Use PMID if available (biomedical)
|
||||
2. Use ISBN for books
|
||||
3. Use URL to stable source
|
||||
4. Include full publication details
|
||||
|
||||
**Example**:
|
||||
```bibtex
|
||||
@article{OldPaper1995,
|
||||
author = {Author, Name},
|
||||
title = {Title Here},
|
||||
journal = {Journal Name},
|
||||
year = {1995},
|
||||
volume = {123},
|
||||
pages = {45--67},
|
||||
url = {https://stable-url-here},
|
||||
note = {PMID: 12345678}
|
||||
}
|
||||
```
|
||||
|
||||
### Conference Papers vs Journal Articles
|
||||
|
||||
**Issue**: Same work published in both.
|
||||
|
||||
**Best practice**:
|
||||
- Cite journal version if both available
|
||||
- Journal version is archival
|
||||
- Conference version for timeliness
|
||||
|
||||
**If citing conference**:
|
||||
```bibtex
|
||||
@inproceedings{Smith2024conf,
|
||||
author = {Smith, John},
|
||||
title = {Title},
|
||||
booktitle = {Proceedings of NeurIPS 2024},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
**If citing journal**:
|
||||
```bibtex
|
||||
@article{Smith2024journal,
|
||||
author = {Smith, John},
|
||||
title = {Title},
|
||||
journal = {Journal of Machine Learning Research},
|
||||
year = {2024}
|
||||
}
|
||||
```
|
||||
|
||||
### Book Chapters vs Edited Collections
|
||||
|
||||
**Extract correctly**:
|
||||
- Chapter: Use `@incollection`
|
||||
- Whole book: Use `@book`
|
||||
- Book editor: List in `editor` field
|
||||
- Chapter author: List in `author` field
|
||||
|
||||
### Datasets and Software
|
||||
|
||||
**Use @misc** with appropriate fields:
|
||||
|
||||
```bibtex
|
||||
@misc{DatasetName2024,
|
||||
author = {Author, Name},
|
||||
title = {Dataset Title},
|
||||
year = {2024},
|
||||
howpublished = {Zenodo},
|
||||
doi = {10.5281/zenodo.123456},
|
||||
note = {Version 1.2}
|
||||
}
|
||||
```
|
||||
|
||||
## Validation After Extraction
|
||||
|
||||
Always validate extracted metadata:
|
||||
|
||||
```bash
|
||||
python scripts/validate_citations.py extracted_refs.bib
|
||||
```
|
||||
|
||||
**Check**:
|
||||
- All required fields present
|
||||
- DOI resolves correctly
|
||||
- Author names formatted consistently
|
||||
- Year is reasonable (4 digits)
|
||||
- Journal/publisher names correct
|
||||
- Page ranges use -- not -
|
||||
- Special characters handled properly
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Prefer DOI When Available
|
||||
|
||||
DOIs provide:
|
||||
- Permanent identifier
|
||||
- Best metadata source
|
||||
- Publisher-verified information
|
||||
- Resolvable link
|
||||
|
||||
### 2. Verify Automatically Extracted Metadata
|
||||
|
||||
Spot-check:
|
||||
- Author names match publication
|
||||
- Title matches (including capitalization)
|
||||
- Year is correct
|
||||
- Journal name is complete
|
||||
|
||||
### 3. Handle Special Characters
|
||||
|
||||
**LaTeX special characters**:
|
||||
- Protect capitalization: `{AlphaFold}`
|
||||
- Handle accents: `M{\"u}ller` or use Unicode
|
||||
- Chemical formulas: `H$_2$O` or `\ce{H2O}`
|
||||
|
||||
### 4. Use Consistent Citation Keys
|
||||
|
||||
**Convention**: `FirstAuthorYEARkeyword`
|
||||
```
|
||||
Smith2024protein
|
||||
Doe2023machine
|
||||
Johnson2024cancer
|
||||
```
|
||||
|
||||
### 5. Include DOI for Modern Papers
|
||||
|
||||
All papers published after ~2000 should have DOI:
|
||||
```bibtex
|
||||
doi = {10.1038/nature12345}
|
||||
```
|
||||
|
||||
### 6. Document Source
|
||||
|
||||
For non-standard sources, add note:
|
||||
```bibtex
|
||||
note = {Preprint, not peer-reviewed}
|
||||
note = {Technical report}
|
||||
note = {Dataset accompanying [citation]}
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
Metadata extraction workflow:
|
||||
|
||||
1. **Identify**: Determine identifier type (DOI, PMID, arXiv, URL)
|
||||
2. **Query**: Use appropriate API (CrossRef, PubMed, arXiv)
|
||||
3. **Extract**: Parse response for required fields
|
||||
4. **Format**: Create properly formatted BibTeX entry
|
||||
5. **Validate**: Check completeness and accuracy
|
||||
6. **Verify**: Spot-check critical citations
|
||||
|
||||
**Use scripts** to automate:
|
||||
- `extract_metadata.py`: Universal extractor
|
||||
- `doi_to_bibtex.py`: Quick DOI conversion
|
||||
- `validate_citations.py`: Verify accuracy
|
||||
|
||||
**Always validate** extracted metadata before final submission!
|
||||
|
||||
839
skills/citation-management/references/pubmed_search.md
Normal file
839
skills/citation-management/references/pubmed_search.md
Normal file
@@ -0,0 +1,839 @@
|
||||
# PubMed Search Guide
|
||||
|
||||
Comprehensive guide to searching PubMed for biomedical and life sciences literature, including MeSH terms, field tags, advanced search strategies, and E-utilities API usage.
|
||||
|
||||
## Overview
|
||||
|
||||
PubMed is the premier database for biomedical literature:
|
||||
- **Coverage**: 35+ million citations
|
||||
- **Scope**: Biomedical and life sciences
|
||||
- **Sources**: MEDLINE, life science journals, online books
|
||||
- **Authority**: Maintained by National Library of Medicine (NLM) / NCBI
|
||||
- **Access**: Free, no account required
|
||||
- **Updates**: Daily with new citations
|
||||
- **Curation**: High-quality metadata, MeSH indexing
|
||||
|
||||
## Basic Search
|
||||
|
||||
### Simple Keyword Search
|
||||
|
||||
PubMed automatically maps terms to MeSH and searches multiple fields:
|
||||
|
||||
```
|
||||
diabetes
|
||||
CRISPR gene editing
|
||||
Alzheimer's disease treatment
|
||||
cancer immunotherapy
|
||||
```
|
||||
|
||||
**Automatic Features**:
|
||||
- Automatic MeSH mapping
|
||||
- Plural/singular variants
|
||||
- Abbreviation expansion
|
||||
- Spell checking
|
||||
|
||||
### Exact Phrase Search
|
||||
|
||||
Use quotation marks for exact phrases:
|
||||
|
||||
```
|
||||
"CRISPR-Cas9"
|
||||
"systematic review"
|
||||
"randomized controlled trial"
|
||||
"machine learning"
|
||||
```
|
||||
|
||||
## MeSH (Medical Subject Headings)
|
||||
|
||||
### What is MeSH?
|
||||
|
||||
MeSH is a controlled vocabulary thesaurus for indexing biomedical literature:
|
||||
- **Hierarchical structure**: Organized in tree structures
|
||||
- **Consistent indexing**: Same concept always tagged the same way
|
||||
- **Comprehensive**: Covers diseases, drugs, anatomy, techniques, etc.
|
||||
- **Professional curation**: NLM indexers assign MeSH terms
|
||||
|
||||
### Finding MeSH Terms
|
||||
|
||||
**MeSH Browser**: https://meshb.nlm.nih.gov/search
|
||||
|
||||
**Example**:
|
||||
```
|
||||
Search: "heart attack"
|
||||
MeSH term: "Myocardial Infarction"
|
||||
```
|
||||
|
||||
**In PubMed**:
|
||||
1. Search with keyword
|
||||
2. Check "MeSH Terms" in left sidebar
|
||||
3. Select relevant MeSH terms
|
||||
4. Add to search
|
||||
|
||||
### Using MeSH in Searches
|
||||
|
||||
**Basic MeSH search**:
|
||||
```
|
||||
"Diabetes Mellitus"[MeSH]
|
||||
"CRISPR-Cas Systems"[MeSH]
|
||||
"Alzheimer Disease"[MeSH]
|
||||
"Neoplasms"[MeSH]
|
||||
```
|
||||
|
||||
**MeSH with subheadings**:
|
||||
```
|
||||
"Diabetes Mellitus/drug therapy"[MeSH]
|
||||
"Neoplasms/genetics"[MeSH]
|
||||
"Heart Failure/prevention and control"[MeSH]
|
||||
```
|
||||
|
||||
**Common subheadings**:
|
||||
- `/drug therapy`: Drug treatment
|
||||
- `/diagnosis`: Diagnostic aspects
|
||||
- `/genetics`: Genetic aspects
|
||||
- `/epidemiology`: Occurrence and distribution
|
||||
- `/prevention and control`: Prevention methods
|
||||
- `/etiology`: Causes
|
||||
- `/surgery`: Surgical treatment
|
||||
- `/metabolism`: Metabolic aspects
|
||||
|
||||
### MeSH Explosion
|
||||
|
||||
By default, MeSH searches include narrower terms (explosion):
|
||||
|
||||
```
|
||||
"Neoplasms"[MeSH]
|
||||
# Includes: Breast Neoplasms, Lung Neoplasms, etc.
|
||||
```
|
||||
|
||||
**Disable explosion** (exact term only):
|
||||
```
|
||||
"Neoplasms"[MeSH:NoExp]
|
||||
```
|
||||
|
||||
### MeSH Major Topic
|
||||
|
||||
Search only where MeSH term is a major focus:
|
||||
|
||||
```
|
||||
"Diabetes Mellitus"[MeSH Major Topic]
|
||||
# Only papers where diabetes is main topic
|
||||
```
|
||||
|
||||
## Field Tags
|
||||
|
||||
Field tags specify which part of the record to search.
|
||||
|
||||
### Common Field Tags
|
||||
|
||||
**Title and Abstract**:
|
||||
```
|
||||
cancer[Title] # In title only
|
||||
treatment[Title/Abstract] # In title or abstract
|
||||
"machine learning"[Title/Abstract]
|
||||
```
|
||||
|
||||
**Author**:
|
||||
```
|
||||
"Smith J"[Author]
|
||||
"Doudna JA"[Author]
|
||||
"Collins FS"[Author]
|
||||
```
|
||||
|
||||
**Author - Full Name**:
|
||||
```
|
||||
"Smith, John"[Full Author Name]
|
||||
```
|
||||
|
||||
**Journal**:
|
||||
```
|
||||
"Nature"[Journal]
|
||||
"Science"[Journal]
|
||||
"New England Journal of Medicine"[Journal]
|
||||
"Nat Commun"[Journal] # Abbreviated form
|
||||
```
|
||||
|
||||
**Publication Date**:
|
||||
```
|
||||
2023[Publication Date]
|
||||
2020:2024[Publication Date] # Date range
|
||||
2023/01/01:2023/12/31[Publication Date]
|
||||
```
|
||||
|
||||
**Date Created**:
|
||||
```
|
||||
2023[Date - Create] # When added to PubMed
|
||||
```
|
||||
|
||||
**Publication Type**:
|
||||
```
|
||||
"Review"[Publication Type]
|
||||
"Clinical Trial"[Publication Type]
|
||||
"Meta-Analysis"[Publication Type]
|
||||
"Randomized Controlled Trial"[Publication Type]
|
||||
```
|
||||
|
||||
**Language**:
|
||||
```
|
||||
English[Language]
|
||||
French[Language]
|
||||
```
|
||||
|
||||
**DOI**:
|
||||
```
|
||||
10.1038/nature12345[DOI]
|
||||
```
|
||||
|
||||
**PMID (PubMed ID)**:
|
||||
```
|
||||
12345678[PMID]
|
||||
```
|
||||
|
||||
**Article ID**:
|
||||
```
|
||||
PMC1234567[PMC] # PubMed Central ID
|
||||
```
|
||||
|
||||
### Less Common But Useful Tags
|
||||
|
||||
```
|
||||
humans[MeSH Terms] # Only human studies
|
||||
animals[MeSH Terms] # Only animal studies
|
||||
"United States"[Place of Publication]
|
||||
nih[Grant Number] # NIH-funded research
|
||||
"Female"[Sex] # Female subjects
|
||||
"Aged, 80 and over"[Age] # Elderly subjects
|
||||
```
|
||||
|
||||
## Boolean Operators
|
||||
|
||||
Combine search terms with Boolean logic.
|
||||
|
||||
### AND
|
||||
|
||||
Both terms must be present (default behavior):
|
||||
|
||||
```
|
||||
diabetes AND treatment
|
||||
"CRISPR-Cas9" AND "gene editing"
|
||||
cancer AND immunotherapy AND "clinical trial"[Publication Type]
|
||||
```
|
||||
|
||||
### OR
|
||||
|
||||
Either term must be present:
|
||||
|
||||
```
|
||||
"heart attack" OR "myocardial infarction"
|
||||
diabetes OR "diabetes mellitus"
|
||||
CRISPR OR Cas9 OR "gene editing"
|
||||
```
|
||||
|
||||
**Use case**: Synonyms and related terms
|
||||
|
||||
### NOT
|
||||
|
||||
Exclude terms:
|
||||
|
||||
```
|
||||
cancer NOT review
|
||||
diabetes NOT animal
|
||||
"machine learning" NOT "deep learning"
|
||||
```
|
||||
|
||||
**Caution**: May exclude relevant papers that mention both terms.
|
||||
|
||||
### Combining Operators
|
||||
|
||||
Use parentheses for complex logic:
|
||||
|
||||
```
|
||||
(diabetes OR "diabetes mellitus") AND (treatment OR therapy)
|
||||
|
||||
("CRISPR" OR "gene editing") AND ("therapeutic" OR "therapy")
|
||||
AND 2020:2024[Publication Date]
|
||||
|
||||
(cancer OR neoplasm) AND (immunotherapy OR "immune checkpoint inhibitor")
|
||||
AND ("clinical trial"[Publication Type] OR "randomized controlled trial"[Publication Type])
|
||||
```
|
||||
|
||||
## Advanced Search Builder
|
||||
|
||||
**Access**: https://pubmed.ncbi.nlm.nih.gov/advanced/
|
||||
|
||||
**Features**:
|
||||
- Visual query builder
|
||||
- Add multiple query boxes
|
||||
- Select field tags from dropdowns
|
||||
- Combine with AND/OR/NOT
|
||||
- Preview results
|
||||
- Shows final query string
|
||||
- Save queries
|
||||
|
||||
**Workflow**:
|
||||
1. Add search terms in separate boxes
|
||||
2. Select field tags
|
||||
3. Choose Boolean operators
|
||||
4. Preview results
|
||||
5. Refine as needed
|
||||
6. Copy final query string
|
||||
7. Use in scripts or save
|
||||
|
||||
**Example built query**:
|
||||
```
|
||||
#1: "Diabetes Mellitus, Type 2"[MeSH]
|
||||
#2: "Metformin"[MeSH]
|
||||
#3: "Clinical Trial"[Publication Type]
|
||||
#4: 2020:2024[Publication Date]
|
||||
#5: #1 AND #2 AND #3 AND #4
|
||||
```
|
||||
|
||||
## Filters and Limits
|
||||
|
||||
### Article Types
|
||||
|
||||
```
|
||||
"Review"[Publication Type]
|
||||
"Systematic Review"[Publication Type]
|
||||
"Meta-Analysis"[Publication Type]
|
||||
"Clinical Trial"[Publication Type]
|
||||
"Randomized Controlled Trial"[Publication Type]
|
||||
"Case Reports"[Publication Type]
|
||||
"Comparative Study"[Publication Type]
|
||||
```
|
||||
|
||||
### Species
|
||||
|
||||
```
|
||||
humans[MeSH Terms]
|
||||
mice[MeSH Terms]
|
||||
rats[MeSH Terms]
|
||||
```
|
||||
|
||||
### Sex
|
||||
|
||||
```
|
||||
"Female"[MeSH Terms]
|
||||
"Male"[MeSH Terms]
|
||||
```
|
||||
|
||||
### Age Groups
|
||||
|
||||
```
|
||||
"Infant"[MeSH Terms]
|
||||
"Child"[MeSH Terms]
|
||||
"Adolescent"[MeSH Terms]
|
||||
"Adult"[MeSH Terms]
|
||||
"Aged"[MeSH Terms]
|
||||
"Aged, 80 and over"[MeSH Terms]
|
||||
```
|
||||
|
||||
### Text Availability
|
||||
|
||||
```
|
||||
free full text[Filter] # Free full-text available
|
||||
```
|
||||
|
||||
### Journal Categories
|
||||
|
||||
```
|
||||
"Journal Article"[Publication Type]
|
||||
```
|
||||
|
||||
## E-utilities API
|
||||
|
||||
NCBI provides programmatic access via E-utilities (Entrez Programming Utilities).
|
||||
|
||||
### Overview
|
||||
|
||||
**Base URL**: `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/`
|
||||
|
||||
**Main Tools**:
|
||||
- **ESearch**: Search and retrieve PMIDs
|
||||
- **EFetch**: Retrieve full records
|
||||
- **ESummary**: Retrieve document summaries
|
||||
- **ELink**: Find related articles
|
||||
- **EInfo**: Database statistics
|
||||
|
||||
**No API key required**, but recommended for:
|
||||
- Higher rate limits (10/sec vs 3/sec)
|
||||
- Better performance
|
||||
- Identify your project
|
||||
|
||||
**Get API key**: https://www.ncbi.nlm.nih.gov/account/
|
||||
|
||||
### ESearch - Search PubMed
|
||||
|
||||
Retrieve PMIDs for a query.
|
||||
|
||||
**Endpoint**: `/esearch.fcgi`
|
||||
|
||||
**Parameters**:
|
||||
- `db`: Database (pubmed)
|
||||
- `term`: Search query
|
||||
- `retmax`: Maximum results (default 20, max 10000)
|
||||
- `retstart`: Starting position (for pagination)
|
||||
- `sort`: Sort order (relevance, pub_date, author)
|
||||
- `api_key`: Your API key (optional but recommended)
|
||||
|
||||
**Example URL**:
|
||||
```
|
||||
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?
|
||||
db=pubmed&
|
||||
term=diabetes+AND+treatment&
|
||||
retmax=100&
|
||||
retmode=json&
|
||||
api_key=YOUR_API_KEY
|
||||
```
|
||||
|
||||
**Response**:
|
||||
```json
|
||||
{
|
||||
"esearchresult": {
|
||||
"count": "250000",
|
||||
"retmax": "100",
|
||||
"idlist": ["12345678", "12345679", ...]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### EFetch - Retrieve Records
|
||||
|
||||
Get full metadata for PMIDs.
|
||||
|
||||
**Endpoint**: `/efetch.fcgi`
|
||||
|
||||
**Parameters**:
|
||||
- `db`: Database (pubmed)
|
||||
- `id`: Comma-separated PMIDs
|
||||
- `retmode`: Format (xml, json, text)
|
||||
- `rettype`: Type (abstract, medline, full)
|
||||
- `api_key`: Your API key
|
||||
|
||||
**Example URL**:
|
||||
```
|
||||
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
|
||||
db=pubmed&
|
||||
id=12345678,12345679&
|
||||
retmode=xml&
|
||||
api_key=YOUR_API_KEY
|
||||
```
|
||||
|
||||
**Response**: XML with complete metadata including:
|
||||
- Title
|
||||
- Authors (with affiliations)
|
||||
- Abstract
|
||||
- Journal
|
||||
- Publication date
|
||||
- DOI
|
||||
- PMID, PMCID
|
||||
- MeSH terms
|
||||
- Keywords
|
||||
|
||||
### ESummary - Get Summaries
|
||||
|
||||
Lighter-weight alternative to EFetch.
|
||||
|
||||
**Example**:
|
||||
```
|
||||
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?
|
||||
db=pubmed&
|
||||
id=12345678&
|
||||
retmode=json&
|
||||
api_key=YOUR_API_KEY
|
||||
```
|
||||
|
||||
**Returns**: Key metadata without full abstract and details.
|
||||
|
||||
### ELink - Find Related Articles
|
||||
|
||||
Find related articles or links to other databases.
|
||||
|
||||
**Example**:
|
||||
```
|
||||
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?
|
||||
dbfrom=pubmed&
|
||||
db=pubmed&
|
||||
id=12345678&
|
||||
linkname=pubmed_pubmed_citedin
|
||||
```
|
||||
|
||||
**Link types**:
|
||||
- `pubmed_pubmed`: Related articles
|
||||
- `pubmed_pubmed_citedin`: Papers citing this article
|
||||
- `pubmed_pmc`: PMC full-text versions
|
||||
- `pubmed_protein`: Related protein records
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
**Without API key**:
|
||||
- 3 requests per second
|
||||
- Block if exceeded
|
||||
|
||||
**With API key**:
|
||||
- 10 requests per second
|
||||
- Better for programmatic access
|
||||
|
||||
**Best practice**:
|
||||
```python
|
||||
import time
|
||||
time.sleep(0.34) # ~3 requests/second
|
||||
# or
|
||||
time.sleep(0.11) # ~10 requests/second with API key
|
||||
```
|
||||
|
||||
### API Key Usage
|
||||
|
||||
**Get API key**:
|
||||
1. Create NCBI account: https://www.ncbi.nlm.nih.gov/account/
|
||||
2. Settings → API Key Management
|
||||
3. Create new API key
|
||||
4. Copy key
|
||||
|
||||
**Use in requests**:
|
||||
```
|
||||
&api_key=YOUR_API_KEY_HERE
|
||||
```
|
||||
|
||||
**Store securely**:
|
||||
```bash
|
||||
# In environment variable
|
||||
export NCBI_API_KEY="your_key_here"
|
||||
|
||||
# In script
|
||||
import os
|
||||
api_key = os.getenv('NCBI_API_KEY')
|
||||
```
|
||||
|
||||
## Search Strategies
|
||||
|
||||
### Comprehensive Systematic Search
|
||||
|
||||
For systematic reviews and meta-analyses:
|
||||
|
||||
```
|
||||
# 1. Identify key concepts
|
||||
Concept 1: Diabetes
|
||||
Concept 2: Treatment
|
||||
Concept 3: Outcomes
|
||||
|
||||
# 2. Find MeSH terms and synonyms
|
||||
Concept 1: "Diabetes Mellitus"[MeSH] OR diabetes OR diabetic
|
||||
Concept 2: "Drug Therapy"[MeSH] OR treatment OR therapy OR medication
|
||||
Concept 3: "Treatment Outcome"[MeSH] OR outcome OR efficacy OR effectiveness
|
||||
|
||||
# 3. Combine with AND
|
||||
("Diabetes Mellitus"[MeSH] OR diabetes OR diabetic)
|
||||
AND ("Drug Therapy"[MeSH] OR treatment OR therapy OR medication)
|
||||
AND ("Treatment Outcome"[MeSH] OR outcome OR efficacy OR effectiveness)
|
||||
|
||||
# 4. Add filters
|
||||
AND 2015:2024[Publication Date]
|
||||
AND ("Clinical Trial"[Publication Type] OR "Randomized Controlled Trial"[Publication Type])
|
||||
AND English[Language]
|
||||
AND humans[MeSH Terms]
|
||||
```
|
||||
|
||||
### Finding Clinical Trials
|
||||
|
||||
```
|
||||
# Specific disease + clinical trials
|
||||
"Alzheimer Disease"[MeSH]
|
||||
AND ("Clinical Trial"[Publication Type]
|
||||
OR "Randomized Controlled Trial"[Publication Type])
|
||||
AND 2020:2024[Publication Date]
|
||||
|
||||
# Specific drug trials
|
||||
"Metformin"[MeSH]
|
||||
AND "Diabetes Mellitus, Type 2"[MeSH]
|
||||
AND "Randomized Controlled Trial"[Publication Type]
|
||||
```
|
||||
|
||||
### Finding Reviews
|
||||
|
||||
```
|
||||
# Systematic reviews on topic
|
||||
"CRISPR-Cas Systems"[MeSH]
|
||||
AND ("Systematic Review"[Publication Type] OR "Meta-Analysis"[Publication Type])
|
||||
|
||||
# Reviews in high-impact journals
|
||||
cancer immunotherapy
|
||||
AND "Review"[Publication Type]
|
||||
AND ("Nature"[Journal] OR "Science"[Journal] OR "Cell"[Journal])
|
||||
```
|
||||
|
||||
### Finding Recent Papers
|
||||
|
||||
```
|
||||
# Papers from last year
|
||||
"machine learning"[Title/Abstract]
|
||||
AND "drug discovery"[Title/Abstract]
|
||||
AND 2024[Publication Date]
|
||||
|
||||
# Recent papers in specific journal
|
||||
"CRISPR"[Title/Abstract]
|
||||
AND "Nature"[Journal]
|
||||
AND 2023:2024[Publication Date]
|
||||
```
|
||||
|
||||
### Author Tracking
|
||||
|
||||
```
|
||||
# Specific author's recent work
|
||||
"Doudna JA"[Author] AND 2020:2024[Publication Date]
|
||||
|
||||
# Author + topic
|
||||
"Church GM"[Author] AND "synthetic biology"[Title/Abstract]
|
||||
```
|
||||
|
||||
### High-Quality Evidence
|
||||
|
||||
```
|
||||
# Meta-analyses and systematic reviews
|
||||
(diabetes OR "diabetes mellitus")
|
||||
AND (treatment OR therapy)
|
||||
AND ("Meta-Analysis"[Publication Type] OR "Systematic Review"[Publication Type])
|
||||
|
||||
# RCTs only
|
||||
cancer immunotherapy
|
||||
AND "Randomized Controlled Trial"[Publication Type]
|
||||
AND 2020:2024[Publication Date]
|
||||
```
|
||||
|
||||
## Script Integration
|
||||
|
||||
### search_pubmed.py Usage
|
||||
|
||||
**Basic search**:
|
||||
```bash
|
||||
python scripts/search_pubmed.py "diabetes treatment"
|
||||
```
|
||||
|
||||
**With MeSH terms**:
|
||||
```bash
|
||||
python scripts/search_pubmed.py \
|
||||
--query '"Diabetes Mellitus"[MeSH] AND "Drug Therapy"[MeSH]'
|
||||
```
|
||||
|
||||
**Date range filter**:
|
||||
```bash
|
||||
python scripts/search_pubmed.py "CRISPR" \
|
||||
--date-start 2020-01-01 \
|
||||
--date-end 2024-12-31 \
|
||||
--limit 200
|
||||
```
|
||||
|
||||
**Publication type filter**:
|
||||
```bash
|
||||
python scripts/search_pubmed.py "cancer immunotherapy" \
|
||||
--publication-types "Clinical Trial,Randomized Controlled Trial" \
|
||||
--limit 100
|
||||
```
|
||||
|
||||
**Export to BibTeX**:
|
||||
```bash
|
||||
python scripts/search_pubmed.py "Alzheimer's disease" \
|
||||
--limit 100 \
|
||||
--format bibtex \
|
||||
--output alzheimers.bib
|
||||
```
|
||||
|
||||
**Complex query from file**:
|
||||
```bash
|
||||
# Save complex query in query.txt
|
||||
cat > query.txt << 'EOF'
|
||||
("Diabetes Mellitus, Type 2"[MeSH] OR "diabetes"[Title/Abstract])
|
||||
AND ("Metformin"[MeSH] OR "metformin"[Title/Abstract])
|
||||
AND "Randomized Controlled Trial"[Publication Type]
|
||||
AND 2015:2024[Publication Date]
|
||||
AND English[Language]
|
||||
EOF
|
||||
|
||||
# Run search
|
||||
python scripts/search_pubmed.py --query-file query.txt --limit 500
|
||||
```
|
||||
|
||||
### Batch Searches
|
||||
|
||||
```bash
|
||||
# Search multiple topics
|
||||
TOPICS=("diabetes treatment" "cancer immunotherapy" "CRISPR gene editing")
|
||||
|
||||
for topic in "${TOPICS[@]}"; do
|
||||
python scripts/search_pubmed.py "$topic" \
|
||||
--limit 100 \
|
||||
--output "${topic// /_}.json"
|
||||
sleep 1
|
||||
done
|
||||
```
|
||||
|
||||
### Extract Metadata
|
||||
|
||||
```bash
|
||||
# Search returns PMIDs
|
||||
python scripts/search_pubmed.py "topic" --output results.json
|
||||
|
||||
# Extract full metadata
|
||||
python scripts/extract_metadata.py \
|
||||
--input results.json \
|
||||
--output references.bib
|
||||
```
|
||||
|
||||
## Tips and Best Practices
|
||||
|
||||
### Search Construction
|
||||
|
||||
1. **Start with MeSH terms**:
|
||||
- Use MeSH Browser to find correct terms
|
||||
- More precise than keyword search
|
||||
- Captures all papers on topic regardless of terminology
|
||||
|
||||
2. **Include text word variants**:
|
||||
```
|
||||
# Better coverage
|
||||
("Diabetes Mellitus"[MeSH] OR diabetes OR diabetic)
|
||||
```
|
||||
|
||||
3. **Use field tags appropriately**:
|
||||
- `[MeSH]` for standardized concepts
|
||||
- `[Title/Abstract]` for specific terms
|
||||
- `[Author]` for known authors
|
||||
- `[Journal]` for specific venues
|
||||
|
||||
4. **Build incrementally**:
|
||||
```
|
||||
# Step 1: Basic search
|
||||
diabetes
|
||||
|
||||
# Step 2: Add specificity
|
||||
"Diabetes Mellitus, Type 2"[MeSH]
|
||||
|
||||
# Step 3: Add treatment
|
||||
"Diabetes Mellitus, Type 2"[MeSH] AND "Metformin"[MeSH]
|
||||
|
||||
# Step 4: Add study type
|
||||
"Diabetes Mellitus, Type 2"[MeSH] AND "Metformin"[MeSH]
|
||||
AND "Clinical Trial"[Publication Type]
|
||||
|
||||
# Step 5: Add date range
|
||||
... AND 2020:2024[Publication Date]
|
||||
```
|
||||
|
||||
### Optimizing Results
|
||||
|
||||
1. **Too many results**: Add filters
|
||||
- Restrict publication type
|
||||
- Narrow date range
|
||||
- Add more specific MeSH terms
|
||||
- Use Major Topic: `[MeSH Major Topic]`
|
||||
|
||||
2. **Too few results**: Broaden search
|
||||
- Remove restrictive filters
|
||||
- Use OR for synonyms
|
||||
- Expand date range
|
||||
- Use MeSH explosion (default)
|
||||
|
||||
3. **Irrelevant results**: Refine terms
|
||||
- Use more specific MeSH terms
|
||||
- Add exclusions with NOT
|
||||
- Use Title field instead of all fields
|
||||
- Add MeSH subheadings
|
||||
|
||||
### Quality Control
|
||||
|
||||
1. **Document search strategy**:
|
||||
- Save exact query string
|
||||
- Record search date
|
||||
- Note number of results
|
||||
- Save filters used
|
||||
|
||||
2. **Export systematically**:
|
||||
- Use consistent file naming
|
||||
- Export to JSON for flexibility
|
||||
- Convert to BibTeX as needed
|
||||
- Keep original search results
|
||||
|
||||
3. **Validate retrieved citations**:
|
||||
```bash
|
||||
python scripts/validate_citations.py pubmed_results.bib
|
||||
```
|
||||
|
||||
### Staying Current
|
||||
|
||||
1. **Set up search alerts**:
|
||||
- PubMed → Save search
|
||||
- Receive email updates
|
||||
- Daily, weekly, or monthly
|
||||
|
||||
2. **Track specific journals**:
|
||||
```
|
||||
"Nature"[Journal] AND CRISPR[Title]
|
||||
```
|
||||
|
||||
3. **Follow key authors**:
|
||||
```
|
||||
"Church GM"[Author]
|
||||
```
|
||||
|
||||
## Common Issues and Solutions
|
||||
|
||||
### Issue: MeSH Term Not Found
|
||||
|
||||
**Solution**:
|
||||
- Check spelling
|
||||
- Use MeSH Browser
|
||||
- Try related terms
|
||||
- Use text word search as fallback
|
||||
|
||||
### Issue: Zero Results
|
||||
|
||||
**Solution**:
|
||||
- Remove filters
|
||||
- Check query syntax
|
||||
- Use OR for broader search
|
||||
- Try synonyms
|
||||
|
||||
### Issue: Poor Quality Results
|
||||
|
||||
**Solution**:
|
||||
- Add publication type filters
|
||||
- Restrict to recent years
|
||||
- Use MeSH Major Topic
|
||||
- Filter by journal quality
|
||||
|
||||
### Issue: Duplicates from Different Sources
|
||||
|
||||
**Solution**:
|
||||
```bash
|
||||
python scripts/format_bibtex.py results.bib \
|
||||
--deduplicate \
|
||||
--output clean.bib
|
||||
```
|
||||
|
||||
### Issue: API Rate Limiting
|
||||
|
||||
**Solution**:
|
||||
- Get API key (increases limit to 10/sec)
|
||||
- Add delays in scripts
|
||||
- Process in batches
|
||||
- Use off-peak hours
|
||||
|
||||
## Summary
|
||||
|
||||
PubMed provides authoritative biomedical literature search:
|
||||
|
||||
✓ **Curated content**: MeSH indexing, quality control
|
||||
✓ **Precise search**: Field tags, MeSH terms, filters
|
||||
✓ **Programmatic access**: E-utilities API
|
||||
✓ **Free access**: No subscription required
|
||||
✓ **Comprehensive**: 35M+ citations, daily updates
|
||||
|
||||
Key strategies:
|
||||
- Use MeSH terms for precise searching
|
||||
- Combine with text words for comprehensive coverage
|
||||
- Apply appropriate field tags
|
||||
- Filter by publication type and date
|
||||
- Use E-utilities API for automation
|
||||
- Document search strategy for reproducibility
|
||||
|
||||
For broader coverage across disciplines, complement with Google Scholar.
|
||||
|
||||
204
skills/citation-management/scripts/doi_to_bibtex.py
Normal file
204
skills/citation-management/scripts/doi_to_bibtex.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DOI to BibTeX Converter
|
||||
Quick utility to convert DOIs to BibTeX format using CrossRef API.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import requests
|
||||
import argparse
|
||||
import time
|
||||
import json
|
||||
from typing import Optional, List
|
||||
|
||||
class DOIConverter:
|
||||
"""Convert DOIs to BibTeX entries using CrossRef API."""
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool; mailto:support@example.com)'
|
||||
})
|
||||
|
||||
def doi_to_bibtex(self, doi: str) -> Optional[str]:
|
||||
"""
|
||||
Convert a single DOI to BibTeX format.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
BibTeX string or None if conversion fails
|
||||
"""
|
||||
# Clean DOI (remove URL prefix if present)
|
||||
doi = doi.strip()
|
||||
if doi.startswith('https://doi.org/'):
|
||||
doi = doi.replace('https://doi.org/', '')
|
||||
elif doi.startswith('http://doi.org/'):
|
||||
doi = doi.replace('http://doi.org/', '')
|
||||
elif doi.startswith('doi:'):
|
||||
doi = doi.replace('doi:', '')
|
||||
|
||||
# Request BibTeX from CrossRef content negotiation
|
||||
url = f'https://doi.org/{doi}'
|
||||
headers = {
|
||||
'Accept': 'application/x-bibtex',
|
||||
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool)'
|
||||
}
|
||||
|
||||
try:
|
||||
response = self.session.get(url, headers=headers, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
bibtex = response.text.strip()
|
||||
# CrossRef sometimes returns entries with @data type, convert to @misc
|
||||
if bibtex.startswith('@data{'):
|
||||
bibtex = bibtex.replace('@data{', '@misc{', 1)
|
||||
return bibtex
|
||||
elif response.status_code == 404:
|
||||
print(f'Error: DOI not found: {doi}', file=sys.stderr)
|
||||
return None
|
||||
else:
|
||||
print(f'Error: Failed to retrieve BibTeX for {doi} (status {response.status_code})', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print(f'Error: Request timeout for DOI: {doi}', file=sys.stderr)
|
||||
return None
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f'Error: Request failed for {doi}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def convert_multiple(self, dois: List[str], delay: float = 0.5) -> List[str]:
|
||||
"""
|
||||
Convert multiple DOIs to BibTeX.
|
||||
|
||||
Args:
|
||||
dois: List of DOIs
|
||||
delay: Delay between requests (seconds) for rate limiting
|
||||
|
||||
Returns:
|
||||
List of BibTeX entries (excludes failed conversions)
|
||||
"""
|
||||
bibtex_entries = []
|
||||
|
||||
for i, doi in enumerate(dois):
|
||||
print(f'Converting DOI {i+1}/{len(dois)}: {doi}', file=sys.stderr)
|
||||
bibtex = self.doi_to_bibtex(doi)
|
||||
|
||||
if bibtex:
|
||||
bibtex_entries.append(bibtex)
|
||||
|
||||
# Rate limiting
|
||||
if i < len(dois) - 1: # Don't delay after last request
|
||||
time.sleep(delay)
|
||||
|
||||
return bibtex_entries
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert DOIs to BibTeX format using CrossRef API',
|
||||
epilog='Example: python doi_to_bibtex.py 10.1038/s41586-021-03819-2'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'dois',
|
||||
nargs='*',
|
||||
help='DOI(s) to convert (can provide multiple)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-i', '--input',
|
||||
help='Input file with DOIs (one per line)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file for BibTeX (default: stdout)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--delay',
|
||||
type=float,
|
||||
default=0.5,
|
||||
help='Delay between requests in seconds (default: 0.5)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['bibtex', 'json'],
|
||||
default='bibtex',
|
||||
help='Output format (default: bibtex)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Collect DOIs from command line and/or file
|
||||
dois = []
|
||||
|
||||
if args.dois:
|
||||
dois.extend(args.dois)
|
||||
|
||||
if args.input:
|
||||
try:
|
||||
with open(args.input, 'r', encoding='utf-8') as f:
|
||||
file_dois = [line.strip() for line in f if line.strip()]
|
||||
dois.extend(file_dois)
|
||||
except FileNotFoundError:
|
||||
print(f'Error: Input file not found: {args.input}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f'Error reading input file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not dois:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Convert DOIs
|
||||
converter = DOIConverter()
|
||||
|
||||
if len(dois) == 1:
|
||||
bibtex = converter.doi_to_bibtex(dois[0])
|
||||
if bibtex:
|
||||
bibtex_entries = [bibtex]
|
||||
else:
|
||||
sys.exit(1)
|
||||
else:
|
||||
bibtex_entries = converter.convert_multiple(dois, delay=args.delay)
|
||||
|
||||
if not bibtex_entries:
|
||||
print('Error: No successful conversions', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Format output
|
||||
if args.format == 'bibtex':
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
else: # json
|
||||
output = json.dumps({
|
||||
'count': len(bibtex_entries),
|
||||
'entries': bibtex_entries
|
||||
}, indent=2)
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'Successfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f'Error writing output file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
# Summary
|
||||
if len(dois) > 1:
|
||||
success_rate = len(bibtex_entries) / len(dois) * 100
|
||||
print(f'\nConverted {len(bibtex_entries)}/{len(dois)} DOIs ({success_rate:.1f}%)', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
569
skills/citation-management/scripts/extract_metadata.py
Executable file
569
skills/citation-management/scripts/extract_metadata.py
Executable file
@@ -0,0 +1,569 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Metadata Extraction Tool
|
||||
Extract citation metadata from DOI, PMID, arXiv ID, or URL using various APIs.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
import argparse
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Optional, Dict, List, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class MetadataExtractor:
|
||||
"""Extract metadata from various sources and generate BibTeX."""
|
||||
|
||||
def __init__(self, email: Optional[str] = None):
|
||||
"""
|
||||
Initialize extractor.
|
||||
|
||||
Args:
|
||||
email: Email for Entrez API (recommended for PubMed)
|
||||
"""
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'MetadataExtractor/1.0 (Citation Management Tool)'
|
||||
})
|
||||
self.email = email or os.getenv('NCBI_EMAIL', '')
|
||||
|
||||
def identify_type(self, identifier: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Identify the type of identifier.
|
||||
|
||||
Args:
|
||||
identifier: DOI, PMID, arXiv ID, or URL
|
||||
|
||||
Returns:
|
||||
Tuple of (type, cleaned_identifier)
|
||||
"""
|
||||
identifier = identifier.strip()
|
||||
|
||||
# Check if URL
|
||||
if identifier.startswith('http://') or identifier.startswith('https://'):
|
||||
return self._parse_url(identifier)
|
||||
|
||||
# Check for DOI
|
||||
if identifier.startswith('10.'):
|
||||
return ('doi', identifier)
|
||||
|
||||
# Check for arXiv ID
|
||||
if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', identifier):
|
||||
return ('arxiv', identifier)
|
||||
if identifier.startswith('arXiv:'):
|
||||
return ('arxiv', identifier.replace('arXiv:', ''))
|
||||
|
||||
# Check for PMID (8-digit number typically)
|
||||
if identifier.isdigit() and len(identifier) >= 7:
|
||||
return ('pmid', identifier)
|
||||
|
||||
# Check for PMCID
|
||||
if identifier.upper().startswith('PMC') and identifier[3:].isdigit():
|
||||
return ('pmcid', identifier.upper())
|
||||
|
||||
return ('unknown', identifier)
|
||||
|
||||
def _parse_url(self, url: str) -> Tuple[str, str]:
|
||||
"""Parse URL to extract identifier type and value."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
# DOI URLs
|
||||
if 'doi.org' in parsed.netloc:
|
||||
doi = parsed.path.lstrip('/')
|
||||
return ('doi', doi)
|
||||
|
||||
# PubMed URLs
|
||||
if 'pubmed.ncbi.nlm.nih.gov' in parsed.netloc or 'ncbi.nlm.nih.gov/pubmed' in url:
|
||||
pmid = re.search(r'/(\d+)', parsed.path)
|
||||
if pmid:
|
||||
return ('pmid', pmid.group(1))
|
||||
|
||||
# arXiv URLs
|
||||
if 'arxiv.org' in parsed.netloc:
|
||||
arxiv_id = re.search(r'/abs/(\d{4}\.\d{4,5})', parsed.path)
|
||||
if arxiv_id:
|
||||
return ('arxiv', arxiv_id.group(1))
|
||||
|
||||
# Nature, Science, Cell, etc. - try to extract DOI from URL
|
||||
doi_match = re.search(r'10\.\d{4,}/[^\s/]+', url)
|
||||
if doi_match:
|
||||
return ('doi', doi_match.group())
|
||||
|
||||
return ('url', url)
|
||||
|
||||
def extract_from_doi(self, doi: str) -> Optional[Dict]:
|
||||
"""
|
||||
Extract metadata from DOI using CrossRef API.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
Metadata dictionary or None
|
||||
"""
|
||||
url = f'https://api.crossref.org/works/{doi}'
|
||||
|
||||
try:
|
||||
response = self.session.get(url, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
message = data.get('message', {})
|
||||
|
||||
metadata = {
|
||||
'type': 'doi',
|
||||
'entry_type': self._crossref_type_to_bibtex(message.get('type')),
|
||||
'doi': doi,
|
||||
'title': message.get('title', [''])[0],
|
||||
'authors': self._format_authors_crossref(message.get('author', [])),
|
||||
'year': self._extract_year_crossref(message),
|
||||
'journal': message.get('container-title', [''])[0] if message.get('container-title') else '',
|
||||
'volume': str(message.get('volume', '')) if message.get('volume') else '',
|
||||
'issue': str(message.get('issue', '')) if message.get('issue') else '',
|
||||
'pages': message.get('page', ''),
|
||||
'publisher': message.get('publisher', ''),
|
||||
'url': f'https://doi.org/{doi}'
|
||||
}
|
||||
|
||||
return metadata
|
||||
else:
|
||||
print(f'Error: CrossRef API returned status {response.status_code} for DOI: {doi}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata from DOI {doi}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def extract_from_pmid(self, pmid: str) -> Optional[Dict]:
|
||||
"""
|
||||
Extract metadata from PMID using PubMed E-utilities.
|
||||
|
||||
Args:
|
||||
pmid: PubMed ID
|
||||
|
||||
Returns:
|
||||
Metadata dictionary or None
|
||||
"""
|
||||
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'id': pmid,
|
||||
'retmode': 'xml',
|
||||
'rettype': 'abstract'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
|
||||
api_key = os.getenv('NCBI_API_KEY')
|
||||
if api_key:
|
||||
params['api_key'] = api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(url, params=params, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
root = ET.fromstring(response.content)
|
||||
article = root.find('.//PubmedArticle')
|
||||
|
||||
if article is None:
|
||||
print(f'Error: No article found for PMID: {pmid}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Extract metadata from XML
|
||||
medline_citation = article.find('.//MedlineCitation')
|
||||
article_elem = medline_citation.find('.//Article')
|
||||
journal = article_elem.find('.//Journal')
|
||||
|
||||
# Get DOI if available
|
||||
doi = None
|
||||
article_ids = article.findall('.//ArticleId')
|
||||
for article_id in article_ids:
|
||||
if article_id.get('IdType') == 'doi':
|
||||
doi = article_id.text
|
||||
break
|
||||
|
||||
metadata = {
|
||||
'type': 'pmid',
|
||||
'entry_type': 'article',
|
||||
'pmid': pmid,
|
||||
'title': article_elem.findtext('.//ArticleTitle', ''),
|
||||
'authors': self._format_authors_pubmed(article_elem.findall('.//Author')),
|
||||
'year': self._extract_year_pubmed(article_elem),
|
||||
'journal': journal.findtext('.//Title', ''),
|
||||
'volume': journal.findtext('.//JournalIssue/Volume', ''),
|
||||
'issue': journal.findtext('.//JournalIssue/Issue', ''),
|
||||
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
|
||||
'doi': doi
|
||||
}
|
||||
|
||||
return metadata
|
||||
else:
|
||||
print(f'Error: PubMed API returned status {response.status_code} for PMID: {pmid}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata from PMID {pmid}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def extract_from_arxiv(self, arxiv_id: str) -> Optional[Dict]:
|
||||
"""
|
||||
Extract metadata from arXiv ID using arXiv API.
|
||||
|
||||
Args:
|
||||
arxiv_id: arXiv identifier
|
||||
|
||||
Returns:
|
||||
Metadata dictionary or None
|
||||
"""
|
||||
url = 'http://export.arxiv.org/api/query'
|
||||
params = {
|
||||
'id_list': arxiv_id,
|
||||
'max_results': 1
|
||||
}
|
||||
|
||||
try:
|
||||
response = self.session.get(url, params=params, timeout=15)
|
||||
|
||||
if response.status_code == 200:
|
||||
# Parse Atom XML
|
||||
root = ET.fromstring(response.content)
|
||||
ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
|
||||
|
||||
entry = root.find('atom:entry', ns)
|
||||
if entry is None:
|
||||
print(f'Error: No entry found for arXiv ID: {arxiv_id}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Extract DOI if published
|
||||
doi_elem = entry.find('arxiv:doi', ns)
|
||||
doi = doi_elem.text if doi_elem is not None else None
|
||||
|
||||
# Extract journal reference if published
|
||||
journal_ref_elem = entry.find('arxiv:journal_ref', ns)
|
||||
journal_ref = journal_ref_elem.text if journal_ref_elem is not None else None
|
||||
|
||||
# Get publication date
|
||||
published = entry.findtext('atom:published', '', ns)
|
||||
year = published[:4] if published else ''
|
||||
|
||||
# Get authors
|
||||
authors = []
|
||||
for author in entry.findall('atom:author', ns):
|
||||
name = author.findtext('atom:name', '', ns)
|
||||
if name:
|
||||
authors.append(name)
|
||||
|
||||
metadata = {
|
||||
'type': 'arxiv',
|
||||
'entry_type': 'misc' if not doi else 'article',
|
||||
'arxiv_id': arxiv_id,
|
||||
'title': entry.findtext('atom:title', '', ns).strip().replace('\n', ' '),
|
||||
'authors': ' and '.join(authors),
|
||||
'year': year,
|
||||
'doi': doi,
|
||||
'journal_ref': journal_ref,
|
||||
'abstract': entry.findtext('atom:summary', '', ns).strip().replace('\n', ' '),
|
||||
'url': f'https://arxiv.org/abs/{arxiv_id}'
|
||||
}
|
||||
|
||||
return metadata
|
||||
else:
|
||||
print(f'Error: arXiv API returned status {response.status_code} for ID: {arxiv_id}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata from arXiv {arxiv_id}: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def metadata_to_bibtex(self, metadata: Dict, citation_key: Optional[str] = None) -> str:
|
||||
"""
|
||||
Convert metadata dictionary to BibTeX format.
|
||||
|
||||
Args:
|
||||
metadata: Metadata dictionary
|
||||
citation_key: Optional custom citation key
|
||||
|
||||
Returns:
|
||||
BibTeX string
|
||||
"""
|
||||
if not citation_key:
|
||||
citation_key = self._generate_citation_key(metadata)
|
||||
|
||||
entry_type = metadata.get('entry_type', 'misc')
|
||||
|
||||
# Build BibTeX entry
|
||||
lines = [f'@{entry_type}{{{citation_key},']
|
||||
|
||||
# Add fields
|
||||
if metadata.get('authors'):
|
||||
lines.append(f' author = {{{metadata["authors"]}}},')
|
||||
|
||||
if metadata.get('title'):
|
||||
# Protect capitalization
|
||||
title = self._protect_title(metadata['title'])
|
||||
lines.append(f' title = {{{title}}},')
|
||||
|
||||
if entry_type == 'article' and metadata.get('journal'):
|
||||
lines.append(f' journal = {{{metadata["journal"]}}},')
|
||||
elif entry_type == 'misc' and metadata.get('type') == 'arxiv':
|
||||
lines.append(f' howpublished = {{arXiv}},')
|
||||
|
||||
if metadata.get('year'):
|
||||
lines.append(f' year = {{{metadata["year"]}}},')
|
||||
|
||||
if metadata.get('volume'):
|
||||
lines.append(f' volume = {{{metadata["volume"]}}},')
|
||||
|
||||
if metadata.get('issue'):
|
||||
lines.append(f' number = {{{metadata["issue"]}}},')
|
||||
|
||||
if metadata.get('pages'):
|
||||
pages = metadata['pages'].replace('-', '--') # En-dash
|
||||
lines.append(f' pages = {{{pages}}},')
|
||||
|
||||
if metadata.get('doi'):
|
||||
lines.append(f' doi = {{{metadata["doi"]}}},')
|
||||
elif metadata.get('url'):
|
||||
lines.append(f' url = {{{metadata["url"]}}},')
|
||||
|
||||
if metadata.get('pmid'):
|
||||
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
|
||||
|
||||
if metadata.get('type') == 'arxiv' and not metadata.get('doi'):
|
||||
lines.append(f' note = {{Preprint}},')
|
||||
|
||||
# Remove trailing comma from last field
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def _crossref_type_to_bibtex(self, crossref_type: str) -> str:
|
||||
"""Map CrossRef type to BibTeX entry type."""
|
||||
type_map = {
|
||||
'journal-article': 'article',
|
||||
'book': 'book',
|
||||
'book-chapter': 'incollection',
|
||||
'proceedings-article': 'inproceedings',
|
||||
'posted-content': 'misc',
|
||||
'dataset': 'misc',
|
||||
'report': 'techreport'
|
||||
}
|
||||
return type_map.get(crossref_type, 'misc')
|
||||
|
||||
def _format_authors_crossref(self, authors: List[Dict]) -> str:
|
||||
"""Format author list from CrossRef data."""
|
||||
if not authors:
|
||||
return ''
|
||||
|
||||
formatted = []
|
||||
for author in authors:
|
||||
given = author.get('given', '')
|
||||
family = author.get('family', '')
|
||||
if family:
|
||||
if given:
|
||||
formatted.append(f'{family}, {given}')
|
||||
else:
|
||||
formatted.append(family)
|
||||
|
||||
return ' and '.join(formatted)
|
||||
|
||||
def _format_authors_pubmed(self, authors: List) -> str:
|
||||
"""Format author list from PubMed XML."""
|
||||
formatted = []
|
||||
for author in authors:
|
||||
last_name = author.findtext('.//LastName', '')
|
||||
fore_name = author.findtext('.//ForeName', '')
|
||||
if last_name:
|
||||
if fore_name:
|
||||
formatted.append(f'{last_name}, {fore_name}')
|
||||
else:
|
||||
formatted.append(last_name)
|
||||
|
||||
return ' and '.join(formatted)
|
||||
|
||||
def _extract_year_crossref(self, message: Dict) -> str:
|
||||
"""Extract year from CrossRef message."""
|
||||
# Try published-print first, then published-online
|
||||
date_parts = message.get('published-print', {}).get('date-parts', [[]])
|
||||
if not date_parts or not date_parts[0]:
|
||||
date_parts = message.get('published-online', {}).get('date-parts', [[]])
|
||||
|
||||
if date_parts and date_parts[0]:
|
||||
return str(date_parts[0][0])
|
||||
return ''
|
||||
|
||||
def _extract_year_pubmed(self, article: ET.Element) -> str:
|
||||
"""Extract year from PubMed XML."""
|
||||
year = article.findtext('.//Journal/JournalIssue/PubDate/Year', '')
|
||||
if not year:
|
||||
medline_date = article.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
|
||||
if medline_date:
|
||||
year_match = re.search(r'\d{4}', medline_date)
|
||||
if year_match:
|
||||
year = year_match.group()
|
||||
return year
|
||||
|
||||
def _generate_citation_key(self, metadata: Dict) -> str:
|
||||
"""Generate a citation key from metadata."""
|
||||
# Get first author last name
|
||||
authors = metadata.get('authors', '')
|
||||
if authors:
|
||||
first_author = authors.split(' and ')[0]
|
||||
if ',' in first_author:
|
||||
last_name = first_author.split(',')[0].strip()
|
||||
else:
|
||||
last_name = first_author.split()[-1] if first_author else 'Unknown'
|
||||
else:
|
||||
last_name = 'Unknown'
|
||||
|
||||
# Get year
|
||||
year = metadata.get('year', '').strip()
|
||||
if not year:
|
||||
year = 'XXXX'
|
||||
|
||||
# Clean last name (remove special characters)
|
||||
last_name = re.sub(r'[^a-zA-Z]', '', last_name)
|
||||
|
||||
# Get keyword from title
|
||||
title = metadata.get('title', '')
|
||||
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
|
||||
keyword = words[0].lower() if words else 'paper'
|
||||
|
||||
return f'{last_name}{year}{keyword}'
|
||||
|
||||
def _protect_title(self, title: str) -> str:
|
||||
"""Protect capitalization in title for BibTeX."""
|
||||
# Protect common acronyms and proper nouns
|
||||
protected_words = [
|
||||
'DNA', 'RNA', 'CRISPR', 'COVID', 'HIV', 'AIDS', 'AlphaFold',
|
||||
'Python', 'AI', 'ML', 'GPU', 'CPU', 'USA', 'UK', 'EU'
|
||||
]
|
||||
|
||||
for word in protected_words:
|
||||
title = re.sub(rf'\b{word}\b', f'{{{word}}}', title, flags=re.IGNORECASE)
|
||||
|
||||
return title
|
||||
|
||||
def extract(self, identifier: str) -> Optional[str]:
|
||||
"""
|
||||
Extract metadata and return BibTeX.
|
||||
|
||||
Args:
|
||||
identifier: DOI, PMID, arXiv ID, or URL
|
||||
|
||||
Returns:
|
||||
BibTeX string or None
|
||||
"""
|
||||
id_type, clean_id = self.identify_type(identifier)
|
||||
|
||||
print(f'Identified as {id_type}: {clean_id}', file=sys.stderr)
|
||||
|
||||
metadata = None
|
||||
|
||||
if id_type == 'doi':
|
||||
metadata = self.extract_from_doi(clean_id)
|
||||
elif id_type == 'pmid':
|
||||
metadata = self.extract_from_pmid(clean_id)
|
||||
elif id_type == 'arxiv':
|
||||
metadata = self.extract_from_arxiv(clean_id)
|
||||
else:
|
||||
print(f'Error: Unknown identifier type: {identifier}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
if metadata:
|
||||
return self.metadata_to_bibtex(metadata)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract citation metadata from DOI, PMID, arXiv ID, or URL',
|
||||
epilog='Example: python extract_metadata.py --doi 10.1038/s41586-021-03819-2'
|
||||
)
|
||||
|
||||
parser.add_argument('--doi', help='Digital Object Identifier')
|
||||
parser.add_argument('--pmid', help='PubMed ID')
|
||||
parser.add_argument('--arxiv', help='arXiv ID')
|
||||
parser.add_argument('--url', help='URL to article')
|
||||
parser.add_argument('-i', '--input', help='Input file with identifiers (one per line)')
|
||||
parser.add_argument('-o', '--output', help='Output file for BibTeX (default: stdout)')
|
||||
parser.add_argument('--format', choices=['bibtex', 'json'], default='bibtex', help='Output format')
|
||||
parser.add_argument('--email', help='Email for NCBI E-utilities (recommended)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Collect identifiers
|
||||
identifiers = []
|
||||
if args.doi:
|
||||
identifiers.append(args.doi)
|
||||
if args.pmid:
|
||||
identifiers.append(args.pmid)
|
||||
if args.arxiv:
|
||||
identifiers.append(args.arxiv)
|
||||
if args.url:
|
||||
identifiers.append(args.url)
|
||||
|
||||
if args.input:
|
||||
try:
|
||||
with open(args.input, 'r', encoding='utf-8') as f:
|
||||
file_ids = [line.strip() for line in f if line.strip()]
|
||||
identifiers.extend(file_ids)
|
||||
except Exception as e:
|
||||
print(f'Error reading input file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not identifiers:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Extract metadata
|
||||
extractor = MetadataExtractor(email=args.email)
|
||||
bibtex_entries = []
|
||||
|
||||
for i, identifier in enumerate(identifiers):
|
||||
print(f'\nProcessing {i+1}/{len(identifiers)}...', file=sys.stderr)
|
||||
bibtex = extractor.extract(identifier)
|
||||
if bibtex:
|
||||
bibtex_entries.append(bibtex)
|
||||
|
||||
# Rate limiting
|
||||
if i < len(identifiers) - 1:
|
||||
time.sleep(0.5)
|
||||
|
||||
if not bibtex_entries:
|
||||
print('Error: No successful extractions', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Format output
|
||||
if args.format == 'bibtex':
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
else: # json
|
||||
output = json.dumps({
|
||||
'count': len(bibtex_entries),
|
||||
'entries': bibtex_entries
|
||||
}, indent=2)
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'\nSuccessfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
print(f'\nExtracted {len(bibtex_entries)}/{len(identifiers)} entries', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
349
skills/citation-management/scripts/format_bibtex.py
Executable file
349
skills/citation-management/scripts/format_bibtex.py
Executable file
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BibTeX Formatter and Cleaner
|
||||
Format, clean, sort, and deduplicate BibTeX files.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
from typing import List, Dict, Tuple
|
||||
from collections import OrderedDict
|
||||
|
||||
class BibTeXFormatter:
|
||||
"""Format and clean BibTeX entries."""
|
||||
|
||||
def __init__(self):
|
||||
# Standard field order for readability
|
||||
self.field_order = [
|
||||
'author', 'editor', 'title', 'booktitle', 'journal',
|
||||
'year', 'month', 'volume', 'number', 'pages',
|
||||
'publisher', 'address', 'edition', 'series',
|
||||
'school', 'institution', 'organization',
|
||||
'howpublished', 'doi', 'url', 'isbn', 'issn',
|
||||
'note', 'abstract', 'keywords'
|
||||
]
|
||||
|
||||
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
|
||||
"""
|
||||
Parse BibTeX file and extract entries.
|
||||
|
||||
Args:
|
||||
filepath: Path to BibTeX file
|
||||
|
||||
Returns:
|
||||
List of entry dictionaries
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f'Error reading file: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
entries = []
|
||||
|
||||
# Match BibTeX entries
|
||||
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
|
||||
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
entry_type = match.group(1).lower()
|
||||
citation_key = match.group(2).strip()
|
||||
fields_text = match.group(3)
|
||||
|
||||
# Parse fields
|
||||
fields = OrderedDict()
|
||||
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
|
||||
field_matches = re.finditer(field_pattern, fields_text)
|
||||
|
||||
for field_match in field_matches:
|
||||
if field_match.group(1):
|
||||
field_name = field_match.group(1).lower()
|
||||
field_value = field_match.group(2)
|
||||
else:
|
||||
field_name = field_match.group(3).lower()
|
||||
field_value = field_match.group(4)
|
||||
|
||||
fields[field_name] = field_value.strip()
|
||||
|
||||
entries.append({
|
||||
'type': entry_type,
|
||||
'key': citation_key,
|
||||
'fields': fields
|
||||
})
|
||||
|
||||
return entries
|
||||
|
||||
def format_entry(self, entry: Dict) -> str:
|
||||
"""
|
||||
Format a single BibTeX entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Formatted BibTeX string
|
||||
"""
|
||||
lines = [f'@{entry["type"]}{{{entry["key"]},']
|
||||
|
||||
# Order fields according to standard order
|
||||
ordered_fields = OrderedDict()
|
||||
|
||||
# Add fields in standard order
|
||||
for field_name in self.field_order:
|
||||
if field_name in entry['fields']:
|
||||
ordered_fields[field_name] = entry['fields'][field_name]
|
||||
|
||||
# Add any remaining fields
|
||||
for field_name, field_value in entry['fields'].items():
|
||||
if field_name not in ordered_fields:
|
||||
ordered_fields[field_name] = field_value
|
||||
|
||||
# Format each field
|
||||
max_field_len = max(len(f) for f in ordered_fields.keys()) if ordered_fields else 0
|
||||
|
||||
for field_name, field_value in ordered_fields.items():
|
||||
# Pad field name for alignment
|
||||
padded_field = field_name.ljust(max_field_len)
|
||||
lines.append(f' {padded_field} = {{{field_value}}},')
|
||||
|
||||
# Remove trailing comma from last field
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
def fix_common_issues(self, entry: Dict) -> Dict:
|
||||
"""
|
||||
Fix common formatting issues in entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Fixed entry dictionary
|
||||
"""
|
||||
fixed = entry.copy()
|
||||
fields = fixed['fields'].copy()
|
||||
|
||||
# Fix page ranges (single hyphen to double hyphen)
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
# Replace single hyphen with double hyphen if it's a range
|
||||
if re.search(r'\d-\d', pages) and '--' not in pages:
|
||||
pages = re.sub(r'(\d)-(\d)', r'\1--\2', pages)
|
||||
fields['pages'] = pages
|
||||
|
||||
# Remove "pp." from pages
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
pages = re.sub(r'^pp\.\s*', '', pages, flags=re.IGNORECASE)
|
||||
fields['pages'] = pages
|
||||
|
||||
# Fix DOI (remove URL prefix if present)
|
||||
if 'doi' in fields:
|
||||
doi = fields['doi']
|
||||
doi = doi.replace('https://doi.org/', '')
|
||||
doi = doi.replace('http://doi.org/', '')
|
||||
doi = doi.replace('doi:', '')
|
||||
fields['doi'] = doi
|
||||
|
||||
# Fix author separators (semicolon or ampersand to 'and')
|
||||
if 'author' in fields:
|
||||
author = fields['author']
|
||||
author = author.replace(';', ' and')
|
||||
author = author.replace(' & ', ' and ')
|
||||
# Clean up multiple 'and's
|
||||
author = re.sub(r'\s+and\s+and\s+', ' and ', author)
|
||||
fields['author'] = author
|
||||
|
||||
fixed['fields'] = fields
|
||||
return fixed
|
||||
|
||||
def deduplicate_entries(self, entries: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Remove duplicate entries based on DOI or citation key.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
|
||||
Returns:
|
||||
List of unique entries
|
||||
"""
|
||||
seen_dois = set()
|
||||
seen_keys = set()
|
||||
unique_entries = []
|
||||
|
||||
for entry in entries:
|
||||
doi = entry['fields'].get('doi', '').strip()
|
||||
key = entry['key']
|
||||
|
||||
# Check DOI first (more reliable)
|
||||
if doi:
|
||||
if doi in seen_dois:
|
||||
print(f'Duplicate DOI found: {doi} (skipping {key})', file=sys.stderr)
|
||||
continue
|
||||
seen_dois.add(doi)
|
||||
|
||||
# Check citation key
|
||||
if key in seen_keys:
|
||||
print(f'Duplicate citation key found: {key} (skipping)', file=sys.stderr)
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
|
||||
unique_entries.append(entry)
|
||||
|
||||
return unique_entries
|
||||
|
||||
def sort_entries(self, entries: List[Dict], sort_by: str = 'key', descending: bool = False) -> List[Dict]:
|
||||
"""
|
||||
Sort entries by specified field.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
sort_by: Field to sort by ('key', 'year', 'author', 'title')
|
||||
descending: Sort in descending order
|
||||
|
||||
Returns:
|
||||
Sorted list of entries
|
||||
"""
|
||||
def get_sort_key(entry: Dict) -> str:
|
||||
if sort_by == 'key':
|
||||
return entry['key'].lower()
|
||||
elif sort_by == 'year':
|
||||
year = entry['fields'].get('year', '9999')
|
||||
return year
|
||||
elif sort_by == 'author':
|
||||
author = entry['fields'].get('author', 'ZZZ')
|
||||
# Get last name of first author
|
||||
if ',' in author:
|
||||
return author.split(',')[0].lower()
|
||||
else:
|
||||
return author.split()[0].lower() if author else 'zzz'
|
||||
elif sort_by == 'title':
|
||||
return entry['fields'].get('title', '').lower()
|
||||
else:
|
||||
return entry['key'].lower()
|
||||
|
||||
return sorted(entries, key=get_sort_key, reverse=descending)
|
||||
|
||||
def format_file(self, filepath: str, output: str = None,
|
||||
deduplicate: bool = False, sort_by: str = None,
|
||||
descending: bool = False, fix_issues: bool = True) -> None:
|
||||
"""
|
||||
Format entire BibTeX file.
|
||||
|
||||
Args:
|
||||
filepath: Input BibTeX file
|
||||
output: Output file (None for in-place)
|
||||
deduplicate: Remove duplicates
|
||||
sort_by: Field to sort by
|
||||
descending: Sort in descending order
|
||||
fix_issues: Fix common formatting issues
|
||||
"""
|
||||
print(f'Parsing {filepath}...', file=sys.stderr)
|
||||
entries = self.parse_bibtex_file(filepath)
|
||||
|
||||
if not entries:
|
||||
print('No entries found', file=sys.stderr)
|
||||
return
|
||||
|
||||
print(f'Found {len(entries)} entries', file=sys.stderr)
|
||||
|
||||
# Fix common issues
|
||||
if fix_issues:
|
||||
print('Fixing common issues...', file=sys.stderr)
|
||||
entries = [self.fix_common_issues(e) for e in entries]
|
||||
|
||||
# Deduplicate
|
||||
if deduplicate:
|
||||
print('Removing duplicates...', file=sys.stderr)
|
||||
original_count = len(entries)
|
||||
entries = self.deduplicate_entries(entries)
|
||||
removed = original_count - len(entries)
|
||||
if removed > 0:
|
||||
print(f'Removed {removed} duplicate(s)', file=sys.stderr)
|
||||
|
||||
# Sort
|
||||
if sort_by:
|
||||
print(f'Sorting by {sort_by}...', file=sys.stderr)
|
||||
entries = self.sort_entries(entries, sort_by, descending)
|
||||
|
||||
# Format entries
|
||||
print('Formatting entries...', file=sys.stderr)
|
||||
formatted_entries = [self.format_entry(e) for e in entries]
|
||||
|
||||
# Write output
|
||||
output_content = '\n\n'.join(formatted_entries) + '\n'
|
||||
|
||||
output_file = output or filepath
|
||||
try:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(output_content)
|
||||
print(f'Successfully wrote {len(entries)} entries to {output_file}', file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f'Error writing file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Format, clean, sort, and deduplicate BibTeX files',
|
||||
epilog='Example: python format_bibtex.py references.bib --deduplicate --sort year'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'file',
|
||||
help='BibTeX file to format'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: overwrite input file)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--deduplicate',
|
||||
action='store_true',
|
||||
help='Remove duplicate entries'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sort',
|
||||
choices=['key', 'year', 'author', 'title'],
|
||||
help='Sort entries by field'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--descending',
|
||||
action='store_true',
|
||||
help='Sort in descending order'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--no-fix',
|
||||
action='store_true',
|
||||
help='Do not fix common issues'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Format file
|
||||
formatter = BibTeXFormatter()
|
||||
formatter.format_file(
|
||||
args.file,
|
||||
output=args.output,
|
||||
deduplicate=args.deduplicate,
|
||||
sort_by=args.sort,
|
||||
descending=args.descending,
|
||||
fix_issues=not args.no_fix
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
282
skills/citation-management/scripts/search_google_scholar.py
Executable file
282
skills/citation-management/scripts/search_google_scholar.py
Executable file
@@ -0,0 +1,282 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Google Scholar Search Tool
|
||||
Search Google Scholar and export results.
|
||||
|
||||
Note: This script requires the 'scholarly' library.
|
||||
Install with: pip install scholarly
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
try:
|
||||
from scholarly import scholarly, ProxyGenerator
|
||||
SCHOLARLY_AVAILABLE = True
|
||||
except ImportError:
|
||||
SCHOLARLY_AVAILABLE = False
|
||||
print('Warning: scholarly library not installed. Install with: pip install scholarly', file=sys.stderr)
|
||||
|
||||
class GoogleScholarSearcher:
|
||||
"""Search Google Scholar using scholarly library."""
|
||||
|
||||
def __init__(self, use_proxy: bool = False):
|
||||
"""
|
||||
Initialize searcher.
|
||||
|
||||
Args:
|
||||
use_proxy: Use free proxy (helps avoid rate limiting)
|
||||
"""
|
||||
if not SCHOLARLY_AVAILABLE:
|
||||
raise ImportError('scholarly library required. Install with: pip install scholarly')
|
||||
|
||||
# Setup proxy if requested
|
||||
if use_proxy:
|
||||
try:
|
||||
pg = ProxyGenerator()
|
||||
pg.FreeProxies()
|
||||
scholarly.use_proxy(pg)
|
||||
print('Using free proxy', file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f'Warning: Could not setup proxy: {e}', file=sys.stderr)
|
||||
|
||||
def search(self, query: str, max_results: int = 50,
|
||||
year_start: Optional[int] = None, year_end: Optional[int] = None,
|
||||
sort_by: str = 'relevance') -> List[Dict]:
|
||||
"""
|
||||
Search Google Scholar.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
max_results: Maximum number of results
|
||||
year_start: Start year filter
|
||||
year_end: End year filter
|
||||
sort_by: Sort order ('relevance' or 'citations')
|
||||
|
||||
Returns:
|
||||
List of result dictionaries
|
||||
"""
|
||||
if not SCHOLARLY_AVAILABLE:
|
||||
print('Error: scholarly library not installed', file=sys.stderr)
|
||||
return []
|
||||
|
||||
print(f'Searching Google Scholar: {query}', file=sys.stderr)
|
||||
print(f'Max results: {max_results}', file=sys.stderr)
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Perform search
|
||||
search_query = scholarly.search_pubs(query)
|
||||
|
||||
for i, result in enumerate(search_query):
|
||||
if i >= max_results:
|
||||
break
|
||||
|
||||
print(f'Retrieved {i+1}/{max_results}', file=sys.stderr)
|
||||
|
||||
# Extract metadata
|
||||
metadata = {
|
||||
'title': result.get('bib', {}).get('title', ''),
|
||||
'authors': ', '.join(result.get('bib', {}).get('author', [])),
|
||||
'year': result.get('bib', {}).get('pub_year', ''),
|
||||
'venue': result.get('bib', {}).get('venue', ''),
|
||||
'abstract': result.get('bib', {}).get('abstract', ''),
|
||||
'citations': result.get('num_citations', 0),
|
||||
'url': result.get('pub_url', ''),
|
||||
'eprint_url': result.get('eprint_url', ''),
|
||||
}
|
||||
|
||||
# Filter by year
|
||||
if year_start or year_end:
|
||||
try:
|
||||
pub_year = int(metadata['year']) if metadata['year'] else 0
|
||||
if year_start and pub_year < year_start:
|
||||
continue
|
||||
if year_end and pub_year > year_end:
|
||||
continue
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
results.append(metadata)
|
||||
|
||||
# Rate limiting to avoid blocking
|
||||
time.sleep(random.uniform(2, 5))
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error during search: {e}', file=sys.stderr)
|
||||
|
||||
# Sort if requested
|
||||
if sort_by == 'citations' and results:
|
||||
results.sort(key=lambda x: x.get('citations', 0), reverse=True)
|
||||
|
||||
return results
|
||||
|
||||
def metadata_to_bibtex(self, metadata: Dict) -> str:
|
||||
"""Convert metadata to BibTeX format."""
|
||||
# Generate citation key
|
||||
if metadata.get('authors'):
|
||||
first_author = metadata['authors'].split(',')[0].strip()
|
||||
last_name = first_author.split()[-1] if first_author else 'Unknown'
|
||||
else:
|
||||
last_name = 'Unknown'
|
||||
|
||||
year = metadata.get('year', 'XXXX')
|
||||
|
||||
# Get keyword from title
|
||||
import re
|
||||
title = metadata.get('title', '')
|
||||
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
|
||||
keyword = words[0].lower() if words else 'paper'
|
||||
|
||||
citation_key = f'{last_name}{year}{keyword}'
|
||||
|
||||
# Determine entry type (guess based on venue)
|
||||
venue = metadata.get('venue', '').lower()
|
||||
if 'proceedings' in venue or 'conference' in venue:
|
||||
entry_type = 'inproceedings'
|
||||
venue_field = 'booktitle'
|
||||
else:
|
||||
entry_type = 'article'
|
||||
venue_field = 'journal'
|
||||
|
||||
# Build BibTeX
|
||||
lines = [f'@{entry_type}{{{citation_key},']
|
||||
|
||||
# Convert authors format
|
||||
if metadata.get('authors'):
|
||||
authors = metadata['authors'].replace(',', ' and')
|
||||
lines.append(f' author = {{{authors}}},')
|
||||
|
||||
if metadata.get('title'):
|
||||
lines.append(f' title = {{{metadata["title"]}}},')
|
||||
|
||||
if metadata.get('venue'):
|
||||
lines.append(f' {venue_field} = {{{metadata["venue"]}}},')
|
||||
|
||||
if metadata.get('year'):
|
||||
lines.append(f' year = {{{metadata["year"]}}},')
|
||||
|
||||
if metadata.get('url'):
|
||||
lines.append(f' url = {{{metadata["url"]}}},')
|
||||
|
||||
if metadata.get('citations'):
|
||||
lines.append(f' note = {{Cited by: {metadata["citations"]}}},')
|
||||
|
||||
# Remove trailing comma
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Search Google Scholar (requires scholarly library)',
|
||||
epilog='Example: python search_google_scholar.py "machine learning" --limit 50'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'query',
|
||||
help='Search query'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=50,
|
||||
help='Maximum number of results (default: 50)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--year-start',
|
||||
type=int,
|
||||
help='Start year for filtering'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--year-end',
|
||||
type=int,
|
||||
help='End year for filtering'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--sort-by',
|
||||
choices=['relevance', 'citations'],
|
||||
default='relevance',
|
||||
help='Sort order (default: relevance)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--use-proxy',
|
||||
action='store_true',
|
||||
help='Use free proxy to avoid rate limiting'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: stdout)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['json', 'bibtex'],
|
||||
default='json',
|
||||
help='Output format (default: json)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not SCHOLARLY_AVAILABLE:
|
||||
print('\nError: scholarly library not installed', file=sys.stderr)
|
||||
print('Install with: pip install scholarly', file=sys.stderr)
|
||||
print('\nAlternatively, use PubMed search for biomedical literature:', file=sys.stderr)
|
||||
print(' python search_pubmed.py "your query"', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Search
|
||||
searcher = GoogleScholarSearcher(use_proxy=args.use_proxy)
|
||||
results = searcher.search(
|
||||
args.query,
|
||||
max_results=args.limit,
|
||||
year_start=args.year_start,
|
||||
year_end=args.year_end,
|
||||
sort_by=args.sort_by
|
||||
)
|
||||
|
||||
if not results:
|
||||
print('No results found', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Format output
|
||||
if args.format == 'json':
|
||||
output = json.dumps({
|
||||
'query': args.query,
|
||||
'count': len(results),
|
||||
'results': results
|
||||
}, indent=2)
|
||||
else: # bibtex
|
||||
bibtex_entries = [searcher.metadata_to_bibtex(r) for r in results]
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'Wrote {len(results)} results to {args.output}', file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
print(f'\nRetrieved {len(results)} results', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
398
skills/citation-management/scripts/search_pubmed.py
Executable file
398
skills/citation-management/scripts/search_pubmed.py
Executable file
@@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubMed Search Tool
|
||||
Search PubMed using E-utilities API and export results.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
|
||||
class PubMedSearcher:
|
||||
"""Search PubMed using NCBI E-utilities API."""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
|
||||
"""
|
||||
Initialize searcher.
|
||||
|
||||
Args:
|
||||
api_key: NCBI API key (optional but recommended)
|
||||
email: Email for Entrez (optional but recommended)
|
||||
"""
|
||||
self.api_key = api_key or os.getenv('NCBI_API_KEY', '')
|
||||
self.email = email or os.getenv('NCBI_EMAIL', '')
|
||||
self.base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
||||
self.session = requests.Session()
|
||||
|
||||
# Rate limiting
|
||||
self.delay = 0.11 if self.api_key else 0.34 # 10/sec with key, 3/sec without
|
||||
|
||||
def search(self, query: str, max_results: int = 100,
|
||||
date_start: Optional[str] = None, date_end: Optional[str] = None,
|
||||
publication_types: Optional[List[str]] = None) -> List[str]:
|
||||
"""
|
||||
Search PubMed and return PMIDs.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
max_results: Maximum number of results
|
||||
date_start: Start date (YYYY/MM/DD or YYYY)
|
||||
date_end: End date (YYYY/MM/DD or YYYY)
|
||||
publication_types: List of publication types to filter
|
||||
|
||||
Returns:
|
||||
List of PMIDs
|
||||
"""
|
||||
# Build query with filters
|
||||
full_query = query
|
||||
|
||||
# Add date range
|
||||
if date_start or date_end:
|
||||
start = date_start or '1900'
|
||||
end = date_end or datetime.now().strftime('%Y')
|
||||
full_query += f' AND {start}:{end}[Publication Date]'
|
||||
|
||||
# Add publication types
|
||||
if publication_types:
|
||||
pub_type_query = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
|
||||
full_query += f' AND ({pub_type_query})'
|
||||
|
||||
print(f'Searching PubMed: {full_query}', file=sys.stderr)
|
||||
|
||||
# ESearch to get PMIDs
|
||||
esearch_url = self.base_url + 'esearch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'term': full_query,
|
||||
'retmax': max_results,
|
||||
'retmode': 'json'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
if self.api_key:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(esearch_url, params=params, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
pmids = data['esearchresult']['idlist']
|
||||
count = int(data['esearchresult']['count'])
|
||||
|
||||
print(f'Found {count} results, retrieving {len(pmids)}', file=sys.stderr)
|
||||
|
||||
return pmids
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error searching PubMed: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
def fetch_metadata(self, pmids: List[str]) -> List[Dict]:
|
||||
"""
|
||||
Fetch metadata for PMIDs.
|
||||
|
||||
Args:
|
||||
pmids: List of PubMed IDs
|
||||
|
||||
Returns:
|
||||
List of metadata dictionaries
|
||||
"""
|
||||
if not pmids:
|
||||
return []
|
||||
|
||||
metadata_list = []
|
||||
|
||||
# Fetch in batches of 200
|
||||
batch_size = 200
|
||||
for i in range(0, len(pmids), batch_size):
|
||||
batch = pmids[i:i+batch_size]
|
||||
print(f'Fetching metadata for PMIDs {i+1}-{min(i+batch_size, len(pmids))}...', file=sys.stderr)
|
||||
|
||||
efetch_url = self.base_url + 'efetch.fcgi'
|
||||
params = {
|
||||
'db': 'pubmed',
|
||||
'id': ','.join(batch),
|
||||
'retmode': 'xml',
|
||||
'rettype': 'abstract'
|
||||
}
|
||||
|
||||
if self.email:
|
||||
params['email'] = self.email
|
||||
if self.api_key:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
try:
|
||||
response = self.session.get(efetch_url, params=params, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse XML
|
||||
root = ET.fromstring(response.content)
|
||||
articles = root.findall('.//PubmedArticle')
|
||||
|
||||
for article in articles:
|
||||
metadata = self._extract_metadata_from_xml(article)
|
||||
if metadata:
|
||||
metadata_list.append(metadata)
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(self.delay)
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error fetching metadata for batch: {e}', file=sys.stderr)
|
||||
continue
|
||||
|
||||
return metadata_list
|
||||
|
||||
def _extract_metadata_from_xml(self, article: ET.Element) -> Optional[Dict]:
|
||||
"""Extract metadata from PubmedArticle XML element."""
|
||||
try:
|
||||
medline_citation = article.find('.//MedlineCitation')
|
||||
article_elem = medline_citation.find('.//Article')
|
||||
journal = article_elem.find('.//Journal')
|
||||
|
||||
# Get PMID
|
||||
pmid = medline_citation.findtext('.//PMID', '')
|
||||
|
||||
# Get DOI
|
||||
doi = None
|
||||
article_ids = article.findall('.//ArticleId')
|
||||
for article_id in article_ids:
|
||||
if article_id.get('IdType') == 'doi':
|
||||
doi = article_id.text
|
||||
break
|
||||
|
||||
# Get authors
|
||||
authors = []
|
||||
author_list = article_elem.find('.//AuthorList')
|
||||
if author_list is not None:
|
||||
for author in author_list.findall('.//Author'):
|
||||
last_name = author.findtext('.//LastName', '')
|
||||
fore_name = author.findtext('.//ForeName', '')
|
||||
if last_name:
|
||||
if fore_name:
|
||||
authors.append(f'{last_name}, {fore_name}')
|
||||
else:
|
||||
authors.append(last_name)
|
||||
|
||||
# Get year
|
||||
year = article_elem.findtext('.//Journal/JournalIssue/PubDate/Year', '')
|
||||
if not year:
|
||||
medline_date = article_elem.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
|
||||
if medline_date:
|
||||
import re
|
||||
year_match = re.search(r'\d{4}', medline_date)
|
||||
if year_match:
|
||||
year = year_match.group()
|
||||
|
||||
metadata = {
|
||||
'pmid': pmid,
|
||||
'doi': doi,
|
||||
'title': article_elem.findtext('.//ArticleTitle', ''),
|
||||
'authors': ' and '.join(authors),
|
||||
'journal': journal.findtext('.//Title', ''),
|
||||
'year': year,
|
||||
'volume': journal.findtext('.//JournalIssue/Volume', ''),
|
||||
'issue': journal.findtext('.//JournalIssue/Issue', ''),
|
||||
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
|
||||
'abstract': article_elem.findtext('.//Abstract/AbstractText', '')
|
||||
}
|
||||
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error extracting metadata: {e}', file=sys.stderr)
|
||||
return None
|
||||
|
||||
def metadata_to_bibtex(self, metadata: Dict) -> str:
|
||||
"""Convert metadata to BibTeX format."""
|
||||
# Generate citation key
|
||||
if metadata.get('authors'):
|
||||
first_author = metadata['authors'].split(' and ')[0]
|
||||
if ',' in first_author:
|
||||
last_name = first_author.split(',')[0].strip()
|
||||
else:
|
||||
last_name = first_author.split()[0]
|
||||
else:
|
||||
last_name = 'Unknown'
|
||||
|
||||
year = metadata.get('year', 'XXXX')
|
||||
citation_key = f'{last_name}{year}pmid{metadata.get("pmid", "")}'
|
||||
|
||||
# Build BibTeX entry
|
||||
lines = [f'@article{{{citation_key},']
|
||||
|
||||
if metadata.get('authors'):
|
||||
lines.append(f' author = {{{metadata["authors"]}}},')
|
||||
|
||||
if metadata.get('title'):
|
||||
lines.append(f' title = {{{metadata["title"]}}},')
|
||||
|
||||
if metadata.get('journal'):
|
||||
lines.append(f' journal = {{{metadata["journal"]}}},')
|
||||
|
||||
if metadata.get('year'):
|
||||
lines.append(f' year = {{{metadata["year"]}}},')
|
||||
|
||||
if metadata.get('volume'):
|
||||
lines.append(f' volume = {{{metadata["volume"]}}},')
|
||||
|
||||
if metadata.get('issue'):
|
||||
lines.append(f' number = {{{metadata["issue"]}}},')
|
||||
|
||||
if metadata.get('pages'):
|
||||
pages = metadata['pages'].replace('-', '--')
|
||||
lines.append(f' pages = {{{pages}}},')
|
||||
|
||||
if metadata.get('doi'):
|
||||
lines.append(f' doi = {{{metadata["doi"]}}},')
|
||||
|
||||
if metadata.get('pmid'):
|
||||
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
|
||||
|
||||
# Remove trailing comma
|
||||
if lines[-1].endswith(','):
|
||||
lines[-1] = lines[-1][:-1]
|
||||
|
||||
lines.append('}')
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Search PubMed using E-utilities API',
|
||||
epilog='Example: python search_pubmed.py "CRISPR gene editing" --limit 100'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'query',
|
||||
nargs='?',
|
||||
help='Search query (PubMed syntax)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--query',
|
||||
dest='query_arg',
|
||||
help='Search query (alternative to positional argument)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--query-file',
|
||||
help='File containing search query'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--limit',
|
||||
type=int,
|
||||
default=100,
|
||||
help='Maximum number of results (default: 100)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--date-start',
|
||||
help='Start date (YYYY/MM/DD or YYYY)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--date-end',
|
||||
help='End date (YYYY/MM/DD or YYYY)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-types',
|
||||
help='Comma-separated publication types (e.g., "Review,Clinical Trial")'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-o', '--output',
|
||||
help='Output file (default: stdout)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['json', 'bibtex'],
|
||||
default='json',
|
||||
help='Output format (default: json)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--api-key',
|
||||
help='NCBI API key (or set NCBI_API_KEY env var)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--email',
|
||||
help='Email for Entrez (or set NCBI_EMAIL env var)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get query
|
||||
query = args.query or args.query_arg
|
||||
|
||||
if args.query_file:
|
||||
try:
|
||||
with open(args.query_file, 'r', encoding='utf-8') as f:
|
||||
query = f.read().strip()
|
||||
except Exception as e:
|
||||
print(f'Error reading query file: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not query:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
# Parse publication types
|
||||
pub_types = None
|
||||
if args.publication_types:
|
||||
pub_types = [pt.strip() for pt in args.publication_types.split(',')]
|
||||
|
||||
# Search PubMed
|
||||
searcher = PubMedSearcher(api_key=args.api_key, email=args.email)
|
||||
pmids = searcher.search(
|
||||
query,
|
||||
max_results=args.limit,
|
||||
date_start=args.date_start,
|
||||
date_end=args.date_end,
|
||||
publication_types=pub_types
|
||||
)
|
||||
|
||||
if not pmids:
|
||||
print('No results found', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Fetch metadata
|
||||
metadata_list = searcher.fetch_metadata(pmids)
|
||||
|
||||
# Format output
|
||||
if args.format == 'json':
|
||||
output = json.dumps({
|
||||
'query': query,
|
||||
'count': len(metadata_list),
|
||||
'results': metadata_list
|
||||
}, indent=2)
|
||||
else: # bibtex
|
||||
bibtex_entries = [searcher.metadata_to_bibtex(m) for m in metadata_list]
|
||||
output = '\n\n'.join(bibtex_entries) + '\n'
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(output)
|
||||
print(f'Wrote {len(metadata_list)} results to {args.output}', file=sys.stderr)
|
||||
else:
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
497
skills/citation-management/scripts/validate_citations.py
Executable file
497
skills/citation-management/scripts/validate_citations.py
Executable file
@@ -0,0 +1,497 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Citation Validation Tool
|
||||
Validate BibTeX files for accuracy, completeness, and format compliance.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import requests
|
||||
import argparse
|
||||
import json
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
class CitationValidator:
|
||||
"""Validate BibTeX entries for errors and inconsistencies."""
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
|
||||
})
|
||||
|
||||
# Required fields by entry type
|
||||
self.required_fields = {
|
||||
'article': ['author', 'title', 'journal', 'year'],
|
||||
'book': ['title', 'publisher', 'year'], # author OR editor
|
||||
'inproceedings': ['author', 'title', 'booktitle', 'year'],
|
||||
'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
|
||||
'phdthesis': ['author', 'title', 'school', 'year'],
|
||||
'mastersthesis': ['author', 'title', 'school', 'year'],
|
||||
'techreport': ['author', 'title', 'institution', 'year'],
|
||||
'misc': ['title', 'year']
|
||||
}
|
||||
|
||||
# Recommended fields
|
||||
self.recommended_fields = {
|
||||
'article': ['volume', 'pages', 'doi'],
|
||||
'book': ['isbn'],
|
||||
'inproceedings': ['pages'],
|
||||
}
|
||||
|
||||
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
|
||||
"""
|
||||
Parse BibTeX file and extract entries.
|
||||
|
||||
Args:
|
||||
filepath: Path to BibTeX file
|
||||
|
||||
Returns:
|
||||
List of entry dictionaries
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f'Error reading file: {e}', file=sys.stderr)
|
||||
return []
|
||||
|
||||
entries = []
|
||||
|
||||
# Match BibTeX entries
|
||||
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
|
||||
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
entry_type = match.group(1).lower()
|
||||
citation_key = match.group(2).strip()
|
||||
fields_text = match.group(3)
|
||||
|
||||
# Parse fields
|
||||
fields = {}
|
||||
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
|
||||
field_matches = re.finditer(field_pattern, fields_text)
|
||||
|
||||
for field_match in field_matches:
|
||||
if field_match.group(1):
|
||||
field_name = field_match.group(1).lower()
|
||||
field_value = field_match.group(2)
|
||||
else:
|
||||
field_name = field_match.group(3).lower()
|
||||
field_value = field_match.group(4)
|
||||
|
||||
fields[field_name] = field_value.strip()
|
||||
|
||||
entries.append({
|
||||
'type': entry_type,
|
||||
'key': citation_key,
|
||||
'fields': fields,
|
||||
'raw': match.group(0)
|
||||
})
|
||||
|
||||
return entries
|
||||
|
||||
def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
|
||||
"""
|
||||
Validate a single BibTeX entry.
|
||||
|
||||
Args:
|
||||
entry: Entry dictionary
|
||||
|
||||
Returns:
|
||||
Tuple of (errors, warnings)
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
entry_type = entry['type']
|
||||
key = entry['key']
|
||||
fields = entry['fields']
|
||||
|
||||
# Check required fields
|
||||
if entry_type in self.required_fields:
|
||||
for req_field in self.required_fields[entry_type]:
|
||||
if req_field not in fields or not fields[req_field]:
|
||||
# Special case: book can have author OR editor
|
||||
if entry_type == 'book' and req_field == 'author':
|
||||
if 'editor' not in fields or not fields['editor']:
|
||||
errors.append({
|
||||
'type': 'missing_required_field',
|
||||
'field': 'author or editor',
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Missing required field "author" or "editor"'
|
||||
})
|
||||
else:
|
||||
errors.append({
|
||||
'type': 'missing_required_field',
|
||||
'field': req_field,
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Missing required field "{req_field}"'
|
||||
})
|
||||
|
||||
# Check recommended fields
|
||||
if entry_type in self.recommended_fields:
|
||||
for rec_field in self.recommended_fields[entry_type]:
|
||||
if rec_field not in fields or not fields[rec_field]:
|
||||
warnings.append({
|
||||
'type': 'missing_recommended_field',
|
||||
'field': rec_field,
|
||||
'severity': 'medium',
|
||||
'message': f'Entry {key}: Missing recommended field "{rec_field}"'
|
||||
})
|
||||
|
||||
# Validate year
|
||||
if 'year' in fields:
|
||||
year = fields['year']
|
||||
if not re.match(r'^\d{4}$', year):
|
||||
errors.append({
|
||||
'type': 'invalid_year',
|
||||
'field': 'year',
|
||||
'value': year,
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
|
||||
})
|
||||
elif int(year) < 1600 or int(year) > 2030:
|
||||
warnings.append({
|
||||
'type': 'suspicious_year',
|
||||
'field': 'year',
|
||||
'value': year,
|
||||
'severity': 'medium',
|
||||
'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
|
||||
})
|
||||
|
||||
# Validate DOI format
|
||||
if 'doi' in fields:
|
||||
doi = fields['doi']
|
||||
if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
|
||||
warnings.append({
|
||||
'type': 'invalid_doi_format',
|
||||
'field': 'doi',
|
||||
'value': doi,
|
||||
'severity': 'medium',
|
||||
'message': f'Entry {key}: Invalid DOI format "{doi}"'
|
||||
})
|
||||
|
||||
# Check for single hyphen in pages (should be --)
|
||||
if 'pages' in fields:
|
||||
pages = fields['pages']
|
||||
if re.search(r'\d-\d', pages) and '--' not in pages:
|
||||
warnings.append({
|
||||
'type': 'page_range_format',
|
||||
'field': 'pages',
|
||||
'value': pages,
|
||||
'severity': 'low',
|
||||
'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
|
||||
})
|
||||
|
||||
# Check author format
|
||||
if 'author' in fields:
|
||||
author = fields['author']
|
||||
if ';' in author or '&' in author:
|
||||
errors.append({
|
||||
'type': 'invalid_author_format',
|
||||
'field': 'author',
|
||||
'severity': 'high',
|
||||
'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
|
||||
})
|
||||
|
||||
return errors, warnings
|
||||
|
||||
def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
|
||||
"""
|
||||
Verify DOI resolves correctly and get metadata.
|
||||
|
||||
Args:
|
||||
doi: Digital Object Identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, metadata)
|
||||
"""
|
||||
try:
|
||||
url = f'https://doi.org/{doi}'
|
||||
response = self.session.head(url, timeout=10, allow_redirects=True)
|
||||
|
||||
if response.status_code < 400:
|
||||
# DOI resolves, now get metadata from CrossRef
|
||||
crossref_url = f'https://api.crossref.org/works/{doi}'
|
||||
metadata_response = self.session.get(crossref_url, timeout=10)
|
||||
|
||||
if metadata_response.status_code == 200:
|
||||
data = metadata_response.json()
|
||||
message = data.get('message', {})
|
||||
|
||||
# Extract key metadata
|
||||
metadata = {
|
||||
'title': message.get('title', [''])[0],
|
||||
'year': self._extract_year_crossref(message),
|
||||
'authors': self._format_authors_crossref(message.get('author', [])),
|
||||
}
|
||||
return True, metadata
|
||||
else:
|
||||
return True, None # DOI resolves but no CrossRef metadata
|
||||
else:
|
||||
return False, None
|
||||
|
||||
except Exception:
|
||||
return False, None
|
||||
|
||||
def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Detect duplicate entries.
|
||||
|
||||
Args:
|
||||
entries: List of entry dictionaries
|
||||
|
||||
Returns:
|
||||
List of duplicate groups
|
||||
"""
|
||||
duplicates = []
|
||||
|
||||
# Check for duplicate DOIs
|
||||
doi_map = defaultdict(list)
|
||||
for entry in entries:
|
||||
doi = entry['fields'].get('doi', '').strip()
|
||||
if doi:
|
||||
doi_map[doi].append(entry['key'])
|
||||
|
||||
for doi, keys in doi_map.items():
|
||||
if len(keys) > 1:
|
||||
duplicates.append({
|
||||
'type': 'duplicate_doi',
|
||||
'doi': doi,
|
||||
'entries': keys,
|
||||
'severity': 'high',
|
||||
'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
|
||||
})
|
||||
|
||||
# Check for duplicate citation keys
|
||||
key_counts = defaultdict(int)
|
||||
for entry in entries:
|
||||
key_counts[entry['key']] += 1
|
||||
|
||||
for key, count in key_counts.items():
|
||||
if count > 1:
|
||||
duplicates.append({
|
||||
'type': 'duplicate_key',
|
||||
'key': key,
|
||||
'count': count,
|
||||
'severity': 'high',
|
||||
'message': f'Citation key "{key}" appears {count} times'
|
||||
})
|
||||
|
||||
# Check for similar titles (possible duplicates)
|
||||
titles = {}
|
||||
for entry in entries:
|
||||
title = entry['fields'].get('title', '').lower()
|
||||
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
|
||||
title = ' '.join(title.split()) # Normalize whitespace
|
||||
|
||||
if title:
|
||||
if title in titles:
|
||||
duplicates.append({
|
||||
'type': 'similar_title',
|
||||
'entries': [titles[title], entry['key']],
|
||||
'severity': 'medium',
|
||||
'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
|
||||
})
|
||||
else:
|
||||
titles[title] = entry['key']
|
||||
|
||||
return duplicates
|
||||
|
||||
def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
|
||||
"""
|
||||
Validate entire BibTeX file.
|
||||
|
||||
Args:
|
||||
filepath: Path to BibTeX file
|
||||
check_dois: Whether to verify DOIs (slow)
|
||||
|
||||
Returns:
|
||||
Validation report dictionary
|
||||
"""
|
||||
print(f'Parsing {filepath}...', file=sys.stderr)
|
||||
entries = self.parse_bibtex_file(filepath)
|
||||
|
||||
if not entries:
|
||||
return {
|
||||
'total_entries': 0,
|
||||
'errors': [],
|
||||
'warnings': [],
|
||||
'duplicates': []
|
||||
}
|
||||
|
||||
print(f'Found {len(entries)} entries', file=sys.stderr)
|
||||
|
||||
all_errors = []
|
||||
all_warnings = []
|
||||
|
||||
# Validate each entry
|
||||
for i, entry in enumerate(entries):
|
||||
print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
|
||||
errors, warnings = self.validate_entry(entry)
|
||||
|
||||
for error in errors:
|
||||
error['entry'] = entry['key']
|
||||
all_errors.append(error)
|
||||
|
||||
for warning in warnings:
|
||||
warning['entry'] = entry['key']
|
||||
all_warnings.append(warning)
|
||||
|
||||
# Check for duplicates
|
||||
print('Checking for duplicates...', file=sys.stderr)
|
||||
duplicates = self.detect_duplicates(entries)
|
||||
|
||||
# Verify DOIs if requested
|
||||
doi_errors = []
|
||||
if check_dois:
|
||||
print('Verifying DOIs...', file=sys.stderr)
|
||||
for i, entry in enumerate(entries):
|
||||
doi = entry['fields'].get('doi', '')
|
||||
if doi:
|
||||
print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
|
||||
is_valid, metadata = self.verify_doi(doi)
|
||||
|
||||
if not is_valid:
|
||||
doi_errors.append({
|
||||
'type': 'invalid_doi',
|
||||
'entry': entry['key'],
|
||||
'doi': doi,
|
||||
'severity': 'high',
|
||||
'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
|
||||
})
|
||||
|
||||
all_errors.extend(doi_errors)
|
||||
|
||||
return {
|
||||
'filepath': filepath,
|
||||
'total_entries': len(entries),
|
||||
'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
|
||||
'errors': all_errors,
|
||||
'warnings': all_warnings,
|
||||
'duplicates': duplicates
|
||||
}
|
||||
|
||||
def _extract_year_crossref(self, message: Dict) -> str:
|
||||
"""Extract year from CrossRef message."""
|
||||
date_parts = message.get('published-print', {}).get('date-parts', [[]])
|
||||
if not date_parts or not date_parts[0]:
|
||||
date_parts = message.get('published-online', {}).get('date-parts', [[]])
|
||||
|
||||
if date_parts and date_parts[0]:
|
||||
return str(date_parts[0][0])
|
||||
return ''
|
||||
|
||||
def _format_authors_crossref(self, authors: List[Dict]) -> str:
|
||||
"""Format author list from CrossRef."""
|
||||
if not authors:
|
||||
return ''
|
||||
|
||||
formatted = []
|
||||
for author in authors[:3]: # First 3 authors
|
||||
given = author.get('given', '')
|
||||
family = author.get('family', '')
|
||||
if family:
|
||||
formatted.append(f'{family}, {given}' if given else family)
|
||||
|
||||
if len(authors) > 3:
|
||||
formatted.append('et al.')
|
||||
|
||||
return ', '.join(formatted)
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Validate BibTeX files for errors and inconsistencies',
|
||||
epilog='Example: python validate_citations.py references.bib'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'file',
|
||||
help='BibTeX file to validate'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--check-dois',
|
||||
action='store_true',
|
||||
help='Verify DOIs resolve correctly (slow)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--auto-fix',
|
||||
action='store_true',
|
||||
help='Attempt to auto-fix common issues (not implemented yet)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--report',
|
||||
help='Output file for JSON validation report'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--verbose',
|
||||
action='store_true',
|
||||
help='Show detailed output'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate file
|
||||
validator = CitationValidator()
|
||||
report = validator.validate_file(args.file, check_dois=args.check_dois)
|
||||
|
||||
# Print summary
|
||||
print('\n' + '='*60)
|
||||
print('CITATION VALIDATION REPORT')
|
||||
print('='*60)
|
||||
print(f'\nFile: {args.file}')
|
||||
print(f'Total entries: {report["total_entries"]}')
|
||||
print(f'Valid entries: {report["valid_entries"]}')
|
||||
print(f'Errors: {len(report["errors"])}')
|
||||
print(f'Warnings: {len(report["warnings"])}')
|
||||
print(f'Duplicates: {len(report["duplicates"])}')
|
||||
|
||||
# Print errors
|
||||
if report['errors']:
|
||||
print('\n' + '-'*60)
|
||||
print('ERRORS (must fix):')
|
||||
print('-'*60)
|
||||
for error in report['errors']:
|
||||
print(f'\n{error["message"]}')
|
||||
if args.verbose:
|
||||
print(f' Type: {error["type"]}')
|
||||
print(f' Severity: {error["severity"]}')
|
||||
|
||||
# Print warnings
|
||||
if report['warnings'] and args.verbose:
|
||||
print('\n' + '-'*60)
|
||||
print('WARNINGS (should fix):')
|
||||
print('-'*60)
|
||||
for warning in report['warnings']:
|
||||
print(f'\n{warning["message"]}')
|
||||
|
||||
# Print duplicates
|
||||
if report['duplicates']:
|
||||
print('\n' + '-'*60)
|
||||
print('DUPLICATES:')
|
||||
print('-'*60)
|
||||
for dup in report['duplicates']:
|
||||
print(f'\n{dup["message"]}')
|
||||
|
||||
# Save report
|
||||
if args.report:
|
||||
with open(args.report, 'w', encoding='utf-8') as f:
|
||||
json.dump(report, f, indent=2)
|
||||
print(f'\nDetailed report saved to: {args.report}')
|
||||
|
||||
# Exit with error code if there are errors
|
||||
if report['errors']:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user