Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:18 +08:00
commit 74bee324ab
335 changed files with 147377 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,264 @@
% BibTeX Template File
% Examples of properly formatted entries for all common types
% =============================================================================
% JOURNAL ARTICLES
% =============================================================================
@article{Jumper2021,
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and others},
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
journal = {Nature},
year = {2021},
volume = {596},
number = {7873},
pages = {583--589},
doi = {10.1038/s41586-021-03819-2}
}
@article{Watson1953,
author = {Watson, James D. and Crick, Francis H. C.},
title = {Molecular Structure of Nucleic Acids: A Structure for Deoxyribose Nucleic Acid},
journal = {Nature},
year = {1953},
volume = {171},
number = {4356},
pages = {737--738},
doi = {10.1038/171737a0}
}
@article{Doudna2014,
author = {Doudna, Jennifer A. and Charpentier, Emmanuelle},
title = {The New Frontier of Genome Engineering with {CRISPR-Cas9}},
journal = {Science},
year = {2014},
volume = {346},
number = {6213},
pages = {1258096},
doi = {10.1126/science.1258096}
}
% =============================================================================
% BOOKS
% =============================================================================
@book{Kumar2021,
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
title = {Robbins and Cotran Pathologic Basis of Disease},
publisher = {Elsevier},
year = {2021},
edition = {10},
address = {Philadelphia, PA},
isbn = {978-0-323-53113-9}
}
@book{Alberts2014,
author = {Alberts, Bruce and Johnson, Alexander and Lewis, Julian and Morgan, David and Raff, Martin and Roberts, Keith and Walter, Peter},
title = {Molecular Biology of the Cell},
publisher = {Garland Science},
year = {2014},
edition = {6},
address = {New York, NY},
isbn = {978-0-815-34432-2}
}
% Book with editor instead of author
@book{Sambrook2001,
editor = {Sambrook, Joseph and Russell, David W.},
title = {Molecular Cloning: A Laboratory Manual},
publisher = {Cold Spring Harbor Laboratory Press},
year = {2001},
edition = {3},
address = {Cold Spring Harbor, NY},
isbn = {978-0-879-69576-7}
}
% =============================================================================
% CONFERENCE PAPERS (PROCEEDINGS)
% =============================================================================
@inproceedings{Vaswani2017,
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},
title = {Attention is All You Need},
booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
year = {2017},
pages = {5998--6008},
address = {Long Beach, CA},
url = {https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html}
}
@inproceedings{He2016,
author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
title = {Deep Residual Learning for Image Recognition},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2016},
pages = {770--778},
address = {Las Vegas, NV},
doi = {10.1109/CVPR.2016.90}
}
% =============================================================================
% BOOK CHAPTERS
% =============================================================================
@incollection{Brown2020,
author = {Brown, Peter O. and Botstein, David},
title = {Exploring the New World of the Genome with {DNA} Microarrays},
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
editor = {Eisen, Michael B. and Brown, Patrick O.},
publisher = {Cold Spring Harbor Laboratory Press},
year = {2020},
pages = {1--45},
address = {Cold Spring Harbor, NY}
}
% =============================================================================
% PHD THESES / DISSERTATIONS
% =============================================================================
@phdthesis{Johnson2023,
author = {Johnson, Mary L.},
title = {Novel Approaches to Cancer Immunotherapy Using {CRISPR} Technology},
school = {Stanford University},
year = {2023},
type = {{PhD} dissertation},
address = {Stanford, CA}
}
% =============================================================================
% MASTER'S THESES
% =============================================================================
@mastersthesis{Smith2022,
author = {Smith, Robert J.},
title = {Machine Learning Methods for Protein Structure Prediction},
school = {Massachusetts Institute of Technology},
year = {2022},
type = {{Master's} thesis},
address = {Cambridge, MA}
}
% =============================================================================
% TECHNICAL REPORTS
% =============================================================================
@techreport{WHO2020,
author = {{World Health Organization}},
title = {Clinical Management of {COVID-19}: Interim Guidance},
institution = {World Health Organization},
year = {2020},
type = {Technical Report},
number = {WHO/2019-nCoV/clinical/2020.5},
address = {Geneva, Switzerland}
}
% =============================================================================
% PREPRINTS
% =============================================================================
% bioRxiv preprint
@misc{Zhang2024preprint,
author = {Zhang, Yi and Chen, Li and Wang, Hui and Liu, Xin},
title = {Novel Therapeutic Targets in {Alzheimer}'s Disease},
year = {2024},
howpublished = {bioRxiv},
doi = {10.1101/2024.01.15.575432},
note = {Preprint}
}
% arXiv preprint
@misc{Brown2024arxiv,
author = {Brown, Alice and Green, Bob},
title = {Advances in Quantum Computing},
year = {2024},
howpublished = {arXiv},
note = {arXiv:2401.12345}
}
% =============================================================================
% DATASETS
% =============================================================================
@misc{AlphaFoldDB2021,
author = {{DeepMind} and {EMBL-EBI}},
title = {{AlphaFold} Protein Structure Database},
year = {2021},
howpublished = {Database},
url = {https://alphafold.ebi.ac.uk/},
doi = {10.1093/nar/gkab1061},
note = {Version 4}
}
% =============================================================================
% SOFTWARE / CODE
% =============================================================================
@misc{McKinney2010pandas,
author = {McKinney, Wes},
title = {pandas: A Foundational {Python} Library for Data Analysis and Statistics},
year = {2010},
howpublished = {Software},
url = {https://pandas.pydata.org/},
note = {Python Data Analysis Library}
}
% =============================================================================
% WEBSITES / ONLINE RESOURCES
% =============================================================================
@misc{NCBI2024,
author = {{National Center for Biotechnology Information}},
title = {{PubMed}: Database of Biomedical Literature},
year = {2024},
howpublished = {Website},
url = {https://pubmed.ncbi.nlm.nih.gov/},
note = {Accessed: 2024-01-15}
}
% =============================================================================
% SPECIAL CASES
% =============================================================================
% Article with organization as author
@article{NatureEditorial2023,
author = {{Nature Editorial Board}},
title = {The Future of {AI} in Scientific Research},
journal = {Nature},
year = {2023},
volume = {615},
pages = {1--2},
doi = {10.1038/d41586-023-00001-1}
}
% Article with no volume number (some journals)
@article{OpenAccess2024,
author = {Williams, Sarah and Thomas, Michael},
title = {Open Access Publishing in the 21st Century},
journal = {Journal of Scholarly Communication},
year = {2024},
pages = {e123456},
doi = {10.1234/jsc.2024.123456}
}
% Conference paper with DOI
@inproceedings{Garcia2023,
author = {Garc{\'i}a-Mart{\'i}nez, Jos{\'e} and M{\"u}ller, Hans},
title = {International Collaboration in Science},
booktitle = {Proceedings of the International Conference on Academic Publishing},
year = {2023},
pages = {45--52},
doi = {10.1109/ICAP.2023.123456}
}
% Article with PMID but no DOI (older papers)
@article{OldPaper1995,
author = {Anderson, Philip W.},
title = {Through the Glass Lightly},
journal = {Science},
year = {1995},
volume = {267},
number = {5204},
pages = {1615--1616},
note = {PMID: 17808148}
}

View File

@@ -0,0 +1,386 @@
# Citation Quality Checklist
Use this checklist to ensure your citations are accurate, complete, and properly formatted before final submission.
## Pre-Submission Checklist
### ✓ Metadata Accuracy
- [ ] All author names are correct and properly formatted
- [ ] Article titles match the actual publication
- [ ] Journal/conference names are complete (not abbreviated unless required)
- [ ] Publication years are accurate
- [ ] Volume and issue numbers are correct
- [ ] Page ranges are accurate
### ✓ Required Fields
- [ ] All @article entries have: author, title, journal, year
- [ ] All @book entries have: author/editor, title, publisher, year
- [ ] All @inproceedings entries have: author, title, booktitle, year
- [ ] Modern papers (2000+) include DOI when available
- [ ] All entries have unique citation keys
### ✓ DOI Verification
- [ ] All DOIs are properly formatted (10.XXXX/...)
- [ ] DOIs resolve correctly to the article
- [ ] No DOI prefix in the BibTeX field (no "doi:" or "https://doi.org/")
- [ ] Metadata from CrossRef matches your BibTeX entry
- [ ] Run: `python scripts/validate_citations.py references.bib --check-dois`
### ✓ Formatting Consistency
- [ ] Page ranges use double hyphen (--) not single (-)
- [ ] No "pp." prefix in pages field
- [ ] Author names use "and" separator (not semicolon or ampersand)
- [ ] Capitalization protected in titles ({AlphaFold}, {CRISPR}, etc.)
- [ ] Month names use standard abbreviations if included
- [ ] Citation keys follow consistent format
### ✓ Duplicate Detection
- [ ] No duplicate DOIs in bibliography
- [ ] No duplicate citation keys
- [ ] No near-duplicate titles
- [ ] Preprints updated to published versions when available
- [ ] Run: `python scripts/validate_citations.py references.bib`
### ✓ Special Characters
- [ ] Accented characters properly formatted (e.g., {\"u} for ü)
- [ ] Mathematical symbols use LaTeX commands
- [ ] Chemical formulas properly formatted
- [ ] No unescaped special characters (%, &, $, #, etc.)
### ✓ BibTeX Syntax
- [ ] All entries have balanced braces {}
- [ ] Fields separated by commas
- [ ] No comma after last field in each entry
- [ ] Valid entry types (@article, @book, etc.)
- [ ] Run: `python scripts/validate_citations.py references.bib`
### ✓ File Organization
- [ ] Bibliography sorted in logical order (by year, author, or key)
- [ ] Consistent formatting throughout
- [ ] No formatting inconsistencies between entries
- [ ] Run: `python scripts/format_bibtex.py references.bib --sort year`
## Automated Validation
### Step 1: Format and Clean
```bash
python scripts/format_bibtex.py references.bib \
--deduplicate \
--sort year \
--descending \
--output clean_references.bib
```
**What this does**:
- Removes duplicates
- Standardizes formatting
- Fixes common issues (page ranges, DOI format, etc.)
- Sorts by year (newest first)
### Step 2: Validate
```bash
python scripts/validate_citations.py clean_references.bib \
--check-dois \
--report validation_report.json \
--verbose
```
**What this does**:
- Checks required fields
- Verifies DOIs resolve
- Detects duplicates
- Validates syntax
- Generates detailed report
### Step 3: Review Report
```bash
cat validation_report.json
```
**Address any**:
- **Errors**: Must fix (missing fields, broken DOIs, syntax errors)
- **Warnings**: Should fix (missing recommended fields, formatting issues)
- **Duplicates**: Remove or consolidate
### Step 4: Final Check
```bash
python scripts/validate_citations.py clean_references.bib --verbose
```
**Goal**: Zero errors, minimal warnings
## Manual Review Checklist
### Critical Citations (Top 10-20 Most Important)
For your most important citations, manually verify:
- [ ] Visit DOI link and confirm it's the correct article
- [ ] Check author names against the actual publication
- [ ] Verify year matches publication date
- [ ] Confirm journal/conference name is correct
- [ ] Check that volume/pages match
### Common Issues to Watch For
**Missing Information**:
- [ ] No DOI for papers published after 2000
- [ ] Missing volume or page numbers for journal articles
- [ ] Missing publisher for books
- [ ] Missing conference location for proceedings
**Formatting Errors**:
- [ ] Single hyphen in page ranges (123-145 → 123--145)
- [ ] Ampersands in author lists (Smith & Jones → Smith and Jones)
- [ ] Unprotected acronyms in titles (DNA → {DNA})
- [ ] DOI includes URL prefix (https://doi.org/10.xxx → 10.xxx)
**Metadata Mismatches**:
- [ ] Author names differ from publication
- [ ] Year is online-first instead of print publication
- [ ] Journal name abbreviated when it should be full
- [ ] Volume/issue numbers swapped
**Duplicates**:
- [ ] Same paper cited with different citation keys
- [ ] Preprint and published version both cited
- [ ] Conference paper and journal version both cited
## Field-Specific Checks
### Biomedical Sciences
- [ ] PubMed Central ID (PMCID) included when available
- [ ] MeSH terms appropriate (if using)
- [ ] Clinical trial registration number included (if applicable)
- [ ] All references to treatments/drugs accurately cited
### Computer Science
- [ ] arXiv ID included for preprints
- [ ] Conference proceedings properly cited (not just "NeurIPS")
- [ ] Software/dataset citations include version numbers
- [ ] GitHub links stable and permanent
### General Sciences
- [ ] Data availability statements properly cited
- [ ] Retracted papers identified and removed
- [ ] Preprints checked for published versions
- [ ] Supplementary materials referenced if critical
## Final Pre-Submission Steps
### 1 Week Before Submission
- [ ] Run full validation with DOI checking
- [ ] Fix all errors and critical warnings
- [ ] Manually verify top 10-20 most important citations
- [ ] Check for any retracted papers
### 3 Days Before Submission
- [ ] Re-run validation after any manual edits
- [ ] Ensure all in-text citations have corresponding bibliography entries
- [ ] Ensure all bibliography entries are cited in text
- [ ] Check citation style matches journal requirements
### 1 Day Before Submission
- [ ] Final validation check
- [ ] LaTeX compilation successful with no warnings
- [ ] PDF renders all citations correctly
- [ ] Bibliography appears in correct format
- [ ] No placeholder citations (Smith et al. XXXX)
### Submission Day
- [ ] One final validation run
- [ ] No last-minute edits without re-validation
- [ ] Bibliography file included in submission package
- [ ] Figures/tables referenced in text match bibliography
## Quality Metrics
### Excellent Bibliography
- ✓ 100% of entries have DOIs (for modern papers)
- ✓ Zero validation errors
- ✓ Zero missing required fields
- ✓ Zero broken DOIs
- ✓ Zero duplicates
- ✓ Consistent formatting throughout
- ✓ All citations manually spot-checked
### Acceptable Bibliography
- ✓ 90%+ of modern entries have DOIs
- ✓ Zero high-severity errors
- ✓ Minor warnings only (e.g., missing recommended fields)
- ✓ Key citations manually verified
- ✓ Compilation succeeds without errors
### Needs Improvement
- ✗ Missing DOIs for recent papers
- ✗ High-severity validation errors
- ✗ Broken or incorrect DOIs
- ✗ Duplicate entries
- ✗ Inconsistent formatting
- ✗ Compilation warnings or errors
## Emergency Fixes
If you discover issues at the last minute:
### Broken DOI
```bash
# Find correct DOI
# Option 1: Search CrossRef
# https://www.crossref.org/
# Option 2: Search on publisher website
# Option 3: Google Scholar
# Re-extract metadata
python scripts/extract_metadata.py --doi CORRECT_DOI
```
### Missing Information
```bash
# Extract from DOI
python scripts/extract_metadata.py --doi 10.xxxx/yyyy
# Or from PMID (biomedical)
python scripts/extract_metadata.py --pmid 12345678
# Or from arXiv
python scripts/extract_metadata.py --arxiv 2103.12345
```
### Duplicate Entries
```bash
# Auto-remove duplicates
python scripts/format_bibtex.py references.bib \
--deduplicate \
--output fixed_references.bib
```
### Formatting Errors
```bash
# Auto-fix common issues
python scripts/format_bibtex.py references.bib \
--output fixed_references.bib
# Then validate
python scripts/validate_citations.py fixed_references.bib
```
## Long-Term Best Practices
### During Research
- [ ] Add citations to bibliography file as you find them
- [ ] Extract metadata immediately using DOI
- [ ] Validate after every 10-20 additions
- [ ] Keep bibliography file under version control
### During Writing
- [ ] Cite as you write
- [ ] Use consistent citation keys
- [ ] Don't delay adding references
- [ ] Validate weekly
### Before Submission
- [ ] Allow 2-3 days for citation cleanup
- [ ] Don't wait until the last day
- [ ] Automate what you can
- [ ] Manually verify critical citations
## Tool Quick Reference
### Extract Metadata
```bash
# From DOI
python scripts/doi_to_bibtex.py 10.1038/nature12345
# From multiple sources
python scripts/extract_metadata.py \
--doi 10.1038/nature12345 \
--pmid 12345678 \
--arxiv 2103.12345 \
--output references.bib
```
### Validate
```bash
# Basic validation
python scripts/validate_citations.py references.bib
# With DOI checking (slow but thorough)
python scripts/validate_citations.py references.bib --check-dois
# Generate report
python scripts/validate_citations.py references.bib \
--report validation.json \
--verbose
```
### Format and Clean
```bash
# Format and fix issues
python scripts/format_bibtex.py references.bib
# Remove duplicates and sort
python scripts/format_bibtex.py references.bib \
--deduplicate \
--sort year \
--descending \
--output clean_refs.bib
```
## Summary
**Minimum Requirements**:
1. Run `format_bibtex.py --deduplicate`
2. Run `validate_citations.py`
3. Fix all errors
4. Compile successfully
**Recommended**:
1. Format, deduplicate, and sort
2. Validate with `--check-dois`
3. Fix all errors and warnings
4. Manually verify top citations
5. Re-validate after fixes
**Best Practice**:
1. Validate throughout research process
2. Use automated tools consistently
3. Keep bibliography clean and organized
4. Document any special cases
5. Final validation 1-3 days before submission
**Remember**: Citation errors reflect poorly on your scholarship. Taking time to ensure accuracy is worthwhile!

View File

@@ -0,0 +1,908 @@
# BibTeX Formatting Guide
Comprehensive guide to BibTeX entry types, required fields, formatting conventions, and best practices.
## Overview
BibTeX is the standard bibliography format for LaTeX documents. Proper formatting ensures:
- Correct citation rendering
- Consistent formatting
- Compatibility with citation styles
- No compilation errors
This guide covers all common entry types and formatting rules.
## Entry Types
### @article - Journal Articles
**Most common entry type** for peer-reviewed journal articles.
**Required fields**:
- `author`: Author names
- `title`: Article title
- `journal`: Journal name
- `year`: Publication year
**Optional fields**:
- `volume`: Volume number
- `number`: Issue number
- `pages`: Page range
- `month`: Publication month
- `doi`: Digital Object Identifier
- `url`: URL
- `note`: Additional notes
**Template**:
```bibtex
@article{CitationKey2024,
author = {Last1, First1 and Last2, First2},
title = {Article Title Here},
journal = {Journal Name},
year = {2024},
volume = {10},
number = {3},
pages = {123--145},
doi = {10.1234/journal.2024.123456},
month = jan
}
```
**Example**:
```bibtex
@article{Jumper2021,
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and others},
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
journal = {Nature},
year = {2021},
volume = {596},
number = {7873},
pages = {583--589},
doi = {10.1038/s41586-021-03819-2}
}
```
### @book - Books
**For entire books**.
**Required fields**:
- `author` OR `editor`: Author(s) or editor(s)
- `title`: Book title
- `publisher`: Publisher name
- `year`: Publication year
**Optional fields**:
- `volume`: Volume number (if multi-volume)
- `series`: Series name
- `address`: Publisher location
- `edition`: Edition number
- `isbn`: ISBN
- `url`: URL
**Template**:
```bibtex
@book{CitationKey2024,
author = {Last, First},
title = {Book Title},
publisher = {Publisher Name},
year = {2024},
edition = {3},
address = {City, Country},
isbn = {978-0-123-45678-9}
}
```
**Example**:
```bibtex
@book{Kumar2021,
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
title = {Robbins and Cotran Pathologic Basis of Disease},
publisher = {Elsevier},
year = {2021},
edition = {10},
address = {Philadelphia, PA},
isbn = {978-0-323-53113-9}
}
```
### @inproceedings - Conference Papers
**For papers in conference proceedings**.
**Required fields**:
- `author`: Author names
- `title`: Paper title
- `booktitle`: Conference/proceedings name
- `year`: Year
**Optional fields**:
- `editor`: Proceedings editor(s)
- `volume`: Volume number
- `series`: Series name
- `pages`: Page range
- `address`: Conference location
- `month`: Conference month
- `organization`: Organizing body
- `publisher`: Publisher
- `doi`: DOI
**Template**:
```bibtex
@inproceedings{CitationKey2024,
author = {Last, First},
title = {Paper Title},
booktitle = {Proceedings of Conference Name},
year = {2024},
pages = {123--145},
address = {City, Country},
month = jun
}
```
**Example**:
```bibtex
@inproceedings{Vaswani2017,
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and others},
title = {Attention is All You Need},
booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
year = {2017},
pages = {5998--6008},
address = {Long Beach, CA}
}
```
**Note**: `@conference` is an alias for `@inproceedings`.
### @incollection - Book Chapters
**For chapters in edited books**.
**Required fields**:
- `author`: Chapter author(s)
- `title`: Chapter title
- `booktitle`: Book title
- `publisher`: Publisher name
- `year`: Publication year
**Optional fields**:
- `editor`: Book editor(s)
- `volume`: Volume number
- `series`: Series name
- `type`: Type of section (e.g., "chapter")
- `chapter`: Chapter number
- `pages`: Page range
- `address`: Publisher location
- `edition`: Edition
- `month`: Month
**Template**:
```bibtex
@incollection{CitationKey2024,
author = {Last, First},
title = {Chapter Title},
booktitle = {Book Title},
editor = {Editor, Last and Editor2, Last},
publisher = {Publisher Name},
year = {2024},
pages = {123--145},
chapter = {5}
}
```
**Example**:
```bibtex
@incollection{Brown2020,
author = {Brown, Peter O. and Botstein, David},
title = {Exploring the New World of the Genome with {DNA} Microarrays},
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
editor = {Eisen, Michael B. and Brown, Patrick O.},
publisher = {Cold Spring Harbor Laboratory Press},
year = {2020},
pages = {1--45},
address = {Cold Spring Harbor, NY}
}
```
### @phdthesis - Doctoral Dissertations
**For PhD dissertations and theses**.
**Required fields**:
- `author`: Author name
- `title`: Thesis title
- `school`: Institution
- `year`: Year
**Optional fields**:
- `type`: Type (e.g., "PhD dissertation", "PhD thesis")
- `address`: Institution location
- `month`: Month
- `url`: URL
- `note`: Additional notes
**Template**:
```bibtex
@phdthesis{CitationKey2024,
author = {Last, First},
title = {Dissertation Title},
school = {University Name},
year = {2024},
type = {{PhD} dissertation},
address = {City, State}
}
```
**Example**:
```bibtex
@phdthesis{Johnson2023,
author = {Johnson, Mary L.},
title = {Novel Approaches to Cancer Immunotherapy Using {CRISPR} Technology},
school = {Stanford University},
year = {2023},
type = {{PhD} dissertation},
address = {Stanford, CA}
}
```
**Note**: `@mastersthesis` is similar but for Master's theses.
### @mastersthesis - Master's Theses
**For Master's theses**.
**Required fields**:
- `author`: Author name
- `title`: Thesis title
- `school`: Institution
- `year`: Year
**Template**:
```bibtex
@mastersthesis{CitationKey2024,
author = {Last, First},
title = {Thesis Title},
school = {University Name},
year = {2024}
}
```
### @misc - Miscellaneous
**For items that don't fit other categories** (preprints, datasets, software, websites, etc.).
**Required fields**:
- `author` (if known)
- `title`
- `year`
**Optional fields**:
- `howpublished`: Repository, website, format
- `url`: URL
- `doi`: DOI
- `note`: Additional information
- `month`: Month
**Template for preprints**:
```bibtex
@misc{CitationKey2024,
author = {Last, First},
title = {Preprint Title},
year = {2024},
howpublished = {bioRxiv},
doi = {10.1101/2024.01.01.123456},
note = {Preprint}
}
```
**Template for datasets**:
```bibtex
@misc{DatasetName2024,
author = {Last, First},
title = {Dataset Title},
year = {2024},
howpublished = {Zenodo},
doi = {10.5281/zenodo.123456},
note = {Version 1.2}
}
```
**Template for software**:
```bibtex
@misc{SoftwareName2024,
author = {Last, First},
title = {Software Name},
year = {2024},
howpublished = {GitHub},
url = {https://github.com/user/repo},
note = {Version 2.0}
}
```
### @techreport - Technical Reports
**For technical reports**.
**Required fields**:
- `author`: Author name(s)
- `title`: Report title
- `institution`: Institution
- `year`: Year
**Optional fields**:
- `type`: Type of report
- `number`: Report number
- `address`: Institution location
- `month`: Month
**Template**:
```bibtex
@techreport{CitationKey2024,
author = {Last, First},
title = {Report Title},
institution = {Institution Name},
year = {2024},
type = {Technical Report},
number = {TR-2024-01}
}
```
### @unpublished - Unpublished Work
**For unpublished works** (not preprints - use @misc for those).
**Required fields**:
- `author`: Author name(s)
- `title`: Work title
- `note`: Description
**Optional fields**:
- `month`: Month
- `year`: Year
**Template**:
```bibtex
@unpublished{CitationKey2024,
author = {Last, First},
title = {Work Title},
note = {Unpublished manuscript},
year = {2024}
}
```
### @online/@electronic - Online Resources
**For web pages and online-only content**.
**Note**: Not standard BibTeX, but supported by many bibliography packages (biblatex).
**Required fields**:
- `author` OR `organization`
- `title`
- `url`
- `year`
**Template**:
```bibtex
@online{CitationKey2024,
author = {{Organization Name}},
title = {Page Title},
url = {https://example.com/page},
year = {2024},
note = {Accessed: 2024-01-15}
}
```
## Formatting Rules
### Citation Keys
**Convention**: `FirstAuthorYEARkeyword`
**Examples**:
```bibtex
Smith2024protein
Doe2023machine
JohnsonWilliams2024cancer % Multiple authors, no space
NatureEditorial2024 % No author, use publication
WHO2024guidelines % Organization author
```
**Rules**:
- Alphanumeric plus: `-`, `_`, `.`, `:`
- No spaces
- Case-sensitive
- Unique within file
- Descriptive
**Avoid**:
- Special characters: `@`, `#`, `&`, `%`, `$`
- Spaces: use CamelCase or underscores
- Starting with numbers: `2024Smith` (some systems disallow)
### Author Names
**Recommended format**: `Last, First Middle`
**Single author**:
```bibtex
author = {Smith, John}
author = {Smith, John A.}
author = {Smith, John Andrew}
```
**Multiple authors** - separate with `and`:
```bibtex
author = {Smith, John and Doe, Jane}
author = {Smith, John A. and Doe, Jane M. and Johnson, Mary L.}
```
**Many authors** (10+):
```bibtex
author = {Smith, John and Doe, Jane and Johnson, Mary and others}
```
**Special cases**:
```bibtex
% Suffix (Jr., III, etc.)
author = {King, Jr., Martin Luther}
% Organization as author
author = {{World Health Organization}}
% Note: Double braces keep as single entity
% Multiple surnames
author = {Garc{\'i}a-Mart{\'i}nez, Jos{\'e}}
% Particles (van, von, de, etc.)
author = {van der Waals, Johannes}
author = {de Broglie, Louis}
```
**Wrong formats** (don't use):
```bibtex
author = {Smith, J.; Doe, J.} % Semicolons (wrong)
author = {Smith, J., Doe, J.} % Commas (wrong)
author = {Smith, J. & Doe, J.} % Ampersand (wrong)
author = {Smith J} % No comma
```
### Title Capitalization
**Protect capitalization** with braces:
```bibtex
% Proper nouns, acronyms, formulas
title = {{AlphaFold}: Protein Structure Prediction}
title = {Machine Learning for {DNA} Sequencing}
title = {The {Ising} Model in Statistical Physics}
title = {{CRISPR-Cas9} Gene Editing Technology}
```
**Reason**: Citation styles may change capitalization. Braces protect.
**Examples**:
```bibtex
% Good
title = {Advances in {COVID-19} Treatment}
title = {Using {Python} for Data Analysis}
title = {The {AlphaFold} Protein Structure Database}
% Will be lowercase in title case styles
title = {Advances in COVID-19 Treatment} % covid-19
title = {Using Python for Data Analysis} % python
```
**Whole title protection** (rarely needed):
```bibtex
title = {{This Entire Title Keeps Its Capitalization}}
```
### Page Ranges
**Use en-dash** (double hyphen `--`):
```bibtex
pages = {123--145} % Correct
pages = {1234--1256} % Correct
pages = {e0123456} % Article ID (PLOS, etc.)
pages = {123} % Single page
```
**Wrong**:
```bibtex
pages = {123-145} % Single hyphen (don't use)
pages = {pp. 123-145} % "pp." not needed
pages = {123145} % Unicode en-dash (may cause issues)
```
### Month Names
**Use three-letter abbreviations** (unquoted):
```bibtex
month = jan
month = feb
month = mar
month = apr
month = may
month = jun
month = jul
month = aug
month = sep
month = oct
month = nov
month = dec
```
**Or numeric**:
```bibtex
month = {1} % January
month = {12} % December
```
**Or full name in braces**:
```bibtex
month = {January}
```
**Standard abbreviations work without quotes** because they're defined in BibTeX.
### Journal Names
**Full name** (not abbreviated):
```bibtex
journal = {Nature}
journal = {Science}
journal = {Cell}
journal = {Proceedings of the National Academy of Sciences}
journal = {Journal of the American Chemical Society}
```
**Bibliography style** will handle abbreviation if needed.
**Avoid manual abbreviation**:
```bibtex
% Don't do this in BibTeX file
journal = {Proc. Natl. Acad. Sci. U.S.A.}
% Do this instead
journal = {Proceedings of the National Academy of Sciences}
```
**Exception**: If style requires abbreviations, use full abbreviated form:
```bibtex
journal = {Proc. Natl. Acad. Sci. U.S.A.} % If required by style
```
### DOI Formatting
**URL format** (preferred):
```bibtex
doi = {10.1038/s41586-021-03819-2}
```
**Not**:
```bibtex
doi = {https://doi.org/10.1038/s41586-021-03819-2} % Don't include URL
doi = {doi:10.1038/s41586-021-03819-2} % Don't include prefix
```
**LaTeX** will format as URL automatically.
**Note**: No period after DOI field!
### URL Formatting
```bibtex
url = {https://www.example.com/article}
```
**Use**:
- When DOI not available
- For web pages
- For supplementary materials
**Don't duplicate**:
```bibtex
% Don't include both if DOI URL is same as url
doi = {10.1038/nature12345}
url = {https://doi.org/10.1038/nature12345} % Redundant!
```
### Special Characters
**Accents and diacritics**:
```bibtex
author = {M{\"u}ller, Hans} % ü
author = {Garc{\'i}a, Jos{\'e}} % í, é
author = {Erd{\H{o}}s, Paul} % ő
author = {Schr{\"o}dinger, Erwin} % ö
```
**Or use UTF-8** (with proper LaTeX setup):
```bibtex
author = {Müller, Hans}
author = {García, José}
```
**Mathematical symbols**:
```bibtex
title = {The $\alpha$-helix Structure}
title = {$\beta$-sheet Prediction}
```
**Chemical formulas**:
```bibtex
title = {H$_2$O Molecular Dynamics}
% Or with chemformula package:
title = {\ce{H2O} Molecular Dynamics}
```
### Field Order
**Recommended order** (for readability):
```bibtex
@article{Key,
author = {},
title = {},
journal = {},
year = {},
volume = {},
number = {},
pages = {},
doi = {},
url = {},
note = {}
}
```
**Rules**:
- Most important fields first
- Consistent across entries
- Use formatter to standardize
## Best Practices
### 1. Consistent Formatting
Use same format throughout:
- Author name format
- Title capitalization
- Journal names
- Citation key style
### 2. Required Fields
Always include:
- All required fields for entry type
- DOI for modern papers (2000+)
- Volume and pages for articles
- Publisher for books
### 3. Protect Capitalization
Use braces for:
- Proper nouns: `{AlphaFold}`
- Acronyms: `{DNA}`, `{CRISPR}`
- Formulas: `{H2O}`
- Names: `{Python}`, `{R}`
### 4. Complete Author Lists
Include all authors when possible:
- All authors if <10
- Use "and others" for 10+
- Don't abbreviate to "et al." manually
### 5. Use Standard Entry Types
Choose correct entry type:
- Journal article → `@article`
- Book → `@book`
- Conference paper → `@inproceedings`
- Preprint → `@misc`
### 6. Validate Syntax
Check for:
- Balanced braces
- Commas after fields
- Unique citation keys
- Valid entry types
### 7. Use Formatters
Use automated tools:
```bash
python scripts/format_bibtex.py references.bib
```
Benefits:
- Consistent formatting
- Catch syntax errors
- Standardize field order
- Fix common issues
## Common Mistakes
### 1. Wrong Author Separator
**Wrong**:
```bibtex
author = {Smith, J.; Doe, J.} % Semicolon
author = {Smith, J., Doe, J.} % Comma
author = {Smith, J. & Doe, J.} % Ampersand
```
**Correct**:
```bibtex
author = {Smith, John and Doe, Jane}
```
### 2. Missing Commas
**Wrong**:
```bibtex
@article{Smith2024,
author = {Smith, John} % Missing comma!
title = {Title}
}
```
**Correct**:
```bibtex
@article{Smith2024,
author = {Smith, John}, % Comma after each field
title = {Title}
}
```
### 3. Unprotected Capitalization
**Wrong**:
```bibtex
title = {Machine Learning with Python}
% "Python" will become "python" in title case
```
**Correct**:
```bibtex
title = {Machine Learning with {Python}}
```
### 4. Single Hyphen in Pages
**Wrong**:
```bibtex
pages = {123-145} % Single hyphen
```
**Correct**:
```bibtex
pages = {123--145} % Double hyphen (en-dash)
```
### 5. Redundant "pp." in Pages
**Wrong**:
```bibtex
pages = {pp. 123--145}
```
**Correct**:
```bibtex
pages = {123--145}
```
### 6. DOI with URL Prefix
**Wrong**:
```bibtex
doi = {https://doi.org/10.1038/nature12345}
doi = {doi:10.1038/nature12345}
```
**Correct**:
```bibtex
doi = {10.1038/nature12345}
```
## Example Complete Bibliography
```bibtex
% Journal article
@article{Jumper2021,
author = {Jumper, John and Evans, Richard and Pritzel, Alexander and others},
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
journal = {Nature},
year = {2021},
volume = {596},
number = {7873},
pages = {583--589},
doi = {10.1038/s41586-021-03819-2}
}
% Book
@book{Kumar2021,
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
title = {Robbins and Cotran Pathologic Basis of Disease},
publisher = {Elsevier},
year = {2021},
edition = {10},
address = {Philadelphia, PA},
isbn = {978-0-323-53113-9}
}
% Conference paper
@inproceedings{Vaswani2017,
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and others},
title = {Attention is All You Need},
booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
year = {2017},
pages = {5998--6008}
}
% Book chapter
@incollection{Brown2020,
author = {Brown, Peter O. and Botstein, David},
title = {Exploring the New World of the Genome with {DNA} Microarrays},
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
editor = {Eisen, Michael B. and Brown, Patrick O.},
publisher = {Cold Spring Harbor Laboratory Press},
year = {2020},
pages = {1--45}
}
% PhD thesis
@phdthesis{Johnson2023,
author = {Johnson, Mary L.},
title = {Novel Approaches to Cancer Immunotherapy},
school = {Stanford University},
year = {2023},
type = {{PhD} dissertation}
}
% Preprint
@misc{Zhang2024,
author = {Zhang, Yi and Chen, Li and Wang, Hui},
title = {Novel Therapeutic Targets in {Alzheimer}'s Disease},
year = {2024},
howpublished = {bioRxiv},
doi = {10.1101/2024.01.001},
note = {Preprint}
}
% Dataset
@misc{AlphaFoldDB2021,
author = {{DeepMind} and {EMBL-EBI}},
title = {{AlphaFold} Protein Structure Database},
year = {2021},
howpublished = {Database},
url = {https://alphafold.ebi.ac.uk/},
doi = {10.1093/nar/gkab1061}
}
```
## Summary
BibTeX formatting essentials:
**Choose correct entry type** (@article, @book, etc.)
**Include all required fields**
**Use `and` for multiple authors**
**Protect capitalization** with braces
**Use `--` for page ranges**
**Include DOI** for modern papers
**Validate syntax** before compilation
Use formatting tools to ensure consistency:
```bash
python scripts/format_bibtex.py references.bib
```
Properly formatted BibTeX ensures correct, consistent citations across all bibliography styles!

View File

@@ -0,0 +1,794 @@
# Citation Validation Guide
Comprehensive guide to validating citation accuracy, completeness, and formatting in BibTeX files.
## Overview
Citation validation ensures:
- All citations are accurate and complete
- DOIs resolve correctly
- Required fields are present
- No duplicate entries
- Proper formatting and syntax
- Links are accessible
Validation should be performed:
- After extracting metadata
- Before manuscript submission
- After manual edits to BibTeX files
- Periodically for maintained bibliographies
## Validation Categories
### 1. DOI Verification
**Purpose**: Ensure DOIs are valid and resolve correctly.
#### What to Check
**DOI format**:
```
Valid: 10.1038/s41586-021-03819-2
Valid: 10.1126/science.aam9317
Invalid: 10.1038/invalid
Invalid: doi:10.1038/... (should omit "doi:" prefix in BibTeX)
```
**DOI resolution**:
- DOI should resolve via https://doi.org/
- Should redirect to actual article
- Should not return 404 or error
**Metadata consistency**:
- CrossRef metadata should match BibTeX
- Author names should align
- Title should match
- Year should match
#### How to Validate
**Manual check**:
1. Copy DOI from BibTeX
2. Visit https://doi.org/10.1038/nature12345
3. Verify it redirects to correct article
4. Check metadata matches
**Automated check** (recommended):
```bash
python scripts/validate_citations.py references.bib --check-dois
```
**Process**:
1. Extract all DOIs from BibTeX file
2. Query doi.org resolver for each
3. Query CrossRef API for metadata
4. Compare metadata with BibTeX entry
5. Report discrepancies
#### Common Issues
**Broken DOIs**:
- Typos in DOI
- Publisher changed DOI (rare)
- Article retracted
- Solution: Find correct DOI from publisher site
**Mismatched metadata**:
- BibTeX has old/incorrect information
- Solution: Re-extract metadata from CrossRef
**Missing DOIs**:
- Older articles may not have DOIs
- Acceptable for pre-2000 publications
- Add URL or PMID instead
### 2. Required Fields
**Purpose**: Ensure all necessary information is present.
#### Required by Entry Type
**@article**:
```bibtex
author % REQUIRED
title % REQUIRED
journal % REQUIRED
year % REQUIRED
volume % Highly recommended
pages % Highly recommended
doi % Highly recommended for modern papers
```
**@book**:
```bibtex
author OR editor % REQUIRED (at least one)
title % REQUIRED
publisher % REQUIRED
year % REQUIRED
isbn % Recommended
```
**@inproceedings**:
```bibtex
author % REQUIRED
title % REQUIRED
booktitle % REQUIRED (conference/proceedings name)
year % REQUIRED
pages % Recommended
```
**@incollection** (book chapter):
```bibtex
author % REQUIRED
title % REQUIRED (chapter title)
booktitle % REQUIRED (book title)
publisher % REQUIRED
year % REQUIRED
editor % Recommended
pages % Recommended
```
**@phdthesis**:
```bibtex
author % REQUIRED
title % REQUIRED
school % REQUIRED
year % REQUIRED
```
**@misc** (preprints, datasets, etc.):
```bibtex
author % REQUIRED
title % REQUIRED
year % REQUIRED
howpublished % Recommended (bioRxiv, Zenodo, etc.)
doi OR url % At least one required
```
#### Validation Script
```bash
python scripts/validate_citations.py references.bib --check-required-fields
```
**Output**:
```
Error: Entry 'Smith2024' missing required field 'journal'
Error: Entry 'Doe2023' missing required field 'year'
Warning: Entry 'Jones2022' missing recommended field 'volume'
```
### 3. Author Name Formatting
**Purpose**: Ensure consistent, correct author name formatting.
#### Proper Format
**Recommended BibTeX format**:
```bibtex
author = {Last1, First1 and Last2, First2 and Last3, First3}
```
**Examples**:
```bibtex
% Correct
author = {Smith, John}
author = {Smith, John A.}
author = {Smith, John Andrew}
author = {Smith, John and Doe, Jane}
author = {Smith, John and Doe, Jane and Johnson, Mary}
% For many authors
author = {Smith, John and Doe, Jane and others}
% Incorrect
author = {John Smith} % First Last format (not recommended)
author = {Smith, J.; Doe, J.} % Semicolon separator (wrong)
author = {Smith J, Doe J} % Missing commas
```
#### Special Cases
**Suffixes (Jr., III, etc.)**:
```bibtex
author = {King, Jr., Martin Luther}
```
**Multiple surnames (hyphenated)**:
```bibtex
author = {Smith-Jones, Mary}
```
**Van, von, de, etc.**:
```bibtex
author = {van der Waals, Johannes}
author = {de Broglie, Louis}
```
**Organizations as authors**:
```bibtex
author = {{World Health Organization}}
% Double braces treat as single author
```
#### Validation Checks
**Automated validation**:
```bash
python scripts/validate_citations.py references.bib --check-authors
```
**Checks for**:
- Proper separator (and, not &, ; , etc.)
- Comma placement
- Empty author fields
- Malformed names
### 4. Data Consistency
**Purpose**: Ensure all fields contain valid, reasonable values.
#### Year Validation
**Valid years**:
```bibtex
year = {2024} % Current/recent
year = {1953} % Watson & Crick DNA structure (historical)
year = {1665} % Hooke's Micrographia (very old)
```
**Invalid years**:
```bibtex
year = {24} % Two digits (ambiguous)
year = {202} % Typo
year = {2025} % Future (unless accepted/in press)
year = {0} % Obviously wrong
```
**Check**:
- Four digits
- Reasonable range (1600-current+1)
- Not all zeros
#### Volume/Number Validation
```bibtex
volume = {123} % Numeric
volume = {12} % Valid
number = {3} % Valid
number = {S1} % Supplement issue (valid)
```
**Invalid**:
```bibtex
volume = {Vol. 123} % Should be just number
number = {Issue 3} % Should be just number
```
#### Page Range Validation
**Correct format**:
```bibtex
pages = {123--145} % En-dash (two hyphens)
pages = {e0123456} % PLOS-style article ID
pages = {123} % Single page
```
**Incorrect format**:
```bibtex
pages = {123-145} % Single hyphen (use --)
pages = {pp. 123-145} % Remove "pp."
pages = {123145} % Unicode en-dash (may cause issues)
```
#### URL Validation
**Check**:
- URLs are accessible (return 200 status)
- HTTPS when available
- No obvious typos
- Permanent links (not temporary)
**Valid**:
```bibtex
url = {https://www.nature.com/articles/nature12345}
url = {https://arxiv.org/abs/2103.14030}
```
**Questionable**:
```bibtex
url = {http://...} % HTTP instead of HTTPS
url = {file:///...} % Local file path
url = {bit.ly/...} % URL shortener (not permanent)
```
### 5. Duplicate Detection
**Purpose**: Find and remove duplicate entries.
#### Types of Duplicates
**Exact duplicates** (same DOI):
```bibtex
@article{Smith2024a,
doi = {10.1038/nature12345},
...
}
@article{Smith2024b,
doi = {10.1038/nature12345}, % Same DOI!
...
}
```
**Near duplicates** (similar title/authors):
```bibtex
@article{Smith2024,
title = {Machine Learning for Drug Discovery},
...
}
@article{Smith2024method,
title = {Machine learning for drug discovery}, % Same, different case
...
}
```
**Preprint + Published**:
```bibtex
@misc{Smith2023arxiv,
title = {AlphaFold Results},
howpublished = {arXiv},
...
}
@article{Smith2024,
title = {AlphaFold Results}, % Same paper, now published
journal = {Nature},
...
}
% Keep published version only
```
#### Detection Methods
**By DOI** (most reliable):
- Same DOI = exact duplicate
- Keep one, remove other
**By title similarity**:
- Normalize: lowercase, remove punctuation
- Calculate similarity (e.g., Levenshtein distance)
- Flag if >90% similar
**By author-year-title**:
- Same first author + year + similar title
- Likely duplicate
**Automated detection**:
```bash
python scripts/validate_citations.py references.bib --check-duplicates
```
**Output**:
```
Warning: Possible duplicate entries:
- Smith2024a (DOI: 10.1038/nature12345)
- Smith2024b (DOI: 10.1038/nature12345)
Recommendation: Keep one entry, remove the other.
```
### 6. Format and Syntax
**Purpose**: Ensure valid BibTeX syntax.
#### Common Syntax Errors
**Missing commas**:
```bibtex
@article{Smith2024,
author = {Smith, John} % Missing comma!
title = {Title}
}
% Should be:
author = {Smith, John}, % Comma after each field
```
**Unbalanced braces**:
```bibtex
title = {Title with {Protected} Text % Missing closing brace
% Should be:
title = {Title with {Protected} Text}
```
**Missing closing brace for entry**:
```bibtex
@article{Smith2024,
author = {Smith, John},
title = {Title}
% Missing closing brace!
% Should end with:
}
```
**Invalid characters in keys**:
```bibtex
@article{Smith&Doe2024, % & not allowed in key
...
}
% Use:
@article{SmithDoe2024,
...
}
```
#### BibTeX Syntax Rules
**Entry structure**:
```bibtex
@TYPE{citationkey,
field1 = {value1},
field2 = {value2},
...
fieldN = {valueN}
}
```
**Citation keys**:
- Alphanumeric and some punctuation (-, _, ., :)
- No spaces
- Case-sensitive
- Unique within file
**Field values**:
- Enclosed in {braces} or "quotes"
- Braces preferred for complex text
- Numbers can be unquoted: `year = 2024`
**Special characters**:
- `{` and `}` for grouping
- `\` for LaTeX commands
- Protect capitalization: `{AlphaFold}`
- Accents: `{\"u}`, `{\'e}`, `{\aa}`
#### Validation
```bash
python scripts/validate_citations.py references.bib --check-syntax
```
**Checks**:
- Valid BibTeX structure
- Balanced braces
- Proper commas
- Valid entry types
- Unique citation keys
## Validation Workflow
### Step 1: Basic Validation
Run comprehensive validation:
```bash
python scripts/validate_citations.py references.bib
```
**Checks all**:
- DOI resolution
- Required fields
- Author formatting
- Data consistency
- Duplicates
- Syntax
### Step 2: Review Report
Examine validation report:
```json
{
"total_entries": 150,
"valid_entries": 140,
"errors": [
{
"entry": "Smith2024",
"error": "missing_required_field",
"field": "journal",
"severity": "high"
},
{
"entry": "Doe2023",
"error": "invalid_doi",
"doi": "10.1038/broken",
"severity": "high"
}
],
"warnings": [
{
"entry": "Jones2022",
"warning": "missing_recommended_field",
"field": "volume",
"severity": "medium"
}
],
"duplicates": [
{
"entries": ["Smith2024a", "Smith2024b"],
"reason": "same_doi",
"doi": "10.1038/nature12345"
}
]
}
```
### Step 3: Fix Issues
**High-priority** (errors):
1. Add missing required fields
2. Fix broken DOIs
3. Remove duplicates
4. Correct syntax errors
**Medium-priority** (warnings):
1. Add recommended fields
2. Improve author formatting
3. Fix page ranges
**Low-priority**:
1. Standardize formatting
2. Add URLs for accessibility
### Step 4: Auto-Fix
Use auto-fix for safe corrections:
```bash
python scripts/validate_citations.py references.bib \
--auto-fix \
--output fixed_references.bib
```
**Auto-fix can**:
- Fix page range format (- to --)
- Remove "pp." from pages
- Standardize author separators
- Fix common syntax errors
- Normalize field order
**Auto-fix cannot**:
- Add missing information
- Find correct DOIs
- Determine which duplicate to keep
- Fix semantic errors
### Step 5: Manual Review
Review auto-fixed file:
```bash
# Check what changed
diff references.bib fixed_references.bib
# Review specific entries that had errors
grep -A 10 "Smith2024" fixed_references.bib
```
### Step 6: Re-Validate
Validate after fixes:
```bash
python scripts/validate_citations.py fixed_references.bib --verbose
```
Should show:
```
✓ All DOIs valid
✓ All required fields present
✓ No duplicates found
✓ Syntax valid
✓ 150/150 entries valid
```
## Validation Checklist
Use this checklist before final submission:
### DOI Validation
- [ ] All DOIs resolve correctly
- [ ] Metadata matches between BibTeX and CrossRef
- [ ] No broken or invalid DOIs
### Completeness
- [ ] All entries have required fields
- [ ] Modern papers (2000+) have DOIs
- [ ] Authors properly formatted
- [ ] Journals/conferences properly named
### Consistency
- [ ] Years are 4-digit numbers
- [ ] Page ranges use -- not -
- [ ] Volume/number are numeric
- [ ] URLs are accessible
### Duplicates
- [ ] No entries with same DOI
- [ ] No near-duplicate titles
- [ ] Preprints updated to published versions
### Formatting
- [ ] Valid BibTeX syntax
- [ ] Balanced braces
- [ ] Proper commas
- [ ] Unique citation keys
### Final Checks
- [ ] Bibliography compiles without errors
- [ ] All citations in text appear in bibliography
- [ ] All bibliography entries cited in text
- [ ] Citation style matches journal requirements
## Best Practices
### 1. Validate Early and Often
```bash
# After extraction
python scripts/extract_metadata.py --doi ... --output refs.bib
python scripts/validate_citations.py refs.bib
# After manual edits
python scripts/validate_citations.py refs.bib
# Before submission
python scripts/validate_citations.py refs.bib --strict
```
### 2. Use Automated Tools
Don't validate manually - use scripts:
- Faster
- More comprehensive
- Catches errors humans miss
- Generates reports
### 3. Keep Backup
```bash
# Before auto-fix
cp references.bib references_backup.bib
# Run auto-fix
python scripts/validate_citations.py references.bib \
--auto-fix \
--output references_fixed.bib
# Review changes
diff references.bib references_fixed.bib
# If satisfied, replace
mv references_fixed.bib references.bib
```
### 4. Fix High-Priority First
**Priority order**:
1. Syntax errors (prevent compilation)
2. Missing required fields (incomplete citations)
3. Broken DOIs (broken links)
4. Duplicates (confusion, wasted space)
5. Missing recommended fields
6. Formatting inconsistencies
### 5. Document Exceptions
For entries that can't be fixed:
```bibtex
@article{Old1950,
author = {Smith, John},
title = {Title},
journal = {Obscure Journal},
year = {1950},
volume = {12},
pages = {34--56},
note = {DOI not available for publications before 2000}
}
```
### 6. Validate Against Journal Requirements
Different journals have different requirements:
- Citation style (numbered, author-year)
- Abbreviations (journal names)
- Maximum reference count
- Format (BibTeX, EndNote, manual)
Check journal author guidelines!
## Common Validation Issues
### Issue 1: Metadata Mismatch
**Problem**: BibTeX says 2023, CrossRef says 2024.
**Cause**:
- Online-first vs print publication
- Correction/update
- Extraction error
**Solution**:
1. Check actual article
2. Use more recent/accurate date
3. Update BibTeX entry
4. Re-validate
### Issue 2: Special Characters
**Problem**: LaTeX compilation fails on special characters.
**Cause**:
- Accented characters (é, ü, ñ)
- Chemical formulas (H₂O)
- Math symbols (α, β, ±)
**Solution**:
```bibtex
% Use LaTeX commands
author = {M{\"u}ller, Hans} % Müller
title = {Study of H\textsubscript{2}O} % H₂O
% Or use UTF-8 with proper LaTeX packages
```
### Issue 3: Incomplete Extraction
**Problem**: Extracted metadata missing fields.
**Cause**:
- Source doesn't provide all metadata
- Extraction error
- Incomplete record
**Solution**:
1. Check original article
2. Manually add missing fields
3. Use alternative source (PubMed vs CrossRef)
### Issue 4: Cannot Find Duplicate
**Problem**: Same paper appears twice, not detected.
**Cause**:
- Different DOIs (should be rare)
- Different titles (abbreviated, typo)
- Different citation keys
**Solution**:
- Manual search for author + year
- Check for similar titles
- Remove manually
## Summary
Validation ensures citation quality:
**Accuracy**: DOIs resolve, metadata correct
**Completeness**: All required fields present
**Consistency**: Proper formatting throughout
**No duplicates**: Each paper cited once
**Valid syntax**: BibTeX compiles without errors
**Always validate** before final submission!
Use automated tools:
```bash
python scripts/validate_citations.py references.bib
```
Follow workflow:
1. Extract metadata
2. Validate
3. Fix errors
4. Re-validate
5. Submit

View File

@@ -0,0 +1,725 @@
# Google Scholar Search Guide
Comprehensive guide to searching Google Scholar for academic papers, including advanced search operators, filtering strategies, and metadata extraction.
## Overview
Google Scholar provides the most comprehensive coverage of academic literature across all disciplines:
- **Coverage**: 100+ million scholarly documents
- **Scope**: All academic disciplines
- **Content types**: Journal articles, books, theses, conference papers, preprints, patents, court opinions
- **Citation tracking**: "Cited by" links for forward citation tracking
- **Accessibility**: Free to use, no account required
## Basic Search
### Simple Keyword Search
Search for papers containing specific terms anywhere in the document (title, abstract, full text):
```
CRISPR gene editing
machine learning protein folding
climate change impact agriculture
quantum computing algorithms
```
**Tips**:
- Use specific technical terms
- Include key acronyms and abbreviations
- Start broad, then refine
- Check spelling of technical terms
### Exact Phrase Search
Use quotation marks to search for exact phrases:
```
"deep learning"
"CRISPR-Cas9"
"systematic review"
"randomized controlled trial"
```
**When to use**:
- Technical terms that must appear together
- Proper names
- Specific methodologies
- Exact titles
## Advanced Search Operators
### Author Search
Find papers by specific authors:
```
author:LeCun
author:"Geoffrey Hinton"
author:Church synthetic biology
```
**Variations**:
- Single last name: `author:Smith`
- Full name in quotes: `author:"Jane Smith"`
- Author + topic: `author:Doudna CRISPR`
**Tips**:
- Authors may publish under different name variations
- Try with and without middle initials
- Consider name changes (marriage, etc.)
- Use quotation marks for full names
### Title Search
Search only in article titles:
```
intitle:transformer
intitle:"attention mechanism"
intitle:review climate change
```
**Use cases**:
- Finding papers specifically about a topic
- More precise than full-text search
- Reduces irrelevant results
- Good for finding reviews or methods
### Source (Journal) Search
Search within specific journals or conferences:
```
source:Nature
source:"Nature Communications"
source:NeurIPS
source:"Journal of Machine Learning Research"
```
**Applications**:
- Track publications in top-tier venues
- Find papers in specialized journals
- Identify conference-specific work
- Verify publication venue
### Exclusion Operator
Exclude terms from results:
```
machine learning -survey
CRISPR -patent
climate change -news
deep learning -tutorial -review
```
**Common exclusions**:
- `-survey`: Exclude survey papers
- `-review`: Exclude review articles
- `-patent`: Exclude patents
- `-book`: Exclude books
- `-news`: Exclude news articles
- `-tutorial`: Exclude tutorials
### OR Operator
Search for papers containing any of multiple terms:
```
"machine learning" OR "deep learning"
CRISPR OR "gene editing"
"climate change" OR "global warming"
```
**Best practices**:
- OR must be uppercase
- Combine synonyms
- Include acronyms and spelled-out versions
- Use with exact phrases
### Wildcard Search
Use asterisk (*) as wildcard for unknown words:
```
"machine * learning"
"CRISPR * editing"
"* neural network"
```
**Note**: Limited wildcard support in Google Scholar compared to other databases.
## Advanced Filtering
### Year Range
Filter by publication year:
**Using interface**:
- Click "Since [year]" on left sidebar
- Select custom range
**Using search operators**:
```
# Not directly in search query
# Use interface or URL parameters
```
**In script**:
```bash
python scripts/search_google_scholar.py "quantum computing" \
--year-start 2020 \
--year-end 2024
```
### Sorting Options
**By relevance** (default):
- Google's algorithm determines relevance
- Considers citations, author reputation, publication venue
- Generally good for most searches
**By date**:
- Most recent papers first
- Good for fast-moving fields
- May miss highly cited older papers
- Click "Sort by date" in interface
**By citation count** (via script):
```bash
python scripts/search_google_scholar.py "transformers" \
--sort-by citations \
--limit 50
```
### Language Filtering
**In interface**:
- Settings → Languages
- Select preferred languages
**Default**: English and papers with English abstracts
## Search Strategies
### Finding Seminal Papers
Identify highly influential papers in a field:
1. **Search by topic** with broad terms
2. **Sort by citations** (most cited first)
3. **Look for review articles** for comprehensive overviews
4. **Check publication dates** for foundational vs recent work
**Example**:
```
"generative adversarial networks"
# Sort by citations
# Top results: original GAN paper (Goodfellow et al., 2014), key variants
```
### Finding Recent Work
Stay current with latest research:
1. **Search by topic**
2. **Filter to recent years** (last 1-2 years)
3. **Sort by date** for newest first
4. **Set up alerts** for ongoing tracking
**Example**:
```bash
python scripts/search_google_scholar.py "AlphaFold protein structure" \
--year-start 2023 \
--year-end 2024 \
--limit 50
```
### Finding Review Articles
Get comprehensive overviews of a field:
```
intitle:review "machine learning"
"systematic review" CRISPR
intitle:survey "natural language processing"
```
**Indicators**:
- "review", "survey", "perspective" in title
- Often highly cited
- Published in review journals (Nature Reviews, Trends, etc.)
- Comprehensive reference lists
### Citation Chain Search
**Forward citations** (papers citing a key paper):
1. Find seminal paper
2. Click "Cited by X"
3. See all papers that cite it
4. Identify how field has developed
**Backward citations** (references in a key paper):
1. Find recent review or important paper
2. Check its reference list
3. Identify foundational work
4. Trace development of ideas
**Example workflow**:
```
# Find original transformer paper
"Attention is all you need" author:Vaswani
# Check "Cited by 120,000+"
# See evolution: BERT, GPT, T5, etc.
# Check references in original paper
# Find RNN, LSTM, attention mechanism origins
```
### Comprehensive Literature Search
For thorough coverage (e.g., systematic reviews):
1. **Generate synonym list**:
- Main terms + alternatives
- Acronyms + spelled out
- US vs UK spelling
2. **Use OR operators**:
```
("machine learning" OR "deep learning" OR "neural networks")
```
3. **Combine multiple concepts**:
```
("machine learning" OR "deep learning") ("drug discovery" OR "drug development")
```
4. **Search without date filters** initially:
- Get total landscape
- Filter later if too many results
5. **Export results** for systematic analysis:
```bash
python scripts/search_google_scholar.py \
'"machine learning" OR "deep learning" drug discovery' \
--limit 500 \
--output comprehensive_search.json
```
## Extracting Citation Information
### From Google Scholar Results Page
Each result shows:
- **Title**: Paper title (linked to full text if available)
- **Authors**: Author list (often truncated)
- **Source**: Journal/conference, year, publisher
- **Cited by**: Number of citations + link to citing papers
- **Related articles**: Link to similar papers
- **All versions**: Different versions of the same paper
### Export Options
**Manual export**:
1. Click "Cite" under paper
2. Select BibTeX format
3. Copy citation
**Limitations**:
- One paper at a time
- Manual process
- Time-consuming for many papers
**Automated export** (using script):
```bash
# Search and export to BibTeX
python scripts/search_google_scholar.py "quantum computing" \
--limit 50 \
--format bibtex \
--output quantum_papers.bib
```
### Metadata Available
From Google Scholar you can typically extract:
- Title
- Authors (may be incomplete)
- Year
- Source (journal/conference)
- Citation count
- Link to full text (when available)
- Link to PDF (when available)
**Note**: Metadata quality varies:
- Some fields may be missing
- Author names may be incomplete
- Need to verify with DOI lookup for accuracy
## Rate Limiting and Access
### Rate Limits
Google Scholar has rate limiting to prevent automated scraping:
**Symptoms of rate limiting**:
- CAPTCHA challenges
- Temporary IP blocks
- 429 "Too Many Requests" errors
**Best practices**:
1. **Add delays between requests**: 2-5 seconds minimum
2. **Limit query volume**: Don't search hundreds of queries rapidly
3. **Use scholarly library**: Handles rate limiting automatically
4. **Rotate User-Agents**: Appear as different browsers
5. **Consider proxies**: For large-scale searches (use ethically)
**In our scripts**:
```python
# Automatic rate limiting built in
time.sleep(random.uniform(3, 7)) # Random delay 3-7 seconds
```
### Ethical Considerations
**DO**:
- Respect rate limits
- Use reasonable delays
- Cache results (don't re-query)
- Use official APIs when available
- Attribute data properly
**DON'T**:
- Scrape aggressively
- Use multiple IPs to bypass limits
- Violate terms of service
- Burden servers unnecessarily
- Use data commercially without permission
### Institutional Access
**Benefits of institutional access**:
- Access to full-text PDFs through library subscriptions
- Better download capabilities
- Integration with library systems
- Link resolver to full text
**Setup**:
- Google Scholar → Settings → Library links
- Add your institution
- Links appear in search results
## Tips and Best Practices
### Search Optimization
1. **Start simple, then refine**:
```
# Too specific initially
intitle:"deep learning" intitle:review source:Nature 2023..2024
# Better approach
deep learning review
# Review results
# Add intitle:, source:, year filters as needed
```
2. **Use multiple search strategies**:
- Keyword search
- Author search for known experts
- Citation chaining from key papers
- Source search in top journals
3. **Check spelling and variations**:
- Color vs colour
- Optimization vs optimisation
- Tumor vs tumour
- Try common misspellings if few results
4. **Combine operators strategically**:
```
# Good combination
author:Church intitle:"synthetic biology" 2015..2024
# Find reviews by specific author on topic in recent years
```
### Result Evaluation
1. **Check citation counts**:
- High citations indicate influence
- Recent papers may have low citations but be important
- Citation counts vary by field
2. **Verify publication venue**:
- Peer-reviewed journals vs preprints
- Conference proceedings
- Book chapters
- Technical reports
3. **Check for full text access**:
- [PDF] link on right side
- "All X versions" may have open access version
- Check institutional access
- Try author's website or ResearchGate
4. **Look for review articles**:
- Comprehensive overviews
- Good starting point for new topics
- Extensive reference lists
### Managing Results
1. **Use citation manager integration**:
- Export to BibTeX
- Import to Zotero, Mendeley, EndNote
- Maintain organized library
2. **Set up alerts** for ongoing research:
- Google Scholar → Alerts
- Get emails for new papers matching query
- Track specific authors or topics
3. **Create collections**:
- Save papers to Google Scholar Library
- Organize by project or topic
- Add labels and notes
4. **Export systematically**:
```bash
# Save search results for later analysis
python scripts/search_google_scholar.py "your topic" \
--output topic_papers.json
# Can re-process later without re-searching
python scripts/extract_metadata.py \
--input topic_papers.json \
--output topic_refs.bib
```
## Advanced Techniques
### Boolean Logic Combinations
Combine multiple operators for precise searches:
```
# Highly cited reviews on specific topic by known authors
intitle:review "machine learning" ("drug discovery" OR "drug development")
author:Horvath OR author:Bengio 2020..2024
# Method papers excluding reviews
intitle:method "protein folding" -review -survey
# Papers in top journals only
("Nature" OR "Science" OR "Cell") CRISPR 2022..2024
```
### Finding Open Access Papers
```
# Search with generic terms
machine learning
# Filter by "All versions" which often includes preprints
# Look for green [PDF] links (often open access)
# Check arXiv, bioRxiv versions
```
**In script**:
```bash
python scripts/search_google_scholar.py "topic" \
--open-access-only \
--output open_access_papers.json
```
### Tracking Research Impact
**For a specific paper**:
1. Find the paper
2. Click "Cited by X"
3. Analyze citing papers:
- How is it being used?
- What fields cite it?
- Recent vs older citations?
**For an author**:
1. Search `author:LastName`
2. Check h-index and i10-index
3. View citation history graph
4. Identify most influential papers
**For a topic**:
1. Search topic
2. Sort by citations
3. Identify seminal papers (highly cited, older)
4. Check recent highly-cited papers (emerging important work)
### Finding Preprints and Early Work
```
# arXiv papers
source:arxiv "deep learning"
# bioRxiv papers
source:biorxiv CRISPR
# All preprint servers
("arxiv" OR "biorxiv" OR "medrxiv") your topic
```
**Note**: Preprints are not peer-reviewed. Always check if published version exists.
## Common Issues and Solutions
### Too Many Results
**Problem**: Search returns 100,000+ results, overwhelming.
**Solutions**:
1. Add more specific terms
2. Use `intitle:` to search only titles
3. Filter by recent years
4. Add exclusions (e.g., `-review`)
5. Search within specific journals
### Too Few Results
**Problem**: Search returns 0-10 results, suspiciously few.
**Solutions**:
1. Remove restrictive operators
2. Try synonyms and related terms
3. Check spelling
4. Broaden year range
5. Use OR for alternative terms
### Irrelevant Results
**Problem**: Results don't match intent.
**Solutions**:
1. Use exact phrases with quotes
2. Add more specific context terms
3. Use `intitle:` for title-only search
4. Exclude common irrelevant terms
5. Combine multiple specific terms
### CAPTCHA or Rate Limiting
**Problem**: Google Scholar shows CAPTCHA or blocks access.
**Solutions**:
1. Wait several minutes before continuing
2. Reduce query frequency
3. Use longer delays in scripts (5-10 seconds)
4. Switch to different IP/network
5. Consider using institutional access
### Missing Metadata
**Problem**: Author names, year, or venue missing from results.
**Solutions**:
1. Click through to see full details
2. Check "All versions" for better metadata
3. Look up by DOI if available
4. Extract metadata from CrossRef/PubMed instead
5. Manually verify from paper PDF
### Duplicate Results
**Problem**: Same paper appears multiple times.
**Solutions**:
1. Click "All X versions" to see consolidated view
2. Choose version with best metadata
3. Use deduplication in post-processing:
```bash
python scripts/format_bibtex.py results.bib \
--deduplicate \
--output clean_results.bib
```
## Integration with Scripts
### search_google_scholar.py Usage
**Basic search**:
```bash
python scripts/search_google_scholar.py "machine learning drug discovery"
```
**With year filter**:
```bash
python scripts/search_google_scholar.py "CRISPR" \
--year-start 2020 \
--year-end 2024 \
--limit 100
```
**Sort by citations**:
```bash
python scripts/search_google_scholar.py "transformers" \
--sort-by citations \
--limit 50
```
**Export to BibTeX**:
```bash
python scripts/search_google_scholar.py "quantum computing" \
--format bibtex \
--output quantum.bib
```
**Export to JSON for later processing**:
```bash
python scripts/search_google_scholar.py "topic" \
--format json \
--output results.json
# Later: extract full metadata
python scripts/extract_metadata.py \
--input results.json \
--output references.bib
```
### Batch Searching
For multiple topics:
```bash
# Create file with search queries (queries.txt)
# One query per line
# Search each query
while read query; do
python scripts/search_google_scholar.py "$query" \
--limit 50 \
--output "${query// /_}.json"
sleep 10 # Delay between queries
done < queries.txt
```
## Summary
Google Scholar is the most comprehensive academic search engine, providing:
**Broad coverage**: All disciplines, 100M+ documents
**Free access**: No account or subscription required
**Citation tracking**: "Cited by" for impact analysis
**Multiple formats**: Articles, books, theses, patents
**Full-text search**: Not just abstracts
Key strategies:
- Use advanced operators for precision
- Combine author, title, source searches
- Track citations for impact
- Export systematically to citation manager
- Respect rate limits and access policies
- Verify metadata with CrossRef/PubMed
For biomedical research, complement with PubMed for MeSH terms and curated metadata.

View File

@@ -0,0 +1,870 @@
# Metadata Extraction Guide
Comprehensive guide to extracting accurate citation metadata from DOIs, PMIDs, arXiv IDs, and URLs using various APIs and services.
## Overview
Accurate metadata is essential for proper citations. This guide covers:
- Identifying paper identifiers (DOI, PMID, arXiv ID)
- Querying metadata APIs (CrossRef, PubMed, arXiv, DataCite)
- Required BibTeX fields by entry type
- Handling edge cases and special situations
- Validating extracted metadata
## Paper Identifiers
### DOI (Digital Object Identifier)
**Format**: `10.XXXX/suffix`
**Examples**:
```
10.1038/s41586-021-03819-2 # Nature article
10.1126/science.aam9317 # Science article
10.1016/j.cell.2023.01.001 # Cell article
10.1371/journal.pone.0123456 # PLOS ONE article
```
**Properties**:
- Permanent identifier
- Most reliable for metadata
- Resolves to current location
- Publisher-assigned
**Where to find**:
- First page of article
- Article webpage
- CrossRef, Google Scholar, PubMed
- Usually prominent on publisher site
### PMID (PubMed ID)
**Format**: 8-digit number (typically)
**Examples**:
```
34265844
28445112
35476778
```
**Properties**:
- Specific to PubMed database
- Biomedical literature only
- Assigned by NCBI
- Permanent identifier
**Where to find**:
- PubMed search results
- Article page on PubMed
- Often in article PDF footer
- PMC (PubMed Central) pages
### PMCID (PubMed Central ID)
**Format**: PMC followed by numbers
**Examples**:
```
PMC8287551
PMC7456789
```
**Properties**:
- Free full-text articles in PMC
- Subset of PubMed articles
- Open access or author manuscripts
### arXiv ID
**Format**: YYMM.NNNNN or archive/YYMMNNN
**Examples**:
```
2103.14030 # New format (since 2007)
2401.12345 # 2024 submission
arXiv:hep-th/9901001 # Old format
```
**Properties**:
- Preprints (not peer-reviewed)
- Physics, math, CS, q-bio, etc.
- Version tracking (v1, v2, etc.)
- Free, open access
**Where to find**:
- arXiv.org
- Often cited before publication
- Paper PDF header
### Other Identifiers
**ISBN** (Books):
```
978-0-12-345678-9
0-123-45678-9
```
**arXiv category**:
```
cs.LG # Computer Science - Machine Learning
q-bio.QM # Quantitative Biology - Quantitative Methods
math.ST # Mathematics - Statistics
```
## Metadata APIs
### CrossRef API
**Primary source for DOIs** - Most comprehensive metadata for journal articles.
**Base URL**: `https://api.crossref.org/works/`
**No API key required**, but polite pool recommended:
- Add email to User-Agent
- Gets better service
- No rate limits
#### Basic DOI Lookup
**Request**:
```
GET https://api.crossref.org/works/10.1038/s41586-021-03819-2
```
**Response** (simplified):
```json
{
"message": {
"DOI": "10.1038/s41586-021-03819-2",
"title": ["Article title here"],
"author": [
{"given": "John", "family": "Smith"},
{"given": "Jane", "family": "Doe"}
],
"container-title": ["Nature"],
"volume": "595",
"issue": "7865",
"page": "123-128",
"published-print": {"date-parts": [[2021, 7, 1]]},
"publisher": "Springer Nature",
"type": "journal-article",
"ISSN": ["0028-0836"]
}
}
```
#### Fields Available
**Always present**:
- `DOI`: Digital Object Identifier
- `title`: Article title (array)
- `type`: Content type (journal-article, book-chapter, etc.)
**Usually present**:
- `author`: Array of author objects
- `container-title`: Journal/book title
- `published-print` or `published-online`: Publication date
- `volume`, `issue`, `page`: Publication details
- `publisher`: Publisher name
**Sometimes present**:
- `abstract`: Article abstract
- `subject`: Subject categories
- `ISSN`: Journal ISSN
- `ISBN`: Book ISBN
- `reference`: Reference list
- `is-referenced-by-count`: Citation count
#### Content Types
CrossRef `type` field values:
- `journal-article`: Journal articles
- `book-chapter`: Book chapters
- `book`: Books
- `proceedings-article`: Conference papers
- `posted-content`: Preprints
- `dataset`: Research datasets
- `report`: Technical reports
- `dissertation`: Theses/dissertations
### PubMed E-utilities API
**Specialized for biomedical literature** - Curated metadata with MeSH terms.
**Base URL**: `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/`
**API key recommended** (free):
- Higher rate limits
- Better performance
#### PMID to Metadata
**Step 1: EFetch for full record**
```
GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
db=pubmed&
id=34265844&
retmode=xml&
api_key=YOUR_KEY
```
**Response**: XML with comprehensive metadata
**Step 2: Parse XML**
Key fields:
```xml
<PubmedArticle>
<MedlineCitation>
<PMID>34265844</PMID>
<Article>
<ArticleTitle>Title here</ArticleTitle>
<AuthorList>
<Author><LastName>Smith</LastName><ForeName>John</ForeName></Author>
</AuthorList>
<Journal>
<Title>Nature</Title>
<JournalIssue>
<Volume>595</Volume>
<Issue>7865</Issue>
<PubDate><Year>2021</Year></PubDate>
</JournalIssue>
</Journal>
<Pagination><MedlinePgn>123-128</MedlinePgn></Pagination>
<Abstract><AbstractText>Abstract text here</AbstractText></Abstract>
</Article>
</MedlineCitation>
<PubmedData>
<ArticleIdList>
<ArticleId IdType="doi">10.1038/s41586-021-03819-2</ArticleId>
<ArticleId IdType="pmc">PMC8287551</ArticleId>
</ArticleIdList>
</PubmedData>
</PubmedArticle>
```
#### Unique PubMed Fields
**MeSH Terms**: Controlled vocabulary
```xml
<MeshHeadingList>
<MeshHeading>
<DescriptorName UI="D003920">Diabetes Mellitus</DescriptorName>
</MeshHeading>
</MeshHeadingList>
```
**Publication Types**:
```xml
<PublicationTypeList>
<PublicationType UI="D016428">Journal Article</PublicationType>
<PublicationType UI="D016449">Randomized Controlled Trial</PublicationType>
</PublicationTypeList>
```
**Grant Information**:
```xml
<GrantList>
<Grant>
<GrantID>R01-123456</GrantID>
<Agency>NIAID NIH HHS</Agency>
<Country>United States</Country>
</Grant>
</GrantList>
```
### arXiv API
**Preprints in physics, math, CS, q-bio** - Free, open access.
**Base URL**: `http://export.arxiv.org/api/query`
**No API key required**
#### arXiv ID to Metadata
**Request**:
```
GET http://export.arxiv.org/api/query?id_list=2103.14030
```
**Response**: Atom XML
```xml
<entry>
<id>http://arxiv.org/abs/2103.14030v2</id>
<title>Highly accurate protein structure prediction with AlphaFold</title>
<author><name>John Jumper</name></author>
<author><name>Richard Evans</name></author>
<published>2021-03-26T17:47:17Z</published>
<updated>2021-07-01T16:51:46Z</updated>
<summary>Abstract text here...</summary>
<arxiv:doi>10.1038/s41586-021-03819-2</arxiv:doi>
<category term="q-bio.BM" scheme="http://arxiv.org/schemas/atom"/>
<category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
</entry>
```
#### Key Fields
- `id`: arXiv URL
- `title`: Preprint title
- `author`: Author list
- `published`: First version date
- `updated`: Latest version date
- `summary`: Abstract
- `arxiv:doi`: DOI if published
- `arxiv:journal_ref`: Journal reference if published
- `category`: arXiv categories
#### Version Tracking
arXiv tracks versions:
- `v1`: Initial submission
- `v2`, `v3`, etc.: Revisions
**Always check** if preprint has been published in journal (use DOI if available).
### DataCite API
**Research datasets, software, other outputs** - Assigns DOIs to non-traditional scholarly works.
**Base URL**: `https://api.datacite.org/dois/`
**Similar to CrossRef** but for datasets, software, code, etc.
**Request**:
```
GET https://api.datacite.org/dois/10.5281/zenodo.1234567
```
**Response**: JSON with metadata for dataset/software
## Required BibTeX Fields
### @article (Journal Articles)
**Required**:
- `author`: Author names
- `title`: Article title
- `journal`: Journal name
- `year`: Publication year
**Optional but recommended**:
- `volume`: Volume number
- `number`: Issue number
- `pages`: Page range (e.g., 123--145)
- `doi`: Digital Object Identifier
- `url`: URL if no DOI
- `month`: Publication month
**Example**:
```bibtex
@article{Smith2024,
author = {Smith, John and Doe, Jane},
title = {Novel Approach to Protein Folding},
journal = {Nature},
year = {2024},
volume = {625},
number = {8001},
pages = {123--145},
doi = {10.1038/nature12345}
}
```
### @book (Books)
**Required**:
- `author` or `editor`: Author(s) or editor(s)
- `title`: Book title
- `publisher`: Publisher name
- `year`: Publication year
**Optional but recommended**:
- `edition`: Edition number (if not first)
- `address`: Publisher location
- `isbn`: ISBN
- `url`: URL
- `series`: Series name
**Example**:
```bibtex
@book{Kumar2021,
author = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
title = {Robbins and Cotran Pathologic Basis of Disease},
publisher = {Elsevier},
year = {2021},
edition = {10},
isbn = {978-0-323-53113-9}
}
```
### @inproceedings (Conference Papers)
**Required**:
- `author`: Author names
- `title`: Paper title
- `booktitle`: Conference/proceedings name
- `year`: Year
**Optional but recommended**:
- `pages`: Page range
- `organization`: Organizing body
- `publisher`: Publisher
- `address`: Conference location
- `month`: Conference month
- `doi`: DOI if available
**Example**:
```bibtex
@inproceedings{Vaswani2017,
author = {Vaswani, Ashish and Shazeer, Noam and others},
title = {Attention is All You Need},
booktitle = {Advances in Neural Information Processing Systems},
year = {2017},
pages = {5998--6008},
volume = {30}
}
```
### @incollection (Book Chapters)
**Required**:
- `author`: Chapter author(s)
- `title`: Chapter title
- `booktitle`: Book title
- `publisher`: Publisher name
- `year`: Publication year
**Optional but recommended**:
- `editor`: Book editor(s)
- `pages`: Chapter page range
- `chapter`: Chapter number
- `edition`: Edition
- `address`: Publisher location
**Example**:
```bibtex
@incollection{Brown2020,
author = {Brown, Peter O. and Botstein, David},
title = {Exploring the New World of the Genome with {DNA} Microarrays},
booktitle = {DNA Microarrays: A Molecular Cloning Manual},
editor = {Eisen, Michael B. and Brown, Patrick O.},
publisher = {Cold Spring Harbor Laboratory Press},
year = {2020},
pages = {1--45}
}
```
### @phdthesis (Dissertations)
**Required**:
- `author`: Author name
- `title`: Thesis title
- `school`: Institution
- `year`: Year
**Optional**:
- `type`: Type (e.g., "PhD dissertation")
- `address`: Institution location
- `month`: Month
- `url`: URL
**Example**:
```bibtex
@phdthesis{Johnson2023,
author = {Johnson, Mary L.},
title = {Novel Approaches to Cancer Immunotherapy},
school = {Stanford University},
year = {2023},
type = {{PhD} dissertation}
}
```
### @misc (Preprints, Software, Datasets)
**Required**:
- `author`: Author(s)
- `title`: Title
- `year`: Year
**For preprints, add**:
- `howpublished`: Repository (e.g., "bioRxiv")
- `doi`: Preprint DOI
- `note`: Preprint ID
**Example (preprint)**:
```bibtex
@misc{Zhang2024,
author = {Zhang, Yi and Chen, Li and Wang, Hui},
title = {Novel Therapeutic Targets in Alzheimer's Disease},
year = {2024},
howpublished = {bioRxiv},
doi = {10.1101/2024.01.001},
note = {Preprint}
}
```
**Example (software)**:
```bibtex
@misc{AlphaFold2021,
author = {DeepMind},
title = {{AlphaFold} Protein Structure Database},
year = {2021},
howpublished = {Software},
url = {https://alphafold.ebi.ac.uk/},
doi = {10.5281/zenodo.5123456}
}
```
## Extraction Workflows
### From DOI
**Best practice** - Most reliable source:
```bash
# Single DOI
python scripts/extract_metadata.py --doi 10.1038/s41586-021-03819-2
# Multiple DOIs
python scripts/extract_metadata.py \
--doi 10.1038/nature12345 \
--doi 10.1126/science.abc1234 \
--output refs.bib
```
**Process**:
1. Query CrossRef API with DOI
2. Parse JSON response
3. Extract required fields
4. Determine entry type (@article, @book, etc.)
5. Format as BibTeX
6. Validate completeness
### From PMID
**For biomedical literature**:
```bash
# Single PMID
python scripts/extract_metadata.py --pmid 34265844
# Multiple PMIDs
python scripts/extract_metadata.py \
--pmid 34265844 \
--pmid 28445112 \
--output refs.bib
```
**Process**:
1. Query PubMed EFetch with PMID
2. Parse XML response
3. Extract metadata including MeSH terms
4. Check for DOI in response
5. If DOI exists, optionally query CrossRef for additional metadata
6. Format as BibTeX
### From arXiv ID
**For preprints**:
```bash
python scripts/extract_metadata.py --arxiv 2103.14030
```
**Process**:
1. Query arXiv API with ID
2. Parse Atom XML response
3. Check for published version (DOI in response)
4. If published: Use DOI and CrossRef
5. If not published: Use preprint metadata
6. Format as @misc with preprint note
**Important**: Always check if preprint has been published!
### From URL
**When you only have URL**:
```bash
python scripts/extract_metadata.py \
--url "https://www.nature.com/articles/s41586-021-03819-2"
```
**Process**:
1. Parse URL to extract identifier
2. Identify type (DOI, PMID, arXiv)
3. Extract identifier from URL
4. Query appropriate API
5. Format as BibTeX
**URL patterns**:
```
# DOI URLs
https://doi.org/10.1038/nature12345
https://dx.doi.org/10.1126/science.abc123
https://www.nature.com/articles/s41586-021-03819-2
# PubMed URLs
https://pubmed.ncbi.nlm.nih.gov/34265844/
https://www.ncbi.nlm.nih.gov/pubmed/34265844
# arXiv URLs
https://arxiv.org/abs/2103.14030
https://arxiv.org/pdf/2103.14030.pdf
```
### Batch Processing
**From file with mixed identifiers**:
```bash
# Create file with one identifier per line
# identifiers.txt:
# 10.1038/nature12345
# 34265844
# 2103.14030
# https://doi.org/10.1126/science.abc123
python scripts/extract_metadata.py \
--input identifiers.txt \
--output references.bib
```
**Process**:
- Script auto-detects identifier type
- Queries appropriate API
- Combines all into single BibTeX file
- Handles errors gracefully
## Special Cases and Edge Cases
### Preprints Later Published
**Issue**: Preprint cited, but journal version now available.
**Solution**:
1. Check arXiv metadata for DOI field
2. If DOI present, use published version
3. Update citation to journal article
4. Note preprint version in comments if needed
**Example**:
```bibtex
% Originally: arXiv:2103.14030
% Published as:
@article{Jumper2021,
author = {Jumper, John and Evans, Richard and others},
title = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
journal = {Nature},
year = {2021},
volume = {596},
pages = {583--589},
doi = {10.1038/s41586-021-03819-2}
}
```
### Multiple Authors (et al.)
**Issue**: Many authors (10+).
**BibTeX practice**:
- Include all authors if <10
- Use "and others" for 10+
- Or list all (journals vary)
**Example**:
```bibtex
@article{LargeCollaboration2024,
author = {First, Author and Second, Author and Third, Author and others},
...
}
```
### Author Name Variations
**Issue**: Authors publish under different name formats.
**Standardization**:
```
# Common variations
John Smith
John A. Smith
John Andrew Smith
J. A. Smith
Smith, J.
Smith, J. A.
# BibTeX format (recommended)
author = {Smith, John A.}
```
**Extraction preference**:
1. Use full name if available
2. Include middle initial if available
3. Format: Last, First Middle
### No DOI Available
**Issue**: Older papers or books without DOIs.
**Solutions**:
1. Use PMID if available (biomedical)
2. Use ISBN for books
3. Use URL to stable source
4. Include full publication details
**Example**:
```bibtex
@article{OldPaper1995,
author = {Author, Name},
title = {Title Here},
journal = {Journal Name},
year = {1995},
volume = {123},
pages = {45--67},
url = {https://stable-url-here},
note = {PMID: 12345678}
}
```
### Conference Papers vs Journal Articles
**Issue**: Same work published in both.
**Best practice**:
- Cite journal version if both available
- Journal version is archival
- Conference version for timeliness
**If citing conference**:
```bibtex
@inproceedings{Smith2024conf,
author = {Smith, John},
title = {Title},
booktitle = {Proceedings of NeurIPS 2024},
year = {2024}
}
```
**If citing journal**:
```bibtex
@article{Smith2024journal,
author = {Smith, John},
title = {Title},
journal = {Journal of Machine Learning Research},
year = {2024}
}
```
### Book Chapters vs Edited Collections
**Extract correctly**:
- Chapter: Use `@incollection`
- Whole book: Use `@book`
- Book editor: List in `editor` field
- Chapter author: List in `author` field
### Datasets and Software
**Use @misc** with appropriate fields:
```bibtex
@misc{DatasetName2024,
author = {Author, Name},
title = {Dataset Title},
year = {2024},
howpublished = {Zenodo},
doi = {10.5281/zenodo.123456},
note = {Version 1.2}
}
```
## Validation After Extraction
Always validate extracted metadata:
```bash
python scripts/validate_citations.py extracted_refs.bib
```
**Check**:
- All required fields present
- DOI resolves correctly
- Author names formatted consistently
- Year is reasonable (4 digits)
- Journal/publisher names correct
- Page ranges use -- not -
- Special characters handled properly
## Best Practices
### 1. Prefer DOI When Available
DOIs provide:
- Permanent identifier
- Best metadata source
- Publisher-verified information
- Resolvable link
### 2. Verify Automatically Extracted Metadata
Spot-check:
- Author names match publication
- Title matches (including capitalization)
- Year is correct
- Journal name is complete
### 3. Handle Special Characters
**LaTeX special characters**:
- Protect capitalization: `{AlphaFold}`
- Handle accents: `M{\"u}ller` or use Unicode
- Chemical formulas: `H$_2$O` or `\ce{H2O}`
### 4. Use Consistent Citation Keys
**Convention**: `FirstAuthorYEARkeyword`
```
Smith2024protein
Doe2023machine
Johnson2024cancer
```
### 5. Include DOI for Modern Papers
All papers published after ~2000 should have DOI:
```bibtex
doi = {10.1038/nature12345}
```
### 6. Document Source
For non-standard sources, add note:
```bibtex
note = {Preprint, not peer-reviewed}
note = {Technical report}
note = {Dataset accompanying [citation]}
```
## Summary
Metadata extraction workflow:
1. **Identify**: Determine identifier type (DOI, PMID, arXiv, URL)
2. **Query**: Use appropriate API (CrossRef, PubMed, arXiv)
3. **Extract**: Parse response for required fields
4. **Format**: Create properly formatted BibTeX entry
5. **Validate**: Check completeness and accuracy
6. **Verify**: Spot-check critical citations
**Use scripts** to automate:
- `extract_metadata.py`: Universal extractor
- `doi_to_bibtex.py`: Quick DOI conversion
- `validate_citations.py`: Verify accuracy
**Always validate** extracted metadata before final submission!

View File

@@ -0,0 +1,839 @@
# PubMed Search Guide
Comprehensive guide to searching PubMed for biomedical and life sciences literature, including MeSH terms, field tags, advanced search strategies, and E-utilities API usage.
## Overview
PubMed is the premier database for biomedical literature:
- **Coverage**: 35+ million citations
- **Scope**: Biomedical and life sciences
- **Sources**: MEDLINE, life science journals, online books
- **Authority**: Maintained by National Library of Medicine (NLM) / NCBI
- **Access**: Free, no account required
- **Updates**: Daily with new citations
- **Curation**: High-quality metadata, MeSH indexing
## Basic Search
### Simple Keyword Search
PubMed automatically maps terms to MeSH and searches multiple fields:
```
diabetes
CRISPR gene editing
Alzheimer's disease treatment
cancer immunotherapy
```
**Automatic Features**:
- Automatic MeSH mapping
- Plural/singular variants
- Abbreviation expansion
- Spell checking
### Exact Phrase Search
Use quotation marks for exact phrases:
```
"CRISPR-Cas9"
"systematic review"
"randomized controlled trial"
"machine learning"
```
## MeSH (Medical Subject Headings)
### What is MeSH?
MeSH is a controlled vocabulary thesaurus for indexing biomedical literature:
- **Hierarchical structure**: Organized in tree structures
- **Consistent indexing**: Same concept always tagged the same way
- **Comprehensive**: Covers diseases, drugs, anatomy, techniques, etc.
- **Professional curation**: NLM indexers assign MeSH terms
### Finding MeSH Terms
**MeSH Browser**: https://meshb.nlm.nih.gov/search
**Example**:
```
Search: "heart attack"
MeSH term: "Myocardial Infarction"
```
**In PubMed**:
1. Search with keyword
2. Check "MeSH Terms" in left sidebar
3. Select relevant MeSH terms
4. Add to search
### Using MeSH in Searches
**Basic MeSH search**:
```
"Diabetes Mellitus"[MeSH]
"CRISPR-Cas Systems"[MeSH]
"Alzheimer Disease"[MeSH]
"Neoplasms"[MeSH]
```
**MeSH with subheadings**:
```
"Diabetes Mellitus/drug therapy"[MeSH]
"Neoplasms/genetics"[MeSH]
"Heart Failure/prevention and control"[MeSH]
```
**Common subheadings**:
- `/drug therapy`: Drug treatment
- `/diagnosis`: Diagnostic aspects
- `/genetics`: Genetic aspects
- `/epidemiology`: Occurrence and distribution
- `/prevention and control`: Prevention methods
- `/etiology`: Causes
- `/surgery`: Surgical treatment
- `/metabolism`: Metabolic aspects
### MeSH Explosion
By default, MeSH searches include narrower terms (explosion):
```
"Neoplasms"[MeSH]
# Includes: Breast Neoplasms, Lung Neoplasms, etc.
```
**Disable explosion** (exact term only):
```
"Neoplasms"[MeSH:NoExp]
```
### MeSH Major Topic
Search only where MeSH term is a major focus:
```
"Diabetes Mellitus"[MeSH Major Topic]
# Only papers where diabetes is main topic
```
## Field Tags
Field tags specify which part of the record to search.
### Common Field Tags
**Title and Abstract**:
```
cancer[Title] # In title only
treatment[Title/Abstract] # In title or abstract
"machine learning"[Title/Abstract]
```
**Author**:
```
"Smith J"[Author]
"Doudna JA"[Author]
"Collins FS"[Author]
```
**Author - Full Name**:
```
"Smith, John"[Full Author Name]
```
**Journal**:
```
"Nature"[Journal]
"Science"[Journal]
"New England Journal of Medicine"[Journal]
"Nat Commun"[Journal] # Abbreviated form
```
**Publication Date**:
```
2023[Publication Date]
2020:2024[Publication Date] # Date range
2023/01/01:2023/12/31[Publication Date]
```
**Date Created**:
```
2023[Date - Create] # When added to PubMed
```
**Publication Type**:
```
"Review"[Publication Type]
"Clinical Trial"[Publication Type]
"Meta-Analysis"[Publication Type]
"Randomized Controlled Trial"[Publication Type]
```
**Language**:
```
English[Language]
French[Language]
```
**DOI**:
```
10.1038/nature12345[DOI]
```
**PMID (PubMed ID)**:
```
12345678[PMID]
```
**Article ID**:
```
PMC1234567[PMC] # PubMed Central ID
```
### Less Common But Useful Tags
```
humans[MeSH Terms] # Only human studies
animals[MeSH Terms] # Only animal studies
"United States"[Place of Publication]
nih[Grant Number] # NIH-funded research
"Female"[Sex] # Female subjects
"Aged, 80 and over"[Age] # Elderly subjects
```
## Boolean Operators
Combine search terms with Boolean logic.
### AND
Both terms must be present (default behavior):
```
diabetes AND treatment
"CRISPR-Cas9" AND "gene editing"
cancer AND immunotherapy AND "clinical trial"[Publication Type]
```
### OR
Either term must be present:
```
"heart attack" OR "myocardial infarction"
diabetes OR "diabetes mellitus"
CRISPR OR Cas9 OR "gene editing"
```
**Use case**: Synonyms and related terms
### NOT
Exclude terms:
```
cancer NOT review
diabetes NOT animal
"machine learning" NOT "deep learning"
```
**Caution**: May exclude relevant papers that mention both terms.
### Combining Operators
Use parentheses for complex logic:
```
(diabetes OR "diabetes mellitus") AND (treatment OR therapy)
("CRISPR" OR "gene editing") AND ("therapeutic" OR "therapy")
AND 2020:2024[Publication Date]
(cancer OR neoplasm) AND (immunotherapy OR "immune checkpoint inhibitor")
AND ("clinical trial"[Publication Type] OR "randomized controlled trial"[Publication Type])
```
## Advanced Search Builder
**Access**: https://pubmed.ncbi.nlm.nih.gov/advanced/
**Features**:
- Visual query builder
- Add multiple query boxes
- Select field tags from dropdowns
- Combine with AND/OR/NOT
- Preview results
- Shows final query string
- Save queries
**Workflow**:
1. Add search terms in separate boxes
2. Select field tags
3. Choose Boolean operators
4. Preview results
5. Refine as needed
6. Copy final query string
7. Use in scripts or save
**Example built query**:
```
#1: "Diabetes Mellitus, Type 2"[MeSH]
#2: "Metformin"[MeSH]
#3: "Clinical Trial"[Publication Type]
#4: 2020:2024[Publication Date]
#5: #1 AND #2 AND #3 AND #4
```
## Filters and Limits
### Article Types
```
"Review"[Publication Type]
"Systematic Review"[Publication Type]
"Meta-Analysis"[Publication Type]
"Clinical Trial"[Publication Type]
"Randomized Controlled Trial"[Publication Type]
"Case Reports"[Publication Type]
"Comparative Study"[Publication Type]
```
### Species
```
humans[MeSH Terms]
mice[MeSH Terms]
rats[MeSH Terms]
```
### Sex
```
"Female"[MeSH Terms]
"Male"[MeSH Terms]
```
### Age Groups
```
"Infant"[MeSH Terms]
"Child"[MeSH Terms]
"Adolescent"[MeSH Terms]
"Adult"[MeSH Terms]
"Aged"[MeSH Terms]
"Aged, 80 and over"[MeSH Terms]
```
### Text Availability
```
free full text[Filter] # Free full-text available
```
### Journal Categories
```
"Journal Article"[Publication Type]
```
## E-utilities API
NCBI provides programmatic access via E-utilities (Entrez Programming Utilities).
### Overview
**Base URL**: `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/`
**Main Tools**:
- **ESearch**: Search and retrieve PMIDs
- **EFetch**: Retrieve full records
- **ESummary**: Retrieve document summaries
- **ELink**: Find related articles
- **EInfo**: Database statistics
**No API key required**, but recommended for:
- Higher rate limits (10/sec vs 3/sec)
- Better performance
- Identify your project
**Get API key**: https://www.ncbi.nlm.nih.gov/account/
### ESearch - Search PubMed
Retrieve PMIDs for a query.
**Endpoint**: `/esearch.fcgi`
**Parameters**:
- `db`: Database (pubmed)
- `term`: Search query
- `retmax`: Maximum results (default 20, max 10000)
- `retstart`: Starting position (for pagination)
- `sort`: Sort order (relevance, pub_date, author)
- `api_key`: Your API key (optional but recommended)
**Example URL**:
```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?
db=pubmed&
term=diabetes+AND+treatment&
retmax=100&
retmode=json&
api_key=YOUR_API_KEY
```
**Response**:
```json
{
"esearchresult": {
"count": "250000",
"retmax": "100",
"idlist": ["12345678", "12345679", ...]
}
}
```
### EFetch - Retrieve Records
Get full metadata for PMIDs.
**Endpoint**: `/efetch.fcgi`
**Parameters**:
- `db`: Database (pubmed)
- `id`: Comma-separated PMIDs
- `retmode`: Format (xml, json, text)
- `rettype`: Type (abstract, medline, full)
- `api_key`: Your API key
**Example URL**:
```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
db=pubmed&
id=12345678,12345679&
retmode=xml&
api_key=YOUR_API_KEY
```
**Response**: XML with complete metadata including:
- Title
- Authors (with affiliations)
- Abstract
- Journal
- Publication date
- DOI
- PMID, PMCID
- MeSH terms
- Keywords
### ESummary - Get Summaries
Lighter-weight alternative to EFetch.
**Example**:
```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?
db=pubmed&
id=12345678&
retmode=json&
api_key=YOUR_API_KEY
```
**Returns**: Key metadata without full abstract and details.
### ELink - Find Related Articles
Find related articles or links to other databases.
**Example**:
```
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?
dbfrom=pubmed&
db=pubmed&
id=12345678&
linkname=pubmed_pubmed_citedin
```
**Link types**:
- `pubmed_pubmed`: Related articles
- `pubmed_pubmed_citedin`: Papers citing this article
- `pubmed_pmc`: PMC full-text versions
- `pubmed_protein`: Related protein records
### Rate Limiting
**Without API key**:
- 3 requests per second
- Block if exceeded
**With API key**:
- 10 requests per second
- Better for programmatic access
**Best practice**:
```python
import time
time.sleep(0.34) # ~3 requests/second
# or
time.sleep(0.11) # ~10 requests/second with API key
```
### API Key Usage
**Get API key**:
1. Create NCBI account: https://www.ncbi.nlm.nih.gov/account/
2. Settings → API Key Management
3. Create new API key
4. Copy key
**Use in requests**:
```
&api_key=YOUR_API_KEY_HERE
```
**Store securely**:
```bash
# In environment variable
export NCBI_API_KEY="your_key_here"
# In script
import os
api_key = os.getenv('NCBI_API_KEY')
```
## Search Strategies
### Comprehensive Systematic Search
For systematic reviews and meta-analyses:
```
# 1. Identify key concepts
Concept 1: Diabetes
Concept 2: Treatment
Concept 3: Outcomes
# 2. Find MeSH terms and synonyms
Concept 1: "Diabetes Mellitus"[MeSH] OR diabetes OR diabetic
Concept 2: "Drug Therapy"[MeSH] OR treatment OR therapy OR medication
Concept 3: "Treatment Outcome"[MeSH] OR outcome OR efficacy OR effectiveness
# 3. Combine with AND
("Diabetes Mellitus"[MeSH] OR diabetes OR diabetic)
AND ("Drug Therapy"[MeSH] OR treatment OR therapy OR medication)
AND ("Treatment Outcome"[MeSH] OR outcome OR efficacy OR effectiveness)
# 4. Add filters
AND 2015:2024[Publication Date]
AND ("Clinical Trial"[Publication Type] OR "Randomized Controlled Trial"[Publication Type])
AND English[Language]
AND humans[MeSH Terms]
```
### Finding Clinical Trials
```
# Specific disease + clinical trials
"Alzheimer Disease"[MeSH]
AND ("Clinical Trial"[Publication Type]
OR "Randomized Controlled Trial"[Publication Type])
AND 2020:2024[Publication Date]
# Specific drug trials
"Metformin"[MeSH]
AND "Diabetes Mellitus, Type 2"[MeSH]
AND "Randomized Controlled Trial"[Publication Type]
```
### Finding Reviews
```
# Systematic reviews on topic
"CRISPR-Cas Systems"[MeSH]
AND ("Systematic Review"[Publication Type] OR "Meta-Analysis"[Publication Type])
# Reviews in high-impact journals
cancer immunotherapy
AND "Review"[Publication Type]
AND ("Nature"[Journal] OR "Science"[Journal] OR "Cell"[Journal])
```
### Finding Recent Papers
```
# Papers from last year
"machine learning"[Title/Abstract]
AND "drug discovery"[Title/Abstract]
AND 2024[Publication Date]
# Recent papers in specific journal
"CRISPR"[Title/Abstract]
AND "Nature"[Journal]
AND 2023:2024[Publication Date]
```
### Author Tracking
```
# Specific author's recent work
"Doudna JA"[Author] AND 2020:2024[Publication Date]
# Author + topic
"Church GM"[Author] AND "synthetic biology"[Title/Abstract]
```
### High-Quality Evidence
```
# Meta-analyses and systematic reviews
(diabetes OR "diabetes mellitus")
AND (treatment OR therapy)
AND ("Meta-Analysis"[Publication Type] OR "Systematic Review"[Publication Type])
# RCTs only
cancer immunotherapy
AND "Randomized Controlled Trial"[Publication Type]
AND 2020:2024[Publication Date]
```
## Script Integration
### search_pubmed.py Usage
**Basic search**:
```bash
python scripts/search_pubmed.py "diabetes treatment"
```
**With MeSH terms**:
```bash
python scripts/search_pubmed.py \
--query '"Diabetes Mellitus"[MeSH] AND "Drug Therapy"[MeSH]'
```
**Date range filter**:
```bash
python scripts/search_pubmed.py "CRISPR" \
--date-start 2020-01-01 \
--date-end 2024-12-31 \
--limit 200
```
**Publication type filter**:
```bash
python scripts/search_pubmed.py "cancer immunotherapy" \
--publication-types "Clinical Trial,Randomized Controlled Trial" \
--limit 100
```
**Export to BibTeX**:
```bash
python scripts/search_pubmed.py "Alzheimer's disease" \
--limit 100 \
--format bibtex \
--output alzheimers.bib
```
**Complex query from file**:
```bash
# Save complex query in query.txt
cat > query.txt << 'EOF'
("Diabetes Mellitus, Type 2"[MeSH] OR "diabetes"[Title/Abstract])
AND ("Metformin"[MeSH] OR "metformin"[Title/Abstract])
AND "Randomized Controlled Trial"[Publication Type]
AND 2015:2024[Publication Date]
AND English[Language]
EOF
# Run search
python scripts/search_pubmed.py --query-file query.txt --limit 500
```
### Batch Searches
```bash
# Search multiple topics
TOPICS=("diabetes treatment" "cancer immunotherapy" "CRISPR gene editing")
for topic in "${TOPICS[@]}"; do
python scripts/search_pubmed.py "$topic" \
--limit 100 \
--output "${topic// /_}.json"
sleep 1
done
```
### Extract Metadata
```bash
# Search returns PMIDs
python scripts/search_pubmed.py "topic" --output results.json
# Extract full metadata
python scripts/extract_metadata.py \
--input results.json \
--output references.bib
```
## Tips and Best Practices
### Search Construction
1. **Start with MeSH terms**:
- Use MeSH Browser to find correct terms
- More precise than keyword search
- Captures all papers on topic regardless of terminology
2. **Include text word variants**:
```
# Better coverage
("Diabetes Mellitus"[MeSH] OR diabetes OR diabetic)
```
3. **Use field tags appropriately**:
- `[MeSH]` for standardized concepts
- `[Title/Abstract]` for specific terms
- `[Author]` for known authors
- `[Journal]` for specific venues
4. **Build incrementally**:
```
# Step 1: Basic search
diabetes
# Step 2: Add specificity
"Diabetes Mellitus, Type 2"[MeSH]
# Step 3: Add treatment
"Diabetes Mellitus, Type 2"[MeSH] AND "Metformin"[MeSH]
# Step 4: Add study type
"Diabetes Mellitus, Type 2"[MeSH] AND "Metformin"[MeSH]
AND "Clinical Trial"[Publication Type]
# Step 5: Add date range
... AND 2020:2024[Publication Date]
```
### Optimizing Results
1. **Too many results**: Add filters
- Restrict publication type
- Narrow date range
- Add more specific MeSH terms
- Use Major Topic: `[MeSH Major Topic]`
2. **Too few results**: Broaden search
- Remove restrictive filters
- Use OR for synonyms
- Expand date range
- Use MeSH explosion (default)
3. **Irrelevant results**: Refine terms
- Use more specific MeSH terms
- Add exclusions with NOT
- Use Title field instead of all fields
- Add MeSH subheadings
### Quality Control
1. **Document search strategy**:
- Save exact query string
- Record search date
- Note number of results
- Save filters used
2. **Export systematically**:
- Use consistent file naming
- Export to JSON for flexibility
- Convert to BibTeX as needed
- Keep original search results
3. **Validate retrieved citations**:
```bash
python scripts/validate_citations.py pubmed_results.bib
```
### Staying Current
1. **Set up search alerts**:
- PubMed → Save search
- Receive email updates
- Daily, weekly, or monthly
2. **Track specific journals**:
```
"Nature"[Journal] AND CRISPR[Title]
```
3. **Follow key authors**:
```
"Church GM"[Author]
```
## Common Issues and Solutions
### Issue: MeSH Term Not Found
**Solution**:
- Check spelling
- Use MeSH Browser
- Try related terms
- Use text word search as fallback
### Issue: Zero Results
**Solution**:
- Remove filters
- Check query syntax
- Use OR for broader search
- Try synonyms
### Issue: Poor Quality Results
**Solution**:
- Add publication type filters
- Restrict to recent years
- Use MeSH Major Topic
- Filter by journal quality
### Issue: Duplicates from Different Sources
**Solution**:
```bash
python scripts/format_bibtex.py results.bib \
--deduplicate \
--output clean.bib
```
### Issue: API Rate Limiting
**Solution**:
- Get API key (increases limit to 10/sec)
- Add delays in scripts
- Process in batches
- Use off-peak hours
## Summary
PubMed provides authoritative biomedical literature search:
**Curated content**: MeSH indexing, quality control
**Precise search**: Field tags, MeSH terms, filters
**Programmatic access**: E-utilities API
**Free access**: No subscription required
**Comprehensive**: 35M+ citations, daily updates
Key strategies:
- Use MeSH terms for precise searching
- Combine with text words for comprehensive coverage
- Apply appropriate field tags
- Filter by publication type and date
- Use E-utilities API for automation
- Document search strategy for reproducibility
For broader coverage across disciplines, complement with Google Scholar.

View File

@@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
DOI to BibTeX Converter
Quick utility to convert DOIs to BibTeX format using CrossRef API.
"""
import sys
import requests
import argparse
import time
import json
from typing import Optional, List
class DOIConverter:
"""Convert DOIs to BibTeX entries using CrossRef API."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool; mailto:support@example.com)'
})
def doi_to_bibtex(self, doi: str) -> Optional[str]:
"""
Convert a single DOI to BibTeX format.
Args:
doi: Digital Object Identifier
Returns:
BibTeX string or None if conversion fails
"""
# Clean DOI (remove URL prefix if present)
doi = doi.strip()
if doi.startswith('https://doi.org/'):
doi = doi.replace('https://doi.org/', '')
elif doi.startswith('http://doi.org/'):
doi = doi.replace('http://doi.org/', '')
elif doi.startswith('doi:'):
doi = doi.replace('doi:', '')
# Request BibTeX from CrossRef content negotiation
url = f'https://doi.org/{doi}'
headers = {
'Accept': 'application/x-bibtex',
'User-Agent': 'DOIConverter/1.0 (Citation Management Tool)'
}
try:
response = self.session.get(url, headers=headers, timeout=15)
if response.status_code == 200:
bibtex = response.text.strip()
# CrossRef sometimes returns entries with @data type, convert to @misc
if bibtex.startswith('@data{'):
bibtex = bibtex.replace('@data{', '@misc{', 1)
return bibtex
elif response.status_code == 404:
print(f'Error: DOI not found: {doi}', file=sys.stderr)
return None
else:
print(f'Error: Failed to retrieve BibTeX for {doi} (status {response.status_code})', file=sys.stderr)
return None
except requests.exceptions.Timeout:
print(f'Error: Request timeout for DOI: {doi}', file=sys.stderr)
return None
except requests.exceptions.RequestException as e:
print(f'Error: Request failed for {doi}: {e}', file=sys.stderr)
return None
def convert_multiple(self, dois: List[str], delay: float = 0.5) -> List[str]:
"""
Convert multiple DOIs to BibTeX.
Args:
dois: List of DOIs
delay: Delay between requests (seconds) for rate limiting
Returns:
List of BibTeX entries (excludes failed conversions)
"""
bibtex_entries = []
for i, doi in enumerate(dois):
print(f'Converting DOI {i+1}/{len(dois)}: {doi}', file=sys.stderr)
bibtex = self.doi_to_bibtex(doi)
if bibtex:
bibtex_entries.append(bibtex)
# Rate limiting
if i < len(dois) - 1: # Don't delay after last request
time.sleep(delay)
return bibtex_entries
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Convert DOIs to BibTeX format using CrossRef API',
epilog='Example: python doi_to_bibtex.py 10.1038/s41586-021-03819-2'
)
parser.add_argument(
'dois',
nargs='*',
help='DOI(s) to convert (can provide multiple)'
)
parser.add_argument(
'-i', '--input',
help='Input file with DOIs (one per line)'
)
parser.add_argument(
'-o', '--output',
help='Output file for BibTeX (default: stdout)'
)
parser.add_argument(
'--delay',
type=float,
default=0.5,
help='Delay between requests in seconds (default: 0.5)'
)
parser.add_argument(
'--format',
choices=['bibtex', 'json'],
default='bibtex',
help='Output format (default: bibtex)'
)
args = parser.parse_args()
# Collect DOIs from command line and/or file
dois = []
if args.dois:
dois.extend(args.dois)
if args.input:
try:
with open(args.input, 'r', encoding='utf-8') as f:
file_dois = [line.strip() for line in f if line.strip()]
dois.extend(file_dois)
except FileNotFoundError:
print(f'Error: Input file not found: {args.input}', file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f'Error reading input file: {e}', file=sys.stderr)
sys.exit(1)
if not dois:
parser.print_help()
sys.exit(1)
# Convert DOIs
converter = DOIConverter()
if len(dois) == 1:
bibtex = converter.doi_to_bibtex(dois[0])
if bibtex:
bibtex_entries = [bibtex]
else:
sys.exit(1)
else:
bibtex_entries = converter.convert_multiple(dois, delay=args.delay)
if not bibtex_entries:
print('Error: No successful conversions', file=sys.stderr)
sys.exit(1)
# Format output
if args.format == 'bibtex':
output = '\n\n'.join(bibtex_entries) + '\n'
else: # json
output = json.dumps({
'count': len(bibtex_entries),
'entries': bibtex_entries
}, indent=2)
# Write output
if args.output:
try:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'Successfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
except Exception as e:
print(f'Error writing output file: {e}', file=sys.stderr)
sys.exit(1)
else:
print(output)
# Summary
if len(dois) > 1:
success_rate = len(bibtex_entries) / len(dois) * 100
print(f'\nConverted {len(bibtex_entries)}/{len(dois)} DOIs ({success_rate:.1f}%)', file=sys.stderr)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,569 @@
#!/usr/bin/env python3
"""
Metadata Extraction Tool
Extract citation metadata from DOI, PMID, arXiv ID, or URL using various APIs.
"""
import sys
import os
import requests
import argparse
import time
import re
import json
import xml.etree.ElementTree as ET
from typing import Optional, Dict, List, Tuple
from urllib.parse import urlparse
class MetadataExtractor:
"""Extract metadata from various sources and generate BibTeX."""
def __init__(self, email: Optional[str] = None):
"""
Initialize extractor.
Args:
email: Email for Entrez API (recommended for PubMed)
"""
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'MetadataExtractor/1.0 (Citation Management Tool)'
})
self.email = email or os.getenv('NCBI_EMAIL', '')
def identify_type(self, identifier: str) -> Tuple[str, str]:
"""
Identify the type of identifier.
Args:
identifier: DOI, PMID, arXiv ID, or URL
Returns:
Tuple of (type, cleaned_identifier)
"""
identifier = identifier.strip()
# Check if URL
if identifier.startswith('http://') or identifier.startswith('https://'):
return self._parse_url(identifier)
# Check for DOI
if identifier.startswith('10.'):
return ('doi', identifier)
# Check for arXiv ID
if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', identifier):
return ('arxiv', identifier)
if identifier.startswith('arXiv:'):
return ('arxiv', identifier.replace('arXiv:', ''))
# Check for PMID (8-digit number typically)
if identifier.isdigit() and len(identifier) >= 7:
return ('pmid', identifier)
# Check for PMCID
if identifier.upper().startswith('PMC') and identifier[3:].isdigit():
return ('pmcid', identifier.upper())
return ('unknown', identifier)
def _parse_url(self, url: str) -> Tuple[str, str]:
"""Parse URL to extract identifier type and value."""
parsed = urlparse(url)
# DOI URLs
if 'doi.org' in parsed.netloc:
doi = parsed.path.lstrip('/')
return ('doi', doi)
# PubMed URLs
if 'pubmed.ncbi.nlm.nih.gov' in parsed.netloc or 'ncbi.nlm.nih.gov/pubmed' in url:
pmid = re.search(r'/(\d+)', parsed.path)
if pmid:
return ('pmid', pmid.group(1))
# arXiv URLs
if 'arxiv.org' in parsed.netloc:
arxiv_id = re.search(r'/abs/(\d{4}\.\d{4,5})', parsed.path)
if arxiv_id:
return ('arxiv', arxiv_id.group(1))
# Nature, Science, Cell, etc. - try to extract DOI from URL
doi_match = re.search(r'10\.\d{4,}/[^\s/]+', url)
if doi_match:
return ('doi', doi_match.group())
return ('url', url)
def extract_from_doi(self, doi: str) -> Optional[Dict]:
"""
Extract metadata from DOI using CrossRef API.
Args:
doi: Digital Object Identifier
Returns:
Metadata dictionary or None
"""
url = f'https://api.crossref.org/works/{doi}'
try:
response = self.session.get(url, timeout=15)
if response.status_code == 200:
data = response.json()
message = data.get('message', {})
metadata = {
'type': 'doi',
'entry_type': self._crossref_type_to_bibtex(message.get('type')),
'doi': doi,
'title': message.get('title', [''])[0],
'authors': self._format_authors_crossref(message.get('author', [])),
'year': self._extract_year_crossref(message),
'journal': message.get('container-title', [''])[0] if message.get('container-title') else '',
'volume': str(message.get('volume', '')) if message.get('volume') else '',
'issue': str(message.get('issue', '')) if message.get('issue') else '',
'pages': message.get('page', ''),
'publisher': message.get('publisher', ''),
'url': f'https://doi.org/{doi}'
}
return metadata
else:
print(f'Error: CrossRef API returned status {response.status_code} for DOI: {doi}', file=sys.stderr)
return None
except Exception as e:
print(f'Error extracting metadata from DOI {doi}: {e}', file=sys.stderr)
return None
def extract_from_pmid(self, pmid: str) -> Optional[Dict]:
"""
Extract metadata from PMID using PubMed E-utilities.
Args:
pmid: PubMed ID
Returns:
Metadata dictionary or None
"""
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
params = {
'db': 'pubmed',
'id': pmid,
'retmode': 'xml',
'rettype': 'abstract'
}
if self.email:
params['email'] = self.email
api_key = os.getenv('NCBI_API_KEY')
if api_key:
params['api_key'] = api_key
try:
response = self.session.get(url, params=params, timeout=15)
if response.status_code == 200:
root = ET.fromstring(response.content)
article = root.find('.//PubmedArticle')
if article is None:
print(f'Error: No article found for PMID: {pmid}', file=sys.stderr)
return None
# Extract metadata from XML
medline_citation = article.find('.//MedlineCitation')
article_elem = medline_citation.find('.//Article')
journal = article_elem.find('.//Journal')
# Get DOI if available
doi = None
article_ids = article.findall('.//ArticleId')
for article_id in article_ids:
if article_id.get('IdType') == 'doi':
doi = article_id.text
break
metadata = {
'type': 'pmid',
'entry_type': 'article',
'pmid': pmid,
'title': article_elem.findtext('.//ArticleTitle', ''),
'authors': self._format_authors_pubmed(article_elem.findall('.//Author')),
'year': self._extract_year_pubmed(article_elem),
'journal': journal.findtext('.//Title', ''),
'volume': journal.findtext('.//JournalIssue/Volume', ''),
'issue': journal.findtext('.//JournalIssue/Issue', ''),
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
'doi': doi
}
return metadata
else:
print(f'Error: PubMed API returned status {response.status_code} for PMID: {pmid}', file=sys.stderr)
return None
except Exception as e:
print(f'Error extracting metadata from PMID {pmid}: {e}', file=sys.stderr)
return None
def extract_from_arxiv(self, arxiv_id: str) -> Optional[Dict]:
"""
Extract metadata from arXiv ID using arXiv API.
Args:
arxiv_id: arXiv identifier
Returns:
Metadata dictionary or None
"""
url = 'http://export.arxiv.org/api/query'
params = {
'id_list': arxiv_id,
'max_results': 1
}
try:
response = self.session.get(url, params=params, timeout=15)
if response.status_code == 200:
# Parse Atom XML
root = ET.fromstring(response.content)
ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
entry = root.find('atom:entry', ns)
if entry is None:
print(f'Error: No entry found for arXiv ID: {arxiv_id}', file=sys.stderr)
return None
# Extract DOI if published
doi_elem = entry.find('arxiv:doi', ns)
doi = doi_elem.text if doi_elem is not None else None
# Extract journal reference if published
journal_ref_elem = entry.find('arxiv:journal_ref', ns)
journal_ref = journal_ref_elem.text if journal_ref_elem is not None else None
# Get publication date
published = entry.findtext('atom:published', '', ns)
year = published[:4] if published else ''
# Get authors
authors = []
for author in entry.findall('atom:author', ns):
name = author.findtext('atom:name', '', ns)
if name:
authors.append(name)
metadata = {
'type': 'arxiv',
'entry_type': 'misc' if not doi else 'article',
'arxiv_id': arxiv_id,
'title': entry.findtext('atom:title', '', ns).strip().replace('\n', ' '),
'authors': ' and '.join(authors),
'year': year,
'doi': doi,
'journal_ref': journal_ref,
'abstract': entry.findtext('atom:summary', '', ns).strip().replace('\n', ' '),
'url': f'https://arxiv.org/abs/{arxiv_id}'
}
return metadata
else:
print(f'Error: arXiv API returned status {response.status_code} for ID: {arxiv_id}', file=sys.stderr)
return None
except Exception as e:
print(f'Error extracting metadata from arXiv {arxiv_id}: {e}', file=sys.stderr)
return None
def metadata_to_bibtex(self, metadata: Dict, citation_key: Optional[str] = None) -> str:
"""
Convert metadata dictionary to BibTeX format.
Args:
metadata: Metadata dictionary
citation_key: Optional custom citation key
Returns:
BibTeX string
"""
if not citation_key:
citation_key = self._generate_citation_key(metadata)
entry_type = metadata.get('entry_type', 'misc')
# Build BibTeX entry
lines = [f'@{entry_type}{{{citation_key},']
# Add fields
if metadata.get('authors'):
lines.append(f' author = {{{metadata["authors"]}}},')
if metadata.get('title'):
# Protect capitalization
title = self._protect_title(metadata['title'])
lines.append(f' title = {{{title}}},')
if entry_type == 'article' and metadata.get('journal'):
lines.append(f' journal = {{{metadata["journal"]}}},')
elif entry_type == 'misc' and metadata.get('type') == 'arxiv':
lines.append(f' howpublished = {{arXiv}},')
if metadata.get('year'):
lines.append(f' year = {{{metadata["year"]}}},')
if metadata.get('volume'):
lines.append(f' volume = {{{metadata["volume"]}}},')
if metadata.get('issue'):
lines.append(f' number = {{{metadata["issue"]}}},')
if metadata.get('pages'):
pages = metadata['pages'].replace('-', '--') # En-dash
lines.append(f' pages = {{{pages}}},')
if metadata.get('doi'):
lines.append(f' doi = {{{metadata["doi"]}}},')
elif metadata.get('url'):
lines.append(f' url = {{{metadata["url"]}}},')
if metadata.get('pmid'):
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
if metadata.get('type') == 'arxiv' and not metadata.get('doi'):
lines.append(f' note = {{Preprint}},')
# Remove trailing comma from last field
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def _crossref_type_to_bibtex(self, crossref_type: str) -> str:
"""Map CrossRef type to BibTeX entry type."""
type_map = {
'journal-article': 'article',
'book': 'book',
'book-chapter': 'incollection',
'proceedings-article': 'inproceedings',
'posted-content': 'misc',
'dataset': 'misc',
'report': 'techreport'
}
return type_map.get(crossref_type, 'misc')
def _format_authors_crossref(self, authors: List[Dict]) -> str:
"""Format author list from CrossRef data."""
if not authors:
return ''
formatted = []
for author in authors:
given = author.get('given', '')
family = author.get('family', '')
if family:
if given:
formatted.append(f'{family}, {given}')
else:
formatted.append(family)
return ' and '.join(formatted)
def _format_authors_pubmed(self, authors: List) -> str:
"""Format author list from PubMed XML."""
formatted = []
for author in authors:
last_name = author.findtext('.//LastName', '')
fore_name = author.findtext('.//ForeName', '')
if last_name:
if fore_name:
formatted.append(f'{last_name}, {fore_name}')
else:
formatted.append(last_name)
return ' and '.join(formatted)
def _extract_year_crossref(self, message: Dict) -> str:
"""Extract year from CrossRef message."""
# Try published-print first, then published-online
date_parts = message.get('published-print', {}).get('date-parts', [[]])
if not date_parts or not date_parts[0]:
date_parts = message.get('published-online', {}).get('date-parts', [[]])
if date_parts and date_parts[0]:
return str(date_parts[0][0])
return ''
def _extract_year_pubmed(self, article: ET.Element) -> str:
"""Extract year from PubMed XML."""
year = article.findtext('.//Journal/JournalIssue/PubDate/Year', '')
if not year:
medline_date = article.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
if medline_date:
year_match = re.search(r'\d{4}', medline_date)
if year_match:
year = year_match.group()
return year
def _generate_citation_key(self, metadata: Dict) -> str:
"""Generate a citation key from metadata."""
# Get first author last name
authors = metadata.get('authors', '')
if authors:
first_author = authors.split(' and ')[0]
if ',' in first_author:
last_name = first_author.split(',')[0].strip()
else:
last_name = first_author.split()[-1] if first_author else 'Unknown'
else:
last_name = 'Unknown'
# Get year
year = metadata.get('year', '').strip()
if not year:
year = 'XXXX'
# Clean last name (remove special characters)
last_name = re.sub(r'[^a-zA-Z]', '', last_name)
# Get keyword from title
title = metadata.get('title', '')
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
keyword = words[0].lower() if words else 'paper'
return f'{last_name}{year}{keyword}'
def _protect_title(self, title: str) -> str:
"""Protect capitalization in title for BibTeX."""
# Protect common acronyms and proper nouns
protected_words = [
'DNA', 'RNA', 'CRISPR', 'COVID', 'HIV', 'AIDS', 'AlphaFold',
'Python', 'AI', 'ML', 'GPU', 'CPU', 'USA', 'UK', 'EU'
]
for word in protected_words:
title = re.sub(rf'\b{word}\b', f'{{{word}}}', title, flags=re.IGNORECASE)
return title
def extract(self, identifier: str) -> Optional[str]:
"""
Extract metadata and return BibTeX.
Args:
identifier: DOI, PMID, arXiv ID, or URL
Returns:
BibTeX string or None
"""
id_type, clean_id = self.identify_type(identifier)
print(f'Identified as {id_type}: {clean_id}', file=sys.stderr)
metadata = None
if id_type == 'doi':
metadata = self.extract_from_doi(clean_id)
elif id_type == 'pmid':
metadata = self.extract_from_pmid(clean_id)
elif id_type == 'arxiv':
metadata = self.extract_from_arxiv(clean_id)
else:
print(f'Error: Unknown identifier type: {identifier}', file=sys.stderr)
return None
if metadata:
return self.metadata_to_bibtex(metadata)
else:
return None
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Extract citation metadata from DOI, PMID, arXiv ID, or URL',
epilog='Example: python extract_metadata.py --doi 10.1038/s41586-021-03819-2'
)
parser.add_argument('--doi', help='Digital Object Identifier')
parser.add_argument('--pmid', help='PubMed ID')
parser.add_argument('--arxiv', help='arXiv ID')
parser.add_argument('--url', help='URL to article')
parser.add_argument('-i', '--input', help='Input file with identifiers (one per line)')
parser.add_argument('-o', '--output', help='Output file for BibTeX (default: stdout)')
parser.add_argument('--format', choices=['bibtex', 'json'], default='bibtex', help='Output format')
parser.add_argument('--email', help='Email for NCBI E-utilities (recommended)')
args = parser.parse_args()
# Collect identifiers
identifiers = []
if args.doi:
identifiers.append(args.doi)
if args.pmid:
identifiers.append(args.pmid)
if args.arxiv:
identifiers.append(args.arxiv)
if args.url:
identifiers.append(args.url)
if args.input:
try:
with open(args.input, 'r', encoding='utf-8') as f:
file_ids = [line.strip() for line in f if line.strip()]
identifiers.extend(file_ids)
except Exception as e:
print(f'Error reading input file: {e}', file=sys.stderr)
sys.exit(1)
if not identifiers:
parser.print_help()
sys.exit(1)
# Extract metadata
extractor = MetadataExtractor(email=args.email)
bibtex_entries = []
for i, identifier in enumerate(identifiers):
print(f'\nProcessing {i+1}/{len(identifiers)}...', file=sys.stderr)
bibtex = extractor.extract(identifier)
if bibtex:
bibtex_entries.append(bibtex)
# Rate limiting
if i < len(identifiers) - 1:
time.sleep(0.5)
if not bibtex_entries:
print('Error: No successful extractions', file=sys.stderr)
sys.exit(1)
# Format output
if args.format == 'bibtex':
output = '\n\n'.join(bibtex_entries) + '\n'
else: # json
output = json.dumps({
'count': len(bibtex_entries),
'entries': bibtex_entries
}, indent=2)
# Write output
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'\nSuccessfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
else:
print(output)
print(f'\nExtracted {len(bibtex_entries)}/{len(identifiers)} entries', file=sys.stderr)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,349 @@
#!/usr/bin/env python3
"""
BibTeX Formatter and Cleaner
Format, clean, sort, and deduplicate BibTeX files.
"""
import sys
import re
import argparse
from typing import List, Dict, Tuple
from collections import OrderedDict
class BibTeXFormatter:
"""Format and clean BibTeX entries."""
def __init__(self):
# Standard field order for readability
self.field_order = [
'author', 'editor', 'title', 'booktitle', 'journal',
'year', 'month', 'volume', 'number', 'pages',
'publisher', 'address', 'edition', 'series',
'school', 'institution', 'organization',
'howpublished', 'doi', 'url', 'isbn', 'issn',
'note', 'abstract', 'keywords'
]
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
"""
Parse BibTeX file and extract entries.
Args:
filepath: Path to BibTeX file
Returns:
List of entry dictionaries
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f'Error reading file: {e}', file=sys.stderr)
return []
entries = []
# Match BibTeX entries
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
for match in matches:
entry_type = match.group(1).lower()
citation_key = match.group(2).strip()
fields_text = match.group(3)
# Parse fields
fields = OrderedDict()
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
field_matches = re.finditer(field_pattern, fields_text)
for field_match in field_matches:
if field_match.group(1):
field_name = field_match.group(1).lower()
field_value = field_match.group(2)
else:
field_name = field_match.group(3).lower()
field_value = field_match.group(4)
fields[field_name] = field_value.strip()
entries.append({
'type': entry_type,
'key': citation_key,
'fields': fields
})
return entries
def format_entry(self, entry: Dict) -> str:
"""
Format a single BibTeX entry.
Args:
entry: Entry dictionary
Returns:
Formatted BibTeX string
"""
lines = [f'@{entry["type"]}{{{entry["key"]},']
# Order fields according to standard order
ordered_fields = OrderedDict()
# Add fields in standard order
for field_name in self.field_order:
if field_name in entry['fields']:
ordered_fields[field_name] = entry['fields'][field_name]
# Add any remaining fields
for field_name, field_value in entry['fields'].items():
if field_name not in ordered_fields:
ordered_fields[field_name] = field_value
# Format each field
max_field_len = max(len(f) for f in ordered_fields.keys()) if ordered_fields else 0
for field_name, field_value in ordered_fields.items():
# Pad field name for alignment
padded_field = field_name.ljust(max_field_len)
lines.append(f' {padded_field} = {{{field_value}}},')
# Remove trailing comma from last field
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def fix_common_issues(self, entry: Dict) -> Dict:
"""
Fix common formatting issues in entry.
Args:
entry: Entry dictionary
Returns:
Fixed entry dictionary
"""
fixed = entry.copy()
fields = fixed['fields'].copy()
# Fix page ranges (single hyphen to double hyphen)
if 'pages' in fields:
pages = fields['pages']
# Replace single hyphen with double hyphen if it's a range
if re.search(r'\d-\d', pages) and '--' not in pages:
pages = re.sub(r'(\d)-(\d)', r'\1--\2', pages)
fields['pages'] = pages
# Remove "pp." from pages
if 'pages' in fields:
pages = fields['pages']
pages = re.sub(r'^pp\.\s*', '', pages, flags=re.IGNORECASE)
fields['pages'] = pages
# Fix DOI (remove URL prefix if present)
if 'doi' in fields:
doi = fields['doi']
doi = doi.replace('https://doi.org/', '')
doi = doi.replace('http://doi.org/', '')
doi = doi.replace('doi:', '')
fields['doi'] = doi
# Fix author separators (semicolon or ampersand to 'and')
if 'author' in fields:
author = fields['author']
author = author.replace(';', ' and')
author = author.replace(' & ', ' and ')
# Clean up multiple 'and's
author = re.sub(r'\s+and\s+and\s+', ' and ', author)
fields['author'] = author
fixed['fields'] = fields
return fixed
def deduplicate_entries(self, entries: List[Dict]) -> List[Dict]:
"""
Remove duplicate entries based on DOI or citation key.
Args:
entries: List of entry dictionaries
Returns:
List of unique entries
"""
seen_dois = set()
seen_keys = set()
unique_entries = []
for entry in entries:
doi = entry['fields'].get('doi', '').strip()
key = entry['key']
# Check DOI first (more reliable)
if doi:
if doi in seen_dois:
print(f'Duplicate DOI found: {doi} (skipping {key})', file=sys.stderr)
continue
seen_dois.add(doi)
# Check citation key
if key in seen_keys:
print(f'Duplicate citation key found: {key} (skipping)', file=sys.stderr)
continue
seen_keys.add(key)
unique_entries.append(entry)
return unique_entries
def sort_entries(self, entries: List[Dict], sort_by: str = 'key', descending: bool = False) -> List[Dict]:
"""
Sort entries by specified field.
Args:
entries: List of entry dictionaries
sort_by: Field to sort by ('key', 'year', 'author', 'title')
descending: Sort in descending order
Returns:
Sorted list of entries
"""
def get_sort_key(entry: Dict) -> str:
if sort_by == 'key':
return entry['key'].lower()
elif sort_by == 'year':
year = entry['fields'].get('year', '9999')
return year
elif sort_by == 'author':
author = entry['fields'].get('author', 'ZZZ')
# Get last name of first author
if ',' in author:
return author.split(',')[0].lower()
else:
return author.split()[0].lower() if author else 'zzz'
elif sort_by == 'title':
return entry['fields'].get('title', '').lower()
else:
return entry['key'].lower()
return sorted(entries, key=get_sort_key, reverse=descending)
def format_file(self, filepath: str, output: str = None,
deduplicate: bool = False, sort_by: str = None,
descending: bool = False, fix_issues: bool = True) -> None:
"""
Format entire BibTeX file.
Args:
filepath: Input BibTeX file
output: Output file (None for in-place)
deduplicate: Remove duplicates
sort_by: Field to sort by
descending: Sort in descending order
fix_issues: Fix common formatting issues
"""
print(f'Parsing {filepath}...', file=sys.stderr)
entries = self.parse_bibtex_file(filepath)
if not entries:
print('No entries found', file=sys.stderr)
return
print(f'Found {len(entries)} entries', file=sys.stderr)
# Fix common issues
if fix_issues:
print('Fixing common issues...', file=sys.stderr)
entries = [self.fix_common_issues(e) for e in entries]
# Deduplicate
if deduplicate:
print('Removing duplicates...', file=sys.stderr)
original_count = len(entries)
entries = self.deduplicate_entries(entries)
removed = original_count - len(entries)
if removed > 0:
print(f'Removed {removed} duplicate(s)', file=sys.stderr)
# Sort
if sort_by:
print(f'Sorting by {sort_by}...', file=sys.stderr)
entries = self.sort_entries(entries, sort_by, descending)
# Format entries
print('Formatting entries...', file=sys.stderr)
formatted_entries = [self.format_entry(e) for e in entries]
# Write output
output_content = '\n\n'.join(formatted_entries) + '\n'
output_file = output or filepath
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output_content)
print(f'Successfully wrote {len(entries)} entries to {output_file}', file=sys.stderr)
except Exception as e:
print(f'Error writing file: {e}', file=sys.stderr)
sys.exit(1)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Format, clean, sort, and deduplicate BibTeX files',
epilog='Example: python format_bibtex.py references.bib --deduplicate --sort year'
)
parser.add_argument(
'file',
help='BibTeX file to format'
)
parser.add_argument(
'-o', '--output',
help='Output file (default: overwrite input file)'
)
parser.add_argument(
'--deduplicate',
action='store_true',
help='Remove duplicate entries'
)
parser.add_argument(
'--sort',
choices=['key', 'year', 'author', 'title'],
help='Sort entries by field'
)
parser.add_argument(
'--descending',
action='store_true',
help='Sort in descending order'
)
parser.add_argument(
'--no-fix',
action='store_true',
help='Do not fix common issues'
)
args = parser.parse_args()
# Format file
formatter = BibTeXFormatter()
formatter.format_file(
args.file,
output=args.output,
deduplicate=args.deduplicate,
sort_by=args.sort,
descending=args.descending,
fix_issues=not args.no_fix
)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Google Scholar Search Tool
Search Google Scholar and export results.
Note: This script requires the 'scholarly' library.
Install with: pip install scholarly
"""
import sys
import argparse
import json
import time
import random
from typing import List, Dict, Optional
try:
from scholarly import scholarly, ProxyGenerator
SCHOLARLY_AVAILABLE = True
except ImportError:
SCHOLARLY_AVAILABLE = False
print('Warning: scholarly library not installed. Install with: pip install scholarly', file=sys.stderr)
class GoogleScholarSearcher:
"""Search Google Scholar using scholarly library."""
def __init__(self, use_proxy: bool = False):
"""
Initialize searcher.
Args:
use_proxy: Use free proxy (helps avoid rate limiting)
"""
if not SCHOLARLY_AVAILABLE:
raise ImportError('scholarly library required. Install with: pip install scholarly')
# Setup proxy if requested
if use_proxy:
try:
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)
print('Using free proxy', file=sys.stderr)
except Exception as e:
print(f'Warning: Could not setup proxy: {e}', file=sys.stderr)
def search(self, query: str, max_results: int = 50,
year_start: Optional[int] = None, year_end: Optional[int] = None,
sort_by: str = 'relevance') -> List[Dict]:
"""
Search Google Scholar.
Args:
query: Search query
max_results: Maximum number of results
year_start: Start year filter
year_end: End year filter
sort_by: Sort order ('relevance' or 'citations')
Returns:
List of result dictionaries
"""
if not SCHOLARLY_AVAILABLE:
print('Error: scholarly library not installed', file=sys.stderr)
return []
print(f'Searching Google Scholar: {query}', file=sys.stderr)
print(f'Max results: {max_results}', file=sys.stderr)
results = []
try:
# Perform search
search_query = scholarly.search_pubs(query)
for i, result in enumerate(search_query):
if i >= max_results:
break
print(f'Retrieved {i+1}/{max_results}', file=sys.stderr)
# Extract metadata
metadata = {
'title': result.get('bib', {}).get('title', ''),
'authors': ', '.join(result.get('bib', {}).get('author', [])),
'year': result.get('bib', {}).get('pub_year', ''),
'venue': result.get('bib', {}).get('venue', ''),
'abstract': result.get('bib', {}).get('abstract', ''),
'citations': result.get('num_citations', 0),
'url': result.get('pub_url', ''),
'eprint_url': result.get('eprint_url', ''),
}
# Filter by year
if year_start or year_end:
try:
pub_year = int(metadata['year']) if metadata['year'] else 0
if year_start and pub_year < year_start:
continue
if year_end and pub_year > year_end:
continue
except ValueError:
pass
results.append(metadata)
# Rate limiting to avoid blocking
time.sleep(random.uniform(2, 5))
except Exception as e:
print(f'Error during search: {e}', file=sys.stderr)
# Sort if requested
if sort_by == 'citations' and results:
results.sort(key=lambda x: x.get('citations', 0), reverse=True)
return results
def metadata_to_bibtex(self, metadata: Dict) -> str:
"""Convert metadata to BibTeX format."""
# Generate citation key
if metadata.get('authors'):
first_author = metadata['authors'].split(',')[0].strip()
last_name = first_author.split()[-1] if first_author else 'Unknown'
else:
last_name = 'Unknown'
year = metadata.get('year', 'XXXX')
# Get keyword from title
import re
title = metadata.get('title', '')
words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
keyword = words[0].lower() if words else 'paper'
citation_key = f'{last_name}{year}{keyword}'
# Determine entry type (guess based on venue)
venue = metadata.get('venue', '').lower()
if 'proceedings' in venue or 'conference' in venue:
entry_type = 'inproceedings'
venue_field = 'booktitle'
else:
entry_type = 'article'
venue_field = 'journal'
# Build BibTeX
lines = [f'@{entry_type}{{{citation_key},']
# Convert authors format
if metadata.get('authors'):
authors = metadata['authors'].replace(',', ' and')
lines.append(f' author = {{{authors}}},')
if metadata.get('title'):
lines.append(f' title = {{{metadata["title"]}}},')
if metadata.get('venue'):
lines.append(f' {venue_field} = {{{metadata["venue"]}}},')
if metadata.get('year'):
lines.append(f' year = {{{metadata["year"]}}},')
if metadata.get('url'):
lines.append(f' url = {{{metadata["url"]}}},')
if metadata.get('citations'):
lines.append(f' note = {{Cited by: {metadata["citations"]}}},')
# Remove trailing comma
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Search Google Scholar (requires scholarly library)',
epilog='Example: python search_google_scholar.py "machine learning" --limit 50'
)
parser.add_argument(
'query',
help='Search query'
)
parser.add_argument(
'--limit',
type=int,
default=50,
help='Maximum number of results (default: 50)'
)
parser.add_argument(
'--year-start',
type=int,
help='Start year for filtering'
)
parser.add_argument(
'--year-end',
type=int,
help='End year for filtering'
)
parser.add_argument(
'--sort-by',
choices=['relevance', 'citations'],
default='relevance',
help='Sort order (default: relevance)'
)
parser.add_argument(
'--use-proxy',
action='store_true',
help='Use free proxy to avoid rate limiting'
)
parser.add_argument(
'-o', '--output',
help='Output file (default: stdout)'
)
parser.add_argument(
'--format',
choices=['json', 'bibtex'],
default='json',
help='Output format (default: json)'
)
args = parser.parse_args()
if not SCHOLARLY_AVAILABLE:
print('\nError: scholarly library not installed', file=sys.stderr)
print('Install with: pip install scholarly', file=sys.stderr)
print('\nAlternatively, use PubMed search for biomedical literature:', file=sys.stderr)
print(' python search_pubmed.py "your query"', file=sys.stderr)
sys.exit(1)
# Search
searcher = GoogleScholarSearcher(use_proxy=args.use_proxy)
results = searcher.search(
args.query,
max_results=args.limit,
year_start=args.year_start,
year_end=args.year_end,
sort_by=args.sort_by
)
if not results:
print('No results found', file=sys.stderr)
sys.exit(1)
# Format output
if args.format == 'json':
output = json.dumps({
'query': args.query,
'count': len(results),
'results': results
}, indent=2)
else: # bibtex
bibtex_entries = [searcher.metadata_to_bibtex(r) for r in results]
output = '\n\n'.join(bibtex_entries) + '\n'
# Write output
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'Wrote {len(results)} results to {args.output}', file=sys.stderr)
else:
print(output)
print(f'\nRetrieved {len(results)} results', file=sys.stderr)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,398 @@
#!/usr/bin/env python3
"""
PubMed Search Tool
Search PubMed using E-utilities API and export results.
"""
import sys
import os
import requests
import argparse
import json
import time
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
from datetime import datetime
class PubMedSearcher:
"""Search PubMed using NCBI E-utilities API."""
def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
"""
Initialize searcher.
Args:
api_key: NCBI API key (optional but recommended)
email: Email for Entrez (optional but recommended)
"""
self.api_key = api_key or os.getenv('NCBI_API_KEY', '')
self.email = email or os.getenv('NCBI_EMAIL', '')
self.base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
self.session = requests.Session()
# Rate limiting
self.delay = 0.11 if self.api_key else 0.34 # 10/sec with key, 3/sec without
def search(self, query: str, max_results: int = 100,
date_start: Optional[str] = None, date_end: Optional[str] = None,
publication_types: Optional[List[str]] = None) -> List[str]:
"""
Search PubMed and return PMIDs.
Args:
query: Search query
max_results: Maximum number of results
date_start: Start date (YYYY/MM/DD or YYYY)
date_end: End date (YYYY/MM/DD or YYYY)
publication_types: List of publication types to filter
Returns:
List of PMIDs
"""
# Build query with filters
full_query = query
# Add date range
if date_start or date_end:
start = date_start or '1900'
end = date_end or datetime.now().strftime('%Y')
full_query += f' AND {start}:{end}[Publication Date]'
# Add publication types
if publication_types:
pub_type_query = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
full_query += f' AND ({pub_type_query})'
print(f'Searching PubMed: {full_query}', file=sys.stderr)
# ESearch to get PMIDs
esearch_url = self.base_url + 'esearch.fcgi'
params = {
'db': 'pubmed',
'term': full_query,
'retmax': max_results,
'retmode': 'json'
}
if self.email:
params['email'] = self.email
if self.api_key:
params['api_key'] = self.api_key
try:
response = self.session.get(esearch_url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
pmids = data['esearchresult']['idlist']
count = int(data['esearchresult']['count'])
print(f'Found {count} results, retrieving {len(pmids)}', file=sys.stderr)
return pmids
except Exception as e:
print(f'Error searching PubMed: {e}', file=sys.stderr)
return []
def fetch_metadata(self, pmids: List[str]) -> List[Dict]:
"""
Fetch metadata for PMIDs.
Args:
pmids: List of PubMed IDs
Returns:
List of metadata dictionaries
"""
if not pmids:
return []
metadata_list = []
# Fetch in batches of 200
batch_size = 200
for i in range(0, len(pmids), batch_size):
batch = pmids[i:i+batch_size]
print(f'Fetching metadata for PMIDs {i+1}-{min(i+batch_size, len(pmids))}...', file=sys.stderr)
efetch_url = self.base_url + 'efetch.fcgi'
params = {
'db': 'pubmed',
'id': ','.join(batch),
'retmode': 'xml',
'rettype': 'abstract'
}
if self.email:
params['email'] = self.email
if self.api_key:
params['api_key'] = self.api_key
try:
response = self.session.get(efetch_url, params=params, timeout=60)
response.raise_for_status()
# Parse XML
root = ET.fromstring(response.content)
articles = root.findall('.//PubmedArticle')
for article in articles:
metadata = self._extract_metadata_from_xml(article)
if metadata:
metadata_list.append(metadata)
# Rate limiting
time.sleep(self.delay)
except Exception as e:
print(f'Error fetching metadata for batch: {e}', file=sys.stderr)
continue
return metadata_list
def _extract_metadata_from_xml(self, article: ET.Element) -> Optional[Dict]:
"""Extract metadata from PubmedArticle XML element."""
try:
medline_citation = article.find('.//MedlineCitation')
article_elem = medline_citation.find('.//Article')
journal = article_elem.find('.//Journal')
# Get PMID
pmid = medline_citation.findtext('.//PMID', '')
# Get DOI
doi = None
article_ids = article.findall('.//ArticleId')
for article_id in article_ids:
if article_id.get('IdType') == 'doi':
doi = article_id.text
break
# Get authors
authors = []
author_list = article_elem.find('.//AuthorList')
if author_list is not None:
for author in author_list.findall('.//Author'):
last_name = author.findtext('.//LastName', '')
fore_name = author.findtext('.//ForeName', '')
if last_name:
if fore_name:
authors.append(f'{last_name}, {fore_name}')
else:
authors.append(last_name)
# Get year
year = article_elem.findtext('.//Journal/JournalIssue/PubDate/Year', '')
if not year:
medline_date = article_elem.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
if medline_date:
import re
year_match = re.search(r'\d{4}', medline_date)
if year_match:
year = year_match.group()
metadata = {
'pmid': pmid,
'doi': doi,
'title': article_elem.findtext('.//ArticleTitle', ''),
'authors': ' and '.join(authors),
'journal': journal.findtext('.//Title', ''),
'year': year,
'volume': journal.findtext('.//JournalIssue/Volume', ''),
'issue': journal.findtext('.//JournalIssue/Issue', ''),
'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
'abstract': article_elem.findtext('.//Abstract/AbstractText', '')
}
return metadata
except Exception as e:
print(f'Error extracting metadata: {e}', file=sys.stderr)
return None
def metadata_to_bibtex(self, metadata: Dict) -> str:
"""Convert metadata to BibTeX format."""
# Generate citation key
if metadata.get('authors'):
first_author = metadata['authors'].split(' and ')[0]
if ',' in first_author:
last_name = first_author.split(',')[0].strip()
else:
last_name = first_author.split()[0]
else:
last_name = 'Unknown'
year = metadata.get('year', 'XXXX')
citation_key = f'{last_name}{year}pmid{metadata.get("pmid", "")}'
# Build BibTeX entry
lines = [f'@article{{{citation_key},']
if metadata.get('authors'):
lines.append(f' author = {{{metadata["authors"]}}},')
if metadata.get('title'):
lines.append(f' title = {{{metadata["title"]}}},')
if metadata.get('journal'):
lines.append(f' journal = {{{metadata["journal"]}}},')
if metadata.get('year'):
lines.append(f' year = {{{metadata["year"]}}},')
if metadata.get('volume'):
lines.append(f' volume = {{{metadata["volume"]}}},')
if metadata.get('issue'):
lines.append(f' number = {{{metadata["issue"]}}},')
if metadata.get('pages'):
pages = metadata['pages'].replace('-', '--')
lines.append(f' pages = {{{pages}}},')
if metadata.get('doi'):
lines.append(f' doi = {{{metadata["doi"]}}},')
if metadata.get('pmid'):
lines.append(f' note = {{PMID: {metadata["pmid"]}}},')
# Remove trailing comma
if lines[-1].endswith(','):
lines[-1] = lines[-1][:-1]
lines.append('}')
return '\n'.join(lines)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Search PubMed using E-utilities API',
epilog='Example: python search_pubmed.py "CRISPR gene editing" --limit 100'
)
parser.add_argument(
'query',
nargs='?',
help='Search query (PubMed syntax)'
)
parser.add_argument(
'--query',
dest='query_arg',
help='Search query (alternative to positional argument)'
)
parser.add_argument(
'--query-file',
help='File containing search query'
)
parser.add_argument(
'--limit',
type=int,
default=100,
help='Maximum number of results (default: 100)'
)
parser.add_argument(
'--date-start',
help='Start date (YYYY/MM/DD or YYYY)'
)
parser.add_argument(
'--date-end',
help='End date (YYYY/MM/DD or YYYY)'
)
parser.add_argument(
'--publication-types',
help='Comma-separated publication types (e.g., "Review,Clinical Trial")'
)
parser.add_argument(
'-o', '--output',
help='Output file (default: stdout)'
)
parser.add_argument(
'--format',
choices=['json', 'bibtex'],
default='json',
help='Output format (default: json)'
)
parser.add_argument(
'--api-key',
help='NCBI API key (or set NCBI_API_KEY env var)'
)
parser.add_argument(
'--email',
help='Email for Entrez (or set NCBI_EMAIL env var)'
)
args = parser.parse_args()
# Get query
query = args.query or args.query_arg
if args.query_file:
try:
with open(args.query_file, 'r', encoding='utf-8') as f:
query = f.read().strip()
except Exception as e:
print(f'Error reading query file: {e}', file=sys.stderr)
sys.exit(1)
if not query:
parser.print_help()
sys.exit(1)
# Parse publication types
pub_types = None
if args.publication_types:
pub_types = [pt.strip() for pt in args.publication_types.split(',')]
# Search PubMed
searcher = PubMedSearcher(api_key=args.api_key, email=args.email)
pmids = searcher.search(
query,
max_results=args.limit,
date_start=args.date_start,
date_end=args.date_end,
publication_types=pub_types
)
if not pmids:
print('No results found', file=sys.stderr)
sys.exit(1)
# Fetch metadata
metadata_list = searcher.fetch_metadata(pmids)
# Format output
if args.format == 'json':
output = json.dumps({
'query': query,
'count': len(metadata_list),
'results': metadata_list
}, indent=2)
else: # bibtex
bibtex_entries = [searcher.metadata_to_bibtex(m) for m in metadata_list]
output = '\n\n'.join(bibtex_entries) + '\n'
# Write output
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f'Wrote {len(metadata_list)} results to {args.output}', file=sys.stderr)
else:
print(output)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,497 @@
#!/usr/bin/env python3
"""
Citation Validation Tool
Validate BibTeX files for accuracy, completeness, and format compliance.
"""
import sys
import re
import requests
import argparse
import json
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
class CitationValidator:
"""Validate BibTeX entries for errors and inconsistencies."""
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
})
# Required fields by entry type
self.required_fields = {
'article': ['author', 'title', 'journal', 'year'],
'book': ['title', 'publisher', 'year'], # author OR editor
'inproceedings': ['author', 'title', 'booktitle', 'year'],
'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
'phdthesis': ['author', 'title', 'school', 'year'],
'mastersthesis': ['author', 'title', 'school', 'year'],
'techreport': ['author', 'title', 'institution', 'year'],
'misc': ['title', 'year']
}
# Recommended fields
self.recommended_fields = {
'article': ['volume', 'pages', 'doi'],
'book': ['isbn'],
'inproceedings': ['pages'],
}
def parse_bibtex_file(self, filepath: str) -> List[Dict]:
"""
Parse BibTeX file and extract entries.
Args:
filepath: Path to BibTeX file
Returns:
List of entry dictionaries
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f'Error reading file: {e}', file=sys.stderr)
return []
entries = []
# Match BibTeX entries
pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
for match in matches:
entry_type = match.group(1).lower()
citation_key = match.group(2).strip()
fields_text = match.group(3)
# Parse fields
fields = {}
field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
field_matches = re.finditer(field_pattern, fields_text)
for field_match in field_matches:
if field_match.group(1):
field_name = field_match.group(1).lower()
field_value = field_match.group(2)
else:
field_name = field_match.group(3).lower()
field_value = field_match.group(4)
fields[field_name] = field_value.strip()
entries.append({
'type': entry_type,
'key': citation_key,
'fields': fields,
'raw': match.group(0)
})
return entries
def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
"""
Validate a single BibTeX entry.
Args:
entry: Entry dictionary
Returns:
Tuple of (errors, warnings)
"""
errors = []
warnings = []
entry_type = entry['type']
key = entry['key']
fields = entry['fields']
# Check required fields
if entry_type in self.required_fields:
for req_field in self.required_fields[entry_type]:
if req_field not in fields or not fields[req_field]:
# Special case: book can have author OR editor
if entry_type == 'book' and req_field == 'author':
if 'editor' not in fields or not fields['editor']:
errors.append({
'type': 'missing_required_field',
'field': 'author or editor',
'severity': 'high',
'message': f'Entry {key}: Missing required field "author" or "editor"'
})
else:
errors.append({
'type': 'missing_required_field',
'field': req_field,
'severity': 'high',
'message': f'Entry {key}: Missing required field "{req_field}"'
})
# Check recommended fields
if entry_type in self.recommended_fields:
for rec_field in self.recommended_fields[entry_type]:
if rec_field not in fields or not fields[rec_field]:
warnings.append({
'type': 'missing_recommended_field',
'field': rec_field,
'severity': 'medium',
'message': f'Entry {key}: Missing recommended field "{rec_field}"'
})
# Validate year
if 'year' in fields:
year = fields['year']
if not re.match(r'^\d{4}$', year):
errors.append({
'type': 'invalid_year',
'field': 'year',
'value': year,
'severity': 'high',
'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
})
elif int(year) < 1600 or int(year) > 2030:
warnings.append({
'type': 'suspicious_year',
'field': 'year',
'value': year,
'severity': 'medium',
'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
})
# Validate DOI format
if 'doi' in fields:
doi = fields['doi']
if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
warnings.append({
'type': 'invalid_doi_format',
'field': 'doi',
'value': doi,
'severity': 'medium',
'message': f'Entry {key}: Invalid DOI format "{doi}"'
})
# Check for single hyphen in pages (should be --)
if 'pages' in fields:
pages = fields['pages']
if re.search(r'\d-\d', pages) and '--' not in pages:
warnings.append({
'type': 'page_range_format',
'field': 'pages',
'value': pages,
'severity': 'low',
'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
})
# Check author format
if 'author' in fields:
author = fields['author']
if ';' in author or '&' in author:
errors.append({
'type': 'invalid_author_format',
'field': 'author',
'severity': 'high',
'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
})
return errors, warnings
def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
"""
Verify DOI resolves correctly and get metadata.
Args:
doi: Digital Object Identifier
Returns:
Tuple of (is_valid, metadata)
"""
try:
url = f'https://doi.org/{doi}'
response = self.session.head(url, timeout=10, allow_redirects=True)
if response.status_code < 400:
# DOI resolves, now get metadata from CrossRef
crossref_url = f'https://api.crossref.org/works/{doi}'
metadata_response = self.session.get(crossref_url, timeout=10)
if metadata_response.status_code == 200:
data = metadata_response.json()
message = data.get('message', {})
# Extract key metadata
metadata = {
'title': message.get('title', [''])[0],
'year': self._extract_year_crossref(message),
'authors': self._format_authors_crossref(message.get('author', [])),
}
return True, metadata
else:
return True, None # DOI resolves but no CrossRef metadata
else:
return False, None
except Exception:
return False, None
def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
"""
Detect duplicate entries.
Args:
entries: List of entry dictionaries
Returns:
List of duplicate groups
"""
duplicates = []
# Check for duplicate DOIs
doi_map = defaultdict(list)
for entry in entries:
doi = entry['fields'].get('doi', '').strip()
if doi:
doi_map[doi].append(entry['key'])
for doi, keys in doi_map.items():
if len(keys) > 1:
duplicates.append({
'type': 'duplicate_doi',
'doi': doi,
'entries': keys,
'severity': 'high',
'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
})
# Check for duplicate citation keys
key_counts = defaultdict(int)
for entry in entries:
key_counts[entry['key']] += 1
for key, count in key_counts.items():
if count > 1:
duplicates.append({
'type': 'duplicate_key',
'key': key,
'count': count,
'severity': 'high',
'message': f'Citation key "{key}" appears {count} times'
})
# Check for similar titles (possible duplicates)
titles = {}
for entry in entries:
title = entry['fields'].get('title', '').lower()
title = re.sub(r'[^\w\s]', '', title) # Remove punctuation
title = ' '.join(title.split()) # Normalize whitespace
if title:
if title in titles:
duplicates.append({
'type': 'similar_title',
'entries': [titles[title], entry['key']],
'severity': 'medium',
'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
})
else:
titles[title] = entry['key']
return duplicates
def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
"""
Validate entire BibTeX file.
Args:
filepath: Path to BibTeX file
check_dois: Whether to verify DOIs (slow)
Returns:
Validation report dictionary
"""
print(f'Parsing {filepath}...', file=sys.stderr)
entries = self.parse_bibtex_file(filepath)
if not entries:
return {
'total_entries': 0,
'errors': [],
'warnings': [],
'duplicates': []
}
print(f'Found {len(entries)} entries', file=sys.stderr)
all_errors = []
all_warnings = []
# Validate each entry
for i, entry in enumerate(entries):
print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
errors, warnings = self.validate_entry(entry)
for error in errors:
error['entry'] = entry['key']
all_errors.append(error)
for warning in warnings:
warning['entry'] = entry['key']
all_warnings.append(warning)
# Check for duplicates
print('Checking for duplicates...', file=sys.stderr)
duplicates = self.detect_duplicates(entries)
# Verify DOIs if requested
doi_errors = []
if check_dois:
print('Verifying DOIs...', file=sys.stderr)
for i, entry in enumerate(entries):
doi = entry['fields'].get('doi', '')
if doi:
print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
is_valid, metadata = self.verify_doi(doi)
if not is_valid:
doi_errors.append({
'type': 'invalid_doi',
'entry': entry['key'],
'doi': doi,
'severity': 'high',
'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
})
all_errors.extend(doi_errors)
return {
'filepath': filepath,
'total_entries': len(entries),
'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
'errors': all_errors,
'warnings': all_warnings,
'duplicates': duplicates
}
def _extract_year_crossref(self, message: Dict) -> str:
"""Extract year from CrossRef message."""
date_parts = message.get('published-print', {}).get('date-parts', [[]])
if not date_parts or not date_parts[0]:
date_parts = message.get('published-online', {}).get('date-parts', [[]])
if date_parts and date_parts[0]:
return str(date_parts[0][0])
return ''
def _format_authors_crossref(self, authors: List[Dict]) -> str:
"""Format author list from CrossRef."""
if not authors:
return ''
formatted = []
for author in authors[:3]: # First 3 authors
given = author.get('given', '')
family = author.get('family', '')
if family:
formatted.append(f'{family}, {given}' if given else family)
if len(authors) > 3:
formatted.append('et al.')
return ', '.join(formatted)
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description='Validate BibTeX files for errors and inconsistencies',
epilog='Example: python validate_citations.py references.bib'
)
parser.add_argument(
'file',
help='BibTeX file to validate'
)
parser.add_argument(
'--check-dois',
action='store_true',
help='Verify DOIs resolve correctly (slow)'
)
parser.add_argument(
'--auto-fix',
action='store_true',
help='Attempt to auto-fix common issues (not implemented yet)'
)
parser.add_argument(
'--report',
help='Output file for JSON validation report'
)
parser.add_argument(
'--verbose',
action='store_true',
help='Show detailed output'
)
args = parser.parse_args()
# Validate file
validator = CitationValidator()
report = validator.validate_file(args.file, check_dois=args.check_dois)
# Print summary
print('\n' + '='*60)
print('CITATION VALIDATION REPORT')
print('='*60)
print(f'\nFile: {args.file}')
print(f'Total entries: {report["total_entries"]}')
print(f'Valid entries: {report["valid_entries"]}')
print(f'Errors: {len(report["errors"])}')
print(f'Warnings: {len(report["warnings"])}')
print(f'Duplicates: {len(report["duplicates"])}')
# Print errors
if report['errors']:
print('\n' + '-'*60)
print('ERRORS (must fix):')
print('-'*60)
for error in report['errors']:
print(f'\n{error["message"]}')
if args.verbose:
print(f' Type: {error["type"]}')
print(f' Severity: {error["severity"]}')
# Print warnings
if report['warnings'] and args.verbose:
print('\n' + '-'*60)
print('WARNINGS (should fix):')
print('-'*60)
for warning in report['warnings']:
print(f'\n{warning["message"]}')
# Print duplicates
if report['duplicates']:
print('\n' + '-'*60)
print('DUPLICATES:')
print('-'*60)
for dup in report['duplicates']:
print(f'\n{dup["message"]}')
# Save report
if args.report:
with open(args.report, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2)
print(f'\nDetailed report saved to: {args.report}')
# Exit with error code if there are errors
if report['errors']:
sys.exit(1)
if __name__ == '__main__':
main()