11 KiB
Database Access with Bio.Entrez
Overview
Bio.Entrez provides programmatic access to NCBI's Entrez databases, including PubMed, GenBank, Gene, Protein, Nucleotide, and many others. It handles all the complexity of API calls, rate limiting, and data parsing.
Setup and Configuration
Email Address (Required)
NCBI requires an email address to track usage and contact users if issues arise:
from Bio import Entrez
# Always set your email
Entrez.email = "your.email@example.com"
API Key (Recommended)
Using an API key increases rate limits from 3 to 10 requests per second:
# Get API key from: https://www.ncbi.nlm.nih.gov/account/settings/
Entrez.api_key = "your_api_key_here"
Rate Limiting
Biopython automatically respects NCBI rate limits:
- Without API key: 3 requests per second
- With API key: 10 requests per second
The module handles this automatically, so you don't need to add delays between requests.
Core Entrez Functions
EInfo - Database Information
Get information about available databases and their statistics:
# List all databases
handle = Entrez.einfo()
result = Entrez.read(handle)
print(result["DbList"])
# Get information about a specific database
handle = Entrez.einfo(db="pubmed")
result = Entrez.read(handle)
print(result["DbInfo"]["Description"])
print(result["DbInfo"]["Count"]) # Number of records
ESearch - Search Databases
Search for records and retrieve their IDs:
# Search PubMed
handle = Entrez.esearch(db="pubmed", term="biopython")
result = Entrez.read(handle)
handle.close()
id_list = result["IdList"]
count = result["Count"]
print(f"Found {count} results")
print(f"Retrieved IDs: {id_list}")
Advanced ESearch Parameters
# Search with additional parameters
handle = Entrez.esearch(
db="pubmed",
term="biopython[Title]",
retmax=100, # Return up to 100 IDs
sort="relevance", # Sort by relevance
reldate=365, # Only results from last year
datetype="pdat" # Use publication date
)
result = Entrez.read(handle)
handle.close()
ESummary - Get Record Summaries
Retrieve summary information for a list of IDs:
# Get summaries for multiple records
handle = Entrez.esummary(db="pubmed", id="19304878,18606172")
results = Entrez.read(handle)
handle.close()
for record in results:
print(f"Title: {record['Title']}")
print(f"Authors: {record['AuthorList']}")
print(f"Journal: {record['Source']}")
print()
EFetch - Retrieve Full Records
Fetch complete records in various formats:
# Fetch a GenBank record
handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
record_text = handle.read()
handle.close()
# Parse with SeqIO
from Bio import SeqIO
handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
print(record.description)
EFetch Return Types
Different databases support different return types:
Nucleotide/Protein:
rettype="fasta"- FASTA formatrettype="gb"or"genbank"- GenBank formatrettype="gp"- GenPept format (proteins)
PubMed:
rettype="medline"- MEDLINE formatrettype="abstract"- Abstract text
Common modes:
retmode="text"- Plain textretmode="xml"- XML format
ELink - Find Related Records
Find links between records in different databases:
# Find protein records linked to a nucleotide record
handle = Entrez.elink(dbfrom="nucleotide", db="protein", id="EU490707")
result = Entrez.read(handle)
handle.close()
# Extract linked IDs
for linkset in result[0]["LinkSetDb"]:
if linkset["LinkName"] == "nucleotide_protein":
protein_ids = [link["Id"] for link in linkset["Link"]]
print(f"Linked protein IDs: {protein_ids}")
EPost - Upload ID Lists
Upload large lists of IDs to the server for later use:
# Post IDs to server
id_list = ["19304878", "18606172", "16403221"]
handle = Entrez.epost(db="pubmed", id=",".join(id_list))
result = Entrez.read(handle)
handle.close()
# Get query_key and WebEnv for later use
query_key = result["QueryKey"]
webenv = result["WebEnv"]
# Use in subsequent queries
handle = Entrez.efetch(
db="pubmed",
query_key=query_key,
WebEnv=webenv,
rettype="medline",
retmode="text"
)
EGQuery - Global Query
Search across all Entrez databases at once:
handle = Entrez.egquery(term="biopython")
result = Entrez.read(handle)
handle.close()
for row in result["eGQueryResult"]:
print(f"{row['DbName']}: {row['Count']} results")
ESpell - Spelling Suggestions
Get spelling suggestions for search terms:
handle = Entrez.espell(db="pubmed", term="biopythn")
result = Entrez.read(handle)
handle.close()
print(f"Original: {result['Query']}")
print(f"Suggestion: {result['CorrectedQuery']}")
Working with Different Databases
PubMed
# Search for articles
handle = Entrez.esearch(db="pubmed", term="cancer genomics", retmax=10)
result = Entrez.read(handle)
handle.close()
# Fetch abstracts
handle = Entrez.efetch(
db="pubmed",
id=result["IdList"],
rettype="medline",
retmode="text"
)
records = handle.read()
handle.close()
print(records)
GenBank / Nucleotide
# Search for sequences
handle = Entrez.esearch(db="nucleotide", term="Cypripedioideae[Orgn] AND matK[Gene]")
result = Entrez.read(handle)
handle.close()
# Fetch sequences
if result["IdList"]:
handle = Entrez.efetch(
db="nucleotide",
id=result["IdList"][:5],
rettype="fasta",
retmode="text"
)
sequences = handle.read()
handle.close()
Protein
# Search for protein sequences
handle = Entrez.esearch(db="protein", term="human insulin")
result = Entrez.read(handle)
handle.close()
# Fetch protein records
from Bio import SeqIO
handle = Entrez.efetch(
db="protein",
id=result["IdList"][:5],
rettype="gp",
retmode="text"
)
records = SeqIO.parse(handle, "genbank")
for record in records:
print(f"{record.id}: {record.description}")
handle.close()
Gene
# Search for gene records
handle = Entrez.esearch(db="gene", term="BRCA1[Gene] AND human[Organism]")
result = Entrez.read(handle)
handle.close()
# Get gene information
handle = Entrez.efetch(db="gene", id=result["IdList"][0], retmode="xml")
record = Entrez.read(handle)
handle.close()
Taxonomy
# Search for organism
handle = Entrez.esearch(db="taxonomy", term="Homo sapiens")
result = Entrez.read(handle)
handle.close()
# Fetch taxonomic information
handle = Entrez.efetch(db="taxonomy", id=result["IdList"][0], retmode="xml")
records = Entrez.read(handle)
handle.close()
for record in records:
print(f"TaxID: {record['TaxId']}")
print(f"Scientific Name: {record['ScientificName']}")
print(f"Lineage: {record['Lineage']}")
Parsing Entrez Results
Reading XML Results
# Most results can be parsed with Entrez.read()
handle = Entrez.efetch(db="pubmed", id="19304878", retmode="xml")
records = Entrez.read(handle)
handle.close()
# Access parsed data
article = records['PubmedArticle'][0]['MedlineCitation']['Article']
print(article['ArticleTitle'])
Handling Large Result Sets
# Batch processing for large searches
search_term = "cancer[Title]"
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=0)
result = Entrez.read(handle)
handle.close()
total_count = int(result["Count"])
batch_size = 500
for start in range(0, total_count, batch_size):
# Fetch batch
handle = Entrez.esearch(
db="pubmed",
term=search_term,
retstart=start,
retmax=batch_size
)
result = Entrez.read(handle)
handle.close()
# Process IDs
id_list = result["IdList"]
print(f"Processing IDs {start} to {start + len(id_list)}")
Advanced Patterns
Search History with WebEnv
# Perform search and store on server
handle = Entrez.esearch(
db="pubmed",
term="biopython",
usehistory="y"
)
result = Entrez.read(handle)
handle.close()
webenv = result["WebEnv"]
query_key = result["QueryKey"]
count = int(result["Count"])
# Fetch results in batches using history
batch_size = 100
for start in range(0, count, batch_size):
handle = Entrez.efetch(
db="pubmed",
retstart=start,
retmax=batch_size,
rettype="medline",
retmode="text",
webenv=webenv,
query_key=query_key
)
data = handle.read()
handle.close()
# Process data
Combining Searches
# Use boolean operators
complex_search = "(cancer[Title]) AND (genomics[Title]) AND 2020:2025[PDAT]"
handle = Entrez.esearch(db="pubmed", term=complex_search, retmax=100)
result = Entrez.read(handle)
handle.close()
Best Practices
- Always set Entrez.email - Required by NCBI
- Use API key for higher rate limits (10 req/s vs 3 req/s)
- Close handles after reading to free resources
- Batch large requests - Use retstart and retmax for pagination
- Use WebEnv for large downloads - Store results on server
- Cache locally - Download once and save to avoid repeated requests
- Handle errors gracefully - Network issues and API limits can occur
- Respect NCBI guidelines - Don't overwhelm the service
- Use appropriate rettype - Choose format that matches your needs
- Parse XML carefully - Structure varies by database and record type
Error Handling
from urllib.error import HTTPError
from Bio import Entrez
Entrez.email = "your.email@example.com"
try:
handle = Entrez.efetch(db="nucleotide", id="invalid_id", rettype="gb")
record = handle.read()
handle.close()
except HTTPError as e:
print(f"HTTP Error: {e.code} - {e.reason}")
except Exception as e:
print(f"Error: {e}")
Common Use Cases
Download GenBank Records
from Bio import Entrez, SeqIO
Entrez.email = "your.email@example.com"
# List of accession numbers
accessions = ["EU490707", "EU490708", "EU490709"]
for acc in accessions:
handle = Entrez.efetch(db="nucleotide", id=acc, rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# Save to file
SeqIO.write(record, f"{acc}.gb", "genbank")
Search and Download Papers
# Search PubMed
handle = Entrez.esearch(db="pubmed", term="machine learning bioinformatics", retmax=20)
result = Entrez.read(handle)
handle.close()
# Get details
handle = Entrez.efetch(db="pubmed", id=result["IdList"], retmode="xml")
papers = Entrez.read(handle)
handle.close()
# Extract information
for paper in papers['PubmedArticle']:
article = paper['MedlineCitation']['Article']
print(f"Title: {article['ArticleTitle']}")
print(f"Journal: {article['Journal']['Title']}")
print()
Find Related Sequences
# Start with one sequence
handle = Entrez.efetch(db="nucleotide", id="EU490707", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# Find similar sequences
handle = Entrez.elink(dbfrom="nucleotide", db="nucleotide", id="EU490707")
result = Entrez.read(handle)
handle.close()
# Get related IDs
related_ids = []
for linkset in result[0]["LinkSetDb"]:
for link in linkset["Link"]:
related_ids.append(link["Id"])