Initial commit
This commit is contained in:
309
skills/bioservices/scripts/pathway_analysis.py
Executable file
309
skills/bioservices/scripts/pathway_analysis.py
Executable file
@@ -0,0 +1,309 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
KEGG Pathway Network Analysis
|
||||
|
||||
This script analyzes all pathways for an organism and extracts:
|
||||
- Pathway sizes (number of genes)
|
||||
- Protein-protein interactions
|
||||
- Interaction type distributions
|
||||
- Network data in various formats (CSV, SIF)
|
||||
|
||||
Usage:
|
||||
python pathway_analysis.py ORGANISM OUTPUT_DIR [--limit N]
|
||||
|
||||
Examples:
|
||||
python pathway_analysis.py hsa ./human_pathways
|
||||
python pathway_analysis.py mmu ./mouse_pathways --limit 50
|
||||
|
||||
Organism codes:
|
||||
hsa = Homo sapiens (human)
|
||||
mmu = Mus musculus (mouse)
|
||||
dme = Drosophila melanogaster
|
||||
sce = Saccharomyces cerevisiae (yeast)
|
||||
eco = Escherichia coli
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import csv
|
||||
from collections import Counter
|
||||
from bioservices import KEGG
|
||||
|
||||
|
||||
def get_all_pathways(kegg, organism):
|
||||
"""Get all pathway IDs for organism."""
|
||||
print(f"\nRetrieving pathways for {organism}...")
|
||||
|
||||
kegg.organism = organism
|
||||
pathway_ids = kegg.pathwayIds
|
||||
|
||||
print(f"✓ Found {len(pathway_ids)} pathways")
|
||||
|
||||
return pathway_ids
|
||||
|
||||
|
||||
def analyze_pathway(kegg, pathway_id):
|
||||
"""Analyze single pathway for size and interactions."""
|
||||
try:
|
||||
# Parse KGML pathway
|
||||
kgml = kegg.parse_kgml_pathway(pathway_id)
|
||||
|
||||
entries = kgml.get('entries', [])
|
||||
relations = kgml.get('relations', [])
|
||||
|
||||
# Count relation types
|
||||
relation_types = Counter()
|
||||
for rel in relations:
|
||||
rel_type = rel.get('name', 'unknown')
|
||||
relation_types[rel_type] += 1
|
||||
|
||||
# Get pathway name
|
||||
try:
|
||||
entry = kegg.get(pathway_id)
|
||||
pathway_name = "Unknown"
|
||||
for line in entry.split("\n"):
|
||||
if line.startswith("NAME"):
|
||||
pathway_name = line.replace("NAME", "").strip()
|
||||
break
|
||||
except:
|
||||
pathway_name = "Unknown"
|
||||
|
||||
result = {
|
||||
'pathway_id': pathway_id,
|
||||
'pathway_name': pathway_name,
|
||||
'num_entries': len(entries),
|
||||
'num_relations': len(relations),
|
||||
'relation_types': dict(relation_types),
|
||||
'entries': entries,
|
||||
'relations': relations
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error analyzing {pathway_id}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def analyze_all_pathways(kegg, pathway_ids, limit=None):
|
||||
"""Analyze all pathways."""
|
||||
if limit:
|
||||
pathway_ids = pathway_ids[:limit]
|
||||
print(f"\n⚠ Limiting analysis to first {limit} pathways")
|
||||
|
||||
print(f"\nAnalyzing {len(pathway_ids)} pathways...")
|
||||
|
||||
results = []
|
||||
for i, pathway_id in enumerate(pathway_ids, 1):
|
||||
print(f" [{i}/{len(pathway_ids)}] {pathway_id}", end="\r")
|
||||
|
||||
result = analyze_pathway(kegg, pathway_id)
|
||||
if result:
|
||||
results.append(result)
|
||||
|
||||
print(f"\n✓ Successfully analyzed {len(results)}/{len(pathway_ids)} pathways")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def save_pathway_summary(results, output_file):
|
||||
"""Save pathway summary to CSV."""
|
||||
print(f"\nSaving pathway summary to {output_file}...")
|
||||
|
||||
with open(output_file, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
|
||||
# Header
|
||||
writer.writerow([
|
||||
'Pathway_ID',
|
||||
'Pathway_Name',
|
||||
'Num_Genes',
|
||||
'Num_Interactions',
|
||||
'Activation',
|
||||
'Inhibition',
|
||||
'Phosphorylation',
|
||||
'Binding',
|
||||
'Other'
|
||||
])
|
||||
|
||||
# Data
|
||||
for result in results:
|
||||
rel_types = result['relation_types']
|
||||
|
||||
writer.writerow([
|
||||
result['pathway_id'],
|
||||
result['pathway_name'],
|
||||
result['num_entries'],
|
||||
result['num_relations'],
|
||||
rel_types.get('activation', 0),
|
||||
rel_types.get('inhibition', 0),
|
||||
rel_types.get('phosphorylation', 0),
|
||||
rel_types.get('binding/association', 0),
|
||||
sum(v for k, v in rel_types.items()
|
||||
if k not in ['activation', 'inhibition', 'phosphorylation', 'binding/association'])
|
||||
])
|
||||
|
||||
print(f"✓ Summary saved")
|
||||
|
||||
|
||||
def save_interactions_sif(results, output_file):
|
||||
"""Save all interactions in SIF format."""
|
||||
print(f"\nSaving interactions to {output_file}...")
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
for result in results:
|
||||
pathway_id = result['pathway_id']
|
||||
|
||||
for rel in result['relations']:
|
||||
entry1 = rel.get('entry1', '')
|
||||
entry2 = rel.get('entry2', '')
|
||||
interaction_type = rel.get('name', 'interaction')
|
||||
|
||||
# Write SIF format: source\tinteraction\ttarget
|
||||
f.write(f"{entry1}\t{interaction_type}\t{entry2}\n")
|
||||
|
||||
print(f"✓ Interactions saved")
|
||||
|
||||
|
||||
def save_detailed_pathway_info(results, output_dir):
|
||||
"""Save detailed information for each pathway."""
|
||||
print(f"\nSaving detailed pathway files to {output_dir}/pathways/...")
|
||||
|
||||
pathway_dir = os.path.join(output_dir, "pathways")
|
||||
os.makedirs(pathway_dir, exist_ok=True)
|
||||
|
||||
for result in results:
|
||||
pathway_id = result['pathway_id'].replace(":", "_")
|
||||
filename = os.path.join(pathway_dir, f"{pathway_id}_interactions.csv")
|
||||
|
||||
with open(filename, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Source', 'Target', 'Interaction_Type', 'Link_Type'])
|
||||
|
||||
for rel in result['relations']:
|
||||
writer.writerow([
|
||||
rel.get('entry1', ''),
|
||||
rel.get('entry2', ''),
|
||||
rel.get('name', 'unknown'),
|
||||
rel.get('link', 'unknown')
|
||||
])
|
||||
|
||||
print(f"✓ Detailed files saved for {len(results)} pathways")
|
||||
|
||||
|
||||
def print_statistics(results):
|
||||
"""Print analysis statistics."""
|
||||
print(f"\n{'='*70}")
|
||||
print("PATHWAY ANALYSIS STATISTICS")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# Total stats
|
||||
total_pathways = len(results)
|
||||
total_interactions = sum(r['num_relations'] for r in results)
|
||||
total_genes = sum(r['num_entries'] for r in results)
|
||||
|
||||
print(f"\nOverall:")
|
||||
print(f" Total pathways: {total_pathways}")
|
||||
print(f" Total genes/proteins: {total_genes}")
|
||||
print(f" Total interactions: {total_interactions}")
|
||||
|
||||
# Largest pathways
|
||||
print(f"\nLargest pathways (by gene count):")
|
||||
sorted_by_size = sorted(results, key=lambda x: x['num_entries'], reverse=True)
|
||||
for i, result in enumerate(sorted_by_size[:10], 1):
|
||||
print(f" {i}. {result['pathway_id']}: {result['num_entries']} genes")
|
||||
print(f" {result['pathway_name']}")
|
||||
|
||||
# Most connected pathways
|
||||
print(f"\nMost connected pathways (by interactions):")
|
||||
sorted_by_connections = sorted(results, key=lambda x: x['num_relations'], reverse=True)
|
||||
for i, result in enumerate(sorted_by_connections[:10], 1):
|
||||
print(f" {i}. {result['pathway_id']}: {result['num_relations']} interactions")
|
||||
print(f" {result['pathway_name']}")
|
||||
|
||||
# Interaction type distribution
|
||||
print(f"\nInteraction type distribution:")
|
||||
all_types = Counter()
|
||||
for result in results:
|
||||
for rel_type, count in result['relation_types'].items():
|
||||
all_types[rel_type] += count
|
||||
|
||||
for rel_type, count in all_types.most_common():
|
||||
percentage = (count / total_interactions) * 100 if total_interactions > 0 else 0
|
||||
print(f" {rel_type}: {count} ({percentage:.1f}%)")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main analysis workflow."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze KEGG pathways for an organism",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python pathway_analysis.py hsa ./human_pathways
|
||||
python pathway_analysis.py mmu ./mouse_pathways --limit 50
|
||||
|
||||
Organism codes:
|
||||
hsa = Homo sapiens (human)
|
||||
mmu = Mus musculus (mouse)
|
||||
dme = Drosophila melanogaster
|
||||
sce = Saccharomyces cerevisiae (yeast)
|
||||
eco = Escherichia coli
|
||||
"""
|
||||
)
|
||||
parser.add_argument("organism", help="KEGG organism code (e.g., hsa, mmu)")
|
||||
parser.add_argument("output_dir", help="Output directory for results")
|
||||
parser.add_argument("--limit", type=int, default=None,
|
||||
help="Limit analysis to first N pathways")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("BIOSERVICES: KEGG Pathway Network Analysis")
|
||||
print("=" * 70)
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Initialize KEGG
|
||||
kegg = KEGG()
|
||||
|
||||
# Get all pathways
|
||||
pathway_ids = get_all_pathways(kegg, args.organism)
|
||||
|
||||
if not pathway_ids:
|
||||
print(f"\n✗ No pathways found for {args.organism}")
|
||||
sys.exit(1)
|
||||
|
||||
# Analyze pathways
|
||||
results = analyze_all_pathways(kegg, pathway_ids, args.limit)
|
||||
|
||||
if not results:
|
||||
print("\n✗ No pathways successfully analyzed")
|
||||
sys.exit(1)
|
||||
|
||||
# Print statistics
|
||||
print_statistics(results)
|
||||
|
||||
# Save results
|
||||
summary_file = os.path.join(args.output_dir, "pathway_summary.csv")
|
||||
save_pathway_summary(results, summary_file)
|
||||
|
||||
sif_file = os.path.join(args.output_dir, "all_interactions.sif")
|
||||
save_interactions_sif(results, sif_file)
|
||||
|
||||
save_detailed_pathway_info(results, args.output_dir)
|
||||
|
||||
# Final summary
|
||||
print(f"\n{'='*70}")
|
||||
print("OUTPUT FILES")
|
||||
print(f"{'='*70}")
|
||||
print(f" Summary: {summary_file}")
|
||||
print(f" Interactions: {sif_file}")
|
||||
print(f" Detailed: {args.output_dir}/pathways/")
|
||||
print(f"{'='*70}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user