Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/tooluniverse/scripts/example_workflow.py
+++ b/skills/tooluniverse/scripts/example_workflow.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+Example workflow demonstrating tool composition in ToolUniverse.
+
+This script shows a complete drug discovery workflow:
+1. Find disease-associated targets
+2. Retrieve protein sequences
+3. Get structure predictions
+4. Screen compound libraries
+5. Calculate drug-likeness properties
+"""
+
+from tooluniverse import ToolUniverse
+
+
+def drug_discovery_workflow(disease_efo_id: str, max_targets: int = 3):
+    """
+    Execute a drug discovery workflow for a given disease.
+
+    Args:
+        disease_efo_id: EFO ID for the disease (e.g., "EFO_0000537" for hypertension)
+        max_targets: Maximum number of targets to process
+    """
+    tu = ToolUniverse()
+    tu.load_tools()
+
+    print("=" * 70)
+    print("DRUG DISCOVERY WORKFLOW")
+    print("=" * 70)
+
+    # Step 1: Find disease-associated targets
+    print(f"\nStep 1: Finding targets for disease {disease_efo_id}...")
+    targets = tu.run({
+        "name": "OpenTargets_get_associated_targets_by_disease_efoId",
+        "arguments": {"efoId": disease_efo_id}
+    })
+    print(f"✓ Found {len(targets)} disease-associated targets")
+
+    # Process top targets
+    top_targets = targets[:max_targets]
+    print(f"  Processing top {len(top_targets)} targets:")
+    for idx, target in enumerate(top_targets, 1):
+        print(f"    {idx}. {target.get('target_name', 'Unknown')} ({target.get('uniprot_id', 'N/A')})")
+
+    # Step 2: Get protein sequences
+    print(f"\nStep 2: Retrieving protein sequences...")
+    sequences = []
+    for target in top_targets:
+        try:
+            seq = tu.run({
+                "name": "UniProt_get_sequence",
+                "arguments": {"uniprot_id": target['uniprot_id']}
+            })
+            sequences.append({
+                "target": target,
+                "sequence": seq
+            })
+            print(f"  ✓ Retrieved sequence for {target.get('target_name', 'Unknown')}")
+        except Exception as e:
+            print(f"  ✗ Failed to get sequence: {e}")
+
+    # Step 3: Predict protein structures
+    print(f"\nStep 3: Predicting protein structures...")
+    structures = []
+    for seq_data in sequences:
+        try:
+            structure = tu.run({
+                "name": "AlphaFold_get_structure",
+                "arguments": {"uniprot_id": seq_data['target']['uniprot_id']}
+            })
+            structures.append({
+                "target": seq_data['target'],
+                "structure": structure
+            })
+            print(f"  ✓ Predicted structure for {seq_data['target'].get('target_name', 'Unknown')}")
+        except Exception as e:
+            print(f"  ✗ Failed to predict structure: {e}")
+
+    # Step 4: Find binding sites
+    print(f"\nStep 4: Identifying binding sites...")
+    binding_sites = []
+    for struct_data in structures:
+        try:
+            sites = tu.run({
+                "name": "Fpocket_find_binding_sites",
+                "arguments": {"structure": struct_data['structure']}
+            })
+            binding_sites.append({
+                "target": struct_data['target'],
+                "sites": sites
+            })
+            print(f"  ✓ Found {len(sites)} binding sites for {struct_data['target'].get('target_name', 'Unknown')}")
+        except Exception as e:
+            print(f"  ✗ Failed to find binding sites: {e}")
+
+    # Step 5: Virtual screening (simplified)
+    print(f"\nStep 5: Screening compound libraries...")
+    all_hits = []
+    for site_data in binding_sites:
+        for site in site_data['sites'][:1]:  # Top site only
+            try:
+                compounds = tu.run({
+                    "name": "ZINC_virtual_screening",
+                    "arguments": {
+                        "binding_site": site,
+                        "library": "lead-like",
+                        "top_n": 10
+                    }
+                })
+                all_hits.extend(compounds)
+                print(f"  ✓ Found {len(compounds)} hit compounds for {site_data['target'].get('target_name', 'Unknown')}")
+            except Exception as e:
+                print(f"  ✗ Screening failed: {e}")
+
+    # Step 6: Calculate drug-likeness
+    print(f"\nStep 6: Evaluating drug-likeness...")
+    drug_candidates = []
+    for compound in all_hits:
+        try:
+            properties = tu.run({
+                "name": "RDKit_calculate_drug_properties",
+                "arguments": {"smiles": compound['smiles']}
+            })
+
+            if properties.get('lipinski_pass', False):
+                drug_candidates.append({
+                    "compound": compound,
+                    "properties": properties
+                })
+        except Exception as e:
+            print(f"  ✗ Property calculation failed: {e}")
+
+    print(f"\n  ✓ Identified {len(drug_candidates)} drug candidates passing Lipinski's Rule of Five")
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("WORKFLOW SUMMARY")
+    print("=" * 70)
+    print(f"Disease targets processed: {len(top_targets)}")
+    print(f"Protein structures predicted: {len(structures)}")
+    print(f"Binding sites identified: {sum(len(s['sites']) for s in binding_sites)}")
+    print(f"Compounds screened: {len(all_hits)}")
+    print(f"Drug candidates identified: {len(drug_candidates)}")
+    print("=" * 70)
+
+    return drug_candidates
+
+
+def genomics_workflow(geo_id: str):
+    """
+    Execute a genomics analysis workflow.
+
+    Args:
+        geo_id: GEO dataset ID (e.g., "GSE12345")
+    """
+    tu = ToolUniverse()
+    tu.load_tools()
+
+    print("=" * 70)
+    print("GENOMICS ANALYSIS WORKFLOW")
+    print("=" * 70)
+
+    # Step 1: Download gene expression data
+    print(f"\nStep 1: Downloading dataset {geo_id}...")
+    try:
+        expression_data = tu.run({
+            "name": "GEO_download_dataset",
+            "arguments": {"geo_id": geo_id}
+        })
+        print(f"  ✓ Downloaded expression data")
+    except Exception as e:
+        print(f"  ✗ Failed: {e}")
+        return
+
+    # Step 2: Differential expression analysis
+    print(f"\nStep 2: Performing differential expression analysis...")
+    try:
+        de_genes = tu.run({
+            "name": "DESeq2_differential_expression",
+            "arguments": {
+                "data": expression_data,
+                "condition1": "control",
+                "condition2": "treated"
+            }
+        })
+        print(f"  ✓ Found {len(de_genes.get('significant_genes', []))} differentially expressed genes")
+    except Exception as e:
+        print(f"  ✗ Failed: {e}")
+        return
+
+    # Step 3: Pathway enrichment
+    print(f"\nStep 3: Running pathway enrichment analysis...")
+    try:
+        pathways = tu.run({
+            "name": "KEGG_pathway_enrichment",
+            "arguments": {
+                "gene_list": de_genes['significant_genes'],
+                "organism": "hsa"
+            }
+        })
+        print(f"  ✓ Found {len(pathways)} enriched pathways")
+        if pathways:
+            print(f"    Top pathway: {pathways[0].get('pathway_name', 'Unknown')}")
+    except Exception as e:
+        print(f"  ✗ Failed: {e}")
+
+    print("\n" + "=" * 70)
+
+
+if __name__ == "__main__":
+    # Example 1: Drug discovery workflow for hypertension
+    print("EXAMPLE 1: Drug Discovery for Hypertension")
+    candidates = drug_discovery_workflow("EFO_0000537", max_targets=2)
+
+    print("\n\n")
+
+    # Example 2: Genomics workflow
+    print("EXAMPLE 2: Genomics Analysis")
+    genomics_workflow("GSE12345")