Initial commit

2025-11-30 08:30:14 +08:00
commit 1dd5bee3b4
335 changed files with 147360 additions and 0 deletions
--- a/skills/research-lookup/research_lookup.py
+++ b/skills/research-lookup/research_lookup.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""
+Research Information Lookup Tool
+Uses Perplexity's Sonar Pro or Sonar Reasoning Pro models through OpenRouter.
+Automatically selects the appropriate model based on query complexity.
+"""
+
+import os
+import json
+import requests
+import time
+from datetime import datetime
+from typing import Dict, List, Optional, Any
+from urllib.parse import quote
+
+
+class ResearchLookup:
+    """Research information lookup using Perplexity Sonar models via OpenRouter."""
+
+    # Complexity indicators for determining which model to use
+    REASONING_KEYWORDS = [
+        'compare', 'contrast', 'analyze', 'analysis', 'synthesis', 'meta-analysis',
+        'systematic review', 'evaluate', 'critique', 'trade-off', 'tradeoff',
+        'relationship', 'versus', 'vs', 'vs.', 'compared to',
+        'mechanism', 'why', 'how does', 'how do', 'explain', 'theoretical framework',
+        'implications', 'debate', 'controversy', 'conflicting', 'paradox',
+        'reconcile', 'integrate', 'multifaceted', 'complex interaction',
+        'causal relationship', 'underlying mechanism', 'interpret', 'reasoning',
+        'pros and cons', 'advantages and disadvantages', 'critical analysis',
+        'differences between', 'similarities', 'trade offs'
+    ]
+
+    def __init__(self, force_model: Optional[str] = None):
+        """
+        Initialize the research lookup tool.
+        
+        Args:
+            force_model: Optional model override ('pro' or 'reasoning'). 
+                        If None, automatically selects based on query complexity.
+        """
+        self.api_key = os.getenv("OPENROUTER_API_KEY")
+        if not self.api_key:
+            raise ValueError("OPENROUTER_API_KEY environment variable not set")
+
+        self.base_url = "https://openrouter.ai/api/v1"
+        self.model_pro = "perplexity/sonar-pro"  # Fast, efficient lookup
+        self.model_reasoning = "perplexity/sonar-reasoning-pro"  # Deep analysis
+        self.force_model = force_model
+        self.headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+            "HTTP-Referer": "https://scientific-writer.local",  # Replace with your domain
+            "X-Title": "Scientific Writer Research Tool"
+        }
+
+    def _assess_query_complexity(self, query: str) -> str:
+        """
+        Assess query complexity to determine which model to use.
+        
+        Returns:
+            'reasoning' for complex analytical queries, 'pro' for straightforward lookups
+        """
+        query_lower = query.lower()
+        
+        # Count reasoning keywords
+        reasoning_count = sum(1 for keyword in self.REASONING_KEYWORDS if keyword in query_lower)
+        
+        # Count questions (multiple questions suggest complexity)
+        question_count = query.count('?')
+        
+        # Check for multiple clauses (complexity indicators)
+        clause_indicators = [' and ', ' or ', ' but ', ' however ', ' whereas ', ' although ']
+        clause_count = sum(1 for indicator in clause_indicators if indicator in query_lower)
+        
+        # Complexity score
+        complexity_score = (
+            reasoning_count * 3 +      # Reasoning keywords heavily weighted
+            question_count * 2 +        # Multiple questions indicate complexity
+            clause_count * 1.5 +        # Multiple clauses suggest nuance
+            (1 if len(query) > 150 else 0)  # Long queries often more complex
+        )
+        
+        # Threshold for using reasoning model (lowered to 3 to catch single reasoning keywords)
+        return 'reasoning' if complexity_score >= 3 else 'pro'
+    
+    def _select_model(self, query: str) -> str:
+        """Select the appropriate model based on query complexity or force override."""
+        if self.force_model:
+            return self.model_reasoning if self.force_model == 'reasoning' else self.model_pro
+        
+        complexity_level = self._assess_query_complexity(query)
+        return self.model_reasoning if complexity_level == 'reasoning' else self.model_pro
+
+    def _make_request(self, messages: List[Dict[str, str]], model: str, **kwargs) -> Dict[str, Any]:
+        """Make a request to the OpenRouter API."""
+        data = {
+            "model": model,
+            "messages": messages,
+            "max_tokens": 4000,
+            "temperature": 0.1,  # Low temperature for factual research
+            "provider": {
+                "order": ["Perplexity"],
+                "allow_fallbacks": False
+            },
+            **kwargs
+        }
+
+        try:
+            response = requests.post(
+                f"{self.base_url}/chat/completions",
+                headers=self.headers,
+                json=data,
+                timeout=90  # Increased timeout for reasoning model
+            )
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"API request failed: {str(e)}")
+
+    def _format_research_prompt(self, query: str) -> str:
+        """Format the query for optimal research results."""
+        return f"""You are an expert research assistant. Please provide comprehensive, accurate research information for the following query: "{query}"
+
+IMPORTANT INSTRUCTIONS:
+1. Focus on ACADEMIC and SCIENTIFIC sources (peer-reviewed papers, reputable journals, institutional research)
+2. Include RECENT information (prioritize 2020-2024 publications)
+3. Provide COMPLETE citations with authors, title, journal/conference, year, and DOI when available
+4. Structure your response with clear sections and proper attribution
+5. Be comprehensive but concise - aim for 800-1200 words
+6. Include key findings, methodologies, and implications when relevant
+7. Note any controversies, limitations, or conflicting evidence
+
+RESPONSE FORMAT:
+- Start with a brief summary (2-3 sentences)
+- Present key findings and studies in organized sections
+- End with future directions or research gaps if applicable
+- Include 5-8 high-quality citations at the end
+
+Remember: This is for academic research purposes. Prioritize accuracy, completeness, and proper attribution."""
+
+    def lookup(self, query: str) -> Dict[str, Any]:
+        """Perform a research lookup for the given query."""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        # Select the appropriate model based on query complexity
+        selected_model = self._select_model(query)
+        model_type = "reasoning" if "reasoning" in selected_model else "standard"
+        
+        print(f"[Research] Using {selected_model} (detected complexity: {model_type})")
+
+        # Format the research prompt
+        research_prompt = self._format_research_prompt(query)
+
+        # Prepare messages for the API with system message for academic mode
+        messages = [
+            {
+                "role": "system", 
+                "content": "You are an academic research assistant. Focus exclusively on scholarly sources: peer-reviewed journals, academic papers, research institutions, and reputable scientific publications. Prioritize recent academic literature (2020-2024) and provide complete citations with DOIs. Use academic/scholarly search mode."
+            },
+            {"role": "user", "content": research_prompt}
+        ]
+
+        try:
+            # Make the API request with selected model
+            response = self._make_request(messages, model=selected_model)
+
+            # Extract the response content
+            if "choices" in response and len(response["choices"]) > 0:
+                choice = response["choices"][0]
+                if "message" in choice and "content" in choice["message"]:
+                    content = choice["message"]["content"]
+
+                    # Extract citations if present (basic regex extraction)
+                    citations = self._extract_citations(content)
+
+                    return {
+                        "success": True,
+                        "query": query,
+                        "response": content,
+                        "citations": citations,
+                        "timestamp": timestamp,
+                        "model": selected_model,
+                        "model_type": model_type,
+                        "usage": response.get("usage", {})
+                    }
+                else:
+                    raise Exception("Invalid response format from API")
+            else:
+                raise Exception("No response choices received from API")
+
+        except Exception as e:
+            return {
+                "success": False,
+                "query": query,
+                "error": str(e),
+                "timestamp": timestamp,
+                "model": selected_model,
+                "model_type": model_type
+            }
+
+    def _extract_citations(self, text: str) -> List[Dict[str, str]]:
+        """Extract potential citations from the response text."""
+        # This is a simple citation extractor - in practice, you might want
+        # to use a more sophisticated approach or rely on the model's structured output
+
+        citations = []
+
+        # Look for common citation patterns
+        import re
+
+        # Pattern for author et al. year
+        author_pattern = r'([A-Z][a-z]+(?:\s+[A-Z]\.)*(?:\s+et\s+al\.)?)\s*\((\d{4})\)'
+        matches = re.findall(author_pattern, text)
+
+        for author, year in matches:
+            citations.append({
+                "authors": author,
+                "year": year,
+                "type": "extracted"
+            })
+
+        # Look for DOI patterns
+        doi_pattern = r'doi:\s*([^\s\)\]]+)'
+        doi_matches = re.findall(doi_pattern, text, re.IGNORECASE)
+
+        for doi in doi_matches:
+            citations.append({
+                "doi": doi.strip(),
+                "type": "doi"
+            })
+
+        return citations
+
+    def batch_lookup(self, queries: List[str], delay: float = 1.0) -> List[Dict[str, Any]]:
+        """Perform multiple research lookups with optional delay between requests."""
+        results = []
+
+        for i, query in enumerate(queries):
+            if i > 0 and delay > 0:
+                time.sleep(delay)  # Rate limiting
+
+            result = self.lookup(query)
+            results.append(result)
+
+            # Print progress
+            print(f"[Research] Completed query {i+1}/{len(queries)}: {query[:50]}...")
+
+        return results
+
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get information about available models from OpenRouter."""
+        try:
+            response = requests.get(
+                f"{self.base_url}/models",
+                headers=self.headers,
+                timeout=30
+            )
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            return {"error": str(e)}
+
+
+def main():
+    """Command-line interface for testing the research lookup tool."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Research Information Lookup Tool")
+    parser.add_argument("query", nargs="?", help="Research query to look up")
+    parser.add_argument("--model-info", action="store_true", help="Show available models")
+    parser.add_argument("--batch", nargs="+", help="Run multiple queries")
+    parser.add_argument("--force-model", choices=['pro', 'reasoning'], 
+                       help="Force use of specific model (pro=fast lookup, reasoning=deep analysis)")
+
+    args = parser.parse_args()
+
+    # Check for API key
+    if not os.getenv("OPENROUTER_API_KEY"):
+        print("Error: OPENROUTER_API_KEY environment variable not set")
+        print("Please set it in your .env file or export it:")
+        print("  export OPENROUTER_API_KEY='your_openrouter_api_key'")
+        return 1
+
+    try:
+        research = ResearchLookup(force_model=args.force_model)
+
+        if args.model_info:
+            print("Available models from OpenRouter:")
+            models = research.get_model_info()
+            if "data" in models:
+                for model in models["data"]:
+                    if "perplexity" in model["id"].lower():
+                        print(f"  - {model['id']}: {model.get('name', 'N/A')}")
+            return 0
+
+        if not args.query and not args.batch:
+            parser.print_help()
+            return 1
+
+        if args.batch:
+            print(f"Running batch research for {len(args.batch)} queries...")
+            results = research.batch_lookup(args.batch)
+        else:
+            print(f"Researching: {args.query}")
+            results = [research.lookup(args.query)]
+
+        # Display results
+        for i, result in enumerate(results):
+            if result["success"]:
+                print(f"\n{'='*80}")
+                print(f"Query {i+1}: {result['query']}")
+                print(f"Timestamp: {result['timestamp']}")
+                print(f"Model: {result['model']} ({result.get('model_type', 'unknown')})")
+                print(f"{'='*80}")
+                print(result["response"])
+
+                if result["citations"]:
+                    print(f"\nExtracted Citations ({len(result['citations'])}):")
+                    for j, citation in enumerate(result["citations"]):
+                        print(f"  {j+1}. {citation}")
+
+                if result["usage"]:
+                    print(f"\nUsage: {result['usage']}")
+            else:
+                print(f"\nError in query {i+1}: {result['error']}")
+
+        return 0
+
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main())