Initial commit

2025-11-30 09:05:19 +08:00
commit 09fec2555b
96 changed files with 24269 additions and 0 deletions
--- a/skills/building-mcp/scripts/connections.py
+++ b/skills/building-mcp/scripts/connections.py
@@ -0,0 +1,151 @@
+"""Lightweight connection handling for MCP servers."""
+
+from abc import ABC, abstractmethod
+from contextlib import AsyncExitStack
+from typing import Any
+
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.sse import sse_client
+from mcp.client.stdio import stdio_client
+from mcp.client.streamable_http import streamablehttp_client
+
+
+class MCPConnection(ABC):
+    """Base class for MCP server connections."""
+
+    def __init__(self):
+        self.session = None
+        self._stack = None
+
+    @abstractmethod
+    def _create_context(self):
+        """Create the connection context based on connection type."""
+
+    async def __aenter__(self):
+        """Initialize MCP server connection."""
+        self._stack = AsyncExitStack()
+        await self._stack.__aenter__()
+
+        try:
+            ctx = self._create_context()
+            result = await self._stack.enter_async_context(ctx)
+
+            if len(result) == 2:
+                read, write = result
+            elif len(result) == 3:
+                read, write, _ = result
+            else:
+                raise ValueError(f"Unexpected context result: {result}")
+
+            session_ctx = ClientSession(read, write)
+            self.session = await self._stack.enter_async_context(session_ctx)
+            await self.session.initialize()
+            return self
+        except BaseException:
+            await self._stack.__aexit__(None, None, None)
+            raise
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Clean up MCP server connection resources."""
+        if self._stack:
+            await self._stack.__aexit__(exc_type, exc_val, exc_tb)
+        self.session = None
+        self._stack = None
+
+    async def list_tools(self) -> list[dict[str, Any]]:
+        """Retrieve available tools from the MCP server."""
+        response = await self.session.list_tools()
+        return [
+            {
+                "name": tool.name,
+                "description": tool.description,
+                "input_schema": tool.inputSchema,
+            }
+            for tool in response.tools
+        ]
+
+    async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
+        """Call a tool on the MCP server with provided arguments."""
+        result = await self.session.call_tool(tool_name, arguments=arguments)
+        return result.content
+
+
+class MCPConnectionStdio(MCPConnection):
+    """MCP connection using standard input/output."""
+
+    def __init__(self, command: str, args: list[str] = None, env: dict[str, str] = None):
+        super().__init__()
+        self.command = command
+        self.args = args or []
+        self.env = env
+
+    def _create_context(self):
+        return stdio_client(
+            StdioServerParameters(command=self.command, args=self.args, env=self.env)
+        )
+
+
+class MCPConnectionSSE(MCPConnection):
+    """MCP connection using Server-Sent Events."""
+
+    def __init__(self, url: str, headers: dict[str, str] = None):
+        super().__init__()
+        self.url = url
+        self.headers = headers or {}
+
+    def _create_context(self):
+        return sse_client(url=self.url, headers=self.headers)
+
+
+class MCPConnectionHTTP(MCPConnection):
+    """MCP connection using Streamable HTTP."""
+
+    def __init__(self, url: str, headers: dict[str, str] = None):
+        super().__init__()
+        self.url = url
+        self.headers = headers or {}
+
+    def _create_context(self):
+        return streamablehttp_client(url=self.url, headers=self.headers)
+
+
+def create_connection(
+    transport: str,
+    command: str = None,
+    args: list[str] = None,
+    env: dict[str, str] = None,
+    url: str = None,
+    headers: dict[str, str] = None,
+) -> MCPConnection:
+    """Factory function to create the appropriate MCP connection.
+
+    Args:
+        transport: Connection type ("stdio", "sse", or "http")
+        command: Command to run (stdio only)
+        args: Command arguments (stdio only)
+        env: Environment variables (stdio only)
+        url: Server URL (sse and http only)
+        headers: HTTP headers (sse and http only)
+
+    Returns:
+        MCPConnection instance
+    """
+    transport = transport.lower()
+
+    if transport == "stdio":
+        if not command:
+            raise ValueError("Command is required for stdio transport")
+        return MCPConnectionStdio(command=command, args=args, env=env)
+
+    elif transport == "sse":
+        if not url:
+            raise ValueError("URL is required for sse transport")
+        return MCPConnectionSSE(url=url, headers=headers)
+
+    elif transport in ["http", "streamable_http", "streamable-http"]:
+        if not url:
+            raise ValueError("URL is required for http transport")
+        return MCPConnectionHTTP(url=url, headers=headers)
+
+    else:
+        raise ValueError(f"Unsupported transport type: {transport}. Use 'stdio', 'sse', or 'http'")
--- a/skills/building-mcp/scripts/evaluation.py
+++ b/skills/building-mcp/scripts/evaluation.py
@@ -0,0 +1,373 @@
+"""MCP Server Evaluation Harness
+
+This script evaluates MCP servers by running test questions against them using Claude.
+"""
+
+import argparse
+import asyncio
+import json
+import re
+import sys
+import time
+import traceback
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any
+
+from anthropic import Anthropic
+
+from connections import create_connection
+
+EVALUATION_PROMPT = """You are an AI assistant with access to tools.
+
+When given a task, you MUST:
+1. Use the available tools to complete the task
+2. Provide summary of each step in your approach, wrapped in <summary> tags
+3. Provide feedback on the tools provided, wrapped in <feedback> tags
+4. Provide your final response, wrapped in <response> tags
+
+Summary Requirements:
+- In your <summary> tags, you must explain:
+  - The steps you took to complete the task
+  - Which tools you used, in what order, and why
+  - The inputs you provided to each tool
+  - The outputs you received from each tool
+  - A summary for how you arrived at the response
+
+Feedback Requirements:
+- In your <feedback> tags, provide constructive feedback on the tools:
+  - Comment on tool names: Are they clear and descriptive?
+  - Comment on input parameters: Are they well-documented? Are required vs optional parameters clear?
+  - Comment on descriptions: Do they accurately describe what the tool does?
+  - Comment on any errors encountered during tool usage: Did the tool fail to execute? Did the tool return too many tokens?
+  - Identify specific areas for improvement and explain WHY they would help
+  - Be specific and actionable in your suggestions
+
+Response Requirements:
+- Your response should be concise and directly address what was asked
+- Always wrap your final response in <response> tags
+- If you cannot solve the task return <response>NOT_FOUND</response>
+- For numeric responses, provide just the number
+- For IDs, provide just the ID
+- For names or text, provide the exact text requested
+- Your response should go last"""
+
+
+def parse_evaluation_file(file_path: Path) -> list[dict[str, Any]]:
+    """Parse XML evaluation file with qa_pair elements."""
+    try:
+        tree = ET.parse(file_path)
+        root = tree.getroot()
+        evaluations = []
+
+        for qa_pair in root.findall(".//qa_pair"):
+            question_elem = qa_pair.find("question")
+            answer_elem = qa_pair.find("answer")
+
+            if question_elem is not None and answer_elem is not None:
+                evaluations.append({
+                    "question": (question_elem.text or "").strip(),
+                    "answer": (answer_elem.text or "").strip(),
+                })
+
+        return evaluations
+    except Exception as e:
+        print(f"Error parsing evaluation file {file_path}: {e}")
+        return []
+
+
+def extract_xml_content(text: str, tag: str) -> str | None:
+    """Extract content from XML tags."""
+    pattern = rf"<{tag}>(.*?)</{tag}>"
+    matches = re.findall(pattern, text, re.DOTALL)
+    return matches[-1].strip() if matches else None
+
+
+async def agent_loop(
+    client: Anthropic,
+    model: str,
+    question: str,
+    tools: list[dict[str, Any]],
+    connection: Any,
+) -> tuple[str, dict[str, Any]]:
+    """Run the agent loop with MCP tools."""
+    messages = [{"role": "user", "content": question}]
+
+    response = await asyncio.to_thread(
+        client.messages.create,
+        model=model,
+        max_tokens=4096,
+        system=EVALUATION_PROMPT,
+        messages=messages,
+        tools=tools,
+    )
+
+    messages.append({"role": "assistant", "content": response.content})
+
+    tool_metrics = {}
+
+    while response.stop_reason == "tool_use":
+        tool_use = next(block for block in response.content if block.type == "tool_use")
+        tool_name = tool_use.name
+        tool_input = tool_use.input
+
+        tool_start_ts = time.time()
+        try:
+            tool_result = await connection.call_tool(tool_name, tool_input)
+            tool_response = json.dumps(tool_result) if isinstance(tool_result, (dict, list)) else str(tool_result)
+        except Exception as e:
+            tool_response = f"Error executing tool {tool_name}: {str(e)}\n"
+            tool_response += traceback.format_exc()
+        tool_duration = time.time() - tool_start_ts
+
+        if tool_name not in tool_metrics:
+            tool_metrics[tool_name] = {"count": 0, "durations": []}
+        tool_metrics[tool_name]["count"] += 1
+        tool_metrics[tool_name]["durations"].append(tool_duration)
+
+        messages.append({
+            "role": "user",
+            "content": [{
+                "type": "tool_result",
+                "tool_use_id": tool_use.id,
+                "content": tool_response,
+            }]
+        })
+
+        response = await asyncio.to_thread(
+            client.messages.create,
+            model=model,
+            max_tokens=4096,
+            system=EVALUATION_PROMPT,
+            messages=messages,
+            tools=tools,
+        )
+        messages.append({"role": "assistant", "content": response.content})
+
+    response_text = next(
+        (block.text for block in response.content if hasattr(block, "text")),
+        None,
+    )
+    return response_text, tool_metrics
+
+
+async def evaluate_single_task(
+    client: Anthropic,
+    model: str,
+    qa_pair: dict[str, Any],
+    tools: list[dict[str, Any]],
+    connection: Any,
+    task_index: int,
+) -> dict[str, Any]:
+    """Evaluate a single QA pair with the given tools."""
+    start_time = time.time()
+
+    print(f"Task {task_index + 1}: Running task with question: {qa_pair['question']}")
+    response, tool_metrics = await agent_loop(client, model, qa_pair["question"], tools, connection)
+
+    response_value = extract_xml_content(response, "response")
+    summary = extract_xml_content(response, "summary")
+    feedback = extract_xml_content(response, "feedback")
+
+    duration_seconds = time.time() - start_time
+
+    return {
+        "question": qa_pair["question"],
+        "expected": qa_pair["answer"],
+        "actual": response_value,
+        "score": int(response_value == qa_pair["answer"]) if response_value else 0,
+        "total_duration": duration_seconds,
+        "tool_calls": tool_metrics,
+        "num_tool_calls": sum(len(metrics["durations"]) for metrics in tool_metrics.values()),
+        "summary": summary,
+        "feedback": feedback,
+    }
+
+
+REPORT_HEADER = """
+# Evaluation Report
+
+## Summary
+
+- **Accuracy**: {correct}/{total} ({accuracy:.1f}%)
+- **Average Task Duration**: {average_duration_s:.2f}s
+- **Average Tool Calls per Task**: {average_tool_calls:.2f}
+- **Total Tool Calls**: {total_tool_calls}
+
+---
+"""
+
+TASK_TEMPLATE = """
+### Task {task_num}
+
+**Question**: {question}
+**Ground Truth Answer**: `{expected_answer}`
+**Actual Answer**: `{actual_answer}`
+**Correct**: {correct_indicator}
+**Duration**: {total_duration:.2f}s
+**Tool Calls**: {tool_calls}
+
+**Summary**
+{summary}
+
+**Feedback**
+{feedback}
+
+---
+"""
+
+
+async def run_evaluation(
+    eval_path: Path,
+    connection: Any,
+    model: str = "claude-3-7-sonnet-20250219",
+) -> str:
+    """Run evaluation with MCP server tools."""
+    print("🚀 Starting Evaluation")
+
+    client = Anthropic()
+
+    tools = await connection.list_tools()
+    print(f"📋 Loaded {len(tools)} tools from MCP server")
+
+    qa_pairs = parse_evaluation_file(eval_path)
+    print(f"📋 Loaded {len(qa_pairs)} evaluation tasks")
+
+    results = []
+    for i, qa_pair in enumerate(qa_pairs):
+        print(f"Processing task {i + 1}/{len(qa_pairs)}")
+        result = await evaluate_single_task(client, model, qa_pair, tools, connection, i)
+        results.append(result)
+
+    correct = sum(r["score"] for r in results)
+    accuracy = (correct / len(results)) * 100 if results else 0
+    average_duration_s = sum(r["total_duration"] for r in results) / len(results) if results else 0
+    average_tool_calls = sum(r["num_tool_calls"] for r in results) / len(results) if results else 0
+    total_tool_calls = sum(r["num_tool_calls"] for r in results)
+
+    report = REPORT_HEADER.format(
+        correct=correct,
+        total=len(results),
+        accuracy=accuracy,
+        average_duration_s=average_duration_s,
+        average_tool_calls=average_tool_calls,
+        total_tool_calls=total_tool_calls,
+    )
+
+    report += "".join([
+        TASK_TEMPLATE.format(
+            task_num=i + 1,
+            question=qa_pair["question"],
+            expected_answer=qa_pair["answer"],
+            actual_answer=result["actual"] or "N/A",
+            correct_indicator="✅" if result["score"] else "❌",
+            total_duration=result["total_duration"],
+            tool_calls=json.dumps(result["tool_calls"], indent=2),
+            summary=result["summary"] or "N/A",
+            feedback=result["feedback"] or "N/A",
+        )
+        for i, (qa_pair, result) in enumerate(zip(qa_pairs, results))
+    ])
+
+    return report
+
+
+def parse_headers(header_list: list[str]) -> dict[str, str]:
+    """Parse header strings in format 'Key: Value' into a dictionary."""
+    headers = {}
+    if not header_list:
+        return headers
+
+    for header in header_list:
+        if ":" in header:
+            key, value = header.split(":", 1)
+            headers[key.strip()] = value.strip()
+        else:
+            print(f"Warning: Ignoring malformed header: {header}")
+    return headers
+
+
+def parse_env_vars(env_list: list[str]) -> dict[str, str]:
+    """Parse environment variable strings in format 'KEY=VALUE' into a dictionary."""
+    env = {}
+    if not env_list:
+        return env
+
+    for env_var in env_list:
+        if "=" in env_var:
+            key, value = env_var.split("=", 1)
+            env[key.strip()] = value.strip()
+        else:
+            print(f"Warning: Ignoring malformed environment variable: {env_var}")
+    return env
+
+
+async def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate MCP servers using test questions",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Evaluate a local stdio MCP server
+  python evaluation.py -t stdio -c python -a my_server.py eval.xml
+
+  # Evaluate an SSE MCP server
+  python evaluation.py -t sse -u https://example.com/mcp -H "Authorization: Bearer token" eval.xml
+
+  # Evaluate an HTTP MCP server with custom model
+  python evaluation.py -t http -u https://example.com/mcp -m claude-3-5-sonnet-20241022 eval.xml
+        """,
+    )
+
+    parser.add_argument("eval_file", type=Path, help="Path to evaluation XML file")
+    parser.add_argument("-t", "--transport", choices=["stdio", "sse", "http"], default="stdio", help="Transport type (default: stdio)")
+    parser.add_argument("-m", "--model", default="claude-3-7-sonnet-20250219", help="Claude model to use (default: claude-3-7-sonnet-20250219)")
+
+    stdio_group = parser.add_argument_group("stdio options")
+    stdio_group.add_argument("-c", "--command", help="Command to run MCP server (stdio only)")
+    stdio_group.add_argument("-a", "--args", nargs="+", help="Arguments for the command (stdio only)")
+    stdio_group.add_argument("-e", "--env", nargs="+", help="Environment variables in KEY=VALUE format (stdio only)")
+
+    remote_group = parser.add_argument_group("sse/http options")
+    remote_group.add_argument("-u", "--url", help="MCP server URL (sse/http only)")
+    remote_group.add_argument("-H", "--header", nargs="+", dest="headers", help="HTTP headers in 'Key: Value' format (sse/http only)")
+
+    parser.add_argument("-o", "--output", type=Path, help="Output file for evaluation report (default: stdout)")
+
+    args = parser.parse_args()
+
+    if not args.eval_file.exists():
+        print(f"Error: Evaluation file not found: {args.eval_file}")
+        sys.exit(1)
+
+    headers = parse_headers(args.headers) if args.headers else None
+    env_vars = parse_env_vars(args.env) if args.env else None
+
+    try:
+        connection = create_connection(
+            transport=args.transport,
+            command=args.command,
+            args=args.args,
+            env=env_vars,
+            url=args.url,
+            headers=headers,
+        )
+    except ValueError as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+    print(f"🔗 Connecting to MCP server via {args.transport}...")
+
+    async with connection:
+        print("✅ Connected successfully")
+        report = await run_evaluation(args.eval_file, connection, args.model)
+
+        if args.output:
+            args.output.write_text(report)
+            print(f"\n✅ Report saved to {args.output}")
+        else:
+            print("\n" + report)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/skills/building-mcp/scripts/example_evaluation.xml
+++ b/skills/building-mcp/scripts/example_evaluation.xml
@@ -0,0 +1,22 @@
+<evaluation>
+   <qa_pair>
+      <question>Calculate the compound interest on $10,000 invested at 5% annual interest rate, compounded monthly for 3 years. What is the final amount in dollars (rounded to 2 decimal places)?</question>
+      <answer>11614.72</answer>
+   </qa_pair>
+   <qa_pair>
+      <question>A projectile is launched at a 45-degree angle with an initial velocity of 50 m/s. Calculate the total distance (in meters) it has traveled from the launch point after 2 seconds, assuming g=9.8 m/s². Round to 2 decimal places.</question>
+      <answer>87.25</answer>
+   </qa_pair>
+   <qa_pair>
+      <question>A sphere has a volume of 500 cubic meters. Calculate its surface area in square meters. Round to 2 decimal places.</question>
+      <answer>304.65</answer>
+   </qa_pair>
+   <qa_pair>
+      <question>Calculate the population standard deviation of this dataset: [12, 15, 18, 22, 25, 30, 35]. Round to 2 decimal places.</question>
+      <answer>7.61</answer>
+   </qa_pair>
+   <qa_pair>
+      <question>Calculate the pH of a solution with a hydrogen ion concentration of 3.5 × 10^-5 M. Round to 2 decimal places.</question>
+      <answer>4.46</answer>
+   </qa_pair>
+</evaluation>
--- a/skills/building-mcp/scripts/requirements.txt
+++ b/skills/building-mcp/scripts/requirements.txt
@@ -0,0 +1,2 @@
+anthropic>=0.39.0
+mcp>=1.1.0