gh-jamesrochabrun-skills/skills/engineer-expertise-extractor/scripts/extract_engineer.sh

#!/bin/bash

# Engineer Expertise Extractor
# Research and document an engineer's coding style, patterns, and best practices

set -e

# Colors
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
MAGENTA='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'

echo -e "${BLUE}╔══════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║      Engineer Expertise Extractor                ║${NC}"
echo -e "${BLUE}╚══════════════════════════════════════════════════╝${NC}"
echo ""
echo -e "${YELLOW}Extract coding style, patterns, and best practices from GitHub${NC}"
echo ""

# Check for gh CLI
if ! command -v gh &> /dev/null; then
    echo -e "${RED}Error: GitHub CLI (gh) is not installed${NC}"
    echo "Install from: https://cli.github.com/"
    exit 1
fi

# Check authentication
if ! gh auth status &> /dev/null; then
    echo -e "${RED}Error: Not authenticated with GitHub CLI${NC}"
    echo "Run: gh auth login"
    exit 1
fi

# Helper function
prompt_input() {
    local prompt_text="$1"
    local var_name="$2"
    local required="$3"

    while true; do
        echo -e "${CYAN}${prompt_text}${NC}"
        read -r input

        if [ -n "$input" ]; then
            eval "$var_name=\"$input\""
            break
        elif [ "$required" != "true" ]; then
            eval "$var_name=\"\""
            break
        else
            echo -e "${RED}This field is required.${NC}"
        fi
    done
}

# Step 1: Engineer Info
echo -e "${MAGENTA}━━━ Step 1: Engineer Information ━━━${NC}"
echo ""

prompt_input "GitHub username to research:" ENGINEER_USERNAME true

# Verify user exists
echo -e "${BLUE}Verifying GitHub user...${NC}"
if ! gh api users/$ENGINEER_USERNAME > /dev/null 2>&1; then
    echo -e "${RED}Error: GitHub user '$ENGINEER_USERNAME' not found${NC}"
    exit 1
fi

# Get user info
USER_INFO=$(gh api users/$ENGINEER_USERNAME)
USER_NAME=$(echo "$USER_INFO" | grep '"name"' | cut -d'"' -f4)
USER_BIO=$(echo "$USER_INFO" | grep '"bio"' | cut -d'"' -f4)

echo -e "${GREEN}✓ Found: $USER_NAME${NC}"
[ -n "$USER_BIO" ] && echo -e "  Bio: $USER_BIO"
echo ""

# Step 2: Scope
echo -e "${MAGENTA}━━━ Step 2: Research Scope ━━━${NC}"
echo ""

prompt_input "Organization to focus on (or 'all'):" ORG_FILTER false
ORG_FILTER=${ORG_FILTER:-all}

echo ""
echo "How many PRs to analyze?"
echo "1) Recent 20 (quick scan)"
echo "2) Recent 50 (good coverage)"
echo "3) Recent 100 (comprehensive)"
echo "4) Custom"
echo ""

prompt_input "Select (1-4):" PR_COUNT_NUM true

case $PR_COUNT_NUM in
    1) PR_LIMIT=20 ;;
    2) PR_LIMIT=50 ;;
    3) PR_LIMIT=100 ;;
    4) prompt_input "Enter custom number:" PR_LIMIT true ;;
    *) PR_LIMIT=50 ;;
esac

# Step 3: Focus Areas
echo ""
echo -e "${MAGENTA}━━━ Step 3: Focus Areas ━━━${NC}"
echo ""

echo "What to extract? (y/n for each)"
echo ""

prompt_input "Extract coding style? (y/n):" EXTRACT_STYLE false
EXTRACT_STYLE=${EXTRACT_STYLE:-y}

prompt_input "Extract common patterns? (y/n):" EXTRACT_PATTERNS false
EXTRACT_PATTERNS=${EXTRACT_PATTERNS:-y}

prompt_input "Extract best practices? (y/n):" EXTRACT_PRACTICES false
EXTRACT_PRACTICES=${EXTRACT_PRACTICES:-y}

prompt_input "Extract code review style? (y/n):" EXTRACT_REVIEWS false
EXTRACT_REVIEWS=${EXTRACT_REVIEWS:-y}

prompt_input "Extract architectural decisions? (y/n):" EXTRACT_ARCH false
EXTRACT_ARCH=${EXTRACT_ARCH:-y}

# Create output directory
OUTPUT_DIR="engineer_profiles/${ENGINEER_USERNAME}"
mkdir -p "$OUTPUT_DIR"
mkdir -p "$OUTPUT_DIR/coding_style"
mkdir -p "$OUTPUT_DIR/patterns"
mkdir -p "$OUTPUT_DIR/best_practices"
mkdir -p "$OUTPUT_DIR/architecture"
mkdir -p "$OUTPUT_DIR/code_review"
mkdir -p "$OUTPUT_DIR/examples/notable_prs"
mkdir -p "$OUTPUT_DIR/raw_data"

echo ""
echo -e "${BLUE}━━━ Starting Extraction ━━━${NC}"
echo ""

# Step 4: Fetch Pull Requests
echo -e "${YELLOW}Fetching pull requests...${NC}"

# Build search query
SEARCH_QUERY="is:pr author:$ENGINEER_USERNAME"
if [ "$ORG_FILTER" != "all" ]; then
    SEARCH_QUERY="$SEARCH_QUERY org:$ORG_FILTER"
fi

# Fetch PRs
echo "Query: $SEARCH_QUERY (limit: $PR_LIMIT)"
gh search prs "$SEARCH_QUERY" --limit $PR_LIMIT --json number,title,repository,createdAt,state,url > "$OUTPUT_DIR/raw_data/prs.json"

PR_COUNT=$(cat "$OUTPUT_DIR/raw_data/prs.json" | grep -c '"number"' || echo "0")
echo -e "${GREEN}✓ Found $PR_COUNT pull requests${NC}"
echo ""

if [ "$PR_COUNT" -eq 0 ]; then
    echo -e "${RED}No PRs found. Exiting.${NC}"
    exit 1
fi

# Step 5: Analyze PRs
echo -e "${YELLOW}Analyzing pull requests...${NC}"
echo "This may take a while..."
echo ""

# Create PR analysis file
PR_ANALYSIS_FILE="$OUTPUT_DIR/raw_data/pr_analysis.md"
echo "# Pull Request Analysis: $ENGINEER_USERNAME" > "$PR_ANALYSIS_FILE"
echo "" >> "$PR_ANALYSIS_FILE"
echo "Total PRs analyzed: $PR_COUNT" >> "$PR_ANALYSIS_FILE"
echo "Generated: $(date)" >> "$PR_ANALYSIS_FILE"
echo "" >> "$PR_ANALYSIS_FILE"

# Analyze top N PRs in detail (limit to avoid rate limiting)
DETAILED_ANALYSIS_LIMIT=20
if [ "$PR_COUNT" -lt "$DETAILED_ANALYSIS_LIMIT" ]; then
    DETAILED_ANALYSIS_LIMIT=$PR_COUNT
fi

echo "Performing detailed analysis on $DETAILED_ANALYSIS_LIMIT PRs..."

# Extract PR numbers and iterate
cat "$OUTPUT_DIR/raw_data/prs.json" | grep '"number"' | head -$DETAILED_ANALYSIS_LIMIT | while read -r line; do
    PR_NUMBER=$(echo "$line" | grep -o '[0-9]*' | head -1)
    REPO=$(cat "$OUTPUT_DIR/raw_data/prs.json" | grep -B5 "\"number\": $PR_NUMBER" | grep '"nameWithOwner"' | cut -d'"' -f4 | head -1)

    if [ -n "$PR_NUMBER" ] && [ -n "$REPO" ]; then
        echo "  Analyzing PR #$PR_NUMBER in $REPO..."

        # Fetch PR details
        gh pr view $PR_NUMBER --repo $REPO --json title,body,files,comments > "$OUTPUT_DIR/raw_data/pr_${PR_NUMBER}.json" 2>/dev/null || continue

        # Extract to analysis file
        TITLE=$(cat "$OUTPUT_DIR/raw_data/pr_${PR_NUMBER}.json" | grep '"title"' | cut -d'"' -f4)
        echo "## PR #$PR_NUMBER: $TITLE" >> "$PR_ANALYSIS_FILE"
        echo "**Repository:** $REPO" >> "$PR_ANALYSIS_FILE"
        echo "**URL:** https://github.com/$REPO/pull/$PR_NUMBER" >> "$PR_ANALYSIS_FILE"
        echo "" >> "$PR_ANALYSIS_FILE"

        # Get file changes
        FILES_CHANGED=$(cat "$OUTPUT_DIR/raw_data/pr_${PR_NUMBER}.json" | grep '"path"' | wc -l)
        echo "**Files Changed:** $FILES_CHANGED" >> "$PR_ANALYSIS_FILE"

        # List languages (from file extensions)
        cat "$OUTPUT_DIR/raw_data/pr_${PR_NUMBER}.json" | grep '"path"' | grep -o '\.[a-z]*"' | sort | uniq | head -5 >> "$PR_ANALYSIS_FILE"
        echo "" >> "$PR_ANALYSIS_FILE"

        sleep 1  # Rate limiting
    fi
done

echo -e "${GREEN}✓ Detailed analysis complete${NC}"
echo ""

# Step 6: Extract Code Review Comments
if [ "$EXTRACT_REVIEWS" = "y" ]; then
    echo -e "${YELLOW}Extracting code review patterns...${NC}"

    REVIEW_FILE="$OUTPUT_DIR/code_review/review_patterns.md"
    echo "# Code Review Patterns: $ENGINEER_USERNAME" > "$REVIEW_FILE"
    echo "" >> "$REVIEW_FILE"
    echo "Extracted from $PR_COUNT pull requests" >> "$REVIEW_FILE"
    echo "" >> "$REVIEW_FILE"
    echo "## Common Review Comments" >> "$REVIEW_FILE"
    echo "" >> "$REVIEW_FILE"
    echo "[To be populated with actual review comments from PRs where they are reviewer]" >> "$REVIEW_FILE"
    echo "" >> "$REVIEW_FILE"
    echo "## Review Focus Areas" >> "$REVIEW_FILE"
    echo "- Code quality" >> "$REVIEW_FILE"
    echo "- Testing coverage" >> "$REVIEW_FILE"
    echo "- Performance considerations" >> "$REVIEW_FILE"
    echo "- Security practices" >> "$REVIEW_FILE"
    echo "" >> "$REVIEW_FILE"

    echo -e "${GREEN}✓ Review patterns documented${NC}"
fi

# Step 7: Create Profile README
echo ""
echo -e "${YELLOW}Creating profile documentation...${NC}"

README_FILE="$OUTPUT_DIR/README.md"
cat > "$README_FILE" << EOF
# Engineer Profile: $ENGINEER_USERNAME

**Name:** ${USER_NAME:-$ENGINEER_USERNAME}
${USER_BIO:+**Bio:** $USER_BIO}
**GitHub:** https://github.com/$ENGINEER_USERNAME
**Profile Created:** $(date +%Y-%m-%d)
**PRs Analyzed:** $PR_COUNT

---

## Overview

This profile contains extracted coding expertise from analyzing $ENGINEER_USERNAME's GitHub contributions.

## Contents

### 📝 Coding Style
Location: \`coding_style/\`

Documents coding conventions, naming patterns, and formatting preferences.

**Key files:**
- \`naming_conventions.md\` - Variable, function, class naming
- \`code_structure.md\` - File organization and structure
- \`formatting_preferences.md\` - Code formatting style

### 🔧 Patterns
Location: \`patterns/\`

Common solutions, design patterns, and recurring approaches.

**Key files:**
- \`common_solutions.md\` - Frequently used solutions
- \`design_patterns.md\` - Applied design patterns
- \`examples/\` - Code examples

### ✅ Best Practices
Location: \`best_practices/\`

Quality standards, testing approaches, and guidelines.

**Key files:**
- \`code_quality.md\` - Quality standards
- \`testing_approach.md\` - Testing strategy
- \`performance.md\` - Performance practices
- \`security.md\` - Security considerations

### 🏗️ Architecture
Location: \`architecture/\`

Design decisions, technology choices, and system design.

**Key files:**
- \`design_decisions.md\` - Architectural choices
- \`tech_choices.md\` - Technology selections
- \`trade_offs.md\` - Decision trade-offs

### 👀 Code Review
Location: \`code_review/\`

Code review style, common feedback, and review approach.

**Key files:**
- \`feedback_style.md\` - How they provide feedback
- \`common_suggestions.md\` - Recurring suggestions
- \`review_checklist.md\` - What they look for

### 📚 Examples
Location: \`examples/\`

Real code examples from notable pull requests.

**Contents:**
- \`notable_prs/\` - Significant PRs and their patterns

---

## Using This Profile

### For Learning
1. Start with \`coding_style/\` to understand conventions
2. Review \`patterns/\` for common solutions
3. Study \`best_practices/\` for quality standards
4. Read \`examples/\` for real-world applications

### For AI Agents
Provide this profile as context when asking agents to:
- Write code matching this engineer's style
- Review code using their standards
- Apply their patterns and practices
- Make architectural decisions in their approach

**Example prompt:**
\`\`\`
Using the engineer profile at engineer_profiles/$ENGINEER_USERNAME/,
write a user authentication service following their coding style,
patterns, and best practices.
\`\`\`

### For Team Alignment
- Use as reference for team coding standards
- Share patterns for consistency
- Adopt best practices across team
- Train new engineers with real examples

---

## Expertise Areas

Based on analyzed PRs:

EOF

# Extract languages from PR data
echo "### Languages" >> "$README_FILE"
cat "$OUTPUT_DIR/raw_data/prs.json" | grep '"path"' | grep -o '\.[a-z]*"' | sed 's/"//g' | sort | uniq -c | sort -rn | head -10 | while read -r count ext; do
    echo "- $ext files ($count occurrences)" >> "$README_FILE"
done

echo "" >> "$README_FILE"
echo "### Repositories Contributed To" >> "$README_FILE"
cat "$OUTPUT_DIR/raw_data/prs.json" | grep '"nameWithOwner"' | cut -d'"' -f4 | sort | uniq | head -10 | while read -r repo; do
    echo "- $repo" >> "$README_FILE"
done

cat >> "$README_FILE" << EOF

---

## Data Sources

- **Pull Requests:** $PR_COUNT PRs analyzed
- **Time Range:** Most recent contributions
- **Scope:** ${ORG_FILTER:-All repositories}
- **Analysis Date:** $(date +%Y-%m-%d)

---

## Maintenance

### Updating This Profile

\`\`\`bash
./scripts/update_profile.sh $ENGINEER_USERNAME
\`\`\`

### Adding Custom Documentation

Feel free to enhance this profile with:
- Additional examples
- Team-specific practices
- Personal notes and observations
- Meeting discussions and decisions

---

## Notes

This profile is automatically generated from GitHub contributions.
It captures public coding patterns and should be used as a learning
resource and reference, not as a rigid rulebook.

**Privacy:** Only public GitHub contributions are analyzed.
**Accuracy:** Patterns are inferred from code; always verify with engineer.
**Currency:** Update regularly as coding practices evolve.

EOF

echo -e "${GREEN}✓ Profile README created${NC}"

# Step 8: Create Template Files
echo ""
echo -e "${YELLOW}Creating template structure...${NC}"

# Coding Style Templates
cat > "$OUTPUT_DIR/coding_style/naming_conventions.md" << 'EOF'
# Naming Conventions

Based on analysis of pull requests.

## Variables

### General Rules
- Descriptive names preferred
- Avoid abbreviations
- Use camelCase (or language convention)

### Examples
```
// From PR analysis
const userAuthentication = ...  // not ua
const isActive = ...             // boolean prefix
const totalAmount = ...          // clear purpose
```

## Functions

### General Rules
- Verb-first naming
- Single responsibility
- Descriptive of action

### Examples
```
// From PR analysis
getUserById(id)
validateInput(data)
calculateTotal(items)
```

## Classes

### General Rules
- PascalCase
- Noun-based
- Clear purpose

### Examples
```
// From PR analysis
UserService
PaymentProcessor
AuthenticationManager
```

---

**Note:** Fill in with specific patterns found in engineer's code.
EOF

cat > "$OUTPUT_DIR/patterns/common_solutions.md" << 'EOF'
# Common Solutions

Recurring patterns and solutions found in PRs.

## Pattern 1: [To be extracted]

**Problem:** [What problem does this solve]

**Solution:**
```
[Code example from PRs]
```

**Why:** [Reasoning from PR discussions]

**When to use:** [Applicable scenarios]

---

## Pattern 2: [To be extracted]

[Continue documenting patterns found in analysis]

---

**Note:** Populate with actual patterns from PR analysis.
EOF

cat > "$OUTPUT_DIR/best_practices/testing_approach.md" << 'EOF'
# Testing Approach

Testing strategies and patterns extracted from PRs.

## Test Structure

[Document testing patterns from analyzed PRs]

## Coverage Standards

[Extract coverage expectations from PRs]

## Test Types

### Unit Tests
[Patterns found]

### Integration Tests
[Patterns found]

### E2E Tests
[Patterns found]

---

**Note:** Fill in based on test files in analyzed PRs.
EOF

cat > "$OUTPUT_DIR/architecture/design_decisions.md" << 'EOF'
# Design Decisions

Architectural decisions extracted from PRs and discussions.

## Decision Template

### Decision: [Name]

**Context:** [What problem or need]

**Decision:** [What was decided]

**Reasoning:** [Why this approach]

**Alternatives Considered:** [Other options]

**Trade-offs:** [Pros and cons]

**Outcome:** [Results]

---

## Decisions Extracted from PRs

[To be populated with decisions found in PR descriptions and discussions]

---

**Note:** Extract from PR descriptions, discussions, and code comments.
EOF

echo -e "${GREEN}✓ Template files created${NC}"

# Step 9: Generate Summary
echo ""
echo -e "${BLUE}╔════════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║          Extraction Complete!                  ║${NC}"
echo -e "${BLUE}╚════════════════════════════════════════════════╝${NC}"
echo ""
echo -e "${GREEN}Profile created at: ${BLUE}$OUTPUT_DIR${NC}"
echo ""
echo -e "${YELLOW}━━━ Summary ━━━${NC}"
echo "Engineer: $ENGINEER_USERNAME"
echo "PRs Analyzed: $PR_COUNT"
echo "Detailed Analysis: $DETAILED_ANALYSIS_LIMIT PRs"
echo ""
echo -e "${YELLOW}━━━ Created Structure ━━━${NC}"
echo "├── README.md (profile overview)"
echo "├── coding_style/ (conventions and preferences)"
echo "├── patterns/ (common solutions)"
echo "├── best_practices/ (quality standards)"
echo "├── architecture/ (design decisions)"
echo "├── code_review/ (feedback patterns)"
echo "├── examples/ (code samples)"
echo "└── raw_data/ (source PR data)"
echo ""
echo -e "${CYAN}━━━ Next Steps ━━━${NC}"
echo ""
echo "1. Review the generated profile:"
echo -e "   ${BLUE}cat $OUTPUT_DIR/README.md${NC}"
echo ""
echo "2. Review PR analysis:"
echo -e "   ${BLUE}cat $OUTPUT_DIR/raw_data/pr_analysis.md${NC}"
echo ""
echo "3. Enhance documentation with specific findings:"
echo "   - Add code examples to patterns/"
echo "   - Document specific conventions in coding_style/"
echo "   - Extract best practices from notable PRs"
echo "   - Add architectural decisions from PR discussions"
echo ""
echo "4. Use with AI agents:"
echo -e "   ${BLUE}\"Using engineer_profiles/$ENGINEER_USERNAME/, write code matching their style\"${NC}"
echo ""
echo -e "${YELLOW}💡 Tip: Manually review PRs and add specific examples to strengthen profile${NC}"
echo ""