gh-krishagel-geoffrey/skills/obsidian-manager/scripts/search-readwise.py

#!/usr/bin/env python3
# /// script
# dependencies = ["python-frontmatter", "pyyaml"]
# ///
"""
Search Readwise highlights by topic, author, or category.

Usage:
    uv run search-readwise.py "query" [--author "name"] [--category articles|books|podcasts|tweets]

Arguments:
    query      - Search term or phrase
    --author   - Filter by author name
    --category - Filter by category (articles, books, podcasts, tweets)

Examples:
    uv run search-readwise.py "second brain"
    uv run search-readwise.py "AI" --author "Andrej Karpathy"
    uv run search-readwise.py "habits" --category books
"""

import sys
import os
import json
import re
from pathlib import Path
import frontmatter

VAULT_PATH = Path.home() / "Library/Mobile Documents/iCloud~md~obsidian/Documents/Personal_Notes"
READWISE_PATH = VAULT_PATH / "Readwise"


def search_readwise(query: str, author: str = "", category: str = "", limit: int = 15) -> list[dict]:
    """Search Readwise highlights."""
    results = []

    # Determine search paths based on category
    if category:
        category_map = {
            "articles": "Articles",
            "books": "Books",
            "podcasts": "Podcasts",
            "tweets": "Tweets"
        }
        search_paths = [READWISE_PATH / category_map.get(category.lower(), category)]
    else:
        search_paths = [
            READWISE_PATH / "Articles",
            READWISE_PATH / "Books",
            READWISE_PATH / "Podcasts",
            READWISE_PATH / "Tweets"
        ]

    pattern = re.compile(re.escape(query), re.IGNORECASE)
    author_pattern = re.compile(re.escape(author), re.IGNORECASE) if author else None

    for search_path in search_paths:
        if not search_path.exists():
            continue

        for md_file in search_path.rglob("*.md"):
            try:
                post = frontmatter.load(md_file)
                content = post.content
                metadata = post.metadata

                # Filter by author if specified
                if author_pattern:
                    file_author = metadata.get("Author", "")
                    if not author_pattern.search(file_author):
                        continue

                # Search in content
                matches = list(pattern.finditer(content))
                if not matches:
                    continue

                # Extract highlights (lines starting with - or >)
                highlights = []
                for line in content.split('\n'):
                    line = line.strip()
                    if (line.startswith('-') or line.startswith('>')) and pattern.search(line):
                        highlights.append(line[:200])

                # Get context around first match if no highlight found
                if not highlights and matches:
                    first_match = matches[0]
                    start = max(0, first_match.start() - 50)
                    end = min(len(content), first_match.end() + 150)
                    excerpt = content[start:end].strip()
                    excerpt = re.sub(r'\s+', ' ', excerpt)
                    highlights = [excerpt]

                results.append({
                    "file": str(md_file.relative_to(VAULT_PATH)),
                    "title": metadata.get("Full Title", md_file.stem),
                    "author": metadata.get("Author", "Unknown"),
                    "category": search_path.name,
                    "matches": len(matches),
                    "highlights": highlights[:3]  # Top 3 matching highlights
                })

                if len(results) >= limit:
                    break

            except Exception as e:
                continue

        if len(results) >= limit:
            break

    # Sort by number of matches
    results.sort(key=lambda x: x["matches"], reverse=True)
    return results[:limit]


def main():
    if len(sys.argv) < 2:
        print("Usage: uv run search-readwise.py \"query\" [--author \"name\"] [--category type]")
        sys.exit(1)

    query = sys.argv[1]
    author = ""
    category = ""

    # Parse arguments
    i = 2
    while i < len(sys.argv):
        if sys.argv[i] == "--author" and i + 1 < len(sys.argv):
            author = sys.argv[i + 1]
            i += 2
        elif sys.argv[i] == "--category" and i + 1 < len(sys.argv):
            category = sys.argv[i + 1]
            i += 2
        else:
            i += 1

    filters = []
    if author:
        filters.append(f"author: {author}")
    if category:
        filters.append(f"category: {category}")
    filter_str = f" ({', '.join(filters)})" if filters else ""

    print(f"Searching Readwise for '{query}'{filter_str}...")
    results = search_readwise(query, author, category)

    if not results:
        print(f"\nNo Readwise highlights found for '{query}'")
        return

    print(f"\nFound {len(results)} sources:\n")

    for i, result in enumerate(results, 1):
        print(f"{i}. **{result['title']}**")
        print(f"   Author: {result['author']} | Category: {result['category']} | {result['matches']} matches")
        print(f"   File: {result['file']}")
        if result['highlights']:
            print(f"   Highlights:")
            for h in result['highlights']:
                print(f"     - {h[:150]}{'...' if len(h) > 150 else ''}")
        print()

    # Output JSON for programmatic use
    print(f"\n{json.dumps({'results': results, 'total': len(results)})}")


if __name__ == "__main__":
    main()