Initial commit

2025-11-30 08:58:28 +08:00
commit e60768ac8e
10 changed files with 1020 additions and 0 deletions
--- a/skills/analyze-goodreads-export/scripts/goodreads_lib.py
+++ b/skills/analyze-goodreads-export/scripts/goodreads_lib.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""Library for parsing and querying Goodreads CSV exports."""
+
+import csv
+import os
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import List, Dict, Optional, Callable
+import re
+
+
+class GoodreadsBook:
+    """Represents a book from Goodreads CSV export."""
+
+    def __init__(self, row: Dict[str, str]):
+        self.book_id = row.get('Book Id', '')
+        self.title = row.get('Title', '')
+        self.author = row.get('Author', '')
+        self.author_lf = row.get('Author l-f', '')
+        self.additional_authors = row.get('Additional Authors', '')
+        self.isbn = self._clean_isbn(row.get('ISBN', ''))
+        self.isbn13 = self._clean_isbn(row.get('ISBN13', ''))
+        self.my_rating = self._parse_int(row.get('My Rating', ''))
+        self.average_rating = self._parse_float(row.get('Average Rating', ''))
+        self.publisher = row.get('Publisher', '')
+        self.binding = row.get('Binding', '')
+        self.num_pages = self._parse_int(row.get('Number of Pages', ''))
+        self.year_published = self._parse_int(row.get('Year Published', ''))
+        self.original_publication_year = self._parse_int(row.get('Original Publication Year', ''))
+        self.date_read = self._parse_date(row.get('Date Read', ''))
+        self.date_added = self._parse_date(row.get('Date Added', ''))
+        self.bookshelves = row.get('Bookshelves', '')
+        self.bookshelves_with_positions = row.get('Bookshelves with positions', '')
+        self.exclusive_shelf = row.get('Exclusive Shelf', '')
+        self.my_review = row.get('My Review', '')
+        self.spoiler = row.get('Spoiler', '')
+        self.private_notes = row.get('Private Notes', '')
+        self.read_count = self._parse_int(row.get('Read Count', ''))
+        self.owned_copies = self._parse_int(row.get('Owned Copies', ''))
+
+        # Parse series information from title
+        self.series, self.series_index = self._parse_series()
+
+    def _clean_isbn(self, isbn: str) -> str:
+        """Remove Excel formatting from ISBN."""
+        if isbn.startswith('="') and isbn.endswith('"'):
+            return isbn[2:-1]
+        return isbn
+
+    def _parse_int(self, value: str) -> Optional[int]:
+        """Parse integer value, return None if empty or invalid."""
+        if not value or value == '':
+            return None
+        try:
+            return int(value)
+        except ValueError:
+            return None
+
+    def _parse_float(self, value: str) -> Optional[float]:
+        """Parse float value, return None if empty or invalid."""
+        if not value or value == '':
+            return None
+        try:
+            return float(value)
+        except ValueError:
+            return None
+
+    def _parse_date(self, value: str) -> Optional[datetime]:
+        """Parse date in YYYY/MM/DD format."""
+        if not value or value == '':
+            return None
+        try:
+            return datetime.strptime(value, '%Y/%m/%d')
+        except ValueError:
+            return None
+
+    def _parse_series(self) -> tuple[Optional[str], Optional[float]]:
+        """Extract series name and number from title.
+
+        Examples:
+        - "An Absolutely Remarkable Thing (The Carls, #1)" -> ("The Carls", 1.0)
+        - "The Three-Body Problem (Remembrance of Earth's Past, #1)" -> ("Remembrance of Earth's Past", 1.0)
+        """
+        # Match pattern: (Series Name, #Number)
+        match = re.search(r'\(([^,]+),\s*#([\d.]+)\)$', self.title)
+        if match:
+            series_name = match.group(1).strip()
+            try:
+                series_index = float(match.group(2))
+                return series_name, series_index
+            except ValueError:
+                return series_name, None
+        return None, None
+
+    @property
+    def is_read(self) -> bool:
+        """Check if book has been read."""
+        return self.date_read is not None
+
+    @property
+    def is_tbr(self) -> bool:
+        """Check if book is in to-be-read list."""
+        return 'to-read' in self.exclusive_shelf
+
+    @property
+    def is_currently_reading(self) -> bool:
+        """Check if currently reading."""
+        return 'currently-reading' in self.exclusive_shelf
+
+    def has_shelf(self, shelf_name: str) -> bool:
+        """Check if book is on a specific shelf."""
+        return shelf_name in self.bookshelves or shelf_name in self.exclusive_shelf
+
+    def __repr__(self):
+        return f"<GoodreadsBook: {self.title} by {self.author}>"
+
+
+class GoodreadsLibrary:
+    """Main class for querying Goodreads library from CSV."""
+
+    def __init__(self, csv_path: Optional[str] = None):
+        """Initialize library from CSV file.
+
+        Args:
+            csv_path: Path to goodreads_library_export.csv
+                     Defaults to ~/Drive/Claude/books/goodreads_library_export.csv
+        """
+        if csv_path is None:
+            csv_path = os.path.expanduser('~/Drive/Claude/books/goodreads_library_export.csv')
+
+        self.csv_path = csv_path
+        self.books: List[GoodreadsBook] = []
+        self._load_books()
+
+    def _load_books(self):
+        """Load books from CSV file."""
+        with open(self.csv_path, 'r', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                self.books.append(GoodreadsBook(row))
+
+    def query(self, filter_func: Callable[[GoodreadsBook], bool]) -> List[GoodreadsBook]:
+        """Query books with a custom filter function."""
+        return [book for book in self.books if filter_func(book)]
+
+    def get_read_books(self, limit: Optional[int] = None,
+                       sort_by_date: bool = True) -> List[GoodreadsBook]:
+        """Get all read books, optionally sorted by date read."""
+        books = [book for book in self.books if book.is_read]
+        if sort_by_date:
+            books.sort(key=lambda b: b.date_read or datetime.min, reverse=True)
+        if limit:
+            books = books[:limit]
+        return books
+
+    def get_tbr_books(self) -> List[GoodreadsBook]:
+        """Get all to-be-read books."""
+        return [book for book in self.books if book.is_tbr]
+
+    def get_books_by_shelf(self, shelf_name: str) -> List[GoodreadsBook]:
+        """Get all books on a specific shelf."""
+        return [book for book in self.books if book.has_shelf(shelf_name)]
+
+    def get_books_read_in_period(self, days: int) -> List[GoodreadsBook]:
+        """Get books read in the last N days."""
+        cutoff = datetime.now() - timedelta(days=days)
+        return [book for book in self.books
+                if book.date_read and book.date_read >= cutoff]
+
+    def get_books_read_in_year(self, year: int) -> List[GoodreadsBook]:
+        """Get books read in a specific year."""
+        return [book for book in self.books
+                if book.date_read and book.date_read.year == year]
+
+    def get_books_added_in_period(self, days: int) -> List[GoodreadsBook]:
+        """Get books added to library in the last N days."""
+        cutoff = datetime.now() - timedelta(days=days)
+        return [book for book in self.books
+                if book.date_added and book.date_added >= cutoff]
+
+    def get_series_books(self, series_name: str) -> List[GoodreadsBook]:
+        """Get all books in a series, sorted by series index."""
+        books = [book for book in self.books if book.series == series_name]
+        books.sort(key=lambda b: b.series_index or 0)
+        return books
+
+    def get_all_series(self) -> Dict[str, List[GoodreadsBook]]:
+        """Get all series with their books."""
+        series_dict = {}
+        for book in self.books:
+            if book.series:
+                if book.series not in series_dict:
+                    series_dict[book.series] = []
+                series_dict[book.series].append(book)
+
+        # Sort books within each series
+        for series in series_dict:
+            series_dict[series].sort(key=lambda b: b.series_index or 0)
+
+        return series_dict
+
+    def get_incomplete_series(self) -> Dict[str, Dict]:
+        """Get series where at least one book is read but not all."""
+        all_series = self.get_all_series()
+        incomplete = {}
+
+        for series_name, books in all_series.items():
+            read_count = sum(1 for b in books if b.is_read)
+            total_count = len(books)
+
+            if read_count > 0 and read_count < total_count:
+                # Find next unread book
+                next_unread = None
+                for book in books:
+                    if not book.is_read:
+                        next_unread = book
+                        break
+
+                incomplete[series_name] = {
+                    'books': books,
+                    'read_count': read_count,
+                    'total_count': total_count,
+                    'next_book': next_unread
+                }
+
+        return incomplete
+
+    def get_author_stats(self) -> List[tuple[str, int]]:
+        """Get author statistics (author, book count) sorted by count."""
+        author_counts = {}
+        for book in self.books:
+            if book.is_read:
+                author_counts[book.author] = author_counts.get(book.author, 0) + 1
+
+        return sorted(author_counts.items(), key=lambda x: x[1], reverse=True)
+
+    def get_rating_distribution(self) -> Dict[int, int]:
+        """Get distribution of user ratings."""
+        dist = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
+        for book in self.books:
+            if book.is_read and book.my_rating:
+                dist[book.my_rating] = dist.get(book.my_rating, 0) + 1
+        return dist