Initial commit
This commit is contained in:
243
skills/analyze-goodreads-export/scripts/goodreads_lib.py
Normal file
243
skills/analyze-goodreads-export/scripts/goodreads_lib.py
Normal file
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Library for parsing and querying Goodreads CSV exports."""
|
||||
|
||||
import csv
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Callable
|
||||
import re
|
||||
|
||||
|
||||
class GoodreadsBook:
|
||||
"""Represents a book from Goodreads CSV export."""
|
||||
|
||||
def __init__(self, row: Dict[str, str]):
|
||||
self.book_id = row.get('Book Id', '')
|
||||
self.title = row.get('Title', '')
|
||||
self.author = row.get('Author', '')
|
||||
self.author_lf = row.get('Author l-f', '')
|
||||
self.additional_authors = row.get('Additional Authors', '')
|
||||
self.isbn = self._clean_isbn(row.get('ISBN', ''))
|
||||
self.isbn13 = self._clean_isbn(row.get('ISBN13', ''))
|
||||
self.my_rating = self._parse_int(row.get('My Rating', ''))
|
||||
self.average_rating = self._parse_float(row.get('Average Rating', ''))
|
||||
self.publisher = row.get('Publisher', '')
|
||||
self.binding = row.get('Binding', '')
|
||||
self.num_pages = self._parse_int(row.get('Number of Pages', ''))
|
||||
self.year_published = self._parse_int(row.get('Year Published', ''))
|
||||
self.original_publication_year = self._parse_int(row.get('Original Publication Year', ''))
|
||||
self.date_read = self._parse_date(row.get('Date Read', ''))
|
||||
self.date_added = self._parse_date(row.get('Date Added', ''))
|
||||
self.bookshelves = row.get('Bookshelves', '')
|
||||
self.bookshelves_with_positions = row.get('Bookshelves with positions', '')
|
||||
self.exclusive_shelf = row.get('Exclusive Shelf', '')
|
||||
self.my_review = row.get('My Review', '')
|
||||
self.spoiler = row.get('Spoiler', '')
|
||||
self.private_notes = row.get('Private Notes', '')
|
||||
self.read_count = self._parse_int(row.get('Read Count', ''))
|
||||
self.owned_copies = self._parse_int(row.get('Owned Copies', ''))
|
||||
|
||||
# Parse series information from title
|
||||
self.series, self.series_index = self._parse_series()
|
||||
|
||||
def _clean_isbn(self, isbn: str) -> str:
|
||||
"""Remove Excel formatting from ISBN."""
|
||||
if isbn.startswith('="') and isbn.endswith('"'):
|
||||
return isbn[2:-1]
|
||||
return isbn
|
||||
|
||||
def _parse_int(self, value: str) -> Optional[int]:
|
||||
"""Parse integer value, return None if empty or invalid."""
|
||||
if not value or value == '':
|
||||
return None
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _parse_float(self, value: str) -> Optional[float]:
|
||||
"""Parse float value, return None if empty or invalid."""
|
||||
if not value or value == '':
|
||||
return None
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _parse_date(self, value: str) -> Optional[datetime]:
|
||||
"""Parse date in YYYY/MM/DD format."""
|
||||
if not value or value == '':
|
||||
return None
|
||||
try:
|
||||
return datetime.strptime(value, '%Y/%m/%d')
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _parse_series(self) -> tuple[Optional[str], Optional[float]]:
|
||||
"""Extract series name and number from title.
|
||||
|
||||
Examples:
|
||||
- "An Absolutely Remarkable Thing (The Carls, #1)" -> ("The Carls", 1.0)
|
||||
- "The Three-Body Problem (Remembrance of Earth's Past, #1)" -> ("Remembrance of Earth's Past", 1.0)
|
||||
"""
|
||||
# Match pattern: (Series Name, #Number)
|
||||
match = re.search(r'\(([^,]+),\s*#([\d.]+)\)$', self.title)
|
||||
if match:
|
||||
series_name = match.group(1).strip()
|
||||
try:
|
||||
series_index = float(match.group(2))
|
||||
return series_name, series_index
|
||||
except ValueError:
|
||||
return series_name, None
|
||||
return None, None
|
||||
|
||||
@property
|
||||
def is_read(self) -> bool:
|
||||
"""Check if book has been read."""
|
||||
return self.date_read is not None
|
||||
|
||||
@property
|
||||
def is_tbr(self) -> bool:
|
||||
"""Check if book is in to-be-read list."""
|
||||
return 'to-read' in self.exclusive_shelf
|
||||
|
||||
@property
|
||||
def is_currently_reading(self) -> bool:
|
||||
"""Check if currently reading."""
|
||||
return 'currently-reading' in self.exclusive_shelf
|
||||
|
||||
def has_shelf(self, shelf_name: str) -> bool:
|
||||
"""Check if book is on a specific shelf."""
|
||||
return shelf_name in self.bookshelves or shelf_name in self.exclusive_shelf
|
||||
|
||||
def __repr__(self):
|
||||
return f"<GoodreadsBook: {self.title} by {self.author}>"
|
||||
|
||||
|
||||
class GoodreadsLibrary:
|
||||
"""Main class for querying Goodreads library from CSV."""
|
||||
|
||||
def __init__(self, csv_path: Optional[str] = None):
|
||||
"""Initialize library from CSV file.
|
||||
|
||||
Args:
|
||||
csv_path: Path to goodreads_library_export.csv
|
||||
Defaults to ~/Drive/Claude/books/goodreads_library_export.csv
|
||||
"""
|
||||
if csv_path is None:
|
||||
csv_path = os.path.expanduser('~/Drive/Claude/books/goodreads_library_export.csv')
|
||||
|
||||
self.csv_path = csv_path
|
||||
self.books: List[GoodreadsBook] = []
|
||||
self._load_books()
|
||||
|
||||
def _load_books(self):
|
||||
"""Load books from CSV file."""
|
||||
with open(self.csv_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
self.books.append(GoodreadsBook(row))
|
||||
|
||||
def query(self, filter_func: Callable[[GoodreadsBook], bool]) -> List[GoodreadsBook]:
|
||||
"""Query books with a custom filter function."""
|
||||
return [book for book in self.books if filter_func(book)]
|
||||
|
||||
def get_read_books(self, limit: Optional[int] = None,
|
||||
sort_by_date: bool = True) -> List[GoodreadsBook]:
|
||||
"""Get all read books, optionally sorted by date read."""
|
||||
books = [book for book in self.books if book.is_read]
|
||||
if sort_by_date:
|
||||
books.sort(key=lambda b: b.date_read or datetime.min, reverse=True)
|
||||
if limit:
|
||||
books = books[:limit]
|
||||
return books
|
||||
|
||||
def get_tbr_books(self) -> List[GoodreadsBook]:
|
||||
"""Get all to-be-read books."""
|
||||
return [book for book in self.books if book.is_tbr]
|
||||
|
||||
def get_books_by_shelf(self, shelf_name: str) -> List[GoodreadsBook]:
|
||||
"""Get all books on a specific shelf."""
|
||||
return [book for book in self.books if book.has_shelf(shelf_name)]
|
||||
|
||||
def get_books_read_in_period(self, days: int) -> List[GoodreadsBook]:
|
||||
"""Get books read in the last N days."""
|
||||
cutoff = datetime.now() - timedelta(days=days)
|
||||
return [book for book in self.books
|
||||
if book.date_read and book.date_read >= cutoff]
|
||||
|
||||
def get_books_read_in_year(self, year: int) -> List[GoodreadsBook]:
|
||||
"""Get books read in a specific year."""
|
||||
return [book for book in self.books
|
||||
if book.date_read and book.date_read.year == year]
|
||||
|
||||
def get_books_added_in_period(self, days: int) -> List[GoodreadsBook]:
|
||||
"""Get books added to library in the last N days."""
|
||||
cutoff = datetime.now() - timedelta(days=days)
|
||||
return [book for book in self.books
|
||||
if book.date_added and book.date_added >= cutoff]
|
||||
|
||||
def get_series_books(self, series_name: str) -> List[GoodreadsBook]:
|
||||
"""Get all books in a series, sorted by series index."""
|
||||
books = [book for book in self.books if book.series == series_name]
|
||||
books.sort(key=lambda b: b.series_index or 0)
|
||||
return books
|
||||
|
||||
def get_all_series(self) -> Dict[str, List[GoodreadsBook]]:
|
||||
"""Get all series with their books."""
|
||||
series_dict = {}
|
||||
for book in self.books:
|
||||
if book.series:
|
||||
if book.series not in series_dict:
|
||||
series_dict[book.series] = []
|
||||
series_dict[book.series].append(book)
|
||||
|
||||
# Sort books within each series
|
||||
for series in series_dict:
|
||||
series_dict[series].sort(key=lambda b: b.series_index or 0)
|
||||
|
||||
return series_dict
|
||||
|
||||
def get_incomplete_series(self) -> Dict[str, Dict]:
|
||||
"""Get series where at least one book is read but not all."""
|
||||
all_series = self.get_all_series()
|
||||
incomplete = {}
|
||||
|
||||
for series_name, books in all_series.items():
|
||||
read_count = sum(1 for b in books if b.is_read)
|
||||
total_count = len(books)
|
||||
|
||||
if read_count > 0 and read_count < total_count:
|
||||
# Find next unread book
|
||||
next_unread = None
|
||||
for book in books:
|
||||
if not book.is_read:
|
||||
next_unread = book
|
||||
break
|
||||
|
||||
incomplete[series_name] = {
|
||||
'books': books,
|
||||
'read_count': read_count,
|
||||
'total_count': total_count,
|
||||
'next_book': next_unread
|
||||
}
|
||||
|
||||
return incomplete
|
||||
|
||||
def get_author_stats(self) -> List[tuple[str, int]]:
|
||||
"""Get author statistics (author, book count) sorted by count."""
|
||||
author_counts = {}
|
||||
for book in self.books:
|
||||
if book.is_read:
|
||||
author_counts[book.author] = author_counts.get(book.author, 0) + 1
|
||||
|
||||
return sorted(author_counts.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
def get_rating_distribution(self) -> Dict[int, int]:
|
||||
"""Get distribution of user ratings."""
|
||||
dist = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
|
||||
for book in self.books:
|
||||
if book.is_read and book.my_rating:
|
||||
dist[book.my_rating] = dist.get(book.my_rating, 0) + 1
|
||||
return dist
|
||||
Reference in New Issue
Block a user