244 lines
9.3 KiB
Python
244 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Library for parsing and querying Goodreads CSV exports."""
|
|
|
|
import csv
|
|
import os
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional, Callable
|
|
import re
|
|
|
|
|
|
class GoodreadsBook:
|
|
"""Represents a book from Goodreads CSV export."""
|
|
|
|
def __init__(self, row: Dict[str, str]):
|
|
self.book_id = row.get('Book Id', '')
|
|
self.title = row.get('Title', '')
|
|
self.author = row.get('Author', '')
|
|
self.author_lf = row.get('Author l-f', '')
|
|
self.additional_authors = row.get('Additional Authors', '')
|
|
self.isbn = self._clean_isbn(row.get('ISBN', ''))
|
|
self.isbn13 = self._clean_isbn(row.get('ISBN13', ''))
|
|
self.my_rating = self._parse_int(row.get('My Rating', ''))
|
|
self.average_rating = self._parse_float(row.get('Average Rating', ''))
|
|
self.publisher = row.get('Publisher', '')
|
|
self.binding = row.get('Binding', '')
|
|
self.num_pages = self._parse_int(row.get('Number of Pages', ''))
|
|
self.year_published = self._parse_int(row.get('Year Published', ''))
|
|
self.original_publication_year = self._parse_int(row.get('Original Publication Year', ''))
|
|
self.date_read = self._parse_date(row.get('Date Read', ''))
|
|
self.date_added = self._parse_date(row.get('Date Added', ''))
|
|
self.bookshelves = row.get('Bookshelves', '')
|
|
self.bookshelves_with_positions = row.get('Bookshelves with positions', '')
|
|
self.exclusive_shelf = row.get('Exclusive Shelf', '')
|
|
self.my_review = row.get('My Review', '')
|
|
self.spoiler = row.get('Spoiler', '')
|
|
self.private_notes = row.get('Private Notes', '')
|
|
self.read_count = self._parse_int(row.get('Read Count', ''))
|
|
self.owned_copies = self._parse_int(row.get('Owned Copies', ''))
|
|
|
|
# Parse series information from title
|
|
self.series, self.series_index = self._parse_series()
|
|
|
|
def _clean_isbn(self, isbn: str) -> str:
|
|
"""Remove Excel formatting from ISBN."""
|
|
if isbn.startswith('="') and isbn.endswith('"'):
|
|
return isbn[2:-1]
|
|
return isbn
|
|
|
|
def _parse_int(self, value: str) -> Optional[int]:
|
|
"""Parse integer value, return None if empty or invalid."""
|
|
if not value or value == '':
|
|
return None
|
|
try:
|
|
return int(value)
|
|
except ValueError:
|
|
return None
|
|
|
|
def _parse_float(self, value: str) -> Optional[float]:
|
|
"""Parse float value, return None if empty or invalid."""
|
|
if not value or value == '':
|
|
return None
|
|
try:
|
|
return float(value)
|
|
except ValueError:
|
|
return None
|
|
|
|
def _parse_date(self, value: str) -> Optional[datetime]:
|
|
"""Parse date in YYYY/MM/DD format."""
|
|
if not value or value == '':
|
|
return None
|
|
try:
|
|
return datetime.strptime(value, '%Y/%m/%d')
|
|
except ValueError:
|
|
return None
|
|
|
|
def _parse_series(self) -> tuple[Optional[str], Optional[float]]:
|
|
"""Extract series name and number from title.
|
|
|
|
Examples:
|
|
- "An Absolutely Remarkable Thing (The Carls, #1)" -> ("The Carls", 1.0)
|
|
- "The Three-Body Problem (Remembrance of Earth's Past, #1)" -> ("Remembrance of Earth's Past", 1.0)
|
|
"""
|
|
# Match pattern: (Series Name, #Number)
|
|
match = re.search(r'\(([^,]+),\s*#([\d.]+)\)$', self.title)
|
|
if match:
|
|
series_name = match.group(1).strip()
|
|
try:
|
|
series_index = float(match.group(2))
|
|
return series_name, series_index
|
|
except ValueError:
|
|
return series_name, None
|
|
return None, None
|
|
|
|
@property
|
|
def is_read(self) -> bool:
|
|
"""Check if book has been read."""
|
|
return self.date_read is not None
|
|
|
|
@property
|
|
def is_tbr(self) -> bool:
|
|
"""Check if book is in to-be-read list."""
|
|
return 'to-read' in self.exclusive_shelf
|
|
|
|
@property
|
|
def is_currently_reading(self) -> bool:
|
|
"""Check if currently reading."""
|
|
return 'currently-reading' in self.exclusive_shelf
|
|
|
|
def has_shelf(self, shelf_name: str) -> bool:
|
|
"""Check if book is on a specific shelf."""
|
|
return shelf_name in self.bookshelves or shelf_name in self.exclusive_shelf
|
|
|
|
def __repr__(self):
|
|
return f"<GoodreadsBook: {self.title} by {self.author}>"
|
|
|
|
|
|
class GoodreadsLibrary:
|
|
"""Main class for querying Goodreads library from CSV."""
|
|
|
|
def __init__(self, csv_path: Optional[str] = None):
|
|
"""Initialize library from CSV file.
|
|
|
|
Args:
|
|
csv_path: Path to goodreads_library_export.csv
|
|
Defaults to ~/Drive/Claude/books/goodreads_library_export.csv
|
|
"""
|
|
if csv_path is None:
|
|
csv_path = os.path.expanduser('~/Drive/Claude/books/goodreads_library_export.csv')
|
|
|
|
self.csv_path = csv_path
|
|
self.books: List[GoodreadsBook] = []
|
|
self._load_books()
|
|
|
|
def _load_books(self):
|
|
"""Load books from CSV file."""
|
|
with open(self.csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
self.books.append(GoodreadsBook(row))
|
|
|
|
def query(self, filter_func: Callable[[GoodreadsBook], bool]) -> List[GoodreadsBook]:
|
|
"""Query books with a custom filter function."""
|
|
return [book for book in self.books if filter_func(book)]
|
|
|
|
def get_read_books(self, limit: Optional[int] = None,
|
|
sort_by_date: bool = True) -> List[GoodreadsBook]:
|
|
"""Get all read books, optionally sorted by date read."""
|
|
books = [book for book in self.books if book.is_read]
|
|
if sort_by_date:
|
|
books.sort(key=lambda b: b.date_read or datetime.min, reverse=True)
|
|
if limit:
|
|
books = books[:limit]
|
|
return books
|
|
|
|
def get_tbr_books(self) -> List[GoodreadsBook]:
|
|
"""Get all to-be-read books."""
|
|
return [book for book in self.books if book.is_tbr]
|
|
|
|
def get_books_by_shelf(self, shelf_name: str) -> List[GoodreadsBook]:
|
|
"""Get all books on a specific shelf."""
|
|
return [book for book in self.books if book.has_shelf(shelf_name)]
|
|
|
|
def get_books_read_in_period(self, days: int) -> List[GoodreadsBook]:
|
|
"""Get books read in the last N days."""
|
|
cutoff = datetime.now() - timedelta(days=days)
|
|
return [book for book in self.books
|
|
if book.date_read and book.date_read >= cutoff]
|
|
|
|
def get_books_read_in_year(self, year: int) -> List[GoodreadsBook]:
|
|
"""Get books read in a specific year."""
|
|
return [book for book in self.books
|
|
if book.date_read and book.date_read.year == year]
|
|
|
|
def get_books_added_in_period(self, days: int) -> List[GoodreadsBook]:
|
|
"""Get books added to library in the last N days."""
|
|
cutoff = datetime.now() - timedelta(days=days)
|
|
return [book for book in self.books
|
|
if book.date_added and book.date_added >= cutoff]
|
|
|
|
def get_series_books(self, series_name: str) -> List[GoodreadsBook]:
|
|
"""Get all books in a series, sorted by series index."""
|
|
books = [book for book in self.books if book.series == series_name]
|
|
books.sort(key=lambda b: b.series_index or 0)
|
|
return books
|
|
|
|
def get_all_series(self) -> Dict[str, List[GoodreadsBook]]:
|
|
"""Get all series with their books."""
|
|
series_dict = {}
|
|
for book in self.books:
|
|
if book.series:
|
|
if book.series not in series_dict:
|
|
series_dict[book.series] = []
|
|
series_dict[book.series].append(book)
|
|
|
|
# Sort books within each series
|
|
for series in series_dict:
|
|
series_dict[series].sort(key=lambda b: b.series_index or 0)
|
|
|
|
return series_dict
|
|
|
|
def get_incomplete_series(self) -> Dict[str, Dict]:
|
|
"""Get series where at least one book is read but not all."""
|
|
all_series = self.get_all_series()
|
|
incomplete = {}
|
|
|
|
for series_name, books in all_series.items():
|
|
read_count = sum(1 for b in books if b.is_read)
|
|
total_count = len(books)
|
|
|
|
if read_count > 0 and read_count < total_count:
|
|
# Find next unread book
|
|
next_unread = None
|
|
for book in books:
|
|
if not book.is_read:
|
|
next_unread = book
|
|
break
|
|
|
|
incomplete[series_name] = {
|
|
'books': books,
|
|
'read_count': read_count,
|
|
'total_count': total_count,
|
|
'next_book': next_unread
|
|
}
|
|
|
|
return incomplete
|
|
|
|
def get_author_stats(self) -> List[tuple[str, int]]:
|
|
"""Get author statistics (author, book count) sorted by count."""
|
|
author_counts = {}
|
|
for book in self.books:
|
|
if book.is_read:
|
|
author_counts[book.author] = author_counts.get(book.author, 0) + 1
|
|
|
|
return sorted(author_counts.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
def get_rating_distribution(self) -> Dict[int, int]:
|
|
"""Get distribution of user ratings."""
|
|
dist = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
|
|
for book in self.books:
|
|
if book.is_read and book.my_rating:
|
|
dist[book.my_rating] = dist.get(book.my_rating, 0) + 1
|
|
return dist
|