Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:58:28 +08:00
commit e60768ac8e
10 changed files with 1020 additions and 0 deletions

View File

@@ -0,0 +1,243 @@
#!/usr/bin/env python3
"""Library for parsing and querying Goodreads CSV exports."""
import csv
import os
from datetime import datetime, timedelta
from pathlib import Path
from typing import List, Dict, Optional, Callable
import re
class GoodreadsBook:
"""Represents a book from Goodreads CSV export."""
def __init__(self, row: Dict[str, str]):
self.book_id = row.get('Book Id', '')
self.title = row.get('Title', '')
self.author = row.get('Author', '')
self.author_lf = row.get('Author l-f', '')
self.additional_authors = row.get('Additional Authors', '')
self.isbn = self._clean_isbn(row.get('ISBN', ''))
self.isbn13 = self._clean_isbn(row.get('ISBN13', ''))
self.my_rating = self._parse_int(row.get('My Rating', ''))
self.average_rating = self._parse_float(row.get('Average Rating', ''))
self.publisher = row.get('Publisher', '')
self.binding = row.get('Binding', '')
self.num_pages = self._parse_int(row.get('Number of Pages', ''))
self.year_published = self._parse_int(row.get('Year Published', ''))
self.original_publication_year = self._parse_int(row.get('Original Publication Year', ''))
self.date_read = self._parse_date(row.get('Date Read', ''))
self.date_added = self._parse_date(row.get('Date Added', ''))
self.bookshelves = row.get('Bookshelves', '')
self.bookshelves_with_positions = row.get('Bookshelves with positions', '')
self.exclusive_shelf = row.get('Exclusive Shelf', '')
self.my_review = row.get('My Review', '')
self.spoiler = row.get('Spoiler', '')
self.private_notes = row.get('Private Notes', '')
self.read_count = self._parse_int(row.get('Read Count', ''))
self.owned_copies = self._parse_int(row.get('Owned Copies', ''))
# Parse series information from title
self.series, self.series_index = self._parse_series()
def _clean_isbn(self, isbn: str) -> str:
"""Remove Excel formatting from ISBN."""
if isbn.startswith('="') and isbn.endswith('"'):
return isbn[2:-1]
return isbn
def _parse_int(self, value: str) -> Optional[int]:
"""Parse integer value, return None if empty or invalid."""
if not value or value == '':
return None
try:
return int(value)
except ValueError:
return None
def _parse_float(self, value: str) -> Optional[float]:
"""Parse float value, return None if empty or invalid."""
if not value or value == '':
return None
try:
return float(value)
except ValueError:
return None
def _parse_date(self, value: str) -> Optional[datetime]:
"""Parse date in YYYY/MM/DD format."""
if not value or value == '':
return None
try:
return datetime.strptime(value, '%Y/%m/%d')
except ValueError:
return None
def _parse_series(self) -> tuple[Optional[str], Optional[float]]:
"""Extract series name and number from title.
Examples:
- "An Absolutely Remarkable Thing (The Carls, #1)" -> ("The Carls", 1.0)
- "The Three-Body Problem (Remembrance of Earth's Past, #1)" -> ("Remembrance of Earth's Past", 1.0)
"""
# Match pattern: (Series Name, #Number)
match = re.search(r'\(([^,]+),\s*#([\d.]+)\)$', self.title)
if match:
series_name = match.group(1).strip()
try:
series_index = float(match.group(2))
return series_name, series_index
except ValueError:
return series_name, None
return None, None
@property
def is_read(self) -> bool:
"""Check if book has been read."""
return self.date_read is not None
@property
def is_tbr(self) -> bool:
"""Check if book is in to-be-read list."""
return 'to-read' in self.exclusive_shelf
@property
def is_currently_reading(self) -> bool:
"""Check if currently reading."""
return 'currently-reading' in self.exclusive_shelf
def has_shelf(self, shelf_name: str) -> bool:
"""Check if book is on a specific shelf."""
return shelf_name in self.bookshelves or shelf_name in self.exclusive_shelf
def __repr__(self):
return f"<GoodreadsBook: {self.title} by {self.author}>"
class GoodreadsLibrary:
"""Main class for querying Goodreads library from CSV."""
def __init__(self, csv_path: Optional[str] = None):
"""Initialize library from CSV file.
Args:
csv_path: Path to goodreads_library_export.csv
Defaults to ~/Drive/Claude/books/goodreads_library_export.csv
"""
if csv_path is None:
csv_path = os.path.expanduser('~/Drive/Claude/books/goodreads_library_export.csv')
self.csv_path = csv_path
self.books: List[GoodreadsBook] = []
self._load_books()
def _load_books(self):
"""Load books from CSV file."""
with open(self.csv_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
self.books.append(GoodreadsBook(row))
def query(self, filter_func: Callable[[GoodreadsBook], bool]) -> List[GoodreadsBook]:
"""Query books with a custom filter function."""
return [book for book in self.books if filter_func(book)]
def get_read_books(self, limit: Optional[int] = None,
sort_by_date: bool = True) -> List[GoodreadsBook]:
"""Get all read books, optionally sorted by date read."""
books = [book for book in self.books if book.is_read]
if sort_by_date:
books.sort(key=lambda b: b.date_read or datetime.min, reverse=True)
if limit:
books = books[:limit]
return books
def get_tbr_books(self) -> List[GoodreadsBook]:
"""Get all to-be-read books."""
return [book for book in self.books if book.is_tbr]
def get_books_by_shelf(self, shelf_name: str) -> List[GoodreadsBook]:
"""Get all books on a specific shelf."""
return [book for book in self.books if book.has_shelf(shelf_name)]
def get_books_read_in_period(self, days: int) -> List[GoodreadsBook]:
"""Get books read in the last N days."""
cutoff = datetime.now() - timedelta(days=days)
return [book for book in self.books
if book.date_read and book.date_read >= cutoff]
def get_books_read_in_year(self, year: int) -> List[GoodreadsBook]:
"""Get books read in a specific year."""
return [book for book in self.books
if book.date_read and book.date_read.year == year]
def get_books_added_in_period(self, days: int) -> List[GoodreadsBook]:
"""Get books added to library in the last N days."""
cutoff = datetime.now() - timedelta(days=days)
return [book for book in self.books
if book.date_added and book.date_added >= cutoff]
def get_series_books(self, series_name: str) -> List[GoodreadsBook]:
"""Get all books in a series, sorted by series index."""
books = [book for book in self.books if book.series == series_name]
books.sort(key=lambda b: b.series_index or 0)
return books
def get_all_series(self) -> Dict[str, List[GoodreadsBook]]:
"""Get all series with their books."""
series_dict = {}
for book in self.books:
if book.series:
if book.series not in series_dict:
series_dict[book.series] = []
series_dict[book.series].append(book)
# Sort books within each series
for series in series_dict:
series_dict[series].sort(key=lambda b: b.series_index or 0)
return series_dict
def get_incomplete_series(self) -> Dict[str, Dict]:
"""Get series where at least one book is read but not all."""
all_series = self.get_all_series()
incomplete = {}
for series_name, books in all_series.items():
read_count = sum(1 for b in books if b.is_read)
total_count = len(books)
if read_count > 0 and read_count < total_count:
# Find next unread book
next_unread = None
for book in books:
if not book.is_read:
next_unread = book
break
incomplete[series_name] = {
'books': books,
'read_count': read_count,
'total_count': total_count,
'next_book': next_unread
}
return incomplete
def get_author_stats(self) -> List[tuple[str, int]]:
"""Get author statistics (author, book count) sorted by count."""
author_counts = {}
for book in self.books:
if book.is_read:
author_counts[book.author] = author_counts.get(book.author, 0) + 1
return sorted(author_counts.items(), key=lambda x: x[1], reverse=True)
def get_rating_distribution(self) -> Dict[int, int]:
"""Get distribution of user ratings."""
dist = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
for book in self.books:
if book.is_read and book.my_rating:
dist[book.my_rating] = dist.get(book.my_rating, 0) + 1
return dist