From 275926dffe73a25b907a9c5d9ea8263e61b22a03 Mon Sep 17 00:00:00 2001 From: John Doty Date: Fri, 19 Jul 2024 07:30:14 -0700 Subject: [PATCH 1/2] Some database refactor --- cry/cli.py | 8 +- cry/database.py | 161 +++++++++++++++++++---------------------- tests/test_database.py | 4 +- 3 files changed, 79 insertions(+), 94 deletions(-) diff --git a/cry/cli.py b/cry/cli.py index 63601ba..86f0d69 100644 --- a/cry/cli.py +++ b/cry/cli.py @@ -96,7 +96,7 @@ def subscribe(url, literal): result = d # Check to see if this URL is already in the database. - existing = db.load_feed(result.meta.url) # TODO: Replace with 'load_meta'? + existing = db.load_meta(result.meta.url) if existing is not None: click.echo(f"This feed already exists (as {result.meta.url})") return 1 @@ -129,7 +129,7 @@ def import_opml(opml_file): click.echo(f"{url} does not seem to be a feed, skipping...") continue - existing = db.load_feed(meta.url) # TODO: Replace with 'load_meta'? + existing = db.load_meta(meta.url) if existing is not None: LOG.info(f"{url} already exists (as {meta.url})") continue @@ -151,11 +151,11 @@ def refresh(url): db = database.Database.local() if url: - f = db.load_feed(url) # TODO: Replace with 'load_meta'? + f = db.load_meta(url) if f is None: click.echo(f"Not subscribed to {url}") return 1 - feeds = [f.meta] + feeds = [f] else: feeds = db.load_all_meta() diff --git a/cry/database.py b/cry/database.py index 1834245..9f5b7bd 100644 --- a/cry/database.py +++ b/cry/database.py @@ -115,23 +115,11 @@ class Database: def get_property(self, prop: str, default=None) -> typing.Any: with self.db: - cursor = self.db.execute( - "SELECT value FROM properties WHERE name=?", (prop,) - ) - result = cursor.fetchone() - if result is None: - return default - return result[0] + return self._get_property(prop, default) def set_property(self, prop: str, value): with self.db: - self.db.execute( - """ - INSERT INTO properties (name, value) VALUES (?, ?) - ON CONFLICT DO UPDATE SET value=excluded.value - """, - (prop, value), - ) + return self._set_property(prop, value) def ensure_database_schema(self): with self.db: @@ -143,15 +131,15 @@ class Database: ) """ ) - version = int(self.get_property("version", 0)) + version = int(self._get_property("version", 0)) for script in SCHEMA_STATEMENTS[version:]: for statement in script.split(";"): try: self.db.execute(statement) except Exception as e: raise Exception(f"Error executing:\n{statement}") from e - self.set_property("version", len(SCHEMA_STATEMENTS)) - self.set_property("origin", self.origin) + self._set_property("version", len(SCHEMA_STATEMENTS)) + self._set_property("origin", self.origin) def load_all_meta(self) -> list[feed.FeedMeta]: with self.db: @@ -258,7 +246,7 @@ class Database: return feeds - def load_feed(self, url: str) -> feed.Feed | None: + def load_meta(self, url: str) -> feed.FeedMeta | None: with self.db: cursor = self.db.execute( """ @@ -267,9 +255,7 @@ class Database: retry_after_ts, status, etag, - modified, - title, - link + modified FROM feeds WHERE url=? """, @@ -280,8 +266,8 @@ class Database: if row is None: return None - last_fetched_ts, retry_after_ts, status, etag, modified, title, link = row - meta = feed.FeedMeta( + last_fetched_ts, retry_after_ts, status, etag, modified = row + return feed.FeedMeta( url=url, last_fetched_ts=last_fetched_ts, retry_after_ts=retry_after_ts, @@ -290,27 +276,6 @@ class Database: modified=modified, ) - cursor = self.db.execute( - """ - SELECT - id, - inserted_at, - title, - link - FROM entries - WHERE feed_url=? - """, - [url], - ) - - rows = cursor.fetchall() - entries = [ - feed.Entry(id=id, inserted_at=inserted_at, title=title, link=link) - for id, inserted_at, title, link in rows - ] - - return feed.Feed(meta=meta, title=title, link=link, entries=entries) - def update_meta(self, f: feed.FeedMeta): with self.db: self.db.execute( @@ -373,50 +338,8 @@ class Database: ], ) - cursor = self.db.execute( - "SELECT COUNT (*) FROM entries WHERE feed_url=?", [f.meta.url] - ) - start_count = cursor.fetchone()[0] - - self.db.executemany( - """ - INSERT INTO entries ( - id, - inserted_at, - feed_url, - title, - link - ) VALUES (?, ?, ?, ?, ?) - ON CONFLICT DO UPDATE - SET - -- NOTE: This is also part of the feed merge algorithm, BUT - -- we implement it here because feeds tend to be rolling - -- windows over some external content and we don't want - -- to read and write the entire feed just to update the - -- few new items. But we can't just do ON CONFLICT DO - -- NOTHING because we *might* be storing a feed where we - -- resolved conflicts with another instance. So we want - -- to handle all the cases. (In theory we could make two - -- different INSERTs to handle the two cases but that is - -- more complexity than it is worth.) - inserted_at=MIN(inserted_at, excluded.inserted_at), - title=CASE - WHEN inserted_at < excluded.inserted_at THEN title - ELSE excluded.title - END, - link=CASE - WHEN inserted_at < excluded.inserted_at THEN link - ELSE excluded.link - END - """, - [(e.id, e.inserted_at, f.meta.url, e.title, e.link) for e in f.entries], - ) - - cursor = self.db.execute( - "SELECT COUNT (*) FROM entries WHERE feed_url=?", [f.meta.url] - ) - end_count = cursor.fetchone()[0] - return end_count - start_count + change_count = self._insert_entries(f.meta.url, f.entries) + return change_count def set_feed_status(self, url: str, status: int) -> int: with self.db: @@ -466,3 +389,65 @@ class Database: """, [feed.FEED_STATUS_UNSUBSCRIBED, int(time.time()), old_url], ) + + def _get_property(self, prop: str, default=None) -> typing.Any: + cursor = self.db.execute("SELECT value FROM properties WHERE name=?", (prop,)) + result = cursor.fetchone() + if result is None: + return default + return result[0] + + def _set_property(self, prop: str, value): + self.db.execute( + """ + INSERT INTO properties (name, value) VALUES (?, ?) + ON CONFLICT DO UPDATE SET value=excluded.value + """, + (prop, value), + ) + + def _insert_entries(self, feed_url: str, entries: list[feed.Entry]) -> int: + cursor = self.db.execute( + "SELECT COUNT (*) FROM entries WHERE feed_url=?", [feed_url] + ) + start_count = cursor.fetchone()[0] + + self.db.executemany( + """ + INSERT INTO entries ( + id, + inserted_at, + feed_url, + title, + link + ) VALUES (?, ?, ?, ?, ?) + ON CONFLICT DO UPDATE + SET + -- NOTE: This is also part of the feed merge algorithm, BUT + -- we implement it here because feeds tend to be rolling + -- windows over some external content and we don't want + -- to read and write the entire feed just to update the + -- few new items. But we can't just do ON CONFLICT DO + -- NOTHING because we *might* be storing a feed where we + -- resolved conflicts with another instance. So we want + -- to handle all the cases. (In theory we could make two + -- different INSERTs to handle the two cases but that is + -- more complexity than it is worth.) + inserted_at=MIN(inserted_at, excluded.inserted_at), + title=CASE + WHEN inserted_at < excluded.inserted_at THEN title + ELSE excluded.title + END, + link=CASE + WHEN inserted_at < excluded.inserted_at THEN link + ELSE excluded.link + END + """, + [(e.id, e.inserted_at, feed_url, e.title, e.link) for e in entries], + ) + + cursor = self.db.execute( + "SELECT COUNT (*) FROM entries WHERE feed_url=?", [feed_url] + ) + end_count = cursor.fetchone()[0] + return end_count - start_count diff --git a/tests/test_database.py b/tests/test_database.py index a6b1d1e..daa3b7b 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -106,8 +106,8 @@ def test_database_store_feed(): db.ensure_database_schema() db.store_feed(FEED) - loaded = db.load_feed(FEED.meta.url) - assert loaded == FEED + loaded_meta = db.load_meta(FEED.meta.url) + assert loaded_meta == FEED.meta def test_database_store_feed_dups(): From 463abcb92377ebce57c092ee79f96164f0dbc5c7 Mon Sep 17 00:00:00 2001 From: John Doty Date: Fri, 19 Jul 2024 08:11:10 -0700 Subject: [PATCH 2/2] Stuff --- cry/cli.py | 19 +++++-- cry/database.py | 118 +++++++++++++++++++++++++---------------- tests/test_database.py | 8 +-- 3 files changed, 90 insertions(+), 55 deletions(-) diff --git a/cry/cli.py b/cry/cli.py index 86f0d69..beaed84 100644 --- a/cry/cli.py +++ b/cry/cli.py @@ -239,11 +239,13 @@ def unsubscribe(url): `list` command.) """ db = database.Database.local() - count = db.set_feed_status(url, feed.FEED_STATUS_UNSUBSCRIBED) - if count == 0: + meta = db.load_meta(url) + if meta is None: click.echo(f"Not subscribed to feed {url}") return 1 + db.update_feed_status(meta, feed.FEED_STATUS_UNSUBSCRIBED) + @cli.command("serve") def serve(): @@ -262,6 +264,11 @@ def serve(): Subscribed Feeds +

Feeds

""" @@ -272,17 +279,19 @@ def serve(): ago = f" ({f.entries[0].time_ago()})" else: ago = "" + buffer.write(f"
") buffer.write(f'

{feed_title}{ago}

') - buffer.write(f"
") if len(f.entries) > 0: + buffer.write(f"
    ") for entry in f.entries: title = html.escape(entry.title) buffer.write( - f'{title} ({entry.time_ago()}) ' + f'
  • {title} ({entry.time_ago()})
  • ' ) + buffer.write(f"
") else: buffer.write("No entries...") - buffer.write(f"
") + buffer.write(f"
") # feed buffer.flush() text = buffer.getvalue() response = text.encode("utf-8") diff --git a/cry/database.py b/cry/database.py index 9f5b7bd..a73e971 100644 --- a/cry/database.py +++ b/cry/database.py @@ -304,55 +304,12 @@ class Database: Returns the number of new entries inserted. """ with self.db: - self.db.execute( - """ - INSERT INTO feeds ( - url, - last_fetched_ts, - retry_after_ts, - status, - etag, - modified, - title, - link - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT DO UPDATE - SET - last_fetched_ts=excluded.last_fetched_ts, - retry_after_ts=excluded.retry_after_ts, - status=excluded.status, - etag=excluded.etag, - modified=excluded.modified, - title=excluded.title, - link=excluded.link - """, - [ - f.meta.url, - f.meta.last_fetched_ts, - f.meta.retry_after_ts, - f.meta.status, - f.meta.etag, - f.meta.modified, - f.title, - f.link, - ], - ) + self._insert_feed(f.meta, f.title, f.link) + return self._insert_entries(f.meta.url, f.entries) - change_count = self._insert_entries(f.meta.url, f.entries) - return change_count - - def set_feed_status(self, url: str, status: int) -> int: + def update_feed_status(self, meta: feed.FeedMeta, status: int) -> int: with self.db: - cursor = self.db.execute( - """ - UPDATE feeds - SET status = ?, - last_fetched_ts = ? - WHERE url = ? - """, - [status, int(time.time()), url], - ) - return cursor.rowcount + return self._update_feed_status(meta, status) def redirect_feed(self, old_url: str, new_url: str): with self.db: @@ -380,6 +337,7 @@ class Database: # bother. # Mark the old feed unsubscribed. + # TODO: Rebuild with helpers self.db.execute( """ UPDATE feeds @@ -406,6 +364,59 @@ class Database: (prop, value), ) + def _insert_feed(self, meta: feed.FeedMeta, title: str, link: str): + """Insert into the feeds table, handling collisions with UPSERT.""" + self.db.execute( + """ + INSERT INTO feeds ( + url, + last_fetched_ts, + retry_after_ts, + status, + etag, + modified, + title, + link + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT DO UPDATE + SET + last_fetched_ts=MAX(last_fetched_ts, excluded.last_fetched_ts), + retry_after_ts=MAX(retry_after_ts, excluded.retry_after_ts), + -- For all other fields, take the value that was computed by the + -- most recent fetch. + status=CASE + WHEN last_fetched_ts > excluded.last_fetched_ts THEN status + ELSE excluded.status + END, + etag=CASE + WHEN last_fetched_ts > excluded.last_fetched_ts THEN etag + ELSE excluded.etag + END, + modified=CASE + WHEN last_fetched_ts > excluded.last_fetched_ts THEN modified + ELSE excluded.modified + END, + title=CASE + WHEN last_fetched_ts > excluded.last_fetched_ts THEN title + ELSE excluded.title + END, + link=CASE + WHEN last_fetched_ts > excluded.last_fetched_ts THEN link + ELSE excluded.link + END + """, + [ + meta.url, + meta.last_fetched_ts, + meta.retry_after_ts, + meta.status, + meta.etag, + meta.modified, + title, + link, + ], + ) + def _insert_entries(self, feed_url: str, entries: list[feed.Entry]) -> int: cursor = self.db.execute( "SELECT COUNT (*) FROM entries WHERE feed_url=?", [feed_url] @@ -451,3 +462,16 @@ class Database: ) end_count = cursor.fetchone()[0] return end_count - start_count + + def _update_feed_status(self, meta: feed.FeedMeta, status: int) -> int: + new_ts = max(int(time.time()), meta.last_fetched_ts + 1) + cursor = self.db.execute( + """ + UPDATE feeds + SET status = ?, + last_fetched_ts = ? + WHERE url = ? + """, + [status, new_ts, meta.url], + ) + return cursor.rowcount diff --git a/tests/test_database.py b/tests/test_database.py index daa3b7b..511b760 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -252,16 +252,18 @@ def test_database_store_update_meta(): assert db.load_all_meta()[0] == new_meta -def test_database_set_feed_status(): +def test_database_update_feed_status(): db = database.Database(":memory:", random_slug()) db.ensure_database_schema() db.store_feed(FEED) assert db.load_all_meta()[0].status != feed.FEED_STATUS_UNSUBSCRIBED - db.set_feed_status(FEED.meta.url, feed.FEED_STATUS_UNSUBSCRIBED) + db.update_feed_status( + FEED.meta, + feed.FEED_STATUS_UNSUBSCRIBED, + ) - # TODO: Ensure that the updated time is touched too. assert db.load_all_meta()[0].status == feed.FEED_STATUS_UNSUBSCRIBED