More work on post times

This commit is contained in:
John Doty 2025-01-02 08:15:05 -08:00
parent e053e01e3a
commit c4a25c1683
3 changed files with 66 additions and 28 deletions

View file

@ -549,6 +549,11 @@ class Database:
) )
entries_results = entries_cursor.fetchmany() entries_results = entries_cursor.fetchmany()
def get_max_post_time(self) -> int:
"""Get the latest any entry was recorded."""
with self.db:
return self._get_max_post_time(feed_url=None)
def _get_property(self, prop: str, default=None) -> typing.Any: def _get_property(self, prop: str, default=None) -> typing.Any:
cursor = self.db.execute("SELECT value FROM properties WHERE name=?", (prop,)) cursor = self.db.execute("SELECT value FROM properties WHERE name=?", (prop,))
result = cursor.fetchone() result = cursor.fetchone()
@ -618,11 +623,17 @@ class Database:
], ],
) )
def _get_max_post_time(self, feed_url: str) -> int: def _get_max_post_time(self, feed_url: str | None) -> int:
cursor = self.db.execute( if feed_url:
"SELECT MAX(COALESCE(posted_at, inserted_at)) FROM entries WHERE feed_url=?", cursor = self.db.execute(
[feed_url], "SELECT MAX(COALESCE(posted_at, inserted_at)) FROM entries WHERE feed_url=?",
) [feed_url],
)
else:
cursor = self.db.execute(
"SELECT MAX(COALESCE(posted_at, inserted_at)) FROM entries"
)
result = cursor.fetchone() result = cursor.fetchone()
if result is None or result[0] is None: if result is None or result[0] is None:
return 0 return 0

View file

@ -77,15 +77,24 @@ class Entry:
link: str | None link: str | None
@classmethod @classmethod
def from_parsed(cls, entry: feedparser.FeedParserDict, insert_time: int) -> "Entry": def from_parsed(
cls,
entry: feedparser.FeedParserDict,
insert_time: int,
last_post_time: int,
) -> "Entry":
"""Convert an entry from feedparser into an Entry by extracting the """Convert an entry from feedparser into an Entry by extracting the
things we care about, fudging things and substituting things as things we care about, fudging things and substituting things as
necessary. necessary.
The one thing we need from the outside is the "insert time", which We also need to correct the post's timestamps, possibly, and to do
is *almost* `int(time.time())` but needs a little bit of fudging in that we have two times from the outside: insert_time and last_post_time.
order to ensure that we can keep the items in order when we get a lot insert_time is a monotonic clock that mostly measures "now", and
of them all at once. last_post_time is mostly the time of the latest post in the feed.
(The reason for "mostly" above is that they're actually ticked
forward a little in order to make sure that entries without
timestamps maintain their feed order.)
""" """
title = entry.get("title") title = entry.get("title")
if not title: if not title:
@ -114,10 +123,11 @@ class Entry:
if published is not None: if published is not None:
assert isinstance(published, tuple) assert isinstance(published, tuple)
# NOTE: Take insert_time if it's smaller; publish time errors generate # NOTE: Take last_post_time if it's bigger, so that we don't get
# posts from the future. # new entries from the past, and insert_time if it's smaller;
# publish time errors generate posts from the future.
pub_time = int(calendar.timegm(published) * 1000) pub_time = int(calendar.timegm(published) * 1000)
posted_at = min(pub_time, insert_time) posted_at = min(max(pub_time, last_post_time), insert_time)
else: else:
posted_at = insert_time posted_at = insert_time
@ -166,7 +176,12 @@ class Feed:
entries: list[Entry] entries: list[Entry]
@classmethod @classmethod
def from_parsed(cls, d: feedparser.FeedParserDict, meta: FeedMeta) -> "Feed": def from_parsed(
cls,
d: feedparser.FeedParserDict,
meta: FeedMeta,
last_post_time: int,
) -> "Feed":
title = None title = None
link = None link = None
@ -225,16 +240,16 @@ class Feed:
# `retry_after_ts` field, etc.) it's not a very likely thing to # `retry_after_ts` field, etc.) it's not a very likely thing to
# happen. # happen.
# #
# The *other* big source of time instability is that "new" items might # The *other* big source of time instability is that "new" items
# seem to have been published with a time that is "before" the last # might seem to have been published with a time that is "before" the
# item we previously saw. (i.e., on the first refresh we see an item # last item we previously saw. (i.e., on the first refresh we see an
# from October 3rd, then on the next refresh we see an item from October # item from October 3rd, then on the next refresh we see an item from
# 1st.) We don't know anything about historical refreshes here in feed # October 1st.) That value comes in as "last_post", and if we see a
# land, so that gets corrected in the database. (See store_feed.) # time less than that it is bumped up to that time + 1.
# #
insert_time = int(time.time()) * 1000 insert_time = int(time.time()) * 1000
entries = [ entries = [
Entry.from_parsed(e, insert_time + i) Entry.from_parsed(e, insert_time + i, last_post_time + i)
for i, e in enumerate(reversed(d.entries)) for i, e in enumerate(reversed(d.entries))
] ]
entries.reverse() entries.reverse()
@ -420,7 +435,10 @@ class Guardian:
GUARDIAN = Guardian() GUARDIAN = Guardian()
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta]: async def fetch_feed(
meta: FeedMeta,
last_post_time: int = 0,
) -> typing.Tuple[None | Feed | str, FeedMeta]:
"""Fetch a feed from the internet. `meta` is a FeedMeta that has all the """Fetch a feed from the internet. `meta` is a FeedMeta that has all the
details about what happened the last time we went to do a fetch, caching details about what happened the last time we went to do a fetch, caching
information and whatnot. information and whatnot.
@ -567,7 +585,7 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta
# Does this seem to be a feed? Or not? # Does this seem to be a feed? Or not?
if could_be_feed_data(response.text): if could_be_feed_data(response.text):
parsed = feedparser.parse(response.content, response_headers=response.headers) parsed = feedparser.parse(response.content, response_headers=response.headers)
return (Feed.from_parsed(parsed, meta), meta) return (Feed.from_parsed(parsed, meta, last_post_time), meta)
# No this is not a feed, just return the content out for further # No this is not a feed, just return the content out for further
# processing. # processing.

View file

@ -253,11 +253,15 @@ def background_task(
def refresh_feeds(sink: EventChannel): def refresh_feeds(sink: EventChannel):
"""Refresh all the subscribed feeds.""" """Refresh all the subscribed feeds."""
async def _refresh_meta(db: database.Database, meta: feed.FeedMeta): async def _refresh_meta(
db: database.Database,
meta: feed.FeedMeta,
last_post_time: int,
):
sink.log(f"[{meta.url}] Fetching...") sink.log(f"[{meta.url}] Fetching...")
d = None d = None
try: try:
d, meta = await feed.fetch_feed(meta) d, meta = await feed.fetch_feed(meta, last_post_time)
if d is None: if d is None:
sink.log(f"[{meta.url}] No updates") sink.log(f"[{meta.url}] No updates")
db.update_meta(meta) db.update_meta(meta)
@ -274,10 +278,14 @@ def refresh_feeds(sink: EventChannel):
except Exception as e: except Exception as e:
sink.log(f"[{meta.url}] Error refressing feed: {e}") sink.log(f"[{meta.url}] Error refressing feed: {e}")
async def _refresh_all(db: database.Database, metas: list[feed.FeedMeta]): async def _refresh_all(
db: database.Database,
metas: list[feed.FeedMeta],
last_post_time: int,
):
async with asyncio.TaskGroup() as group: async with asyncio.TaskGroup() as group:
for meta in metas: for meta in metas:
group.create_task(_refresh_meta(db, meta)) group.create_task(_refresh_meta(db, meta, last_post_time))
db = database.Database.local() db = database.Database.local()
sink.status("Synchronizing state...") sink.status("Synchronizing state...")
@ -285,9 +293,10 @@ def refresh_feeds(sink: EventChannel):
sink.status("Loading subscriptions...") sink.status("Loading subscriptions...")
metas = db.load_all_meta() metas = db.load_all_meta()
last_post_time = db.get_max_post_time()
sink.status("Refreshing subscriptions...") sink.status("Refreshing subscriptions...")
asyncio.run(_refresh_all(db, metas)) asyncio.run(_refresh_all(db, metas, last_post_time))
sink.status("Done") sink.status("Done")
sink.redirect("/") sink.redirect("/")