More work on post times
This commit is contained in:
parent
e053e01e3a
commit
c4a25c1683
3 changed files with 66 additions and 28 deletions
|
|
@ -549,6 +549,11 @@ class Database:
|
||||||
)
|
)
|
||||||
entries_results = entries_cursor.fetchmany()
|
entries_results = entries_cursor.fetchmany()
|
||||||
|
|
||||||
|
def get_max_post_time(self) -> int:
|
||||||
|
"""Get the latest any entry was recorded."""
|
||||||
|
with self.db:
|
||||||
|
return self._get_max_post_time(feed_url=None)
|
||||||
|
|
||||||
def _get_property(self, prop: str, default=None) -> typing.Any:
|
def _get_property(self, prop: str, default=None) -> typing.Any:
|
||||||
cursor = self.db.execute("SELECT value FROM properties WHERE name=?", (prop,))
|
cursor = self.db.execute("SELECT value FROM properties WHERE name=?", (prop,))
|
||||||
result = cursor.fetchone()
|
result = cursor.fetchone()
|
||||||
|
|
@ -618,11 +623,17 @@ class Database:
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_max_post_time(self, feed_url: str) -> int:
|
def _get_max_post_time(self, feed_url: str | None) -> int:
|
||||||
cursor = self.db.execute(
|
if feed_url:
|
||||||
"SELECT MAX(COALESCE(posted_at, inserted_at)) FROM entries WHERE feed_url=?",
|
cursor = self.db.execute(
|
||||||
[feed_url],
|
"SELECT MAX(COALESCE(posted_at, inserted_at)) FROM entries WHERE feed_url=?",
|
||||||
)
|
[feed_url],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cursor = self.db.execute(
|
||||||
|
"SELECT MAX(COALESCE(posted_at, inserted_at)) FROM entries"
|
||||||
|
)
|
||||||
|
|
||||||
result = cursor.fetchone()
|
result = cursor.fetchone()
|
||||||
if result is None or result[0] is None:
|
if result is None or result[0] is None:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
||||||
54
cry/feed.py
54
cry/feed.py
|
|
@ -77,15 +77,24 @@ class Entry:
|
||||||
link: str | None
|
link: str | None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_parsed(cls, entry: feedparser.FeedParserDict, insert_time: int) -> "Entry":
|
def from_parsed(
|
||||||
|
cls,
|
||||||
|
entry: feedparser.FeedParserDict,
|
||||||
|
insert_time: int,
|
||||||
|
last_post_time: int,
|
||||||
|
) -> "Entry":
|
||||||
"""Convert an entry from feedparser into an Entry by extracting the
|
"""Convert an entry from feedparser into an Entry by extracting the
|
||||||
things we care about, fudging things and substituting things as
|
things we care about, fudging things and substituting things as
|
||||||
necessary.
|
necessary.
|
||||||
|
|
||||||
The one thing we need from the outside is the "insert time", which
|
We also need to correct the post's timestamps, possibly, and to do
|
||||||
is *almost* `int(time.time())` but needs a little bit of fudging in
|
that we have two times from the outside: insert_time and last_post_time.
|
||||||
order to ensure that we can keep the items in order when we get a lot
|
insert_time is a monotonic clock that mostly measures "now", and
|
||||||
of them all at once.
|
last_post_time is mostly the time of the latest post in the feed.
|
||||||
|
|
||||||
|
(The reason for "mostly" above is that they're actually ticked
|
||||||
|
forward a little in order to make sure that entries without
|
||||||
|
timestamps maintain their feed order.)
|
||||||
"""
|
"""
|
||||||
title = entry.get("title")
|
title = entry.get("title")
|
||||||
if not title:
|
if not title:
|
||||||
|
|
@ -114,10 +123,11 @@ class Entry:
|
||||||
if published is not None:
|
if published is not None:
|
||||||
assert isinstance(published, tuple)
|
assert isinstance(published, tuple)
|
||||||
|
|
||||||
# NOTE: Take insert_time if it's smaller; publish time errors generate
|
# NOTE: Take last_post_time if it's bigger, so that we don't get
|
||||||
# posts from the future.
|
# new entries from the past, and insert_time if it's smaller;
|
||||||
|
# publish time errors generate posts from the future.
|
||||||
pub_time = int(calendar.timegm(published) * 1000)
|
pub_time = int(calendar.timegm(published) * 1000)
|
||||||
posted_at = min(pub_time, insert_time)
|
posted_at = min(max(pub_time, last_post_time), insert_time)
|
||||||
else:
|
else:
|
||||||
posted_at = insert_time
|
posted_at = insert_time
|
||||||
|
|
||||||
|
|
@ -166,7 +176,12 @@ class Feed:
|
||||||
entries: list[Entry]
|
entries: list[Entry]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_parsed(cls, d: feedparser.FeedParserDict, meta: FeedMeta) -> "Feed":
|
def from_parsed(
|
||||||
|
cls,
|
||||||
|
d: feedparser.FeedParserDict,
|
||||||
|
meta: FeedMeta,
|
||||||
|
last_post_time: int,
|
||||||
|
) -> "Feed":
|
||||||
title = None
|
title = None
|
||||||
link = None
|
link = None
|
||||||
|
|
||||||
|
|
@ -225,16 +240,16 @@ class Feed:
|
||||||
# `retry_after_ts` field, etc.) it's not a very likely thing to
|
# `retry_after_ts` field, etc.) it's not a very likely thing to
|
||||||
# happen.
|
# happen.
|
||||||
#
|
#
|
||||||
# The *other* big source of time instability is that "new" items might
|
# The *other* big source of time instability is that "new" items
|
||||||
# seem to have been published with a time that is "before" the last
|
# might seem to have been published with a time that is "before" the
|
||||||
# item we previously saw. (i.e., on the first refresh we see an item
|
# last item we previously saw. (i.e., on the first refresh we see an
|
||||||
# from October 3rd, then on the next refresh we see an item from October
|
# item from October 3rd, then on the next refresh we see an item from
|
||||||
# 1st.) We don't know anything about historical refreshes here in feed
|
# October 1st.) That value comes in as "last_post", and if we see a
|
||||||
# land, so that gets corrected in the database. (See store_feed.)
|
# time less than that it is bumped up to that time + 1.
|
||||||
#
|
#
|
||||||
insert_time = int(time.time()) * 1000
|
insert_time = int(time.time()) * 1000
|
||||||
entries = [
|
entries = [
|
||||||
Entry.from_parsed(e, insert_time + i)
|
Entry.from_parsed(e, insert_time + i, last_post_time + i)
|
||||||
for i, e in enumerate(reversed(d.entries))
|
for i, e in enumerate(reversed(d.entries))
|
||||||
]
|
]
|
||||||
entries.reverse()
|
entries.reverse()
|
||||||
|
|
@ -420,7 +435,10 @@ class Guardian:
|
||||||
GUARDIAN = Guardian()
|
GUARDIAN = Guardian()
|
||||||
|
|
||||||
|
|
||||||
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta]:
|
async def fetch_feed(
|
||||||
|
meta: FeedMeta,
|
||||||
|
last_post_time: int = 0,
|
||||||
|
) -> typing.Tuple[None | Feed | str, FeedMeta]:
|
||||||
"""Fetch a feed from the internet. `meta` is a FeedMeta that has all the
|
"""Fetch a feed from the internet. `meta` is a FeedMeta that has all the
|
||||||
details about what happened the last time we went to do a fetch, caching
|
details about what happened the last time we went to do a fetch, caching
|
||||||
information and whatnot.
|
information and whatnot.
|
||||||
|
|
@ -567,7 +585,7 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta
|
||||||
# Does this seem to be a feed? Or not?
|
# Does this seem to be a feed? Or not?
|
||||||
if could_be_feed_data(response.text):
|
if could_be_feed_data(response.text):
|
||||||
parsed = feedparser.parse(response.content, response_headers=response.headers)
|
parsed = feedparser.parse(response.content, response_headers=response.headers)
|
||||||
return (Feed.from_parsed(parsed, meta), meta)
|
return (Feed.from_parsed(parsed, meta, last_post_time), meta)
|
||||||
|
|
||||||
# No this is not a feed, just return the content out for further
|
# No this is not a feed, just return the content out for further
|
||||||
# processing.
|
# processing.
|
||||||
|
|
|
||||||
19
cry/web.py
19
cry/web.py
|
|
@ -253,11 +253,15 @@ def background_task(
|
||||||
def refresh_feeds(sink: EventChannel):
|
def refresh_feeds(sink: EventChannel):
|
||||||
"""Refresh all the subscribed feeds."""
|
"""Refresh all the subscribed feeds."""
|
||||||
|
|
||||||
async def _refresh_meta(db: database.Database, meta: feed.FeedMeta):
|
async def _refresh_meta(
|
||||||
|
db: database.Database,
|
||||||
|
meta: feed.FeedMeta,
|
||||||
|
last_post_time: int,
|
||||||
|
):
|
||||||
sink.log(f"[{meta.url}] Fetching...")
|
sink.log(f"[{meta.url}] Fetching...")
|
||||||
d = None
|
d = None
|
||||||
try:
|
try:
|
||||||
d, meta = await feed.fetch_feed(meta)
|
d, meta = await feed.fetch_feed(meta, last_post_time)
|
||||||
if d is None:
|
if d is None:
|
||||||
sink.log(f"[{meta.url}] No updates")
|
sink.log(f"[{meta.url}] No updates")
|
||||||
db.update_meta(meta)
|
db.update_meta(meta)
|
||||||
|
|
@ -274,10 +278,14 @@ def refresh_feeds(sink: EventChannel):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
sink.log(f"[{meta.url}] Error refressing feed: {e}")
|
sink.log(f"[{meta.url}] Error refressing feed: {e}")
|
||||||
|
|
||||||
async def _refresh_all(db: database.Database, metas: list[feed.FeedMeta]):
|
async def _refresh_all(
|
||||||
|
db: database.Database,
|
||||||
|
metas: list[feed.FeedMeta],
|
||||||
|
last_post_time: int,
|
||||||
|
):
|
||||||
async with asyncio.TaskGroup() as group:
|
async with asyncio.TaskGroup() as group:
|
||||||
for meta in metas:
|
for meta in metas:
|
||||||
group.create_task(_refresh_meta(db, meta))
|
group.create_task(_refresh_meta(db, meta, last_post_time))
|
||||||
|
|
||||||
db = database.Database.local()
|
db = database.Database.local()
|
||||||
sink.status("Synchronizing state...")
|
sink.status("Synchronizing state...")
|
||||||
|
|
@ -285,9 +293,10 @@ def refresh_feeds(sink: EventChannel):
|
||||||
|
|
||||||
sink.status("Loading subscriptions...")
|
sink.status("Loading subscriptions...")
|
||||||
metas = db.load_all_meta()
|
metas = db.load_all_meta()
|
||||||
|
last_post_time = db.get_max_post_time()
|
||||||
|
|
||||||
sink.status("Refreshing subscriptions...")
|
sink.status("Refreshing subscriptions...")
|
||||||
asyncio.run(_refresh_all(db, metas))
|
asyncio.run(_refresh_all(db, metas, last_post_time))
|
||||||
|
|
||||||
sink.status("Done")
|
sink.status("Done")
|
||||||
sink.redirect("/")
|
sink.redirect("/")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue