Changed my mind about feed times
I hope I haven't broken things, we'll see after a while I guess.
This commit is contained in:
parent
95f8d6d3ff
commit
08fe7c1cf7
3 changed files with 153 additions and 58 deletions
27
cry/cli.py
27
cry/cli.py
|
|
@ -259,6 +259,33 @@ def unsubscribe(url):
|
|||
db.update_feed_status(meta, feed.FEED_STATUS_UNSUBSCRIBED)
|
||||
|
||||
|
||||
@cli.command(name="fetch")
|
||||
@click.argument("url")
|
||||
def fetch(url):
|
||||
"""Just fetch a feed and display the entries.
|
||||
|
||||
Nothing local is updated.
|
||||
"""
|
||||
meta = feed.FeedMeta.from_url(url)
|
||||
|
||||
click.echo(f"Fetching {url}...")
|
||||
d, _ = asyncio.run(feed.fetch_feed(meta))
|
||||
|
||||
if d is None:
|
||||
click.echo("No changes. (?)")
|
||||
elif isinstance(d, str):
|
||||
click.echo(f"WARNING: {url} returned a non-feed result!")
|
||||
click.echo(d)
|
||||
else:
|
||||
click.echo(f"{d.title}")
|
||||
if len(d.entries) > 0:
|
||||
for entry in d.entries:
|
||||
click.echo(f" {entry.title} ({entry.time_ago()})")
|
||||
else:
|
||||
click.echo(f" <No Entries>")
|
||||
click.echo()
|
||||
|
||||
|
||||
@cli.command("serve")
|
||||
def serve():
|
||||
web.serve()
|
||||
|
|
|
|||
108
cry/database.py
108
cry/database.py
|
|
@ -91,16 +91,8 @@ SCHEMA_STATEMENTS = [
|
|||
UPDATE properties SET value=value + 1 WHERE name='clock';
|
||||
END;
|
||||
|
||||
CREATE TRIGGER update_clock_on_entries_update
|
||||
AFTER UPDATE ON entries
|
||||
WHEN (NEW.id IS NOT OLD.id)
|
||||
OR (NEW.inserted_at IS NOT OLD.inserted_at)
|
||||
OR (NEW.feed_url IS NOT OLD.feed_url)
|
||||
OR (NEW.title IS NOT OLD.title)
|
||||
OR (NEW.link IS NOT OLD.link)
|
||||
BEGIN
|
||||
UPDATE properties SET value=value + 1 WHERE name='clock';
|
||||
END;
|
||||
-- Superceded by later definition, no need to re-run this.
|
||||
-- CREATE TRIGGER update_clock_on_entries_update
|
||||
""",
|
||||
"""
|
||||
CREATE TABLE sync_status (
|
||||
|
|
@ -108,6 +100,22 @@ SCHEMA_STATEMENTS = [
|
|||
clock INT NOT NULL
|
||||
);
|
||||
""",
|
||||
"""
|
||||
ALTER TABLE entries ADD COLUMN posted_at INTEGER;
|
||||
|
||||
DROP TRIGGER IF EXISTS update_clock_on_entries_update;
|
||||
CREATE TRIGGER update_clock_on_entries_update
|
||||
AFTER UPDATE ON entries
|
||||
WHEN (NEW.id IS NOT OLD.id)
|
||||
OR (NEW.inserted_at IS NOT OLD.inserted_at)
|
||||
OR (NEW.posted_at IS NOT OLD.posted_at)
|
||||
OR (NEW.feed_url IS NOT OLD.feed_url)
|
||||
OR (NEW.title IS NOT OLD.title)
|
||||
OR (NEW.link IS NOT OLD.link)
|
||||
BEGIN
|
||||
UPDATE properties SET value=value + 1 WHERE name='clock';
|
||||
END;
|
||||
""",
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -308,6 +316,7 @@ class Database:
|
|||
SELECT
|
||||
id,
|
||||
inserted_at,
|
||||
COALESCE(posted_at, inserted_at) AS posted_at,
|
||||
title,
|
||||
link
|
||||
FROM entries
|
||||
|
|
@ -322,8 +331,14 @@ class Database:
|
|||
rows = []
|
||||
|
||||
entries = [
|
||||
feed.Entry(id=id, inserted_at=inserted_at, title=title, link=link)
|
||||
for id, inserted_at, title, link in rows
|
||||
feed.Entry(
|
||||
id=id,
|
||||
inserted_at=inserted_at,
|
||||
posted_at=posted_at,
|
||||
title=title,
|
||||
link=link,
|
||||
)
|
||||
for id, inserted_at, posted_at, title, link in rows
|
||||
]
|
||||
f = feed.Feed(meta=meta, title=title, link=link, entries=entries)
|
||||
feeds.append(f)
|
||||
|
|
@ -362,8 +377,42 @@ class Database:
|
|||
Returns the number of new entries inserted.
|
||||
"""
|
||||
with self.db:
|
||||
# Correct the entries to make sure that we do not hide "new"
|
||||
# entries behind old entries. (This can happen because the
|
||||
# times in feeds are historically untrustworthy.) e.g., what
|
||||
# should we do if we previously saw an entry from "October 3rd"
|
||||
# but suddenly see a *new* entry from "October 1st"? That can't
|
||||
# be right! So we bring every item's posted time to at last the
|
||||
# maximum posted time we previously saw.
|
||||
#
|
||||
# This correction is incorrect in the case of feed sync, so feed
|
||||
# sync can't go through here.
|
||||
#
|
||||
# NOTE: This this might seem to bring entries already in the
|
||||
# database forward to a newer time. BUT! When we insert we
|
||||
# take the *older* time on conflict, so the change we do
|
||||
# here is undone on insert. Given that I don't want to do
|
||||
# "new entry" detection here in memory, that seems to be
|
||||
# OK. The other fix is to actually do "new entry" detection
|
||||
# in python, and stop relying on insert conflicts. But
|
||||
# alas, the insert conflict mechanism still must still
|
||||
# exist in order to do the local-state synchronization, so
|
||||
# we really don't save anything with that.
|
||||
#
|
||||
max_post_time = self._get_max_post_time(f.meta.url) + 1
|
||||
fixed_entries = [
|
||||
feed.Entry(
|
||||
id=e.id,
|
||||
inserted_at=e.inserted_at,
|
||||
posted_at=max(e.posted_at, max_post_time),
|
||||
title=e.title,
|
||||
link=e.link,
|
||||
)
|
||||
for e in f.entries
|
||||
]
|
||||
|
||||
self._insert_feed(f.meta, f.title, f.link)
|
||||
return self._insert_entries(f.meta.url, f.entries)
|
||||
return self._insert_entries(f.meta.url, fixed_entries)
|
||||
|
||||
def update_feed_status(self, meta: feed.FeedMeta, status: int) -> int:
|
||||
with self.db:
|
||||
|
|
@ -471,6 +520,7 @@ class Database:
|
|||
SELECT
|
||||
id,
|
||||
inserted_at,
|
||||
COALESCE(posted_at, inserted_at),
|
||||
title,
|
||||
link
|
||||
FROM entries
|
||||
|
|
@ -480,16 +530,21 @@ class Database:
|
|||
)
|
||||
entries_results = entries_cursor.fetchmany()
|
||||
while len(entries_results) > 0:
|
||||
# NOTE: It is critical that this here does not go
|
||||
# through the logic in "store_feed" as it can cause
|
||||
# wildly incorrect times to get propagated. (Although
|
||||
# maybe corrected on the next sync?)
|
||||
self._insert_entries(
|
||||
url,
|
||||
[
|
||||
feed.Entry(
|
||||
id=id,
|
||||
inserted_at=int(inserted_at),
|
||||
posted_at=int(posted_at),
|
||||
title=title,
|
||||
link=link,
|
||||
)
|
||||
for id, inserted_at, title, link in entries_results
|
||||
for id, inserted_at, posted_at, title, link in entries_results
|
||||
],
|
||||
)
|
||||
entries_results = entries_cursor.fetchmany()
|
||||
|
|
@ -563,6 +618,16 @@ class Database:
|
|||
],
|
||||
)
|
||||
|
||||
def _get_max_post_time(self, feed_url: str) -> int:
|
||||
cursor = self.db.execute(
|
||||
"SELECT MAX(COALESCE(posted_at, inserted_at)) FROM entries WHERE feed_url=?",
|
||||
[feed_url],
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
if result is None or result[0] is None:
|
||||
return 0
|
||||
return int(result[0])
|
||||
|
||||
def _insert_entries(self, feed_url: str, entries: list[feed.Entry]) -> int:
|
||||
cursor = self.db.execute(
|
||||
"SELECT COUNT (*) FROM entries WHERE feed_url=?", [feed_url]
|
||||
|
|
@ -574,10 +639,11 @@ class Database:
|
|||
INSERT INTO entries (
|
||||
id,
|
||||
inserted_at,
|
||||
posted_at,
|
||||
feed_url,
|
||||
title,
|
||||
link
|
||||
) VALUES (?, ?, ?, ?, ?)
|
||||
) VALUES (?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT DO UPDATE
|
||||
SET
|
||||
-- NOTE: This is also part of the feed merge algorithm, BUT
|
||||
|
|
@ -590,7 +656,14 @@ class Database:
|
|||
-- to handle all the cases. (In theory we could make two
|
||||
-- different INSERTs to handle the two cases but that is
|
||||
-- more complexity than it is worth.)
|
||||
--
|
||||
inserted_at=MIN(inserted_at, excluded.inserted_at),
|
||||
|
||||
-- NOTE: This behavior of MIN() on collision is relied upon to
|
||||
-- correct another correction we make in store_feed. See
|
||||
-- the comment there about publish times.
|
||||
--
|
||||
posted_at=COALESCE(MIN(posted_at, excluded.posted_at), MIN(inserted_at, excluded.inserted_at)),
|
||||
title=CASE
|
||||
WHEN inserted_at < excluded.inserted_at THEN title
|
||||
ELSE excluded.title
|
||||
|
|
@ -600,7 +673,10 @@ class Database:
|
|||
ELSE excluded.link
|
||||
END
|
||||
""",
|
||||
[(e.id, e.inserted_at, feed_url, e.title, e.link) for e in entries],
|
||||
[
|
||||
(e.id, e.inserted_at, e.posted_at, feed_url, e.title, e.link)
|
||||
for e in entries
|
||||
],
|
||||
)
|
||||
|
||||
cursor = self.db.execute(
|
||||
|
|
|
|||
76
cry/feed.py
76
cry/feed.py
|
|
@ -69,7 +69,8 @@ class FeedMeta:
|
|||
@dataclasses.dataclass(frozen=True)
|
||||
class Entry:
|
||||
id: str
|
||||
inserted_at: int
|
||||
inserted_at: int # Unix time, but ms, not sec
|
||||
posted_at: int # Unix time, but ms, not sec
|
||||
title: str
|
||||
link: str | None
|
||||
|
||||
|
|
@ -105,15 +106,32 @@ class Entry:
|
|||
if not id:
|
||||
id = the_worst_element_hash(entry)
|
||||
|
||||
published = entry.get("published_parsed")
|
||||
if published is None:
|
||||
published = entry.get("updated_parsed")
|
||||
if published is not None:
|
||||
assert isinstance(published, tuple)
|
||||
# NOTE: Take insert_time if it's smaller; publish time errors generate
|
||||
# posts from the future.
|
||||
posted_at = min(int(time.mktime(published) * 1000), insert_time)
|
||||
else:
|
||||
posted_at = int(insert_time)
|
||||
|
||||
assert isinstance(id, str)
|
||||
assert link is None or isinstance(link, str)
|
||||
|
||||
title = clean_text(str(title))
|
||||
return Entry(id=id, inserted_at=insert_time, title=title, link=link)
|
||||
return Entry(
|
||||
id=id,
|
||||
inserted_at=insert_time,
|
||||
posted_at=posted_at,
|
||||
title=title,
|
||||
link=link,
|
||||
)
|
||||
|
||||
def time_ago(self) -> str:
|
||||
inserted = int(self.inserted_at / 1000)
|
||||
seconds = int(time.time()) - inserted
|
||||
posted = int(self.posted_at / 1000)
|
||||
seconds = int(time.time()) - posted
|
||||
if seconds <= 90:
|
||||
return f"{seconds}s"
|
||||
minutes = int(seconds / 60)
|
||||
|
|
@ -179,10 +197,10 @@ class Feed:
|
|||
# c) Even if you can parse the timestamp, many feed implementations
|
||||
# just PUT THE WRONG TIME IN THERE.
|
||||
#
|
||||
# The only coherent thing to do is to ignore the dates in the feeds
|
||||
# and just rely on our own sense of time. This comes with its own
|
||||
# problems, of course: our clock can be highly unreliable. But in
|
||||
# general it's good enough to work with, and feeds don't update so
|
||||
# So we have to account for the fact that the publish time might be
|
||||
# wildly unreliable, and back it up with our own clock. This comes with
|
||||
# its own problems, of course: our clock can be highly unreliable. But
|
||||
# in general it's good enough to work with, and feeds don't update so
|
||||
# frequently that we need to worry about most of these problems if we
|
||||
# use unix timestamps as our basis.
|
||||
#
|
||||
|
|
@ -200,6 +218,13 @@ class Feed:
|
|||
# `retry_after_ts` field, etc.) it's not a very likely thing to
|
||||
# happen.
|
||||
#
|
||||
# The *other* big source of time instability is that "new" items might
|
||||
# seem to have been published with a time that is "before" the last
|
||||
# item we previously saw. (i.e., on the first refresh we see an item
|
||||
# from October 3rd, then on the next refresh we see an item from October
|
||||
# 1st.) We don't know anything about historical refreshes here in feed
|
||||
# land, so that gets corrected in the database. (See store_feed.)
|
||||
#
|
||||
insert_time = int(time.time()) * 1000
|
||||
entries = [
|
||||
Entry.from_parsed(e, insert_time + i)
|
||||
|
|
@ -550,43 +575,10 @@ async def fetch_many(
|
|||
return [t.result() for t in tasks]
|
||||
|
||||
|
||||
def merge_feeds(a: Feed, a_origin: str, b: Feed, b_origin: str) -> Feed:
|
||||
"""Merge two known feeds. There are two conflict resolution policies:
|
||||
|
||||
1. The newer fetch of feed metadata wins.
|
||||
2. The older fetch of a feed item wins.
|
||||
|
||||
This means that the merge order between feeds *should* be consistent,
|
||||
unless somehow the feeds updated at the exact same time. In that case,
|
||||
the feed with the lexographically smallest slug wins.
|
||||
"""
|
||||
results = {e.id: e for e in a.entries}
|
||||
for entry in b.entries:
|
||||
existing = results.get(entry.id)
|
||||
if existing is None or existing.inserted_at > entry.inserted_at:
|
||||
results[entry.id] = entry
|
||||
|
||||
entries = sorted(results.values(), key=lambda e: e.inserted_at, reverse=True)
|
||||
source_feed = a
|
||||
if a.meta.last_fetched_ts > b.meta.last_fetched_ts:
|
||||
source_feed = a
|
||||
elif a.meta.last_fetched_ts == b.meta.last_fetched_ts:
|
||||
source_feed = a if a_origin < b_origin else b
|
||||
else:
|
||||
source_feed = b
|
||||
|
||||
return Feed(
|
||||
meta=source_feed.meta,
|
||||
title=source_feed.title,
|
||||
link=source_feed.link,
|
||||
entries=entries,
|
||||
)
|
||||
|
||||
|
||||
def sort_key(f: Feed) -> int:
|
||||
"""A sort key for sorting feeds by recency."""
|
||||
if len(f.entries) > 0:
|
||||
return max(e.inserted_at for e in f.entries)
|
||||
return max(e.posted_at for e in f.entries)
|
||||
return -1
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue