Subscribe to feeds
This commit is contained in:
parent
6be6afdbc3
commit
ff53b42b6f
3 changed files with 391 additions and 142 deletions
41
cry/cli.py
41
cry/cli.py
|
|
@ -1,5 +1,10 @@
|
||||||
|
# https://simonwillison.net/2023/Sep/30/cli-tools-python/
|
||||||
|
import asyncio
|
||||||
import click
|
import click
|
||||||
|
|
||||||
|
from . import feed
|
||||||
|
from . import database
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.version_option()
|
@click.version_option()
|
||||||
|
|
@ -7,15 +12,27 @@ def cli():
|
||||||
"Command line feed reader"
|
"Command line feed reader"
|
||||||
|
|
||||||
|
|
||||||
@cli.command(name="command")
|
@cli.command(name="subscribe")
|
||||||
@click.argument(
|
@click.argument("url")
|
||||||
"example"
|
def subscribe(url):
|
||||||
)
|
"Subscribe to a feed at the specified URL."
|
||||||
@click.option(
|
|
||||||
"-o",
|
db = database.Database.local()
|
||||||
"--option",
|
|
||||||
help="An example option",
|
click.echo(f"Fetching {url} ...")
|
||||||
)
|
meta = feed.FeedMeta.from_url(url, db.origin)
|
||||||
def first_command(example, option):
|
d, meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
"Command description goes here"
|
if d is None:
|
||||||
click.echo("Here is some output")
|
click.echo(f"Unable to fetch {url}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
# Check to see if this URL is already in the database.
|
||||||
|
existing = db.load_feed(meta.url)
|
||||||
|
if existing is not None:
|
||||||
|
click.echo(f"This feed already exists (as {meta.url})")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
f = feed.Feed.from_parsed(d, meta)
|
||||||
|
db.store_feed(f)
|
||||||
|
|
||||||
|
click.echo(f"Subscribed to {meta.url}")
|
||||||
|
|
|
||||||
263
cry/database.py
263
cry/database.py
|
|
@ -1,25 +1,13 @@
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import random
|
||||||
|
import socket
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import string
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
|
import platformdirs
|
||||||
|
|
||||||
def get_property(db: sqlite3.Connection, prop: str, default=None) -> typing.Any:
|
from . import feed
|
||||||
cursor = db.execute("SELECT value FROM properties WHERE name=?", (prop,))
|
|
||||||
result = cursor.fetchone()
|
|
||||||
if result is None:
|
|
||||||
return default
|
|
||||||
return result[0]
|
|
||||||
|
|
||||||
|
|
||||||
def set_property(db: sqlite3.Connection, prop: str, value):
|
|
||||||
db.execute(
|
|
||||||
"""
|
|
||||||
INSERT INTO properties (name, value) VALUES (?, ?)
|
|
||||||
ON CONFLICT DO UPDATE SET value=excluded.value
|
|
||||||
""",
|
|
||||||
(prop, value),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
SCHEMA_STATEMENTS = [
|
SCHEMA_STATEMENTS = [
|
||||||
"""
|
"""
|
||||||
|
|
@ -35,12 +23,13 @@ SCHEMA_STATEMENTS = [
|
||||||
);
|
);
|
||||||
|
|
||||||
CREATE TABLE entries(
|
CREATE TABLE entries(
|
||||||
id VARCHAR NOT NULL PRIMARY KEY,
|
id VARCHAR NOT NULL,
|
||||||
inserted_at INTEGER NOT NULL,
|
inserted_at INTEGER NOT NULL,
|
||||||
feed_url VARCHAR,
|
feed_url VARCHAR NOT NULL,
|
||||||
title VARCHAR,
|
title VARCHAR,
|
||||||
link VARCHAR,
|
link VARCHAR,
|
||||||
FOREIGN KEY feed_url REFERENCES feeds(url)
|
PRIMARY KEY (id, feed_url),
|
||||||
|
FOREIGN KEY (feed_url) REFERENCES feeds(url)
|
||||||
ON UPDATE CASCADE
|
ON UPDATE CASCADE
|
||||||
ON DELETE CASCADE
|
ON DELETE CASCADE
|
||||||
);
|
);
|
||||||
|
|
@ -48,38 +37,214 @@ SCHEMA_STATEMENTS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def ensure_database_schema(db: sqlite3.Connection):
|
def origin_path() -> pathlib.Path:
|
||||||
with db:
|
return platformdirs.user_data_path("cry", "cry") / "origin"
|
||||||
db.execute(
|
|
||||||
"""
|
|
||||||
CREATE TABLE IF NOT EXISTS properties (
|
def local_origin(path: pathlib.Path | None = None) -> str:
|
||||||
name VARCHAR NOT NULL PRIMARY KEY,
|
if path is None:
|
||||||
value VARCHAR NOT NULL
|
path = origin_path()
|
||||||
)
|
|
||||||
"""
|
if path.exists():
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
return f.read().strip()
|
||||||
|
|
||||||
|
host = socket.gethostname()
|
||||||
|
slug = "".join(
|
||||||
|
random.choices(
|
||||||
|
string.ascii_uppercase + string.ascii_lowercase + string.digits, k=8
|
||||||
)
|
)
|
||||||
version = int(get_property(db, "version", 0))
|
)
|
||||||
for script in SCHEMA_STATEMENTS[version:]:
|
origin = f"{host}-{slug}"
|
||||||
for statement in script.split(";"):
|
|
||||||
db.execute(statement)
|
|
||||||
set_property(db, "version", len(SCHEMA_STATEMENTS))
|
|
||||||
|
|
||||||
|
|
||||||
def database_path() -> pathlib.Path:
|
|
||||||
# TODO: Determine the name/slug from local state if necessary
|
|
||||||
return pathlib.Path.home() / "Dropbox" / "cry" / "testing-slug.db"
|
|
||||||
|
|
||||||
|
|
||||||
def connect_database(path: pathlib.Path) -> sqlite3.Connection:
|
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
connection = sqlite3.Connection(str(path), autocommit=False)
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
connection.execute("PRAGMA foreign_keys = ON")
|
f.write(origin)
|
||||||
return connection
|
|
||||||
|
return origin
|
||||||
|
|
||||||
|
|
||||||
def setup_database() -> sqlite3.Connection:
|
def database_path(origin: str) -> pathlib.Path:
|
||||||
db_path = database_path()
|
# TODO: Determine the name/slug from local state if necessary
|
||||||
db = connect_database(db_path)
|
return pathlib.Path.home() / "Dropbox" / "cry" / f"{origin}.db"
|
||||||
ensure_database_schema(db)
|
|
||||||
|
|
||||||
return db
|
|
||||||
|
class Database:
|
||||||
|
db: sqlite3.Connection
|
||||||
|
origin: str
|
||||||
|
|
||||||
|
def __init__(self, path: pathlib.Path | str, origin: str):
|
||||||
|
if not isinstance(path, str):
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
db = sqlite3.Connection(str(path), autocommit=False)
|
||||||
|
db.execute("PRAGMA foreign_keys = ON")
|
||||||
|
self.db = db
|
||||||
|
self.origin = origin
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def local(cls, origin: str | None = None) -> "Database":
|
||||||
|
if origin is None:
|
||||||
|
origin = local_origin()
|
||||||
|
|
||||||
|
db = Database(database_path(origin), origin)
|
||||||
|
db.ensure_database_schema()
|
||||||
|
return db
|
||||||
|
|
||||||
|
def get_property(self, prop: str, default=None) -> typing.Any:
|
||||||
|
cursor = self.db.execute("SELECT value FROM properties WHERE name=?", (prop,))
|
||||||
|
result = cursor.fetchone()
|
||||||
|
if result is None:
|
||||||
|
return default
|
||||||
|
return result[0]
|
||||||
|
|
||||||
|
def set_property(self, prop: str, value):
|
||||||
|
self.db.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO properties (name, value) VALUES (?, ?)
|
||||||
|
ON CONFLICT DO UPDATE SET value=excluded.value
|
||||||
|
""",
|
||||||
|
(prop, value),
|
||||||
|
)
|
||||||
|
|
||||||
|
def ensure_database_schema(self):
|
||||||
|
with self.db:
|
||||||
|
self.db.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS properties (
|
||||||
|
name VARCHAR NOT NULL PRIMARY KEY,
|
||||||
|
value VARCHAR NOT NULL
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
version = int(self.get_property("version", 0))
|
||||||
|
for script in SCHEMA_STATEMENTS[version:]:
|
||||||
|
for statement in script.split(";"):
|
||||||
|
try:
|
||||||
|
self.db.execute(statement)
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"Error executing:\n{statement}") from e
|
||||||
|
self.set_property("version", len(SCHEMA_STATEMENTS))
|
||||||
|
self.set_property("origin", self.origin)
|
||||||
|
|
||||||
|
def load_feed(self, url: str) -> feed.Feed | None:
|
||||||
|
cursor = self.db.execute(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
last_fetched_ts,
|
||||||
|
retry_after_ts,
|
||||||
|
status,
|
||||||
|
etag,
|
||||||
|
modified,
|
||||||
|
title,
|
||||||
|
link
|
||||||
|
FROM feeds
|
||||||
|
WHERE url=?
|
||||||
|
""",
|
||||||
|
[url],
|
||||||
|
)
|
||||||
|
|
||||||
|
row = cursor.fetchone()
|
||||||
|
if row is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
last_fetched_ts, retry_after_ts, status, etag, modified, title, link = row
|
||||||
|
meta = feed.FeedMeta(
|
||||||
|
url=url,
|
||||||
|
last_fetched_ts=last_fetched_ts,
|
||||||
|
retry_after_ts=retry_after_ts,
|
||||||
|
status=status,
|
||||||
|
etag=etag,
|
||||||
|
modified=modified,
|
||||||
|
origin=self.origin,
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor = self.db.execute(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
inserted_at,
|
||||||
|
title,
|
||||||
|
link
|
||||||
|
FROM entries
|
||||||
|
WHERE feed_url=?
|
||||||
|
""",
|
||||||
|
[url],
|
||||||
|
)
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
entries = [
|
||||||
|
feed.Entry(id=id, inserted_at=inserted_at, title=title, link=link)
|
||||||
|
for id, inserted_at, title, link in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
return feed.Feed(meta=meta, title=title, link=link, entries=entries)
|
||||||
|
|
||||||
|
def store_feed(self, f: feed.Feed):
|
||||||
|
with self.db:
|
||||||
|
self.db.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO feeds (
|
||||||
|
url,
|
||||||
|
last_fetched_ts,
|
||||||
|
retry_after_ts,
|
||||||
|
status,
|
||||||
|
etag,
|
||||||
|
modified,
|
||||||
|
title,
|
||||||
|
link
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT DO UPDATE
|
||||||
|
SET
|
||||||
|
last_fetched_ts=excluded.last_fetched_ts,
|
||||||
|
retry_after_ts=excluded.retry_after_ts,
|
||||||
|
status=excluded.status,
|
||||||
|
etag=excluded.etag,
|
||||||
|
modified=excluded.modified,
|
||||||
|
title=excluded.title,
|
||||||
|
link=excluded.link
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
f.meta.url,
|
||||||
|
f.meta.last_fetched_ts,
|
||||||
|
f.meta.retry_after_ts,
|
||||||
|
f.meta.status,
|
||||||
|
f.meta.etag,
|
||||||
|
f.meta.modified,
|
||||||
|
f.title,
|
||||||
|
f.link,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.db.executemany(
|
||||||
|
"""
|
||||||
|
INSERT INTO entries (
|
||||||
|
id,
|
||||||
|
inserted_at,
|
||||||
|
feed_url,
|
||||||
|
title,
|
||||||
|
link
|
||||||
|
) VALUES (?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT DO UPDATE
|
||||||
|
SET
|
||||||
|
-- NOTE: This is also part of the feed merge algorithm, BUT
|
||||||
|
-- we implement it here because feeds tend to be rolling
|
||||||
|
-- windows over some external content and we don't want
|
||||||
|
-- to read and write the entire feed just to update the
|
||||||
|
-- few new items. But we can't just do ON CONFLICT DO
|
||||||
|
-- NOTHING because we *might* be storing a feed where we
|
||||||
|
-- resolved conflicts with another instance. So we want
|
||||||
|
-- to handle all the cases. (In theory we could make two
|
||||||
|
-- different INSERTs to handle the two cases but that is
|
||||||
|
-- more complexity than it is worth.)
|
||||||
|
inserted_at=MIN(inserted_at, excluded.inserted_at),
|
||||||
|
title=CASE
|
||||||
|
WHEN inserted_at < excluded.inserted_at THEN title
|
||||||
|
ELSE excluded.title
|
||||||
|
END,
|
||||||
|
link=CASE
|
||||||
|
WHEN inserted_at < excluded.inserted_at THEN link
|
||||||
|
ELSE excluded.link
|
||||||
|
END
|
||||||
|
""",
|
||||||
|
[(e.id, e.inserted_at, f.meta.url, e.title, e.link) for e in f.entries],
|
||||||
|
)
|
||||||
|
|
|
||||||
229
cry/feed.py
229
cry/feed.py
|
|
@ -5,7 +5,6 @@ import functools
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
import typing
|
import typing
|
||||||
import pathlib
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import html.parser
|
import html.parser
|
||||||
import io
|
import io
|
||||||
|
|
@ -15,8 +14,6 @@ import feedparser
|
||||||
import requests
|
import requests
|
||||||
import requests.structures
|
import requests.structures
|
||||||
|
|
||||||
import database
|
|
||||||
import opml
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -37,9 +34,10 @@ class FeedMeta:
|
||||||
status: int
|
status: int
|
||||||
etag: str | None
|
etag: str | None
|
||||||
modified: str | None
|
modified: str | None
|
||||||
|
origin: str
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_url(cls, url: str) -> "FeedMeta":
|
def from_url(cls, url: str, origin: str) -> "FeedMeta":
|
||||||
return FeedMeta(
|
return FeedMeta(
|
||||||
url=url,
|
url=url,
|
||||||
last_fetched_ts=0,
|
last_fetched_ts=0,
|
||||||
|
|
@ -47,24 +45,10 @@ class FeedMeta:
|
||||||
status=FEED_STATUS_ALIVE,
|
status=FEED_STATUS_ALIVE,
|
||||||
etag=None,
|
etag=None,
|
||||||
modified=None,
|
modified=None,
|
||||||
|
origin=origin,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class Entry:
|
|
||||||
id: str
|
|
||||||
title: str
|
|
||||||
link: str | None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
|
||||||
class Feed:
|
|
||||||
meta: FeedMeta
|
|
||||||
title: str
|
|
||||||
link: str
|
|
||||||
entries: list[Entry]
|
|
||||||
|
|
||||||
|
|
||||||
def the_worst_element_hash(value) -> str:
|
def the_worst_element_hash(value) -> str:
|
||||||
"""Compute a content hash for the given feed element, to use as an ID.
|
"""Compute a content hash for the given feed element, to use as an ID.
|
||||||
|
|
||||||
|
|
@ -142,39 +126,6 @@ def clean_text(text: str) -> str:
|
||||||
return MULTI_SPACES.sub(" ", writer.getvalue())
|
return MULTI_SPACES.sub(" ", writer.getvalue())
|
||||||
|
|
||||||
|
|
||||||
def entry_from_feed(entry: feedparser.FeedParserDict) -> Entry:
|
|
||||||
"""Convert an entry from feedparser into an Entry by extracting the
|
|
||||||
things we care about, fudging things and substituting things as
|
|
||||||
necessary.
|
|
||||||
"""
|
|
||||||
title = entry.get("title")
|
|
||||||
if not title:
|
|
||||||
title = entry.get("description")
|
|
||||||
|
|
||||||
id = entry.get("id")
|
|
||||||
|
|
||||||
link = entry.get("link")
|
|
||||||
if id and not link:
|
|
||||||
linkid = str(id).lower()
|
|
||||||
if linkid.startswith("http:") or linkid.startswith("https:"):
|
|
||||||
link = linkid
|
|
||||||
|
|
||||||
if link and not id:
|
|
||||||
id = link
|
|
||||||
if title and not id:
|
|
||||||
id = title
|
|
||||||
if not id:
|
|
||||||
id = entry.get("published")
|
|
||||||
if not id:
|
|
||||||
id = the_worst_element_hash(entry)
|
|
||||||
|
|
||||||
assert isinstance(id, str)
|
|
||||||
assert link is None or isinstance(link, str)
|
|
||||||
|
|
||||||
title = clean_text(str(title))
|
|
||||||
return Entry(id=id, title=title, link=link)
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_feed(
|
async def fetch_feed(
|
||||||
feed: FeedMeta,
|
feed: FeedMeta,
|
||||||
) -> typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]:
|
) -> typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]:
|
||||||
|
|
@ -288,40 +239,156 @@ async def fetch_feed(
|
||||||
return (parsed, feed)
|
return (parsed, feed)
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
@dataclasses.dataclass(frozen=True)
|
||||||
database.setup_database()
|
class Entry:
|
||||||
|
id: str
|
||||||
|
inserted_at: int
|
||||||
|
title: str
|
||||||
|
link: str | None
|
||||||
|
|
||||||
feeds = [
|
@classmethod
|
||||||
FeedMeta.from_url(url)
|
def from_parsed(cls, entry: feedparser.FeedParserDict, insert_time: int) -> "Entry":
|
||||||
for url in opml.load_opml(pathlib.Path.home() / "Downloads" / "fraidycat.opml")
|
"""Convert an entry from feedparser into an Entry by extracting the
|
||||||
]
|
things we care about, fudging things and substituting things as
|
||||||
async with asyncio.TaskGroup() as group:
|
necessary.
|
||||||
tasks = [group.create_task(fetch_feed(f)) for f in feeds]
|
|
||||||
results = [t.result() for t in tasks]
|
|
||||||
|
|
||||||
for d, meta in results:
|
The one thing we need from the outside is the "insert time", which
|
||||||
if d is not None:
|
is *almost* `int(time.time())` but needs a little bit of fudging in
|
||||||
title = None
|
order to ensure that we can keep the items in order when we get a lot
|
||||||
page_url = None
|
of them all at once.
|
||||||
|
"""
|
||||||
|
title = entry.get("title")
|
||||||
|
if not title:
|
||||||
|
title = entry.get("description")
|
||||||
|
|
||||||
if d.feed is not None:
|
id = entry.get("id")
|
||||||
title = d.feed.get("title")
|
|
||||||
page_url = d.feed.get("link")
|
|
||||||
|
|
||||||
if title is None or title == "":
|
link = entry.get("link")
|
||||||
title = meta.url
|
if id and not link:
|
||||||
if page_url is None:
|
linkid = str(id).lower()
|
||||||
page_url = meta.url
|
if linkid.startswith("http:") or linkid.startswith("https:"):
|
||||||
|
link = linkid
|
||||||
|
|
||||||
print(f"[{title}]({page_url})")
|
if link and not id:
|
||||||
print(f"{meta}")
|
id = link
|
||||||
|
if title and not id:
|
||||||
|
id = title
|
||||||
|
if not id:
|
||||||
|
id = entry.get("published")
|
||||||
|
if not id:
|
||||||
|
id = the_worst_element_hash(entry)
|
||||||
|
|
||||||
entries = [entry_from_feed(e) for e in d.entries]
|
assert isinstance(id, str)
|
||||||
for entry in entries:
|
assert link is None or isinstance(link, str)
|
||||||
print(f" - {entry.title} ({entry.id})")
|
|
||||||
print(f" {entry.link}")
|
title = clean_text(str(title))
|
||||||
print()
|
return Entry(id=id, inserted_at=insert_time, title=title, link=link)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
@dataclasses.dataclass(frozen=True)
|
||||||
asyncio.run(main())
|
class Feed:
|
||||||
|
meta: FeedMeta
|
||||||
|
title: str
|
||||||
|
link: str
|
||||||
|
entries: list[Entry]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_parsed(cls, d: feedparser.FeedParserDict, meta: FeedMeta) -> "Feed":
|
||||||
|
title = None
|
||||||
|
link = None
|
||||||
|
|
||||||
|
if d.feed is not None:
|
||||||
|
title = d.feed.get("title")
|
||||||
|
link = d.feed.get("link")
|
||||||
|
|
||||||
|
if title is None or title == "":
|
||||||
|
title = meta.url
|
||||||
|
if link is None:
|
||||||
|
link = meta.url
|
||||||
|
|
||||||
|
# =====================================================================
|
||||||
|
# FEED AND ENTRY ORDERING!
|
||||||
|
# =====================================================================
|
||||||
|
# In many ways this is the most critical part of a feed reader: in
|
||||||
|
# what order do we show the items in the feed?
|
||||||
|
#
|
||||||
|
# RSS is pretty unspecified in general, but also in what the meaning
|
||||||
|
# of the order of the entries in the feed actually is. (I can't
|
||||||
|
# remember if this is something that Atom specifies but it doesn't
|
||||||
|
# matter because RSS is still really popular, even in the ungodly
|
||||||
|
# late year of 2024.
|
||||||
|
#
|
||||||
|
# *We* want to show posts in reverse chronological order, of course,
|
||||||
|
# but we still have problems. You *cannot* trust the dates and times
|
||||||
|
# in the entries. Sure, sure, Atom does a great job of specifying at
|
||||||
|
# least three different timestamps in the feed, and they are supposed
|
||||||
|
# to have time zones and whatnot. But:
|
||||||
|
#
|
||||||
|
# a) Any kind of timestamp is optional in RSS, and
|
||||||
|
# b) Even if the timestamp is present, it can come in a variety of
|
||||||
|
# formats (which theoretically `feedparser` handles), but
|
||||||
|
# c) Even if you can parse the timestamp, many feed implementations
|
||||||
|
# just PUT THE WRONG TIME IN THERE.
|
||||||
|
#
|
||||||
|
# The only coherent thing to do is to ignore the dates in the feeds
|
||||||
|
# and just rely on our own sense of time. This comes with its own
|
||||||
|
# problems, of course: our clock can be highly unreliable. But in
|
||||||
|
# general it's good enough to work with, and feeds don't update so
|
||||||
|
# frequently that we need to worry about most of these problems if we
|
||||||
|
# use unix timestamps as our basis.
|
||||||
|
#
|
||||||
|
# If we just use our own timestamps, then what do we do with feed
|
||||||
|
# updates where multiple items are inserted at once? We want to
|
||||||
|
# preserve that ordering too! Our hack is to multiply the unix
|
||||||
|
# timestamp by 1000, and then use the lower two digits as a sequence
|
||||||
|
# number. (Maybe it looks like everything was posted a millisecond
|
||||||
|
# apart?) There's a *chance* of conflict if:
|
||||||
|
#
|
||||||
|
# a) a feed as more than 1000 items in it, and
|
||||||
|
# b) we update the feed again less than a second later
|
||||||
|
#
|
||||||
|
# But given the other rate limiting features in this RSS system (The
|
||||||
|
# `retry_after_ts` field, etc.) it's not a very likely thing to
|
||||||
|
# happen.
|
||||||
|
#
|
||||||
|
insert_time = int(time.time()) * 1000
|
||||||
|
entries = [
|
||||||
|
Entry.from_parsed(e, insert_time + i)
|
||||||
|
for i, e in enumerate(reversed(d.entries))
|
||||||
|
]
|
||||||
|
entries.reverse()
|
||||||
|
|
||||||
|
return Feed(meta=meta, title=title, link=link, entries=entries)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_feeds(a: Feed, b: Feed) -> Feed:
|
||||||
|
"""Merge two known feeds. There are two conflict resolution policies:
|
||||||
|
|
||||||
|
1. The newer fetch of feed metadata wins.
|
||||||
|
2. The older fetch of a feed item wins.
|
||||||
|
|
||||||
|
This means that the merge order between feeds *should* be consistent,
|
||||||
|
unless somehow the feeds updated at the exact same time. In that case,
|
||||||
|
the feed with the lexographically smallest slug wins.
|
||||||
|
"""
|
||||||
|
results = {e.id: e for e in a.entries}
|
||||||
|
for entry in b.entries:
|
||||||
|
existing = results.get(entry.id)
|
||||||
|
if existing is None or existing.inserted_at > entry.inserted_at:
|
||||||
|
results[entry.id] = entry
|
||||||
|
|
||||||
|
entries = sorted(results.values(), key=lambda e: e.inserted_at, reverse=True)
|
||||||
|
source_feed = a
|
||||||
|
if a.meta.last_fetched_ts > b.meta.last_fetched_ts:
|
||||||
|
source_feed = a
|
||||||
|
elif a.meta.last_fetched_ts == b.meta.last_fetched_ts:
|
||||||
|
source_feed = a if a.meta.origin < b.meta.origin else b
|
||||||
|
else:
|
||||||
|
source_feed = b
|
||||||
|
|
||||||
|
return Feed(
|
||||||
|
meta=source_feed.meta,
|
||||||
|
title=source_feed.title,
|
||||||
|
link=source_feed.link,
|
||||||
|
entries=entries,
|
||||||
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue