diff --git a/cry/cli.py b/cry/cli.py index b3e9415..f04e4d0 100644 --- a/cry/cli.py +++ b/cry/cli.py @@ -39,6 +39,7 @@ def cli(verbose): @click.argument("url") def search(url): "Search an URL for feeds." + # TODO: Rewrite to use our new one feeds = feedfinder.find_feeds(url) for feed in feeds: click.echo(feed) @@ -47,28 +48,61 @@ def search(url): @cli.command(name="subscribe") @click.argument("url") -def subscribe(url): +@click.option("--literal/--no-literal", "-l/-L", default=False) +def subscribe(url, literal): "Subscribe to a feed at the specified URL." db = database.Database.local() - click.echo(f"Fetching {url} ...") - meta = feed.FeedMeta.from_url(url, db.origin) - d, meta = asyncio.run(feed.fetch_feed(meta)) - if d is None: - click.echo(f"Unable to fetch {url}") - return 1 + if not literal: + click.echo(f"Searching for feeds for {url} ...") + feeds = asyncio.run(feed.feed_search(url, db.origin)) + if len(feeds) == 0: + click.echo(f"Unable to find a suitable feed for {url}") + return 1 + + if len(feeds) > 1: + # If we found more than one feed then we will try to see what the + # individual feeds are. + click.echo(f"Found {len(feeds)} feeds:") + + max_title = max(len(f.title) for f in feeds) + max_url = max(len(f.meta.url) for f in feeds) + + feeds.sort(key=lambda f: f.title) + + for f in feeds: + click.echo(f"{f.title:{max_title}} {f.meta.url:{max_url}}") + + click.echo( + "\nRun `subscribe` again with the URL of the feed you want to subscribe to." + ) + return 1 + + result = feeds[0] + click.echo(f"Identified {result.meta.url} as a feed for {url}") + else: + click.echo(f"Fetching {url} ...") + meta = feed.FeedMeta.from_url(url, db.origin) + d, meta = asyncio.run(feed.fetch_feed(meta)) + if d is None: + click.echo(f"Unable to fetch {url}") + return 1 + + if isinstance(d, str): + click.echo(f"{url} does not seem to be a feed") + return 1 + + result = d # Check to see if this URL is already in the database. - existing = db.load_feed(meta.url) + existing = db.load_feed(result.meta.url) if existing is not None: - click.echo(f"This feed already exists (as {meta.url})") + click.echo(f"This feed already exists (as {result.meta.url})") return 1 - f = feed.Feed.from_parsed(d, meta) - db.store_feed(f) - - click.echo(f"Subscribed to {meta.url}") + db.store_feed(result) + click.echo(f"Subscribed to {result.meta.url}") @cli.command(name="import") @@ -91,13 +125,16 @@ def import_opml(opml_file): LOG.warn(f"Unable to fetch {url}, skipping...") continue + if isinstance(d, str): + click.echo(f"{url} does not seem to be a feed, skipping...") + continue + existing = db.load_feed(meta.url) if existing is not None: LOG.info(f"{url} already exists (as {meta.url})") continue - f = feed.Feed.from_parsed(d, meta) - db.store_feed(f) + db.store_feed(d) subscribed = subscribed + 1 click.echo(f"Subscribed to {subscribed} new feeds") @@ -130,10 +167,11 @@ def refresh(url): if d is None: # Nothing new. db.update_meta(meta) + elif isinstance(d, str): + click.echo(f"WARNING: {meta.url} returned a non-feed result!") else: # New items, possibly! - f = feed.Feed.from_parsed(d, meta) - new_count = new_count + db.store_feed(f) + new_count = new_count + db.store_feed(d) click.echo(f"Fetched {new_count} new entries.") diff --git a/cry/feed.py b/cry/feed.py index d778275..f736aae 100644 --- a/cry/feed.py +++ b/cry/feed.py @@ -2,13 +2,15 @@ import asyncio import dataclasses import functools -import logging -import time -import typing import hashlib import html.parser import io +import logging import re +import time +import typing +import urllib.parse + import feedparser import requests @@ -18,6 +20,8 @@ import requests.structures LOG = logging.getLogger(__name__) +USER_AGENT = "cry-reader v0.0" + FEED_STATUS_ALIVE = 0 FEED_STATUS_DEAD = 1 FEED_STATUS_UNSUBSCRIBED = 2 @@ -48,211 +52,19 @@ class FeedMeta: origin=origin, ) + def should_fetch(self, now) -> bool: + if self.status != FEED_STATUS_ALIVE: + LOG.info(f"{self.url} is dead or unsubscribed") + return False -def the_worst_element_hash(value) -> str: - """Compute a content hash for the given feed element, to use as an ID. + if now < self.retry_after_ts: + retry_str = time.strftime( + "%Y-%m-%d %H:%M:%S %z", time.localtime(self.retry_after_ts) + ) + LOG.info(f"{self.url} will not be pulled until {retry_str}") + return False - The hash must be as stable as we can make it, but obviously there are things - we cannot control. If we've gotten here then the feed author has already - failed us and there's little we can do. This is already *known to be wrong.* - """ - - def process(value, hash): - if isinstance(value, feedparser.FeedParserDict): - hash.update(b"dict") - keys = sorted(value.keys()) - for key in keys: - hash.update(b"key::") - hash.update(key.encode("utf-8")) - hash.update(b"value::") - process(value[key], hash) - hash.update(b"tcid") - elif isinstance(value, str): - hash.update(b"str") - hash.update(value.encode("utf-8")) - hash.update(b"rts") - elif isinstance(value, list): - hash.update(b"list") - for item in value: - process(item, hash) - hash.update(b"tsil") - elif isinstance(value, tuple): - hash.update(b"tuple") - for item in value: - process(item, hash) - hash.update(b"elput") - - hash = hashlib.sha256(usedforsecurity=False) - process(value, hash) - return hash.hexdigest() - - -BLANK_TAGS = {"p", "br", "li", "div", "img"} -MULTI_SPACES = re.compile(r"\s+") - - -def clean_text(text: str) -> str: - """Sometimes text is HTML and otherwise ugly. This reduces it to - something pretty to display. Strips tags, puts blank space in between - elements that should generate blank space, and then collapses blank - spaces down to one. - """ - - class Cleaner(html.parser.HTMLParser): - def __init__(self, writer): - super().__init__() - self.writer = writer - - def handle_data(self, data: str) -> None: - self.writer.write(data) - - def handle_startendtag( - self, tag: str, attrs: list[tuple[str, str | None]] - ) -> None: - del attrs - if tag.lower() in BLANK_TAGS: - self.writer.write(" ") - - def handle_starttag( - self, tag: str, attrs: list[tuple[str, str | None]] - ) -> None: - del attrs - if tag.lower() in BLANK_TAGS: - self.writer.write(" ") - - writer = io.StringIO() - cleaner = Cleaner(writer) - cleaner.feed(text) - return MULTI_SPACES.sub(" ", writer.getvalue()) - - -async def fetch_feed( - feed: FeedMeta, -) -> typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]: - """Potentially fetch the feed described by `feed`, returning a parsed feed - (if possible and necessary) and an updated FeedMeta. - - This function can fail to return a parsed feed under a number of - circumstances. Among them: - - - It's too soon to be checking this feed again. - - The feed has been failing for a while and we've called it's dead. - - The server told us it was dead. - - We checked the server and it told us our cache was good. - - We tried to contact the server, but a networking error happened. - - Regardless, the new FeedMeta has the latest state of the feed. - """ - if feed.status != FEED_STATUS_ALIVE: - LOG.info(f"{feed.url} is dead or unsubscribed") - return (None, feed) - - if time.time() < feed.retry_after_ts: - retry_str = time.strftime( - "%Y-%m-%d %H:%M:%S %z", time.localtime(feed.retry_after_ts) - ) - LOG.info(f"{feed.url} will not be pulled until {retry_str}") - return (None, feed) - - # We waffle back and forth about using feedreader's HTTP support vs - # calling requests ourselves. We have decided to use requests manually at - # this time because it make it much much easier to figure out whether or - # not a request has succeeded. (The straw was handling timeouts and - # understanding whether `bozo_exception` was a transport failure or not.) - - headers = {"user-agent": "cry-reader v0.0"} - if feed.etag: - headers["if-none-match"] = feed.etag - if feed.modified: - headers["if-modified-since"] = feed.modified - - LOG.info(f"{feed.url} fetching...") - try: - loop = asyncio.get_running_loop() - response = await loop.run_in_executor( - None, - functools.partial(http.get, feed.url, headers=headers), - ) - LOG.info(f"{feed.url} fetched with status: {response.status_code}") - failed = response.status_code >= 400 - except Exception as e: - LOG.error(f"{feed.url} error fetching: {e}") - failed = True - response = None - - # Now, there are a number of things to consider in the response that - # we need to consider in updating our permanent record. - - if response is not None and response.status_code == 410: - # Permanently gone, really stop asking. - LOG.error(f"{feed.url} permanently gone") - return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD)) - - if failed and time.time() > feed.last_fetched_ts + (7 * 24 * 60 * 60): - # If we've been failing to fetch the feed for more than a week then - # consider us dead, we must be doing something wrong. - LOG.error(f"{feed.url} failed for too long, giving up") - return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD)) - - if response and response.is_permanent_redirect: - # Permanent redirect, update the stored URL, but mark this as a - # successful fetch. - # - # TODO: Is this actually the right URL to store? We need the last - # permanently redirected URL, not just whatever the last thing - # is... e.g. imagine a permanent followed by a temporary - # redirect, then what? - LOG.info(f"{feed.url} permanently redirected to {response.url}") - assert response.url is not None - feed = dataclasses.replace(feed, url=response.url) - - # NOTE: We might still be in a failure state here. But success or fail, - # the server might have told us when to next retry, so make a note - # of it. - retry_delta = None - if response is not None: - try: - retry_delta = int(response.headers.get("retry-after", "nope")) - except Exception: - pass - if retry_delta is None: - if failed: - retry_delta = 1 * 60 # Retry again in a minute - else: - retry_delta = 60 * 60 # 1 hour default - - feed = dataclasses.replace(feed, retry_after_ts=int(time.time()) + retry_delta) - - # We've done everything we can on a failure, bail if we've got an error. - if failed: - LOG.info(f"{feed.url} failed at the network level") - return (None, feed) - - assert response is not None - - # Record our successful fetch now, to reset the failure timer above. - feed = dataclasses.replace(feed, last_fetched_ts=int(time.time())) - - # We can *still* be successful but like, no changes. - if response.status_code != 200: - LOG.info(f"{feed.url} had no changes") - return (None, feed) - - feed = dataclasses.replace( - feed, - etag=response.headers.get("etag"), - modified=response.headers.get("last-modified"), - ) - parsed = feedparser.parse(response.content, response_headers=response.headers) - return (parsed, feed) - - -async def fetch_many( - metas: list[FeedMeta], -) -> list[typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]]: - async with asyncio.TaskGroup() as group: - tasks = [group.create_task(fetch_feed(m)) for m in metas] - return [t.result() for t in tasks] + return True @dataclasses.dataclass(frozen=True) @@ -398,6 +210,212 @@ class Feed: return Feed(meta=meta, title=title, link=link, entries=entries) +def the_worst_element_hash(value) -> str: + """Compute a content hash for the given feed element, to use as an ID. + + The hash must be as stable as we can make it, but obviously there are things + we cannot control. If we've gotten here then the feed author has already + failed us and there's little we can do. This is already *known to be wrong.* + """ + + def process(value, hash): + if isinstance(value, feedparser.FeedParserDict): + hash.update(b"dict") + keys = sorted(value.keys()) + for key in keys: + hash.update(b"key::") + hash.update(key.encode("utf-8")) + hash.update(b"value::") + process(value[key], hash) + hash.update(b"tcid") + elif isinstance(value, str): + hash.update(b"str") + hash.update(value.encode("utf-8")) + hash.update(b"rts") + elif isinstance(value, list): + hash.update(b"list") + for item in value: + process(item, hash) + hash.update(b"tsil") + elif isinstance(value, tuple): + hash.update(b"tuple") + for item in value: + process(item, hash) + hash.update(b"elput") + + hash = hashlib.sha256(usedforsecurity=False) + process(value, hash) + return hash.hexdigest() + + +BLANK_TAGS = {"p", "br", "li", "div", "img"} +MULTI_SPACES = re.compile(r"\s+") + + +def clean_text(text: str) -> str: + """Sometimes text is HTML and otherwise ugly. This reduces it to + something pretty to display. Strips tags, puts blank space in between + elements that should generate blank space, and then collapses blank + spaces down to one. + """ + + class Cleaner(html.parser.HTMLParser): + def __init__(self, writer): + super().__init__() + self.writer = writer + + def handle_data(self, data: str) -> None: + self.writer.write(data) + + def handle_startendtag( + self, tag: str, attrs: list[tuple[str, str | None]] + ) -> None: + del attrs + if tag.lower() in BLANK_TAGS: + self.writer.write(" ") + + def handle_starttag( + self, tag: str, attrs: list[tuple[str, str | None]] + ) -> None: + del attrs + if tag.lower() in BLANK_TAGS: + self.writer.write(" ") + + writer = io.StringIO() + cleaner = Cleaner(writer) + cleaner.feed(text) + return MULTI_SPACES.sub(" ", writer.getvalue()) + + +def could_be_feed_data(data: str) -> bool: + """See if the data might be a feed.""" + data = data.lower() + if data.count(" 0 + + +async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]: + if not meta.should_fetch(time.time()): + return (None, meta) + + headers = {"user-agent": USER_AGENT} + if meta.etag: + headers["if-none-match"] = meta.etag + if meta.modified: + headers["if-modified-since"] = meta.modified + + # We waffle back and forth about using feedreader's HTTP support vs + # calling requests ourselves. We have decided to use requests manually at + # this time because it make it much much easier to figure out whether or + # not a request has succeeded. (The straw was handling timeouts and + # understanding whether `bozo_exception` was a transport failure or not.) + # + # TODO: Check robots.txt! + + try: + loop = asyncio.get_running_loop() + response = await loop.run_in_executor( + None, + functools.partial(http.get, meta.url, headers=headers), + ) + LOG.info(f"{meta.url} fetched with status: {response.status_code}") + failed = response.status_code >= 400 + except Exception as e: + LOG.error(f"{meta.url} error fetching: {e}") + failed = True + response = None + + # Now, there are a number of things to consider in the response that + # we need to consider in updating our permanent record. + + if response is not None and response.status_code == 410: + # Permanently gone, really stop asking. + LOG.error(f"{meta.url} permanently gone") + return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD)) + + if failed and time.time() > meta.last_fetched_ts + (7 * 24 * 60 * 60): + # If we've been failing to fetch the feed for more than a week then + # consider us dead, we must be doing something wrong. + LOG.error(f"{meta.url} failed for too long, giving up") + return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD)) + + if response and response.is_permanent_redirect: + # Permanent redirect, update the stored URL, but mark this as a + # successful fetch. + # + # TODO: Is this actually the right URL to store? We need the last + # permanently redirected URL, not just whatever the last thing + # is... e.g. imagine a permanent followed by a temporary + # redirect, then what? + LOG.info(f"{meta.url} permanently redirected to {response.url}") + assert response.url is not None + meta = dataclasses.replace(meta, url=response.url) + + # TODO: Handle that bogus non-HTTP redirect that feedfinder uses. + + # NOTE: We might still be in a failure state here. But success or fail, + # the server might have told us when to next retry, so make a note + # of it. The server might also have given us updated caching + # information (even on failure!) and so let's also make a note of that. + retry_delta = None + etag = meta.etag + modified = meta.modified + if response is not None: + etag = response.headers.get("etag", meta.etag) + modified = response.headers.get("last-modified", meta.modified) + + try: + retry_delta = int(response.headers.get("retry-after", "nope")) + except Exception: + pass + + if retry_delta is None: + if failed: + retry_delta = 1 * 60 # Retry again in a minute + else: + retry_delta = 60 * 60 # 1 hour default + + meta = dataclasses.replace( + meta, + retry_after_ts=int(time.time()) + retry_delta, + etag=etag, + modified=modified, + ) + + # We've done everything we can on a failure, bail if we've got an error. + if failed: + LOG.info(f"{meta.url} failed at the network level") + return (None, meta) + + assert response is not None + + # Record our successful fetch now, to reset the failure timer above. + meta = dataclasses.replace(meta, last_fetched_ts=int(time.time())) + + # We can *still* be successful but like, no changes. + if response.status_code != 200: + LOG.info(f"{meta.url} had no changes") + return (None, meta) + + # Does this seem to be a feed? Or not? + if could_be_feed_data(response.text): + parsed = feedparser.parse(response.content, response_headers=response.headers) + return (Feed.from_parsed(parsed, meta), meta) + + # No this is not a feed, just return the content out for further + # processing. + return (response.text, meta) + + +async def fetch_many( + metas: list[FeedMeta], +) -> list[typing.Tuple[Feed | str | None, FeedMeta]]: + async with asyncio.TaskGroup() as group: + tasks = [group.create_task(fetch_feed(m)) for m in metas] + return [t.result() for t in tasks] + + def merge_feeds(a: Feed, b: Feed) -> Feed: """Merge two known feeds. There are two conflict resolution policies: @@ -436,3 +454,180 @@ def sort_key(f: Feed) -> int: if len(f.entries) > 0: return max(e.inserted_at for e in f.entries) return -1 + + +class FeedSearchParser(html.parser.HTMLParser): + """An HTML parser that tries to find links to feeds.""" + + FEED_TYPES = ( + "application/rss+xml", + "text/xml", + "application/atom+xml", + "application/x.atom+xml", + "application/x-atom+xml", + ) + + link_links: list[str] + a_links: list[str] + + def __init__(self, baseuri): + super().__init__() + self.baseuri = baseuri + self.link_links = [] + self.a_links = [] + + def handle_starttag(self, tag, attrs): + attrs = {k: v for k, v in attrs} + if tag == "base": + self.do_base(attrs) + elif tag == "link": + self.do_link(attrs) + elif tag == "a": + self.do_a(attrs) + + def do_base(self, attrs): + base = attrs.get("href") + if base is not None: + self.baseuri = base + + def do_link(self, attrs): + rel = attrs.get("rel") + if rel is None: + return + + if "alternate" not in rel.split(): + return + + if attrs.get("type", "").lower() not in self.FEED_TYPES: + return + + href = attrs.get("href") + if href is None: + return + + self.link_links.append(urllib.parse.urljoin(self.baseuri, href)) + + def do_a(self, attrs): + href = attrs.get("href") + if href is None: + return + + self.a_links.append(urllib.parse.urljoin(self.baseuri, href)) + + +def massage_url(uri: str) -> str: + uri = uri.strip() + if uri.startswith("feed://"): + uri = "http://" + uri.split("feed://", 1).pop() + for x in ["http", "https"]: + if uri.startswith("%s://" % x): + return uri + return "http://%s" % uri + + +def classify_links(links, baseuri) -> typing.Tuple[list[str], list[str]]: + """Split the links into two sets: local (which start with baseuri) and + remote (which don't). + """ + baseuri = baseuri.lower() + + local, remote = [], [] + for link in links: + if link.lower().startswith(baseuri): + local.append(link) + else: + remote.append(link) + + return local, remote + + +def is_feed_link(link: str) -> bool: + """Return True if the link seems to be a feed link, or False otherwise.""" + link = link.lower() + return ( + link.endswith(".rss") + or link.endswith(".rdf") + or link.endswith(".xml") + or link.endswith(".atom") + ) + + +def is_XML_related_link(link: str) -> bool: + link = link.lower() + return "rss" in link or "rdf" in link or "xml" in link or "atom" in link + + +async def check_feed(url: str, origin: str) -> Feed | None: + """Check to see if the given URL is a feed. If it is, return the feed, + otherwise return None. + """ + meta = FeedMeta.from_url(url, origin) + result, meta = await fetch_feed(meta) + if isinstance(result, Feed): + return result + + return None + + +async def check_links(links: typing.Iterable[str], origin: str) -> list[Feed]: + """Fetch all the links and return the ones that appear to have feeds in + them. If none of them are fetchable or none of them have feeds then this + will return nothing. + """ + async with asyncio.TaskGroup() as group: + tasks = [group.create_task(check_feed(link, origin)) for link in links] + + outfeeds: list[Feed] = [] + for task in tasks: + result = task.result() + if result is not None: + outfeeds.append(result) + + return outfeeds + + +async def feed_search(uri: str, origin: str) -> list[Feed]: + meta = FeedMeta.from_url(massage_url(uri), origin) + result, meta = await fetch_feed(meta) + if result is None: + return [] + if isinstance(result, Feed): + return [result] + + # OK it was not a feed, let's try all our searching games. + parser = FeedSearchParser(meta.url) + parser.feed(result) + + LOG.debug("Checking links...") + outfeeds = await check_links(parser.link_links, origin) + if len(outfeeds) > 0: + return outfeeds + + LOG.debug("No links, checking A tags...") + local_links, remote_links = classify_links(parser.a_links, meta.url) + outfeeds = await check_links(filter(is_feed_link, local_links), origin) + if len(outfeeds) > 0: + return outfeeds + outfeeds = await check_links(filter(is_XML_related_link, local_links), origin) + if len(outfeeds) > 0: + return outfeeds + outfeeds = await check_links(filter(is_feed_link, remote_links), origin) + if len(outfeeds) > 0: + return outfeeds + outfeeds = await check_links(filter(is_XML_related_link, remote_links), origin) + if len(outfeeds) > 0: + return outfeeds + + LOG.debug("no A tags, guessing") + suffixes = [ # filenames used by popular software: + "atom.xml", # blogger, TypePad + "index.atom", # MT, apparently + "index.rdf", # MT + "rss.xml", # Dave Winer/Manila + "index.xml", # MT + "index.rss", # Slash + ] + outfeeds = await check_links( + [urllib.parse.urljoin(meta.url, x) for x in suffixes], origin + ) + return outfeeds diff --git a/cry/feedfinder.py b/cry/feedfinder.py index aa21a3f..bc2f14e 100644 --- a/cry/feedfinder.py +++ b/cry/feedfinder.py @@ -2,8 +2,10 @@ Based on http://www.aaronsw.com/2002/feedfinder/ -Kinda rewritten by John Doty for the Python3 and the cry aggregator, but the -basic frame remains. +Rewritted by John Doty for the Python3 and the cry aggregator, but the basic +frame remains. The big thing *this* does is also return the FeedMeta when it +has found feeds, instead of just URLs. This is more useful for the rest of +processing. """ import logging @@ -17,6 +19,7 @@ import urllib.robotparser import requests +from . import feed LOG = logging.getLogger(__name__) @@ -125,7 +128,7 @@ class HtmlBasedParser(html.parser.HTMLParser): self.a_links.append(urllib.parse.urljoin(self.baseuri, href)) -def makeFullURI(uri): +def makeFullURI(uri: str) -> str: uri = uri.strip() if uri.startswith("feed://"): uri = "http://" + uri.split("feed://", 1).pop() @@ -204,8 +207,6 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]: """Find feeds for the given URI. How it works: - 0. - 1. If the URI points to a feed, it is simply returned; otherwise the page is downloaded and the real fun begins. @@ -293,58 +294,3 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]: ) return list(set(outfeeds)) - - -##### test harness ###### - - -def test(): - uri = "http://diveintomark.org/tests/client/autodiscovery/html4-001.html" - failed = [] - count = 0 - while 1: - data = _gatekeeper.get(uri) - if data.find("Atom autodiscovery test") == -1: - break - sys.stdout.write(".") - sys.stdout.flush() - count += 1 - links = getLinks(data, uri) - if not links: - print(f"\n*** FAILED *** {uri} could not find link") - failed.append(uri) - elif len(links) > 1: - print(f"\n*** FAILED *** {uri} found too many links") - failed.append(uri) - else: - atomdata = requests.get(links[0]).text - if atomdata.find('