Subscribe now searches

Rewrite feed finder again to not multi-fetch when not needed
2024-07-15 17:23:24 +09:00 · 2024-07-15 17:23:24 +09:00 · cd20db0c4c
commit cd20db0c4c
parent 786ced5c24
3 changed files with 462 additions and 283 deletions
--- a/cry/cli.py
+++ b/cry/cli.py
@ -39,6 +39,7 @@ def cli(verbose):
@click.argument("url")
 def search(url):
    "Search an URL for feeds."
    # TODO: Rewrite to use our new one
    feeds = feedfinder.find_feeds(url)
    for feed in feeds:
        click.echo(feed)
@ -47,11 +48,40 @@ def search(url):
@cli.command(name="subscribe")
@click.argument("url")
-def subscribe(url):
+@click.option("--literal/--no-literal", "-l/-L", default=False)
 def subscribe(url, literal):
    "Subscribe to a feed at the specified URL."
    db = database.Database.local()
    if not literal:
        click.echo(f"Searching for feeds for {url} ...")
        feeds = asyncio.run(feed.feed_search(url, db.origin))
        if len(feeds) == 0:
            click.echo(f"Unable to find a suitable feed for {url}")
            return 1
        if len(feeds) > 1:
            # If we found more than one feed then we will try to see what the
            # individual feeds are.
            click.echo(f"Found {len(feeds)} feeds:")
            max_title = max(len(f.title) for f in feeds)
            max_url = max(len(f.meta.url) for f in feeds)
            feeds.sort(key=lambda f: f.title)
            for f in feeds:
                click.echo(f"{f.title:{max_title}}  {f.meta.url:{max_url}}")
            click.echo(
                "\nRun `subscribe` again with the URL of the feed you want to subscribe to."
            )
            return 1
        result = feeds[0]
        click.echo(f"Identified {result.meta.url} as a feed for {url}")
    else:
        click.echo(f"Fetching {url} ...")
        meta = feed.FeedMeta.from_url(url, db.origin)
        d, meta = asyncio.run(feed.fetch_feed(meta))
@ -59,16 +89,20 @@ def subscribe(url):
            click.echo(f"Unable to fetch {url}")
            return 1
-    # Check to see if this URL is already in the database.
+        if isinstance(d, str):
-    existing = db.load_feed(meta.url)
+            click.echo(f"{url} does not seem to be a feed")
    if existing is not None:
        click.echo(f"This feed already exists (as {meta.url})")
            return 1
-    f = feed.Feed.from_parsed(d, meta)
+        result = d
    db.store_feed(f)
-    click.echo(f"Subscribed to {meta.url}")
+    # Check to see if this URL is already in the database.
    existing = db.load_feed(result.meta.url)
    if existing is not None:
        click.echo(f"This feed already exists (as {result.meta.url})")
        return 1
    db.store_feed(result)
    click.echo(f"Subscribed to {result.meta.url}")
@cli.command(name="import")
@ -91,13 +125,16 @@ def import_opml(opml_file):
            LOG.warn(f"Unable to fetch {url}, skipping...")
            continue
        if isinstance(d, str):
            click.echo(f"{url} does not seem to be a feed, skipping...")
            continue
        existing = db.load_feed(meta.url)
        if existing is not None:
            LOG.info(f"{url} already exists (as {meta.url})")
            continue
-        f = feed.Feed.from_parsed(d, meta)
+        db.store_feed(d)
        db.store_feed(f)
        subscribed = subscribed + 1
    click.echo(f"Subscribed to {subscribed} new feeds")
@ -130,10 +167,11 @@ def refresh(url):
        if d is None:
            # Nothing new.
            db.update_meta(meta)
        elif isinstance(d, str):
            click.echo(f"WARNING: {meta.url} returned a non-feed result!")
        else:
            # New items, possibly!
-            f = feed.Feed.from_parsed(d, meta)
+            new_count = new_count + db.store_feed(d)
            new_count = new_count + db.store_feed(f)
    click.echo(f"Fetched {new_count} new entries.")
--- a/cry/feed.py
+++ b/cry/feed.py
@ -2,13 +2,15 @@
 import asyncio
 import dataclasses
 import functools
 import logging
 import time
 import typing
 import hashlib
 import html.parser
 import io
 import logging
 import re
 import time
 import typing
 import urllib.parse
 import feedparser
 import requests
@ -18,6 +20,8 @@ import requests.structures
 LOG = logging.getLogger(__name__)
 USER_AGENT = "cry-reader v0.0"
 FEED_STATUS_ALIVE = 0
 FEED_STATUS_DEAD = 1
 FEED_STATUS_UNSUBSCRIBED = 2
@ -48,211 +52,19 @@ class FeedMeta:
            origin=origin,
        )
    def should_fetch(self, now) -> bool:
        if self.status != FEED_STATUS_ALIVE:
            LOG.info(f"{self.url} is dead or unsubscribed")
            return False
-def the_worst_element_hash(value) -> str:
+        if now < self.retry_after_ts:
    """Compute a content hash for the given feed element, to use as an ID.
    The hash must be as stable as we can make it, but obviously there are things
    we cannot control. If we've gotten here then the feed author has already
    failed us and there's little we can do. This is already *known to be wrong.*
    """
    def process(value, hash):
        if isinstance(value, feedparser.FeedParserDict):
            hash.update(b"dict")
            keys = sorted(value.keys())
            for key in keys:
                hash.update(b"key::")
                hash.update(key.encode("utf-8"))
                hash.update(b"value::")
                process(value[key], hash)
            hash.update(b"tcid")
        elif isinstance(value, str):
            hash.update(b"str")
            hash.update(value.encode("utf-8"))
            hash.update(b"rts")
        elif isinstance(value, list):
            hash.update(b"list")
            for item in value:
                process(item, hash)
            hash.update(b"tsil")
        elif isinstance(value, tuple):
            hash.update(b"tuple")
            for item in value:
                process(item, hash)
            hash.update(b"elput")
    hash = hashlib.sha256(usedforsecurity=False)
    process(value, hash)
    return hash.hexdigest()
 BLANK_TAGS = {"p", "br", "li", "div", "img"}
 MULTI_SPACES = re.compile(r"\s+")
 def clean_text(text: str) -> str:
    """Sometimes text is HTML and otherwise ugly. This reduces it to
    something pretty to display. Strips tags, puts blank space in between
    elements that should generate blank space, and then collapses blank
    spaces down to one.
    """
    class Cleaner(html.parser.HTMLParser):
        def __init__(self, writer):
            super().__init__()
            self.writer = writer
        def handle_data(self, data: str) -> None:
            self.writer.write(data)
        def handle_startendtag(
            self, tag: str, attrs: list[tuple[str, str | None]]
        ) -> None:
            del attrs
            if tag.lower() in BLANK_TAGS:
                self.writer.write(" ")
        def handle_starttag(
            self, tag: str, attrs: list[tuple[str, str | None]]
        ) -> None:
            del attrs
            if tag.lower() in BLANK_TAGS:
                self.writer.write(" ")
    writer = io.StringIO()
    cleaner = Cleaner(writer)
    cleaner.feed(text)
    return MULTI_SPACES.sub(" ", writer.getvalue())
 async def fetch_feed(
    feed: FeedMeta,
 ) -> typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]:
    """Potentially fetch the feed described by `feed`, returning a parsed feed
    (if possible and necessary) and an updated FeedMeta.
    This function can fail to return a parsed feed under a number of
    circumstances. Among them:
    - It's too soon to be checking this feed again.
    - The feed has been failing for a while and we've called it's dead.
    - The server told us it was dead.
    - We checked the server and it told us our cache was good.
    - We tried to contact the server, but a networking error happened.
    Regardless, the new FeedMeta has the latest state of the feed.
    """
    if feed.status != FEED_STATUS_ALIVE:
        LOG.info(f"{feed.url} is dead or unsubscribed")
        return (None, feed)
    if time.time() < feed.retry_after_ts:
            retry_str = time.strftime(
-            "%Y-%m-%d %H:%M:%S %z", time.localtime(feed.retry_after_ts)
+                "%Y-%m-%d %H:%M:%S %z", time.localtime(self.retry_after_ts)
            )
-        LOG.info(f"{feed.url} will not be pulled until {retry_str}")
+            LOG.info(f"{self.url} will not be pulled until {retry_str}")
-        return (None, feed)
+            return False
-    # We waffle back and forth about using feedreader's HTTP support vs
+        return True
    # calling requests ourselves. We have decided to use requests manually at
    # this time because it make it much much easier to figure out whether or
    # not a request has succeeded. (The straw was handling timeouts and
    # understanding whether `bozo_exception` was a transport failure or not.)
    headers = {"user-agent": "cry-reader v0.0"}
    if feed.etag:
        headers["if-none-match"] = feed.etag
    if feed.modified:
        headers["if-modified-since"] = feed.modified
    LOG.info(f"{feed.url} fetching...")
    try:
        loop = asyncio.get_running_loop()
        response = await loop.run_in_executor(
            None,
            functools.partial(http.get, feed.url, headers=headers),
        )
        LOG.info(f"{feed.url} fetched with status: {response.status_code}")
        failed = response.status_code >= 400
    except Exception as e:
        LOG.error(f"{feed.url} error fetching: {e}")
        failed = True
        response = None
    # Now, there are a number of things to consider in the response that
    # we need to consider in updating our permanent record.
    if response is not None and response.status_code == 410:
        # Permanently gone, really stop asking.
        LOG.error(f"{feed.url} permanently gone")
        return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
    if failed and time.time() > feed.last_fetched_ts + (7 * 24 * 60 * 60):
        # If we've been failing to fetch the feed for more than a week then
        # consider us dead, we must be doing something wrong.
        LOG.error(f"{feed.url} failed for too long, giving up")
        return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
    if response and response.is_permanent_redirect:
        # Permanent redirect, update the stored URL, but mark this as a
        # successful fetch.
        #
        # TODO: Is this actually the right URL to store? We need the last
        #       permanently redirected URL, not just whatever the last thing
        #       is... e.g. imagine a permanent followed by a temporary
        #       redirect, then what?
        LOG.info(f"{feed.url} permanently redirected to {response.url}")
        assert response.url is not None
        feed = dataclasses.replace(feed, url=response.url)
    # NOTE: We might still be in a failure state here. But success or fail,
    #       the server might have told us when to next retry, so make a note
    #       of it.
    retry_delta = None
    if response is not None:
        try:
            retry_delta = int(response.headers.get("retry-after", "nope"))
        except Exception:
            pass
    if retry_delta is None:
        if failed:
            retry_delta = 1 * 60  # Retry again in a minute
        else:
            retry_delta = 60 * 60  # 1 hour default
    feed = dataclasses.replace(feed, retry_after_ts=int(time.time()) + retry_delta)
    # We've done everything we can on a failure, bail if we've got an error.
    if failed:
        LOG.info(f"{feed.url} failed at the network level")
        return (None, feed)
    assert response is not None
    # Record our successful fetch now, to reset the failure timer above.
    feed = dataclasses.replace(feed, last_fetched_ts=int(time.time()))
    # We can *still* be successful but like, no changes.
    if response.status_code != 200:
        LOG.info(f"{feed.url} had no changes")
        return (None, feed)
    feed = dataclasses.replace(
        feed,
        etag=response.headers.get("etag"),
        modified=response.headers.get("last-modified"),
    )
    parsed = feedparser.parse(response.content, response_headers=response.headers)
    return (parsed, feed)
 async def fetch_many(
    metas: list[FeedMeta],
 ) -> list[typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]]:
    async with asyncio.TaskGroup() as group:
        tasks = [group.create_task(fetch_feed(m)) for m in metas]
    return [t.result() for t in tasks]
@dataclasses.dataclass(frozen=True)
@ -398,6 +210,212 @@ class Feed:
        return Feed(meta=meta, title=title, link=link, entries=entries)
 def the_worst_element_hash(value) -> str:
    """Compute a content hash for the given feed element, to use as an ID.
    The hash must be as stable as we can make it, but obviously there are things
    we cannot control. If we've gotten here then the feed author has already
    failed us and there's little we can do. This is already *known to be wrong.*
    """
    def process(value, hash):
        if isinstance(value, feedparser.FeedParserDict):
            hash.update(b"dict")
            keys = sorted(value.keys())
            for key in keys:
                hash.update(b"key::")
                hash.update(key.encode("utf-8"))
                hash.update(b"value::")
                process(value[key], hash)
            hash.update(b"tcid")
        elif isinstance(value, str):
            hash.update(b"str")
            hash.update(value.encode("utf-8"))
            hash.update(b"rts")
        elif isinstance(value, list):
            hash.update(b"list")
            for item in value:
                process(item, hash)
            hash.update(b"tsil")
        elif isinstance(value, tuple):
            hash.update(b"tuple")
            for item in value:
                process(item, hash)
            hash.update(b"elput")
    hash = hashlib.sha256(usedforsecurity=False)
    process(value, hash)
    return hash.hexdigest()
 BLANK_TAGS = {"p", "br", "li", "div", "img"}
 MULTI_SPACES = re.compile(r"\s+")
 def clean_text(text: str) -> str:
    """Sometimes text is HTML and otherwise ugly. This reduces it to
    something pretty to display. Strips tags, puts blank space in between
    elements that should generate blank space, and then collapses blank
    spaces down to one.
    """
    class Cleaner(html.parser.HTMLParser):
        def __init__(self, writer):
            super().__init__()
            self.writer = writer
        def handle_data(self, data: str) -> None:
            self.writer.write(data)
        def handle_startendtag(
            self, tag: str, attrs: list[tuple[str, str | None]]
        ) -> None:
            del attrs
            if tag.lower() in BLANK_TAGS:
                self.writer.write(" ")
        def handle_starttag(
            self, tag: str, attrs: list[tuple[str, str | None]]
        ) -> None:
            del attrs
            if tag.lower() in BLANK_TAGS:
                self.writer.write(" ")
    writer = io.StringIO()
    cleaner = Cleaner(writer)
    cleaner.feed(text)
    return MULTI_SPACES.sub(" ", writer.getvalue())
 def could_be_feed_data(data: str) -> bool:
    """See if the data might be a feed."""
    data = data.lower()
    if data.count("<html"):
        return False
    return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
 async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
    if not meta.should_fetch(time.time()):
        return (None, meta)
    headers = {"user-agent": USER_AGENT}
    if meta.etag:
        headers["if-none-match"] = meta.etag
    if meta.modified:
        headers["if-modified-since"] = meta.modified
    # We waffle back and forth about using feedreader's HTTP support vs
    # calling requests ourselves. We have decided to use requests manually at
    # this time because it make it much much easier to figure out whether or
    # not a request has succeeded. (The straw was handling timeouts and
    # understanding whether `bozo_exception` was a transport failure or not.)
    #
    # TODO: Check robots.txt!
    try:
        loop = asyncio.get_running_loop()
        response = await loop.run_in_executor(
            None,
            functools.partial(http.get, meta.url, headers=headers),
        )
        LOG.info(f"{meta.url} fetched with status: {response.status_code}")
        failed = response.status_code >= 400
    except Exception as e:
        LOG.error(f"{meta.url} error fetching: {e}")
        failed = True
        response = None
    # Now, there are a number of things to consider in the response that
    # we need to consider in updating our permanent record.
    if response is not None and response.status_code == 410:
        # Permanently gone, really stop asking.
        LOG.error(f"{meta.url} permanently gone")
        return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
    if failed and time.time() > meta.last_fetched_ts + (7 * 24 * 60 * 60):
        # If we've been failing to fetch the feed for more than a week then
        # consider us dead, we must be doing something wrong.
        LOG.error(f"{meta.url} failed for too long, giving up")
        return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
    if response and response.is_permanent_redirect:
        # Permanent redirect, update the stored URL, but mark this as a
        # successful fetch.
        #
        # TODO: Is this actually the right URL to store? We need the last
        #       permanently redirected URL, not just whatever the last thing
        #       is... e.g. imagine a permanent followed by a temporary
        #       redirect, then what?
        LOG.info(f"{meta.url} permanently redirected to {response.url}")
        assert response.url is not None
        meta = dataclasses.replace(meta, url=response.url)
    # TODO: Handle that bogus non-HTTP redirect that feedfinder uses.
    # NOTE: We might still be in a failure state here. But success or fail,
    #       the server might have told us when to next retry, so make a note
    #       of it. The server might also have given us updated caching
    #       information (even on failure!) and so let's also make a note of that.
    retry_delta = None
    etag = meta.etag
    modified = meta.modified
    if response is not None:
        etag = response.headers.get("etag", meta.etag)
        modified = response.headers.get("last-modified", meta.modified)
        try:
            retry_delta = int(response.headers.get("retry-after", "nope"))
        except Exception:
            pass
    if retry_delta is None:
        if failed:
            retry_delta = 1 * 60  # Retry again in a minute
        else:
            retry_delta = 60 * 60  # 1 hour default
    meta = dataclasses.replace(
        meta,
        retry_after_ts=int(time.time()) + retry_delta,
        etag=etag,
        modified=modified,
    )
    # We've done everything we can on a failure, bail if we've got an error.
    if failed:
        LOG.info(f"{meta.url} failed at the network level")
        return (None, meta)
    assert response is not None
    # Record our successful fetch now, to reset the failure timer above.
    meta = dataclasses.replace(meta, last_fetched_ts=int(time.time()))
    # We can *still* be successful but like, no changes.
    if response.status_code != 200:
        LOG.info(f"{meta.url} had no changes")
        return (None, meta)
    # Does this seem to be a feed? Or not?
    if could_be_feed_data(response.text):
        parsed = feedparser.parse(response.content, response_headers=response.headers)
        return (Feed.from_parsed(parsed, meta), meta)
    # No this is not a feed, just return the content out for further
    # processing.
    return (response.text, meta)
 async def fetch_many(
    metas: list[FeedMeta],
 ) -> list[typing.Tuple[Feed | str | None, FeedMeta]]:
    async with asyncio.TaskGroup() as group:
        tasks = [group.create_task(fetch_feed(m)) for m in metas]
    return [t.result() for t in tasks]
 def merge_feeds(a: Feed, b: Feed) -> Feed:
    """Merge two known feeds. There are two conflict resolution policies:
@ -436,3 +454,180 @@ def sort_key(f: Feed) -> int:
    if len(f.entries) > 0:
        return max(e.inserted_at for e in f.entries)
    return -1
 class FeedSearchParser(html.parser.HTMLParser):
    """An HTML parser that tries to find links to feeds."""
    FEED_TYPES = (
        "application/rss+xml",
        "text/xml",
        "application/atom+xml",
        "application/x.atom+xml",
        "application/x-atom+xml",
    )
    link_links: list[str]
    a_links: list[str]
    def __init__(self, baseuri):
        super().__init__()
        self.baseuri = baseuri
        self.link_links = []
        self.a_links = []
    def handle_starttag(self, tag, attrs):
        attrs = {k: v for k, v in attrs}
        if tag == "base":
            self.do_base(attrs)
        elif tag == "link":
            self.do_link(attrs)
        elif tag == "a":
            self.do_a(attrs)
    def do_base(self, attrs):
        base = attrs.get("href")
        if base is not None:
            self.baseuri = base
    def do_link(self, attrs):
        rel = attrs.get("rel")
        if rel is None:
            return
        if "alternate" not in rel.split():
            return
        if attrs.get("type", "").lower() not in self.FEED_TYPES:
            return
        href = attrs.get("href")
        if href is None:
            return
        self.link_links.append(urllib.parse.urljoin(self.baseuri, href))
    def do_a(self, attrs):
        href = attrs.get("href")
        if href is None:
            return
        self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
 def massage_url(uri: str) -> str:
    uri = uri.strip()
    if uri.startswith("feed://"):
        uri = "http://" + uri.split("feed://", 1).pop()
    for x in ["http", "https"]:
        if uri.startswith("%s://" % x):
            return uri
    return "http://%s" % uri
 def classify_links(links, baseuri) -> typing.Tuple[list[str], list[str]]:
    """Split the links into two sets: local (which start with baseuri) and
    remote (which don't).
    """
    baseuri = baseuri.lower()
    local, remote = [], []
    for link in links:
        if link.lower().startswith(baseuri):
            local.append(link)
        else:
            remote.append(link)
    return local, remote
 def is_feed_link(link: str) -> bool:
    """Return True if the link seems to be a feed link, or False otherwise."""
    link = link.lower()
    return (
        link.endswith(".rss")
        or link.endswith(".rdf")
        or link.endswith(".xml")
        or link.endswith(".atom")
    )
 def is_XML_related_link(link: str) -> bool:
    link = link.lower()
    return "rss" in link or "rdf" in link or "xml" in link or "atom" in link
 async def check_feed(url: str, origin: str) -> Feed | None:
    """Check to see if the given URL is a feed. If it is, return the feed,
    otherwise return None.
    """
    meta = FeedMeta.from_url(url, origin)
    result, meta = await fetch_feed(meta)
    if isinstance(result, Feed):
        return result
    return None
 async def check_links(links: typing.Iterable[str], origin: str) -> list[Feed]:
    """Fetch all the links and return the ones that appear to have feeds in
    them. If none of them are fetchable or none of them have feeds then this
    will return nothing.
    """
    async with asyncio.TaskGroup() as group:
        tasks = [group.create_task(check_feed(link, origin)) for link in links]
    outfeeds: list[Feed] = []
    for task in tasks:
        result = task.result()
        if result is not None:
            outfeeds.append(result)
    return outfeeds
 async def feed_search(uri: str, origin: str) -> list[Feed]:
    meta = FeedMeta.from_url(massage_url(uri), origin)
    result, meta = await fetch_feed(meta)
    if result is None:
        return []
    if isinstance(result, Feed):
        return [result]
    # OK it was not a feed, let's try all our searching games.
    parser = FeedSearchParser(meta.url)
    parser.feed(result)
    LOG.debug("Checking links...")
    outfeeds = await check_links(parser.link_links, origin)
    if len(outfeeds) > 0:
        return outfeeds
    LOG.debug("No links, checking A tags...")
    local_links, remote_links = classify_links(parser.a_links, meta.url)
    outfeeds = await check_links(filter(is_feed_link, local_links), origin)
    if len(outfeeds) > 0:
        return outfeeds
    outfeeds = await check_links(filter(is_XML_related_link, local_links), origin)
    if len(outfeeds) > 0:
        return outfeeds
    outfeeds = await check_links(filter(is_feed_link, remote_links), origin)
    if len(outfeeds) > 0:
        return outfeeds
    outfeeds = await check_links(filter(is_XML_related_link, remote_links), origin)
    if len(outfeeds) > 0:
        return outfeeds
    LOG.debug("no A tags, guessing")
    suffixes = [  # filenames used by popular software:
        "atom.xml",  # blogger, TypePad
        "index.atom",  # MT, apparently
        "index.rdf",  # MT
        "rss.xml",  # Dave Winer/Manila
        "index.xml",  # MT
        "index.rss",  # Slash
    ]
    outfeeds = await check_links(
        [urllib.parse.urljoin(meta.url, x) for x in suffixes], origin
    )
    return outfeeds
--- a/cry/feedfinder.py
+++ b/cry/feedfinder.py
@ -2,8 +2,10 @@
 Based on http://www.aaronsw.com/2002/feedfinder/
-Kinda rewritten by John Doty for the Python3 and the cry aggregator, but the
+Rewritted by John Doty for the Python3 and the cry aggregator, but the basic
-basic frame remains.
+frame remains. The big thing *this* does is also return the FeedMeta when it
 has found feeds, instead of just URLs. This is more useful for the rest of
 processing.
 """
 import logging
@ -17,6 +19,7 @@ import urllib.robotparser
 import requests
 from . import feed
 LOG = logging.getLogger(__name__)
@ -125,7 +128,7 @@ class HtmlBasedParser(html.parser.HTMLParser):
        self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
-def makeFullURI(uri):
+def makeFullURI(uri: str) -> str:
    uri = uri.strip()
    if uri.startswith("feed://"):
        uri = "http://" + uri.split("feed://", 1).pop()
@ -204,8 +207,6 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
    """Find feeds for the given URI.
    How it works:
    0.
    1. If the URI points to a feed, it is simply returned; otherwise
       the page is downloaded and the real fun begins.
@ -293,58 +294,3 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
        )
    return list(set(outfeeds))
 ##### test harness ######
 def test():
    uri = "http://diveintomark.org/tests/client/autodiscovery/html4-001.html"
    failed = []
    count = 0
    while 1:
        data = _gatekeeper.get(uri)
        if data.find("Atom autodiscovery test") == -1:
            break
        sys.stdout.write(".")
        sys.stdout.flush()
        count += 1
        links = getLinks(data, uri)
        if not links:
            print(f"\n*** FAILED *** {uri} could not find link")
            failed.append(uri)
        elif len(links) > 1:
            print(f"\n*** FAILED *** {uri} found too many links")
            failed.append(uri)
        else:
            atomdata = requests.get(links[0]).text
            if atomdata.find('<link rel="alternate"') == -1:
                print(f"\n*** FAILED *** {uri} retrieved something that is not a feed")
                failed.append(uri)
            else:
                backlink = atomdata.split('href="').pop().split('"')[0]
                if backlink != uri:
                    print(f"\n*** FAILED *** {uri} retrieved wrong feed")
                    failed.append(uri)
        if data.find('<link rel="next" href="') == -1:
            break
        uri = urllib.parse.urljoin(
            uri, data.split('<link rel="next" href="').pop().split('"')[0]
        )
    print()
    print(f"{count} tests executed, {len(failed)} failed")
 if __name__ == "__main__":
    args = sys.argv[1:]
    if args and args[0] == "--debug":
        _debug = 1
        args.pop(0)
    if args:
        uri = args[0]
    else:
        uri = "http://diveintomark.org/"
    if uri == "test":
        test()
    else:
        print("\n".join(getFeeds(uri)))