Subscribe now searches

Rewrite feed finder again to not multi-fetch when not needed
2024-07-15 17:23:24 +09:00 · 2024-07-15 17:23:24 +09:00 · cd20db0c4c
commit cd20db0c4c
parent 786ced5c24
3 changed files with 462 additions and 283 deletions
--- a/cry/cli.py
+++ b/cry/cli.py
@ -39,6 +39,7 @@ def cli(verbose):
@click.argument("url")
 def search(url):
    "Search an URL for feeds."
+    # TODO: Rewrite to use our new one
    feeds = feedfinder.find_feeds(url)
    for feed in feeds:
        click.echo(feed)
@ -47,28 +48,61 @@ def search(url):

@cli.command(name="subscribe")
@click.argument("url")
-def subscribe(url):
+@click.option("--literal/--no-literal", "-l/-L", default=False)
+def subscribe(url, literal):
    "Subscribe to a feed at the specified URL."

    db = database.Database.local()

-    click.echo(f"Fetching {url} ...")
-    meta = feed.FeedMeta.from_url(url, db.origin)
-    d, meta = asyncio.run(feed.fetch_feed(meta))
-    if d is None:
-        click.echo(f"Unable to fetch {url}")
-        return 1
+    if not literal:
+        click.echo(f"Searching for feeds for {url} ...")
+        feeds = asyncio.run(feed.feed_search(url, db.origin))
+        if len(feeds) == 0:
+            click.echo(f"Unable to find a suitable feed for {url}")
+            return 1
+
+        if len(feeds) > 1:
+            # If we found more than one feed then we will try to see what the
+            # individual feeds are.
+            click.echo(f"Found {len(feeds)} feeds:")
+
+            max_title = max(len(f.title) for f in feeds)
+            max_url = max(len(f.meta.url) for f in feeds)
+
+            feeds.sort(key=lambda f: f.title)
+
+            for f in feeds:
+                click.echo(f"{f.title:{max_title}}  {f.meta.url:{max_url}}")
+
+            click.echo(
+                "\nRun `subscribe` again with the URL of the feed you want to subscribe to."
+            )
+            return 1
+
+        result = feeds[0]
+        click.echo(f"Identified {result.meta.url} as a feed for {url}")
+    else:
+        click.echo(f"Fetching {url} ...")
+        meta = feed.FeedMeta.from_url(url, db.origin)
+        d, meta = asyncio.run(feed.fetch_feed(meta))
+        if d is None:
+            click.echo(f"Unable to fetch {url}")
+            return 1
+
+        if isinstance(d, str):
+            click.echo(f"{url} does not seem to be a feed")
+            return 1
+
+        result = d

    # Check to see if this URL is already in the database.
-    existing = db.load_feed(meta.url)
+    existing = db.load_feed(result.meta.url)
    if existing is not None:
-        click.echo(f"This feed already exists (as {meta.url})")
+        click.echo(f"This feed already exists (as {result.meta.url})")
        return 1

-    f = feed.Feed.from_parsed(d, meta)
-    db.store_feed(f)
-
-    click.echo(f"Subscribed to {meta.url}")
+    db.store_feed(result)
+    click.echo(f"Subscribed to {result.meta.url}")


@cli.command(name="import")
@ -91,13 +125,16 @@ def import_opml(opml_file):
            LOG.warn(f"Unable to fetch {url}, skipping...")
            continue

+        if isinstance(d, str):
+            click.echo(f"{url} does not seem to be a feed, skipping...")
+            continue
+
        existing = db.load_feed(meta.url)
        if existing is not None:
            LOG.info(f"{url} already exists (as {meta.url})")
            continue

-        f = feed.Feed.from_parsed(d, meta)
-        db.store_feed(f)
+        db.store_feed(d)
        subscribed = subscribed + 1

    click.echo(f"Subscribed to {subscribed} new feeds")
@ -130,10 +167,11 @@ def refresh(url):
        if d is None:
            # Nothing new.
            db.update_meta(meta)
+        elif isinstance(d, str):
+            click.echo(f"WARNING: {meta.url} returned a non-feed result!")
        else:
            # New items, possibly!
-            f = feed.Feed.from_parsed(d, meta)
-            new_count = new_count + db.store_feed(f)
+            new_count = new_count + db.store_feed(d)

    click.echo(f"Fetched {new_count} new entries.")

--- a/cry/feed.py
+++ b/cry/feed.py
@ -2,13 +2,15 @@
 import asyncio
 import dataclasses
 import functools
-import logging
-import time
-import typing
 import hashlib
 import html.parser
 import io
+import logging
 import re
+import time
+import typing
+import urllib.parse
+

 import feedparser
 import requests
@ -18,6 +20,8 @@ import requests.structures
 LOG = logging.getLogger(__name__)


+USER_AGENT = "cry-reader v0.0"
+
 FEED_STATUS_ALIVE = 0
 FEED_STATUS_DEAD = 1
 FEED_STATUS_UNSUBSCRIBED = 2
@ -48,211 +52,19 @@ class FeedMeta:
            origin=origin,
        )

+    def should_fetch(self, now) -> bool:
+        if self.status != FEED_STATUS_ALIVE:
+            LOG.info(f"{self.url} is dead or unsubscribed")
+            return False

-def the_worst_element_hash(value) -> str:
-    """Compute a content hash for the given feed element, to use as an ID.
+        if now < self.retry_after_ts:
+            retry_str = time.strftime(
+                "%Y-%m-%d %H:%M:%S %z", time.localtime(self.retry_after_ts)
+            )
+            LOG.info(f"{self.url} will not be pulled until {retry_str}")
+            return False

-    The hash must be as stable as we can make it, but obviously there are things
-    we cannot control. If we've gotten here then the feed author has already
-    failed us and there's little we can do. This is already *known to be wrong.*
-    """
-
-    def process(value, hash):
-        if isinstance(value, feedparser.FeedParserDict):
-            hash.update(b"dict")
-            keys = sorted(value.keys())
-            for key in keys:
-                hash.update(b"key::")
-                hash.update(key.encode("utf-8"))
-                hash.update(b"value::")
-                process(value[key], hash)
-            hash.update(b"tcid")
-        elif isinstance(value, str):
-            hash.update(b"str")
-            hash.update(value.encode("utf-8"))
-            hash.update(b"rts")
-        elif isinstance(value, list):
-            hash.update(b"list")
-            for item in value:
-                process(item, hash)
-            hash.update(b"tsil")
-        elif isinstance(value, tuple):
-            hash.update(b"tuple")
-            for item in value:
-                process(item, hash)
-            hash.update(b"elput")
-
-    hash = hashlib.sha256(usedforsecurity=False)
-    process(value, hash)
-    return hash.hexdigest()
-
-
-BLANK_TAGS = {"p", "br", "li", "div", "img"}
-MULTI_SPACES = re.compile(r"\s+")
-
-
-def clean_text(text: str) -> str:
-    """Sometimes text is HTML and otherwise ugly. This reduces it to
-    something pretty to display. Strips tags, puts blank space in between
-    elements that should generate blank space, and then collapses blank
-    spaces down to one.
-    """
-
-    class Cleaner(html.parser.HTMLParser):
-        def __init__(self, writer):
-            super().__init__()
-            self.writer = writer
-
-        def handle_data(self, data: str) -> None:
-            self.writer.write(data)
-
-        def handle_startendtag(
-            self, tag: str, attrs: list[tuple[str, str | None]]
-        ) -> None:
-            del attrs
-            if tag.lower() in BLANK_TAGS:
-                self.writer.write(" ")
-
-        def handle_starttag(
-            self, tag: str, attrs: list[tuple[str, str | None]]
-        ) -> None:
-            del attrs
-            if tag.lower() in BLANK_TAGS:
-                self.writer.write(" ")
-
-    writer = io.StringIO()
-    cleaner = Cleaner(writer)
-    cleaner.feed(text)
-    return MULTI_SPACES.sub(" ", writer.getvalue())
-
-
-async def fetch_feed(
-    feed: FeedMeta,
-) -> typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]:
-    """Potentially fetch the feed described by `feed`, returning a parsed feed
-    (if possible and necessary) and an updated FeedMeta.
-
-    This function can fail to return a parsed feed under a number of
-    circumstances. Among them:
-
-    - It's too soon to be checking this feed again.
-    - The feed has been failing for a while and we've called it's dead.
-    - The server told us it was dead.
-    - We checked the server and it told us our cache was good.
-    - We tried to contact the server, but a networking error happened.
-
-    Regardless, the new FeedMeta has the latest state of the feed.
-    """
-    if feed.status != FEED_STATUS_ALIVE:
-        LOG.info(f"{feed.url} is dead or unsubscribed")
-        return (None, feed)
-
-    if time.time() < feed.retry_after_ts:
-        retry_str = time.strftime(
-            "%Y-%m-%d %H:%M:%S %z", time.localtime(feed.retry_after_ts)
-        )
-        LOG.info(f"{feed.url} will not be pulled until {retry_str}")
-        return (None, feed)
-
-    # We waffle back and forth about using feedreader's HTTP support vs
-    # calling requests ourselves. We have decided to use requests manually at
-    # this time because it make it much much easier to figure out whether or
-    # not a request has succeeded. (The straw was handling timeouts and
-    # understanding whether `bozo_exception` was a transport failure or not.)
-
-    headers = {"user-agent": "cry-reader v0.0"}
-    if feed.etag:
-        headers["if-none-match"] = feed.etag
-    if feed.modified:
-        headers["if-modified-since"] = feed.modified
-
-    LOG.info(f"{feed.url} fetching...")
-    try:
-        loop = asyncio.get_running_loop()
-        response = await loop.run_in_executor(
-            None,
-            functools.partial(http.get, feed.url, headers=headers),
-        )
-        LOG.info(f"{feed.url} fetched with status: {response.status_code}")
-        failed = response.status_code >= 400
-    except Exception as e:
-        LOG.error(f"{feed.url} error fetching: {e}")
-        failed = True
-        response = None
-
-    # Now, there are a number of things to consider in the response that
-    # we need to consider in updating our permanent record.
-
-    if response is not None and response.status_code == 410:
-        # Permanently gone, really stop asking.
-        LOG.error(f"{feed.url} permanently gone")
-        return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
-
-    if failed and time.time() > feed.last_fetched_ts + (7 * 24 * 60 * 60):
-        # If we've been failing to fetch the feed for more than a week then
-        # consider us dead, we must be doing something wrong.
-        LOG.error(f"{feed.url} failed for too long, giving up")
-        return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
-
-    if response and response.is_permanent_redirect:
-        # Permanent redirect, update the stored URL, but mark this as a
-        # successful fetch.
-        #
-        # TODO: Is this actually the right URL to store? We need the last
-        #       permanently redirected URL, not just whatever the last thing
-        #       is... e.g. imagine a permanent followed by a temporary
-        #       redirect, then what?
-        LOG.info(f"{feed.url} permanently redirected to {response.url}")
-        assert response.url is not None
-        feed = dataclasses.replace(feed, url=response.url)
-
-    # NOTE: We might still be in a failure state here. But success or fail,
-    #       the server might have told us when to next retry, so make a note
-    #       of it.
-    retry_delta = None
-    if response is not None:
-        try:
-            retry_delta = int(response.headers.get("retry-after", "nope"))
-        except Exception:
-            pass
-    if retry_delta is None:
-        if failed:
-            retry_delta = 1 * 60  # Retry again in a minute
-        else:
-            retry_delta = 60 * 60  # 1 hour default
-
-    feed = dataclasses.replace(feed, retry_after_ts=int(time.time()) + retry_delta)
-
-    # We've done everything we can on a failure, bail if we've got an error.
-    if failed:
-        LOG.info(f"{feed.url} failed at the network level")
-        return (None, feed)
-
-    assert response is not None
-
-    # Record our successful fetch now, to reset the failure timer above.
-    feed = dataclasses.replace(feed, last_fetched_ts=int(time.time()))
-
-    # We can *still* be successful but like, no changes.
-    if response.status_code != 200:
-        LOG.info(f"{feed.url} had no changes")
-        return (None, feed)
-
-    feed = dataclasses.replace(
-        feed,
-        etag=response.headers.get("etag"),
-        modified=response.headers.get("last-modified"),
-    )
-    parsed = feedparser.parse(response.content, response_headers=response.headers)
-    return (parsed, feed)
-
-
-async def fetch_many(
-    metas: list[FeedMeta],
-) -> list[typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]]:
-    async with asyncio.TaskGroup() as group:
-        tasks = [group.create_task(fetch_feed(m)) for m in metas]
-    return [t.result() for t in tasks]
+        return True


@dataclasses.dataclass(frozen=True)
@ -398,6 +210,212 @@ class Feed:
        return Feed(meta=meta, title=title, link=link, entries=entries)


+def the_worst_element_hash(value) -> str:
+    """Compute a content hash for the given feed element, to use as an ID.
+
+    The hash must be as stable as we can make it, but obviously there are things
+    we cannot control. If we've gotten here then the feed author has already
+    failed us and there's little we can do. This is already *known to be wrong.*
+    """
+
+    def process(value, hash):
+        if isinstance(value, feedparser.FeedParserDict):
+            hash.update(b"dict")
+            keys = sorted(value.keys())
+            for key in keys:
+                hash.update(b"key::")
+                hash.update(key.encode("utf-8"))
+                hash.update(b"value::")
+                process(value[key], hash)
+            hash.update(b"tcid")
+        elif isinstance(value, str):
+            hash.update(b"str")
+            hash.update(value.encode("utf-8"))
+            hash.update(b"rts")
+        elif isinstance(value, list):
+            hash.update(b"list")
+            for item in value:
+                process(item, hash)
+            hash.update(b"tsil")
+        elif isinstance(value, tuple):
+            hash.update(b"tuple")
+            for item in value:
+                process(item, hash)
+            hash.update(b"elput")
+
+    hash = hashlib.sha256(usedforsecurity=False)
+    process(value, hash)
+    return hash.hexdigest()
+
+
+BLANK_TAGS = {"p", "br", "li", "div", "img"}
+MULTI_SPACES = re.compile(r"\s+")
+
+
+def clean_text(text: str) -> str:
+    """Sometimes text is HTML and otherwise ugly. This reduces it to
+    something pretty to display. Strips tags, puts blank space in between
+    elements that should generate blank space, and then collapses blank
+    spaces down to one.
+    """
+
+    class Cleaner(html.parser.HTMLParser):
+        def __init__(self, writer):
+            super().__init__()
+            self.writer = writer
+
+        def handle_data(self, data: str) -> None:
+            self.writer.write(data)
+
+        def handle_startendtag(
+            self, tag: str, attrs: list[tuple[str, str | None]]
+        ) -> None:
+            del attrs
+            if tag.lower() in BLANK_TAGS:
+                self.writer.write(" ")
+
+        def handle_starttag(
+            self, tag: str, attrs: list[tuple[str, str | None]]
+        ) -> None:
+            del attrs
+            if tag.lower() in BLANK_TAGS:
+                self.writer.write(" ")
+
+    writer = io.StringIO()
+    cleaner = Cleaner(writer)
+    cleaner.feed(text)
+    return MULTI_SPACES.sub(" ", writer.getvalue())
+
+
+def could_be_feed_data(data: str) -> bool:
+    """See if the data might be a feed."""
+    data = data.lower()
+    if data.count("<html"):
+        return False
+    return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
+
+
+async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
+    if not meta.should_fetch(time.time()):
+        return (None, meta)
+
+    headers = {"user-agent": USER_AGENT}
+    if meta.etag:
+        headers["if-none-match"] = meta.etag
+    if meta.modified:
+        headers["if-modified-since"] = meta.modified
+
+    # We waffle back and forth about using feedreader's HTTP support vs
+    # calling requests ourselves. We have decided to use requests manually at
+    # this time because it make it much much easier to figure out whether or
+    # not a request has succeeded. (The straw was handling timeouts and
+    # understanding whether `bozo_exception` was a transport failure or not.)
+    #
+    # TODO: Check robots.txt!
+
+    try:
+        loop = asyncio.get_running_loop()
+        response = await loop.run_in_executor(
+            None,
+            functools.partial(http.get, meta.url, headers=headers),
+        )
+        LOG.info(f"{meta.url} fetched with status: {response.status_code}")
+        failed = response.status_code >= 400
+    except Exception as e:
+        LOG.error(f"{meta.url} error fetching: {e}")
+        failed = True
+        response = None
+
+    # Now, there are a number of things to consider in the response that
+    # we need to consider in updating our permanent record.
+
+    if response is not None and response.status_code == 410:
+        # Permanently gone, really stop asking.
+        LOG.error(f"{meta.url} permanently gone")
+        return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
+
+    if failed and time.time() > meta.last_fetched_ts + (7 * 24 * 60 * 60):
+        # If we've been failing to fetch the feed for more than a week then
+        # consider us dead, we must be doing something wrong.
+        LOG.error(f"{meta.url} failed for too long, giving up")
+        return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
+
+    if response and response.is_permanent_redirect:
+        # Permanent redirect, update the stored URL, but mark this as a
+        # successful fetch.
+        #
+        # TODO: Is this actually the right URL to store? We need the last
+        #       permanently redirected URL, not just whatever the last thing
+        #       is... e.g. imagine a permanent followed by a temporary
+        #       redirect, then what?
+        LOG.info(f"{meta.url} permanently redirected to {response.url}")
+        assert response.url is not None
+        meta = dataclasses.replace(meta, url=response.url)
+
+    # TODO: Handle that bogus non-HTTP redirect that feedfinder uses.
+
+    # NOTE: We might still be in a failure state here. But success or fail,
+    #       the server might have told us when to next retry, so make a note
+    #       of it. The server might also have given us updated caching
+    #       information (even on failure!) and so let's also make a note of that.
+    retry_delta = None
+    etag = meta.etag
+    modified = meta.modified
+    if response is not None:
+        etag = response.headers.get("etag", meta.etag)
+        modified = response.headers.get("last-modified", meta.modified)
+
+        try:
+            retry_delta = int(response.headers.get("retry-after", "nope"))
+        except Exception:
+            pass
+
+    if retry_delta is None:
+        if failed:
+            retry_delta = 1 * 60  # Retry again in a minute
+        else:
+            retry_delta = 60 * 60  # 1 hour default
+
+    meta = dataclasses.replace(
+        meta,
+        retry_after_ts=int(time.time()) + retry_delta,
+        etag=etag,
+        modified=modified,
+    )
+
+    # We've done everything we can on a failure, bail if we've got an error.
+    if failed:
+        LOG.info(f"{meta.url} failed at the network level")
+        return (None, meta)
+
+    assert response is not None
+
+    # Record our successful fetch now, to reset the failure timer above.
+    meta = dataclasses.replace(meta, last_fetched_ts=int(time.time()))
+
+    # We can *still* be successful but like, no changes.
+    if response.status_code != 200:
+        LOG.info(f"{meta.url} had no changes")
+        return (None, meta)
+
+    # Does this seem to be a feed? Or not?
+    if could_be_feed_data(response.text):
+        parsed = feedparser.parse(response.content, response_headers=response.headers)
+        return (Feed.from_parsed(parsed, meta), meta)
+
+    # No this is not a feed, just return the content out for further
+    # processing.
+    return (response.text, meta)
+
+
+async def fetch_many(
+    metas: list[FeedMeta],
+) -> list[typing.Tuple[Feed | str | None, FeedMeta]]:
+    async with asyncio.TaskGroup() as group:
+        tasks = [group.create_task(fetch_feed(m)) for m in metas]
+    return [t.result() for t in tasks]
+
+
 def merge_feeds(a: Feed, b: Feed) -> Feed:
    """Merge two known feeds. There are two conflict resolution policies:

@ -436,3 +454,180 @@ def sort_key(f: Feed) -> int:
    if len(f.entries) > 0:
        return max(e.inserted_at for e in f.entries)
    return -1
+
+
+class FeedSearchParser(html.parser.HTMLParser):
+    """An HTML parser that tries to find links to feeds."""
+
+    FEED_TYPES = (
+        "application/rss+xml",
+        "text/xml",
+        "application/atom+xml",
+        "application/x.atom+xml",
+        "application/x-atom+xml",
+    )
+
+    link_links: list[str]
+    a_links: list[str]
+
+    def __init__(self, baseuri):
+        super().__init__()
+        self.baseuri = baseuri
+        self.link_links = []
+        self.a_links = []
+
+    def handle_starttag(self, tag, attrs):
+        attrs = {k: v for k, v in attrs}
+        if tag == "base":
+            self.do_base(attrs)
+        elif tag == "link":
+            self.do_link(attrs)
+        elif tag == "a":
+            self.do_a(attrs)
+
+    def do_base(self, attrs):
+        base = attrs.get("href")
+        if base is not None:
+            self.baseuri = base
+
+    def do_link(self, attrs):
+        rel = attrs.get("rel")
+        if rel is None:
+            return
+
+        if "alternate" not in rel.split():
+            return
+
+        if attrs.get("type", "").lower() not in self.FEED_TYPES:
+            return
+
+        href = attrs.get("href")
+        if href is None:
+            return
+
+        self.link_links.append(urllib.parse.urljoin(self.baseuri, href))
+
+    def do_a(self, attrs):
+        href = attrs.get("href")
+        if href is None:
+            return
+
+        self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
+
+
+def massage_url(uri: str) -> str:
+    uri = uri.strip()
+    if uri.startswith("feed://"):
+        uri = "http://" + uri.split("feed://", 1).pop()
+    for x in ["http", "https"]:
+        if uri.startswith("%s://" % x):
+            return uri
+    return "http://%s" % uri
+
+
+def classify_links(links, baseuri) -> typing.Tuple[list[str], list[str]]:
+    """Split the links into two sets: local (which start with baseuri) and
+    remote (which don't).
+    """
+    baseuri = baseuri.lower()
+
+    local, remote = [], []
+    for link in links:
+        if link.lower().startswith(baseuri):
+            local.append(link)
+        else:
+            remote.append(link)
+
+    return local, remote
+
+
+def is_feed_link(link: str) -> bool:
+    """Return True if the link seems to be a feed link, or False otherwise."""
+    link = link.lower()
+    return (
+        link.endswith(".rss")
+        or link.endswith(".rdf")
+        or link.endswith(".xml")
+        or link.endswith(".atom")
+    )
+
+
+def is_XML_related_link(link: str) -> bool:
+    link = link.lower()
+    return "rss" in link or "rdf" in link or "xml" in link or "atom" in link
+
+
+async def check_feed(url: str, origin: str) -> Feed | None:
+    """Check to see if the given URL is a feed. If it is, return the feed,
+    otherwise return None.
+    """
+    meta = FeedMeta.from_url(url, origin)
+    result, meta = await fetch_feed(meta)
+    if isinstance(result, Feed):
+        return result
+
+    return None
+
+
+async def check_links(links: typing.Iterable[str], origin: str) -> list[Feed]:
+    """Fetch all the links and return the ones that appear to have feeds in
+    them. If none of them are fetchable or none of them have feeds then this
+    will return nothing.
+    """
+    async with asyncio.TaskGroup() as group:
+        tasks = [group.create_task(check_feed(link, origin)) for link in links]
+
+    outfeeds: list[Feed] = []
+    for task in tasks:
+        result = task.result()
+        if result is not None:
+            outfeeds.append(result)
+
+    return outfeeds
+
+
+async def feed_search(uri: str, origin: str) -> list[Feed]:
+    meta = FeedMeta.from_url(massage_url(uri), origin)
+    result, meta = await fetch_feed(meta)
+    if result is None:
+        return []
+    if isinstance(result, Feed):
+        return [result]
+
+    # OK it was not a feed, let's try all our searching games.
+    parser = FeedSearchParser(meta.url)
+    parser.feed(result)
+
+    LOG.debug("Checking links...")
+    outfeeds = await check_links(parser.link_links, origin)
+    if len(outfeeds) > 0:
+        return outfeeds
+
+    LOG.debug("No links, checking A tags...")
+    local_links, remote_links = classify_links(parser.a_links, meta.url)
+    outfeeds = await check_links(filter(is_feed_link, local_links), origin)
+    if len(outfeeds) > 0:
+        return outfeeds
+    outfeeds = await check_links(filter(is_XML_related_link, local_links), origin)
+    if len(outfeeds) > 0:
+        return outfeeds
+    outfeeds = await check_links(filter(is_feed_link, remote_links), origin)
+    if len(outfeeds) > 0:
+        return outfeeds
+    outfeeds = await check_links(filter(is_XML_related_link, remote_links), origin)
+    if len(outfeeds) > 0:
+        return outfeeds
+
+    LOG.debug("no A tags, guessing")
+    suffixes = [  # filenames used by popular software:
+        "atom.xml",  # blogger, TypePad
+        "index.atom",  # MT, apparently
+        "index.rdf",  # MT
+        "rss.xml",  # Dave Winer/Manila
+        "index.xml",  # MT
+        "index.rss",  # Slash
+    ]
+    outfeeds = await check_links(
+        [urllib.parse.urljoin(meta.url, x) for x in suffixes], origin
+    )
+    return outfeeds
--- a/cry/feedfinder.py
+++ b/cry/feedfinder.py
@ -2,8 +2,10 @@

 Based on http://www.aaronsw.com/2002/feedfinder/

-Kinda rewritten by John Doty for the Python3 and the cry aggregator, but the
-basic frame remains.
+Rewritted by John Doty for the Python3 and the cry aggregator, but the basic
+frame remains. The big thing *this* does is also return the FeedMeta when it
+has found feeds, instead of just URLs. This is more useful for the rest of
+processing.
 """

 import logging
@ -17,6 +19,7 @@ import urllib.robotparser

 import requests

+from . import feed

 LOG = logging.getLogger(__name__)

@ -125,7 +128,7 @@ class HtmlBasedParser(html.parser.HTMLParser):
        self.a_links.append(urllib.parse.urljoin(self.baseuri, href))


-def makeFullURI(uri):
+def makeFullURI(uri: str) -> str:
    uri = uri.strip()
    if uri.startswith("feed://"):
        uri = "http://" + uri.split("feed://", 1).pop()
@ -204,8 +207,6 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
    """Find feeds for the given URI.

    How it works:
-    0.
-
    1. If the URI points to a feed, it is simply returned; otherwise
       the page is downloaded and the real fun begins.

@ -293,58 +294,3 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
        )

    return list(set(outfeeds))
-
-
-##### test harness ######
-
-
-def test():
-    uri = "http://diveintomark.org/tests/client/autodiscovery/html4-001.html"
-    failed = []
-    count = 0
-    while 1:
-        data = _gatekeeper.get(uri)
-        if data.find("Atom autodiscovery test") == -1:
-            break
-        sys.stdout.write(".")
-        sys.stdout.flush()
-        count += 1
-        links = getLinks(data, uri)
-        if not links:
-            print(f"\n*** FAILED *** {uri} could not find link")
-            failed.append(uri)
-        elif len(links) > 1:
-            print(f"\n*** FAILED *** {uri} found too many links")
-            failed.append(uri)
-        else:
-            atomdata = requests.get(links[0]).text
-            if atomdata.find('<link rel="alternate"') == -1:
-                print(f"\n*** FAILED *** {uri} retrieved something that is not a feed")
-                failed.append(uri)
-            else:
-                backlink = atomdata.split('href="').pop().split('"')[0]
-                if backlink != uri:
-                    print(f"\n*** FAILED *** {uri} retrieved wrong feed")
-                    failed.append(uri)
-        if data.find('<link rel="next" href="') == -1:
-            break
-        uri = urllib.parse.urljoin(
-            uri, data.split('<link rel="next" href="').pop().split('"')[0]
-        )
-    print()
-    print(f"{count} tests executed, {len(failed)} failed")
-
-
-if __name__ == "__main__":
-    args = sys.argv[1:]
-    if args and args[0] == "--debug":
-        _debug = 1
-        args.pop(0)
-    if args:
-        uri = args[0]
-    else:
-        uri = "http://diveintomark.org/"
-    if uri == "test":
-        test()
-    else:
-        print("\n".join(getFeeds(uri)))