cry/cry/feedfinder.py

"""feedfinder: Find the Web feed for a Web page

Based on http://www.aaronsw.com/2002/feedfinder/

Rewritted by John Doty for the Python3 and the cry aggregator, but the basic
frame remains. The big thing *this* does is also return the FeedMeta when it
has found feeds, instead of just URLs. This is more useful for the rest of
processing.
"""

import logging
import re
import sys
import typing
import urllib
import urllib.parse
import urllib.request
import urllib.robotparser

import requests

from . import feed

LOG = logging.getLogger(__name__)


class URLGatekeeper:
    """a class to track robots.txt rules across multiple servers"""

    def __init__(self):
        self.rpcache = {}  # a dictionary of RobotFileParser objects, by domain
        self.agent = f"cry/0.9"
        self.session = requests.Session()
        self.session.headers["user-agent"] = self.agent
        LOG.debug(f"User agent: {self.agent}")

    def _getrp(self, url):
        protocol, domain = urllib.parse.urlparse(url)[:2]
        if domain in self.rpcache:
            return self.rpcache[domain]
        baseurl = "%s://%s" % (protocol, domain)
        robotsurl = urllib.parse.urljoin(baseurl, "robots.txt")

        rp = urllib.robotparser.RobotFileParser(robotsurl)
        try:
            response = self.session.get(robotsurl)
            lines = response.text.splitlines()
            rp.parse(lines)
        except:
            pass
        self.rpcache[domain] = rp
        return rp

    def can_fetch(self, url):
        rp = self._getrp(url)
        allow = rp.can_fetch(self.agent, url)
        LOG.debug(f"gatekeeper of {url} says {allow}")
        return allow

    def get(self, url, check=True):
        if check and not self.can_fetch(url):
            return ""
        try:
            return self.session.get(url, timeout=10).text
        except:
            return ""


_gatekeeper = URLGatekeeper()

import html.parser


class HtmlBasedParser(html.parser.HTMLParser):
    FEED_TYPES = (
        "application/rss+xml",
        "text/xml",
        "application/atom+xml",
        "application/x.atom+xml",
        "application/x-atom+xml",
    )

    link_links: list[str]
    a_links: list[str]

    def __init__(self, baseuri):
        super().__init__()
        self.baseuri = baseuri
        self.link_links = []
        self.a_links = []

    def handle_starttag(self, tag, attrs):
        attrs = {k: v for k, v in attrs}
        if tag == "base":
            self.do_base(attrs)
        elif tag == "link":
            self.do_link(attrs)
        elif tag == "a":
            self.do_a(attrs)

    def do_base(self, attrs):
        base = attrs.get("href")
        if base is not None:
            self.baseuri = base

    def do_link(self, attrs):
        rel = attrs.get("rel")
        if rel is None:
            return

        if "alternate" not in rel.split():
            return

        if attrs.get("type", "").lower() not in self.FEED_TYPES:
            return

        href = attrs.get("href")
        if href is None:
            return

        self.link_links.append(urllib.parse.urljoin(self.baseuri, href))

    def do_a(self, attrs):
        href = attrs.get("href")
        if href is None:
            return

        self.a_links.append(urllib.parse.urljoin(self.baseuri, href))


def makeFullURI(uri: str) -> str:
    uri = uri.strip()
    if uri.startswith("feed://"):
        uri = "http://" + uri.split("feed://", 1).pop()
    for x in ["http", "https"]:
        if uri.startswith("%s://" % x):
            return uri
    return "http://%s" % uri


def classify_links(links, baseuri) -> typing.Tuple[list[str], list[str]]:
    """Split the links into two sets: local (which start with baseuri) and
    remote (which don't).
    """
    baseuri = baseuri.lower()

    local, remote = [], []
    for link in links:
        if link.lower().startswith(baseuri):
            local.append(link)
        else:
            remote.append(link)

    return local, remote


def is_feed_link(link: str) -> bool:
    """Return True if the link seems to be a feed link, or False otherwise."""
    link = link.lower()
    return (
        link.endswith(".rss")
        or link.endswith(".rdf")
        or link.endswith(".xml")
        or link.endswith(".atom")
    )


def is_XML_related_link(link: str) -> bool:
    link = link.lower()
    return "rss" in link or "rdf" in link or "xml" in link or "atom" in link


r_brokenRedirect = re.compile("<newLocation[^>]*>(.*?)</newLocation>", re.S)


def try_broken_redirect(data) -> str | None:
    """See if the content is a 'broken redirect'.

    This is in the code taken from aaronsw and I don't know what, if anything,
    ever generated this.
    """
    if "<newLocation" in data:
        newuris = r_brokenRedirect.findall(data)
        if newuris:
            return newuris[0].strip()


def could_be_feed_data(data: str) -> bool:
    """See if the data might be a feed."""
    data = data.lower()
    if data.count("<html"):
        return False
    return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0


def is_feed(uri: str) -> bool:
    """See if the data at `uri` might be a feed."""
    LOG.debug(f"seeing if {uri} is a feed")
    protocol = urllib.parse.urlparse(uri)
    if protocol[0] not in ("http", "https"):
        return False
    data = _gatekeeper.get(uri)
    return could_be_feed_data(data)


def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
    """Find feeds for the given URI.

    How it works:
    1. If the URI points to a feed, it is simply returned; otherwise
       the page is downloaded and the real fun begins.

    2. Feeds pointed to by LINK tags in the header of the page (autodiscovery)

    3. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml",
       or ".atom"

    4. <A> links to feeds on the same server containing "rss", "rdf", "xml", or
       "atom"

    5. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml",
       or ".atom"

    6. <A> links to feeds on external servers containing "rss", "rdf", "xml", or
       "atom"

    7. Try some guesses about common places for feeds. (index.xml, atom.xml,
       etc.)

    (At every step, feeds are minimally verified to make sure they are really
    feeds.)

    If `all` is True then return all possible feeds, kinda sorta ordered in
    terms of goodness. Otherwise, we stop as soon as one of the above steps
    finds a likely feed.
    """
    if _recurs is None:
        _recurs = [uri]
    fulluri = makeFullURI(uri)
    try:
        data = _gatekeeper.get(fulluri, check=False)
    except:
        return []

    # is this already a feed?
    if could_be_feed_data(data):
        return [fulluri]

    newuri = try_broken_redirect(data)
    if newuri and newuri not in _recurs:
        _recurs.append(newuri)
        return feeds(newuri, all=all, _recurs=_recurs)

    # nope, it's a page, try LINK tags first
    parser = HtmlBasedParser(fulluri)
    parser.feed(data)

    outfeeds = [link for link in parser.link_links if is_feed(link)]
    LOG.info(f"found {len(outfeeds)} through LINK tags")

    if all or len(outfeeds) == 0:
        # no LINK tags, look for regular <A> links that point to feeds
        if not all:
            LOG.info("no LINK tags, looking at A tags")

        local_links, remote_links = classify_links(parser.a_links, fulluri)

        # look for obvious feed links on the same server
        outfeeds.extend(filter(is_feed, filter(is_feed_link, local_links)))
        if all or len(outfeeds) == 0:
            # look harder for feed links on the same server
            outfeeds.extend(filter(is_feed, filter(is_XML_related_link, local_links)))

        if all or len(outfeeds) == 0:
            # look for obvious feed links on another server
            outfeeds.extend(filter(is_feed, filter(is_feed_link, remote_links)))

        if all or len(outfeeds) == 0:
            # look harder for feed links on another server
            outfeeds.extend(filter(is_feed, filter(is_XML_related_link, remote_links)))

    if all or len(outfeeds) == 0:
        LOG.debug("no A tags, guessing")
        suffixes = [  # filenames used by popular software:
            "atom.xml",  # blogger, TypePad
            "index.atom",  # MT, apparently
            "index.rdf",  # MT
            "rss.xml",  # Dave Winer/Manila
            "index.xml",  # MT
            "index.rss",  # Slash
        ]
        outfeeds.extend(
            filter(is_feed, [urllib.parse.urljoin(fulluri, x) for x in suffixes])
        )

    return list(set(outfeeds))