Check robots.txt for guidance and permission when fetching

It's only polite.
2024-07-28 10:49:07 -07:00 · 2024-07-28 10:49:07 -07:00 · bf41f70209
commit bf41f70209
parent eab6cf609d
1 changed files with 84 additions and 12 deletions
--- a/cry/feed.py
+++ b/cry/feed.py
@ -10,6 +10,7 @@ import re
 import time
 import typing
 import urllib.parse
 import urllib.robotparser
 import feedparser
@ -294,6 +295,70 @@ def could_be_feed_data(data: str) -> bool:
    return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
 class Guardian:
    permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
    def __init__(self):
        self.permissions = {}
    async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
        url = urllib.parse.urljoin(url, "/robots.txt")
        parser = self.permissions.get(url)
        if parser is None:
            parser = asyncio.Lock()
            self.permissions[url] = parser
        if isinstance(parser, urllib.robotparser.RobotFileParser):
            return parser
        assert isinstance(parser, asyncio.Lock)
        async with parser:
            parser = self.permissions.get(url)
            if isinstance(parser, urllib.robotparser.RobotFileParser):
                return parser
            headers = {"user-agent": USER_AGENT}
            event_loop = asyncio.get_running_loop()
            response = await event_loop.run_in_executor(
                None,
                functools.partial(
                    requests.get,
                    url,
                    headers=headers,
                ),
            )
            parser = urllib.robotparser.RobotFileParser(url)
            if response.status_code in (401, 403):
                parser.disallow_all = True
            elif response.status_code >= 400 and response.status_code < 500:
                parser.allow_all = True
            elif response.status_code >= 300:
                response.raise_for_status()
            else:
                text = await event_loop.run_in_executor(None, lambda: response.text)
                parser.parse(text.splitlines())
            self.permissions[url] = parser
            return parser
    async def can_fetch(self, url: str) -> bool:
        parser = await self.get_robots_parser(url)
        return parser.can_fetch(USER_AGENT, url)
    async def crawl_delay(self, url: str) -> int | None:
        parser = await self.get_robots_parser(url)
        result = parser.crawl_delay(USER_AGENT)
        if isinstance(result, str):
            try:
                return int(result)
            except ValueError:
                return None
 GUARDIAN = Guardian()
 async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
    if not meta.should_fetch(time.time()):
        return (None, meta)
@ -309,19 +374,22 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
    # this time because it make it much much easier to figure out whether or
    # not a request has succeeded. (The straw was handling timeouts and
    # understanding whether `bozo_exception` was a transport failure or not.)
    #
    # TODO: Check robots.txt!
-    try:
+    if await GUARDIAN.can_fetch(meta.url):
-        loop = asyncio.get_running_loop()
+        try:
-        response = await loop.run_in_executor(
+            loop = asyncio.get_running_loop()
-            None,
+            response = await loop.run_in_executor(
-            functools.partial(http.get, meta.url, headers=headers),
+                None,
-        )
+                functools.partial(http.get, meta.url, headers=headers),
-        LOG.info(f"{meta.url} fetched with status: {response.status_code}")
+            )
-        failed = response.status_code >= 400
+            LOG.info(f"{meta.url} fetched with status: {response.status_code}")
-    except Exception as e:
+            failed = response.status_code >= 400
-        LOG.error(f"{meta.url} error fetching: {e}")
+        except Exception as e:
            LOG.error(f"{meta.url} error fetching: {e}")
            failed = True
            response = None
    else:
        LOG.error(f"{meta.url} Guardian says we cannot fetch")
        failed = True
        response = None
@ -381,6 +449,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
        except Exception:
            pass
    if retry_delta is None:
        # See if robots.txt has any guidance for us.
        retry_delta = await GUARDIAN.crawl_delay(meta.url)
    if retry_delta is None:
        if failed:
            retry_delta = 1 * 60  # Retry again in a minute