From bf41f70209ab7fbd8fb74ab786c8dc36f0d38b00 Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Sun, 28 Jul 2024 10:49:07 -0700
Subject: [PATCH] Check robots.txt for guidance and permission when fetching

It's only polite.
---
 cry/feed.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 84 insertions(+), 12 deletions(-)

diff --git a/cry/feed.py b/cry/feed.py
index 6e7b4fa..913ee7b 100644
--- a/cry/feed.py
+++ b/cry/feed.py
@@ -10,6 +10,7 @@ import re
 import time
 import typing
 import urllib.parse
+import urllib.robotparser
 
 
 import feedparser
@@ -294,6 +295,70 @@ def could_be_feed_data(data: str) -> bool:
     return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
 
 
+class Guardian:
+    permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
+
+    def __init__(self):
+        self.permissions = {}
+
+    async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
+        url = urllib.parse.urljoin(url, "/robots.txt")
+        parser = self.permissions.get(url)
+        if parser is None:
+            parser = asyncio.Lock()
+            self.permissions[url] = parser
+
+        if isinstance(parser, urllib.robotparser.RobotFileParser):
+            return parser
+
+        assert isinstance(parser, asyncio.Lock)
+        async with parser:
+            parser = self.permissions.get(url)
+            if isinstance(parser, urllib.robotparser.RobotFileParser):
+                return parser
+
+            headers = {"user-agent": USER_AGENT}
+            event_loop = asyncio.get_running_loop()
+            response = await event_loop.run_in_executor(
+                None,
+                functools.partial(
+                    requests.get,
+                    url,
+                    headers=headers,
+                ),
+            )
+
+            parser = urllib.robotparser.RobotFileParser(url)
+            if response.status_code in (401, 403):
+                parser.disallow_all = True
+            elif response.status_code >= 400 and response.status_code < 500:
+                parser.allow_all = True
+            elif response.status_code >= 300:
+                response.raise_for_status()
+            else:
+                text = await event_loop.run_in_executor(None, lambda: response.text)
+                parser.parse(text.splitlines())
+
+            self.permissions[url] = parser
+            return parser
+
+    async def can_fetch(self, url: str) -> bool:
+        parser = await self.get_robots_parser(url)
+        return parser.can_fetch(USER_AGENT, url)
+
+    async def crawl_delay(self, url: str) -> int | None:
+        parser = await self.get_robots_parser(url)
+        result = parser.crawl_delay(USER_AGENT)
+        if isinstance(result, str):
+            try:
+                return int(result)
+            except ValueError:
+                return None
+
+
+GUARDIAN = Guardian()
+
+
 async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
     if not meta.should_fetch(time.time()):
         return (None, meta)
@@ -309,19 +374,22 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
     # this time because it make it much much easier to figure out whether or
     # not a request has succeeded. (The straw was handling timeouts and
     # understanding whether `bozo_exception` was a transport failure or not.)
-    #
-    # TODO: Check robots.txt!
 
-    try:
-        loop = asyncio.get_running_loop()
-        response = await loop.run_in_executor(
-            None,
-            functools.partial(http.get, meta.url, headers=headers),
-        )
-        LOG.info(f"{meta.url} fetched with status: {response.status_code}")
-        failed = response.status_code >= 400
-    except Exception as e:
-        LOG.error(f"{meta.url} error fetching: {e}")
+    if await GUARDIAN.can_fetch(meta.url):
+        try:
+            loop = asyncio.get_running_loop()
+            response = await loop.run_in_executor(
+                None,
+                functools.partial(http.get, meta.url, headers=headers),
+            )
+            LOG.info(f"{meta.url} fetched with status: {response.status_code}")
+            failed = response.status_code >= 400
+        except Exception as e:
+            LOG.error(f"{meta.url} error fetching: {e}")
+            failed = True
+            response = None
+    else:
+        LOG.error(f"{meta.url} Guardian says we cannot fetch")
         failed = True
         response = None
 
@@ -381,6 +449,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
         except Exception:
             pass
 
+    if retry_delta is None:
+        # See if robots.txt has any guidance for us.
+        retry_delta = await GUARDIAN.crawl_delay(meta.url)
+
     if retry_delta is None:
         if failed:
             retry_delta = 1 * 60  # Retry again in a minute