From e83e5c96026311b2b6f1e5173d05793ee49edafe Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Sun, 28 Jul 2024 11:01:41 -0700
Subject: [PATCH] Comments and whatnot

---
 cry/feed.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/cry/feed.py b/cry/feed.py
index 913ee7b..135fbf4 100644
--- a/cry/feed.py
+++ b/cry/feed.py
@@ -296,12 +296,15 @@ def could_be_feed_data(data: str) -> bool:
 
 
 class Guardian:
+    """A keeper of robots.txt files."""
+
     permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
 
     def __init__(self):
         self.permissions = {}
 
     async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
+        """Fetch the robots parser for the given URL. Only do it once per site."""
         url = urllib.parse.urljoin(url, "/robots.txt")
         parser = self.permissions.get(url)
         if parser is None:
@@ -317,6 +320,7 @@ class Guardian:
             if isinstance(parser, urllib.robotparser.RobotFileParser):
                 return parser
 
+            LOG.debug(f"{url} Fetching robots.txt...")
             headers = {"user-agent": USER_AGENT}
             event_loop = asyncio.get_running_loop()
             response = await event_loop.run_in_executor(
@@ -330,9 +334,15 @@ class Guardian:
 
             parser = urllib.robotparser.RobotFileParser(url)
             if response.status_code in (401, 403):
-                parser.disallow_all = True
+                LOG.debug(
+                    f"{url} Server says {response.status_code}, asusming we can't fetch anything"
+                )
+                parser.disallow_all = True  # type: ignore
             elif response.status_code >= 400 and response.status_code < 500:
-                parser.allow_all = True
+                LOG.debug(
+                    f"{url} Server says {response.status_code}, assume we have free reign"
+                )
+                parser.allow_all = True  # type: ignore
             elif response.status_code >= 300:
                 response.raise_for_status()
             else:
@@ -343,10 +353,12 @@ class Guardian:
             return parser
 
     async def can_fetch(self, url: str) -> bool:
+        """Returns true if we are allowed to fetch the given URL."""
         parser = await self.get_robots_parser(url)
         return parser.can_fetch(USER_AGENT, url)
 
     async def crawl_delay(self, url: str) -> int | None:
+        """Returns the number of seconds we should wait before fetching again."""
         parser = await self.get_robots_parser(url)
         result = parser.crawl_delay(USER_AGENT)
         if isinstance(result, str):