From e83e5c96026311b2b6f1e5173d05793ee49edafe Mon Sep 17 00:00:00 2001 From: John Doty Date: Sun, 28 Jul 2024 11:01:41 -0700 Subject: [PATCH] Comments and whatnot --- cry/feed.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cry/feed.py b/cry/feed.py index 913ee7b..135fbf4 100644 --- a/cry/feed.py +++ b/cry/feed.py @@ -296,12 +296,15 @@ def could_be_feed_data(data: str) -> bool: class Guardian: + """A keeper of robots.txt files.""" + permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock] def __init__(self): self.permissions = {} async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser: + """Fetch the robots parser for the given URL. Only do it once per site.""" url = urllib.parse.urljoin(url, "/robots.txt") parser = self.permissions.get(url) if parser is None: @@ -317,6 +320,7 @@ class Guardian: if isinstance(parser, urllib.robotparser.RobotFileParser): return parser + LOG.debug(f"{url} Fetching robots.txt...") headers = {"user-agent": USER_AGENT} event_loop = asyncio.get_running_loop() response = await event_loop.run_in_executor( @@ -330,9 +334,15 @@ class Guardian: parser = urllib.robotparser.RobotFileParser(url) if response.status_code in (401, 403): - parser.disallow_all = True + LOG.debug( + f"{url} Server says {response.status_code}, asusming we can't fetch anything" + ) + parser.disallow_all = True # type: ignore elif response.status_code >= 400 and response.status_code < 500: - parser.allow_all = True + LOG.debug( + f"{url} Server says {response.status_code}, assume we have free reign" + ) + parser.allow_all = True # type: ignore elif response.status_code >= 300: response.raise_for_status() else: @@ -343,10 +353,12 @@ class Guardian: return parser async def can_fetch(self, url: str) -> bool: + """Returns true if we are allowed to fetch the given URL.""" parser = await self.get_robots_parser(url) return parser.can_fetch(USER_AGENT, url) async def crawl_delay(self, url: str) -> int | None: + """Returns the number of seconds we should wait before fetching again.""" parser = await self.get_robots_parser(url) result = parser.crawl_delay(USER_AGENT) if isinstance(result, str):