From bf41f70209ab7fbd8fb74ab786c8dc36f0d38b00 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sun, 28 Jul 2024 10:49:07 -0700 Subject: [PATCH] Check robots.txt for guidance and permission when fetching It's only polite. --- cry/feed.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 12 deletions(-) diff --git a/cry/feed.py b/cry/feed.py index 6e7b4fa..913ee7b 100644 --- a/cry/feed.py +++ b/cry/feed.py @@ -10,6 +10,7 @@ import re import time import typing import urllib.parse +import urllib.robotparser import feedparser @@ -294,6 +295,70 @@ def could_be_feed_data(data: str) -> bool: return (data.count(" 0 +class Guardian: + permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock] + + def __init__(self): + self.permissions = {} + + async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser: + url = urllib.parse.urljoin(url, "/robots.txt") + parser = self.permissions.get(url) + if parser is None: + parser = asyncio.Lock() + self.permissions[url] = parser + + if isinstance(parser, urllib.robotparser.RobotFileParser): + return parser + + assert isinstance(parser, asyncio.Lock) + async with parser: + parser = self.permissions.get(url) + if isinstance(parser, urllib.robotparser.RobotFileParser): + return parser + + headers = {"user-agent": USER_AGENT} + event_loop = asyncio.get_running_loop() + response = await event_loop.run_in_executor( + None, + functools.partial( + requests.get, + url, + headers=headers, + ), + ) + + parser = urllib.robotparser.RobotFileParser(url) + if response.status_code in (401, 403): + parser.disallow_all = True + elif response.status_code >= 400 and response.status_code < 500: + parser.allow_all = True + elif response.status_code >= 300: + response.raise_for_status() + else: + text = await event_loop.run_in_executor(None, lambda: response.text) + parser.parse(text.splitlines()) + + self.permissions[url] = parser + return parser + + async def can_fetch(self, url: str) -> bool: + parser = await self.get_robots_parser(url) + return parser.can_fetch(USER_AGENT, url) + + async def crawl_delay(self, url: str) -> int | None: + parser = await self.get_robots_parser(url) + result = parser.crawl_delay(USER_AGENT) + if isinstance(result, str): + try: + return int(result) + except ValueError: + return None + + +GUARDIAN = Guardian() + + async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]: if not meta.should_fetch(time.time()): return (None, meta) @@ -309,19 +374,22 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta # this time because it make it much much easier to figure out whether or # not a request has succeeded. (The straw was handling timeouts and # understanding whether `bozo_exception` was a transport failure or not.) - # - # TODO: Check robots.txt! - try: - loop = asyncio.get_running_loop() - response = await loop.run_in_executor( - None, - functools.partial(http.get, meta.url, headers=headers), - ) - LOG.info(f"{meta.url} fetched with status: {response.status_code}") - failed = response.status_code >= 400 - except Exception as e: - LOG.error(f"{meta.url} error fetching: {e}") + if await GUARDIAN.can_fetch(meta.url): + try: + loop = asyncio.get_running_loop() + response = await loop.run_in_executor( + None, + functools.partial(http.get, meta.url, headers=headers), + ) + LOG.info(f"{meta.url} fetched with status: {response.status_code}") + failed = response.status_code >= 400 + except Exception as e: + LOG.error(f"{meta.url} error fetching: {e}") + failed = True + response = None + else: + LOG.error(f"{meta.url} Guardian says we cannot fetch") failed = True response = None @@ -381,6 +449,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta except Exception: pass + if retry_delta is None: + # See if robots.txt has any guidance for us. + retry_delta = await GUARDIAN.crawl_delay(meta.url) + if retry_delta is None: if failed: retry_delta = 1 * 60 # Retry again in a minute