Check robots.txt for guidance and permission when fetching

It's only polite.
2024-07-28 10:49:07 -07:00 · 2024-07-28 10:49:07 -07:00 · bf41f70209
commit bf41f70209
parent eab6cf609d
1 changed files with 84 additions and 12 deletions
--- a/cry/feed.py
+++ b/cry/feed.py
@ -10,6 +10,7 @@ import re
 import time
 import typing
 import urllib.parse
+import urllib.robotparser


 import feedparser
@ -294,6 +295,70 @@ def could_be_feed_data(data: str) -> bool:
    return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0


+class Guardian:
+    permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
+
+    def __init__(self):
+        self.permissions = {}
+
+    async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
+        url = urllib.parse.urljoin(url, "/robots.txt")
+        parser = self.permissions.get(url)
+        if parser is None:
+            parser = asyncio.Lock()
+            self.permissions[url] = parser
+
+        if isinstance(parser, urllib.robotparser.RobotFileParser):
+            return parser
+
+        assert isinstance(parser, asyncio.Lock)
+        async with parser:
+            parser = self.permissions.get(url)
+            if isinstance(parser, urllib.robotparser.RobotFileParser):
+                return parser
+
+            headers = {"user-agent": USER_AGENT}
+            event_loop = asyncio.get_running_loop()
+            response = await event_loop.run_in_executor(
+                None,
+                functools.partial(
+                    requests.get,
+                    url,
+                    headers=headers,
+                ),
+            )
+
+            parser = urllib.robotparser.RobotFileParser(url)
+            if response.status_code in (401, 403):
+                parser.disallow_all = True
+            elif response.status_code >= 400 and response.status_code < 500:
+                parser.allow_all = True
+            elif response.status_code >= 300:
+                response.raise_for_status()
+            else:
+                text = await event_loop.run_in_executor(None, lambda: response.text)
+                parser.parse(text.splitlines())
+
+            self.permissions[url] = parser
+            return parser
+
+    async def can_fetch(self, url: str) -> bool:
+        parser = await self.get_robots_parser(url)
+        return parser.can_fetch(USER_AGENT, url)
+
+    async def crawl_delay(self, url: str) -> int | None:
+        parser = await self.get_robots_parser(url)
+        result = parser.crawl_delay(USER_AGENT)
+        if isinstance(result, str):
+            try:
+                return int(result)
+            except ValueError:
+                return None
+
+
+GUARDIAN = Guardian()
+
+
 async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
    if not meta.should_fetch(time.time()):
        return (None, meta)
@ -309,9 +374,8 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
    # this time because it make it much much easier to figure out whether or
    # not a request has succeeded. (The straw was handling timeouts and
    # understanding whether `bozo_exception` was a transport failure or not.)
-    #
-    # TODO: Check robots.txt!

+    if await GUARDIAN.can_fetch(meta.url):
        try:
            loop = asyncio.get_running_loop()
            response = await loop.run_in_executor(
@ -324,6 +388,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
            LOG.error(f"{meta.url} error fetching: {e}")
            failed = True
            response = None
+    else:
+        LOG.error(f"{meta.url} Guardian says we cannot fetch")
+        failed = True
+        response = None

    # Now, there are a number of things to consider in the response that
    # we need to consider in updating our permanent record.
@ -381,6 +449,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
        except Exception:
            pass

+    if retry_delta is None:
+        # See if robots.txt has any guidance for us.
+        retry_delta = await GUARDIAN.crawl_delay(meta.url)
+
    if retry_delta is None:
        if failed:
            retry_delta = 1 * 60  # Retry again in a minute