From 65e4b3f1f74bcf89f414f8e153076f0115e25ee3 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 29 Jul 2024 10:01:11 -0700 Subject: [PATCH] Stop paying attention to the robots.txt for feeds --- cry/feed.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cry/feed.py b/cry/feed.py index 135fbf4..2746315 100644 --- a/cry/feed.py +++ b/cry/feed.py @@ -354,8 +354,20 @@ class Guardian: async def can_fetch(self, url: str) -> bool: """Returns true if we are allowed to fetch the given URL.""" - parser = await self.get_robots_parser(url) - return parser.can_fetch(USER_AGENT, url) + # Look, opinions differ on whether feed readers are supposed to be + # considered robots. I added robots.txt support for feeds based on + # the example of the feed finder python code but on reflection it + # does not do what I want it to do and the world seems to suggest + # that RSS readers should ignore it. (i.e., jwz blocks robots from + # accessing the RSS feed, so.) + # + # I'm leaving this code here so that I can resurrect it later if + # necessary. + # + # parser = await self.get_robots_parser(url) + # return parser.can_fetch(USER_AGENT, url) + del url + return True async def crawl_delay(self, url: str) -> int | None: """Returns the number of seconds we should wait before fetching again."""