Stop paying attention to the robots.txt for feeds

This commit is contained in:
John Doty 2024-07-29 10:01:11 -07:00
parent e83e5c9602
commit 65e4b3f1f7

View file

@ -354,8 +354,20 @@ class Guardian:
async def can_fetch(self, url: str) -> bool: async def can_fetch(self, url: str) -> bool:
"""Returns true if we are allowed to fetch the given URL.""" """Returns true if we are allowed to fetch the given URL."""
parser = await self.get_robots_parser(url) # Look, opinions differ on whether feed readers are supposed to be
return parser.can_fetch(USER_AGENT, url) # considered robots. I added robots.txt support for feeds based on
# the example of the feed finder python code but on reflection it
# does not do what I want it to do and the world seems to suggest
# that RSS readers should ignore it. (i.e., jwz blocks robots from
# accessing the RSS feed, so.)
#
# I'm leaving this code here so that I can resurrect it later if
# necessary.
#
# parser = await self.get_robots_parser(url)
# return parser.can_fetch(USER_AGENT, url)
del url
return True
async def crawl_delay(self, url: str) -> int | None: async def crawl_delay(self, url: str) -> int | None:
"""Returns the number of seconds we should wait before fetching again.""" """Returns the number of seconds we should wait before fetching again."""