Stop paying attention to the robots.txt for feeds
This commit is contained in:
parent
e83e5c9602
commit
65e4b3f1f7
1 changed files with 14 additions and 2 deletions
16
cry/feed.py
16
cry/feed.py
|
|
@ -354,8 +354,20 @@ class Guardian:
|
|||
|
||||
async def can_fetch(self, url: str) -> bool:
|
||||
"""Returns true if we are allowed to fetch the given URL."""
|
||||
parser = await self.get_robots_parser(url)
|
||||
return parser.can_fetch(USER_AGENT, url)
|
||||
# Look, opinions differ on whether feed readers are supposed to be
|
||||
# considered robots. I added robots.txt support for feeds based on
|
||||
# the example of the feed finder python code but on reflection it
|
||||
# does not do what I want it to do and the world seems to suggest
|
||||
# that RSS readers should ignore it. (i.e., jwz blocks robots from
|
||||
# accessing the RSS feed, so.)
|
||||
#
|
||||
# I'm leaving this code here so that I can resurrect it later if
|
||||
# necessary.
|
||||
#
|
||||
# parser = await self.get_robots_parser(url)
|
||||
# return parser.can_fetch(USER_AGENT, url)
|
||||
del url
|
||||
return True
|
||||
|
||||
async def crawl_delay(self, url: str) -> int | None:
|
||||
"""Returns the number of seconds we should wait before fetching again."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue