Stop paying attention to the robots.txt for feeds
This commit is contained in:
parent
e83e5c9602
commit
65e4b3f1f7
1 changed files with 14 additions and 2 deletions
16
cry/feed.py
16
cry/feed.py
|
|
@ -354,8 +354,20 @@ class Guardian:
|
||||||
|
|
||||||
async def can_fetch(self, url: str) -> bool:
|
async def can_fetch(self, url: str) -> bool:
|
||||||
"""Returns true if we are allowed to fetch the given URL."""
|
"""Returns true if we are allowed to fetch the given URL."""
|
||||||
parser = await self.get_robots_parser(url)
|
# Look, opinions differ on whether feed readers are supposed to be
|
||||||
return parser.can_fetch(USER_AGENT, url)
|
# considered robots. I added robots.txt support for feeds based on
|
||||||
|
# the example of the feed finder python code but on reflection it
|
||||||
|
# does not do what I want it to do and the world seems to suggest
|
||||||
|
# that RSS readers should ignore it. (i.e., jwz blocks robots from
|
||||||
|
# accessing the RSS feed, so.)
|
||||||
|
#
|
||||||
|
# I'm leaving this code here so that I can resurrect it later if
|
||||||
|
# necessary.
|
||||||
|
#
|
||||||
|
# parser = await self.get_robots_parser(url)
|
||||||
|
# return parser.can_fetch(USER_AGENT, url)
|
||||||
|
del url
|
||||||
|
return True
|
||||||
|
|
||||||
async def crawl_delay(self, url: str) -> int | None:
|
async def crawl_delay(self, url: str) -> int | None:
|
||||||
"""Returns the number of seconds we should wait before fetching again."""
|
"""Returns the number of seconds we should wait before fetching again."""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue