Comments and whatnot
This commit is contained in:
parent
bf41f70209
commit
e83e5c9602
1 changed files with 14 additions and 2 deletions
16
cry/feed.py
16
cry/feed.py
|
|
@ -296,12 +296,15 @@ def could_be_feed_data(data: str) -> bool:
|
|||
|
||||
|
||||
class Guardian:
|
||||
"""A keeper of robots.txt files."""
|
||||
|
||||
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
|
||||
|
||||
def __init__(self):
|
||||
self.permissions = {}
|
||||
|
||||
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
|
||||
"""Fetch the robots parser for the given URL. Only do it once per site."""
|
||||
url = urllib.parse.urljoin(url, "/robots.txt")
|
||||
parser = self.permissions.get(url)
|
||||
if parser is None:
|
||||
|
|
@ -317,6 +320,7 @@ class Guardian:
|
|||
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
||||
return parser
|
||||
|
||||
LOG.debug(f"{url} Fetching robots.txt...")
|
||||
headers = {"user-agent": USER_AGENT}
|
||||
event_loop = asyncio.get_running_loop()
|
||||
response = await event_loop.run_in_executor(
|
||||
|
|
@ -330,9 +334,15 @@ class Guardian:
|
|||
|
||||
parser = urllib.robotparser.RobotFileParser(url)
|
||||
if response.status_code in (401, 403):
|
||||
parser.disallow_all = True
|
||||
LOG.debug(
|
||||
f"{url} Server says {response.status_code}, asusming we can't fetch anything"
|
||||
)
|
||||
parser.disallow_all = True # type: ignore
|
||||
elif response.status_code >= 400 and response.status_code < 500:
|
||||
parser.allow_all = True
|
||||
LOG.debug(
|
||||
f"{url} Server says {response.status_code}, assume we have free reign"
|
||||
)
|
||||
parser.allow_all = True # type: ignore
|
||||
elif response.status_code >= 300:
|
||||
response.raise_for_status()
|
||||
else:
|
||||
|
|
@ -343,10 +353,12 @@ class Guardian:
|
|||
return parser
|
||||
|
||||
async def can_fetch(self, url: str) -> bool:
|
||||
"""Returns true if we are allowed to fetch the given URL."""
|
||||
parser = await self.get_robots_parser(url)
|
||||
return parser.can_fetch(USER_AGENT, url)
|
||||
|
||||
async def crawl_delay(self, url: str) -> int | None:
|
||||
"""Returns the number of seconds we should wait before fetching again."""
|
||||
parser = await self.get_robots_parser(url)
|
||||
result = parser.crawl_delay(USER_AGENT)
|
||||
if isinstance(result, str):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue