Comments and whatnot

This commit is contained in:
John Doty 2024-07-28 11:01:41 -07:00
parent bf41f70209
commit e83e5c9602

View file

@ -296,12 +296,15 @@ def could_be_feed_data(data: str) -> bool:
class Guardian:
"""A keeper of robots.txt files."""
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
def __init__(self):
self.permissions = {}
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
"""Fetch the robots parser for the given URL. Only do it once per site."""
url = urllib.parse.urljoin(url, "/robots.txt")
parser = self.permissions.get(url)
if parser is None:
@ -317,6 +320,7 @@ class Guardian:
if isinstance(parser, urllib.robotparser.RobotFileParser):
return parser
LOG.debug(f"{url} Fetching robots.txt...")
headers = {"user-agent": USER_AGENT}
event_loop = asyncio.get_running_loop()
response = await event_loop.run_in_executor(
@ -330,9 +334,15 @@ class Guardian:
parser = urllib.robotparser.RobotFileParser(url)
if response.status_code in (401, 403):
parser.disallow_all = True
LOG.debug(
f"{url} Server says {response.status_code}, asusming we can't fetch anything"
)
parser.disallow_all = True # type: ignore
elif response.status_code >= 400 and response.status_code < 500:
parser.allow_all = True
LOG.debug(
f"{url} Server says {response.status_code}, assume we have free reign"
)
parser.allow_all = True # type: ignore
elif response.status_code >= 300:
response.raise_for_status()
else:
@ -343,10 +353,12 @@ class Guardian:
return parser
async def can_fetch(self, url: str) -> bool:
"""Returns true if we are allowed to fetch the given URL."""
parser = await self.get_robots_parser(url)
return parser.can_fetch(USER_AGENT, url)
async def crawl_delay(self, url: str) -> int | None:
"""Returns the number of seconds we should wait before fetching again."""
parser = await self.get_robots_parser(url)
result = parser.crawl_delay(USER_AGENT)
if isinstance(result, str):