Comments and whatnot
This commit is contained in:
parent
bf41f70209
commit
e83e5c9602
1 changed files with 14 additions and 2 deletions
16
cry/feed.py
16
cry/feed.py
|
|
@ -296,12 +296,15 @@ def could_be_feed_data(data: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
class Guardian:
|
class Guardian:
|
||||||
|
"""A keeper of robots.txt files."""
|
||||||
|
|
||||||
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
|
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.permissions = {}
|
self.permissions = {}
|
||||||
|
|
||||||
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
|
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
|
||||||
|
"""Fetch the robots parser for the given URL. Only do it once per site."""
|
||||||
url = urllib.parse.urljoin(url, "/robots.txt")
|
url = urllib.parse.urljoin(url, "/robots.txt")
|
||||||
parser = self.permissions.get(url)
|
parser = self.permissions.get(url)
|
||||||
if parser is None:
|
if parser is None:
|
||||||
|
|
@ -317,6 +320,7 @@ class Guardian:
|
||||||
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
LOG.debug(f"{url} Fetching robots.txt...")
|
||||||
headers = {"user-agent": USER_AGENT}
|
headers = {"user-agent": USER_AGENT}
|
||||||
event_loop = asyncio.get_running_loop()
|
event_loop = asyncio.get_running_loop()
|
||||||
response = await event_loop.run_in_executor(
|
response = await event_loop.run_in_executor(
|
||||||
|
|
@ -330,9 +334,15 @@ class Guardian:
|
||||||
|
|
||||||
parser = urllib.robotparser.RobotFileParser(url)
|
parser = urllib.robotparser.RobotFileParser(url)
|
||||||
if response.status_code in (401, 403):
|
if response.status_code in (401, 403):
|
||||||
parser.disallow_all = True
|
LOG.debug(
|
||||||
|
f"{url} Server says {response.status_code}, asusming we can't fetch anything"
|
||||||
|
)
|
||||||
|
parser.disallow_all = True # type: ignore
|
||||||
elif response.status_code >= 400 and response.status_code < 500:
|
elif response.status_code >= 400 and response.status_code < 500:
|
||||||
parser.allow_all = True
|
LOG.debug(
|
||||||
|
f"{url} Server says {response.status_code}, assume we have free reign"
|
||||||
|
)
|
||||||
|
parser.allow_all = True # type: ignore
|
||||||
elif response.status_code >= 300:
|
elif response.status_code >= 300:
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
else:
|
else:
|
||||||
|
|
@ -343,10 +353,12 @@ class Guardian:
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
async def can_fetch(self, url: str) -> bool:
|
async def can_fetch(self, url: str) -> bool:
|
||||||
|
"""Returns true if we are allowed to fetch the given URL."""
|
||||||
parser = await self.get_robots_parser(url)
|
parser = await self.get_robots_parser(url)
|
||||||
return parser.can_fetch(USER_AGENT, url)
|
return parser.can_fetch(USER_AGENT, url)
|
||||||
|
|
||||||
async def crawl_delay(self, url: str) -> int | None:
|
async def crawl_delay(self, url: str) -> int | None:
|
||||||
|
"""Returns the number of seconds we should wait before fetching again."""
|
||||||
parser = await self.get_robots_parser(url)
|
parser = await self.get_robots_parser(url)
|
||||||
result = parser.crawl_delay(USER_AGENT)
|
result = parser.crawl_delay(USER_AGENT)
|
||||||
if isinstance(result, str):
|
if isinstance(result, str):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue