Check robots.txt for guidance and permission when fetching

It's only polite.
This commit is contained in:
John Doty 2024-07-28 10:49:07 -07:00
parent eab6cf609d
commit bf41f70209

View file

@ -10,6 +10,7 @@ import re
import time
import typing
import urllib.parse
import urllib.robotparser
import feedparser
@ -294,6 +295,70 @@ def could_be_feed_data(data: str) -> bool:
return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
class Guardian:
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
def __init__(self):
self.permissions = {}
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
url = urllib.parse.urljoin(url, "/robots.txt")
parser = self.permissions.get(url)
if parser is None:
parser = asyncio.Lock()
self.permissions[url] = parser
if isinstance(parser, urllib.robotparser.RobotFileParser):
return parser
assert isinstance(parser, asyncio.Lock)
async with parser:
parser = self.permissions.get(url)
if isinstance(parser, urllib.robotparser.RobotFileParser):
return parser
headers = {"user-agent": USER_AGENT}
event_loop = asyncio.get_running_loop()
response = await event_loop.run_in_executor(
None,
functools.partial(
requests.get,
url,
headers=headers,
),
)
parser = urllib.robotparser.RobotFileParser(url)
if response.status_code in (401, 403):
parser.disallow_all = True
elif response.status_code >= 400 and response.status_code < 500:
parser.allow_all = True
elif response.status_code >= 300:
response.raise_for_status()
else:
text = await event_loop.run_in_executor(None, lambda: response.text)
parser.parse(text.splitlines())
self.permissions[url] = parser
return parser
async def can_fetch(self, url: str) -> bool:
parser = await self.get_robots_parser(url)
return parser.can_fetch(USER_AGENT, url)
async def crawl_delay(self, url: str) -> int | None:
parser = await self.get_robots_parser(url)
result = parser.crawl_delay(USER_AGENT)
if isinstance(result, str):
try:
return int(result)
except ValueError:
return None
GUARDIAN = Guardian()
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
if not meta.should_fetch(time.time()):
return (None, meta)
@ -309,9 +374,8 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
# this time because it make it much much easier to figure out whether or
# not a request has succeeded. (The straw was handling timeouts and
# understanding whether `bozo_exception` was a transport failure or not.)
#
# TODO: Check robots.txt!
if await GUARDIAN.can_fetch(meta.url):
try:
loop = asyncio.get_running_loop()
response = await loop.run_in_executor(
@ -324,6 +388,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
LOG.error(f"{meta.url} error fetching: {e}")
failed = True
response = None
else:
LOG.error(f"{meta.url} Guardian says we cannot fetch")
failed = True
response = None
# Now, there are a number of things to consider in the response that
# we need to consider in updating our permanent record.
@ -381,6 +449,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
except Exception:
pass
if retry_delta is None:
# See if robots.txt has any guidance for us.
retry_delta = await GUARDIAN.crawl_delay(meta.url)
if retry_delta is None:
if failed:
retry_delta = 1 * 60 # Retry again in a minute