Check robots.txt for guidance and permission when fetching
It's only polite.
This commit is contained in:
parent
eab6cf609d
commit
bf41f70209
1 changed files with 84 additions and 12 deletions
96
cry/feed.py
96
cry/feed.py
|
|
@ -10,6 +10,7 @@ import re
|
|||
import time
|
||||
import typing
|
||||
import urllib.parse
|
||||
import urllib.robotparser
|
||||
|
||||
|
||||
import feedparser
|
||||
|
|
@ -294,6 +295,70 @@ def could_be_feed_data(data: str) -> bool:
|
|||
return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
|
||||
|
||||
|
||||
class Guardian:
|
||||
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
|
||||
|
||||
def __init__(self):
|
||||
self.permissions = {}
|
||||
|
||||
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
|
||||
url = urllib.parse.urljoin(url, "/robots.txt")
|
||||
parser = self.permissions.get(url)
|
||||
if parser is None:
|
||||
parser = asyncio.Lock()
|
||||
self.permissions[url] = parser
|
||||
|
||||
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
||||
return parser
|
||||
|
||||
assert isinstance(parser, asyncio.Lock)
|
||||
async with parser:
|
||||
parser = self.permissions.get(url)
|
||||
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
||||
return parser
|
||||
|
||||
headers = {"user-agent": USER_AGENT}
|
||||
event_loop = asyncio.get_running_loop()
|
||||
response = await event_loop.run_in_executor(
|
||||
None,
|
||||
functools.partial(
|
||||
requests.get,
|
||||
url,
|
||||
headers=headers,
|
||||
),
|
||||
)
|
||||
|
||||
parser = urllib.robotparser.RobotFileParser(url)
|
||||
if response.status_code in (401, 403):
|
||||
parser.disallow_all = True
|
||||
elif response.status_code >= 400 and response.status_code < 500:
|
||||
parser.allow_all = True
|
||||
elif response.status_code >= 300:
|
||||
response.raise_for_status()
|
||||
else:
|
||||
text = await event_loop.run_in_executor(None, lambda: response.text)
|
||||
parser.parse(text.splitlines())
|
||||
|
||||
self.permissions[url] = parser
|
||||
return parser
|
||||
|
||||
async def can_fetch(self, url: str) -> bool:
|
||||
parser = await self.get_robots_parser(url)
|
||||
return parser.can_fetch(USER_AGENT, url)
|
||||
|
||||
async def crawl_delay(self, url: str) -> int | None:
|
||||
parser = await self.get_robots_parser(url)
|
||||
result = parser.crawl_delay(USER_AGENT)
|
||||
if isinstance(result, str):
|
||||
try:
|
||||
return int(result)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
GUARDIAN = Guardian()
|
||||
|
||||
|
||||
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
|
||||
if not meta.should_fetch(time.time()):
|
||||
return (None, meta)
|
||||
|
|
@ -309,19 +374,22 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
|
|||
# this time because it make it much much easier to figure out whether or
|
||||
# not a request has succeeded. (The straw was handling timeouts and
|
||||
# understanding whether `bozo_exception` was a transport failure or not.)
|
||||
#
|
||||
# TODO: Check robots.txt!
|
||||
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
response = await loop.run_in_executor(
|
||||
None,
|
||||
functools.partial(http.get, meta.url, headers=headers),
|
||||
)
|
||||
LOG.info(f"{meta.url} fetched with status: {response.status_code}")
|
||||
failed = response.status_code >= 400
|
||||
except Exception as e:
|
||||
LOG.error(f"{meta.url} error fetching: {e}")
|
||||
if await GUARDIAN.can_fetch(meta.url):
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
response = await loop.run_in_executor(
|
||||
None,
|
||||
functools.partial(http.get, meta.url, headers=headers),
|
||||
)
|
||||
LOG.info(f"{meta.url} fetched with status: {response.status_code}")
|
||||
failed = response.status_code >= 400
|
||||
except Exception as e:
|
||||
LOG.error(f"{meta.url} error fetching: {e}")
|
||||
failed = True
|
||||
response = None
|
||||
else:
|
||||
LOG.error(f"{meta.url} Guardian says we cannot fetch")
|
||||
failed = True
|
||||
response = None
|
||||
|
||||
|
|
@ -381,6 +449,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
if retry_delta is None:
|
||||
# See if robots.txt has any guidance for us.
|
||||
retry_delta = await GUARDIAN.crawl_delay(meta.url)
|
||||
|
||||
if retry_delta is None:
|
||||
if failed:
|
||||
retry_delta = 1 * 60 # Retry again in a minute
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue