Check robots.txt for guidance and permission when fetching
It's only polite.
This commit is contained in:
parent
eab6cf609d
commit
bf41f70209
1 changed files with 84 additions and 12 deletions
96
cry/feed.py
96
cry/feed.py
|
|
@ -10,6 +10,7 @@ import re
|
||||||
import time
|
import time
|
||||||
import typing
|
import typing
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
import urllib.robotparser
|
||||||
|
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
@ -294,6 +295,70 @@ def could_be_feed_data(data: str) -> bool:
|
||||||
return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
|
return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
|
||||||
|
|
||||||
|
|
||||||
|
class Guardian:
|
||||||
|
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.permissions = {}
|
||||||
|
|
||||||
|
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
|
||||||
|
url = urllib.parse.urljoin(url, "/robots.txt")
|
||||||
|
parser = self.permissions.get(url)
|
||||||
|
if parser is None:
|
||||||
|
parser = asyncio.Lock()
|
||||||
|
self.permissions[url] = parser
|
||||||
|
|
||||||
|
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
||||||
|
return parser
|
||||||
|
|
||||||
|
assert isinstance(parser, asyncio.Lock)
|
||||||
|
async with parser:
|
||||||
|
parser = self.permissions.get(url)
|
||||||
|
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
||||||
|
return parser
|
||||||
|
|
||||||
|
headers = {"user-agent": USER_AGENT}
|
||||||
|
event_loop = asyncio.get_running_loop()
|
||||||
|
response = await event_loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
functools.partial(
|
||||||
|
requests.get,
|
||||||
|
url,
|
||||||
|
headers=headers,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = urllib.robotparser.RobotFileParser(url)
|
||||||
|
if response.status_code in (401, 403):
|
||||||
|
parser.disallow_all = True
|
||||||
|
elif response.status_code >= 400 and response.status_code < 500:
|
||||||
|
parser.allow_all = True
|
||||||
|
elif response.status_code >= 300:
|
||||||
|
response.raise_for_status()
|
||||||
|
else:
|
||||||
|
text = await event_loop.run_in_executor(None, lambda: response.text)
|
||||||
|
parser.parse(text.splitlines())
|
||||||
|
|
||||||
|
self.permissions[url] = parser
|
||||||
|
return parser
|
||||||
|
|
||||||
|
async def can_fetch(self, url: str) -> bool:
|
||||||
|
parser = await self.get_robots_parser(url)
|
||||||
|
return parser.can_fetch(USER_AGENT, url)
|
||||||
|
|
||||||
|
async def crawl_delay(self, url: str) -> int | None:
|
||||||
|
parser = await self.get_robots_parser(url)
|
||||||
|
result = parser.crawl_delay(USER_AGENT)
|
||||||
|
if isinstance(result, str):
|
||||||
|
try:
|
||||||
|
return int(result)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
GUARDIAN = Guardian()
|
||||||
|
|
||||||
|
|
||||||
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
|
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
|
||||||
if not meta.should_fetch(time.time()):
|
if not meta.should_fetch(time.time()):
|
||||||
return (None, meta)
|
return (None, meta)
|
||||||
|
|
@ -309,19 +374,22 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
|
||||||
# this time because it make it much much easier to figure out whether or
|
# this time because it make it much much easier to figure out whether or
|
||||||
# not a request has succeeded. (The straw was handling timeouts and
|
# not a request has succeeded. (The straw was handling timeouts and
|
||||||
# understanding whether `bozo_exception` was a transport failure or not.)
|
# understanding whether `bozo_exception` was a transport failure or not.)
|
||||||
#
|
|
||||||
# TODO: Check robots.txt!
|
|
||||||
|
|
||||||
try:
|
if await GUARDIAN.can_fetch(meta.url):
|
||||||
loop = asyncio.get_running_loop()
|
try:
|
||||||
response = await loop.run_in_executor(
|
loop = asyncio.get_running_loop()
|
||||||
None,
|
response = await loop.run_in_executor(
|
||||||
functools.partial(http.get, meta.url, headers=headers),
|
None,
|
||||||
)
|
functools.partial(http.get, meta.url, headers=headers),
|
||||||
LOG.info(f"{meta.url} fetched with status: {response.status_code}")
|
)
|
||||||
failed = response.status_code >= 400
|
LOG.info(f"{meta.url} fetched with status: {response.status_code}")
|
||||||
except Exception as e:
|
failed = response.status_code >= 400
|
||||||
LOG.error(f"{meta.url} error fetching: {e}")
|
except Exception as e:
|
||||||
|
LOG.error(f"{meta.url} error fetching: {e}")
|
||||||
|
failed = True
|
||||||
|
response = None
|
||||||
|
else:
|
||||||
|
LOG.error(f"{meta.url} Guardian says we cannot fetch")
|
||||||
failed = True
|
failed = True
|
||||||
response = None
|
response = None
|
||||||
|
|
||||||
|
|
@ -381,6 +449,10 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
if retry_delta is None:
|
||||||
|
# See if robots.txt has any guidance for us.
|
||||||
|
retry_delta = await GUARDIAN.crawl_delay(meta.url)
|
||||||
|
|
||||||
if retry_delta is None:
|
if retry_delta is None:
|
||||||
if failed:
|
if failed:
|
||||||
retry_delta = 1 * 60 # Retry again in a minute
|
retry_delta = 1 * 60 # Retry again in a minute
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue