Ignore errors fetching robots

Man everybody has trouble with this, don't let a bad server bring you
down here.
This commit is contained in:
John Doty 2024-08-18 17:12:12 -07:00
parent 74f7146937
commit 03d420e412

View file

@ -371,19 +371,45 @@ class Guardian:
async def crawl_delay(self, url: str) -> int | None: async def crawl_delay(self, url: str) -> int | None:
"""Returns the number of seconds we should wait before fetching again.""" """Returns the number of seconds we should wait before fetching again."""
parser = await self.get_robots_parser(url) try:
result = parser.crawl_delay(USER_AGENT) parser = await self.get_robots_parser(url)
if isinstance(result, str): result = parser.crawl_delay(USER_AGENT)
try: if isinstance(result, str):
return int(result) try:
except ValueError: return int(result)
return None except ValueError:
return None
except Exception as e:
LOG.error(f"Error fetching crawl delay for {url}: {e}")
return None
GUARDIAN = Guardian() GUARDIAN = Guardian()
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]: async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta]:
"""Fetch a feed from the internet. `meta` is a FeedMeta that has all the
details about what happened the last time we went to do a fetch, caching
information and whatnot.
The return value is a little funky. It returns a 2-tuple, where the first
element is one of:
- None, if we could not fetch anything
- A Feed, if we fetched something and it seemed to be a feed
- A string, if we fetched something but it was not a feed
The second element is a FeedMeta that describes the URL. It might be the
same as the FeedMeta that was provided, but it might not be:
- The etag might have been updated if the server sent us an etag
- The modified value might have been updated if the server sent us a
new value
- The URL might have been updated if we followed a permanent redirect
Just to be safe, callers should use the new FeedMeta in place of the
argument for everything after calling this function.
"""
if not meta.should_fetch(time.time()): if not meta.should_fetch(time.time()):
return (None, meta) return (None, meta)