From 03d420e412f11e50127177d69fd2fcde3f1a7059 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sun, 18 Aug 2024 17:12:12 -0700 Subject: [PATCH] Ignore errors fetching robots Man everybody has trouble with this, don't let a bad server bring you down here. --- cry/feed.py | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/cry/feed.py b/cry/feed.py index 2746315..0bbceea 100644 --- a/cry/feed.py +++ b/cry/feed.py @@ -371,19 +371,45 @@ class Guardian: async def crawl_delay(self, url: str) -> int | None: """Returns the number of seconds we should wait before fetching again.""" - parser = await self.get_robots_parser(url) - result = parser.crawl_delay(USER_AGENT) - if isinstance(result, str): - try: - return int(result) - except ValueError: - return None + try: + parser = await self.get_robots_parser(url) + result = parser.crawl_delay(USER_AGENT) + if isinstance(result, str): + try: + return int(result) + except ValueError: + return None + except Exception as e: + LOG.error(f"Error fetching crawl delay for {url}: {e}") + return None GUARDIAN = Guardian() -async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]: +async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta]: + """Fetch a feed from the internet. `meta` is a FeedMeta that has all the + details about what happened the last time we went to do a fetch, caching + information and whatnot. + + The return value is a little funky. It returns a 2-tuple, where the first + element is one of: + + - None, if we could not fetch anything + - A Feed, if we fetched something and it seemed to be a feed + - A string, if we fetched something but it was not a feed + + The second element is a FeedMeta that describes the URL. It might be the + same as the FeedMeta that was provided, but it might not be: + + - The etag might have been updated if the server sent us an etag + - The modified value might have been updated if the server sent us a + new value + - The URL might have been updated if we followed a permanent redirect + + Just to be safe, callers should use the new FeedMeta in place of the + argument for everything after calling this function. + """ if not meta.should_fetch(time.time()): return (None, meta)