Ignore errors fetching robots
Man everybody has trouble with this, don't let a bad server bring you down here.
This commit is contained in:
parent
74f7146937
commit
03d420e412
1 changed files with 34 additions and 8 deletions
42
cry/feed.py
42
cry/feed.py
|
|
@ -371,19 +371,45 @@ class Guardian:
|
|||
|
||||
async def crawl_delay(self, url: str) -> int | None:
|
||||
"""Returns the number of seconds we should wait before fetching again."""
|
||||
parser = await self.get_robots_parser(url)
|
||||
result = parser.crawl_delay(USER_AGENT)
|
||||
if isinstance(result, str):
|
||||
try:
|
||||
return int(result)
|
||||
except ValueError:
|
||||
return None
|
||||
try:
|
||||
parser = await self.get_robots_parser(url)
|
||||
result = parser.crawl_delay(USER_AGENT)
|
||||
if isinstance(result, str):
|
||||
try:
|
||||
return int(result)
|
||||
except ValueError:
|
||||
return None
|
||||
except Exception as e:
|
||||
LOG.error(f"Error fetching crawl delay for {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
GUARDIAN = Guardian()
|
||||
|
||||
|
||||
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
|
||||
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta]:
|
||||
"""Fetch a feed from the internet. `meta` is a FeedMeta that has all the
|
||||
details about what happened the last time we went to do a fetch, caching
|
||||
information and whatnot.
|
||||
|
||||
The return value is a little funky. It returns a 2-tuple, where the first
|
||||
element is one of:
|
||||
|
||||
- None, if we could not fetch anything
|
||||
- A Feed, if we fetched something and it seemed to be a feed
|
||||
- A string, if we fetched something but it was not a feed
|
||||
|
||||
The second element is a FeedMeta that describes the URL. It might be the
|
||||
same as the FeedMeta that was provided, but it might not be:
|
||||
|
||||
- The etag might have been updated if the server sent us an etag
|
||||
- The modified value might have been updated if the server sent us a
|
||||
new value
|
||||
- The URL might have been updated if we followed a permanent redirect
|
||||
|
||||
Just to be safe, callers should use the new FeedMeta in place of the
|
||||
argument for everything after calling this function.
|
||||
"""
|
||||
if not meta.should_fetch(time.time()):
|
||||
return (None, meta)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue