Ignore errors fetching robots
Man everybody has trouble with this, don't let a bad server bring you down here.
This commit is contained in:
parent
74f7146937
commit
03d420e412
1 changed files with 34 additions and 8 deletions
42
cry/feed.py
42
cry/feed.py
|
|
@ -371,19 +371,45 @@ class Guardian:
|
||||||
|
|
||||||
async def crawl_delay(self, url: str) -> int | None:
|
async def crawl_delay(self, url: str) -> int | None:
|
||||||
"""Returns the number of seconds we should wait before fetching again."""
|
"""Returns the number of seconds we should wait before fetching again."""
|
||||||
parser = await self.get_robots_parser(url)
|
try:
|
||||||
result = parser.crawl_delay(USER_AGENT)
|
parser = await self.get_robots_parser(url)
|
||||||
if isinstance(result, str):
|
result = parser.crawl_delay(USER_AGENT)
|
||||||
try:
|
if isinstance(result, str):
|
||||||
return int(result)
|
try:
|
||||||
except ValueError:
|
return int(result)
|
||||||
return None
|
except ValueError:
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
LOG.error(f"Error fetching crawl delay for {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
GUARDIAN = Guardian()
|
GUARDIAN = Guardian()
|
||||||
|
|
||||||
|
|
||||||
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
|
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta]:
|
||||||
|
"""Fetch a feed from the internet. `meta` is a FeedMeta that has all the
|
||||||
|
details about what happened the last time we went to do a fetch, caching
|
||||||
|
information and whatnot.
|
||||||
|
|
||||||
|
The return value is a little funky. It returns a 2-tuple, where the first
|
||||||
|
element is one of:
|
||||||
|
|
||||||
|
- None, if we could not fetch anything
|
||||||
|
- A Feed, if we fetched something and it seemed to be a feed
|
||||||
|
- A string, if we fetched something but it was not a feed
|
||||||
|
|
||||||
|
The second element is a FeedMeta that describes the URL. It might be the
|
||||||
|
same as the FeedMeta that was provided, but it might not be:
|
||||||
|
|
||||||
|
- The etag might have been updated if the server sent us an etag
|
||||||
|
- The modified value might have been updated if the server sent us a
|
||||||
|
new value
|
||||||
|
- The URL might have been updated if we followed a permanent redirect
|
||||||
|
|
||||||
|
Just to be safe, callers should use the new FeedMeta in place of the
|
||||||
|
argument for everything after calling this function.
|
||||||
|
"""
|
||||||
if not meta.should_fetch(time.time()):
|
if not meta.should_fetch(time.time()):
|
||||||
return (None, meta)
|
return (None, meta)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue