Subscribe now searches
Rewrite feed finder again to not multi-fetch when not needed
This commit is contained in:
parent
786ced5c24
commit
cd20db0c4c
3 changed files with 462 additions and 283 deletions
62
cry/cli.py
62
cry/cli.py
|
|
@ -39,6 +39,7 @@ def cli(verbose):
|
||||||
@click.argument("url")
|
@click.argument("url")
|
||||||
def search(url):
|
def search(url):
|
||||||
"Search an URL for feeds."
|
"Search an URL for feeds."
|
||||||
|
# TODO: Rewrite to use our new one
|
||||||
feeds = feedfinder.find_feeds(url)
|
feeds = feedfinder.find_feeds(url)
|
||||||
for feed in feeds:
|
for feed in feeds:
|
||||||
click.echo(feed)
|
click.echo(feed)
|
||||||
|
|
@ -47,11 +48,40 @@ def search(url):
|
||||||
|
|
||||||
@cli.command(name="subscribe")
|
@cli.command(name="subscribe")
|
||||||
@click.argument("url")
|
@click.argument("url")
|
||||||
def subscribe(url):
|
@click.option("--literal/--no-literal", "-l/-L", default=False)
|
||||||
|
def subscribe(url, literal):
|
||||||
"Subscribe to a feed at the specified URL."
|
"Subscribe to a feed at the specified URL."
|
||||||
|
|
||||||
db = database.Database.local()
|
db = database.Database.local()
|
||||||
|
|
||||||
|
if not literal:
|
||||||
|
click.echo(f"Searching for feeds for {url} ...")
|
||||||
|
feeds = asyncio.run(feed.feed_search(url, db.origin))
|
||||||
|
if len(feeds) == 0:
|
||||||
|
click.echo(f"Unable to find a suitable feed for {url}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if len(feeds) > 1:
|
||||||
|
# If we found more than one feed then we will try to see what the
|
||||||
|
# individual feeds are.
|
||||||
|
click.echo(f"Found {len(feeds)} feeds:")
|
||||||
|
|
||||||
|
max_title = max(len(f.title) for f in feeds)
|
||||||
|
max_url = max(len(f.meta.url) for f in feeds)
|
||||||
|
|
||||||
|
feeds.sort(key=lambda f: f.title)
|
||||||
|
|
||||||
|
for f in feeds:
|
||||||
|
click.echo(f"{f.title:{max_title}} {f.meta.url:{max_url}}")
|
||||||
|
|
||||||
|
click.echo(
|
||||||
|
"\nRun `subscribe` again with the URL of the feed you want to subscribe to."
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
result = feeds[0]
|
||||||
|
click.echo(f"Identified {result.meta.url} as a feed for {url}")
|
||||||
|
else:
|
||||||
click.echo(f"Fetching {url} ...")
|
click.echo(f"Fetching {url} ...")
|
||||||
meta = feed.FeedMeta.from_url(url, db.origin)
|
meta = feed.FeedMeta.from_url(url, db.origin)
|
||||||
d, meta = asyncio.run(feed.fetch_feed(meta))
|
d, meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
|
|
@ -59,16 +89,20 @@ def subscribe(url):
|
||||||
click.echo(f"Unable to fetch {url}")
|
click.echo(f"Unable to fetch {url}")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Check to see if this URL is already in the database.
|
if isinstance(d, str):
|
||||||
existing = db.load_feed(meta.url)
|
click.echo(f"{url} does not seem to be a feed")
|
||||||
if existing is not None:
|
|
||||||
click.echo(f"This feed already exists (as {meta.url})")
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
f = feed.Feed.from_parsed(d, meta)
|
result = d
|
||||||
db.store_feed(f)
|
|
||||||
|
|
||||||
click.echo(f"Subscribed to {meta.url}")
|
# Check to see if this URL is already in the database.
|
||||||
|
existing = db.load_feed(result.meta.url)
|
||||||
|
if existing is not None:
|
||||||
|
click.echo(f"This feed already exists (as {result.meta.url})")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
db.store_feed(result)
|
||||||
|
click.echo(f"Subscribed to {result.meta.url}")
|
||||||
|
|
||||||
|
|
||||||
@cli.command(name="import")
|
@cli.command(name="import")
|
||||||
|
|
@ -91,13 +125,16 @@ def import_opml(opml_file):
|
||||||
LOG.warn(f"Unable to fetch {url}, skipping...")
|
LOG.warn(f"Unable to fetch {url}, skipping...")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if isinstance(d, str):
|
||||||
|
click.echo(f"{url} does not seem to be a feed, skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
existing = db.load_feed(meta.url)
|
existing = db.load_feed(meta.url)
|
||||||
if existing is not None:
|
if existing is not None:
|
||||||
LOG.info(f"{url} already exists (as {meta.url})")
|
LOG.info(f"{url} already exists (as {meta.url})")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
f = feed.Feed.from_parsed(d, meta)
|
db.store_feed(d)
|
||||||
db.store_feed(f)
|
|
||||||
subscribed = subscribed + 1
|
subscribed = subscribed + 1
|
||||||
|
|
||||||
click.echo(f"Subscribed to {subscribed} new feeds")
|
click.echo(f"Subscribed to {subscribed} new feeds")
|
||||||
|
|
@ -130,10 +167,11 @@ def refresh(url):
|
||||||
if d is None:
|
if d is None:
|
||||||
# Nothing new.
|
# Nothing new.
|
||||||
db.update_meta(meta)
|
db.update_meta(meta)
|
||||||
|
elif isinstance(d, str):
|
||||||
|
click.echo(f"WARNING: {meta.url} returned a non-feed result!")
|
||||||
else:
|
else:
|
||||||
# New items, possibly!
|
# New items, possibly!
|
||||||
f = feed.Feed.from_parsed(d, meta)
|
new_count = new_count + db.store_feed(d)
|
||||||
new_count = new_count + db.store_feed(f)
|
|
||||||
|
|
||||||
click.echo(f"Fetched {new_count} new entries.")
|
click.echo(f"Fetched {new_count} new entries.")
|
||||||
|
|
||||||
|
|
|
||||||
603
cry/feed.py
603
cry/feed.py
|
|
@ -2,13 +2,15 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import functools
|
import functools
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
import typing
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import html.parser
|
import html.parser
|
||||||
import io
|
import io
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
import typing
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
import requests
|
import requests
|
||||||
|
|
@ -18,6 +20,8 @@ import requests.structures
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
USER_AGENT = "cry-reader v0.0"
|
||||||
|
|
||||||
FEED_STATUS_ALIVE = 0
|
FEED_STATUS_ALIVE = 0
|
||||||
FEED_STATUS_DEAD = 1
|
FEED_STATUS_DEAD = 1
|
||||||
FEED_STATUS_UNSUBSCRIBED = 2
|
FEED_STATUS_UNSUBSCRIBED = 2
|
||||||
|
|
@ -48,211 +52,19 @@ class FeedMeta:
|
||||||
origin=origin,
|
origin=origin,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def should_fetch(self, now) -> bool:
|
||||||
|
if self.status != FEED_STATUS_ALIVE:
|
||||||
|
LOG.info(f"{self.url} is dead or unsubscribed")
|
||||||
|
return False
|
||||||
|
|
||||||
def the_worst_element_hash(value) -> str:
|
if now < self.retry_after_ts:
|
||||||
"""Compute a content hash for the given feed element, to use as an ID.
|
|
||||||
|
|
||||||
The hash must be as stable as we can make it, but obviously there are things
|
|
||||||
we cannot control. If we've gotten here then the feed author has already
|
|
||||||
failed us and there's little we can do. This is already *known to be wrong.*
|
|
||||||
"""
|
|
||||||
|
|
||||||
def process(value, hash):
|
|
||||||
if isinstance(value, feedparser.FeedParserDict):
|
|
||||||
hash.update(b"dict")
|
|
||||||
keys = sorted(value.keys())
|
|
||||||
for key in keys:
|
|
||||||
hash.update(b"key::")
|
|
||||||
hash.update(key.encode("utf-8"))
|
|
||||||
hash.update(b"value::")
|
|
||||||
process(value[key], hash)
|
|
||||||
hash.update(b"tcid")
|
|
||||||
elif isinstance(value, str):
|
|
||||||
hash.update(b"str")
|
|
||||||
hash.update(value.encode("utf-8"))
|
|
||||||
hash.update(b"rts")
|
|
||||||
elif isinstance(value, list):
|
|
||||||
hash.update(b"list")
|
|
||||||
for item in value:
|
|
||||||
process(item, hash)
|
|
||||||
hash.update(b"tsil")
|
|
||||||
elif isinstance(value, tuple):
|
|
||||||
hash.update(b"tuple")
|
|
||||||
for item in value:
|
|
||||||
process(item, hash)
|
|
||||||
hash.update(b"elput")
|
|
||||||
|
|
||||||
hash = hashlib.sha256(usedforsecurity=False)
|
|
||||||
process(value, hash)
|
|
||||||
return hash.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
BLANK_TAGS = {"p", "br", "li", "div", "img"}
|
|
||||||
MULTI_SPACES = re.compile(r"\s+")
|
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text: str) -> str:
|
|
||||||
"""Sometimes text is HTML and otherwise ugly. This reduces it to
|
|
||||||
something pretty to display. Strips tags, puts blank space in between
|
|
||||||
elements that should generate blank space, and then collapses blank
|
|
||||||
spaces down to one.
|
|
||||||
"""
|
|
||||||
|
|
||||||
class Cleaner(html.parser.HTMLParser):
|
|
||||||
def __init__(self, writer):
|
|
||||||
super().__init__()
|
|
||||||
self.writer = writer
|
|
||||||
|
|
||||||
def handle_data(self, data: str) -> None:
|
|
||||||
self.writer.write(data)
|
|
||||||
|
|
||||||
def handle_startendtag(
|
|
||||||
self, tag: str, attrs: list[tuple[str, str | None]]
|
|
||||||
) -> None:
|
|
||||||
del attrs
|
|
||||||
if tag.lower() in BLANK_TAGS:
|
|
||||||
self.writer.write(" ")
|
|
||||||
|
|
||||||
def handle_starttag(
|
|
||||||
self, tag: str, attrs: list[tuple[str, str | None]]
|
|
||||||
) -> None:
|
|
||||||
del attrs
|
|
||||||
if tag.lower() in BLANK_TAGS:
|
|
||||||
self.writer.write(" ")
|
|
||||||
|
|
||||||
writer = io.StringIO()
|
|
||||||
cleaner = Cleaner(writer)
|
|
||||||
cleaner.feed(text)
|
|
||||||
return MULTI_SPACES.sub(" ", writer.getvalue())
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_feed(
|
|
||||||
feed: FeedMeta,
|
|
||||||
) -> typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]:
|
|
||||||
"""Potentially fetch the feed described by `feed`, returning a parsed feed
|
|
||||||
(if possible and necessary) and an updated FeedMeta.
|
|
||||||
|
|
||||||
This function can fail to return a parsed feed under a number of
|
|
||||||
circumstances. Among them:
|
|
||||||
|
|
||||||
- It's too soon to be checking this feed again.
|
|
||||||
- The feed has been failing for a while and we've called it's dead.
|
|
||||||
- The server told us it was dead.
|
|
||||||
- We checked the server and it told us our cache was good.
|
|
||||||
- We tried to contact the server, but a networking error happened.
|
|
||||||
|
|
||||||
Regardless, the new FeedMeta has the latest state of the feed.
|
|
||||||
"""
|
|
||||||
if feed.status != FEED_STATUS_ALIVE:
|
|
||||||
LOG.info(f"{feed.url} is dead or unsubscribed")
|
|
||||||
return (None, feed)
|
|
||||||
|
|
||||||
if time.time() < feed.retry_after_ts:
|
|
||||||
retry_str = time.strftime(
|
retry_str = time.strftime(
|
||||||
"%Y-%m-%d %H:%M:%S %z", time.localtime(feed.retry_after_ts)
|
"%Y-%m-%d %H:%M:%S %z", time.localtime(self.retry_after_ts)
|
||||||
)
|
)
|
||||||
LOG.info(f"{feed.url} will not be pulled until {retry_str}")
|
LOG.info(f"{self.url} will not be pulled until {retry_str}")
|
||||||
return (None, feed)
|
return False
|
||||||
|
|
||||||
# We waffle back and forth about using feedreader's HTTP support vs
|
return True
|
||||||
# calling requests ourselves. We have decided to use requests manually at
|
|
||||||
# this time because it make it much much easier to figure out whether or
|
|
||||||
# not a request has succeeded. (The straw was handling timeouts and
|
|
||||||
# understanding whether `bozo_exception` was a transport failure or not.)
|
|
||||||
|
|
||||||
headers = {"user-agent": "cry-reader v0.0"}
|
|
||||||
if feed.etag:
|
|
||||||
headers["if-none-match"] = feed.etag
|
|
||||||
if feed.modified:
|
|
||||||
headers["if-modified-since"] = feed.modified
|
|
||||||
|
|
||||||
LOG.info(f"{feed.url} fetching...")
|
|
||||||
try:
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
response = await loop.run_in_executor(
|
|
||||||
None,
|
|
||||||
functools.partial(http.get, feed.url, headers=headers),
|
|
||||||
)
|
|
||||||
LOG.info(f"{feed.url} fetched with status: {response.status_code}")
|
|
||||||
failed = response.status_code >= 400
|
|
||||||
except Exception as e:
|
|
||||||
LOG.error(f"{feed.url} error fetching: {e}")
|
|
||||||
failed = True
|
|
||||||
response = None
|
|
||||||
|
|
||||||
# Now, there are a number of things to consider in the response that
|
|
||||||
# we need to consider in updating our permanent record.
|
|
||||||
|
|
||||||
if response is not None and response.status_code == 410:
|
|
||||||
# Permanently gone, really stop asking.
|
|
||||||
LOG.error(f"{feed.url} permanently gone")
|
|
||||||
return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
|
|
||||||
|
|
||||||
if failed and time.time() > feed.last_fetched_ts + (7 * 24 * 60 * 60):
|
|
||||||
# If we've been failing to fetch the feed for more than a week then
|
|
||||||
# consider us dead, we must be doing something wrong.
|
|
||||||
LOG.error(f"{feed.url} failed for too long, giving up")
|
|
||||||
return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
|
|
||||||
|
|
||||||
if response and response.is_permanent_redirect:
|
|
||||||
# Permanent redirect, update the stored URL, but mark this as a
|
|
||||||
# successful fetch.
|
|
||||||
#
|
|
||||||
# TODO: Is this actually the right URL to store? We need the last
|
|
||||||
# permanently redirected URL, not just whatever the last thing
|
|
||||||
# is... e.g. imagine a permanent followed by a temporary
|
|
||||||
# redirect, then what?
|
|
||||||
LOG.info(f"{feed.url} permanently redirected to {response.url}")
|
|
||||||
assert response.url is not None
|
|
||||||
feed = dataclasses.replace(feed, url=response.url)
|
|
||||||
|
|
||||||
# NOTE: We might still be in a failure state here. But success or fail,
|
|
||||||
# the server might have told us when to next retry, so make a note
|
|
||||||
# of it.
|
|
||||||
retry_delta = None
|
|
||||||
if response is not None:
|
|
||||||
try:
|
|
||||||
retry_delta = int(response.headers.get("retry-after", "nope"))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if retry_delta is None:
|
|
||||||
if failed:
|
|
||||||
retry_delta = 1 * 60 # Retry again in a minute
|
|
||||||
else:
|
|
||||||
retry_delta = 60 * 60 # 1 hour default
|
|
||||||
|
|
||||||
feed = dataclasses.replace(feed, retry_after_ts=int(time.time()) + retry_delta)
|
|
||||||
|
|
||||||
# We've done everything we can on a failure, bail if we've got an error.
|
|
||||||
if failed:
|
|
||||||
LOG.info(f"{feed.url} failed at the network level")
|
|
||||||
return (None, feed)
|
|
||||||
|
|
||||||
assert response is not None
|
|
||||||
|
|
||||||
# Record our successful fetch now, to reset the failure timer above.
|
|
||||||
feed = dataclasses.replace(feed, last_fetched_ts=int(time.time()))
|
|
||||||
|
|
||||||
# We can *still* be successful but like, no changes.
|
|
||||||
if response.status_code != 200:
|
|
||||||
LOG.info(f"{feed.url} had no changes")
|
|
||||||
return (None, feed)
|
|
||||||
|
|
||||||
feed = dataclasses.replace(
|
|
||||||
feed,
|
|
||||||
etag=response.headers.get("etag"),
|
|
||||||
modified=response.headers.get("last-modified"),
|
|
||||||
)
|
|
||||||
parsed = feedparser.parse(response.content, response_headers=response.headers)
|
|
||||||
return (parsed, feed)
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_many(
|
|
||||||
metas: list[FeedMeta],
|
|
||||||
) -> list[typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]]:
|
|
||||||
async with asyncio.TaskGroup() as group:
|
|
||||||
tasks = [group.create_task(fetch_feed(m)) for m in metas]
|
|
||||||
return [t.result() for t in tasks]
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass(frozen=True)
|
@dataclasses.dataclass(frozen=True)
|
||||||
|
|
@ -398,6 +210,212 @@ class Feed:
|
||||||
return Feed(meta=meta, title=title, link=link, entries=entries)
|
return Feed(meta=meta, title=title, link=link, entries=entries)
|
||||||
|
|
||||||
|
|
||||||
|
def the_worst_element_hash(value) -> str:
|
||||||
|
"""Compute a content hash for the given feed element, to use as an ID.
|
||||||
|
|
||||||
|
The hash must be as stable as we can make it, but obviously there are things
|
||||||
|
we cannot control. If we've gotten here then the feed author has already
|
||||||
|
failed us and there's little we can do. This is already *known to be wrong.*
|
||||||
|
"""
|
||||||
|
|
||||||
|
def process(value, hash):
|
||||||
|
if isinstance(value, feedparser.FeedParserDict):
|
||||||
|
hash.update(b"dict")
|
||||||
|
keys = sorted(value.keys())
|
||||||
|
for key in keys:
|
||||||
|
hash.update(b"key::")
|
||||||
|
hash.update(key.encode("utf-8"))
|
||||||
|
hash.update(b"value::")
|
||||||
|
process(value[key], hash)
|
||||||
|
hash.update(b"tcid")
|
||||||
|
elif isinstance(value, str):
|
||||||
|
hash.update(b"str")
|
||||||
|
hash.update(value.encode("utf-8"))
|
||||||
|
hash.update(b"rts")
|
||||||
|
elif isinstance(value, list):
|
||||||
|
hash.update(b"list")
|
||||||
|
for item in value:
|
||||||
|
process(item, hash)
|
||||||
|
hash.update(b"tsil")
|
||||||
|
elif isinstance(value, tuple):
|
||||||
|
hash.update(b"tuple")
|
||||||
|
for item in value:
|
||||||
|
process(item, hash)
|
||||||
|
hash.update(b"elput")
|
||||||
|
|
||||||
|
hash = hashlib.sha256(usedforsecurity=False)
|
||||||
|
process(value, hash)
|
||||||
|
return hash.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
BLANK_TAGS = {"p", "br", "li", "div", "img"}
|
||||||
|
MULTI_SPACES = re.compile(r"\s+")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text: str) -> str:
|
||||||
|
"""Sometimes text is HTML and otherwise ugly. This reduces it to
|
||||||
|
something pretty to display. Strips tags, puts blank space in between
|
||||||
|
elements that should generate blank space, and then collapses blank
|
||||||
|
spaces down to one.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Cleaner(html.parser.HTMLParser):
|
||||||
|
def __init__(self, writer):
|
||||||
|
super().__init__()
|
||||||
|
self.writer = writer
|
||||||
|
|
||||||
|
def handle_data(self, data: str) -> None:
|
||||||
|
self.writer.write(data)
|
||||||
|
|
||||||
|
def handle_startendtag(
|
||||||
|
self, tag: str, attrs: list[tuple[str, str | None]]
|
||||||
|
) -> None:
|
||||||
|
del attrs
|
||||||
|
if tag.lower() in BLANK_TAGS:
|
||||||
|
self.writer.write(" ")
|
||||||
|
|
||||||
|
def handle_starttag(
|
||||||
|
self, tag: str, attrs: list[tuple[str, str | None]]
|
||||||
|
) -> None:
|
||||||
|
del attrs
|
||||||
|
if tag.lower() in BLANK_TAGS:
|
||||||
|
self.writer.write(" ")
|
||||||
|
|
||||||
|
writer = io.StringIO()
|
||||||
|
cleaner = Cleaner(writer)
|
||||||
|
cleaner.feed(text)
|
||||||
|
return MULTI_SPACES.sub(" ", writer.getvalue())
|
||||||
|
|
||||||
|
|
||||||
|
def could_be_feed_data(data: str) -> bool:
|
||||||
|
"""See if the data might be a feed."""
|
||||||
|
data = data.lower()
|
||||||
|
if data.count("<html"):
|
||||||
|
return False
|
||||||
|
return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
|
||||||
|
if not meta.should_fetch(time.time()):
|
||||||
|
return (None, meta)
|
||||||
|
|
||||||
|
headers = {"user-agent": USER_AGENT}
|
||||||
|
if meta.etag:
|
||||||
|
headers["if-none-match"] = meta.etag
|
||||||
|
if meta.modified:
|
||||||
|
headers["if-modified-since"] = meta.modified
|
||||||
|
|
||||||
|
# We waffle back and forth about using feedreader's HTTP support vs
|
||||||
|
# calling requests ourselves. We have decided to use requests manually at
|
||||||
|
# this time because it make it much much easier to figure out whether or
|
||||||
|
# not a request has succeeded. (The straw was handling timeouts and
|
||||||
|
# understanding whether `bozo_exception` was a transport failure or not.)
|
||||||
|
#
|
||||||
|
# TODO: Check robots.txt!
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
response = await loop.run_in_executor(
|
||||||
|
None,
|
||||||
|
functools.partial(http.get, meta.url, headers=headers),
|
||||||
|
)
|
||||||
|
LOG.info(f"{meta.url} fetched with status: {response.status_code}")
|
||||||
|
failed = response.status_code >= 400
|
||||||
|
except Exception as e:
|
||||||
|
LOG.error(f"{meta.url} error fetching: {e}")
|
||||||
|
failed = True
|
||||||
|
response = None
|
||||||
|
|
||||||
|
# Now, there are a number of things to consider in the response that
|
||||||
|
# we need to consider in updating our permanent record.
|
||||||
|
|
||||||
|
if response is not None and response.status_code == 410:
|
||||||
|
# Permanently gone, really stop asking.
|
||||||
|
LOG.error(f"{meta.url} permanently gone")
|
||||||
|
return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
|
||||||
|
|
||||||
|
if failed and time.time() > meta.last_fetched_ts + (7 * 24 * 60 * 60):
|
||||||
|
# If we've been failing to fetch the feed for more than a week then
|
||||||
|
# consider us dead, we must be doing something wrong.
|
||||||
|
LOG.error(f"{meta.url} failed for too long, giving up")
|
||||||
|
return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
|
||||||
|
|
||||||
|
if response and response.is_permanent_redirect:
|
||||||
|
# Permanent redirect, update the stored URL, but mark this as a
|
||||||
|
# successful fetch.
|
||||||
|
#
|
||||||
|
# TODO: Is this actually the right URL to store? We need the last
|
||||||
|
# permanently redirected URL, not just whatever the last thing
|
||||||
|
# is... e.g. imagine a permanent followed by a temporary
|
||||||
|
# redirect, then what?
|
||||||
|
LOG.info(f"{meta.url} permanently redirected to {response.url}")
|
||||||
|
assert response.url is not None
|
||||||
|
meta = dataclasses.replace(meta, url=response.url)
|
||||||
|
|
||||||
|
# TODO: Handle that bogus non-HTTP redirect that feedfinder uses.
|
||||||
|
|
||||||
|
# NOTE: We might still be in a failure state here. But success or fail,
|
||||||
|
# the server might have told us when to next retry, so make a note
|
||||||
|
# of it. The server might also have given us updated caching
|
||||||
|
# information (even on failure!) and so let's also make a note of that.
|
||||||
|
retry_delta = None
|
||||||
|
etag = meta.etag
|
||||||
|
modified = meta.modified
|
||||||
|
if response is not None:
|
||||||
|
etag = response.headers.get("etag", meta.etag)
|
||||||
|
modified = response.headers.get("last-modified", meta.modified)
|
||||||
|
|
||||||
|
try:
|
||||||
|
retry_delta = int(response.headers.get("retry-after", "nope"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if retry_delta is None:
|
||||||
|
if failed:
|
||||||
|
retry_delta = 1 * 60 # Retry again in a minute
|
||||||
|
else:
|
||||||
|
retry_delta = 60 * 60 # 1 hour default
|
||||||
|
|
||||||
|
meta = dataclasses.replace(
|
||||||
|
meta,
|
||||||
|
retry_after_ts=int(time.time()) + retry_delta,
|
||||||
|
etag=etag,
|
||||||
|
modified=modified,
|
||||||
|
)
|
||||||
|
|
||||||
|
# We've done everything we can on a failure, bail if we've got an error.
|
||||||
|
if failed:
|
||||||
|
LOG.info(f"{meta.url} failed at the network level")
|
||||||
|
return (None, meta)
|
||||||
|
|
||||||
|
assert response is not None
|
||||||
|
|
||||||
|
# Record our successful fetch now, to reset the failure timer above.
|
||||||
|
meta = dataclasses.replace(meta, last_fetched_ts=int(time.time()))
|
||||||
|
|
||||||
|
# We can *still* be successful but like, no changes.
|
||||||
|
if response.status_code != 200:
|
||||||
|
LOG.info(f"{meta.url} had no changes")
|
||||||
|
return (None, meta)
|
||||||
|
|
||||||
|
# Does this seem to be a feed? Or not?
|
||||||
|
if could_be_feed_data(response.text):
|
||||||
|
parsed = feedparser.parse(response.content, response_headers=response.headers)
|
||||||
|
return (Feed.from_parsed(parsed, meta), meta)
|
||||||
|
|
||||||
|
# No this is not a feed, just return the content out for further
|
||||||
|
# processing.
|
||||||
|
return (response.text, meta)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_many(
|
||||||
|
metas: list[FeedMeta],
|
||||||
|
) -> list[typing.Tuple[Feed | str | None, FeedMeta]]:
|
||||||
|
async with asyncio.TaskGroup() as group:
|
||||||
|
tasks = [group.create_task(fetch_feed(m)) for m in metas]
|
||||||
|
return [t.result() for t in tasks]
|
||||||
|
|
||||||
|
|
||||||
def merge_feeds(a: Feed, b: Feed) -> Feed:
|
def merge_feeds(a: Feed, b: Feed) -> Feed:
|
||||||
"""Merge two known feeds. There are two conflict resolution policies:
|
"""Merge two known feeds. There are two conflict resolution policies:
|
||||||
|
|
||||||
|
|
@ -436,3 +454,180 @@ def sort_key(f: Feed) -> int:
|
||||||
if len(f.entries) > 0:
|
if len(f.entries) > 0:
|
||||||
return max(e.inserted_at for e in f.entries)
|
return max(e.inserted_at for e in f.entries)
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
class FeedSearchParser(html.parser.HTMLParser):
|
||||||
|
"""An HTML parser that tries to find links to feeds."""
|
||||||
|
|
||||||
|
FEED_TYPES = (
|
||||||
|
"application/rss+xml",
|
||||||
|
"text/xml",
|
||||||
|
"application/atom+xml",
|
||||||
|
"application/x.atom+xml",
|
||||||
|
"application/x-atom+xml",
|
||||||
|
)
|
||||||
|
|
||||||
|
link_links: list[str]
|
||||||
|
a_links: list[str]
|
||||||
|
|
||||||
|
def __init__(self, baseuri):
|
||||||
|
super().__init__()
|
||||||
|
self.baseuri = baseuri
|
||||||
|
self.link_links = []
|
||||||
|
self.a_links = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs = {k: v for k, v in attrs}
|
||||||
|
if tag == "base":
|
||||||
|
self.do_base(attrs)
|
||||||
|
elif tag == "link":
|
||||||
|
self.do_link(attrs)
|
||||||
|
elif tag == "a":
|
||||||
|
self.do_a(attrs)
|
||||||
|
|
||||||
|
def do_base(self, attrs):
|
||||||
|
base = attrs.get("href")
|
||||||
|
if base is not None:
|
||||||
|
self.baseuri = base
|
||||||
|
|
||||||
|
def do_link(self, attrs):
|
||||||
|
rel = attrs.get("rel")
|
||||||
|
if rel is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
if "alternate" not in rel.split():
|
||||||
|
return
|
||||||
|
|
||||||
|
if attrs.get("type", "").lower() not in self.FEED_TYPES:
|
||||||
|
return
|
||||||
|
|
||||||
|
href = attrs.get("href")
|
||||||
|
if href is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.link_links.append(urllib.parse.urljoin(self.baseuri, href))
|
||||||
|
|
||||||
|
def do_a(self, attrs):
|
||||||
|
href = attrs.get("href")
|
||||||
|
if href is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
|
||||||
|
|
||||||
|
|
||||||
|
def massage_url(uri: str) -> str:
|
||||||
|
uri = uri.strip()
|
||||||
|
if uri.startswith("feed://"):
|
||||||
|
uri = "http://" + uri.split("feed://", 1).pop()
|
||||||
|
for x in ["http", "https"]:
|
||||||
|
if uri.startswith("%s://" % x):
|
||||||
|
return uri
|
||||||
|
return "http://%s" % uri
|
||||||
|
|
||||||
|
|
||||||
|
def classify_links(links, baseuri) -> typing.Tuple[list[str], list[str]]:
|
||||||
|
"""Split the links into two sets: local (which start with baseuri) and
|
||||||
|
remote (which don't).
|
||||||
|
"""
|
||||||
|
baseuri = baseuri.lower()
|
||||||
|
|
||||||
|
local, remote = [], []
|
||||||
|
for link in links:
|
||||||
|
if link.lower().startswith(baseuri):
|
||||||
|
local.append(link)
|
||||||
|
else:
|
||||||
|
remote.append(link)
|
||||||
|
|
||||||
|
return local, remote
|
||||||
|
|
||||||
|
|
||||||
|
def is_feed_link(link: str) -> bool:
|
||||||
|
"""Return True if the link seems to be a feed link, or False otherwise."""
|
||||||
|
link = link.lower()
|
||||||
|
return (
|
||||||
|
link.endswith(".rss")
|
||||||
|
or link.endswith(".rdf")
|
||||||
|
or link.endswith(".xml")
|
||||||
|
or link.endswith(".atom")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_XML_related_link(link: str) -> bool:
|
||||||
|
link = link.lower()
|
||||||
|
return "rss" in link or "rdf" in link or "xml" in link or "atom" in link
|
||||||
|
|
||||||
|
|
||||||
|
async def check_feed(url: str, origin: str) -> Feed | None:
|
||||||
|
"""Check to see if the given URL is a feed. If it is, return the feed,
|
||||||
|
otherwise return None.
|
||||||
|
"""
|
||||||
|
meta = FeedMeta.from_url(url, origin)
|
||||||
|
result, meta = await fetch_feed(meta)
|
||||||
|
if isinstance(result, Feed):
|
||||||
|
return result
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def check_links(links: typing.Iterable[str], origin: str) -> list[Feed]:
|
||||||
|
"""Fetch all the links and return the ones that appear to have feeds in
|
||||||
|
them. If none of them are fetchable or none of them have feeds then this
|
||||||
|
will return nothing.
|
||||||
|
"""
|
||||||
|
async with asyncio.TaskGroup() as group:
|
||||||
|
tasks = [group.create_task(check_feed(link, origin)) for link in links]
|
||||||
|
|
||||||
|
outfeeds: list[Feed] = []
|
||||||
|
for task in tasks:
|
||||||
|
result = task.result()
|
||||||
|
if result is not None:
|
||||||
|
outfeeds.append(result)
|
||||||
|
|
||||||
|
return outfeeds
|
||||||
|
|
||||||
|
|
||||||
|
async def feed_search(uri: str, origin: str) -> list[Feed]:
|
||||||
|
meta = FeedMeta.from_url(massage_url(uri), origin)
|
||||||
|
result, meta = await fetch_feed(meta)
|
||||||
|
if result is None:
|
||||||
|
return []
|
||||||
|
if isinstance(result, Feed):
|
||||||
|
return [result]
|
||||||
|
|
||||||
|
# OK it was not a feed, let's try all our searching games.
|
||||||
|
parser = FeedSearchParser(meta.url)
|
||||||
|
parser.feed(result)
|
||||||
|
|
||||||
|
LOG.debug("Checking links...")
|
||||||
|
outfeeds = await check_links(parser.link_links, origin)
|
||||||
|
if len(outfeeds) > 0:
|
||||||
|
return outfeeds
|
||||||
|
|
||||||
|
LOG.debug("No links, checking A tags...")
|
||||||
|
local_links, remote_links = classify_links(parser.a_links, meta.url)
|
||||||
|
outfeeds = await check_links(filter(is_feed_link, local_links), origin)
|
||||||
|
if len(outfeeds) > 0:
|
||||||
|
return outfeeds
|
||||||
|
outfeeds = await check_links(filter(is_XML_related_link, local_links), origin)
|
||||||
|
if len(outfeeds) > 0:
|
||||||
|
return outfeeds
|
||||||
|
outfeeds = await check_links(filter(is_feed_link, remote_links), origin)
|
||||||
|
if len(outfeeds) > 0:
|
||||||
|
return outfeeds
|
||||||
|
outfeeds = await check_links(filter(is_XML_related_link, remote_links), origin)
|
||||||
|
if len(outfeeds) > 0:
|
||||||
|
return outfeeds
|
||||||
|
|
||||||
|
LOG.debug("no A tags, guessing")
|
||||||
|
suffixes = [ # filenames used by popular software:
|
||||||
|
"atom.xml", # blogger, TypePad
|
||||||
|
"index.atom", # MT, apparently
|
||||||
|
"index.rdf", # MT
|
||||||
|
"rss.xml", # Dave Winer/Manila
|
||||||
|
"index.xml", # MT
|
||||||
|
"index.rss", # Slash
|
||||||
|
]
|
||||||
|
outfeeds = await check_links(
|
||||||
|
[urllib.parse.urljoin(meta.url, x) for x in suffixes], origin
|
||||||
|
)
|
||||||
|
return outfeeds
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,10 @@
|
||||||
|
|
||||||
Based on http://www.aaronsw.com/2002/feedfinder/
|
Based on http://www.aaronsw.com/2002/feedfinder/
|
||||||
|
|
||||||
Kinda rewritten by John Doty for the Python3 and the cry aggregator, but the
|
Rewritted by John Doty for the Python3 and the cry aggregator, but the basic
|
||||||
basic frame remains.
|
frame remains. The big thing *this* does is also return the FeedMeta when it
|
||||||
|
has found feeds, instead of just URLs. This is more useful for the rest of
|
||||||
|
processing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -17,6 +19,7 @@ import urllib.robotparser
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from . import feed
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -125,7 +128,7 @@ class HtmlBasedParser(html.parser.HTMLParser):
|
||||||
self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
|
self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
|
||||||
|
|
||||||
|
|
||||||
def makeFullURI(uri):
|
def makeFullURI(uri: str) -> str:
|
||||||
uri = uri.strip()
|
uri = uri.strip()
|
||||||
if uri.startswith("feed://"):
|
if uri.startswith("feed://"):
|
||||||
uri = "http://" + uri.split("feed://", 1).pop()
|
uri = "http://" + uri.split("feed://", 1).pop()
|
||||||
|
|
@ -204,8 +207,6 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
|
||||||
"""Find feeds for the given URI.
|
"""Find feeds for the given URI.
|
||||||
|
|
||||||
How it works:
|
How it works:
|
||||||
0.
|
|
||||||
|
|
||||||
1. If the URI points to a feed, it is simply returned; otherwise
|
1. If the URI points to a feed, it is simply returned; otherwise
|
||||||
the page is downloaded and the real fun begins.
|
the page is downloaded and the real fun begins.
|
||||||
|
|
||||||
|
|
@ -293,58 +294,3 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
|
||||||
)
|
)
|
||||||
|
|
||||||
return list(set(outfeeds))
|
return list(set(outfeeds))
|
||||||
|
|
||||||
|
|
||||||
##### test harness ######
|
|
||||||
|
|
||||||
|
|
||||||
def test():
|
|
||||||
uri = "http://diveintomark.org/tests/client/autodiscovery/html4-001.html"
|
|
||||||
failed = []
|
|
||||||
count = 0
|
|
||||||
while 1:
|
|
||||||
data = _gatekeeper.get(uri)
|
|
||||||
if data.find("Atom autodiscovery test") == -1:
|
|
||||||
break
|
|
||||||
sys.stdout.write(".")
|
|
||||||
sys.stdout.flush()
|
|
||||||
count += 1
|
|
||||||
links = getLinks(data, uri)
|
|
||||||
if not links:
|
|
||||||
print(f"\n*** FAILED *** {uri} could not find link")
|
|
||||||
failed.append(uri)
|
|
||||||
elif len(links) > 1:
|
|
||||||
print(f"\n*** FAILED *** {uri} found too many links")
|
|
||||||
failed.append(uri)
|
|
||||||
else:
|
|
||||||
atomdata = requests.get(links[0]).text
|
|
||||||
if atomdata.find('<link rel="alternate"') == -1:
|
|
||||||
print(f"\n*** FAILED *** {uri} retrieved something that is not a feed")
|
|
||||||
failed.append(uri)
|
|
||||||
else:
|
|
||||||
backlink = atomdata.split('href="').pop().split('"')[0]
|
|
||||||
if backlink != uri:
|
|
||||||
print(f"\n*** FAILED *** {uri} retrieved wrong feed")
|
|
||||||
failed.append(uri)
|
|
||||||
if data.find('<link rel="next" href="') == -1:
|
|
||||||
break
|
|
||||||
uri = urllib.parse.urljoin(
|
|
||||||
uri, data.split('<link rel="next" href="').pop().split('"')[0]
|
|
||||||
)
|
|
||||||
print()
|
|
||||||
print(f"{count} tests executed, {len(failed)} failed")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
args = sys.argv[1:]
|
|
||||||
if args and args[0] == "--debug":
|
|
||||||
_debug = 1
|
|
||||||
args.pop(0)
|
|
||||||
if args:
|
|
||||||
uri = args[0]
|
|
||||||
else:
|
|
||||||
uri = "http://diveintomark.org/"
|
|
||||||
if uri == "test":
|
|
||||||
test()
|
|
||||||
else:
|
|
||||||
print("\n".join(getFeeds(uri)))
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue