Subscribe now searches
Rewrite feed finder again to not multi-fetch when not needed
This commit is contained in:
parent
786ced5c24
commit
cd20db0c4c
3 changed files with 462 additions and 283 deletions
72
cry/cli.py
72
cry/cli.py
|
|
@ -39,6 +39,7 @@ def cli(verbose):
|
|||
@click.argument("url")
|
||||
def search(url):
|
||||
"Search an URL for feeds."
|
||||
# TODO: Rewrite to use our new one
|
||||
feeds = feedfinder.find_feeds(url)
|
||||
for feed in feeds:
|
||||
click.echo(feed)
|
||||
|
|
@ -47,28 +48,61 @@ def search(url):
|
|||
|
||||
@cli.command(name="subscribe")
|
||||
@click.argument("url")
|
||||
def subscribe(url):
|
||||
@click.option("--literal/--no-literal", "-l/-L", default=False)
|
||||
def subscribe(url, literal):
|
||||
"Subscribe to a feed at the specified URL."
|
||||
|
||||
db = database.Database.local()
|
||||
|
||||
click.echo(f"Fetching {url} ...")
|
||||
meta = feed.FeedMeta.from_url(url, db.origin)
|
||||
d, meta = asyncio.run(feed.fetch_feed(meta))
|
||||
if d is None:
|
||||
click.echo(f"Unable to fetch {url}")
|
||||
return 1
|
||||
if not literal:
|
||||
click.echo(f"Searching for feeds for {url} ...")
|
||||
feeds = asyncio.run(feed.feed_search(url, db.origin))
|
||||
if len(feeds) == 0:
|
||||
click.echo(f"Unable to find a suitable feed for {url}")
|
||||
return 1
|
||||
|
||||
if len(feeds) > 1:
|
||||
# If we found more than one feed then we will try to see what the
|
||||
# individual feeds are.
|
||||
click.echo(f"Found {len(feeds)} feeds:")
|
||||
|
||||
max_title = max(len(f.title) for f in feeds)
|
||||
max_url = max(len(f.meta.url) for f in feeds)
|
||||
|
||||
feeds.sort(key=lambda f: f.title)
|
||||
|
||||
for f in feeds:
|
||||
click.echo(f"{f.title:{max_title}} {f.meta.url:{max_url}}")
|
||||
|
||||
click.echo(
|
||||
"\nRun `subscribe` again with the URL of the feed you want to subscribe to."
|
||||
)
|
||||
return 1
|
||||
|
||||
result = feeds[0]
|
||||
click.echo(f"Identified {result.meta.url} as a feed for {url}")
|
||||
else:
|
||||
click.echo(f"Fetching {url} ...")
|
||||
meta = feed.FeedMeta.from_url(url, db.origin)
|
||||
d, meta = asyncio.run(feed.fetch_feed(meta))
|
||||
if d is None:
|
||||
click.echo(f"Unable to fetch {url}")
|
||||
return 1
|
||||
|
||||
if isinstance(d, str):
|
||||
click.echo(f"{url} does not seem to be a feed")
|
||||
return 1
|
||||
|
||||
result = d
|
||||
|
||||
# Check to see if this URL is already in the database.
|
||||
existing = db.load_feed(meta.url)
|
||||
existing = db.load_feed(result.meta.url)
|
||||
if existing is not None:
|
||||
click.echo(f"This feed already exists (as {meta.url})")
|
||||
click.echo(f"This feed already exists (as {result.meta.url})")
|
||||
return 1
|
||||
|
||||
f = feed.Feed.from_parsed(d, meta)
|
||||
db.store_feed(f)
|
||||
|
||||
click.echo(f"Subscribed to {meta.url}")
|
||||
db.store_feed(result)
|
||||
click.echo(f"Subscribed to {result.meta.url}")
|
||||
|
||||
|
||||
@cli.command(name="import")
|
||||
|
|
@ -91,13 +125,16 @@ def import_opml(opml_file):
|
|||
LOG.warn(f"Unable to fetch {url}, skipping...")
|
||||
continue
|
||||
|
||||
if isinstance(d, str):
|
||||
click.echo(f"{url} does not seem to be a feed, skipping...")
|
||||
continue
|
||||
|
||||
existing = db.load_feed(meta.url)
|
||||
if existing is not None:
|
||||
LOG.info(f"{url} already exists (as {meta.url})")
|
||||
continue
|
||||
|
||||
f = feed.Feed.from_parsed(d, meta)
|
||||
db.store_feed(f)
|
||||
db.store_feed(d)
|
||||
subscribed = subscribed + 1
|
||||
|
||||
click.echo(f"Subscribed to {subscribed} new feeds")
|
||||
|
|
@ -130,10 +167,11 @@ def refresh(url):
|
|||
if d is None:
|
||||
# Nothing new.
|
||||
db.update_meta(meta)
|
||||
elif isinstance(d, str):
|
||||
click.echo(f"WARNING: {meta.url} returned a non-feed result!")
|
||||
else:
|
||||
# New items, possibly!
|
||||
f = feed.Feed.from_parsed(d, meta)
|
||||
new_count = new_count + db.store_feed(f)
|
||||
new_count = new_count + db.store_feed(d)
|
||||
|
||||
click.echo(f"Fetched {new_count} new entries.")
|
||||
|
||||
|
|
|
|||
607
cry/feed.py
607
cry/feed.py
|
|
@ -2,13 +2,15 @@
|
|||
import asyncio
|
||||
import dataclasses
|
||||
import functools
|
||||
import logging
|
||||
import time
|
||||
import typing
|
||||
import hashlib
|
||||
import html.parser
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import typing
|
||||
import urllib.parse
|
||||
|
||||
|
||||
import feedparser
|
||||
import requests
|
||||
|
|
@ -18,6 +20,8 @@ import requests.structures
|
|||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
USER_AGENT = "cry-reader v0.0"
|
||||
|
||||
FEED_STATUS_ALIVE = 0
|
||||
FEED_STATUS_DEAD = 1
|
||||
FEED_STATUS_UNSUBSCRIBED = 2
|
||||
|
|
@ -48,211 +52,19 @@ class FeedMeta:
|
|||
origin=origin,
|
||||
)
|
||||
|
||||
def should_fetch(self, now) -> bool:
|
||||
if self.status != FEED_STATUS_ALIVE:
|
||||
LOG.info(f"{self.url} is dead or unsubscribed")
|
||||
return False
|
||||
|
||||
def the_worst_element_hash(value) -> str:
|
||||
"""Compute a content hash for the given feed element, to use as an ID.
|
||||
if now < self.retry_after_ts:
|
||||
retry_str = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S %z", time.localtime(self.retry_after_ts)
|
||||
)
|
||||
LOG.info(f"{self.url} will not be pulled until {retry_str}")
|
||||
return False
|
||||
|
||||
The hash must be as stable as we can make it, but obviously there are things
|
||||
we cannot control. If we've gotten here then the feed author has already
|
||||
failed us and there's little we can do. This is already *known to be wrong.*
|
||||
"""
|
||||
|
||||
def process(value, hash):
|
||||
if isinstance(value, feedparser.FeedParserDict):
|
||||
hash.update(b"dict")
|
||||
keys = sorted(value.keys())
|
||||
for key in keys:
|
||||
hash.update(b"key::")
|
||||
hash.update(key.encode("utf-8"))
|
||||
hash.update(b"value::")
|
||||
process(value[key], hash)
|
||||
hash.update(b"tcid")
|
||||
elif isinstance(value, str):
|
||||
hash.update(b"str")
|
||||
hash.update(value.encode("utf-8"))
|
||||
hash.update(b"rts")
|
||||
elif isinstance(value, list):
|
||||
hash.update(b"list")
|
||||
for item in value:
|
||||
process(item, hash)
|
||||
hash.update(b"tsil")
|
||||
elif isinstance(value, tuple):
|
||||
hash.update(b"tuple")
|
||||
for item in value:
|
||||
process(item, hash)
|
||||
hash.update(b"elput")
|
||||
|
||||
hash = hashlib.sha256(usedforsecurity=False)
|
||||
process(value, hash)
|
||||
return hash.hexdigest()
|
||||
|
||||
|
||||
BLANK_TAGS = {"p", "br", "li", "div", "img"}
|
||||
MULTI_SPACES = re.compile(r"\s+")
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Sometimes text is HTML and otherwise ugly. This reduces it to
|
||||
something pretty to display. Strips tags, puts blank space in between
|
||||
elements that should generate blank space, and then collapses blank
|
||||
spaces down to one.
|
||||
"""
|
||||
|
||||
class Cleaner(html.parser.HTMLParser):
|
||||
def __init__(self, writer):
|
||||
super().__init__()
|
||||
self.writer = writer
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
self.writer.write(data)
|
||||
|
||||
def handle_startendtag(
|
||||
self, tag: str, attrs: list[tuple[str, str | None]]
|
||||
) -> None:
|
||||
del attrs
|
||||
if tag.lower() in BLANK_TAGS:
|
||||
self.writer.write(" ")
|
||||
|
||||
def handle_starttag(
|
||||
self, tag: str, attrs: list[tuple[str, str | None]]
|
||||
) -> None:
|
||||
del attrs
|
||||
if tag.lower() in BLANK_TAGS:
|
||||
self.writer.write(" ")
|
||||
|
||||
writer = io.StringIO()
|
||||
cleaner = Cleaner(writer)
|
||||
cleaner.feed(text)
|
||||
return MULTI_SPACES.sub(" ", writer.getvalue())
|
||||
|
||||
|
||||
async def fetch_feed(
|
||||
feed: FeedMeta,
|
||||
) -> typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]:
|
||||
"""Potentially fetch the feed described by `feed`, returning a parsed feed
|
||||
(if possible and necessary) and an updated FeedMeta.
|
||||
|
||||
This function can fail to return a parsed feed under a number of
|
||||
circumstances. Among them:
|
||||
|
||||
- It's too soon to be checking this feed again.
|
||||
- The feed has been failing for a while and we've called it's dead.
|
||||
- The server told us it was dead.
|
||||
- We checked the server and it told us our cache was good.
|
||||
- We tried to contact the server, but a networking error happened.
|
||||
|
||||
Regardless, the new FeedMeta has the latest state of the feed.
|
||||
"""
|
||||
if feed.status != FEED_STATUS_ALIVE:
|
||||
LOG.info(f"{feed.url} is dead or unsubscribed")
|
||||
return (None, feed)
|
||||
|
||||
if time.time() < feed.retry_after_ts:
|
||||
retry_str = time.strftime(
|
||||
"%Y-%m-%d %H:%M:%S %z", time.localtime(feed.retry_after_ts)
|
||||
)
|
||||
LOG.info(f"{feed.url} will not be pulled until {retry_str}")
|
||||
return (None, feed)
|
||||
|
||||
# We waffle back and forth about using feedreader's HTTP support vs
|
||||
# calling requests ourselves. We have decided to use requests manually at
|
||||
# this time because it make it much much easier to figure out whether or
|
||||
# not a request has succeeded. (The straw was handling timeouts and
|
||||
# understanding whether `bozo_exception` was a transport failure or not.)
|
||||
|
||||
headers = {"user-agent": "cry-reader v0.0"}
|
||||
if feed.etag:
|
||||
headers["if-none-match"] = feed.etag
|
||||
if feed.modified:
|
||||
headers["if-modified-since"] = feed.modified
|
||||
|
||||
LOG.info(f"{feed.url} fetching...")
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
response = await loop.run_in_executor(
|
||||
None,
|
||||
functools.partial(http.get, feed.url, headers=headers),
|
||||
)
|
||||
LOG.info(f"{feed.url} fetched with status: {response.status_code}")
|
||||
failed = response.status_code >= 400
|
||||
except Exception as e:
|
||||
LOG.error(f"{feed.url} error fetching: {e}")
|
||||
failed = True
|
||||
response = None
|
||||
|
||||
# Now, there are a number of things to consider in the response that
|
||||
# we need to consider in updating our permanent record.
|
||||
|
||||
if response is not None and response.status_code == 410:
|
||||
# Permanently gone, really stop asking.
|
||||
LOG.error(f"{feed.url} permanently gone")
|
||||
return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
|
||||
|
||||
if failed and time.time() > feed.last_fetched_ts + (7 * 24 * 60 * 60):
|
||||
# If we've been failing to fetch the feed for more than a week then
|
||||
# consider us dead, we must be doing something wrong.
|
||||
LOG.error(f"{feed.url} failed for too long, giving up")
|
||||
return (None, dataclasses.replace(feed, status=FEED_STATUS_DEAD))
|
||||
|
||||
if response and response.is_permanent_redirect:
|
||||
# Permanent redirect, update the stored URL, but mark this as a
|
||||
# successful fetch.
|
||||
#
|
||||
# TODO: Is this actually the right URL to store? We need the last
|
||||
# permanently redirected URL, not just whatever the last thing
|
||||
# is... e.g. imagine a permanent followed by a temporary
|
||||
# redirect, then what?
|
||||
LOG.info(f"{feed.url} permanently redirected to {response.url}")
|
||||
assert response.url is not None
|
||||
feed = dataclasses.replace(feed, url=response.url)
|
||||
|
||||
# NOTE: We might still be in a failure state here. But success or fail,
|
||||
# the server might have told us when to next retry, so make a note
|
||||
# of it.
|
||||
retry_delta = None
|
||||
if response is not None:
|
||||
try:
|
||||
retry_delta = int(response.headers.get("retry-after", "nope"))
|
||||
except Exception:
|
||||
pass
|
||||
if retry_delta is None:
|
||||
if failed:
|
||||
retry_delta = 1 * 60 # Retry again in a minute
|
||||
else:
|
||||
retry_delta = 60 * 60 # 1 hour default
|
||||
|
||||
feed = dataclasses.replace(feed, retry_after_ts=int(time.time()) + retry_delta)
|
||||
|
||||
# We've done everything we can on a failure, bail if we've got an error.
|
||||
if failed:
|
||||
LOG.info(f"{feed.url} failed at the network level")
|
||||
return (None, feed)
|
||||
|
||||
assert response is not None
|
||||
|
||||
# Record our successful fetch now, to reset the failure timer above.
|
||||
feed = dataclasses.replace(feed, last_fetched_ts=int(time.time()))
|
||||
|
||||
# We can *still* be successful but like, no changes.
|
||||
if response.status_code != 200:
|
||||
LOG.info(f"{feed.url} had no changes")
|
||||
return (None, feed)
|
||||
|
||||
feed = dataclasses.replace(
|
||||
feed,
|
||||
etag=response.headers.get("etag"),
|
||||
modified=response.headers.get("last-modified"),
|
||||
)
|
||||
parsed = feedparser.parse(response.content, response_headers=response.headers)
|
||||
return (parsed, feed)
|
||||
|
||||
|
||||
async def fetch_many(
|
||||
metas: list[FeedMeta],
|
||||
) -> list[typing.Tuple[feedparser.FeedParserDict | None, FeedMeta]]:
|
||||
async with asyncio.TaskGroup() as group:
|
||||
tasks = [group.create_task(fetch_feed(m)) for m in metas]
|
||||
return [t.result() for t in tasks]
|
||||
return True
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
|
|
@ -398,6 +210,212 @@ class Feed:
|
|||
return Feed(meta=meta, title=title, link=link, entries=entries)
|
||||
|
||||
|
||||
def the_worst_element_hash(value) -> str:
|
||||
"""Compute a content hash for the given feed element, to use as an ID.
|
||||
|
||||
The hash must be as stable as we can make it, but obviously there are things
|
||||
we cannot control. If we've gotten here then the feed author has already
|
||||
failed us and there's little we can do. This is already *known to be wrong.*
|
||||
"""
|
||||
|
||||
def process(value, hash):
|
||||
if isinstance(value, feedparser.FeedParserDict):
|
||||
hash.update(b"dict")
|
||||
keys = sorted(value.keys())
|
||||
for key in keys:
|
||||
hash.update(b"key::")
|
||||
hash.update(key.encode("utf-8"))
|
||||
hash.update(b"value::")
|
||||
process(value[key], hash)
|
||||
hash.update(b"tcid")
|
||||
elif isinstance(value, str):
|
||||
hash.update(b"str")
|
||||
hash.update(value.encode("utf-8"))
|
||||
hash.update(b"rts")
|
||||
elif isinstance(value, list):
|
||||
hash.update(b"list")
|
||||
for item in value:
|
||||
process(item, hash)
|
||||
hash.update(b"tsil")
|
||||
elif isinstance(value, tuple):
|
||||
hash.update(b"tuple")
|
||||
for item in value:
|
||||
process(item, hash)
|
||||
hash.update(b"elput")
|
||||
|
||||
hash = hashlib.sha256(usedforsecurity=False)
|
||||
process(value, hash)
|
||||
return hash.hexdigest()
|
||||
|
||||
|
||||
BLANK_TAGS = {"p", "br", "li", "div", "img"}
|
||||
MULTI_SPACES = re.compile(r"\s+")
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""Sometimes text is HTML and otherwise ugly. This reduces it to
|
||||
something pretty to display. Strips tags, puts blank space in between
|
||||
elements that should generate blank space, and then collapses blank
|
||||
spaces down to one.
|
||||
"""
|
||||
|
||||
class Cleaner(html.parser.HTMLParser):
|
||||
def __init__(self, writer):
|
||||
super().__init__()
|
||||
self.writer = writer
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
self.writer.write(data)
|
||||
|
||||
def handle_startendtag(
|
||||
self, tag: str, attrs: list[tuple[str, str | None]]
|
||||
) -> None:
|
||||
del attrs
|
||||
if tag.lower() in BLANK_TAGS:
|
||||
self.writer.write(" ")
|
||||
|
||||
def handle_starttag(
|
||||
self, tag: str, attrs: list[tuple[str, str | None]]
|
||||
) -> None:
|
||||
del attrs
|
||||
if tag.lower() in BLANK_TAGS:
|
||||
self.writer.write(" ")
|
||||
|
||||
writer = io.StringIO()
|
||||
cleaner = Cleaner(writer)
|
||||
cleaner.feed(text)
|
||||
return MULTI_SPACES.sub(" ", writer.getvalue())
|
||||
|
||||
|
||||
def could_be_feed_data(data: str) -> bool:
|
||||
"""See if the data might be a feed."""
|
||||
data = data.lower()
|
||||
if data.count("<html"):
|
||||
return False
|
||||
return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
|
||||
|
||||
|
||||
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta]:
|
||||
if not meta.should_fetch(time.time()):
|
||||
return (None, meta)
|
||||
|
||||
headers = {"user-agent": USER_AGENT}
|
||||
if meta.etag:
|
||||
headers["if-none-match"] = meta.etag
|
||||
if meta.modified:
|
||||
headers["if-modified-since"] = meta.modified
|
||||
|
||||
# We waffle back and forth about using feedreader's HTTP support vs
|
||||
# calling requests ourselves. We have decided to use requests manually at
|
||||
# this time because it make it much much easier to figure out whether or
|
||||
# not a request has succeeded. (The straw was handling timeouts and
|
||||
# understanding whether `bozo_exception` was a transport failure or not.)
|
||||
#
|
||||
# TODO: Check robots.txt!
|
||||
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
response = await loop.run_in_executor(
|
||||
None,
|
||||
functools.partial(http.get, meta.url, headers=headers),
|
||||
)
|
||||
LOG.info(f"{meta.url} fetched with status: {response.status_code}")
|
||||
failed = response.status_code >= 400
|
||||
except Exception as e:
|
||||
LOG.error(f"{meta.url} error fetching: {e}")
|
||||
failed = True
|
||||
response = None
|
||||
|
||||
# Now, there are a number of things to consider in the response that
|
||||
# we need to consider in updating our permanent record.
|
||||
|
||||
if response is not None and response.status_code == 410:
|
||||
# Permanently gone, really stop asking.
|
||||
LOG.error(f"{meta.url} permanently gone")
|
||||
return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
|
||||
|
||||
if failed and time.time() > meta.last_fetched_ts + (7 * 24 * 60 * 60):
|
||||
# If we've been failing to fetch the feed for more than a week then
|
||||
# consider us dead, we must be doing something wrong.
|
||||
LOG.error(f"{meta.url} failed for too long, giving up")
|
||||
return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
|
||||
|
||||
if response and response.is_permanent_redirect:
|
||||
# Permanent redirect, update the stored URL, but mark this as a
|
||||
# successful fetch.
|
||||
#
|
||||
# TODO: Is this actually the right URL to store? We need the last
|
||||
# permanently redirected URL, not just whatever the last thing
|
||||
# is... e.g. imagine a permanent followed by a temporary
|
||||
# redirect, then what?
|
||||
LOG.info(f"{meta.url} permanently redirected to {response.url}")
|
||||
assert response.url is not None
|
||||
meta = dataclasses.replace(meta, url=response.url)
|
||||
|
||||
# TODO: Handle that bogus non-HTTP redirect that feedfinder uses.
|
||||
|
||||
# NOTE: We might still be in a failure state here. But success or fail,
|
||||
# the server might have told us when to next retry, so make a note
|
||||
# of it. The server might also have given us updated caching
|
||||
# information (even on failure!) and so let's also make a note of that.
|
||||
retry_delta = None
|
||||
etag = meta.etag
|
||||
modified = meta.modified
|
||||
if response is not None:
|
||||
etag = response.headers.get("etag", meta.etag)
|
||||
modified = response.headers.get("last-modified", meta.modified)
|
||||
|
||||
try:
|
||||
retry_delta = int(response.headers.get("retry-after", "nope"))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if retry_delta is None:
|
||||
if failed:
|
||||
retry_delta = 1 * 60 # Retry again in a minute
|
||||
else:
|
||||
retry_delta = 60 * 60 # 1 hour default
|
||||
|
||||
meta = dataclasses.replace(
|
||||
meta,
|
||||
retry_after_ts=int(time.time()) + retry_delta,
|
||||
etag=etag,
|
||||
modified=modified,
|
||||
)
|
||||
|
||||
# We've done everything we can on a failure, bail if we've got an error.
|
||||
if failed:
|
||||
LOG.info(f"{meta.url} failed at the network level")
|
||||
return (None, meta)
|
||||
|
||||
assert response is not None
|
||||
|
||||
# Record our successful fetch now, to reset the failure timer above.
|
||||
meta = dataclasses.replace(meta, last_fetched_ts=int(time.time()))
|
||||
|
||||
# We can *still* be successful but like, no changes.
|
||||
if response.status_code != 200:
|
||||
LOG.info(f"{meta.url} had no changes")
|
||||
return (None, meta)
|
||||
|
||||
# Does this seem to be a feed? Or not?
|
||||
if could_be_feed_data(response.text):
|
||||
parsed = feedparser.parse(response.content, response_headers=response.headers)
|
||||
return (Feed.from_parsed(parsed, meta), meta)
|
||||
|
||||
# No this is not a feed, just return the content out for further
|
||||
# processing.
|
||||
return (response.text, meta)
|
||||
|
||||
|
||||
async def fetch_many(
|
||||
metas: list[FeedMeta],
|
||||
) -> list[typing.Tuple[Feed | str | None, FeedMeta]]:
|
||||
async with asyncio.TaskGroup() as group:
|
||||
tasks = [group.create_task(fetch_feed(m)) for m in metas]
|
||||
return [t.result() for t in tasks]
|
||||
|
||||
|
||||
def merge_feeds(a: Feed, b: Feed) -> Feed:
|
||||
"""Merge two known feeds. There are two conflict resolution policies:
|
||||
|
||||
|
|
@ -436,3 +454,180 @@ def sort_key(f: Feed) -> int:
|
|||
if len(f.entries) > 0:
|
||||
return max(e.inserted_at for e in f.entries)
|
||||
return -1
|
||||
|
||||
|
||||
class FeedSearchParser(html.parser.HTMLParser):
|
||||
"""An HTML parser that tries to find links to feeds."""
|
||||
|
||||
FEED_TYPES = (
|
||||
"application/rss+xml",
|
||||
"text/xml",
|
||||
"application/atom+xml",
|
||||
"application/x.atom+xml",
|
||||
"application/x-atom+xml",
|
||||
)
|
||||
|
||||
link_links: list[str]
|
||||
a_links: list[str]
|
||||
|
||||
def __init__(self, baseuri):
|
||||
super().__init__()
|
||||
self.baseuri = baseuri
|
||||
self.link_links = []
|
||||
self.a_links = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = {k: v for k, v in attrs}
|
||||
if tag == "base":
|
||||
self.do_base(attrs)
|
||||
elif tag == "link":
|
||||
self.do_link(attrs)
|
||||
elif tag == "a":
|
||||
self.do_a(attrs)
|
||||
|
||||
def do_base(self, attrs):
|
||||
base = attrs.get("href")
|
||||
if base is not None:
|
||||
self.baseuri = base
|
||||
|
||||
def do_link(self, attrs):
|
||||
rel = attrs.get("rel")
|
||||
if rel is None:
|
||||
return
|
||||
|
||||
if "alternate" not in rel.split():
|
||||
return
|
||||
|
||||
if attrs.get("type", "").lower() not in self.FEED_TYPES:
|
||||
return
|
||||
|
||||
href = attrs.get("href")
|
||||
if href is None:
|
||||
return
|
||||
|
||||
self.link_links.append(urllib.parse.urljoin(self.baseuri, href))
|
||||
|
||||
def do_a(self, attrs):
|
||||
href = attrs.get("href")
|
||||
if href is None:
|
||||
return
|
||||
|
||||
self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
|
||||
|
||||
|
||||
def massage_url(uri: str) -> str:
|
||||
uri = uri.strip()
|
||||
if uri.startswith("feed://"):
|
||||
uri = "http://" + uri.split("feed://", 1).pop()
|
||||
for x in ["http", "https"]:
|
||||
if uri.startswith("%s://" % x):
|
||||
return uri
|
||||
return "http://%s" % uri
|
||||
|
||||
|
||||
def classify_links(links, baseuri) -> typing.Tuple[list[str], list[str]]:
|
||||
"""Split the links into two sets: local (which start with baseuri) and
|
||||
remote (which don't).
|
||||
"""
|
||||
baseuri = baseuri.lower()
|
||||
|
||||
local, remote = [], []
|
||||
for link in links:
|
||||
if link.lower().startswith(baseuri):
|
||||
local.append(link)
|
||||
else:
|
||||
remote.append(link)
|
||||
|
||||
return local, remote
|
||||
|
||||
|
||||
def is_feed_link(link: str) -> bool:
|
||||
"""Return True if the link seems to be a feed link, or False otherwise."""
|
||||
link = link.lower()
|
||||
return (
|
||||
link.endswith(".rss")
|
||||
or link.endswith(".rdf")
|
||||
or link.endswith(".xml")
|
||||
or link.endswith(".atom")
|
||||
)
|
||||
|
||||
|
||||
def is_XML_related_link(link: str) -> bool:
|
||||
link = link.lower()
|
||||
return "rss" in link or "rdf" in link or "xml" in link or "atom" in link
|
||||
|
||||
|
||||
async def check_feed(url: str, origin: str) -> Feed | None:
|
||||
"""Check to see if the given URL is a feed. If it is, return the feed,
|
||||
otherwise return None.
|
||||
"""
|
||||
meta = FeedMeta.from_url(url, origin)
|
||||
result, meta = await fetch_feed(meta)
|
||||
if isinstance(result, Feed):
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def check_links(links: typing.Iterable[str], origin: str) -> list[Feed]:
|
||||
"""Fetch all the links and return the ones that appear to have feeds in
|
||||
them. If none of them are fetchable or none of them have feeds then this
|
||||
will return nothing.
|
||||
"""
|
||||
async with asyncio.TaskGroup() as group:
|
||||
tasks = [group.create_task(check_feed(link, origin)) for link in links]
|
||||
|
||||
outfeeds: list[Feed] = []
|
||||
for task in tasks:
|
||||
result = task.result()
|
||||
if result is not None:
|
||||
outfeeds.append(result)
|
||||
|
||||
return outfeeds
|
||||
|
||||
|
||||
async def feed_search(uri: str, origin: str) -> list[Feed]:
|
||||
meta = FeedMeta.from_url(massage_url(uri), origin)
|
||||
result, meta = await fetch_feed(meta)
|
||||
if result is None:
|
||||
return []
|
||||
if isinstance(result, Feed):
|
||||
return [result]
|
||||
|
||||
# OK it was not a feed, let's try all our searching games.
|
||||
parser = FeedSearchParser(meta.url)
|
||||
parser.feed(result)
|
||||
|
||||
LOG.debug("Checking links...")
|
||||
outfeeds = await check_links(parser.link_links, origin)
|
||||
if len(outfeeds) > 0:
|
||||
return outfeeds
|
||||
|
||||
LOG.debug("No links, checking A tags...")
|
||||
local_links, remote_links = classify_links(parser.a_links, meta.url)
|
||||
outfeeds = await check_links(filter(is_feed_link, local_links), origin)
|
||||
if len(outfeeds) > 0:
|
||||
return outfeeds
|
||||
outfeeds = await check_links(filter(is_XML_related_link, local_links), origin)
|
||||
if len(outfeeds) > 0:
|
||||
return outfeeds
|
||||
outfeeds = await check_links(filter(is_feed_link, remote_links), origin)
|
||||
if len(outfeeds) > 0:
|
||||
return outfeeds
|
||||
outfeeds = await check_links(filter(is_XML_related_link, remote_links), origin)
|
||||
if len(outfeeds) > 0:
|
||||
return outfeeds
|
||||
|
||||
LOG.debug("no A tags, guessing")
|
||||
suffixes = [ # filenames used by popular software:
|
||||
"atom.xml", # blogger, TypePad
|
||||
"index.atom", # MT, apparently
|
||||
"index.rdf", # MT
|
||||
"rss.xml", # Dave Winer/Manila
|
||||
"index.xml", # MT
|
||||
"index.rss", # Slash
|
||||
]
|
||||
outfeeds = await check_links(
|
||||
[urllib.parse.urljoin(meta.url, x) for x in suffixes], origin
|
||||
)
|
||||
return outfeeds
|
||||
|
|
|
|||
|
|
@ -2,8 +2,10 @@
|
|||
|
||||
Based on http://www.aaronsw.com/2002/feedfinder/
|
||||
|
||||
Kinda rewritten by John Doty for the Python3 and the cry aggregator, but the
|
||||
basic frame remains.
|
||||
Rewritted by John Doty for the Python3 and the cry aggregator, but the basic
|
||||
frame remains. The big thing *this* does is also return the FeedMeta when it
|
||||
has found feeds, instead of just URLs. This is more useful for the rest of
|
||||
processing.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
|
@ -17,6 +19,7 @@ import urllib.robotparser
|
|||
|
||||
import requests
|
||||
|
||||
from . import feed
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -125,7 +128,7 @@ class HtmlBasedParser(html.parser.HTMLParser):
|
|||
self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
|
||||
|
||||
|
||||
def makeFullURI(uri):
|
||||
def makeFullURI(uri: str) -> str:
|
||||
uri = uri.strip()
|
||||
if uri.startswith("feed://"):
|
||||
uri = "http://" + uri.split("feed://", 1).pop()
|
||||
|
|
@ -204,8 +207,6 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
|
|||
"""Find feeds for the given URI.
|
||||
|
||||
How it works:
|
||||
0.
|
||||
|
||||
1. If the URI points to a feed, it is simply returned; otherwise
|
||||
the page is downloaded and the real fun begins.
|
||||
|
||||
|
|
@ -293,58 +294,3 @@ def find_feeds(uri: str, all: bool = False, _recurs=None) -> list[str]:
|
|||
)
|
||||
|
||||
return list(set(outfeeds))
|
||||
|
||||
|
||||
##### test harness ######
|
||||
|
||||
|
||||
def test():
|
||||
uri = "http://diveintomark.org/tests/client/autodiscovery/html4-001.html"
|
||||
failed = []
|
||||
count = 0
|
||||
while 1:
|
||||
data = _gatekeeper.get(uri)
|
||||
if data.find("Atom autodiscovery test") == -1:
|
||||
break
|
||||
sys.stdout.write(".")
|
||||
sys.stdout.flush()
|
||||
count += 1
|
||||
links = getLinks(data, uri)
|
||||
if not links:
|
||||
print(f"\n*** FAILED *** {uri} could not find link")
|
||||
failed.append(uri)
|
||||
elif len(links) > 1:
|
||||
print(f"\n*** FAILED *** {uri} found too many links")
|
||||
failed.append(uri)
|
||||
else:
|
||||
atomdata = requests.get(links[0]).text
|
||||
if atomdata.find('<link rel="alternate"') == -1:
|
||||
print(f"\n*** FAILED *** {uri} retrieved something that is not a feed")
|
||||
failed.append(uri)
|
||||
else:
|
||||
backlink = atomdata.split('href="').pop().split('"')[0]
|
||||
if backlink != uri:
|
||||
print(f"\n*** FAILED *** {uri} retrieved wrong feed")
|
||||
failed.append(uri)
|
||||
if data.find('<link rel="next" href="') == -1:
|
||||
break
|
||||
uri = urllib.parse.urljoin(
|
||||
uri, data.split('<link rel="next" href="').pop().split('"')[0]
|
||||
)
|
||||
print()
|
||||
print(f"{count} tests executed, {len(failed)} failed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = sys.argv[1:]
|
||||
if args and args[0] == "--debug":
|
||||
_debug = 1
|
||||
args.pop(0)
|
||||
if args:
|
||||
uri = args[0]
|
||||
else:
|
||||
uri = "http://diveintomark.org/"
|
||||
if uri == "test":
|
||||
test()
|
||||
else:
|
||||
print("\n".join(getFeeds(uri)))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue