786 lines
26 KiB
Python
786 lines
26 KiB
Python
# I guess this is it.
|
|
import asyncio
|
|
import calendar
|
|
import dataclasses
|
|
import datetime
|
|
import functools
|
|
import hashlib
|
|
import html.parser
|
|
import io
|
|
import logging
|
|
import re
|
|
import time
|
|
import typing
|
|
import urllib.parse
|
|
import urllib.robotparser
|
|
|
|
|
|
import feedparser
|
|
import requests
|
|
import requests.structures
|
|
|
|
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
USER_AGENT = "cry-reader v0.0"
|
|
|
|
FEED_STATUS_ALIVE = 0
|
|
FEED_STATUS_DEAD = 1
|
|
FEED_STATUS_UNSUBSCRIBED = 2
|
|
|
|
# TODO: Consider configuration here.
|
|
http = requests.Session()
|
|
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class FeedMeta:
|
|
url: str
|
|
last_fetched_ts: int
|
|
retry_after_ts: int
|
|
status: int
|
|
etag: str | None
|
|
modified: str | None
|
|
|
|
@classmethod
|
|
def from_url(cls, url: str) -> "FeedMeta":
|
|
return FeedMeta(
|
|
url=url,
|
|
last_fetched_ts=0,
|
|
retry_after_ts=0,
|
|
status=FEED_STATUS_ALIVE,
|
|
etag=None,
|
|
modified=None,
|
|
)
|
|
|
|
def should_fetch(self, now) -> bool:
|
|
if self.status != FEED_STATUS_ALIVE:
|
|
LOG.info(f"{self.url} is dead or unsubscribed")
|
|
return False
|
|
|
|
if now < self.retry_after_ts:
|
|
retry_str = time.strftime(
|
|
"%Y-%m-%d %H:%M:%S %z", time.localtime(self.retry_after_ts)
|
|
)
|
|
LOG.info(f"{self.url} will not be pulled until {retry_str}")
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class Entry:
|
|
id: str
|
|
inserted_at: int # Unix time, but ms, not sec
|
|
posted_at: int # Unix time, but ms, not sec
|
|
title: str
|
|
link: str | None
|
|
|
|
@classmethod
|
|
def from_parsed(cls, entry: feedparser.FeedParserDict, insert_time: int) -> "Entry":
|
|
"""Convert an entry from feedparser into an Entry by extracting the
|
|
things we care about, fudging things and substituting things as
|
|
necessary.
|
|
|
|
The one thing we need from the outside is the "insert time", which
|
|
is *almost* `int(time.time())` but needs a little bit of fudging in
|
|
order to ensure that we can keep the items in order when we get a lot
|
|
of them all at once.
|
|
"""
|
|
title = entry.get("title")
|
|
if not title:
|
|
title = entry.get("description")
|
|
|
|
id = entry.get("id")
|
|
|
|
link = entry.get("link")
|
|
if id and not link:
|
|
linkid = str(id).lower()
|
|
if linkid.startswith("http:") or linkid.startswith("https:"):
|
|
link = linkid
|
|
|
|
if link and not id:
|
|
id = link
|
|
if title and not id:
|
|
id = title
|
|
if not id:
|
|
id = entry.get("published")
|
|
if not id:
|
|
id = the_worst_element_hash(entry)
|
|
|
|
published = entry.get("published_parsed")
|
|
if published is None:
|
|
published = entry.get("updated_parsed")
|
|
if published is not None:
|
|
assert isinstance(published, tuple)
|
|
|
|
# NOTE: Take insert_time if it's smaller; publish time errors generate
|
|
# posts from the future.
|
|
pub_time = int(calendar.timegm(published) * 1000)
|
|
posted_at = min(pub_time, insert_time)
|
|
else:
|
|
posted_at = insert_time
|
|
|
|
assert isinstance(id, str)
|
|
assert link is None or isinstance(link, str)
|
|
|
|
title = clean_text(str(title))
|
|
return Entry(
|
|
id=id,
|
|
inserted_at=insert_time,
|
|
posted_at=posted_at,
|
|
title=title,
|
|
link=link,
|
|
)
|
|
|
|
def posted_time_iso(self) -> str:
|
|
return datetime.datetime.fromtimestamp(self.posted_at / 1000).isoformat()
|
|
|
|
def time_ago(self) -> str:
|
|
posted = int(self.posted_at / 1000)
|
|
seconds = int(time.time()) - posted
|
|
if seconds <= 90:
|
|
return f"{seconds}s"
|
|
minutes = int(seconds / 60)
|
|
if minutes <= 90:
|
|
return f"{minutes}m"
|
|
hours = int(minutes / 60)
|
|
if hours < 24:
|
|
return f"{hours}h"
|
|
days = int(hours / 24)
|
|
if days <= 7:
|
|
return f"{days}d"
|
|
weeks = int(days / 7)
|
|
if weeks < 52:
|
|
return f"{weeks}w"
|
|
|
|
years = int(weeks / 52)
|
|
return f"{years}y"
|
|
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class Feed:
|
|
meta: FeedMeta
|
|
title: str
|
|
link: str
|
|
entries: list[Entry]
|
|
|
|
@classmethod
|
|
def from_parsed(cls, d: feedparser.FeedParserDict, meta: FeedMeta) -> "Feed":
|
|
title = None
|
|
link = None
|
|
|
|
if d.feed is not None:
|
|
assert not isinstance(d.feed, list)
|
|
title = d.feed.get("title")
|
|
link = d.feed.get("link")
|
|
|
|
if title is None or title == "":
|
|
title = meta.url
|
|
if link is None:
|
|
link = meta.url
|
|
|
|
# =====================================================================
|
|
# FEED AND ENTRY ORDERING!
|
|
# =====================================================================
|
|
# In many ways this is the most critical part of a feed reader: in
|
|
# what order do we show the items in the feed?
|
|
#
|
|
# RSS is pretty unspecified in general, but also in what the meaning
|
|
# of the order of the entries in the feed actually is. (I can't
|
|
# remember if this is something that Atom specifies but it doesn't
|
|
# matter because RSS is still really popular, even in the ungodly
|
|
# late year of 2024.
|
|
#
|
|
# *We* want to show posts in reverse chronological order, of course,
|
|
# but we still have problems. You *cannot* trust the dates and times
|
|
# in the entries. Sure, sure, Atom does a great job of specifying at
|
|
# least three different timestamps in the feed, and they are supposed
|
|
# to have time zones and whatnot. But:
|
|
#
|
|
# a) Any kind of timestamp is optional in RSS, and
|
|
# b) Even if the timestamp is present, it can come in a variety of
|
|
# formats (which theoretically `feedparser` handles), but
|
|
# c) Even if you can parse the timestamp, many feed implementations
|
|
# just PUT THE WRONG TIME IN THERE.
|
|
#
|
|
# So we have to account for the fact that the publish time might be
|
|
# wildly unreliable, and back it up with our own clock. This comes with
|
|
# its own problems, of course: our clock can be highly unreliable. But
|
|
# in general it's good enough to work with, and feeds don't update so
|
|
# frequently that we need to worry about most of these problems if we
|
|
# use unix timestamps as our basis.
|
|
#
|
|
# If we just use our own timestamps, then what do we do with feed
|
|
# updates where multiple items are inserted at once? We want to
|
|
# preserve that ordering too! Our hack is to multiply the unix
|
|
# timestamp by 1000, and then use the lower two digits as a sequence
|
|
# number. (Maybe it looks like everything was posted a millisecond
|
|
# apart?) There's a *chance* of conflict if:
|
|
#
|
|
# a) a feed as more than 1000 items in it, and
|
|
# b) we update the feed again less than a second later
|
|
#
|
|
# But given the other rate limiting features in this RSS system (The
|
|
# `retry_after_ts` field, etc.) it's not a very likely thing to
|
|
# happen.
|
|
#
|
|
# The *other* big source of time instability is that "new" items might
|
|
# seem to have been published with a time that is "before" the last
|
|
# item we previously saw. (i.e., on the first refresh we see an item
|
|
# from October 3rd, then on the next refresh we see an item from October
|
|
# 1st.) We don't know anything about historical refreshes here in feed
|
|
# land, so that gets corrected in the database. (See store_feed.)
|
|
#
|
|
insert_time = int(time.time()) * 1000
|
|
entries = [
|
|
Entry.from_parsed(e, insert_time + i)
|
|
for i, e in enumerate(reversed(d.entries))
|
|
]
|
|
entries.reverse()
|
|
|
|
return Feed(meta=meta, title=title, link=link, entries=entries)
|
|
|
|
|
|
def the_worst_element_hash(value) -> str:
|
|
"""Compute a content hash for the given feed element, to use as an ID.
|
|
|
|
The hash must be as stable as we can make it, but obviously there are things
|
|
we cannot control. If we've gotten here then the feed author has already
|
|
failed us and there's little we can do. This is already *known to be wrong.*
|
|
"""
|
|
|
|
def process(value, hash):
|
|
if isinstance(value, feedparser.FeedParserDict):
|
|
hash.update(b"dict")
|
|
keys = sorted(value.keys())
|
|
for key in keys:
|
|
hash.update(b"key::")
|
|
hash.update(key.encode("utf-8"))
|
|
hash.update(b"value::")
|
|
process(value[key], hash)
|
|
hash.update(b"tcid")
|
|
elif isinstance(value, str):
|
|
hash.update(b"str")
|
|
hash.update(value.encode("utf-8"))
|
|
hash.update(b"rts")
|
|
elif isinstance(value, list):
|
|
hash.update(b"list")
|
|
for item in value:
|
|
process(item, hash)
|
|
hash.update(b"tsil")
|
|
elif isinstance(value, tuple):
|
|
hash.update(b"tuple")
|
|
for item in value:
|
|
process(item, hash)
|
|
hash.update(b"elput")
|
|
|
|
hash = hashlib.sha256(usedforsecurity=False)
|
|
process(value, hash)
|
|
return hash.hexdigest()
|
|
|
|
|
|
BLANK_TAGS = {"p", "br", "li", "div", "img"}
|
|
MULTI_SPACES = re.compile(r"\s+")
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
"""Sometimes text is HTML and otherwise ugly. This reduces it to
|
|
something pretty to display. Strips tags, puts blank space in between
|
|
elements that should generate blank space, and then collapses blank
|
|
spaces down to one.
|
|
"""
|
|
|
|
class Cleaner(html.parser.HTMLParser):
|
|
def __init__(self, writer):
|
|
super().__init__()
|
|
self.writer = writer
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
self.writer.write(data)
|
|
|
|
def handle_startendtag(
|
|
self, tag: str, attrs: list[tuple[str, str | None]]
|
|
) -> None:
|
|
del attrs
|
|
if tag.lower() in BLANK_TAGS:
|
|
self.writer.write(" ")
|
|
|
|
def handle_starttag(
|
|
self, tag: str, attrs: list[tuple[str, str | None]]
|
|
) -> None:
|
|
del attrs
|
|
if tag.lower() in BLANK_TAGS:
|
|
self.writer.write(" ")
|
|
|
|
writer = io.StringIO()
|
|
cleaner = Cleaner(writer)
|
|
cleaner.feed(text)
|
|
cleaner.close()
|
|
return MULTI_SPACES.sub(" ", writer.getvalue())
|
|
|
|
|
|
def could_be_feed_data(data: str) -> bool:
|
|
"""See if the data might be a feed."""
|
|
data = data.lower()
|
|
if data.count("<html"):
|
|
return False
|
|
return (data.count("<rss") + data.count("<rdf") + data.count("<feed")) > 0
|
|
|
|
|
|
class Guardian:
|
|
"""A keeper of robots.txt files."""
|
|
|
|
permissions: dict[str, urllib.robotparser.RobotFileParser | asyncio.Lock]
|
|
|
|
def __init__(self):
|
|
self.permissions = {}
|
|
|
|
async def get_robots_parser(self, url: str) -> urllib.robotparser.RobotFileParser:
|
|
"""Fetch the robots parser for the given URL. Only do it once per site."""
|
|
url = urllib.parse.urljoin(url, "/robots.txt")
|
|
parser = self.permissions.get(url)
|
|
if parser is None:
|
|
parser = asyncio.Lock()
|
|
self.permissions[url] = parser
|
|
|
|
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
|
return parser
|
|
|
|
assert isinstance(parser, asyncio.Lock)
|
|
async with parser:
|
|
parser = self.permissions.get(url)
|
|
if isinstance(parser, urllib.robotparser.RobotFileParser):
|
|
return parser
|
|
|
|
LOG.debug(f"{url} Fetching robots.txt...")
|
|
headers = {"user-agent": USER_AGENT}
|
|
event_loop = asyncio.get_running_loop()
|
|
response = await event_loop.run_in_executor(
|
|
None,
|
|
functools.partial(
|
|
requests.get,
|
|
url,
|
|
headers=headers,
|
|
),
|
|
)
|
|
|
|
parser = urllib.robotparser.RobotFileParser(url)
|
|
if response.status_code in (401, 403):
|
|
LOG.debug(
|
|
f"{url} Server says {response.status_code}, asusming we can't fetch anything"
|
|
)
|
|
parser.disallow_all = True # type: ignore
|
|
elif response.status_code >= 400 and response.status_code < 500:
|
|
LOG.debug(
|
|
f"{url} Server says {response.status_code}, assume we have free reign"
|
|
)
|
|
parser.allow_all = True # type: ignore
|
|
elif response.status_code >= 300:
|
|
response.raise_for_status()
|
|
else:
|
|
text = await event_loop.run_in_executor(None, lambda: response.text)
|
|
parser.parse(text.splitlines())
|
|
|
|
self.permissions[url] = parser
|
|
return parser
|
|
|
|
async def can_fetch(self, url: str) -> bool:
|
|
"""Returns true if we are allowed to fetch the given URL."""
|
|
# Look, opinions differ on whether feed readers are supposed to be
|
|
# considered robots. I added robots.txt support for feeds based on
|
|
# the example of the feed finder python code but on reflection it
|
|
# does not do what I want it to do and the world seems to suggest
|
|
# that RSS readers should ignore it. (i.e., jwz blocks robots from
|
|
# accessing the RSS feed, so.)
|
|
#
|
|
# I'm leaving this code here so that I can resurrect it later if
|
|
# necessary.
|
|
#
|
|
# parser = await self.get_robots_parser(url)
|
|
# return parser.can_fetch(USER_AGENT, url)
|
|
del url
|
|
return True
|
|
|
|
async def crawl_delay(self, url: str) -> int | None:
|
|
"""Returns the number of seconds we should wait before fetching again."""
|
|
try:
|
|
parser = await self.get_robots_parser(url)
|
|
result = parser.crawl_delay(USER_AGENT)
|
|
if isinstance(result, str):
|
|
try:
|
|
return int(result)
|
|
except ValueError:
|
|
return None
|
|
except Exception as e:
|
|
LOG.error(f"Error fetching crawl delay for {url}: {e}")
|
|
return None
|
|
|
|
|
|
GUARDIAN = Guardian()
|
|
|
|
|
|
async def fetch_feed(meta: FeedMeta) -> typing.Tuple[None | Feed | str, FeedMeta]:
|
|
"""Fetch a feed from the internet. `meta` is a FeedMeta that has all the
|
|
details about what happened the last time we went to do a fetch, caching
|
|
information and whatnot.
|
|
|
|
The return value is a little funky. It returns a 2-tuple, where the first
|
|
element is one of:
|
|
|
|
- None, if we could not fetch anything
|
|
- A Feed, if we fetched something and it seemed to be a feed
|
|
- A string, if we fetched something but it was not a feed
|
|
|
|
The second element is a FeedMeta that describes the URL. It might be the
|
|
same as the FeedMeta that was provided, but it might not be:
|
|
|
|
- The etag might have been updated if the server sent us an etag
|
|
- The modified value might have been updated if the server sent us a
|
|
new value
|
|
- The URL might have been updated if we followed a permanent redirect
|
|
|
|
Just to be safe, callers should use the new FeedMeta in place of the
|
|
argument for everything after calling this function.
|
|
"""
|
|
if not meta.should_fetch(time.time()):
|
|
return (None, meta)
|
|
|
|
headers = {"user-agent": USER_AGENT}
|
|
if meta.etag:
|
|
headers["if-none-match"] = meta.etag
|
|
if meta.modified:
|
|
headers["if-modified-since"] = meta.modified
|
|
|
|
# We waffle back and forth about using feedreader's HTTP support vs
|
|
# calling requests ourselves. We have decided to use requests manually at
|
|
# this time because it make it much much easier to figure out whether or
|
|
# not a request has succeeded. (The straw was handling timeouts and
|
|
# understanding whether `bozo_exception` was a transport failure or not.)
|
|
|
|
if await GUARDIAN.can_fetch(meta.url):
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
response = await loop.run_in_executor(
|
|
None,
|
|
functools.partial(http.get, meta.url, headers=headers),
|
|
)
|
|
LOG.info(f"{meta.url} fetched with status: {response.status_code}")
|
|
failed = response.status_code >= 400
|
|
except Exception as e:
|
|
LOG.error(f"{meta.url} error fetching: {e}")
|
|
failed = True
|
|
response = None
|
|
else:
|
|
LOG.error(f"{meta.url} Guardian says we cannot fetch")
|
|
failed = True
|
|
response = None
|
|
|
|
# Now, there are a number of things to consider in the response that
|
|
# we need to consider in updating our permanent record.
|
|
|
|
if response is not None and response.status_code == 410:
|
|
# Permanently gone, really stop asking.
|
|
LOG.error(f"{meta.url} permanently gone")
|
|
return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
|
|
|
|
if failed and time.time() > meta.last_fetched_ts + (7 * 24 * 60 * 60):
|
|
# If we've been failing to fetch the feed for more than a week then
|
|
# consider us dead, we must be doing something wrong.
|
|
LOG.error(f"{meta.url} failed for too long, giving up")
|
|
return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD))
|
|
|
|
if response:
|
|
# Check for permanent redirects and handle them properly. Note that
|
|
# requests is kinda dumb when it comes to permanent redirects: we
|
|
# have to slog through the history itself when it comes to the
|
|
# redirects, and we have to note the URL of the request *after* the
|
|
# permanent redirect in order to get the right one.
|
|
#
|
|
new_url = None
|
|
|
|
history = list(response.history)
|
|
history.append(response)
|
|
history.reverse()
|
|
|
|
last_url = response.url
|
|
for h in history:
|
|
if h.is_permanent_redirect:
|
|
new_url = last_url
|
|
break
|
|
last_url = h.url
|
|
|
|
if new_url is not None:
|
|
LOG.info(f"{meta.url} permanently redirected to {new_url}")
|
|
meta = dataclasses.replace(meta, url=new_url)
|
|
|
|
# TODO: Handle that bogus non-HTTP redirect that feedfinder uses.
|
|
|
|
# NOTE: We might still be in a failure state here. But success or fail,
|
|
# the server might have told us when to next retry, so make a note
|
|
# of it. The server might also have given us updated caching
|
|
# information (even on failure!) and so let's also make a note of that.
|
|
retry_delta = None
|
|
etag = meta.etag
|
|
modified = meta.modified
|
|
if response is not None:
|
|
etag = response.headers.get("etag", meta.etag)
|
|
modified = response.headers.get("last-modified", meta.modified)
|
|
|
|
try:
|
|
retry_delta = int(response.headers.get("retry-after", "nope"))
|
|
except Exception:
|
|
pass
|
|
|
|
if retry_delta is None:
|
|
# See if robots.txt has any guidance for us.
|
|
retry_delta = await GUARDIAN.crawl_delay(meta.url)
|
|
|
|
if retry_delta is None:
|
|
if failed:
|
|
retry_delta = 1 * 60 # Retry again in a minute
|
|
else:
|
|
retry_delta = 60 * 60 # 1 hour default
|
|
|
|
meta = dataclasses.replace(
|
|
meta,
|
|
retry_after_ts=int(time.time()) + retry_delta,
|
|
etag=etag,
|
|
modified=modified,
|
|
)
|
|
|
|
# We've done everything we can on a failure, bail if we've got an error.
|
|
if failed:
|
|
LOG.info(f"{meta.url} failed at the network level")
|
|
return (None, meta)
|
|
|
|
assert response is not None
|
|
|
|
# Record our successful fetch now, to reset the failure timer above.
|
|
meta = dataclasses.replace(meta, last_fetched_ts=int(time.time()))
|
|
|
|
# We can *still* be successful but like, no changes.
|
|
if response.status_code != 200:
|
|
LOG.info(f"{meta.url} had no changes")
|
|
return (None, meta)
|
|
|
|
# Does this seem to be a feed? Or not?
|
|
if could_be_feed_data(response.text):
|
|
parsed = feedparser.parse(response.content, response_headers=response.headers)
|
|
return (Feed.from_parsed(parsed, meta), meta)
|
|
|
|
# No this is not a feed, just return the content out for further
|
|
# processing.
|
|
return (response.text, meta)
|
|
|
|
|
|
async def fetch_many(
|
|
metas: list[FeedMeta],
|
|
) -> list[typing.Tuple[Feed | str | None, FeedMeta]]:
|
|
async with asyncio.TaskGroup() as group:
|
|
tasks = [group.create_task(fetch_feed(m)) for m in metas]
|
|
return [t.result() for t in tasks]
|
|
|
|
|
|
def sort_key(f: Feed) -> int:
|
|
"""A sort key for sorting feeds by recency."""
|
|
if len(f.entries) > 0:
|
|
return max(e.posted_at for e in f.entries)
|
|
return -1
|
|
|
|
|
|
def sort_key_inserted(f: Feed) -> int:
|
|
"""A sort key for sorting feeds by recency."""
|
|
if len(f.entries) > 0:
|
|
return max(e.inserted_at for e in f.entries)
|
|
return -1
|
|
|
|
|
|
class FeedSearchParser(html.parser.HTMLParser):
|
|
"""An HTML parser that tries to find links to feeds."""
|
|
|
|
FEED_TYPES = (
|
|
"application/rss+xml",
|
|
"text/xml",
|
|
"application/atom+xml",
|
|
"application/x.atom+xml",
|
|
"application/x-atom+xml",
|
|
)
|
|
|
|
link_links: list[str]
|
|
a_links: list[str]
|
|
|
|
def __init__(self, baseuri):
|
|
super().__init__()
|
|
self.baseuri = baseuri
|
|
self.link_links = []
|
|
self.a_links = []
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs = {k: v for k, v in attrs}
|
|
if tag == "base":
|
|
self.do_base(attrs)
|
|
elif tag == "link":
|
|
self.do_link(attrs)
|
|
elif tag == "a":
|
|
self.do_a(attrs)
|
|
|
|
def do_base(self, attrs):
|
|
base = attrs.get("href")
|
|
if base is not None:
|
|
self.baseuri = base
|
|
|
|
def do_link(self, attrs):
|
|
rel = attrs.get("rel")
|
|
if rel is None:
|
|
return
|
|
|
|
if "alternate" not in rel.split():
|
|
return
|
|
|
|
if attrs.get("type", "").lower() not in self.FEED_TYPES:
|
|
return
|
|
|
|
href = attrs.get("href")
|
|
if href is None:
|
|
return
|
|
|
|
self.link_links.append(urllib.parse.urljoin(self.baseuri, href))
|
|
|
|
def do_a(self, attrs):
|
|
href = attrs.get("href")
|
|
if href is None:
|
|
return
|
|
|
|
self.a_links.append(urllib.parse.urljoin(self.baseuri, href))
|
|
|
|
|
|
def massage_url(uri: str) -> str:
|
|
uri = uri.strip()
|
|
if uri.startswith("feed://"):
|
|
uri = "http://" + uri.split("feed://", 1).pop()
|
|
for x in ["http", "https"]:
|
|
if uri.startswith("%s://" % x):
|
|
return uri
|
|
return "http://%s" % uri
|
|
|
|
|
|
def classify_links(links, baseuri) -> typing.Tuple[list[str], list[str]]:
|
|
"""Split the links into two sets: local (which start with baseuri) and
|
|
remote (which don't).
|
|
"""
|
|
baseuri = baseuri.lower()
|
|
|
|
local, remote = set(), set()
|
|
for link in links:
|
|
if link.lower().startswith(baseuri):
|
|
local.add(link)
|
|
else:
|
|
remote.add(link)
|
|
|
|
return list(local), list(remote)
|
|
|
|
|
|
def is_feed_link(link: str) -> bool:
|
|
"""Return True if the link seems to be a feed link, or False otherwise."""
|
|
link = link.lower()
|
|
return (
|
|
link.endswith(".rss")
|
|
or link.endswith(".rdf")
|
|
or link.endswith(".xml")
|
|
or link.endswith(".atom")
|
|
)
|
|
|
|
|
|
def is_XML_related_link(link: str) -> bool:
|
|
link = link.lower()
|
|
return (
|
|
"rss" in link
|
|
or "rdf" in link
|
|
or "xml" in link
|
|
or "atom" in link
|
|
or "feed" in link
|
|
)
|
|
|
|
|
|
async def check_feed(url: str) -> Feed | None:
|
|
"""Check to see if the given URL is a feed. If it is, return the feed,
|
|
otherwise return None.
|
|
"""
|
|
LOG.debug(f"Checking {url}: checking...")
|
|
meta = FeedMeta.from_url(url)
|
|
result, meta = await fetch_feed(meta)
|
|
if isinstance(result, Feed):
|
|
LOG.debug(f"Checking {url}: is a feed")
|
|
return result
|
|
|
|
LOG.debug(f"Checking {url}: is not a feed")
|
|
return None
|
|
|
|
|
|
async def check_links(links: typing.Iterable[str]) -> list[Feed]:
|
|
"""Fetch all the links and return the ones that appear to have feeds in
|
|
them. If none of them are fetchable or none of them have feeds then this
|
|
will return nothing.
|
|
"""
|
|
async with asyncio.TaskGroup() as group:
|
|
tasks = [group.create_task(check_feed(link)) for link in links]
|
|
|
|
outfeeds: list[Feed] = []
|
|
for task in tasks:
|
|
result = task.result()
|
|
if result is not None:
|
|
outfeeds.append(result)
|
|
|
|
return outfeeds
|
|
|
|
|
|
async def feed_search(uri: str) -> list[Feed]:
|
|
meta = FeedMeta.from_url(massage_url(uri))
|
|
result, meta = await fetch_feed(meta)
|
|
if result is None:
|
|
return []
|
|
if isinstance(result, Feed):
|
|
return [result]
|
|
|
|
# OK it was not a feed, let's try all our searching games.
|
|
parser = FeedSearchParser(meta.url)
|
|
parser.feed(result)
|
|
|
|
LOG.debug("Checking links...")
|
|
outfeeds = await check_links(parser.link_links)
|
|
if len(outfeeds) > 0:
|
|
return outfeeds
|
|
|
|
LOG.debug("No links, checking A tags...")
|
|
local_links, remote_links = classify_links(parser.a_links, meta.url)
|
|
for link in local_links:
|
|
LOG.debug(f" LOCAL {link}")
|
|
for link in remote_links:
|
|
LOG.debug(f" REMOTE {link}")
|
|
|
|
outfeeds = await check_links(filter(is_feed_link, local_links))
|
|
if len(outfeeds) > 0:
|
|
return outfeeds
|
|
outfeeds = await check_links(filter(is_XML_related_link, local_links))
|
|
if len(outfeeds) > 0:
|
|
return outfeeds
|
|
outfeeds = await check_links(filter(is_feed_link, remote_links))
|
|
if len(outfeeds) > 0:
|
|
return outfeeds
|
|
outfeeds = await check_links(filter(is_XML_related_link, remote_links))
|
|
if len(outfeeds) > 0:
|
|
return outfeeds
|
|
|
|
LOG.debug("no A tags, guessing")
|
|
suffixes = [ # filenames used by popular software:
|
|
"atom.xml", # blogger, TypePad
|
|
"index.atom", # MT, apparently
|
|
"index.rdf", # MT
|
|
"rss.xml", # Dave Winer/Manila
|
|
"index.xml", # MT
|
|
"index.rss", # Slash
|
|
"feed", # catandgirl.com and sometimes others.
|
|
]
|
|
outfeeds = await check_links([urllib.parse.urljoin(meta.url, x) for x in suffixes])
|
|
return outfeeds
|