Compare commits
No commits in common. "52c12785c82204fb8c0758e8168449f9019091c7" and "cce0ad9f8fadbd43310bb949a9d075130f277dae" have entirely different histories.
52c12785c8
...
cce0ad9f8f
5 changed files with 32 additions and 90 deletions
|
|
@ -56,7 +56,7 @@ def subscribe(url, literal):
|
||||||
|
|
||||||
if not literal:
|
if not literal:
|
||||||
click.echo(f"Searching for feeds for {url} ...")
|
click.echo(f"Searching for feeds for {url} ...")
|
||||||
feeds = asyncio.run(feed.feed_search(url))
|
feeds = asyncio.run(feed.feed_search(url, db.origin))
|
||||||
if len(feeds) == 0:
|
if len(feeds) == 0:
|
||||||
click.echo(f"Unable to find a suitable feed for {url}")
|
click.echo(f"Unable to find a suitable feed for {url}")
|
||||||
return 1
|
return 1
|
||||||
|
|
@ -83,7 +83,7 @@ def subscribe(url, literal):
|
||||||
click.echo(f"Identified {result.meta.url} as a feed for {url}")
|
click.echo(f"Identified {result.meta.url} as a feed for {url}")
|
||||||
else:
|
else:
|
||||||
click.echo(f"Fetching {url} ...")
|
click.echo(f"Fetching {url} ...")
|
||||||
meta = feed.FeedMeta.from_url(url)
|
meta = feed.FeedMeta.from_url(url, db.origin)
|
||||||
d, meta = asyncio.run(feed.fetch_feed(meta))
|
d, meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
if d is None:
|
if d is None:
|
||||||
click.echo(f"Unable to fetch {url}")
|
click.echo(f"Unable to fetch {url}")
|
||||||
|
|
@ -112,7 +112,7 @@ def import_opml(opml_file):
|
||||||
|
|
||||||
db = database.Database.local()
|
db = database.Database.local()
|
||||||
urls = opml.parse_opml(opml_file.read())
|
urls = opml.parse_opml(opml_file.read())
|
||||||
metas = [feed.FeedMeta.from_url(url) for url in urls]
|
metas = [feed.FeedMeta.from_url(url, db.origin) for url in urls]
|
||||||
|
|
||||||
click.echo(f"Fetching {len(urls)} feeds ...")
|
click.echo(f"Fetching {len(urls)} feeds ...")
|
||||||
results = asyncio.run(feed.fetch_many(metas))
|
results = asyncio.run(feed.fetch_many(metas))
|
||||||
|
|
|
||||||
|
|
@ -76,6 +76,7 @@ def local_origin(path: pathlib.Path | None = None) -> str:
|
||||||
|
|
||||||
|
|
||||||
def database_path(origin: str) -> pathlib.Path:
|
def database_path(origin: str) -> pathlib.Path:
|
||||||
|
# TODO: Determine the name/slug from local state if necessary
|
||||||
return pathlib.Path.home() / "Dropbox" / "cry" / f"{origin}.db"
|
return pathlib.Path.home() / "Dropbox" / "cry" / f"{origin}.db"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -167,6 +168,7 @@ class Database:
|
||||||
status=int(status),
|
status=int(status),
|
||||||
etag=etag,
|
etag=etag,
|
||||||
modified=modified,
|
modified=modified,
|
||||||
|
origin=self.origin,
|
||||||
)
|
)
|
||||||
for url, last_fetched_ts, retry_after_ts, status, etag, modified in rows
|
for url, last_fetched_ts, retry_after_ts, status, etag, modified in rows
|
||||||
]
|
]
|
||||||
|
|
@ -216,6 +218,7 @@ class Database:
|
||||||
status=status,
|
status=status,
|
||||||
etag=etag,
|
etag=etag,
|
||||||
modified=modified,
|
modified=modified,
|
||||||
|
origin=self.origin,
|
||||||
)
|
)
|
||||||
almost_feeds.append((meta, title, link))
|
almost_feeds.append((meta, title, link))
|
||||||
|
|
||||||
|
|
@ -279,6 +282,7 @@ class Database:
|
||||||
status=status,
|
status=status,
|
||||||
etag=etag,
|
etag=etag,
|
||||||
modified=modified,
|
modified=modified,
|
||||||
|
origin=self.origin,
|
||||||
)
|
)
|
||||||
|
|
||||||
cursor = self.db.execute(
|
cursor = self.db.execute(
|
||||||
|
|
|
||||||
35
cry/feed.py
35
cry/feed.py
|
|
@ -38,9 +38,10 @@ class FeedMeta:
|
||||||
status: int
|
status: int
|
||||||
etag: str | None
|
etag: str | None
|
||||||
modified: str | None
|
modified: str | None
|
||||||
|
origin: str
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_url(cls, url: str) -> "FeedMeta":
|
def from_url(cls, url: str, origin: str) -> "FeedMeta":
|
||||||
return FeedMeta(
|
return FeedMeta(
|
||||||
url=url,
|
url=url,
|
||||||
last_fetched_ts=0,
|
last_fetched_ts=0,
|
||||||
|
|
@ -48,6 +49,7 @@ class FeedMeta:
|
||||||
status=FEED_STATUS_ALIVE,
|
status=FEED_STATUS_ALIVE,
|
||||||
etag=None,
|
etag=None,
|
||||||
modified=None,
|
modified=None,
|
||||||
|
origin=origin,
|
||||||
)
|
)
|
||||||
|
|
||||||
def should_fetch(self, now) -> bool:
|
def should_fetch(self, now) -> bool:
|
||||||
|
|
@ -145,7 +147,6 @@ class Feed:
|
||||||
link = None
|
link = None
|
||||||
|
|
||||||
if d.feed is not None:
|
if d.feed is not None:
|
||||||
assert not isinstance(d.feed, list)
|
|
||||||
title = d.feed.get("title")
|
title = d.feed.get("title")
|
||||||
link = d.feed.get("link")
|
link = d.feed.get("link")
|
||||||
|
|
||||||
|
|
@ -427,7 +428,7 @@ async def fetch_many(
|
||||||
return [t.result() for t in tasks]
|
return [t.result() for t in tasks]
|
||||||
|
|
||||||
|
|
||||||
def merge_feeds(a: Feed, a_origin: str, b: Feed, b_origin: str) -> Feed:
|
def merge_feeds(a: Feed, b: Feed) -> Feed:
|
||||||
"""Merge two known feeds. There are two conflict resolution policies:
|
"""Merge two known feeds. There are two conflict resolution policies:
|
||||||
|
|
||||||
1. The newer fetch of feed metadata wins.
|
1. The newer fetch of feed metadata wins.
|
||||||
|
|
@ -448,7 +449,7 @@ def merge_feeds(a: Feed, a_origin: str, b: Feed, b_origin: str) -> Feed:
|
||||||
if a.meta.last_fetched_ts > b.meta.last_fetched_ts:
|
if a.meta.last_fetched_ts > b.meta.last_fetched_ts:
|
||||||
source_feed = a
|
source_feed = a
|
||||||
elif a.meta.last_fetched_ts == b.meta.last_fetched_ts:
|
elif a.meta.last_fetched_ts == b.meta.last_fetched_ts:
|
||||||
source_feed = a if a_origin < b_origin else b
|
source_feed = a if a.meta.origin < b.meta.origin else b
|
||||||
else:
|
else:
|
||||||
source_feed = b
|
source_feed = b
|
||||||
|
|
||||||
|
|
@ -568,11 +569,11 @@ def is_XML_related_link(link: str) -> bool:
|
||||||
return "rss" in link or "rdf" in link or "xml" in link or "atom" in link
|
return "rss" in link or "rdf" in link or "xml" in link or "atom" in link
|
||||||
|
|
||||||
|
|
||||||
async def check_feed(url: str) -> Feed | None:
|
async def check_feed(url: str, origin: str) -> Feed | None:
|
||||||
"""Check to see if the given URL is a feed. If it is, return the feed,
|
"""Check to see if the given URL is a feed. If it is, return the feed,
|
||||||
otherwise return None.
|
otherwise return None.
|
||||||
"""
|
"""
|
||||||
meta = FeedMeta.from_url(url)
|
meta = FeedMeta.from_url(url, origin)
|
||||||
result, meta = await fetch_feed(meta)
|
result, meta = await fetch_feed(meta)
|
||||||
if isinstance(result, Feed):
|
if isinstance(result, Feed):
|
||||||
return result
|
return result
|
||||||
|
|
@ -580,13 +581,13 @@ async def check_feed(url: str) -> Feed | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def check_links(links: typing.Iterable[str]) -> list[Feed]:
|
async def check_links(links: typing.Iterable[str], origin: str) -> list[Feed]:
|
||||||
"""Fetch all the links and return the ones that appear to have feeds in
|
"""Fetch all the links and return the ones that appear to have feeds in
|
||||||
them. If none of them are fetchable or none of them have feeds then this
|
them. If none of them are fetchable or none of them have feeds then this
|
||||||
will return nothing.
|
will return nothing.
|
||||||
"""
|
"""
|
||||||
async with asyncio.TaskGroup() as group:
|
async with asyncio.TaskGroup() as group:
|
||||||
tasks = [group.create_task(check_feed(link)) for link in links]
|
tasks = [group.create_task(check_feed(link, origin)) for link in links]
|
||||||
|
|
||||||
outfeeds: list[Feed] = []
|
outfeeds: list[Feed] = []
|
||||||
for task in tasks:
|
for task in tasks:
|
||||||
|
|
@ -597,8 +598,8 @@ async def check_links(links: typing.Iterable[str]) -> list[Feed]:
|
||||||
return outfeeds
|
return outfeeds
|
||||||
|
|
||||||
|
|
||||||
async def feed_search(uri: str) -> list[Feed]:
|
async def feed_search(uri: str, origin: str) -> list[Feed]:
|
||||||
meta = FeedMeta.from_url(massage_url(uri))
|
meta = FeedMeta.from_url(massage_url(uri), origin)
|
||||||
result, meta = await fetch_feed(meta)
|
result, meta = await fetch_feed(meta)
|
||||||
if result is None:
|
if result is None:
|
||||||
return []
|
return []
|
||||||
|
|
@ -610,22 +611,22 @@ async def feed_search(uri: str) -> list[Feed]:
|
||||||
parser.feed(result)
|
parser.feed(result)
|
||||||
|
|
||||||
LOG.debug("Checking links...")
|
LOG.debug("Checking links...")
|
||||||
outfeeds = await check_links(parser.link_links)
|
outfeeds = await check_links(parser.link_links, origin)
|
||||||
if len(outfeeds) > 0:
|
if len(outfeeds) > 0:
|
||||||
return outfeeds
|
return outfeeds
|
||||||
|
|
||||||
LOG.debug("No links, checking A tags...")
|
LOG.debug("No links, checking A tags...")
|
||||||
local_links, remote_links = classify_links(parser.a_links, meta.url)
|
local_links, remote_links = classify_links(parser.a_links, meta.url)
|
||||||
outfeeds = await check_links(filter(is_feed_link, local_links))
|
outfeeds = await check_links(filter(is_feed_link, local_links), origin)
|
||||||
if len(outfeeds) > 0:
|
if len(outfeeds) > 0:
|
||||||
return outfeeds
|
return outfeeds
|
||||||
outfeeds = await check_links(filter(is_XML_related_link, local_links))
|
outfeeds = await check_links(filter(is_XML_related_link, local_links), origin)
|
||||||
if len(outfeeds) > 0:
|
if len(outfeeds) > 0:
|
||||||
return outfeeds
|
return outfeeds
|
||||||
outfeeds = await check_links(filter(is_feed_link, remote_links))
|
outfeeds = await check_links(filter(is_feed_link, remote_links), origin)
|
||||||
if len(outfeeds) > 0:
|
if len(outfeeds) > 0:
|
||||||
return outfeeds
|
return outfeeds
|
||||||
outfeeds = await check_links(filter(is_XML_related_link, remote_links))
|
outfeeds = await check_links(filter(is_XML_related_link, remote_links), origin)
|
||||||
if len(outfeeds) > 0:
|
if len(outfeeds) > 0:
|
||||||
return outfeeds
|
return outfeeds
|
||||||
|
|
||||||
|
|
@ -638,5 +639,7 @@ async def feed_search(uri: str) -> list[Feed]:
|
||||||
"index.xml", # MT
|
"index.xml", # MT
|
||||||
"index.rss", # Slash
|
"index.rss", # Slash
|
||||||
]
|
]
|
||||||
outfeeds = await check_links([urllib.parse.urljoin(meta.url, x) for x in suffixes])
|
outfeeds = await check_links(
|
||||||
|
[urllib.parse.urljoin(meta.url, x) for x in suffixes], origin
|
||||||
|
)
|
||||||
return outfeeds
|
return outfeeds
|
||||||
|
|
|
||||||
|
|
@ -1,66 +0,0 @@
|
||||||
import pathlib
|
|
||||||
import random
|
|
||||||
import string
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
from cry import database
|
|
||||||
|
|
||||||
|
|
||||||
def random_slug() -> str:
|
|
||||||
return "".join(
|
|
||||||
random.choices(
|
|
||||||
string.ascii_uppercase + string.ascii_lowercase + string.digits, k=8
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_database_origin_path():
|
|
||||||
op = database.origin_path()
|
|
||||||
assert op is not None
|
|
||||||
|
|
||||||
|
|
||||||
def test_database_local_origin():
|
|
||||||
with tempfile.TemporaryDirectory() as op:
|
|
||||||
origin_file = pathlib.Path(op) / "origin"
|
|
||||||
assert not origin_file.exists()
|
|
||||||
|
|
||||||
origin = database.local_origin(origin_file)
|
|
||||||
|
|
||||||
assert origin_file.exists()
|
|
||||||
assert len(origin) > 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_database_local_origin_repeatable():
|
|
||||||
with tempfile.TemporaryDirectory() as op:
|
|
||||||
origin_file = pathlib.Path(op) / "origin"
|
|
||||||
|
|
||||||
a = database.local_origin(origin_file)
|
|
||||||
b = database.local_origin(origin_file)
|
|
||||||
|
|
||||||
assert len(a) > 0
|
|
||||||
assert a == b
|
|
||||||
|
|
||||||
|
|
||||||
def test_database_origin_in_path():
|
|
||||||
slug = random_slug()
|
|
||||||
p = database.database_path(slug)
|
|
||||||
assert slug in str(p)
|
|
||||||
|
|
||||||
|
|
||||||
def test_database_schema():
|
|
||||||
db = database.Database(":memory:", random_slug())
|
|
||||||
db.ensure_database_schema()
|
|
||||||
|
|
||||||
c = db.db.execute("SELECT value FROM properties WHERE name = 'version'")
|
|
||||||
row = c.fetchone()
|
|
||||||
assert int(row[0]) == len(database.SCHEMA_STATEMENTS)
|
|
||||||
|
|
||||||
|
|
||||||
def test_database_prop_get_set():
|
|
||||||
db = database.Database(":memory:", random_slug())
|
|
||||||
db.ensure_database_schema()
|
|
||||||
|
|
||||||
assert db.get_property("foo") is None
|
|
||||||
val = random_slug()
|
|
||||||
db.set_property("foo", val)
|
|
||||||
assert db.get_property("foo") == val
|
|
||||||
|
|
@ -4,6 +4,7 @@ import http.server
|
||||||
import threading
|
import threading
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
from cry import feed
|
from cry import feed
|
||||||
|
|
||||||
|
|
@ -117,7 +118,7 @@ def test_basic_successful_fetch():
|
||||||
with TestWebServer() as server:
|
with TestWebServer() as server:
|
||||||
server.handle("/", TEST_FEED, content_type="text/xml")
|
server.handle("/", TEST_FEED, content_type="text/xml")
|
||||||
|
|
||||||
meta = feed.FeedMeta.from_url(server.make_url("/"))
|
meta = feed.FeedMeta.from_url(server.make_url("/"), "asdf")
|
||||||
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
|
|
||||||
assert new_meta.url == meta.url
|
assert new_meta.url == meta.url
|
||||||
|
|
@ -131,7 +132,7 @@ def test_fetch_after_temp_redirect():
|
||||||
server.handle("/old", code=307, headers=[("location", "/temp")])
|
server.handle("/old", code=307, headers=[("location", "/temp")])
|
||||||
server.handle("/temp", TEST_FEED, content_type="text/xml")
|
server.handle("/temp", TEST_FEED, content_type="text/xml")
|
||||||
|
|
||||||
meta = feed.FeedMeta.from_url(server.make_url("/old"))
|
meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf")
|
||||||
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
assert new_meta.url == meta.url
|
assert new_meta.url == meta.url
|
||||||
assert isinstance(result, feed.Feed)
|
assert isinstance(result, feed.Feed)
|
||||||
|
|
@ -142,7 +143,7 @@ def test_fetch_after_permanent_redirect():
|
||||||
server.handle("/old", code=308, headers=[("location", "/perm")])
|
server.handle("/old", code=308, headers=[("location", "/perm")])
|
||||||
server.handle("/perm", TEST_FEED, content_type="text/xml")
|
server.handle("/perm", TEST_FEED, content_type="text/xml")
|
||||||
|
|
||||||
meta = feed.FeedMeta.from_url(server.make_url("/old"))
|
meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf")
|
||||||
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
assert new_meta.url == server.make_url("/perm")
|
assert new_meta.url == server.make_url("/perm")
|
||||||
assert isinstance(result, feed.Feed)
|
assert isinstance(result, feed.Feed)
|
||||||
|
|
@ -154,7 +155,7 @@ def test_fetch_after_permanent_to_temporary_redirect():
|
||||||
server.handle("/perm", code=307, headers=[("location", "/temp")])
|
server.handle("/perm", code=307, headers=[("location", "/temp")])
|
||||||
server.handle("/temp", TEST_FEED, content_type="text/xml")
|
server.handle("/temp", TEST_FEED, content_type="text/xml")
|
||||||
|
|
||||||
meta = feed.FeedMeta.from_url(server.make_url("/old"))
|
meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf")
|
||||||
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
|
|
||||||
# NOTE: we should record the PERMANENT redirect, not the temporary one.
|
# NOTE: we should record the PERMANENT redirect, not the temporary one.
|
||||||
|
|
@ -168,7 +169,7 @@ def test_fetch_after_permanent_to_permanent_redirect():
|
||||||
server.handle("/one", code=308, headers=[("location", "/two")])
|
server.handle("/one", code=308, headers=[("location", "/two")])
|
||||||
server.handle("/two", TEST_FEED, content_type="text/xml")
|
server.handle("/two", TEST_FEED, content_type="text/xml")
|
||||||
|
|
||||||
meta = feed.FeedMeta.from_url(server.make_url("/old"))
|
meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf")
|
||||||
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
result, new_meta = asyncio.run(feed.fetch_feed(meta))
|
||||||
|
|
||||||
# NOTE: we should record the latest redirect.
|
# NOTE: we should record the latest redirect.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue