diff --git a/cry/feed.py b/cry/feed.py index f736aae..2af594d 100644 --- a/cry/feed.py +++ b/cry/feed.py @@ -340,17 +340,29 @@ async def fetch_feed(meta: FeedMeta) -> typing.Tuple[str | Feed | None, FeedMeta LOG.error(f"{meta.url} failed for too long, giving up") return (None, dataclasses.replace(meta, status=FEED_STATUS_DEAD)) - if response and response.is_permanent_redirect: - # Permanent redirect, update the stored URL, but mark this as a - # successful fetch. + if response: + # Check for permanent redirects and handle them properly. Note that + # requests is kinda dumb when it comes to permanent redirects: we + # have to slog through the history itself when it comes to the + # redirects, and we have to note the URL of the request *after* the + # permanent redirect in order to get the right one. # - # TODO: Is this actually the right URL to store? We need the last - # permanently redirected URL, not just whatever the last thing - # is... e.g. imagine a permanent followed by a temporary - # redirect, then what? - LOG.info(f"{meta.url} permanently redirected to {response.url}") - assert response.url is not None - meta = dataclasses.replace(meta, url=response.url) + new_url = None + + history = list(response.history) + history.append(response) + history.reverse() + + last_url = response.url + for h in history: + if h.is_permanent_redirect: + new_url = last_url + break + last_url = h.url + + if new_url is not None: + LOG.info(f"{meta.url} permanently redirected to {new_url}") + meta = dataclasses.replace(meta, url=new_url) # TODO: Handle that bogus non-HTTP redirect that feedfinder uses. diff --git a/tests/test_feed.py b/tests/test_feed.py new file mode 100644 index 0000000..235eaf1 --- /dev/null +++ b/tests/test_feed.py @@ -0,0 +1,177 @@ +import asyncio +import dataclasses +import http.server +import threading +import typing + +import requests + +from cry import feed + + +@dataclasses.dataclass +class MockResponse: + code: int = 200 + headers: list[typing.Tuple[str, str]] | None = None + content: bytes = b"" + + def write_to(self, handler: http.server.BaseHTTPRequestHandler): + handler.send_response(self.code) + if self.headers is not None: + for name, value in self.headers: + handler.send_header(name, value) + handler.send_header("content-length", str(len(self.content))) + handler.end_headers() + handler.wfile.write(self.content) + + +class TestWebServer: + __test__ = False + server_port: int + http_server: http.server.ThreadingHTTPServer + server_thread: threading.Thread + + handlers: dict[str, list[MockResponse]] + + def __init__(self): + server = self + + class TestWebHandler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + server._handle_GET(self) + + self.http_server = http.server.ThreadingHTTPServer(("", 0), TestWebHandler) + self.server_thread = threading.Thread(target=self.http_server.serve_forever) + self.server_thread.daemon = True + self.handlers = {} + + self.server_port = self.http_server.server_port + + def make_url(self, path: str) -> str: + return f"http://localhost:{self.server_port}{path}" + + def handle( + self, + path: str, + content: bytes = b"", + code: int = 200, + content_type: str | None = None, + headers: list[typing.Tuple[str, str]] | None = None, + ): + if headers is None: + headers = [] + else: + headers = list(headers) + + if content_type is not None: + headers.append(("content-type", content_type)) + + self.respond(path, MockResponse(code, headers, content)) + + def respond(self, path: str, response: MockResponse): + response_list = self.handlers.get(path) + if response_list is None: + response_list = [] + self.handlers[path] = response_list + + response_list.append(response) + + def _handle_GET(self, handler: http.server.BaseHTTPRequestHandler): + responses = self.handlers.get(handler.path, []) + + if len(responses) > 0: + response = responses[0] + if len(responses) > 1: + responses.pop(0) + else: + response = MockResponse(code=404) + + response.write_to(handler) + + def __enter__(self): + self.server_thread.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + del exc_type + del exc_value + del traceback + self.http_server.shutdown() + + +TEST_FEED = b""" + + + + Smallest + + An Item + http://example.com + tag:d0ty.me,2024-01-01:smallest:one + + + +""" + + +def test_basic_successful_fetch(): + with TestWebServer() as server: + server.handle("/", TEST_FEED, content_type="text/xml") + + meta = feed.FeedMeta.from_url(server.make_url("/"), "asdf") + result, new_meta = asyncio.run(feed.fetch_feed(meta)) + + assert new_meta.url == meta.url + assert isinstance(result, feed.Feed) + assert len(result.entries) == 1 + assert result.title == "Smallest" + + +def test_fetch_after_temp_redirect(): + with TestWebServer() as server: + server.handle("/old", code=307, headers=[("location", "/temp")]) + server.handle("/temp", TEST_FEED, content_type="text/xml") + + meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf") + result, new_meta = asyncio.run(feed.fetch_feed(meta)) + assert new_meta.url == meta.url + assert isinstance(result, feed.Feed) + + +def test_fetch_after_permanent_redirect(): + with TestWebServer() as server: + server.handle("/old", code=308, headers=[("location", "/perm")]) + server.handle("/perm", TEST_FEED, content_type="text/xml") + + meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf") + result, new_meta = asyncio.run(feed.fetch_feed(meta)) + assert new_meta.url == server.make_url("/perm") + assert isinstance(result, feed.Feed) + + +def test_fetch_after_permanent_to_temporary_redirect(): + with TestWebServer() as server: + server.handle("/old", code=308, headers=[("location", "/perm")]) + server.handle("/perm", code=307, headers=[("location", "/temp")]) + server.handle("/temp", TEST_FEED, content_type="text/xml") + + meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf") + result, new_meta = asyncio.run(feed.fetch_feed(meta)) + + # NOTE: we should record the PERMANENT redirect, not the temporary one. + assert new_meta.url == server.make_url("/perm") + assert isinstance(result, feed.Feed) + + +def test_fetch_after_permanent_to_permanent_redirect(): + with TestWebServer() as server: + server.handle("/old", code=308, headers=[("location", "/one")]) + server.handle("/one", code=308, headers=[("location", "/two")]) + server.handle("/two", TEST_FEED, content_type="text/xml") + + meta = feed.FeedMeta.from_url(server.make_url("/old"), "asdf") + result, new_meta = asyncio.run(feed.fetch_feed(meta)) + + # NOTE: we should record the latest redirect. + assert new_meta.url == server.make_url("/two") + assert isinstance(result, feed.Feed)