cry/cry/database.py

import logging
import pathlib
import random
import socket
import sqlite3
import string
import time
import typing

import platformdirs

from . import feed

LOG = logging.getLogger(__name__)

SCHEMA_STATEMENTS = [
    """
    CREATE TABLE feeds (
        url VARCHAR NOT NULL PRIMARY KEY,
        last_fetched_ts INTEGER NOT NULL,
        retry_after_ts INTEGER NOT NULL,
        status INTEGER NOT NULL,
        etag VARCHAR,
        modified VARCHAR,
        title VARCHAR,
        link VARCHAR
    );

    CREATE TABLE entries(
        id VARCHAR NOT NULL,
        inserted_at INTEGER NOT NULL,
        feed_url VARCHAR NOT NULL,
        title VARCHAR,
        link VARCHAR,
        PRIMARY KEY (id, feed_url),
        FOREIGN KEY (feed_url) REFERENCES feeds(url)
            ON UPDATE CASCADE
            ON DELETE CASCADE
    );
    """,
    # I went and changed the status enum to make ALIVE == 0 when I added the
    # "unsubscribed" status. I should probably make these strings huh.
    """
    UPDATE feeds
    SET status=CASE
      WHEN status = 0 THEN 1
      WHEN status = 1 THEN 0
      ELSE status
    END
    """,
    # The "clock" is a number that increments as we make changes. We use this
    # to do reconciliation, and track which versions of other databases we
    # have reconciled already.
    """
    INSERT INTO properties (name, value) VALUES ('clock', 1);

    CREATE TRIGGER update_clock_on_feed_insert
    AFTER INSERT ON feeds
    BEGIN
      UPDATE properties SET value=value + 1 WHERE name='clock';
    END;

    CREATE TRIGGER update_clock_on_feed_delete
    AFTER DELETE ON feeds
    BEGIN
      UPDATE properties SET value=value + 1 WHERE name='clock';
    END;

    CREATE TRIGGER update_clock_on_feed_update
    AFTER UPDATE ON feeds
    WHEN (NEW.last_fetched_ts IS NOT OLD.last_fetched_ts)
      OR (NEW.retry_after_ts IS NOT OLD.retry_after_ts)
      OR (NEW.status IS NOT OLD.status)
      OR (NEW.etag IS NOT OLD.etag)
      OR (NEW.modified IS NOT OLD.modified)
      OR (NEW.title IS NOT OLD.title)
      OR (NEW.link IS NOT OLD.link)
    BEGIN
      UPDATE properties SET value=value + 1 WHERE name='clock';
    END;

    CREATE TRIGGER update_clock_on_entries_insert
    AFTER INSERT ON entries
    BEGIN
      UPDATE properties SET value=value + 1 WHERE name='clock';
    END;

    CREATE TRIGGER update_clock_on_entries_delete
    AFTER DELETE ON entries
    BEGIN
      UPDATE properties SET value=value + 1 WHERE name='clock';
    END;

    CREATE TRIGGER update_clock_on_entries_update
    AFTER UPDATE ON entries
    WHEN (NEW.id IS NOT OLD.id)
      OR (NEW.inserted_at IS NOT OLD.inserted_at)
      OR (NEW.feed_url IS NOT OLD.feed_url)
      OR (NEW.title IS NOT OLD.title)
      OR (NEW.link IS NOT OLD.link)
    BEGIN
      UPDATE properties SET value=value + 1 WHERE name='clock';
    END;
    """,
    """
    CREATE TABLE sync_status (
      origin VARCHAR NOT NULL PRIMARY KEY,
      clock INT NOT NULL
    );
    """,
]


def origin_path() -> pathlib.Path:
    return platformdirs.user_data_path("cry", "cry") / "origin"


def local_origin(path: pathlib.Path | None = None) -> str:
    if path is None:
        path = origin_path()

    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            return f.read().strip()

    host = socket.gethostname()
    slug = "".join(
        random.choices(
            string.ascii_uppercase + string.ascii_lowercase + string.digits, k=8
        )
    )
    origin = f"{host}-{slug}"

    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(origin)

    return origin


def databases_directory() -> pathlib.Path:
    return pathlib.Path.home() / "Dropbox" / "cry"


def database_path(origin: str) -> pathlib.Path:
    return databases_directory() / f"{origin}.db"


# TODO: Refactor into:
# -top level: transactions
# -bottom level: queries
# to enable reuse
class Database:
    db: sqlite3.Connection
    origin: str

    def __init__(self, path: pathlib.Path | str, origin: str, readonly: bool = False):
        uri = False
        if not isinstance(path, str):
            path.parent.mkdir(parents=True, exist_ok=True)
            path = f"file:{str(path)}"
            uri = True
            if readonly:
                path = f"{path}?mode=ro"

        # Enable autocommit as a separate step so that I can enable foreign
        # keys cleanly. (Can't enable foreign keys in a transaction.)
        db = sqlite3.connect(str(path), uri=uri)
        db.execute("PRAGMA foreign_keys = ON")
        db.autocommit = False

        cursor = db.execute("PRAGMA foreign_keys")
        rows = cursor.fetchall()
        assert str(rows[0][0]) == "1", f"Foreign keys not enabled! {rows[0][0]}"

        self.db = db
        self.origin = origin

    @classmethod
    def local(cls, origin: str | None = None) -> "Database":
        if origin is None:
            origin = local_origin()

        db = Database(database_path(origin), origin)
        db.ensure_database_schema()
        db.set_property("origin", origin)
        return db

    @classmethod
    def from_file(cls, path: pathlib.Path) -> "Database":
        db = Database(path, "", readonly=True)
        origin = db.get_property("origin")
        if origin is None:
            raise Exception("No origin!")
        db.origin = str(origin)
        return db

    def get_property(self, prop: str, default=None) -> typing.Any:
        with self.db:
            return self._get_property(prop, default)

    def set_property(self, prop: str, value):
        with self.db:
            return self._set_property(prop, value)

    def get_clock(self) -> int:
        return int(self.get_property("clock", 0))

    def ensure_database_schema(self):
        with self.db:
            self.db.execute(
                """
                CREATE TABLE IF NOT EXISTS properties (
                name VARCHAR NOT NULL PRIMARY KEY,
                value VARCHAR NOT NULL
                )
                """
            )
            version = int(self._get_property("version", 0))
            for script in SCHEMA_STATEMENTS[version:]:
                try:
                    self.db.executescript(script)
                except Exception as e:
                    raise Exception(f"Error executing:\n{script}") from e
            self._set_property("version", len(SCHEMA_STATEMENTS))
            self._set_property("origin", self.origin)

    def load_all_meta(self) -> list[feed.FeedMeta]:
        with self.db:
            cursor = self.db.execute(
                """
                SELECT
                  url,
                  last_fetched_ts,
                  retry_after_ts,
                  status,
                  etag,
                  modified
                FROM feeds
                """
            )
            rows = cursor.fetchall()
            return [
                feed.FeedMeta(
                    url=url,
                    last_fetched_ts=int(last_fetched_ts),
                    retry_after_ts=int(retry_after_ts),
                    status=int(status),
                    etag=etag,
                    modified=modified,
                )
                for url, last_fetched_ts, retry_after_ts, status, etag, modified in rows
            ]

    def load_all(self, feed_limit: int = 20, pattern: str = "") -> list[feed.Feed]:
        with self.db:
            pattern = (
                pattern.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
            )
            sql_pattern = f"%{pattern}%"
            cursor = self.db.execute(
                """
                SELECT
                  url,
                  last_fetched_ts,
                  retry_after_ts,
                  status,
                  etag,
                  modified,
                  title,
                  link
                FROM feeds
                WHERE (title LIKE :sql_pattern ESCAPE '\\'
                   OR link LIKE :sql_pattern ESCAPE '\\')
                  AND status != 2 -- UNSUBSCRIBED
                """,
                {"sql_pattern": sql_pattern},
            )
            rows = cursor.fetchall()

            almost_feeds = []
            for row in rows:
                (
                    url,
                    last_fetched_ts,
                    retry_after_ts,
                    status,
                    etag,
                    modified,
                    title,
                    link,
                ) = row
                meta = feed.FeedMeta(
                    url=url,
                    last_fetched_ts=int(last_fetched_ts),
                    retry_after_ts=int(retry_after_ts),
                    status=int(status),
                    etag=etag,
                    modified=modified,
                )
                almost_feeds.append((meta, title, link))

            feeds = []
            for meta, title, link in almost_feeds:
                if feed_limit > 0:
                    cursor = self.db.execute(
                        """
                        SELECT
                          id,
                          inserted_at,
                          title,
                          link
                        FROM entries
                        WHERE feed_url=?
                        ORDER BY inserted_at DESC
                        LIMIT ?
                        """,
                        [meta.url, feed_limit],
                    )
                    rows = cursor.fetchall()
                else:
                    rows = []

                entries = [
                    feed.Entry(id=id, inserted_at=inserted_at, title=title, link=link)
                    for id, inserted_at, title, link in rows
                ]
                f = feed.Feed(meta=meta, title=title, link=link, entries=entries)
                feeds.append(f)

        return feeds

    def load_meta(self, url: str) -> feed.FeedMeta | None:
        with self.db:
            return self._load_meta(url)

    def update_meta(self, f: feed.FeedMeta):
        with self.db:
            self.db.execute(
                """
                UPDATE feeds SET
                  last_fetched_ts=?,
                  retry_after_ts=?,
                  status=?,
                  etag=?,
                  modified=?
                WHERE url=?
                """,
                [
                    f.last_fetched_ts,
                    f.retry_after_ts,
                    f.status,
                    f.etag,
                    f.modified,
                    f.url,
                ],
            )

    def store_feed(self, f: feed.Feed) -> int:
        """Store the given feed in the database.

        Returns the number of new entries inserted.
        """
        with self.db:
            self._insert_feed(f.meta, f.title, f.link)
            return self._insert_entries(f.meta.url, f.entries)

    def update_feed_status(self, meta: feed.FeedMeta, status: int) -> int:
        with self.db:
            return self._update_feed_status(meta, status)

    def redirect_feed(self, old_url: str, new_url: str):
        with self.db:
            cursor = self.db.execute(
                "SELECT COUNT(1) FROM feeds WHERE url=?", [new_url]
            )
            row = cursor.fetchone()
            if row[0] == 0:
                self.db.execute(
                    "UPDATE feeds SET url = ? WHERE url = ?", [new_url, old_url]
                )
            else:
                # First update all the entries that you can with the old url.
                self.db.execute(
                    """
                    UPDATE OR IGNORE entries
                    SET feed_url = ?
                    WHERE feed_url = ?
                    """,
                    [new_url, old_url],
                )

                # TODO: It is expensive and not worth it to try to load and
                #       re-insert all the old stuff so I'm not going to
                #       bother.

                # Mark the old feed unsubscribed.
                # TODO: Rebuild with helpers
                self.db.execute(
                    """
                    UPDATE feeds
                    SET status = ?,
                        last_fetched_ts = ?
                    WHERE url = ?
                    """,
                    [feed.FEED_STATUS_UNSUBSCRIBED, int(time.time()), old_url],
                )

    def get_sync_clock(self, origin: str) -> int | None:
        with self.db:
            cursor = self.db.execute(
                "SELECT clock FROM sync_status WHERE origin = ?",
                [origin],
            )
            row = cursor.fetchone()
            if row is None:
                return None
            return int(row[0])

    def set_sync_clock(self, origin: str, clock: int):
        with self.db:
            self.db.execute(
                """
                INSERT INTO sync_status (origin, clock)
                VALUES (?, ?)
                ON CONFLICT DO UPDATE SET clock=excluded.clock
                """,
                [origin, clock],
            )

    def sync_from(self, other: "Database"):
        with self.db:
            with other.db:
                feed_cursor = other.db.execute(
                    """
                    SELECT
                      url,
                      last_fetched_ts,
                      retry_after_ts,
                      status,
                      etag,
                      modified,
                      title,
                      link
                    FROM feeds
                    """
                )
                for row in feed_cursor:
                    (
                        url,
                        last_fetched_ts,
                        retry_after_ts,
                        status,
                        etag,
                        modified,
                        title,
                        link,
                    ) = row
                    meta = feed.FeedMeta(
                        url=url,
                        last_fetched_ts=int(last_fetched_ts),
                        retry_after_ts=int(retry_after_ts),
                        status=int(status),
                        etag=etag,
                        modified=modified,
                    )
                    self._insert_feed(meta, title, link)

                    entries_cursor = other.db.execute(
                        """
                        SELECT
                          id,
                          inserted_at,
                          title,
                          link
                        FROM entries
                        WHERE feed_url=?
                        """,
                        [url],
                    )
                    entries_results = entries_cursor.fetchmany()
                    while len(entries_results) > 0:
                        self._insert_entries(
                            url,
                            [
                                feed.Entry(
                                    id=id,
                                    inserted_at=int(inserted_at),
                                    title=title,
                                    link=link,
                                )
                                for id, inserted_at, title, link in entries_results
                            ],
                        )
                        entries_results = entries_cursor.fetchmany()

    def _get_property(self, prop: str, default=None) -> typing.Any:
        cursor = self.db.execute("SELECT value FROM properties WHERE name=?", (prop,))
        result = cursor.fetchone()
        if result is None:
            return default
        return result[0]

    def _set_property(self, prop: str, value):
        self.db.execute(
            """
                INSERT INTO properties (name, value) VALUES (?, ?)
                ON CONFLICT DO UPDATE SET value=excluded.value
                """,
            (prop, value),
        )

    def _insert_feed(self, meta: feed.FeedMeta, title: str, link: str):
        """Insert into the feeds table, handling collisions with UPSERT."""
        self.db.execute(
            """
            INSERT INTO feeds (
              url,
              last_fetched_ts,
              retry_after_ts,
              status,
              etag,
              modified,
              title,
              link
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ON CONFLICT DO UPDATE
            SET
              last_fetched_ts=MAX(last_fetched_ts, excluded.last_fetched_ts),
              retry_after_ts=MAX(retry_after_ts, excluded.retry_after_ts),
              -- For all other fields, take the value that was computed by the
              -- most recent fetch.
              status=CASE
                WHEN last_fetched_ts > excluded.last_fetched_ts THEN status
                ELSE excluded.status
              END,
              etag=CASE
                WHEN last_fetched_ts > excluded.last_fetched_ts THEN etag
                ELSE excluded.etag
              END,
              modified=CASE
                WHEN last_fetched_ts > excluded.last_fetched_ts THEN modified
                ELSE excluded.modified
              END,
              title=CASE
                 WHEN last_fetched_ts > excluded.last_fetched_ts THEN title
                 ELSE excluded.title
              END,
              link=CASE
                 WHEN last_fetched_ts > excluded.last_fetched_ts THEN link
                 ELSE excluded.link
              END
            """,
            [
                meta.url,
                meta.last_fetched_ts,
                meta.retry_after_ts,
                meta.status,
                meta.etag,
                meta.modified,
                title,
                link,
            ],
        )

    def _insert_entries(self, feed_url: str, entries: list[feed.Entry]) -> int:
        cursor = self.db.execute(
            "SELECT COUNT (*) FROM entries WHERE feed_url=?", [feed_url]
        )
        start_count = cursor.fetchone()[0]

        self.db.executemany(
            """
                INSERT INTO entries (
                  id,
                  inserted_at,
                  feed_url,
                  title,
                  link
                ) VALUES (?, ?, ?, ?, ?)
                ON CONFLICT DO UPDATE
                SET
                  -- NOTE: This is also part of the feed merge algorithm, BUT
                  --       we implement it here because feeds tend to be rolling
                  --       windows over some external content and we don't want
                  --       to read and write the entire feed just to update the
                  --       few new items. But we can't just do ON CONFLICT DO
                  --       NOTHING because we *might* be storing a feed where we
                  --       resolved conflicts with another instance. So we want
                  --       to handle all the cases. (In theory we could make two
                  --       different INSERTs to handle the two cases but that is
                  --       more complexity than it is worth.)
                  inserted_at=MIN(inserted_at, excluded.inserted_at),
                  title=CASE
                    WHEN inserted_at < excluded.inserted_at THEN title
                    ELSE excluded.title
                  END,
                  link=CASE
                    WHEN inserted_at < excluded.inserted_at THEN link
                    ELSE excluded.link
                  END
                """,
            [(e.id, e.inserted_at, feed_url, e.title, e.link) for e in entries],
        )

        cursor = self.db.execute(
            "SELECT COUNT (*) FROM entries WHERE feed_url=?", [feed_url]
        )
        end_count = cursor.fetchone()[0]
        return end_count - start_count

    def _update_feed_status(self, meta: feed.FeedMeta, status: int) -> int:
        new_ts = max(int(time.time()), meta.last_fetched_ts + 1)
        cursor = self.db.execute(
            """
            UPDATE feeds
            SET status = ?,
                last_fetched_ts = ?
            WHERE url = ?
            """,
            [status, new_ts, meta.url],
        )
        return cursor.rowcount

    def _load_meta(self, url: str) -> feed.FeedMeta | None:
        cursor = self.db.execute(
            """
                SELECT
                  last_fetched_ts,
                  retry_after_ts,
                  status,
                  etag,
                  modified
                FROM feeds
                WHERE url=?
                """,
            [url],
        )

        row = cursor.fetchone()
        if row is None:
            return None

        last_fetched_ts, retry_after_ts, status, etag, modified = row
        return feed.FeedMeta(
            url=url,
            last_fetched_ts=int(last_fetched_ts),
            retry_after_ts=int(retry_after_ts),
            status=int(status),
            etag=etag,
            modified=modified,
        )


def sync(local_db: Database):
    local_version = local_db.get_property("version", 0)
    for p in databases_directory().glob("*.db"):
        if not p.is_file():
            continue

        try:
            other_db = Database.from_file(p)
            if local_db.origin == other_db.origin:
                continue

            # Ensure the schema version is compatible so that we don't run
            # into trouble trying to query the other database.
            other_version = other_db.get_property("version", 0)
            if other_version != local_version:
                LOG.warn(
                    f"{other_db.origin}: Not reconciling version {other_version} against {local_version}"
                )
                continue

            # Check to see if we've already reconciled this other database.
            other_clock = other_db.get_clock()
            reconciled_clock = local_db.get_sync_clock(other_db.origin)
            if other_clock == reconciled_clock:
                continue

            local_db.sync_from(other_db)

            local_db.set_sync_clock(other_db.origin, other_clock)

        except Exception as e:
            LOG.error(f"Error loading {p}: {e}")