From fd0da1a59e73f0f16bcba715d219bcbcdace6451 Mon Sep 17 00:00:00 2001 From: James Price Date: Mon, 15 Jun 2026 19:26:12 -0400 Subject: [PATCH] Persist URL dedup cache to SQLite (bounded, ~KB); keep video blobs in memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch storage backend from in-memory to SQLite (silences the library warning). recent_urls (the edit-redelivery dedup map) now survives restarts: it's tiny and TTL-bounded, stored via the library KV store, with wall-clock timestamps and prune-on-load. last_video is deliberately NOT persisted — each value is a base64 video up to ~133 MB, so persisting it would bloat the DB; it stays in memory. DB lives at bot-state.db (gitignored). Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 4 ++++ bot.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index ee491a6..f46cca6 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,10 @@ signal-cli-data.bak.* # Local service file (contains your phone number) — use signal-bot.service.example signal-bot.service +# Runtime state (SQLite dedup cache) — created next to bot.py at runtime +bot-state.db +bot-state.db-* + # Python venv/ __pycache__/ diff --git a/bot.py b/bot.py index 8fafbf6..b00e783 100644 --- a/bot.py +++ b/bot.py @@ -30,6 +30,7 @@ MAX_CLIP_DURATION = 600 # ceiling for a user-supplied /clip override MAX_CONCURRENT_JOBS = 2 YTDLP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "venv", "bin", "yt-dlp") COOKIES = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cookies.txt") +STATE_DB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "bot-state.db") ADMIN_NUMBERS = {n.strip() for n in os.environ.get("BOT_ADMINS", "").split(",") if n.strip()} logging.basicConfig( @@ -41,12 +42,16 @@ log = logging.getLogger("signal-bot") # (group_id, sender) -> {"b64": ..., "time": monotonic}. Keyed per sender so a # stranger's later video doesn't silently become the target of your /speed or # /rev; _get_video falls back to the group's latest video when you have none. +# Intentionally in-memory ONLY: each value is a base64 video up to ~133 MB, so +# persisting these to SQLite would bloat the DB. Only the dedup map below persists. last_video = {} VIDEO_TTL = 3600 # 1 hour -# (group_id, url) -> monotonic time the URL was successfully handled. +# (group_id, url) -> wall-clock epoch seconds the URL was successfully handled. # When a user edits a message, signal-cli redelivers it as MessageType.EDIT_MESSAGE # with the same text — without this guard the bot re-downloads and re-posts the video. +# Persisted to SQLite so a restart doesn't forget recent dedup; tiny and TTL-bounded. +# Wall-clock (not monotonic) so the stored timestamps stay meaningful across restarts. recent_urls = {} RECENT_URL_TTL = 600 # 10 min @@ -57,6 +62,11 @@ _inflight = set() # Lazily created so it binds to the bot's running event loop, not import time. _job_semaphore = None +# signalbot SQLite key/value store (set in main()); persists recent_urls only. +_storage = None +_RECENT_URLS_KEY = "recent_urls" +_RECENT_SEP = "\x1f" # joins (group, url) into one JSON-safe storage key + def _get_job_semaphore() -> asyncio.Semaphore: global _job_semaphore @@ -92,11 +102,43 @@ def _get_video(group_id, sender): def _sweep_recent_urls() -> None: - now = time.monotonic() + now = time.time() for key in [k for k, t in recent_urls.items() if now - t > RECENT_URL_TTL]: del recent_urls[key] +def _load_recent_urls() -> None: + """Restore the (pruned) dedup map from SQLite on startup.""" + if _storage is None: + return + try: + if not _storage.exists(_RECENT_URLS_KEY): + return + data = _storage.read(_RECENT_URLS_KEY) or {} + except Exception as e: # noqa: BLE001 + log.warning("Could not load dedup state: %s", e) + return + now = time.time() + for joined, ts in data.items(): + if not isinstance(ts, (int, float)) or now - ts > RECENT_URL_TTL: + continue + group_id, _, url = joined.partition(_RECENT_SEP) + recent_urls[(group_id, url)] = ts + log.info("Restored %d dedup entries from %s", len(recent_urls), STATE_DB) + + +def _persist_recent_urls() -> None: + if _storage is None: + return + try: + _storage.save( + _RECENT_URLS_KEY, + {f"{g}{_RECENT_SEP}{u}": ts for (g, u), ts in recent_urls.items()}, + ) + except Exception as e: # noqa: BLE001 + log.warning("Could not persist dedup state: %s", e) + + def _url_recently_handled(group_id, url) -> bool: _sweep_recent_urls() return (group_id, url) in recent_urls @@ -108,7 +150,8 @@ def _url_busy(group_id, url) -> bool: def _mark_url_handled(group_id, url) -> None: _sweep_recent_urls() - recent_urls[(group_id, url)] = time.monotonic() + recent_urls[(group_id, url)] = time.time() + _persist_recent_urls() async def _safe_reply(c: Context, text: str) -> None: @@ -796,6 +839,8 @@ class CookiesCommand(Command): def main(): + global _storage + phone_number = os.environ.get("SIGNAL_PHONE_NUMBER") signal_service = os.environ.get("SIGNAL_SERVICE", "127.0.0.1:8080") @@ -810,7 +855,12 @@ def main(): # Don't let the library download+base64 every attachment of every message # on the producer loop. VideoTracker fetches only video attachments lazily. "download_attachments": False, + # Tiny SQLite KV (a few KB): persists only the URL dedup map across + # restarts. The big last-video blobs stay in memory by design. + "storage": {"type": "sqlite", "sqlite_db": STATE_DB}, }) + _storage = bot.storage + _load_recent_urls() bot.register(VideoTracker(), contacts=False, groups=True) bot.register(VideoCommand(), contacts=False, groups=True) bot.register(ReverseCommand(), contacts=False, groups=True)