Persist URL dedup cache to SQLite (bounded, ~KB); keep video blobs in memory

Switch storage backend from in-memory to SQLite (silences the library warning).
recent_urls (the edit-redelivery dedup map) now survives restarts: it's tiny and
TTL-bounded, stored via the library KV store, with wall-clock timestamps and
prune-on-load. last_video is deliberately NOT persisted — each value is a base64
video up to ~133 MB, so persisting it would bloat the DB; it stays in memory.
DB lives at bot-state.db (gitignored).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-15 19:26:12 -04:00
parent c7eb101ff0
commit fd0da1a59e
2 changed files with 57 additions and 3 deletions
Vendored
+4
View File
@@ -8,6 +8,10 @@ signal-cli-data.bak.*
# Local service file (contains your phone number) — use signal-bot.service.example
signal-bot.service
# Runtime state (SQLite dedup cache) — created next to bot.py at runtime
bot-state.db
bot-state.db-*
# Python
venv/
__pycache__/
+53 -3
View File
@@ -30,6 +30,7 @@ MAX_CLIP_DURATION = 600 # ceiling for a user-supplied /clip override
MAX_CONCURRENT_JOBS = 2
YTDLP = os.path.join(os.path.dirname(os.path.abspath(__file__)), "venv", "bin", "yt-dlp")
COOKIES = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cookies.txt")
STATE_DB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "bot-state.db")
ADMIN_NUMBERS = {n.strip() for n in os.environ.get("BOT_ADMINS", "").split(",") if n.strip()}
logging.basicConfig(
@@ -41,12 +42,16 @@ log = logging.getLogger("signal-bot")
# (group_id, sender) -> {"b64": ..., "time": monotonic}. Keyed per sender so a
# stranger's later video doesn't silently become the target of your /speed or
# /rev; _get_video falls back to the group's latest video when you have none.
# Intentionally in-memory ONLY: each value is a base64 video up to ~133 MB, so
# persisting these to SQLite would bloat the DB. Only the dedup map below persists.
last_video = {}
VIDEO_TTL = 3600 # 1 hour
# (group_id, url) -> monotonic time the URL was successfully handled.
# (group_id, url) -> wall-clock epoch seconds the URL was successfully handled.
# When a user edits a message, signal-cli redelivers it as MessageType.EDIT_MESSAGE
# with the same text — without this guard the bot re-downloads and re-posts the video.
# Persisted to SQLite so a restart doesn't forget recent dedup; tiny and TTL-bounded.
# Wall-clock (not monotonic) so the stored timestamps stay meaningful across restarts.
recent_urls = {}
RECENT_URL_TTL = 600 # 10 min
@@ -57,6 +62,11 @@ _inflight = set()
# Lazily created so it binds to the bot's running event loop, not import time.
_job_semaphore = None
# signalbot SQLite key/value store (set in main()); persists recent_urls only.
_storage = None
_RECENT_URLS_KEY = "recent_urls"
_RECENT_SEP = "\x1f" # joins (group, url) into one JSON-safe storage key
def _get_job_semaphore() -> asyncio.Semaphore:
global _job_semaphore
@@ -92,11 +102,43 @@ def _get_video(group_id, sender):
def _sweep_recent_urls() -> None:
now = time.monotonic()
now = time.time()
for key in [k for k, t in recent_urls.items() if now - t > RECENT_URL_TTL]:
del recent_urls[key]
def _load_recent_urls() -> None:
"""Restore the (pruned) dedup map from SQLite on startup."""
if _storage is None:
return
try:
if not _storage.exists(_RECENT_URLS_KEY):
return
data = _storage.read(_RECENT_URLS_KEY) or {}
except Exception as e: # noqa: BLE001
log.warning("Could not load dedup state: %s", e)
return
now = time.time()
for joined, ts in data.items():
if not isinstance(ts, (int, float)) or now - ts > RECENT_URL_TTL:
continue
group_id, _, url = joined.partition(_RECENT_SEP)
recent_urls[(group_id, url)] = ts
log.info("Restored %d dedup entries from %s", len(recent_urls), STATE_DB)
def _persist_recent_urls() -> None:
if _storage is None:
return
try:
_storage.save(
_RECENT_URLS_KEY,
{f"{g}{_RECENT_SEP}{u}": ts for (g, u), ts in recent_urls.items()},
)
except Exception as e: # noqa: BLE001
log.warning("Could not persist dedup state: %s", e)
def _url_recently_handled(group_id, url) -> bool:
_sweep_recent_urls()
return (group_id, url) in recent_urls
@@ -108,7 +150,8 @@ def _url_busy(group_id, url) -> bool:
def _mark_url_handled(group_id, url) -> None:
_sweep_recent_urls()
recent_urls[(group_id, url)] = time.monotonic()
recent_urls[(group_id, url)] = time.time()
_persist_recent_urls()
async def _safe_reply(c: Context, text: str) -> None:
@@ -796,6 +839,8 @@ class CookiesCommand(Command):
def main():
global _storage
phone_number = os.environ.get("SIGNAL_PHONE_NUMBER")
signal_service = os.environ.get("SIGNAL_SERVICE", "127.0.0.1:8080")
@@ -810,7 +855,12 @@ def main():
# Don't let the library download+base64 every attachment of every message
# on the producer loop. VideoTracker fetches only video attachments lazily.
"download_attachments": False,
# Tiny SQLite KV (a few KB): persists only the URL dedup map across
# restarts. The big last-video blobs stay in memory by design.
"storage": {"type": "sqlite", "sqlite_db": STATE_DB},
})
_storage = bot.storage
_load_recent_urls()
bot.register(VideoTracker(), contacts=False, groups=True)
bot.register(VideoCommand(), contacts=False, groups=True)
bot.register(ReverseCommand(), contacts=False, groups=True)