Try to solve th problem

2026-03-25 20:16:40 +00:00 · 2026-03-24 17:39:43 -07:00
parent c36a019f5d
commit d17ec6dc1f
10 changed files with 446 additions and 73 deletions
--- a/server/routers/newt/pingAccumulator.ts
+++ b/server/routers/newt/pingAccumulator.ts
@@ -0,0 +1,382 @@
+import { db } from "@server/db";
+import { sites, clients, olms } from "@server/db";
+import { eq, inArray } from "drizzle-orm";
+import logger from "@server/logger";
+
+/**
+ * Ping Accumulator
+ *
+ * Instead of writing to the database on every single newt/olm ping (which
+ * causes pool exhaustion under load, especially with cross-region latency),
+ * we accumulate pings in memory and flush them to the database periodically
+ * in a single batch.
+ *
+ * This is the same pattern used for bandwidth flushing in
+ * receiveBandwidth.ts and handleReceiveBandwidthMessage.ts.
+ *
+ * Supports two kinds of pings:
+ *   - **Site pings** (from newts): update `sites.online` and `sites.lastPing`
+ *   - **Client pings** (from OLMs): update `clients.online`, `clients.lastPing`,
+ *     `clients.archived`, and optionally reset `olms.archived`
+ */
+
+const FLUSH_INTERVAL_MS = 10_000; // Flush every 10 seconds
+const MAX_RETRIES = 2;
+const BASE_DELAY_MS = 50;
+
+// ── Site (newt) pings ──────────────────────────────────────────────────
+// Map of siteId -> latest ping timestamp (unix seconds)
+const pendingSitePings: Map<number, number> = new Map();
+
+// ── Client (OLM) pings ────────────────────────────────────────────────
+// Map of clientId -> latest ping timestamp (unix seconds)
+const pendingClientPings: Map<number, number> = new Map();
+// Set of olmIds whose `archived` flag should be reset to false
+const pendingOlmArchiveResets: Set<string> = new Set();
+
+let flushTimer: NodeJS.Timeout | null = null;
+
+// ── Public API ─────────────────────────────────────────────────────────
+
+/**
+ * Record a ping for a newt site. This does NOT write to the database
+ * immediately. Instead it stores the latest ping timestamp in memory,
+ * to be flushed periodically by the background timer.
+ */
+export function recordSitePing(siteId: number): void {
+    const now = Math.floor(Date.now() / 1000);
+    pendingSitePings.set(siteId, now);
+}
+
+/** @deprecated Use `recordSitePing` instead. Alias kept for existing call-sites. */
+export const recordPing = recordSitePing;
+
+/**
+ * Record a ping for an OLM client. Batches the `clients` table update
+ * (`online`, `lastPing`, `archived`) and, when `olmArchived` is true,
+ * also queues an `olms` table update to clear the archived flag.
+ */
+export function recordClientPing(
+    clientId: number,
+    olmId: string,
+    olmArchived: boolean
+): void {
+    const now = Math.floor(Date.now() / 1000);
+    pendingClientPings.set(clientId, now);
+    if (olmArchived) {
+        pendingOlmArchiveResets.add(olmId);
+    }
+}
+
+// ── Flush Logic ────────────────────────────────────────────────────────
+
+/**
+ * Flush all accumulated site pings to the database.
+ */
+async function flushSitePingsToDb(): Promise<void> {
+    if (pendingSitePings.size === 0) {
+        return;
+    }
+
+    // Snapshot and clear so new pings arriving during the flush go into a
+    // fresh map for the next cycle.
+    const pingsToFlush = new Map(pendingSitePings);
+    pendingSitePings.clear();
+
+    // Sort by siteId for consistent lock ordering (prevents deadlocks)
+    const sortedEntries = Array.from(pingsToFlush.entries()).sort(
+        ([a], [b]) => a - b
+    );
+
+    const BATCH_SIZE = 50;
+    for (let i = 0; i < sortedEntries.length; i += BATCH_SIZE) {
+        const batch = sortedEntries.slice(i, i + BATCH_SIZE);
+
+        try {
+            await withRetry(async () => {
+                // Group by timestamp for efficient bulk updates
+                const byTimestamp = new Map<number, number[]>();
+                for (const [siteId, timestamp] of batch) {
+                    const group = byTimestamp.get(timestamp) || [];
+                    group.push(siteId);
+                    byTimestamp.set(timestamp, group);
+                }
+
+                if (byTimestamp.size === 1) {
+                    const [timestamp, siteIds] = Array.from(
+                        byTimestamp.entries()
+                    )[0];
+                    await db
+                        .update(sites)
+                        .set({
+                            online: true,
+                            lastPing: timestamp
+                        })
+                        .where(inArray(sites.siteId, siteIds));
+                } else {
+                    await db.transaction(async (tx) => {
+                        for (const [timestamp, siteIds] of byTimestamp) {
+                            await tx
+                                .update(sites)
+                                .set({
+                                    online: true,
+                                    lastPing: timestamp
+                                })
+                                .where(inArray(sites.siteId, siteIds));
+                        }
+                    });
+                }
+            }, "flushSitePingsToDb");
+        } catch (error) {
+            logger.error(
+                `Failed to flush site ping batch (${batch.length} sites), re-queuing for next cycle`,
+                { error }
+            );
+            for (const [siteId, timestamp] of batch) {
+                const existing = pendingSitePings.get(siteId);
+                if (!existing || existing < timestamp) {
+                    pendingSitePings.set(siteId, timestamp);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Flush all accumulated client (OLM) pings to the database.
+ */
+async function flushClientPingsToDb(): Promise<void> {
+    if (pendingClientPings.size === 0 && pendingOlmArchiveResets.size === 0) {
+        return;
+    }
+
+    // Snapshot and clear
+    const pingsToFlush = new Map(pendingClientPings);
+    pendingClientPings.clear();
+
+    const olmResetsToFlush = new Set(pendingOlmArchiveResets);
+    pendingOlmArchiveResets.clear();
+
+    // ── Flush client pings ─────────────────────────────────────────────
+    if (pingsToFlush.size > 0) {
+        const sortedEntries = Array.from(pingsToFlush.entries()).sort(
+            ([a], [b]) => a - b
+        );
+
+        const BATCH_SIZE = 50;
+        for (let i = 0; i < sortedEntries.length; i += BATCH_SIZE) {
+            const batch = sortedEntries.slice(i, i + BATCH_SIZE);
+
+            try {
+                await withRetry(async () => {
+                    const byTimestamp = new Map<number, number[]>();
+                    for (const [clientId, timestamp] of batch) {
+                        const group = byTimestamp.get(timestamp) || [];
+                        group.push(clientId);
+                        byTimestamp.set(timestamp, group);
+                    }
+
+                    if (byTimestamp.size === 1) {
+                        const [timestamp, clientIds] = Array.from(
+                            byTimestamp.entries()
+                        )[0];
+                        await db
+                            .update(clients)
+                            .set({
+                                lastPing: timestamp,
+                                online: true,
+                                archived: false
+                            })
+                            .where(inArray(clients.clientId, clientIds));
+                    } else {
+                        await db.transaction(async (tx) => {
+                            for (const [timestamp, clientIds] of byTimestamp) {
+                                await tx
+                                    .update(clients)
+                                    .set({
+                                        lastPing: timestamp,
+                                        online: true,
+                                        archived: false
+                                    })
+                                    .where(
+                                        inArray(clients.clientId, clientIds)
+                                    );
+                            }
+                        });
+                    }
+                }, "flushClientPingsToDb");
+            } catch (error) {
+                logger.error(
+                    `Failed to flush client ping batch (${batch.length} clients), re-queuing for next cycle`,
+                    { error }
+                );
+                for (const [clientId, timestamp] of batch) {
+                    const existing = pendingClientPings.get(clientId);
+                    if (!existing || existing < timestamp) {
+                        pendingClientPings.set(clientId, timestamp);
+                    }
+                }
+            }
+        }
+    }
+
+    // ── Flush OLM archive resets ───────────────────────────────────────
+    if (olmResetsToFlush.size > 0) {
+        const olmIds = Array.from(olmResetsToFlush).sort();
+
+        const BATCH_SIZE = 50;
+        for (let i = 0; i < olmIds.length; i += BATCH_SIZE) {
+            const batch = olmIds.slice(i, i + BATCH_SIZE);
+
+            try {
+                await withRetry(async () => {
+                    await db
+                        .update(olms)
+                        .set({ archived: false })
+                        .where(inArray(olms.olmId, batch));
+                }, "flushOlmArchiveResets");
+            } catch (error) {
+                logger.error(
+                    `Failed to flush OLM archive reset batch (${batch.length} olms), re-queuing for next cycle`,
+                    { error }
+                );
+                for (const olmId of batch) {
+                    pendingOlmArchiveResets.add(olmId);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Flush everything — called by the interval timer and during shutdown.
+ */
+export async function flushPingsToDb(): Promise<void> {
+    await flushSitePingsToDb();
+    await flushClientPingsToDb();
+}
+
+// ── Retry / Error Helpers ──────────────────────────────────────────────
+
+/**
+ * Simple retry wrapper with exponential backoff for transient errors
+ * (connection timeouts, unexpected disconnects).
+ */
+async function withRetry<T>(
+    operation: () => Promise<T>,
+    context: string
+): Promise<T> {
+    let attempt = 0;
+    while (true) {
+        try {
+            return await operation();
+        } catch (error: any) {
+            if (isTransientError(error) && attempt < MAX_RETRIES) {
+                attempt++;
+                const baseDelay = Math.pow(2, attempt - 1) * BASE_DELAY_MS;
+                const jitter = Math.random() * baseDelay;
+                const delay = baseDelay + jitter;
+                logger.warn(
+                    `Transient DB error in ${context}, retrying attempt ${attempt}/${MAX_RETRIES} after ${delay.toFixed(0)}ms`
+                );
+                await new Promise((resolve) => setTimeout(resolve, delay));
+                continue;
+            }
+            throw error;
+        }
+    }
+}
+
+/**
+ * Detect transient connection errors that are safe to retry.
+ */
+function isTransientError(error: any): boolean {
+    if (!error) return false;
+
+    const message = (error.message || "").toLowerCase();
+    const causeMessage = (error.cause?.message || "").toLowerCase();
+    const code = error.code || "";
+
+    // Connection timeout / terminated
+    if (
+        message.includes("connection timeout") ||
+        message.includes("connection terminated") ||
+        message.includes("timeout exceeded when trying to connect") ||
+        causeMessage.includes("connection terminated unexpectedly") ||
+        causeMessage.includes("connection timeout")
+    ) {
+        return true;
+    }
+
+    // PostgreSQL deadlock
+    if (code === "40P01" || message.includes("deadlock")) {
+        return true;
+    }
+
+    // ECONNRESET, ECONNREFUSED, EPIPE
+    if (
+        code === "ECONNRESET" ||
+        code === "ECONNREFUSED" ||
+        code === "EPIPE" ||
+        code === "ETIMEDOUT"
+    ) {
+        return true;
+    }
+
+    return false;
+}
+
+// ── Lifecycle ──────────────────────────────────────────────────────────
+
+/**
+ * Start the background flush timer. Call this once at server startup.
+ */
+export function startPingAccumulator(): void {
+    if (flushTimer) {
+        return; // Already running
+    }
+
+    flushTimer = setInterval(async () => {
+        try {
+            await flushPingsToDb();
+        } catch (error) {
+            logger.error("Unhandled error in ping accumulator flush", {
+                error
+            });
+        }
+    }, FLUSH_INTERVAL_MS);
+
+    // Don't prevent the process from exiting
+    flushTimer.unref();
+
+    logger.info(
+        `Ping accumulator started (flush interval: ${FLUSH_INTERVAL_MS}ms)`
+    );
+}
+
+/**
+ * Stop the background flush timer and perform a final flush.
+ * Call this during graceful shutdown.
+ */
+export async function stopPingAccumulator(): Promise<void> {
+    if (flushTimer) {
+        clearInterval(flushTimer);
+        flushTimer = null;
+    }
+
+    // Final flush to persist any remaining pings
+    try {
+        await flushPingsToDb();
+    } catch (error) {
+        logger.error("Error during final ping accumulator flush", { error });
+    }
+
+    logger.info("Ping accumulator stopped");
+}
+
+/**
+ * Get the number of pending (unflushed) pings. Useful for monitoring.
+ */
+export function getPendingPingCount(): number {
+    return pendingSitePings.size + pendingClientPings.size;
+}