Merge branch 'dev' into multi-role

2026-03-30 22:46:40 +00:00 · 2026-03-26 21:46:29 -07:00
parent 13eadeaa8f 395cab795c
commit 63372b174f
31 changed files with 1159 additions and 229 deletions
--- a/server/routers/client/targets.ts
+++ b/server/routers/client/targets.ts
@@ -1,15 +1,54 @@
 import { sendToClient } from "#dynamic/routers/ws";
-import { db, olms, Transaction } from "@server/db";
+import { db, newts, olms } from "@server/db";
+import {
+    Alias,
+    convertSubnetProxyTargetsV2ToV1,
+    SubnetProxyTarget,
+    SubnetProxyTargetV2
+} from "@server/lib/ip";
 import { canCompress } from "@server/lib/clientVersionChecks";
-import { Alias, SubnetProxyTarget } from "@server/lib/ip";
 import logger from "@server/logger";
 import { eq } from "drizzle-orm";
+import semver from "semver";
+
+const NEWT_V2_TARGETS_VERSION = ">=1.10.3";
+
+export async function convertTargetsIfNessicary(
+    newtId: string,
+    targets: SubnetProxyTarget[] | SubnetProxyTargetV2[]
+) {
+    // get the newt
+    const [newt] = await db
+        .select()
+        .from(newts)
+        .where(eq(newts.newtId, newtId));
+    if (!newt) {
+        throw new Error(`No newt found for id: ${newtId}`);
+    }
+
+    // check the semver
+    if (
+        newt.version &&
+        !semver.satisfies(newt.version, NEWT_V2_TARGETS_VERSION)
+    ) {
+        logger.debug(
+            `addTargets Newt version ${newt.version} does not support targets v2 falling back`
+        );
+        targets = convertSubnetProxyTargetsV2ToV1(
+            targets as SubnetProxyTargetV2[]
+        );
+    }
+
+    return targets;
+}

 export async function addTargets(
    newtId: string,
-    targets: SubnetProxyTarget[],
+    targets: SubnetProxyTarget[] | SubnetProxyTargetV2[],
    version?: string | null
 ) {
+    targets = await convertTargetsIfNessicary(newtId, targets);
+
    await sendToClient(
        newtId,
        {
@@ -22,9 +61,11 @@ export async function addTargets(

 export async function removeTargets(
    newtId: string,
-    targets: SubnetProxyTarget[],
+    targets: SubnetProxyTarget[] | SubnetProxyTargetV2[],
    version?: string | null
 ) {
+    targets = await convertTargetsIfNessicary(newtId, targets);
+
    await sendToClient(
        newtId,
        {
@@ -38,11 +79,39 @@ export async function removeTargets(
 export async function updateTargets(
    newtId: string,
    targets: {
-        oldTargets: SubnetProxyTarget[];
-        newTargets: SubnetProxyTarget[];
+        oldTargets: SubnetProxyTarget[] | SubnetProxyTargetV2[];
+        newTargets: SubnetProxyTarget[] | SubnetProxyTargetV2[];
    },
    version?: string | null
 ) {
+    // get the newt
+    const [newt] = await db
+        .select()
+        .from(newts)
+        .where(eq(newts.newtId, newtId));
+    if (!newt) {
+        logger.error(`addTargetsL No newt found for id: ${newtId}`);
+        return;
+    }
+
+    // check the semver
+    if (
+        newt.version &&
+        !semver.satisfies(newt.version, NEWT_V2_TARGETS_VERSION)
+    ) {
+        logger.debug(
+            `addTargets Newt version ${newt.version} does not support targets v2 falling back`
+        );
+        targets = {
+            oldTargets: convertSubnetProxyTargetsV2ToV1(
+                targets.oldTargets as SubnetProxyTargetV2[]
+            ),
+            newTargets: convertSubnetProxyTargetsV2ToV1(
+                targets.newTargets as SubnetProxyTargetV2[]
+            )
+        };
+    }
+
    await sendToClient(
        newtId,
        {
--- a/server/routers/gerbil/receiveBandwidth.ts
+++ b/server/routers/gerbil/receiveBandwidth.ts
@@ -1,6 +1,5 @@
 import { Request, Response, NextFunction } from "express";
-import { eq, sql } from "drizzle-orm";
-import { sites } from "@server/db";
+import { sql } from "drizzle-orm";
 import { db } from "@server/db";
 import logger from "@server/logger";
 import createHttpError from "http-errors";
@@ -31,7 +30,10 @@ const MAX_RETRIES = 3;
 const BASE_DELAY_MS = 50;

 // How often to flush accumulated bandwidth data to the database
-const FLUSH_INTERVAL_MS = 30_000; // 30 seconds
+const FLUSH_INTERVAL_MS = 300_000; // 300 seconds
+
+// Maximum number of sites to include in a single batch UPDATE statement
+const BATCH_CHUNK_SIZE = 250;

 // In-memory accumulator: publicKey -> AccumulatorEntry
 let accumulator = new Map<string, AccumulatorEntry>();
@@ -75,13 +77,33 @@ async function withDeadlockRetry<T>(
    }
 }

+/**
+ * Execute a raw SQL query that returns rows, in a way that works across both
+ * the PostgreSQL driver (which exposes `execute`) and the SQLite driver (which
+ * exposes `all`).  Drizzle's typed query builder doesn't support bulk
+ * UPDATE … FROM (VALUES …) natively, so we drop to raw SQL here.
+ */
+async function dbQueryRows<T extends Record<string, unknown>>(
+    query: Parameters<(typeof sql)["join"]>[0][number]
+): Promise<T[]> {
+    const anyDb = db as any;
+    if (typeof anyDb.execute === "function") {
+        // PostgreSQL (node-postgres via Drizzle) — returns { rows: [...] } or an array
+        const result = await anyDb.execute(query);
+        return (Array.isArray(result) ? result : (result.rows ?? [])) as T[];
+    }
+    // SQLite (better-sqlite3 via Drizzle) — returns an array directly
+    return (await anyDb.all(query)) as T[];
+}
+
 /**
 * Flush all accumulated site bandwidth data to the database.
 *
 * Swaps out the accumulator before writing so that any bandwidth messages
 * received during the flush are captured in the new accumulator rather than
- * being lost or causing contention. Entries that fail to write are re-queued
- * back into the accumulator so they will be retried on the next flush.
+ * being lost or causing contention. Sites are updated in chunks via a single
+ * batch UPDATE per chunk. Failed chunks are discarded — exact per-flush
+ * accuracy is not critical and re-queuing is not worth the added complexity.
 *
 * This function is exported so that the application's graceful-shutdown
 * cleanup handler can call it before the process exits.
@@ -108,76 +130,76 @@ export async function flushSiteBandwidthToDb(): Promise<void> {
        `Flushing accumulated bandwidth data for ${sortedEntries.length} site(s) to the database`
    );

-    // Aggregate billing usage by org, collected during the DB update loop.
+    // Build a lookup so post-processing can reach each entry by publicKey.
+    const snapshotMap = new Map(sortedEntries);
+
+    // Aggregate billing usage by org across all chunks.
    const orgUsageMap = new Map<string, number>();

-    for (const [publicKey, { bytesIn, bytesOut, exitNodeId, calcUsage }] of sortedEntries) {
+    // Process in chunks so individual queries stay at a reasonable size.
+    for (let i = 0; i < sortedEntries.length; i += BATCH_CHUNK_SIZE) {
+        const chunk = sortedEntries.slice(i, i + BATCH_CHUNK_SIZE);
+        const chunkEnd = i + chunk.length - 1;
+
+        // Build a parameterised VALUES list: (pubKey, bytesIn, bytesOut), ...
+        // Both PostgreSQL and SQLite (≥ 3.33.0, which better-sqlite3 bundles)
+        // support UPDATE … FROM (VALUES …), letting us update the whole chunk
+        // in a single query instead of N individual round-trips.
+        const valuesList = chunk.map(([publicKey, { bytesIn, bytesOut }]) =>
+            sql`(${publicKey}, ${bytesIn}, ${bytesOut})`
+        );
+        const valuesClause = sql.join(valuesList, sql`, `);
+
+        let rows: { orgId: string; pubKey: string }[] = [];
+
        try {
-            const updatedSite = await withDeadlockRetry(async () => {
-                const [result] = await db
-                    .update(sites)
-                    .set({
-                        megabytesOut: sql`COALESCE(${sites.megabytesOut}, 0) + ${bytesIn}`,
-                        megabytesIn: sql`COALESCE(${sites.megabytesIn}, 0) + ${bytesOut}`,
-                        lastBandwidthUpdate: currentTime,
-                    })
-                    .where(eq(sites.pubKey, publicKey))
-                    .returning({
-                        orgId: sites.orgId,
-                        siteId: sites.siteId
-                    });
-                return result;
-            }, `flush bandwidth for site ${publicKey}`);
-
-            if (updatedSite) {
-                if (exitNodeId) {
-                    const notAllowed = await checkExitNodeOrg(
-                        exitNodeId,
-                        updatedSite.orgId
-                    );
-                    if (notAllowed) {
-                        logger.warn(
-                            `Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
-                        );
-                        // Skip usage tracking for this site but continue
-                        // processing the rest.
-                        continue;
-                    }
-                }
-
-                if (calcUsage) {
-                    const totalBandwidth = bytesIn + bytesOut;
-                    const current = orgUsageMap.get(updatedSite.orgId) ?? 0;
-                    orgUsageMap.set(updatedSite.orgId, current + totalBandwidth);
-                }
-            }
+            rows = await withDeadlockRetry(async () => {
+                return dbQueryRows<{ orgId: string; pubKey: string }>(sql`
+                    UPDATE sites
+                    SET
+                        "bytesOut"            = COALESCE("bytesOut", 0) + v.bytes_in,
+                        "bytesIn"             = COALESCE("bytesIn", 0)  + v.bytes_out,
+                        "lastBandwidthUpdate" = ${currentTime}
+                    FROM (VALUES ${valuesClause}) AS v(pub_key, bytes_in, bytes_out)
+                    WHERE sites."pubKey" = v.pub_key
+                    RETURNING sites."orgId" AS "orgId", sites."pubKey" AS "pubKey"
+                `);
+            }, `flush bandwidth chunk [${i}–${chunkEnd}]`);
        } catch (error) {
            logger.error(
-                `Failed to flush bandwidth for site ${publicKey}:`,
+                `Failed to flush bandwidth chunk [${i}–${chunkEnd}], discarding ${chunk.length} site(s):`,
                error
            );
+            // Discard the chunk — exact per-flush accuracy is not critical.
+            continue;
+        }

-            // Re-queue the failed entry so it is retried on the next flush
-            // rather than silently dropped.
-            const existing = accumulator.get(publicKey);
-            if (existing) {
-                existing.bytesIn += bytesIn;
-                existing.bytesOut += bytesOut;
-            } else {
-                accumulator.set(publicKey, {
-                    bytesIn,
-                    bytesOut,
-                    exitNodeId,
-                    calcUsage
-                });
+        // Collect billing usage from the returned rows.
+        for (const { orgId, pubKey } of rows) {
+            const entry = snapshotMap.get(pubKey);
+            if (!entry) continue;
+
+            const { bytesIn, bytesOut, exitNodeId, calcUsage } = entry;
+
+            if (exitNodeId) {
+                const notAllowed = await checkExitNodeOrg(exitNodeId, orgId);
+                if (notAllowed) {
+                    logger.warn(
+                        `Exit node ${exitNodeId} is not allowed for org ${orgId}`
+                    );
+                    continue;
+                }
+            }
+
+            if (calcUsage) {
+                const current = orgUsageMap.get(orgId) ?? 0;
+                orgUsageMap.set(orgId, current + bytesIn + bytesOut);
            }
        }
    }

-    // Process billing usage updates outside the site-update loop to keep
-    // lock scope small and concerns separated.
+    // Process billing usage updates after all chunks are written.
    if (orgUsageMap.size > 0) {
-        // Sort org IDs for consistent lock ordering.
        const sortedOrgIds = [...orgUsageMap.keys()].sort();

        for (const orgId of sortedOrgIds) {
--- a/server/routers/newt/buildConfiguration.ts
+++ b/server/routers/newt/buildConfiguration.ts
@@ -16,8 +16,8 @@ import { eq, and } from "drizzle-orm";
 import config from "@server/lib/config";
 import {
    formatEndpoint,
-    generateSubnetProxyTargets,
-    SubnetProxyTarget
+    generateSubnetProxyTargetV2,
+    SubnetProxyTargetV2
 } from "@server/lib/ip";

 export async function buildClientConfigurationForNewtClient(
@@ -143,7 +143,7 @@ export async function buildClientConfigurationForNewtClient(
        .from(siteResources)
        .where(eq(siteResources.siteId, siteId));

-    const targetsToSend: SubnetProxyTarget[] = [];
+    const targetsToSend: SubnetProxyTargetV2[] = [];

    for (const resource of allSiteResources) {
        // Get clients associated with this specific resource
@@ -168,12 +168,14 @@ export async function buildClientConfigurationForNewtClient(
                )
            );

-        const resourceTargets = generateSubnetProxyTargets(
+        const resourceTarget = generateSubnetProxyTargetV2(
            resource,
            resourceClients
        );

-        targetsToSend.push(...resourceTargets);
+        if (resourceTarget) {
+            targetsToSend.push(resourceTarget);
+        }
    }

    return {
--- a/server/routers/newt/getNewtToken.ts
+++ b/server/routers/newt/getNewtToken.ts
@@ -1,6 +1,8 @@
 import { generateSessionToken } from "@server/auth/sessions/app";
-import { db } from "@server/db";
+import { db, newtSessions } from "@server/db";
 import { newts } from "@server/db";
+import { getOrCreateCachedToken } from "#dynamic/lib/tokenCache";
+import { EXPIRES } from "@server/auth/sessions/newt";
 import HttpCode from "@server/types/HttpCode";
 import response from "@server/lib/response";
 import { eq } from "drizzle-orm";
@@ -92,8 +94,19 @@ export async function getNewtToken(
            );
        }

-        const resToken = generateSessionToken();
-        await createNewtSession(resToken, existingNewt.newtId);
+        // Return a cached token if one exists to prevent thundering herd on
+        // simultaneous restarts; falls back to creating a fresh session when
+        // Redis is unavailable or the cache has expired.
+        const resToken = await getOrCreateCachedToken(
+            `newt:token_cache:${existingNewt.newtId}`,
+            config.getRawConfig().server.secret!,
+            Math.floor(EXPIRES / 1000),
+            async () => {
+                const token = generateSessionToken();
+                await createNewtSession(token, existingNewt.newtId);
+                return token;
+            }
+        );

        return response<{ token: string; serverVersion: string }>(res, {
            data: {
--- a/server/routers/newt/handleGetConfigMessage.ts
+++ b/server/routers/newt/handleGetConfigMessage.ts
@@ -6,6 +6,7 @@ import { db, ExitNode, exitNodes, Newt, sites } from "@server/db";
 import { eq } from "drizzle-orm";
 import { sendToExitNode } from "#dynamic/lib/exitNodes";
 import { buildClientConfigurationForNewtClient } from "./buildConfiguration";
+import { convertTargetsIfNessicary } from "../client/targets";
 import { canCompress } from "@server/lib/clientVersionChecks";

 const inputSchema = z.object({
@@ -127,13 +128,15 @@ export const handleGetConfigMessage: MessageHandler = async (context) => {
        exitNode
    );

+    const targetsToSend = await convertTargetsIfNessicary(newt.newtId, targets);
+
    return {
        message: {
            type: "newt/wg/receive-config",
            data: {
                ipAddress: site.address,
                peers,
-                targets
+                targets: targetsToSend
            }
        },
        options: {
--- a/server/routers/newt/handleNewtPingMessage.ts
+++ b/server/routers/newt/handleNewtPingMessage.ts
@@ -5,6 +5,7 @@ import { Newt } from "@server/db";
 import { eq, lt, isNull, and, or } from "drizzle-orm";
 import logger from "@server/logger";
 import { sendNewtSyncMessage } from "./sync";
+import { recordPing } from "./pingAccumulator";

 // Track if the offline checker interval is running
 let offlineCheckerInterval: NodeJS.Timeout | null = null;
@@ -114,18 +115,12 @@ export const handleNewtPingMessage: MessageHandler = async (context) => {
        return;
    }

-    try {
-        // Mark the site as online and record the ping timestamp.
-        await db
-            .update(sites)
-            .set({
-                online: true,
-                lastPing: Math.floor(Date.now() / 1000)
-            })
-            .where(eq(sites.siteId, newt.siteId));
-    } catch (error) {
-        logger.error("Error updating online state on newt ping", { error });
-    }
+    // Record the ping in memory; it will be flushed to the database
+    // periodically by the ping accumulator (every ~10s) in a single
+    // batched UPDATE instead of one query per ping. This prevents
+    // connection pool exhaustion under load, especially with
+    // cross-region latency to the database.
+    recordPing(newt.siteId);

    // Check config version and sync if stale.
    const configVersion = await getClientConfigVersion(newt.newtId);
--- a/server/routers/newt/pingAccumulator.ts
+++ b/server/routers/newt/pingAccumulator.ts
@@ -0,0 +1,382 @@
+import { db } from "@server/db";
+import { sites, clients, olms } from "@server/db";
+import { eq, inArray } from "drizzle-orm";
+import logger from "@server/logger";
+
+/**
+ * Ping Accumulator
+ *
+ * Instead of writing to the database on every single newt/olm ping (which
+ * causes pool exhaustion under load, especially with cross-region latency),
+ * we accumulate pings in memory and flush them to the database periodically
+ * in a single batch.
+ *
+ * This is the same pattern used for bandwidth flushing in
+ * receiveBandwidth.ts and handleReceiveBandwidthMessage.ts.
+ *
+ * Supports two kinds of pings:
+ *   - **Site pings** (from newts): update `sites.online` and `sites.lastPing`
+ *   - **Client pings** (from OLMs): update `clients.online`, `clients.lastPing`,
+ *     `clients.archived`, and optionally reset `olms.archived`
+ */
+
+const FLUSH_INTERVAL_MS = 10_000; // Flush every 10 seconds
+const MAX_RETRIES = 2;
+const BASE_DELAY_MS = 50;
+
+// ── Site (newt) pings ──────────────────────────────────────────────────
+// Map of siteId -> latest ping timestamp (unix seconds)
+const pendingSitePings: Map<number, number> = new Map();
+
+// ── Client (OLM) pings ────────────────────────────────────────────────
+// Map of clientId -> latest ping timestamp (unix seconds)
+const pendingClientPings: Map<number, number> = new Map();
+// Set of olmIds whose `archived` flag should be reset to false
+const pendingOlmArchiveResets: Set<string> = new Set();
+
+let flushTimer: NodeJS.Timeout | null = null;
+
+// ── Public API ─────────────────────────────────────────────────────────
+
+/**
+ * Record a ping for a newt site. This does NOT write to the database
+ * immediately. Instead it stores the latest ping timestamp in memory,
+ * to be flushed periodically by the background timer.
+ */
+export function recordSitePing(siteId: number): void {
+    const now = Math.floor(Date.now() / 1000);
+    pendingSitePings.set(siteId, now);
+}
+
+/** @deprecated Use `recordSitePing` instead. Alias kept for existing call-sites. */
+export const recordPing = recordSitePing;
+
+/**
+ * Record a ping for an OLM client. Batches the `clients` table update
+ * (`online`, `lastPing`, `archived`) and, when `olmArchived` is true,
+ * also queues an `olms` table update to clear the archived flag.
+ */
+export function recordClientPing(
+    clientId: number,
+    olmId: string,
+    olmArchived: boolean
+): void {
+    const now = Math.floor(Date.now() / 1000);
+    pendingClientPings.set(clientId, now);
+    if (olmArchived) {
+        pendingOlmArchiveResets.add(olmId);
+    }
+}
+
+// ── Flush Logic ────────────────────────────────────────────────────────
+
+/**
+ * Flush all accumulated site pings to the database.
+ */
+async function flushSitePingsToDb(): Promise<void> {
+    if (pendingSitePings.size === 0) {
+        return;
+    }
+
+    // Snapshot and clear so new pings arriving during the flush go into a
+    // fresh map for the next cycle.
+    const pingsToFlush = new Map(pendingSitePings);
+    pendingSitePings.clear();
+
+    // Sort by siteId for consistent lock ordering (prevents deadlocks)
+    const sortedEntries = Array.from(pingsToFlush.entries()).sort(
+        ([a], [b]) => a - b
+    );
+
+    const BATCH_SIZE = 50;
+    for (let i = 0; i < sortedEntries.length; i += BATCH_SIZE) {
+        const batch = sortedEntries.slice(i, i + BATCH_SIZE);
+
+        try {
+            await withRetry(async () => {
+                // Group by timestamp for efficient bulk updates
+                const byTimestamp = new Map<number, number[]>();
+                for (const [siteId, timestamp] of batch) {
+                    const group = byTimestamp.get(timestamp) || [];
+                    group.push(siteId);
+                    byTimestamp.set(timestamp, group);
+                }
+
+                if (byTimestamp.size === 1) {
+                    const [timestamp, siteIds] = Array.from(
+                        byTimestamp.entries()
+                    )[0];
+                    await db
+                        .update(sites)
+                        .set({
+                            online: true,
+                            lastPing: timestamp
+                        })
+                        .where(inArray(sites.siteId, siteIds));
+                } else {
+                    await db.transaction(async (tx) => {
+                        for (const [timestamp, siteIds] of byTimestamp) {
+                            await tx
+                                .update(sites)
+                                .set({
+                                    online: true,
+                                    lastPing: timestamp
+                                })
+                                .where(inArray(sites.siteId, siteIds));
+                        }
+                    });
+                }
+            }, "flushSitePingsToDb");
+        } catch (error) {
+            logger.error(
+                `Failed to flush site ping batch (${batch.length} sites), re-queuing for next cycle`,
+                { error }
+            );
+            for (const [siteId, timestamp] of batch) {
+                const existing = pendingSitePings.get(siteId);
+                if (!existing || existing < timestamp) {
+                    pendingSitePings.set(siteId, timestamp);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Flush all accumulated client (OLM) pings to the database.
+ */
+async function flushClientPingsToDb(): Promise<void> {
+    if (pendingClientPings.size === 0 && pendingOlmArchiveResets.size === 0) {
+        return;
+    }
+
+    // Snapshot and clear
+    const pingsToFlush = new Map(pendingClientPings);
+    pendingClientPings.clear();
+
+    const olmResetsToFlush = new Set(pendingOlmArchiveResets);
+    pendingOlmArchiveResets.clear();
+
+    // ── Flush client pings ─────────────────────────────────────────────
+    if (pingsToFlush.size > 0) {
+        const sortedEntries = Array.from(pingsToFlush.entries()).sort(
+            ([a], [b]) => a - b
+        );
+
+        const BATCH_SIZE = 50;
+        for (let i = 0; i < sortedEntries.length; i += BATCH_SIZE) {
+            const batch = sortedEntries.slice(i, i + BATCH_SIZE);
+
+            try {
+                await withRetry(async () => {
+                    const byTimestamp = new Map<number, number[]>();
+                    for (const [clientId, timestamp] of batch) {
+                        const group = byTimestamp.get(timestamp) || [];
+                        group.push(clientId);
+                        byTimestamp.set(timestamp, group);
+                    }
+
+                    if (byTimestamp.size === 1) {
+                        const [timestamp, clientIds] = Array.from(
+                            byTimestamp.entries()
+                        )[0];
+                        await db
+                            .update(clients)
+                            .set({
+                                lastPing: timestamp,
+                                online: true,
+                                archived: false
+                            })
+                            .where(inArray(clients.clientId, clientIds));
+                    } else {
+                        await db.transaction(async (tx) => {
+                            for (const [timestamp, clientIds] of byTimestamp) {
+                                await tx
+                                    .update(clients)
+                                    .set({
+                                        lastPing: timestamp,
+                                        online: true,
+                                        archived: false
+                                    })
+                                    .where(
+                                        inArray(clients.clientId, clientIds)
+                                    );
+                            }
+                        });
+                    }
+                }, "flushClientPingsToDb");
+            } catch (error) {
+                logger.error(
+                    `Failed to flush client ping batch (${batch.length} clients), re-queuing for next cycle`,
+                    { error }
+                );
+                for (const [clientId, timestamp] of batch) {
+                    const existing = pendingClientPings.get(clientId);
+                    if (!existing || existing < timestamp) {
+                        pendingClientPings.set(clientId, timestamp);
+                    }
+                }
+            }
+        }
+    }
+
+    // ── Flush OLM archive resets ───────────────────────────────────────
+    if (olmResetsToFlush.size > 0) {
+        const olmIds = Array.from(olmResetsToFlush).sort();
+
+        const BATCH_SIZE = 50;
+        for (let i = 0; i < olmIds.length; i += BATCH_SIZE) {
+            const batch = olmIds.slice(i, i + BATCH_SIZE);
+
+            try {
+                await withRetry(async () => {
+                    await db
+                        .update(olms)
+                        .set({ archived: false })
+                        .where(inArray(olms.olmId, batch));
+                }, "flushOlmArchiveResets");
+            } catch (error) {
+                logger.error(
+                    `Failed to flush OLM archive reset batch (${batch.length} olms), re-queuing for next cycle`,
+                    { error }
+                );
+                for (const olmId of batch) {
+                    pendingOlmArchiveResets.add(olmId);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Flush everything — called by the interval timer and during shutdown.
+ */
+export async function flushPingsToDb(): Promise<void> {
+    await flushSitePingsToDb();
+    await flushClientPingsToDb();
+}
+
+// ── Retry / Error Helpers ──────────────────────────────────────────────
+
+/**
+ * Simple retry wrapper with exponential backoff for transient errors
+ * (connection timeouts, unexpected disconnects).
+ */
+async function withRetry<T>(
+    operation: () => Promise<T>,
+    context: string
+): Promise<T> {
+    let attempt = 0;
+    while (true) {
+        try {
+            return await operation();
+        } catch (error: any) {
+            if (isTransientError(error) && attempt < MAX_RETRIES) {
+                attempt++;
+                const baseDelay = Math.pow(2, attempt - 1) * BASE_DELAY_MS;
+                const jitter = Math.random() * baseDelay;
+                const delay = baseDelay + jitter;
+                logger.warn(
+                    `Transient DB error in ${context}, retrying attempt ${attempt}/${MAX_RETRIES} after ${delay.toFixed(0)}ms`
+                );
+                await new Promise((resolve) => setTimeout(resolve, delay));
+                continue;
+            }
+            throw error;
+        }
+    }
+}
+
+/**
+ * Detect transient connection errors that are safe to retry.
+ */
+function isTransientError(error: any): boolean {
+    if (!error) return false;
+
+    const message = (error.message || "").toLowerCase();
+    const causeMessage = (error.cause?.message || "").toLowerCase();
+    const code = error.code || "";
+
+    // Connection timeout / terminated
+    if (
+        message.includes("connection timeout") ||
+        message.includes("connection terminated") ||
+        message.includes("timeout exceeded when trying to connect") ||
+        causeMessage.includes("connection terminated unexpectedly") ||
+        causeMessage.includes("connection timeout")
+    ) {
+        return true;
+    }
+
+    // PostgreSQL deadlock
+    if (code === "40P01" || message.includes("deadlock")) {
+        return true;
+    }
+
+    // ECONNRESET, ECONNREFUSED, EPIPE
+    if (
+        code === "ECONNRESET" ||
+        code === "ECONNREFUSED" ||
+        code === "EPIPE" ||
+        code === "ETIMEDOUT"
+    ) {
+        return true;
+    }
+
+    return false;
+}
+
+// ── Lifecycle ──────────────────────────────────────────────────────────
+
+/**
+ * Start the background flush timer. Call this once at server startup.
+ */
+export function startPingAccumulator(): void {
+    if (flushTimer) {
+        return; // Already running
+    }
+
+    flushTimer = setInterval(async () => {
+        try {
+            await flushPingsToDb();
+        } catch (error) {
+            logger.error("Unhandled error in ping accumulator flush", {
+                error
+            });
+        }
+    }, FLUSH_INTERVAL_MS);
+
+    // Don't prevent the process from exiting
+    flushTimer.unref();
+
+    logger.info(
+        `Ping accumulator started (flush interval: ${FLUSH_INTERVAL_MS}ms)`
+    );
+}
+
+/**
+ * Stop the background flush timer and perform a final flush.
+ * Call this during graceful shutdown.
+ */
+export async function stopPingAccumulator(): Promise<void> {
+    if (flushTimer) {
+        clearInterval(flushTimer);
+        flushTimer = null;
+    }
+
+    // Final flush to persist any remaining pings
+    try {
+        await flushPingsToDb();
+    } catch (error) {
+        logger.error("Error during final ping accumulator flush", { error });
+    }
+
+    logger.info("Ping accumulator stopped");
+}
+
+/**
+ * Get the number of pending (unflushed) pings. Useful for monitoring.
+ */
+export function getPendingPingCount(): number {
+    return pendingSitePings.size + pendingClientPings.size;
+}
--- a/server/routers/olm/getOlmToken.ts
+++ b/server/routers/olm/getOlmToken.ts
@@ -8,7 +8,7 @@ import {
    ExitNode,
    exitNodes,
    sites,
-    clientSitesAssociationsCache
+    clientSitesAssociationsCache,
 } from "@server/db";
 import { olms } from "@server/db";
 import HttpCode from "@server/types/HttpCode";
@@ -20,8 +20,10 @@ import { z } from "zod";
 import { fromError } from "zod-validation-error";
 import {
    createOlmSession,
-    validateOlmSessionToken
+    validateOlmSessionToken,
+    EXPIRES
 } from "@server/auth/sessions/olm";
+import { getOrCreateCachedToken } from "#dynamic/lib/tokenCache";
 import { verifyPassword } from "@server/auth/password";
 import logger from "@server/logger";
 import config from "@server/lib/config";
@@ -132,8 +134,19 @@ export async function getOlmToken(

        logger.debug("Creating new olm session token");

-        const resToken = generateSessionToken();
-        await createOlmSession(resToken, existingOlm.olmId);
+        // Return a cached token if one exists to prevent thundering herd on
+        // simultaneous restarts; falls back to creating a fresh session when
+        // Redis is unavailable or the cache has expired.
+        const resToken = await getOrCreateCachedToken(
+            `olm:token_cache:${existingOlm.olmId}`,
+            config.getRawConfig().server.secret!,
+            Math.floor(EXPIRES / 1000),
+            async () => {
+                const token = generateSessionToken();
+                await createOlmSession(token, existingOlm.olmId);
+                return token;
+            }
+        );

        let clientIdToUse;
        if (orgId) {
--- a/server/routers/olm/handleOlmPingMessage.ts
+++ b/server/routers/olm/handleOlmPingMessage.ts
@@ -3,6 +3,7 @@ import { db } from "@server/db";
 import { MessageHandler } from "@server/routers/ws";
 import { clients, olms, Olm } from "@server/db";
 import { eq, lt, isNull, and, or } from "drizzle-orm";
+import { recordClientPing } from "@server/routers/newt/pingAccumulator";
 import logger from "@server/logger";
 import { validateSessionToken } from "@server/auth/sessions/app";
 import { checkOrgAccessPolicy } from "#dynamic/lib/checkOrgAccessPolicy";
@@ -201,22 +202,12 @@ export const handleOlmPingMessage: MessageHandler = async (context) => {
            await sendOlmSyncMessage(olm, client);
        }

-        // Update the client's last ping timestamp
-        await db
-            .update(clients)
-            .set({
-                lastPing: Math.floor(Date.now() / 1000),
-                online: true,
-                archived: false
-            })
-            .where(eq(clients.clientId, olm.clientId));
-
-        if (olm.archived) {
-            await db
-                .update(olms)
-                .set({ archived: false })
-                .where(eq(olms.olmId, olm.olmId));
-        }
+        // Record the ping in memory; it will be flushed to the database
+        // periodically by the ping accumulator (every ~10s) in a single
+        // batched UPDATE instead of one query per ping. This prevents
+        // connection pool exhaustion under load, especially with
+        // cross-region latency to the database.
+        recordClientPing(olm.clientId, olm.olmId, !!olm.archived);
    } catch (error) {
        logger.error("Error handling ping message", { error });
    }
--- a/server/routers/siteResource/createSiteResource.ts
+++ b/server/routers/siteResource/createSiteResource.ts
@@ -88,7 +88,7 @@ const createSiteResourceSchema = z
        },
        {
            message:
-                "Destination must be a valid IP address or valid domain AND alias is required"
+                "Destination must be a valid IPV4 address or valid domain AND alias is required"
        }
    )
    .refine(
--- a/server/routers/siteResource/updateSiteResource.ts
+++ b/server/routers/siteResource/updateSiteResource.ts
@@ -24,7 +24,7 @@ import { updatePeerData, updateTargets } from "@server/routers/client/targets";
 import {
    generateAliasConfig,
    generateRemoteSubnets,
-    generateSubnetProxyTargets,
+    generateSubnetProxyTargetV2,
    isIpInCidr,
    portRangeStringSchema
 } from "@server/lib/ip";
@@ -608,18 +608,18 @@ export async function handleMessagingForUpdatedSiteResource(

        // Only update targets on newt if destination changed
        if (destinationChanged || portRangesChanged) {
-            const oldTargets = generateSubnetProxyTargets(
+            const oldTarget = generateSubnetProxyTargetV2(
                existingSiteResource,
                mergedAllClients
            );
-            const newTargets = generateSubnetProxyTargets(
+            const newTarget = generateSubnetProxyTargetV2(
                updatedSiteResource,
                mergedAllClients
            );

            await updateTargets(newt.newtId, {
-                oldTargets: oldTargets,
-                newTargets: newTargets
+                oldTargets: oldTarget ? [oldTarget] : [],
+                newTargets: newTarget ? [newTarget] : []
            }, newt.version);
        }

--- a/server/routers/ws/messageHandlers.ts
+++ b/server/routers/ws/messageHandlers.ts
@@ -11,6 +11,7 @@ import {
    startNewtOfflineChecker,
    handleNewtDisconnectingMessage
 } from "../newt";
+import { startPingAccumulator } from "../newt/pingAccumulator";
 import {
    handleOlmRegisterMessage,
    handleOlmRelayMessage,
@@ -46,6 +47,10 @@ export const messageHandlers: Record<string, MessageHandler> = {
    "ws/round-trip/complete": handleRoundTripMessage
 };

+// Start the ping accumulator for all builds — it batches per-site online/lastPing
+// updates into periodic bulk writes, preventing connection pool exhaustion.
+startPingAccumulator();
+
 if (build != "saas") {
    startOlmOfflineChecker(); // this is to handle the offline check for olms
    startNewtOfflineChecker(); // this is to handle the offline check for newts
--- a/server/routers/ws/ws.ts
+++ b/server/routers/ws/ws.ts
@@ -6,6 +6,7 @@ import { Socket } from "net";
 import { Newt, newts, NewtSession, olms, Olm, OlmSession, sites } from "@server/db";
 import { eq } from "drizzle-orm";
 import { db } from "@server/db";
+import { recordPing } from "@server/routers/newt/pingAccumulator";
 import { validateNewtSessionToken } from "@server/auth/sessions/newt";
 import { validateOlmSessionToken } from "@server/auth/sessions/olm";
 import { messageHandlers } from "./messageHandlers";
@@ -386,22 +387,14 @@ const setupConnection = async (
    // the same as modern newt clients.
    if (clientType === "newt") {
        const newtClient = client as Newt;
-        ws.on("ping", async () => {
+        ws.on("ping", () => {
            if (!newtClient.siteId) return;
-            try {
-                await db
-                    .update(sites)
-                    .set({
-                        online: true,
-                        lastPing: Math.floor(Date.now() / 1000)
-                    })
-                    .where(eq(sites.siteId, newtClient.siteId));
-            } catch (error) {
-                logger.error(
-                    "Error updating newt site online state on WS ping",
-                    { error }
-                );
-            }
+            // Record the ping in the accumulator instead of writing to the
+            // database on every WS ping frame. The accumulator flushes all
+            // pending pings in a single batched UPDATE every ~10s, which
+            // prevents connection pool exhaustion under load (especially
+            // with cross-region latency to the database).
+            recordPing(newtClient.siteId);
        });
    }