Merge pull request #2710 from fosrl/thundering-herd

thundering herd
Fix import problems
2026-03-25 20:16:40 +00:00 · 2026-03-24 20:29:01 -07:00 · 2026-03-24 20:27:34 -07:00 · 2026-03-24 18:12:51 -07:00 · 2026-03-24 17:55:14 -07:00 · 2026-03-24 17:39:43 -07:00
24 changed files with 743 additions and 135 deletions
--- a/server/cleanup.ts
+++ b/server/cleanup.ts
@@ -1,8 +1,10 @@
 import { flushBandwidthToDb } from "@server/routers/newt/handleReceiveBandwidthMessage";
 import { flushSiteBandwidthToDb } from "@server/routers/gerbil/receiveBandwidth";
+import { stopPingAccumulator } from "@server/routers/newt/pingAccumulator";
 import { cleanup as wsCleanup } from "#dynamic/routers/ws";

 async function cleanup() {
+    await stopPingAccumulator();
    await flushBandwidthToDb();
    await flushSiteBandwidthToDb();
    await wsCleanup();
--- a/server/db/pg/driver.ts
+++ b/server/db/pg/driver.ts
@@ -1,7 +1,7 @@
 import { drizzle as DrizzlePostgres } from "drizzle-orm/node-postgres";
-import { Pool } from "pg";
 import { readConfigFile } from "@server/lib/readConfigFile";
 import { withReplicas } from "drizzle-orm/pg-core";
+import { createPool } from "./poolConfig";

 function createDb() {
    const config = readConfigFile();
@@ -39,12 +39,17 @@ function createDb() {

    // Create connection pools instead of individual connections
    const poolConfig = config.postgres.pool;
-    const primaryPool = new Pool({
+    const maxConnections = poolConfig?.max_connections || 20;
+    const idleTimeoutMs = poolConfig?.idle_timeout_ms || 30000;
+    const connectionTimeoutMs = poolConfig?.connection_timeout_ms || 5000;
+
+    const primaryPool = createPool(
        connectionString,
-        max: poolConfig?.max_connections || 20,
-        idleTimeoutMillis: poolConfig?.idle_timeout_ms || 30000,
-        connectionTimeoutMillis: poolConfig?.connection_timeout_ms || 5000
-    });
+        maxConnections,
+        idleTimeoutMs,
+        connectionTimeoutMs,
+        "primary"
+    );

    const replicas = [];

@@ -55,14 +60,16 @@ function createDb() {
            })
        );
    } else {
+        const maxReplicaConnections =
+            poolConfig?.max_replica_connections || 20;
        for (const conn of replicaConnections) {
-            const replicaPool = new Pool({
-                connectionString: conn.connection_string,
-                max: poolConfig?.max_replica_connections || 20,
-                idleTimeoutMillis: poolConfig?.idle_timeout_ms || 30000,
-                connectionTimeoutMillis:
-                    poolConfig?.connection_timeout_ms || 5000
-            });
+            const replicaPool = createPool(
+                conn.connection_string,
+                maxReplicaConnections,
+                idleTimeoutMs,
+                connectionTimeoutMs,
+                "replica"
+            );
            replicas.push(
                DrizzlePostgres(replicaPool, {
                    logger: process.env.QUERY_LOGGING == "true"
@@ -84,4 +91,4 @@ export default db;
 export const primaryDb = db.$primary;
 export type Transaction = Parameters<
    Parameters<(typeof db)["transaction"]>[0]
->[0];
+>[0];
--- a/server/db/pg/logsDriver.ts
+++ b/server/db/pg/logsDriver.ts
@@ -1,9 +1,9 @@
 import { drizzle as DrizzlePostgres } from "drizzle-orm/node-postgres";
-import { Pool } from "pg";
 import { readConfigFile } from "@server/lib/readConfigFile";
 import { withReplicas } from "drizzle-orm/pg-core";
 import { build } from "@server/build";
 import { db as mainDb, primaryDb as mainPrimaryDb } from "./driver";
+import { createPool } from "./poolConfig";

 function createLogsDb() {
    // Only use separate logs database in SaaS builds
@@ -42,12 +42,17 @@ function createLogsDb() {

    // Create separate connection pool for logs database
    const poolConfig = logsConfig?.pool || config.postgres?.pool;
-    const primaryPool = new Pool({
+    const maxConnections = poolConfig?.max_connections || 20;
+    const idleTimeoutMs = poolConfig?.idle_timeout_ms || 30000;
+    const connectionTimeoutMs = poolConfig?.connection_timeout_ms || 5000;
+
+    const primaryPool = createPool(
        connectionString,
-        max: poolConfig?.max_connections || 20,
-        idleTimeoutMillis: poolConfig?.idle_timeout_ms || 30000,
-        connectionTimeoutMillis: poolConfig?.connection_timeout_ms || 5000
-    });
+        maxConnections,
+        idleTimeoutMs,
+        connectionTimeoutMs,
+        "logs-primary"
+    );

    const replicas = [];

@@ -58,14 +63,16 @@ function createLogsDb() {
            })
        );
    } else {
+        const maxReplicaConnections =
+            poolConfig?.max_replica_connections || 20;
        for (const conn of replicaConnections) {
-            const replicaPool = new Pool({
-                connectionString: conn.connection_string,
-                max: poolConfig?.max_replica_connections || 20,
-                idleTimeoutMillis: poolConfig?.idle_timeout_ms || 30000,
-                connectionTimeoutMillis:
-                    poolConfig?.connection_timeout_ms || 5000
-            });
+            const replicaPool = createPool(
+                conn.connection_string,
+                maxReplicaConnections,
+                idleTimeoutMs,
+                connectionTimeoutMs,
+                "logs-replica"
+            );
            replicas.push(
                DrizzlePostgres(replicaPool, {
                    logger: process.env.QUERY_LOGGING == "true"
@@ -84,4 +91,4 @@ function createLogsDb() {

 export const logsDb = createLogsDb();
 export default logsDb;
-export const primaryLogsDb = logsDb.$primary;
+export const primaryLogsDb = logsDb.$primary;
--- a/server/db/pg/poolConfig.ts
+++ b/server/db/pg/poolConfig.ts
@@ -0,0 +1,63 @@
+import { Pool, PoolConfig } from "pg";
+import logger from "@server/logger";
+
+export function createPoolConfig(
+    connectionString: string,
+    maxConnections: number,
+    idleTimeoutMs: number,
+    connectionTimeoutMs: number
+): PoolConfig {
+    return {
+        connectionString,
+        max: maxConnections,
+        idleTimeoutMillis: idleTimeoutMs,
+        connectionTimeoutMillis: connectionTimeoutMs,
+        // TCP keepalive to prevent silent connection drops by NAT gateways,
+        // load balancers, and other intermediate network devices (e.g. AWS
+        // NAT Gateway drops idle TCP connections after ~350s)
+        keepAlive: true,
+        keepAliveInitialDelayMillis: 10000, // send first keepalive after 10s of idle
+        // Allow connections to be released and recreated more aggressively
+        // to avoid stale connections building up
+        allowExitOnIdle: false
+    };
+}
+
+export function attachPoolErrorHandlers(pool: Pool, label: string): void {
+    pool.on("error", (err) => {
+        // This catches errors on idle clients in the pool. Without this
+        // handler an unexpected disconnect would crash the process.
+        logger.error(
+            `Unexpected error on idle ${label} database client: ${err.message}`
+        );
+    });
+
+    pool.on("connect", (client) => {
+        // Set a statement timeout on every new connection so a single slow
+        // query can't block the pool forever
+        client.query("SET statement_timeout = '30s'").catch((err: Error) => {
+            logger.warn(
+                `Failed to set statement_timeout on ${label} client: ${err.message}`
+            );
+        });
+    });
+}
+
+export function createPool(
+    connectionString: string,
+    maxConnections: number,
+    idleTimeoutMs: number,
+    connectionTimeoutMs: number,
+    label: string
+): Pool {
+    const pool = new Pool(
+        createPoolConfig(
+            connectionString,
+            maxConnections,
+            idleTimeoutMs,
+            connectionTimeoutMs
+        )
+    );
+    attachPoolErrorHandlers(pool, label);
+    return pool;
+}
--- a/server/lib/tokenCache.ts
+++ b/server/lib/tokenCache.ts
@@ -0,0 +1,22 @@
+/**
+ * Returns a cached plaintext token from Redis if one exists and decrypts
+ * cleanly, otherwise calls `createSession` to mint a fresh token, stores the
+ * encrypted value in Redis with the given TTL, and returns it.
+ *
+ * Failures at the Redis layer are non-fatal – the function always falls
+ * through to session creation so the caller is never blocked by a Redis outage.
+ *
+ * @param cacheKey   Unique Redis key, e.g. `"newt:token_cache:abc123"`
+ * @param secret     Server secret used for AES encryption/decryption
+ * @param ttlSeconds Cache TTL in seconds (should match session expiry)
+ * @param createSession Factory that mints a new session and returns its raw token
+ */
+export async function getOrCreateCachedToken(
+    cacheKey: string,
+    secret: string,
+    ttlSeconds: number,
+    createSession: () => Promise<string>
+): Promise<string> {
+    const token = await createSession();
+    return token;
+}
--- a/server/private/cleanup.ts
+++ b/server/private/cleanup.ts
@@ -15,8 +15,10 @@ import { rateLimitService } from "#private/lib/rateLimit";
 import { cleanup as wsCleanup } from "#private/routers/ws";
 import { flushBandwidthToDb } from "@server/routers/newt/handleReceiveBandwidthMessage";
 import { flushSiteBandwidthToDb } from "@server/routers/gerbil/receiveBandwidth";
+import { stopPingAccumulator } from "@server/routers/newt/pingAccumulator";

 async function cleanup() {
+    await stopPingAccumulator();
    await flushBandwidthToDb();
    await flushSiteBandwidthToDb();
    await rateLimitService.cleanup();
--- a/server/private/lib/cache.ts
+++ b/server/private/lib/cache.ts
@@ -1,3 +1,16 @@
+/*
+ * This file is part of a proprietary work.
+ *
+ * Copyright (c) 2025 Fossorial, Inc.
+ * All rights reserved.
+ *
+ * This file is licensed under the Fossorial Commercial License.
+ * You may not use this file except in compliance with the License.
+ * Unauthorized use, copying, modification, or distribution is strictly prohibited.
+ *
+ * This file is not licensed under the AGPLv3.
+ */
+
 import NodeCache from "node-cache";
 import logger from "@server/logger";
 import { redisManager } from "@server/private/lib/redis";
@@ -24,23 +37,31 @@ setInterval(() => {
 */
 class AdaptiveCache {
    private useRedis(): boolean {
-        return redisManager.isRedisEnabled() && redisManager.getHealthStatus().isHealthy;
+        return (
+            redisManager.isRedisEnabled() &&
+            redisManager.getHealthStatus().isHealthy
+        );
    }

    /**
     * Set a value in the cache
     * @param key - Cache key
     * @param value - Value to cache (will be JSON stringified for Redis)
-     * @param ttl - Time to live in seconds (0 = no expiration)
+     * @param ttl - Time to live in seconds (0 = no expiration; omit = 3600s for Redis)
     * @returns boolean indicating success
     */
    async set(key: string, value: any, ttl?: number): Promise<boolean> {
        const effectiveTtl = ttl === 0 ? undefined : ttl;
+        const redisTtl = ttl === 0 ? undefined : (ttl ?? 3600);

        if (this.useRedis()) {
            try {
                const serialized = JSON.stringify(value);
-                const success = await redisManager.set(key, serialized, effectiveTtl);
+                const success = await redisManager.set(
+                    key,
+                    serialized,
+                    redisTtl
+                );

                if (success) {
                    logger.debug(`Set key in Redis: ${key}`);
@@ -48,7 +69,9 @@ class AdaptiveCache {
                }

                // Redis failed, fall through to local cache
-                logger.debug(`Redis set failed for key ${key}, falling back to local cache`);
+                logger.debug(
+                    `Redis set failed for key ${key}, falling back to local cache`
+                );
            } catch (error) {
                logger.error(`Redis set error for key ${key}:`, error);
                // Fall through to local cache
@@ -120,9 +143,14 @@ class AdaptiveCache {
                }

                // Some Redis deletes failed, fall through to local cache
-                logger.debug(`Some Redis deletes failed, falling back to local cache`);
+                logger.debug(
+                    `Some Redis deletes failed, falling back to local cache`
+                );
            } catch (error) {
-                logger.error(`Redis del error for keys ${keys.join(", ")}:`, error);
+                logger.error(
+                    `Redis del error for keys ${keys.join(", ")}:`,
+                    error
+                );
                // Fall through to local cache
                deletedCount = 0;
            }
@@ -195,7 +223,9 @@ class AdaptiveCache {
     */
    async flushAll(): Promise<void> {
        if (this.useRedis()) {
-            logger.warn("Adaptive cache flushAll called - Redis flush not implemented, only local cache will be flushed");
+            logger.warn(
+                "Adaptive cache flushAll called - Redis flush not implemented, only local cache will be flushed"
+            );
        }

        localCache.flushAll();
@@ -239,7 +269,9 @@ class AdaptiveCache {
    getTtl(key: string): number {
        // Note: This only works for local cache, Redis TTL is not supported
        if (this.useRedis()) {
-            logger.warn(`getTtl called for key ${key} but Redis TTL lookup is not implemented`);
+            logger.warn(
+                `getTtl called for key ${key} but Redis TTL lookup is not implemented`
+            );
        }

        const ttl = localCache.getTtl(key);
@@ -255,7 +287,9 @@ class AdaptiveCache {
     */
    keys(): string[] {
        if (this.useRedis()) {
-            logger.warn("keys() called but Redis keys are not included, only local cache keys returned");
+            logger.warn(
+                "keys() called but Redis keys are not included, only local cache keys returned"
+            );
        }
        return localCache.keys();
    }
--- a/server/private/lib/tokenCache.ts
+++ b/server/private/lib/tokenCache.ts
@@ -0,0 +1,77 @@
+/*
+ * This file is part of a proprietary work.
+ *
+ * Copyright (c) 2025 Fossorial, Inc.
+ * All rights reserved.
+ *
+ * This file is licensed under the Fossorial Commercial License.
+ * You may not use this file except in compliance with the License.
+ * Unauthorized use, copying, modification, or distribution is strictly prohibited.
+ *
+ * This file is not licensed under the AGPLv3.
+ */
+
+import redisManager from "#private/lib/redis";
+import { encrypt, decrypt } from "@server/lib/crypto";
+import logger from "@server/logger";
+
+/**
+ * Returns a cached plaintext token from Redis if one exists and decrypts
+ * cleanly, otherwise calls `createSession` to mint a fresh token, stores the
+ * encrypted value in Redis with the given TTL, and returns it.
+ *
+ * Failures at the Redis layer are non-fatal – the function always falls
+ * through to session creation so the caller is never blocked by a Redis outage.
+ *
+ * @param cacheKey   Unique Redis key, e.g. `"newt:token_cache:abc123"`
+ * @param secret     Server secret used for AES encryption/decryption
+ * @param ttlSeconds Cache TTL in seconds (should match session expiry)
+ * @param createSession Factory that mints a new session and returns its raw token
+ */
+export async function getOrCreateCachedToken(
+    cacheKey: string,
+    secret: string,
+    ttlSeconds: number,
+    createSession: () => Promise<string>
+): Promise<string> {
+    if (redisManager.isRedisEnabled()) {
+        try {
+            const cached = await redisManager.get(cacheKey);
+            if (cached) {
+                const token = decrypt(cached, secret);
+                if (token) {
+                    logger.debug(`Token cache hit for key: ${cacheKey}`);
+                    return token;
+                }
+                // Decryption produced an empty string – treat as a miss
+                logger.warn(
+                    `Token cache decryption returned empty string for key: ${cacheKey}, treating as miss`
+                );
+            }
+        } catch (e) {
+            logger.warn(
+                `Token cache read/decrypt failed for key ${cacheKey}, falling through to session creation:`,
+                e
+            );
+        }
+    }
+
+    const token = await createSession();
+
+    if (redisManager.isRedisEnabled()) {
+        try {
+            const encrypted = encrypt(token, secret);
+            await redisManager.set(cacheKey, encrypted, ttlSeconds);
+            logger.debug(
+                `Token cached in Redis for key: ${cacheKey} (TTL ${ttlSeconds}s)`
+            );
+        } catch (e) {
+            logger.warn(
+                `Token cache write failed for key ${cacheKey} (session was still created):`,
+                e
+            );
+        }
+    }
+
+    return token;
+}
--- a/server/private/routers/hybrid.ts
+++ b/server/private/routers/hybrid.ts
@@ -15,6 +15,7 @@ import { verifySessionRemoteExitNodeMiddleware } from "#private/middlewares/veri
 import { Router } from "express";
 import {
    db,
+    logsDb,
    exitNodes,
    Resource,
    ResourcePassword,
@@ -1885,7 +1886,7 @@ hybridRouter.post(
            const batchSize = 100;
            for (let i = 0; i < logEntries.length; i += batchSize) {
                const batch = logEntries.slice(i, i + batchSize);
-                await db.insert(requestAuditLog).values(batch);
+                await logsDb.insert(requestAuditLog).values(batch);
            }

            return response(res, {
--- a/server/private/routers/remoteExitNode/getRemoteExitNodeToken.ts
+++ b/server/private/routers/remoteExitNode/getRemoteExitNodeToken.ts
@@ -23,8 +23,10 @@ import { z } from "zod";
 import { fromError } from "zod-validation-error";
 import {
    createRemoteExitNodeSession,
-    validateRemoteExitNodeSessionToken
+    validateRemoteExitNodeSessionToken,
+    EXPIRES
 } from "#private/auth/sessions/remoteExitNode";
+import { getOrCreateCachedToken } from "@server/private/lib/tokenCache";
 import { verifyPassword } from "@server/auth/password";
 import logger from "@server/logger";
 import config from "@server/lib/config";
@@ -103,14 +105,23 @@ export async function getRemoteExitNodeToken(
            );
        }

-        const resToken = generateSessionToken();
-        await createRemoteExitNodeSession(
-            resToken,
-            existingRemoteExitNode.remoteExitNodeId
+        // Return a cached token if one exists to prevent thundering herd on
+        // simultaneous restarts; falls back to creating a fresh session when
+        // Redis is unavailable or the cache has expired.
+        const resToken = await getOrCreateCachedToken(
+            `remote_exit_node:token_cache:${existingRemoteExitNode.remoteExitNodeId}`,
+            config.getRawConfig().server.secret!,
+            Math.floor(EXPIRES / 1000),
+            async () => {
+                const token = generateSessionToken();
+                await createRemoteExitNodeSession(
+                    token,
+                    existingRemoteExitNode.remoteExitNodeId
+                );
+                return token;
+            }
        );

-        // logger.debug(`Created RemoteExitNode token response: ${JSON.stringify(resToken)}`);
-
        return response<{ token: string }>(res, {
            data: {
                token: resToken
--- a/server/private/routers/ws/ws.ts
+++ b/server/private/routers/ws/ws.ts
@@ -19,17 +19,14 @@ import { Socket } from "net";
 import {
    Newt,
    newts,
-    NewtSession,
-    olms,
    Olm,
-    OlmSession,
+    olms,
    RemoteExitNode,
-    RemoteExitNodeSession,
    remoteExitNodes,
-    sites
 } from "@server/db";
 import { eq } from "drizzle-orm";
 import { db } from "@server/db";
+import { recordPing } from "@server/routers/newt/pingAccumulator";
 import { validateNewtSessionToken } from "@server/auth/sessions/newt";
 import { validateOlmSessionToken } from "@server/auth/sessions/olm";
 import logger from "@server/logger";
@@ -197,11 +194,7 @@ const connectedClients: Map<string, AuthenticatedWebSocket[]> = new Map();
 // Config version tracking map (local to this node, resets on server restart)
 const clientConfigVersions: Map<string, number> = new Map();

-// Tracks the last Unix timestamp (seconds) at which a ping was flushed to the
-// DB for a given siteId. Resets on server restart which is fine – the first
-// ping after startup will always write, re-establishing the online state.
-const lastPingDbWrite: Map<number, number> = new Map();
-const PING_DB_WRITE_INTERVAL = 45; // seconds
+

 // Recovery tracking
 let isRedisRecoveryInProgress = false;
@@ -853,32 +846,16 @@ const setupConnection = async (
        );
    });

-    // Handle WebSocket protocol-level pings from older newt clients that do
-    // not send application-level "newt/ping" messages. Update the site's
-    // online state and lastPing timestamp so the offline checker treats them
-    // the same as modern newt clients.
    if (clientType === "newt") {
        const newtClient = client as Newt;
-        ws.on("ping", async () => {
+        ws.on("ping", () => {
            if (!newtClient.siteId) return;
-            const now = Math.floor(Date.now() / 1000);
-            const lastWrite = lastPingDbWrite.get(newtClient.siteId) ?? 0;
-            if (now - lastWrite < PING_DB_WRITE_INTERVAL) return;
-            lastPingDbWrite.set(newtClient.siteId, now);
-            try {
-                await db
-                    .update(sites)
-                    .set({
-                        online: true,
-                        lastPing: now
-                    })
-                    .where(eq(sites.siteId, newtClient.siteId));
-            } catch (error) {
-                logger.error(
-                    "Error updating newt site online state on WS ping",
-                    { error }
-                );
-            }
+            // Record the ping in the accumulator instead of writing to the
+            // database on every WS ping frame. The accumulator flushes all
+            // pending pings in a single batched UPDATE every ~10s, which
+            // prevents connection pool exhaustion under load (especially
+            // with cross-region latency to the database).
+            recordPing(newtClient.siteId);
        });
    }

--- a/server/routers/client/listClients.ts
+++ b/server/routers/client/listClients.ts
@@ -70,7 +70,7 @@ async function getLatestOlmVersion(): Promise<string | null> {
        tags = tags.filter((version) => !version.name.includes("rc"));
        const latestVersion = tags[0].name;

-        olmVersionCache.set("latestOlmVersion", latestVersion);
+        olmVersionCache.set("latestOlmVersion", latestVersion, 3600);

        return latestVersion;
    } catch (error: any) {
--- a/server/routers/client/listUserDevices.ts
+++ b/server/routers/client/listUserDevices.ts
@@ -71,7 +71,7 @@ async function getLatestOlmVersion(): Promise<string | null> {
        tags = tags.filter((version) => !version.name.includes("rc"));
        const latestVersion = tags[0].name;

-        olmVersionCache.set("latestOlmVersion", latestVersion);
+        olmVersionCache.set("latestOlmVersion", latestVersion, 3600);

        return latestVersion;
    } catch (error: any) {
--- a/server/routers/newt/getNewtToken.ts
+++ b/server/routers/newt/getNewtToken.ts
@@ -1,6 +1,8 @@
 import { generateSessionToken } from "@server/auth/sessions/app";
-import { db } from "@server/db";
+import { db, newtSessions } from "@server/db";
 import { newts } from "@server/db";
+import { getOrCreateCachedToken } from "#dynamic/lib/tokenCache";
+import { EXPIRES } from "@server/auth/sessions/newt";
 import HttpCode from "@server/types/HttpCode";
 import response from "@server/lib/response";
 import { eq } from "drizzle-orm";
@@ -92,8 +94,19 @@ export async function getNewtToken(
            );
        }

-        const resToken = generateSessionToken();
-        await createNewtSession(resToken, existingNewt.newtId);
+        // Return a cached token if one exists to prevent thundering herd on
+        // simultaneous restarts; falls back to creating a fresh session when
+        // Redis is unavailable or the cache has expired.
+        const resToken = await getOrCreateCachedToken(
+            `newt:token_cache:${existingNewt.newtId}`,
+            config.getRawConfig().server.secret!,
+            Math.floor(EXPIRES / 1000),
+            async () => {
+                const token = generateSessionToken();
+                await createNewtSession(token, existingNewt.newtId);
+                return token;
+            }
+        );

        return response<{ token: string; serverVersion: string }>(res, {
            data: {
--- a/server/routers/newt/handleNewtPingMessage.ts
+++ b/server/routers/newt/handleNewtPingMessage.ts
@@ -5,6 +5,7 @@ import { Newt } from "@server/db";
 import { eq, lt, isNull, and, or } from "drizzle-orm";
 import logger from "@server/logger";
 import { sendNewtSyncMessage } from "./sync";
+import { recordPing } from "./pingAccumulator";

 // Track if the offline checker interval is running
 let offlineCheckerInterval: NodeJS.Timeout | null = null;
@@ -114,18 +115,12 @@ export const handleNewtPingMessage: MessageHandler = async (context) => {
        return;
    }

-    try {
-        // Mark the site as online and record the ping timestamp.
-        await db
-            .update(sites)
-            .set({
-                online: true,
-                lastPing: Math.floor(Date.now() / 1000)
-            })
-            .where(eq(sites.siteId, newt.siteId));
-    } catch (error) {
-        logger.error("Error updating online state on newt ping", { error });
-    }
+    // Record the ping in memory; it will be flushed to the database
+    // periodically by the ping accumulator (every ~10s) in a single
+    // batched UPDATE instead of one query per ping. This prevents
+    // connection pool exhaustion under load, especially with
+    // cross-region latency to the database.
+    recordPing(newt.siteId);

    // Check config version and sync if stale.
    const configVersion = await getClientConfigVersion(newt.newtId);
--- a/server/routers/newt/pingAccumulator.ts
+++ b/server/routers/newt/pingAccumulator.ts
@@ -0,0 +1,382 @@
+import { db } from "@server/db";
+import { sites, clients, olms } from "@server/db";
+import { eq, inArray } from "drizzle-orm";
+import logger from "@server/logger";
+
+/**
+ * Ping Accumulator
+ *
+ * Instead of writing to the database on every single newt/olm ping (which
+ * causes pool exhaustion under load, especially with cross-region latency),
+ * we accumulate pings in memory and flush them to the database periodically
+ * in a single batch.
+ *
+ * This is the same pattern used for bandwidth flushing in
+ * receiveBandwidth.ts and handleReceiveBandwidthMessage.ts.
+ *
+ * Supports two kinds of pings:
+ *   - **Site pings** (from newts): update `sites.online` and `sites.lastPing`
+ *   - **Client pings** (from OLMs): update `clients.online`, `clients.lastPing`,
+ *     `clients.archived`, and optionally reset `olms.archived`
+ */
+
+const FLUSH_INTERVAL_MS = 10_000; // Flush every 10 seconds
+const MAX_RETRIES = 2;
+const BASE_DELAY_MS = 50;
+
+// ── Site (newt) pings ──────────────────────────────────────────────────
+// Map of siteId -> latest ping timestamp (unix seconds)
+const pendingSitePings: Map<number, number> = new Map();
+
+// ── Client (OLM) pings ────────────────────────────────────────────────
+// Map of clientId -> latest ping timestamp (unix seconds)
+const pendingClientPings: Map<number, number> = new Map();
+// Set of olmIds whose `archived` flag should be reset to false
+const pendingOlmArchiveResets: Set<string> = new Set();
+
+let flushTimer: NodeJS.Timeout | null = null;
+
+// ── Public API ─────────────────────────────────────────────────────────
+
+/**
+ * Record a ping for a newt site. This does NOT write to the database
+ * immediately. Instead it stores the latest ping timestamp in memory,
+ * to be flushed periodically by the background timer.
+ */
+export function recordSitePing(siteId: number): void {
+    const now = Math.floor(Date.now() / 1000);
+    pendingSitePings.set(siteId, now);
+}
+
+/** @deprecated Use `recordSitePing` instead. Alias kept for existing call-sites. */
+export const recordPing = recordSitePing;
+
+/**
+ * Record a ping for an OLM client. Batches the `clients` table update
+ * (`online`, `lastPing`, `archived`) and, when `olmArchived` is true,
+ * also queues an `olms` table update to clear the archived flag.
+ */
+export function recordClientPing(
+    clientId: number,
+    olmId: string,
+    olmArchived: boolean
+): void {
+    const now = Math.floor(Date.now() / 1000);
+    pendingClientPings.set(clientId, now);
+    if (olmArchived) {
+        pendingOlmArchiveResets.add(olmId);
+    }
+}
+
+// ── Flush Logic ────────────────────────────────────────────────────────
+
+/**
+ * Flush all accumulated site pings to the database.
+ */
+async function flushSitePingsToDb(): Promise<void> {
+    if (pendingSitePings.size === 0) {
+        return;
+    }
+
+    // Snapshot and clear so new pings arriving during the flush go into a
+    // fresh map for the next cycle.
+    const pingsToFlush = new Map(pendingSitePings);
+    pendingSitePings.clear();
+
+    // Sort by siteId for consistent lock ordering (prevents deadlocks)
+    const sortedEntries = Array.from(pingsToFlush.entries()).sort(
+        ([a], [b]) => a - b
+    );
+
+    const BATCH_SIZE = 50;
+    for (let i = 0; i < sortedEntries.length; i += BATCH_SIZE) {
+        const batch = sortedEntries.slice(i, i + BATCH_SIZE);
+
+        try {
+            await withRetry(async () => {
+                // Group by timestamp for efficient bulk updates
+                const byTimestamp = new Map<number, number[]>();
+                for (const [siteId, timestamp] of batch) {
+                    const group = byTimestamp.get(timestamp) || [];
+                    group.push(siteId);
+                    byTimestamp.set(timestamp, group);
+                }
+
+                if (byTimestamp.size === 1) {
+                    const [timestamp, siteIds] = Array.from(
+                        byTimestamp.entries()
+                    )[0];
+                    await db
+                        .update(sites)
+                        .set({
+                            online: true,
+                            lastPing: timestamp
+                        })
+                        .where(inArray(sites.siteId, siteIds));
+                } else {
+                    await db.transaction(async (tx) => {
+                        for (const [timestamp, siteIds] of byTimestamp) {
+                            await tx
+                                .update(sites)
+                                .set({
+                                    online: true,
+                                    lastPing: timestamp
+                                })
+                                .where(inArray(sites.siteId, siteIds));
+                        }
+                    });
+                }
+            }, "flushSitePingsToDb");
+        } catch (error) {
+            logger.error(
+                `Failed to flush site ping batch (${batch.length} sites), re-queuing for next cycle`,
+                { error }
+            );
+            for (const [siteId, timestamp] of batch) {
+                const existing = pendingSitePings.get(siteId);
+                if (!existing || existing < timestamp) {
+                    pendingSitePings.set(siteId, timestamp);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Flush all accumulated client (OLM) pings to the database.
+ */
+async function flushClientPingsToDb(): Promise<void> {
+    if (pendingClientPings.size === 0 && pendingOlmArchiveResets.size === 0) {
+        return;
+    }
+
+    // Snapshot and clear
+    const pingsToFlush = new Map(pendingClientPings);
+    pendingClientPings.clear();
+
+    const olmResetsToFlush = new Set(pendingOlmArchiveResets);
+    pendingOlmArchiveResets.clear();
+
+    // ── Flush client pings ─────────────────────────────────────────────
+    if (pingsToFlush.size > 0) {
+        const sortedEntries = Array.from(pingsToFlush.entries()).sort(
+            ([a], [b]) => a - b
+        );
+
+        const BATCH_SIZE = 50;
+        for (let i = 0; i < sortedEntries.length; i += BATCH_SIZE) {
+            const batch = sortedEntries.slice(i, i + BATCH_SIZE);
+
+            try {
+                await withRetry(async () => {
+                    const byTimestamp = new Map<number, number[]>();
+                    for (const [clientId, timestamp] of batch) {
+                        const group = byTimestamp.get(timestamp) || [];
+                        group.push(clientId);
+                        byTimestamp.set(timestamp, group);
+                    }
+
+                    if (byTimestamp.size === 1) {
+                        const [timestamp, clientIds] = Array.from(
+                            byTimestamp.entries()
+                        )[0];
+                        await db
+                            .update(clients)
+                            .set({
+                                lastPing: timestamp,
+                                online: true,
+                                archived: false
+                            })
+                            .where(inArray(clients.clientId, clientIds));
+                    } else {
+                        await db.transaction(async (tx) => {
+                            for (const [timestamp, clientIds] of byTimestamp) {
+                                await tx
+                                    .update(clients)
+                                    .set({
+                                        lastPing: timestamp,
+                                        online: true,
+                                        archived: false
+                                    })
+                                    .where(
+                                        inArray(clients.clientId, clientIds)
+                                    );
+                            }
+                        });
+                    }
+                }, "flushClientPingsToDb");
+            } catch (error) {
+                logger.error(
+                    `Failed to flush client ping batch (${batch.length} clients), re-queuing for next cycle`,
+                    { error }
+                );
+                for (const [clientId, timestamp] of batch) {
+                    const existing = pendingClientPings.get(clientId);
+                    if (!existing || existing < timestamp) {
+                        pendingClientPings.set(clientId, timestamp);
+                    }
+                }
+            }
+        }
+    }
+
+    // ── Flush OLM archive resets ───────────────────────────────────────
+    if (olmResetsToFlush.size > 0) {
+        const olmIds = Array.from(olmResetsToFlush).sort();
+
+        const BATCH_SIZE = 50;
+        for (let i = 0; i < olmIds.length; i += BATCH_SIZE) {
+            const batch = olmIds.slice(i, i + BATCH_SIZE);
+
+            try {
+                await withRetry(async () => {
+                    await db
+                        .update(olms)
+                        .set({ archived: false })
+                        .where(inArray(olms.olmId, batch));
+                }, "flushOlmArchiveResets");
+            } catch (error) {
+                logger.error(
+                    `Failed to flush OLM archive reset batch (${batch.length} olms), re-queuing for next cycle`,
+                    { error }
+                );
+                for (const olmId of batch) {
+                    pendingOlmArchiveResets.add(olmId);
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Flush everything — called by the interval timer and during shutdown.
+ */
+export async function flushPingsToDb(): Promise<void> {
+    await flushSitePingsToDb();
+    await flushClientPingsToDb();
+}
+
+// ── Retry / Error Helpers ──────────────────────────────────────────────
+
+/**
+ * Simple retry wrapper with exponential backoff for transient errors
+ * (connection timeouts, unexpected disconnects).
+ */
+async function withRetry<T>(
+    operation: () => Promise<T>,
+    context: string
+): Promise<T> {
+    let attempt = 0;
+    while (true) {
+        try {
+            return await operation();
+        } catch (error: any) {
+            if (isTransientError(error) && attempt < MAX_RETRIES) {
+                attempt++;
+                const baseDelay = Math.pow(2, attempt - 1) * BASE_DELAY_MS;
+                const jitter = Math.random() * baseDelay;
+                const delay = baseDelay + jitter;
+                logger.warn(
+                    `Transient DB error in ${context}, retrying attempt ${attempt}/${MAX_RETRIES} after ${delay.toFixed(0)}ms`
+                );
+                await new Promise((resolve) => setTimeout(resolve, delay));
+                continue;
+            }
+            throw error;
+        }
+    }
+}
+
+/**
+ * Detect transient connection errors that are safe to retry.
+ */
+function isTransientError(error: any): boolean {
+    if (!error) return false;
+
+    const message = (error.message || "").toLowerCase();
+    const causeMessage = (error.cause?.message || "").toLowerCase();
+    const code = error.code || "";
+
+    // Connection timeout / terminated
+    if (
+        message.includes("connection timeout") ||
+        message.includes("connection terminated") ||
+        message.includes("timeout exceeded when trying to connect") ||
+        causeMessage.includes("connection terminated unexpectedly") ||
+        causeMessage.includes("connection timeout")
+    ) {
+        return true;
+    }
+
+    // PostgreSQL deadlock
+    if (code === "40P01" || message.includes("deadlock")) {
+        return true;
+    }
+
+    // ECONNRESET, ECONNREFUSED, EPIPE
+    if (
+        code === "ECONNRESET" ||
+        code === "ECONNREFUSED" ||
+        code === "EPIPE" ||
+        code === "ETIMEDOUT"
+    ) {
+        return true;
+    }
+
+    return false;
+}
+
+// ── Lifecycle ──────────────────────────────────────────────────────────
+
+/**
+ * Start the background flush timer. Call this once at server startup.
+ */
+export function startPingAccumulator(): void {
+    if (flushTimer) {
+        return; // Already running
+    }
+
+    flushTimer = setInterval(async () => {
+        try {
+            await flushPingsToDb();
+        } catch (error) {
+            logger.error("Unhandled error in ping accumulator flush", {
+                error
+            });
+        }
+    }, FLUSH_INTERVAL_MS);
+
+    // Don't prevent the process from exiting
+    flushTimer.unref();
+
+    logger.info(
+        `Ping accumulator started (flush interval: ${FLUSH_INTERVAL_MS}ms)`
+    );
+}
+
+/**
+ * Stop the background flush timer and perform a final flush.
+ * Call this during graceful shutdown.
+ */
+export async function stopPingAccumulator(): Promise<void> {
+    if (flushTimer) {
+        clearInterval(flushTimer);
+        flushTimer = null;
+    }
+
+    // Final flush to persist any remaining pings
+    try {
+        await flushPingsToDb();
+    } catch (error) {
+        logger.error("Error during final ping accumulator flush", { error });
+    }
+
+    logger.info("Ping accumulator stopped");
+}
+
+/**
+ * Get the number of pending (unflushed) pings. Useful for monitoring.
+ */
+export function getPendingPingCount(): number {
+    return pendingSitePings.size + pendingClientPings.size;
+}
--- a/server/routers/olm/getOlmToken.ts
+++ b/server/routers/olm/getOlmToken.ts
@@ -8,7 +8,7 @@ import {
    ExitNode,
    exitNodes,
    sites,
-    clientSitesAssociationsCache
+    clientSitesAssociationsCache,
 } from "@server/db";
 import { olms } from "@server/db";
 import HttpCode from "@server/types/HttpCode";
@@ -20,8 +20,10 @@ import { z } from "zod";
 import { fromError } from "zod-validation-error";
 import {
    createOlmSession,
-    validateOlmSessionToken
+    validateOlmSessionToken,
+    EXPIRES
 } from "@server/auth/sessions/olm";
+import { getOrCreateCachedToken } from "#dynamic/lib/tokenCache";
 import { verifyPassword } from "@server/auth/password";
 import logger from "@server/logger";
 import config from "@server/lib/config";
@@ -132,8 +134,19 @@ export async function getOlmToken(

        logger.debug("Creating new olm session token");

-        const resToken = generateSessionToken();
-        await createOlmSession(resToken, existingOlm.olmId);
+        // Return a cached token if one exists to prevent thundering herd on
+        // simultaneous restarts; falls back to creating a fresh session when
+        // Redis is unavailable or the cache has expired.
+        const resToken = await getOrCreateCachedToken(
+            `olm:token_cache:${existingOlm.olmId}`,
+            config.getRawConfig().server.secret!,
+            Math.floor(EXPIRES / 1000),
+            async () => {
+                const token = generateSessionToken();
+                await createOlmSession(token, existingOlm.olmId);
+                return token;
+            }
+        );

        let clientIdToUse;
        if (orgId) {
--- a/server/routers/olm/handleOlmPingMessage.ts
+++ b/server/routers/olm/handleOlmPingMessage.ts
@@ -3,6 +3,7 @@ import { db } from "@server/db";
 import { MessageHandler } from "@server/routers/ws";
 import { clients, olms, Olm } from "@server/db";
 import { eq, lt, isNull, and, or } from "drizzle-orm";
+import { recordClientPing } from "@server/routers/newt/pingAccumulator";
 import logger from "@server/logger";
 import { validateSessionToken } from "@server/auth/sessions/app";
 import { checkOrgAccessPolicy } from "#dynamic/lib/checkOrgAccessPolicy";
@@ -201,22 +202,12 @@ export const handleOlmPingMessage: MessageHandler = async (context) => {
            await sendOlmSyncMessage(olm, client);
        }

-        // Update the client's last ping timestamp
-        await db
-            .update(clients)
-            .set({
-                lastPing: Math.floor(Date.now() / 1000),
-                online: true,
-                archived: false
-            })
-            .where(eq(clients.clientId, olm.clientId));
-
-        if (olm.archived) {
-            await db
-                .update(olms)
-                .set({ archived: false })
-                .where(eq(olms.olmId, olm.olmId));
-        }
+        // Record the ping in memory; it will be flushed to the database
+        // periodically by the ping accumulator (every ~10s) in a single
+        // batched UPDATE instead of one query per ping. This prevents
+        // connection pool exhaustion under load, especially with
+        // cross-region latency to the database.
+        recordClientPing(olm.clientId, olm.olmId, !!olm.archived);
    } catch (error) {
        logger.error("Error handling ping message", { error });
    }
--- a/server/routers/site/listSites.ts
+++ b/server/routers/site/listSites.ts
@@ -55,7 +55,7 @@ async function getLatestNewtVersion(): Promise<string | null> {
        tags = tags.filter((version) => !version.name.includes("rc"));
        const latestVersion = tags[0].name;

-        await cache.set("latestNewtVersion", latestVersion);
+        await cache.set("latestNewtVersion", latestVersion, 3600);

        return latestVersion;
    } catch (error: any) {
@@ -180,7 +180,7 @@ registry.registerPath({
    method: "get",
    path: "/org/{orgId}/sites",
    description: "List all sites in an organization",
-    tags: [OpenAPITags.Site],
+    tags: [OpenAPITags.Org, OpenAPITags.Site],
    request: {
        params: listSitesParamsSchema,
        query: listSitesSchema
--- a/server/routers/user/inviteUser.ts
+++ b/server/routers/user/inviteUser.ts
@@ -201,7 +201,7 @@ export async function inviteUser(
                );
            }

-            await cache.set(email, attempts + 1);
+            await cache.set("regenerateInvite:" + email, attempts + 1, 3600);

            const inviteId = existingInvite[0].inviteId; // Retrieve the original inviteId
            const token = generateRandomString(
--- a/server/routers/ws/messageHandlers.ts
+++ b/server/routers/ws/messageHandlers.ts
@@ -11,6 +11,7 @@ import {
    startNewtOfflineChecker,
    handleNewtDisconnectingMessage
 } from "../newt";
+import { startPingAccumulator } from "../newt/pingAccumulator";
 import {
    handleOlmRegisterMessage,
    handleOlmRelayMessage,
@@ -46,6 +47,10 @@ export const messageHandlers: Record<string, MessageHandler> = {
    "ws/round-trip/complete": handleRoundTripMessage
 };

+// Start the ping accumulator for all builds — it batches per-site online/lastPing
+// updates into periodic bulk writes, preventing connection pool exhaustion.
+startPingAccumulator();
+
 if (build != "saas") {
    startOlmOfflineChecker(); // this is to handle the offline check for olms
    startNewtOfflineChecker(); // this is to handle the offline check for newts
--- a/server/routers/ws/ws.ts
+++ b/server/routers/ws/ws.ts
@@ -6,6 +6,7 @@ import { Socket } from "net";
 import { Newt, newts, NewtSession, olms, Olm, OlmSession, sites } from "@server/db";
 import { eq } from "drizzle-orm";
 import { db } from "@server/db";
+import { recordPing } from "@server/routers/newt/pingAccumulator";
 import { validateNewtSessionToken } from "@server/auth/sessions/newt";
 import { validateOlmSessionToken } from "@server/auth/sessions/olm";
 import { messageHandlers } from "./messageHandlers";
@@ -386,22 +387,14 @@ const setupConnection = async (
    // the same as modern newt clients.
    if (clientType === "newt") {
        const newtClient = client as Newt;
-        ws.on("ping", async () => {
+        ws.on("ping", () => {
            if (!newtClient.siteId) return;
-            try {
-                await db
-                    .update(sites)
-                    .set({
-                        online: true,
-                        lastPing: Math.floor(Date.now() / 1000)
-                    })
-                    .where(eq(sites.siteId, newtClient.siteId));
-            } catch (error) {
-                logger.error(
-                    "Error updating newt site online state on WS ping",
-                    { error }
-                );
-            }
+            // Record the ping in the accumulator instead of writing to the
+            // database on every WS ping frame. The accumulator flushes all
+            // pending pings in a single batched UPDATE every ~10s, which
+            // prevents connection pool exhaustion under load (especially
+            // with cross-region latency to the database).
+            recordPing(newtClient.siteId);
        });
    }

--- a/src/app/[orgId]/settings/(private)/idp/create/page.tsx
+++ b/src/app/[orgId]/settings/(private)/idp/create/page.tsx
@@ -275,6 +275,8 @@ export default function Page() {
        }
    }

+    const disabled = !isPaidUser(tierMatrix.orgOidc);
+
    return (
        <>
            <div className="flex justify-between">
@@ -292,6 +294,9 @@ export default function Page() {
                </Button>
            </div>

+            <PaidFeaturesAlert tiers={tierMatrix.orgOidc} />
+
+            <fieldset disabled={disabled} className={disabled ? "opacity-50 pointer-events-none" : ""}>
            <SettingsContainer>
                <SettingsSection>
                    <SettingsSectionHeader>
@@ -812,9 +817,10 @@ export default function Page() {
                </Button>
                <Button
                    type="submit"
-                    disabled={createLoading || !isPaidUser(tierMatrix.orgOidc)}
+                    disabled={createLoading || disabled}
                    loading={createLoading}
                    onClick={() => {
+                        if (disabled) return;
                        // log any issues with the form
                        console.log(form.formState.errors);
                        form.handleSubmit(onSubmit)();
@@ -823,6 +829,7 @@ export default function Page() {
                    {t("idpSubmit")}
                </Button>
            </div>
+            </fieldset>
        </>
    );
 }
--- a/src/components/OrgSelector.tsx
+++ b/src/components/OrgSelector.tsx
@@ -29,6 +29,7 @@ import { usePathname, useRouter } from "next/navigation";
 import { useMemo, useState } from "react";
 import { useUserContext } from "@app/hooks/useUserContext";
 import { useTranslations } from "next-intl";
+import { build } from "@server/build";

 interface OrgSelectorProps {
    orgId?: string;
@@ -50,6 +51,11 @@ export function OrgSelector({

    const selectedOrg = orgs?.find((org) => org.orgId === orgId);

+    let canCreateOrg = !env.flags.disableUserCreateOrg || user.serverAdmin;
+    if (build === "saas" && user.type !== "internal") {
+        canCreateOrg = false;
+    }
+
    const sortedOrgs = useMemo(() => {
        if (!orgs?.length) return orgs ?? [];
        return [...orgs].sort((a, b) => {
@@ -161,7 +167,7 @@ export function OrgSelector({
                        </CommandGroup>
                    </CommandList>
                </Command>
-                {(!env.flags.disableUserCreateOrg || user.serverAdmin) && (
+                {canCreateOrg && (
                    <div className="p-2 border-t border-border">
                        <Button
                            variant="ghost"
Author	SHA1	Message	Date
Owen Schwartz	62c63ddcaa	Merge pull request #2710 from fosrl/thundering-herd thundering herd	2026-03-24 20:29:01 -07:00
Owen	dfd604c781	Fix import problems	2026-03-24 20:27:34 -07:00
Owen	c96c5e8ae8	Cache token for thundering hurd	2026-03-24 18:12:51 -07:00
Owen	6f71e9f0f2	Clean up	2026-03-24 17:55:14 -07:00
Owen	d17ec6dc1f	Try to solve th problem	2026-03-24 17:39:43 -07:00
Owen Schwartz	c36a019f5d	Merge pull request #2709 from fosrl/pool-update Update pool and disable idp	2026-03-24 16:48:28 -07:00
Owen	cf2dfdea5b	Add better pooling controls	2026-03-24 16:38:50 -07:00
Owen	985e1bb9ab	Disable everything if not paid	2026-03-24 16:38:46 -07:00
Owen Schwartz	85335bfecc	Merge pull request #2685 from fosrl/dev 1.16.2-s.16	2026-03-21 10:47:18 -07:00
Owen	7c2b4f422a	Merge branch 'main' into dev	2026-03-21 10:45:13 -07:00
Owen	ad2a0ae127	Use the log database in hybrid as well	2026-03-21 10:42:31 -07:00
miloschwartz	6c2c620c99	set cache ttl and default ttl	2026-03-20 17:52:07 -07:00
miloschwartz	f643abf19a	dont show create org for oidc users	2026-03-20 16:04:00 -07:00