seperate out the offline checker logic

This commit is contained in:
Owen
2026-04-15 16:40:04 -07:00
parent ad15b7c3c6
commit 1d4b2b1da1
9 changed files with 362 additions and 335 deletions

View File

@@ -1,184 +1,11 @@
import { db, newts, sites, targetHealthCheck, targets } from "@server/db";
import {
hasActiveConnections,
getClientConfigVersion
} from "#dynamic/routers/ws";
import { db, sites } from "@server/db";
import { getClientConfigVersion } from "#dynamic/routers/ws";
import { MessageHandler } from "@server/routers/ws";
import { Newt } from "@server/db";
import { eq, lt, isNull, and, or, ne, not } from "drizzle-orm";
import { eq } from "drizzle-orm";
import logger from "@server/logger";
import { sendNewtSyncMessage } from "./sync";
import { recordPing } from "./pingAccumulator";
import { fireSiteOfflineAlert } from "#dynamic/lib/alerts";
// Track if the offline checker interval is running
let offlineCheckerInterval: NodeJS.Timeout | null = null;
const OFFLINE_CHECK_INTERVAL = 30 * 1000; // Check every 30 seconds
const OFFLINE_THRESHOLD_MS = 2 * 60 * 1000; // 2 minutes
const OFFLINE_THRESHOLD_BANDWIDTH_MS = 8 * 60 * 1000; // 8 minutes
/**
* Starts the background interval that checks for newt sites that haven't
* pinged recently and marks them as offline. For backward compatibility,
* a site is only marked offline when there is no active WebSocket connection
* either — so older newt versions that don't send pings but remain connected
* continue to be treated as online.
*/
export const startNewtOfflineChecker = (): void => {
if (offlineCheckerInterval) {
return; // Already running
}
offlineCheckerInterval = setInterval(async () => {
try {
const twoMinutesAgo = Math.floor(
(Date.now() - OFFLINE_THRESHOLD_MS) / 1000
);
// Find all online newt-type sites that haven't pinged recently
// (or have never pinged at all). Join newts to obtain the newtId
// needed for the WebSocket connection check.
const staleSites = await db
.select({
siteId: sites.siteId,
orgId: sites.orgId,
name: sites.name,
newtId: newts.newtId,
lastPing: sites.lastPing
})
.from(sites)
.innerJoin(newts, eq(newts.siteId, sites.siteId))
.where(
and(
eq(sites.online, true),
eq(sites.type, "newt"),
or(
lt(sites.lastPing, twoMinutesAgo),
isNull(sites.lastPing)
)
)
);
for (const staleSite of staleSites) {
// Backward-compatibility check: if the newt still has an
// active WebSocket connection (older clients that don't send
// pings), keep the site online.
const isConnected = await hasActiveConnections(
staleSite.newtId
);
if (isConnected) {
logger.debug(
`Newt ${staleSite.newtId} has not pinged recently but is still connected via WebSocket — keeping site ${staleSite.siteId} online`
);
continue;
}
logger.info(
`Marking site ${staleSite.siteId} offline: newt ${staleSite.newtId} has no recent ping and no active WebSocket connection`
);
await db
.update(sites)
.set({ online: false })
.where(eq(sites.siteId, staleSite.siteId));
const healthChecksOnSite = await db
.select()
.from(targetHealthCheck)
.innerJoin(
targets,
eq(targets.targetId, targetHealthCheck.targetId)
)
.innerJoin(sites, eq(sites.siteId, targets.siteId))
.where(eq(sites.siteId, staleSite.siteId));
for (const healthCheck of healthChecksOnSite) {
logger.info(
`Marking health check ${healthCheck.targetHealthCheck.targetHealthCheckId} offline due to site ${staleSite.siteId} being marked offline`
);
await db
.update(targetHealthCheck)
.set({ hcHealth: "unknown" })
.where(
eq(
targetHealthCheck.targetHealthCheckId,
healthCheck.targetHealthCheck
.targetHealthCheckId
)
);
}
await fireSiteOfflineAlert(staleSite.orgId, staleSite.siteId, staleSite.name);
}
// this part only effects self hosted. Its not efficient but we dont expect people to have very many wireguard sites
// select all of the wireguard sites to evaluate if they need to be offline due to the last bandwidth update
const allWireguardSites = await db
.select({
siteId: sites.siteId,
online: sites.online,
lastBandwidthUpdate: sites.lastBandwidthUpdate
})
.from(sites)
.where(
and(
eq(sites.type, "wireguard"),
not(isNull(sites.lastBandwidthUpdate))
)
);
const wireguardOfflineThreshold = Math.floor(
(Date.now() - OFFLINE_THRESHOLD_BANDWIDTH_MS) / 1000
);
// loop over each one. If its offline and there is a new update then mark it online. If its online and there is no update then mark it offline
for (const site of allWireguardSites) {
const lastBandwidthUpdate =
new Date(site.lastBandwidthUpdate!).getTime() / 1000;
if (
lastBandwidthUpdate < wireguardOfflineThreshold &&
site.online
) {
logger.info(
`Marking wireguard site ${site.siteId} offline: no bandwidth update in over ${OFFLINE_THRESHOLD_BANDWIDTH_MS / 60000} minutes`
);
await db
.update(sites)
.set({ online: false })
.where(eq(sites.siteId, site.siteId));
} else if (
lastBandwidthUpdate >= wireguardOfflineThreshold &&
!site.online
) {
logger.info(
`Marking wireguard site ${site.siteId} online: recent bandwidth update`
);
await db
.update(sites)
.set({ online: true })
.where(eq(sites.siteId, site.siteId));
}
}
} catch (error) {
logger.error("Error in newt offline checker interval", { error });
}
}, OFFLINE_CHECK_INTERVAL);
logger.debug("Started newt offline checker interval");
};
/**
* Stops the background interval that checks for offline newt sites.
*/
export const stopNewtOfflineChecker = (): void => {
if (offlineCheckerInterval) {
clearInterval(offlineCheckerInterval);
offlineCheckerInterval = null;
logger.info("Stopped newt offline checker interval");
}
};
/**
* Handles ping messages from newt clients.

View File

@@ -11,3 +11,4 @@ export * from "./handleNewtDisconnectingMessage";
export * from "./handleConnectionLogMessage";
export * from "./handleRequestLogMessage";
export * from "./registerNewt";
export * from "./offlineChecker";

View File

@@ -0,0 +1,176 @@
import { db, newts, sites, targetHealthCheck, targets } from "@server/db";
import {
hasActiveConnections,
} from "#dynamic/routers/ws";
import { eq, lt, isNull, and, or, ne, not } from "drizzle-orm";
import logger from "@server/logger";
import { fireSiteOfflineAlert } from "#dynamic/lib/alerts";
// Track if the offline checker interval is running
let offlineCheckerInterval: NodeJS.Timeout | null = null;
const OFFLINE_CHECK_INTERVAL = 30 * 1000; // Check every 30 seconds
const OFFLINE_THRESHOLD_MS = 2 * 60 * 1000; // 2 minutes
const OFFLINE_THRESHOLD_BANDWIDTH_MS = 8 * 60 * 1000; // 8 minutes
/**
* Starts the background interval that checks for newt sites that haven't
* pinged recently and marks them as offline. For backward compatibility,
* a site is only marked offline when there is no active WebSocket connection
* either — so older newt versions that don't send pings but remain connected
* continue to be treated as online.
*/
export const startNewtOfflineChecker = (): void => {
if (offlineCheckerInterval) {
return; // Already running
}
offlineCheckerInterval = setInterval(async () => {
try {
const twoMinutesAgo = Math.floor(
(Date.now() - OFFLINE_THRESHOLD_MS) / 1000
);
// Find all online newt-type sites that haven't pinged recently
// (or have never pinged at all). Join newts to obtain the newtId
// needed for the WebSocket connection check.
const staleSites = await db
.select({
siteId: sites.siteId,
orgId: sites.orgId,
name: sites.name,
newtId: newts.newtId,
lastPing: sites.lastPing
})
.from(sites)
.innerJoin(newts, eq(newts.siteId, sites.siteId))
.where(
and(
eq(sites.online, true),
eq(sites.type, "newt"),
or(
lt(sites.lastPing, twoMinutesAgo),
isNull(sites.lastPing)
)
)
);
for (const staleSite of staleSites) {
// Backward-compatibility check: if the newt still has an
// active WebSocket connection (older clients that don't send
// pings), keep the site online.
const isConnected = await hasActiveConnections(
staleSite.newtId
);
if (isConnected) {
logger.debug(
`Newt ${staleSite.newtId} has not pinged recently but is still connected via WebSocket — keeping site ${staleSite.siteId} online`
);
continue;
}
logger.info(
`Marking site ${staleSite.siteId} offline: newt ${staleSite.newtId} has no recent ping and no active WebSocket connection`
);
await db
.update(sites)
.set({ online: false })
.where(eq(sites.siteId, staleSite.siteId));
const healthChecksOnSite = await db
.select()
.from(targetHealthCheck)
.innerJoin(
targets,
eq(targets.targetId, targetHealthCheck.targetId)
)
.innerJoin(sites, eq(sites.siteId, targets.siteId))
.where(eq(sites.siteId, staleSite.siteId));
for (const healthCheck of healthChecksOnSite) {
logger.info(
`Marking health check ${healthCheck.targetHealthCheck.targetHealthCheckId} offline due to site ${staleSite.siteId} being marked offline`
);
await db
.update(targetHealthCheck)
.set({ hcHealth: "unknown" })
.where(
eq(
targetHealthCheck.targetHealthCheckId,
healthCheck.targetHealthCheck
.targetHealthCheckId
)
);
}
await fireSiteOfflineAlert(staleSite.orgId, staleSite.siteId, staleSite.name);
}
// this part only effects self hosted. Its not efficient but we dont expect people to have very many wireguard sites
// select all of the wireguard sites to evaluate if they need to be offline due to the last bandwidth update
const allWireguardSites = await db
.select({
siteId: sites.siteId,
online: sites.online,
lastBandwidthUpdate: sites.lastBandwidthUpdate
})
.from(sites)
.where(
and(
eq(sites.type, "wireguard"),
not(isNull(sites.lastBandwidthUpdate))
)
);
const wireguardOfflineThreshold = Math.floor(
(Date.now() - OFFLINE_THRESHOLD_BANDWIDTH_MS) / 1000
);
// loop over each one. If its offline and there is a new update then mark it online. If its online and there is no update then mark it offline
for (const site of allWireguardSites) {
const lastBandwidthUpdate =
new Date(site.lastBandwidthUpdate!).getTime() / 1000;
if (
lastBandwidthUpdate < wireguardOfflineThreshold &&
site.online
) {
logger.info(
`Marking wireguard site ${site.siteId} offline: no bandwidth update in over ${OFFLINE_THRESHOLD_BANDWIDTH_MS / 60000} minutes`
);
await db
.update(sites)
.set({ online: false })
.where(eq(sites.siteId, site.siteId));
} else if (
lastBandwidthUpdate >= wireguardOfflineThreshold &&
!site.online
) {
logger.info(
`Marking wireguard site ${site.siteId} online: recent bandwidth update`
);
await db
.update(sites)
.set({ online: true })
.where(eq(sites.siteId, site.siteId));
}
}
} catch (error) {
logger.error("Error in newt offline checker interval", { error });
}
}, OFFLINE_CHECK_INTERVAL);
logger.debug("Started newt offline checker interval");
};
/**
* Stops the background interval that checks for offline newt sites.
*/
export const stopNewtOfflineChecker = (): void => {
if (offlineCheckerInterval) {
clearInterval(offlineCheckerInterval);
offlineCheckerInterval = null;
logger.info("Stopped newt offline checker interval");
}
};

View File

@@ -1,104 +1,17 @@
import { disconnectClient, getClientConfigVersion } from "#dynamic/routers/ws";
import { getClientConfigVersion } from "#dynamic/routers/ws";
import { db } from "@server/db";
import { MessageHandler } from "@server/routers/ws";
import { clients, olms, Olm } from "@server/db";
import { eq, lt, isNull, and, or } from "drizzle-orm";
import { clients, Olm } from "@server/db";
import { eq } from "drizzle-orm";
import { recordClientPing } from "@server/routers/newt/pingAccumulator";
import logger from "@server/logger";
import { validateSessionToken } from "@server/auth/sessions/app";
import { checkOrgAccessPolicy } from "#dynamic/lib/checkOrgAccessPolicy";
import { sendTerminateClient } from "../client/terminate";
import { encodeHexLowerCase } from "@oslojs/encoding";
import { sha256 } from "@oslojs/crypto/sha2";
import { sendOlmSyncMessage } from "./sync";
import { OlmErrorCodes } from "./error";
import { handleFingerprintInsertion } from "./fingerprintingUtils";
// Track if the offline checker interval is running
let offlineCheckerInterval: NodeJS.Timeout | null = null;
const OFFLINE_CHECK_INTERVAL = 30 * 1000; // Check every 30 seconds
const OFFLINE_THRESHOLD_MS = 2 * 60 * 1000; // 2 minutes
/**
* Starts the background interval that checks for clients that haven't pinged recently
* and marks them as offline
*/
export const startOlmOfflineChecker = (): void => {
if (offlineCheckerInterval) {
return; // Already running
}
offlineCheckerInterval = setInterval(async () => {
try {
const twoMinutesAgo = Math.floor(
(Date.now() - OFFLINE_THRESHOLD_MS) / 1000
);
// TODO: WE NEED TO MAKE SURE THIS WORKS WITH DISTRIBUTED NODES ALL DOING THE SAME THING
// Find clients that haven't pinged in the last 2 minutes and mark them as offline
const offlineClients = await db
.update(clients)
.set({ online: false })
.where(
and(
eq(clients.online, true),
or(
lt(clients.lastPing, twoMinutesAgo),
isNull(clients.lastPing)
)
)
)
.returning();
for (const offlineClient of offlineClients) {
logger.info(
`Kicking offline olm client ${offlineClient.clientId} due to inactivity`
);
if (!offlineClient.olmId) {
logger.warn(
`Offline client ${offlineClient.clientId} has no olmId, cannot disconnect`
);
continue;
}
// Send a disconnect message to the client if connected
try {
await sendTerminateClient(
offlineClient.clientId,
OlmErrorCodes.TERMINATED_INACTIVITY,
offlineClient.olmId
); // terminate first
// wait a moment to ensure the message is sent
await new Promise((resolve) => setTimeout(resolve, 1000));
await disconnectClient(offlineClient.olmId);
} catch (error) {
logger.error(
`Error sending disconnect to offline olm ${offlineClient.clientId}`,
{ error }
);
}
}
} catch (error) {
logger.error("Error in offline checker interval", { error });
}
}, OFFLINE_CHECK_INTERVAL);
logger.debug("Started offline checker interval");
};
/**
* Stops the background interval that checks for offline clients
*/
export const stopOlmOfflineChecker = (): void => {
if (offlineCheckerInterval) {
clearInterval(offlineCheckerInterval);
offlineCheckerInterval = null;
logger.info("Stopped offline checker interval");
}
};
/**
* Handles ping messages from clients and responds with pong
*/

View File

@@ -12,3 +12,4 @@ export * from "./handleOlmUnRelayMessage";
export * from "./recoverOlmWithFingerprint";
export * from "./handleOlmDisconnectingMessage";
export * from "./handleOlmServerInitAddPeerHandshake";
export * from "./offlineChecker";

View File

@@ -0,0 +1,92 @@
import { disconnectClient, getClientConfigVersion } from "#dynamic/routers/ws";
import { db } from "@server/db";
import { clients } from "@server/db";
import { eq, lt, isNull, and, or } from "drizzle-orm";
import logger from "@server/logger";
import { sendTerminateClient } from "../client/terminate";
import { OlmErrorCodes } from "./error";
// Track if the offline checker interval is running
let offlineCheckerInterval: NodeJS.Timeout | null = null;
const OFFLINE_CHECK_INTERVAL = 30 * 1000; // Check every 30 seconds
const OFFLINE_THRESHOLD_MS = 2 * 60 * 1000; // 2 minutes
/**
* Starts the background interval that checks for clients that haven't pinged recently
* and marks them as offline
*/
export const startOlmOfflineChecker = (): void => {
if (offlineCheckerInterval) {
return; // Already running
}
offlineCheckerInterval = setInterval(async () => {
try {
const twoMinutesAgo = Math.floor(
(Date.now() - OFFLINE_THRESHOLD_MS) / 1000
);
// TODO: WE NEED TO MAKE SURE THIS WORKS WITH DISTRIBUTED NODES ALL DOING THE SAME THING
// Find clients that haven't pinged in the last 2 minutes and mark them as offline
const offlineClients = await db
.update(clients)
.set({ online: false })
.where(
and(
eq(clients.online, true),
or(
lt(clients.lastPing, twoMinutesAgo),
isNull(clients.lastPing)
)
)
)
.returning();
for (const offlineClient of offlineClients) {
logger.info(
`Kicking offline olm client ${offlineClient.clientId} due to inactivity`
);
if (!offlineClient.olmId) {
logger.warn(
`Offline client ${offlineClient.clientId} has no olmId, cannot disconnect`
);
continue;
}
// Send a disconnect message to the client if connected
try {
await sendTerminateClient(
offlineClient.clientId,
OlmErrorCodes.TERMINATED_INACTIVITY,
offlineClient.olmId
); // terminate first
// wait a moment to ensure the message is sent
await new Promise((resolve) => setTimeout(resolve, 1000));
await disconnectClient(offlineClient.olmId);
} catch (error) {
logger.error(
`Error sending disconnect to offline olm ${offlineClient.clientId}`,
{ error }
);
}
}
} catch (error) {
logger.error("Error in offline checker interval", { error });
}
}, OFFLINE_CHECK_INTERVAL);
logger.debug("Started offline checker interval");
};
/**
* Stops the background interval that checks for offline clients
*/
export const stopOlmOfflineChecker = (): void => {
if (offlineCheckerInterval) {
clearInterval(offlineCheckerInterval);
offlineCheckerInterval = null;
logger.info("Stopped offline checker interval");
}
};