Merge branch 'dev' into msg-opt

This commit is contained in:
Owen
2026-03-23 16:00:50 -07:00
203 changed files with 4955 additions and 4932 deletions

View File

@@ -15,6 +15,7 @@ import { initPeerAddHandshake, updatePeer } from "../olm/peers";
import { eq, and } from "drizzle-orm";
import config from "@server/lib/config";
import {
formatEndpoint,
generateSubnetProxyTargetV2,
SubnetProxyTargetV2
} from "@server/lib/ip";
@@ -83,40 +84,42 @@ export async function buildClientConfigurationForNewtClient(
// )
// );
// update the peer info on the olm
// if the peer has not been added yet this will be a no-op
await updatePeer(client.clients.clientId, {
siteId: site.siteId,
endpoint: site.endpoint!,
relayEndpoint: `${exitNode.endpoint}:${config.getRawConfig().gerbil.clients_start_port}`,
publicKey: site.publicKey!,
serverIP: site.address,
serverPort: site.listenPort
// remoteSubnets: generateRemoteSubnets(
// allSiteResources.map(
// ({ siteResources }) => siteResources
// )
// ),
// aliases: generateAliasConfig(
// allSiteResources.map(
// ({ siteResources }) => siteResources
// )
// )
});
if (!client.clientSitesAssociationsCache.isJitMode) { // if we are adding sites through jit then dont add the site to the olm
// update the peer info on the olm
// if the peer has not been added yet this will be a no-op
await updatePeer(client.clients.clientId, {
siteId: site.siteId,
endpoint: site.endpoint!,
relayEndpoint: `${exitNode.endpoint}:${config.getRawConfig().gerbil.clients_start_port}`,
publicKey: site.publicKey!,
serverIP: site.address,
serverPort: site.listenPort
// remoteSubnets: generateRemoteSubnets(
// allSiteResources.map(
// ({ siteResources }) => siteResources
// )
// ),
// aliases: generateAliasConfig(
// allSiteResources.map(
// ({ siteResources }) => siteResources
// )
// )
});
// also trigger the peer add handshake in case the peer was not already added to the olm and we need to hole punch
// if it has already been added this will be a no-op
await initPeerAddHandshake(
// this will kick off the add peer process for the client
client.clients.clientId,
{
siteId,
exitNode: {
publicKey: exitNode.publicKey,
endpoint: exitNode.endpoint
// also trigger the peer add handshake in case the peer was not already added to the olm and we need to hole punch
// if it has already been added this will be a no-op
await initPeerAddHandshake(
// this will kick off the add peer process for the client
client.clients.clientId,
{
siteId,
exitNode: {
publicKey: exitNode.publicKey,
endpoint: exitNode.endpoint
}
}
}
);
);
}
return {
publicKey: client.clients.pubKey!,
@@ -204,7 +207,8 @@ export async function buildTargetConfigurationForNewtClient(siteId: number) {
hcTimeout: targetHealthCheck.hcTimeout,
hcHeaders: targetHealthCheck.hcHeaders,
hcMethod: targetHealthCheck.hcMethod,
hcTlsServerName: targetHealthCheck.hcTlsServerName
hcTlsServerName: targetHealthCheck.hcTlsServerName,
hcStatus: targetHealthCheck.hcStatus
})
.from(targets)
.innerJoin(resources, eq(targets.resourceId, resources.resourceId))
@@ -221,8 +225,8 @@ export async function buildTargetConfigurationForNewtClient(siteId: number) {
return acc;
}
// Format target into string
const formattedTarget = `${target.internalPort}:${target.ip}:${target.port}`;
// Format target into string (handles IPv6 bracketing)
const formattedTarget = `${target.internalPort}:${formatEndpoint(target.ip, target.port)}`;
// Add to the appropriate protocol array
if (target.protocol === "tcp") {
@@ -245,9 +249,9 @@ export async function buildTargetConfigurationForNewtClient(siteId: number) {
!target.hcInterval ||
!target.hcMethod
) {
logger.debug(
`Skipping target ${target.targetId} due to missing health check fields`
);
// logger.debug(
// `Skipping adding target health check ${target.targetId} due to missing health check fields`
// );
return null; // Skip targets with missing health check fields
}
@@ -277,7 +281,8 @@ export async function buildTargetConfigurationForNewtClient(siteId: number) {
hcTimeout: target.hcTimeout, // in seconds
hcHeaders: hcHeadersSend,
hcMethod: target.hcMethod,
hcTlsServerName: target.hcTlsServerName
hcTlsServerName: target.hcTlsServerName,
hcStatus: target.hcStatus
};
});

View File

@@ -7,6 +7,7 @@ import { eq } from "drizzle-orm";
import { sendToExitNode } from "#dynamic/lib/exitNodes";
import { buildClientConfigurationForNewtClient } from "./buildConfiguration";
import { convertTargetsIfNessicary } from "../client/targets";
import { canCompress } from "@server/lib/clientVersionChecks";
const inputSchema = z.object({
publicKey: z.string(),
@@ -105,11 +106,11 @@ export const handleGetConfigMessage: MessageHandler = async (context) => {
const payload = {
oldDestination: {
destinationIP: existingSite.subnet?.split("/")[0],
destinationPort: existingSite.listenPort
destinationPort: existingSite.listenPort || 1 // this satisfies gerbil for now but should be reevaluated
},
newDestination: {
destinationIP: site.subnet?.split("/")[0],
destinationPort: site.listenPort
destinationPort: site.listenPort || 1 // this satisfies gerbil for now but should be reevaluated
}
};
@@ -138,6 +139,9 @@ export const handleGetConfigMessage: MessageHandler = async (context) => {
targets: targetsToSend
}
},
options: {
compress: canCompress(newt.version, "newt")
},
broadcast: false,
excludeSender: false
};

View File

@@ -0,0 +1,36 @@
import { MessageHandler } from "@server/routers/ws";
import { db, Newt, sites } from "@server/db";
import { eq } from "drizzle-orm";
import logger from "@server/logger";
/**
* Handles disconnecting messages from sites to show disconnected in the ui
*/
export const handleNewtDisconnectingMessage: MessageHandler = async (
context
) => {
const { message, client: c, sendToClient } = context;
const newt = c as Newt;
if (!newt) {
logger.warn("Newt not found");
return;
}
if (!newt.siteId) {
logger.warn("Newt has no client ID!");
return;
}
try {
// Update the client's last ping timestamp
await db
.update(sites)
.set({
online: false
})
.where(eq(sites.siteId, newt.siteId));
} catch (error) {
logger.error("Error handling disconnecting message", { error });
}
};

View File

@@ -1,105 +1,107 @@
import { db, sites } from "@server/db";
import { disconnectClient, getClientConfigVersion } from "#dynamic/routers/ws";
import { db, newts, sites } from "@server/db";
import { hasActiveConnections, getClientConfigVersion } from "#dynamic/routers/ws";
import { MessageHandler } from "@server/routers/ws";
import { clients, Newt } from "@server/db";
import { Newt } from "@server/db";
import { eq, lt, isNull, and, or } from "drizzle-orm";
import logger from "@server/logger";
import { validateSessionToken } from "@server/auth/sessions/app";
import { checkOrgAccessPolicy } from "#dynamic/lib/checkOrgAccessPolicy";
import { sendTerminateClient } from "../client/terminate";
import { encodeHexLowerCase } from "@oslojs/encoding";
import { sha256 } from "@oslojs/crypto/sha2";
import { sendNewtSyncMessage } from "./sync";
// Track if the offline checker interval is running
// let offlineCheckerInterval: NodeJS.Timeout | null = null;
// const OFFLINE_CHECK_INTERVAL = 30 * 1000; // Check every 30 seconds
// const OFFLINE_THRESHOLD_MS = 2 * 60 * 1000; // 2 minutes
let offlineCheckerInterval: NodeJS.Timeout | null = null;
const OFFLINE_CHECK_INTERVAL = 30 * 1000; // Check every 30 seconds
const OFFLINE_THRESHOLD_MS = 2 * 60 * 1000; // 2 minutes
/**
* Starts the background interval that checks for clients that haven't pinged recently
* and marks them as offline
* Starts the background interval that checks for newt sites that haven't
* pinged recently and marks them as offline. For backward compatibility,
* a site is only marked offline when there is no active WebSocket connection
* either — so older newt versions that don't send pings but remain connected
* continue to be treated as online.
*/
// export const startNewtOfflineChecker = (): void => {
// if (offlineCheckerInterval) {
// return; // Already running
// }
export const startNewtOfflineChecker = (): void => {
if (offlineCheckerInterval) {
return; // Already running
}
// offlineCheckerInterval = setInterval(async () => {
// try {
// const twoMinutesAgo = Math.floor(
// (Date.now() - OFFLINE_THRESHOLD_MS) / 1000
// );
offlineCheckerInterval = setInterval(async () => {
try {
const twoMinutesAgo = Math.floor(
(Date.now() - OFFLINE_THRESHOLD_MS) / 1000
);
// // TODO: WE NEED TO MAKE SURE THIS WORKS WITH DISTRIBUTED NODES ALL DOING THE SAME THING
// Find all online newt-type sites that haven't pinged recently
// (or have never pinged at all). Join newts to obtain the newtId
// needed for the WebSocket connection check.
const staleSites = await db
.select({
siteId: sites.siteId,
newtId: newts.newtId,
lastPing: sites.lastPing
})
.from(sites)
.innerJoin(newts, eq(newts.siteId, sites.siteId))
.where(
and(
eq(sites.online, true),
eq(sites.type, "newt"),
or(
lt(sites.lastPing, twoMinutesAgo),
isNull(sites.lastPing)
)
)
);
// // Find clients that haven't pinged in the last 2 minutes and mark them as offline
// const offlineClients = await db
// .update(clients)
// .set({ online: false })
// .where(
// and(
// eq(clients.online, true),
// or(
// lt(clients.lastPing, twoMinutesAgo),
// isNull(clients.lastPing)
// )
// )
// )
// .returning();
for (const staleSite of staleSites) {
// Backward-compatibility check: if the newt still has an
// active WebSocket connection (older clients that don't send
// pings), keep the site online.
const isConnected = await hasActiveConnections(staleSite.newtId);
if (isConnected) {
logger.debug(
`Newt ${staleSite.newtId} has not pinged recently but is still connected via WebSocket — keeping site ${staleSite.siteId} online`
);
continue;
}
// for (const offlineClient of offlineClients) {
// logger.info(
// `Kicking offline newt client ${offlineClient.clientId} due to inactivity`
// );
logger.info(
`Marking site ${staleSite.siteId} offline: newt ${staleSite.newtId} has no recent ping and no active WebSocket connection`
);
// if (!offlineClient.newtId) {
// logger.warn(
// `Offline client ${offlineClient.clientId} has no newtId, cannot disconnect`
// );
// continue;
// }
await db
.update(sites)
.set({ online: false })
.where(eq(sites.siteId, staleSite.siteId));
}
} catch (error) {
logger.error("Error in newt offline checker interval", { error });
}
}, OFFLINE_CHECK_INTERVAL);
// // Send a disconnect message to the client if connected
// try {
// await sendTerminateClient(
// offlineClient.clientId,
// offlineClient.newtId
// ); // terminate first
// // wait a moment to ensure the message is sent
// await new Promise((resolve) => setTimeout(resolve, 1000));
// await disconnectClient(offlineClient.newtId);
// } catch (error) {
// logger.error(
// `Error sending disconnect to offline newt ${offlineClient.clientId}`,
// { error }
// );
// }
// }
// } catch (error) {
// logger.error("Error in offline checker interval", { error });
// }
// }, OFFLINE_CHECK_INTERVAL);
// logger.debug("Started offline checker interval");
// };
logger.debug("Started newt offline checker interval");
};
/**
* Stops the background interval that checks for offline clients
* Stops the background interval that checks for offline newt sites.
*/
// export const stopNewtOfflineChecker = (): void => {
// if (offlineCheckerInterval) {
// clearInterval(offlineCheckerInterval);
// offlineCheckerInterval = null;
// logger.info("Stopped offline checker interval");
// }
// };
export const stopNewtOfflineChecker = (): void => {
if (offlineCheckerInterval) {
clearInterval(offlineCheckerInterval);
offlineCheckerInterval = null;
logger.info("Stopped newt offline checker interval");
}
};
/**
* Handles ping messages from clients and responds with pong
* Handles ping messages from newt clients.
*
* On each ping:
* - Marks the associated site as online.
* - Records the current timestamp as the newt's last-ping time.
* - Triggers a config sync if the newt is running an outdated config version.
* - Responds with a pong message.
*/
export const handleNewtPingMessage: MessageHandler = async (context) => {
const { message, client: c, sendToClient } = context;
const { message, client: c } = context;
const newt = c as Newt;
if (!newt) {
@@ -112,15 +114,31 @@ export const handleNewtPingMessage: MessageHandler = async (context) => {
return;
}
// get the version
try {
// Mark the site as online and record the ping timestamp.
await db
.update(sites)
.set({
online: true,
lastPing: Math.floor(Date.now() / 1000)
})
.where(eq(sites.siteId, newt.siteId));
} catch (error) {
logger.error("Error updating online state on newt ping", { error });
}
// Check config version and sync if stale.
const configVersion = await getClientConfigVersion(newt.newtId);
if (message.configVersion && configVersion != null && configVersion != message.configVersion) {
if (
message.configVersion != null &&
configVersion != null &&
configVersion !== message.configVersion
) {
logger.warn(
`Newt ping with outdated config version: ${message.configVersion} (current: ${configVersion})`
);
// get the site
const [site] = await db
.select()
.from(sites)
@@ -137,19 +155,6 @@ export const handleNewtPingMessage: MessageHandler = async (context) => {
await sendNewtSyncMessage(newt, site);
}
// try {
// // Update the client's last ping timestamp
// await db
// .update(clients)
// .set({
// lastPing: Math.floor(Date.now() / 1000),
// online: true
// })
// .where(eq(clients.clientId, newt.clientId));
// } catch (error) {
// logger.error("Error handling ping message", { error });
// }
return {
message: {
type: "pong",

View File

@@ -5,9 +5,7 @@ import { eq } from "drizzle-orm";
import { addPeer, deletePeer } from "../gerbil/peers";
import logger from "@server/logger";
import config from "@server/lib/config";
import {
findNextAvailableCidr,
} from "@server/lib/ip";
import { findNextAvailableCidr } from "@server/lib/ip";
import {
selectBestExitNode,
verifyExitNodeOrgAccess
@@ -15,6 +13,7 @@ import {
import { fetchContainers } from "./dockerSocket";
import { lockManager } from "#dynamic/lib/lock";
import { buildTargetConfigurationForNewtClient } from "./buildConfiguration";
import { canCompress } from "@server/lib/clientVersionChecks";
export type ExitNodePingResult = {
exitNodeId: number;
@@ -215,6 +214,9 @@ export const handleNewtRegisterMessage: MessageHandler = async (context) => {
healthCheckTargets: validHealthCheckTargets
}
},
options: {
compress: canCompress(newt.version, "newt")
},
broadcast: false, // Send to all clients
excludeSender: false // Include sender in broadcast
};

View File

@@ -10,10 +10,21 @@ interface PeerBandwidth {
bytesOut: number;
}
interface BandwidthAccumulator {
bytesIn: number;
bytesOut: number;
}
// Retry configuration for deadlock handling
const MAX_RETRIES = 3;
const BASE_DELAY_MS = 50;
// How often to flush accumulated bandwidth data to the database
const FLUSH_INTERVAL_MS = 120_000; // 120 seconds
// In-memory accumulator: publicKey -> { bytesIn, bytesOut }
let accumulator = new Map<string, BandwidthAccumulator>();
/**
* Check if an error is a deadlock error
*/
@@ -53,6 +64,90 @@ async function withDeadlockRetry<T>(
}
}
/**
* Flush all accumulated bandwidth data to the database.
*
* Swaps out the accumulator before writing so that any bandwidth messages
* received during the flush are captured in the new accumulator rather than
* being lost or causing contention. Entries that fail to write are re-queued
* back into the accumulator so they will be retried on the next flush.
*
* This function is exported so that the application's graceful-shutdown
* cleanup handler can call it before the process exits.
*/
export async function flushBandwidthToDb(): Promise<void> {
if (accumulator.size === 0) {
return;
}
// Atomically swap out the accumulator so new data keeps flowing in
// while we write the snapshot to the database.
const snapshot = accumulator;
accumulator = new Map<string, BandwidthAccumulator>();
const currentTime = new Date().toISOString();
// Sort by publicKey for consistent lock ordering across concurrent
// writers — this is the same deadlock-prevention strategy used in the
// original per-message implementation.
const sortedEntries = [...snapshot.entries()].sort(([a], [b]) =>
a.localeCompare(b)
);
logger.debug(
`Flushing accumulated bandwidth data for ${sortedEntries.length} client(s) to the database`
);
for (const [publicKey, { bytesIn, bytesOut }] of sortedEntries) {
try {
await withDeadlockRetry(async () => {
// Use atomic SQL increment to avoid the SELECT-then-UPDATE
// anti-pattern and the races it would introduce.
await db
.update(clients)
.set({
// Note: bytesIn from peer goes to megabytesOut (data
// sent to client) and bytesOut from peer goes to
// megabytesIn (data received from client).
megabytesOut: sql`COALESCE(${clients.megabytesOut}, 0) + ${bytesIn}`,
megabytesIn: sql`COALESCE(${clients.megabytesIn}, 0) + ${bytesOut}`,
lastBandwidthUpdate: currentTime
})
.where(eq(clients.pubKey, publicKey));
}, `flush bandwidth for client ${publicKey}`);
} catch (error) {
logger.error(
`Failed to flush bandwidth for client ${publicKey}:`,
error
);
// Re-queue the failed entry so it is retried on the next flush
// rather than silently dropped.
const existing = accumulator.get(publicKey);
if (existing) {
existing.bytesIn += bytesIn;
existing.bytesOut += bytesOut;
} else {
accumulator.set(publicKey, { bytesIn, bytesOut });
}
}
}
}
const flushTimer = setInterval(async () => {
try {
await flushBandwidthToDb();
} catch (error) {
logger.error("Unexpected error during periodic bandwidth flush:", error);
}
}, FLUSH_INTERVAL_MS);
// Calling unref() means this timer will not keep the Node.js event loop alive
// on its own — the process can still exit normally when there is no other work
// left. The graceful-shutdown path (see server/cleanup.ts) will call
// flushBandwidthToDb() explicitly before process.exit(), so no data is lost.
flushTimer.unref();
export const handleReceiveBandwidthMessage: MessageHandler = async (
context
) => {
@@ -69,40 +164,21 @@ export const handleReceiveBandwidthMessage: MessageHandler = async (
throw new Error("Invalid bandwidth data");
}
// Sort bandwidth data by publicKey to ensure consistent lock ordering across all instances
// This is critical for preventing deadlocks when multiple instances update the same clients
const sortedBandwidthData = [...bandwidthData].sort((a, b) =>
a.publicKey.localeCompare(b.publicKey)
);
// Accumulate the incoming data in memory; the periodic timer (and the
// shutdown hook) will take care of writing it to the database.
for (const { publicKey, bytesIn, bytesOut } of bandwidthData) {
// Skip peers that haven't transferred any data — writing zeros to the
// database would be a no-op anyway.
if (bytesIn <= 0 && bytesOut <= 0) {
continue;
}
const currentTime = new Date().toISOString();
// Update each client individually with retry logic
// This reduces transaction scope and allows retries per-client
for (const peer of sortedBandwidthData) {
const { publicKey, bytesIn, bytesOut } = peer;
try {
await withDeadlockRetry(async () => {
// Use atomic SQL increment to avoid SELECT then UPDATE pattern
// This eliminates the need to read the current value first
await db
.update(clients)
.set({
// Note: bytesIn from peer goes to megabytesOut (data sent to client)
// and bytesOut from peer goes to megabytesIn (data received from client)
megabytesOut: sql`COALESCE(${clients.megabytesOut}, 0) + ${bytesIn}`,
megabytesIn: sql`COALESCE(${clients.megabytesIn}, 0) + ${bytesOut}`,
lastBandwidthUpdate: currentTime
})
.where(eq(clients.pubKey, publicKey));
}, `update client bandwidth ${publicKey}`);
} catch (error) {
logger.error(
`Failed to update bandwidth for client ${publicKey}:`,
error
);
// Continue with other clients even if one fails
const existing = accumulator.get(publicKey);
if (existing) {
existing.bytesIn += bytesIn;
existing.bytesOut += bytesOut;
} else {
accumulator.set(publicKey, { bytesIn, bytesOut });
}
}
};

View File

@@ -7,3 +7,4 @@ export * from "./handleSocketMessages";
export * from "./handleNewtPingRequestMessage";
export * from "./handleApplyBlueprintMessage";
export * from "./handleNewtPingMessage";
export * from "./handleNewtDisconnectingMessage";

View File

@@ -6,6 +6,7 @@ import {
buildClientConfigurationForNewtClient,
buildTargetConfigurationForNewtClient
} from "./buildConfiguration";
import { canCompress } from "@server/lib/clientVersionChecks";
export async function sendNewtSyncMessage(newt: Newt, site: Site) {
const { tcpTargets, udpTargets, validHealthCheckTargets } =
@@ -24,18 +25,24 @@ export async function sendNewtSyncMessage(newt: Newt, site: Site) {
exitNode
);
await sendToClient(newt.newtId, {
type: "newt/sync",
data: {
proxyTargets: {
udp: udpTargets,
tcp: tcpTargets
},
healthCheckTargets: validHealthCheckTargets,
peers: peers,
clientTargets: targets
await sendToClient(
newt.newtId,
{
type: "newt/sync",
data: {
proxyTargets: {
udp: udpTargets,
tcp: tcpTargets
},
healthCheckTargets: validHealthCheckTargets,
peers: peers,
clientTargets: targets
}
},
{
compress: canCompress(newt.version, "newt")
}
}).catch((error) => {
).catch((error) => {
logger.warn(`Error sending newt sync message:`, error);
});
}

View File

@@ -2,13 +2,14 @@ import { Target, TargetHealthCheck, db, targetHealthCheck } from "@server/db";
import { sendToClient } from "#dynamic/routers/ws";
import logger from "@server/logger";
import { eq, inArray } from "drizzle-orm";
import { canCompress } from "@server/lib/clientVersionChecks";
export async function addTargets(
newtId: string,
targets: Target[],
healthCheckData: TargetHealthCheck[],
protocol: string,
port: number | null = null
version?: string | null
) {
//create a list of udp and tcp targets
const payloadTargets = targets.map((target) => {
@@ -22,7 +23,7 @@ export async function addTargets(
data: {
targets: payloadTargets
}
}, { incrementConfigVersion: true });
}, { incrementConfigVersion: true, compress: canCompress(version, "newt") });
// Create a map for quick lookup
const healthCheckMap = new Map<number, TargetHealthCheck>();
@@ -103,14 +104,14 @@ export async function addTargets(
data: {
targets: validHealthCheckTargets
}
}, { incrementConfigVersion: true });
}, { incrementConfigVersion: true, compress: canCompress(version, "newt") });
}
export async function removeTargets(
newtId: string,
targets: Target[],
protocol: string,
port: number | null = null
version?: string | null
) {
//create a list of udp and tcp targets
const payloadTargets = targets.map((target) => {
@@ -135,5 +136,5 @@ export async function removeTargets(
data: {
ids: healthCheckTargets
}
}, { incrementConfigVersion: true });
}, { incrementConfigVersion: true, compress: canCompress(version, "newt") });
}