mirror of
https://github.com/fosrl/pangolin.git
synced 2026-03-16 15:46:38 +00:00
Attempt to improve handling bandwidth tracking
This commit is contained in:
@@ -1,6 +1,10 @@
|
|||||||
|
import { flushBandwidthToDb } from "@server/routers/newt/handleReceiveBandwidthMessage";
|
||||||
|
import { flushSiteBandwidthToDb } from "@server/routers/gerbil/receiveBandwidth";
|
||||||
import { cleanup as wsCleanup } from "#dynamic/routers/ws";
|
import { cleanup as wsCleanup } from "#dynamic/routers/ws";
|
||||||
|
|
||||||
async function cleanup() {
|
async function cleanup() {
|
||||||
|
await flushBandwidthToDb();
|
||||||
|
await flushSiteBandwidthToDb();
|
||||||
await wsCleanup();
|
await wsCleanup();
|
||||||
|
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
@@ -10,4 +14,4 @@ export async function initCleanup() {
|
|||||||
// Handle process termination
|
// Handle process termination
|
||||||
process.on("SIGTERM", () => cleanup());
|
process.on("SIGTERM", () => cleanup());
|
||||||
process.on("SIGINT", () => cleanup());
|
process.on("SIGINT", () => cleanup());
|
||||||
}
|
}
|
||||||
@@ -13,8 +13,12 @@
|
|||||||
|
|
||||||
import { rateLimitService } from "#private/lib/rateLimit";
|
import { rateLimitService } from "#private/lib/rateLimit";
|
||||||
import { cleanup as wsCleanup } from "#private/routers/ws";
|
import { cleanup as wsCleanup } from "#private/routers/ws";
|
||||||
|
import { flushBandwidthToDb } from "@server/routers/newt/handleReceiveBandwidthMessage";
|
||||||
|
import { flushSiteBandwidthToDb } from "@server/routers/gerbil/receiveBandwidth";
|
||||||
|
|
||||||
async function cleanup() {
|
async function cleanup() {
|
||||||
|
await flushBandwidthToDb();
|
||||||
|
await flushSiteBandwidthToDb();
|
||||||
await rateLimitService.cleanup();
|
await rateLimitService.cleanup();
|
||||||
await wsCleanup();
|
await wsCleanup();
|
||||||
|
|
||||||
@@ -25,4 +29,4 @@ export async function initCleanup() {
|
|||||||
// Handle process termination
|
// Handle process termination
|
||||||
process.on("SIGTERM", () => cleanup());
|
process.on("SIGTERM", () => cleanup());
|
||||||
process.on("SIGINT", () => cleanup());
|
process.on("SIGINT", () => cleanup());
|
||||||
}
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
import { Request, Response, NextFunction } from "express";
|
import { Request, Response, NextFunction } from "express";
|
||||||
import { eq, and, lt, inArray, sql } from "drizzle-orm";
|
import { eq, sql } from "drizzle-orm";
|
||||||
import { sites } from "@server/db";
|
import { sites } from "@server/db";
|
||||||
import { db } from "@server/db";
|
import { db } from "@server/db";
|
||||||
import logger from "@server/logger";
|
import logger from "@server/logger";
|
||||||
@@ -11,19 +11,31 @@ import { FeatureId } from "@server/lib/billing/features";
|
|||||||
import { checkExitNodeOrg } from "#dynamic/lib/exitNodes";
|
import { checkExitNodeOrg } from "#dynamic/lib/exitNodes";
|
||||||
import { build } from "@server/build";
|
import { build } from "@server/build";
|
||||||
|
|
||||||
// Track sites that are already offline to avoid unnecessary queries
|
|
||||||
const offlineSites = new Set<string>();
|
|
||||||
|
|
||||||
// Retry configuration for deadlock handling
|
|
||||||
const MAX_RETRIES = 3;
|
|
||||||
const BASE_DELAY_MS = 50;
|
|
||||||
|
|
||||||
interface PeerBandwidth {
|
interface PeerBandwidth {
|
||||||
publicKey: string;
|
publicKey: string;
|
||||||
bytesIn: number;
|
bytesIn: number;
|
||||||
bytesOut: number;
|
bytesOut: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface AccumulatorEntry {
|
||||||
|
bytesIn: number;
|
||||||
|
bytesOut: number;
|
||||||
|
/** Present when the update came through a remote exit node. */
|
||||||
|
exitNodeId?: number;
|
||||||
|
/** Whether to record egress usage for billing purposes. */
|
||||||
|
calcUsage: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retry configuration for deadlock handling
|
||||||
|
const MAX_RETRIES = 3;
|
||||||
|
const BASE_DELAY_MS = 50;
|
||||||
|
|
||||||
|
// How often to flush accumulated bandwidth data to the database
|
||||||
|
const FLUSH_INTERVAL_MS = 30_000; // 30 seconds
|
||||||
|
|
||||||
|
// In-memory accumulator: publicKey -> AccumulatorEntry
|
||||||
|
let accumulator = new Map<string, AccumulatorEntry>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if an error is a deadlock error
|
* Check if an error is a deadlock error
|
||||||
*/
|
*/
|
||||||
@@ -63,6 +75,220 @@ async function withDeadlockRetry<T>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flush all accumulated site bandwidth data to the database.
|
||||||
|
*
|
||||||
|
* Swaps out the accumulator before writing so that any bandwidth messages
|
||||||
|
* received during the flush are captured in the new accumulator rather than
|
||||||
|
* being lost or causing contention. Entries that fail to write are re-queued
|
||||||
|
* back into the accumulator so they will be retried on the next flush.
|
||||||
|
*
|
||||||
|
* This function is exported so that the application's graceful-shutdown
|
||||||
|
* cleanup handler can call it before the process exits.
|
||||||
|
*/
|
||||||
|
export async function flushSiteBandwidthToDb(): Promise<void> {
|
||||||
|
if (accumulator.size === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Atomically swap out the accumulator so new data keeps flowing in
|
||||||
|
// while we write the snapshot to the database.
|
||||||
|
const snapshot = accumulator;
|
||||||
|
accumulator = new Map<string, AccumulatorEntry>();
|
||||||
|
|
||||||
|
const currentTime = new Date().toISOString();
|
||||||
|
|
||||||
|
// Sort by publicKey for consistent lock ordering across concurrent
|
||||||
|
// writers — deadlock-prevention strategy.
|
||||||
|
const sortedEntries = [...snapshot.entries()].sort(([a], [b]) =>
|
||||||
|
a.localeCompare(b)
|
||||||
|
);
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
`Flushing accumulated bandwidth data for ${sortedEntries.length} site(s) to the database`
|
||||||
|
);
|
||||||
|
|
||||||
|
// Aggregate billing usage by org, collected during the DB update loop.
|
||||||
|
const orgUsageMap = new Map<string, number>();
|
||||||
|
|
||||||
|
for (const [publicKey, { bytesIn, bytesOut, exitNodeId, calcUsage }] of sortedEntries) {
|
||||||
|
try {
|
||||||
|
const updatedSite = await withDeadlockRetry(async () => {
|
||||||
|
const [result] = await db
|
||||||
|
.update(sites)
|
||||||
|
.set({
|
||||||
|
megabytesOut: sql`COALESCE(${sites.megabytesOut}, 0) + ${bytesIn}`,
|
||||||
|
megabytesIn: sql`COALESCE(${sites.megabytesIn}, 0) + ${bytesOut}`,
|
||||||
|
lastBandwidthUpdate: currentTime
|
||||||
|
})
|
||||||
|
.where(eq(sites.pubKey, publicKey))
|
||||||
|
.returning({
|
||||||
|
orgId: sites.orgId,
|
||||||
|
siteId: sites.siteId
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}, `flush bandwidth for site ${publicKey}`);
|
||||||
|
|
||||||
|
if (updatedSite) {
|
||||||
|
if (exitNodeId) {
|
||||||
|
const notAllowed = await checkExitNodeOrg(
|
||||||
|
exitNodeId,
|
||||||
|
updatedSite.orgId
|
||||||
|
);
|
||||||
|
if (notAllowed) {
|
||||||
|
logger.warn(
|
||||||
|
`Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
|
||||||
|
);
|
||||||
|
// Skip usage tracking for this site but continue
|
||||||
|
// processing the rest.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (calcUsage) {
|
||||||
|
const totalBandwidth = bytesIn + bytesOut;
|
||||||
|
const current = orgUsageMap.get(updatedSite.orgId) ?? 0;
|
||||||
|
orgUsageMap.set(updatedSite.orgId, current + totalBandwidth);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
`Failed to flush bandwidth for site ${publicKey}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
|
||||||
|
// Re-queue the failed entry so it is retried on the next flush
|
||||||
|
// rather than silently dropped.
|
||||||
|
const existing = accumulator.get(publicKey);
|
||||||
|
if (existing) {
|
||||||
|
existing.bytesIn += bytesIn;
|
||||||
|
existing.bytesOut += bytesOut;
|
||||||
|
} else {
|
||||||
|
accumulator.set(publicKey, {
|
||||||
|
bytesIn,
|
||||||
|
bytesOut,
|
||||||
|
exitNodeId,
|
||||||
|
calcUsage
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process billing usage updates outside the site-update loop to keep
|
||||||
|
// lock scope small and concerns separated.
|
||||||
|
if (orgUsageMap.size > 0) {
|
||||||
|
// Sort org IDs for consistent lock ordering.
|
||||||
|
const sortedOrgIds = [...orgUsageMap.keys()].sort();
|
||||||
|
|
||||||
|
for (const orgId of sortedOrgIds) {
|
||||||
|
try {
|
||||||
|
const totalBandwidth = orgUsageMap.get(orgId)!;
|
||||||
|
const bandwidthUsage = await usageService.add(
|
||||||
|
orgId,
|
||||||
|
FeatureId.EGRESS_DATA_MB,
|
||||||
|
totalBandwidth
|
||||||
|
);
|
||||||
|
if (bandwidthUsage) {
|
||||||
|
// Fire-and-forget — don't block the flush on limit checking.
|
||||||
|
usageService
|
||||||
|
.checkLimitSet(
|
||||||
|
orgId,
|
||||||
|
FeatureId.EGRESS_DATA_MB,
|
||||||
|
bandwidthUsage
|
||||||
|
)
|
||||||
|
.catch((error: any) => {
|
||||||
|
logger.error(
|
||||||
|
`Error checking bandwidth limits for org ${orgId}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
`Error processing usage for org ${orgId}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
// Continue with other orgs.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Periodic flush timer
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const flushTimer = setInterval(async () => {
|
||||||
|
try {
|
||||||
|
await flushSiteBandwidthToDb();
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
"Unexpected error during periodic site bandwidth flush:",
|
||||||
|
error
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}, FLUSH_INTERVAL_MS);
|
||||||
|
|
||||||
|
// Allow the process to exit normally even while the timer is pending.
|
||||||
|
// The graceful-shutdown path (see server/cleanup.ts) will call
|
||||||
|
// flushSiteBandwidthToDb() explicitly before process.exit(), so no data
|
||||||
|
// is lost.
|
||||||
|
flushTimer.unref();
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Public API
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Accumulate bandwidth data reported by a gerbil or remote exit node.
|
||||||
|
*
|
||||||
|
* Only peers that actually transferred data (bytesIn > 0) are added to the
|
||||||
|
* accumulator; peers with no activity are silently ignored, which means the
|
||||||
|
* flush will only write rows that have genuinely changed.
|
||||||
|
*
|
||||||
|
* The function is intentionally synchronous in its fast path so that the
|
||||||
|
* HTTP handler can respond immediately without waiting for any I/O.
|
||||||
|
*/
|
||||||
|
export async function updateSiteBandwidth(
|
||||||
|
bandwidthData: PeerBandwidth[],
|
||||||
|
calcUsageAndLimits: boolean,
|
||||||
|
exitNodeId?: number
|
||||||
|
): Promise<void> {
|
||||||
|
for (const { publicKey, bytesIn, bytesOut } of bandwidthData) {
|
||||||
|
// Skip peers that haven't transferred any data — writing zeros to the
|
||||||
|
// database would be a no-op anyway.
|
||||||
|
if (bytesIn <= 0 && bytesOut <= 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const existing = accumulator.get(publicKey);
|
||||||
|
if (existing) {
|
||||||
|
existing.bytesIn += bytesIn;
|
||||||
|
existing.bytesOut += bytesOut;
|
||||||
|
// Retain the most-recent exitNodeId for this peer.
|
||||||
|
if (exitNodeId !== undefined) {
|
||||||
|
existing.exitNodeId = exitNodeId;
|
||||||
|
}
|
||||||
|
// Once calcUsage has been requested for a peer, keep it set for
|
||||||
|
// the lifetime of this flush window.
|
||||||
|
if (calcUsageAndLimits) {
|
||||||
|
existing.calcUsage = true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
accumulator.set(publicKey, {
|
||||||
|
bytesIn,
|
||||||
|
bytesOut,
|
||||||
|
exitNodeId,
|
||||||
|
calcUsage: calcUsageAndLimits
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// HTTP handler
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
export const receiveBandwidth = async (
|
export const receiveBandwidth = async (
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response,
|
res: Response,
|
||||||
@@ -75,7 +301,9 @@ export const receiveBandwidth = async (
|
|||||||
throw new Error("Invalid bandwidth data");
|
throw new Error("Invalid bandwidth data");
|
||||||
}
|
}
|
||||||
|
|
||||||
await updateSiteBandwidth(bandwidthData, build == "saas"); // we are checking the usage on saas only
|
// Accumulate in memory; the periodic timer (and the shutdown hook)
|
||||||
|
// will write to the database.
|
||||||
|
await updateSiteBandwidth(bandwidthData, build == "saas");
|
||||||
|
|
||||||
return response(res, {
|
return response(res, {
|
||||||
data: {},
|
data: {},
|
||||||
@@ -93,202 +321,4 @@ export const receiveBandwidth = async (
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function updateSiteBandwidth(
|
|
||||||
bandwidthData: PeerBandwidth[],
|
|
||||||
calcUsageAndLimits: boolean,
|
|
||||||
exitNodeId?: number
|
|
||||||
) {
|
|
||||||
const currentTime = new Date();
|
|
||||||
const oneMinuteAgo = new Date(currentTime.getTime() - 60000); // 1 minute ago
|
|
||||||
|
|
||||||
// Sort bandwidth data by publicKey to ensure consistent lock ordering across all instances
|
|
||||||
// This is critical for preventing deadlocks when multiple instances update the same sites
|
|
||||||
const sortedBandwidthData = [...bandwidthData].sort((a, b) =>
|
|
||||||
a.publicKey.localeCompare(b.publicKey)
|
|
||||||
);
|
|
||||||
|
|
||||||
// First, handle sites that are actively reporting bandwidth
|
|
||||||
const activePeers = sortedBandwidthData.filter((peer) => peer.bytesIn > 0);
|
|
||||||
|
|
||||||
// Aggregate usage data by organization (collected outside transaction)
|
|
||||||
const orgUsageMap = new Map<string, number>();
|
|
||||||
|
|
||||||
if (activePeers.length > 0) {
|
|
||||||
// Remove any active peers from offline tracking since they're sending data
|
|
||||||
activePeers.forEach((peer) => offlineSites.delete(peer.publicKey));
|
|
||||||
|
|
||||||
// Update each active site individually with retry logic
|
|
||||||
// This reduces transaction scope and allows retries per-site
|
|
||||||
for (const peer of activePeers) {
|
|
||||||
try {
|
|
||||||
const updatedSite = await withDeadlockRetry(async () => {
|
|
||||||
const [result] = await db
|
|
||||||
.update(sites)
|
|
||||||
.set({
|
|
||||||
megabytesOut: sql`${sites.megabytesOut} + ${peer.bytesIn}`,
|
|
||||||
megabytesIn: sql`${sites.megabytesIn} + ${peer.bytesOut}`,
|
|
||||||
lastBandwidthUpdate: currentTime.toISOString(),
|
|
||||||
online: true
|
|
||||||
})
|
|
||||||
.where(eq(sites.pubKey, peer.publicKey))
|
|
||||||
.returning({
|
|
||||||
online: sites.online,
|
|
||||||
orgId: sites.orgId,
|
|
||||||
siteId: sites.siteId,
|
|
||||||
lastBandwidthUpdate: sites.lastBandwidthUpdate
|
|
||||||
});
|
|
||||||
return result;
|
|
||||||
}, `update active site ${peer.publicKey}`);
|
|
||||||
|
|
||||||
if (updatedSite) {
|
|
||||||
if (exitNodeId) {
|
|
||||||
const notAllowed = await checkExitNodeOrg(
|
|
||||||
exitNodeId,
|
|
||||||
updatedSite.orgId
|
|
||||||
);
|
|
||||||
if (notAllowed) {
|
|
||||||
logger.warn(
|
|
||||||
`Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
|
|
||||||
);
|
|
||||||
// Skip this site but continue processing others
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Aggregate bandwidth usage for the org
|
|
||||||
const totalBandwidth = peer.bytesIn + peer.bytesOut;
|
|
||||||
const currentOrgUsage =
|
|
||||||
orgUsageMap.get(updatedSite.orgId) || 0;
|
|
||||||
orgUsageMap.set(
|
|
||||||
updatedSite.orgId,
|
|
||||||
currentOrgUsage + totalBandwidth
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(
|
|
||||||
`Failed to update bandwidth for site ${peer.publicKey}:`,
|
|
||||||
error
|
|
||||||
);
|
|
||||||
// Continue with other sites
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process usage updates outside of site update transactions
|
|
||||||
// This separates the concerns and reduces lock contention
|
|
||||||
if (calcUsageAndLimits && orgUsageMap.size > 0) {
|
|
||||||
// Sort org IDs to ensure consistent lock ordering
|
|
||||||
const allOrgIds = [...new Set([...orgUsageMap.keys()])].sort();
|
|
||||||
|
|
||||||
for (const orgId of allOrgIds) {
|
|
||||||
try {
|
|
||||||
// Process bandwidth usage for this org
|
|
||||||
const totalBandwidth = orgUsageMap.get(orgId);
|
|
||||||
if (totalBandwidth) {
|
|
||||||
const bandwidthUsage = await usageService.add(
|
|
||||||
orgId,
|
|
||||||
FeatureId.EGRESS_DATA_MB,
|
|
||||||
totalBandwidth
|
|
||||||
);
|
|
||||||
if (bandwidthUsage) {
|
|
||||||
// Fire and forget - don't block on limit checking
|
|
||||||
usageService
|
|
||||||
.checkLimitSet(
|
|
||||||
orgId,
|
|
||||||
FeatureId.EGRESS_DATA_MB,
|
|
||||||
bandwidthUsage
|
|
||||||
)
|
|
||||||
.catch((error: any) => {
|
|
||||||
logger.error(
|
|
||||||
`Error checking bandwidth limits for org ${orgId}:`,
|
|
||||||
error
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(`Error processing usage for org ${orgId}:`, error);
|
|
||||||
// Continue with other orgs
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle sites that reported zero bandwidth but need online status updated
|
|
||||||
const zeroBandwidthPeers = sortedBandwidthData.filter(
|
|
||||||
(peer) => peer.bytesIn === 0 && !offlineSites.has(peer.publicKey)
|
|
||||||
);
|
|
||||||
|
|
||||||
if (zeroBandwidthPeers.length > 0) {
|
|
||||||
// Fetch all zero bandwidth sites in one query
|
|
||||||
const zeroBandwidthSites = await db
|
|
||||||
.select()
|
|
||||||
.from(sites)
|
|
||||||
.where(
|
|
||||||
inArray(
|
|
||||||
sites.pubKey,
|
|
||||||
zeroBandwidthPeers.map((p) => p.publicKey)
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
// Sort by siteId to ensure consistent lock ordering
|
|
||||||
const sortedZeroBandwidthSites = zeroBandwidthSites.sort(
|
|
||||||
(a, b) => a.siteId - b.siteId
|
|
||||||
);
|
|
||||||
|
|
||||||
for (const site of sortedZeroBandwidthSites) {
|
|
||||||
let newOnlineStatus = site.online;
|
|
||||||
|
|
||||||
// Check if site should go offline based on last bandwidth update WITH DATA
|
|
||||||
if (site.lastBandwidthUpdate) {
|
|
||||||
const lastUpdateWithData = new Date(site.lastBandwidthUpdate);
|
|
||||||
if (lastUpdateWithData < oneMinuteAgo) {
|
|
||||||
newOnlineStatus = false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// No previous data update recorded, set to offline
|
|
||||||
newOnlineStatus = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only update online status if it changed
|
|
||||||
if (site.online !== newOnlineStatus) {
|
|
||||||
try {
|
|
||||||
const updatedSite = await withDeadlockRetry(async () => {
|
|
||||||
const [result] = await db
|
|
||||||
.update(sites)
|
|
||||||
.set({
|
|
||||||
online: newOnlineStatus
|
|
||||||
})
|
|
||||||
.where(eq(sites.siteId, site.siteId))
|
|
||||||
.returning();
|
|
||||||
return result;
|
|
||||||
}, `update offline status for site ${site.siteId}`);
|
|
||||||
|
|
||||||
if (updatedSite && exitNodeId) {
|
|
||||||
const notAllowed = await checkExitNodeOrg(
|
|
||||||
exitNodeId,
|
|
||||||
updatedSite.orgId
|
|
||||||
);
|
|
||||||
if (notAllowed) {
|
|
||||||
logger.warn(
|
|
||||||
`Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If site went offline, add it to our tracking set
|
|
||||||
if (!newOnlineStatus && site.pubKey) {
|
|
||||||
offlineSites.add(site.pubKey);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(
|
|
||||||
`Failed to update offline status for site ${site.siteId}:`,
|
|
||||||
error
|
|
||||||
);
|
|
||||||
// Continue with other sites
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -10,10 +10,21 @@ interface PeerBandwidth {
|
|||||||
bytesOut: number;
|
bytesOut: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface BandwidthAccumulator {
|
||||||
|
bytesIn: number;
|
||||||
|
bytesOut: number;
|
||||||
|
}
|
||||||
|
|
||||||
// Retry configuration for deadlock handling
|
// Retry configuration for deadlock handling
|
||||||
const MAX_RETRIES = 3;
|
const MAX_RETRIES = 3;
|
||||||
const BASE_DELAY_MS = 50;
|
const BASE_DELAY_MS = 50;
|
||||||
|
|
||||||
|
// How often to flush accumulated bandwidth data to the database
|
||||||
|
const FLUSH_INTERVAL_MS = 120_000; // 120 seconds
|
||||||
|
|
||||||
|
// In-memory accumulator: publicKey -> { bytesIn, bytesOut }
|
||||||
|
let accumulator = new Map<string, BandwidthAccumulator>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if an error is a deadlock error
|
* Check if an error is a deadlock error
|
||||||
*/
|
*/
|
||||||
@@ -53,6 +64,90 @@ async function withDeadlockRetry<T>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Flush all accumulated bandwidth data to the database.
|
||||||
|
*
|
||||||
|
* Swaps out the accumulator before writing so that any bandwidth messages
|
||||||
|
* received during the flush are captured in the new accumulator rather than
|
||||||
|
* being lost or causing contention. Entries that fail to write are re-queued
|
||||||
|
* back into the accumulator so they will be retried on the next flush.
|
||||||
|
*
|
||||||
|
* This function is exported so that the application's graceful-shutdown
|
||||||
|
* cleanup handler can call it before the process exits.
|
||||||
|
*/
|
||||||
|
export async function flushBandwidthToDb(): Promise<void> {
|
||||||
|
if (accumulator.size === 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Atomically swap out the accumulator so new data keeps flowing in
|
||||||
|
// while we write the snapshot to the database.
|
||||||
|
const snapshot = accumulator;
|
||||||
|
accumulator = new Map<string, BandwidthAccumulator>();
|
||||||
|
|
||||||
|
const currentTime = new Date().toISOString();
|
||||||
|
|
||||||
|
// Sort by publicKey for consistent lock ordering across concurrent
|
||||||
|
// writers — this is the same deadlock-prevention strategy used in the
|
||||||
|
// original per-message implementation.
|
||||||
|
const sortedEntries = [...snapshot.entries()].sort(([a], [b]) =>
|
||||||
|
a.localeCompare(b)
|
||||||
|
);
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
`Flushing accumulated bandwidth data for ${sortedEntries.length} client(s) to the database`
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const [publicKey, { bytesIn, bytesOut }] of sortedEntries) {
|
||||||
|
try {
|
||||||
|
await withDeadlockRetry(async () => {
|
||||||
|
// Use atomic SQL increment to avoid the SELECT-then-UPDATE
|
||||||
|
// anti-pattern and the races it would introduce.
|
||||||
|
await db
|
||||||
|
.update(clients)
|
||||||
|
.set({
|
||||||
|
// Note: bytesIn from peer goes to megabytesOut (data
|
||||||
|
// sent to client) and bytesOut from peer goes to
|
||||||
|
// megabytesIn (data received from client).
|
||||||
|
megabytesOut: sql`COALESCE(${clients.megabytesOut}, 0) + ${bytesIn}`,
|
||||||
|
megabytesIn: sql`COALESCE(${clients.megabytesIn}, 0) + ${bytesOut}`,
|
||||||
|
lastBandwidthUpdate: currentTime
|
||||||
|
})
|
||||||
|
.where(eq(clients.pubKey, publicKey));
|
||||||
|
}, `flush bandwidth for client ${publicKey}`);
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
`Failed to flush bandwidth for client ${publicKey}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
|
||||||
|
// Re-queue the failed entry so it is retried on the next flush
|
||||||
|
// rather than silently dropped.
|
||||||
|
const existing = accumulator.get(publicKey);
|
||||||
|
if (existing) {
|
||||||
|
existing.bytesIn += bytesIn;
|
||||||
|
existing.bytesOut += bytesOut;
|
||||||
|
} else {
|
||||||
|
accumulator.set(publicKey, { bytesIn, bytesOut });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const flushTimer = setInterval(async () => {
|
||||||
|
try {
|
||||||
|
await flushBandwidthToDb();
|
||||||
|
} catch (error) {
|
||||||
|
logger.error("Unexpected error during periodic bandwidth flush:", error);
|
||||||
|
}
|
||||||
|
}, FLUSH_INTERVAL_MS);
|
||||||
|
|
||||||
|
// Calling unref() means this timer will not keep the Node.js event loop alive
|
||||||
|
// on its own — the process can still exit normally when there is no other work
|
||||||
|
// left. The graceful-shutdown path (see server/cleanup.ts) will call
|
||||||
|
// flushBandwidthToDb() explicitly before process.exit(), so no data is lost.
|
||||||
|
flushTimer.unref();
|
||||||
|
|
||||||
export const handleReceiveBandwidthMessage: MessageHandler = async (
|
export const handleReceiveBandwidthMessage: MessageHandler = async (
|
||||||
context
|
context
|
||||||
) => {
|
) => {
|
||||||
@@ -69,40 +164,21 @@ export const handleReceiveBandwidthMessage: MessageHandler = async (
|
|||||||
throw new Error("Invalid bandwidth data");
|
throw new Error("Invalid bandwidth data");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort bandwidth data by publicKey to ensure consistent lock ordering across all instances
|
// Accumulate the incoming data in memory; the periodic timer (and the
|
||||||
// This is critical for preventing deadlocks when multiple instances update the same clients
|
// shutdown hook) will take care of writing it to the database.
|
||||||
const sortedBandwidthData = [...bandwidthData].sort((a, b) =>
|
for (const { publicKey, bytesIn, bytesOut } of bandwidthData) {
|
||||||
a.publicKey.localeCompare(b.publicKey)
|
// Skip peers that haven't transferred any data — writing zeros to the
|
||||||
);
|
// database would be a no-op anyway.
|
||||||
|
if (bytesIn <= 0 && bytesOut <= 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
const currentTime = new Date().toISOString();
|
const existing = accumulator.get(publicKey);
|
||||||
|
if (existing) {
|
||||||
// Update each client individually with retry logic
|
existing.bytesIn += bytesIn;
|
||||||
// This reduces transaction scope and allows retries per-client
|
existing.bytesOut += bytesOut;
|
||||||
for (const peer of sortedBandwidthData) {
|
} else {
|
||||||
const { publicKey, bytesIn, bytesOut } = peer;
|
accumulator.set(publicKey, { bytesIn, bytesOut });
|
||||||
|
|
||||||
try {
|
|
||||||
await withDeadlockRetry(async () => {
|
|
||||||
// Use atomic SQL increment to avoid SELECT then UPDATE pattern
|
|
||||||
// This eliminates the need to read the current value first
|
|
||||||
await db
|
|
||||||
.update(clients)
|
|
||||||
.set({
|
|
||||||
// Note: bytesIn from peer goes to megabytesOut (data sent to client)
|
|
||||||
// and bytesOut from peer goes to megabytesIn (data received from client)
|
|
||||||
megabytesOut: sql`COALESCE(${clients.megabytesOut}, 0) + ${bytesIn}`,
|
|
||||||
megabytesIn: sql`COALESCE(${clients.megabytesIn}, 0) + ${bytesOut}`,
|
|
||||||
lastBandwidthUpdate: currentTime
|
|
||||||
})
|
|
||||||
.where(eq(clients.pubKey, publicKey));
|
|
||||||
}, `update client bandwidth ${publicKey}`);
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(
|
|
||||||
`Failed to update bandwidth for client ${publicKey}:`,
|
|
||||||
error
|
|
||||||
);
|
|
||||||
// Continue with other clients even if one fails
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user