mirror of
https://github.com/fosrl/pangolin.git
synced 2026-03-06 10:46:38 +00:00
Try to fix deadlocks again
Fixes FOU-284
This commit is contained in:
@@ -1,13 +0,0 @@
|
|||||||
/*
|
|
||||||
* This file is part of a proprietary work.
|
|
||||||
*
|
|
||||||
* Copyright (c) 2025 Fossorial, Inc.
|
|
||||||
* All rights reserved.
|
|
||||||
*
|
|
||||||
* This file is licensed under the Fossorial Commercial License.
|
|
||||||
* You may not use this file except in compliance with the License.
|
|
||||||
* Unauthorized use, copying, modification, or distribution is strictly prohibited.
|
|
||||||
*
|
|
||||||
* This file is not licensed under the AGPLv3.
|
|
||||||
*/
|
|
||||||
|
|
||||||
@@ -14,12 +14,55 @@ import { build } from "@server/build";
|
|||||||
// Track sites that are already offline to avoid unnecessary queries
|
// Track sites that are already offline to avoid unnecessary queries
|
||||||
const offlineSites = new Set<string>();
|
const offlineSites = new Set<string>();
|
||||||
|
|
||||||
|
// Retry configuration for deadlock handling
|
||||||
|
const MAX_RETRIES = 3;
|
||||||
|
const BASE_DELAY_MS = 50;
|
||||||
|
|
||||||
interface PeerBandwidth {
|
interface PeerBandwidth {
|
||||||
publicKey: string;
|
publicKey: string;
|
||||||
bytesIn: number;
|
bytesIn: number;
|
||||||
bytesOut: number;
|
bytesOut: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if an error is a deadlock error
|
||||||
|
*/
|
||||||
|
function isDeadlockError(error: any): boolean {
|
||||||
|
return (
|
||||||
|
error?.code === "40P01" ||
|
||||||
|
error?.cause?.code === "40P01" ||
|
||||||
|
(error?.message && error.message.includes("deadlock"))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute a function with retry logic for deadlock handling
|
||||||
|
*/
|
||||||
|
async function withDeadlockRetry<T>(
|
||||||
|
operation: () => Promise<T>,
|
||||||
|
context: string
|
||||||
|
): Promise<T> {
|
||||||
|
let attempt = 0;
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
return await operation();
|
||||||
|
} catch (error: any) {
|
||||||
|
if (isDeadlockError(error) && attempt < MAX_RETRIES) {
|
||||||
|
attempt++;
|
||||||
|
const baseDelay = Math.pow(2, attempt - 1) * BASE_DELAY_MS;
|
||||||
|
const jitter = Math.random() * baseDelay;
|
||||||
|
const delay = baseDelay + jitter;
|
||||||
|
logger.warn(
|
||||||
|
`Deadlock detected in ${context}, retrying attempt ${attempt}/${MAX_RETRIES} after ${delay.toFixed(0)}ms`
|
||||||
|
);
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export const receiveBandwidth = async (
|
export const receiveBandwidth = async (
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response,
|
res: Response,
|
||||||
@@ -60,201 +103,208 @@ export async function updateSiteBandwidth(
|
|||||||
const currentTime = new Date();
|
const currentTime = new Date();
|
||||||
const oneMinuteAgo = new Date(currentTime.getTime() - 60000); // 1 minute ago
|
const oneMinuteAgo = new Date(currentTime.getTime() - 60000); // 1 minute ago
|
||||||
|
|
||||||
// logger.debug(`Received data: ${JSON.stringify(bandwidthData)}`);
|
// Sort bandwidth data by publicKey to ensure consistent lock ordering across all instances
|
||||||
|
// This is critical for preventing deadlocks when multiple instances update the same sites
|
||||||
|
const sortedBandwidthData = [...bandwidthData].sort((a, b) =>
|
||||||
|
a.publicKey.localeCompare(b.publicKey)
|
||||||
|
);
|
||||||
|
|
||||||
await db.transaction(async (trx) => {
|
// First, handle sites that are actively reporting bandwidth
|
||||||
// First, handle sites that are actively reporting bandwidth
|
const activePeers = sortedBandwidthData.filter((peer) => peer.bytesIn > 0);
|
||||||
const activePeers = bandwidthData.filter((peer) => peer.bytesIn > 0); // Bytesout will have data as it tries to send keep alive messages
|
|
||||||
|
|
||||||
if (activePeers.length > 0) {
|
// Aggregate usage data by organization (collected outside transaction)
|
||||||
// Remove any active peers from offline tracking since they're sending data
|
const orgUsageMap = new Map<string, number>();
|
||||||
activePeers.forEach((peer) => offlineSites.delete(peer.publicKey));
|
const orgUptimeMap = new Map<string, number>();
|
||||||
|
|
||||||
// Aggregate usage data by organization
|
if (activePeers.length > 0) {
|
||||||
const orgUsageMap = new Map<string, number>();
|
// Remove any active peers from offline tracking since they're sending data
|
||||||
const orgUptimeMap = new Map<string, number>();
|
activePeers.forEach((peer) => offlineSites.delete(peer.publicKey));
|
||||||
|
|
||||||
// Update all active sites with bandwidth data and get the site data in one operation
|
// Update each active site individually with retry logic
|
||||||
const updatedSites = [];
|
// This reduces transaction scope and allows retries per-site
|
||||||
for (const peer of activePeers) {
|
for (const peer of activePeers) {
|
||||||
const [updatedSite] = await trx
|
try {
|
||||||
.update(sites)
|
const updatedSite = await withDeadlockRetry(async () => {
|
||||||
.set({
|
const [result] = await db
|
||||||
megabytesOut: sql`${sites.megabytesOut} + ${peer.bytesIn}`,
|
.update(sites)
|
||||||
megabytesIn: sql`${sites.megabytesIn} + ${peer.bytesOut}`,
|
.set({
|
||||||
lastBandwidthUpdate: currentTime.toISOString(),
|
megabytesOut: sql`${sites.megabytesOut} + ${peer.bytesIn}`,
|
||||||
online: true
|
megabytesIn: sql`${sites.megabytesIn} + ${peer.bytesOut}`,
|
||||||
})
|
lastBandwidthUpdate: currentTime.toISOString(),
|
||||||
.where(eq(sites.pubKey, peer.publicKey))
|
online: true
|
||||||
.returning({
|
})
|
||||||
online: sites.online,
|
.where(eq(sites.pubKey, peer.publicKey))
|
||||||
orgId: sites.orgId,
|
.returning({
|
||||||
siteId: sites.siteId,
|
online: sites.online,
|
||||||
lastBandwidthUpdate: sites.lastBandwidthUpdate
|
orgId: sites.orgId,
|
||||||
});
|
siteId: sites.siteId,
|
||||||
|
lastBandwidthUpdate: sites.lastBandwidthUpdate
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}, `update active site ${peer.publicKey}`);
|
||||||
|
|
||||||
if (updatedSite) {
|
if (updatedSite) {
|
||||||
if (exitNodeId) {
|
if (exitNodeId) {
|
||||||
if (
|
const notAllowed = await checkExitNodeOrg(
|
||||||
await checkExitNodeOrg(
|
exitNodeId,
|
||||||
exitNodeId,
|
updatedSite.orgId
|
||||||
updatedSite.orgId,
|
);
|
||||||
trx
|
if (notAllowed) {
|
||||||
)
|
|
||||||
) {
|
|
||||||
// not allowed
|
|
||||||
logger.warn(
|
logger.warn(
|
||||||
`Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
|
`Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
|
||||||
);
|
);
|
||||||
// THIS SHOULD TRIGGER THE TRANSACTION TO FAIL?
|
// Skip this site but continue processing others
|
||||||
throw new Error("Exit node not allowed");
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
updatedSites.push({ ...updatedSite, peer });
|
// Aggregate bandwidth usage for the org
|
||||||
}
|
const totalBandwidth = peer.bytesIn + peer.bytesOut;
|
||||||
}
|
const currentOrgUsage = orgUsageMap.get(updatedSite.orgId) || 0;
|
||||||
|
orgUsageMap.set(updatedSite.orgId, currentOrgUsage + totalBandwidth);
|
||||||
// Calculate org usage aggregations using the updated site data
|
|
||||||
for (const { peer, ...site } of updatedSites) {
|
// Add 10 seconds of uptime for each active site
|
||||||
// Aggregate bandwidth usage for the org
|
const currentOrgUptime = orgUptimeMap.get(updatedSite.orgId) || 0;
|
||||||
const totalBandwidth = peer.bytesIn + peer.bytesOut;
|
orgUptimeMap.set(updatedSite.orgId, currentOrgUptime + 10 / 60);
|
||||||
const currentOrgUsage = orgUsageMap.get(site.orgId) || 0;
|
|
||||||
orgUsageMap.set(site.orgId, currentOrgUsage + totalBandwidth);
|
|
||||||
|
|
||||||
// Add 10 seconds of uptime for each active site
|
|
||||||
const currentOrgUptime = orgUptimeMap.get(site.orgId) || 0;
|
|
||||||
orgUptimeMap.set(site.orgId, currentOrgUptime + 10 / 60); // Store in minutes and jut add 10 seconds
|
|
||||||
}
|
|
||||||
|
|
||||||
if (calcUsageAndLimits) {
|
|
||||||
// REMOTE EXIT NODES DO NOT COUNT TOWARDS USAGE
|
|
||||||
// Process all usage updates sequentially by organization to reduce deadlock risk
|
|
||||||
const allOrgIds = new Set([...orgUsageMap.keys(), ...orgUptimeMap.keys()]);
|
|
||||||
|
|
||||||
for (const orgId of allOrgIds) {
|
|
||||||
try {
|
|
||||||
// Process bandwidth usage for this org
|
|
||||||
const totalBandwidth = orgUsageMap.get(orgId);
|
|
||||||
if (totalBandwidth) {
|
|
||||||
const bandwidthUsage = await usageService.add(
|
|
||||||
orgId,
|
|
||||||
FeatureId.EGRESS_DATA_MB,
|
|
||||||
totalBandwidth,
|
|
||||||
trx
|
|
||||||
);
|
|
||||||
if (bandwidthUsage) {
|
|
||||||
usageService
|
|
||||||
.checkLimitSet(
|
|
||||||
orgId,
|
|
||||||
true,
|
|
||||||
FeatureId.EGRESS_DATA_MB,
|
|
||||||
bandwidthUsage,
|
|
||||||
trx
|
|
||||||
)
|
|
||||||
.catch((error: any) => {
|
|
||||||
logger.error(
|
|
||||||
`Error checking bandwidth limits for org ${orgId}:`,
|
|
||||||
error
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process uptime usage for this org
|
|
||||||
const totalUptime = orgUptimeMap.get(orgId);
|
|
||||||
if (totalUptime) {
|
|
||||||
const uptimeUsage = await usageService.add(
|
|
||||||
orgId,
|
|
||||||
FeatureId.SITE_UPTIME,
|
|
||||||
totalUptime,
|
|
||||||
trx
|
|
||||||
);
|
|
||||||
if (uptimeUsage) {
|
|
||||||
usageService
|
|
||||||
.checkLimitSet(
|
|
||||||
orgId,
|
|
||||||
true,
|
|
||||||
FeatureId.SITE_UPTIME,
|
|
||||||
uptimeUsage,
|
|
||||||
trx
|
|
||||||
)
|
|
||||||
.catch((error: any) => {
|
|
||||||
logger.error(
|
|
||||||
`Error checking uptime limits for org ${orgId}:`,
|
|
||||||
error
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(
|
|
||||||
`Error processing usage for org ${orgId}:`,
|
|
||||||
error
|
|
||||||
);
|
|
||||||
// Don't break the loop, continue with other orgs
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
`Failed to update bandwidth for site ${peer.publicKey}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
// Continue with other sites
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Handle sites that reported zero bandwidth but need online status updated
|
// Process usage updates outside of site update transactions
|
||||||
const zeroBandwidthPeers = bandwidthData.filter(
|
// This separates the concerns and reduces lock contention
|
||||||
(peer) => peer.bytesIn === 0 && !offlineSites.has(peer.publicKey) // Bytesout will have data as it tries to send keep alive messages
|
if (calcUsageAndLimits && (orgUsageMap.size > 0 || orgUptimeMap.size > 0)) {
|
||||||
);
|
// Sort org IDs to ensure consistent lock ordering
|
||||||
|
const allOrgIds = [...new Set([...orgUsageMap.keys(), ...orgUptimeMap.keys()])].sort();
|
||||||
|
|
||||||
if (zeroBandwidthPeers.length > 0) {
|
for (const orgId of allOrgIds) {
|
||||||
const zeroBandwidthSites = await trx
|
try {
|
||||||
.select()
|
// Process bandwidth usage for this org
|
||||||
.from(sites)
|
const totalBandwidth = orgUsageMap.get(orgId);
|
||||||
.where(
|
if (totalBandwidth) {
|
||||||
inArray(
|
const bandwidthUsage = await usageService.add(
|
||||||
sites.pubKey,
|
orgId,
|
||||||
zeroBandwidthPeers.map((p) => p.publicKey)
|
FeatureId.EGRESS_DATA_MB,
|
||||||
)
|
totalBandwidth
|
||||||
);
|
|
||||||
|
|
||||||
for (const site of zeroBandwidthSites) {
|
|
||||||
let newOnlineStatus = site.online;
|
|
||||||
|
|
||||||
// Check if site should go offline based on last bandwidth update WITH DATA
|
|
||||||
if (site.lastBandwidthUpdate) {
|
|
||||||
const lastUpdateWithData = new Date(
|
|
||||||
site.lastBandwidthUpdate
|
|
||||||
);
|
);
|
||||||
if (lastUpdateWithData < oneMinuteAgo) {
|
if (bandwidthUsage) {
|
||||||
newOnlineStatus = false;
|
// Fire and forget - don't block on limit checking
|
||||||
|
usageService
|
||||||
|
.checkLimitSet(
|
||||||
|
orgId,
|
||||||
|
true,
|
||||||
|
FeatureId.EGRESS_DATA_MB,
|
||||||
|
bandwidthUsage
|
||||||
|
)
|
||||||
|
.catch((error: any) => {
|
||||||
|
logger.error(
|
||||||
|
`Error checking bandwidth limits for org ${orgId}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// No previous data update recorded, set to offline
|
|
||||||
newOnlineStatus = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Always update lastBandwidthUpdate to show this instance is receiving reports
|
// Process uptime usage for this org
|
||||||
// Only update online status if it changed
|
const totalUptime = orgUptimeMap.get(orgId);
|
||||||
if (site.online !== newOnlineStatus) {
|
if (totalUptime) {
|
||||||
const [updatedSite] = await trx
|
const uptimeUsage = await usageService.add(
|
||||||
.update(sites)
|
orgId,
|
||||||
.set({
|
FeatureId.SITE_UPTIME,
|
||||||
online: newOnlineStatus
|
totalUptime
|
||||||
})
|
);
|
||||||
.where(eq(sites.siteId, site.siteId))
|
if (uptimeUsage) {
|
||||||
.returning();
|
// Fire and forget - don't block on limit checking
|
||||||
|
usageService
|
||||||
|
.checkLimitSet(
|
||||||
|
orgId,
|
||||||
|
true,
|
||||||
|
FeatureId.SITE_UPTIME,
|
||||||
|
uptimeUsage
|
||||||
|
)
|
||||||
|
.catch((error: any) => {
|
||||||
|
logger.error(
|
||||||
|
`Error checking uptime limits for org ${orgId}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
`Error processing usage for org ${orgId}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
// Continue with other orgs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle sites that reported zero bandwidth but need online status updated
|
||||||
|
const zeroBandwidthPeers = sortedBandwidthData.filter(
|
||||||
|
(peer) => peer.bytesIn === 0 && !offlineSites.has(peer.publicKey)
|
||||||
|
);
|
||||||
|
|
||||||
|
if (zeroBandwidthPeers.length > 0) {
|
||||||
|
// Fetch all zero bandwidth sites in one query
|
||||||
|
const zeroBandwidthSites = await db
|
||||||
|
.select()
|
||||||
|
.from(sites)
|
||||||
|
.where(
|
||||||
|
inArray(
|
||||||
|
sites.pubKey,
|
||||||
|
zeroBandwidthPeers.map((p) => p.publicKey)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Sort by siteId to ensure consistent lock ordering
|
||||||
|
const sortedZeroBandwidthSites = zeroBandwidthSites.sort(
|
||||||
|
(a, b) => a.siteId - b.siteId
|
||||||
|
);
|
||||||
|
|
||||||
|
for (const site of sortedZeroBandwidthSites) {
|
||||||
|
let newOnlineStatus = site.online;
|
||||||
|
|
||||||
|
// Check if site should go offline based on last bandwidth update WITH DATA
|
||||||
|
if (site.lastBandwidthUpdate) {
|
||||||
|
const lastUpdateWithData = new Date(site.lastBandwidthUpdate);
|
||||||
|
if (lastUpdateWithData < oneMinuteAgo) {
|
||||||
|
newOnlineStatus = false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No previous data update recorded, set to offline
|
||||||
|
newOnlineStatus = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only update online status if it changed
|
||||||
|
if (site.online !== newOnlineStatus) {
|
||||||
|
try {
|
||||||
|
const updatedSite = await withDeadlockRetry(async () => {
|
||||||
|
const [result] = await db
|
||||||
|
.update(sites)
|
||||||
|
.set({
|
||||||
|
online: newOnlineStatus
|
||||||
|
})
|
||||||
|
.where(eq(sites.siteId, site.siteId))
|
||||||
|
.returning();
|
||||||
|
return result;
|
||||||
|
}, `update offline status for site ${site.siteId}`);
|
||||||
|
|
||||||
if (updatedSite && exitNodeId) {
|
if (updatedSite && exitNodeId) {
|
||||||
if (
|
const notAllowed = await checkExitNodeOrg(
|
||||||
await checkExitNodeOrg(
|
exitNodeId,
|
||||||
exitNodeId,
|
updatedSite.orgId
|
||||||
updatedSite.orgId,
|
);
|
||||||
trx
|
if (notAllowed) {
|
||||||
)
|
|
||||||
) {
|
|
||||||
// not allowed
|
|
||||||
logger.warn(
|
logger.warn(
|
||||||
`Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
|
`Exit node ${exitNodeId} is not allowed for org ${updatedSite.orgId}`
|
||||||
);
|
);
|
||||||
// THIS SHOULD TRIGGER THE TRANSACTION TO FAIL?
|
|
||||||
throw new Error("Exit node not allowed");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -262,8 +312,14 @@ export async function updateSiteBandwidth(
|
|||||||
if (!newOnlineStatus && site.pubKey) {
|
if (!newOnlineStatus && site.pubKey) {
|
||||||
offlineSites.add(site.pubKey);
|
offlineSites.add(site.pubKey);
|
||||||
}
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(
|
||||||
|
`Failed to update offline status for site ${site.siteId}:`,
|
||||||
|
error
|
||||||
|
);
|
||||||
|
// Continue with other sites
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user