[management] Cache peer snapshot + consolidate auth reads on Sync hot path

Trim the fast-path Sync handler by removing two DB round trips on cache hit:

1. Consolidate GetUserIDByPeerKey + GetAccountIDByPeerPubKey into a single
   GetPeerAuthInfoByPubKey store call. Both looked up the same peer row by
   pubkey and returned one column each; the new method SELECTs both columns
   in one query. AccountManager exposes it as GetPeerAuthInfo.

2. Extend peerSyncEntry with AccountID, PeerID, PeerKey, Ephemeral and a
   HasUser flag so the cache carries everything the fast path needs. On
   cache hit with a matching metaHash:

    - The Sync handler skips GetPeerAuthInfo entirely (entry.AccountID and
      entry.HasUser drive the loginFilter gate).
    - commitFastPath skips GetPeerByPeerPubKey by using the cached peer
      snapshot for OnPeerConnectedWithPeer.

Old cache entries from pre-step-2 shape still decode (missing fields zero
out) but IsComplete() returns false, so they fall through to the slow path
and get rewritten with the full shape on first pass. No migration needed.

Expected impact on a 16.8 s pathological Sync observed in production:
~6 s saved from eliminating one auth-read round trip, the pre-fast-path
GetPeerAuthInfo on cache hit, and GetPeerByPeerPubKey in commitFastPath.
Cache miss / cold start remain on the slow path unchanged.

Account-serial, ExtraSettings and peer-group caching — the remaining
synchronous DB reads — are deliberately left for a follow-up so the
invalidation design can be proven incrementally.
This commit is contained in:
mlsmaycon
2026-04-24 11:41:59 +02:00
parent 5993264d34
commit 7e9d3485d8
9 changed files with 251 additions and 60 deletions

View File

@@ -261,19 +261,39 @@ func (s *Server) Sync(req *proto.EncryptedMessage, srv proto.ManagementService_S
realIP := getRealIP(ctx)
sRealIP := realIP.String()
peerMeta := extractPeerMeta(ctx, syncReq.GetMeta())
getUserIDStart := time.Now()
userID, err := s.accountManager.GetUserIDByPeerKey(ctx, peerKey.String())
if err != nil {
s.syncSem.Add(-1)
if errStatus, ok := internalStatus.FromError(err); ok && errStatus.Type() == internalStatus.NotFound {
return status.Errorf(codes.PermissionDenied, "peer is not registered")
}
return mapError(ctx, err)
}
log.WithContext(ctx).Debugf("fast path: GetUserIDByPeerKey took %s", time.Since(getUserIDStart))
metahashed := metaHash(peerMeta, sRealIP)
// Fast path authorisation short-circuit: if the peer-sync cache has a
// complete entry whose metaHash still matches the incoming request, we can
// skip GetPeerAuthInfo entirely. The entry carries AccountID and HasUser
// so we have everything the loginFilter gate and the rest of the handler
// need. On any mismatch we fall back to the DB read below.
var (
userID string
accountID string
)
cachedEntry, cachedEntryHit := s.lookupPeerAuthFromCache(peerKey.String(), metahashed, peerMeta.GoOS)
if cachedEntryHit {
accountID = cachedEntry.AccountID
if cachedEntry.HasUser {
userID = "cached"
}
log.WithContext(ctx).Debugf("fast path: GetPeerAuthInfo skipped (cache hit)")
} else {
authInfoStart := time.Now()
uid, aid, err := s.accountManager.GetPeerAuthInfo(ctx, peerKey.String())
if err != nil {
s.syncSem.Add(-1)
if errStatus, ok := internalStatus.FromError(err); ok && errStatus.Type() == internalStatus.NotFound {
return status.Errorf(codes.PermissionDenied, "peer is not registered")
}
return mapError(ctx, err)
}
userID = uid
accountID = aid
log.WithContext(ctx).Debugf("fast path: GetPeerAuthInfo took %s", time.Since(authInfoStart))
}
if userID == "" && !s.loginFilter.allowLogin(peerKey.String(), metahashed) {
if s.appMetrics != nil {
s.appMetrics.GRPCMetrics().CountSyncRequestBlocked()
@@ -294,21 +314,6 @@ func (s *Server) Sync(req *proto.EncryptedMessage, srv proto.ManagementService_S
// nolint:staticcheck
ctx = context.WithValue(ctx, nbContext.PeerIDKey, peerKey.String())
getAccountIDStart := time.Now()
accountID, err := s.accountManager.GetAccountIDForPeerKey(ctx, peerKey.String())
if err != nil {
// nolint:staticcheck
ctx = context.WithValue(ctx, nbContext.AccountIDKey, "UNKNOWN")
log.WithContext(ctx).Tracef("peer %s is not registered", peerKey.String())
if errStatus, ok := internalStatus.FromError(err); ok && errStatus.Type() == internalStatus.NotFound {
s.syncSem.Add(-1)
return status.Errorf(codes.PermissionDenied, "peer is not registered")
}
s.syncSem.Add(-1)
return err
}
log.WithContext(ctx).Debugf("fast path: GetAccountIDForPeerKey took %s", time.Since(getAccountIDStart))
// nolint:staticcheck
ctx = context.WithValue(ctx, nbContext.AccountIDKey, accountID)
@@ -348,7 +353,7 @@ func (s *Server) Sync(req *proto.EncryptedMessage, srv proto.ManagementService_S
s.cancelPeerRoutinesWithoutLock(ctx, accountID, peer, syncStart)
return err
}
s.recordPeerSyncEntry(peerKey.String(), netMap, metahash)
s.recordPeerSyncEntry(peerKey.String(), netMap, metahash, peer)
updates, err := s.networkMapController.OnPeerConnected(ctx, accountID, peer.ID)
if err != nil {
@@ -522,7 +527,7 @@ func (s *Server) sendUpdate(ctx context.Context, accountID string, peerKey wgtyp
return status.Errorf(codes.Internal, "failed sending update message")
}
if update.MessageType == network_map.MessageTypeNetworkMap {
s.recordPeerSyncEntryFromUpdate(peerKey.String(), update, peerMetaHash)
s.recordPeerSyncEntryFromUpdate(peerKey.String(), update, peerMetaHash, peer)
}
log.WithContext(ctx).Debugf("sent an update to peer %s", peerKey.String())
return nil