apply snapshot mappings concurrently while maintaining management backpressure

This commit is contained in:
pascal
2026-05-18 15:44:32 +02:00
parent 705f87fc20
commit ddc4c20a31
9 changed files with 2246 additions and 98 deletions

View File

@@ -32,9 +32,11 @@ import (
"go.opentelemetry.io/otel/sdk/metric"
"golang.org/x/exp/maps"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/keepalive"
grpcstatus "google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/netbirdio/netbird/proxy/internal/accesslog"
@@ -938,6 +940,9 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
Clock: backoff.SystemClock,
}
// syncSupported tracks whether management supports SyncMappings.
// Starts true; set to false on first Unimplemented error.
syncSupported := true
initialSyncDone := false
operation := func() error {
@@ -949,36 +954,25 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
s.healthChecker.SetManagementConnected(false)
}
supportsCrowdSec := s.crowdsecRegistry.Available()
mappingClient, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
ProxyId: s.ID,
Version: s.Version,
StartedAt: timestamppb.New(s.startTime),
Address: s.ProxyURL,
Capabilities: &proto.ProxyCapabilities{
SupportsCustomPorts: &s.SupportsCustomPorts,
RequireSubdomain: &s.RequireSubdomain,
SupportsCrowdsec: &supportsCrowdSec,
},
})
if err != nil {
return fmt.Errorf("create mapping stream: %w", err)
var streamErr error
if syncSupported {
streamErr = s.trySyncMappings(ctx, client, &initialSyncDone)
if isSyncUnimplemented(streamErr) {
syncSupported = false
s.Logger.Info("management does not support SyncMappings, falling back to GetMappingUpdate")
streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
}
} else {
streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
}
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(true)
}
s.Logger.Debug("management mapping stream established")
// Stream established — reset backoff so the next failure retries quickly.
bo.Reset()
streamErr := s.handleMappingStream(ctx, mappingClient, &initialSyncDone)
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(false)
}
// Stream established — reset backoff so the next failure retries quickly.
bo.Reset()
if streamErr == nil {
return fmt.Errorf("stream closed by server")
}
@@ -995,6 +989,125 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
}
}
func (s *Server) proxyCapabilities() *proto.ProxyCapabilities {
supportsCrowdSec := s.crowdsecRegistry.Available()
return &proto.ProxyCapabilities{
SupportsCustomPorts: &s.SupportsCustomPorts,
RequireSubdomain: &s.RequireSubdomain,
SupportsCrowdsec: &supportsCrowdSec,
}
}
func (s *Server) tryGetMappingUpdate(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
mappingClient, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
ProxyId: s.ID,
Version: s.Version,
StartedAt: timestamppb.New(s.startTime),
Address: s.ProxyURL,
Capabilities: s.proxyCapabilities(),
})
if err != nil {
return fmt.Errorf("create mapping stream: %w", err)
}
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(true)
}
s.Logger.Debug("management mapping stream established (GetMappingUpdate)")
return s.handleMappingStream(ctx, mappingClient, initialSyncDone)
}
func (s *Server) trySyncMappings(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
stream, err := client.SyncMappings(ctx)
if err != nil {
return fmt.Errorf("create sync stream: %w", err)
}
// Send init message.
if err := stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: s.ID,
Version: s.Version,
StartedAt: timestamppb.New(s.startTime),
Address: s.ProxyURL,
Capabilities: s.proxyCapabilities(),
},
},
}); err != nil {
return fmt.Errorf("send sync init: %w", err)
}
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(true)
}
s.Logger.Debug("management mapping stream established (SyncMappings)")
return s.handleSyncMappingsStream(ctx, stream, initialSyncDone)
}
func isSyncUnimplemented(err error) bool {
if err == nil {
return false
}
st, ok := grpcstatus.FromError(err)
return ok && st.Code() == codes.Unimplemented
}
func (s *Server) handleSyncMappingsStream(ctx context.Context, stream proto.ProxyService_SyncMappingsClient, initialSyncDone *bool) error {
select {
case <-s.routerReady:
case <-ctx.Done():
return ctx.Err()
}
var snapshotIDs map[types.ServiceID]struct{}
if !*initialSyncDone {
snapshotIDs = make(map[types.ServiceID]struct{})
}
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
msg, err := stream.Recv()
switch {
case errors.Is(err, io.EOF):
return nil
case err != nil:
return fmt.Errorf("receive msg: %w", err)
}
s.Logger.Debug("Received mapping update, starting processing")
s.processMappings(ctx, msg.GetMapping())
s.Logger.Debug("Processing mapping update completed")
// Send ack so management knows we're ready for the next batch.
if err := stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
}); err != nil {
return fmt.Errorf("send ack: %w", err)
}
if !*initialSyncDone {
for _, m := range msg.GetMapping() {
snapshotIDs[types.ServiceID(m.GetId())] = struct{}{}
}
if msg.GetInitialSyncComplete() {
s.reconcileSnapshot(ctx, snapshotIDs)
snapshotIDs = nil
if s.healthChecker != nil {
s.healthChecker.SetInitialSyncComplete()
}
*initialSyncDone = true
s.Logger.Info("Initial mapping sync complete")
}
}
}
}
}
func (s *Server) handleMappingStream(ctx context.Context, mappingClient proto.ProxyService_GetMappingUpdateClient, initialSyncDone *bool) error {
select {
case <-s.routerReady:
@@ -1067,6 +1180,8 @@ func (s *Server) reconcileSnapshot(ctx context.Context, snapshotIDs map[types.Se
}
func (s *Server) processMappings(ctx context.Context, mappings []*proto.ProxyMapping) {
s.ensurePeers(ctx, mappings)
for _, mapping := range mappings {
s.Logger.WithFields(log.Fields{
"type": mapping.GetType(),
@@ -1100,6 +1215,60 @@ func (s *Server) processMappings(ctx context.Context, mappings []*proto.ProxyMap
}
}
// ensurePeers pre-creates NetBird peers for all unique accounts referenced by
// CREATED mappings. Peers for different accounts are created concurrently,
// which avoids serializing N×100ms gRPC round-trips during large initial syncs.
func (s *Server) ensurePeers(ctx context.Context, mappings []*proto.ProxyMapping) {
// Collect one representative mapping per account that needs a new peer.
type peerReq struct {
accountID types.AccountID
svcKey roundtrip.ServiceKey
authToken string
svcID types.ServiceID
}
seen := make(map[types.AccountID]struct{})
var reqs []peerReq
for _, m := range mappings {
if m.GetType() != proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED {
continue
}
accountID := types.AccountID(m.GetAccountId())
if _, ok := seen[accountID]; ok {
continue
}
seen[accountID] = struct{}{}
if s.netbird.HasClient(accountID) {
continue
}
reqs = append(reqs, peerReq{
accountID: accountID,
svcKey: s.serviceKeyForMapping(m),
authToken: m.GetAuthToken(),
svcID: types.ServiceID(m.GetId()),
})
}
if len(reqs) <= 1 {
return
}
var wg sync.WaitGroup
wg.Add(len(reqs))
for _, r := range reqs {
go func() {
defer wg.Done()
if err := s.netbird.AddPeer(ctx, r.accountID, r.svcKey, r.authToken, r.svcID); err != nil {
s.Logger.WithFields(log.Fields{
"account_id": r.accountID,
"service_id": r.svcID,
"error": err,
}).Warn("failed to pre-create peer for account")
}
}()
}
wg.Wait()
}
// addMapping registers a service mapping and starts the appropriate relay or routes.
func (s *Server) addMapping(ctx context.Context, mapping *proto.ProxyMapping) error {
accountID := types.AccountID(mapping.GetAccountId())