mirror of
https://github.com/netbirdio/netbird.git
synced 2026-05-19 23:29:56 +00:00
apply snapshot mappings concurrently while maintaining management backpressure
This commit is contained in:
217
proxy/server.go
217
proxy/server.go
@@ -32,9 +32,11 @@ import (
|
||||
"go.opentelemetry.io/otel/sdk/metric"
|
||||
"golang.org/x/exp/maps"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/credentials"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"google.golang.org/grpc/keepalive"
|
||||
grpcstatus "google.golang.org/grpc/status"
|
||||
"google.golang.org/protobuf/types/known/timestamppb"
|
||||
|
||||
"github.com/netbirdio/netbird/proxy/internal/accesslog"
|
||||
@@ -938,6 +940,9 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
|
||||
Clock: backoff.SystemClock,
|
||||
}
|
||||
|
||||
// syncSupported tracks whether management supports SyncMappings.
|
||||
// Starts true; set to false on first Unimplemented error.
|
||||
syncSupported := true
|
||||
initialSyncDone := false
|
||||
|
||||
operation := func() error {
|
||||
@@ -949,36 +954,25 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
|
||||
s.healthChecker.SetManagementConnected(false)
|
||||
}
|
||||
|
||||
supportsCrowdSec := s.crowdsecRegistry.Available()
|
||||
mappingClient, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
|
||||
ProxyId: s.ID,
|
||||
Version: s.Version,
|
||||
StartedAt: timestamppb.New(s.startTime),
|
||||
Address: s.ProxyURL,
|
||||
Capabilities: &proto.ProxyCapabilities{
|
||||
SupportsCustomPorts: &s.SupportsCustomPorts,
|
||||
RequireSubdomain: &s.RequireSubdomain,
|
||||
SupportsCrowdsec: &supportsCrowdSec,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("create mapping stream: %w", err)
|
||||
var streamErr error
|
||||
if syncSupported {
|
||||
streamErr = s.trySyncMappings(ctx, client, &initialSyncDone)
|
||||
if isSyncUnimplemented(streamErr) {
|
||||
syncSupported = false
|
||||
s.Logger.Info("management does not support SyncMappings, falling back to GetMappingUpdate")
|
||||
streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
|
||||
}
|
||||
} else {
|
||||
streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
|
||||
}
|
||||
|
||||
if s.healthChecker != nil {
|
||||
s.healthChecker.SetManagementConnected(true)
|
||||
}
|
||||
s.Logger.Debug("management mapping stream established")
|
||||
|
||||
// Stream established — reset backoff so the next failure retries quickly.
|
||||
bo.Reset()
|
||||
|
||||
streamErr := s.handleMappingStream(ctx, mappingClient, &initialSyncDone)
|
||||
|
||||
if s.healthChecker != nil {
|
||||
s.healthChecker.SetManagementConnected(false)
|
||||
}
|
||||
|
||||
// Stream established — reset backoff so the next failure retries quickly.
|
||||
bo.Reset()
|
||||
|
||||
if streamErr == nil {
|
||||
return fmt.Errorf("stream closed by server")
|
||||
}
|
||||
@@ -995,6 +989,125 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) proxyCapabilities() *proto.ProxyCapabilities {
|
||||
supportsCrowdSec := s.crowdsecRegistry.Available()
|
||||
return &proto.ProxyCapabilities{
|
||||
SupportsCustomPorts: &s.SupportsCustomPorts,
|
||||
RequireSubdomain: &s.RequireSubdomain,
|
||||
SupportsCrowdsec: &supportsCrowdSec,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) tryGetMappingUpdate(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
|
||||
mappingClient, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
|
||||
ProxyId: s.ID,
|
||||
Version: s.Version,
|
||||
StartedAt: timestamppb.New(s.startTime),
|
||||
Address: s.ProxyURL,
|
||||
Capabilities: s.proxyCapabilities(),
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("create mapping stream: %w", err)
|
||||
}
|
||||
|
||||
if s.healthChecker != nil {
|
||||
s.healthChecker.SetManagementConnected(true)
|
||||
}
|
||||
s.Logger.Debug("management mapping stream established (GetMappingUpdate)")
|
||||
|
||||
return s.handleMappingStream(ctx, mappingClient, initialSyncDone)
|
||||
}
|
||||
|
||||
func (s *Server) trySyncMappings(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
|
||||
stream, err := client.SyncMappings(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create sync stream: %w", err)
|
||||
}
|
||||
|
||||
// Send init message.
|
||||
if err := stream.Send(&proto.SyncMappingsRequest{
|
||||
Msg: &proto.SyncMappingsRequest_Init{
|
||||
Init: &proto.SyncMappingsInit{
|
||||
ProxyId: s.ID,
|
||||
Version: s.Version,
|
||||
StartedAt: timestamppb.New(s.startTime),
|
||||
Address: s.ProxyURL,
|
||||
Capabilities: s.proxyCapabilities(),
|
||||
},
|
||||
},
|
||||
}); err != nil {
|
||||
return fmt.Errorf("send sync init: %w", err)
|
||||
}
|
||||
|
||||
if s.healthChecker != nil {
|
||||
s.healthChecker.SetManagementConnected(true)
|
||||
}
|
||||
s.Logger.Debug("management mapping stream established (SyncMappings)")
|
||||
|
||||
return s.handleSyncMappingsStream(ctx, stream, initialSyncDone)
|
||||
}
|
||||
|
||||
func isSyncUnimplemented(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
st, ok := grpcstatus.FromError(err)
|
||||
return ok && st.Code() == codes.Unimplemented
|
||||
}
|
||||
|
||||
func (s *Server) handleSyncMappingsStream(ctx context.Context, stream proto.ProxyService_SyncMappingsClient, initialSyncDone *bool) error {
|
||||
select {
|
||||
case <-s.routerReady:
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
var snapshotIDs map[types.ServiceID]struct{}
|
||||
if !*initialSyncDone {
|
||||
snapshotIDs = make(map[types.ServiceID]struct{})
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
msg, err := stream.Recv()
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
return nil
|
||||
case err != nil:
|
||||
return fmt.Errorf("receive msg: %w", err)
|
||||
}
|
||||
s.Logger.Debug("Received mapping update, starting processing")
|
||||
s.processMappings(ctx, msg.GetMapping())
|
||||
s.Logger.Debug("Processing mapping update completed")
|
||||
|
||||
// Send ack so management knows we're ready for the next batch.
|
||||
if err := stream.Send(&proto.SyncMappingsRequest{
|
||||
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
|
||||
}); err != nil {
|
||||
return fmt.Errorf("send ack: %w", err)
|
||||
}
|
||||
|
||||
if !*initialSyncDone {
|
||||
for _, m := range msg.GetMapping() {
|
||||
snapshotIDs[types.ServiceID(m.GetId())] = struct{}{}
|
||||
}
|
||||
if msg.GetInitialSyncComplete() {
|
||||
s.reconcileSnapshot(ctx, snapshotIDs)
|
||||
snapshotIDs = nil
|
||||
if s.healthChecker != nil {
|
||||
s.healthChecker.SetInitialSyncComplete()
|
||||
}
|
||||
*initialSyncDone = true
|
||||
s.Logger.Info("Initial mapping sync complete")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) handleMappingStream(ctx context.Context, mappingClient proto.ProxyService_GetMappingUpdateClient, initialSyncDone *bool) error {
|
||||
select {
|
||||
case <-s.routerReady:
|
||||
@@ -1067,6 +1180,8 @@ func (s *Server) reconcileSnapshot(ctx context.Context, snapshotIDs map[types.Se
|
||||
}
|
||||
|
||||
func (s *Server) processMappings(ctx context.Context, mappings []*proto.ProxyMapping) {
|
||||
s.ensurePeers(ctx, mappings)
|
||||
|
||||
for _, mapping := range mappings {
|
||||
s.Logger.WithFields(log.Fields{
|
||||
"type": mapping.GetType(),
|
||||
@@ -1100,6 +1215,60 @@ func (s *Server) processMappings(ctx context.Context, mappings []*proto.ProxyMap
|
||||
}
|
||||
}
|
||||
|
||||
// ensurePeers pre-creates NetBird peers for all unique accounts referenced by
|
||||
// CREATED mappings. Peers for different accounts are created concurrently,
|
||||
// which avoids serializing N×100ms gRPC round-trips during large initial syncs.
|
||||
func (s *Server) ensurePeers(ctx context.Context, mappings []*proto.ProxyMapping) {
|
||||
// Collect one representative mapping per account that needs a new peer.
|
||||
type peerReq struct {
|
||||
accountID types.AccountID
|
||||
svcKey roundtrip.ServiceKey
|
||||
authToken string
|
||||
svcID types.ServiceID
|
||||
}
|
||||
seen := make(map[types.AccountID]struct{})
|
||||
var reqs []peerReq
|
||||
for _, m := range mappings {
|
||||
if m.GetType() != proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED {
|
||||
continue
|
||||
}
|
||||
accountID := types.AccountID(m.GetAccountId())
|
||||
if _, ok := seen[accountID]; ok {
|
||||
continue
|
||||
}
|
||||
seen[accountID] = struct{}{}
|
||||
if s.netbird.HasClient(accountID) {
|
||||
continue
|
||||
}
|
||||
reqs = append(reqs, peerReq{
|
||||
accountID: accountID,
|
||||
svcKey: s.serviceKeyForMapping(m),
|
||||
authToken: m.GetAuthToken(),
|
||||
svcID: types.ServiceID(m.GetId()),
|
||||
})
|
||||
}
|
||||
|
||||
if len(reqs) <= 1 {
|
||||
return
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(reqs))
|
||||
for _, r := range reqs {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err := s.netbird.AddPeer(ctx, r.accountID, r.svcKey, r.authToken, r.svcID); err != nil {
|
||||
s.Logger.WithFields(log.Fields{
|
||||
"account_id": r.accountID,
|
||||
"service_id": r.svcID,
|
||||
"error": err,
|
||||
}).Warn("failed to pre-create peer for account")
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// addMapping registers a service mapping and starts the appropriate relay or routes.
|
||||
func (s *Server) addMapping(ctx context.Context, mapping *proto.ProxyMapping) error {
|
||||
accountID := types.AccountID(mapping.GetAccountId())
|
||||
|
||||
Reference in New Issue
Block a user