apply snapshot mappings concurrently while maintaining management backpressure

This commit is contained in:
pascal
2026-05-18 15:44:32 +02:00
parent 705f87fc20
commit ddc4c20a31
9 changed files with 2246 additions and 98 deletions

View File

@@ -76,6 +76,11 @@ type clientEntry struct {
services map[ServiceKey]serviceInfo
createdAt time.Time
started bool
// ready is closed once the client has been fully initialized.
// Callers that find a pending entry wait on this channel before
// accessing the client. A nil initErr means success.
ready chan struct{}
initErr error
// Per-backend in-flight limiting keyed by target host:port.
// TODO: clean up stale entries when backend targets change.
inflightMu sync.Mutex
@@ -157,6 +162,9 @@ type skipTLSVerifyContextKey struct{}
// AddPeer registers a service for an account. If the account doesn't have a client yet,
// one is created by authenticating with the management server using the provided token.
// Multiple services can share the same client.
//
// Client creation (WG keygen, gRPC, embed.New) runs without holding clientsMux
// so that concurrent AddPeer calls for different accounts execute in parallel.
func (n *NetBird) AddPeer(ctx context.Context, accountID types.AccountID, key ServiceKey, authToken string, serviceID types.ServiceID) error {
si := serviceInfo{serviceID: serviceID}
@@ -164,10 +172,23 @@ func (n *NetBird) AddPeer(ctx context.Context, accountID types.AccountID, key Se
entry, exists := n.clients[accountID]
if exists {
ready := entry.ready
entry.services[key] = si
started := entry.started
n.clientsMux.Unlock()
// If the entry is still being initialized by another goroutine, wait.
if ready != nil {
select {
case <-ready:
case <-ctx.Done():
return ctx.Err()
}
if entry.initErr != nil {
return fmt.Errorf("peer initialization failed: %w", entry.initErr)
}
}
n.logger.WithFields(log.Fields{
"account_id": accountID,
"service_key": key,
@@ -184,15 +205,39 @@ func (n *NetBird) AddPeer(ctx context.Context, accountID types.AccountID, key Se
return nil
}
entry, err := n.createClientEntry(ctx, accountID, key, authToken, si)
// Insert a placeholder so other goroutines calling AddPeer for the same
// account will wait on the ready channel instead of starting a second
// client creation.
entry = &clientEntry{
services: map[ServiceKey]serviceInfo{key: si},
ready: make(chan struct{}),
}
n.clients[accountID] = entry
n.clientsMux.Unlock()
created, err := n.createClientEntry(ctx, accountID, key, authToken, si)
if err != nil {
entry.initErr = err
close(entry.ready)
n.clientsMux.Lock()
delete(n.clients, accountID)
n.clientsMux.Unlock()
return err
}
n.clients[accountID] = entry
// Transfer any services that were registered by concurrent AddPeer calls
// while we were creating the client.
n.clientsMux.Lock()
for k, v := range entry.services {
created.services[k] = v
}
created.ready = nil
n.clients[accountID] = created
n.clientsMux.Unlock()
close(entry.ready)
n.logger.WithFields(log.Fields{
"account_id": accountID,
"service_key": key,
@@ -200,13 +245,13 @@ func (n *NetBird) AddPeer(ctx context.Context, accountID types.AccountID, key Se
// Attempt to start the client in the background; if this fails we will
// retry on the first request via RoundTrip.
go n.runClientStartup(ctx, accountID, entry.client)
go n.runClientStartup(ctx, accountID, created.client)
return nil
}
// createClientEntry generates a WireGuard keypair, authenticates with management,
// and creates an embedded NetBird client. Must be called with clientsMux held.
// and creates an embedded NetBird client.
func (n *NetBird) createClientEntry(ctx context.Context, accountID types.AccountID, key ServiceKey, authToken string, si serviceInfo) (*clientEntry, error) {
serviceID := si.serviceID
n.logger.WithFields(log.Fields{

View File

@@ -0,0 +1,300 @@
package proxy
import (
"context"
"fmt"
"net"
"testing"
"time"
log "github.com/sirupsen/logrus"
"google.golang.org/grpc"
"github.com/netbirdio/netbird/proxy/internal/auth"
"github.com/netbirdio/netbird/proxy/internal/conntrack"
"github.com/netbirdio/netbird/proxy/internal/crowdsec"
proxymetrics "github.com/netbirdio/netbird/proxy/internal/metrics"
"github.com/netbirdio/netbird/proxy/internal/proxy"
"github.com/netbirdio/netbird/proxy/internal/roundtrip"
nbtcp "github.com/netbirdio/netbird/proxy/internal/tcp"
"github.com/netbirdio/netbird/proxy/internal/types"
udprelay "github.com/netbirdio/netbird/proxy/internal/udp"
"github.com/netbirdio/netbird/shared/management/proto"
"go.opentelemetry.io/otel/metric/noop"
)
// latencyMockClient simulates realistic gRPC latency for management calls.
type latencyMockClient struct {
proto.ProxyServiceClient
createPeerDelay time.Duration
statusUpdateDelay time.Duration
}
func (m *latencyMockClient) SendStatusUpdate(ctx context.Context, _ *proto.SendStatusUpdateRequest, _ ...grpc.CallOption) (*proto.SendStatusUpdateResponse, error) {
if m.statusUpdateDelay > 0 {
select {
case <-time.After(m.statusUpdateDelay):
case <-ctx.Done():
return nil, ctx.Err()
}
}
return &proto.SendStatusUpdateResponse{}, nil
}
func (m *latencyMockClient) CreateProxyPeer(ctx context.Context, _ *proto.CreateProxyPeerRequest, _ ...grpc.CallOption) (*proto.CreateProxyPeerResponse, error) {
if m.createPeerDelay > 0 {
select {
case <-time.After(m.createPeerDelay):
case <-ctx.Done():
return nil, ctx.Err()
}
}
return &proto.CreateProxyPeerResponse{Success: true}, nil
}
type discardWriter struct{}
func (discardWriter) Write(p []byte) (int, error) { return len(p), nil }
func benchServerWithLatency(b *testing.B, createPeerDelay, statusDelay time.Duration) *Server {
b.Helper()
logger := log.New()
logger.SetLevel(log.FatalLevel)
logger.SetOutput(&discardWriter{})
meter, err := proxymetrics.New(context.Background(), noop.Meter{})
if err != nil {
b.Fatal(err)
}
mgmtClient := &latencyMockClient{
createPeerDelay: createPeerDelay,
statusUpdateDelay: statusDelay,
}
nb := roundtrip.NewNetBird("bench-proxy", "bench.test",
roundtrip.ClientConfig{MgmtAddr: "http://bench.test:9999"},
logger, nil, mgmtClient)
mainRouter := nbtcp.NewRouter(logger, func(accountID types.AccountID) (types.DialContextFunc, error) {
return (&net.Dialer{}).DialContext, nil
}, &net.TCPAddr{IP: net.IPv4(127, 0, 0, 1), Port: 443})
return &Server{
Logger: logger,
mgmtClient: mgmtClient,
netbird: nb,
proxy: proxy.NewReverseProxy(nil, "auto", nil, logger),
auth: auth.NewMiddleware(logger, nil, nil),
mainRouter: mainRouter,
mainPort: 443,
meter: meter,
hijackTracker: conntrack.HijackTracker{},
crowdsecRegistry: crowdsec.NewRegistry("", "", log.NewEntry(logger)),
crowdsecServices: make(map[types.ServiceID]bool),
lastMappings: make(map[types.ServiceID]*proto.ProxyMapping),
portRouters: make(map[uint16]*portRouter),
svcPorts: make(map[types.ServiceID][]uint16),
udpRelays: make(map[types.ServiceID]*udprelay.Relay),
}
}
// generateHTTPMappings creates N HTTP-mode mappings with the given update type.
// All belong to a single account to share the embedded client.
func generateHTTPMappings(n int, updateType proto.ProxyMappingUpdateType) []*proto.ProxyMapping {
mappings := make([]*proto.ProxyMapping, n)
for i := range n {
mappings[i] = &proto.ProxyMapping{
Type: updateType,
Id: fmt.Sprintf("svc-%d", i),
AccountId: "account-1",
Domain: fmt.Sprintf("svc-%d.bench.example.com", i),
Mode: "http",
Path: []*proto.PathMapping{
{
Path: "/",
Target: fmt.Sprintf("http://10.0.%d.%d:8080", (i/256)%256, i%256),
},
},
Auth: &proto.Authentication{},
}
}
return mappings
}
// generateMultiAccountHTTPMappings creates N HTTP-mode CREATED mappings spread
// across the given number of accounts. This stresses the AddPeer new-account
// path which calls CreateProxyPeer + embed.New per unique account.
func generateMultiAccountHTTPMappings(n, accounts int) []*proto.ProxyMapping {
mappings := make([]*proto.ProxyMapping, n)
for i := range n {
mappings[i] = &proto.ProxyMapping{
Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED,
Id: fmt.Sprintf("svc-%d", i),
AccountId: fmt.Sprintf("account-%d", i%accounts),
Domain: fmt.Sprintf("svc-%d.bench.example.com", i),
Mode: "http",
Path: []*proto.PathMapping{
{
Path: "/",
Target: fmt.Sprintf("http://10.0.%d.%d:8080", (i/256)%256, i%256),
},
},
Auth: &proto.Authentication{},
}
}
return mappings
}
// generateMixedMappings creates mappings with a realistic distribution:
// 70% HTTP create, 15% modify existing, 10% TLS on main port, 5% remove.
// All use a single account to avoid embed.New dialing.
func generateMixedMappings(n int) []*proto.ProxyMapping {
mappings := make([]*proto.ProxyMapping, n)
for i := range n {
var m *proto.ProxyMapping
switch {
case i%20 < 14: // 70% HTTP create
m = &proto.ProxyMapping{
Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED,
Id: fmt.Sprintf("svc-http-%d", i),
AccountId: "account-1",
Domain: fmt.Sprintf("svc-%d.bench.example.com", i),
Mode: "http",
Path: []*proto.PathMapping{
{Path: "/", Target: fmt.Sprintf("http://10.0.%d.%d:8080", (i/256)%256, i%256)},
{Path: "/api", Target: fmt.Sprintf("http://10.0.%d.%d:8081", (i/256)%256, i%256)},
},
Auth: &proto.Authentication{},
}
case i%20 < 17: // 15% modify
m = &proto.ProxyMapping{
Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_MODIFIED,
Id: fmt.Sprintf("svc-http-%d", i%100),
AccountId: "account-1",
Domain: fmt.Sprintf("svc-%d.bench.example.com", i%100),
Mode: "http",
Path: []*proto.PathMapping{
{Path: "/", Target: fmt.Sprintf("http://10.1.%d.%d:8080", (i/256)%256, i%256)},
},
Auth: &proto.Authentication{},
}
case i%20 < 19: // 10% TLS passthrough on main port
m = &proto.ProxyMapping{
Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED,
Id: fmt.Sprintf("svc-tls-%d", i),
AccountId: "account-1",
Domain: fmt.Sprintf("tls-%d.bench.example.com", i),
Mode: "tls",
ListenPort: 443,
Path: []*proto.PathMapping{
{Path: "/", Target: fmt.Sprintf("10.2.%d.%d:443", (i/256)%256, i%256)},
},
}
default: // 5% remove
m = &proto.ProxyMapping{
Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_REMOVED,
Id: fmt.Sprintf("svc-http-%d", i%50),
AccountId: "account-1",
Domain: fmt.Sprintf("svc-%d.bench.example.com", i%50),
Mode: "http",
}
}
mappings[i] = m
}
return mappings
}
const (
createPeerLatency = 100 * time.Millisecond
statusUpdateLatency = 50 * time.Millisecond
)
// BenchmarkProcessMappings_HTTPCreate_SingleAccount benchmarks the initial sync
// scenario: N HTTP mappings all on a single account. Only the first mapping
// triggers CreateProxyPeer (100ms gRPC). The rest just register with the
// existing client. This is the "best case" production path.
func BenchmarkProcessMappings_HTTPCreate_SingleAccount(b *testing.B) {
for _, n := range []int{100, 1000, 5000} {
b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
mappings := generateHTTPMappings(n, proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED)
for range b.N {
s := benchServerWithLatency(b, createPeerLatency, statusUpdateLatency)
s.processMappings(b.Context(), mappings)
}
})
}
}
// BenchmarkProcessMappings_HTTPCreate_MultiAccount benchmarks the worst-case
// initial sync: every mapping belongs to a different account, so each one
// triggers a full CreateProxyPeer gRPC round-trip (100ms) + embed.New.
// With 500 accounts this serializes to ~50s of blocking I/O.
func BenchmarkProcessMappings_HTTPCreate_MultiAccount(b *testing.B) {
for _, tc := range []struct {
mappings int
accounts int
}{
{100, 10},
{100, 50},
{1000, 50},
{1000, 200},
{3000, 500},
} {
b.Run(fmt.Sprintf("mappings=%d/accounts=%d", tc.mappings, tc.accounts), func(b *testing.B) {
mappings := generateMultiAccountHTTPMappings(tc.mappings, tc.accounts)
for range b.N {
s := benchServerWithLatency(b, createPeerLatency, statusUpdateLatency)
s.processMappings(b.Context(), mappings)
}
})
}
}
// BenchmarkProcessMappings_Mixed benchmarks a realistic mixed workload
// of creates, modifies, TLS, and removes with production-like latency.
// TLS mappings call SendStatusUpdate (50ms each), serialized.
func BenchmarkProcessMappings_Mixed(b *testing.B) {
for _, n := range []int{100, 1000, 5000} {
b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
mappings := generateMixedMappings(n)
for range b.N {
s := benchServerWithLatency(b, createPeerLatency, statusUpdateLatency)
creates := generateHTTPMappings(100, proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED)
s.processMappings(b.Context(), creates)
s.processMappings(b.Context(), mappings)
}
})
}
}
// BenchmarkProcessMappings_ModifyOnly benchmarks bulk modification of
// already-registered mappings (no new peers needed, no gRPC).
func BenchmarkProcessMappings_ModifyOnly(b *testing.B) {
for _, n := range []int{100, 1000, 5000} {
b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
creates := generateHTTPMappings(n, proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED)
modifies := generateHTTPMappings(n, proto.ProxyMappingUpdateType_UPDATE_TYPE_MODIFIED)
for range b.N {
s := benchServerWithLatency(b, createPeerLatency, statusUpdateLatency)
s.processMappings(b.Context(), creates)
s.processMappings(b.Context(), modifies)
}
})
}
}
// BenchmarkProcessMappings_NoLatency measures pure CPU/allocation overhead
// with zero I/O latency for profiling purposes.
func BenchmarkProcessMappings_NoLatency(b *testing.B) {
for _, n := range []int{1000, 5000} {
b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
mappings := generateHTTPMappings(n, proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED)
for range b.N {
s := benchServerWithLatency(b, 0, 0)
s.processMappings(b.Context(), mappings)
}
})
}
}

View File

@@ -32,9 +32,11 @@ import (
"go.opentelemetry.io/otel/sdk/metric"
"golang.org/x/exp/maps"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/keepalive"
grpcstatus "google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/netbirdio/netbird/proxy/internal/accesslog"
@@ -938,6 +940,9 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
Clock: backoff.SystemClock,
}
// syncSupported tracks whether management supports SyncMappings.
// Starts true; set to false on first Unimplemented error.
syncSupported := true
initialSyncDone := false
operation := func() error {
@@ -949,36 +954,25 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
s.healthChecker.SetManagementConnected(false)
}
supportsCrowdSec := s.crowdsecRegistry.Available()
mappingClient, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
ProxyId: s.ID,
Version: s.Version,
StartedAt: timestamppb.New(s.startTime),
Address: s.ProxyURL,
Capabilities: &proto.ProxyCapabilities{
SupportsCustomPorts: &s.SupportsCustomPorts,
RequireSubdomain: &s.RequireSubdomain,
SupportsCrowdsec: &supportsCrowdSec,
},
})
if err != nil {
return fmt.Errorf("create mapping stream: %w", err)
var streamErr error
if syncSupported {
streamErr = s.trySyncMappings(ctx, client, &initialSyncDone)
if isSyncUnimplemented(streamErr) {
syncSupported = false
s.Logger.Info("management does not support SyncMappings, falling back to GetMappingUpdate")
streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
}
} else {
streamErr = s.tryGetMappingUpdate(ctx, client, &initialSyncDone)
}
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(true)
}
s.Logger.Debug("management mapping stream established")
// Stream established — reset backoff so the next failure retries quickly.
bo.Reset()
streamErr := s.handleMappingStream(ctx, mappingClient, &initialSyncDone)
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(false)
}
// Stream established — reset backoff so the next failure retries quickly.
bo.Reset()
if streamErr == nil {
return fmt.Errorf("stream closed by server")
}
@@ -995,6 +989,125 @@ func (s *Server) newManagementMappingWorker(ctx context.Context, client proto.Pr
}
}
func (s *Server) proxyCapabilities() *proto.ProxyCapabilities {
supportsCrowdSec := s.crowdsecRegistry.Available()
return &proto.ProxyCapabilities{
SupportsCustomPorts: &s.SupportsCustomPorts,
RequireSubdomain: &s.RequireSubdomain,
SupportsCrowdsec: &supportsCrowdSec,
}
}
func (s *Server) tryGetMappingUpdate(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
mappingClient, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
ProxyId: s.ID,
Version: s.Version,
StartedAt: timestamppb.New(s.startTime),
Address: s.ProxyURL,
Capabilities: s.proxyCapabilities(),
})
if err != nil {
return fmt.Errorf("create mapping stream: %w", err)
}
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(true)
}
s.Logger.Debug("management mapping stream established (GetMappingUpdate)")
return s.handleMappingStream(ctx, mappingClient, initialSyncDone)
}
func (s *Server) trySyncMappings(ctx context.Context, client proto.ProxyServiceClient, initialSyncDone *bool) error {
stream, err := client.SyncMappings(ctx)
if err != nil {
return fmt.Errorf("create sync stream: %w", err)
}
// Send init message.
if err := stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: s.ID,
Version: s.Version,
StartedAt: timestamppb.New(s.startTime),
Address: s.ProxyURL,
Capabilities: s.proxyCapabilities(),
},
},
}); err != nil {
return fmt.Errorf("send sync init: %w", err)
}
if s.healthChecker != nil {
s.healthChecker.SetManagementConnected(true)
}
s.Logger.Debug("management mapping stream established (SyncMappings)")
return s.handleSyncMappingsStream(ctx, stream, initialSyncDone)
}
func isSyncUnimplemented(err error) bool {
if err == nil {
return false
}
st, ok := grpcstatus.FromError(err)
return ok && st.Code() == codes.Unimplemented
}
func (s *Server) handleSyncMappingsStream(ctx context.Context, stream proto.ProxyService_SyncMappingsClient, initialSyncDone *bool) error {
select {
case <-s.routerReady:
case <-ctx.Done():
return ctx.Err()
}
var snapshotIDs map[types.ServiceID]struct{}
if !*initialSyncDone {
snapshotIDs = make(map[types.ServiceID]struct{})
}
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
msg, err := stream.Recv()
switch {
case errors.Is(err, io.EOF):
return nil
case err != nil:
return fmt.Errorf("receive msg: %w", err)
}
s.Logger.Debug("Received mapping update, starting processing")
s.processMappings(ctx, msg.GetMapping())
s.Logger.Debug("Processing mapping update completed")
// Send ack so management knows we're ready for the next batch.
if err := stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
}); err != nil {
return fmt.Errorf("send ack: %w", err)
}
if !*initialSyncDone {
for _, m := range msg.GetMapping() {
snapshotIDs[types.ServiceID(m.GetId())] = struct{}{}
}
if msg.GetInitialSyncComplete() {
s.reconcileSnapshot(ctx, snapshotIDs)
snapshotIDs = nil
if s.healthChecker != nil {
s.healthChecker.SetInitialSyncComplete()
}
*initialSyncDone = true
s.Logger.Info("Initial mapping sync complete")
}
}
}
}
}
func (s *Server) handleMappingStream(ctx context.Context, mappingClient proto.ProxyService_GetMappingUpdateClient, initialSyncDone *bool) error {
select {
case <-s.routerReady:
@@ -1067,6 +1180,8 @@ func (s *Server) reconcileSnapshot(ctx context.Context, snapshotIDs map[types.Se
}
func (s *Server) processMappings(ctx context.Context, mappings []*proto.ProxyMapping) {
s.ensurePeers(ctx, mappings)
for _, mapping := range mappings {
s.Logger.WithFields(log.Fields{
"type": mapping.GetType(),
@@ -1100,6 +1215,60 @@ func (s *Server) processMappings(ctx context.Context, mappings []*proto.ProxyMap
}
}
// ensurePeers pre-creates NetBird peers for all unique accounts referenced by
// CREATED mappings. Peers for different accounts are created concurrently,
// which avoids serializing N×100ms gRPC round-trips during large initial syncs.
func (s *Server) ensurePeers(ctx context.Context, mappings []*proto.ProxyMapping) {
// Collect one representative mapping per account that needs a new peer.
type peerReq struct {
accountID types.AccountID
svcKey roundtrip.ServiceKey
authToken string
svcID types.ServiceID
}
seen := make(map[types.AccountID]struct{})
var reqs []peerReq
for _, m := range mappings {
if m.GetType() != proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED {
continue
}
accountID := types.AccountID(m.GetAccountId())
if _, ok := seen[accountID]; ok {
continue
}
seen[accountID] = struct{}{}
if s.netbird.HasClient(accountID) {
continue
}
reqs = append(reqs, peerReq{
accountID: accountID,
svcKey: s.serviceKeyForMapping(m),
authToken: m.GetAuthToken(),
svcID: types.ServiceID(m.GetId()),
})
}
if len(reqs) <= 1 {
return
}
var wg sync.WaitGroup
wg.Add(len(reqs))
for _, r := range reqs {
go func() {
defer wg.Done()
if err := s.netbird.AddPeer(ctx, r.accountID, r.svcKey, r.authToken, r.svcID); err != nil {
s.Logger.WithFields(log.Fields{
"account_id": r.accountID,
"service_id": r.svcID,
"error": err,
}).Warn("failed to pre-create peer for account")
}
}()
}
wg.Wait()
}
// addMapping registers a service mapping and starts the appropriate relay or routes.
func (s *Server) addMapping(ctx context.Context, mapping *proto.ProxyMapping) error {
accountID := types.AccountID(mapping.GetAccountId())

510
proxy/sync_mappings_test.go Normal file
View File

@@ -0,0 +1,510 @@
package proxy
import (
"context"
"errors"
"fmt"
"net"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/credentials/insecure"
grpcstatus "google.golang.org/grpc/status"
"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service"
"github.com/netbirdio/netbird/shared/management/proto"
)
func TestIntegration_SyncMappings_HappyPath(t *testing.T) {
setup := setupIntegrationTest(t)
defer setup.cleanup()
conn, err := grpc.NewClient(setup.grpcAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
require.NoError(t, err)
defer conn.Close()
client := proto.NewProxyServiceClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
stream, err := client.SyncMappings(ctx)
require.NoError(t, err)
// Send init.
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: "sync-proxy-1",
Version: "test-v1",
Address: "test.proxy.io",
},
},
})
require.NoError(t, err)
mappingsByID := make(map[string]*proto.ProxyMapping)
for {
msg, err := stream.Recv()
require.NoError(t, err)
for _, m := range msg.GetMapping() {
mappingsByID[m.GetId()] = m
}
// Ack every batch.
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
})
require.NoError(t, err)
if msg.GetInitialSyncComplete() {
break
}
}
assert.Len(t, mappingsByID, 2, "Should receive 2 mappings")
rp1 := mappingsByID["rp-1"]
require.NotNil(t, rp1)
assert.Equal(t, "app1.test.proxy.io", rp1.GetDomain())
assert.Equal(t, "test-account-1", rp1.GetAccountId())
assert.Equal(t, proto.ProxyMappingUpdateType_UPDATE_TYPE_CREATED, rp1.GetType())
assert.NotEmpty(t, rp1.GetAuthToken(), "Should have auth token")
rp2 := mappingsByID["rp-2"]
require.NotNil(t, rp2)
assert.Equal(t, "app2.test.proxy.io", rp2.GetDomain())
}
func TestIntegration_SyncMappings_BackPressure(t *testing.T) {
setup := setupIntegrationTest(t)
defer setup.cleanup()
// Add more services so we get multiple batches.
addServicesToStore(t, setup, 20, "test.proxy.io")
conn, err := grpc.NewClient(setup.grpcAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
require.NoError(t, err)
defer conn.Close()
client := proto.NewProxyServiceClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
stream, err := client.SyncMappings(ctx)
require.NoError(t, err)
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: "sync-proxy-backpressure",
Version: "test-v1",
Address: "test.proxy.io",
},
},
})
require.NoError(t, err)
// Record the ordering of events to verify back-pressure.
var mu sync.Mutex
var events []string
var totalMappings int
for {
msg, err := stream.Recv()
require.NoError(t, err)
mu.Lock()
events = append(events, "recv")
totalMappings += len(msg.GetMapping())
mu.Unlock()
// Simulate processing delay.
time.Sleep(50 * time.Millisecond)
mu.Lock()
events = append(events, "ack")
mu.Unlock()
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
})
require.NoError(t, err)
if msg.GetInitialSyncComplete() {
break
}
}
// 2 original + 20 added = 22 services total.
assert.Equal(t, 22, totalMappings, "should receive all 22 mappings")
// Events should alternate recv/ack — no two recvs in a row
// (management waits for ack before sending next).
mu.Lock()
defer mu.Unlock()
for i := 0; i < len(events)-1; i += 2 {
assert.Equal(t, "recv", events[i], "event %d should be recv", i)
if i+1 < len(events) {
assert.Equal(t, "ack", events[i+1], "event %d should be ack", i+1)
}
}
}
func TestIntegration_SyncMappings_IncrementalUpdate(t *testing.T) {
setup := setupIntegrationTest(t)
defer setup.cleanup()
conn, err := grpc.NewClient(setup.grpcAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
require.NoError(t, err)
defer conn.Close()
client := proto.NewProxyServiceClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
stream, err := client.SyncMappings(ctx)
require.NoError(t, err)
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: "sync-proxy-incremental",
Version: "test-v1",
Address: "test.proxy.io",
},
},
})
require.NoError(t, err)
// Drain initial snapshot.
for {
msg, err := stream.Recv()
require.NoError(t, err)
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
})
require.NoError(t, err)
if msg.GetInitialSyncComplete() {
break
}
}
// Now send an incremental update via the management server.
setup.proxyService.SendServiceUpdate(&proto.GetMappingUpdateResponse{
Mapping: []*proto.ProxyMapping{{
Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_REMOVED,
Id: "rp-1",
AccountId: "test-account-1",
Domain: "app1.test.proxy.io",
}},
})
// Receive the incremental update on the sync stream.
msg, err := stream.Recv()
require.NoError(t, err)
require.NotEmpty(t, msg.GetMapping())
assert.Equal(t, "rp-1", msg.GetMapping()[0].GetId())
assert.Equal(t, proto.ProxyMappingUpdateType_UPDATE_TYPE_REMOVED, msg.GetMapping()[0].GetType())
}
func TestIntegration_SyncMappings_MixedProxyVersions(t *testing.T) {
setup := setupIntegrationTest(t)
defer setup.cleanup()
conn, err := grpc.NewClient(setup.grpcAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
require.NoError(t, err)
defer conn.Close()
client := proto.NewProxyServiceClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// Old proxy uses GetMappingUpdate.
legacyStream, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
ProxyId: "legacy-proxy",
Version: "old-v1",
Address: "test.proxy.io",
})
require.NoError(t, err)
var legacyMappings []*proto.ProxyMapping
for {
msg, err := legacyStream.Recv()
require.NoError(t, err)
legacyMappings = append(legacyMappings, msg.GetMapping()...)
if msg.GetInitialSyncComplete() {
break
}
}
// New proxy uses SyncMappings.
syncStream, err := client.SyncMappings(ctx)
require.NoError(t, err)
err = syncStream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: "new-proxy",
Version: "new-v2",
Address: "test.proxy.io",
},
},
})
require.NoError(t, err)
var syncMappings []*proto.ProxyMapping
for {
msg, err := syncStream.Recv()
require.NoError(t, err)
syncMappings = append(syncMappings, msg.GetMapping()...)
err = syncStream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
})
require.NoError(t, err)
if msg.GetInitialSyncComplete() {
break
}
}
// Both should receive the same set of mappings.
assert.Equal(t, len(legacyMappings), len(syncMappings),
"legacy and sync proxies should receive the same number of mappings")
legacyIDs := make(map[string]bool)
for _, m := range legacyMappings {
legacyIDs[m.GetId()] = true
}
for _, m := range syncMappings {
assert.True(t, legacyIDs[m.GetId()],
"mapping %s should be present in both streams", m.GetId())
}
// Both proxies should be connected.
proxies := setup.proxyService.GetConnectedProxies()
assert.Contains(t, proxies, "legacy-proxy")
assert.Contains(t, proxies, "new-proxy")
// Both should receive incremental updates.
setup.proxyService.SendServiceUpdate(&proto.GetMappingUpdateResponse{
Mapping: []*proto.ProxyMapping{{
Type: proto.ProxyMappingUpdateType_UPDATE_TYPE_REMOVED,
Id: "rp-1",
AccountId: "test-account-1",
Domain: "app1.test.proxy.io",
}},
})
// Legacy proxy receives via GetMappingUpdateResponse.
legacyMsg, err := legacyStream.Recv()
require.NoError(t, err)
assert.Equal(t, "rp-1", legacyMsg.GetMapping()[0].GetId())
// Sync proxy receives via SyncMappingsResponse.
syncMsg, err := syncStream.Recv()
require.NoError(t, err)
assert.Equal(t, "rp-1", syncMsg.GetMapping()[0].GetId())
}
func TestIntegration_SyncMappings_Reconnect(t *testing.T) {
setup := setupIntegrationTest(t)
defer setup.cleanup()
conn, err := grpc.NewClient(setup.grpcAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
require.NoError(t, err)
defer conn.Close()
client := proto.NewProxyServiceClient(conn)
proxyID := "sync-proxy-reconnect"
receiveMappings := func() []*proto.ProxyMapping {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
stream, err := client.SyncMappings(ctx)
require.NoError(t, err)
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: proxyID,
Version: "test-v1",
Address: "test.proxy.io",
},
},
})
require.NoError(t, err)
var mappings []*proto.ProxyMapping
for {
msg, err := stream.Recv()
require.NoError(t, err)
mappings = append(mappings, msg.GetMapping()...)
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Ack{Ack: &proto.SyncMappingsAck{}},
})
require.NoError(t, err)
if msg.GetInitialSyncComplete() {
break
}
}
return mappings
}
first := receiveMappings()
time.Sleep(100 * time.Millisecond)
second := receiveMappings()
assert.Equal(t, len(first), len(second),
"should receive same mappings on reconnect")
firstIDs := make(map[string]bool)
for _, m := range first {
firstIDs[m.GetId()] = true
}
for _, m := range second {
assert.True(t, firstIDs[m.GetId()],
"mapping %s should be present in both connections", m.GetId())
}
}
// --- Fallback tests: old management returns Unimplemented ---
// unimplementedProxyServer embeds UnimplementedProxyServiceServer so
// SyncMappings returns codes.Unimplemented while GetMappingUpdate works.
type unimplementedSyncServer struct {
proto.UnimplementedProxyServiceServer
getMappingCalls atomic.Int32
}
func (s *unimplementedSyncServer) GetMappingUpdate(_ *proto.GetMappingUpdateRequest, stream proto.ProxyService_GetMappingUpdateServer) error {
s.getMappingCalls.Add(1)
return stream.Send(&proto.GetMappingUpdateResponse{
Mapping: []*proto.ProxyMapping{{Id: "svc-1", AccountId: "acct-1", Domain: "example.com"}},
InitialSyncComplete: true,
})
}
func TestIntegration_FallbackToGetMappingUpdate(t *testing.T) {
// Start a gRPC server that does NOT implement SyncMappings.
lis, err := net.Listen("tcp", "127.0.0.1:0")
require.NoError(t, err)
srv := &unimplementedSyncServer{}
grpcServer := grpc.NewServer()
proto.RegisterProxyServiceServer(grpcServer, srv)
go func() { _ = grpcServer.Serve(lis) }()
defer grpcServer.GracefulStop()
conn, err := grpc.NewClient(lis.Addr().String(), grpc.WithTransportCredentials(insecure.NewCredentials()))
require.NoError(t, err)
defer conn.Close()
client := proto.NewProxyServiceClient(conn)
// Try SyncMappings — should get Unimplemented.
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
stream, err := client.SyncMappings(ctx)
require.NoError(t, err)
err = stream.Send(&proto.SyncMappingsRequest{
Msg: &proto.SyncMappingsRequest_Init{
Init: &proto.SyncMappingsInit{
ProxyId: "test-proxy",
Address: "test.example.com",
},
},
})
require.NoError(t, err)
_, err = stream.Recv()
require.Error(t, err)
st, ok := grpcstatus.FromError(err)
require.True(t, ok)
assert.Equal(t, codes.Unimplemented, st.Code(),
"unimplemented SyncMappings should return Unimplemented code")
// isSyncUnimplemented should detect this.
assert.True(t, isSyncUnimplemented(err))
// The actual fallback: GetMappingUpdate should work.
legacyStream, err := client.GetMappingUpdate(ctx, &proto.GetMappingUpdateRequest{
ProxyId: "test-proxy",
Address: "test.example.com",
})
require.NoError(t, err)
msg, err := legacyStream.Recv()
require.NoError(t, err)
assert.True(t, msg.GetInitialSyncComplete())
assert.Len(t, msg.GetMapping(), 1)
assert.Equal(t, int32(1), srv.getMappingCalls.Load())
}
func TestIsSyncUnimplemented(t *testing.T) {
tests := []struct {
name string
err error
want bool
}{
{"nil error", nil, false},
{"non-grpc error", errors.New("random"), false},
{"grpc internal", grpcstatus.Error(codes.Internal, "fail"), false},
{"grpc unavailable", grpcstatus.Error(codes.Unavailable, "fail"), false},
{"grpc unimplemented", grpcstatus.Error(codes.Unimplemented, "method not found"), true},
{
"wrapped unimplemented",
fmt.Errorf("create sync stream: %w", grpcstatus.Error(codes.Unimplemented, "nope")),
// grpc/status.FromError unwraps in recent versions of grpc-go.
true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, isSyncUnimplemented(tt.err))
})
}
}
// addServicesToStore adds n extra services to the test store for the given cluster.
func addServicesToStore(t *testing.T, setup *integrationTestSetup, n int, cluster string) {
t.Helper()
ctx := context.Background()
for i := 0; i < n; i++ {
svc := &service.Service{
ID: fmt.Sprintf("extra-svc-%d", i),
AccountID: "test-account-1",
Name: fmt.Sprintf("Extra Service %d", i),
Domain: fmt.Sprintf("extra-%d.test.proxy.io", i),
ProxyCluster: cluster,
Enabled: true,
Targets: []*service.Target{{
Path: strPtr("/"),
Host: fmt.Sprintf("10.0.1.%d", i%256),
Port: 8080,
Protocol: "http",
TargetId: fmt.Sprintf("peer-extra-%d", i),
TargetType: "peer",
Enabled: true,
}},
}
require.NoError(t, setup.store.CreateService(ctx, svc))
}
}