feat(private-service): expose NetBird-only services over tunnel peers

Adds a new "private" service mode for the reverse proxy: services
reachable exclusively over the embedded WireGuard tunnel, gated by
per-peer group membership instead of operator auth schemes.

Wire contract
- ProxyMapping.private (field 13): the proxy MUST call
  ValidateTunnelPeer and fail closed; operator schemes are bypassed.
- ProxyCapabilities.private (4) + supports_private_service (5):
  capability gate. Management never streams private mappings to
  proxies that don't claim the capability; the broadcast path applies
  the same filter via filterMappingsForProxy.
- ValidateTunnelPeer RPC: resolves an inbound tunnel IP to a peer,
  checks the peer's groups against service.AccessGroups, and mints
  a session JWT on success. checkPeerGroupAccess fails closed when
  a private service has empty AccessGroups.
- ValidateSession/ValidateTunnelPeer responses now carry
  peer_group_ids + peer_group_names so the proxy can authorise
  policy-aware middlewares without an extra management round-trip.
- ProxyInboundListener + SendStatusUpdate.inbound_listener: per-account
  inbound listener state surfaced to dashboards.
- PathTargetOptions.direct_upstream (11): bypass the embedded NetBird
  client and dial the target via the proxy host's network stack for
  upstreams reachable without WireGuard.

Data model
- Service.Private (bool) + Service.AccessGroups ([]string, JSON-
  serialised). Validate() rejects bearer auth on private services.
  Copy() deep-copies AccessGroups. pgx getServices loads the columns.
- DomainConfig.Private threaded into the proxy auth middleware.
  Request handler routes private services through forwardWithTunnelPeer
  and returns 403 on validation failure.
- Account-level SynthesizePrivateServiceZones (synthetic DNS) and
  injectPrivateServicePolicies (synthetic ACL) gate on
  len(svc.AccessGroups) > 0.

Proxy
- /netbird proxy --private (embedded mode) flag; Config.Private in
  proxy/lifecycle.go.
- Per-account inbound listener (proxy/inbound.go) binding HTTP/HTTPS
  on the embedded NetBird client's WireGuard tunnel netstack.
- proxy/internal/auth/tunnel_cache: ValidateTunnelPeer response cache
  with single-flight de-duplication and per-account eviction.
- Local peerstore short-circuit: when the inbound IP isn't in the
  account roster, deny fast without an RPC.
- proxy/server.go reports SupportsPrivateService=true and redacts the
  full ProxyMapping JSON from info logs (auth_token + header-auth
  hashed values now only at debug level).

Identity forwarding
- ValidateSessionJWT returns user_id, email, method, groups,
  group_names. sessionkey.Claims carries Email + Groups + GroupNames
  so the proxy can stamp identity onto upstream requests without an
  extra management round-trip on every cookie-bearing request.
- CapturedData carries userEmail / userGroups / userGroupNames; the
  proxy stamps X-NetBird-User and X-NetBird-Groups on r.Out from the
  authenticated identity (strips client-supplied values first to
  prevent spoofing).
- AccessLog.UserGroups: access-log enrichment captures the user's
  group memberships at write time so the dashboard can render group
  context without reverse-resolving stale memberships.

OpenAPI/dashboard surface
- ReverseProxyService gains private + access_groups; ReverseProxyCluster
  gains private + supports_private. ReverseProxyTarget target_type
  enum gains "cluster". ServiceTargetOptions gains direct_upstream.
  ProxyAccessLog gains user_groups.
This commit is contained in:
mlsmaycon
2026-05-20 21:39:22 +02:00
parent 37052fd5bc
commit 167ee08e14
72 changed files with 6584 additions and 2586 deletions

134
proxy/lifecycle_test.go Normal file
View File

@@ -0,0 +1,134 @@
package proxy
import (
"context"
"errors"
"io"
"testing"
"time"
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
// quietLifecycleLogger keeps lifecycle tests from spamming the test output.
func quietLifecycleLogger() *log.Logger {
l := log.New()
l.SetOutput(io.Discard)
l.SetLevel(log.PanicLevel)
return l
}
func TestNewIsPureConstructor(t *testing.T) {
cfg := Config{
ListenAddr: ":0",
ID: "test-id",
Logger: quietLifecycleLogger(),
Version: "test",
ManagementAddress: "https://example.invalid",
HealthAddr: "",
ForwardedProto: "auto",
}
srv := New(cfg)
require.NotNil(t, srv, "New must return a non-nil Server")
assert.Equal(t, ":0", srv.ListenAddr, "ListenAddr should round-trip")
assert.Equal(t, "test-id", srv.ID, "ID should round-trip")
assert.Equal(t, "test", srv.Version, "Version should round-trip")
assert.Equal(t, "https://example.invalid", srv.ManagementAddress, "ManagementAddress should round-trip")
assert.Equal(t, "auto", srv.ForwardedProto, "ForwardedProto should round-trip")
// Pure constructor: no goroutines, no listener bind, no management dial.
assert.False(t, srv.started, "Server must be marked unstarted before Start")
assert.Nil(t, srv.mgmtClient, "mgmt client must not be created in New")
assert.Nil(t, srv.netbird, "netbird client must not be created in New")
assert.Nil(t, srv.https, "https server must not be created in New")
assert.Nil(t, srv.healthServer, "health server must not be created in New")
assert.Nil(t, srv.runCancel, "runCancel must be nil before Start")
assert.Nil(t, srv.runErrCh, "runErrCh must be nil before Start")
}
func TestStopBeforeStartIsNoOp(t *testing.T) {
srv := New(Config{Logger: quietLifecycleLogger()})
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
err := srv.Stop(ctx)
assert.NoError(t, err, "Stop on an unstarted server must succeed without error")
err = srv.Stop(ctx)
assert.NoError(t, err, "Stop must remain idempotent across repeated calls")
}
func TestStartFailsWithoutManagement(t *testing.T) {
srv := New(Config{
Logger: quietLifecycleLogger(),
ListenAddr: "127.0.0.1:0",
ManagementAddress: "://broken-url",
})
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
err := srv.Start(ctx)
require.Error(t, err, "Start must surface management dial failures")
assert.True(t, srv.started, "started flag is set before any dial attempt so a second Start fails fast")
err = srv.Start(ctx)
require.Error(t, err, "second Start must reject")
assert.Contains(t, err.Error(), "already started", "error must explain why the call was rejected")
}
func TestStopIsIdempotent(t *testing.T) {
srv := &Server{
Logger: quietLifecycleLogger(),
started: true,
runErrCh: make(chan struct{}),
runCancel: func() {},
}
srv.recordRunErr(errors.New("synthetic"))
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
err := srv.Stop(ctx)
require.Error(t, err, "Stop must surface the recorded background error")
assert.Contains(t, err.Error(), "synthetic", "error must round-trip recordRunErr's value")
err = srv.Stop(ctx)
require.Error(t, err, "second Stop must still report the same error")
assert.Contains(t, err.Error(), "synthetic", "idempotent Stop must return the cached error")
}
func TestRecordRunErrPreservesFirstFailure(t *testing.T) {
srv := &Server{
Logger: quietLifecycleLogger(),
runErrCh: make(chan struct{}),
}
srv.recordRunErr(errors.New("first"))
srv.recordRunErr(errors.New("second"))
require.Error(t, srv.runErr, "first failure must be retained")
assert.Contains(t, srv.runErr.Error(), "first", "second call must not overwrite the cached error")
select {
case <-srv.runErrCh:
default:
t.Fatal("recordRunErr must close runErrCh so waitAndStop unblocks")
}
}
func TestStopSkipsShutdownWhenNeverStarted(t *testing.T) {
srv := New(Config{Logger: quietLifecycleLogger()})
ctx, cancel := context.WithCancel(context.Background())
cancel()
err := srv.Stop(ctx)
assert.NoError(t, err, "Stop on an unstarted server should not block on the cancelled ctx")
}