relay: deploy templates expose UDP, mgmt threads transport hints

Two follow-ups to the WebTransport/ALPN-mux landing:

Deployment templates publish UDP alongside TCP for the relay so the
single ALPN-multiplexed socket can serve raw QUIC and WebTransport
clients on the same port as the existing WebSocket transport.

- docker-compose.yml.tmpl: adds the matching `/udp` mapping; the relay
  was already binding both stacks, the host port just wasn't published.
- docker-compose.yml.tmpl.traefik: WebTransport is the awkward case —
  Traefik can't proxy WT sessions, so the relay container now publishes
  UDP/443 directly and obtains its own Let's Encrypt cert (separate
  volume), while the TCP /relay route stays behind Traefik unchanged so
  WS-only clients keep working.

Management server learned to advertise per-relay transport hints:

- Config gains an optional `Endpoints []{URL, Transports}` block on the
  Relay section, mirrored to clients as RelayConfig.endpoints.
- `Addresses` is still emitted as RelayConfig.urls so older agents keep
  working unchanged.
- A single BuildRelayConfigProto helper is the only place that builds
  the proto, called from both toNetbirdConfig and the token push paths.

The GeoDNS case is operator-asserted, not probed: a single URL fans out
to several physical relays, and the Transports list must already be the
intersection of what every backend supports. Documented on the config
struct — if any backend behind a hostname can't speak h3, the operator
drops "wt" from that hostname's list and no client tries it there.
This commit is contained in:
Claude
2026-05-17 11:21:38 +00:00
parent 078c323ef3
commit e3c23c263b
7 changed files with 143 additions and 21 deletions

View File

@@ -47,6 +47,10 @@ VOLUME_PREFIX="netbird-"
MGMT_VOLUMESUFFIX="mgmt"
SIGNAL_VOLUMESUFFIX="signal"
LETSENCRYPT_VOLUMESUFFIX="letsencrypt"
# Dedicated Let's Encrypt store for the relay. Required only by the Traefik
# deployment, where the relay runs its own ACME client to terminate TLS on
# UDP/443 for WebTransport + raw QUIC (Traefik can't proxy WebTransport).
RELAY_LE_VOLUMESUFFIX="relay-letsencrypt"
NETBIRD_AUTH_DEVICE_AUTH_PROVIDER="none"
NETBIRD_AUTH_DEVICE_AUTH_AUDIENCE=${NETBIRD_AUTH_DEVICE_AUTH_AUDIENCE:-$NETBIRD_AUTH_AUDIENCE}
@@ -111,6 +115,7 @@ export VOLUME_PREFIX
export MGMT_VOLUMESUFFIX
export SIGNAL_VOLUMESUFFIX
export LETSENCRYPT_VOLUMESUFFIX
export RELAY_LE_VOLUMESUFFIX
export NETBIRD_DISABLE_ANONYMOUS_METRICS
export NETBIRD_MGMT_SINGLE_ACCOUNT_MODE_DOMAIN
export NETBIRD_MGMT_DNS_DOMAIN

View File

@@ -112,6 +112,7 @@ mkdir -p $artifacts_path
MGMT_VOLUMENAME="${VOLUME_PREFIX}${MGMT_VOLUMESUFFIX}"
SIGNAL_VOLUMENAME="${VOLUME_PREFIX}${SIGNAL_VOLUMESUFFIX}"
LETSENCRYPT_VOLUMENAME="${VOLUME_PREFIX}${LETSENCRYPT_VOLUMESUFFIX}"
RELAY_LE_VOLUMENAME="${VOLUME_PREFIX}${RELAY_LE_VOLUMESUFFIX}"
# if volume with wiretrustee- prefix already exists, use it, else create new with netbird-
OLD_PREFIX='wiretrustee-'
if docker volume ls | grep -q "${OLD_PREFIX}${MGMT_VOLUMESUFFIX}"; then
@@ -127,6 +128,7 @@ fi
export MGMT_VOLUMENAME
export SIGNAL_VOLUMENAME
export LETSENCRYPT_VOLUMENAME
export RELAY_LE_VOLUMENAME
#backwards compatibility after migrating to generic OIDC with Auth0
if [[ -z "${NETBIRD_AUTH_OIDC_CONFIGURATION_ENDPOINT}" ]]; then

View File

@@ -58,6 +58,17 @@ services:
]
# Relay
#
# The relay listens on the same address for three transports, multiplexed by
# ALPN on a single TLS endpoint:
# - TCP: WebSocket (rels:// path /relay) — universal, works through any HTTP proxy
# - UDP: raw QUIC (nb-quic ALPN) — used by native clients
# - UDP: HTTP/3 + WebTransport (h3 ALPN, path /relay) — used by browser/WASM clients
# Both TCP and UDP must be published on the same port. Operators who want to
# disable a transport for clients should NOT remove the port mapping — the
# listener still binds — instead drop the transport from each entry's
# `transports:` list in management.json so the management server stops
# advertising it.
relay:
<<: *default
image: netbirdio/relay:$NETBIRD_RELAY_TAG
@@ -69,6 +80,7 @@ services:
- NB_AUTH_SECRET=$NETBIRD_RELAY_AUTH_SECRET
ports:
- $NETBIRD_RELAY_PORT:$NETBIRD_RELAY_PORT
- $NETBIRD_RELAY_PORT:$NETBIRD_RELAY_PORT/udp
# Management
management:

View File

@@ -54,19 +54,48 @@ services:
- traefik.http.services.netbird-signal.loadbalancer.server.scheme=h2c
# Relay
#
# Traefik fronts the TCP/WebSocket side of the relay on port 443 via the HTTP
# router below — this gives us WS over TLS that traverses any HTTP proxy.
#
# WebTransport (h3) and raw QUIC require direct UDP termination on the relay
# itself: Traefik does not proxy WebTransport sessions, and tunnelling the
# h3 stream end-to-end through a reverse proxy defeats the point. The relay
# therefore publishes UDP/443 on the host directly and terminates TLS for
# both ALPNs ("nb-quic" and "h3") on a single socket via ALPN multiplexing.
#
# The relay obtains its own Let's Encrypt certificate (NB_LETSENCRYPT_*
# below) since Traefik's certificate store is not shared with the container.
# The cert is bound to NETBIRD_RELAY_DOMAIN — point this at the same FQDN
# clients use to dial the relay.
#
# If a deployment can't open UDP/443 to the host (firewall, k8s without
# hostPort, etc.), leave it unmapped: native clients fall back to raw QUIC
# over WS and browser clients fall back to WS. Drop "quic"/"wt" from the
# management Relays config in that case so clients don't waste a handshake.
relay:
<<: *default
image: netbirdio/relay:$NETBIRD_RELAY_TAG
environment:
- NB_LOG_LEVEL=info
- NB_LISTEN_ADDRESS=:33080
- NB_LISTEN_ADDRESS=:443
- NB_EXPOSED_ADDRESS=$NETBIRD_RELAY_ENDPOINT
- NB_LETSENCRYPT_DOMAINS=$NETBIRD_RELAY_DOMAIN
- NB_LETSENCRYPT_EMAIL=$NETBIRD_LETSENCRYPT_EMAIL
- NB_LETSENCRYPT_DATA_DIR=/var/lib/netbird-relay
# todo: change to a secure secret
- NB_AUTH_SECRET=$NETBIRD_RELAY_AUTH_SECRET
volumes:
- $RELAY_LE_VOLUMENAME:/var/lib/netbird-relay
ports:
# Direct UDP exposure for QUIC + WebTransport (bypasses Traefik).
- 443:443/udp
labels:
# The TCP WS path stays behind Traefik so the existing /relay route keeps
# working for clients that can't open UDP/443.
- traefik.enable=true
- traefik.http.routers.netbird-relay.rule=Host(`$NETBIRD_DOMAIN`) && PathPrefix(`/relay`)
- traefik.http.services.netbird-relay.loadbalancer.server.port=33080
- traefik.http.services.netbird-relay.loadbalancer.server.port=443
# Management
management:
@@ -117,3 +146,4 @@ volumes:
$MGMT_VOLUMENAME:
$SIGNAL_VOLUMENAME:
$LETSENCRYPT_VOLUMENAME:
$RELAY_LE_VOLUMENAME:

View File

@@ -87,12 +87,33 @@ type TURNConfig struct {
}
// Relay configuration type
//
// Addresses is the legacy flat list and is forwarded to clients as
// RelayConfig.urls for back-compat with older agents.
//
// Endpoints, when populated, additionally announces the transports each
// relay URL supports. Under GeoDNS, where one URL resolves to several
// physical relays in different regions, Transports must be the
// intersection of the transports supported by every backend behind that
// hostname — clients pick a transport per URL and the management server
// does not probe individual backends. If a single backend in the pool
// does not support h3/WebTransport, drop "wt" from Transports so no
// client tries it against that hostname.
type Relay struct {
Addresses []string
Endpoints []RelayEndpoint
CredentialsTTL util.Duration
Secret string
}
// RelayEndpoint pairs a relay URL with the transports it advertises.
// Transports values: "ws", "quic", "wt". Empty Transports means "unknown,
// let the client try whatever it supports".
type RelayEndpoint struct {
URL string
Transports []string
}
// HttpServerConfig is a config of the HTTP Management service server
type HttpServerConfig struct {
LetsEncryptDomain string

View File

@@ -62,16 +62,10 @@ func toNetbirdConfig(config *nbconfig.Config, turnCredentials *Token, relayToken
}
}
var relayCfg *proto.RelayConfig
if config.Relay != nil && len(config.Relay.Addresses) > 0 {
relayCfg = &proto.RelayConfig{
Urls: config.Relay.Addresses,
}
if relayToken != nil {
relayCfg.TokenPayload = relayToken.Payload
relayCfg.TokenSignature = relayToken.Signature
}
relayCfg := BuildRelayConfigProto(config.Relay)
if relayCfg != nil && relayToken != nil {
relayCfg.TokenPayload = relayToken.Payload
relayCfg.TokenSignature = relayToken.Signature
}
var signalCfg *proto.HostConfig
@@ -92,6 +86,60 @@ func toNetbirdConfig(config *nbconfig.Config, turnCredentials *Token, relayToken
return nbConfig
}
// BuildRelayConfigProto translates the management-server relay config into
// the wire-level proto.RelayConfig.
//
// Both forms are emitted side by side: the legacy `urls` slice always carries
// every relay address (so old agents that don't understand `endpoints` still
// receive a working URL list), and `endpoints` adds per-relay transport hints
// when the operator has configured them.
//
// Under GeoDNS a single URL fans out to several physical relays. The hint
// declared here must already be the intersection of the transports supported
// by every backend; the management server takes the operator's word for it
// rather than probing each region. If even one backend behind a hostname
// can't speak h3/WebTransport, the operator must omit "wt" from that
// hostname's Transports.
func BuildRelayConfigProto(cfg *nbconfig.Relay) *proto.RelayConfig {
if cfg == nil {
return nil
}
if len(cfg.Addresses) == 0 && len(cfg.Endpoints) == 0 {
return nil
}
out := &proto.RelayConfig{}
if len(cfg.Endpoints) > 0 {
seen := make(map[string]struct{}, len(cfg.Endpoints))
out.Endpoints = make([]*proto.RelayEndpoint, 0, len(cfg.Endpoints))
out.Urls = make([]string, 0, len(cfg.Endpoints)+len(cfg.Addresses))
for _, ep := range cfg.Endpoints {
if ep.URL == "" {
continue
}
out.Endpoints = append(out.Endpoints, &proto.RelayEndpoint{
Url: ep.URL,
Transports: append([]string(nil), ep.Transports...),
})
out.Urls = append(out.Urls, ep.URL)
seen[ep.URL] = struct{}{}
}
// Append any plain Addresses that weren't already covered by an
// Endpoint, so legacy clients still see them via `urls`.
for _, addr := range cfg.Addresses {
if _, ok := seen[addr]; ok {
continue
}
out.Urls = append(out.Urls, addr)
}
} else {
out.Urls = append([]string(nil), cfg.Addresses...)
}
return out
}
func toPeerConfig(peer *nbpeer.Peer, network *types.Network, dnsName string, settings *types.Settings, httpConfig *nbconfig.HttpServerConfig, deviceFlowConfig *nbconfig.DeviceAuthorizationFlow, enableSSH bool) *proto.PeerConfig {
netmask, _ := network.Net.Mask.Size()
fqdn := peer.FQDN(dnsName)

View File

@@ -231,10 +231,11 @@ func (m *TimeBasedAuthSecretsManager) pushNewTURNAndRelayTokens(ctx context.Cont
if m.relayCfg != nil {
token, err := m.GenerateRelayToken()
if err == nil {
update.NetbirdConfig.Relay = &proto.RelayConfig{
Urls: m.relayCfg.Addresses,
TokenPayload: token.Payload,
TokenSignature: token.Signature,
relayProto := BuildRelayConfigProto(m.relayCfg)
if relayProto != nil {
relayProto.TokenPayload = token.Payload
relayProto.TokenSignature = token.Signature
update.NetbirdConfig.Relay = relayProto
}
}
}
@@ -255,13 +256,16 @@ func (m *TimeBasedAuthSecretsManager) pushNewRelayTokens(ctx context.Context, ac
return
}
relayProto := BuildRelayConfigProto(m.relayCfg)
if relayProto == nil {
// no addresses or endpoints configured; nothing to push
return
}
relayProto.TokenPayload = string(relayToken.Payload)
relayProto.TokenSignature = base64.StdEncoding.EncodeToString(relayToken.Signature)
update := &proto.SyncResponse{
NetbirdConfig: &proto.NetbirdConfig{
Relay: &proto.RelayConfig{
Urls: m.relayCfg.Addresses,
TokenPayload: string(relayToken.Payload),
TokenSignature: base64.StdEncoding.EncodeToString(relayToken.Signature),
},
Relay: relayProto,
// omit Turns to avoid updates there
},
}