diff --git a/infrastructure_files/base.setup.env b/infrastructure_files/base.setup.env index e59939191..822dbbb19 100644 --- a/infrastructure_files/base.setup.env +++ b/infrastructure_files/base.setup.env @@ -47,6 +47,10 @@ VOLUME_PREFIX="netbird-" MGMT_VOLUMESUFFIX="mgmt" SIGNAL_VOLUMESUFFIX="signal" LETSENCRYPT_VOLUMESUFFIX="letsencrypt" +# Dedicated Let's Encrypt store for the relay. Required only by the Traefik +# deployment, where the relay runs its own ACME client to terminate TLS on +# UDP/443 for WebTransport + raw QUIC (Traefik can't proxy WebTransport). +RELAY_LE_VOLUMESUFFIX="relay-letsencrypt" NETBIRD_AUTH_DEVICE_AUTH_PROVIDER="none" NETBIRD_AUTH_DEVICE_AUTH_AUDIENCE=${NETBIRD_AUTH_DEVICE_AUTH_AUDIENCE:-$NETBIRD_AUTH_AUDIENCE} @@ -111,6 +115,7 @@ export VOLUME_PREFIX export MGMT_VOLUMESUFFIX export SIGNAL_VOLUMESUFFIX export LETSENCRYPT_VOLUMESUFFIX +export RELAY_LE_VOLUMESUFFIX export NETBIRD_DISABLE_ANONYMOUS_METRICS export NETBIRD_MGMT_SINGLE_ACCOUNT_MODE_DOMAIN export NETBIRD_MGMT_DNS_DOMAIN diff --git a/infrastructure_files/configure.sh b/infrastructure_files/configure.sh index 92252d0b3..c3d35f084 100755 --- a/infrastructure_files/configure.sh +++ b/infrastructure_files/configure.sh @@ -112,6 +112,7 @@ mkdir -p $artifacts_path MGMT_VOLUMENAME="${VOLUME_PREFIX}${MGMT_VOLUMESUFFIX}" SIGNAL_VOLUMENAME="${VOLUME_PREFIX}${SIGNAL_VOLUMESUFFIX}" LETSENCRYPT_VOLUMENAME="${VOLUME_PREFIX}${LETSENCRYPT_VOLUMESUFFIX}" +RELAY_LE_VOLUMENAME="${VOLUME_PREFIX}${RELAY_LE_VOLUMESUFFIX}" # if volume with wiretrustee- prefix already exists, use it, else create new with netbird- OLD_PREFIX='wiretrustee-' if docker volume ls | grep -q "${OLD_PREFIX}${MGMT_VOLUMESUFFIX}"; then @@ -127,6 +128,7 @@ fi export MGMT_VOLUMENAME export SIGNAL_VOLUMENAME export LETSENCRYPT_VOLUMENAME +export RELAY_LE_VOLUMENAME #backwards compatibility after migrating to generic OIDC with Auth0 if [[ -z "${NETBIRD_AUTH_OIDC_CONFIGURATION_ENDPOINT}" ]]; then diff --git a/infrastructure_files/docker-compose.yml.tmpl b/infrastructure_files/docker-compose.yml.tmpl index 1c9c63f78..5fdb95035 100644 --- a/infrastructure_files/docker-compose.yml.tmpl +++ b/infrastructure_files/docker-compose.yml.tmpl @@ -58,6 +58,17 @@ services: ] # Relay + # + # The relay listens on the same address for three transports, multiplexed by + # ALPN on a single TLS endpoint: + # - TCP: WebSocket (rels:// path /relay) — universal, works through any HTTP proxy + # - UDP: raw QUIC (nb-quic ALPN) — used by native clients + # - UDP: HTTP/3 + WebTransport (h3 ALPN, path /relay) — used by browser/WASM clients + # Both TCP and UDP must be published on the same port. Operators who want to + # disable a transport for clients should NOT remove the port mapping — the + # listener still binds — instead drop the transport from each entry's + # `transports:` list in management.json so the management server stops + # advertising it. relay: <<: *default image: netbirdio/relay:$NETBIRD_RELAY_TAG @@ -69,6 +80,7 @@ services: - NB_AUTH_SECRET=$NETBIRD_RELAY_AUTH_SECRET ports: - $NETBIRD_RELAY_PORT:$NETBIRD_RELAY_PORT + - $NETBIRD_RELAY_PORT:$NETBIRD_RELAY_PORT/udp # Management management: diff --git a/infrastructure_files/docker-compose.yml.tmpl.traefik b/infrastructure_files/docker-compose.yml.tmpl.traefik index 0010974c5..bd73bf8b5 100644 --- a/infrastructure_files/docker-compose.yml.tmpl.traefik +++ b/infrastructure_files/docker-compose.yml.tmpl.traefik @@ -54,19 +54,48 @@ services: - traefik.http.services.netbird-signal.loadbalancer.server.scheme=h2c # Relay + # + # Traefik fronts the TCP/WebSocket side of the relay on port 443 via the HTTP + # router below — this gives us WS over TLS that traverses any HTTP proxy. + # + # WebTransport (h3) and raw QUIC require direct UDP termination on the relay + # itself: Traefik does not proxy WebTransport sessions, and tunnelling the + # h3 stream end-to-end through a reverse proxy defeats the point. The relay + # therefore publishes UDP/443 on the host directly and terminates TLS for + # both ALPNs ("nb-quic" and "h3") on a single socket via ALPN multiplexing. + # + # The relay obtains its own Let's Encrypt certificate (NB_LETSENCRYPT_* + # below) since Traefik's certificate store is not shared with the container. + # The cert is bound to NETBIRD_RELAY_DOMAIN — point this at the same FQDN + # clients use to dial the relay. + # + # If a deployment can't open UDP/443 to the host (firewall, k8s without + # hostPort, etc.), leave it unmapped: native clients fall back to raw QUIC + # over WS and browser clients fall back to WS. Drop "quic"/"wt" from the + # management Relays config in that case so clients don't waste a handshake. relay: <<: *default image: netbirdio/relay:$NETBIRD_RELAY_TAG environment: - NB_LOG_LEVEL=info - - NB_LISTEN_ADDRESS=:33080 + - NB_LISTEN_ADDRESS=:443 - NB_EXPOSED_ADDRESS=$NETBIRD_RELAY_ENDPOINT + - NB_LETSENCRYPT_DOMAINS=$NETBIRD_RELAY_DOMAIN + - NB_LETSENCRYPT_EMAIL=$NETBIRD_LETSENCRYPT_EMAIL + - NB_LETSENCRYPT_DATA_DIR=/var/lib/netbird-relay # todo: change to a secure secret - NB_AUTH_SECRET=$NETBIRD_RELAY_AUTH_SECRET + volumes: + - $RELAY_LE_VOLUMENAME:/var/lib/netbird-relay + ports: + # Direct UDP exposure for QUIC + WebTransport (bypasses Traefik). + - 443:443/udp labels: + # The TCP WS path stays behind Traefik so the existing /relay route keeps + # working for clients that can't open UDP/443. - traefik.enable=true - traefik.http.routers.netbird-relay.rule=Host(`$NETBIRD_DOMAIN`) && PathPrefix(`/relay`) - - traefik.http.services.netbird-relay.loadbalancer.server.port=33080 + - traefik.http.services.netbird-relay.loadbalancer.server.port=443 # Management management: @@ -117,3 +146,4 @@ volumes: $MGMT_VOLUMENAME: $SIGNAL_VOLUMENAME: $LETSENCRYPT_VOLUMENAME: + $RELAY_LE_VOLUMENAME: diff --git a/management/internals/server/config/config.go b/management/internals/server/config/config.go index fb9c842b7..f29d7f682 100644 --- a/management/internals/server/config/config.go +++ b/management/internals/server/config/config.go @@ -87,12 +87,33 @@ type TURNConfig struct { } // Relay configuration type +// +// Addresses is the legacy flat list and is forwarded to clients as +// RelayConfig.urls for back-compat with older agents. +// +// Endpoints, when populated, additionally announces the transports each +// relay URL supports. Under GeoDNS, where one URL resolves to several +// physical relays in different regions, Transports must be the +// intersection of the transports supported by every backend behind that +// hostname — clients pick a transport per URL and the management server +// does not probe individual backends. If a single backend in the pool +// does not support h3/WebTransport, drop "wt" from Transports so no +// client tries it against that hostname. type Relay struct { Addresses []string + Endpoints []RelayEndpoint CredentialsTTL util.Duration Secret string } +// RelayEndpoint pairs a relay URL with the transports it advertises. +// Transports values: "ws", "quic", "wt". Empty Transports means "unknown, +// let the client try whatever it supports". +type RelayEndpoint struct { + URL string + Transports []string +} + // HttpServerConfig is a config of the HTTP Management service server type HttpServerConfig struct { LetsEncryptDomain string diff --git a/management/internals/shared/grpc/conversion.go b/management/internals/shared/grpc/conversion.go index 12402b420..d38061e1f 100644 --- a/management/internals/shared/grpc/conversion.go +++ b/management/internals/shared/grpc/conversion.go @@ -62,16 +62,10 @@ func toNetbirdConfig(config *nbconfig.Config, turnCredentials *Token, relayToken } } - var relayCfg *proto.RelayConfig - if config.Relay != nil && len(config.Relay.Addresses) > 0 { - relayCfg = &proto.RelayConfig{ - Urls: config.Relay.Addresses, - } - - if relayToken != nil { - relayCfg.TokenPayload = relayToken.Payload - relayCfg.TokenSignature = relayToken.Signature - } + relayCfg := BuildRelayConfigProto(config.Relay) + if relayCfg != nil && relayToken != nil { + relayCfg.TokenPayload = relayToken.Payload + relayCfg.TokenSignature = relayToken.Signature } var signalCfg *proto.HostConfig @@ -92,6 +86,60 @@ func toNetbirdConfig(config *nbconfig.Config, turnCredentials *Token, relayToken return nbConfig } +// BuildRelayConfigProto translates the management-server relay config into +// the wire-level proto.RelayConfig. +// +// Both forms are emitted side by side: the legacy `urls` slice always carries +// every relay address (so old agents that don't understand `endpoints` still +// receive a working URL list), and `endpoints` adds per-relay transport hints +// when the operator has configured them. +// +// Under GeoDNS a single URL fans out to several physical relays. The hint +// declared here must already be the intersection of the transports supported +// by every backend; the management server takes the operator's word for it +// rather than probing each region. If even one backend behind a hostname +// can't speak h3/WebTransport, the operator must omit "wt" from that +// hostname's Transports. +func BuildRelayConfigProto(cfg *nbconfig.Relay) *proto.RelayConfig { + if cfg == nil { + return nil + } + if len(cfg.Addresses) == 0 && len(cfg.Endpoints) == 0 { + return nil + } + + out := &proto.RelayConfig{} + + if len(cfg.Endpoints) > 0 { + seen := make(map[string]struct{}, len(cfg.Endpoints)) + out.Endpoints = make([]*proto.RelayEndpoint, 0, len(cfg.Endpoints)) + out.Urls = make([]string, 0, len(cfg.Endpoints)+len(cfg.Addresses)) + for _, ep := range cfg.Endpoints { + if ep.URL == "" { + continue + } + out.Endpoints = append(out.Endpoints, &proto.RelayEndpoint{ + Url: ep.URL, + Transports: append([]string(nil), ep.Transports...), + }) + out.Urls = append(out.Urls, ep.URL) + seen[ep.URL] = struct{}{} + } + // Append any plain Addresses that weren't already covered by an + // Endpoint, so legacy clients still see them via `urls`. + for _, addr := range cfg.Addresses { + if _, ok := seen[addr]; ok { + continue + } + out.Urls = append(out.Urls, addr) + } + } else { + out.Urls = append([]string(nil), cfg.Addresses...) + } + + return out +} + func toPeerConfig(peer *nbpeer.Peer, network *types.Network, dnsName string, settings *types.Settings, httpConfig *nbconfig.HttpServerConfig, deviceFlowConfig *nbconfig.DeviceAuthorizationFlow, enableSSH bool) *proto.PeerConfig { netmask, _ := network.Net.Mask.Size() fqdn := peer.FQDN(dnsName) diff --git a/management/internals/shared/grpc/token_mgr.go b/management/internals/shared/grpc/token_mgr.go index 65e58ad41..0439e16c3 100644 --- a/management/internals/shared/grpc/token_mgr.go +++ b/management/internals/shared/grpc/token_mgr.go @@ -231,10 +231,11 @@ func (m *TimeBasedAuthSecretsManager) pushNewTURNAndRelayTokens(ctx context.Cont if m.relayCfg != nil { token, err := m.GenerateRelayToken() if err == nil { - update.NetbirdConfig.Relay = &proto.RelayConfig{ - Urls: m.relayCfg.Addresses, - TokenPayload: token.Payload, - TokenSignature: token.Signature, + relayProto := BuildRelayConfigProto(m.relayCfg) + if relayProto != nil { + relayProto.TokenPayload = token.Payload + relayProto.TokenSignature = token.Signature + update.NetbirdConfig.Relay = relayProto } } } @@ -255,13 +256,16 @@ func (m *TimeBasedAuthSecretsManager) pushNewRelayTokens(ctx context.Context, ac return } + relayProto := BuildRelayConfigProto(m.relayCfg) + if relayProto == nil { + // no addresses or endpoints configured; nothing to push + return + } + relayProto.TokenPayload = string(relayToken.Payload) + relayProto.TokenSignature = base64.StdEncoding.EncodeToString(relayToken.Signature) update := &proto.SyncResponse{ NetbirdConfig: &proto.NetbirdConfig{ - Relay: &proto.RelayConfig{ - Urls: m.relayCfg.Addresses, - TokenPayload: string(relayToken.Payload), - TokenSignature: base64.StdEncoding.EncodeToString(relayToken.Signature), - }, + Relay: relayProto, // omit Turns to avoid updates there }, }