mirror of
https://github.com/netbirdio/netbird.git
synced 2026-04-17 07:46:38 +00:00
Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4eed459f27 | ||
|
|
13539543af | ||
|
|
7483fec048 | ||
|
|
5259e5df51 | ||
|
|
ebd78e0122 | ||
|
|
cf86b9a528 | ||
|
|
ee588e1536 | ||
|
|
2a8aacc5c9 | ||
|
|
15709bc666 | ||
|
|
789b4113fe | ||
|
|
d2cdc0efec | ||
|
|
ee343d5d77 | ||
|
|
099c493b18 | ||
|
|
c1d1229ae0 | ||
|
|
94a36cb53e | ||
|
|
c7ba931466 | ||
|
|
413d95b740 |
@@ -6,6 +6,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/coreos/go-iptables/iptables"
|
||||
"github.com/google/nftables"
|
||||
@@ -35,20 +36,34 @@ const SKIP_NFTABLES_ENV = "NB_SKIP_NFTABLES_CHECK"
|
||||
type FWType int
|
||||
|
||||
func NewFirewall(iface IFaceMapper, stateManager *statemanager.Manager, flowLogger nftypes.FlowLogger, disableServerRoutes bool, mtu uint16) (firewall.Manager, error) {
|
||||
// on the linux system we try to user nftables or iptables
|
||||
// in any case, because we need to allow netbird interface traffic
|
||||
// so we use AllowNetbird traffic from these firewall managers
|
||||
// for the userspace packet filtering firewall
|
||||
// We run in userspace mode and force userspace firewall was requested. We don't attempt native firewall.
|
||||
if iface.IsUserspaceBind() && forceUserspaceFirewall() {
|
||||
log.Info("forcing userspace firewall")
|
||||
return createUserspaceFirewall(iface, nil, disableServerRoutes, flowLogger, mtu)
|
||||
}
|
||||
|
||||
// Use native firewall for either kernel or userspace, the interface appears identical to netfilter
|
||||
fm, err := createNativeFirewall(iface, stateManager, disableServerRoutes, mtu)
|
||||
|
||||
// Kernel cannot fall back to anything else, need to return error
|
||||
if !iface.IsUserspaceBind() {
|
||||
return fm, err
|
||||
}
|
||||
|
||||
// Fall back to the userspace packet filter if native is unavailable
|
||||
if err != nil {
|
||||
log.Warnf("failed to create native firewall: %v. Proceeding with userspace", err)
|
||||
return createUserspaceFirewall(iface, nil, disableServerRoutes, flowLogger, mtu)
|
||||
}
|
||||
return createUserspaceFirewall(iface, fm, disableServerRoutes, flowLogger, mtu)
|
||||
|
||||
// Native firewall handles packet filtering, but the userspace WireGuard bind
|
||||
// needs a device filter for DNS interception hooks. Install a minimal
|
||||
// hooks-only filter that passes all traffic through to the kernel firewall.
|
||||
if err := iface.SetFilter(&uspfilter.HooksFilter{}); err != nil {
|
||||
log.Warnf("failed to set hooks filter, DNS via memory hooks will not work: %v", err)
|
||||
}
|
||||
|
||||
return fm, nil
|
||||
}
|
||||
|
||||
func createNativeFirewall(iface IFaceMapper, stateManager *statemanager.Manager, routes bool, mtu uint16) (firewall.Manager, error) {
|
||||
@@ -160,3 +175,17 @@ func isIptablesClientAvailable(client *iptables.IPTables) bool {
|
||||
_, err := client.ListChains("filter")
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func forceUserspaceFirewall() bool {
|
||||
val := os.Getenv(EnvForceUserspaceFirewall)
|
||||
if val == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
force, err := strconv.ParseBool(val)
|
||||
if err != nil {
|
||||
log.Warnf("failed to parse %s: %v", EnvForceUserspaceFirewall, err)
|
||||
return false
|
||||
}
|
||||
return force
|
||||
}
|
||||
|
||||
@@ -7,6 +7,12 @@ import (
|
||||
"github.com/netbirdio/netbird/client/iface/wgaddr"
|
||||
)
|
||||
|
||||
// EnvForceUserspaceFirewall forces the use of the userspace packet filter even when
|
||||
// native iptables/nftables is available. This only applies when the WireGuard interface
|
||||
// runs in userspace mode. When set, peer ACLs are handled by USPFilter instead of
|
||||
// kernel netfilter rules.
|
||||
const EnvForceUserspaceFirewall = "NB_FORCE_USERSPACE_FIREWALL"
|
||||
|
||||
// IFaceMapper defines subset methods of interface required for manager
|
||||
type IFaceMapper interface {
|
||||
Name() string
|
||||
|
||||
@@ -33,7 +33,6 @@ type Manager struct {
|
||||
type iFaceMapper interface {
|
||||
Name() string
|
||||
Address() wgaddr.Address
|
||||
IsUserspaceBind() bool
|
||||
}
|
||||
|
||||
// Create iptables firewall manager
|
||||
@@ -64,10 +63,9 @@ func Create(wgIface iFaceMapper, mtu uint16) (*Manager, error) {
|
||||
func (m *Manager) Init(stateManager *statemanager.Manager) error {
|
||||
state := &ShutdownState{
|
||||
InterfaceState: &InterfaceState{
|
||||
NameStr: m.wgIface.Name(),
|
||||
WGAddress: m.wgIface.Address(),
|
||||
UserspaceBind: m.wgIface.IsUserspaceBind(),
|
||||
MTU: m.router.mtu,
|
||||
NameStr: m.wgIface.Name(),
|
||||
WGAddress: m.wgIface.Address(),
|
||||
MTU: m.router.mtu,
|
||||
},
|
||||
}
|
||||
stateManager.RegisterState(state)
|
||||
@@ -203,12 +201,10 @@ func (m *Manager) Close(stateManager *statemanager.Manager) error {
|
||||
return nberrors.FormatErrorOrNil(merr)
|
||||
}
|
||||
|
||||
// AllowNetbird allows netbird interface traffic
|
||||
// AllowNetbird allows netbird interface traffic.
|
||||
// This is called when USPFilter wraps the native firewall, adding blanket accept
|
||||
// rules so that packet filtering is handled in userspace instead of by netfilter.
|
||||
func (m *Manager) AllowNetbird() error {
|
||||
if !m.wgIface.IsUserspaceBind() {
|
||||
return nil
|
||||
}
|
||||
|
||||
_, err := m.AddPeerFiltering(
|
||||
nil,
|
||||
net.IP{0, 0, 0, 0},
|
||||
|
||||
@@ -47,8 +47,6 @@ func (i *iFaceMock) Address() wgaddr.Address {
|
||||
panic("AddressFunc is not set")
|
||||
}
|
||||
|
||||
func (i *iFaceMock) IsUserspaceBind() bool { return false }
|
||||
|
||||
func TestIptablesManager(t *testing.T) {
|
||||
ipv4Client, err := iptables.NewWithProtocol(iptables.ProtocolIPv4)
|
||||
require.NoError(t, err)
|
||||
|
||||
@@ -9,10 +9,9 @@ import (
|
||||
)
|
||||
|
||||
type InterfaceState struct {
|
||||
NameStr string `json:"name"`
|
||||
WGAddress wgaddr.Address `json:"wg_address"`
|
||||
UserspaceBind bool `json:"userspace_bind"`
|
||||
MTU uint16 `json:"mtu"`
|
||||
NameStr string `json:"name"`
|
||||
WGAddress wgaddr.Address `json:"wg_address"`
|
||||
MTU uint16 `json:"mtu"`
|
||||
}
|
||||
|
||||
func (i *InterfaceState) Name() string {
|
||||
@@ -23,10 +22,6 @@ func (i *InterfaceState) Address() wgaddr.Address {
|
||||
return i.WGAddress
|
||||
}
|
||||
|
||||
func (i *InterfaceState) IsUserspaceBind() bool {
|
||||
return i.UserspaceBind
|
||||
}
|
||||
|
||||
type ShutdownState struct {
|
||||
sync.Mutex
|
||||
|
||||
|
||||
@@ -40,7 +40,6 @@ func getTableName() string {
|
||||
type iFaceMapper interface {
|
||||
Name() string
|
||||
Address() wgaddr.Address
|
||||
IsUserspaceBind() bool
|
||||
}
|
||||
|
||||
// Manager of iptables firewall
|
||||
@@ -106,10 +105,9 @@ func (m *Manager) Init(stateManager *statemanager.Manager) error {
|
||||
// cleanup using Close() without needing to store specific rules.
|
||||
if err := stateManager.UpdateState(&ShutdownState{
|
||||
InterfaceState: &InterfaceState{
|
||||
NameStr: m.wgIface.Name(),
|
||||
WGAddress: m.wgIface.Address(),
|
||||
UserspaceBind: m.wgIface.IsUserspaceBind(),
|
||||
MTU: m.router.mtu,
|
||||
NameStr: m.wgIface.Name(),
|
||||
WGAddress: m.wgIface.Address(),
|
||||
MTU: m.router.mtu,
|
||||
},
|
||||
}); err != nil {
|
||||
log.Errorf("failed to update state: %v", err)
|
||||
@@ -205,12 +203,10 @@ func (m *Manager) RemoveNatRule(pair firewall.RouterPair) error {
|
||||
return m.router.RemoveNatRule(pair)
|
||||
}
|
||||
|
||||
// AllowNetbird allows netbird interface traffic
|
||||
// AllowNetbird allows netbird interface traffic.
|
||||
// This is called when USPFilter wraps the native firewall, adding blanket accept
|
||||
// rules so that packet filtering is handled in userspace instead of by netfilter.
|
||||
func (m *Manager) AllowNetbird() error {
|
||||
if !m.wgIface.IsUserspaceBind() {
|
||||
return nil
|
||||
}
|
||||
|
||||
m.mutex.Lock()
|
||||
defer m.mutex.Unlock()
|
||||
|
||||
|
||||
@@ -52,8 +52,6 @@ func (i *iFaceMock) Address() wgaddr.Address {
|
||||
panic("AddressFunc is not set")
|
||||
}
|
||||
|
||||
func (i *iFaceMock) IsUserspaceBind() bool { return false }
|
||||
|
||||
func TestNftablesManager(t *testing.T) {
|
||||
|
||||
// just check on the local interface
|
||||
|
||||
@@ -8,10 +8,9 @@ import (
|
||||
)
|
||||
|
||||
type InterfaceState struct {
|
||||
NameStr string `json:"name"`
|
||||
WGAddress wgaddr.Address `json:"wg_address"`
|
||||
UserspaceBind bool `json:"userspace_bind"`
|
||||
MTU uint16 `json:"mtu"`
|
||||
NameStr string `json:"name"`
|
||||
WGAddress wgaddr.Address `json:"wg_address"`
|
||||
MTU uint16 `json:"mtu"`
|
||||
}
|
||||
|
||||
func (i *InterfaceState) Name() string {
|
||||
@@ -22,10 +21,6 @@ func (i *InterfaceState) Address() wgaddr.Address {
|
||||
return i.WGAddress
|
||||
}
|
||||
|
||||
func (i *InterfaceState) IsUserspaceBind() bool {
|
||||
return i.UserspaceBind
|
||||
}
|
||||
|
||||
type ShutdownState struct {
|
||||
InterfaceState *InterfaceState `json:"interface_state,omitempty"`
|
||||
}
|
||||
|
||||
37
client/firewall/uspfilter/common/hooks.go
Normal file
37
client/firewall/uspfilter/common/hooks.go
Normal file
@@ -0,0 +1,37 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"net/netip"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// PacketHook stores a registered hook for a specific IP:port.
|
||||
type PacketHook struct {
|
||||
IP netip.Addr
|
||||
Port uint16
|
||||
Fn func([]byte) bool
|
||||
}
|
||||
|
||||
// HookMatches checks if a packet's destination matches the hook and invokes it.
|
||||
func HookMatches(h *PacketHook, dstIP netip.Addr, dport uint16, packetData []byte) bool {
|
||||
if h == nil {
|
||||
return false
|
||||
}
|
||||
if h.IP == dstIP && h.Port == dport {
|
||||
return h.Fn(packetData)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// SetHook atomically stores a hook, handling nil removal.
|
||||
func SetHook(ptr *atomic.Pointer[PacketHook], ip netip.Addr, dPort uint16, hook func([]byte) bool) {
|
||||
if hook == nil {
|
||||
ptr.Store(nil)
|
||||
return
|
||||
}
|
||||
ptr.Store(&PacketHook{
|
||||
IP: ip,
|
||||
Port: dPort,
|
||||
Fn: hook,
|
||||
})
|
||||
}
|
||||
@@ -142,15 +142,8 @@ type Manager struct {
|
||||
mssClampEnabled bool
|
||||
|
||||
// Only one hook per protocol is supported. Outbound direction only.
|
||||
udpHookOut atomic.Pointer[packetHook]
|
||||
tcpHookOut atomic.Pointer[packetHook]
|
||||
}
|
||||
|
||||
// packetHook stores a registered hook for a specific IP:port.
|
||||
type packetHook struct {
|
||||
ip netip.Addr
|
||||
port uint16
|
||||
fn func([]byte) bool
|
||||
udpHookOut atomic.Pointer[common.PacketHook]
|
||||
tcpHookOut atomic.Pointer[common.PacketHook]
|
||||
}
|
||||
|
||||
// decoder for packages
|
||||
@@ -912,21 +905,11 @@ func (m *Manager) trackInbound(d *decoder, srcIP, dstIP netip.Addr, ruleID []byt
|
||||
}
|
||||
|
||||
func (m *Manager) udpHooksDrop(dport uint16, dstIP netip.Addr, packetData []byte) bool {
|
||||
return hookMatches(m.udpHookOut.Load(), dstIP, dport, packetData)
|
||||
return common.HookMatches(m.udpHookOut.Load(), dstIP, dport, packetData)
|
||||
}
|
||||
|
||||
func (m *Manager) tcpHooksDrop(dport uint16, dstIP netip.Addr, packetData []byte) bool {
|
||||
return hookMatches(m.tcpHookOut.Load(), dstIP, dport, packetData)
|
||||
}
|
||||
|
||||
func hookMatches(h *packetHook, dstIP netip.Addr, dport uint16, packetData []byte) bool {
|
||||
if h == nil {
|
||||
return false
|
||||
}
|
||||
if h.ip == dstIP && h.port == dport {
|
||||
return h.fn(packetData)
|
||||
}
|
||||
return false
|
||||
return common.HookMatches(m.tcpHookOut.Load(), dstIP, dport, packetData)
|
||||
}
|
||||
|
||||
// filterInbound implements filtering logic for incoming packets.
|
||||
@@ -1337,28 +1320,12 @@ func (m *Manager) ruleMatches(rule *RouteRule, srcAddr, dstAddr netip.Addr, prot
|
||||
|
||||
// SetUDPPacketHook sets the outbound UDP packet hook. Pass nil hook to remove.
|
||||
func (m *Manager) SetUDPPacketHook(ip netip.Addr, dPort uint16, hook func(packet []byte) bool) {
|
||||
if hook == nil {
|
||||
m.udpHookOut.Store(nil)
|
||||
return
|
||||
}
|
||||
m.udpHookOut.Store(&packetHook{
|
||||
ip: ip,
|
||||
port: dPort,
|
||||
fn: hook,
|
||||
})
|
||||
common.SetHook(&m.udpHookOut, ip, dPort, hook)
|
||||
}
|
||||
|
||||
// SetTCPPacketHook sets the outbound TCP packet hook. Pass nil hook to remove.
|
||||
func (m *Manager) SetTCPPacketHook(ip netip.Addr, dPort uint16, hook func(packet []byte) bool) {
|
||||
if hook == nil {
|
||||
m.tcpHookOut.Store(nil)
|
||||
return
|
||||
}
|
||||
m.tcpHookOut.Store(&packetHook{
|
||||
ip: ip,
|
||||
port: dPort,
|
||||
fn: hook,
|
||||
})
|
||||
common.SetHook(&m.tcpHookOut, ip, dPort, hook)
|
||||
}
|
||||
|
||||
// SetLogLevel sets the log level for the firewall manager
|
||||
|
||||
@@ -202,9 +202,9 @@ func TestSetUDPPacketHook(t *testing.T) {
|
||||
|
||||
h := manager.udpHookOut.Load()
|
||||
require.NotNil(t, h)
|
||||
assert.Equal(t, netip.MustParseAddr("10.168.0.1"), h.ip)
|
||||
assert.Equal(t, uint16(8000), h.port)
|
||||
assert.True(t, h.fn(nil))
|
||||
assert.Equal(t, netip.MustParseAddr("10.168.0.1"), h.IP)
|
||||
assert.Equal(t, uint16(8000), h.Port)
|
||||
assert.True(t, h.Fn(nil))
|
||||
assert.True(t, called)
|
||||
|
||||
manager.SetUDPPacketHook(netip.MustParseAddr("10.168.0.1"), 8000, nil)
|
||||
@@ -226,9 +226,9 @@ func TestSetTCPPacketHook(t *testing.T) {
|
||||
|
||||
h := manager.tcpHookOut.Load()
|
||||
require.NotNil(t, h)
|
||||
assert.Equal(t, netip.MustParseAddr("10.168.0.1"), h.ip)
|
||||
assert.Equal(t, uint16(53), h.port)
|
||||
assert.True(t, h.fn(nil))
|
||||
assert.Equal(t, netip.MustParseAddr("10.168.0.1"), h.IP)
|
||||
assert.Equal(t, uint16(53), h.Port)
|
||||
assert.True(t, h.Fn(nil))
|
||||
assert.True(t, called)
|
||||
|
||||
manager.SetTCPPacketHook(netip.MustParseAddr("10.168.0.1"), 53, nil)
|
||||
|
||||
90
client/firewall/uspfilter/hooks_filter.go
Normal file
90
client/firewall/uspfilter/hooks_filter.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package uspfilter
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"net/netip"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/netbirdio/netbird/client/firewall/uspfilter/common"
|
||||
"github.com/netbirdio/netbird/client/iface/device"
|
||||
)
|
||||
|
||||
const (
|
||||
ipv4HeaderMinLen = 20
|
||||
ipv4ProtoOffset = 9
|
||||
ipv4FlagsOffset = 6
|
||||
ipv4DstOffset = 16
|
||||
ipProtoUDP = 17
|
||||
ipProtoTCP = 6
|
||||
ipv4FragOffMask = 0x1fff
|
||||
// dstPortOffset is the offset of the destination port within a UDP or TCP header.
|
||||
dstPortOffset = 2
|
||||
)
|
||||
|
||||
// HooksFilter is a minimal packet filter that only handles outbound DNS hooks.
|
||||
// It is installed on the WireGuard interface when the userspace bind is active
|
||||
// but a full firewall filter (Manager) is not needed because a native kernel
|
||||
// firewall (nftables/iptables) handles packet filtering.
|
||||
type HooksFilter struct {
|
||||
udpHook atomic.Pointer[common.PacketHook]
|
||||
tcpHook atomic.Pointer[common.PacketHook]
|
||||
}
|
||||
|
||||
var _ device.PacketFilter = (*HooksFilter)(nil)
|
||||
|
||||
// FilterOutbound checks outbound packets for DNS hook matches.
|
||||
// Only IPv4 packets matching the registered hook IP:port are intercepted.
|
||||
// IPv6 and non-IP packets pass through unconditionally.
|
||||
func (f *HooksFilter) FilterOutbound(packetData []byte, _ int) bool {
|
||||
if len(packetData) < ipv4HeaderMinLen {
|
||||
return false
|
||||
}
|
||||
|
||||
// Only process IPv4 packets, let everything else pass through.
|
||||
if packetData[0]>>4 != 4 {
|
||||
return false
|
||||
}
|
||||
|
||||
ihl := int(packetData[0]&0x0f) * 4
|
||||
if ihl < ipv4HeaderMinLen || len(packetData) < ihl+4 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Skip non-first fragments: they don't carry L4 headers.
|
||||
flagsAndOffset := binary.BigEndian.Uint16(packetData[ipv4FlagsOffset : ipv4FlagsOffset+2])
|
||||
if flagsAndOffset&ipv4FragOffMask != 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
dstIP, ok := netip.AddrFromSlice(packetData[ipv4DstOffset : ipv4DstOffset+4])
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
proto := packetData[ipv4ProtoOffset]
|
||||
dstPort := binary.BigEndian.Uint16(packetData[ihl+dstPortOffset : ihl+dstPortOffset+2])
|
||||
|
||||
switch proto {
|
||||
case ipProtoUDP:
|
||||
return common.HookMatches(f.udpHook.Load(), dstIP, dstPort, packetData)
|
||||
case ipProtoTCP:
|
||||
return common.HookMatches(f.tcpHook.Load(), dstIP, dstPort, packetData)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// FilterInbound allows all inbound packets (native firewall handles filtering).
|
||||
func (f *HooksFilter) FilterInbound([]byte, int) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// SetUDPPacketHook registers the UDP packet hook.
|
||||
func (f *HooksFilter) SetUDPPacketHook(ip netip.Addr, dPort uint16, hook func([]byte) bool) {
|
||||
common.SetHook(&f.udpHook, ip, dPort, hook)
|
||||
}
|
||||
|
||||
// SetTCPPacketHook registers the TCP packet hook.
|
||||
func (f *HooksFilter) SetTCPPacketHook(ip netip.Addr, dPort uint16, hook func([]byte) bool) {
|
||||
common.SetHook(&f.tcpHook, ip, dPort, hook)
|
||||
}
|
||||
@@ -19,6 +19,9 @@ import (
|
||||
var flowLogger = netflow.NewManager(nil, []byte{}, nil).GetLogger()
|
||||
|
||||
func TestDefaultManager(t *testing.T) {
|
||||
t.Setenv("NB_WG_KERNEL_DISABLED", "true")
|
||||
t.Setenv(firewall.EnvForceUserspaceFirewall, "true")
|
||||
|
||||
networkMap := &mgmProto.NetworkMap{
|
||||
FirewallRules: []*mgmProto.FirewallRule{
|
||||
{
|
||||
@@ -135,6 +138,7 @@ func TestDefaultManager(t *testing.T) {
|
||||
func TestDefaultManagerStateless(t *testing.T) {
|
||||
// stateless currently only in userspace, so we have to disable kernel
|
||||
t.Setenv("NB_WG_KERNEL_DISABLED", "true")
|
||||
t.Setenv(firewall.EnvForceUserspaceFirewall, "true")
|
||||
t.Setenv("NB_DISABLE_CONNTRACK", "true")
|
||||
|
||||
networkMap := &mgmProto.NetworkMap{
|
||||
@@ -194,6 +198,7 @@ func TestDefaultManagerStateless(t *testing.T) {
|
||||
// This tests the full ACL manager -> uspfilter integration.
|
||||
func TestDenyRulesNotAccumulatedOnRepeatedApply(t *testing.T) {
|
||||
t.Setenv("NB_WG_KERNEL_DISABLED", "true")
|
||||
t.Setenv(firewall.EnvForceUserspaceFirewall, "true")
|
||||
|
||||
networkMap := &mgmProto.NetworkMap{
|
||||
FirewallRules: []*mgmProto.FirewallRule{
|
||||
@@ -258,6 +263,7 @@ func TestDenyRulesNotAccumulatedOnRepeatedApply(t *testing.T) {
|
||||
// up when they're removed from the network map in a subsequent update.
|
||||
func TestDenyRulesCleanedUpOnRemoval(t *testing.T) {
|
||||
t.Setenv("NB_WG_KERNEL_DISABLED", "true")
|
||||
t.Setenv(firewall.EnvForceUserspaceFirewall, "true")
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
defer ctrl.Finish()
|
||||
@@ -339,6 +345,7 @@ func TestDenyRulesCleanedUpOnRemoval(t *testing.T) {
|
||||
// one added without leaking.
|
||||
func TestRuleUpdateChangingAction(t *testing.T) {
|
||||
t.Setenv("NB_WG_KERNEL_DISABLED", "true")
|
||||
t.Setenv(firewall.EnvForceUserspaceFirewall, "true")
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
defer ctrl.Finish()
|
||||
|
||||
@@ -25,6 +25,7 @@ import (
|
||||
"google.golang.org/protobuf/encoding/protojson"
|
||||
|
||||
"github.com/netbirdio/netbird/client/anonymize"
|
||||
"github.com/netbirdio/netbird/client/configs"
|
||||
"github.com/netbirdio/netbird/client/internal/peer"
|
||||
"github.com/netbirdio/netbird/client/internal/profilemanager"
|
||||
"github.com/netbirdio/netbird/client/internal/updater/installer"
|
||||
@@ -52,6 +53,7 @@ resolved_domains.txt: Anonymized resolved domain IP addresses from the status re
|
||||
config.txt: Anonymized configuration information of the NetBird client.
|
||||
network_map.json: Anonymized sync response containing peer configurations, routes, DNS settings, and firewall rules.
|
||||
state.json: Anonymized client state dump containing netbird states for the active profile.
|
||||
service_params.json: Sanitized service install parameters (service.json). Sensitive environment variable values are masked. Only present when service.json exists.
|
||||
metrics.txt: Buffered client metrics in InfluxDB line protocol format. Only present when metrics collection is enabled. Peer identifiers are anonymized.
|
||||
mutex.prof: Mutex profiling information.
|
||||
goroutine.prof: Goroutine profiling information.
|
||||
@@ -359,6 +361,10 @@ func (g *BundleGenerator) createArchive() error {
|
||||
log.Errorf("failed to add corrupted state files to debug bundle: %v", err)
|
||||
}
|
||||
|
||||
if err := g.addServiceParams(); err != nil {
|
||||
log.Errorf("failed to add service params to debug bundle: %v", err)
|
||||
}
|
||||
|
||||
if err := g.addMetrics(); err != nil {
|
||||
log.Errorf("failed to add metrics to debug bundle: %v", err)
|
||||
}
|
||||
@@ -488,6 +494,90 @@ func (g *BundleGenerator) addConfig() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
serviceParamsFile = "service.json"
|
||||
serviceParamsBundle = "service_params.json"
|
||||
maskedValue = "***"
|
||||
envVarPrefix = "NB_"
|
||||
jsonKeyManagementURL = "management_url"
|
||||
jsonKeyServiceEnv = "service_env_vars"
|
||||
)
|
||||
|
||||
var sensitiveEnvSubstrings = []string{"key", "token", "secret", "password", "credential"}
|
||||
|
||||
// addServiceParams reads the service.json file and adds a sanitized version to the bundle.
|
||||
// Non-NB_ env vars and vars with sensitive names are masked. Other NB_ values are anonymized.
|
||||
func (g *BundleGenerator) addServiceParams() error {
|
||||
path := filepath.Join(configs.StateDir, serviceParamsFile)
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("read service params: %w", err)
|
||||
}
|
||||
|
||||
var params map[string]any
|
||||
if err := json.Unmarshal(data, ¶ms); err != nil {
|
||||
return fmt.Errorf("parse service params: %w", err)
|
||||
}
|
||||
|
||||
if g.anonymize {
|
||||
if mgmtURL, ok := params[jsonKeyManagementURL].(string); ok && mgmtURL != "" {
|
||||
params[jsonKeyManagementURL] = g.anonymizer.AnonymizeURI(mgmtURL)
|
||||
}
|
||||
}
|
||||
|
||||
g.sanitizeServiceEnvVars(params)
|
||||
|
||||
sanitizedData, err := json.MarshalIndent(params, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal sanitized service params: %w", err)
|
||||
}
|
||||
|
||||
if err := g.addFileToZip(bytes.NewReader(sanitizedData), serviceParamsBundle); err != nil {
|
||||
return fmt.Errorf("add service params to zip: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// sanitizeServiceEnvVars masks or anonymizes env var values in service params.
|
||||
// Non-NB_ vars and vars with sensitive names (key, token, etc.) are fully masked.
|
||||
// Other NB_ var values are passed through the anonymizer when anonymization is enabled.
|
||||
func (g *BundleGenerator) sanitizeServiceEnvVars(params map[string]any) {
|
||||
envVars, ok := params[jsonKeyServiceEnv].(map[string]any)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
sanitized := make(map[string]any, len(envVars))
|
||||
for k, v := range envVars {
|
||||
val, _ := v.(string)
|
||||
switch {
|
||||
case !strings.HasPrefix(k, envVarPrefix) || isSensitiveEnvVar(k):
|
||||
sanitized[k] = maskedValue
|
||||
case g.anonymize:
|
||||
sanitized[k] = g.anonymizer.AnonymizeString(val)
|
||||
default:
|
||||
sanitized[k] = val
|
||||
}
|
||||
}
|
||||
params[jsonKeyServiceEnv] = sanitized
|
||||
}
|
||||
|
||||
// isSensitiveEnvVar returns true for env var names that may contain secrets.
|
||||
func isSensitiveEnvVar(key string) bool {
|
||||
lower := strings.ToLower(key)
|
||||
for _, s := range sensitiveEnvSubstrings {
|
||||
if strings.Contains(lower, s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (g *BundleGenerator) addCommonConfigFields(configContent *strings.Builder) {
|
||||
configContent.WriteString("NetBird Client Configuration:\n\n")
|
||||
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
package debug
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@@ -10,6 +14,7 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/netbirdio/netbird/client/anonymize"
|
||||
"github.com/netbirdio/netbird/client/configs"
|
||||
mgmProto "github.com/netbirdio/netbird/shared/management/proto"
|
||||
)
|
||||
|
||||
@@ -420,6 +425,226 @@ func TestAnonymizeNetworkMap(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsSensitiveEnvVar(t *testing.T) {
|
||||
tests := []struct {
|
||||
key string
|
||||
sensitive bool
|
||||
}{
|
||||
{"NB_SETUP_KEY", true},
|
||||
{"NB_API_TOKEN", true},
|
||||
{"NB_CLIENT_SECRET", true},
|
||||
{"NB_PASSWORD", true},
|
||||
{"NB_CREDENTIAL", true},
|
||||
{"NB_LOG_LEVEL", false},
|
||||
{"NB_MANAGEMENT_URL", false},
|
||||
{"NB_HOSTNAME", false},
|
||||
{"HOME", false},
|
||||
{"PATH", false},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.key, func(t *testing.T) {
|
||||
assert.Equal(t, tt.sensitive, isSensitiveEnvVar(tt.key))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSanitizeServiceEnvVars(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
anonymize bool
|
||||
input map[string]any
|
||||
check func(t *testing.T, params map[string]any)
|
||||
}{
|
||||
{
|
||||
name: "no env vars key",
|
||||
anonymize: false,
|
||||
input: map[string]any{"management_url": "https://mgmt.example.com"},
|
||||
check: func(t *testing.T, params map[string]any) {
|
||||
t.Helper()
|
||||
assert.Equal(t, "https://mgmt.example.com", params["management_url"], "non-env fields should be untouched")
|
||||
_, ok := params[jsonKeyServiceEnv]
|
||||
assert.False(t, ok, "service_env_vars should not be added")
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "non-NB vars are masked",
|
||||
anonymize: false,
|
||||
input: map[string]any{
|
||||
jsonKeyServiceEnv: map[string]any{
|
||||
"HOME": "/root",
|
||||
"PATH": "/usr/bin",
|
||||
"NB_LOG_LEVEL": "debug",
|
||||
},
|
||||
},
|
||||
check: func(t *testing.T, params map[string]any) {
|
||||
t.Helper()
|
||||
env := params[jsonKeyServiceEnv].(map[string]any)
|
||||
assert.Equal(t, maskedValue, env["HOME"], "non-NB_ var should be masked")
|
||||
assert.Equal(t, maskedValue, env["PATH"], "non-NB_ var should be masked")
|
||||
assert.Equal(t, "debug", env["NB_LOG_LEVEL"], "safe NB_ var should pass through")
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "sensitive NB vars are masked",
|
||||
anonymize: false,
|
||||
input: map[string]any{
|
||||
jsonKeyServiceEnv: map[string]any{
|
||||
"NB_SETUP_KEY": "abc123",
|
||||
"NB_API_TOKEN": "tok_xyz",
|
||||
"NB_LOG_LEVEL": "info",
|
||||
},
|
||||
},
|
||||
check: func(t *testing.T, params map[string]any) {
|
||||
t.Helper()
|
||||
env := params[jsonKeyServiceEnv].(map[string]any)
|
||||
assert.Equal(t, maskedValue, env["NB_SETUP_KEY"], "sensitive NB_ var should be masked")
|
||||
assert.Equal(t, maskedValue, env["NB_API_TOKEN"], "sensitive NB_ var should be masked")
|
||||
assert.Equal(t, "info", env["NB_LOG_LEVEL"], "safe NB_ var should pass through")
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "safe NB vars anonymized when anonymize is true",
|
||||
anonymize: true,
|
||||
input: map[string]any{
|
||||
jsonKeyServiceEnv: map[string]any{
|
||||
"NB_MANAGEMENT_URL": "https://mgmt.example.com:443",
|
||||
"NB_LOG_LEVEL": "debug",
|
||||
"NB_SETUP_KEY": "secret",
|
||||
"SOME_OTHER": "val",
|
||||
},
|
||||
},
|
||||
check: func(t *testing.T, params map[string]any) {
|
||||
t.Helper()
|
||||
env := params[jsonKeyServiceEnv].(map[string]any)
|
||||
// Safe NB_ values should be anonymized (not the original, not masked)
|
||||
mgmtVal := env["NB_MANAGEMENT_URL"].(string)
|
||||
assert.NotEqual(t, "https://mgmt.example.com:443", mgmtVal, "should be anonymized")
|
||||
assert.NotEqual(t, maskedValue, mgmtVal, "should not be masked")
|
||||
|
||||
logVal := env["NB_LOG_LEVEL"].(string)
|
||||
assert.NotEqual(t, maskedValue, logVal, "safe NB_ var should not be masked")
|
||||
|
||||
// Sensitive and non-NB_ still masked
|
||||
assert.Equal(t, maskedValue, env["NB_SETUP_KEY"])
|
||||
assert.Equal(t, maskedValue, env["SOME_OTHER"])
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
anonymizer := anonymize.NewAnonymizer(anonymize.DefaultAddresses())
|
||||
g := &BundleGenerator{
|
||||
anonymize: tt.anonymize,
|
||||
anonymizer: anonymizer,
|
||||
}
|
||||
g.sanitizeServiceEnvVars(tt.input)
|
||||
tt.check(t, tt.input)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddServiceParams(t *testing.T) {
|
||||
t.Run("missing service.json returns nil", func(t *testing.T) {
|
||||
g := &BundleGenerator{
|
||||
anonymizer: anonymize.NewAnonymizer(anonymize.DefaultAddresses()),
|
||||
}
|
||||
|
||||
origStateDir := configs.StateDir
|
||||
configs.StateDir = t.TempDir()
|
||||
t.Cleanup(func() { configs.StateDir = origStateDir })
|
||||
|
||||
err := g.addServiceParams()
|
||||
assert.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("management_url anonymized when anonymize is true", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
origStateDir := configs.StateDir
|
||||
configs.StateDir = dir
|
||||
t.Cleanup(func() { configs.StateDir = origStateDir })
|
||||
|
||||
input := map[string]any{
|
||||
jsonKeyManagementURL: "https://api.example.com:443",
|
||||
jsonKeyServiceEnv: map[string]any{
|
||||
"NB_LOG_LEVEL": "trace",
|
||||
},
|
||||
}
|
||||
data, err := json.Marshal(input)
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, serviceParamsFile), data, 0600))
|
||||
|
||||
var buf bytes.Buffer
|
||||
zw := zip.NewWriter(&buf)
|
||||
|
||||
g := &BundleGenerator{
|
||||
anonymize: true,
|
||||
anonymizer: anonymize.NewAnonymizer(anonymize.DefaultAddresses()),
|
||||
archive: zw,
|
||||
}
|
||||
|
||||
require.NoError(t, g.addServiceParams())
|
||||
require.NoError(t, zw.Close())
|
||||
|
||||
zr, err := zip.NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
|
||||
require.NoError(t, err)
|
||||
require.Len(t, zr.File, 1)
|
||||
assert.Equal(t, serviceParamsBundle, zr.File[0].Name)
|
||||
|
||||
rc, err := zr.File[0].Open()
|
||||
require.NoError(t, err)
|
||||
defer rc.Close()
|
||||
|
||||
var result map[string]any
|
||||
require.NoError(t, json.NewDecoder(rc).Decode(&result))
|
||||
|
||||
mgmt := result[jsonKeyManagementURL].(string)
|
||||
assert.NotEqual(t, "https://api.example.com:443", mgmt, "management_url should be anonymized")
|
||||
assert.NotEmpty(t, mgmt)
|
||||
|
||||
env := result[jsonKeyServiceEnv].(map[string]any)
|
||||
assert.NotEqual(t, maskedValue, env["NB_LOG_LEVEL"], "safe NB_ var should not be masked")
|
||||
})
|
||||
|
||||
t.Run("management_url preserved when anonymize is false", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
origStateDir := configs.StateDir
|
||||
configs.StateDir = dir
|
||||
t.Cleanup(func() { configs.StateDir = origStateDir })
|
||||
|
||||
input := map[string]any{
|
||||
jsonKeyManagementURL: "https://api.example.com:443",
|
||||
}
|
||||
data, err := json.Marshal(input)
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, serviceParamsFile), data, 0600))
|
||||
|
||||
var buf bytes.Buffer
|
||||
zw := zip.NewWriter(&buf)
|
||||
|
||||
g := &BundleGenerator{
|
||||
anonymize: false,
|
||||
anonymizer: anonymize.NewAnonymizer(anonymize.DefaultAddresses()),
|
||||
archive: zw,
|
||||
}
|
||||
|
||||
require.NoError(t, g.addServiceParams())
|
||||
require.NoError(t, zw.Close())
|
||||
|
||||
zr, err := zip.NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
|
||||
require.NoError(t, err)
|
||||
|
||||
rc, err := zr.File[0].Open()
|
||||
require.NoError(t, err)
|
||||
defer rc.Close()
|
||||
|
||||
var result map[string]any
|
||||
require.NoError(t, json.NewDecoder(rc).Decode(&result))
|
||||
|
||||
assert.Equal(t, "https://api.example.com:443", result[jsonKeyManagementURL], "management_url should be preserved")
|
||||
})
|
||||
}
|
||||
|
||||
// Helper function to check if IP is in CGNAT range
|
||||
func isInCGNATRange(ip net.IP) bool {
|
||||
cgnat := net.IPNet{
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"regexp"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -15,19 +16,29 @@ import (
|
||||
|
||||
const (
|
||||
defaultMappingTTL = 2 * time.Hour
|
||||
renewalInterval = defaultMappingTTL / 2
|
||||
discoveryTimeout = 10 * time.Second
|
||||
mappingDescription = "NetBird"
|
||||
)
|
||||
|
||||
// upnpErrPermanentLeaseOnly matches UPnP error 725 in SOAP fault XML,
|
||||
// allowing for whitespace/newlines between tags from different router firmware.
|
||||
var upnpErrPermanentLeaseOnly = regexp.MustCompile(`<errorCode>\s*725\s*</errorCode>`)
|
||||
|
||||
// Mapping represents an active NAT port mapping.
|
||||
type Mapping struct {
|
||||
Protocol string
|
||||
InternalPort uint16
|
||||
ExternalPort uint16
|
||||
ExternalIP net.IP
|
||||
NATType string
|
||||
// TTL is the lease duration. Zero means a permanent lease that never expires.
|
||||
TTL time.Duration
|
||||
}
|
||||
|
||||
// TODO: persist mapping state for crash recovery cleanup of permanent leases.
|
||||
// Currently not done because State.Cleanup requires NAT gateway re-discovery,
|
||||
// which blocks startup for ~10s when no gateway is present (affects all clients).
|
||||
|
||||
type Manager struct {
|
||||
cancel context.CancelFunc
|
||||
|
||||
@@ -43,6 +54,7 @@ type Manager struct {
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// NewManager creates a new port forwarding manager.
|
||||
func NewManager() *Manager {
|
||||
return &Manager{
|
||||
stopCtx: make(chan context.Context, 1),
|
||||
@@ -77,8 +89,7 @@ func (m *Manager) Start(ctx context.Context, wgPort uint16) {
|
||||
|
||||
gateway, mapping, err := m.setup(ctx)
|
||||
if err != nil {
|
||||
log.Errorf("failed to setup NAT port mapping: %v", err)
|
||||
|
||||
log.Infof("port forwarding setup: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -86,7 +97,7 @@ func (m *Manager) Start(ctx context.Context, wgPort uint16) {
|
||||
m.mapping = mapping
|
||||
m.mappingLock.Unlock()
|
||||
|
||||
m.renewLoop(ctx, gateway)
|
||||
m.renewLoop(ctx, gateway, mapping.TTL)
|
||||
|
||||
select {
|
||||
case cleanupCtx := <-m.stopCtx:
|
||||
@@ -145,16 +156,14 @@ func (m *Manager) setup(ctx context.Context) (nat.NAT, *Mapping, error) {
|
||||
|
||||
gateway, err := nat.DiscoverGateway(discoverCtx)
|
||||
if err != nil {
|
||||
log.Infof("NAT gateway discovery failed: %v (port forwarding disabled)", err)
|
||||
return nil, nil, err
|
||||
return nil, nil, fmt.Errorf("discover gateway: %w", err)
|
||||
}
|
||||
|
||||
log.Infof("discovered NAT gateway: %s", gateway.Type())
|
||||
|
||||
mapping, err := m.createMapping(ctx, gateway)
|
||||
if err != nil {
|
||||
log.Warnf("failed to create port mapping: %v", err)
|
||||
return nil, nil, err
|
||||
return nil, nil, fmt.Errorf("create port mapping: %w", err)
|
||||
}
|
||||
return gateway, mapping, nil
|
||||
}
|
||||
@@ -163,9 +172,18 @@ func (m *Manager) createMapping(ctx context.Context, gateway nat.NAT) (*Mapping,
|
||||
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
externalPort, err := gateway.AddPortMapping(ctx, "udp", int(m.wgPort), mappingDescription, defaultMappingTTL)
|
||||
ttl := defaultMappingTTL
|
||||
externalPort, err := gateway.AddPortMapping(ctx, "udp", int(m.wgPort), mappingDescription, ttl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if !isPermanentLeaseRequired(err) {
|
||||
return nil, err
|
||||
}
|
||||
log.Infof("gateway only supports permanent leases, retrying with indefinite duration")
|
||||
ttl = 0
|
||||
externalPort, err = gateway.AddPortMapping(ctx, "udp", int(m.wgPort), mappingDescription, ttl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
externalIP, err := gateway.GetExternalAddress()
|
||||
@@ -180,6 +198,7 @@ func (m *Manager) createMapping(ctx context.Context, gateway nat.NAT) (*Mapping,
|
||||
ExternalPort: uint16(externalPort),
|
||||
ExternalIP: externalIP,
|
||||
NATType: gateway.Type(),
|
||||
TTL: ttl,
|
||||
}
|
||||
|
||||
log.Infof("created port mapping: %d -> %d via %s (external IP: %s)",
|
||||
@@ -187,8 +206,14 @@ func (m *Manager) createMapping(ctx context.Context, gateway nat.NAT) (*Mapping,
|
||||
return mapping, nil
|
||||
}
|
||||
|
||||
func (m *Manager) renewLoop(ctx context.Context, gateway nat.NAT) {
|
||||
ticker := time.NewTicker(renewalInterval)
|
||||
func (m *Manager) renewLoop(ctx context.Context, gateway nat.NAT, ttl time.Duration) {
|
||||
if ttl == 0 {
|
||||
// Permanent mappings don't expire, just wait for cancellation.
|
||||
<-ctx.Done()
|
||||
return
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(ttl / 2)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
@@ -208,7 +233,7 @@ func (m *Manager) renewMapping(ctx context.Context, gateway nat.NAT) error {
|
||||
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
externalPort, err := gateway.AddPortMapping(ctx, m.mapping.Protocol, int(m.mapping.InternalPort), mappingDescription, defaultMappingTTL)
|
||||
externalPort, err := gateway.AddPortMapping(ctx, m.mapping.Protocol, int(m.mapping.InternalPort), mappingDescription, m.mapping.TTL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("add port mapping: %w", err)
|
||||
}
|
||||
@@ -248,3 +273,8 @@ func (m *Manager) startTearDown(ctx context.Context) {
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
// isPermanentLeaseRequired checks if a UPnP error indicates the gateway only supports permanent leases (error 725).
|
||||
func isPermanentLeaseRequired(err error) bool {
|
||||
return err != nil && upnpErrPermanentLeaseOnly.MatchString(err.Error())
|
||||
}
|
||||
|
||||
@@ -3,15 +3,18 @@ package portforward
|
||||
import (
|
||||
"context"
|
||||
"net"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Mapping represents port mapping information.
|
||||
// Mapping represents an active NAT port mapping.
|
||||
type Mapping struct {
|
||||
Protocol string
|
||||
InternalPort uint16
|
||||
ExternalPort uint16
|
||||
ExternalIP net.IP
|
||||
NATType string
|
||||
// TTL is the lease duration. Zero means a permanent lease that never expires.
|
||||
TTL time.Duration
|
||||
}
|
||||
|
||||
// Manager is a stub for js/wasm builds where NAT-PMP/UPnP is not supported.
|
||||
|
||||
@@ -4,23 +4,25 @@ package portforward
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/libp2p/go-nat"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
type mockNAT struct {
|
||||
natType string
|
||||
deviceAddr net.IP
|
||||
externalAddr net.IP
|
||||
internalAddr net.IP
|
||||
mappings map[int]int
|
||||
addMappingErr error
|
||||
deleteMappingErr error
|
||||
natType string
|
||||
deviceAddr net.IP
|
||||
externalAddr net.IP
|
||||
internalAddr net.IP
|
||||
mappings map[int]int
|
||||
addMappingErr error
|
||||
deleteMappingErr error
|
||||
onlyPermanentLeases bool
|
||||
lastTimeout time.Duration
|
||||
}
|
||||
|
||||
func newMockNAT() *mockNAT {
|
||||
@@ -53,8 +55,12 @@ func (m *mockNAT) AddPortMapping(ctx context.Context, protocol string, internalP
|
||||
if m.addMappingErr != nil {
|
||||
return 0, m.addMappingErr
|
||||
}
|
||||
if m.onlyPermanentLeases && timeout != 0 {
|
||||
return 0, fmt.Errorf("SOAP fault. Code: | Explanation: | Detail: <UPnPError xmlns=\"urn:schemas-upnp-org:control-1-0\"><errorCode>725</errorCode><errorDescription>OnlyPermanentLeasesSupported</errorDescription></UPnPError>")
|
||||
}
|
||||
externalPort := internalPort
|
||||
m.mappings[internalPort] = externalPort
|
||||
m.lastTimeout = timeout
|
||||
return externalPort, nil
|
||||
}
|
||||
|
||||
@@ -80,6 +86,7 @@ func TestManager_CreateMapping(t *testing.T) {
|
||||
assert.Equal(t, uint16(51820), mapping.ExternalPort)
|
||||
assert.Equal(t, "Mock-NAT", mapping.NATType)
|
||||
assert.Equal(t, net.ParseIP("203.0.113.50").To4(), mapping.ExternalIP.To4())
|
||||
assert.Equal(t, defaultMappingTTL, mapping.TTL)
|
||||
}
|
||||
|
||||
func TestManager_GetMapping_ReturnsNilWhenNotReady(t *testing.T) {
|
||||
@@ -131,29 +138,64 @@ func TestManager_Cleanup_NilMapping(t *testing.T) {
|
||||
m.cleanup(context.Background(), gateway)
|
||||
}
|
||||
|
||||
func TestState_Cleanup(t *testing.T) {
|
||||
origDiscover := discoverGateway
|
||||
defer func() { discoverGateway = origDiscover }()
|
||||
|
||||
mockGateway := newMockNAT()
|
||||
mockGateway.mappings[51820] = 51820
|
||||
discoverGateway = func(ctx context.Context) (nat.NAT, error) {
|
||||
return mockGateway, nil
|
||||
}
|
||||
func TestManager_CreateMapping_PermanentLeaseFallback(t *testing.T) {
|
||||
m := NewManager()
|
||||
m.wgPort = 51820
|
||||
|
||||
state := &State{
|
||||
Protocol: "udp",
|
||||
InternalPort: 51820,
|
||||
}
|
||||
gateway := newMockNAT()
|
||||
gateway.onlyPermanentLeases = true
|
||||
|
||||
err := state.Cleanup()
|
||||
assert.NoError(t, err)
|
||||
mapping, err := m.createMapping(context.Background(), gateway)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, mapping)
|
||||
|
||||
_, exists := mockGateway.mappings[51820]
|
||||
assert.False(t, exists, "mapping should be deleted after cleanup")
|
||||
assert.Equal(t, uint16(51820), mapping.InternalPort)
|
||||
assert.Equal(t, time.Duration(0), mapping.TTL, "should return zero TTL for permanent lease")
|
||||
assert.Equal(t, time.Duration(0), gateway.lastTimeout, "should have retried with zero duration")
|
||||
}
|
||||
|
||||
func TestState_Name(t *testing.T) {
|
||||
state := &State{}
|
||||
assert.Equal(t, "port_forward_state", state.Name())
|
||||
func TestIsPermanentLeaseRequired(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "nil error",
|
||||
err: nil,
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "UPnP error 725",
|
||||
err: fmt.Errorf("SOAP fault. Code: | Detail: <UPnPError><errorCode>725</errorCode><errorDescription>OnlyPermanentLeasesSupported</errorDescription></UPnPError>"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "wrapped error with 725",
|
||||
err: fmt.Errorf("add port mapping: %w", fmt.Errorf("Detail: <errorCode>725</errorCode>")),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "error 725 with newlines in XML",
|
||||
err: fmt.Errorf("<errorCode>\n 725\n</errorCode>"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "bare 725 without XML tag",
|
||||
err: fmt.Errorf("error code 725"),
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "unrelated error",
|
||||
err: fmt.Errorf("connection refused"),
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
assert.Equal(t, tt.expected, isPermanentLeaseRequired(tt.err))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
//go:build !js
|
||||
|
||||
package portforward
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/libp2p/go-nat"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// discoverGateway is the function used for NAT gateway discovery.
|
||||
// It can be replaced in tests to avoid real network operations.
|
||||
var discoverGateway = nat.DiscoverGateway
|
||||
|
||||
// State is persisted only for crash recovery cleanup
|
||||
type State struct {
|
||||
InternalPort uint16 `json:"internal_port,omitempty"`
|
||||
Protocol string `json:"protocol,omitempty"`
|
||||
}
|
||||
|
||||
func (s *State) Name() string {
|
||||
return "port_forward_state"
|
||||
}
|
||||
|
||||
// Cleanup implements statemanager.CleanableState for crash recovery
|
||||
func (s *State) Cleanup() error {
|
||||
if s.InternalPort == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
log.Infof("cleaning up stale port mapping for port %d", s.InternalPort)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), discoveryTimeout)
|
||||
defer cancel()
|
||||
|
||||
gateway, err := discoverGateway(ctx)
|
||||
if err != nil {
|
||||
// Discovery failure is not an error - gateway may not exist
|
||||
log.Debugf("cleanup: no gateway found: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := gateway.DeletePortMapping(ctx, s.Protocol, int(s.InternalPort)); err != nil {
|
||||
return fmt.Errorf("delete port mapping: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -168,6 +168,7 @@ func (m *DefaultManager) setupAndroidRoutes(config ManagerConfig) {
|
||||
NetworkType: route.IPv4Network,
|
||||
}
|
||||
cr = append(cr, fakeIPRoute)
|
||||
m.notifier.SetFakeIPRoute(fakeIPRoute)
|
||||
}
|
||||
|
||||
m.notifier.SetInitialClientRoutes(cr, routesForComparison)
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
type Notifier struct {
|
||||
initialRoutes []*route.Route
|
||||
currentRoutes []*route.Route
|
||||
fakeIPRoute *route.Route
|
||||
|
||||
listener listener.NetworkChangeListener
|
||||
listenerMux sync.Mutex
|
||||
@@ -31,13 +32,17 @@ func (n *Notifier) SetListener(listener listener.NetworkChangeListener) {
|
||||
n.listener = listener
|
||||
}
|
||||
|
||||
// SetInitialClientRoutes stores the full initial route set (including fake IP blocks)
|
||||
// and a separate comparison set (without fake IP blocks) for diff detection.
|
||||
// SetInitialClientRoutes stores the initial route sets for TUN configuration.
|
||||
func (n *Notifier) SetInitialClientRoutes(initialRoutes []*route.Route, routesForComparison []*route.Route) {
|
||||
n.initialRoutes = filterStatic(initialRoutes)
|
||||
n.currentRoutes = filterStatic(routesForComparison)
|
||||
}
|
||||
|
||||
// SetFakeIPRoute stores the fake IP route to be included in every TUN rebuild.
|
||||
func (n *Notifier) SetFakeIPRoute(r *route.Route) {
|
||||
n.fakeIPRoute = r
|
||||
}
|
||||
|
||||
func (n *Notifier) OnNewRoutes(idMap route.HAMap) {
|
||||
var newRoutes []*route.Route
|
||||
for _, routes := range idMap {
|
||||
@@ -69,7 +74,9 @@ func (n *Notifier) notify() {
|
||||
}
|
||||
|
||||
allRoutes := slices.Clone(n.currentRoutes)
|
||||
allRoutes = append(allRoutes, n.extraInitialRoutes()...)
|
||||
if n.fakeIPRoute != nil {
|
||||
allRoutes = append(allRoutes, n.fakeIPRoute)
|
||||
}
|
||||
|
||||
routeStrings := n.routesToStrings(allRoutes)
|
||||
sort.Strings(routeStrings)
|
||||
@@ -78,23 +85,6 @@ func (n *Notifier) notify() {
|
||||
}(n.listener)
|
||||
}
|
||||
|
||||
// extraInitialRoutes returns initialRoutes whose network prefix is absent
|
||||
// from currentRoutes (e.g. the fake IP block added at setup time).
|
||||
func (n *Notifier) extraInitialRoutes() []*route.Route {
|
||||
currentNets := make(map[netip.Prefix]struct{}, len(n.currentRoutes))
|
||||
for _, r := range n.currentRoutes {
|
||||
currentNets[r.Network] = struct{}{}
|
||||
}
|
||||
|
||||
var extra []*route.Route
|
||||
for _, r := range n.initialRoutes {
|
||||
if _, ok := currentNets[r.Network]; !ok {
|
||||
extra = append(extra, r)
|
||||
}
|
||||
}
|
||||
return extra
|
||||
}
|
||||
|
||||
func filterStatic(routes []*route.Route) []*route.Route {
|
||||
out := make([]*route.Route, 0, len(routes))
|
||||
for _, r := range routes {
|
||||
|
||||
@@ -34,6 +34,10 @@ func (n *Notifier) SetInitialClientRoutes([]*route.Route, []*route.Route) {
|
||||
// iOS doesn't care about initial routes
|
||||
}
|
||||
|
||||
func (n *Notifier) SetFakeIPRoute(*route.Route) {
|
||||
// Not used on iOS
|
||||
}
|
||||
|
||||
func (n *Notifier) OnNewRoutes(route.HAMap) {
|
||||
// Not used on iOS
|
||||
}
|
||||
|
||||
@@ -23,6 +23,10 @@ func (n *Notifier) SetInitialClientRoutes([]*route.Route, []*route.Route) {
|
||||
// Not used on non-mobile platforms
|
||||
}
|
||||
|
||||
func (n *Notifier) SetFakeIPRoute(*route.Route) {
|
||||
// Not used on non-mobile platforms
|
||||
}
|
||||
|
||||
func (n *Notifier) OnNewRoutes(idMap route.HAMap) {
|
||||
// Not used on non-mobile platforms
|
||||
}
|
||||
|
||||
@@ -10,10 +10,6 @@ import (
|
||||
)
|
||||
|
||||
// registerStates registers all states that need crash recovery cleanup.
|
||||
// Note: portforward.State is intentionally NOT registered here to avoid blocking startup
|
||||
// for up to 10 seconds during NAT gateway discovery when no gateway is present.
|
||||
// The gateway reference cannot be persisted across restarts, so cleanup requires re-discovery.
|
||||
// Port forward cleanup is handled by the Manager during normal operation instead.
|
||||
func registerStates(mgr *statemanager.Manager) {
|
||||
mgr.RegisterState(&dns.ShutdownState{})
|
||||
mgr.RegisterState(&systemops.ShutdownState{})
|
||||
|
||||
@@ -12,10 +12,6 @@ import (
|
||||
)
|
||||
|
||||
// registerStates registers all states that need crash recovery cleanup.
|
||||
// Note: portforward.State is intentionally NOT registered here to avoid blocking startup
|
||||
// for up to 10 seconds during NAT gateway discovery when no gateway is present.
|
||||
// The gateway reference cannot be persisted across restarts, so cleanup requires re-discovery.
|
||||
// Port forward cleanup is handled by the Manager during normal operation instead.
|
||||
func registerStates(mgr *statemanager.Manager) {
|
||||
mgr.RegisterState(&dns.ShutdownState{})
|
||||
mgr.RegisterState(&systemops.ShutdownState{})
|
||||
|
||||
@@ -43,18 +43,24 @@ func GetInfo(ctx context.Context) *Info {
|
||||
|
||||
systemHostname, _ := os.Hostname()
|
||||
|
||||
addrs, err := networkAddresses()
|
||||
if err != nil {
|
||||
log.Warnf("failed to discover network addresses: %s", err)
|
||||
}
|
||||
|
||||
return &Info{
|
||||
GoOS: runtime.GOOS,
|
||||
Kernel: osInfo[0],
|
||||
Platform: runtime.GOARCH,
|
||||
OS: osName,
|
||||
OSVersion: osVersion,
|
||||
Hostname: extractDeviceName(ctx, systemHostname),
|
||||
CPUs: runtime.NumCPU(),
|
||||
NetbirdVersion: version.NetbirdVersion(),
|
||||
UIVersion: extractUserAgent(ctx),
|
||||
KernelVersion: osInfo[1],
|
||||
Environment: env,
|
||||
GoOS: runtime.GOOS,
|
||||
Kernel: osInfo[0],
|
||||
Platform: runtime.GOARCH,
|
||||
OS: osName,
|
||||
OSVersion: osVersion,
|
||||
Hostname: extractDeviceName(ctx, systemHostname),
|
||||
CPUs: runtime.NumCPU(),
|
||||
NetbirdVersion: version.NetbirdVersion(),
|
||||
UIVersion: extractUserAgent(ctx),
|
||||
KernelVersion: osInfo[1],
|
||||
NetworkAddresses: addrs,
|
||||
Environment: env,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -179,9 +179,11 @@ type StoreConfig struct {
|
||||
|
||||
// ReverseProxyConfig contains reverse proxy settings
|
||||
type ReverseProxyConfig struct {
|
||||
TrustedHTTPProxies []string `yaml:"trustedHTTPProxies"`
|
||||
TrustedHTTPProxiesCount uint `yaml:"trustedHTTPProxiesCount"`
|
||||
TrustedPeers []string `yaml:"trustedPeers"`
|
||||
TrustedHTTPProxies []string `yaml:"trustedHTTPProxies"`
|
||||
TrustedHTTPProxiesCount uint `yaml:"trustedHTTPProxiesCount"`
|
||||
TrustedPeers []string `yaml:"trustedPeers"`
|
||||
AccessLogRetentionDays int `yaml:"accessLogRetentionDays"`
|
||||
AccessLogCleanupIntervalHours int `yaml:"accessLogCleanupIntervalHours"`
|
||||
}
|
||||
|
||||
// DefaultConfig returns a CombinedConfig with default values
|
||||
@@ -645,7 +647,9 @@ func (c *CombinedConfig) ToManagementConfig() (*nbconfig.Config, error) {
|
||||
|
||||
// Build reverse proxy config
|
||||
reverseProxy := nbconfig.ReverseProxy{
|
||||
TrustedHTTPProxiesCount: mgmt.ReverseProxy.TrustedHTTPProxiesCount,
|
||||
TrustedHTTPProxiesCount: mgmt.ReverseProxy.TrustedHTTPProxiesCount,
|
||||
AccessLogRetentionDays: mgmt.ReverseProxy.AccessLogRetentionDays,
|
||||
AccessLogCleanupIntervalHours: mgmt.ReverseProxy.AccessLogCleanupIntervalHours,
|
||||
}
|
||||
for _, p := range mgmt.ReverseProxy.TrustedHTTPProxies {
|
||||
if prefix, err := netip.ParsePrefix(p); err == nil {
|
||||
|
||||
@@ -14,7 +14,6 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/connectivity"
|
||||
"google.golang.org/grpc/credentials"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"google.golang.org/grpc/keepalive"
|
||||
@@ -26,11 +25,22 @@ import (
|
||||
"github.com/netbirdio/netbird/util/wsproxy"
|
||||
)
|
||||
|
||||
var ErrClientClosed = errors.New("client is closed")
|
||||
|
||||
// minHealthyDuration is the minimum time a stream must survive before a failure
|
||||
// resets the backoff timer. Streams that fail faster are considered unhealthy and
|
||||
// should not reset backoff, so that MaxElapsedTime can eventually stop retries.
|
||||
const minHealthyDuration = 5 * time.Second
|
||||
|
||||
type GRPCClient struct {
|
||||
realClient proto.FlowServiceClient
|
||||
clientConn *grpc.ClientConn
|
||||
stream proto.FlowService_EventsClient
|
||||
streamMu sync.Mutex
|
||||
target string
|
||||
opts []grpc.DialOption
|
||||
closed bool // prevent creating conn in the middle of the Close
|
||||
receiving bool // prevent concurrent Receive calls
|
||||
mu sync.Mutex // protects clientConn, realClient, stream, closed, and receiving
|
||||
}
|
||||
|
||||
func NewClient(addr, payload, signature string, interval time.Duration) (*GRPCClient, error) {
|
||||
@@ -65,7 +75,8 @@ func NewClient(addr, payload, signature string, interval time.Duration) (*GRPCCl
|
||||
grpc.WithDefaultServiceConfig(`{"healthCheckConfig": {"serviceName": ""}}`),
|
||||
)
|
||||
|
||||
conn, err := grpc.NewClient(fmt.Sprintf("%s:%s", parsedURL.Hostname(), parsedURL.Port()), opts...)
|
||||
target := parsedURL.Host
|
||||
conn, err := grpc.NewClient(target, opts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating new grpc client: %w", err)
|
||||
}
|
||||
@@ -73,30 +84,73 @@ func NewClient(addr, payload, signature string, interval time.Duration) (*GRPCCl
|
||||
return &GRPCClient{
|
||||
realClient: proto.NewFlowServiceClient(conn),
|
||||
clientConn: conn,
|
||||
target: target,
|
||||
opts: opts,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *GRPCClient) Close() error {
|
||||
c.streamMu.Lock()
|
||||
defer c.streamMu.Unlock()
|
||||
|
||||
c.mu.Lock()
|
||||
c.closed = true
|
||||
c.stream = nil
|
||||
if err := c.clientConn.Close(); err != nil && !errors.Is(err, context.Canceled) {
|
||||
conn := c.clientConn
|
||||
c.clientConn = nil
|
||||
c.mu.Unlock()
|
||||
|
||||
if conn == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := conn.Close(); err != nil && !errors.Is(err, context.Canceled) {
|
||||
return fmt.Errorf("close client connection: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *GRPCClient) Send(event *proto.FlowEvent) error {
|
||||
c.mu.Lock()
|
||||
stream := c.stream
|
||||
c.mu.Unlock()
|
||||
|
||||
if stream == nil {
|
||||
return errors.New("stream not initialized")
|
||||
}
|
||||
|
||||
if err := stream.Send(event); err != nil {
|
||||
return fmt.Errorf("send flow event: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *GRPCClient) Receive(ctx context.Context, interval time.Duration, msgHandler func(msg *proto.FlowEventAck) error) error {
|
||||
c.mu.Lock()
|
||||
if c.receiving {
|
||||
c.mu.Unlock()
|
||||
return errors.New("concurrent Receive calls are not supported")
|
||||
}
|
||||
c.receiving = true
|
||||
c.mu.Unlock()
|
||||
defer func() {
|
||||
c.mu.Lock()
|
||||
c.receiving = false
|
||||
c.mu.Unlock()
|
||||
}()
|
||||
|
||||
backOff := defaultBackoff(ctx, interval)
|
||||
operation := func() error {
|
||||
if err := c.establishStreamAndReceive(ctx, msgHandler); err != nil {
|
||||
if s, ok := status.FromError(err); ok && s.Code() == codes.Canceled {
|
||||
return fmt.Errorf("receive: %w: %w", err, context.Canceled)
|
||||
}
|
||||
stream, err := c.establishStream(ctx)
|
||||
if err != nil {
|
||||
log.Errorf("failed to establish flow stream, retrying: %v", err)
|
||||
return c.handleRetryableError(err, time.Time{}, backOff)
|
||||
}
|
||||
|
||||
streamStart := time.Now()
|
||||
|
||||
if err := c.receive(stream, msgHandler); err != nil {
|
||||
log.Errorf("receive failed: %v", err)
|
||||
return fmt.Errorf("receive: %w", err)
|
||||
return c.handleRetryableError(err, streamStart, backOff)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -108,37 +162,106 @@ func (c *GRPCClient) Receive(ctx context.Context, interval time.Duration, msgHan
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *GRPCClient) establishStreamAndReceive(ctx context.Context, msgHandler func(msg *proto.FlowEventAck) error) error {
|
||||
if c.clientConn.GetState() == connectivity.Shutdown {
|
||||
return errors.New("connection to flow receiver has been shut down")
|
||||
// handleRetryableError resets the backoff timer if the stream was healthy long
|
||||
// enough and recreates the underlying ClientConn so that gRPC's internal
|
||||
// subchannel backoff does not accumulate and compete with our own retry timer.
|
||||
// A zero streamStart means the stream was never established.
|
||||
func (c *GRPCClient) handleRetryableError(err error, streamStart time.Time, backOff backoff.BackOff) error {
|
||||
if isContextDone(err) {
|
||||
return backoff.Permanent(err)
|
||||
}
|
||||
|
||||
stream, err := c.realClient.Events(ctx, grpc.WaitForReady(true))
|
||||
if err != nil {
|
||||
return fmt.Errorf("create event stream: %w", err)
|
||||
var permErr *backoff.PermanentError
|
||||
if errors.As(err, &permErr) {
|
||||
return err
|
||||
}
|
||||
|
||||
err = stream.Send(&proto.FlowEvent{IsInitiator: true})
|
||||
// Reset the backoff so the next retry starts with a short delay instead of
|
||||
// continuing the already-elapsed timer. Only do this if the stream was healthy
|
||||
// long enough; short-lived connect/drop cycles must not defeat MaxElapsedTime.
|
||||
if !streamStart.IsZero() && time.Since(streamStart) >= minHealthyDuration {
|
||||
backOff.Reset()
|
||||
}
|
||||
|
||||
if recreateErr := c.recreateConnection(); recreateErr != nil {
|
||||
log.Errorf("recreate connection: %v", recreateErr)
|
||||
return recreateErr
|
||||
}
|
||||
|
||||
log.Infof("connection recreated, retrying stream")
|
||||
return fmt.Errorf("retrying after error: %w", err)
|
||||
}
|
||||
|
||||
func (c *GRPCClient) recreateConnection() error {
|
||||
c.mu.Lock()
|
||||
if c.closed {
|
||||
c.mu.Unlock()
|
||||
return backoff.Permanent(ErrClientClosed)
|
||||
}
|
||||
|
||||
conn, err := grpc.NewClient(c.target, c.opts...)
|
||||
if err != nil {
|
||||
log.Infof("failed to send initiator message to flow receiver but will attempt to continue. Error: %s", err)
|
||||
c.mu.Unlock()
|
||||
return fmt.Errorf("create new connection: %w", err)
|
||||
}
|
||||
|
||||
old := c.clientConn
|
||||
c.clientConn = conn
|
||||
c.realClient = proto.NewFlowServiceClient(conn)
|
||||
c.stream = nil
|
||||
c.mu.Unlock()
|
||||
|
||||
_ = old.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *GRPCClient) establishStream(ctx context.Context) (proto.FlowService_EventsClient, error) {
|
||||
c.mu.Lock()
|
||||
if c.closed {
|
||||
c.mu.Unlock()
|
||||
return nil, backoff.Permanent(ErrClientClosed)
|
||||
}
|
||||
cl := c.realClient
|
||||
c.mu.Unlock()
|
||||
|
||||
// open stream outside the lock — blocking operation
|
||||
stream, err := cl.Events(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create event stream: %w", err)
|
||||
}
|
||||
streamReady := false
|
||||
defer func() {
|
||||
if !streamReady {
|
||||
_ = stream.CloseSend()
|
||||
}
|
||||
}()
|
||||
|
||||
if err = stream.Send(&proto.FlowEvent{IsInitiator: true}); err != nil {
|
||||
return nil, fmt.Errorf("send initiator: %w", err)
|
||||
}
|
||||
|
||||
if err = checkHeader(stream); err != nil {
|
||||
return fmt.Errorf("check header: %w", err)
|
||||
return nil, fmt.Errorf("check header: %w", err)
|
||||
}
|
||||
|
||||
c.streamMu.Lock()
|
||||
c.mu.Lock()
|
||||
if c.closed {
|
||||
c.mu.Unlock()
|
||||
return nil, backoff.Permanent(ErrClientClosed)
|
||||
}
|
||||
c.stream = stream
|
||||
c.streamMu.Unlock()
|
||||
c.mu.Unlock()
|
||||
streamReady = true
|
||||
|
||||
return c.receive(stream, msgHandler)
|
||||
return stream, nil
|
||||
}
|
||||
|
||||
func (c *GRPCClient) receive(stream proto.FlowService_EventsClient, msgHandler func(msg *proto.FlowEventAck) error) error {
|
||||
for {
|
||||
msg, err := stream.Recv()
|
||||
if err != nil {
|
||||
return fmt.Errorf("receive from stream: %w", err)
|
||||
return err
|
||||
}
|
||||
|
||||
if msg.IsInitiator {
|
||||
@@ -169,7 +292,7 @@ func checkHeader(stream proto.FlowService_EventsClient) error {
|
||||
func defaultBackoff(ctx context.Context, interval time.Duration) backoff.BackOff {
|
||||
return backoff.WithContext(&backoff.ExponentialBackOff{
|
||||
InitialInterval: 800 * time.Millisecond,
|
||||
RandomizationFactor: 1,
|
||||
RandomizationFactor: 0.5,
|
||||
Multiplier: 1.7,
|
||||
MaxInterval: interval / 2,
|
||||
MaxElapsedTime: 3 * 30 * 24 * time.Hour, // 3 months
|
||||
@@ -178,18 +301,12 @@ func defaultBackoff(ctx context.Context, interval time.Duration) backoff.BackOff
|
||||
}, ctx)
|
||||
}
|
||||
|
||||
func (c *GRPCClient) Send(event *proto.FlowEvent) error {
|
||||
c.streamMu.Lock()
|
||||
stream := c.stream
|
||||
c.streamMu.Unlock()
|
||||
|
||||
if stream == nil {
|
||||
return errors.New("stream not initialized")
|
||||
func isContextDone(err error) bool {
|
||||
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
return true
|
||||
}
|
||||
|
||||
if err := stream.Send(event); err != nil {
|
||||
return fmt.Errorf("send flow event: %w", err)
|
||||
if s, ok := status.FromError(err); ok {
|
||||
return s.Code() == codes.Canceled || s.Code() == codes.DeadlineExceeded
|
||||
}
|
||||
|
||||
return nil
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -2,8 +2,11 @@ package client_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"net"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -11,6 +14,8 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/codes"
|
||||
"google.golang.org/grpc/status"
|
||||
|
||||
flow "github.com/netbirdio/netbird/flow/client"
|
||||
"github.com/netbirdio/netbird/flow/proto"
|
||||
@@ -18,21 +23,89 @@ import (
|
||||
|
||||
type testServer struct {
|
||||
proto.UnimplementedFlowServiceServer
|
||||
events chan *proto.FlowEvent
|
||||
acks chan *proto.FlowEventAck
|
||||
grpcSrv *grpc.Server
|
||||
addr string
|
||||
events chan *proto.FlowEvent
|
||||
acks chan *proto.FlowEventAck
|
||||
grpcSrv *grpc.Server
|
||||
addr string
|
||||
listener *connTrackListener
|
||||
closeStream chan struct{} // signal server to close the stream
|
||||
handlerDone chan struct{} // signaled each time Events() exits
|
||||
handlerStarted chan struct{} // signaled each time Events() begins
|
||||
}
|
||||
|
||||
// connTrackListener wraps a net.Listener to track accepted connections
|
||||
// so tests can forcefully close them to simulate PROTOCOL_ERROR/RST_STREAM.
|
||||
type connTrackListener struct {
|
||||
net.Listener
|
||||
mu sync.Mutex
|
||||
conns []net.Conn
|
||||
}
|
||||
|
||||
func (l *connTrackListener) Accept() (net.Conn, error) {
|
||||
c, err := l.Listener.Accept()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
l.mu.Lock()
|
||||
l.conns = append(l.conns, c)
|
||||
l.mu.Unlock()
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// sendRSTStream writes a raw HTTP/2 RST_STREAM frame with PROTOCOL_ERROR
|
||||
// (error code 0x1) on every tracked connection. This produces the exact error:
|
||||
//
|
||||
// rpc error: code = Internal desc = stream terminated by RST_STREAM with error code: PROTOCOL_ERROR
|
||||
//
|
||||
// HTTP/2 RST_STREAM frame format (9-byte header + 4-byte payload):
|
||||
//
|
||||
// Length (3 bytes): 0x000004
|
||||
// Type (1 byte): 0x03 (RST_STREAM)
|
||||
// Flags (1 byte): 0x00
|
||||
// Stream ID (4 bytes): target stream (must have bit 31 clear)
|
||||
// Error Code (4 bytes): 0x00000001 (PROTOCOL_ERROR)
|
||||
func (l *connTrackListener) connCount() int {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
return len(l.conns)
|
||||
}
|
||||
|
||||
func (l *connTrackListener) sendRSTStream(streamID uint32) {
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
|
||||
frame := make([]byte, 13) // 9-byte header + 4-byte payload
|
||||
// Length = 4 (3 bytes, big-endian)
|
||||
frame[0], frame[1], frame[2] = 0, 0, 4
|
||||
// Type = RST_STREAM (0x03)
|
||||
frame[3] = 0x03
|
||||
// Flags = 0
|
||||
frame[4] = 0x00
|
||||
// Stream ID (4 bytes, big-endian, bit 31 reserved = 0)
|
||||
binary.BigEndian.PutUint32(frame[5:9], streamID)
|
||||
// Error Code = PROTOCOL_ERROR (0x1)
|
||||
binary.BigEndian.PutUint32(frame[9:13], 0x1)
|
||||
|
||||
for _, c := range l.conns {
|
||||
_, _ = c.Write(frame)
|
||||
}
|
||||
}
|
||||
|
||||
func newTestServer(t *testing.T) *testServer {
|
||||
listener, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
rawListener, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
require.NoError(t, err)
|
||||
|
||||
listener := &connTrackListener{Listener: rawListener}
|
||||
|
||||
s := &testServer{
|
||||
events: make(chan *proto.FlowEvent, 100),
|
||||
acks: make(chan *proto.FlowEventAck, 100),
|
||||
grpcSrv: grpc.NewServer(),
|
||||
addr: listener.Addr().String(),
|
||||
events: make(chan *proto.FlowEvent, 100),
|
||||
acks: make(chan *proto.FlowEventAck, 100),
|
||||
grpcSrv: grpc.NewServer(),
|
||||
addr: rawListener.Addr().String(),
|
||||
listener: listener,
|
||||
closeStream: make(chan struct{}, 1),
|
||||
handlerDone: make(chan struct{}, 10),
|
||||
handlerStarted: make(chan struct{}, 10),
|
||||
}
|
||||
|
||||
proto.RegisterFlowServiceServer(s.grpcSrv, s)
|
||||
@@ -51,11 +124,23 @@ func newTestServer(t *testing.T) *testServer {
|
||||
}
|
||||
|
||||
func (s *testServer) Events(stream proto.FlowService_EventsServer) error {
|
||||
defer func() {
|
||||
select {
|
||||
case s.handlerDone <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}()
|
||||
|
||||
err := stream.Send(&proto.FlowEventAck{IsInitiator: true})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
select {
|
||||
case s.handlerStarted <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(stream.Context())
|
||||
defer cancel()
|
||||
|
||||
@@ -91,6 +176,8 @@ func (s *testServer) Events(stream proto.FlowService_EventsServer) error {
|
||||
if err := stream.Send(ack); err != nil {
|
||||
return err
|
||||
}
|
||||
case <-s.closeStream:
|
||||
return status.Errorf(codes.Internal, "server closing stream")
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
@@ -110,16 +197,13 @@ func TestReceive(t *testing.T) {
|
||||
assert.NoError(t, err, "failed to close flow")
|
||||
})
|
||||
|
||||
receivedAcks := make(map[string]bool)
|
||||
var ackCount atomic.Int32
|
||||
receiveDone := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
err := client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
|
||||
if !msg.IsInitiator && len(msg.EventId) > 0 {
|
||||
id := string(msg.EventId)
|
||||
receivedAcks[id] = true
|
||||
|
||||
if len(receivedAcks) >= 3 {
|
||||
if ackCount.Add(1) >= 3 {
|
||||
close(receiveDone)
|
||||
}
|
||||
}
|
||||
@@ -130,7 +214,11 @@ func TestReceive(t *testing.T) {
|
||||
}
|
||||
}()
|
||||
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
select {
|
||||
case <-server.handlerStarted:
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("timeout waiting for stream to be established")
|
||||
}
|
||||
|
||||
for i := 0; i < 3; i++ {
|
||||
eventID := uuid.New().String()
|
||||
@@ -153,7 +241,7 @@ func TestReceive(t *testing.T) {
|
||||
t.Fatal("timeout waiting for acks to be processed")
|
||||
}
|
||||
|
||||
assert.Equal(t, 3, len(receivedAcks))
|
||||
assert.Equal(t, int32(3), ackCount.Load())
|
||||
}
|
||||
|
||||
func TestReceive_ContextCancellation(t *testing.T) {
|
||||
@@ -254,3 +342,195 @@ func TestSend(t *testing.T) {
|
||||
t.Fatal("timeout waiting for ack to be received by flow")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewClient_PermanentClose(t *testing.T) {
|
||||
server := newTestServer(t)
|
||||
|
||||
client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = client.Close()
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
|
||||
return nil
|
||||
})
|
||||
}()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
require.ErrorIs(t, err, flow.ErrClientClosed)
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("Receive did not return after Close — stuck in retry loop")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewClient_CloseVerify(t *testing.T) {
|
||||
server := newTestServer(t)
|
||||
|
||||
client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
|
||||
return nil
|
||||
})
|
||||
}()
|
||||
|
||||
closeDone := make(chan struct{}, 1)
|
||||
go func() {
|
||||
_ = client.Close()
|
||||
closeDone <- struct{}{}
|
||||
}()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
require.Error(t, err)
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("Receive did not return after Close — stuck in retry loop")
|
||||
}
|
||||
|
||||
select {
|
||||
case <-closeDone:
|
||||
return
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("Close did not return — blocked in retry loop")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestClose_WhileReceiving(t *testing.T) {
|
||||
server := newTestServer(t)
|
||||
client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
|
||||
require.NoError(t, err)
|
||||
|
||||
ctx := context.Background() // no timeout — intentional
|
||||
receiveDone := make(chan struct{})
|
||||
go func() {
|
||||
_ = client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
|
||||
return nil
|
||||
})
|
||||
close(receiveDone)
|
||||
}()
|
||||
|
||||
// Wait for the server-side handler to confirm the stream is established.
|
||||
select {
|
||||
case <-server.handlerStarted:
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("timeout waiting for stream to be established")
|
||||
}
|
||||
|
||||
closeDone := make(chan struct{})
|
||||
go func() {
|
||||
_ = client.Close()
|
||||
close(closeDone)
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-closeDone:
|
||||
// Close returned — good
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("Close blocked forever — Receive stuck in retry loop")
|
||||
}
|
||||
|
||||
select {
|
||||
case <-receiveDone:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("Receive did not exit after Close")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReceive_ProtocolErrorStreamReconnect(t *testing.T) {
|
||||
server := newTestServer(t)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
t.Cleanup(cancel)
|
||||
|
||||
client, err := flow.NewClient("http://"+server.addr, "test-payload", "test-signature", 1*time.Second)
|
||||
require.NoError(t, err)
|
||||
t.Cleanup(func() {
|
||||
err := client.Close()
|
||||
assert.NoError(t, err, "failed to close flow")
|
||||
})
|
||||
|
||||
// Track acks received before and after server-side stream close
|
||||
var ackCount atomic.Int32
|
||||
receivedFirst := make(chan struct{})
|
||||
receivedAfterReconnect := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
err := client.Receive(ctx, 1*time.Second, func(msg *proto.FlowEventAck) error {
|
||||
if msg.IsInitiator || len(msg.EventId) == 0 {
|
||||
return nil
|
||||
}
|
||||
n := ackCount.Add(1)
|
||||
if n == 1 {
|
||||
close(receivedFirst)
|
||||
}
|
||||
if n == 2 {
|
||||
close(receivedAfterReconnect)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil && !errors.Is(err, context.Canceled) {
|
||||
t.Logf("receive error: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait for stream to be established, then send first ack
|
||||
select {
|
||||
case <-server.handlerStarted:
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("timeout waiting for stream to be established")
|
||||
}
|
||||
server.acks <- &proto.FlowEventAck{EventId: []byte("before-close")}
|
||||
|
||||
select {
|
||||
case <-receivedFirst:
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("timeout waiting for first ack")
|
||||
}
|
||||
|
||||
// Snapshot connection count before injecting the fault.
|
||||
connsBefore := server.listener.connCount()
|
||||
|
||||
// Send a raw HTTP/2 RST_STREAM frame with PROTOCOL_ERROR on the TCP connection.
|
||||
// gRPC multiplexes streams on stream IDs 1, 3, 5, ... (odd, client-initiated).
|
||||
// Stream ID 1 is the client's first stream (our Events bidi stream).
|
||||
// This produces the exact error the client sees in production:
|
||||
// "stream terminated by RST_STREAM with error code: PROTOCOL_ERROR"
|
||||
server.listener.sendRSTStream(1)
|
||||
|
||||
// Wait for the old Events() handler to fully exit so it can no longer
|
||||
// drain s.acks and drop our injected ack on a broken stream.
|
||||
select {
|
||||
case <-server.handlerDone:
|
||||
case <-time.After(5 * time.Second):
|
||||
t.Fatal("old Events() handler did not exit after RST_STREAM")
|
||||
}
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
return server.listener.connCount() > connsBefore
|
||||
}, 5*time.Second, 50*time.Millisecond, "client did not open a new TCP connection after RST_STREAM")
|
||||
|
||||
server.acks <- &proto.FlowEventAck{EventId: []byte("after-close")}
|
||||
|
||||
select {
|
||||
case <-receivedAfterReconnect:
|
||||
// Client successfully reconnected and received ack after server-side stream close
|
||||
case <-time.After(5 * time.Second):
|
||||
t.Fatal("timeout waiting for ack after server-side stream close — client did not reconnect")
|
||||
}
|
||||
|
||||
assert.GreaterOrEqual(t, int(ackCount.Load()), 2, "should have received acks before and after stream close")
|
||||
assert.GreaterOrEqual(t, server.listener.connCount(), 2, "client should have created at least 2 TCP connections (original + reconnect)")
|
||||
}
|
||||
|
||||
@@ -302,7 +302,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(management_account_peer_meta_update_counter_ratio_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"expr": "rate(management_account_peer_meta_update_counter_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"instant": false,
|
||||
"legendFormat": "{{cluster}}/{{environment}}/{{job}}",
|
||||
"range": true,
|
||||
@@ -410,7 +410,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5,sum(increase(management_account_get_peer_network_map_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.5,sum(increase(management_account_get_peer_network_map_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
@@ -426,7 +426,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.9,sum(increase(management_account_get_peer_network_map_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.9,sum(increase(management_account_get_peer_network_map_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
@@ -443,7 +443,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99,sum(increase(management_account_get_peer_network_map_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.99,sum(increase(management_account_get_peer_network_map_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
@@ -545,7 +545,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5,sum(increase(management_account_update_account_peers_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.5,sum(increase(management_account_update_account_peers_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
@@ -561,7 +561,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.9,sum(increase(management_account_update_account_peers_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.9,sum(increase(management_account_update_account_peers_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
@@ -578,7 +578,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99,sum(increase(management_account_update_account_peers_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.99,sum(increase(management_account_update_account_peers_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
@@ -694,7 +694,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5,sum(increase(management_grpc_updatechannel_queue_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.5,sum(increase(management_grpc_updatechannel_queue_length_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
@@ -710,7 +710,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.9,sum(increase(management_grpc_updatechannel_queue_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.9,sum(increase(management_grpc_updatechannel_queue_length_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
@@ -727,7 +727,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99,sum(increase(management_grpc_updatechannel_queue_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"expr": "histogram_quantile(0.99,sum(increase(management_grpc_updatechannel_queue_length_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le,cluster,environment,job))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"hide": false,
|
||||
@@ -841,7 +841,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_store_persistence_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_store_persistence_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"instant": false,
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
@@ -853,7 +853,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_store_persistence_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_store_persistence_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p90",
|
||||
@@ -866,7 +866,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_store_persistence_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_store_persistence_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
@@ -963,7 +963,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_store_transaction_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_store_transaction_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"instant": false,
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
@@ -975,7 +975,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_store_transaction_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_store_transaction_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p90",
|
||||
@@ -988,7 +988,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_store_transaction_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_store_transaction_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
@@ -1085,7 +1085,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_store_global_lock_acquisition_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_store_global_lock_acquisition_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"instant": false,
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
@@ -1097,7 +1097,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_store_global_lock_acquisition_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_store_global_lock_acquisition_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p90",
|
||||
@@ -1110,7 +1110,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_store_global_lock_acquisition_duration_ms_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_store_global_lock_acquisition_duration_ms_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
@@ -1221,7 +1221,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(management_idp_authenticate_request_counter_ratio_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"expr": "rate(management_idp_authenticate_request_counter_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"instant": false,
|
||||
"legendFormat": "{{cluster}}/{{environment}}/{{job}}",
|
||||
"range": true,
|
||||
@@ -1317,7 +1317,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(management_idp_get_account_counter_ratio_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"expr": "rate(management_idp_get_account_counter_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"instant": false,
|
||||
"legendFormat": "{{cluster}}/{{environment}}/{{job}}",
|
||||
"range": true,
|
||||
@@ -1413,7 +1413,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "rate(management_idp_update_user_meta_counter_ratio_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"expr": "rate(management_idp_update_user_meta_counter_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])",
|
||||
"instant": false,
|
||||
"legendFormat": "{{cluster}}/{{environment}}/{{job}}",
|
||||
"range": true,
|
||||
@@ -1523,7 +1523,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(management_http_request_counter_ratio_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",method=~\"GET|OPTIONS\"}[$__rate_interval])) by (job,method)",
|
||||
"expr": "sum(rate(management_http_request_counter_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",method=~\"GET|OPTIONS\"}[$__rate_interval])) by (job,method)",
|
||||
"instant": false,
|
||||
"legendFormat": "{{method}}",
|
||||
"range": true,
|
||||
@@ -1619,7 +1619,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(management_http_request_counter_ratio_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",method=~\"POST|PUT|DELETE\"}[$__rate_interval])) by (job,method)",
|
||||
"expr": "sum(rate(management_http_request_counter_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",method=~\"POST|PUT|DELETE\"}[$__rate_interval])) by (job,method)",
|
||||
"instant": false,
|
||||
"legendFormat": "{{method}}",
|
||||
"range": true,
|
||||
@@ -1715,7 +1715,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_http_request_duration_ms_total_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"read\"}[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_http_request_duration_ms_total_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"read\"}[5m])) by (le))",
|
||||
"instant": false,
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
@@ -1727,7 +1727,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_http_request_duration_ms_total_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"read\"}[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_http_request_duration_ms_total_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"read\"}[5m])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p90",
|
||||
@@ -1740,7 +1740,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_http_request_duration_ms_total_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"read\"}[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_http_request_duration_ms_total_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"read\"}[5m])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
@@ -1837,7 +1837,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_http_request_duration_ms_total_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"write\"}[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.50, sum(rate(management_http_request_duration_ms_total_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"write\"}[5m])) by (le))",
|
||||
"instant": false,
|
||||
"legendFormat": "p50",
|
||||
"range": true,
|
||||
@@ -1849,7 +1849,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_http_request_duration_ms_total_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"write\"}[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.90, sum(rate(management_http_request_duration_ms_total_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"write\"}[5m])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p90",
|
||||
@@ -1862,7 +1862,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_http_request_duration_ms_total_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"write\"}[5m])) by (le))",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(management_http_request_duration_ms_total_milliseconds_bucket{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\",type=~\"write\"}[5m])) by (le))",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "p99",
|
||||
@@ -1963,7 +1963,7 @@
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(management_http_request_counter_ratio_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (job,exported_endpoint,method)",
|
||||
"expr": "sum(rate(management_http_request_counter_total{cluster=~\"$cluster\",environment=~\"$environment\",job=~\"$job\",host=~\"$host\"}[$__rate_interval])) by (job,exported_endpoint,method)",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"legendFormat": "{{method}}-{{exported_endpoint}}",
|
||||
@@ -3222,7 +3222,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "sum by(le) (increase(management_grpc_updatechannel_queue_bucket{application=\"management\", environment=\"$environment\", host=~\"$host\"}[$__rate_interval]))",
|
||||
"expr": "sum by(le) (increase(management_grpc_updatechannel_queue_length_bucket{application=\"management\", environment=\"$environment\", host=~\"$host\"}[$__rate_interval]))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
@@ -3323,7 +3323,7 @@
|
||||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "sum by(le) (increase(management_account_update_account_peers_duration_ms_bucket{application=\"management\", environment=\"$environment\", host=~\"$host\"}[$__rate_interval]))",
|
||||
"expr": "sum by(le) (increase(management_account_update_account_peers_duration_ms_milliseconds_bucket{application=\"management\", environment=\"$environment\", host=~\"$host\"}[$__rate_interval]))",
|
||||
"format": "heatmap",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
|
||||
@@ -106,13 +106,23 @@ func (m *managerImpl) CleanupOldAccessLogs(ctx context.Context, retentionDays in
|
||||
|
||||
// StartPeriodicCleanup starts a background goroutine that periodically cleans up old access logs
|
||||
func (m *managerImpl) StartPeriodicCleanup(ctx context.Context, retentionDays, cleanupIntervalHours int) {
|
||||
if retentionDays <= 0 {
|
||||
log.WithContext(ctx).Debug("periodic access log cleanup disabled: retention days is 0 or negative")
|
||||
if retentionDays < 0 {
|
||||
log.WithContext(ctx).Debug("periodic access log cleanup disabled: retention days is negative")
|
||||
return
|
||||
}
|
||||
|
||||
if retentionDays == 0 {
|
||||
retentionDays = 7
|
||||
log.WithContext(ctx).Debugf("no retention days specified for access log cleanup, defaulting to %d days", retentionDays)
|
||||
} else {
|
||||
log.WithContext(ctx).Debugf("access log retention period set to %d days", retentionDays)
|
||||
}
|
||||
|
||||
if cleanupIntervalHours <= 0 {
|
||||
cleanupIntervalHours = 24
|
||||
log.WithContext(ctx).Debugf("no cleanup interval specified for access log cleanup, defaulting to %d hours", cleanupIntervalHours)
|
||||
} else {
|
||||
log.WithContext(ctx).Debugf("access log cleanup interval set to %d hours", cleanupIntervalHours)
|
||||
}
|
||||
|
||||
cleanupCtx, cancel := context.WithCancel(ctx)
|
||||
|
||||
@@ -121,7 +121,7 @@ func TestCleanupWithExactBoundary(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestStartPeriodicCleanup(t *testing.T) {
|
||||
t.Run("periodic cleanup disabled with zero retention", func(t *testing.T) {
|
||||
t.Run("periodic cleanup disabled with negative retention", func(t *testing.T) {
|
||||
ctrl := gomock.NewController(t)
|
||||
defer ctrl.Finish()
|
||||
|
||||
@@ -135,7 +135,7 @@ func TestStartPeriodicCleanup(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
manager.StartPeriodicCleanup(ctx, 0, 1)
|
||||
manager.StartPeriodicCleanup(ctx, -1, 1)
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
|
||||
|
||||
@@ -30,3 +30,8 @@ func (d *Domain) EventMeta() map[string]any {
|
||||
"validated": d.Validated,
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Domain) Copy() *Domain {
|
||||
dCopy := *d
|
||||
return &dCopy
|
||||
}
|
||||
|
||||
@@ -203,7 +203,7 @@ type ReverseProxy struct {
|
||||
|
||||
// AccessLogRetentionDays specifies the number of days to retain access logs.
|
||||
// Logs older than this duration will be automatically deleted during cleanup.
|
||||
// A value of 0 or negative means logs are kept indefinitely (no cleanup).
|
||||
// A value of 0 will default to 7 days. Negative means logs are kept indefinitely (no cleanup).
|
||||
AccessLogRetentionDays int
|
||||
|
||||
// AccessLogCleanupIntervalHours specifies how often (in hours) to run the cleanup routine.
|
||||
|
||||
@@ -742,11 +742,6 @@ func (am *DefaultAccountManager) DeleteAccount(ctx context.Context, accountID, u
|
||||
return status.Errorf(status.Internal, "failed to build user infos for account %s: %v", accountID, err)
|
||||
}
|
||||
|
||||
err = am.serviceManager.DeleteAllServices(ctx, accountID, userID)
|
||||
if err != nil {
|
||||
return status.Errorf(status.Internal, "failed to delete service %s: %v", accountID, err)
|
||||
}
|
||||
|
||||
for _, otherUser := range account.Users {
|
||||
if otherUser.Id == userID {
|
||||
continue
|
||||
|
||||
@@ -15,7 +15,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/golang/mock/gomock"
|
||||
"github.com/netbirdio/netbird/shared/management/status"
|
||||
"github.com/prometheus/client_golang/prometheus/push"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
@@ -23,6 +22,9 @@ import (
|
||||
"go.opentelemetry.io/otel/metric/noop"
|
||||
"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
|
||||
|
||||
"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/domain"
|
||||
"github.com/netbirdio/netbird/shared/management/status"
|
||||
|
||||
nbdns "github.com/netbirdio/netbird/dns"
|
||||
"github.com/netbirdio/netbird/management/internals/controllers/network_map"
|
||||
"github.com/netbirdio/netbird/management/internals/controllers/network_map/controller"
|
||||
@@ -1815,6 +1817,13 @@ func TestAccount_Copy(t *testing.T) {
|
||||
Targets: []*service.Target{},
|
||||
},
|
||||
},
|
||||
Domains: []*domain.Domain{
|
||||
{
|
||||
ID: "domain1",
|
||||
Domain: "test.com",
|
||||
AccountID: "account1",
|
||||
},
|
||||
},
|
||||
NetworkMapCache: &types.NetworkMapBuilder{},
|
||||
}
|
||||
account.InitOnce()
|
||||
|
||||
@@ -489,6 +489,102 @@ func MigrateJsonToTable[T any](ctx context.Context, db *gorm.DB, columnName stri
|
||||
return nil
|
||||
}
|
||||
|
||||
// hasForeignKey checks whether a foreign key constraint exists on the given table and column.
|
||||
func hasForeignKey(db *gorm.DB, table, column string) bool {
|
||||
var count int64
|
||||
|
||||
switch db.Name() {
|
||||
case "postgres":
|
||||
db.Raw(`
|
||||
SELECT COUNT(*) FROM information_schema.key_column_usage kcu
|
||||
JOIN information_schema.table_constraints tc
|
||||
ON tc.constraint_name = kcu.constraint_name
|
||||
AND tc.table_schema = kcu.table_schema
|
||||
WHERE tc.constraint_type = 'FOREIGN KEY'
|
||||
AND kcu.table_name = ?
|
||||
AND kcu.column_name = ?
|
||||
`, table, column).Scan(&count)
|
||||
case "mysql":
|
||||
db.Raw(`
|
||||
SELECT COUNT(*) FROM information_schema.key_column_usage
|
||||
WHERE table_schema = DATABASE()
|
||||
AND table_name = ?
|
||||
AND column_name = ?
|
||||
AND referenced_table_name IS NOT NULL
|
||||
`, table, column).Scan(&count)
|
||||
default: // sqlite
|
||||
type fkInfo struct {
|
||||
From string
|
||||
}
|
||||
var fks []fkInfo
|
||||
db.Raw(fmt.Sprintf("PRAGMA foreign_key_list(%s)", table)).Scan(&fks)
|
||||
for _, fk := range fks {
|
||||
if fk.From == column {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
return count > 0
|
||||
}
|
||||
|
||||
// CleanupOrphanedResources deletes rows from the table of model T where the foreign
|
||||
// key column (fkColumn) references a row in the table of model R that no longer exists.
|
||||
func CleanupOrphanedResources[T any, R any](ctx context.Context, db *gorm.DB, fkColumn string) error {
|
||||
var model T
|
||||
var refModel R
|
||||
|
||||
if !db.Migrator().HasTable(&model) {
|
||||
log.WithContext(ctx).Debugf("table for %T does not exist, no cleanup needed", model)
|
||||
return nil
|
||||
}
|
||||
|
||||
if !db.Migrator().HasTable(&refModel) {
|
||||
log.WithContext(ctx).Debugf("referenced table for %T does not exist, no cleanup needed", refModel)
|
||||
return nil
|
||||
}
|
||||
|
||||
stmtT := &gorm.Statement{DB: db}
|
||||
if err := stmtT.Parse(&model); err != nil {
|
||||
return fmt.Errorf("parse model %T: %w", model, err)
|
||||
}
|
||||
childTable := stmtT.Schema.Table
|
||||
|
||||
stmtR := &gorm.Statement{DB: db}
|
||||
if err := stmtR.Parse(&refModel); err != nil {
|
||||
return fmt.Errorf("parse reference model %T: %w", refModel, err)
|
||||
}
|
||||
parentTable := stmtR.Schema.Table
|
||||
|
||||
if !db.Migrator().HasColumn(&model, fkColumn) {
|
||||
log.WithContext(ctx).Debugf("column %s does not exist in table %s, no cleanup needed", fkColumn, childTable)
|
||||
return nil
|
||||
}
|
||||
|
||||
// If a foreign key constraint already exists on the column, the DB itself
|
||||
// enforces referential integrity and orphaned rows cannot exist.
|
||||
if hasForeignKey(db, childTable, fkColumn) {
|
||||
log.WithContext(ctx).Debugf("foreign key constraint for %s already exists on %s, no cleanup needed", fkColumn, childTable)
|
||||
return nil
|
||||
}
|
||||
|
||||
result := db.Exec(
|
||||
fmt.Sprintf(
|
||||
"DELETE FROM %s WHERE %s NOT IN (SELECT id FROM %s)",
|
||||
childTable, fkColumn, parentTable,
|
||||
),
|
||||
)
|
||||
if result.Error != nil {
|
||||
return fmt.Errorf("cleanup orphaned rows in %s: %w", childTable, result.Error)
|
||||
}
|
||||
|
||||
log.WithContext(ctx).Infof("Cleaned up %d orphaned rows from %s where %s had no matching row in %s",
|
||||
result.RowsAffected, childTable, fkColumn, parentTable)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func RemoveDuplicatePeerKeys(ctx context.Context, db *gorm.DB) error {
|
||||
if !db.Migrator().HasTable("peers") {
|
||||
log.WithContext(ctx).Debug("peers table does not exist, skipping duplicate key cleanup")
|
||||
|
||||
@@ -441,3 +441,197 @@ func TestRemoveDuplicatePeerKeys_NoTable(t *testing.T) {
|
||||
err := migration.RemoveDuplicatePeerKeys(context.Background(), db)
|
||||
require.NoError(t, err, "Should not fail when table does not exist")
|
||||
}
|
||||
|
||||
type testParent struct {
|
||||
ID string `gorm:"primaryKey"`
|
||||
}
|
||||
|
||||
func (testParent) TableName() string {
|
||||
return "test_parents"
|
||||
}
|
||||
|
||||
type testChild struct {
|
||||
ID string `gorm:"primaryKey"`
|
||||
ParentID string
|
||||
}
|
||||
|
||||
func (testChild) TableName() string {
|
||||
return "test_children"
|
||||
}
|
||||
|
||||
type testChildWithFK struct {
|
||||
ID string `gorm:"primaryKey"`
|
||||
ParentID string `gorm:"index"`
|
||||
Parent *testParent `gorm:"foreignKey:ParentID"`
|
||||
}
|
||||
|
||||
func (testChildWithFK) TableName() string {
|
||||
return "test_children"
|
||||
}
|
||||
|
||||
func setupOrphanTestDB(t *testing.T, models ...any) *gorm.DB {
|
||||
t.Helper()
|
||||
db := setupDatabase(t)
|
||||
for _, m := range models {
|
||||
_ = db.Migrator().DropTable(m)
|
||||
}
|
||||
err := db.AutoMigrate(models...)
|
||||
require.NoError(t, err, "Failed to auto-migrate tables")
|
||||
return db
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_NoChildTable(t *testing.T) {
|
||||
db := setupDatabase(t)
|
||||
_ = db.Migrator().DropTable(&testChild{})
|
||||
_ = db.Migrator().DropTable(&testParent{})
|
||||
|
||||
err := migration.CleanupOrphanedResources[testChild, testParent](context.Background(), db, "parent_id")
|
||||
require.NoError(t, err, "Should not fail when child table does not exist")
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_NoParentTable(t *testing.T) {
|
||||
db := setupDatabase(t)
|
||||
_ = db.Migrator().DropTable(&testParent{})
|
||||
_ = db.Migrator().DropTable(&testChild{})
|
||||
|
||||
err := db.AutoMigrate(&testChild{})
|
||||
require.NoError(t, err)
|
||||
|
||||
err = migration.CleanupOrphanedResources[testChild, testParent](context.Background(), db, "parent_id")
|
||||
require.NoError(t, err, "Should not fail when parent table does not exist")
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_EmptyTables(t *testing.T) {
|
||||
db := setupOrphanTestDB(t, &testParent{}, &testChild{})
|
||||
|
||||
err := migration.CleanupOrphanedResources[testChild, testParent](context.Background(), db, "parent_id")
|
||||
require.NoError(t, err, "Should not fail on empty tables")
|
||||
|
||||
var count int64
|
||||
db.Model(&testChild{}).Count(&count)
|
||||
assert.Equal(t, int64(0), count)
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_NoOrphans(t *testing.T) {
|
||||
db := setupOrphanTestDB(t, &testParent{}, &testChild{})
|
||||
|
||||
require.NoError(t, db.Create(&testParent{ID: "p1"}).Error)
|
||||
require.NoError(t, db.Create(&testParent{ID: "p2"}).Error)
|
||||
require.NoError(t, db.Create(&testChild{ID: "c1", ParentID: "p1"}).Error)
|
||||
require.NoError(t, db.Create(&testChild{ID: "c2", ParentID: "p2"}).Error)
|
||||
|
||||
err := migration.CleanupOrphanedResources[testChild, testParent](context.Background(), db, "parent_id")
|
||||
require.NoError(t, err)
|
||||
|
||||
var count int64
|
||||
db.Model(&testChild{}).Count(&count)
|
||||
assert.Equal(t, int64(2), count, "All children should remain when no orphans")
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_AllOrphans(t *testing.T) {
|
||||
db := setupOrphanTestDB(t, &testParent{}, &testChild{})
|
||||
|
||||
require.NoError(t, db.Exec("INSERT INTO test_children (id, parent_id) VALUES (?, ?)", "c1", "gone1").Error)
|
||||
require.NoError(t, db.Exec("INSERT INTO test_children (id, parent_id) VALUES (?, ?)", "c2", "gone2").Error)
|
||||
require.NoError(t, db.Exec("INSERT INTO test_children (id, parent_id) VALUES (?, ?)", "c3", "gone3").Error)
|
||||
|
||||
err := migration.CleanupOrphanedResources[testChild, testParent](context.Background(), db, "parent_id")
|
||||
require.NoError(t, err)
|
||||
|
||||
var count int64
|
||||
db.Model(&testChild{}).Count(&count)
|
||||
assert.Equal(t, int64(0), count, "All orphaned children should be deleted")
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_MixedValidAndOrphaned(t *testing.T) {
|
||||
db := setupOrphanTestDB(t, &testParent{}, &testChild{})
|
||||
|
||||
require.NoError(t, db.Create(&testParent{ID: "p1"}).Error)
|
||||
require.NoError(t, db.Create(&testParent{ID: "p2"}).Error)
|
||||
|
||||
require.NoError(t, db.Create(&testChild{ID: "c1", ParentID: "p1"}).Error)
|
||||
require.NoError(t, db.Create(&testChild{ID: "c2", ParentID: "p2"}).Error)
|
||||
require.NoError(t, db.Create(&testChild{ID: "c3", ParentID: "p1"}).Error)
|
||||
|
||||
require.NoError(t, db.Exec("INSERT INTO test_children (id, parent_id) VALUES (?, ?)", "c4", "gone1").Error)
|
||||
require.NoError(t, db.Exec("INSERT INTO test_children (id, parent_id) VALUES (?, ?)", "c5", "gone2").Error)
|
||||
|
||||
err := migration.CleanupOrphanedResources[testChild, testParent](context.Background(), db, "parent_id")
|
||||
require.NoError(t, err)
|
||||
|
||||
var remaining []testChild
|
||||
require.NoError(t, db.Order("id").Find(&remaining).Error)
|
||||
|
||||
assert.Len(t, remaining, 3, "Only valid children should remain")
|
||||
assert.Equal(t, "c1", remaining[0].ID)
|
||||
assert.Equal(t, "c2", remaining[1].ID)
|
||||
assert.Equal(t, "c3", remaining[2].ID)
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_Idempotent(t *testing.T) {
|
||||
db := setupOrphanTestDB(t, &testParent{}, &testChild{})
|
||||
|
||||
require.NoError(t, db.Create(&testParent{ID: "p1"}).Error)
|
||||
require.NoError(t, db.Create(&testChild{ID: "c1", ParentID: "p1"}).Error)
|
||||
require.NoError(t, db.Exec("INSERT INTO test_children (id, parent_id) VALUES (?, ?)", "c2", "gone").Error)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
err := migration.CleanupOrphanedResources[testChild, testParent](ctx, db, "parent_id")
|
||||
require.NoError(t, err)
|
||||
|
||||
var count int64
|
||||
db.Model(&testChild{}).Count(&count)
|
||||
assert.Equal(t, int64(1), count)
|
||||
|
||||
err = migration.CleanupOrphanedResources[testChild, testParent](ctx, db, "parent_id")
|
||||
require.NoError(t, err)
|
||||
|
||||
db.Model(&testChild{}).Count(&count)
|
||||
assert.Equal(t, int64(1), count, "Count should remain the same after second run")
|
||||
}
|
||||
|
||||
func TestCleanupOrphanedResources_SkipsWhenForeignKeyExists(t *testing.T) {
|
||||
engine := os.Getenv("NETBIRD_STORE_ENGINE")
|
||||
if engine != "postgres" && engine != "mysql" {
|
||||
t.Skip("FK constraint early-exit test requires postgres or mysql")
|
||||
}
|
||||
|
||||
db := setupDatabase(t)
|
||||
_ = db.Migrator().DropTable(&testChildWithFK{})
|
||||
_ = db.Migrator().DropTable(&testParent{})
|
||||
|
||||
err := db.AutoMigrate(&testParent{}, &testChildWithFK{})
|
||||
require.NoError(t, err)
|
||||
|
||||
require.NoError(t, db.Create(&testParent{ID: "p1"}).Error)
|
||||
require.NoError(t, db.Create(&testParent{ID: "p2"}).Error)
|
||||
require.NoError(t, db.Create(&testChildWithFK{ID: "c1", ParentID: "p1"}).Error)
|
||||
require.NoError(t, db.Create(&testChildWithFK{ID: "c2", ParentID: "p2"}).Error)
|
||||
|
||||
switch engine {
|
||||
case "postgres":
|
||||
require.NoError(t, db.Exec("ALTER TABLE test_children DROP CONSTRAINT fk_test_children_parent").Error)
|
||||
require.NoError(t, db.Exec("DELETE FROM test_parents WHERE id = ?", "p2").Error)
|
||||
require.NoError(t, db.Exec(
|
||||
"ALTER TABLE test_children ADD CONSTRAINT fk_test_children_parent "+
|
||||
"FOREIGN KEY (parent_id) REFERENCES test_parents(id) NOT VALID",
|
||||
).Error)
|
||||
case "mysql":
|
||||
require.NoError(t, db.Exec("SET FOREIGN_KEY_CHECKS = 0").Error)
|
||||
require.NoError(t, db.Exec("ALTER TABLE test_children DROP FOREIGN KEY fk_test_children_parent").Error)
|
||||
require.NoError(t, db.Exec("DELETE FROM test_parents WHERE id = ?", "p2").Error)
|
||||
require.NoError(t, db.Exec(
|
||||
"ALTER TABLE test_children ADD CONSTRAINT fk_test_children_parent "+
|
||||
"FOREIGN KEY (parent_id) REFERENCES test_parents(id)",
|
||||
).Error)
|
||||
require.NoError(t, db.Exec("SET FOREIGN_KEY_CHECKS = 1").Error)
|
||||
}
|
||||
|
||||
err = migration.CleanupOrphanedResources[testChildWithFK, testParent](context.Background(), db, "parent_id")
|
||||
require.NoError(t, err)
|
||||
|
||||
var count int64
|
||||
db.Model(&testChildWithFK{}).Count(&count)
|
||||
assert.Equal(t, int64(2), count, "Both rows should survive — migration must skip when FK constraint exists")
|
||||
}
|
||||
|
||||
@@ -396,6 +396,11 @@ func (s *SqlStore) DeleteAccount(ctx context.Context, account *types.Account) er
|
||||
return result.Error
|
||||
}
|
||||
|
||||
result = tx.Select(clause.Associations).Delete(account.Services, "account_id = ?", account.Id)
|
||||
if result.Error != nil {
|
||||
return result.Error
|
||||
}
|
||||
|
||||
result = tx.Select(clause.Associations).Delete(account)
|
||||
if result.Error != nil {
|
||||
return result.Error
|
||||
@@ -2099,6 +2104,8 @@ func (s *SqlStore) getServices(ctx context.Context, accountID string) ([]*rpserv
|
||||
var createdAt, certIssuedAt sql.NullTime
|
||||
var status, proxyCluster, sessionPrivateKey, sessionPublicKey sql.NullString
|
||||
var mode, source, sourcePeer sql.NullString
|
||||
var terminated, portAutoAssigned sql.NullBool
|
||||
var listenPort sql.NullInt64
|
||||
err := row.Scan(
|
||||
&s.ID,
|
||||
&s.AccountID,
|
||||
@@ -2115,11 +2122,11 @@ func (s *SqlStore) getServices(ctx context.Context, accountID string) ([]*rpserv
|
||||
&sessionPrivateKey,
|
||||
&sessionPublicKey,
|
||||
&mode,
|
||||
&s.ListenPort,
|
||||
&s.PortAutoAssigned,
|
||||
&listenPort,
|
||||
&portAutoAssigned,
|
||||
&source,
|
||||
&sourcePeer,
|
||||
&s.Terminated,
|
||||
&terminated,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -2160,7 +2167,15 @@ func (s *SqlStore) getServices(ctx context.Context, accountID string) ([]*rpserv
|
||||
if sourcePeer.Valid {
|
||||
s.SourcePeer = sourcePeer.String
|
||||
}
|
||||
|
||||
if terminated.Valid {
|
||||
s.Terminated = terminated.Bool
|
||||
}
|
||||
if portAutoAssigned.Valid {
|
||||
s.PortAutoAssigned = portAutoAssigned.Bool
|
||||
}
|
||||
if listenPort.Valid {
|
||||
s.ListenPort = uint16(listenPort.Int64)
|
||||
}
|
||||
s.Targets = []*rpservice.Target{}
|
||||
return &s, nil
|
||||
})
|
||||
|
||||
@@ -22,6 +22,8 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
nbdns "github.com/netbirdio/netbird/dns"
|
||||
proxydomain "github.com/netbirdio/netbird/management/internals/modules/reverseproxy/domain"
|
||||
rpservice "github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service"
|
||||
"github.com/netbirdio/netbird/management/internals/modules/zones"
|
||||
"github.com/netbirdio/netbird/management/internals/modules/zones/records"
|
||||
resourceTypes "github.com/netbirdio/netbird/management/server/networks/resources/types"
|
||||
@@ -350,6 +352,35 @@ func TestSqlite_DeleteAccount(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
account.Services = []*rpservice.Service{
|
||||
{
|
||||
ID: "service_id",
|
||||
AccountID: account.Id,
|
||||
Name: "test service",
|
||||
Domain: "svc.example.com",
|
||||
Enabled: true,
|
||||
Targets: []*rpservice.Target{
|
||||
{
|
||||
AccountID: account.Id,
|
||||
ServiceID: "service_id",
|
||||
Host: "localhost",
|
||||
Port: 8080,
|
||||
Protocol: "http",
|
||||
Enabled: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
account.Domains = []*proxydomain.Domain{
|
||||
{
|
||||
ID: "domain_id",
|
||||
Domain: "custom.example.com",
|
||||
AccountID: account.Id,
|
||||
Validated: true,
|
||||
},
|
||||
}
|
||||
|
||||
err = store.SaveAccount(context.Background(), account)
|
||||
require.NoError(t, err)
|
||||
|
||||
@@ -411,6 +442,20 @@ func TestSqlite_DeleteAccount(t *testing.T) {
|
||||
require.NoError(t, err, "expecting no error after removing DeleteAccount when searching for network resources")
|
||||
require.Len(t, resources, 0, "expecting no network resources to be found after DeleteAccount")
|
||||
}
|
||||
|
||||
domains, err := store.ListCustomDomains(context.Background(), account.Id)
|
||||
require.NoError(t, err, "expecting no error after DeleteAccount when searching for custom domains")
|
||||
require.Len(t, domains, 0, "expecting no custom domains to be found after DeleteAccount")
|
||||
|
||||
var services []*rpservice.Service
|
||||
err = store.(*SqlStore).db.Model(&rpservice.Service{}).Find(&services, "account_id = ?", account.Id).Error
|
||||
require.NoError(t, err, "expecting no error after DeleteAccount when searching for services")
|
||||
require.Len(t, services, 0, "expecting no services to be found after DeleteAccount")
|
||||
|
||||
var targets []*rpservice.Target
|
||||
err = store.(*SqlStore).db.Model(&rpservice.Target{}).Find(&targets, "account_id = ?", account.Id).Error
|
||||
require.NoError(t, err, "expecting no error after DeleteAccount when searching for service targets")
|
||||
require.Len(t, targets, 0, "expecting no service targets to be found after DeleteAccount")
|
||||
}
|
||||
|
||||
func Test_GetAccount(t *testing.T) {
|
||||
|
||||
@@ -20,6 +20,7 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
nbdns "github.com/netbirdio/netbird/dns"
|
||||
"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/domain"
|
||||
"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service"
|
||||
resourceTypes "github.com/netbirdio/netbird/management/server/networks/resources/types"
|
||||
routerTypes "github.com/netbirdio/netbird/management/server/networks/routers/types"
|
||||
@@ -265,6 +266,7 @@ func setupBenchmarkDB(b testing.TB) (*SqlStore, func(), string) {
|
||||
&nbdns.NameServerGroup{}, &posture.Checks{}, &networkTypes.Network{},
|
||||
&routerTypes.NetworkRouter{}, &resourceTypes.NetworkResource{},
|
||||
&types.AccountOnboarding{}, &service.Service{}, &service.Target{},
|
||||
&domain.Domain{},
|
||||
}
|
||||
|
||||
for i := len(models) - 1; i >= 0; i-- {
|
||||
|
||||
@@ -448,6 +448,12 @@ func getMigrationsPreAuto(ctx context.Context) []migrationFunc {
|
||||
func(db *gorm.DB) error {
|
||||
return migration.RemoveDuplicatePeerKeys(ctx, db)
|
||||
},
|
||||
func(db *gorm.DB) error {
|
||||
return migration.CleanupOrphanedResources[rpservice.Service, types.Account](ctx, db, "account_id")
|
||||
},
|
||||
func(db *gorm.DB) error {
|
||||
return migration.CleanupOrphanedResources[domain.Domain, types.Account](ctx, db, "account_id")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
|
||||
"github.com/netbirdio/netbird/client/ssh/auth"
|
||||
nbdns "github.com/netbirdio/netbird/dns"
|
||||
proxydomain "github.com/netbirdio/netbird/management/internals/modules/reverseproxy/domain"
|
||||
"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service"
|
||||
"github.com/netbirdio/netbird/management/internals/modules/zones"
|
||||
"github.com/netbirdio/netbird/management/internals/modules/zones/records"
|
||||
@@ -101,6 +102,7 @@ type Account struct {
|
||||
DNSSettings DNSSettings `gorm:"embedded;embeddedPrefix:dns_settings_"`
|
||||
PostureChecks []*posture.Checks `gorm:"foreignKey:AccountID;references:id"`
|
||||
Services []*service.Service `gorm:"foreignKey:AccountID;references:id"`
|
||||
Domains []*proxydomain.Domain `gorm:"foreignKey:AccountID;references:id"`
|
||||
// Settings is a dictionary of Account settings
|
||||
Settings *Settings `gorm:"embedded;embeddedPrefix:settings_"`
|
||||
Networks []*networkTypes.Network `gorm:"foreignKey:AccountID;references:id"`
|
||||
@@ -911,6 +913,11 @@ func (a *Account) Copy() *Account {
|
||||
services = append(services, svc.Copy())
|
||||
}
|
||||
|
||||
domains := []*proxydomain.Domain{}
|
||||
for _, domain := range a.Domains {
|
||||
domains = append(domains, domain.Copy())
|
||||
}
|
||||
|
||||
return &Account{
|
||||
Id: a.Id,
|
||||
CreatedBy: a.CreatedBy,
|
||||
@@ -936,6 +943,7 @@ func (a *Account) Copy() *Account {
|
||||
Onboarding: a.Onboarding,
|
||||
NetworkMapCache: a.NetworkMapCache,
|
||||
nmapInitOnce: a.nmapInitOnce,
|
||||
Domains: domains,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
217
management/server/types/networkmap_benchmark_test.go
Normal file
217
management/server/types/networkmap_benchmark_test.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package types_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
nbdns "github.com/netbirdio/netbird/dns"
|
||||
"github.com/netbirdio/netbird/management/server/types"
|
||||
)
|
||||
|
||||
type benchmarkScale struct {
|
||||
name string
|
||||
peers int
|
||||
groups int
|
||||
}
|
||||
|
||||
var defaultScales = []benchmarkScale{
|
||||
{"100peers_5groups", 100, 5},
|
||||
{"500peers_20groups", 500, 20},
|
||||
{"1000peers_50groups", 1000, 50},
|
||||
{"5000peers_100groups", 5000, 100},
|
||||
{"10000peers_200groups", 10000, 200},
|
||||
{"20000peers_200groups", 20000, 200},
|
||||
{"30000peers_300groups", 30000, 300},
|
||||
}
|
||||
|
||||
func skipCIBenchmark(b *testing.B) {
|
||||
if os.Getenv("CI") == "true" {
|
||||
b.Skip("Skipping benchmark in CI")
|
||||
}
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Single Peer Network Map Generation
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
// BenchmarkNetworkMapGeneration_Components benchmarks the components-based approach for a single peer.
|
||||
func BenchmarkNetworkMapGeneration_Components(b *testing.B) {
|
||||
skipCIBenchmark(b)
|
||||
for _, scale := range defaultScales {
|
||||
b.Run(scale.name, func(b *testing.B) {
|
||||
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
|
||||
ctx := context.Background()
|
||||
resourcePolicies := account.GetResourcePoliciesMap()
|
||||
routers := account.GetResourceRoutersMap()
|
||||
groupIDToUserIDs := account.GetActiveGroupUsers()
|
||||
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = account.GetPeerNetworkMapFromComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// All Peers (UpdateAccountPeers hot path)
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
// BenchmarkNetworkMapGeneration_AllPeers benchmarks generating network maps for ALL peers.
|
||||
func BenchmarkNetworkMapGeneration_AllPeers(b *testing.B) {
|
||||
skipCIBenchmark(b)
|
||||
scales := []benchmarkScale{
|
||||
{"100peers_5groups", 100, 5},
|
||||
{"500peers_20groups", 500, 20},
|
||||
{"1000peers_50groups", 1000, 50},
|
||||
{"5000peers_100groups", 5000, 100},
|
||||
}
|
||||
|
||||
for _, scale := range scales {
|
||||
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
|
||||
ctx := context.Background()
|
||||
|
||||
peerIDs := make([]string, 0, len(account.Peers))
|
||||
for peerID := range account.Peers {
|
||||
peerIDs = append(peerIDs, peerID)
|
||||
}
|
||||
|
||||
b.Run("components/"+scale.name, func(b *testing.B) {
|
||||
resourcePolicies := account.GetResourcePoliciesMap()
|
||||
routers := account.GetResourceRoutersMap()
|
||||
groupIDToUserIDs := account.GetActiveGroupUsers()
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
for _, peerID := range peerIDs {
|
||||
_ = account.GetPeerNetworkMapFromComponents(ctx, peerID, nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Sub-operations
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
// BenchmarkNetworkMapGeneration_ComponentsCreation benchmarks components extraction.
|
||||
func BenchmarkNetworkMapGeneration_ComponentsCreation(b *testing.B) {
|
||||
skipCIBenchmark(b)
|
||||
for _, scale := range defaultScales {
|
||||
b.Run(scale.name, func(b *testing.B) {
|
||||
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
|
||||
ctx := context.Background()
|
||||
resourcePolicies := account.GetResourcePoliciesMap()
|
||||
routers := account.GetResourceRoutersMap()
|
||||
groupIDToUserIDs := account.GetActiveGroupUsers()
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = account.GetPeerNetworkMapComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, groupIDToUserIDs)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkNetworkMapGeneration_ComponentsCalculation benchmarks calculation from pre-built components.
|
||||
func BenchmarkNetworkMapGeneration_ComponentsCalculation(b *testing.B) {
|
||||
skipCIBenchmark(b)
|
||||
for _, scale := range defaultScales {
|
||||
b.Run(scale.name, func(b *testing.B) {
|
||||
account, validatedPeers := scalableTestAccount(scale.peers, scale.groups)
|
||||
ctx := context.Background()
|
||||
resourcePolicies := account.GetResourcePoliciesMap()
|
||||
routers := account.GetResourceRoutersMap()
|
||||
groupIDToUserIDs := account.GetActiveGroupUsers()
|
||||
components := account.GetPeerNetworkMapComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, groupIDToUserIDs)
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = types.CalculateNetworkMapFromComponents(ctx, components)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkNetworkMapGeneration_PrecomputeMaps benchmarks precomputed map costs.
|
||||
func BenchmarkNetworkMapGeneration_PrecomputeMaps(b *testing.B) {
|
||||
skipCIBenchmark(b)
|
||||
for _, scale := range defaultScales {
|
||||
b.Run("ResourcePoliciesMap/"+scale.name, func(b *testing.B) {
|
||||
account, _ := scalableTestAccount(scale.peers, scale.groups)
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = account.GetResourcePoliciesMap()
|
||||
}
|
||||
})
|
||||
b.Run("ResourceRoutersMap/"+scale.name, func(b *testing.B) {
|
||||
account, _ := scalableTestAccount(scale.peers, scale.groups)
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = account.GetResourceRoutersMap()
|
||||
}
|
||||
})
|
||||
b.Run("ActiveGroupUsers/"+scale.name, func(b *testing.B) {
|
||||
account, _ := scalableTestAccount(scale.peers, scale.groups)
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = account.GetActiveGroupUsers()
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
// Scaling Analysis
|
||||
// ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
// BenchmarkNetworkMapGeneration_GroupScaling tests group count impact on performance.
|
||||
func BenchmarkNetworkMapGeneration_GroupScaling(b *testing.B) {
|
||||
skipCIBenchmark(b)
|
||||
groupCounts := []int{1, 5, 20, 50, 100, 200, 500}
|
||||
for _, numGroups := range groupCounts {
|
||||
b.Run(fmt.Sprintf("components_%dgroups", numGroups), func(b *testing.B) {
|
||||
account, validatedPeers := scalableTestAccount(1000, numGroups)
|
||||
ctx := context.Background()
|
||||
resourcePolicies := account.GetResourcePoliciesMap()
|
||||
routers := account.GetResourceRoutersMap()
|
||||
groupIDToUserIDs := account.GetActiveGroupUsers()
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = account.GetPeerNetworkMapFromComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// BenchmarkNetworkMapGeneration_PeerScaling tests peer count impact on performance.
|
||||
func BenchmarkNetworkMapGeneration_PeerScaling(b *testing.B) {
|
||||
skipCIBenchmark(b)
|
||||
peerCounts := []int{50, 100, 500, 1000, 2000, 5000, 10000, 20000, 30000}
|
||||
for _, numPeers := range peerCounts {
|
||||
numGroups := numPeers / 20
|
||||
if numGroups < 1 {
|
||||
numGroups = 1
|
||||
}
|
||||
b.Run(fmt.Sprintf("components_%dpeers", numPeers), func(b *testing.B) {
|
||||
account, validatedPeers := scalableTestAccount(numPeers, numGroups)
|
||||
ctx := context.Background()
|
||||
resourcePolicies := account.GetResourcePoliciesMap()
|
||||
routers := account.GetResourceRoutersMap()
|
||||
groupIDToUserIDs := account.GetActiveGroupUsers()
|
||||
b.ReportAllocs()
|
||||
b.ResetTimer()
|
||||
for range b.N {
|
||||
_ = account.GetPeerNetworkMapFromComponents(ctx, "peer-0", nbdns.CustomZone{}, nil, validatedPeers, resourcePolicies, routers, nil, groupIDToUserIDs)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
1192
management/server/types/networkmap_components_correctness_test.go
Normal file
1192
management/server/types/networkmap_components_correctness_test.go
Normal file
File diff suppressed because it is too large
Load Diff
@@ -333,7 +333,7 @@ func (c *Client) connect(ctx context.Context) (*RelayAddr, error) {
|
||||
dialers := c.getDialers()
|
||||
|
||||
rd := dialer.NewRaceDial(c.log, dialer.DefaultConnectionTimeout, c.connectionURL, dialers...)
|
||||
conn, err := rd.Dial()
|
||||
conn, err := rd.Dial(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -40,10 +40,10 @@ func NewRaceDial(log *log.Entry, connectionTimeout time.Duration, serverURL stri
|
||||
}
|
||||
}
|
||||
|
||||
func (r *RaceDial) Dial() (net.Conn, error) {
|
||||
func (r *RaceDial) Dial(ctx context.Context) (net.Conn, error) {
|
||||
connChan := make(chan dialResult, len(r.dialerFns))
|
||||
winnerConn := make(chan net.Conn, 1)
|
||||
abortCtx, abort := context.WithCancel(context.Background())
|
||||
abortCtx, abort := context.WithCancel(ctx)
|
||||
defer abort()
|
||||
|
||||
for _, dfn := range r.dialerFns {
|
||||
|
||||
@@ -78,7 +78,7 @@ func TestRaceDialEmptyDialers(t *testing.T) {
|
||||
serverURL := "test.server.com"
|
||||
|
||||
rd := NewRaceDial(logger, DefaultConnectionTimeout, serverURL)
|
||||
conn, err := rd.Dial()
|
||||
conn, err := rd.Dial(context.Background())
|
||||
if err == nil {
|
||||
t.Errorf("Expected an error with empty dialers, got nil")
|
||||
}
|
||||
@@ -104,7 +104,7 @@ func TestRaceDialSingleSuccessfulDialer(t *testing.T) {
|
||||
}
|
||||
|
||||
rd := NewRaceDial(logger, DefaultConnectionTimeout, serverURL, mockDialer)
|
||||
conn, err := rd.Dial()
|
||||
conn, err := rd.Dial(context.Background())
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
@@ -137,7 +137,7 @@ func TestRaceDialMultipleDialersWithOneSuccess(t *testing.T) {
|
||||
}
|
||||
|
||||
rd := NewRaceDial(logger, DefaultConnectionTimeout, serverURL, mockDialer1, mockDialer2)
|
||||
conn, err := rd.Dial()
|
||||
conn, err := rd.Dial(context.Background())
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
@@ -160,7 +160,7 @@ func TestRaceDialTimeout(t *testing.T) {
|
||||
}
|
||||
|
||||
rd := NewRaceDial(logger, 3*time.Second, serverURL, mockDialer)
|
||||
conn, err := rd.Dial()
|
||||
conn, err := rd.Dial(context.Background())
|
||||
if err == nil {
|
||||
t.Errorf("Expected an error, got nil")
|
||||
}
|
||||
@@ -188,7 +188,7 @@ func TestRaceDialAllDialersFail(t *testing.T) {
|
||||
}
|
||||
|
||||
rd := NewRaceDial(logger, DefaultConnectionTimeout, serverURL, mockDialer1, mockDialer2)
|
||||
conn, err := rd.Dial()
|
||||
conn, err := rd.Dial(context.Background())
|
||||
if err == nil {
|
||||
t.Errorf("Expected an error, got nil")
|
||||
}
|
||||
@@ -230,7 +230,7 @@ func TestRaceDialFirstSuccessfulDialerWins(t *testing.T) {
|
||||
}
|
||||
|
||||
rd := NewRaceDial(logger, DefaultConnectionTimeout, serverURL, mockDialer1, mockDialer2)
|
||||
conn, err := rd.Dial()
|
||||
conn, err := rd.Dial(context.Background())
|
||||
if err != nil {
|
||||
t.Errorf("Expected no error, got %v", err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user