Migrate peer monitor into peer manager

2026-05-19 06:39:55 +00:00 · 2025-12-01 21:28:14 -05:00
parent 23e7b173c9
commit 29f0babf07
6 changed files with 154 additions and 126 deletions
--- a/peers/monitor/monitor.go
+++ b/peers/monitor/monitor.go
@@ -0,0 +1,725 @@
+package monitor
+
+import (
+	"context"
+	"fmt"
+	"net"
+	"net/netip"
+	"sync"
+	"time"
+
+	"github.com/fosrl/newt/bind"
+	"github.com/fosrl/newt/holepunch"
+	"github.com/fosrl/newt/logger"
+	"github.com/fosrl/newt/util"
+	middleDevice "github.com/fosrl/olm/device"
+	"github.com/fosrl/olm/websocket"
+	"gvisor.dev/gvisor/pkg/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+)
+
+// PeerMonitorCallback is the function type for connection status change callbacks
+type PeerMonitorCallback func(siteID int, connected bool, rtt time.Duration)
+
+// HolepunchStatusCallback is called when holepunch connection status changes
+type HolepunchStatusCallback func(siteID int, endpoint string, connected bool, rtt time.Duration)
+
+// PeerMonitor handles monitoring the connection status to multiple WireGuard peers
+type PeerMonitor struct {
+	monitors    map[int]*Client
+	callback    PeerMonitorCallback
+	mutex       sync.Mutex
+	running     bool
+	interval    time.Duration
+	timeout     time.Duration
+	maxAttempts int
+	wsClient    *websocket.Client
+
+	// Netstack fields
+	middleDev   *middleDevice.MiddleDevice
+	localIP     string
+	stack       *stack.Stack
+	ep          *channel.Endpoint
+	activePorts map[uint16]bool
+	portsLock   sync.Mutex
+	nsCtx       context.Context
+	nsCancel    context.CancelFunc
+	nsWg        sync.WaitGroup
+
+	// Holepunch testing fields
+	sharedBind              *bind.SharedBind
+	holepunchTester         *holepunch.HolepunchTester
+	holepunchInterval       time.Duration
+	holepunchTimeout        time.Duration
+	holepunchEndpoints      map[int]string // siteID -> endpoint for holepunch testing
+	holepunchStatus         map[int]bool   // siteID -> connected status
+	holepunchStatusCallback HolepunchStatusCallback
+	holepunchStopChan       chan struct{}
+}
+
+// NewPeerMonitor creates a new peer monitor with the given callback
+func NewPeerMonitor(callback PeerMonitorCallback, wsClient *websocket.Client, middleDev *middleDevice.MiddleDevice, localIP string, sharedBind *bind.SharedBind) *PeerMonitor {
+	ctx, cancel := context.WithCancel(context.Background())
+	pm := &PeerMonitor{
+		monitors:           make(map[int]*Client),
+		callback:           callback,
+		interval:           1 * time.Second, // Default check interval
+		timeout:            2500 * time.Millisecond,
+		maxAttempts:        15,
+		wsClient:           wsClient,
+		middleDev:          middleDev,
+		localIP:            localIP,
+		activePorts:        make(map[uint16]bool),
+		nsCtx:              ctx,
+		nsCancel:           cancel,
+		sharedBind:         sharedBind,
+		holepunchInterval:  5 * time.Second, // Check holepunch every 5 seconds
+		holepunchTimeout:   3 * time.Second,
+		holepunchEndpoints: make(map[int]string),
+		holepunchStatus:    make(map[int]bool),
+	}
+
+	if err := pm.initNetstack(); err != nil {
+		logger.Error("Failed to initialize netstack for peer monitor: %v", err)
+	}
+
+	// Initialize holepunch tester if sharedBind is available
+	if sharedBind != nil {
+		pm.holepunchTester = holepunch.NewHolepunchTester(sharedBind)
+	}
+
+	return pm
+}
+
+// SetInterval changes how frequently peers are checked
+func (pm *PeerMonitor) SetInterval(interval time.Duration) {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	pm.interval = interval
+
+	// Update interval for all existing monitors
+	for _, client := range pm.monitors {
+		client.SetPacketInterval(interval)
+	}
+}
+
+// SetTimeout changes the timeout for waiting for responses
+func (pm *PeerMonitor) SetTimeout(timeout time.Duration) {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	pm.timeout = timeout
+
+	// Update timeout for all existing monitors
+	for _, client := range pm.monitors {
+		client.SetTimeout(timeout)
+	}
+}
+
+// SetMaxAttempts changes the maximum number of attempts for TestConnection
+func (pm *PeerMonitor) SetMaxAttempts(attempts int) {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	pm.maxAttempts = attempts
+
+	// Update max attempts for all existing monitors
+	for _, client := range pm.monitors {
+		client.SetMaxAttempts(attempts)
+	}
+}
+
+// AddPeer adds a new peer to monitor
+func (pm *PeerMonitor) AddPeer(siteID int, endpoint string) error {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	if _, exists := pm.monitors[siteID]; exists {
+		return nil // Already monitoring
+	}
+
+	// Use our custom dialer that uses netstack
+	client, err := NewClient(endpoint, pm.dial)
+	if err != nil {
+		return err
+	}
+
+	client.SetPacketInterval(pm.interval)
+	client.SetTimeout(pm.timeout)
+	client.SetMaxAttempts(pm.maxAttempts)
+
+	pm.monitors[siteID] = client
+	pm.holepunchEndpoints[siteID] = endpoint
+	pm.holepunchStatus[siteID] = false // Initially unknown/disconnected
+
+	if pm.running {
+		if err := client.StartMonitor(func(status ConnectionStatus) {
+			pm.handleConnectionStatusChange(siteID, status)
+		}); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// removePeerUnlocked stops monitoring a peer and removes it from the monitor
+// This function assumes the mutex is already held by the caller
+func (pm *PeerMonitor) removePeerUnlocked(siteID int) {
+	client, exists := pm.monitors[siteID]
+	if !exists {
+		return
+	}
+
+	client.StopMonitor()
+	client.Close()
+	delete(pm.monitors, siteID)
+}
+
+// RemovePeer stops monitoring a peer and removes it from the monitor
+func (pm *PeerMonitor) RemovePeer(siteID int) {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	pm.removePeerUnlocked(siteID)
+}
+
+// Start begins monitoring all peers
+func (pm *PeerMonitor) Start() {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	if pm.running {
+		return // Already running
+	}
+
+	pm.running = true
+
+	// Start monitoring all peers
+	for siteID, client := range pm.monitors {
+		siteIDCopy := siteID // Create a copy for the closure
+		err := client.StartMonitor(func(status ConnectionStatus) {
+			pm.handleConnectionStatusChange(siteIDCopy, status)
+		})
+		if err != nil {
+			logger.Error("Failed to start monitoring peer %d: %v\n", siteID, err)
+			continue
+		}
+		logger.Info("Started monitoring peer %d\n", siteID)
+	}
+
+	pm.startHolepunchMonitor()
+}
+
+// handleConnectionStatusChange is called when a peer's connection status changes
+func (pm *PeerMonitor) handleConnectionStatusChange(siteID int, status ConnectionStatus) {
+	// Call the user-provided callback first
+	if pm.callback != nil {
+		pm.callback(siteID, status.Connected, status.RTT)
+	}
+
+	// If disconnected, send relay message to the server
+	if !status.Connected {
+		if pm.wsClient != nil {
+			pm.sendRelay(siteID)
+		}
+	}
+}
+
+// sendRelay sends a relay message to the server
+func (pm *PeerMonitor) sendRelay(siteID int) error {
+	if pm.wsClient == nil {
+		return fmt.Errorf("websocket client is nil")
+	}
+
+	err := pm.wsClient.SendMessage("olm/wg/relay", map[string]interface{}{
+		"siteId": siteID,
+	})
+	if err != nil {
+		logger.Error("Failed to send registration message: %v", err)
+		return err
+	}
+	logger.Info("Sent relay message")
+	return nil
+}
+
+// Stop stops monitoring all peers
+func (pm *PeerMonitor) Stop() {
+	// Stop holepunch monitor first (outside of mutex to avoid deadlock)
+	pm.stopHolepunchMonitor()
+
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	if !pm.running {
+		return
+	}
+
+	pm.running = false
+
+	// Stop all monitors
+	for _, client := range pm.monitors {
+		client.StopMonitor()
+	}
+}
+
+// SetHolepunchStatusCallback sets the callback for holepunch status changes
+func (pm *PeerMonitor) SetHolepunchStatusCallback(callback HolepunchStatusCallback) {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+	pm.holepunchStatusCallback = callback
+}
+
+// startHolepunchMonitor starts the holepunch connection monitoring
+// Note: This function assumes the mutex is already held by the caller (called from Start())
+func (pm *PeerMonitor) startHolepunchMonitor() error {
+	if pm.holepunchTester == nil {
+		return fmt.Errorf("holepunch tester not initialized (sharedBind not provided)")
+	}
+
+	if pm.holepunchStopChan != nil {
+		return fmt.Errorf("holepunch monitor already running")
+	}
+
+	if err := pm.holepunchTester.Start(); err != nil {
+		return fmt.Errorf("failed to start holepunch tester: %w", err)
+	}
+
+	pm.holepunchStopChan = make(chan struct{})
+
+	go pm.runHolepunchMonitor()
+
+	logger.Info("Started holepunch connection monitor")
+	return nil
+}
+
+// stopHolepunchMonitor stops the holepunch connection monitoring
+func (pm *PeerMonitor) stopHolepunchMonitor() {
+	pm.mutex.Lock()
+	stopChan := pm.holepunchStopChan
+	pm.holepunchStopChan = nil
+	pm.mutex.Unlock()
+
+	if stopChan != nil {
+		close(stopChan)
+	}
+
+	if pm.holepunchTester != nil {
+		pm.holepunchTester.Stop()
+	}
+
+	logger.Info("Stopped holepunch connection monitor")
+}
+
+// runHolepunchMonitor runs the holepunch monitoring loop
+func (pm *PeerMonitor) runHolepunchMonitor() {
+	ticker := time.NewTicker(pm.holepunchInterval)
+	defer ticker.Stop()
+
+	// Do initial check immediately
+	pm.checkHolepunchEndpoints()
+
+	for {
+		select {
+		case <-pm.holepunchStopChan:
+			return
+		case <-ticker.C:
+			pm.checkHolepunchEndpoints()
+		}
+	}
+}
+
+// checkHolepunchEndpoints tests all holepunch endpoints
+func (pm *PeerMonitor) checkHolepunchEndpoints() {
+	pm.mutex.Lock()
+	endpoints := make(map[int]string, len(pm.holepunchEndpoints))
+	for siteID, endpoint := range pm.holepunchEndpoints {
+		endpoints[siteID] = endpoint
+	}
+	timeout := pm.holepunchTimeout
+	pm.mutex.Unlock()
+
+	for siteID, endpoint := range endpoints {
+		result := pm.holepunchTester.TestEndpoint(endpoint, timeout)
+
+		pm.mutex.Lock()
+		previousStatus, exists := pm.holepunchStatus[siteID]
+		pm.holepunchStatus[siteID] = result.Success
+		callback := pm.holepunchStatusCallback
+		pm.mutex.Unlock()
+
+		// Log status changes
+		if !exists || previousStatus != result.Success {
+			if result.Success {
+				logger.Info("Holepunch to site %d (%s) is CONNECTED (RTT: %v)", siteID, endpoint, result.RTT)
+			} else {
+				if result.Error != nil {
+					logger.Warn("Holepunch to site %d (%s) is DISCONNECTED: %v", siteID, endpoint, result.Error)
+				} else {
+					logger.Warn("Holepunch to site %d (%s) is DISCONNECTED", siteID, endpoint)
+				}
+			}
+		}
+
+		// Call the callback if set
+		if callback != nil {
+			callback(siteID, endpoint, result.Success, result.RTT)
+		}
+	}
+}
+
+// GetHolepunchStatus returns the current holepunch status for all endpoints
+func (pm *PeerMonitor) GetHolepunchStatus() map[int]bool {
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	status := make(map[int]bool, len(pm.holepunchStatus))
+	for siteID, connected := range pm.holepunchStatus {
+		status[siteID] = connected
+	}
+	return status
+}
+
+// Close stops monitoring and cleans up resources
+func (pm *PeerMonitor) Close() {
+	// Stop holepunch monitor first (outside of mutex to avoid deadlock)
+	pm.stopHolepunchMonitor()
+
+	pm.mutex.Lock()
+	defer pm.mutex.Unlock()
+
+	logger.Debug("PeerMonitor: Starting cleanup")
+
+	// Stop and close all clients first
+	for siteID, client := range pm.monitors {
+		logger.Debug("PeerMonitor: Stopping client for site %d", siteID)
+		client.StopMonitor()
+		client.Close()
+		delete(pm.monitors, siteID)
+	}
+
+	pm.running = false
+
+	// Clean up netstack resources
+	logger.Debug("PeerMonitor: Cancelling netstack context")
+	if pm.nsCancel != nil {
+		pm.nsCancel() // Signal goroutines to stop
+	}
+
+	// Close the channel endpoint to unblock any pending reads
+	logger.Debug("PeerMonitor: Closing endpoint")
+	if pm.ep != nil {
+		pm.ep.Close()
+	}
+
+	// Wait for packet sender goroutine to finish with timeout
+	logger.Debug("PeerMonitor: Waiting for goroutines to finish")
+	done := make(chan struct{})
+	go func() {
+		pm.nsWg.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		logger.Debug("PeerMonitor: Goroutines finished cleanly")
+	case <-time.After(2 * time.Second):
+		logger.Warn("PeerMonitor: Timeout waiting for goroutines to finish, proceeding anyway")
+	}
+
+	// Destroy the stack last, after all goroutines are done
+	logger.Debug("PeerMonitor: Destroying stack")
+	if pm.stack != nil {
+		pm.stack.Destroy()
+		pm.stack = nil
+	}
+
+	logger.Debug("PeerMonitor: Cleanup complete")
+}
+
+// TestPeer tests connectivity to a specific peer
+func (pm *PeerMonitor) TestPeer(siteID int) (bool, time.Duration, error) {
+	pm.mutex.Lock()
+	client, exists := pm.monitors[siteID]
+	pm.mutex.Unlock()
+
+	if !exists {
+		return false, 0, fmt.Errorf("peer with siteID %d not found", siteID)
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), pm.timeout*time.Duration(pm.maxAttempts))
+	defer cancel()
+
+	connected, rtt := client.TestConnection(ctx)
+	return connected, rtt, nil
+}
+
+// TestAllPeers tests connectivity to all peers
+func (pm *PeerMonitor) TestAllPeers() map[int]struct {
+	Connected bool
+	RTT       time.Duration
+} {
+	pm.mutex.Lock()
+	peers := make(map[int]*Client, len(pm.monitors))
+	for siteID, client := range pm.monitors {
+		peers[siteID] = client
+	}
+	pm.mutex.Unlock()
+
+	results := make(map[int]struct {
+		Connected bool
+		RTT       time.Duration
+	})
+	for siteID, client := range peers {
+		ctx, cancel := context.WithTimeout(context.Background(), pm.timeout*time.Duration(pm.maxAttempts))
+		connected, rtt := client.TestConnection(ctx)
+		cancel()
+
+		results[siteID] = struct {
+			Connected bool
+			RTT       time.Duration
+		}{
+			Connected: connected,
+			RTT:       rtt,
+		}
+	}
+
+	return results
+}
+
+// initNetstack initializes the gvisor netstack
+func (pm *PeerMonitor) initNetstack() error {
+	if pm.localIP == "" {
+		return fmt.Errorf("local IP not provided")
+	}
+
+	addr, err := netip.ParseAddr(pm.localIP)
+	if err != nil {
+		return fmt.Errorf("invalid local IP: %v", err)
+	}
+
+	// Create gvisor netstack
+	stackOpts := stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+		HandleLocal:        true,
+	}
+
+	pm.ep = channel.New(256, 1420, "") // MTU 1420 (standard WG)
+	pm.stack = stack.New(stackOpts)
+
+	// Create NIC
+	if err := pm.stack.CreateNIC(1, pm.ep); err != nil {
+		return fmt.Errorf("failed to create NIC: %v", err)
+	}
+
+	// Add IP address
+	ipBytes := addr.As4()
+	protoAddr := tcpip.ProtocolAddress{
+		Protocol:          ipv4.ProtocolNumber,
+		AddressWithPrefix: tcpip.AddrFrom4(ipBytes).WithPrefix(),
+	}
+
+	if err := pm.stack.AddProtocolAddress(1, protoAddr, stack.AddressProperties{}); err != nil {
+		return fmt.Errorf("failed to add protocol address: %v", err)
+	}
+
+	// Add default route
+	pm.stack.AddRoute(tcpip.Route{
+		Destination: header.IPv4EmptySubnet,
+		NIC:         1,
+	})
+
+	// Register filter rule on MiddleDevice
+	// We want to intercept packets destined to our local IP
+	// But ONLY if they are for ports we are listening on
+	pm.middleDev.AddRule(addr, pm.handlePacket)
+
+	// Start packet sender (Stack -> WG)
+	pm.nsWg.Add(1)
+	go pm.runPacketSender()
+
+	return nil
+}
+
+// handlePacket is called by MiddleDevice when a packet arrives for our IP
+func (pm *PeerMonitor) handlePacket(packet []byte) bool {
+	// Check if it's UDP
+	proto, ok := util.GetProtocol(packet)
+	if !ok || proto != 17 { // UDP
+		return false
+	}
+
+	// Check destination port
+	port, ok := util.GetDestPort(packet)
+	if !ok {
+		return false
+	}
+
+	// Check if we are listening on this port
+	pm.portsLock.Lock()
+	active := pm.activePorts[uint16(port)]
+	pm.portsLock.Unlock()
+
+	if !active {
+		return false
+	}
+
+	// Inject into netstack
+	version := packet[0] >> 4
+	pkb := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Payload: buffer.MakeWithData(packet),
+	})
+
+	switch version {
+	case 4:
+		pm.ep.InjectInbound(ipv4.ProtocolNumber, pkb)
+	case 6:
+		pm.ep.InjectInbound(ipv6.ProtocolNumber, pkb)
+	default:
+		pkb.DecRef()
+		return false
+	}
+
+	pkb.DecRef()
+	return true // Handled
+}
+
+// runPacketSender reads packets from netstack and injects them into WireGuard
+func (pm *PeerMonitor) runPacketSender() {
+	defer pm.nsWg.Done()
+	logger.Debug("PeerMonitor: Packet sender goroutine started")
+
+	// Use a ticker to periodically check for packets without blocking indefinitely
+	ticker := time.NewTicker(10 * time.Millisecond)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-pm.nsCtx.Done():
+			logger.Debug("PeerMonitor: Packet sender context cancelled, draining packets")
+			// Drain any remaining packets before exiting
+			for {
+				pkt := pm.ep.Read()
+				if pkt == nil {
+					break
+				}
+				pkt.DecRef()
+			}
+			logger.Debug("PeerMonitor: Packet sender goroutine exiting")
+			return
+		case <-ticker.C:
+			// Try to read packets in batches
+			for i := 0; i < 10; i++ {
+				pkt := pm.ep.Read()
+				if pkt == nil {
+					break
+				}
+
+				// Extract packet data
+				slices := pkt.AsSlices()
+				if len(slices) > 0 {
+					var totalSize int
+					for _, slice := range slices {
+						totalSize += len(slice)
+					}
+
+					buf := make([]byte, totalSize)
+					pos := 0
+					for _, slice := range slices {
+						copy(buf[pos:], slice)
+						pos += len(slice)
+					}
+
+					// Inject into MiddleDevice (outbound to WG)
+					pm.middleDev.InjectOutbound(buf)
+				}
+
+				pkt.DecRef()
+			}
+		}
+	}
+}
+
+// dial creates a UDP connection using the netstack
+func (pm *PeerMonitor) dial(network, addr string) (net.Conn, error) {
+	if pm.stack == nil {
+		return nil, fmt.Errorf("netstack not initialized")
+	}
+
+	// Parse remote address
+	raddr, err := net.ResolveUDPAddr("udp", addr)
+	if err != nil {
+		return nil, err
+	}
+
+	// Parse local IP
+	localIP, err := netip.ParseAddr(pm.localIP)
+	if err != nil {
+		return nil, err
+	}
+	ipBytes := localIP.As4()
+
+	// Create UDP connection
+	// We bind to port 0 (ephemeral)
+	laddr := &tcpip.FullAddress{
+		NIC:  1,
+		Addr: tcpip.AddrFrom4(ipBytes),
+		Port: 0,
+	}
+
+	raddrTcpip := &tcpip.FullAddress{
+		NIC:  1,
+		Addr: tcpip.AddrFrom4([4]byte(raddr.IP.To4())),
+		Port: uint16(raddr.Port),
+	}
+
+	conn, err := gonet.DialUDP(pm.stack, laddr, raddrTcpip, ipv4.ProtocolNumber)
+	if err != nil {
+		return nil, err
+	}
+
+	// Get local port
+	localAddr := conn.LocalAddr().(*net.UDPAddr)
+	port := uint16(localAddr.Port)
+
+	// Register port
+	pm.portsLock.Lock()
+	pm.activePorts[port] = true
+	pm.portsLock.Unlock()
+
+	// Wrap connection to cleanup port on close
+	return &trackedConn{
+		Conn: conn,
+		pm:   pm,
+		port: port,
+	}, nil
+}
+
+func (pm *PeerMonitor) removePort(port uint16) {
+	pm.portsLock.Lock()
+	delete(pm.activePorts, port)
+	pm.portsLock.Unlock()
+}
+
+type trackedConn struct {
+	net.Conn
+	pm   *PeerMonitor
+	port uint16
+}
+
+func (c *trackedConn) Close() error {
+	c.pm.removePort(c.port)
+	if c.Conn != nil {
+		return c.Conn.Close()
+	}
+	return nil
+}
--- a/peers/monitor/wgtester.go
+++ b/peers/monitor/wgtester.go
@@ -0,0 +1,267 @@
+package monitor
+
+import (
+	"context"
+	"encoding/binary"
+	"net"
+	"sync"
+	"time"
+
+	"github.com/fosrl/newt/logger"
+)
+
+const (
+	// Magic bytes to identify our packets
+	magicHeader uint32 = 0xDEADBEEF
+	// Request packet type
+	packetTypeRequest uint8 = 1
+	// Response packet type
+	packetTypeResponse uint8 = 2
+	// Packet format:
+	// - 4 bytes: magic header (0xDEADBEEF)
+	// - 1 byte: packet type (1 = request, 2 = response)
+	// - 8 bytes: timestamp (for round-trip timing)
+	packetSize = 13
+)
+
+// Client handles checking connectivity to a server
+type Client struct {
+	conn           net.Conn
+	serverAddr     string
+	monitorRunning bool
+	monitorLock    sync.Mutex
+	connLock       sync.Mutex // Protects connection operations
+	shutdownCh     chan struct{}
+	packetInterval time.Duration
+	timeout        time.Duration
+	maxAttempts    int
+	dialer         Dialer
+}
+
+// Dialer is a function that creates a connection
+type Dialer func(network, addr string) (net.Conn, error)
+
+// ConnectionStatus represents the current connection state
+type ConnectionStatus struct {
+	Connected bool
+	RTT       time.Duration
+}
+
+// NewClient creates a new connection test client
+func NewClient(serverAddr string, dialer Dialer) (*Client, error) {
+	return &Client{
+		serverAddr:     serverAddr,
+		shutdownCh:     make(chan struct{}),
+		packetInterval: 2 * time.Second,
+		timeout:        500 * time.Millisecond, // Timeout for individual packets
+		maxAttempts:    3,                      // Default max attempts
+		dialer:         dialer,
+	}, nil
+}
+
+// SetPacketInterval changes how frequently packets are sent in monitor mode
+func (c *Client) SetPacketInterval(interval time.Duration) {
+	c.packetInterval = interval
+}
+
+// SetTimeout changes the timeout for waiting for responses
+func (c *Client) SetTimeout(timeout time.Duration) {
+	c.timeout = timeout
+}
+
+// SetMaxAttempts changes the maximum number of attempts for TestConnection
+func (c *Client) SetMaxAttempts(attempts int) {
+	c.maxAttempts = attempts
+}
+
+// Close cleans up client resources
+func (c *Client) Close() {
+	c.StopMonitor()
+
+	c.connLock.Lock()
+	defer c.connLock.Unlock()
+
+	if c.conn != nil {
+		c.conn.Close()
+		c.conn = nil
+	}
+}
+
+// ensureConnection makes sure we have an active UDP connection
+func (c *Client) ensureConnection() error {
+	c.connLock.Lock()
+	defer c.connLock.Unlock()
+
+	if c.conn != nil {
+		return nil
+	}
+
+	var err error
+	if c.dialer != nil {
+		c.conn, err = c.dialer("udp", c.serverAddr)
+	} else {
+		// Fallback to standard net.Dial
+		c.conn, err = net.Dial("udp", c.serverAddr)
+	}
+
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// TestConnection checks if the connection to the server is working
+// Returns true if connected, false otherwise
+func (c *Client) TestConnection(ctx context.Context) (bool, time.Duration) {
+	if err := c.ensureConnection(); err != nil {
+		logger.Warn("Failed to ensure connection: %v", err)
+		return false, 0
+	}
+
+	// Prepare packet buffer
+	packet := make([]byte, packetSize)
+	binary.BigEndian.PutUint32(packet[0:4], magicHeader)
+	packet[4] = packetTypeRequest
+
+	// Send multiple attempts as specified
+	for attempt := 0; attempt < c.maxAttempts; attempt++ {
+		select {
+		case <-ctx.Done():
+			return false, 0
+		default:
+			// Add current timestamp to packet
+			timestamp := time.Now().UnixNano()
+			binary.BigEndian.PutUint64(packet[5:13], uint64(timestamp))
+
+			// Lock the connection for the entire send/receive operation
+			c.connLock.Lock()
+
+			// Check if connection is still valid after acquiring lock
+			if c.conn == nil {
+				c.connLock.Unlock()
+				return false, 0
+			}
+
+			// logger.Debug("Attempting to send monitor packet to %s", c.serverAddr)
+			_, err := c.conn.Write(packet)
+			if err != nil {
+				c.connLock.Unlock()
+				logger.Info("Error sending packet: %v", err)
+				continue
+			}
+			// logger.Debug("Successfully sent monitor packet")
+
+			// Set read deadline
+			c.conn.SetReadDeadline(time.Now().Add(c.timeout))
+
+			// Wait for response
+			responseBuffer := make([]byte, packetSize)
+			n, err := c.conn.Read(responseBuffer)
+			c.connLock.Unlock()
+
+			if err != nil {
+				if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
+					// Timeout, try next attempt
+					time.Sleep(100 * time.Millisecond) // Brief pause between attempts
+					continue
+				}
+				logger.Error("Error reading response: %v", err)
+				continue
+			}
+
+			if n != packetSize {
+				continue // Malformed packet
+			}
+
+			// Verify response
+			magic := binary.BigEndian.Uint32(responseBuffer[0:4])
+			packetType := responseBuffer[4]
+			if magic != magicHeader || packetType != packetTypeResponse {
+				continue // Not our response
+			}
+
+			// Extract the original timestamp and calculate RTT
+			sentTimestamp := int64(binary.BigEndian.Uint64(responseBuffer[5:13]))
+			rtt := time.Duration(time.Now().UnixNano() - sentTimestamp)
+
+			return true, rtt
+		}
+	}
+
+	return false, 0
+}
+
+// TestConnectionWithTimeout tries to test connection with a timeout
+// Returns true if connected, false otherwise
+func (c *Client) TestConnectionWithTimeout(timeout time.Duration) (bool, time.Duration) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	return c.TestConnection(ctx)
+}
+
+// MonitorCallback is the function type for connection status change callbacks
+type MonitorCallback func(status ConnectionStatus)
+
+// StartMonitor begins monitoring the connection and calls the callback
+// when the connection status changes
+func (c *Client) StartMonitor(callback MonitorCallback) error {
+	c.monitorLock.Lock()
+	defer c.monitorLock.Unlock()
+
+	if c.monitorRunning {
+		logger.Info("Monitor already running")
+		return nil // Already running
+	}
+
+	if err := c.ensureConnection(); err != nil {
+		return err
+	}
+
+	c.monitorRunning = true
+	c.shutdownCh = make(chan struct{})
+
+	go func() {
+		var lastConnected bool
+		firstRun := true
+
+		ticker := time.NewTicker(c.packetInterval)
+		defer ticker.Stop()
+
+		for {
+			select {
+			case <-c.shutdownCh:
+				return
+			case <-ticker.C:
+				ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
+				connected, rtt := c.TestConnection(ctx)
+				cancel()
+
+				// Callback if status changed or it's the first check
+				if connected != lastConnected || firstRun {
+					callback(ConnectionStatus{
+						Connected: connected,
+						RTT:       rtt,
+					})
+					lastConnected = connected
+					firstRun = false
+				}
+			}
+		}
+	}()
+
+	return nil
+}
+
+// StopMonitor stops the connection monitoring
+func (c *Client) StopMonitor() {
+	c.monitorLock.Lock()
+	defer c.monitorLock.Unlock()
+
+	if !c.monitorRunning {
+		return
+	}
+
+	close(c.shutdownCh)
+	c.monitorRunning = false
+}