Healthcheck working

This commit is contained in:
Owen
2025-08-11 08:14:29 -07:00
parent 289cce3a22
commit 28b6865f73
2 changed files with 141 additions and 39 deletions

View File

@@ -8,18 +8,20 @@ import (
"strings"
"sync"
"time"
"github.com/fosrl/newt/logger"
)
// Status represents the health status of a target
type Status int
// Health represents the health status of a target
type Health int
const (
StatusUnknown Status = iota
StatusUnknown Health = iota
StatusHealthy
StatusUnhealthy
)
func (s Status) String() string {
func (s Health) String() string {
switch s {
case StatusHealthy:
return "healthy"
@@ -44,12 +46,13 @@ type Config struct {
Timeout int `json:"hcTimeout"` // in seconds
Headers map[string]string `json:"hcHeaders"`
Method string `json:"hcMethod"`
Status int `json:"hcStatus"` // HTTP status code
}
// Target represents a health check target with its current status
type Target struct {
Config Config `json:"config"`
Status Status `json:"status"`
Status Health `json:"status"`
LastCheck time.Time `json:"lastCheck"`
LastError string `json:"lastError,omitempty"`
CheckCount int `json:"checkCount"`
@@ -71,6 +74,7 @@ type Monitor struct {
// NewMonitor creates a new health check monitor
func NewMonitor(callback StatusChangeCallback) *Monitor {
logger.Info("Creating new health check monitor")
return &Monitor{
targets: make(map[int]*Target),
callback: callback,
@@ -108,6 +112,9 @@ func (m *Monitor) AddTarget(config Config) error {
m.mutex.Lock()
defer m.mutex.Unlock()
logger.Info("Adding health check target: ID=%d, hostname=%s, port=%d, enabled=%t",
config.ID, config.Hostname, config.Port, config.Enabled)
return m.addTargetUnsafe(config)
}
@@ -116,17 +123,20 @@ func (m *Monitor) AddTargets(configs []Config) error {
m.mutex.Lock()
defer m.mutex.Unlock()
logger.Info("Adding %d health check targets in bulk", len(configs))
for _, config := range configs {
if err := m.addTargetUnsafe(config); err != nil {
logger.Error("Failed to add target %d: %v", config.ID, err)
return fmt.Errorf("failed to add target %d: %v", config.ID, err)
}
logger.Debug("Successfully added target: ID=%d, hostname=%s", config.ID, config.Hostname)
}
// Notify callback once after all targets are added
if m.callback != nil {
go m.callback(m.getAllTargetsUnsafe())
}
// Don't notify callback immediately - let the initial health checks complete first
// The callback will be triggered when the first health check results are available
logger.Info("Successfully added all %d health check targets", len(configs))
return nil
}
@@ -152,6 +162,9 @@ func (m *Monitor) addTargetUnsafe(config Config) error {
config.Timeout = 5
}
logger.Debug("Target %d configuration: scheme=%s, method=%s, interval=%ds, timeout=%ds",
config.ID, config.Scheme, config.Method, config.Interval, config.Timeout)
// Parse headers if provided as string
if len(config.Headers) == 0 && config.Path != "" {
// This is a simplified header parsing - in real use you might want more robust parsing
@@ -160,6 +173,7 @@ func (m *Monitor) addTargetUnsafe(config Config) error {
// Remove existing target if it exists
if existing, exists := m.targets[config.ID]; exists {
logger.Info("Replacing existing target with ID %d", config.ID)
existing.cancel()
}
@@ -176,7 +190,10 @@ func (m *Monitor) addTargetUnsafe(config Config) error {
// Start monitoring if enabled
if config.Enabled {
logger.Info("Starting monitoring for target %d (%s:%d)", config.ID, config.Hostname, config.Port)
go m.monitorTarget(target)
} else {
logger.Debug("Target %d added but monitoring is disabled", config.ID)
}
return nil
@@ -189,9 +206,11 @@ func (m *Monitor) RemoveTarget(id int) error {
target, exists := m.targets[id]
if !exists {
logger.Warn("Attempted to remove non-existent target with ID %d", id)
return fmt.Errorf("target with id %d not found", id)
}
logger.Info("Removing health check target: ID=%d", id)
target.cancel()
delete(m.targets, id)
@@ -200,6 +219,7 @@ func (m *Monitor) RemoveTarget(id int) error {
go m.callback(m.GetTargets())
}
logger.Info("Successfully removed target %d", id)
return nil
}
@@ -208,25 +228,32 @@ func (m *Monitor) RemoveTargets(ids []int) error {
m.mutex.Lock()
defer m.mutex.Unlock()
logger.Info("Removing %d health check targets", len(ids))
var notFound []int
for _, id := range ids {
target, exists := m.targets[id]
if !exists {
notFound = append(notFound, id)
logger.Warn("Target with ID %d not found during bulk removal", id)
continue
}
logger.Debug("Removing target %d", id)
target.cancel()
delete(m.targets, id)
}
removedCount := len(ids) - len(notFound)
logger.Info("Successfully removed %d targets", removedCount)
// Notify callback of status change if any targets were removed
if len(notFound) != len(ids) && m.callback != nil {
go m.callback(m.GetTargets())
}
if len(notFound) > 0 {
logger.Error("Some targets not found during removal: %v", notFound)
return fmt.Errorf("targets not found: %v", notFound)
}
@@ -263,21 +290,33 @@ func (m *Monitor) getAllTargets() map[int]*Target {
// monitorTarget monitors a single target
func (m *Monitor) monitorTarget(target *Target) {
logger.Info("Starting health check monitoring for target %d (%s:%d)",
target.Config.ID, target.Config.Hostname, target.Config.Port)
// Initial check
oldStatus := target.Status
m.performHealthCheck(target)
// Notify callback after initial check if status changed or if it's the first check
if (oldStatus != target.Status || oldStatus == StatusUnknown) && m.callback != nil {
logger.Info("Target %d initial status: %s", target.Config.ID, target.Status.String())
go m.callback(m.GetTargets())
}
// Set up ticker based on current status
interval := time.Duration(target.Config.Interval) * time.Second
if target.Status == StatusUnhealthy {
interval = time.Duration(target.Config.UnhealthyInterval) * time.Second
}
logger.Debug("Target %d: initial check interval set to %v", target.Config.ID, interval)
target.ticker = time.NewTicker(interval)
defer target.ticker.Stop()
for {
select {
case <-target.ctx.Done():
logger.Info("Stopping health check monitoring for target %d", target.Config.ID)
return
case <-target.ticker.C:
oldStatus := target.Status
@@ -290,6 +329,8 @@ func (m *Monitor) monitorTarget(target *Target) {
}
if newInterval != interval {
logger.Debug("Target %d: updating check interval from %v to %v due to status change",
target.Config.ID, interval, newInterval)
target.ticker.Stop()
target.ticker = time.NewTicker(newInterval)
interval = newInterval
@@ -297,6 +338,8 @@ func (m *Monitor) monitorTarget(target *Target) {
// Notify callback if status changed
if oldStatus != target.Status && m.callback != nil {
logger.Info("Target %d status changed: %s -> %s",
target.Config.ID, oldStatus.String(), target.Status.String())
go m.callback(m.GetTargets())
}
}
@@ -321,6 +364,9 @@ func (m *Monitor) performHealthCheck(target *Target) {
url += target.Config.Path
}
logger.Debug("Target %d: performing health check %d to %s",
target.Config.ID, target.CheckCount, url)
// Create request
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(target.Config.Timeout)*time.Second)
defer cancel()
@@ -329,6 +375,7 @@ func (m *Monitor) performHealthCheck(target *Target) {
if err != nil {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("failed to create request: %v", err)
logger.Warn("Target %d: failed to create request: %v", target.Config.ID, err)
return
}
@@ -342,16 +389,40 @@ func (m *Monitor) performHealthCheck(target *Target) {
if err != nil {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("request failed: %v", err)
logger.Warn("Target %d: health check failed: %v", target.Config.ID, err)
return
}
defer resp.Body.Close()
// Check response status
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
target.Status = StatusHealthy
var expectedStatus int
if target.Config.Status > 0 {
expectedStatus = target.Config.Status
} else {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("unhealthy status code: %d", resp.StatusCode)
expectedStatus = 0 // Use range check for 200-299
}
if expectedStatus > 0 {
logger.Debug("Target %d: checking health status against expected code %d", target.Config.ID, expectedStatus)
// Check for specific status code
if resp.StatusCode == expectedStatus {
target.Status = StatusHealthy
logger.Debug("Target %d: health check passed (status: %d, expected: %d)", target.Config.ID, resp.StatusCode, expectedStatus)
} else {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("unexpected status code: %d (expected: %d)", resp.StatusCode, expectedStatus)
logger.Warn("Target %d: health check failed with status code %d (expected: %d)", target.Config.ID, resp.StatusCode, expectedStatus)
}
} else {
// Check for 2xx range
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
target.Status = StatusHealthy
logger.Debug("Target %d: health check passed (status: %d)", target.Config.ID, resp.StatusCode)
} else {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("unhealthy status code: %d", resp.StatusCode)
logger.Warn("Target %d: health check failed with status code %d", target.Config.ID, resp.StatusCode)
}
}
}
@@ -360,10 +431,16 @@ func (m *Monitor) Stop() {
m.mutex.Lock()
defer m.mutex.Unlock()
for _, target := range m.targets {
targetCount := len(m.targets)
logger.Info("Stopping health check monitor with %d targets", targetCount)
for id, target := range m.targets {
logger.Debug("Stopping monitoring for target %d", id)
target.cancel()
}
m.targets = make(map[int]*Target)
logger.Info("Health check monitor stopped")
}
// EnableTarget enables monitoring for a specific target
@@ -373,10 +450,12 @@ func (m *Monitor) EnableTarget(id int) error {
target, exists := m.targets[id]
if !exists {
logger.Warn("Attempted to enable non-existent target with ID %d", id)
return fmt.Errorf("target with id %d not found", id)
}
if !target.Config.Enabled {
logger.Info("Enabling health check monitoring for target %d", id)
target.Config.Enabled = true
target.cancel() // Stop existing monitoring
@@ -385,6 +464,8 @@ func (m *Monitor) EnableTarget(id int) error {
target.cancel = cancel
go m.monitorTarget(target)
} else {
logger.Debug("Target %d is already enabled", id)
}
return nil
@@ -397,10 +478,12 @@ func (m *Monitor) DisableTarget(id int) error {
target, exists := m.targets[id]
if !exists {
logger.Warn("Attempted to disable non-existent target with ID %d", id)
return fmt.Errorf("target with id %d not found", id)
}
if target.Config.Enabled {
logger.Info("Disabling health check monitoring for target %d", id)
target.Config.Enabled = false
target.cancel()
target.Status = StatusUnknown
@@ -409,6 +492,8 @@ func (m *Monitor) DisableTarget(id int) error {
if m.callback != nil {
go m.callback(m.GetTargets())
}
} else {
logger.Debug("Target %d is already disabled", id)
}
return nil

67
main.go
View File

@@ -101,6 +101,7 @@ var (
healthFile string
useNativeInterface bool
authorizedKeysFile string
preferEndpoint string
healthMonitor *healthcheck.Monitor
)
@@ -172,6 +173,9 @@ func main() {
if pingTimeoutStr == "" {
flag.StringVar(&pingTimeoutStr, "ping-timeout", "5s", " Timeout for each ping (default 5s)")
}
// load the prefer endpoint just as a flag
flag.StringVar(&preferEndpoint, "prefer-endpoint", "", "Prefer this endpoint for the connection (if set, will override the endpoint from the server)")
// if authorizedKeysFile == "" {
// flag.StringVar(&authorizedKeysFile, "authorized-keys-file", "~/.ssh/authorized_keys", "Path to authorized keys file (if unset, no keys will be authorized)")
// }
@@ -291,6 +295,33 @@ func main() {
setupClients(client)
}
// Initialize health check monitor with status change callback
healthMonitor = healthcheck.NewMonitor(func(targets map[int]*healthcheck.Target) {
logger.Debug("Health check status update for %d targets", len(targets))
// Send health status update to the server
healthStatuses := make(map[int]interface{})
for id, target := range targets {
healthStatuses[id] = map[string]interface{}{
"status": target.Status.String(),
"lastCheck": target.LastCheck.Format(time.RFC3339),
"checkCount": target.CheckCount,
"lastError": target.LastError,
"config": target.Config,
}
}
// print the status of the targets
logger.Debug("Health check status: %+v", healthStatuses)
err := client.SendMessage("newt/healthcheck/status", map[string]interface{}{
"targets": healthStatuses,
})
if err != nil {
logger.Error("Failed to send health check status update: %v", err)
}
})
var pingWithRetryStopChan chan struct{}
closeWgTunnel := func() {
@@ -529,9 +560,19 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
}
// If there is just one exit node, we can skip pinging it and use it directly
if len(exitNodes) == 1 {
if len(exitNodes) == 1 || preferEndpoint != "" {
logger.Debug("Only one exit node available, using it directly: %s", exitNodes[0].Endpoint)
// if the preferEndpoint is set, we will use it instead of the exit node endpoint. first you need to find the exit node with that endpoint in the list and send that one
if preferEndpoint != "" {
for _, node := range exitNodes {
if node.Endpoint == preferEndpoint {
exitNodes[0] = node
break
}
}
}
// Prepare data to send to the cloud for selection
pingResults := []ExitNodePingResult{
{
@@ -907,30 +948,6 @@ persistent_keepalive_interval=5`, fixKey(privateKey.String()), fixKey(wgData.Pub
logger.Info("SSH public key appended to authorized keys file")
})
// Initialize health check monitor with status change callback
healthMonitor = healthcheck.NewMonitor(func(targets map[int]*healthcheck.Target) {
logger.Debug("Health check status update for %d targets", len(targets))
// Send health status update to the server
healthStatuses := make(map[int]interface{})
for id, target := range targets {
healthStatuses[id] = map[string]interface{}{
"status": target.Status.String(),
"lastCheck": target.LastCheck.Format(time.RFC3339),
"checkCount": target.CheckCount,
"lastError": target.LastError,
"config": target.Config,
}
}
err := client.SendMessage("newt/healthcheck/status", map[string]interface{}{
"targets": healthStatuses,
})
if err != nil {
logger.Error("Failed to send health check status update: %v", err)
}
})
// Register handler for adding health check targets
client.RegisterHandler("newt/healthcheck/add", func(msg websocket.WSMessage) {
logger.Debug("Received health check add request: %+v", msg)