[server] Add health check HTTP endpoint for Relay server (#4297)

The health check endpoint listens on a dedicated HTTP server.
By default, it is available at 0.0.0.0:9000/health. This can be configured using the --health-listen-address flag.

The results are cached for 3 seconds to avoid excessive calls.

The health check performs the following:

Checks the number of active listeners.
Validates each listener via WebSocket and QUIC dials, including TLS certificate verification.
This commit is contained in:
Zoltan Papp
2025-08-13 10:40:04 +02:00
committed by GitHub
parent a4e8647aef
commit 3d4b502126
14 changed files with 354 additions and 18 deletions

View File

@@ -9,6 +9,7 @@ import (
"net/http"
"os"
"os/signal"
"sync"
"syscall"
"time"
@@ -17,8 +18,9 @@ import (
"github.com/spf13/cobra"
"github.com/netbirdio/netbird/encryption"
"github.com/netbirdio/netbird/shared/relay/auth"
"github.com/netbirdio/netbird/relay/healthcheck"
"github.com/netbirdio/netbird/relay/server"
"github.com/netbirdio/netbird/shared/relay/auth"
"github.com/netbirdio/netbird/signal/metrics"
"github.com/netbirdio/netbird/util"
)
@@ -34,12 +36,13 @@ type Config struct {
LetsencryptDomains []string
// in case of using Route 53 for DNS challenge the credentials should be provided in the environment variables or
// in the AWS credentials file
LetsencryptAWSRoute53 bool
TlsCertFile string
TlsKeyFile string
AuthSecret string
LogLevel string
LogFile string
LetsencryptAWSRoute53 bool
TlsCertFile string
TlsKeyFile string
AuthSecret string
LogLevel string
LogFile string
HealthcheckListenAddress string
}
func (c Config) Validate() error {
@@ -87,6 +90,7 @@ func init() {
rootCmd.PersistentFlags().StringVarP(&cobraConfig.AuthSecret, "auth-secret", "s", "", "auth secret")
rootCmd.PersistentFlags().StringVar(&cobraConfig.LogLevel, "log-level", "info", "log level")
rootCmd.PersistentFlags().StringVar(&cobraConfig.LogFile, "log-file", "console", "log file")
rootCmd.PersistentFlags().StringVarP(&cobraConfig.HealthcheckListenAddress, "health-listen-address", "H", ":9000", "listen address of healthcheck server")
setFlagsFromEnvVars(rootCmd)
}
@@ -102,6 +106,7 @@ func waitForExitSignal() {
}
func execute(cmd *cobra.Command, args []string) error {
wg := sync.WaitGroup{}
err := cobraConfig.Validate()
if err != nil {
log.Debugf("invalid config: %s", err)
@@ -120,7 +125,9 @@ func execute(cmd *cobra.Command, args []string) error {
return fmt.Errorf("setup metrics: %v", err)
}
wg.Add(1)
go func() {
defer wg.Done()
log.Infof("running metrics server: %s%s", metricsServer.Addr, metricsServer.Endpoint)
if err := metricsServer.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
log.Fatalf("Failed to start metrics server: %v", err)
@@ -154,12 +161,31 @@ func execute(cmd *cobra.Command, args []string) error {
return fmt.Errorf("failed to create relay server: %v", err)
}
log.Infof("server will be available on: %s", srv.InstanceURL())
wg.Add(1)
go func() {
defer wg.Done()
if err := srv.Listen(srvListenerCfg); err != nil {
log.Fatalf("failed to bind server: %s", err)
}
}()
hCfg := healthcheck.Config{
ListenAddress: cobraConfig.HealthcheckListenAddress,
ServiceChecker: srv,
}
httpHealthcheck, err := healthcheck.NewServer(hCfg)
if err != nil {
log.Debugf("failed to create healthcheck server: %v", err)
return fmt.Errorf("failed to create healthcheck server: %v", err)
}
wg.Add(1)
go func() {
defer wg.Done()
if err := httpHealthcheck.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
log.Fatalf("Failed to start healthcheck server: %v", err)
}
}()
// it will block until exit signal
waitForExitSignal()
@@ -167,6 +193,10 @@ func execute(cmd *cobra.Command, args []string) error {
defer cancel()
var shutDownErrors error
if err := httpHealthcheck.Shutdown(ctx); err != nil {
shutDownErrors = multierror.Append(shutDownErrors, fmt.Errorf("failed to close healthcheck server: %v", err))
}
if err := srv.Shutdown(ctx); err != nil {
shutDownErrors = multierror.Append(shutDownErrors, fmt.Errorf("failed to close server: %s", err))
}
@@ -175,6 +205,8 @@ func execute(cmd *cobra.Command, args []string) error {
if err := metricsServer.Shutdown(ctx); err != nil {
shutDownErrors = multierror.Append(shutDownErrors, fmt.Errorf("failed to close metrics server: %v", err))
}
wg.Wait()
return shutDownErrors
}