Add cert health info to checks

This commit is contained in:
Viktor Liu
2026-02-09 20:44:32 +08:00
parent 53c1016a8e
commit 73aa0785ba
9 changed files with 361 additions and 135 deletions

View File

@@ -8,12 +8,15 @@ import (
"encoding/binary" "encoding/binary"
"fmt" "fmt"
"net" "net"
"slices"
"sync" "sync"
"time" "time"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"golang.org/x/crypto/acme" "golang.org/x/crypto/acme"
"golang.org/x/crypto/acme/autocert" "golang.org/x/crypto/acme/autocert"
"github.com/netbirdio/netbird/shared/management/domain"
) )
// OID for the SCT list extension (1.3.6.1.4.1.11129.2.4.2) // OID for the SCT list extension (1.3.6.1.4.1.11129.2.4.2)
@@ -23,19 +26,31 @@ type certificateNotifier interface {
NotifyCertificateIssued(ctx context.Context, accountID, reverseProxyID, domain string) error NotifyCertificateIssued(ctx context.Context, accountID, reverseProxyID, domain string) error
} }
type domainState int
const (
domainPending domainState = iota
domainReady
domainFailed
)
type domainInfo struct {
accountID string
reverseProxyID string
state domainState
err string
}
// Manager wraps autocert.Manager with domain tracking and cross-replica // Manager wraps autocert.Manager with domain tracking and cross-replica
// coordination via a pluggable locking strategy. The locker prevents // coordination via a pluggable locking strategy. The locker prevents
// duplicate ACME requests when multiple replicas share a certificate cache. // duplicate ACME requests when multiple replicas share a certificate cache.
type Manager struct { type Manager struct {
*autocert.Manager *autocert.Manager
certDir string certDir string
locker certLocker locker certLocker
domainsMux sync.RWMutex mu sync.RWMutex
domains map[string]struct { domains map[domain.Domain]*domainInfo
accountID string
reverseProxyID string
}
certNotifier certificateNotifier certNotifier certificateNotifier
logger *log.Logger logger *log.Logger
@@ -49,12 +64,9 @@ func NewManager(certDir, acmeURL string, notifier certificateNotifier, logger *l
logger = log.StandardLogger() logger = log.StandardLogger()
} }
mgr := &Manager{ mgr := &Manager{
certDir: certDir, certDir: certDir,
locker: newCertLocker(lockMethod, certDir, logger), locker: newCertLocker(lockMethod, certDir, logger),
domains: make(map[string]struct { domains: make(map[domain.Domain]*domainInfo),
accountID string
reverseProxyID string
}),
certNotifier: notifier, certNotifier: notifier,
logger: logger, logger: logger,
} }
@@ -73,49 +85,50 @@ func (mgr *Manager) hostPolicy(_ context.Context, host string) error {
if h, _, err := net.SplitHostPort(host); err == nil { if h, _, err := net.SplitHostPort(host); err == nil {
host = h host = h
} }
mgr.domainsMux.RLock() mgr.mu.RLock()
_, exists := mgr.domains[host] _, exists := mgr.domains[domain.Domain(host)]
mgr.domainsMux.RUnlock() mgr.mu.RUnlock()
if !exists { if !exists {
return fmt.Errorf("unknown domain %q", host) return fmt.Errorf("unknown domain %q", host)
} }
return nil return nil
} }
func (mgr *Manager) AddDomain(domain, accountID, reverseProxyID string) { // AddDomain registers a domain for ACME certificate prefetching.
mgr.domainsMux.Lock() func (mgr *Manager) AddDomain(d domain.Domain, accountID, reverseProxyID string) {
mgr.domains[domain] = struct { mgr.mu.Lock()
accountID string mgr.domains[d] = &domainInfo{
reverseProxyID string
}{
accountID: accountID, accountID: accountID,
reverseProxyID: reverseProxyID, reverseProxyID: reverseProxyID,
state: domainPending,
} }
mgr.domainsMux.Unlock() mgr.mu.Unlock()
go mgr.prefetchCertificate(domain) go mgr.prefetchCertificate(d)
} }
// prefetchCertificate proactively triggers certificate generation for a domain. // prefetchCertificate proactively triggers certificate generation for a domain.
// It acquires a distributed lock to prevent multiple replicas from issuing // It acquires a distributed lock to prevent multiple replicas from issuing
// duplicate ACME requests. The second replica will block until the first // duplicate ACME requests. The second replica will block until the first
// finishes, then find the certificate in the cache. // finishes, then find the certificate in the cache.
func (mgr *Manager) prefetchCertificate(domain string) { func (mgr *Manager) prefetchCertificate(d domain.Domain) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel() defer cancel()
mgr.logger.Infof("acquiring cert lock for domain %q", domain) name := d.PunycodeString()
mgr.logger.Infof("acquiring cert lock for domain %q", name)
lockStart := time.Now() lockStart := time.Now()
unlock, err := mgr.locker.Lock(ctx, domain) unlock, err := mgr.locker.Lock(ctx, name)
if err != nil { if err != nil {
mgr.logger.Warnf("acquire cert lock for domain %q, proceeding without lock: %v", domain, err) mgr.logger.Warnf("acquire cert lock for domain %q, proceeding without lock: %v", name, err)
} else { } else {
mgr.logger.Infof("acquired cert lock for domain %q in %s", domain, time.Since(lockStart)) mgr.logger.Infof("acquired cert lock for domain %q in %s", name, time.Since(lockStart))
defer unlock() defer unlock()
} }
hello := &tls.ClientHelloInfo{ hello := &tls.ClientHelloInfo{
ServerName: domain, ServerName: name,
Conn: &dummyConn{ctx: ctx}, Conn: &dummyConn{ctx: ctx},
} }
@@ -123,35 +136,47 @@ func (mgr *Manager) prefetchCertificate(domain string) {
cert, err := mgr.GetCertificate(hello) cert, err := mgr.GetCertificate(hello)
elapsed := time.Since(start) elapsed := time.Since(start)
if err != nil { if err != nil {
mgr.logger.Warnf("prefetch certificate for domain %q: %v", domain, err) mgr.logger.Warnf("prefetch certificate for domain %q: %v", name, err)
mgr.setDomainState(d, domainFailed, err.Error())
return return
} }
mgr.setDomainState(d, domainReady, "")
now := time.Now() now := time.Now()
if cert != nil && cert.Leaf != nil { if cert != nil && cert.Leaf != nil {
leaf := cert.Leaf leaf := cert.Leaf
mgr.logger.Infof("certificate for domain %q ready in %s: serial=%s SANs=%v notAfter=%s", mgr.logger.Infof("certificate for domain %q ready in %s: serial=%s SANs=%v notAfter=%s",
domain, elapsed.Round(time.Millisecond), name, elapsed.Round(time.Millisecond),
leaf.SerialNumber.Text(16), leaf.SerialNumber.Text(16),
leaf.DNSNames, leaf.DNSNames,
leaf.NotAfter.UTC().Format(time.RFC3339), leaf.NotAfter.UTC().Format(time.RFC3339),
) )
mgr.logCertificateDetails(domain, leaf, now) mgr.logCertificateDetails(name, leaf, now)
} else { } else {
mgr.logger.Infof("certificate for domain %q ready in %s", domain, elapsed.Round(time.Millisecond)) mgr.logger.Infof("certificate for domain %q ready in %s", name, elapsed.Round(time.Millisecond))
} }
mgr.domainsMux.RLock() mgr.mu.RLock()
info, exists := mgr.domains[domain] info := mgr.domains[d]
mgr.domainsMux.RUnlock() mgr.mu.RUnlock()
if exists && mgr.certNotifier != nil { if info != nil && mgr.certNotifier != nil {
if err := mgr.certNotifier.NotifyCertificateIssued(ctx, info.accountID, info.reverseProxyID, domain); err != nil { if err := mgr.certNotifier.NotifyCertificateIssued(ctx, info.accountID, info.reverseProxyID, name); err != nil {
mgr.logger.Warnf("notify certificate ready for domain %q: %v", domain, err) mgr.logger.Warnf("notify certificate ready for domain %q: %v", name, err)
} }
} }
} }
func (mgr *Manager) setDomainState(d domain.Domain, state domainState, errMsg string) {
mgr.mu.Lock()
defer mgr.mu.Unlock()
if info, ok := mgr.domains[d]; ok {
info.state = state
info.err = errMsg
}
}
// logCertificateDetails logs certificate validity and SCT timestamps. // logCertificateDetails logs certificate validity and SCT timestamps.
func (mgr *Manager) logCertificateDetails(domain string, cert *x509.Certificate, now time.Time) { func (mgr *Manager) logCertificateDetails(domain string, cert *x509.Certificate, now time.Time) {
if cert.NotBefore.After(now) { if cert.NotBefore.After(now) {
@@ -245,8 +270,65 @@ func (c *dummyConn) SetDeadline(t time.Time) error { return nil }
func (c *dummyConn) SetReadDeadline(t time.Time) error { return nil } func (c *dummyConn) SetReadDeadline(t time.Time) error { return nil }
func (c *dummyConn) SetWriteDeadline(t time.Time) error { return nil } func (c *dummyConn) SetWriteDeadline(t time.Time) error { return nil }
func (mgr *Manager) RemoveDomain(domain string) { // RemoveDomain removes a domain from tracking.
mgr.domainsMux.Lock() func (mgr *Manager) RemoveDomain(d domain.Domain) {
defer mgr.domainsMux.Unlock() mgr.mu.Lock()
delete(mgr.domains, domain) defer mgr.mu.Unlock()
delete(mgr.domains, d)
}
// PendingCerts returns the number of certificates currently being prefetched.
func (mgr *Manager) PendingCerts() int {
mgr.mu.RLock()
defer mgr.mu.RUnlock()
var n int
for _, info := range mgr.domains {
if info.state == domainPending {
n++
}
}
return n
}
// TotalDomains returns the total number of registered domains.
func (mgr *Manager) TotalDomains() int {
mgr.mu.RLock()
defer mgr.mu.RUnlock()
return len(mgr.domains)
}
// PendingDomains returns the domain names currently being prefetched.
func (mgr *Manager) PendingDomains() []string {
return mgr.domainsByState(domainPending)
}
// ReadyDomains returns domain names that have successfully obtained certificates.
func (mgr *Manager) ReadyDomains() []string {
return mgr.domainsByState(domainReady)
}
// FailedDomains returns domain names that failed certificate prefetch, mapped to their error.
func (mgr *Manager) FailedDomains() map[string]string {
mgr.mu.RLock()
defer mgr.mu.RUnlock()
result := make(map[string]string)
for d, info := range mgr.domains {
if info.state == domainFailed {
result[d.PunycodeString()] = info.err
}
}
return result
}
func (mgr *Manager) domainsByState(state domainState) []string {
mgr.mu.RLock()
defer mgr.mu.RUnlock()
var domains []string
for d, info := range mgr.domains {
if info.state == state {
domains = append(domains, d.PunycodeString())
}
}
slices.Sort(domains)
return domains
} }

View File

@@ -3,6 +3,7 @@ package acme
import ( import (
"context" "context"
"testing" "testing"
"time"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@@ -12,6 +13,14 @@ func TestHostPolicy(t *testing.T) {
mgr := NewManager(t.TempDir(), "https://acme.example.com/directory", nil, nil, "") mgr := NewManager(t.TempDir(), "https://acme.example.com/directory", nil, nil, "")
mgr.AddDomain("example.com", "acc1", "rp1") mgr.AddDomain("example.com", "acc1", "rp1")
// Wait for the background prefetch goroutine to finish so the temp dir
// can be cleaned up without a race.
t.Cleanup(func() {
assert.Eventually(t, func() bool {
return mgr.PendingCerts() == 0
}, 30*time.Second, 50*time.Millisecond)
})
tests := []struct { tests := []struct {
name string name string
host string host string
@@ -59,3 +68,35 @@ func TestHostPolicy(t *testing.T) {
}) })
} }
} }
func TestDomainStates(t *testing.T) {
mgr := NewManager(t.TempDir(), "https://acme.example.com/directory", nil, nil, "")
assert.Equal(t, 0, mgr.PendingCerts(), "initially zero")
assert.Equal(t, 0, mgr.TotalDomains(), "initially zero domains")
assert.Empty(t, mgr.PendingDomains())
assert.Empty(t, mgr.ReadyDomains())
assert.Empty(t, mgr.FailedDomains())
// AddDomain starts as pending, then the prefetch goroutine will fail
// (no real ACME server) and transition to failed.
mgr.AddDomain("a.example.com", "acc1", "rp1")
mgr.AddDomain("b.example.com", "acc1", "rp1")
assert.Equal(t, 2, mgr.TotalDomains(), "two domains registered")
// Pending domains should eventually drain after prefetch goroutines finish.
assert.Eventually(t, func() bool {
return mgr.PendingCerts() == 0
}, 30*time.Second, 100*time.Millisecond, "pending certs should return to zero after prefetch completes")
assert.Empty(t, mgr.PendingDomains())
assert.Equal(t, 2, mgr.TotalDomains(), "total domains unchanged")
// With a fake ACME URL, both should have failed.
failed := mgr.FailedDomains()
assert.Len(t, failed, 2, "both domains should have failed")
assert.Contains(t, failed, "a.example.com")
assert.Contains(t, failed, "b.example.com")
assert.Empty(t, mgr.ReadyDomains())
}

View File

@@ -56,6 +56,33 @@ func (c *Client) printHealth(data map[string]any) {
_, _ = fmt.Fprintf(c.out, "Management Connected: %s\n", boolIcon(data["management_connected"])) _, _ = fmt.Fprintf(c.out, "Management Connected: %s\n", boolIcon(data["management_connected"]))
_, _ = fmt.Fprintf(c.out, "All Clients Healthy: %s\n", boolIcon(data["all_clients_healthy"])) _, _ = fmt.Fprintf(c.out, "All Clients Healthy: %s\n", boolIcon(data["all_clients_healthy"]))
total, _ := data["certs_total"].(float64)
ready, _ := data["certs_ready"].(float64)
pending, _ := data["certs_pending"].(float64)
failed, _ := data["certs_failed"].(float64)
if total > 0 {
_, _ = fmt.Fprintf(c.out, "Certificates: %d ready, %d pending, %d failed (%d total)\n",
int(ready), int(pending), int(failed), int(total))
}
if domains, ok := data["certs_ready_domains"].([]any); ok && len(domains) > 0 {
_, _ = fmt.Fprintf(c.out, " Ready:\n")
for _, d := range domains {
_, _ = fmt.Fprintf(c.out, " %v\n", d)
}
}
if domains, ok := data["certs_pending_domains"].([]any); ok && len(domains) > 0 {
_, _ = fmt.Fprintf(c.out, " Pending:\n")
for _, d := range domains {
_, _ = fmt.Fprintf(c.out, " %v\n", d)
}
}
if domains, ok := data["certs_failed_domains"].(map[string]any); ok && len(domains) > 0 {
_, _ = fmt.Fprintf(c.out, " Failed:\n")
for d, errMsg := range domains {
_, _ = fmt.Fprintf(c.out, " %s: %v\n", d, errMsg)
}
}
clients, ok := data["clients"].(map[string]any) clients, ok := data["clients"].(map[string]any)
if !ok || len(clients) == 0 { if !ok || len(clients) == 0 {
return return
@@ -328,7 +355,7 @@ func (c *Client) fetch(ctx context.Context, path string) (map[string]any, []byte
if err != nil { if err != nil {
return nil, nil, fmt.Errorf("request failed: %w", err) return nil, nil, fmt.Errorf("request failed: %w", err)
} }
defer resp.Body.Close() defer func() { _ = resp.Body.Close() }()
body, err := io.ReadAll(resp.Body) body, err := io.ReadAll(resp.Body)
if err != nil { if err != nil {
@@ -346,4 +373,3 @@ func (c *Client) fetch(ctx context.Context, path string) (map[string]any, []byte
return data, body, nil return data, body, nil
} }

View File

@@ -2,12 +2,15 @@
package debug package debug
import ( import (
"cmp"
"context" "context"
"embed" "embed"
"encoding/json" "encoding/json"
"fmt" "fmt"
"html/template" "html/template"
"maps"
"net/http" "net/http"
"slices"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -47,6 +50,10 @@ func formatDuration(d time.Duration) string {
} }
} }
func sortedAccountIDs(m map[types.AccountID]roundtrip.ClientDebugInfo) []types.AccountID {
return slices.Sorted(maps.Keys(m))
}
// clientProvider provides access to NetBird clients. // clientProvider provides access to NetBird clients.
type clientProvider interface { type clientProvider interface {
GetClient(accountID types.AccountID) (*nbembed.Client, bool) GetClient(accountID types.AccountID) (*nbembed.Client, bool)
@@ -60,10 +67,18 @@ type healthChecker interface {
CheckClientsConnected(ctx context.Context) (bool, map[types.AccountID]health.ClientHealth) CheckClientsConnected(ctx context.Context) (bool, map[types.AccountID]health.ClientHealth)
} }
type certStatus interface {
TotalDomains() int
PendingDomains() []string
ReadyDomains() []string
FailedDomains() map[string]string
}
// Handler provides HTTP debug endpoints. // Handler provides HTTP debug endpoints.
type Handler struct { type Handler struct {
provider clientProvider provider clientProvider
health healthChecker health healthChecker
certStatus certStatus
logger *log.Logger logger *log.Logger
startTime time.Time startTime time.Time
templates *template.Template templates *template.Template
@@ -87,6 +102,11 @@ func NewHandler(provider clientProvider, healthChecker healthChecker, logger *lo
return h return h
} }
// SetCertStatus sets the certificate status provider for ACME prefetch observability.
func (h *Handler) SetCertStatus(cs certStatus) {
h.certStatus = cs
}
func (h *Handler) loadTemplates() error { func (h *Handler) loadTemplates() error {
tmpl, err := template.ParseFS(templateFS, "templates/*.html") tmpl, err := template.ParseFS(templateFS, "templates/*.html")
if err != nil { if err != nil {
@@ -160,12 +180,24 @@ func (h *Handler) handleClientRoutes(w http.ResponseWriter, r *http.Request, pat
return true return true
} }
type failedDomain struct {
Domain string
Error string
}
type indexData struct { type indexData struct {
Version string Version string
Uptime string Uptime string
ClientCount int ClientCount int
TotalDomains int TotalDomains int
Clients []clientData CertsTotal int
CertsReady int
CertsPending int
CertsFailed int
CertsPendingDomains []string
CertsReadyDomains []string
CertsFailedDomains []failedDomain
Clients []clientData
} }
type clientData struct { type clientData struct {
@@ -177,15 +209,30 @@ type clientData struct {
func (h *Handler) handleIndex(w http.ResponseWriter, _ *http.Request, wantJSON bool) { func (h *Handler) handleIndex(w http.ResponseWriter, _ *http.Request, wantJSON bool) {
clients := h.provider.ListClientsForDebug() clients := h.provider.ListClientsForDebug()
sortedIDs := sortedAccountIDs(clients)
totalDomains := 0 totalDomains := 0
for _, info := range clients { for _, info := range clients {
totalDomains += info.DomainCount totalDomains += info.DomainCount
} }
var certsTotal, certsReady, certsPending, certsFailed int
var certsPendingDomains, certsReadyDomains []string
var certsFailedDomains map[string]string
if h.certStatus != nil {
certsTotal = h.certStatus.TotalDomains()
certsPendingDomains = h.certStatus.PendingDomains()
certsReadyDomains = h.certStatus.ReadyDomains()
certsFailedDomains = h.certStatus.FailedDomains()
certsReady = len(certsReadyDomains)
certsPending = len(certsPendingDomains)
certsFailed = len(certsFailedDomains)
}
if wantJSON { if wantJSON {
clientsJSON := make([]map[string]interface{}, 0, len(clients)) clientsJSON := make([]map[string]interface{}, 0, len(clients))
for _, info := range clients { for _, id := range sortedIDs {
info := clients[id]
clientsJSON = append(clientsJSON, map[string]interface{}{ clientsJSON = append(clientsJSON, map[string]interface{}{
"account_id": info.AccountID, "account_id": info.AccountID,
"domain_count": info.DomainCount, "domain_count": info.DomainCount,
@@ -195,25 +242,55 @@ func (h *Handler) handleIndex(w http.ResponseWriter, _ *http.Request, wantJSON b
"age": time.Since(info.CreatedAt).Round(time.Second).String(), "age": time.Since(info.CreatedAt).Round(time.Second).String(),
}) })
} }
h.writeJSON(w, map[string]interface{}{ resp := map[string]interface{}{
"version": version.NetbirdVersion(), "version": version.NetbirdVersion(),
"uptime": time.Since(h.startTime).Round(time.Second).String(), "uptime": time.Since(h.startTime).Round(time.Second).String(),
"client_count": len(clients), "client_count": len(clients),
"total_domains": totalDomains, "total_domains": totalDomains,
"certs_total": certsTotal,
"certs_ready": certsReady,
"certs_pending": certsPending,
"certs_failed": certsFailed,
"clients": clientsJSON, "clients": clientsJSON,
}) }
if len(certsPendingDomains) > 0 {
resp["certs_pending_domains"] = certsPendingDomains
}
if len(certsReadyDomains) > 0 {
resp["certs_ready_domains"] = certsReadyDomains
}
if len(certsFailedDomains) > 0 {
resp["certs_failed_domains"] = certsFailedDomains
}
h.writeJSON(w, resp)
return return
} }
sortedFailed := make([]failedDomain, 0, len(certsFailedDomains))
for d, e := range certsFailedDomains {
sortedFailed = append(sortedFailed, failedDomain{Domain: d, Error: e})
}
slices.SortFunc(sortedFailed, func(a, b failedDomain) int {
return cmp.Compare(a.Domain, b.Domain)
})
data := indexData{ data := indexData{
Version: version.NetbirdVersion(), Version: version.NetbirdVersion(),
Uptime: time.Since(h.startTime).Round(time.Second).String(), Uptime: time.Since(h.startTime).Round(time.Second).String(),
ClientCount: len(clients), ClientCount: len(clients),
TotalDomains: totalDomains, TotalDomains: totalDomains,
Clients: make([]clientData, 0, len(clients)), CertsTotal: certsTotal,
CertsReady: certsReady,
CertsPending: certsPending,
CertsFailed: certsFailed,
CertsPendingDomains: certsPendingDomains,
CertsReadyDomains: certsReadyDomains,
CertsFailedDomains: sortedFailed,
Clients: make([]clientData, 0, len(clients)),
} }
for _, info := range clients { for _, id := range sortedIDs {
info := clients[id]
domains := info.Domains.SafeString() domains := info.Domains.SafeString()
if domains == "" { if domains == "" {
domains = "-" domains = "-"
@@ -240,10 +317,12 @@ type clientsData struct {
func (h *Handler) handleListClients(w http.ResponseWriter, _ *http.Request, wantJSON bool) { func (h *Handler) handleListClients(w http.ResponseWriter, _ *http.Request, wantJSON bool) {
clients := h.provider.ListClientsForDebug() clients := h.provider.ListClientsForDebug()
sortedIDs := sortedAccountIDs(clients)
if wantJSON { if wantJSON {
clientsJSON := make([]map[string]interface{}, 0, len(clients)) clientsJSON := make([]map[string]interface{}, 0, len(clients))
for _, info := range clients { for _, id := range sortedIDs {
info := clients[id]
clientsJSON = append(clientsJSON, map[string]interface{}{ clientsJSON = append(clientsJSON, map[string]interface{}{
"account_id": info.AccountID, "account_id": info.AccountID,
"domain_count": info.DomainCount, "domain_count": info.DomainCount,
@@ -266,7 +345,8 @@ func (h *Handler) handleListClients(w http.ResponseWriter, _ *http.Request, want
Clients: make([]clientData, 0, len(clients)), Clients: make([]clientData, 0, len(clients)),
} }
for _, info := range clients { for _, id := range sortedIDs {
info := clients[id]
domains := info.Domains.SafeString() domains := info.Domains.SafeString()
if domains == "" { if domains == "" {
domains = "-" domains = "-"
@@ -556,15 +636,12 @@ func (h *Handler) handleClientStop(w http.ResponseWriter, r *http.Request, accou
}) })
} }
type healthData struct {
Uptime string
Status string
ManagementReady bool
AllClientsHealthy bool
Clients map[types.AccountID]health.ClientHealth
}
func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request, wantJSON bool) { func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request, wantJSON bool) {
if !wantJSON {
http.Redirect(w, r, "/debug", http.StatusSeeOther)
return
}
uptime := time.Since(h.startTime).Round(10 * time.Millisecond).String() uptime := time.Since(h.startTime).Round(10 * time.Millisecond).String()
ready := h.health.ReadinessProbe() ready := h.health.ReadinessProbe()
@@ -575,26 +652,40 @@ func (h *Handler) handleHealth(w http.ResponseWriter, r *http.Request, wantJSON
status = "degraded" status = "degraded"
} }
if wantJSON { var certsTotal, certsReady, certsPending, certsFailed int
h.writeJSON(w, map[string]interface{}{ var certsPendingDomains, certsReadyDomains []string
"status": status, var certsFailedDomains map[string]string
"uptime": uptime, if h.certStatus != nil {
"management_connected": ready, certsTotal = h.certStatus.TotalDomains()
"all_clients_healthy": allHealthy, certsPendingDomains = h.certStatus.PendingDomains()
"clients": clientHealth, certsReadyDomains = h.certStatus.ReadyDomains()
}) certsFailedDomains = h.certStatus.FailedDomains()
return certsReady = len(certsReadyDomains)
certsPending = len(certsPendingDomains)
certsFailed = len(certsFailedDomains)
} }
data := healthData{ resp := map[string]any{
Uptime: time.Since(h.startTime).Round(time.Second).String(), "status": status,
Status: status, "uptime": uptime,
ManagementReady: ready, "management_connected": ready,
AllClientsHealthy: allHealthy, "all_clients_healthy": allHealthy,
Clients: clientHealth, "certs_total": certsTotal,
"certs_ready": certsReady,
"certs_pending": certsPending,
"certs_failed": certsFailed,
"clients": clientHealth,
} }
if len(certsPendingDomains) > 0 {
h.renderTemplate(w, "health", data) resp["certs_pending_domains"] = certsPendingDomains
}
if len(certsReadyDomains) > 0 {
resp["certs_ready_domains"] = certsReadyDomains
}
if len(certsFailedDomains) > 0 {
resp["certs_failed_domains"] = certsFailedDomains
}
h.writeJSON(w, resp)
} }
func (h *Handler) renderTemplate(w http.ResponseWriter, name string, data interface{}) { func (h *Handler) renderTemplate(w http.ResponseWriter, name string, data interface{}) {

View File

@@ -1,39 +0,0 @@
{{define "health"}}
<!DOCTYPE html>
<html>
<head>
<title>Health</title>
<style>{{template "style"}}</style>
</head>
<body>
<h1>{{.Status}}</h1>
<p>Uptime: {{.Uptime}}</p>
<p>Management Connected: {{.ManagementReady}}</p>
<p>All Clients Healthy: {{.AllClientsHealthy}}</p>
{{if .Clients}}
<h2>Clients</h2>
<table>
<tr>
<th>Account ID</th>
<th>Healthy</th>
<th>Management</th>
<th>Signal</th>
<th>Relays</th>
<th>Error</th>
</tr>
{{range $id, $c := .Clients}}
<tr>
<td>{{$id}}</td>
<td>{{$c.Healthy}}</td>
<td>{{$c.ManagementConnected}}</td>
<td>{{$c.SignalConnected}}</td>
<td>{{$c.RelaysConnected}}/{{$c.RelaysTotal}}</td>
<td>{{$c.Error}}</td>
</tr>
{{end}}
</table>
{{end}}
<p><a href="/debug">&larr; Back</a></p>
</body>
</html>
{{end}}

View File

@@ -8,6 +8,25 @@
<body> <body>
<h1>NetBird Proxy Debug</h1> <h1>NetBird Proxy Debug</h1>
<p class="info">Version: {{.Version}} | Uptime: {{.Uptime}}</p> <p class="info">Version: {{.Version}} | Uptime: {{.Uptime}}</p>
<h2>Certificates: {{.CertsReady}} ready, {{.CertsPending}} pending, {{.CertsFailed}} failed ({{.CertsTotal}} total)</h2>
{{if .CertsReadyDomains}}
<details>
<summary>Ready domains ({{.CertsReady}})</summary>
<ul>{{range .CertsReadyDomains}}<li>{{.}}</li>{{end}}</ul>
</details>
{{end}}
{{if .CertsPendingDomains}}
<details open>
<summary>Pending domains ({{.CertsPending}})</summary>
<ul>{{range .CertsPendingDomains}}<li>{{.}}</li>{{end}}</ul>
</details>
{{end}}
{{if .CertsFailedDomains}}
<details open>
<summary>Failed domains ({{.CertsFailed}})</summary>
<ul>{{range .CertsFailedDomains}}<li>{{.Domain}}: {{.Error}}</li>{{end}}</ul>
</details>
{{end}}
<h2>Clients ({{.ClientCount}}) | Domains ({{.TotalDomains}})</h2> <h2>Clients ({{.ClientCount}}) | Domains ({{.TotalDomains}})</h2>
{{if .Clients}} {{if .Clients}}
<table> <table>
@@ -32,7 +51,6 @@
<h2>Endpoints</h2> <h2>Endpoints</h2>
<ul> <ul>
<li><a href="/debug/clients">/debug/clients</a> - all clients detail</li> <li><a href="/debug/clients">/debug/clients</a> - all clients detail</li>
<li><a href="/debug/health">/debug/health</a> - health check</li>
</ul> </ul>
<p class="info">Add ?format=json or /json suffix for JSON output</p> <p class="info">Add ?format=json or /json suffix for JSON output</p>
</body> </body>

View File

@@ -63,7 +63,11 @@ func Unlock(f *os.File) error {
return nil return nil
} }
defer f.Close() defer func() {
if cerr := f.Close(); cerr != nil {
log.Debugf("close lock file: %v", cerr)
}
}()
if err := syscall.Flock(int(f.Fd()), syscall.LOCK_UN); err != nil { if err := syscall.Flock(int(f.Fd()), syscall.LOCK_UN); err != nil {
return fmt.Errorf("release lock: %w", err) return fmt.Errorf("release lock: %w", err)

View File

@@ -65,7 +65,7 @@ type MicroTime struct {
const microTimeFormat = "2006-01-02T15:04:05.000000Z" const microTimeFormat = "2006-01-02T15:04:05.000000Z"
// MarshalJSON implements json.Marshaler with k8s MicroTime format. // MarshalJSON implements json.Marshaler with k8s MicroTime format.
func (t MicroTime) MarshalJSON() ([]byte, error) { func (t *MicroTime) MarshalJSON() ([]byte, error) {
return json.Marshal(t.UTC().Format(microTimeFormat)) return json.Marshal(t.UTC().Format(microTimeFormat))
} }
@@ -148,7 +148,7 @@ func (c *LeaseClient) Get(ctx context.Context, name string) (*Lease, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer resp.Body.Close() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode == http.StatusNotFound { if resp.StatusCode == http.StatusNotFound {
return nil, nil return nil, nil
@@ -179,7 +179,7 @@ func (c *LeaseClient) Create(ctx context.Context, lease *Lease) (*Lease, error)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer resp.Body.Close() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode == http.StatusConflict { if resp.StatusCode == http.StatusConflict {
return nil, ErrConflict return nil, ErrConflict
@@ -208,7 +208,7 @@ func (c *LeaseClient) Update(ctx context.Context, lease *Lease) (*Lease, error)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer resp.Body.Close() defer func() { _ = resp.Body.Close() }()
if resp.StatusCode == http.StatusConflict { if resp.StatusCode == http.StatusConflict {
return nil, ErrConflict return nil, ErrConflict

View File

@@ -247,6 +247,9 @@ func (s *Server) ListenAndServe(ctx context.Context, addr string) (err error) {
if s.DebugEndpointEnabled { if s.DebugEndpointEnabled {
debugAddr := debugEndpointAddr(s.DebugEndpointAddress) debugAddr := debugEndpointAddr(s.DebugEndpointAddress)
debugHandler := debug.NewHandler(s.netbird, s.healthChecker, s.Logger) debugHandler := debug.NewHandler(s.netbird, s.healthChecker, s.Logger)
if s.acme != nil {
debugHandler.SetCertStatus(s.acme)
}
s.debug = &http.Server{ s.debug = &http.Server{
Addr: debugAddr, Addr: debugAddr,
Handler: debugHandler, Handler: debugHandler,
@@ -359,7 +362,7 @@ func (s *Server) shutdownServices() {
defer wg.Done() defer wg.Done()
ctx, cancel := context.WithTimeout(context.Background(), shutdownServiceTimeout) ctx, cancel := context.WithTimeout(context.Background(), shutdownServiceTimeout)
defer cancel() defer cancel()
if err := shutdown(ctx); err != nil && !errors.Is(err, http.ErrServerClosed) { if err := shutdown(ctx); err != nil {
s.Logger.Debugf("%s shutdown: %v", name, err) s.Logger.Debugf("%s shutdown: %v", name, err)
} }
}() }()
@@ -516,7 +519,7 @@ func (s *Server) addMapping(ctx context.Context, mapping *proto.ProxyMapping) er
return fmt.Errorf("create peer for domain %q: %w", d, err) return fmt.Errorf("create peer for domain %q: %w", d, err)
} }
if s.acme != nil { if s.acme != nil {
s.acme.AddDomain(string(d), string(accountID), reverseProxyID) s.acme.AddDomain(d, string(accountID), reverseProxyID)
} }
// Pass the mapping through to the update function to avoid duplicating the // Pass the mapping through to the update function to avoid duplicating the
@@ -562,7 +565,7 @@ func (s *Server) removeMapping(ctx context.Context, mapping *proto.ProxyMapping)
}).Error("Error removing NetBird peer connection for domain, continuing additional domain cleanup but peer connection may still exist") }).Error("Error removing NetBird peer connection for domain, continuing additional domain cleanup but peer connection may still exist")
} }
if s.acme != nil { if s.acme != nil {
s.acme.RemoveDomain(mapping.GetDomain()) s.acme.RemoveDomain(d)
} }
s.auth.RemoveDomain(mapping.GetDomain()) s.auth.RemoveDomain(mapping.GetDomain())
s.proxy.RemoveMapping(s.protoToMapping(mapping)) s.proxy.RemoveMapping(s.protoToMapping(mapping))