feat(metrics): add private-service + BYOP signals to self-hosted telemetry

Private-service adoption signals (per-account aggregation, layered on
the existing services loop):

- services_private                       — services with Private=true
- services_private_with_access_groups    — Private services with ≥1 access group
- services_private_access_groups_sum     — total access groups across Private services
- services_with_direct_upstream          — services whose targets use direct_upstream
- services_target_type_cluster           — automatic via existing target_type loop

BYOP / proxy-cluster signals (system-wide via a new store method):

- proxy_clusters         — distinct cluster_address across the proxies table
- proxy_clusters_byop    — distinct cluster_address with account_id != NULL (BYOP)
- proxy_clusters_private — distinct cluster_address where any proxy reported
                            the private capability (embedded `netbird proxy`)
- proxies                — total proxy rows
- proxies_connected      — connected proxies within the active heartbeat window

Implementation:

- store.ProxyMetrics struct + Store.GetProxyMetrics interface method
- SqlStore.GetProxyMetrics: single COUNT(DISTINCT ... CASE WHEN ...)
  round-trip, portable across sqlite/postgres/mysql. Uses the same
  proxyActiveThreshold the cluster-capability queries already honor.
- FileStore.GetProxyMetrics: returns zero-valued struct (proxy state
  isn't persisted in JSON file format).
- store_mock.go regenerated via go generate.
- metrics.DataSource gains GetProxyMetrics; Worker.generateProperties
  collects + emits the new properties.
- selfhosted_test.go fixtures extended with a private service (1 cluster
  target with direct_upstream, 2 access groups) and a host-target with
  direct_upstream to exercise both gates; mockDatasource.GetProxyMetrics
  returns canned counts so the new assertions verify end-to-end wiring.
This commit is contained in:
mlsmaycon
2026-05-21 18:11:52 +02:00
parent a423b788c0
commit 0fd5c59ce3
6 changed files with 183 additions and 9 deletions

View File

@@ -17,6 +17,7 @@ import (
rpservice "github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service"
log "github.com/sirupsen/logrus"
"github.com/netbirdio/netbird/management/server/store"
"github.com/netbirdio/netbird/management/server/types"
nbversion "github.com/netbirdio/netbird/version"
)
@@ -53,6 +54,7 @@ type DataSource interface {
GetAllAccounts(ctx context.Context) []*types.Account
GetStoreEngine() types.Engine
GetCustomDomainsCounts(ctx context.Context) (total int64, validated int64, err error)
GetProxyMetrics(ctx context.Context) (store.ProxyMetrics, error)
}
// ConnManager peer connection manager that holds state for current active connections
@@ -223,6 +225,12 @@ func (w *Worker) generateProperties(ctx context.Context) properties {
servicesAuthPassword int
servicesAuthPin int
servicesAuthOIDC int
// Private-service signals — track adoption of NetBird-only mode
// (services backed by an embedded proxy peer + access groups).
servicesPrivate int
servicesPrivateWithGroups int
servicesPrivateAccessGroupsSum int
servicesWithDirectUpstream int
)
start := time.Now()
metricsProperties := make(properties)
@@ -380,9 +388,31 @@ func (w *Worker) generateProperties(ctx context.Context) properties {
if service.Auth.BearerAuth != nil && service.Auth.BearerAuth.Enabled {
servicesAuthOIDC++
}
if service.Private {
servicesPrivate++
if len(service.AccessGroups) > 0 {
servicesPrivateWithGroups++
}
servicesPrivateAccessGroupsSum += len(service.AccessGroups)
}
for _, target := range service.Targets {
if target.Options.DirectUpstream {
servicesWithDirectUpstream++
break
}
}
}
}
// Proxy / BYOP cluster signals come from the proxies table aggregated
// across all accounts in a single store query; nil on FileStore.
proxyMetrics, err := w.dataSource.GetProxyMetrics(ctx)
if err != nil {
log.WithContext(ctx).Debugf("collect proxy metrics: %v", err)
}
minActivePeerVersion, maxActivePeerVersion := getMinMaxVersion(peerActiveVersions)
metricsProperties["uptime"] = uptime
metricsProperties["accounts"] = accounts
@@ -430,6 +460,15 @@ func (w *Worker) generateProperties(ctx context.Context) properties {
metricsProperties["services_auth_password"] = servicesAuthPassword
metricsProperties["services_auth_pin"] = servicesAuthPin
metricsProperties["services_auth_oidc"] = servicesAuthOIDC
metricsProperties["services_private"] = servicesPrivate
metricsProperties["services_private_with_access_groups"] = servicesPrivateWithGroups
metricsProperties["services_private_access_groups_sum"] = servicesPrivateAccessGroupsSum
metricsProperties["services_with_direct_upstream"] = servicesWithDirectUpstream
metricsProperties["proxy_clusters"] = proxyMetrics.Clusters
metricsProperties["proxy_clusters_byop"] = proxyMetrics.ClustersBYOP
metricsProperties["proxy_clusters_private"] = proxyMetrics.ClustersPrivate
metricsProperties["proxies"] = proxyMetrics.Proxies
metricsProperties["proxies_connected"] = proxyMetrics.ProxiesConnected
metricsProperties["custom_domains"] = customDomains
metricsProperties["custom_domains_validated"] = customDomainsValidated

View File

@@ -12,6 +12,7 @@ import (
networkTypes "github.com/netbirdio/netbird/management/server/networks/types"
nbpeer "github.com/netbirdio/netbird/management/server/peer"
"github.com/netbirdio/netbird/management/server/posture"
"github.com/netbirdio/netbird/management/server/store"
"github.com/netbirdio/netbird/management/server/types"
"github.com/netbirdio/netbird/route"
)
@@ -123,7 +124,7 @@ func (mockDatasource) GetAllAccounts(_ context.Context) []*types.Account {
Enabled: true,
Targets: []*rpservice.Target{
{TargetType: "peer"},
{TargetType: "host"},
{TargetType: "host", Options: rpservice.TargetOptions{DirectUpstream: true}},
},
Auth: rpservice.AuthConfig{
PasswordAuth: &rpservice.PasswordAuthConfig{Enabled: true},
@@ -141,6 +142,16 @@ func (mockDatasource) GetAllAccounts(_ context.Context) []*types.Account {
},
Meta: rpservice.Meta{Status: string(rpservice.StatusPending)},
},
{
ID: "svc3-private",
Enabled: true,
Private: true,
AccessGroups: []string{"grp-eng", "grp-ops"},
Targets: []*rpservice.Target{
{TargetType: "cluster", Options: rpservice.TargetOptions{DirectUpstream: true}},
},
Meta: rpservice.Meta{Status: string(rpservice.StatusActive)},
},
},
},
{
@@ -254,6 +265,18 @@ func (mockDatasource) GetCustomDomainsCounts(_ context.Context) (int64, int64, e
return 3, 2, nil
}
// GetProxyMetrics returns canned proxy/cluster counts so the
// generateProperties test can assert the BYOP signals end-to-end.
func (mockDatasource) GetProxyMetrics(_ context.Context) (store.ProxyMetrics, error) {
return store.ProxyMetrics{
Clusters: 3,
ClustersBYOP: 1,
ClustersPrivate: 1,
Proxies: 4,
ProxiesConnected: 2,
}, nil
}
// TestGenerateProperties tests and validate the properties generation by using the mockDatasource for the Worker.generateProperties
func TestGenerateProperties(t *testing.T) {
ds := mockDatasource{}
@@ -393,17 +416,17 @@ func TestGenerateProperties(t *testing.T) {
t.Errorf("expected 3 embedded_idp_count, got %v", properties["embedded_idp_count"])
}
if properties["services"] != 2 {
t.Errorf("expected 2 services, got %v", properties["services"])
if properties["services"] != 3 {
t.Errorf("expected 3 services, got %v", properties["services"])
}
if properties["services_enabled"] != 1 {
t.Errorf("expected 1 services_enabled, got %v", properties["services_enabled"])
if properties["services_enabled"] != 2 {
t.Errorf("expected 2 services_enabled, got %v", properties["services_enabled"])
}
if properties["services_targets"] != 3 {
t.Errorf("expected 3 services_targets, got %v", properties["services_targets"])
if properties["services_targets"] != 4 {
t.Errorf("expected 4 services_targets, got %v", properties["services_targets"])
}
if properties["services_status_active"] != 1 {
t.Errorf("expected 1 services_status_active, got %v", properties["services_status_active"])
if properties["services_status_active"] != 2 {
t.Errorf("expected 2 services_status_active, got %v", properties["services_status_active"])
}
if properties["services_status_pending"] != 1 {
t.Errorf("expected 1 services_status_pending, got %v", properties["services_status_pending"])
@@ -420,6 +443,9 @@ func TestGenerateProperties(t *testing.T) {
if properties["services_target_type_domain"] != 1 {
t.Errorf("expected 1 services_target_type_domain, got %v", properties["services_target_type_domain"])
}
if properties["services_target_type_cluster"] != 1 {
t.Errorf("expected 1 services_target_type_cluster, got %v", properties["services_target_type_cluster"])
}
if properties["services_auth_password"] != 1 {
t.Errorf("expected 1 services_auth_password, got %v", properties["services_auth_password"])
}
@@ -429,6 +455,33 @@ func TestGenerateProperties(t *testing.T) {
if properties["services_auth_pin"] != 0 {
t.Errorf("expected 0 services_auth_pin, got %v", properties["services_auth_pin"])
}
if properties["services_private"] != 1 {
t.Errorf("expected 1 services_private, got %v", properties["services_private"])
}
if properties["services_private_with_access_groups"] != 1 {
t.Errorf("expected 1 services_private_with_access_groups, got %v", properties["services_private_with_access_groups"])
}
if properties["services_private_access_groups_sum"] != 2 {
t.Errorf("expected 2 services_private_access_groups_sum, got %v", properties["services_private_access_groups_sum"])
}
if properties["services_with_direct_upstream"] != 2 {
t.Errorf("expected 2 services_with_direct_upstream, got %v", properties["services_with_direct_upstream"])
}
if properties["proxy_clusters"] != int64(3) {
t.Errorf("expected 3 proxy_clusters, got %v", properties["proxy_clusters"])
}
if properties["proxy_clusters_byop"] != int64(1) {
t.Errorf("expected 1 proxy_clusters_byop, got %v", properties["proxy_clusters_byop"])
}
if properties["proxy_clusters_private"] != int64(1) {
t.Errorf("expected 1 proxy_clusters_private, got %v", properties["proxy_clusters_private"])
}
if properties["proxies"] != int64(4) {
t.Errorf("expected 4 proxies, got %v", properties["proxies"])
}
if properties["proxies_connected"] != int64(2) {
t.Errorf("expected 2 proxies_connected, got %v", properties["proxies_connected"])
}
if properties["custom_domains"] != int64(3) {
t.Errorf("expected 3 custom_domains, got %v", properties["custom_domains"])
}

View File

@@ -274,3 +274,9 @@ func (s *FileStore) SetFieldEncrypt(_ *crypt.FieldEncrypt) {
func (s *FileStore) GetCustomDomainsCounts(_ context.Context) (int64, int64, error) {
return 0, 0, nil
}
// GetProxyMetrics is a no-op for FileStore — proxy/cluster state isn't
// persisted in the JSON file format.
func (s *FileStore) GetProxyMetrics(_ context.Context) (ProxyMetrics, error) {
return ProxyMetrics{}, nil
}

View File

@@ -1090,6 +1090,38 @@ func (s *SqlStore) GetCustomDomainsCounts(ctx context.Context) (int64, int64, er
return total, validated, nil
}
// GetProxyMetrics aggregates per-cluster + per-proxy counts for the
// self-hosted telemetry payload. Single round-trip via conditional
// aggregations so a large proxies table doesn't fan out into multiple
// queries.
func (s *SqlStore) GetProxyMetrics(ctx context.Context) (ProxyMetrics, error) {
var m ProxyMetrics
activeCutoff := time.Now().Add(-proxyActiveThreshold)
// COUNT(DISTINCT ... CASE WHEN ...) is portable across sqlite/postgres
// (MySQL too) and keeps the round-trip to one. proxy.StatusConnected
// is the same string the cluster-capability queries use; the active
// window matches the cluster-capability semantics (only proxies
// heartbeating within ~2 * heartbeat interval count as connected).
row := s.db.WithContext(ctx).
Model(&proxy.Proxy{}).
Select(
"COUNT(DISTINCT cluster_address) AS clusters, "+
"COUNT(DISTINCT CASE WHEN account_id IS NOT NULL THEN cluster_address END) AS clusters_byop, "+
"COUNT(DISTINCT CASE WHEN private = ? THEN cluster_address END) AS clusters_private, "+
"COUNT(*) AS proxies, "+
"COUNT(CASE WHEN status = ? AND last_seen > ? THEN 1 END) AS proxies_connected",
true,
proxy.StatusConnected,
activeCutoff,
).
Row()
if err := row.Scan(&m.Clusters, &m.ClustersBYOP, &m.ClustersPrivate, &m.Proxies, &m.ProxiesConnected); err != nil {
return ProxyMetrics{}, fmt.Errorf("scan proxy metrics: %w", err)
}
return m, nil
}
func (s *SqlStore) GetAllAccounts(ctx context.Context) (all []*types.Account) {
var accounts []types.Account
result := s.db.Find(&accounts)

View File

@@ -321,9 +321,38 @@ type Store interface {
GetCustomDomainsCounts(ctx context.Context) (total int64, validated int64, err error)
// GetProxyMetrics returns aggregated proxy / cluster counts for the
// self-hosted metrics worker. Self-hosted only — file-based stores
// return a zero-valued struct.
GetProxyMetrics(ctx context.Context) (ProxyMetrics, error)
GetRoutingPeerNetworks(ctx context.Context, accountID, peerID string) ([]string, error)
}
// ProxyMetrics aggregates self-hosted proxy + cluster usage signals
// surfaced to the telemetry payload. Each field is best-effort: when a
// store cannot answer (e.g. FileStore) all fields are zero.
type ProxyMetrics struct {
// Clusters counts distinct cluster_address values across the proxies
// table — every cluster the management server has heard from, online or not.
Clusters int64
// ClustersBYOP counts distinct cluster_address values that are owned
// by an account (account_id IS NOT NULL). These are bring-your-own-proxy
// installations as opposed to NetBird-operated shared clusters.
ClustersBYOP int64
// ClustersPrivate counts distinct cluster_address values where at
// least one proxy reported the private capability (embedded
// `netbird proxy` running inside a client).
ClustersPrivate int64
// Proxies is the total number of proxy rows currently persisted.
Proxies int64
// ProxiesConnected is the subset of proxies whose status is
// "connected" AND last_seen falls within the active heartbeat window
// (~2 * heartbeat interval). Proxies the controller hasn't pruned
// yet but that are visibly stale don't count.
ProxiesConnected int64
}
const (
postgresDsnEnv = "NB_STORE_ENGINE_POSTGRES_DSN"
postgresDsnEnvLegacy = "NETBIRD_STORE_ENGINE_POSTGRES_DSN"

View File

@@ -2090,6 +2090,21 @@ func (mr *MockStoreMockRecorder) GetProxyClusters(ctx, accountID interface{}) *g
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProxyClusters", reflect.TypeOf((*MockStore)(nil).GetProxyClusters), ctx, accountID)
}
// GetProxyMetrics mocks base method.
func (m *MockStore) GetProxyMetrics(ctx context.Context) (ProxyMetrics, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "GetProxyMetrics", ctx)
ret0, _ := ret[0].(ProxyMetrics)
ret1, _ := ret[1].(error)
return ret0, ret1
}
// GetProxyMetrics indicates an expected call of GetProxyMetrics.
func (mr *MockStoreMockRecorder) GetProxyMetrics(ctx interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProxyMetrics", reflect.TypeOf((*MockStore)(nil).GetProxyMetrics), ctx)
}
// GetResourceGroups mocks base method.
func (m *MockStore) GetResourceGroups(ctx context.Context, lockStrength LockingStrength, accountID, resourceID string) ([]*types2.Group, error) {
m.ctrl.T.Helper()