From 0fd5c59ce3906d96b3f2c4b2a8e6967d82605417 Mon Sep 17 00:00:00 2001 From: mlsmaycon Date: Thu, 21 May 2026 18:11:52 +0200 Subject: [PATCH] feat(metrics): add private-service + BYOP signals to self-hosted telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Private-service adoption signals (per-account aggregation, layered on the existing services loop): - services_private — services with Private=true - services_private_with_access_groups — Private services with ≥1 access group - services_private_access_groups_sum — total access groups across Private services - services_with_direct_upstream — services whose targets use direct_upstream - services_target_type_cluster — automatic via existing target_type loop BYOP / proxy-cluster signals (system-wide via a new store method): - proxy_clusters — distinct cluster_address across the proxies table - proxy_clusters_byop — distinct cluster_address with account_id != NULL (BYOP) - proxy_clusters_private — distinct cluster_address where any proxy reported the private capability (embedded `netbird proxy`) - proxies — total proxy rows - proxies_connected — connected proxies within the active heartbeat window Implementation: - store.ProxyMetrics struct + Store.GetProxyMetrics interface method - SqlStore.GetProxyMetrics: single COUNT(DISTINCT ... CASE WHEN ...) round-trip, portable across sqlite/postgres/mysql. Uses the same proxyActiveThreshold the cluster-capability queries already honor. - FileStore.GetProxyMetrics: returns zero-valued struct (proxy state isn't persisted in JSON file format). - store_mock.go regenerated via go generate. - metrics.DataSource gains GetProxyMetrics; Worker.generateProperties collects + emits the new properties. - selfhosted_test.go fixtures extended with a private service (1 cluster target with direct_upstream, 2 access groups) and a host-target with direct_upstream to exercise both gates; mockDatasource.GetProxyMetrics returns canned counts so the new assertions verify end-to-end wiring. --- management/server/metrics/selfhosted.go | 39 +++++++++++ management/server/metrics/selfhosted_test.go | 71 +++++++++++++++++--- management/server/store/file_store.go | 6 ++ management/server/store/sql_store.go | 32 +++++++++ management/server/store/store.go | 29 ++++++++ management/server/store/store_mock.go | 15 +++++ 6 files changed, 183 insertions(+), 9 deletions(-) diff --git a/management/server/metrics/selfhosted.go b/management/server/metrics/selfhosted.go index 8732cf89f..efe50c88f 100644 --- a/management/server/metrics/selfhosted.go +++ b/management/server/metrics/selfhosted.go @@ -17,6 +17,7 @@ import ( rpservice "github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service" log "github.com/sirupsen/logrus" + "github.com/netbirdio/netbird/management/server/store" "github.com/netbirdio/netbird/management/server/types" nbversion "github.com/netbirdio/netbird/version" ) @@ -53,6 +54,7 @@ type DataSource interface { GetAllAccounts(ctx context.Context) []*types.Account GetStoreEngine() types.Engine GetCustomDomainsCounts(ctx context.Context) (total int64, validated int64, err error) + GetProxyMetrics(ctx context.Context) (store.ProxyMetrics, error) } // ConnManager peer connection manager that holds state for current active connections @@ -223,6 +225,12 @@ func (w *Worker) generateProperties(ctx context.Context) properties { servicesAuthPassword int servicesAuthPin int servicesAuthOIDC int + // Private-service signals — track adoption of NetBird-only mode + // (services backed by an embedded proxy peer + access groups). + servicesPrivate int + servicesPrivateWithGroups int + servicesPrivateAccessGroupsSum int + servicesWithDirectUpstream int ) start := time.Now() metricsProperties := make(properties) @@ -380,9 +388,31 @@ func (w *Worker) generateProperties(ctx context.Context) properties { if service.Auth.BearerAuth != nil && service.Auth.BearerAuth.Enabled { servicesAuthOIDC++ } + + if service.Private { + servicesPrivate++ + if len(service.AccessGroups) > 0 { + servicesPrivateWithGroups++ + } + servicesPrivateAccessGroupsSum += len(service.AccessGroups) + } + + for _, target := range service.Targets { + if target.Options.DirectUpstream { + servicesWithDirectUpstream++ + break + } + } } } + // Proxy / BYOP cluster signals come from the proxies table aggregated + // across all accounts in a single store query; nil on FileStore. + proxyMetrics, err := w.dataSource.GetProxyMetrics(ctx) + if err != nil { + log.WithContext(ctx).Debugf("collect proxy metrics: %v", err) + } + minActivePeerVersion, maxActivePeerVersion := getMinMaxVersion(peerActiveVersions) metricsProperties["uptime"] = uptime metricsProperties["accounts"] = accounts @@ -430,6 +460,15 @@ func (w *Worker) generateProperties(ctx context.Context) properties { metricsProperties["services_auth_password"] = servicesAuthPassword metricsProperties["services_auth_pin"] = servicesAuthPin metricsProperties["services_auth_oidc"] = servicesAuthOIDC + metricsProperties["services_private"] = servicesPrivate + metricsProperties["services_private_with_access_groups"] = servicesPrivateWithGroups + metricsProperties["services_private_access_groups_sum"] = servicesPrivateAccessGroupsSum + metricsProperties["services_with_direct_upstream"] = servicesWithDirectUpstream + metricsProperties["proxy_clusters"] = proxyMetrics.Clusters + metricsProperties["proxy_clusters_byop"] = proxyMetrics.ClustersBYOP + metricsProperties["proxy_clusters_private"] = proxyMetrics.ClustersPrivate + metricsProperties["proxies"] = proxyMetrics.Proxies + metricsProperties["proxies_connected"] = proxyMetrics.ProxiesConnected metricsProperties["custom_domains"] = customDomains metricsProperties["custom_domains_validated"] = customDomainsValidated diff --git a/management/server/metrics/selfhosted_test.go b/management/server/metrics/selfhosted_test.go index 78f5c53be..ca9e10262 100644 --- a/management/server/metrics/selfhosted_test.go +++ b/management/server/metrics/selfhosted_test.go @@ -12,6 +12,7 @@ import ( networkTypes "github.com/netbirdio/netbird/management/server/networks/types" nbpeer "github.com/netbirdio/netbird/management/server/peer" "github.com/netbirdio/netbird/management/server/posture" + "github.com/netbirdio/netbird/management/server/store" "github.com/netbirdio/netbird/management/server/types" "github.com/netbirdio/netbird/route" ) @@ -123,7 +124,7 @@ func (mockDatasource) GetAllAccounts(_ context.Context) []*types.Account { Enabled: true, Targets: []*rpservice.Target{ {TargetType: "peer"}, - {TargetType: "host"}, + {TargetType: "host", Options: rpservice.TargetOptions{DirectUpstream: true}}, }, Auth: rpservice.AuthConfig{ PasswordAuth: &rpservice.PasswordAuthConfig{Enabled: true}, @@ -141,6 +142,16 @@ func (mockDatasource) GetAllAccounts(_ context.Context) []*types.Account { }, Meta: rpservice.Meta{Status: string(rpservice.StatusPending)}, }, + { + ID: "svc3-private", + Enabled: true, + Private: true, + AccessGroups: []string{"grp-eng", "grp-ops"}, + Targets: []*rpservice.Target{ + {TargetType: "cluster", Options: rpservice.TargetOptions{DirectUpstream: true}}, + }, + Meta: rpservice.Meta{Status: string(rpservice.StatusActive)}, + }, }, }, { @@ -254,6 +265,18 @@ func (mockDatasource) GetCustomDomainsCounts(_ context.Context) (int64, int64, e return 3, 2, nil } +// GetProxyMetrics returns canned proxy/cluster counts so the +// generateProperties test can assert the BYOP signals end-to-end. +func (mockDatasource) GetProxyMetrics(_ context.Context) (store.ProxyMetrics, error) { + return store.ProxyMetrics{ + Clusters: 3, + ClustersBYOP: 1, + ClustersPrivate: 1, + Proxies: 4, + ProxiesConnected: 2, + }, nil +} + // TestGenerateProperties tests and validate the properties generation by using the mockDatasource for the Worker.generateProperties func TestGenerateProperties(t *testing.T) { ds := mockDatasource{} @@ -393,17 +416,17 @@ func TestGenerateProperties(t *testing.T) { t.Errorf("expected 3 embedded_idp_count, got %v", properties["embedded_idp_count"]) } - if properties["services"] != 2 { - t.Errorf("expected 2 services, got %v", properties["services"]) + if properties["services"] != 3 { + t.Errorf("expected 3 services, got %v", properties["services"]) } - if properties["services_enabled"] != 1 { - t.Errorf("expected 1 services_enabled, got %v", properties["services_enabled"]) + if properties["services_enabled"] != 2 { + t.Errorf("expected 2 services_enabled, got %v", properties["services_enabled"]) } - if properties["services_targets"] != 3 { - t.Errorf("expected 3 services_targets, got %v", properties["services_targets"]) + if properties["services_targets"] != 4 { + t.Errorf("expected 4 services_targets, got %v", properties["services_targets"]) } - if properties["services_status_active"] != 1 { - t.Errorf("expected 1 services_status_active, got %v", properties["services_status_active"]) + if properties["services_status_active"] != 2 { + t.Errorf("expected 2 services_status_active, got %v", properties["services_status_active"]) } if properties["services_status_pending"] != 1 { t.Errorf("expected 1 services_status_pending, got %v", properties["services_status_pending"]) @@ -420,6 +443,9 @@ func TestGenerateProperties(t *testing.T) { if properties["services_target_type_domain"] != 1 { t.Errorf("expected 1 services_target_type_domain, got %v", properties["services_target_type_domain"]) } + if properties["services_target_type_cluster"] != 1 { + t.Errorf("expected 1 services_target_type_cluster, got %v", properties["services_target_type_cluster"]) + } if properties["services_auth_password"] != 1 { t.Errorf("expected 1 services_auth_password, got %v", properties["services_auth_password"]) } @@ -429,6 +455,33 @@ func TestGenerateProperties(t *testing.T) { if properties["services_auth_pin"] != 0 { t.Errorf("expected 0 services_auth_pin, got %v", properties["services_auth_pin"]) } + if properties["services_private"] != 1 { + t.Errorf("expected 1 services_private, got %v", properties["services_private"]) + } + if properties["services_private_with_access_groups"] != 1 { + t.Errorf("expected 1 services_private_with_access_groups, got %v", properties["services_private_with_access_groups"]) + } + if properties["services_private_access_groups_sum"] != 2 { + t.Errorf("expected 2 services_private_access_groups_sum, got %v", properties["services_private_access_groups_sum"]) + } + if properties["services_with_direct_upstream"] != 2 { + t.Errorf("expected 2 services_with_direct_upstream, got %v", properties["services_with_direct_upstream"]) + } + if properties["proxy_clusters"] != int64(3) { + t.Errorf("expected 3 proxy_clusters, got %v", properties["proxy_clusters"]) + } + if properties["proxy_clusters_byop"] != int64(1) { + t.Errorf("expected 1 proxy_clusters_byop, got %v", properties["proxy_clusters_byop"]) + } + if properties["proxy_clusters_private"] != int64(1) { + t.Errorf("expected 1 proxy_clusters_private, got %v", properties["proxy_clusters_private"]) + } + if properties["proxies"] != int64(4) { + t.Errorf("expected 4 proxies, got %v", properties["proxies"]) + } + if properties["proxies_connected"] != int64(2) { + t.Errorf("expected 2 proxies_connected, got %v", properties["proxies_connected"]) + } if properties["custom_domains"] != int64(3) { t.Errorf("expected 3 custom_domains, got %v", properties["custom_domains"]) } diff --git a/management/server/store/file_store.go b/management/server/store/file_store.go index 81185b020..bcf563cd0 100644 --- a/management/server/store/file_store.go +++ b/management/server/store/file_store.go @@ -274,3 +274,9 @@ func (s *FileStore) SetFieldEncrypt(_ *crypt.FieldEncrypt) { func (s *FileStore) GetCustomDomainsCounts(_ context.Context) (int64, int64, error) { return 0, 0, nil } + +// GetProxyMetrics is a no-op for FileStore — proxy/cluster state isn't +// persisted in the JSON file format. +func (s *FileStore) GetProxyMetrics(_ context.Context) (ProxyMetrics, error) { + return ProxyMetrics{}, nil +} diff --git a/management/server/store/sql_store.go b/management/server/store/sql_store.go index 244bda0b5..10fc2fa2b 100644 --- a/management/server/store/sql_store.go +++ b/management/server/store/sql_store.go @@ -1090,6 +1090,38 @@ func (s *SqlStore) GetCustomDomainsCounts(ctx context.Context) (int64, int64, er return total, validated, nil } +// GetProxyMetrics aggregates per-cluster + per-proxy counts for the +// self-hosted telemetry payload. Single round-trip via conditional +// aggregations so a large proxies table doesn't fan out into multiple +// queries. +func (s *SqlStore) GetProxyMetrics(ctx context.Context) (ProxyMetrics, error) { + var m ProxyMetrics + activeCutoff := time.Now().Add(-proxyActiveThreshold) + + // COUNT(DISTINCT ... CASE WHEN ...) is portable across sqlite/postgres + // (MySQL too) and keeps the round-trip to one. proxy.StatusConnected + // is the same string the cluster-capability queries use; the active + // window matches the cluster-capability semantics (only proxies + // heartbeating within ~2 * heartbeat interval count as connected). + row := s.db.WithContext(ctx). + Model(&proxy.Proxy{}). + Select( + "COUNT(DISTINCT cluster_address) AS clusters, "+ + "COUNT(DISTINCT CASE WHEN account_id IS NOT NULL THEN cluster_address END) AS clusters_byop, "+ + "COUNT(DISTINCT CASE WHEN private = ? THEN cluster_address END) AS clusters_private, "+ + "COUNT(*) AS proxies, "+ + "COUNT(CASE WHEN status = ? AND last_seen > ? THEN 1 END) AS proxies_connected", + true, + proxy.StatusConnected, + activeCutoff, + ). + Row() + if err := row.Scan(&m.Clusters, &m.ClustersBYOP, &m.ClustersPrivate, &m.Proxies, &m.ProxiesConnected); err != nil { + return ProxyMetrics{}, fmt.Errorf("scan proxy metrics: %w", err) + } + return m, nil +} + func (s *SqlStore) GetAllAccounts(ctx context.Context) (all []*types.Account) { var accounts []types.Account result := s.db.Find(&accounts) diff --git a/management/server/store/store.go b/management/server/store/store.go index 3bfa77395..746207f27 100644 --- a/management/server/store/store.go +++ b/management/server/store/store.go @@ -321,9 +321,38 @@ type Store interface { GetCustomDomainsCounts(ctx context.Context) (total int64, validated int64, err error) + // GetProxyMetrics returns aggregated proxy / cluster counts for the + // self-hosted metrics worker. Self-hosted only — file-based stores + // return a zero-valued struct. + GetProxyMetrics(ctx context.Context) (ProxyMetrics, error) + GetRoutingPeerNetworks(ctx context.Context, accountID, peerID string) ([]string, error) } +// ProxyMetrics aggregates self-hosted proxy + cluster usage signals +// surfaced to the telemetry payload. Each field is best-effort: when a +// store cannot answer (e.g. FileStore) all fields are zero. +type ProxyMetrics struct { + // Clusters counts distinct cluster_address values across the proxies + // table — every cluster the management server has heard from, online or not. + Clusters int64 + // ClustersBYOP counts distinct cluster_address values that are owned + // by an account (account_id IS NOT NULL). These are bring-your-own-proxy + // installations as opposed to NetBird-operated shared clusters. + ClustersBYOP int64 + // ClustersPrivate counts distinct cluster_address values where at + // least one proxy reported the private capability (embedded + // `netbird proxy` running inside a client). + ClustersPrivate int64 + // Proxies is the total number of proxy rows currently persisted. + Proxies int64 + // ProxiesConnected is the subset of proxies whose status is + // "connected" AND last_seen falls within the active heartbeat window + // (~2 * heartbeat interval). Proxies the controller hasn't pruned + // yet but that are visibly stale don't count. + ProxiesConnected int64 +} + const ( postgresDsnEnv = "NB_STORE_ENGINE_POSTGRES_DSN" postgresDsnEnvLegacy = "NETBIRD_STORE_ENGINE_POSTGRES_DSN" diff --git a/management/server/store/store_mock.go b/management/server/store/store_mock.go index 05ab3562f..dfd5af78d 100644 --- a/management/server/store/store_mock.go +++ b/management/server/store/store_mock.go @@ -2090,6 +2090,21 @@ func (mr *MockStoreMockRecorder) GetProxyClusters(ctx, accountID interface{}) *g return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProxyClusters", reflect.TypeOf((*MockStore)(nil).GetProxyClusters), ctx, accountID) } +// GetProxyMetrics mocks base method. +func (m *MockStore) GetProxyMetrics(ctx context.Context) (ProxyMetrics, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetProxyMetrics", ctx) + ret0, _ := ret[0].(ProxyMetrics) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetProxyMetrics indicates an expected call of GetProxyMetrics. +func (mr *MockStoreMockRecorder) GetProxyMetrics(ctx interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProxyMetrics", reflect.TypeOf((*MockStore)(nil).GetProxyMetrics), ctx) +} + // GetResourceGroups mocks base method. func (m *MockStore) GetResourceGroups(ctx context.Context, lockStrength LockingStrength, accountID, resourceID string) ([]*types2.Group, error) { m.ctrl.T.Helper()