mirror of
https://github.com/netbirdio/netbird.git
synced 2026-05-31 04:59:54 +00:00
feat(metrics): add private-service + BYOP signals to self-hosted telemetry
Private-service adoption signals (per-account aggregation, layered on
the existing services loop):
- services_private — services with Private=true
- services_private_with_access_groups — Private services with ≥1 access group
- services_private_access_groups_sum — total access groups across Private services
- services_with_direct_upstream — services whose targets use direct_upstream
- services_target_type_cluster — automatic via existing target_type loop
BYOP / proxy-cluster signals (system-wide via a new store method):
- proxy_clusters — distinct cluster_address across the proxies table
- proxy_clusters_byop — distinct cluster_address with account_id != NULL (BYOP)
- proxy_clusters_private — distinct cluster_address where any proxy reported
the private capability (embedded `netbird proxy`)
- proxies — total proxy rows
- proxies_connected — connected proxies within the active heartbeat window
Implementation:
- store.ProxyMetrics struct + Store.GetProxyMetrics interface method
- SqlStore.GetProxyMetrics: single COUNT(DISTINCT ... CASE WHEN ...)
round-trip, portable across sqlite/postgres/mysql. Uses the same
proxyActiveThreshold the cluster-capability queries already honor.
- FileStore.GetProxyMetrics: returns zero-valued struct (proxy state
isn't persisted in JSON file format).
- store_mock.go regenerated via go generate.
- metrics.DataSource gains GetProxyMetrics; Worker.generateProperties
collects + emits the new properties.
- selfhosted_test.go fixtures extended with a private service (1 cluster
target with direct_upstream, 2 access groups) and a host-target with
direct_upstream to exercise both gates; mockDatasource.GetProxyMetrics
returns canned counts so the new assertions verify end-to-end wiring.
This commit is contained in:
@@ -17,6 +17,7 @@ import (
|
||||
rpservice "github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/netbirdio/netbird/management/server/store"
|
||||
"github.com/netbirdio/netbird/management/server/types"
|
||||
nbversion "github.com/netbirdio/netbird/version"
|
||||
)
|
||||
@@ -53,6 +54,7 @@ type DataSource interface {
|
||||
GetAllAccounts(ctx context.Context) []*types.Account
|
||||
GetStoreEngine() types.Engine
|
||||
GetCustomDomainsCounts(ctx context.Context) (total int64, validated int64, err error)
|
||||
GetProxyMetrics(ctx context.Context) (store.ProxyMetrics, error)
|
||||
}
|
||||
|
||||
// ConnManager peer connection manager that holds state for current active connections
|
||||
@@ -223,6 +225,12 @@ func (w *Worker) generateProperties(ctx context.Context) properties {
|
||||
servicesAuthPassword int
|
||||
servicesAuthPin int
|
||||
servicesAuthOIDC int
|
||||
// Private-service signals — track adoption of NetBird-only mode
|
||||
// (services backed by an embedded proxy peer + access groups).
|
||||
servicesPrivate int
|
||||
servicesPrivateWithGroups int
|
||||
servicesPrivateAccessGroupsSum int
|
||||
servicesWithDirectUpstream int
|
||||
)
|
||||
start := time.Now()
|
||||
metricsProperties := make(properties)
|
||||
@@ -380,9 +388,31 @@ func (w *Worker) generateProperties(ctx context.Context) properties {
|
||||
if service.Auth.BearerAuth != nil && service.Auth.BearerAuth.Enabled {
|
||||
servicesAuthOIDC++
|
||||
}
|
||||
|
||||
if service.Private {
|
||||
servicesPrivate++
|
||||
if len(service.AccessGroups) > 0 {
|
||||
servicesPrivateWithGroups++
|
||||
}
|
||||
servicesPrivateAccessGroupsSum += len(service.AccessGroups)
|
||||
}
|
||||
|
||||
for _, target := range service.Targets {
|
||||
if target.Options.DirectUpstream {
|
||||
servicesWithDirectUpstream++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Proxy / BYOP cluster signals come from the proxies table aggregated
|
||||
// across all accounts in a single store query; nil on FileStore.
|
||||
proxyMetrics, err := w.dataSource.GetProxyMetrics(ctx)
|
||||
if err != nil {
|
||||
log.WithContext(ctx).Debugf("collect proxy metrics: %v", err)
|
||||
}
|
||||
|
||||
minActivePeerVersion, maxActivePeerVersion := getMinMaxVersion(peerActiveVersions)
|
||||
metricsProperties["uptime"] = uptime
|
||||
metricsProperties["accounts"] = accounts
|
||||
@@ -430,6 +460,15 @@ func (w *Worker) generateProperties(ctx context.Context) properties {
|
||||
metricsProperties["services_auth_password"] = servicesAuthPassword
|
||||
metricsProperties["services_auth_pin"] = servicesAuthPin
|
||||
metricsProperties["services_auth_oidc"] = servicesAuthOIDC
|
||||
metricsProperties["services_private"] = servicesPrivate
|
||||
metricsProperties["services_private_with_access_groups"] = servicesPrivateWithGroups
|
||||
metricsProperties["services_private_access_groups_sum"] = servicesPrivateAccessGroupsSum
|
||||
metricsProperties["services_with_direct_upstream"] = servicesWithDirectUpstream
|
||||
metricsProperties["proxy_clusters"] = proxyMetrics.Clusters
|
||||
metricsProperties["proxy_clusters_byop"] = proxyMetrics.ClustersBYOP
|
||||
metricsProperties["proxy_clusters_private"] = proxyMetrics.ClustersPrivate
|
||||
metricsProperties["proxies"] = proxyMetrics.Proxies
|
||||
metricsProperties["proxies_connected"] = proxyMetrics.ProxiesConnected
|
||||
metricsProperties["custom_domains"] = customDomains
|
||||
metricsProperties["custom_domains_validated"] = customDomainsValidated
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
networkTypes "github.com/netbirdio/netbird/management/server/networks/types"
|
||||
nbpeer "github.com/netbirdio/netbird/management/server/peer"
|
||||
"github.com/netbirdio/netbird/management/server/posture"
|
||||
"github.com/netbirdio/netbird/management/server/store"
|
||||
"github.com/netbirdio/netbird/management/server/types"
|
||||
"github.com/netbirdio/netbird/route"
|
||||
)
|
||||
@@ -123,7 +124,7 @@ func (mockDatasource) GetAllAccounts(_ context.Context) []*types.Account {
|
||||
Enabled: true,
|
||||
Targets: []*rpservice.Target{
|
||||
{TargetType: "peer"},
|
||||
{TargetType: "host"},
|
||||
{TargetType: "host", Options: rpservice.TargetOptions{DirectUpstream: true}},
|
||||
},
|
||||
Auth: rpservice.AuthConfig{
|
||||
PasswordAuth: &rpservice.PasswordAuthConfig{Enabled: true},
|
||||
@@ -141,6 +142,16 @@ func (mockDatasource) GetAllAccounts(_ context.Context) []*types.Account {
|
||||
},
|
||||
Meta: rpservice.Meta{Status: string(rpservice.StatusPending)},
|
||||
},
|
||||
{
|
||||
ID: "svc3-private",
|
||||
Enabled: true,
|
||||
Private: true,
|
||||
AccessGroups: []string{"grp-eng", "grp-ops"},
|
||||
Targets: []*rpservice.Target{
|
||||
{TargetType: "cluster", Options: rpservice.TargetOptions{DirectUpstream: true}},
|
||||
},
|
||||
Meta: rpservice.Meta{Status: string(rpservice.StatusActive)},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -254,6 +265,18 @@ func (mockDatasource) GetCustomDomainsCounts(_ context.Context) (int64, int64, e
|
||||
return 3, 2, nil
|
||||
}
|
||||
|
||||
// GetProxyMetrics returns canned proxy/cluster counts so the
|
||||
// generateProperties test can assert the BYOP signals end-to-end.
|
||||
func (mockDatasource) GetProxyMetrics(_ context.Context) (store.ProxyMetrics, error) {
|
||||
return store.ProxyMetrics{
|
||||
Clusters: 3,
|
||||
ClustersBYOP: 1,
|
||||
ClustersPrivate: 1,
|
||||
Proxies: 4,
|
||||
ProxiesConnected: 2,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// TestGenerateProperties tests and validate the properties generation by using the mockDatasource for the Worker.generateProperties
|
||||
func TestGenerateProperties(t *testing.T) {
|
||||
ds := mockDatasource{}
|
||||
@@ -393,17 +416,17 @@ func TestGenerateProperties(t *testing.T) {
|
||||
t.Errorf("expected 3 embedded_idp_count, got %v", properties["embedded_idp_count"])
|
||||
}
|
||||
|
||||
if properties["services"] != 2 {
|
||||
t.Errorf("expected 2 services, got %v", properties["services"])
|
||||
if properties["services"] != 3 {
|
||||
t.Errorf("expected 3 services, got %v", properties["services"])
|
||||
}
|
||||
if properties["services_enabled"] != 1 {
|
||||
t.Errorf("expected 1 services_enabled, got %v", properties["services_enabled"])
|
||||
if properties["services_enabled"] != 2 {
|
||||
t.Errorf("expected 2 services_enabled, got %v", properties["services_enabled"])
|
||||
}
|
||||
if properties["services_targets"] != 3 {
|
||||
t.Errorf("expected 3 services_targets, got %v", properties["services_targets"])
|
||||
if properties["services_targets"] != 4 {
|
||||
t.Errorf("expected 4 services_targets, got %v", properties["services_targets"])
|
||||
}
|
||||
if properties["services_status_active"] != 1 {
|
||||
t.Errorf("expected 1 services_status_active, got %v", properties["services_status_active"])
|
||||
if properties["services_status_active"] != 2 {
|
||||
t.Errorf("expected 2 services_status_active, got %v", properties["services_status_active"])
|
||||
}
|
||||
if properties["services_status_pending"] != 1 {
|
||||
t.Errorf("expected 1 services_status_pending, got %v", properties["services_status_pending"])
|
||||
@@ -420,6 +443,9 @@ func TestGenerateProperties(t *testing.T) {
|
||||
if properties["services_target_type_domain"] != 1 {
|
||||
t.Errorf("expected 1 services_target_type_domain, got %v", properties["services_target_type_domain"])
|
||||
}
|
||||
if properties["services_target_type_cluster"] != 1 {
|
||||
t.Errorf("expected 1 services_target_type_cluster, got %v", properties["services_target_type_cluster"])
|
||||
}
|
||||
if properties["services_auth_password"] != 1 {
|
||||
t.Errorf("expected 1 services_auth_password, got %v", properties["services_auth_password"])
|
||||
}
|
||||
@@ -429,6 +455,33 @@ func TestGenerateProperties(t *testing.T) {
|
||||
if properties["services_auth_pin"] != 0 {
|
||||
t.Errorf("expected 0 services_auth_pin, got %v", properties["services_auth_pin"])
|
||||
}
|
||||
if properties["services_private"] != 1 {
|
||||
t.Errorf("expected 1 services_private, got %v", properties["services_private"])
|
||||
}
|
||||
if properties["services_private_with_access_groups"] != 1 {
|
||||
t.Errorf("expected 1 services_private_with_access_groups, got %v", properties["services_private_with_access_groups"])
|
||||
}
|
||||
if properties["services_private_access_groups_sum"] != 2 {
|
||||
t.Errorf("expected 2 services_private_access_groups_sum, got %v", properties["services_private_access_groups_sum"])
|
||||
}
|
||||
if properties["services_with_direct_upstream"] != 2 {
|
||||
t.Errorf("expected 2 services_with_direct_upstream, got %v", properties["services_with_direct_upstream"])
|
||||
}
|
||||
if properties["proxy_clusters"] != int64(3) {
|
||||
t.Errorf("expected 3 proxy_clusters, got %v", properties["proxy_clusters"])
|
||||
}
|
||||
if properties["proxy_clusters_byop"] != int64(1) {
|
||||
t.Errorf("expected 1 proxy_clusters_byop, got %v", properties["proxy_clusters_byop"])
|
||||
}
|
||||
if properties["proxy_clusters_private"] != int64(1) {
|
||||
t.Errorf("expected 1 proxy_clusters_private, got %v", properties["proxy_clusters_private"])
|
||||
}
|
||||
if properties["proxies"] != int64(4) {
|
||||
t.Errorf("expected 4 proxies, got %v", properties["proxies"])
|
||||
}
|
||||
if properties["proxies_connected"] != int64(2) {
|
||||
t.Errorf("expected 2 proxies_connected, got %v", properties["proxies_connected"])
|
||||
}
|
||||
if properties["custom_domains"] != int64(3) {
|
||||
t.Errorf("expected 3 custom_domains, got %v", properties["custom_domains"])
|
||||
}
|
||||
|
||||
@@ -274,3 +274,9 @@ func (s *FileStore) SetFieldEncrypt(_ *crypt.FieldEncrypt) {
|
||||
func (s *FileStore) GetCustomDomainsCounts(_ context.Context) (int64, int64, error) {
|
||||
return 0, 0, nil
|
||||
}
|
||||
|
||||
// GetProxyMetrics is a no-op for FileStore — proxy/cluster state isn't
|
||||
// persisted in the JSON file format.
|
||||
func (s *FileStore) GetProxyMetrics(_ context.Context) (ProxyMetrics, error) {
|
||||
return ProxyMetrics{}, nil
|
||||
}
|
||||
|
||||
@@ -1090,6 +1090,38 @@ func (s *SqlStore) GetCustomDomainsCounts(ctx context.Context) (int64, int64, er
|
||||
return total, validated, nil
|
||||
}
|
||||
|
||||
// GetProxyMetrics aggregates per-cluster + per-proxy counts for the
|
||||
// self-hosted telemetry payload. Single round-trip via conditional
|
||||
// aggregations so a large proxies table doesn't fan out into multiple
|
||||
// queries.
|
||||
func (s *SqlStore) GetProxyMetrics(ctx context.Context) (ProxyMetrics, error) {
|
||||
var m ProxyMetrics
|
||||
activeCutoff := time.Now().Add(-proxyActiveThreshold)
|
||||
|
||||
// COUNT(DISTINCT ... CASE WHEN ...) is portable across sqlite/postgres
|
||||
// (MySQL too) and keeps the round-trip to one. proxy.StatusConnected
|
||||
// is the same string the cluster-capability queries use; the active
|
||||
// window matches the cluster-capability semantics (only proxies
|
||||
// heartbeating within ~2 * heartbeat interval count as connected).
|
||||
row := s.db.WithContext(ctx).
|
||||
Model(&proxy.Proxy{}).
|
||||
Select(
|
||||
"COUNT(DISTINCT cluster_address) AS clusters, "+
|
||||
"COUNT(DISTINCT CASE WHEN account_id IS NOT NULL THEN cluster_address END) AS clusters_byop, "+
|
||||
"COUNT(DISTINCT CASE WHEN private = ? THEN cluster_address END) AS clusters_private, "+
|
||||
"COUNT(*) AS proxies, "+
|
||||
"COUNT(CASE WHEN status = ? AND last_seen > ? THEN 1 END) AS proxies_connected",
|
||||
true,
|
||||
proxy.StatusConnected,
|
||||
activeCutoff,
|
||||
).
|
||||
Row()
|
||||
if err := row.Scan(&m.Clusters, &m.ClustersBYOP, &m.ClustersPrivate, &m.Proxies, &m.ProxiesConnected); err != nil {
|
||||
return ProxyMetrics{}, fmt.Errorf("scan proxy metrics: %w", err)
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (s *SqlStore) GetAllAccounts(ctx context.Context) (all []*types.Account) {
|
||||
var accounts []types.Account
|
||||
result := s.db.Find(&accounts)
|
||||
|
||||
@@ -321,9 +321,38 @@ type Store interface {
|
||||
|
||||
GetCustomDomainsCounts(ctx context.Context) (total int64, validated int64, err error)
|
||||
|
||||
// GetProxyMetrics returns aggregated proxy / cluster counts for the
|
||||
// self-hosted metrics worker. Self-hosted only — file-based stores
|
||||
// return a zero-valued struct.
|
||||
GetProxyMetrics(ctx context.Context) (ProxyMetrics, error)
|
||||
|
||||
GetRoutingPeerNetworks(ctx context.Context, accountID, peerID string) ([]string, error)
|
||||
}
|
||||
|
||||
// ProxyMetrics aggregates self-hosted proxy + cluster usage signals
|
||||
// surfaced to the telemetry payload. Each field is best-effort: when a
|
||||
// store cannot answer (e.g. FileStore) all fields are zero.
|
||||
type ProxyMetrics struct {
|
||||
// Clusters counts distinct cluster_address values across the proxies
|
||||
// table — every cluster the management server has heard from, online or not.
|
||||
Clusters int64
|
||||
// ClustersBYOP counts distinct cluster_address values that are owned
|
||||
// by an account (account_id IS NOT NULL). These are bring-your-own-proxy
|
||||
// installations as opposed to NetBird-operated shared clusters.
|
||||
ClustersBYOP int64
|
||||
// ClustersPrivate counts distinct cluster_address values where at
|
||||
// least one proxy reported the private capability (embedded
|
||||
// `netbird proxy` running inside a client).
|
||||
ClustersPrivate int64
|
||||
// Proxies is the total number of proxy rows currently persisted.
|
||||
Proxies int64
|
||||
// ProxiesConnected is the subset of proxies whose status is
|
||||
// "connected" AND last_seen falls within the active heartbeat window
|
||||
// (~2 * heartbeat interval). Proxies the controller hasn't pruned
|
||||
// yet but that are visibly stale don't count.
|
||||
ProxiesConnected int64
|
||||
}
|
||||
|
||||
const (
|
||||
postgresDsnEnv = "NB_STORE_ENGINE_POSTGRES_DSN"
|
||||
postgresDsnEnvLegacy = "NETBIRD_STORE_ENGINE_POSTGRES_DSN"
|
||||
|
||||
@@ -2090,6 +2090,21 @@ func (mr *MockStoreMockRecorder) GetProxyClusters(ctx, accountID interface{}) *g
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProxyClusters", reflect.TypeOf((*MockStore)(nil).GetProxyClusters), ctx, accountID)
|
||||
}
|
||||
|
||||
// GetProxyMetrics mocks base method.
|
||||
func (m *MockStore) GetProxyMetrics(ctx context.Context) (ProxyMetrics, error) {
|
||||
m.ctrl.T.Helper()
|
||||
ret := m.ctrl.Call(m, "GetProxyMetrics", ctx)
|
||||
ret0, _ := ret[0].(ProxyMetrics)
|
||||
ret1, _ := ret[1].(error)
|
||||
return ret0, ret1
|
||||
}
|
||||
|
||||
// GetProxyMetrics indicates an expected call of GetProxyMetrics.
|
||||
func (mr *MockStoreMockRecorder) GetProxyMetrics(ctx interface{}) *gomock.Call {
|
||||
mr.mock.ctrl.T.Helper()
|
||||
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProxyMetrics", reflect.TypeOf((*MockStore)(nil).GetProxyMetrics), ctx)
|
||||
}
|
||||
|
||||
// GetResourceGroups mocks base method.
|
||||
func (m *MockStore) GetResourceGroups(ctx context.Context, lockStrength LockingStrength, accountID, resourceID string) ([]*types2.Group, error) {
|
||||
m.ctrl.T.Helper()
|
||||
|
||||
Reference in New Issue
Block a user