diff --git a/docs/collector.service.md b/docs/collector.service.md index f80d4215..232523c2 100644 --- a/docs/collector.service.md +++ b/docs/collector.service.md @@ -2,11 +2,9 @@ The service collector exposes metrics about Windows Services -||| --|- -Metric name prefix | `service` -Classes | [`Win32_Service`](https://msdn.microsoft.com/en-us/library/aa394418(v=vs.85).aspx) -Enabled by default? | Yes +The collector exists in 2 different version. Version 1 is using WMI to query all services and is able to provide additional +information. Version 2 is a more efficient solution by directly connecting to the service manager, but is not able to +provide additional information like `run_as` or start configuration ## Flags @@ -22,6 +20,19 @@ Example config win_exporter.yml for multiple services: `services-where: Name='SQ Uses API calls instead of WMI for performance optimization. **Note** the previous flag (`--collector.service.services-where`) won't have any effect on this mode. +### `--collector.service.v2` + +Version 2 of the service collector. Is using API calls for performance optimization. **Note** the previous flag (`--collector.service.services-where`) won't have any effect on this mode. +For additional performance reasons, it doesn't provide any additional information like `run_as` or start configuration. + +# collector V1 + +||| +-|- +Metric name prefix | `service` +Classes | [`Win32_Service`](https://msdn.microsoft.com/en-us/library/aa394418(v=vs.85).aspx) +Enabled by default? | Yes + ## Metrics Name | Description | Type | Labels @@ -91,6 +102,53 @@ Counts the number of Microsoft SQL Server/Agent Processes count(windows_service_state{exported_name=~"(sqlserveragent|mssqlserver)",state="running"}) ``` +# collector V2 + + +||| +-|- +Metric name prefix | `service` +Classes | none +Enabled by default? | No + + +## Metrics + +Name | Description | Type | Labels +-----|-------------|------|------- +`windows_service_state` | The state of the service, 1 if the current state, 0 otherwise | gauge | name, display_name, state + +### States + +A service can be in the following states: +- `stopped` +- `start pending` +- `stop pending` +- `running` +- `continue pending` +- `pause pending` +- `paused` +- `unknown` + +### Example metric + +``` +windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="continue pending"} 0 +windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="pause pending"} 0 +windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="paused"} 0 +windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="running"} 0 +windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="start pending"} 0 +windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="stop pending"} 0 +windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="stopped"} 1 +``` + +## Useful queries +Counts the number of Microsoft SQL Server/Agent Processes + +``` +count(windows_service_state{name=~"(sqlserveragent|mssqlserver)",state="running"}) +``` + ## Alerting examples **prometheus.rules** ```yaml @@ -100,7 +158,7 @@ groups: # Sends an alert when the 'sqlserveragent' service is not in the running state for 3 minutes. - alert: SQL Server Agent DOWN - expr: windows_service_state{instance="SQL",exported_name="sqlserveragent",state="running"} == 0 + expr: windows_service_state{instance="SQL",name="sqlserveragent",state="running"} == 0 for: 3m labels: severity: high @@ -110,7 +168,7 @@ groups: # Sends an alert when the 'mssqlserver' service is not in the running state for 3 minutes. - alert: SQL Server DOWN - expr: windows_service_state{instance="SQL",exported_name="mssqlserver",state="running"} == 0 + expr: windows_service_state{instance="SQL",name="mssqlserver",state="running"} == 0 for: 3m labels: severity: high diff --git a/pkg/collector/service/service.go b/pkg/collector/service/service.go index d123fa31..c0a42484 100644 --- a/pkg/collector/service/service.go +++ b/pkg/collector/service/service.go @@ -3,10 +3,12 @@ package service import ( + "errors" "fmt" "strconv" "strings" "syscall" + "unsafe" "github.com/alecthomas/kingpin/v2" "github.com/go-kit/log" @@ -23,16 +25,19 @@ const ( Name = "service" FlagServiceWhereClause = "collector.service.services-where" FlagServiceUseAPI = "collector.service.use-api" + FlagServiceCollectorV2 = "collector.service.v2" ) type Config struct { ServiceWhereClause string `yaml:"service_where_clause"` UseAPI bool `yaml:"use_api"` + V2 bool `yaml:"v2"` } var ConfigDefaults = Config{ ServiceWhereClause: "", UseAPI: false, + V2: false, } // A collector is a Prometheus collector for WMI Win32_Service metrics @@ -41,13 +46,13 @@ type collector struct { serviceWhereClause *string useAPI *bool + v2 *bool Information *prometheus.Desc State *prometheus.Desc StartMode *prometheus.Desc Status *prometheus.Desc - - queryWhereClause string + StateV2 *prometheus.Desc } func New(logger log.Logger, config *Config) types.Collector { @@ -73,6 +78,10 @@ func NewWithFlags(app *kingpin.Application) types.Collector { FlagServiceUseAPI, "Use API calls to collect service data instead of WMI. Flag 'collector.service.services-where' won't be effective.", ).Default(strconv.FormatBool(ConfigDefaults.UseAPI)).Bool(), + v2: app.Flag( + FlagServiceCollectorV2, + "Enable V2 service collector. This collector can services state much more efficiently, can't provide general service information.", + ).Default(strconv.FormatBool(ConfigDefaults.V2)).Bool(), } } @@ -120,25 +129,37 @@ func (c *collector) Build() error { []string{"name", "status"}, nil, ) - c.queryWhereClause = *c.serviceWhereClause + c.StateV2 = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "state"), + "The state of the service (State)", + []string{"name", "display_name", "status"}, + nil, + ) + return nil } // Collect sends the metric values for each metric // to the provided prometheus Metric channel. func (c *collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric) error { - if *c.useAPI { - if err := c.collectAPI(ch); err != nil { + var err error + + switch { + case *c.useAPI: + if err = c.collectAPI(ch); err != nil { _ = level.Error(c.logger).Log("msg", "failed collecting API service metrics:", "err", err) - return err } - } else { - if err := c.collectWMI(ch); err != nil { + case *c.v2: + if err = c.collectAPIV2(ch); err != nil { + _ = level.Error(c.logger).Log("msg", "failed collecting API service metrics:", "err", err) + } + default: + if err = c.collectWMI(ch); err != nil { _ = level.Error(c.logger).Log("msg", "failed collecting WMI service metrics:", "err", err) - return err } } - return nil + + return err } // Win32_Service docs: @@ -164,7 +185,7 @@ var ( "paused", "unknown", } - apiStateValues = map[uint]string{ + apiStateValues = map[uint32]string{ windows.SERVICE_CONTINUE_PENDING: "continue pending", windows.SERVICE_PAUSE_PENDING: "pause pending", windows.SERVICE_PAUSED: "paused", @@ -205,7 +226,7 @@ var ( func (c *collector) collectWMI(ch chan<- prometheus.Metric) error { var dst []Win32_Service - q := wmi.QueryAllWhere(&dst, c.queryWhereClause, c.logger) + q := wmi.QueryAllWhere(&dst, *c.serviceWhereClause, c.logger) if err := wmi.Query(q, &dst); err != nil { return err } @@ -333,7 +354,7 @@ func (c *collector) collectAPI(ch chan<- prometheus.Metric) error { for _, state := range apiStateValues { isCurrentState := 0.0 - if state == apiStateValues[uint(serviceStatus.State)] { + if state == apiStateValues[uint32(serviceStatus.State)] { isCurrentState = 1.0 } ch <- prometheus.MustNewConstMetric( @@ -362,3 +383,92 @@ func (c *collector) collectAPI(ch chan<- prometheus.Metric) error { } return nil } + +func (c *collector) collectAPIV2(ch chan<- prometheus.Metric) error { + services, err := c.queryAllServiceStates() + if err != nil { + _ = level.Warn(c.logger).Log("msg", "Failed to query services", "err", err) + return err + } + + if services == nil { + _ = level.Warn(c.logger).Log("msg", "No services queried") + return nil + } + + var isCurrentState float64 + + for _, svc := range services { + for state, stateValue := range apiStateValues { + isCurrentState = 0.0 + if state == svc.ServiceStatusProcess.CurrentState { + isCurrentState = 1.0 + } + + ch <- prometheus.MustNewConstMetric( + c.StateV2, + prometheus.GaugeValue, + isCurrentState, + windows.UTF16PtrToString(svc.ServiceName), + windows.UTF16PtrToString(svc.DisplayName), + stateValue, + ) + } + } + + return nil +} + +// queryAllServiceStates returns all service states of the current Windows system +// This is realized by ask Service Manager directly. +// +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. +// +// Source: https://github.com/DataDog/datadog-agent/blob/afbd8b6c87939c92610c654cb07fdfd439e4fb27/pkg/util/winutil/scmmonitor.go#L61-L96 +func (c *collector) queryAllServiceStates() ([]windows.ENUM_SERVICE_STATUS_PROCESS, error) { + // EnumServiceStatusEx requires only SC_MANAGER_ENUM_SERVICE. + h, err := windows.OpenSCManager(nil, nil, windows.SC_MANAGER_ENUMERATE_SERVICE) + if err != nil { + return nil, fmt.Errorf("failed to open scm: %w", err) + } + + m := &mgr.Mgr{Handle: h} + defer func() { + if err := m.Disconnect(); err != nil { + _ = level.Warn(c.logger).Log("msg", "Failed to disconnect from scm", "err", err) + } + }() + + var bytesNeeded, servicesReturned uint32 + var buf []byte + for { + var p *byte + if len(buf) > 0 { + p = &buf[0] + } + err = windows.EnumServicesStatusEx(m.Handle, windows.SC_ENUM_PROCESS_INFO, + windows.SERVICE_WIN32, windows.SERVICE_STATE_ALL, + p, uint32(len(buf)), &bytesNeeded, &servicesReturned, nil, nil) + if err == nil { + break + } + if !errors.Is(err, windows.ERROR_MORE_DATA) { + return nil, fmt.Errorf("failed to enum services %w", err) + } + if bytesNeeded <= uint32(len(buf)) { + return nil, err + } + buf = make([]byte, bytesNeeded) + } + + if servicesReturned == 0 { + return nil, nil + } + + services := unsafe.Slice((*windows.ENUM_SERVICE_STATUS_PROCESS)(unsafe.Pointer(&buf[0])), servicesReturned) + + return services, nil +}