mirror of
https://github.com/prometheus-community/windows_exporter.git
synced 2026-02-24 13:46:36 +00:00
Service V2 collector (#1497)
This commit is contained in:
@@ -2,11 +2,9 @@
|
|||||||
|
|
||||||
The service collector exposes metrics about Windows Services
|
The service collector exposes metrics about Windows Services
|
||||||
|
|
||||||
|||
|
The collector exists in 2 different version. Version 1 is using WMI to query all services and is able to provide additional
|
||||||
-|-
|
information. Version 2 is a more efficient solution by directly connecting to the service manager, but is not able to
|
||||||
Metric name prefix | `service`
|
provide additional information like `run_as` or start configuration
|
||||||
Classes | [`Win32_Service`](https://msdn.microsoft.com/en-us/library/aa394418(v=vs.85).aspx)
|
|
||||||
Enabled by default? | Yes
|
|
||||||
|
|
||||||
## Flags
|
## Flags
|
||||||
|
|
||||||
@@ -22,6 +20,19 @@ Example config win_exporter.yml for multiple services: `services-where: Name='SQ
|
|||||||
|
|
||||||
Uses API calls instead of WMI for performance optimization. **Note** the previous flag (`--collector.service.services-where`) won't have any effect on this mode.
|
Uses API calls instead of WMI for performance optimization. **Note** the previous flag (`--collector.service.services-where`) won't have any effect on this mode.
|
||||||
|
|
||||||
|
### `--collector.service.v2`
|
||||||
|
|
||||||
|
Version 2 of the service collector. Is using API calls for performance optimization. **Note** the previous flag (`--collector.service.services-where`) won't have any effect on this mode.
|
||||||
|
For additional performance reasons, it doesn't provide any additional information like `run_as` or start configuration.
|
||||||
|
|
||||||
|
# collector V1
|
||||||
|
|
||||||
|
|||
|
||||||
|
-|-
|
||||||
|
Metric name prefix | `service`
|
||||||
|
Classes | [`Win32_Service`](https://msdn.microsoft.com/en-us/library/aa394418(v=vs.85).aspx)
|
||||||
|
Enabled by default? | Yes
|
||||||
|
|
||||||
## Metrics
|
## Metrics
|
||||||
|
|
||||||
Name | Description | Type | Labels
|
Name | Description | Type | Labels
|
||||||
@@ -91,6 +102,53 @@ Counts the number of Microsoft SQL Server/Agent Processes
|
|||||||
count(windows_service_state{exported_name=~"(sqlserveragent|mssqlserver)",state="running"})
|
count(windows_service_state{exported_name=~"(sqlserveragent|mssqlserver)",state="running"})
|
||||||
```
|
```
|
||||||
|
|
||||||
|
# collector V2
|
||||||
|
|
||||||
|
|
||||||
|
|||
|
||||||
|
-|-
|
||||||
|
Metric name prefix | `service`
|
||||||
|
Classes | none
|
||||||
|
Enabled by default? | No
|
||||||
|
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
Name | Description | Type | Labels
|
||||||
|
-----|-------------|------|-------
|
||||||
|
`windows_service_state` | The state of the service, 1 if the current state, 0 otherwise | gauge | name, display_name, state
|
||||||
|
|
||||||
|
### States
|
||||||
|
|
||||||
|
A service can be in the following states:
|
||||||
|
- `stopped`
|
||||||
|
- `start pending`
|
||||||
|
- `stop pending`
|
||||||
|
- `running`
|
||||||
|
- `continue pending`
|
||||||
|
- `pause pending`
|
||||||
|
- `paused`
|
||||||
|
- `unknown`
|
||||||
|
|
||||||
|
### Example metric
|
||||||
|
|
||||||
|
```
|
||||||
|
windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="continue pending"} 0
|
||||||
|
windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="pause pending"} 0
|
||||||
|
windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="paused"} 0
|
||||||
|
windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="running"} 0
|
||||||
|
windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="start pending"} 0
|
||||||
|
windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="stop pending"} 0
|
||||||
|
windows_service_state{display_name="Declared Configuration(DC) service",name="dcsvc",status="stopped"} 1
|
||||||
|
```
|
||||||
|
|
||||||
|
## Useful queries
|
||||||
|
Counts the number of Microsoft SQL Server/Agent Processes
|
||||||
|
|
||||||
|
```
|
||||||
|
count(windows_service_state{name=~"(sqlserveragent|mssqlserver)",state="running"})
|
||||||
|
```
|
||||||
|
|
||||||
## Alerting examples
|
## Alerting examples
|
||||||
**prometheus.rules**
|
**prometheus.rules**
|
||||||
```yaml
|
```yaml
|
||||||
@@ -100,7 +158,7 @@ groups:
|
|||||||
|
|
||||||
# Sends an alert when the 'sqlserveragent' service is not in the running state for 3 minutes.
|
# Sends an alert when the 'sqlserveragent' service is not in the running state for 3 minutes.
|
||||||
- alert: SQL Server Agent DOWN
|
- alert: SQL Server Agent DOWN
|
||||||
expr: windows_service_state{instance="SQL",exported_name="sqlserveragent",state="running"} == 0
|
expr: windows_service_state{instance="SQL",name="sqlserveragent",state="running"} == 0
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: high
|
severity: high
|
||||||
@@ -110,7 +168,7 @@ groups:
|
|||||||
|
|
||||||
# Sends an alert when the 'mssqlserver' service is not in the running state for 3 minutes.
|
# Sends an alert when the 'mssqlserver' service is not in the running state for 3 minutes.
|
||||||
- alert: SQL Server DOWN
|
- alert: SQL Server DOWN
|
||||||
expr: windows_service_state{instance="SQL",exported_name="mssqlserver",state="running"} == 0
|
expr: windows_service_state{instance="SQL",name="mssqlserver",state="running"} == 0
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: high
|
severity: high
|
||||||
|
|||||||
@@ -3,10 +3,12 @@
|
|||||||
package service
|
package service
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
"github.com/alecthomas/kingpin/v2"
|
"github.com/alecthomas/kingpin/v2"
|
||||||
"github.com/go-kit/log"
|
"github.com/go-kit/log"
|
||||||
@@ -23,16 +25,19 @@ const (
|
|||||||
Name = "service"
|
Name = "service"
|
||||||
FlagServiceWhereClause = "collector.service.services-where"
|
FlagServiceWhereClause = "collector.service.services-where"
|
||||||
FlagServiceUseAPI = "collector.service.use-api"
|
FlagServiceUseAPI = "collector.service.use-api"
|
||||||
|
FlagServiceCollectorV2 = "collector.service.v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
ServiceWhereClause string `yaml:"service_where_clause"`
|
ServiceWhereClause string `yaml:"service_where_clause"`
|
||||||
UseAPI bool `yaml:"use_api"`
|
UseAPI bool `yaml:"use_api"`
|
||||||
|
V2 bool `yaml:"v2"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var ConfigDefaults = Config{
|
var ConfigDefaults = Config{
|
||||||
ServiceWhereClause: "",
|
ServiceWhereClause: "",
|
||||||
UseAPI: false,
|
UseAPI: false,
|
||||||
|
V2: false,
|
||||||
}
|
}
|
||||||
|
|
||||||
// A collector is a Prometheus collector for WMI Win32_Service metrics
|
// A collector is a Prometheus collector for WMI Win32_Service metrics
|
||||||
@@ -41,13 +46,13 @@ type collector struct {
|
|||||||
|
|
||||||
serviceWhereClause *string
|
serviceWhereClause *string
|
||||||
useAPI *bool
|
useAPI *bool
|
||||||
|
v2 *bool
|
||||||
|
|
||||||
Information *prometheus.Desc
|
Information *prometheus.Desc
|
||||||
State *prometheus.Desc
|
State *prometheus.Desc
|
||||||
StartMode *prometheus.Desc
|
StartMode *prometheus.Desc
|
||||||
Status *prometheus.Desc
|
Status *prometheus.Desc
|
||||||
|
StateV2 *prometheus.Desc
|
||||||
queryWhereClause string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func New(logger log.Logger, config *Config) types.Collector {
|
func New(logger log.Logger, config *Config) types.Collector {
|
||||||
@@ -73,6 +78,10 @@ func NewWithFlags(app *kingpin.Application) types.Collector {
|
|||||||
FlagServiceUseAPI,
|
FlagServiceUseAPI,
|
||||||
"Use API calls to collect service data instead of WMI. Flag 'collector.service.services-where' won't be effective.",
|
"Use API calls to collect service data instead of WMI. Flag 'collector.service.services-where' won't be effective.",
|
||||||
).Default(strconv.FormatBool(ConfigDefaults.UseAPI)).Bool(),
|
).Default(strconv.FormatBool(ConfigDefaults.UseAPI)).Bool(),
|
||||||
|
v2: app.Flag(
|
||||||
|
FlagServiceCollectorV2,
|
||||||
|
"Enable V2 service collector. This collector can services state much more efficiently, can't provide general service information.",
|
||||||
|
).Default(strconv.FormatBool(ConfigDefaults.V2)).Bool(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,25 +129,37 @@ func (c *collector) Build() error {
|
|||||||
[]string{"name", "status"},
|
[]string{"name", "status"},
|
||||||
nil,
|
nil,
|
||||||
)
|
)
|
||||||
c.queryWhereClause = *c.serviceWhereClause
|
c.StateV2 = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(types.Namespace, Name, "state"),
|
||||||
|
"The state of the service (State)",
|
||||||
|
[]string{"name", "display_name", "status"},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect sends the metric values for each metric
|
// Collect sends the metric values for each metric
|
||||||
// to the provided prometheus Metric channel.
|
// to the provided prometheus Metric channel.
|
||||||
func (c *collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric) error {
|
func (c *collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric) error {
|
||||||
if *c.useAPI {
|
var err error
|
||||||
if err := c.collectAPI(ch); err != nil {
|
|
||||||
|
switch {
|
||||||
|
case *c.useAPI:
|
||||||
|
if err = c.collectAPI(ch); err != nil {
|
||||||
_ = level.Error(c.logger).Log("msg", "failed collecting API service metrics:", "err", err)
|
_ = level.Error(c.logger).Log("msg", "failed collecting API service metrics:", "err", err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
} else {
|
case *c.v2:
|
||||||
if err := c.collectWMI(ch); err != nil {
|
if err = c.collectAPIV2(ch); err != nil {
|
||||||
|
_ = level.Error(c.logger).Log("msg", "failed collecting API service metrics:", "err", err)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
if err = c.collectWMI(ch); err != nil {
|
||||||
_ = level.Error(c.logger).Log("msg", "failed collecting WMI service metrics:", "err", err)
|
_ = level.Error(c.logger).Log("msg", "failed collecting WMI service metrics:", "err", err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
// Win32_Service docs:
|
// Win32_Service docs:
|
||||||
@@ -164,7 +185,7 @@ var (
|
|||||||
"paused",
|
"paused",
|
||||||
"unknown",
|
"unknown",
|
||||||
}
|
}
|
||||||
apiStateValues = map[uint]string{
|
apiStateValues = map[uint32]string{
|
||||||
windows.SERVICE_CONTINUE_PENDING: "continue pending",
|
windows.SERVICE_CONTINUE_PENDING: "continue pending",
|
||||||
windows.SERVICE_PAUSE_PENDING: "pause pending",
|
windows.SERVICE_PAUSE_PENDING: "pause pending",
|
||||||
windows.SERVICE_PAUSED: "paused",
|
windows.SERVICE_PAUSED: "paused",
|
||||||
@@ -205,7 +226,7 @@ var (
|
|||||||
|
|
||||||
func (c *collector) collectWMI(ch chan<- prometheus.Metric) error {
|
func (c *collector) collectWMI(ch chan<- prometheus.Metric) error {
|
||||||
var dst []Win32_Service
|
var dst []Win32_Service
|
||||||
q := wmi.QueryAllWhere(&dst, c.queryWhereClause, c.logger)
|
q := wmi.QueryAllWhere(&dst, *c.serviceWhereClause, c.logger)
|
||||||
if err := wmi.Query(q, &dst); err != nil {
|
if err := wmi.Query(q, &dst); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@@ -333,7 +354,7 @@ func (c *collector) collectAPI(ch chan<- prometheus.Metric) error {
|
|||||||
|
|
||||||
for _, state := range apiStateValues {
|
for _, state := range apiStateValues {
|
||||||
isCurrentState := 0.0
|
isCurrentState := 0.0
|
||||||
if state == apiStateValues[uint(serviceStatus.State)] {
|
if state == apiStateValues[uint32(serviceStatus.State)] {
|
||||||
isCurrentState = 1.0
|
isCurrentState = 1.0
|
||||||
}
|
}
|
||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
@@ -362,3 +383,92 @@ func (c *collector) collectAPI(ch chan<- prometheus.Metric) error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *collector) collectAPIV2(ch chan<- prometheus.Metric) error {
|
||||||
|
services, err := c.queryAllServiceStates()
|
||||||
|
if err != nil {
|
||||||
|
_ = level.Warn(c.logger).Log("msg", "Failed to query services", "err", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if services == nil {
|
||||||
|
_ = level.Warn(c.logger).Log("msg", "No services queried")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var isCurrentState float64
|
||||||
|
|
||||||
|
for _, svc := range services {
|
||||||
|
for state, stateValue := range apiStateValues {
|
||||||
|
isCurrentState = 0.0
|
||||||
|
if state == svc.ServiceStatusProcess.CurrentState {
|
||||||
|
isCurrentState = 1.0
|
||||||
|
}
|
||||||
|
|
||||||
|
ch <- prometheus.MustNewConstMetric(
|
||||||
|
c.StateV2,
|
||||||
|
prometheus.GaugeValue,
|
||||||
|
isCurrentState,
|
||||||
|
windows.UTF16PtrToString(svc.ServiceName),
|
||||||
|
windows.UTF16PtrToString(svc.DisplayName),
|
||||||
|
stateValue,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryAllServiceStates returns all service states of the current Windows system
|
||||||
|
// This is realized by ask Service Manager directly.
|
||||||
|
//
|
||||||
|
// Unless explicitly stated otherwise all files in this repository are licensed
|
||||||
|
// under the Apache License Version 2.0.
|
||||||
|
// This product includes software developed at Datadog (https://www.datadoghq.com/).
|
||||||
|
// Copyright 2016-present Datadog, Inc.
|
||||||
|
//
|
||||||
|
// Source: https://github.com/DataDog/datadog-agent/blob/afbd8b6c87939c92610c654cb07fdfd439e4fb27/pkg/util/winutil/scmmonitor.go#L61-L96
|
||||||
|
func (c *collector) queryAllServiceStates() ([]windows.ENUM_SERVICE_STATUS_PROCESS, error) {
|
||||||
|
// EnumServiceStatusEx requires only SC_MANAGER_ENUM_SERVICE.
|
||||||
|
h, err := windows.OpenSCManager(nil, nil, windows.SC_MANAGER_ENUMERATE_SERVICE)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to open scm: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
m := &mgr.Mgr{Handle: h}
|
||||||
|
defer func() {
|
||||||
|
if err := m.Disconnect(); err != nil {
|
||||||
|
_ = level.Warn(c.logger).Log("msg", "Failed to disconnect from scm", "err", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
var bytesNeeded, servicesReturned uint32
|
||||||
|
var buf []byte
|
||||||
|
for {
|
||||||
|
var p *byte
|
||||||
|
if len(buf) > 0 {
|
||||||
|
p = &buf[0]
|
||||||
|
}
|
||||||
|
err = windows.EnumServicesStatusEx(m.Handle, windows.SC_ENUM_PROCESS_INFO,
|
||||||
|
windows.SERVICE_WIN32, windows.SERVICE_STATE_ALL,
|
||||||
|
p, uint32(len(buf)), &bytesNeeded, &servicesReturned, nil, nil)
|
||||||
|
if err == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if !errors.Is(err, windows.ERROR_MORE_DATA) {
|
||||||
|
return nil, fmt.Errorf("failed to enum services %w", err)
|
||||||
|
}
|
||||||
|
if bytesNeeded <= uint32(len(buf)) {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
buf = make([]byte, bytesNeeded)
|
||||||
|
}
|
||||||
|
|
||||||
|
if servicesReturned == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
services := unsafe.Slice((*windows.ENUM_SERVICE_STATUS_PROCESS)(unsafe.Pointer(&buf[0])), servicesReturned)
|
||||||
|
|
||||||
|
return services, nil
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user