diff --git a/README.md b/README.md index 827db57b..863b527e 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Name | Description | Enabled by default [ad](docs/collector.ad.md) | Active Directory Domain Services | [cpu](docs/collector.cpu.md) | CPU usage | ✓ [cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) | ✓ +[container](docs/collector.container.md) | Container metrics | [dns](docs/collector.dns.md) | DNS Server | [hyperv](docs/collector.hyperv.md) | Hyper-V hosts | [iis](docs/collector.iis.md) | IIS sites and applications | @@ -33,6 +34,7 @@ Name | Description | Enabled by default [service](docs/collector.service.md) | Service state metrics | ✓ [system](docs/collector.system.md) | System calls | ✓ [tcp](docs/collector.tcp.md) | TCP connections | +[thermalzone](docs/collector.thermalzone.md) | Thermal information [textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file | ✓ [vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent | diff --git a/collector/mssql.go b/collector/mssql.go index 15dbf4e6..834884c6 100644 --- a/collector/mssql.go +++ b/collector/mssql.go @@ -2575,7 +2575,7 @@ func (c *MSSQLCollector) collectDatabaseReplica(ch chan<- prometheus.Metric, sql ch <- prometheus.MustNewConstMetric( c.DBReplicaTransactionDelay, prometheus.GaugeValue, - float64(v.TransactionDelay)*1000.0, + float64(v.TransactionDelay)/1000.0, sqlInstance, replicaName, ) } diff --git a/collector/thermalzone.go b/collector/thermalzone.go new file mode 100644 index 00000000..634de333 --- /dev/null +++ b/collector/thermalzone.go @@ -0,0 +1,103 @@ +package collector + +import ( + "github.com/StackExchange/wmi" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" +) + +func init() { + Factories["thermalzone"] = NewThermalZoneCollector +} + +// A ThermalZoneCollector is a Prometheus collector for WMI Win32_PerfRawData_Counters_ThermalZoneInformation metrics +type ThermalZoneCollector struct { + PercentPassiveLimit *prometheus.Desc + Temperature *prometheus.Desc + ThrottleReasons *prometheus.Desc +} + +// NewThermalZoneCollector ... +func NewThermalZoneCollector() (Collector, error) { + const subsystem = "thermalzone" + return &ThermalZoneCollector{ + Temperature: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "temperature_celsius"), + "(Temperature)", + []string{ + "name", + }, + nil, + ), + PercentPassiveLimit: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "percent_passive_limit"), + "(PercentPassiveLimit)", + []string{ + "name", + }, + nil, + ), + ThrottleReasons: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, "throttle_reasons"), + "(ThrottleReasons)", + []string{ + "name", + }, + nil, + ), + }, nil +} + +// Collect sends the metric values for each metric +// to the provided prometheus Metric channel. +func (c *ThermalZoneCollector) Collect(ch chan<- prometheus.Metric) error { + if desc, err := c.collect(ch); err != nil { + log.Error("failed collecting thermalzone metrics:", desc, err) + return err + } + return nil +} + +// Win32_PerfRawData_Counters_ThermalZoneInformation docs: +// https://wutils.com/wmi/root/cimv2/win32_perfrawdata_counters_thermalzoneinformation/ +type Win32_PerfRawData_Counters_ThermalZoneInformation struct { + Name string + + HighPrecisionTemperature uint32 + PercentPassiveLimit uint32 + ThrottleReasons uint32 +} + +func (c *ThermalZoneCollector) collect(ch chan<- prometheus.Metric) (*prometheus.Desc, error) { + var dst []Win32_PerfRawData_Counters_ThermalZoneInformation + q := queryAll(&dst) + if err := wmi.Query(q, &dst); err != nil { + return nil, err + } + + for _, info := range dst { + //Divide by 10 and subtract 273.15 to convert decikelvin to celsius + ch <- prometheus.MustNewConstMetric( + c.Temperature, + prometheus.GaugeValue, + (float64(info.HighPrecisionTemperature)/10.0)-273.15, + info.Name, + ) + + ch <- prometheus.MustNewConstMetric( + c.PercentPassiveLimit, + prometheus.GaugeValue, + float64(info.PercentPassiveLimit), + info.Name, + ) + + ch <- prometheus.MustNewConstMetric( + c.ThrottleReasons, + prometheus.GaugeValue, + float64(info.ThrottleReasons), + info.Name, + ) + } + + return nil, nil +} diff --git a/docs/collector.service.md b/docs/collector.service.md index 0af8c381..15aaa3ce 100644 --- a/docs/collector.service.md +++ b/docs/collector.service.md @@ -66,10 +66,42 @@ A service can have any of the following statuses: Note that there is some overlap with service state. ### Example metric -_This collector does not yet have explained examples, we would appreciate your help adding them!_ +Lists the services that have a 'disabled' start mode. +``` +wmi_service_start_mode{exported_name=~"(mssqlserver|sqlserveragent)",start_mode="disabled"} +``` ## Useful queries -_This collector does not yet have any useful queries added, we would appreciate your help adding them!_ +Counts the number of Microsoft SQL Server/Agent Processes +``` +count(wmi_service_state{exported_name=~"(sqlserveragent|mssqlserver)",state="running"}) +``` ## Alerting examples -_This collector does not yet have alerting examples, we would appreciate your help adding them!_ +**prometheus.rules** +``` +groups: +- name: Microsoft SQL Server Alerts + rules: + + # Sends an alert when the 'sqlserveragent' service is not in the running state for 3 minutes. + - alert: SQL Server Agent DOWN + expr: wmi_service_state{instance="SQL",exported_name="sqlserveragent",state="running"} == 0 + for: 3m + labels: + severity: high + annotations: + summary: "Service {{ $labels.exported_name }} down" + description: "Service {{ $labels.exported_name }} on instance {{ $labels.instance }} has been down for more than 3 minutes." + + # Sends an alert when the 'mssqlserver' service is not in the running state for 3 minutes. + - alert: SQL Server DOWN + expr: wmi_service_state{instance="SQL",exported_name="mssqlserver",state="running"} == 0 + for: 3m + labels: + severity: high + annotations: + summary: "Service {{ $labels.exported_name }} down" + description: "Service {{ $labels.exported_name }} on instance {{ $labels.instance }} has been down for more than 3 minutes." +``` +In this example, `instance` is the target label of the host. So each alert will be processed per host, which is then used in the alert description. diff --git a/docs/collector.thermalzone.md b/docs/collector.thermalzone.md new file mode 100644 index 00000000..71f36c75 --- /dev/null +++ b/docs/collector.thermalzone.md @@ -0,0 +1,32 @@ +# thermalzone collector + +The thermalzone collector exposes metrics about system temps. Note that temperature is given in Kelvin + +||| +-|- +Metric name prefix | `thermalzone` +Classes | [`Win32_PerfRawData_Counters_ThermalZoneInformation`](https://wutils.com/wmi/root/cimv2/win32_perfrawdata_counters_thermalzoneinformation/#temperature_properties) +Enabled by default? | No + +## Flags + +None + +## Metrics + +Name | Description | Type | Labels +-----|-------------|------|------- +`wmi_thermalzone_percent_passive_limit` | % Passive Limit is the current limit this thermal zone is placing on the devices it controls. A limit of 100% indicates the devices are unconstrained. A limit of 0% indicates the devices are fully constrained. | gauge | None +`wmi_thermalzone_temperature_celsius ` | Temperature of the thermal zone, in degrees Celsius. | gauge | None +`wmi_thermalzone_throttle_reasons ` | Throttle Reasons indicate reasons why the thermal zone is limiting performance of the devices it controls. 0x0 – The zone is not throttled. 0x1 – The zone is throttled for thermal reasons. 0x2 – The zone is throttled to limit electrical current. | gauge | None + +[`Throttle reasons` source](https://docs.microsoft.com/en-us/windows-hardware/design/device-experiences/examples--requirements-and-diagnostics) + +### Example metric +_This collector does not yet have explained examples, we would appreciate your help adding them!_ + +## Useful queries +_This collector does not yet have any useful queries added, we would appreciate your help adding them!_ + +## Alerting examples +_This collector does not yet have alerting examples, we would appreciate your help adding them!_