Merge remote-tracking branch 'origin/master' into perflib-cpu

This commit is contained in:
Calle Pettersson
2019-08-03 15:06:30 +02:00
5 changed files with 173 additions and 4 deletions

View File

@@ -12,6 +12,7 @@ Name | Description | Enabled by default
[ad](docs/collector.ad.md) | Active Directory Domain Services |
[cpu](docs/collector.cpu.md) | CPU usage | ✓
[cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) | ✓
[container](docs/collector.container.md) | Container metrics |
[dns](docs/collector.dns.md) | DNS Server |
[hyperv](docs/collector.hyperv.md) | Hyper-V hosts |
[iis](docs/collector.iis.md) | IIS sites and applications |
@@ -33,6 +34,7 @@ Name | Description | Enabled by default
[service](docs/collector.service.md) | Service state metrics | ✓
[system](docs/collector.system.md) | System calls | ✓
[tcp](docs/collector.tcp.md) | TCP connections |
[thermalzone](docs/collector.thermalzone.md) | Thermal information
[textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file | ✓
[vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent |

View File

@@ -2575,7 +2575,7 @@ func (c *MSSQLCollector) collectDatabaseReplica(ch chan<- prometheus.Metric, sql
ch <- prometheus.MustNewConstMetric(
c.DBReplicaTransactionDelay,
prometheus.GaugeValue,
float64(v.TransactionDelay)*1000.0,
float64(v.TransactionDelay)/1000.0,
sqlInstance, replicaName,
)
}

103
collector/thermalzone.go Normal file
View File

@@ -0,0 +1,103 @@
package collector
import (
"github.com/StackExchange/wmi"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
)
func init() {
Factories["thermalzone"] = NewThermalZoneCollector
}
// A ThermalZoneCollector is a Prometheus collector for WMI Win32_PerfRawData_Counters_ThermalZoneInformation metrics
type ThermalZoneCollector struct {
PercentPassiveLimit *prometheus.Desc
Temperature *prometheus.Desc
ThrottleReasons *prometheus.Desc
}
// NewThermalZoneCollector ...
func NewThermalZoneCollector() (Collector, error) {
const subsystem = "thermalzone"
return &ThermalZoneCollector{
Temperature: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "temperature_celsius"),
"(Temperature)",
[]string{
"name",
},
nil,
),
PercentPassiveLimit: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "percent_passive_limit"),
"(PercentPassiveLimit)",
[]string{
"name",
},
nil,
),
ThrottleReasons: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, "throttle_reasons"),
"(ThrottleReasons)",
[]string{
"name",
},
nil,
),
}, nil
}
// Collect sends the metric values for each metric
// to the provided prometheus Metric channel.
func (c *ThermalZoneCollector) Collect(ch chan<- prometheus.Metric) error {
if desc, err := c.collect(ch); err != nil {
log.Error("failed collecting thermalzone metrics:", desc, err)
return err
}
return nil
}
// Win32_PerfRawData_Counters_ThermalZoneInformation docs:
// https://wutils.com/wmi/root/cimv2/win32_perfrawdata_counters_thermalzoneinformation/
type Win32_PerfRawData_Counters_ThermalZoneInformation struct {
Name string
HighPrecisionTemperature uint32
PercentPassiveLimit uint32
ThrottleReasons uint32
}
func (c *ThermalZoneCollector) collect(ch chan<- prometheus.Metric) (*prometheus.Desc, error) {
var dst []Win32_PerfRawData_Counters_ThermalZoneInformation
q := queryAll(&dst)
if err := wmi.Query(q, &dst); err != nil {
return nil, err
}
for _, info := range dst {
//Divide by 10 and subtract 273.15 to convert decikelvin to celsius
ch <- prometheus.MustNewConstMetric(
c.Temperature,
prometheus.GaugeValue,
(float64(info.HighPrecisionTemperature)/10.0)-273.15,
info.Name,
)
ch <- prometheus.MustNewConstMetric(
c.PercentPassiveLimit,
prometheus.GaugeValue,
float64(info.PercentPassiveLimit),
info.Name,
)
ch <- prometheus.MustNewConstMetric(
c.ThrottleReasons,
prometheus.GaugeValue,
float64(info.ThrottleReasons),
info.Name,
)
}
return nil, nil
}

View File

@@ -66,10 +66,42 @@ A service can have any of the following statuses:
Note that there is some overlap with service state.
### Example metric
_This collector does not yet have explained examples, we would appreciate your help adding them!_
Lists the services that have a 'disabled' start mode.
```
wmi_service_start_mode{exported_name=~"(mssqlserver|sqlserveragent)",start_mode="disabled"}
```
## Useful queries
_This collector does not yet have any useful queries added, we would appreciate your help adding them!_
Counts the number of Microsoft SQL Server/Agent Processes
```
count(wmi_service_state{exported_name=~"(sqlserveragent|mssqlserver)",state="running"})
```
## Alerting examples
_This collector does not yet have alerting examples, we would appreciate your help adding them!_
**prometheus.rules**
```
groups:
- name: Microsoft SQL Server Alerts
rules:
# Sends an alert when the 'sqlserveragent' service is not in the running state for 3 minutes.
- alert: SQL Server Agent DOWN
expr: wmi_service_state{instance="SQL",exported_name="sqlserveragent",state="running"} == 0
for: 3m
labels:
severity: high
annotations:
summary: "Service {{ $labels.exported_name }} down"
description: "Service {{ $labels.exported_name }} on instance {{ $labels.instance }} has been down for more than 3 minutes."
# Sends an alert when the 'mssqlserver' service is not in the running state for 3 minutes.
- alert: SQL Server DOWN
expr: wmi_service_state{instance="SQL",exported_name="mssqlserver",state="running"} == 0
for: 3m
labels:
severity: high
annotations:
summary: "Service {{ $labels.exported_name }} down"
description: "Service {{ $labels.exported_name }} on instance {{ $labels.instance }} has been down for more than 3 minutes."
```
In this example, `instance` is the target label of the host. So each alert will be processed per host, which is then used in the alert description.

View File

@@ -0,0 +1,32 @@
# thermalzone collector
The thermalzone collector exposes metrics about system temps. Note that temperature is given in Kelvin
|||
-|-
Metric name prefix | `thermalzone`
Classes | [`Win32_PerfRawData_Counters_ThermalZoneInformation`](https://wutils.com/wmi/root/cimv2/win32_perfrawdata_counters_thermalzoneinformation/#temperature_properties)
Enabled by default? | No
## Flags
None
## Metrics
Name | Description | Type | Labels
-----|-------------|------|-------
`wmi_thermalzone_percent_passive_limit` | % Passive Limit is the current limit this thermal zone is placing on the devices it controls. A limit of 100% indicates the devices are unconstrained. A limit of 0% indicates the devices are fully constrained. | gauge | None
`wmi_thermalzone_temperature_celsius ` | Temperature of the thermal zone, in degrees Celsius. | gauge | None
`wmi_thermalzone_throttle_reasons ` | Throttle Reasons indicate reasons why the thermal zone is limiting performance of the devices it controls. 0x0 The zone is not throttled. 0x1 The zone is throttled for thermal reasons. 0x2 The zone is throttled to limit electrical current. | gauge | None
[`Throttle reasons` source](https://docs.microsoft.com/en-us/windows-hardware/design/device-experiences/examples--requirements-and-diagnostics)
### Example metric
_This collector does not yet have explained examples, we would appreciate your help adding them!_
## Useful queries
_This collector does not yet have any useful queries added, we would appreciate your help adding them!_
## Alerting examples
_This collector does not yet have alerting examples, we would appreciate your help adding them!_