From e951e516dead49609b83f638da8e945da16afbe3 Mon Sep 17 00:00:00 2001 From: Dominik Eisenberg <64131471+Dominik-esb@users.noreply.github.com> Date: Sun, 8 Feb 2026 13:46:02 +0100 Subject: [PATCH] docs: add alerting examples for CPU and CSV (#2317) Signed-off-by: EisenbergD Co-authored-by: EisenbergD --- docs/collector.cpu.md | 35 ++++++++++++++++++++++++++++++++++- docs/collector.mscluster.md | 21 +++++++++++++++++++-- docs/collector.os.md | 17 ++++++++++++++++- 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/docs/collector.cpu.md b/docs/collector.cpu.md index 9975ce18..07c7ba70 100644 --- a/docs/collector.cpu.md +++ b/docs/collector.cpu.md @@ -80,7 +80,36 @@ avg by(instance) ( ## Alerting examples -**prometheus.rules** +#### Average CPU utilization over 1 hour exceeds 80% (New CPU metric) +```yaml +# Alert on hosts with 1h avg CPU more than 80% +- alert: HighCPUUtilization + expr: | + avg_over_time( + ( + sum by (instance) ( + ( + rate(windows_cpu_processor_utility_total{}[1m]) + / + rate(windows_cpu_processor_rtc_total{}[1m]) + ) + ) / + count by (instance) ( + windows_cpu_processor_utility_total{} + ) + )[1h:] + ) > 80 + for: 1m + labels: + severity: warning + metric_name: CPUUtilization + annotations: + summary: "High CPU utilization on {{ $labels.instance }}" + description: | + CPU utilization on {{ $labels.instance }} has averaged more than 80% over the last hour (current value: {{ printf "%.2f" $value }}) +``` + +#### Average CPU utilization over 1 hour exceeds 80% (Old CPU metric) ```yaml # Alert on hosts with more than 80% CPU usage over a 10 minute period - alert: CpuUsage @@ -91,6 +120,10 @@ avg by(instance) ( annotations: summary: "CPU Usage (instance {{ $labels.instance }})" description: "CPU Usage is more than 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" +``` + +#### CPU not using boost frequencies +```yaml # Alert on hosts which are not boosting their CPU frequencies - alert: NoCpuTurbo expr: | diff --git a/docs/collector.mscluster.md b/docs/collector.mscluster.md index bce4a268..2a254413 100644 --- a/docs/collector.mscluster.md +++ b/docs/collector.mscluster.md @@ -12,7 +12,7 @@ Enabled by default? | No ### `--collectors.mscluster.enabled` Comma-separated list of collectors to use, for example: -`--collectors.mscluster.enabled=cluster,network,node,resource,resouregroup`. +`--collectors.mscluster.enabled=cluster,network,node,resource,resouregroup`. Matching is case-sensitive. ## Metrics @@ -183,4 +183,21 @@ count(windows_mscluster_resource_state{type="Network Name"}) ``` ## Alerting examples -_This collector does not yet have alerting examples, we would appreciate your help adding them!_ +#### Low free space on cluster shared volume +```yaml +# Alerts if volume has less then 20% free space +- alert: LowCSVFreeSpace + expr: | + ( + max by (name, cluster) (windows_mscluster_shared_volumes_free_bytes{name!="ClusterPerformanceHistory"}) + / + max by (name, cluster) (windows_mscluster_shared_volumes_total_bytes{name!="ClusterPerformanceHistory"}) + ) * 100 < 20 + for: 10m + labels: + severity: warning + annotations: + summary: "Low CSV free space on {{ $labels.name }}" + description: | + Cluster Shared Volume {{ $labels.name }} on cluster {{ $labels.cluster }} has less than 20% free space (current: {{ printf "%.2f" $value }}%) +``` diff --git a/docs/collector.os.md b/docs/collector.os.md index c172d2fa..b2641c6e 100644 --- a/docs/collector.os.md +++ b/docs/collector.os.md @@ -38,4 +38,19 @@ windows_os_install_time_timestamp_seconds 1.6725312e+09 _This collector does not yet have useful queries, we would appreciate your help adding them!_ ## Alerting examples -_This collector does not yet have alerting examples, we would appreciate your help adding them!_ + +#### Average CPU utilization over 1 hour exceeds 80% (New CPU metric) +```yaml +# Alerts if Agent/Host is down for 5min +- alert: HypervHostDown + expr: up{app="hyper-v"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: Hyper-V host {{ $labels.instance }} is down + description: | + Hyper-V host {{ $labels.instance }} has been unreachable for more than 5 minutes. + Job: {{ $labels.job }} +``` +