From dcf85032ca19ae8139ffbd69991c869e3e917b6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E8=8D=A3?= Date: Tue, 3 Jun 2025 01:45:18 +0800 Subject: [PATCH] gpu: add metrics collector and related types (#2052) (#2059) --- README.md | 97 +++---- docs/collector.gpu.md | 139 ++++++++++ internal/collector/gpu/gpu.go | 431 +++++++++++++++++++++++++++++ internal/collector/gpu/gpu_test.go | 33 +++ internal/collector/gpu/types.go | 55 ++++ internal/collector/gpu/utils.go | 84 ++++++ pkg/collector/collection.go | 2 + pkg/collector/config.go | 3 + pkg/collector/map.go | 2 + 9 files changed, 798 insertions(+), 48 deletions(-) create mode 100644 docs/collector.gpu.md create mode 100644 internal/collector/gpu/gpu.go create mode 100644 internal/collector/gpu/gpu_test.go create mode 100644 internal/collector/gpu/types.go create mode 100644 internal/collector/gpu/utils.go diff --git a/README.md b/README.md index 0f68cc8e..8beb2e09 100644 --- a/README.md +++ b/README.md @@ -12,54 +12,55 @@ A Prometheus exporter for Windows machines. ## Collectors -Name | Description | Enabled by default ----------|-------------|-------------------- -[ad](docs/collector.ad.md) | Active Directory Domain Services | -[adcs](docs/collector.adcs.md) | Active Directory Certificate Services | -[adfs](docs/collector.adfs.md) | Active Directory Federation Services | -[cache](docs/collector.cache.md) | Cache metrics | -[cpu](docs/collector.cpu.md) | CPU usage | ✓ -[cpu_info](docs/collector.cpu_info.md) | CPU Information | -[cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) | -[container](docs/collector.container.md) | Container metrics | -[diskdrive](docs/collector.diskdrive.md) | Diskdrive metrics | -[dfsr](docs/collector.dfsr.md) | DFSR metrics | -[dhcp](docs/collector.dhcp.md) | DHCP Server | -[dns](docs/collector.dns.md) | DNS Server | -[exchange](docs/collector.exchange.md) | Exchange metrics | -[filetime](docs/collector.filetime.md) | FileTime metrics | -[fsrmquota](docs/collector.fsrmquota.md) | Microsoft File Server Resource Manager (FSRM) Quotas collector | -[hyperv](docs/collector.hyperv.md) | Hyper-V hosts | -[iis](docs/collector.iis.md) | IIS sites and applications | -[license](docs/collector.license.md) | Windows license status | -[logical_disk](docs/collector.logical_disk.md) | Logical disks, disk I/O | ✓ -[memory](docs/collector.memory.md) | Memory usage metrics | ✓ -[mscluster](docs/collector.mscluster.md) | MSCluster metrics | -[msmq](docs/collector.msmq.md) | MSMQ queues | -[mssql](docs/collector.mssql.md) | [SQL Server Performance Objects](https://docs.microsoft.com/en-us/sql/relational-databases/performance-monitor/use-sql-server-objects#SQLServerPOs) metrics | -[netframework](docs/collector.netframework.md) | .NET Framework metrics | -[net](docs/collector.net.md) | Network interface I/O | ✓ -[os](docs/collector.os.md) | OS metrics (memory, processes, users) | ✓ -[pagefile](docs/collector.pagefile.md) | pagefile metrics | -[performancecounter](docs/collector.performancecounter.md) | Custom performance counter metrics | -[physical_disk](docs/collector.physical_disk.md) | physical disk metrics | ✓ -[printer](docs/collector.printer.md) | Printer metrics | -[process](docs/collector.process.md) | Per-process metrics | -[remote_fx](docs/collector.remote_fx.md) | RemoteFX protocol (RDP) metrics | -[scheduled_task](docs/collector.scheduled_task.md) | Scheduled Tasks metrics | -[service](docs/collector.service.md) | Service state metrics | ✓ -[smb](docs/collector.smb.md) | SMB Server | -[smbclient](docs/collector.smbclient.md) | SMB Client | -[smtp](docs/collector.smtp.md) | IIS SMTP Server | -[system](docs/collector.system.md) | System calls | ✓ -[tcp](docs/collector.tcp.md) | TCP connections | -[terminal_services](docs/collector.terminal_services.md) | Terminal services (RDS) -[textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file | -[thermalzone](docs/collector.thermalzone.md) | Thermal information | -[time](docs/collector.time.md) | Windows Time Service | -[udp](docs/collector.udp.md) | UDP connections | -[update](docs/collector.update.md) | Windows Update Service | -[vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent | + Name | Description | Enabled by default +------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------- + [ad](docs/collector.ad.md) | Active Directory Domain Services | + [adcs](docs/collector.adcs.md) | Active Directory Certificate Services | + [adfs](docs/collector.adfs.md) | Active Directory Federation Services | + [cache](docs/collector.cache.md) | Cache metrics | + [cpu](docs/collector.cpu.md) | CPU usage | ✓ + [cpu_info](docs/collector.cpu_info.md) | CPU Information | + [cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) | + [container](docs/collector.container.md) | Container metrics | + [diskdrive](docs/collector.diskdrive.md) | Diskdrive metrics | + [dfsr](docs/collector.dfsr.md) | DFSR metrics | + [dhcp](docs/collector.dhcp.md) | DHCP Server | + [dns](docs/collector.dns.md) | DNS Server | + [exchange](docs/collector.exchange.md) | Exchange metrics | + [filetime](docs/collector.filetime.md) | FileTime metrics | + [fsrmquota](docs/collector.fsrmquota.md) | Microsoft File Server Resource Manager (FSRM) Quotas collector | + [gpu](docs/collector.gpu.md) | GPU metrics | + [hyperv](docs/collector.hyperv.md) | Hyper-V hosts | + [iis](docs/collector.iis.md) | IIS sites and applications | + [license](docs/collector.license.md) | Windows license status | + [logical_disk](docs/collector.logical_disk.md) | Logical disks, disk I/O | ✓ + [memory](docs/collector.memory.md) | Memory usage metrics | ✓ + [mscluster](docs/collector.mscluster.md) | MSCluster metrics | + [msmq](docs/collector.msmq.md) | MSMQ queues | + [mssql](docs/collector.mssql.md) | [SQL Server Performance Objects](https://docs.microsoft.com/en-us/sql/relational-databases/performance-monitor/use-sql-server-objects#SQLServerPOs) metrics | + [netframework](docs/collector.netframework.md) | .NET Framework metrics | + [net](docs/collector.net.md) | Network interface I/O | ✓ + [os](docs/collector.os.md) | OS metrics (memory, processes, users) | ✓ + [pagefile](docs/collector.pagefile.md) | pagefile metrics | + [performancecounter](docs/collector.performancecounter.md) | Custom performance counter metrics | + [physical_disk](docs/collector.physical_disk.md) | physical disk metrics | ✓ + [printer](docs/collector.printer.md) | Printer metrics | + [process](docs/collector.process.md) | Per-process metrics | + [remote_fx](docs/collector.remote_fx.md) | RemoteFX protocol (RDP) metrics | + [scheduled_task](docs/collector.scheduled_task.md) | Scheduled Tasks metrics | + [service](docs/collector.service.md) | Service state metrics | ✓ + [smb](docs/collector.smb.md) | SMB Server | + [smbclient](docs/collector.smbclient.md) | SMB Client | + [smtp](docs/collector.smtp.md) | IIS SMTP Server | + [system](docs/collector.system.md) | System calls | ✓ + [tcp](docs/collector.tcp.md) | TCP connections | + [terminal_services](docs/collector.terminal_services.md) | Terminal services (RDS) + [textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file | + [thermalzone](docs/collector.thermalzone.md) | Thermal information | + [time](docs/collector.time.md) | Windows Time Service | + [udp](docs/collector.udp.md) | UDP connections | + [update](docs/collector.update.md) | Windows Update Service | + [vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent | See the linked documentation on each collector for more information on reported metrics, configuration settings and usage examples. diff --git a/docs/collector.gpu.md b/docs/collector.gpu.md new file mode 100644 index 00000000..d9e3b9cc --- /dev/null +++ b/docs/collector.gpu.md @@ -0,0 +1,139 @@ +# gpu collector + +The gpu collector exposes metrics about GPU usage and memory consumption, both at the adapter (physical GPU) and +per-process level. + +| | | +|---------------------|--------------------------------------| +| Metric name prefix | `gpu` | +| Data source | Perflib | +| Counters | GPU Engine, GPU Adapter, GPU Process | +| Enabled by default? | No | + +## Flags + +None + +## Metrics + +These metrics are available on supported versions of Windows with compatible GPUs and drivers: + +### Adapter-level Metrics + +| Name | Description | Type | Labels | +|----------------------------------------------|----------------------------------------------------------|-------|--------| +| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `phys` | +| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `phys` | +| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `phys` | +| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `phys` | +| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `phys` | + +### Per-process Metrics + +| Name | Description | Type | Labels | +|----------------------------------------------|-------------------------------------------------|---------|----------------------------------------| +| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `phys`, `eng`, `engtype`, `process_id` | +| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `phys`,`process_id` | +| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `phys`,`process_id` | +| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `phys`,`process_id` | +| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `phys`,`process_id` | +| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `phys`,`process_id` | + +## Metric Labels + +* `phys`: Physical GPU index (e.g., "0") +* `eng`: GPU engine index (e.g., "0", "1", ...) +* `engtype`: GPU engine type (e.g., "3D", "Copy", "VideoDecode", etc.) +* `process_id`: Process ID + +## Example Metric + +These are basic queries to help you get started with GPU monitoring on Windows using Prometheus. + +**Show total dedicated GPU memory (in bytes) usage on GPU 0:** + +```promql +windows_gpu_adapter_memory_dedicated_bytes{phys="0"} +``` + +**Aggregate GPU utilization across all processes for a physical GPU (3D engine):** + +```promql +sum by (phys) ( + rate(windows_gpu_engine_time_seconds{phys="0", engtype="3D"}[1m]) +) * 100 +``` + +**Show GPU utilization for a specific process (3D engine):** + +```promql +sum by (phys, process_id) ( + rate(windows_gpu_engine_time_seconds{process_id="1234", engtype="3D"}[1m]) +) * 100 +``` + +**Show dedicated GPU memory per process:** + +```promql +windows_gpu_adapter_memory_dedicated_bytes +``` + +## Useful Queries + +**Show top 5 processes by GPU utilization (all engines):** + +```promql +topk(5, sum by (process_id) ( + rate(windows_gpu_engine_time_seconds[1m]) +) * 100) +``` + +**Show GPU memory usage per physical GPU:** + +```promql +sum by (phys) ( + windows_gpu_adapter_memory_dedicated_bytes +) +``` + +Show GPU engine time with process owner and command line: + +```promql +windows_gpu_engine_time_seconds * on(process_id) group_left(owner, cmdline) windows_process_info +``` + +## Alerting Examples + +**prometheus.rules** + +```yaml +# Alert on processes using more than 80% of a GPU's capacity over 10 minutes +- alert: HighGpuUtilization + expr: | + sum by (process_id) ( + rate(windows_gpu_engine_time_seconds[1m]) + ) * 100 > 80 + for: 10m + labels: + severity: warning + annotations: + summary: "High GPU Utilization (process {{ $labels.process_id }})" + description: "Process is using more than 80% of GPU resources\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" +``` + +## Notes + +* Per-process metrics allow you to identify which processes are consuming GPU resources. +* Adapter-level metrics provide an overview of total GPU memory usage. +* For overall GPU utilization, aggregate per-process metrics in Prometheus using queries such as `sum()`. +* The collector relies on Windows performance counters; ensure your system and drivers support these counters. + +## Enabling the Collector + +To enable the GPU collector, add `gpu` to the list of enabled collectors in your windows_exporter configuration. + +Example (command line): + +```shell +windows_exporter.exe --collectors.enabled=gpu +``` diff --git a/internal/collector/gpu/gpu.go b/internal/collector/gpu/gpu.go new file mode 100644 index 00000000..0c84cd8f --- /dev/null +++ b/internal/collector/gpu/gpu.go @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build windows + +package gpu + +import ( + "errors" + "fmt" + "log/slog" + + "github.com/alecthomas/kingpin/v2" + "github.com/prometheus-community/windows_exporter/internal/mi" + "github.com/prometheus-community/windows_exporter/internal/pdh" + "github.com/prometheus-community/windows_exporter/internal/types" + "github.com/prometheus/client_golang/prometheus" +) + +const Name = "gpu" + +type Config struct{} + +//nolint:gochecknoglobals +var ConfigDefaults = Config{} + +type Collector struct { + config Config + + // GPU Engine + gpuEnginePerfDataCollector *pdh.Collector + gpuEnginePerfDataObject []gpuEnginePerfDataCounterValues + + gpuEngineRunningTime *prometheus.Desc + + // GPU Adapter Memory + gpuAdapterMemoryPerfDataCollector *pdh.Collector + gpuAdapterMemoryPerfDataObject []gpuAdapterMemoryPerfDataCounterValues + + gpuAdapterMemoryDedicatedUsage *prometheus.Desc + gpuAdapterMemorySharedUsage *prometheus.Desc + gpuAdapterMemoryTotalCommitted *prometheus.Desc + + // GPU Local Adapter Memory + gpuLocalAdapterMemoryPerfDataCollector *pdh.Collector + gpuLocalAdapterMemoryPerfDataObject []gpuLocalAdapterMemoryPerfDataCounterValues + + gpuLocalAdapterMemoryUsage *prometheus.Desc + + // GPU Non Local Adapter Memory + gpuNonLocalAdapterMemoryPerfDataCollector *pdh.Collector + gpuNonLocalAdapterMemoryPerfDataObject []gpuNonLocalAdapterMemoryPerfDataCounterValues + + gpuNonLocalAdapterMemoryUsage *prometheus.Desc + + // GPU Process Memory + gpuProcessMemoryPerfDataCollector *pdh.Collector + gpuProcessMemoryPerfDataObject []gpuProcessMemoryPerfDataCounterValues + + gpuProcessMemoryDedicatedUsage *prometheus.Desc + gpuProcessMemoryLocalUsage *prometheus.Desc + gpuProcessMemoryNonLocalUsage *prometheus.Desc + gpuProcessMemorySharedUsage *prometheus.Desc + gpuProcessMemoryTotalCommitted *prometheus.Desc +} + +func New(config *Config) *Collector { + if config == nil { + config = &ConfigDefaults + } + + c := &Collector{ + config: *config, + } + + return c +} + +func NewWithFlags(_ *kingpin.Application) *Collector { + return &Collector{} +} + +func (c *Collector) GetName() string { + return Name +} + +func (c *Collector) Close() error { + c.gpuEnginePerfDataCollector.Close() + c.gpuAdapterMemoryPerfDataCollector.Close() + c.gpuLocalAdapterMemoryPerfDataCollector.Close() + c.gpuNonLocalAdapterMemoryPerfDataCollector.Close() + c.gpuProcessMemoryPerfDataCollector.Close() + + return nil +} + +func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error { + var err error + + c.gpuEngineRunningTime = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"), + "Total running time of the GPU in seconds.", + []string{"process_id", "phys", "eng", "engtype"}, + nil, + ) + + c.gpuAdapterMemoryDedicatedUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_dedicated_bytes"), + "Dedicated GPU memory usage in bytes.", + []string{"phys"}, + nil, + ) + c.gpuAdapterMemorySharedUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_shared_bytes"), + "Shared GPU memory usage in bytes.", + []string{"phys"}, + nil, + ) + c.gpuAdapterMemoryTotalCommitted = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_committed_bytes"), + "Total committed GPU memory in bytes.", + []string{"phys"}, + nil, + ) + + c.gpuLocalAdapterMemoryUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "local_adapter_memory_bytes"), + "Local adapter memory usage in bytes.", + []string{"phys"}, + nil, + ) + + c.gpuNonLocalAdapterMemoryUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "non_local_adapter_memory_bytes"), + "Non-local adapter memory usage in bytes.", + []string{"phys"}, + nil, + ) + + c.gpuProcessMemoryDedicatedUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "process_memory_dedicated_bytes"), + "Dedicated process memory usage in bytes.", + []string{"process_id", "phys"}, + nil, + ) + c.gpuProcessMemoryLocalUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "process_memory_local_bytes"), + "Local process memory usage in bytes.", + []string{"process_id", "phys"}, + nil, + ) + c.gpuProcessMemoryNonLocalUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "process_memory_non_local_bytes"), + "Non-local process memory usage in bytes.", + []string{"process_id", "phys"}, + nil, + ) + c.gpuProcessMemorySharedUsage = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "process_memory_shared_bytes"), + "Shared process memory usage in bytes.", + []string{"process_id", "phys"}, + nil, + ) + c.gpuProcessMemoryTotalCommitted = prometheus.NewDesc( + prometheus.BuildFQName(types.Namespace, Name, "process_memory_committed_bytes"), + "Total committed process memory in bytes.", + []string{"process_id", "phys"}, + nil, + ) + + errs := make([]error, 0) + + c.gpuEnginePerfDataCollector, err = pdh.NewCollector[gpuEnginePerfDataCounterValues](pdh.CounterTypeRaw, "GPU Engine", pdh.InstancesAll) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create GPU Engine perf data collector: %w", err)) + } + + c.gpuAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Adapter Memory", pdh.InstancesAll) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create GPU Adapter Memory perf data collector: %w", err)) + } + + c.gpuLocalAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuLocalAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Local Adapter Memory", pdh.InstancesAll) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create GPU Local Adapter Memory perf data collector: %w", err)) + } + + c.gpuNonLocalAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuNonLocalAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Non Local Adapter Memory", pdh.InstancesAll) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create GPU Non Local Adapter Memory perf data collector: %w", err)) + } + + c.gpuProcessMemoryPerfDataCollector, err = pdh.NewCollector[gpuProcessMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Process Memory", pdh.InstancesAll) + if err != nil { + errs = append(errs, fmt.Errorf("failed to create GPU Process Memory perf data collector: %w", err)) + } + + return errors.Join(errs...) +} + +func (c *Collector) Collect(ch chan<- prometheus.Metric) error { + errs := make([]error, 0) + + if err := c.collectGpuEngineMetrics(ch); err != nil { + errs = append(errs, err) + } + + if err := c.collectGpuAdapterMemoryMetrics(ch); err != nil { + errs = append(errs, err) + } + + if err := c.collectGpuLocalAdapterMemoryMetrics(ch); err != nil { + errs = append(errs, err) + } + + if err := c.collectGpuNonLocalAdapterMemoryMetrics(ch); err != nil { + errs = append(errs, err) + } + + if err := c.collectGpuProcessMemoryMetrics(ch); err != nil { + errs = append(errs, err) + } + + return errors.Join(errs...) +} + +func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error { + // Collect the GPU Engine perf data. + if err := c.gpuEnginePerfDataCollector.Collect(&c.gpuEnginePerfDataObject); err != nil { + return fmt.Errorf("failed to collect GPU Engine perf data: %w", err) + } + + runningTimeMap := make(map[PidPhysEngEngType]float64) + // Iterate over the GPU Engine perf data and aggregate the values. + for _, data := range c.gpuEnginePerfDataObject { + instance := parseGPUCounterInstanceString(data.Name) + + key := PidPhysEngEngType{ + Pid: instance.Pid, + Phys: instance.Phys, + Eng: instance.Eng, + Engtype: instance.Engtype, + } + runningTimeMap[key] += data.RunningTime / 10_000_000 // RunningTime is in 100ns units, convert to seconds. + } + + for key, runningTime := range runningTimeMap { + ch <- prometheus.MustNewConstMetric( + c.gpuEngineRunningTime, + prometheus.CounterValue, + runningTime, + key.Pid, key.Phys, key.Eng, key.Engtype, + ) + } + + return nil +} + +func (c *Collector) collectGpuAdapterMemoryMetrics(ch chan<- prometheus.Metric) error { + // Collect the GPU Adapter Memory perf data. + if err := c.gpuAdapterMemoryPerfDataCollector.Collect(&c.gpuAdapterMemoryPerfDataObject); err != nil { + return fmt.Errorf("failed to collect GPU Adapter Memory perf data: %w", err) + } + + dedicatedUsageMap := make(map[PidPhysEngEngType]float64) + sharedUsageMap := make(map[PidPhysEngEngType]float64) + totalCommittedMap := make(map[PidPhysEngEngType]float64) + + for _, data := range c.gpuAdapterMemoryPerfDataObject { + instance := parseGPUCounterInstanceString(data.Name) + + key := PidPhysEngEngType{ + Pid: instance.Pid, + Phys: instance.Phys, + Eng: instance.Eng, + Engtype: instance.Engtype, + } + dedicatedUsageMap[key] += data.DedicatedUsage + sharedUsageMap[key] += data.SharedUsage + totalCommittedMap[key] += data.TotalCommitted + } + + for key, dedicatedUsage := range dedicatedUsageMap { + ch <- prometheus.MustNewConstMetric( + c.gpuAdapterMemoryDedicatedUsage, + prometheus.GaugeValue, + dedicatedUsage, + key.Phys, + ) + ch <- prometheus.MustNewConstMetric( + c.gpuAdapterMemorySharedUsage, + prometheus.GaugeValue, + sharedUsageMap[key], + key.Phys, + ) + ch <- prometheus.MustNewConstMetric( + c.gpuAdapterMemoryTotalCommitted, + prometheus.GaugeValue, + totalCommittedMap[key], + key.Phys, + ) + } + + return nil +} + +func (c *Collector) collectGpuLocalAdapterMemoryMetrics(ch chan<- prometheus.Metric) error { + // Collect the GPU Local Adapter Memory perf data. + if err := c.gpuLocalAdapterMemoryPerfDataCollector.Collect(&c.gpuLocalAdapterMemoryPerfDataObject); err != nil { + return fmt.Errorf("failed to collect GPU Local Adapter Memory perf data: %w", err) + } + + localAdapterMemoryMap := make(map[string]float64) + + for _, data := range c.gpuLocalAdapterMemoryPerfDataObject { + instance := parseGPUCounterInstanceString(data.Name) + + localAdapterMemoryMap[instance.Phys] += data.LocalUsage + } + + for phys, localUsage := range localAdapterMemoryMap { + ch <- prometheus.MustNewConstMetric( + c.gpuLocalAdapterMemoryUsage, + prometheus.GaugeValue, + localUsage, + phys, + ) + } + + return nil +} + +func (c *Collector) collectGpuNonLocalAdapterMemoryMetrics(ch chan<- prometheus.Metric) error { + // Collect the GPU Non Local Adapter Memory perf data. + if err := c.gpuNonLocalAdapterMemoryPerfDataCollector.Collect(&c.gpuNonLocalAdapterMemoryPerfDataObject); err != nil { + return fmt.Errorf("failed to collect GPU Non Local Adapter Memory perf data: %w", err) + } + + nonLocalAdapterMemoryMap := make(map[string]float64) + + for _, data := range c.gpuNonLocalAdapterMemoryPerfDataObject { + instance := parseGPUCounterInstanceString(data.Name) + + nonLocalAdapterMemoryMap[instance.Phys] += data.NonLocalUsage + } + + for phys, nonLocalUsage := range nonLocalAdapterMemoryMap { + ch <- prometheus.MustNewConstMetric( + c.gpuNonLocalAdapterMemoryUsage, + prometheus.GaugeValue, + nonLocalUsage, + phys, + ) + } + + return nil +} + +func (c *Collector) collectGpuProcessMemoryMetrics(ch chan<- prometheus.Metric) error { + // Collect the GPU Process Memory perf data. + if err := c.gpuProcessMemoryPerfDataCollector.Collect(&c.gpuProcessMemoryPerfDataObject); err != nil { + return fmt.Errorf("failed to collect GPU Process Memory perf data: %w", err) + } + + processDedicatedUsageMap := make(map[PidPhys]float64) + processLocalUsageMap := make(map[PidPhys]float64) + processNonLocalUsageMap := make(map[PidPhys]float64) + processSharedUsageMap := make(map[PidPhys]float64) + processTotalCommittedMap := make(map[PidPhys]float64) + + for _, data := range c.gpuProcessMemoryPerfDataObject { + instance := parseGPUCounterInstanceString(data.Name) + + key := PidPhys{ + Pid: instance.Pid, + Phys: instance.Phys, + } + processDedicatedUsageMap[key] += data.DedicatedUsage + processLocalUsageMap[key] += data.LocalUsage + processNonLocalUsageMap[key] += data.NonLocalUsage + processSharedUsageMap[key] += data.SharedUsage + processTotalCommittedMap[key] += data.TotalCommitted + } + + for key, dedicatedUsage := range processDedicatedUsageMap { + ch <- prometheus.MustNewConstMetric( + c.gpuProcessMemoryDedicatedUsage, + prometheus.GaugeValue, + dedicatedUsage, + key.Pid, key.Phys, + ) + ch <- prometheus.MustNewConstMetric( + c.gpuProcessMemoryLocalUsage, + prometheus.GaugeValue, + processLocalUsageMap[key], + key.Pid, key.Phys, + ) + ch <- prometheus.MustNewConstMetric( + c.gpuProcessMemoryNonLocalUsage, + prometheus.GaugeValue, + processNonLocalUsageMap[key], + key.Pid, key.Phys, + ) + ch <- prometheus.MustNewConstMetric( + c.gpuProcessMemorySharedUsage, + prometheus.GaugeValue, + processSharedUsageMap[key], + key.Pid, key.Phys, + ) + ch <- prometheus.MustNewConstMetric( + c.gpuProcessMemoryTotalCommitted, + prometheus.GaugeValue, + processTotalCommittedMap[key], + key.Pid, key.Phys, + ) + } + + return nil +} diff --git a/internal/collector/gpu/gpu_test.go b/internal/collector/gpu/gpu_test.go new file mode 100644 index 00000000..1897cd77 --- /dev/null +++ b/internal/collector/gpu/gpu_test.go @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build windows + +package gpu_test + +import ( + "testing" + + "github.com/prometheus-community/windows_exporter/internal/collector/gpu" + "github.com/prometheus-community/windows_exporter/internal/utils/testutils" +) + +func BenchmarkCollector(b *testing.B) { + testutils.FuncBenchmarkCollector(b, gpu.Name, gpu.NewWithFlags) +} + +func TestCollector(t *testing.T) { + testutils.TestCollector(t, gpu.New, nil) +} diff --git a/internal/collector/gpu/types.go b/internal/collector/gpu/types.go new file mode 100644 index 00000000..affd96a9 --- /dev/null +++ b/internal/collector/gpu/types.go @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build windows + +package gpu + +type gpuEnginePerfDataCounterValues struct { + Name string + + RunningTime float64 `perfdata:"Running Time"` + UtilizationPercentage float64 `perfdata:"Utilization Percentage"` +} + +type gpuAdapterMemoryPerfDataCounterValues struct { + Name string + + DedicatedUsage float64 `perfdata:"Dedicated Usage"` + SharedUsage float64 `perfdata:"Shared Usage"` + TotalCommitted float64 `perfdata:"Total Committed"` +} + +type gpuLocalAdapterMemoryPerfDataCounterValues struct { + Name string + + LocalUsage float64 `perfdata:"Local Usage"` +} + +type gpuNonLocalAdapterMemoryPerfDataCounterValues struct { + Name string + + NonLocalUsage float64 `perfdata:"Non Local Usage"` +} + +type gpuProcessMemoryPerfDataCounterValues struct { + Name string + + DedicatedUsage float64 `perfdata:"Dedicated Usage"` + LocalUsage float64 `perfdata:"Local Usage"` + NonLocalUsage float64 `perfdata:"Non Local Usage"` + SharedUsage float64 `perfdata:"Shared Usage"` + TotalCommitted float64 `perfdata:"Total Committed"` +} diff --git a/internal/collector/gpu/utils.go b/internal/collector/gpu/utils.go new file mode 100644 index 00000000..cf177e3a --- /dev/null +++ b/internal/collector/gpu/utils.go @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build windows + +package gpu + +import ( + "strings" +) + +type Instance struct { + Pid string + Luid [2]string + Phys string + Eng string + Engtype string + Part string +} + +type PidPhys struct { + Pid string + Phys string +} + +type PidPhysEngEngType struct { + Pid string + Phys string + Eng string + Engtype string +} + +func parseGPUCounterInstanceString(s string) Instance { + // Example: "pid_1234_luid_0x00000000_0x00005678_phys_0_eng_0_engtype_3D" + // Example: "luid_0x00000000_0x00005678_phys_0" + // Example: "luid_0x00000000_0x00005678_phys_0_part_0" + parts := strings.Split(s, "_") + + var instance Instance + + for i, part := range parts { + switch part { + case "pid": + if i+1 < len(parts) { + instance.Pid = parts[i+1] + } + case "luid": + if i+2 < len(parts) { + instance.Luid[0] = parts[i+1] + instance.Luid[1] = parts[i+2] + } + case "phys": + if i+1 < len(parts) { + instance.Phys = parts[i+1] + } + case "eng": + if i+1 < len(parts) { + instance.Eng = parts[i+1] + } + case "engtype": + if i+1 < len(parts) { + instance.Engtype = parts[i+1] + } + case "part": + if i+1 < len(parts) { + instance.Part = parts[i+1] + } + } + } + + return instance +} diff --git a/pkg/collector/collection.go b/pkg/collector/collection.go index 5f4f782a..f11d1deb 100644 --- a/pkg/collector/collection.go +++ b/pkg/collector/collection.go @@ -43,6 +43,7 @@ import ( "github.com/prometheus-community/windows_exporter/internal/collector/exchange" "github.com/prometheus-community/windows_exporter/internal/collector/filetime" "github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota" + "github.com/prometheus-community/windows_exporter/internal/collector/gpu" "github.com/prometheus-community/windows_exporter/internal/collector/hyperv" "github.com/prometheus-community/windows_exporter/internal/collector/iis" "github.com/prometheus-community/windows_exporter/internal/collector/license" @@ -114,6 +115,7 @@ func NewWithConfig(config Config) *Collection { collectors[exchange.Name] = exchange.New(&config.Exchange) collectors[filetime.Name] = filetime.New(&config.Filetime) collectors[fsrmquota.Name] = fsrmquota.New(&config.Fsrmquota) + collectors[gpu.Name] = gpu.New(&config.GPU) collectors[hyperv.Name] = hyperv.New(&config.HyperV) collectors[iis.Name] = iis.New(&config.IIS) collectors[license.Name] = license.New(&config.License) diff --git a/pkg/collector/config.go b/pkg/collector/config.go index 31d5e08e..791247f1 100644 --- a/pkg/collector/config.go +++ b/pkg/collector/config.go @@ -33,6 +33,7 @@ import ( "github.com/prometheus-community/windows_exporter/internal/collector/exchange" "github.com/prometheus-community/windows_exporter/internal/collector/filetime" "github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota" + "github.com/prometheus-community/windows_exporter/internal/collector/gpu" "github.com/prometheus-community/windows_exporter/internal/collector/hyperv" "github.com/prometheus-community/windows_exporter/internal/collector/iis" "github.com/prometheus-community/windows_exporter/internal/collector/license" @@ -84,6 +85,7 @@ type Config struct { Exchange exchange.Config `yaml:"exchange"` Filetime filetime.Config `yaml:"filetime"` Fsrmquota fsrmquota.Config `yaml:"fsrmquota"` + GPU gpu.Config `yaml:"gpu"` HyperV hyperv.Config `yaml:"hyperv"` IIS iis.Config `yaml:"iis"` License license.Config `yaml:"license"` @@ -139,6 +141,7 @@ var ConfigDefaults = Config{ Exchange: exchange.ConfigDefaults, Filetime: filetime.ConfigDefaults, Fsrmquota: fsrmquota.ConfigDefaults, + GPU: gpu.ConfigDefaults, HyperV: hyperv.ConfigDefaults, IIS: iis.ConfigDefaults, License: license.ConfigDefaults, diff --git a/pkg/collector/map.go b/pkg/collector/map.go index 7b4e692b..bf9a8236 100644 --- a/pkg/collector/map.go +++ b/pkg/collector/map.go @@ -37,6 +37,7 @@ import ( "github.com/prometheus-community/windows_exporter/internal/collector/exchange" "github.com/prometheus-community/windows_exporter/internal/collector/filetime" "github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota" + "github.com/prometheus-community/windows_exporter/internal/collector/gpu" "github.com/prometheus-community/windows_exporter/internal/collector/hyperv" "github.com/prometheus-community/windows_exporter/internal/collector/iis" "github.com/prometheus-community/windows_exporter/internal/collector/license" @@ -95,6 +96,7 @@ var BuildersWithFlags = map[string]BuilderWithFlags[Collector]{ exchange.Name: NewBuilderWithFlags(exchange.NewWithFlags), filetime.Name: NewBuilderWithFlags(filetime.NewWithFlags), fsrmquota.Name: NewBuilderWithFlags(fsrmquota.NewWithFlags), + gpu.Name: NewBuilderWithFlags(gpu.NewWithFlags), hyperv.Name: NewBuilderWithFlags(hyperv.NewWithFlags), iis.Name: NewBuilderWithFlags(iis.NewWithFlags), license.Name: NewBuilderWithFlags(license.NewWithFlags),