gpu: add metrics collector and related types (#2052) (#2059)

This commit is contained in:
小荣
2025-06-03 01:45:18 +08:00
committed by GitHub
parent e673f192d2
commit dcf85032ca
9 changed files with 798 additions and 48 deletions

View File

@@ -12,54 +12,55 @@ A Prometheus exporter for Windows machines.
## Collectors
Name | Description | Enabled by default
---------|-------------|--------------------
[ad](docs/collector.ad.md) | Active Directory Domain Services |
[adcs](docs/collector.adcs.md) | Active Directory Certificate Services |
[adfs](docs/collector.adfs.md) | Active Directory Federation Services |
[cache](docs/collector.cache.md) | Cache metrics |
[cpu](docs/collector.cpu.md) | CPU usage | ✓
[cpu_info](docs/collector.cpu_info.md) | CPU Information |
[cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) |
[container](docs/collector.container.md) | Container metrics |
[diskdrive](docs/collector.diskdrive.md) | Diskdrive metrics |
[dfsr](docs/collector.dfsr.md) | DFSR metrics |
[dhcp](docs/collector.dhcp.md) | DHCP Server |
[dns](docs/collector.dns.md) | DNS Server |
[exchange](docs/collector.exchange.md) | Exchange metrics |
[filetime](docs/collector.filetime.md) | FileTime metrics |
[fsrmquota](docs/collector.fsrmquota.md) | Microsoft File Server Resource Manager (FSRM) Quotas collector |
[hyperv](docs/collector.hyperv.md) | Hyper-V hosts |
[iis](docs/collector.iis.md) | IIS sites and applications |
[license](docs/collector.license.md) | Windows license status |
[logical_disk](docs/collector.logical_disk.md) | Logical disks, disk I/O | ✓
[memory](docs/collector.memory.md) | Memory usage metrics | ✓
[mscluster](docs/collector.mscluster.md) | MSCluster metrics |
[msmq](docs/collector.msmq.md) | MSMQ queues |
[mssql](docs/collector.mssql.md) | [SQL Server Performance Objects](https://docs.microsoft.com/en-us/sql/relational-databases/performance-monitor/use-sql-server-objects#SQLServerPOs) metrics |
[netframework](docs/collector.netframework.md) | .NET Framework metrics |
[net](docs/collector.net.md) | Network interface I/O | ✓
[os](docs/collector.os.md) | OS metrics (memory, processes, users) | ✓
[pagefile](docs/collector.pagefile.md) | pagefile metrics |
[performancecounter](docs/collector.performancecounter.md) | Custom performance counter metrics |
[physical_disk](docs/collector.physical_disk.md) | physical disk metrics | ✓
[printer](docs/collector.printer.md) | Printer metrics |
[process](docs/collector.process.md) | Per-process metrics |
[remote_fx](docs/collector.remote_fx.md) | RemoteFX protocol (RDP) metrics |
[scheduled_task](docs/collector.scheduled_task.md) | Scheduled Tasks metrics |
[service](docs/collector.service.md) | Service state metrics | ✓
[smb](docs/collector.smb.md) | SMB Server |
[smbclient](docs/collector.smbclient.md) | SMB Client |
[smtp](docs/collector.smtp.md) | IIS SMTP Server |
[system](docs/collector.system.md) | System calls | ✓
[tcp](docs/collector.tcp.md) | TCP connections |
[terminal_services](docs/collector.terminal_services.md) | Terminal services (RDS)
[textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file |
[thermalzone](docs/collector.thermalzone.md) | Thermal information |
[time](docs/collector.time.md) | Windows Time Service |
[udp](docs/collector.udp.md) | UDP connections |
[update](docs/collector.update.md) | Windows Update Service |
[vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent |
Name | Description | Enabled by default
------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------
[ad](docs/collector.ad.md) | Active Directory Domain Services |
[adcs](docs/collector.adcs.md) | Active Directory Certificate Services |
[adfs](docs/collector.adfs.md) | Active Directory Federation Services |
[cache](docs/collector.cache.md) | Cache metrics |
[cpu](docs/collector.cpu.md) | CPU usage | ✓
[cpu_info](docs/collector.cpu_info.md) | CPU Information |
[cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) |
[container](docs/collector.container.md) | Container metrics |
[diskdrive](docs/collector.diskdrive.md) | Diskdrive metrics |
[dfsr](docs/collector.dfsr.md) | DFSR metrics |
[dhcp](docs/collector.dhcp.md) | DHCP Server |
[dns](docs/collector.dns.md) | DNS Server |
[exchange](docs/collector.exchange.md) | Exchange metrics |
[filetime](docs/collector.filetime.md) | FileTime metrics |
[fsrmquota](docs/collector.fsrmquota.md) | Microsoft File Server Resource Manager (FSRM) Quotas collector |
[gpu](docs/collector.gpu.md) | GPU metrics |
[hyperv](docs/collector.hyperv.md) | Hyper-V hosts |
[iis](docs/collector.iis.md) | IIS sites and applications |
[license](docs/collector.license.md) | Windows license status |
[logical_disk](docs/collector.logical_disk.md) | Logical disks, disk I/O | ✓
[memory](docs/collector.memory.md) | Memory usage metrics | ✓
[mscluster](docs/collector.mscluster.md) | MSCluster metrics |
[msmq](docs/collector.msmq.md) | MSMQ queues |
[mssql](docs/collector.mssql.md) | [SQL Server Performance Objects](https://docs.microsoft.com/en-us/sql/relational-databases/performance-monitor/use-sql-server-objects#SQLServerPOs) metrics |
[netframework](docs/collector.netframework.md) | .NET Framework metrics |
[net](docs/collector.net.md) | Network interface I/O | ✓
[os](docs/collector.os.md) | OS metrics (memory, processes, users) | ✓
[pagefile](docs/collector.pagefile.md) | pagefile metrics |
[performancecounter](docs/collector.performancecounter.md) | Custom performance counter metrics |
[physical_disk](docs/collector.physical_disk.md) | physical disk metrics | ✓
[printer](docs/collector.printer.md) | Printer metrics |
[process](docs/collector.process.md) | Per-process metrics |
[remote_fx](docs/collector.remote_fx.md) | RemoteFX protocol (RDP) metrics |
[scheduled_task](docs/collector.scheduled_task.md) | Scheduled Tasks metrics |
[service](docs/collector.service.md) | Service state metrics | ✓
[smb](docs/collector.smb.md) | SMB Server |
[smbclient](docs/collector.smbclient.md) | SMB Client |
[smtp](docs/collector.smtp.md) | IIS SMTP Server |
[system](docs/collector.system.md) | System calls | ✓
[tcp](docs/collector.tcp.md) | TCP connections |
[terminal_services](docs/collector.terminal_services.md) | Terminal services (RDS)
[textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file |
[thermalzone](docs/collector.thermalzone.md) | Thermal information |
[time](docs/collector.time.md) | Windows Time Service |
[udp](docs/collector.udp.md) | UDP connections |
[update](docs/collector.update.md) | Windows Update Service |
[vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent |
See the linked documentation on each collector for more information on reported metrics, configuration settings and usage examples.

139
docs/collector.gpu.md Normal file
View File

@@ -0,0 +1,139 @@
# gpu collector
The gpu collector exposes metrics about GPU usage and memory consumption, both at the adapter (physical GPU) and
per-process level.
| | |
|---------------------|--------------------------------------|
| Metric name prefix | `gpu` |
| Data source | Perflib |
| Counters | GPU Engine, GPU Adapter, GPU Process |
| Enabled by default? | No |
## Flags
None
## Metrics
These metrics are available on supported versions of Windows with compatible GPUs and drivers:
### Adapter-level Metrics
| Name | Description | Type | Labels |
|----------------------------------------------|----------------------------------------------------------|-------|--------|
| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `phys` |
### Per-process Metrics
| Name | Description | Type | Labels |
|----------------------------------------------|-------------------------------------------------|---------|----------------------------------------|
| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `phys`, `eng`, `engtype`, `process_id` |
| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
## Metric Labels
* `phys`: Physical GPU index (e.g., "0")
* `eng`: GPU engine index (e.g., "0", "1", ...)
* `engtype`: GPU engine type (e.g., "3D", "Copy", "VideoDecode", etc.)
* `process_id`: Process ID
## Example Metric
These are basic queries to help you get started with GPU monitoring on Windows using Prometheus.
**Show total dedicated GPU memory (in bytes) usage on GPU 0:**
```promql
windows_gpu_adapter_memory_dedicated_bytes{phys="0"}
```
**Aggregate GPU utilization across all processes for a physical GPU (3D engine):**
```promql
sum by (phys) (
rate(windows_gpu_engine_time_seconds{phys="0", engtype="3D"}[1m])
) * 100
```
**Show GPU utilization for a specific process (3D engine):**
```promql
sum by (phys, process_id) (
rate(windows_gpu_engine_time_seconds{process_id="1234", engtype="3D"}[1m])
) * 100
```
**Show dedicated GPU memory per process:**
```promql
windows_gpu_adapter_memory_dedicated_bytes
```
## Useful Queries
**Show top 5 processes by GPU utilization (all engines):**
```promql
topk(5, sum by (process_id) (
rate(windows_gpu_engine_time_seconds[1m])
) * 100)
```
**Show GPU memory usage per physical GPU:**
```promql
sum by (phys) (
windows_gpu_adapter_memory_dedicated_bytes
)
```
Show GPU engine time with process owner and command line:
```promql
windows_gpu_engine_time_seconds * on(process_id) group_left(owner, cmdline) windows_process_info
```
## Alerting Examples
**prometheus.rules**
```yaml
# Alert on processes using more than 80% of a GPU's capacity over 10 minutes
- alert: HighGpuUtilization
expr: |
sum by (process_id) (
rate(windows_gpu_engine_time_seconds[1m])
) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High GPU Utilization (process {{ $labels.process_id }})"
description: "Process is using more than 80% of GPU resources\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
```
## Notes
* Per-process metrics allow you to identify which processes are consuming GPU resources.
* Adapter-level metrics provide an overview of total GPU memory usage.
* For overall GPU utilization, aggregate per-process metrics in Prometheus using queries such as `sum()`.
* The collector relies on Windows performance counters; ensure your system and drivers support these counters.
## Enabling the Collector
To enable the GPU collector, add `gpu` to the list of enabled collectors in your windows_exporter configuration.
Example (command line):
```shell
windows_exporter.exe --collectors.enabled=gpu
```

View File

@@ -0,0 +1,431 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu
import (
"errors"
"fmt"
"log/slog"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus-community/windows_exporter/internal/mi"
"github.com/prometheus-community/windows_exporter/internal/pdh"
"github.com/prometheus-community/windows_exporter/internal/types"
"github.com/prometheus/client_golang/prometheus"
)
const Name = "gpu"
type Config struct{}
//nolint:gochecknoglobals
var ConfigDefaults = Config{}
type Collector struct {
config Config
// GPU Engine
gpuEnginePerfDataCollector *pdh.Collector
gpuEnginePerfDataObject []gpuEnginePerfDataCounterValues
gpuEngineRunningTime *prometheus.Desc
// GPU Adapter Memory
gpuAdapterMemoryPerfDataCollector *pdh.Collector
gpuAdapterMemoryPerfDataObject []gpuAdapterMemoryPerfDataCounterValues
gpuAdapterMemoryDedicatedUsage *prometheus.Desc
gpuAdapterMemorySharedUsage *prometheus.Desc
gpuAdapterMemoryTotalCommitted *prometheus.Desc
// GPU Local Adapter Memory
gpuLocalAdapterMemoryPerfDataCollector *pdh.Collector
gpuLocalAdapterMemoryPerfDataObject []gpuLocalAdapterMemoryPerfDataCounterValues
gpuLocalAdapterMemoryUsage *prometheus.Desc
// GPU Non Local Adapter Memory
gpuNonLocalAdapterMemoryPerfDataCollector *pdh.Collector
gpuNonLocalAdapterMemoryPerfDataObject []gpuNonLocalAdapterMemoryPerfDataCounterValues
gpuNonLocalAdapterMemoryUsage *prometheus.Desc
// GPU Process Memory
gpuProcessMemoryPerfDataCollector *pdh.Collector
gpuProcessMemoryPerfDataObject []gpuProcessMemoryPerfDataCounterValues
gpuProcessMemoryDedicatedUsage *prometheus.Desc
gpuProcessMemoryLocalUsage *prometheus.Desc
gpuProcessMemoryNonLocalUsage *prometheus.Desc
gpuProcessMemorySharedUsage *prometheus.Desc
gpuProcessMemoryTotalCommitted *prometheus.Desc
}
func New(config *Config) *Collector {
if config == nil {
config = &ConfigDefaults
}
c := &Collector{
config: *config,
}
return c
}
func NewWithFlags(_ *kingpin.Application) *Collector {
return &Collector{}
}
func (c *Collector) GetName() string {
return Name
}
func (c *Collector) Close() error {
c.gpuEnginePerfDataCollector.Close()
c.gpuAdapterMemoryPerfDataCollector.Close()
c.gpuLocalAdapterMemoryPerfDataCollector.Close()
c.gpuNonLocalAdapterMemoryPerfDataCollector.Close()
c.gpuProcessMemoryPerfDataCollector.Close()
return nil
}
func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
var err error
c.gpuEngineRunningTime = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"),
"Total running time of the GPU in seconds.",
[]string{"process_id", "phys", "eng", "engtype"},
nil,
)
c.gpuAdapterMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_dedicated_bytes"),
"Dedicated GPU memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuAdapterMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_shared_bytes"),
"Shared GPU memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuAdapterMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_committed_bytes"),
"Total committed GPU memory in bytes.",
[]string{"phys"},
nil,
)
c.gpuLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "local_adapter_memory_bytes"),
"Local adapter memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuNonLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "non_local_adapter_memory_bytes"),
"Non-local adapter memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuProcessMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_dedicated_bytes"),
"Dedicated process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemoryLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_local_bytes"),
"Local process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemoryNonLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_non_local_bytes"),
"Non-local process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_shared_bytes"),
"Shared process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_committed_bytes"),
"Total committed process memory in bytes.",
[]string{"process_id", "phys"},
nil,
)
errs := make([]error, 0)
c.gpuEnginePerfDataCollector, err = pdh.NewCollector[gpuEnginePerfDataCounterValues](pdh.CounterTypeRaw, "GPU Engine", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Engine perf data collector: %w", err))
}
c.gpuAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Adapter Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Adapter Memory perf data collector: %w", err))
}
c.gpuLocalAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuLocalAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Local Adapter Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Local Adapter Memory perf data collector: %w", err))
}
c.gpuNonLocalAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuNonLocalAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Non Local Adapter Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Non Local Adapter Memory perf data collector: %w", err))
}
c.gpuProcessMemoryPerfDataCollector, err = pdh.NewCollector[gpuProcessMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Process Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Process Memory perf data collector: %w", err))
}
return errors.Join(errs...)
}
func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
errs := make([]error, 0)
if err := c.collectGpuEngineMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuAdapterMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuLocalAdapterMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuNonLocalAdapterMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuProcessMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
return errors.Join(errs...)
}
func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Engine perf data.
if err := c.gpuEnginePerfDataCollector.Collect(&c.gpuEnginePerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Engine perf data: %w", err)
}
runningTimeMap := make(map[PidPhysEngEngType]float64)
// Iterate over the GPU Engine perf data and aggregate the values.
for _, data := range c.gpuEnginePerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
key := PidPhysEngEngType{
Pid: instance.Pid,
Phys: instance.Phys,
Eng: instance.Eng,
Engtype: instance.Engtype,
}
runningTimeMap[key] += data.RunningTime / 10_000_000 // RunningTime is in 100ns units, convert to seconds.
}
for key, runningTime := range runningTimeMap {
ch <- prometheus.MustNewConstMetric(
c.gpuEngineRunningTime,
prometheus.CounterValue,
runningTime,
key.Pid, key.Phys, key.Eng, key.Engtype,
)
}
return nil
}
func (c *Collector) collectGpuAdapterMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Adapter Memory perf data.
if err := c.gpuAdapterMemoryPerfDataCollector.Collect(&c.gpuAdapterMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Adapter Memory perf data: %w", err)
}
dedicatedUsageMap := make(map[PidPhysEngEngType]float64)
sharedUsageMap := make(map[PidPhysEngEngType]float64)
totalCommittedMap := make(map[PidPhysEngEngType]float64)
for _, data := range c.gpuAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
key := PidPhysEngEngType{
Pid: instance.Pid,
Phys: instance.Phys,
Eng: instance.Eng,
Engtype: instance.Engtype,
}
dedicatedUsageMap[key] += data.DedicatedUsage
sharedUsageMap[key] += data.SharedUsage
totalCommittedMap[key] += data.TotalCommitted
}
for key, dedicatedUsage := range dedicatedUsageMap {
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemoryDedicatedUsage,
prometheus.GaugeValue,
dedicatedUsage,
key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemorySharedUsage,
prometheus.GaugeValue,
sharedUsageMap[key],
key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemoryTotalCommitted,
prometheus.GaugeValue,
totalCommittedMap[key],
key.Phys,
)
}
return nil
}
func (c *Collector) collectGpuLocalAdapterMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Local Adapter Memory perf data.
if err := c.gpuLocalAdapterMemoryPerfDataCollector.Collect(&c.gpuLocalAdapterMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Local Adapter Memory perf data: %w", err)
}
localAdapterMemoryMap := make(map[string]float64)
for _, data := range c.gpuLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
localAdapterMemoryMap[instance.Phys] += data.LocalUsage
}
for phys, localUsage := range localAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric(
c.gpuLocalAdapterMemoryUsage,
prometheus.GaugeValue,
localUsage,
phys,
)
}
return nil
}
func (c *Collector) collectGpuNonLocalAdapterMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Non Local Adapter Memory perf data.
if err := c.gpuNonLocalAdapterMemoryPerfDataCollector.Collect(&c.gpuNonLocalAdapterMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Non Local Adapter Memory perf data: %w", err)
}
nonLocalAdapterMemoryMap := make(map[string]float64)
for _, data := range c.gpuNonLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
nonLocalAdapterMemoryMap[instance.Phys] += data.NonLocalUsage
}
for phys, nonLocalUsage := range nonLocalAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric(
c.gpuNonLocalAdapterMemoryUsage,
prometheus.GaugeValue,
nonLocalUsage,
phys,
)
}
return nil
}
func (c *Collector) collectGpuProcessMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Process Memory perf data.
if err := c.gpuProcessMemoryPerfDataCollector.Collect(&c.gpuProcessMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Process Memory perf data: %w", err)
}
processDedicatedUsageMap := make(map[PidPhys]float64)
processLocalUsageMap := make(map[PidPhys]float64)
processNonLocalUsageMap := make(map[PidPhys]float64)
processSharedUsageMap := make(map[PidPhys]float64)
processTotalCommittedMap := make(map[PidPhys]float64)
for _, data := range c.gpuProcessMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
key := PidPhys{
Pid: instance.Pid,
Phys: instance.Phys,
}
processDedicatedUsageMap[key] += data.DedicatedUsage
processLocalUsageMap[key] += data.LocalUsage
processNonLocalUsageMap[key] += data.NonLocalUsage
processSharedUsageMap[key] += data.SharedUsage
processTotalCommittedMap[key] += data.TotalCommitted
}
for key, dedicatedUsage := range processDedicatedUsageMap {
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryDedicatedUsage,
prometheus.GaugeValue,
dedicatedUsage,
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryLocalUsage,
prometheus.GaugeValue,
processLocalUsageMap[key],
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryNonLocalUsage,
prometheus.GaugeValue,
processNonLocalUsageMap[key],
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemorySharedUsage,
prometheus.GaugeValue,
processSharedUsageMap[key],
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryTotalCommitted,
prometheus.GaugeValue,
processTotalCommittedMap[key],
key.Pid, key.Phys,
)
}
return nil
}

View File

@@ -0,0 +1,33 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu_test
import (
"testing"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/utils/testutils"
)
func BenchmarkCollector(b *testing.B) {
testutils.FuncBenchmarkCollector(b, gpu.Name, gpu.NewWithFlags)
}
func TestCollector(t *testing.T) {
testutils.TestCollector(t, gpu.New, nil)
}

View File

@@ -0,0 +1,55 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu
type gpuEnginePerfDataCounterValues struct {
Name string
RunningTime float64 `perfdata:"Running Time"`
UtilizationPercentage float64 `perfdata:"Utilization Percentage"`
}
type gpuAdapterMemoryPerfDataCounterValues struct {
Name string
DedicatedUsage float64 `perfdata:"Dedicated Usage"`
SharedUsage float64 `perfdata:"Shared Usage"`
TotalCommitted float64 `perfdata:"Total Committed"`
}
type gpuLocalAdapterMemoryPerfDataCounterValues struct {
Name string
LocalUsage float64 `perfdata:"Local Usage"`
}
type gpuNonLocalAdapterMemoryPerfDataCounterValues struct {
Name string
NonLocalUsage float64 `perfdata:"Non Local Usage"`
}
type gpuProcessMemoryPerfDataCounterValues struct {
Name string
DedicatedUsage float64 `perfdata:"Dedicated Usage"`
LocalUsage float64 `perfdata:"Local Usage"`
NonLocalUsage float64 `perfdata:"Non Local Usage"`
SharedUsage float64 `perfdata:"Shared Usage"`
TotalCommitted float64 `perfdata:"Total Committed"`
}

View File

@@ -0,0 +1,84 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu
import (
"strings"
)
type Instance struct {
Pid string
Luid [2]string
Phys string
Eng string
Engtype string
Part string
}
type PidPhys struct {
Pid string
Phys string
}
type PidPhysEngEngType struct {
Pid string
Phys string
Eng string
Engtype string
}
func parseGPUCounterInstanceString(s string) Instance {
// Example: "pid_1234_luid_0x00000000_0x00005678_phys_0_eng_0_engtype_3D"
// Example: "luid_0x00000000_0x00005678_phys_0"
// Example: "luid_0x00000000_0x00005678_phys_0_part_0"
parts := strings.Split(s, "_")
var instance Instance
for i, part := range parts {
switch part {
case "pid":
if i+1 < len(parts) {
instance.Pid = parts[i+1]
}
case "luid":
if i+2 < len(parts) {
instance.Luid[0] = parts[i+1]
instance.Luid[1] = parts[i+2]
}
case "phys":
if i+1 < len(parts) {
instance.Phys = parts[i+1]
}
case "eng":
if i+1 < len(parts) {
instance.Eng = parts[i+1]
}
case "engtype":
if i+1 < len(parts) {
instance.Engtype = parts[i+1]
}
case "part":
if i+1 < len(parts) {
instance.Part = parts[i+1]
}
}
}
return instance
}

View File

@@ -43,6 +43,7 @@ import (
"github.com/prometheus-community/windows_exporter/internal/collector/exchange"
"github.com/prometheus-community/windows_exporter/internal/collector/filetime"
"github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/collector/hyperv"
"github.com/prometheus-community/windows_exporter/internal/collector/iis"
"github.com/prometheus-community/windows_exporter/internal/collector/license"
@@ -114,6 +115,7 @@ func NewWithConfig(config Config) *Collection {
collectors[exchange.Name] = exchange.New(&config.Exchange)
collectors[filetime.Name] = filetime.New(&config.Filetime)
collectors[fsrmquota.Name] = fsrmquota.New(&config.Fsrmquota)
collectors[gpu.Name] = gpu.New(&config.GPU)
collectors[hyperv.Name] = hyperv.New(&config.HyperV)
collectors[iis.Name] = iis.New(&config.IIS)
collectors[license.Name] = license.New(&config.License)

View File

@@ -33,6 +33,7 @@ import (
"github.com/prometheus-community/windows_exporter/internal/collector/exchange"
"github.com/prometheus-community/windows_exporter/internal/collector/filetime"
"github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/collector/hyperv"
"github.com/prometheus-community/windows_exporter/internal/collector/iis"
"github.com/prometheus-community/windows_exporter/internal/collector/license"
@@ -84,6 +85,7 @@ type Config struct {
Exchange exchange.Config `yaml:"exchange"`
Filetime filetime.Config `yaml:"filetime"`
Fsrmquota fsrmquota.Config `yaml:"fsrmquota"`
GPU gpu.Config `yaml:"gpu"`
HyperV hyperv.Config `yaml:"hyperv"`
IIS iis.Config `yaml:"iis"`
License license.Config `yaml:"license"`
@@ -139,6 +141,7 @@ var ConfigDefaults = Config{
Exchange: exchange.ConfigDefaults,
Filetime: filetime.ConfigDefaults,
Fsrmquota: fsrmquota.ConfigDefaults,
GPU: gpu.ConfigDefaults,
HyperV: hyperv.ConfigDefaults,
IIS: iis.ConfigDefaults,
License: license.ConfigDefaults,

View File

@@ -37,6 +37,7 @@ import (
"github.com/prometheus-community/windows_exporter/internal/collector/exchange"
"github.com/prometheus-community/windows_exporter/internal/collector/filetime"
"github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/collector/hyperv"
"github.com/prometheus-community/windows_exporter/internal/collector/iis"
"github.com/prometheus-community/windows_exporter/internal/collector/license"
@@ -95,6 +96,7 @@ var BuildersWithFlags = map[string]BuilderWithFlags[Collector]{
exchange.Name: NewBuilderWithFlags(exchange.NewWithFlags),
filetime.Name: NewBuilderWithFlags(filetime.NewWithFlags),
fsrmquota.Name: NewBuilderWithFlags(fsrmquota.NewWithFlags),
gpu.Name: NewBuilderWithFlags(gpu.NewWithFlags),
hyperv.Name: NewBuilderWithFlags(hyperv.NewWithFlags),
iis.Name: NewBuilderWithFlags(iis.NewWithFlags),
license.Name: NewBuilderWithFlags(license.NewWithFlags),