gpu: add metrics collector and related types (#2052) (#2059)

This commit is contained in:
小荣
2025-06-03 01:45:18 +08:00
committed by GitHub
parent e673f192d2
commit dcf85032ca
9 changed files with 798 additions and 48 deletions

View File

@@ -12,54 +12,55 @@ A Prometheus exporter for Windows machines.
## Collectors ## Collectors
Name | Description | Enabled by default Name | Description | Enabled by default
---------|-------------|-------------------- ------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------
[ad](docs/collector.ad.md) | Active Directory Domain Services | [ad](docs/collector.ad.md) | Active Directory Domain Services |
[adcs](docs/collector.adcs.md) | Active Directory Certificate Services | [adcs](docs/collector.adcs.md) | Active Directory Certificate Services |
[adfs](docs/collector.adfs.md) | Active Directory Federation Services | [adfs](docs/collector.adfs.md) | Active Directory Federation Services |
[cache](docs/collector.cache.md) | Cache metrics | [cache](docs/collector.cache.md) | Cache metrics |
[cpu](docs/collector.cpu.md) | CPU usage | ✓ [cpu](docs/collector.cpu.md) | CPU usage | ✓
[cpu_info](docs/collector.cpu_info.md) | CPU Information | [cpu_info](docs/collector.cpu_info.md) | CPU Information |
[cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) | [cs](docs/collector.cs.md) | "Computer System" metrics (system properties, num cpus/total memory) |
[container](docs/collector.container.md) | Container metrics | [container](docs/collector.container.md) | Container metrics |
[diskdrive](docs/collector.diskdrive.md) | Diskdrive metrics | [diskdrive](docs/collector.diskdrive.md) | Diskdrive metrics |
[dfsr](docs/collector.dfsr.md) | DFSR metrics | [dfsr](docs/collector.dfsr.md) | DFSR metrics |
[dhcp](docs/collector.dhcp.md) | DHCP Server | [dhcp](docs/collector.dhcp.md) | DHCP Server |
[dns](docs/collector.dns.md) | DNS Server | [dns](docs/collector.dns.md) | DNS Server |
[exchange](docs/collector.exchange.md) | Exchange metrics | [exchange](docs/collector.exchange.md) | Exchange metrics |
[filetime](docs/collector.filetime.md) | FileTime metrics | [filetime](docs/collector.filetime.md) | FileTime metrics |
[fsrmquota](docs/collector.fsrmquota.md) | Microsoft File Server Resource Manager (FSRM) Quotas collector | [fsrmquota](docs/collector.fsrmquota.md) | Microsoft File Server Resource Manager (FSRM) Quotas collector |
[hyperv](docs/collector.hyperv.md) | Hyper-V hosts | [gpu](docs/collector.gpu.md) | GPU metrics |
[iis](docs/collector.iis.md) | IIS sites and applications | [hyperv](docs/collector.hyperv.md) | Hyper-V hosts |
[license](docs/collector.license.md) | Windows license status | [iis](docs/collector.iis.md) | IIS sites and applications |
[logical_disk](docs/collector.logical_disk.md) | Logical disks, disk I/O | ✓ [license](docs/collector.license.md) | Windows license status |
[memory](docs/collector.memory.md) | Memory usage metrics | ✓ [logical_disk](docs/collector.logical_disk.md) | Logical disks, disk I/O | ✓
[mscluster](docs/collector.mscluster.md) | MSCluster metrics | [memory](docs/collector.memory.md) | Memory usage metrics | ✓
[msmq](docs/collector.msmq.md) | MSMQ queues | [mscluster](docs/collector.mscluster.md) | MSCluster metrics |
[mssql](docs/collector.mssql.md) | [SQL Server Performance Objects](https://docs.microsoft.com/en-us/sql/relational-databases/performance-monitor/use-sql-server-objects#SQLServerPOs) metrics | [msmq](docs/collector.msmq.md) | MSMQ queues |
[netframework](docs/collector.netframework.md) | .NET Framework metrics | [mssql](docs/collector.mssql.md) | [SQL Server Performance Objects](https://docs.microsoft.com/en-us/sql/relational-databases/performance-monitor/use-sql-server-objects#SQLServerPOs) metrics |
[net](docs/collector.net.md) | Network interface I/O | ✓ [netframework](docs/collector.netframework.md) | .NET Framework metrics |
[os](docs/collector.os.md) | OS metrics (memory, processes, users) | ✓ [net](docs/collector.net.md) | Network interface I/O | ✓
[pagefile](docs/collector.pagefile.md) | pagefile metrics | [os](docs/collector.os.md) | OS metrics (memory, processes, users) | ✓
[performancecounter](docs/collector.performancecounter.md) | Custom performance counter metrics | [pagefile](docs/collector.pagefile.md) | pagefile metrics |
[physical_disk](docs/collector.physical_disk.md) | physical disk metrics | ✓ [performancecounter](docs/collector.performancecounter.md) | Custom performance counter metrics |
[printer](docs/collector.printer.md) | Printer metrics | [physical_disk](docs/collector.physical_disk.md) | physical disk metrics | ✓
[process](docs/collector.process.md) | Per-process metrics | [printer](docs/collector.printer.md) | Printer metrics |
[remote_fx](docs/collector.remote_fx.md) | RemoteFX protocol (RDP) metrics | [process](docs/collector.process.md) | Per-process metrics |
[scheduled_task](docs/collector.scheduled_task.md) | Scheduled Tasks metrics | [remote_fx](docs/collector.remote_fx.md) | RemoteFX protocol (RDP) metrics |
[service](docs/collector.service.md) | Service state metrics | ✓ [scheduled_task](docs/collector.scheduled_task.md) | Scheduled Tasks metrics |
[smb](docs/collector.smb.md) | SMB Server | [service](docs/collector.service.md) | Service state metrics | ✓
[smbclient](docs/collector.smbclient.md) | SMB Client | [smb](docs/collector.smb.md) | SMB Server |
[smtp](docs/collector.smtp.md) | IIS SMTP Server | [smbclient](docs/collector.smbclient.md) | SMB Client |
[system](docs/collector.system.md) | System calls | ✓ [smtp](docs/collector.smtp.md) | IIS SMTP Server |
[tcp](docs/collector.tcp.md) | TCP connections | [system](docs/collector.system.md) | System calls | ✓
[terminal_services](docs/collector.terminal_services.md) | Terminal services (RDS) [tcp](docs/collector.tcp.md) | TCP connections |
[textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file | [terminal_services](docs/collector.terminal_services.md) | Terminal services (RDS)
[thermalzone](docs/collector.thermalzone.md) | Thermal information | [textfile](docs/collector.textfile.md) | Read prometheus metrics from a text file |
[time](docs/collector.time.md) | Windows Time Service | [thermalzone](docs/collector.thermalzone.md) | Thermal information |
[udp](docs/collector.udp.md) | UDP connections | [time](docs/collector.time.md) | Windows Time Service |
[update](docs/collector.update.md) | Windows Update Service | [udp](docs/collector.udp.md) | UDP connections |
[vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent | [update](docs/collector.update.md) | Windows Update Service |
[vmware](docs/collector.vmware.md) | Performance counters installed by the Vmware Guest agent |
See the linked documentation on each collector for more information on reported metrics, configuration settings and usage examples. See the linked documentation on each collector for more information on reported metrics, configuration settings and usage examples.

139
docs/collector.gpu.md Normal file
View File

@@ -0,0 +1,139 @@
# gpu collector
The gpu collector exposes metrics about GPU usage and memory consumption, both at the adapter (physical GPU) and
per-process level.
| | |
|---------------------|--------------------------------------|
| Metric name prefix | `gpu` |
| Data source | Perflib |
| Counters | GPU Engine, GPU Adapter, GPU Process |
| Enabled by default? | No |
## Flags
None
## Metrics
These metrics are available on supported versions of Windows with compatible GPUs and drivers:
### Adapter-level Metrics
| Name | Description | Type | Labels |
|----------------------------------------------|----------------------------------------------------------|-------|--------|
| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `phys` |
### Per-process Metrics
| Name | Description | Type | Labels |
|----------------------------------------------|-------------------------------------------------|---------|----------------------------------------|
| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `phys`, `eng`, `engtype`, `process_id` |
| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
## Metric Labels
* `phys`: Physical GPU index (e.g., "0")
* `eng`: GPU engine index (e.g., "0", "1", ...)
* `engtype`: GPU engine type (e.g., "3D", "Copy", "VideoDecode", etc.)
* `process_id`: Process ID
## Example Metric
These are basic queries to help you get started with GPU monitoring on Windows using Prometheus.
**Show total dedicated GPU memory (in bytes) usage on GPU 0:**
```promql
windows_gpu_adapter_memory_dedicated_bytes{phys="0"}
```
**Aggregate GPU utilization across all processes for a physical GPU (3D engine):**
```promql
sum by (phys) (
rate(windows_gpu_engine_time_seconds{phys="0", engtype="3D"}[1m])
) * 100
```
**Show GPU utilization for a specific process (3D engine):**
```promql
sum by (phys, process_id) (
rate(windows_gpu_engine_time_seconds{process_id="1234", engtype="3D"}[1m])
) * 100
```
**Show dedicated GPU memory per process:**
```promql
windows_gpu_adapter_memory_dedicated_bytes
```
## Useful Queries
**Show top 5 processes by GPU utilization (all engines):**
```promql
topk(5, sum by (process_id) (
rate(windows_gpu_engine_time_seconds[1m])
) * 100)
```
**Show GPU memory usage per physical GPU:**
```promql
sum by (phys) (
windows_gpu_adapter_memory_dedicated_bytes
)
```
Show GPU engine time with process owner and command line:
```promql
windows_gpu_engine_time_seconds * on(process_id) group_left(owner, cmdline) windows_process_info
```
## Alerting Examples
**prometheus.rules**
```yaml
# Alert on processes using more than 80% of a GPU's capacity over 10 minutes
- alert: HighGpuUtilization
expr: |
sum by (process_id) (
rate(windows_gpu_engine_time_seconds[1m])
) * 100 > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High GPU Utilization (process {{ $labels.process_id }})"
description: "Process is using more than 80% of GPU resources\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
```
## Notes
* Per-process metrics allow you to identify which processes are consuming GPU resources.
* Adapter-level metrics provide an overview of total GPU memory usage.
* For overall GPU utilization, aggregate per-process metrics in Prometheus using queries such as `sum()`.
* The collector relies on Windows performance counters; ensure your system and drivers support these counters.
## Enabling the Collector
To enable the GPU collector, add `gpu` to the list of enabled collectors in your windows_exporter configuration.
Example (command line):
```shell
windows_exporter.exe --collectors.enabled=gpu
```

View File

@@ -0,0 +1,431 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu
import (
"errors"
"fmt"
"log/slog"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus-community/windows_exporter/internal/mi"
"github.com/prometheus-community/windows_exporter/internal/pdh"
"github.com/prometheus-community/windows_exporter/internal/types"
"github.com/prometheus/client_golang/prometheus"
)
const Name = "gpu"
type Config struct{}
//nolint:gochecknoglobals
var ConfigDefaults = Config{}
type Collector struct {
config Config
// GPU Engine
gpuEnginePerfDataCollector *pdh.Collector
gpuEnginePerfDataObject []gpuEnginePerfDataCounterValues
gpuEngineRunningTime *prometheus.Desc
// GPU Adapter Memory
gpuAdapterMemoryPerfDataCollector *pdh.Collector
gpuAdapterMemoryPerfDataObject []gpuAdapterMemoryPerfDataCounterValues
gpuAdapterMemoryDedicatedUsage *prometheus.Desc
gpuAdapterMemorySharedUsage *prometheus.Desc
gpuAdapterMemoryTotalCommitted *prometheus.Desc
// GPU Local Adapter Memory
gpuLocalAdapterMemoryPerfDataCollector *pdh.Collector
gpuLocalAdapterMemoryPerfDataObject []gpuLocalAdapterMemoryPerfDataCounterValues
gpuLocalAdapterMemoryUsage *prometheus.Desc
// GPU Non Local Adapter Memory
gpuNonLocalAdapterMemoryPerfDataCollector *pdh.Collector
gpuNonLocalAdapterMemoryPerfDataObject []gpuNonLocalAdapterMemoryPerfDataCounterValues
gpuNonLocalAdapterMemoryUsage *prometheus.Desc
// GPU Process Memory
gpuProcessMemoryPerfDataCollector *pdh.Collector
gpuProcessMemoryPerfDataObject []gpuProcessMemoryPerfDataCounterValues
gpuProcessMemoryDedicatedUsage *prometheus.Desc
gpuProcessMemoryLocalUsage *prometheus.Desc
gpuProcessMemoryNonLocalUsage *prometheus.Desc
gpuProcessMemorySharedUsage *prometheus.Desc
gpuProcessMemoryTotalCommitted *prometheus.Desc
}
func New(config *Config) *Collector {
if config == nil {
config = &ConfigDefaults
}
c := &Collector{
config: *config,
}
return c
}
func NewWithFlags(_ *kingpin.Application) *Collector {
return &Collector{}
}
func (c *Collector) GetName() string {
return Name
}
func (c *Collector) Close() error {
c.gpuEnginePerfDataCollector.Close()
c.gpuAdapterMemoryPerfDataCollector.Close()
c.gpuLocalAdapterMemoryPerfDataCollector.Close()
c.gpuNonLocalAdapterMemoryPerfDataCollector.Close()
c.gpuProcessMemoryPerfDataCollector.Close()
return nil
}
func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
var err error
c.gpuEngineRunningTime = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"),
"Total running time of the GPU in seconds.",
[]string{"process_id", "phys", "eng", "engtype"},
nil,
)
c.gpuAdapterMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_dedicated_bytes"),
"Dedicated GPU memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuAdapterMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_shared_bytes"),
"Shared GPU memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuAdapterMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_committed_bytes"),
"Total committed GPU memory in bytes.",
[]string{"phys"},
nil,
)
c.gpuLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "local_adapter_memory_bytes"),
"Local adapter memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuNonLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "non_local_adapter_memory_bytes"),
"Non-local adapter memory usage in bytes.",
[]string{"phys"},
nil,
)
c.gpuProcessMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_dedicated_bytes"),
"Dedicated process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemoryLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_local_bytes"),
"Local process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemoryNonLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_non_local_bytes"),
"Non-local process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_shared_bytes"),
"Shared process memory usage in bytes.",
[]string{"process_id", "phys"},
nil,
)
c.gpuProcessMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_committed_bytes"),
"Total committed process memory in bytes.",
[]string{"process_id", "phys"},
nil,
)
errs := make([]error, 0)
c.gpuEnginePerfDataCollector, err = pdh.NewCollector[gpuEnginePerfDataCounterValues](pdh.CounterTypeRaw, "GPU Engine", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Engine perf data collector: %w", err))
}
c.gpuAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Adapter Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Adapter Memory perf data collector: %w", err))
}
c.gpuLocalAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuLocalAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Local Adapter Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Local Adapter Memory perf data collector: %w", err))
}
c.gpuNonLocalAdapterMemoryPerfDataCollector, err = pdh.NewCollector[gpuNonLocalAdapterMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Non Local Adapter Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Non Local Adapter Memory perf data collector: %w", err))
}
c.gpuProcessMemoryPerfDataCollector, err = pdh.NewCollector[gpuProcessMemoryPerfDataCounterValues](pdh.CounterTypeRaw, "GPU Process Memory", pdh.InstancesAll)
if err != nil {
errs = append(errs, fmt.Errorf("failed to create GPU Process Memory perf data collector: %w", err))
}
return errors.Join(errs...)
}
func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
errs := make([]error, 0)
if err := c.collectGpuEngineMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuAdapterMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuLocalAdapterMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuNonLocalAdapterMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuProcessMemoryMetrics(ch); err != nil {
errs = append(errs, err)
}
return errors.Join(errs...)
}
func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Engine perf data.
if err := c.gpuEnginePerfDataCollector.Collect(&c.gpuEnginePerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Engine perf data: %w", err)
}
runningTimeMap := make(map[PidPhysEngEngType]float64)
// Iterate over the GPU Engine perf data and aggregate the values.
for _, data := range c.gpuEnginePerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
key := PidPhysEngEngType{
Pid: instance.Pid,
Phys: instance.Phys,
Eng: instance.Eng,
Engtype: instance.Engtype,
}
runningTimeMap[key] += data.RunningTime / 10_000_000 // RunningTime is in 100ns units, convert to seconds.
}
for key, runningTime := range runningTimeMap {
ch <- prometheus.MustNewConstMetric(
c.gpuEngineRunningTime,
prometheus.CounterValue,
runningTime,
key.Pid, key.Phys, key.Eng, key.Engtype,
)
}
return nil
}
func (c *Collector) collectGpuAdapterMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Adapter Memory perf data.
if err := c.gpuAdapterMemoryPerfDataCollector.Collect(&c.gpuAdapterMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Adapter Memory perf data: %w", err)
}
dedicatedUsageMap := make(map[PidPhysEngEngType]float64)
sharedUsageMap := make(map[PidPhysEngEngType]float64)
totalCommittedMap := make(map[PidPhysEngEngType]float64)
for _, data := range c.gpuAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
key := PidPhysEngEngType{
Pid: instance.Pid,
Phys: instance.Phys,
Eng: instance.Eng,
Engtype: instance.Engtype,
}
dedicatedUsageMap[key] += data.DedicatedUsage
sharedUsageMap[key] += data.SharedUsage
totalCommittedMap[key] += data.TotalCommitted
}
for key, dedicatedUsage := range dedicatedUsageMap {
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemoryDedicatedUsage,
prometheus.GaugeValue,
dedicatedUsage,
key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemorySharedUsage,
prometheus.GaugeValue,
sharedUsageMap[key],
key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemoryTotalCommitted,
prometheus.GaugeValue,
totalCommittedMap[key],
key.Phys,
)
}
return nil
}
func (c *Collector) collectGpuLocalAdapterMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Local Adapter Memory perf data.
if err := c.gpuLocalAdapterMemoryPerfDataCollector.Collect(&c.gpuLocalAdapterMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Local Adapter Memory perf data: %w", err)
}
localAdapterMemoryMap := make(map[string]float64)
for _, data := range c.gpuLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
localAdapterMemoryMap[instance.Phys] += data.LocalUsage
}
for phys, localUsage := range localAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric(
c.gpuLocalAdapterMemoryUsage,
prometheus.GaugeValue,
localUsage,
phys,
)
}
return nil
}
func (c *Collector) collectGpuNonLocalAdapterMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Non Local Adapter Memory perf data.
if err := c.gpuNonLocalAdapterMemoryPerfDataCollector.Collect(&c.gpuNonLocalAdapterMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Non Local Adapter Memory perf data: %w", err)
}
nonLocalAdapterMemoryMap := make(map[string]float64)
for _, data := range c.gpuNonLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
nonLocalAdapterMemoryMap[instance.Phys] += data.NonLocalUsage
}
for phys, nonLocalUsage := range nonLocalAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric(
c.gpuNonLocalAdapterMemoryUsage,
prometheus.GaugeValue,
nonLocalUsage,
phys,
)
}
return nil
}
func (c *Collector) collectGpuProcessMemoryMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Process Memory perf data.
if err := c.gpuProcessMemoryPerfDataCollector.Collect(&c.gpuProcessMemoryPerfDataObject); err != nil {
return fmt.Errorf("failed to collect GPU Process Memory perf data: %w", err)
}
processDedicatedUsageMap := make(map[PidPhys]float64)
processLocalUsageMap := make(map[PidPhys]float64)
processNonLocalUsageMap := make(map[PidPhys]float64)
processSharedUsageMap := make(map[PidPhys]float64)
processTotalCommittedMap := make(map[PidPhys]float64)
for _, data := range c.gpuProcessMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
key := PidPhys{
Pid: instance.Pid,
Phys: instance.Phys,
}
processDedicatedUsageMap[key] += data.DedicatedUsage
processLocalUsageMap[key] += data.LocalUsage
processNonLocalUsageMap[key] += data.NonLocalUsage
processSharedUsageMap[key] += data.SharedUsage
processTotalCommittedMap[key] += data.TotalCommitted
}
for key, dedicatedUsage := range processDedicatedUsageMap {
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryDedicatedUsage,
prometheus.GaugeValue,
dedicatedUsage,
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryLocalUsage,
prometheus.GaugeValue,
processLocalUsageMap[key],
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryNonLocalUsage,
prometheus.GaugeValue,
processNonLocalUsageMap[key],
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemorySharedUsage,
prometheus.GaugeValue,
processSharedUsageMap[key],
key.Pid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryTotalCommitted,
prometheus.GaugeValue,
processTotalCommittedMap[key],
key.Pid, key.Phys,
)
}
return nil
}

View File

@@ -0,0 +1,33 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu_test
import (
"testing"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/utils/testutils"
)
func BenchmarkCollector(b *testing.B) {
testutils.FuncBenchmarkCollector(b, gpu.Name, gpu.NewWithFlags)
}
func TestCollector(t *testing.T) {
testutils.TestCollector(t, gpu.New, nil)
}

View File

@@ -0,0 +1,55 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu
type gpuEnginePerfDataCounterValues struct {
Name string
RunningTime float64 `perfdata:"Running Time"`
UtilizationPercentage float64 `perfdata:"Utilization Percentage"`
}
type gpuAdapterMemoryPerfDataCounterValues struct {
Name string
DedicatedUsage float64 `perfdata:"Dedicated Usage"`
SharedUsage float64 `perfdata:"Shared Usage"`
TotalCommitted float64 `perfdata:"Total Committed"`
}
type gpuLocalAdapterMemoryPerfDataCounterValues struct {
Name string
LocalUsage float64 `perfdata:"Local Usage"`
}
type gpuNonLocalAdapterMemoryPerfDataCounterValues struct {
Name string
NonLocalUsage float64 `perfdata:"Non Local Usage"`
}
type gpuProcessMemoryPerfDataCounterValues struct {
Name string
DedicatedUsage float64 `perfdata:"Dedicated Usage"`
LocalUsage float64 `perfdata:"Local Usage"`
NonLocalUsage float64 `perfdata:"Non Local Usage"`
SharedUsage float64 `perfdata:"Shared Usage"`
TotalCommitted float64 `perfdata:"Total Committed"`
}

View File

@@ -0,0 +1,84 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package gpu
import (
"strings"
)
type Instance struct {
Pid string
Luid [2]string
Phys string
Eng string
Engtype string
Part string
}
type PidPhys struct {
Pid string
Phys string
}
type PidPhysEngEngType struct {
Pid string
Phys string
Eng string
Engtype string
}
func parseGPUCounterInstanceString(s string) Instance {
// Example: "pid_1234_luid_0x00000000_0x00005678_phys_0_eng_0_engtype_3D"
// Example: "luid_0x00000000_0x00005678_phys_0"
// Example: "luid_0x00000000_0x00005678_phys_0_part_0"
parts := strings.Split(s, "_")
var instance Instance
for i, part := range parts {
switch part {
case "pid":
if i+1 < len(parts) {
instance.Pid = parts[i+1]
}
case "luid":
if i+2 < len(parts) {
instance.Luid[0] = parts[i+1]
instance.Luid[1] = parts[i+2]
}
case "phys":
if i+1 < len(parts) {
instance.Phys = parts[i+1]
}
case "eng":
if i+1 < len(parts) {
instance.Eng = parts[i+1]
}
case "engtype":
if i+1 < len(parts) {
instance.Engtype = parts[i+1]
}
case "part":
if i+1 < len(parts) {
instance.Part = parts[i+1]
}
}
}
return instance
}

View File

@@ -43,6 +43,7 @@ import (
"github.com/prometheus-community/windows_exporter/internal/collector/exchange" "github.com/prometheus-community/windows_exporter/internal/collector/exchange"
"github.com/prometheus-community/windows_exporter/internal/collector/filetime" "github.com/prometheus-community/windows_exporter/internal/collector/filetime"
"github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota" "github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/collector/hyperv" "github.com/prometheus-community/windows_exporter/internal/collector/hyperv"
"github.com/prometheus-community/windows_exporter/internal/collector/iis" "github.com/prometheus-community/windows_exporter/internal/collector/iis"
"github.com/prometheus-community/windows_exporter/internal/collector/license" "github.com/prometheus-community/windows_exporter/internal/collector/license"
@@ -114,6 +115,7 @@ func NewWithConfig(config Config) *Collection {
collectors[exchange.Name] = exchange.New(&config.Exchange) collectors[exchange.Name] = exchange.New(&config.Exchange)
collectors[filetime.Name] = filetime.New(&config.Filetime) collectors[filetime.Name] = filetime.New(&config.Filetime)
collectors[fsrmquota.Name] = fsrmquota.New(&config.Fsrmquota) collectors[fsrmquota.Name] = fsrmquota.New(&config.Fsrmquota)
collectors[gpu.Name] = gpu.New(&config.GPU)
collectors[hyperv.Name] = hyperv.New(&config.HyperV) collectors[hyperv.Name] = hyperv.New(&config.HyperV)
collectors[iis.Name] = iis.New(&config.IIS) collectors[iis.Name] = iis.New(&config.IIS)
collectors[license.Name] = license.New(&config.License) collectors[license.Name] = license.New(&config.License)

View File

@@ -33,6 +33,7 @@ import (
"github.com/prometheus-community/windows_exporter/internal/collector/exchange" "github.com/prometheus-community/windows_exporter/internal/collector/exchange"
"github.com/prometheus-community/windows_exporter/internal/collector/filetime" "github.com/prometheus-community/windows_exporter/internal/collector/filetime"
"github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota" "github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/collector/hyperv" "github.com/prometheus-community/windows_exporter/internal/collector/hyperv"
"github.com/prometheus-community/windows_exporter/internal/collector/iis" "github.com/prometheus-community/windows_exporter/internal/collector/iis"
"github.com/prometheus-community/windows_exporter/internal/collector/license" "github.com/prometheus-community/windows_exporter/internal/collector/license"
@@ -84,6 +85,7 @@ type Config struct {
Exchange exchange.Config `yaml:"exchange"` Exchange exchange.Config `yaml:"exchange"`
Filetime filetime.Config `yaml:"filetime"` Filetime filetime.Config `yaml:"filetime"`
Fsrmquota fsrmquota.Config `yaml:"fsrmquota"` Fsrmquota fsrmquota.Config `yaml:"fsrmquota"`
GPU gpu.Config `yaml:"gpu"`
HyperV hyperv.Config `yaml:"hyperv"` HyperV hyperv.Config `yaml:"hyperv"`
IIS iis.Config `yaml:"iis"` IIS iis.Config `yaml:"iis"`
License license.Config `yaml:"license"` License license.Config `yaml:"license"`
@@ -139,6 +141,7 @@ var ConfigDefaults = Config{
Exchange: exchange.ConfigDefaults, Exchange: exchange.ConfigDefaults,
Filetime: filetime.ConfigDefaults, Filetime: filetime.ConfigDefaults,
Fsrmquota: fsrmquota.ConfigDefaults, Fsrmquota: fsrmquota.ConfigDefaults,
GPU: gpu.ConfigDefaults,
HyperV: hyperv.ConfigDefaults, HyperV: hyperv.ConfigDefaults,
IIS: iis.ConfigDefaults, IIS: iis.ConfigDefaults,
License: license.ConfigDefaults, License: license.ConfigDefaults,

View File

@@ -37,6 +37,7 @@ import (
"github.com/prometheus-community/windows_exporter/internal/collector/exchange" "github.com/prometheus-community/windows_exporter/internal/collector/exchange"
"github.com/prometheus-community/windows_exporter/internal/collector/filetime" "github.com/prometheus-community/windows_exporter/internal/collector/filetime"
"github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota" "github.com/prometheus-community/windows_exporter/internal/collector/fsrmquota"
"github.com/prometheus-community/windows_exporter/internal/collector/gpu"
"github.com/prometheus-community/windows_exporter/internal/collector/hyperv" "github.com/prometheus-community/windows_exporter/internal/collector/hyperv"
"github.com/prometheus-community/windows_exporter/internal/collector/iis" "github.com/prometheus-community/windows_exporter/internal/collector/iis"
"github.com/prometheus-community/windows_exporter/internal/collector/license" "github.com/prometheus-community/windows_exporter/internal/collector/license"
@@ -95,6 +96,7 @@ var BuildersWithFlags = map[string]BuilderWithFlags[Collector]{
exchange.Name: NewBuilderWithFlags(exchange.NewWithFlags), exchange.Name: NewBuilderWithFlags(exchange.NewWithFlags),
filetime.Name: NewBuilderWithFlags(filetime.NewWithFlags), filetime.Name: NewBuilderWithFlags(filetime.NewWithFlags),
fsrmquota.Name: NewBuilderWithFlags(fsrmquota.NewWithFlags), fsrmquota.Name: NewBuilderWithFlags(fsrmquota.NewWithFlags),
gpu.Name: NewBuilderWithFlags(gpu.NewWithFlags),
hyperv.Name: NewBuilderWithFlags(hyperv.NewWithFlags), hyperv.Name: NewBuilderWithFlags(hyperv.NewWithFlags),
iis.Name: NewBuilderWithFlags(iis.NewWithFlags), iis.Name: NewBuilderWithFlags(iis.NewWithFlags),
license.Name: NewBuilderWithFlags(license.NewWithFlags), license.Name: NewBuilderWithFlags(license.NewWithFlags),