gpu: add device id label (#2186)

This commit is contained in:
Jan-Otto Kröpke
2025-08-28 06:36:10 +02:00
committed by GitHub
parent 71cedbc4d0
commit 0b8a257b31
11 changed files with 490 additions and 193 deletions

View File

@@ -4,6 +4,7 @@
<w>containerd</w> <w>containerd</w>
<w>endpointstats</w> <w>endpointstats</w>
<w>gochecknoglobals</w> <w>gochecknoglobals</w>
<w>lpwstr</w>
<w>luid</w> <w>luid</w>
<w>operationoptions</w> <w>operationoptions</w>
<w>setupapi</w> <w>setupapi</w>

View File

@@ -21,27 +21,27 @@ These metrics are available on supported versions of Windows with compatible GPU
### Adapter-level Metrics ### Adapter-level Metrics
| Name | Description | Type | Labels | | Name | Description | Type | Labels |
|--------------------------------------------------|------------------------------------------------------------------------------------|-------|---------------| |--------------------------------------------------|------------------------------------------------------------------------------------|-------|-----------------------------------------------------------------|
| `windows_gpu_info` | A metric with a constant '1' value labeled with gpu device information. | gauge | `luid`,`name`,`bus_number`,`phys`,`function_number` | | `windows_gpu_info` | A metric with a constant '1' value labeled with gpu device information. | gauge | `bus_number`,`device_id`,`function_number`,`luid`,`name`,`phys` |
| `windows_gpu_dedicated_system_memory_size_bytes` | The size, in bytes, of memory that is dedicated from system memory. | gauge | `luid` | | `windows_gpu_dedicated_system_memory_size_bytes` | The size, in bytes, of memory that is dedicated from system memory. | gauge | `device_id`,`luid` |
| `windows_gpu_dedicated_video_memory_size_bytes` | The size, in bytes, of memory that is dedicated from video memory. | gauge | `luid` | | `windows_gpu_dedicated_video_memory_size_bytes` | The size, in bytes, of memory that is dedicated from video memory. | gauge | `device_id`,`luid` |
| `windows_gpu_shared_system_memory_size_bytes` | The size, in bytes, of memory from system memory that can be shared by many users. | gauge | `luid` | | `windows_gpu_shared_system_memory_size_bytes` | The size, in bytes, of memory from system memory that can be shared by many users. | gauge | `device_id`,`luid` |
| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `luid`,`phys` | | `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `device_id`,`luid`,`phys` |
| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `luid`,`phys` | | `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys` |
| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `luid`,`phys` | | `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys` |
| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `luid`,`phys` | | `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys`,`part` |
| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `luid`,`phys` | | `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys`,`part` |
### Per-process Metrics ### Per-process Metrics
| Name | Description | Type | Labels | | Name | Description | Type | Labels |
|----------------------------------------------|-------------------------------------------------|---------|-----------------------------------------------| |----------------------------------------------|-------------------------------------------------|---------|-----------------------------------------------------------|
| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `luid`,`phys`, `eng`, `engtype`, `process_id` | | `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `device_id`,`luid`,`phys`, `eng`, `engtype`, `process_id` |
| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `luid`,`phys`,`process_id` | | `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` |
| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | | `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` |
| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | | `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` |
| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | | `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` |
| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | | `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` |
## Metric Labels ## Metric Labels
@@ -57,7 +57,7 @@ These are basic queries to help you get started with GPU monitoring on Windows u
**Show GPU information for a specific physical GPU (0):** **Show GPU information for a specific physical GPU (0):**
```promql ```promql
windows_gpu_info{description="NVIDIA GeForce GTX 1070",friendly_name="",hardware_id="PCI\\VEN_10DE&DEV_1B81&SUBSYS_61733842&REV_A1",phys="0",physical_device_object_name="\\Device\\NTPNP_PCI0027"} 1 windows_gpu_info{bus_number="8",device_id="PCI\\VEN_10DE&DEV_1B81&SUBSYS_61733842&REV_A1",function_number="0",luid="0x00000000_0x00010F8A",name="NVIDIA GeForce GTX 1070",phys="0"} 1
``` ```
**Show total dedicated GPU memory (in bytes) usage on GPU 0:** **Show total dedicated GPU memory (in bytes) usage on GPU 0:**

View File

@@ -21,9 +21,9 @@ import (
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"strconv"
"github.com/alecthomas/kingpin/v2" "github.com/alecthomas/kingpin/v2"
"github.com/prometheus-community/windows_exporter/internal/headers/cfgmgr32"
"github.com/prometheus-community/windows_exporter/internal/headers/gdi32" "github.com/prometheus-community/windows_exporter/internal/headers/gdi32"
"github.com/prometheus-community/windows_exporter/internal/mi" "github.com/prometheus-community/windows_exporter/internal/mi"
"github.com/prometheus-community/windows_exporter/internal/pdh" "github.com/prometheus-community/windows_exporter/internal/pdh"
@@ -41,7 +41,7 @@ var ConfigDefaults = Config{}
type Collector struct { type Collector struct {
config Config config Config
gpuDeviceCache map[string]gdi32.GPUDevice gpuDeviceCache map[string]gpuDevice
// GPU Engine // GPU Engine
gpuEnginePerfDataCollector *pdh.Collector gpuEnginePerfDataCollector *pdh.Collector
@@ -85,6 +85,12 @@ type Collector struct {
gpuProcessMemoryTotalCommitted *prometheus.Desc gpuProcessMemoryTotalCommitted *prometheus.Desc
} }
type gpuDevice struct {
gdi32 gdi32.GPUDevice
cfgmgr32 cfgmgr32.Device
ID string
}
func New(config *Config) *Collector { func New(config *Config) *Collector {
if config == nil { if config == nil {
config = &ConfigDefaults config = &ConfigDefaults
@@ -121,97 +127,97 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
c.gpuInfo = prometheus.NewDesc( c.gpuInfo = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "info"), prometheus.BuildFQName(types.Namespace, Name, "info"),
"A metric with a constant '1' value labeled with gpu device information.", "A metric with a constant '1' value labeled with gpu device information.",
[]string{"luid", "name", "bus_number", "phys", "function_number"}, []string{"luid", "device_id", "name", "bus_number", "phys", "function_number"},
nil, nil,
) )
c.gpuSharedSystemMemorySize = prometheus.NewDesc( c.gpuSharedSystemMemorySize = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "shared_system_memory_size_bytes"), prometheus.BuildFQName(types.Namespace, Name, "shared_system_memory_size_bytes"),
"The size, in bytes, of memory from system memory that can be shared by many users.", "The size, in bytes, of memory from system memory that can be shared by many users.",
[]string{"luid"}, []string{"luid", "device_id"},
nil, nil,
) )
c.gpuDedicatedSystemMemorySize = prometheus.NewDesc( c.gpuDedicatedSystemMemorySize = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "dedicated_system_memory_size_bytes"), prometheus.BuildFQName(types.Namespace, Name, "dedicated_system_memory_size_bytes"),
"The size, in bytes, of memory that is dedicated from system memory.", "The size, in bytes, of memory that is dedicated from system memory.",
[]string{"luid"}, []string{"luid", "device_id"},
nil, nil,
) )
c.gpuDedicatedVideoMemorySize = prometheus.NewDesc( c.gpuDedicatedVideoMemorySize = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "dedicated_video_memory_size_bytes"), prometheus.BuildFQName(types.Namespace, Name, "dedicated_video_memory_size_bytes"),
"The size, in bytes, of memory that is dedicated from video memory.", "The size, in bytes, of memory that is dedicated from video memory.",
[]string{"luid"}, []string{"luid", "device_id"},
nil, nil,
) )
c.gpuEngineRunningTime = prometheus.NewDesc( c.gpuEngineRunningTime = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"), prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"),
"Total running time of the GPU in seconds.", "Total running time of the GPU in seconds.",
[]string{"process_id", "luid", "phys", "eng", "engtype"}, []string{"process_id", "luid", "device_id", "phys", "eng", "engtype"},
nil, nil,
) )
c.gpuAdapterMemoryDedicatedUsage = prometheus.NewDesc( c.gpuAdapterMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_dedicated_bytes"), prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_dedicated_bytes"),
"Dedicated GPU memory usage in bytes.", "Dedicated GPU memory usage in bytes.",
[]string{"luid", "phys"}, []string{"luid", "device_id", "phys"},
nil, nil,
) )
c.gpuAdapterMemorySharedUsage = prometheus.NewDesc( c.gpuAdapterMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_shared_bytes"), prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_shared_bytes"),
"Shared GPU memory usage in bytes.", "Shared GPU memory usage in bytes.",
[]string{"luid", "phys"}, []string{"luid", "device_id", "phys"},
nil, nil,
) )
c.gpuAdapterMemoryTotalCommitted = prometheus.NewDesc( c.gpuAdapterMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_committed_bytes"), prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_committed_bytes"),
"Total committed GPU memory in bytes.", "Total committed GPU memory in bytes.",
[]string{"luid", "phys"}, []string{"luid", "device_id", "phys"},
nil, nil,
) )
c.gpuLocalAdapterMemoryUsage = prometheus.NewDesc( c.gpuLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "local_adapter_memory_bytes"), prometheus.BuildFQName(types.Namespace, Name, "local_adapter_memory_bytes"),
"Local adapter memory usage in bytes.", "Local adapter memory usage in bytes.",
[]string{"luid", "phys"}, []string{"luid", "device_id", "phys", "part"},
nil, nil,
) )
c.gpuNonLocalAdapterMemoryUsage = prometheus.NewDesc( c.gpuNonLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "non_local_adapter_memory_bytes"), prometheus.BuildFQName(types.Namespace, Name, "non_local_adapter_memory_bytes"),
"Non-local adapter memory usage in bytes.", "Non-local adapter memory usage in bytes.",
[]string{"luid", "phys"}, []string{"luid", "device_id", "phys", "part"},
nil, nil,
) )
c.gpuProcessMemoryDedicatedUsage = prometheus.NewDesc( c.gpuProcessMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_dedicated_bytes"), prometheus.BuildFQName(types.Namespace, Name, "process_memory_dedicated_bytes"),
"Dedicated process memory usage in bytes.", "Dedicated process memory usage in bytes.",
[]string{"process_id", "luid", "phys"}, []string{"process_id", "luid", "device_id", "phys"},
nil, nil,
) )
c.gpuProcessMemoryLocalUsage = prometheus.NewDesc( c.gpuProcessMemoryLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_local_bytes"), prometheus.BuildFQName(types.Namespace, Name, "process_memory_local_bytes"),
"Local process memory usage in bytes.", "Local process memory usage in bytes.",
[]string{"process_id", "luid", "phys"}, []string{"process_id", "luid", "device_id", "phys"},
nil, nil,
) )
c.gpuProcessMemoryNonLocalUsage = prometheus.NewDesc( c.gpuProcessMemoryNonLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_non_local_bytes"), prometheus.BuildFQName(types.Namespace, Name, "process_memory_non_local_bytes"),
"Non-local process memory usage in bytes.", "Non-local process memory usage in bytes.",
[]string{"process_id", "luid", "phys"}, []string{"process_id", "luid", "device_id", "phys"},
nil, nil,
) )
c.gpuProcessMemorySharedUsage = prometheus.NewDesc( c.gpuProcessMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_shared_bytes"), prometheus.BuildFQName(types.Namespace, Name, "process_memory_shared_bytes"),
"Shared process memory usage in bytes.", "Shared process memory usage in bytes.",
[]string{"process_id", "luid", "phys"}, []string{"process_id", "luid", "device_id", "phys"},
nil, nil,
) )
c.gpuProcessMemoryTotalCommitted = prometheus.NewDesc( c.gpuProcessMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_committed_bytes"), prometheus.BuildFQName(types.Namespace, Name, "process_memory_committed_bytes"),
"Total committed process memory in bytes.", "Total committed process memory in bytes.",
[]string{"process_id", "luid", "phys"}, []string{"process_id", "luid", "device_id", "phys"},
nil, nil,
) )
@@ -253,11 +259,39 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
} }
if c.gpuDeviceCache == nil { if c.gpuDeviceCache == nil {
c.gpuDeviceCache = make(map[string]gdi32.GPUDevice) c.gpuDeviceCache = make(map[string]gpuDevice)
} }
luidKey := fmt.Sprintf("0x%08X_0x%08X", gpu.LUID.HighPart, gpu.LUID.LowPart) luidKey := fmt.Sprintf("0x%08X_0x%08X", gpu.LUID.HighPart, gpu.LUID.LowPart)
c.gpuDeviceCache[luidKey] = gpu
deviceID := gpu.DeviceID
cfgmgr32Devs, err := cfgmgr32.GetDevicesInstanceIDs(gpu.DeviceID)
if err != nil {
errs = append(errs, fmt.Errorf("failed to get device instance IDs for device ID %s: %w", gpu.DeviceID, err))
}
var cfgmgr32Dev cfgmgr32.Device
for _, dev := range cfgmgr32Devs {
if dev.BusNumber == gpu.BusNumber && dev.DeviceNumber == gpu.DeviceNumber && dev.FunctionNumber == gpu.FunctionNumber {
cfgmgr32Dev = dev
break
}
}
if cfgmgr32Dev.InstanceID == "" {
errs = append(errs, fmt.Errorf("failed to find matching device for device ID %s", gpu.DeviceID))
} else {
deviceID = cfgmgr32Dev.InstanceID
}
c.gpuDeviceCache[luidKey] = gpuDevice{
gdi32: gpu,
cfgmgr32: cfgmgr32Dev,
ID: deviceID,
}
} }
return errors.Join(errs...) return errors.Join(errs...)
@@ -298,31 +332,32 @@ func (c *Collector) collectGpuInfo(ch chan<- prometheus.Metric) {
prometheus.GaugeValue, prometheus.GaugeValue,
1.0, 1.0,
luid, luid,
gpu.AdapterString, gpu.ID,
strconv.FormatInt(int64(gpu.BusNumber), 10), gpu.gdi32.AdapterString,
strconv.FormatInt(int64(gpu.DeviceNumber), 10), gpu.gdi32.BusNumber.String(),
strconv.FormatInt(int64(gpu.FunctionNumber), 10), gpu.gdi32.DeviceNumber.String(),
gpu.gdi32.FunctionNumber.String(),
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuSharedSystemMemorySize, c.gpuSharedSystemMemorySize,
prometheus.GaugeValue, prometheus.GaugeValue,
float64(gpu.SharedSystemMemorySize), float64(gpu.gdi32.SharedSystemMemorySize),
luid, luid, gpu.ID,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuDedicatedSystemMemorySize, c.gpuDedicatedSystemMemorySize,
prometheus.GaugeValue, prometheus.GaugeValue,
float64(gpu.DedicatedSystemMemorySize), float64(gpu.gdi32.DedicatedSystemMemorySize),
luid, luid, gpu.ID,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuDedicatedVideoMemorySize, c.gpuDedicatedVideoMemorySize,
prometheus.GaugeValue, prometheus.GaugeValue,
float64(gpu.DedicatedVideoMemorySize), float64(gpu.gdi32.DedicatedVideoMemorySize),
luid, luid, gpu.ID,
) )
} }
} }
@@ -333,31 +368,20 @@ func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
return fmt.Errorf("failed to collect GPU Engine perf data: %w", err) return fmt.Errorf("failed to collect GPU Engine perf data: %w", err)
} }
runningTimeMap := make(map[PidPhysEngEngType]float64)
// Iterate over the GPU Engine perf data and aggregate the values. // Iterate over the GPU Engine perf data and aggregate the values.
for _, data := range c.gpuEnginePerfDataObject { for _, data := range c.gpuEnginePerfDataObject {
instance := parseGPUCounterInstanceString(data.Name) instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { device, ok := c.gpuDeviceCache[instance.Luid]
if !ok {
continue continue
} }
key := PidPhysEngEngType{
Pid: instance.Pid,
Phys: instance.Phys,
Luid: instance.Luid,
Eng: instance.Eng,
Engtype: instance.Engtype,
}
runningTimeMap[key] += data.RunningTime / 10_000_000 // RunningTime is in 100ns units, convert to seconds.
}
for key, runningTime := range runningTimeMap {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuEngineRunningTime, c.gpuEngineRunningTime,
prometheus.CounterValue, prometheus.CounterValue,
runningTime, data.RunningTime/10_000_000,
key.Pid, key.Luid, key.Phys, key.Eng, key.Engtype, instance.Pid, instance.Luid, device.ID, instance.Phys, instance.Eng, instance.Engtype,
) )
} }
@@ -370,49 +394,33 @@ func (c *Collector) collectGpuAdapterMemoryMetrics(ch chan<- prometheus.Metric)
return fmt.Errorf("failed to collect GPU Adapter Memory perf data: %w", err) return fmt.Errorf("failed to collect GPU Adapter Memory perf data: %w", err)
} }
dedicatedUsageMap := make(map[PidPhysEngEngType]float64)
sharedUsageMap := make(map[PidPhysEngEngType]float64)
totalCommittedMap := make(map[PidPhysEngEngType]float64)
for _, data := range c.gpuAdapterMemoryPerfDataObject { for _, data := range c.gpuAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name) instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { device, ok := c.gpuDeviceCache[instance.Luid]
if !ok {
continue continue
} }
key := PidPhysEngEngType{
Pid: instance.Pid,
Luid: instance.Luid,
Phys: instance.Phys,
Eng: instance.Eng,
Engtype: instance.Engtype,
}
dedicatedUsageMap[key] += data.DedicatedUsage
sharedUsageMap[key] += data.SharedUsage
totalCommittedMap[key] += data.TotalCommitted
}
for key, dedicatedUsage := range dedicatedUsageMap {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemoryDedicatedUsage, c.gpuAdapterMemoryDedicatedUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
dedicatedUsage, data.DedicatedUsage,
key.Luid, key.Phys, instance.Luid, device.ID, instance.Phys,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemorySharedUsage, c.gpuAdapterMemorySharedUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
sharedUsageMap[key], data.SharedUsage,
key.Luid, key.Phys, instance.Luid, device.ID, instance.Phys,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemoryTotalCommitted, c.gpuAdapterMemoryTotalCommitted,
prometheus.GaugeValue, prometheus.GaugeValue,
totalCommittedMap[key], data.TotalCommitted,
key.Luid, key.Phys, instance.Luid, device.ID, instance.Phys,
) )
} }
@@ -425,29 +433,19 @@ func (c *Collector) collectGpuLocalAdapterMemoryMetrics(ch chan<- prometheus.Met
return fmt.Errorf("failed to collect GPU Local Adapter Memory perf data: %w", err) return fmt.Errorf("failed to collect GPU Local Adapter Memory perf data: %w", err)
} }
localAdapterMemoryMap := make(map[PidPhysEngEngType]float64)
for _, data := range c.gpuLocalAdapterMemoryPerfDataObject { for _, data := range c.gpuLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name) instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { device, ok := c.gpuDeviceCache[instance.Luid]
if !ok {
continue continue
} }
key := PidPhysEngEngType{
Luid: instance.Luid,
Phys: instance.Phys,
}
localAdapterMemoryMap[key] += data.LocalUsage
}
for key, localUsage := range localAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuLocalAdapterMemoryUsage, c.gpuLocalAdapterMemoryUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
localUsage, data.LocalUsage,
key.Luid, key.Phys, instance.Luid, device.ID, instance.Phys, instance.Part,
) )
} }
@@ -460,28 +458,19 @@ func (c *Collector) collectGpuNonLocalAdapterMemoryMetrics(ch chan<- prometheus.
return fmt.Errorf("failed to collect GPU Non Local Adapter Memory perf data: %w", err) return fmt.Errorf("failed to collect GPU Non Local Adapter Memory perf data: %w", err)
} }
nonLocalAdapterMemoryMap := make(map[PidPhysEngEngType]float64)
for _, data := range c.gpuNonLocalAdapterMemoryPerfDataObject { for _, data := range c.gpuNonLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name) instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { device, ok := c.gpuDeviceCache[instance.Luid]
if !ok {
continue continue
} }
key := PidPhysEngEngType{
Luid: instance.Luid,
Phys: instance.Phys,
}
nonLocalAdapterMemoryMap[key] += data.NonLocalUsage
}
for key, nonLocalUsage := range nonLocalAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuNonLocalAdapterMemoryUsage, c.gpuNonLocalAdapterMemoryUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
nonLocalUsage, data.NonLocalUsage,
key.Luid, key.Phys, instance.Luid, device.ID, instance.Phys, instance.Part,
) )
} }
@@ -494,65 +483,47 @@ func (c *Collector) collectGpuProcessMemoryMetrics(ch chan<- prometheus.Metric)
return fmt.Errorf("failed to collect GPU Process Memory perf data: %w", err) return fmt.Errorf("failed to collect GPU Process Memory perf data: %w", err)
} }
processDedicatedUsageMap := make(map[PidPhys]float64)
processLocalUsageMap := make(map[PidPhys]float64)
processNonLocalUsageMap := make(map[PidPhys]float64)
processSharedUsageMap := make(map[PidPhys]float64)
processTotalCommittedMap := make(map[PidPhys]float64)
for _, data := range c.gpuProcessMemoryPerfDataObject { for _, data := range c.gpuProcessMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name) instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { device, ok := c.gpuDeviceCache[instance.Luid]
if !ok {
continue continue
} }
key := PidPhys{
Pid: instance.Pid,
Luid: instance.Luid,
Phys: instance.Phys,
}
processDedicatedUsageMap[key] += data.DedicatedUsage
processLocalUsageMap[key] += data.LocalUsage
processNonLocalUsageMap[key] += data.NonLocalUsage
processSharedUsageMap[key] += data.SharedUsage
processTotalCommittedMap[key] += data.TotalCommitted
}
for key, dedicatedUsage := range processDedicatedUsageMap {
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryDedicatedUsage, c.gpuProcessMemoryDedicatedUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
dedicatedUsage, data.DedicatedUsage,
key.Pid, key.Luid, key.Phys, instance.Pid, instance.Luid, device.ID, instance.Phys,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryLocalUsage, c.gpuProcessMemoryLocalUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
processLocalUsageMap[key], data.LocalUsage,
key.Pid, key.Luid, key.Phys, instance.Pid, instance.Luid, device.ID, instance.Phys,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryNonLocalUsage, c.gpuProcessMemoryNonLocalUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
processNonLocalUsageMap[key], data.NonLocalUsage,
key.Pid, key.Luid, key.Phys, instance.Pid, instance.Luid, device.ID, instance.Phys,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemorySharedUsage, c.gpuProcessMemorySharedUsage,
prometheus.GaugeValue, prometheus.GaugeValue,
processSharedUsageMap[key], data.SharedUsage,
key.Pid, key.Luid, key.Phys, instance.Pid, instance.Luid, device.ID, instance.Phys,
) )
ch <- prometheus.MustNewConstMetric( ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryTotalCommitted, c.gpuProcessMemoryTotalCommitted,
prometheus.GaugeValue, prometheus.GaugeValue,
processTotalCommittedMap[key], data.TotalCommitted,
key.Pid, key.Luid, key.Phys, instance.Pid, instance.Luid, device.ID, instance.Phys,
) )
} }

View File

@@ -25,6 +25,7 @@ import (
type Instance struct { type Instance struct {
Pid string Pid string
Luid string Luid string
DeviceID string
Phys string Phys string
Eng string Eng string
Engtype string Engtype string
@@ -34,12 +35,14 @@ type Instance struct {
type PidPhys struct { type PidPhys struct {
Pid string Pid string
Luid string Luid string
DeviceID string
Phys string Phys string
} }
type PidPhysEngEngType struct { type PidPhysEngEngType struct {
Pid string Pid string
Luid string Luid string
DeviceID string
Phys string Phys string
Eng string Eng string
Engtype string Engtype string

View File

@@ -0,0 +1,92 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cfgmgr32
import (
"fmt"
"unsafe"
"github.com/prometheus-community/windows_exporter/internal/headers/win32"
"golang.org/x/sys/windows"
)
func GetDevicesInstanceIDs(deviceID string) ([]Device, error) {
var (
err error
listSize uint32
)
deviceIDLWStr := win32.NewLPWSTR(deviceID)
err = CMGetDeviceIDListSize(deviceIDLWStr, &listSize)
if err != nil {
return nil, err
}
listBuffer := make([]uint16, listSize)
err = CMGetDeviceIDList(deviceIDLWStr, listBuffer)
if err != nil {
return nil, err
}
deviceInstanceIDs := win32.ParseMultiSz(listBuffer)
devices := make([]Device, 0, len(deviceInstanceIDs))
for _, deviceInstanceID := range deviceInstanceIDs {
var devNode *windows.Handle
err = CMLocateDevNode(&devNode, deviceInstanceID)
if err != nil {
return nil, err
}
var (
busNumber uint32
deviceAddress uint32
propType uint32
)
propLen := uint32(4)
err = CMGetDevNodeProperty(devNode, DEVPKEYDeviceBusNumber, &propType, unsafe.Pointer(&busNumber), &propLen)
if err != nil {
return nil, err
}
if propType != DEVPROP_TYPE_UINT32 {
return nil, fmt.Errorf("unexpected property type: 0x%08X", propType)
}
err = CMGetDevNodeProperty(devNode, DEVPKEYDeviceAddress, &propType, unsafe.Pointer(&deviceAddress), &propLen)
if err != nil {
return nil, err
}
if propType != DEVPROP_TYPE_UINT32 {
return nil, fmt.Errorf("unexpected property type: 0x%08X", propType)
}
devices = append(devices, Device{
InstanceID: windows.UTF16ToString(deviceInstanceID),
BusNumber: win32.UINT(busNumber),
DeviceNumber: win32.UINT(deviceAddress >> 16),
FunctionNumber: win32.UINT(deviceAddress & 0xFFFF),
})
}
return devices, nil
}

View File

@@ -0,0 +1,94 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cfgmgr32
import (
"fmt"
"unsafe"
"github.com/prometheus-community/windows_exporter/internal/headers/win32"
"golang.org/x/sys/windows"
)
//nolint:gochecknoglobals
var (
cfgmgr32 = windows.NewLazySystemDLL("cfgmgr32.dll")
procCMGetDeviceIDListW = cfgmgr32.NewProc("CM_Get_Device_ID_ListW")
procCMGetDeviceIDListSize = cfgmgr32.NewProc("CM_Get_Device_ID_List_SizeW")
procCMGetDevNodePropertyW = cfgmgr32.NewProc("CM_Get_DevNode_PropertyW")
procCMLocateDevNodeW = cfgmgr32.NewProc("CM_Locate_DevNodeW")
)
func CMGetDeviceIDListSize(filter *win32.LPWSTR, size *uint32) error {
ret, _, _ := procCMGetDeviceIDListSize.Call(
uintptr(unsafe.Pointer(size)),
filter.Pointer(),
uintptr(CM_GETIDLIST_FILTER_PRESENT|CM_GETIDLIST_FILTER_ENUMERATOR),
)
if ret != CR_SUCCESS {
return fmt.Errorf("CMGetDeviceIDListSize failed: 0x%02X", ret)
}
return nil
}
func CMGetDeviceIDList(filter *win32.LPWSTR, buf []uint16) error {
ret, _, _ := procCMGetDeviceIDListW.Call(
filter.Pointer(),
uintptr(unsafe.Pointer(&buf[0])),
uintptr(len(buf)),
uintptr(CM_GETIDLIST_FILTER_PRESENT|CM_GETIDLIST_FILTER_ENUMERATOR),
)
if ret != CR_SUCCESS {
return fmt.Errorf("CMGetDeviceIDList failed: 0x%02X", ret)
}
return nil
}
func CMLocateDevNode(devInst **windows.Handle, deviceID []uint16) error {
ret, _, _ := procCMLocateDevNodeW.Call(
uintptr(unsafe.Pointer(devInst)),
uintptr(unsafe.Pointer(&deviceID[0])),
0,
)
if ret != CR_SUCCESS {
return fmt.Errorf("CMLocateDevNode failed: 0x%02X", ret)
}
return nil
}
func CMGetDevNodeProperty(devInst *windows.Handle, propKey *DEVPROPKEY, propType *uint32, buf unsafe.Pointer, bufLen *uint32) error {
ret, _, _ := procCMGetDevNodePropertyW.Call(
uintptr(unsafe.Pointer(devInst)),
uintptr(unsafe.Pointer(propKey)),
uintptr(unsafe.Pointer(propType)),
uintptr(buf),
uintptr(unsafe.Pointer(bufLen)),
0,
)
if ret != CR_SUCCESS {
return fmt.Errorf("CMGetDevNodeProperty failed: 0x%02X", ret)
}
return nil
}

View File

@@ -0,0 +1,70 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cfgmgr32
import (
"github.com/go-ole/go-ole"
"github.com/prometheus-community/windows_exporter/internal/headers/win32"
)
const (
// Configuration Manager return codes
CR_SUCCESS = 0x00
// Filter flags
CM_GETIDLIST_FILTER_ENUMERATOR = 0x00000001
CM_GETIDLIST_FILTER_PRESENT = 0x00000100
DEVPROP_TYPE_UINT32 uint32 = 0x00000007
)
// DEVPROPKEY represents a device property key (GUID + pid)
type DEVPROPKEY struct {
FmtID ole.GUID
PID uint32
}
type Device struct {
InstanceID string
BusNumber win32.UINT
DeviceNumber win32.UINT
FunctionNumber win32.UINT
}
//nolint:gochecknoglobals
var (
// https://github.com/Infinidat/infi.devicemanager/blob/8be9ead6b04ff45c63d9e3bc70d82cceafb75c47/src/infi/devicemanager/setupapi/properties.py#L138C1-L143C34
DEVPKEYDeviceBusNumber = &DEVPROPKEY{
FmtID: ole.GUID{
Data1: 0xa45c254e,
Data2: 0xdf1c,
Data3: 0x4efd,
Data4: [8]byte{0x80, 0x20, 0x67, 0xd1, 0x46, 0xa8, 0x50, 0xe0},
},
PID: 23, // DEVPROP_TYPE_UINT32
}
// https://github.com/Infinidat/infi.devicemanager/blob/8be9ead6b04ff45c63d9e3bc70d82cceafb75c47/src/infi/devicemanager/setupapi/properties.py#L187-L192
DEVPKEYDeviceAddress = &DEVPROPKEY{
FmtID: ole.GUID{
Data1: 0xa45c254e,
Data2: 0xdf1c,
Data3: 0x4efd,
Data4: [8]byte{0x80, 0x20, 0x67, 0xd1, 0x46, 0xa8, 0x50, 0xe0},
},
PID: 30, // DEVPROP_TYPE_UINT32
}
)

View File

@@ -34,41 +34,12 @@ const (
KMTQAITYPE_ADAPTERADDRESS = 6 KMTQAITYPE_ADAPTERADDRESS = 6
// KMTQAITYPE_ADAPTERREGISTRYINFO pPrivateDriverData points to a D3DKMT_ADAPTERREGISTRYINFO structure that contains registry information about the graphics adapter. // KMTQAITYPE_ADAPTERREGISTRYINFO pPrivateDriverData points to a D3DKMT_ADAPTERREGISTRYINFO structure that contains registry information about the graphics adapter.
KMTQAITYPE_ADAPTERREGISTRYINFO = 8 KMTQAITYPE_ADAPTERREGISTRYINFO = 8
// KMTQAITYPE_PHYSICALADAPTERDEVICEIDS pPrivateDriverData points to a D3DKMT_QUERY_DEVICE_IDS structure that specifies the device ID(s) of the physical adapters. Supported starting with Windows 10 (WDDM 2.0).
KMTQAITYPE_PHYSICALADAPTERDEVICEIDS = 31
) )
var ErrNoGPUDevices = errors.New("no GPU devices found") var ErrNoGPUDevices = errors.New("no GPU devices found")
func GetGPUDeviceByLUID(adapterLUID windows.LUID) (GPUDevice, error) {
open := D3DKMT_OPENADAPTERFROMLUID{
AdapterLUID: adapterLUID,
}
if err := D3DKMTOpenAdapterFromLuid(&open); err != nil {
return GPUDevice{}, fmt.Errorf("D3DKMTOpenAdapterFromLuid failed: %w", err)
}
errs := make([]error, 0)
gpuDevice, err := GetGPUDevice(open.HAdapter)
if err != nil {
errs = append(errs, fmt.Errorf("GetGPUDevice failed: %w", err))
}
if err := D3DKMTCloseAdapter(&D3DKMT_CLOSEADAPTER{
HAdapter: open.HAdapter,
}); err != nil {
errs = append(errs, fmt.Errorf("D3DKMTCloseAdapter failed: %w", err))
}
if len(errs) > 0 {
return gpuDevice, fmt.Errorf("errors occurred while getting GPU device: %w", errors.Join(errs...))
}
gpuDevice.LUID = adapterLUID
return gpuDevice, nil
}
func GetGPUDevice(hAdapter D3DKMT_HANDLE) (GPUDevice, error) { func GetGPUDevice(hAdapter D3DKMT_HANDLE) (GPUDevice, error) {
var gpuDevice GPUDevice var gpuDevice GPUDevice
@@ -118,6 +89,18 @@ func GetGPUDevice(hAdapter D3DKMT_HANDLE) (GPUDevice, error) {
gpuDevice.AdapterString = windows.UTF16ToString(info.AdapterString[:]) gpuDevice.AdapterString = windows.UTF16ToString(info.AdapterString[:])
var deviceIDs D3DKMT_QUERY_DEVICE_IDS
query.queryType = KMTQAITYPE_PHYSICALADAPTERDEVICEIDS
query.pPrivateDriverData = unsafe.Pointer(&deviceIDs)
query.privateDriverDataSize = uint32(unsafe.Sizeof(deviceIDs))
if err := D3DKMTQueryAdapterInfo(&query); err != nil && !errors.Is(err, windows.ERROR_FILE_NOT_FOUND) {
return gpuDevice, fmt.Errorf("D3DKMTQueryAdapterInfo (Device IDs) failed: %w", err)
}
gpuDevice.DeviceID = formatPNPDeviceID(deviceIDs)
return gpuDevice, nil return gpuDevice, nil
} }
@@ -151,7 +134,7 @@ func GetGPUDevices() ([]GPUDevice, error) {
// Process each adapter // Process each adapter
for i := range enumAdapters.NumAdapters { for i := range enumAdapters.NumAdapters {
adapter := pAdapters[i] adapter := pAdapters[i]
// Validate handle before using it // Validate the handle before using it.
if adapter.HAdapter == 0 { if adapter.HAdapter == 0 {
errs = append(errs, fmt.Errorf("adapter %d has null handle", i)) errs = append(errs, fmt.Errorf("adapter %d has null handle", i))
@@ -190,3 +173,13 @@ func GetGPUDevices() ([]GPUDevice, error) {
return gpuDevices, nil return gpuDevices, nil
} }
func formatPNPDeviceID(deviceIDs D3DKMT_QUERY_DEVICE_IDS) string {
return fmt.Sprintf("PCI\\VEN_%04X&DEV_%04X&SUBSYS_%04X%04X&REV_%02X",
uint16(deviceIDs.DeviceIds.VendorID),
uint16(deviceIDs.DeviceIds.DeviceID),
uint16(deviceIDs.DeviceIds.SubSystemID),
uint16(deviceIDs.DeviceIds.SubVendorID),
uint8(deviceIDs.DeviceIds.RevisionID),
)
}

View File

@@ -73,9 +73,22 @@ type D3DKMT_ADAPTERADDRESS struct {
FunctionNumber win32.UINT FunctionNumber win32.UINT
} }
type D3DKMT_QUERY_DEVICE_IDS struct {
PhysicalAdapterIndex win32.UINT
DeviceIds struct {
VendorID win32.UINT
DeviceID win32.UINT
SubVendorID win32.UINT
SubSystemID win32.UINT
RevisionID win32.UINT
BusType win32.UINT
}
}
type GPUDevice struct { type GPUDevice struct {
AdapterString string AdapterString string
LUID windows.LUID LUID windows.LUID
DeviceID string
DedicatedVideoMemorySize uint64 DedicatedVideoMemorySize uint64
DedicatedSystemMemorySize uint64 DedicatedSystemMemorySize uint64
SharedSystemMemorySize uint64 SharedSystemMemorySize uint64

View File

@@ -18,6 +18,7 @@
package win32 package win32
import ( import (
"strconv"
"unsafe" "unsafe"
"golang.org/x/sys/windows" "golang.org/x/sys/windows"
@@ -32,8 +33,8 @@ type (
LPWSTR struct { LPWSTR struct {
*uint16 *uint16
} }
ULONG = uint32 // ULONG is a 32-bit unsigned int in Win32 ULONG uint32 // ULONG is a 32-bit unsigned int in Win32
UINT = uint32 // UINT is a 32-bit unsigned int in Win32 UINT uint32 // UINT is a 32-bit unsigned int in Win32
) )
// NewLPWSTR creates a new LPWSTR from a string. // NewLPWSTR creates a new LPWSTR from a string.
@@ -60,3 +61,7 @@ func (s *LPWSTR) Pointer() uintptr {
func (s *LPWSTR) String() string { func (s *LPWSTR) String() string {
return windows.UTF16PtrToString(s.uint16) return windows.UTF16PtrToString(s.uint16)
} }
func (u *UINT) String() string {
return strconv.FormatUint(uint64(*u), 10)
}

View File

@@ -0,0 +1,55 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package win32
// ParseMultiSz splits a UTF-16 encoded MULTI_SZ buffer (Windows style) into
// individual UTF-16 string slices.
//
// A MULTI_SZ buffer is a sequence of UTF-16 strings separated by single null
// terminators (0x0000) and terminated by an extra null (i.e., two consecutive
// nulls) to mark the end of the list.
//
// Example layout in memory (UTF-16):
//
// "foo\0bar\0baz\0\0"
//
// Given such a []uint16, this function returns a [][]uint16 where each inner
// slice is one null-terminated string segment without the trailing null.
//
// The returned slices reference the original buffer (no copying).
func ParseMultiSz(buf []uint16) [][]uint16 {
var (
result [][]uint16
start int
)
for i := range buf {
if buf[i] == 0 {
// Found a null terminator.
if i == start {
// Two consecutive nulls → end of list.
break
}
// Append current string slice (excluding null).
result = append(result, buf[start:i])
// Move start to next character after null.
start = i + 1
}
}
return result
}