From 0b8a257b31d35b918deb224b56b701b4242dcaf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Thu, 28 Aug 2025 06:36:10 +0200 Subject: [PATCH] gpu: add device id label (#2186) --- .idea/dictionaries/project.xml | 1 + docs/collector.gpu.md | 40 ++--- internal/collector/gpu/gpu.go | 221 +++++++++++--------------- internal/collector/gpu/utils.go | 31 ++-- internal/headers/cfgmgr32/cfgmgr32.go | 92 +++++++++++ internal/headers/cfgmgr32/syscall.go | 94 +++++++++++ internal/headers/cfgmgr32/types.go | 70 ++++++++ internal/headers/gdi32/gdi32.go | 57 +++---- internal/headers/gdi32/types.go | 13 ++ internal/headers/win32/types.go | 9 +- internal/headers/win32/utils.go | 55 +++++++ 11 files changed, 490 insertions(+), 193 deletions(-) create mode 100644 internal/headers/cfgmgr32/cfgmgr32.go create mode 100644 internal/headers/cfgmgr32/syscall.go create mode 100644 internal/headers/cfgmgr32/types.go create mode 100644 internal/headers/win32/utils.go diff --git a/.idea/dictionaries/project.xml b/.idea/dictionaries/project.xml index ecd06d91..74d07f8a 100644 --- a/.idea/dictionaries/project.xml +++ b/.idea/dictionaries/project.xml @@ -4,6 +4,7 @@ containerd endpointstats gochecknoglobals + lpwstr luid operationoptions setupapi diff --git a/docs/collector.gpu.md b/docs/collector.gpu.md index a0f79190..3da22978 100644 --- a/docs/collector.gpu.md +++ b/docs/collector.gpu.md @@ -20,28 +20,28 @@ These metrics are available on supported versions of Windows with compatible GPU ### Adapter-level Metrics -| Name | Description | Type | Labels | -|--------------------------------------------------|------------------------------------------------------------------------------------|-------|---------------| -| `windows_gpu_info` | A metric with a constant '1' value labeled with gpu device information. | gauge | `luid`,`name`,`bus_number`,`phys`,`function_number` | -| `windows_gpu_dedicated_system_memory_size_bytes` | The size, in bytes, of memory that is dedicated from system memory. | gauge | `luid` | -| `windows_gpu_dedicated_video_memory_size_bytes` | The size, in bytes, of memory that is dedicated from video memory. | gauge | `luid` | -| `windows_gpu_shared_system_memory_size_bytes` | The size, in bytes, of memory from system memory that can be shared by many users. | gauge | `luid` | -| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `luid`,`phys` | -| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `luid`,`phys` | -| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `luid`,`phys` | -| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `luid`,`phys` | -| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `luid`,`phys` | +| Name | Description | Type | Labels | +|--------------------------------------------------|------------------------------------------------------------------------------------|-------|-----------------------------------------------------------------| +| `windows_gpu_info` | A metric with a constant '1' value labeled with gpu device information. | gauge | `bus_number`,`device_id`,`function_number`,`luid`,`name`,`phys` | +| `windows_gpu_dedicated_system_memory_size_bytes` | The size, in bytes, of memory that is dedicated from system memory. | gauge | `device_id`,`luid` | +| `windows_gpu_dedicated_video_memory_size_bytes` | The size, in bytes, of memory that is dedicated from video memory. | gauge | `device_id`,`luid` | +| `windows_gpu_shared_system_memory_size_bytes` | The size, in bytes, of memory from system memory that can be shared by many users. | gauge | `device_id`,`luid` | +| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `device_id`,`luid`,`phys` | +| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys` | +| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys` | +| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys`,`part` | +| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `device_id`,`luid`,`phys`,`part` | ### Per-process Metrics -| Name | Description | Type | Labels | -|----------------------------------------------|-------------------------------------------------|---------|-----------------------------------------------| -| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `luid`,`phys`, `eng`, `engtype`, `process_id` | -| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `luid`,`phys`,`process_id` | -| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | -| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | -| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | -| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `luid`,`phys`,`process_id` | +| Name | Description | Type | Labels | +|----------------------------------------------|-------------------------------------------------|---------|-----------------------------------------------------------| +| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `device_id`,`luid`,`phys`, `eng`, `engtype`, `process_id` | +| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` | +| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` | +| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` | +| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` | +| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `device_id`,`luid`,`phys`,`process_id` | ## Metric Labels @@ -57,7 +57,7 @@ These are basic queries to help you get started with GPU monitoring on Windows u **Show GPU information for a specific physical GPU (0):** ```promql -windows_gpu_info{description="NVIDIA GeForce GTX 1070",friendly_name="",hardware_id="PCI\\VEN_10DE&DEV_1B81&SUBSYS_61733842&REV_A1",phys="0",physical_device_object_name="\\Device\\NTPNP_PCI0027"} 1 +windows_gpu_info{bus_number="8",device_id="PCI\\VEN_10DE&DEV_1B81&SUBSYS_61733842&REV_A1",function_number="0",luid="0x00000000_0x00010F8A",name="NVIDIA GeForce GTX 1070",phys="0"} 1 ``` **Show total dedicated GPU memory (in bytes) usage on GPU 0:** diff --git a/internal/collector/gpu/gpu.go b/internal/collector/gpu/gpu.go index 0916c9ca..df30f61e 100644 --- a/internal/collector/gpu/gpu.go +++ b/internal/collector/gpu/gpu.go @@ -21,9 +21,9 @@ import ( "errors" "fmt" "log/slog" - "strconv" "github.com/alecthomas/kingpin/v2" + "github.com/prometheus-community/windows_exporter/internal/headers/cfgmgr32" "github.com/prometheus-community/windows_exporter/internal/headers/gdi32" "github.com/prometheus-community/windows_exporter/internal/mi" "github.com/prometheus-community/windows_exporter/internal/pdh" @@ -41,7 +41,7 @@ var ConfigDefaults = Config{} type Collector struct { config Config - gpuDeviceCache map[string]gdi32.GPUDevice + gpuDeviceCache map[string]gpuDevice // GPU Engine gpuEnginePerfDataCollector *pdh.Collector @@ -85,6 +85,12 @@ type Collector struct { gpuProcessMemoryTotalCommitted *prometheus.Desc } +type gpuDevice struct { + gdi32 gdi32.GPUDevice + cfgmgr32 cfgmgr32.Device + ID string +} + func New(config *Config) *Collector { if config == nil { config = &ConfigDefaults @@ -121,97 +127,97 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error { c.gpuInfo = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "info"), "A metric with a constant '1' value labeled with gpu device information.", - []string{"luid", "name", "bus_number", "phys", "function_number"}, + []string{"luid", "device_id", "name", "bus_number", "phys", "function_number"}, nil, ) c.gpuSharedSystemMemorySize = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "shared_system_memory_size_bytes"), "The size, in bytes, of memory from system memory that can be shared by many users.", - []string{"luid"}, + []string{"luid", "device_id"}, nil, ) c.gpuDedicatedSystemMemorySize = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "dedicated_system_memory_size_bytes"), "The size, in bytes, of memory that is dedicated from system memory.", - []string{"luid"}, + []string{"luid", "device_id"}, nil, ) c.gpuDedicatedVideoMemorySize = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "dedicated_video_memory_size_bytes"), "The size, in bytes, of memory that is dedicated from video memory.", - []string{"luid"}, + []string{"luid", "device_id"}, nil, ) c.gpuEngineRunningTime = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"), "Total running time of the GPU in seconds.", - []string{"process_id", "luid", "phys", "eng", "engtype"}, + []string{"process_id", "luid", "device_id", "phys", "eng", "engtype"}, nil, ) c.gpuAdapterMemoryDedicatedUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_dedicated_bytes"), "Dedicated GPU memory usage in bytes.", - []string{"luid", "phys"}, + []string{"luid", "device_id", "phys"}, nil, ) c.gpuAdapterMemorySharedUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_shared_bytes"), "Shared GPU memory usage in bytes.", - []string{"luid", "phys"}, + []string{"luid", "device_id", "phys"}, nil, ) c.gpuAdapterMemoryTotalCommitted = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_committed_bytes"), "Total committed GPU memory in bytes.", - []string{"luid", "phys"}, + []string{"luid", "device_id", "phys"}, nil, ) c.gpuLocalAdapterMemoryUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "local_adapter_memory_bytes"), "Local adapter memory usage in bytes.", - []string{"luid", "phys"}, + []string{"luid", "device_id", "phys", "part"}, nil, ) c.gpuNonLocalAdapterMemoryUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "non_local_adapter_memory_bytes"), "Non-local adapter memory usage in bytes.", - []string{"luid", "phys"}, + []string{"luid", "device_id", "phys", "part"}, nil, ) c.gpuProcessMemoryDedicatedUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "process_memory_dedicated_bytes"), "Dedicated process memory usage in bytes.", - []string{"process_id", "luid", "phys"}, + []string{"process_id", "luid", "device_id", "phys"}, nil, ) c.gpuProcessMemoryLocalUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "process_memory_local_bytes"), "Local process memory usage in bytes.", - []string{"process_id", "luid", "phys"}, + []string{"process_id", "luid", "device_id", "phys"}, nil, ) c.gpuProcessMemoryNonLocalUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "process_memory_non_local_bytes"), "Non-local process memory usage in bytes.", - []string{"process_id", "luid", "phys"}, + []string{"process_id", "luid", "device_id", "phys"}, nil, ) c.gpuProcessMemorySharedUsage = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "process_memory_shared_bytes"), "Shared process memory usage in bytes.", - []string{"process_id", "luid", "phys"}, + []string{"process_id", "luid", "device_id", "phys"}, nil, ) c.gpuProcessMemoryTotalCommitted = prometheus.NewDesc( prometheus.BuildFQName(types.Namespace, Name, "process_memory_committed_bytes"), "Total committed process memory in bytes.", - []string{"process_id", "luid", "phys"}, + []string{"process_id", "luid", "device_id", "phys"}, nil, ) @@ -253,11 +259,39 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error { } if c.gpuDeviceCache == nil { - c.gpuDeviceCache = make(map[string]gdi32.GPUDevice) + c.gpuDeviceCache = make(map[string]gpuDevice) } luidKey := fmt.Sprintf("0x%08X_0x%08X", gpu.LUID.HighPart, gpu.LUID.LowPart) - c.gpuDeviceCache[luidKey] = gpu + + deviceID := gpu.DeviceID + + cfgmgr32Devs, err := cfgmgr32.GetDevicesInstanceIDs(gpu.DeviceID) + if err != nil { + errs = append(errs, fmt.Errorf("failed to get device instance IDs for device ID %s: %w", gpu.DeviceID, err)) + } + + var cfgmgr32Dev cfgmgr32.Device + + for _, dev := range cfgmgr32Devs { + if dev.BusNumber == gpu.BusNumber && dev.DeviceNumber == gpu.DeviceNumber && dev.FunctionNumber == gpu.FunctionNumber { + cfgmgr32Dev = dev + + break + } + } + + if cfgmgr32Dev.InstanceID == "" { + errs = append(errs, fmt.Errorf("failed to find matching device for device ID %s", gpu.DeviceID)) + } else { + deviceID = cfgmgr32Dev.InstanceID + } + + c.gpuDeviceCache[luidKey] = gpuDevice{ + gdi32: gpu, + cfgmgr32: cfgmgr32Dev, + ID: deviceID, + } } return errors.Join(errs...) @@ -298,31 +332,32 @@ func (c *Collector) collectGpuInfo(ch chan<- prometheus.Metric) { prometheus.GaugeValue, 1.0, luid, - gpu.AdapterString, - strconv.FormatInt(int64(gpu.BusNumber), 10), - strconv.FormatInt(int64(gpu.DeviceNumber), 10), - strconv.FormatInt(int64(gpu.FunctionNumber), 10), + gpu.ID, + gpu.gdi32.AdapterString, + gpu.gdi32.BusNumber.String(), + gpu.gdi32.DeviceNumber.String(), + gpu.gdi32.FunctionNumber.String(), ) ch <- prometheus.MustNewConstMetric( c.gpuSharedSystemMemorySize, prometheus.GaugeValue, - float64(gpu.SharedSystemMemorySize), - luid, + float64(gpu.gdi32.SharedSystemMemorySize), + luid, gpu.ID, ) ch <- prometheus.MustNewConstMetric( c.gpuDedicatedSystemMemorySize, prometheus.GaugeValue, - float64(gpu.DedicatedSystemMemorySize), - luid, + float64(gpu.gdi32.DedicatedSystemMemorySize), + luid, gpu.ID, ) ch <- prometheus.MustNewConstMetric( c.gpuDedicatedVideoMemorySize, prometheus.GaugeValue, - float64(gpu.DedicatedVideoMemorySize), - luid, + float64(gpu.gdi32.DedicatedVideoMemorySize), + luid, gpu.ID, ) } } @@ -333,31 +368,20 @@ func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error { return fmt.Errorf("failed to collect GPU Engine perf data: %w", err) } - runningTimeMap := make(map[PidPhysEngEngType]float64) // Iterate over the GPU Engine perf data and aggregate the values. for _, data := range c.gpuEnginePerfDataObject { instance := parseGPUCounterInstanceString(data.Name) - if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { + device, ok := c.gpuDeviceCache[instance.Luid] + if !ok { continue } - key := PidPhysEngEngType{ - Pid: instance.Pid, - Phys: instance.Phys, - Luid: instance.Luid, - Eng: instance.Eng, - Engtype: instance.Engtype, - } - runningTimeMap[key] += data.RunningTime / 10_000_000 // RunningTime is in 100ns units, convert to seconds. - } - - for key, runningTime := range runningTimeMap { ch <- prometheus.MustNewConstMetric( c.gpuEngineRunningTime, prometheus.CounterValue, - runningTime, - key.Pid, key.Luid, key.Phys, key.Eng, key.Engtype, + data.RunningTime/10_000_000, + instance.Pid, instance.Luid, device.ID, instance.Phys, instance.Eng, instance.Engtype, ) } @@ -370,49 +394,33 @@ func (c *Collector) collectGpuAdapterMemoryMetrics(ch chan<- prometheus.Metric) return fmt.Errorf("failed to collect GPU Adapter Memory perf data: %w", err) } - dedicatedUsageMap := make(map[PidPhysEngEngType]float64) - sharedUsageMap := make(map[PidPhysEngEngType]float64) - totalCommittedMap := make(map[PidPhysEngEngType]float64) - for _, data := range c.gpuAdapterMemoryPerfDataObject { instance := parseGPUCounterInstanceString(data.Name) - if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { + device, ok := c.gpuDeviceCache[instance.Luid] + if !ok { continue } - key := PidPhysEngEngType{ - Pid: instance.Pid, - Luid: instance.Luid, - Phys: instance.Phys, - Eng: instance.Eng, - Engtype: instance.Engtype, - } - dedicatedUsageMap[key] += data.DedicatedUsage - sharedUsageMap[key] += data.SharedUsage - totalCommittedMap[key] += data.TotalCommitted - } - - for key, dedicatedUsage := range dedicatedUsageMap { ch <- prometheus.MustNewConstMetric( c.gpuAdapterMemoryDedicatedUsage, prometheus.GaugeValue, - dedicatedUsage, - key.Luid, key.Phys, + data.DedicatedUsage, + instance.Luid, device.ID, instance.Phys, ) ch <- prometheus.MustNewConstMetric( c.gpuAdapterMemorySharedUsage, prometheus.GaugeValue, - sharedUsageMap[key], - key.Luid, key.Phys, + data.SharedUsage, + instance.Luid, device.ID, instance.Phys, ) ch <- prometheus.MustNewConstMetric( c.gpuAdapterMemoryTotalCommitted, prometheus.GaugeValue, - totalCommittedMap[key], - key.Luid, key.Phys, + data.TotalCommitted, + instance.Luid, device.ID, instance.Phys, ) } @@ -425,29 +433,19 @@ func (c *Collector) collectGpuLocalAdapterMemoryMetrics(ch chan<- prometheus.Met return fmt.Errorf("failed to collect GPU Local Adapter Memory perf data: %w", err) } - localAdapterMemoryMap := make(map[PidPhysEngEngType]float64) - for _, data := range c.gpuLocalAdapterMemoryPerfDataObject { instance := parseGPUCounterInstanceString(data.Name) - if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { + device, ok := c.gpuDeviceCache[instance.Luid] + if !ok { continue } - key := PidPhysEngEngType{ - Luid: instance.Luid, - Phys: instance.Phys, - } - - localAdapterMemoryMap[key] += data.LocalUsage - } - - for key, localUsage := range localAdapterMemoryMap { ch <- prometheus.MustNewConstMetric( c.gpuLocalAdapterMemoryUsage, prometheus.GaugeValue, - localUsage, - key.Luid, key.Phys, + data.LocalUsage, + instance.Luid, device.ID, instance.Phys, instance.Part, ) } @@ -460,28 +458,19 @@ func (c *Collector) collectGpuNonLocalAdapterMemoryMetrics(ch chan<- prometheus. return fmt.Errorf("failed to collect GPU Non Local Adapter Memory perf data: %w", err) } - nonLocalAdapterMemoryMap := make(map[PidPhysEngEngType]float64) - for _, data := range c.gpuNonLocalAdapterMemoryPerfDataObject { instance := parseGPUCounterInstanceString(data.Name) - if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { + device, ok := c.gpuDeviceCache[instance.Luid] + if !ok { continue } - key := PidPhysEngEngType{ - Luid: instance.Luid, - Phys: instance.Phys, - } - nonLocalAdapterMemoryMap[key] += data.NonLocalUsage - } - - for key, nonLocalUsage := range nonLocalAdapterMemoryMap { ch <- prometheus.MustNewConstMetric( c.gpuNonLocalAdapterMemoryUsage, prometheus.GaugeValue, - nonLocalUsage, - key.Luid, key.Phys, + data.NonLocalUsage, + instance.Luid, device.ID, instance.Phys, instance.Part, ) } @@ -494,65 +483,47 @@ func (c *Collector) collectGpuProcessMemoryMetrics(ch chan<- prometheus.Metric) return fmt.Errorf("failed to collect GPU Process Memory perf data: %w", err) } - processDedicatedUsageMap := make(map[PidPhys]float64) - processLocalUsageMap := make(map[PidPhys]float64) - processNonLocalUsageMap := make(map[PidPhys]float64) - processSharedUsageMap := make(map[PidPhys]float64) - processTotalCommittedMap := make(map[PidPhys]float64) - for _, data := range c.gpuProcessMemoryPerfDataObject { instance := parseGPUCounterInstanceString(data.Name) - if _, ok := c.gpuDeviceCache[instance.Luid]; !ok { + device, ok := c.gpuDeviceCache[instance.Luid] + if !ok { continue } - key := PidPhys{ - Pid: instance.Pid, - Luid: instance.Luid, - Phys: instance.Phys, - } - processDedicatedUsageMap[key] += data.DedicatedUsage - processLocalUsageMap[key] += data.LocalUsage - processNonLocalUsageMap[key] += data.NonLocalUsage - processSharedUsageMap[key] += data.SharedUsage - processTotalCommittedMap[key] += data.TotalCommitted - } - - for key, dedicatedUsage := range processDedicatedUsageMap { ch <- prometheus.MustNewConstMetric( c.gpuProcessMemoryDedicatedUsage, prometheus.GaugeValue, - dedicatedUsage, - key.Pid, key.Luid, key.Phys, + data.DedicatedUsage, + instance.Pid, instance.Luid, device.ID, instance.Phys, ) ch <- prometheus.MustNewConstMetric( c.gpuProcessMemoryLocalUsage, prometheus.GaugeValue, - processLocalUsageMap[key], - key.Pid, key.Luid, key.Phys, + data.LocalUsage, + instance.Pid, instance.Luid, device.ID, instance.Phys, ) ch <- prometheus.MustNewConstMetric( c.gpuProcessMemoryNonLocalUsage, prometheus.GaugeValue, - processNonLocalUsageMap[key], - key.Pid, key.Luid, key.Phys, + data.NonLocalUsage, + instance.Pid, instance.Luid, device.ID, instance.Phys, ) ch <- prometheus.MustNewConstMetric( c.gpuProcessMemorySharedUsage, prometheus.GaugeValue, - processSharedUsageMap[key], - key.Pid, key.Luid, key.Phys, + data.SharedUsage, + instance.Pid, instance.Luid, device.ID, instance.Phys, ) ch <- prometheus.MustNewConstMetric( c.gpuProcessMemoryTotalCommitted, prometheus.GaugeValue, - processTotalCommittedMap[key], - key.Pid, key.Luid, key.Phys, + data.TotalCommitted, + instance.Pid, instance.Luid, device.ID, instance.Phys, ) } diff --git a/internal/collector/gpu/utils.go b/internal/collector/gpu/utils.go index c8b57295..902c8a0a 100644 --- a/internal/collector/gpu/utils.go +++ b/internal/collector/gpu/utils.go @@ -23,26 +23,29 @@ import ( ) type Instance struct { - Pid string - Luid string - Phys string - Eng string - Engtype string - Part string + Pid string + Luid string + DeviceID string + Phys string + Eng string + Engtype string + Part string } type PidPhys struct { - Pid string - Luid string - Phys string + Pid string + Luid string + DeviceID string + Phys string } type PidPhysEngEngType struct { - Pid string - Luid string - Phys string - Eng string - Engtype string + Pid string + Luid string + DeviceID string + Phys string + Eng string + Engtype string } func parseGPUCounterInstanceString(s string) Instance { diff --git a/internal/headers/cfgmgr32/cfgmgr32.go b/internal/headers/cfgmgr32/cfgmgr32.go new file mode 100644 index 00000000..e0b2e180 --- /dev/null +++ b/internal/headers/cfgmgr32/cfgmgr32.go @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cfgmgr32 + +import ( + "fmt" + "unsafe" + + "github.com/prometheus-community/windows_exporter/internal/headers/win32" + "golang.org/x/sys/windows" +) + +func GetDevicesInstanceIDs(deviceID string) ([]Device, error) { + var ( + err error + listSize uint32 + ) + + deviceIDLWStr := win32.NewLPWSTR(deviceID) + + err = CMGetDeviceIDListSize(deviceIDLWStr, &listSize) + if err != nil { + return nil, err + } + + listBuffer := make([]uint16, listSize) + + err = CMGetDeviceIDList(deviceIDLWStr, listBuffer) + if err != nil { + return nil, err + } + + deviceInstanceIDs := win32.ParseMultiSz(listBuffer) + devices := make([]Device, 0, len(deviceInstanceIDs)) + + for _, deviceInstanceID := range deviceInstanceIDs { + var devNode *windows.Handle + + err = CMLocateDevNode(&devNode, deviceInstanceID) + if err != nil { + return nil, err + } + + var ( + busNumber uint32 + deviceAddress uint32 + propType uint32 + ) + + propLen := uint32(4) + + err = CMGetDevNodeProperty(devNode, DEVPKEYDeviceBusNumber, &propType, unsafe.Pointer(&busNumber), &propLen) + if err != nil { + return nil, err + } + + if propType != DEVPROP_TYPE_UINT32 { + return nil, fmt.Errorf("unexpected property type: 0x%08X", propType) + } + + err = CMGetDevNodeProperty(devNode, DEVPKEYDeviceAddress, &propType, unsafe.Pointer(&deviceAddress), &propLen) + if err != nil { + return nil, err + } + + if propType != DEVPROP_TYPE_UINT32 { + return nil, fmt.Errorf("unexpected property type: 0x%08X", propType) + } + + devices = append(devices, Device{ + InstanceID: windows.UTF16ToString(deviceInstanceID), + BusNumber: win32.UINT(busNumber), + DeviceNumber: win32.UINT(deviceAddress >> 16), + FunctionNumber: win32.UINT(deviceAddress & 0xFFFF), + }) + } + + return devices, nil +} diff --git a/internal/headers/cfgmgr32/syscall.go b/internal/headers/cfgmgr32/syscall.go new file mode 100644 index 00000000..837c964d --- /dev/null +++ b/internal/headers/cfgmgr32/syscall.go @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cfgmgr32 + +import ( + "fmt" + "unsafe" + + "github.com/prometheus-community/windows_exporter/internal/headers/win32" + "golang.org/x/sys/windows" +) + +//nolint:gochecknoglobals +var ( + cfgmgr32 = windows.NewLazySystemDLL("cfgmgr32.dll") + + procCMGetDeviceIDListW = cfgmgr32.NewProc("CM_Get_Device_ID_ListW") + procCMGetDeviceIDListSize = cfgmgr32.NewProc("CM_Get_Device_ID_List_SizeW") + procCMGetDevNodePropertyW = cfgmgr32.NewProc("CM_Get_DevNode_PropertyW") + procCMLocateDevNodeW = cfgmgr32.NewProc("CM_Locate_DevNodeW") +) + +func CMGetDeviceIDListSize(filter *win32.LPWSTR, size *uint32) error { + ret, _, _ := procCMGetDeviceIDListSize.Call( + uintptr(unsafe.Pointer(size)), + filter.Pointer(), + uintptr(CM_GETIDLIST_FILTER_PRESENT|CM_GETIDLIST_FILTER_ENUMERATOR), + ) + + if ret != CR_SUCCESS { + return fmt.Errorf("CMGetDeviceIDListSize failed: 0x%02X", ret) + } + + return nil +} + +func CMGetDeviceIDList(filter *win32.LPWSTR, buf []uint16) error { + ret, _, _ := procCMGetDeviceIDListW.Call( + filter.Pointer(), + uintptr(unsafe.Pointer(&buf[0])), + uintptr(len(buf)), + uintptr(CM_GETIDLIST_FILTER_PRESENT|CM_GETIDLIST_FILTER_ENUMERATOR), + ) + + if ret != CR_SUCCESS { + return fmt.Errorf("CMGetDeviceIDList failed: 0x%02X", ret) + } + + return nil +} + +func CMLocateDevNode(devInst **windows.Handle, deviceID []uint16) error { + ret, _, _ := procCMLocateDevNodeW.Call( + uintptr(unsafe.Pointer(devInst)), + uintptr(unsafe.Pointer(&deviceID[0])), + 0, + ) + + if ret != CR_SUCCESS { + return fmt.Errorf("CMLocateDevNode failed: 0x%02X", ret) + } + + return nil +} + +func CMGetDevNodeProperty(devInst *windows.Handle, propKey *DEVPROPKEY, propType *uint32, buf unsafe.Pointer, bufLen *uint32) error { + ret, _, _ := procCMGetDevNodePropertyW.Call( + uintptr(unsafe.Pointer(devInst)), + uintptr(unsafe.Pointer(propKey)), + uintptr(unsafe.Pointer(propType)), + uintptr(buf), + uintptr(unsafe.Pointer(bufLen)), + 0, + ) + + if ret != CR_SUCCESS { + return fmt.Errorf("CMGetDevNodeProperty failed: 0x%02X", ret) + } + + return nil +} diff --git a/internal/headers/cfgmgr32/types.go b/internal/headers/cfgmgr32/types.go new file mode 100644 index 00000000..72573e98 --- /dev/null +++ b/internal/headers/cfgmgr32/types.go @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cfgmgr32 + +import ( + "github.com/go-ole/go-ole" + "github.com/prometheus-community/windows_exporter/internal/headers/win32" +) + +const ( + // Configuration Manager return codes + CR_SUCCESS = 0x00 + + // Filter flags + CM_GETIDLIST_FILTER_ENUMERATOR = 0x00000001 + CM_GETIDLIST_FILTER_PRESENT = 0x00000100 + + DEVPROP_TYPE_UINT32 uint32 = 0x00000007 +) + +// DEVPROPKEY represents a device property key (GUID + pid) +type DEVPROPKEY struct { + FmtID ole.GUID + PID uint32 +} + +type Device struct { + InstanceID string + BusNumber win32.UINT + DeviceNumber win32.UINT + FunctionNumber win32.UINT +} + +//nolint:gochecknoglobals +var ( + // https://github.com/Infinidat/infi.devicemanager/blob/8be9ead6b04ff45c63d9e3bc70d82cceafb75c47/src/infi/devicemanager/setupapi/properties.py#L138C1-L143C34 + DEVPKEYDeviceBusNumber = &DEVPROPKEY{ + FmtID: ole.GUID{ + Data1: 0xa45c254e, + Data2: 0xdf1c, + Data3: 0x4efd, + Data4: [8]byte{0x80, 0x20, 0x67, 0xd1, 0x46, 0xa8, 0x50, 0xe0}, + }, + PID: 23, // DEVPROP_TYPE_UINT32 + } + + // https://github.com/Infinidat/infi.devicemanager/blob/8be9ead6b04ff45c63d9e3bc70d82cceafb75c47/src/infi/devicemanager/setupapi/properties.py#L187-L192 + DEVPKEYDeviceAddress = &DEVPROPKEY{ + FmtID: ole.GUID{ + Data1: 0xa45c254e, + Data2: 0xdf1c, + Data3: 0x4efd, + Data4: [8]byte{0x80, 0x20, 0x67, 0xd1, 0x46, 0xa8, 0x50, 0xe0}, + }, + PID: 30, // DEVPROP_TYPE_UINT32 + } +) diff --git a/internal/headers/gdi32/gdi32.go b/internal/headers/gdi32/gdi32.go index 1816db70..c6b12a10 100644 --- a/internal/headers/gdi32/gdi32.go +++ b/internal/headers/gdi32/gdi32.go @@ -34,41 +34,12 @@ const ( KMTQAITYPE_ADAPTERADDRESS = 6 // KMTQAITYPE_ADAPTERREGISTRYINFO pPrivateDriverData points to a D3DKMT_ADAPTERREGISTRYINFO structure that contains registry information about the graphics adapter. KMTQAITYPE_ADAPTERREGISTRYINFO = 8 + // KMTQAITYPE_PHYSICALADAPTERDEVICEIDS pPrivateDriverData points to a D3DKMT_QUERY_DEVICE_IDS structure that specifies the device ID(s) of the physical adapters. Supported starting with Windows 10 (WDDM 2.0). + KMTQAITYPE_PHYSICALADAPTERDEVICEIDS = 31 ) var ErrNoGPUDevices = errors.New("no GPU devices found") -func GetGPUDeviceByLUID(adapterLUID windows.LUID) (GPUDevice, error) { - open := D3DKMT_OPENADAPTERFROMLUID{ - AdapterLUID: adapterLUID, - } - - if err := D3DKMTOpenAdapterFromLuid(&open); err != nil { - return GPUDevice{}, fmt.Errorf("D3DKMTOpenAdapterFromLuid failed: %w", err) - } - - errs := make([]error, 0) - - gpuDevice, err := GetGPUDevice(open.HAdapter) - if err != nil { - errs = append(errs, fmt.Errorf("GetGPUDevice failed: %w", err)) - } - - if err := D3DKMTCloseAdapter(&D3DKMT_CLOSEADAPTER{ - HAdapter: open.HAdapter, - }); err != nil { - errs = append(errs, fmt.Errorf("D3DKMTCloseAdapter failed: %w", err)) - } - - if len(errs) > 0 { - return gpuDevice, fmt.Errorf("errors occurred while getting GPU device: %w", errors.Join(errs...)) - } - - gpuDevice.LUID = adapterLUID - - return gpuDevice, nil -} - func GetGPUDevice(hAdapter D3DKMT_HANDLE) (GPUDevice, error) { var gpuDevice GPUDevice @@ -118,6 +89,18 @@ func GetGPUDevice(hAdapter D3DKMT_HANDLE) (GPUDevice, error) { gpuDevice.AdapterString = windows.UTF16ToString(info.AdapterString[:]) + var deviceIDs D3DKMT_QUERY_DEVICE_IDS + + query.queryType = KMTQAITYPE_PHYSICALADAPTERDEVICEIDS + query.pPrivateDriverData = unsafe.Pointer(&deviceIDs) + query.privateDriverDataSize = uint32(unsafe.Sizeof(deviceIDs)) + + if err := D3DKMTQueryAdapterInfo(&query); err != nil && !errors.Is(err, windows.ERROR_FILE_NOT_FOUND) { + return gpuDevice, fmt.Errorf("D3DKMTQueryAdapterInfo (Device IDs) failed: %w", err) + } + + gpuDevice.DeviceID = formatPNPDeviceID(deviceIDs) + return gpuDevice, nil } @@ -151,7 +134,7 @@ func GetGPUDevices() ([]GPUDevice, error) { // Process each adapter for i := range enumAdapters.NumAdapters { adapter := pAdapters[i] - // Validate handle before using it + // Validate the handle before using it. if adapter.HAdapter == 0 { errs = append(errs, fmt.Errorf("adapter %d has null handle", i)) @@ -190,3 +173,13 @@ func GetGPUDevices() ([]GPUDevice, error) { return gpuDevices, nil } + +func formatPNPDeviceID(deviceIDs D3DKMT_QUERY_DEVICE_IDS) string { + return fmt.Sprintf("PCI\\VEN_%04X&DEV_%04X&SUBSYS_%04X%04X&REV_%02X", + uint16(deviceIDs.DeviceIds.VendorID), + uint16(deviceIDs.DeviceIds.DeviceID), + uint16(deviceIDs.DeviceIds.SubSystemID), + uint16(deviceIDs.DeviceIds.SubVendorID), + uint8(deviceIDs.DeviceIds.RevisionID), + ) +} diff --git a/internal/headers/gdi32/types.go b/internal/headers/gdi32/types.go index f1c62596..ef73534d 100644 --- a/internal/headers/gdi32/types.go +++ b/internal/headers/gdi32/types.go @@ -73,9 +73,22 @@ type D3DKMT_ADAPTERADDRESS struct { FunctionNumber win32.UINT } +type D3DKMT_QUERY_DEVICE_IDS struct { + PhysicalAdapterIndex win32.UINT + DeviceIds struct { + VendorID win32.UINT + DeviceID win32.UINT + SubVendorID win32.UINT + SubSystemID win32.UINT + RevisionID win32.UINT + BusType win32.UINT + } +} + type GPUDevice struct { AdapterString string LUID windows.LUID + DeviceID string DedicatedVideoMemorySize uint64 DedicatedSystemMemorySize uint64 SharedSystemMemorySize uint64 diff --git a/internal/headers/win32/types.go b/internal/headers/win32/types.go index 08147db9..edcfe16c 100644 --- a/internal/headers/win32/types.go +++ b/internal/headers/win32/types.go @@ -18,6 +18,7 @@ package win32 import ( + "strconv" "unsafe" "golang.org/x/sys/windows" @@ -32,8 +33,8 @@ type ( LPWSTR struct { *uint16 } - ULONG = uint32 // ULONG is a 32-bit unsigned int in Win32 - UINT = uint32 // UINT is a 32-bit unsigned int in Win32 + ULONG uint32 // ULONG is a 32-bit unsigned int in Win32 + UINT uint32 // UINT is a 32-bit unsigned int in Win32 ) // NewLPWSTR creates a new LPWSTR from a string. @@ -60,3 +61,7 @@ func (s *LPWSTR) Pointer() uintptr { func (s *LPWSTR) String() string { return windows.UTF16PtrToString(s.uint16) } + +func (u *UINT) String() string { + return strconv.FormatUint(uint64(*u), 10) +} diff --git a/internal/headers/win32/utils.go b/internal/headers/win32/utils.go new file mode 100644 index 00000000..2d713148 --- /dev/null +++ b/internal/headers/win32/utils.go @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package win32 + +// ParseMultiSz splits a UTF-16 encoded MULTI_SZ buffer (Windows style) into +// individual UTF-16 string slices. +// +// A MULTI_SZ buffer is a sequence of UTF-16 strings separated by single null +// terminators (0x0000) and terminated by an extra null (i.e., two consecutive +// nulls) to mark the end of the list. +// +// Example layout in memory (UTF-16): +// +// "foo\0bar\0baz\0\0" +// +// Given such a []uint16, this function returns a [][]uint16 where each inner +// slice is one null-terminated string segment without the trailing null. +// +// The returned slices reference the original buffer (no copying). +func ParseMultiSz(buf []uint16) [][]uint16 { + var ( + result [][]uint16 + start int + ) + + for i := range buf { + if buf[i] == 0 { + // Found a null terminator. + if i == start { + // Two consecutive nulls → end of list. + break + } + + // Append current string slice (excluding null). + result = append(result, buf[start:i]) + // Move start to next character after null. + start = i + 1 + } + } + + return result +}