gpu: add info metric about devices (#2070)

This commit is contained in:
Jan-Otto Kröpke
2025-06-04 22:49:59 +02:00
committed by GitHub
parent dcf85032ca
commit 55c877f536
7 changed files with 299 additions and 15 deletions

View File

@@ -2,6 +2,7 @@
<dictionary name="project">
<words>
<w>containerd</w>
<w>setupapi</w>
<w>spdx</w>
</words>
</dictionary>

View File

@@ -20,24 +20,25 @@ These metrics are available on supported versions of Windows with compatible GPU
### Adapter-level Metrics
| Name | Description | Type | Labels |
|----------------------------------------------|----------------------------------------------------------|-------|--------|
| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `phys` |
| Name | Description | Type | Labels |
|----------------------------------------------|-------------------------------------------------------------------------|-------|--------------------------------------------------------------------------------------|
| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_info` | A metric with a constant '1' value labeled with gpu device information. | gauge | `phys`, `physical_device_object_name`, `hardware_id`, `friendly_name`, `description` |
| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `phys` |
### Per-process Metrics
| Name | Description | Type | Labels |
|----------------------------------------------|-------------------------------------------------|---------|----------------------------------------|
| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `phys`, `eng`, `engtype`, `process_id` |
| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| Name | Description | Type | Labels |
|----------------------------------------------|-------------------------------------------------------------------------|---------|--------------------------------------------------------------------------------------|
| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `phys`, `eng`, `engtype`, `process_id` |
| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
## Metric Labels
@@ -50,6 +51,12 @@ These metrics are available on supported versions of Windows with compatible GPU
These are basic queries to help you get started with GPU monitoring on Windows using Prometheus.
**Show GPU information for a specific physical GPU (0):**
```promql
windows_gpu_info{description="NVIDIA GeForce GTX 1070",friendly_name="",hardware_id="PCI\\VEN_10DE&DEV_1B81&SUBSYS_61733842&REV_A1",phys="0",physical_device_object_name="\\Device\\NTPNP_PCI0027"} 1
```
**Show total dedicated GPU memory (in bytes) usage on GPU 0:**
```promql

View File

@@ -21,8 +21,10 @@ import (
"errors"
"fmt"
"log/slog"
"strconv"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus-community/windows_exporter/internal/headers/setupapi"
"github.com/prometheus-community/windows_exporter/internal/mi"
"github.com/prometheus-community/windows_exporter/internal/pdh"
"github.com/prometheus-community/windows_exporter/internal/types"
@@ -43,6 +45,7 @@ type Collector struct {
gpuEnginePerfDataCollector *pdh.Collector
gpuEnginePerfDataObject []gpuEnginePerfDataCounterValues
gpuInfo *prometheus.Desc
gpuEngineRunningTime *prometheus.Desc
// GPU Adapter Memory
@@ -109,6 +112,13 @@ func (c *Collector) Close() error {
func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
var err error
c.gpuInfo = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "info"),
"A metric with a constant '1' value labeled with gpu device information.",
[]string{"phys", "physical_device_object_name", "hardware_id", "friendly_name", "description"},
nil,
)
c.gpuEngineRunningTime = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"),
"Total running time of the GPU in seconds.",
@@ -213,6 +223,10 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
errs := make([]error, 0)
if err := c.collectGpuInfo(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuEngineMetrics(ch); err != nil {
errs = append(errs, err)
}
@@ -236,6 +250,28 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
return errors.Join(errs...)
}
func (c *Collector) collectGpuInfo(ch chan<- prometheus.Metric) error {
gpus, err := setupapi.GetGPUDevices()
if err != nil {
return fmt.Errorf("failed to get GPU devices: %w", err)
}
for i, gpu := range gpus {
ch <- prometheus.MustNewConstMetric(
c.gpuInfo,
prometheus.GaugeValue,
1.0,
strconv.Itoa(i),
gpu.PhysicalDeviceObjectName,
gpu.HardwareID,
gpu.FriendlyName,
gpu.DeviceDesc,
)
}
return nil
}
func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Engine perf data.
if err := c.gpuEnginePerfDataCollector.Collect(&c.gpuEnginePerfDataObject); err != nil {

View File

@@ -0,0 +1,135 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi
import (
"sync"
"unsafe"
"golang.org/x/sys/windows"
)
//nolint:gochecknoglobals
var GUID_DISPLAY_ADAPTER = sync.OnceValue(func() *windows.GUID {
return &windows.GUID{
Data1: 0x4d36e968,
Data2: 0xe325,
Data3: 0x11ce,
Data4: [8]byte{0xbf, 0xc1, 0x08, 0x00, 0x2b, 0xe1, 0x03, 0x18},
}
})
func GetGPUDevices() ([]GPUDevice, error) {
hDevInfo, _, err := procSetupDiGetClassDevsW.Call(
uintptr(unsafe.Pointer(GUID_DISPLAY_ADAPTER())),
0,
0,
DIGCF_PRESENT,
)
if windows.Handle(hDevInfo) == windows.InvalidHandle {
return nil, err
}
var (
devices []GPUDevice
deviceData SP_DEVINFO_DATA
propertyBuffer [256]uint16
)
deviceData.CbSize = uint32(unsafe.Sizeof(deviceData))
for i := 0; ; i++ {
ret, _, _ := procSetupDiEnumDeviceInfo.Call(hDevInfo, uintptr(i), uintptr(unsafe.Pointer(&deviceData)))
if ret == 0 {
break // No more devices
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_DEVICEDESC),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
gpuDevice := GPUDevice{}
if ret == 0 {
gpuDevice.DeviceDesc = ""
} else {
gpuDevice.DeviceDesc = windows.UTF16ToString(propertyBuffer[:])
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_FRIENDLYNAME),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
if ret == 0 {
gpuDevice.FriendlyName = ""
} else {
gpuDevice.FriendlyName = windows.UTF16ToString(propertyBuffer[:])
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_HARDWAREID),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
if ret == 0 {
gpuDevice.HardwareID = "unknown"
} else {
gpuDevice.HardwareID = windows.UTF16ToString(propertyBuffer[:])
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_PHYSICAL_DEVICE_OBJECT_NAME),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
if ret == 0 {
gpuDevice.PhysicalDeviceObjectName = "unknown"
} else {
gpuDevice.PhysicalDeviceObjectName = windows.UTF16ToString(propertyBuffer[:])
}
devices = append(devices, gpuDevice)
}
_, _, _ = procSetupDiDestroyDeviceInfoList.Call(hDevInfo)
return devices, nil
}

View File

@@ -0,0 +1,32 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi_test
import (
"testing"
"github.com/prometheus-community/windows_exporter/internal/headers/setupapi"
"github.com/stretchr/testify/require"
)
func TestGetGPUDevices(t *testing.T) {
devices, err := setupapi.GetGPUDevices()
require.NoError(t, err, "Failed to get GPU devices")
require.NotNil(t, devices)
}

View File

@@ -0,0 +1,31 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi
import (
"golang.org/x/sys/windows"
)
//nolint:gochecknoglobals
var (
modSetupAPI = windows.NewLazySystemDLL("setupapi.dll")
procSetupDiGetClassDevsW = modSetupAPI.NewProc("SetupDiGetClassDevsW")
procSetupDiEnumDeviceInfo = modSetupAPI.NewProc("SetupDiEnumDeviceInfo")
procSetupDiGetDeviceRegistryPropertyW = modSetupAPI.NewProc("SetupDiGetDeviceRegistryPropertyW")
procSetupDiDestroyDeviceInfoList = modSetupAPI.NewProc("SetupDiDestroyDeviceInfoList")
)

View File

@@ -0,0 +1,42 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi
import "golang.org/x/sys/windows"
const (
DIGCF_PRESENT = 0x00000002
SPDRP_DEVICEDESC = 0x00000000
SPDRP_FRIENDLYNAME = 0x0000000C
SPDRP_HARDWAREID = 0x00000001
SPDRP_PHYSICAL_DEVICE_OBJECT_NAME = 0x0000000E
)
type SP_DEVINFO_DATA struct {
CbSize uint32
ClassGuid windows.GUID
DevInst uint32
_ uintptr // Reserved
}
type GPUDevice struct {
DeviceDesc string
FriendlyName string
HardwareID string
PhysicalDeviceObjectName string
}