gpu: add info metric about devices (#2070)

This commit is contained in:
Jan-Otto Kröpke
2025-06-04 22:49:59 +02:00
committed by GitHub
parent dcf85032ca
commit 55c877f536
7 changed files with 299 additions and 15 deletions

View File

@@ -2,6 +2,7 @@
<dictionary name="project"> <dictionary name="project">
<words> <words>
<w>containerd</w> <w>containerd</w>
<w>setupapi</w>
<w>spdx</w> <w>spdx</w>
</words> </words>
</dictionary> </dictionary>

View File

@@ -20,24 +20,25 @@ These metrics are available on supported versions of Windows with compatible GPU
### Adapter-level Metrics ### Adapter-level Metrics
| Name | Description | Type | Labels | | Name | Description | Type | Labels |
|----------------------------------------------|----------------------------------------------------------|-------|--------| |----------------------------------------------|-------------------------------------------------------------------------|-------|--------------------------------------------------------------------------------------|
| `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `phys` | | `windows_gpu_adapter_memory_committed_bytes` | Total committed GPU memory in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `phys` | | `windows_gpu_adapter_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `phys` | | `windows_gpu_adapter_memory_shared_bytes` | Shared GPU memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `phys` | | `windows_gpu_info` | A metric with a constant '1' value labeled with gpu device information. | gauge | `phys`, `physical_device_object_name`, `hardware_id`, `friendly_name`, `description` |
| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `phys` | | `windows_gpu_local_adapter_memory_bytes` | Local adapter memory usage in bytes per physical GPU | gauge | `phys` |
| `windows_gpu_non_local_adapter_memory_bytes` | Non-local adapter memory usage in bytes per physical GPU | gauge | `phys` |
### Per-process Metrics ### Per-process Metrics
| Name | Description | Type | Labels | | Name | Description | Type | Labels |
|----------------------------------------------|-------------------------------------------------|---------|----------------------------------------| |----------------------------------------------|-------------------------------------------------------------------------|---------|--------------------------------------------------------------------------------------|
| `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `phys`, `eng`, `engtype`, `process_id` | | `windows_gpu_engine_time_seconds` | Total running time of the GPU engine in seconds | counter | `phys`, `eng`, `engtype`, `process_id` |
| `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `phys`,`process_id` | | `windows_gpu_process_memory_committed_bytes` | Total committed GPU memory in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `phys`,`process_id` | | `windows_gpu_process_memory_dedicated_bytes` | Dedicated GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `phys`,`process_id` | | `windows_gpu_process_memory_local_bytes` | Local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `phys`,`process_id` | | `windows_gpu_process_memory_non_local_bytes` | Non-local GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
| `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `phys`,`process_id` | | `windows_gpu_process_memory_shared_bytes` | Shared GPU memory usage in bytes per process | gauge | `phys`,`process_id` |
## Metric Labels ## Metric Labels
@@ -50,6 +51,12 @@ These metrics are available on supported versions of Windows with compatible GPU
These are basic queries to help you get started with GPU monitoring on Windows using Prometheus. These are basic queries to help you get started with GPU monitoring on Windows using Prometheus.
**Show GPU information for a specific physical GPU (0):**
```promql
windows_gpu_info{description="NVIDIA GeForce GTX 1070",friendly_name="",hardware_id="PCI\\VEN_10DE&DEV_1B81&SUBSYS_61733842&REV_A1",phys="0",physical_device_object_name="\\Device\\NTPNP_PCI0027"} 1
```
**Show total dedicated GPU memory (in bytes) usage on GPU 0:** **Show total dedicated GPU memory (in bytes) usage on GPU 0:**
```promql ```promql

View File

@@ -21,8 +21,10 @@ import (
"errors" "errors"
"fmt" "fmt"
"log/slog" "log/slog"
"strconv"
"github.com/alecthomas/kingpin/v2" "github.com/alecthomas/kingpin/v2"
"github.com/prometheus-community/windows_exporter/internal/headers/setupapi"
"github.com/prometheus-community/windows_exporter/internal/mi" "github.com/prometheus-community/windows_exporter/internal/mi"
"github.com/prometheus-community/windows_exporter/internal/pdh" "github.com/prometheus-community/windows_exporter/internal/pdh"
"github.com/prometheus-community/windows_exporter/internal/types" "github.com/prometheus-community/windows_exporter/internal/types"
@@ -43,6 +45,7 @@ type Collector struct {
gpuEnginePerfDataCollector *pdh.Collector gpuEnginePerfDataCollector *pdh.Collector
gpuEnginePerfDataObject []gpuEnginePerfDataCounterValues gpuEnginePerfDataObject []gpuEnginePerfDataCounterValues
gpuInfo *prometheus.Desc
gpuEngineRunningTime *prometheus.Desc gpuEngineRunningTime *prometheus.Desc
// GPU Adapter Memory // GPU Adapter Memory
@@ -109,6 +112,13 @@ func (c *Collector) Close() error {
func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error { func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
var err error var err error
c.gpuInfo = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "info"),
"A metric with a constant '1' value labeled with gpu device information.",
[]string{"phys", "physical_device_object_name", "hardware_id", "friendly_name", "description"},
nil,
)
c.gpuEngineRunningTime = prometheus.NewDesc( c.gpuEngineRunningTime = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"), prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"),
"Total running time of the GPU in seconds.", "Total running time of the GPU in seconds.",
@@ -213,6 +223,10 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
func (c *Collector) Collect(ch chan<- prometheus.Metric) error { func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
errs := make([]error, 0) errs := make([]error, 0)
if err := c.collectGpuInfo(ch); err != nil {
errs = append(errs, err)
}
if err := c.collectGpuEngineMetrics(ch); err != nil { if err := c.collectGpuEngineMetrics(ch); err != nil {
errs = append(errs, err) errs = append(errs, err)
} }
@@ -236,6 +250,28 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
return errors.Join(errs...) return errors.Join(errs...)
} }
func (c *Collector) collectGpuInfo(ch chan<- prometheus.Metric) error {
gpus, err := setupapi.GetGPUDevices()
if err != nil {
return fmt.Errorf("failed to get GPU devices: %w", err)
}
for i, gpu := range gpus {
ch <- prometheus.MustNewConstMetric(
c.gpuInfo,
prometheus.GaugeValue,
1.0,
strconv.Itoa(i),
gpu.PhysicalDeviceObjectName,
gpu.HardwareID,
gpu.FriendlyName,
gpu.DeviceDesc,
)
}
return nil
}
func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error { func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
// Collect the GPU Engine perf data. // Collect the GPU Engine perf data.
if err := c.gpuEnginePerfDataCollector.Collect(&c.gpuEnginePerfDataObject); err != nil { if err := c.gpuEnginePerfDataCollector.Collect(&c.gpuEnginePerfDataObject); err != nil {

View File

@@ -0,0 +1,135 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi
import (
"sync"
"unsafe"
"golang.org/x/sys/windows"
)
//nolint:gochecknoglobals
var GUID_DISPLAY_ADAPTER = sync.OnceValue(func() *windows.GUID {
return &windows.GUID{
Data1: 0x4d36e968,
Data2: 0xe325,
Data3: 0x11ce,
Data4: [8]byte{0xbf, 0xc1, 0x08, 0x00, 0x2b, 0xe1, 0x03, 0x18},
}
})
func GetGPUDevices() ([]GPUDevice, error) {
hDevInfo, _, err := procSetupDiGetClassDevsW.Call(
uintptr(unsafe.Pointer(GUID_DISPLAY_ADAPTER())),
0,
0,
DIGCF_PRESENT,
)
if windows.Handle(hDevInfo) == windows.InvalidHandle {
return nil, err
}
var (
devices []GPUDevice
deviceData SP_DEVINFO_DATA
propertyBuffer [256]uint16
)
deviceData.CbSize = uint32(unsafe.Sizeof(deviceData))
for i := 0; ; i++ {
ret, _, _ := procSetupDiEnumDeviceInfo.Call(hDevInfo, uintptr(i), uintptr(unsafe.Pointer(&deviceData)))
if ret == 0 {
break // No more devices
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_DEVICEDESC),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
gpuDevice := GPUDevice{}
if ret == 0 {
gpuDevice.DeviceDesc = ""
} else {
gpuDevice.DeviceDesc = windows.UTF16ToString(propertyBuffer[:])
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_FRIENDLYNAME),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
if ret == 0 {
gpuDevice.FriendlyName = ""
} else {
gpuDevice.FriendlyName = windows.UTF16ToString(propertyBuffer[:])
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_HARDWAREID),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
if ret == 0 {
gpuDevice.HardwareID = "unknown"
} else {
gpuDevice.HardwareID = windows.UTF16ToString(propertyBuffer[:])
}
ret, _, _ = procSetupDiGetDeviceRegistryPropertyW.Call(
hDevInfo,
uintptr(unsafe.Pointer(&deviceData)),
uintptr(SPDRP_PHYSICAL_DEVICE_OBJECT_NAME),
0,
uintptr(unsafe.Pointer(&propertyBuffer[0])),
uintptr(len(propertyBuffer)*2),
0,
)
if ret == 0 {
gpuDevice.PhysicalDeviceObjectName = "unknown"
} else {
gpuDevice.PhysicalDeviceObjectName = windows.UTF16ToString(propertyBuffer[:])
}
devices = append(devices, gpuDevice)
}
_, _, _ = procSetupDiDestroyDeviceInfoList.Call(hDevInfo)
return devices, nil
}

View File

@@ -0,0 +1,32 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi_test
import (
"testing"
"github.com/prometheus-community/windows_exporter/internal/headers/setupapi"
"github.com/stretchr/testify/require"
)
func TestGetGPUDevices(t *testing.T) {
devices, err := setupapi.GetGPUDevices()
require.NoError(t, err, "Failed to get GPU devices")
require.NotNil(t, devices)
}

View File

@@ -0,0 +1,31 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi
import (
"golang.org/x/sys/windows"
)
//nolint:gochecknoglobals
var (
modSetupAPI = windows.NewLazySystemDLL("setupapi.dll")
procSetupDiGetClassDevsW = modSetupAPI.NewProc("SetupDiGetClassDevsW")
procSetupDiEnumDeviceInfo = modSetupAPI.NewProc("SetupDiEnumDeviceInfo")
procSetupDiGetDeviceRegistryPropertyW = modSetupAPI.NewProc("SetupDiGetDeviceRegistryPropertyW")
procSetupDiDestroyDeviceInfoList = modSetupAPI.NewProc("SetupDiDestroyDeviceInfoList")
)

View File

@@ -0,0 +1,42 @@
// SPDX-License-Identifier: Apache-2.0
//
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build windows
package setupapi
import "golang.org/x/sys/windows"
const (
DIGCF_PRESENT = 0x00000002
SPDRP_DEVICEDESC = 0x00000000
SPDRP_FRIENDLYNAME = 0x0000000C
SPDRP_HARDWAREID = 0x00000001
SPDRP_PHYSICAL_DEVICE_OBJECT_NAME = 0x0000000E
)
type SP_DEVINFO_DATA struct {
CbSize uint32
ClassGuid windows.GUID
DevInst uint32
_ uintptr // Reserved
}
type GPUDevice struct {
DeviceDesc string
FriendlyName string
HardwareID string
PhysicalDeviceObjectName string
}