gpu: fix windows_gpu_info metric (#2130)

This commit is contained in:
Jan-Otto Kröpke
2025-07-13 01:05:59 +02:00
committed by GitHub
parent 6b8c895a68
commit 524fea08c4
12 changed files with 534 additions and 255 deletions

View File

@@ -24,7 +24,7 @@ import (
"strconv"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus-community/windows_exporter/internal/headers/setupapi"
"github.com/prometheus-community/windows_exporter/internal/headers/gdi32"
"github.com/prometheus-community/windows_exporter/internal/mi"
"github.com/prometheus-community/windows_exporter/internal/pdh"
"github.com/prometheus-community/windows_exporter/internal/types"
@@ -41,6 +41,8 @@ var ConfigDefaults = Config{}
type Collector struct {
config Config
gpuDeviceCache map[string]gdi32.GPUDevice
// GPU Engine
gpuEnginePerfDataCollector *pdh.Collector
gpuEnginePerfDataObject []gpuEnginePerfDataCounterValues
@@ -48,6 +50,10 @@ type Collector struct {
gpuInfo *prometheus.Desc
gpuEngineRunningTime *prometheus.Desc
gpuSharedSystemMemorySize *prometheus.Desc
gpuDedicatedSystemMemorySize *prometheus.Desc
gpuDedicatedVideoMemorySize *prometheus.Desc
// GPU Adapter Memory
gpuAdapterMemoryPerfDataCollector *pdh.Collector
gpuAdapterMemoryPerfDataObject []gpuAdapterMemoryPerfDataCounterValues
@@ -115,78 +121,97 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
c.gpuInfo = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "info"),
"A metric with a constant '1' value labeled with gpu device information.",
[]string{"phys", "physical_device_object_name", "hardware_id", "friendly_name", "description"},
[]string{"luid", "name", "bus_number", "phys", "function_number"},
nil,
)
c.gpuSharedSystemMemorySize = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "shared_system_memory_size_bytes"),
"The size, in bytes, of memory from system memory that can be shared by many users.",
[]string{"luid"},
nil,
)
c.gpuDedicatedSystemMemorySize = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "dedicated_system_memory_size_bytes"),
"The size, in bytes, of memory that is dedicated from system memory.",
[]string{"luid"},
nil,
)
c.gpuDedicatedVideoMemorySize = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "dedicated_video_memory_size_bytes"),
"The size, in bytes, of memory that is dedicated from video memory.",
[]string{"luid"},
nil,
)
c.gpuEngineRunningTime = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "engine_time_seconds"),
"Total running time of the GPU in seconds.",
[]string{"process_id", "phys", "eng", "engtype"},
[]string{"process_id", "luid", "phys", "eng", "engtype"},
nil,
)
c.gpuAdapterMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_dedicated_bytes"),
"Dedicated GPU memory usage in bytes.",
[]string{"phys"},
[]string{"luid", "phys"},
nil,
)
c.gpuAdapterMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_shared_bytes"),
"Shared GPU memory usage in bytes.",
[]string{"phys"},
[]string{"luid", "phys"},
nil,
)
c.gpuAdapterMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "adapter_memory_committed_bytes"),
"Total committed GPU memory in bytes.",
[]string{"phys"},
[]string{"luid", "phys"},
nil,
)
c.gpuLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "local_adapter_memory_bytes"),
"Local adapter memory usage in bytes.",
[]string{"phys"},
[]string{"luid", "phys"},
nil,
)
c.gpuNonLocalAdapterMemoryUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "non_local_adapter_memory_bytes"),
"Non-local adapter memory usage in bytes.",
[]string{"phys"},
[]string{"luid", "phys"},
nil,
)
c.gpuProcessMemoryDedicatedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_dedicated_bytes"),
"Dedicated process memory usage in bytes.",
[]string{"process_id", "phys"},
[]string{"process_id", "luid", "phys"},
nil,
)
c.gpuProcessMemoryLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_local_bytes"),
"Local process memory usage in bytes.",
[]string{"process_id", "phys"},
[]string{"process_id", "luid", "phys"},
nil,
)
c.gpuProcessMemoryNonLocalUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_non_local_bytes"),
"Non-local process memory usage in bytes.",
[]string{"process_id", "phys"},
[]string{"process_id", "luid", "phys"},
nil,
)
c.gpuProcessMemorySharedUsage = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_shared_bytes"),
"Shared process memory usage in bytes.",
[]string{"process_id", "phys"},
[]string{"process_id", "luid", "phys"},
nil,
)
c.gpuProcessMemoryTotalCommitted = prometheus.NewDesc(
prometheus.BuildFQName(types.Namespace, Name, "process_memory_committed_bytes"),
"Total committed process memory in bytes.",
[]string{"process_id", "phys"},
[]string{"process_id", "luid", "phys"},
nil,
)
@@ -217,15 +242,31 @@ func (c *Collector) Build(_ *slog.Logger, _ *mi.Session) error {
errs = append(errs, fmt.Errorf("failed to create GPU Process Memory perf data collector: %w", err))
}
gpus, err := gdi32.GetGPUDevices()
if err != nil {
errs = append(errs, fmt.Errorf("failed to get GPU devices: %w", err))
}
for _, gpu := range gpus {
if gpu.AdapterString == "" {
continue
}
if c.gpuDeviceCache == nil {
c.gpuDeviceCache = make(map[string]gdi32.GPUDevice)
}
luidKey := fmt.Sprintf("0x%08X_0x%08X", gpu.LUID.HighPart, gpu.LUID.LowPart)
c.gpuDeviceCache[luidKey] = gpu
}
return errors.Join(errs...)
}
func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
errs := make([]error, 0)
if err := c.collectGpuInfo(ch); err != nil {
errs = append(errs, err)
}
c.collectGpuInfo(ch)
if err := c.collectGpuEngineMetrics(ch); err != nil {
errs = append(errs, err)
@@ -250,26 +291,40 @@ func (c *Collector) Collect(ch chan<- prometheus.Metric) error {
return errors.Join(errs...)
}
func (c *Collector) collectGpuInfo(ch chan<- prometheus.Metric) error {
gpus, err := setupapi.GetGPUDevices()
if err != nil {
return fmt.Errorf("failed to get GPU devices: %w", err)
}
for i, gpu := range gpus {
func (c *Collector) collectGpuInfo(ch chan<- prometheus.Metric) {
for luid, gpu := range c.gpuDeviceCache {
ch <- prometheus.MustNewConstMetric(
c.gpuInfo,
prometheus.GaugeValue,
1.0,
strconv.Itoa(i),
gpu.PhysicalDeviceObjectName,
gpu.HardwareID,
gpu.FriendlyName,
gpu.DeviceDesc,
luid,
gpu.AdapterString,
strconv.FormatInt(int64(gpu.BusNumber), 10),
strconv.FormatInt(int64(gpu.DeviceNumber), 10),
strconv.FormatInt(int64(gpu.FunctionNumber), 10),
)
ch <- prometheus.MustNewConstMetric(
c.gpuSharedSystemMemorySize,
prometheus.GaugeValue,
float64(gpu.SharedSystemMemorySize),
luid,
)
ch <- prometheus.MustNewConstMetric(
c.gpuDedicatedSystemMemorySize,
prometheus.GaugeValue,
float64(gpu.DedicatedSystemMemorySize),
luid,
)
ch <- prometheus.MustNewConstMetric(
c.gpuDedicatedVideoMemorySize,
prometheus.GaugeValue,
float64(gpu.DedicatedVideoMemorySize),
luid,
)
}
return nil
}
func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
@@ -283,9 +338,14 @@ func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
for _, data := range c.gpuEnginePerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok {
continue
}
key := PidPhysEngEngType{
Pid: instance.Pid,
Phys: instance.Phys,
Luid: instance.Luid,
Eng: instance.Eng,
Engtype: instance.Engtype,
}
@@ -297,7 +357,7 @@ func (c *Collector) collectGpuEngineMetrics(ch chan<- prometheus.Metric) error {
c.gpuEngineRunningTime,
prometheus.CounterValue,
runningTime,
key.Pid, key.Phys, key.Eng, key.Engtype,
key.Pid, key.Luid, key.Phys, key.Eng, key.Engtype,
)
}
@@ -317,8 +377,13 @@ func (c *Collector) collectGpuAdapterMemoryMetrics(ch chan<- prometheus.Metric)
for _, data := range c.gpuAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok {
continue
}
key := PidPhysEngEngType{
Pid: instance.Pid,
Luid: instance.Luid,
Phys: instance.Phys,
Eng: instance.Eng,
Engtype: instance.Engtype,
@@ -333,21 +398,21 @@ func (c *Collector) collectGpuAdapterMemoryMetrics(ch chan<- prometheus.Metric)
c.gpuAdapterMemoryDedicatedUsage,
prometheus.GaugeValue,
dedicatedUsage,
key.Phys,
key.Luid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemorySharedUsage,
prometheus.GaugeValue,
sharedUsageMap[key],
key.Phys,
key.Luid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuAdapterMemoryTotalCommitted,
prometheus.GaugeValue,
totalCommittedMap[key],
key.Phys,
key.Luid, key.Phys,
)
}
@@ -360,20 +425,29 @@ func (c *Collector) collectGpuLocalAdapterMemoryMetrics(ch chan<- prometheus.Met
return fmt.Errorf("failed to collect GPU Local Adapter Memory perf data: %w", err)
}
localAdapterMemoryMap := make(map[string]float64)
localAdapterMemoryMap := make(map[PidPhysEngEngType]float64)
for _, data := range c.gpuLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
localAdapterMemoryMap[instance.Phys] += data.LocalUsage
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok {
continue
}
key := PidPhysEngEngType{
Luid: instance.Luid,
Phys: instance.Phys,
}
localAdapterMemoryMap[key] += data.LocalUsage
}
for phys, localUsage := range localAdapterMemoryMap {
for key, localUsage := range localAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric(
c.gpuLocalAdapterMemoryUsage,
prometheus.GaugeValue,
localUsage,
phys,
key.Luid, key.Phys,
)
}
@@ -386,20 +460,28 @@ func (c *Collector) collectGpuNonLocalAdapterMemoryMetrics(ch chan<- prometheus.
return fmt.Errorf("failed to collect GPU Non Local Adapter Memory perf data: %w", err)
}
nonLocalAdapterMemoryMap := make(map[string]float64)
nonLocalAdapterMemoryMap := make(map[PidPhysEngEngType]float64)
for _, data := range c.gpuNonLocalAdapterMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
nonLocalAdapterMemoryMap[instance.Phys] += data.NonLocalUsage
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok {
continue
}
key := PidPhysEngEngType{
Luid: instance.Luid,
Phys: instance.Phys,
}
nonLocalAdapterMemoryMap[key] += data.NonLocalUsage
}
for phys, nonLocalUsage := range nonLocalAdapterMemoryMap {
for key, nonLocalUsage := range nonLocalAdapterMemoryMap {
ch <- prometheus.MustNewConstMetric(
c.gpuNonLocalAdapterMemoryUsage,
prometheus.GaugeValue,
nonLocalUsage,
phys,
key.Luid, key.Phys,
)
}
@@ -421,8 +503,13 @@ func (c *Collector) collectGpuProcessMemoryMetrics(ch chan<- prometheus.Metric)
for _, data := range c.gpuProcessMemoryPerfDataObject {
instance := parseGPUCounterInstanceString(data.Name)
if _, ok := c.gpuDeviceCache[instance.Luid]; !ok {
continue
}
key := PidPhys{
Pid: instance.Pid,
Luid: instance.Luid,
Phys: instance.Phys,
}
processDedicatedUsageMap[key] += data.DedicatedUsage
@@ -437,35 +524,35 @@ func (c *Collector) collectGpuProcessMemoryMetrics(ch chan<- prometheus.Metric)
c.gpuProcessMemoryDedicatedUsage,
prometheus.GaugeValue,
dedicatedUsage,
key.Pid, key.Phys,
key.Pid, key.Luid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryLocalUsage,
prometheus.GaugeValue,
processLocalUsageMap[key],
key.Pid, key.Phys,
key.Pid, key.Luid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryNonLocalUsage,
prometheus.GaugeValue,
processNonLocalUsageMap[key],
key.Pid, key.Phys,
key.Pid, key.Luid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemorySharedUsage,
prometheus.GaugeValue,
processSharedUsageMap[key],
key.Pid, key.Phys,
key.Pid, key.Luid, key.Phys,
)
ch <- prometheus.MustNewConstMetric(
c.gpuProcessMemoryTotalCommitted,
prometheus.GaugeValue,
processTotalCommittedMap[key],
key.Pid, key.Phys,
key.Pid, key.Luid, key.Phys,
)
}