From e673f192d26e18f10cdbe22b7109974ebfd9867a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan-Otto=20Kr=C3=B6pke?= Date: Sat, 31 May 2025 08:46:30 +0200 Subject: [PATCH] process: add `collector.process.counter-version` CLI parameter (#2064) --- .idea/go.imports.xml | 1 + docs/collector.process.md | 5 + internal/collector/process/process.go | 94 +++--- internal/collector/process/process_v1.go | 294 ------------------ .../{process_v2.go => process_worker.go} | 55 ++-- internal/collector/process/types.go | 37 +-- internal/pdh/registry/collector.go | 7 +- 7 files changed, 93 insertions(+), 400 deletions(-) delete mode 100644 internal/collector/process/process_v1.go rename internal/collector/process/{process_v2.go => process_worker.go} (84%) diff --git a/.idea/go.imports.xml b/.idea/go.imports.xml index 8e1cbe42..d7202f03 100644 --- a/.idea/go.imports.xml +++ b/.idea/go.imports.xml @@ -4,6 +4,7 @@ diff --git a/docs/collector.process.md b/docs/collector.process.md index 30b59bd8..a7d1a203 100644 --- a/docs/collector.process.md +++ b/docs/collector.process.md @@ -37,6 +37,11 @@ Enables IIS process name queries. IIS process names are combined with their app Disabled by default, and can be enabled with `--collector.process.iis`. NOTE: Just plain parameter without `true`. +### `--collector.process.counter-version` + +Version of the process collector to use. 1 for Process V1, 2 for Process V2. +Defaults to 0 which will use the latest version available. + ### Example To match all firefox processes: `--collector.process.include="firefox.*"`. diff --git a/internal/collector/process/process.go b/internal/collector/process/process.go index c7b8f64a..1e1dc9b0 100644 --- a/internal/collector/process/process.go +++ b/internal/collector/process/process.go @@ -18,6 +18,7 @@ package process import ( + "context" "errors" "fmt" "log/slog" @@ -31,6 +32,7 @@ import ( "github.com/prometheus-community/windows_exporter/internal/mi" "github.com/prometheus-community/windows_exporter/internal/pdh" "github.com/prometheus-community/windows_exporter/internal/pdh/registry" + pdhtypes "github.com/prometheus-community/windows_exporter/internal/pdh/types" "github.com/prometheus-community/windows_exporter/internal/types" "github.com/prometheus/client_golang/prometheus" "golang.org/x/sys/windows" @@ -42,6 +44,7 @@ type Config struct { ProcessInclude *regexp.Regexp `yaml:"include"` ProcessExclude *regexp.Regexp `yaml:"exclude"` EnableWorkerProcess bool `yaml:"iis"` + CounterVersion uint8 `yaml:"counter-version"` } //nolint:gochecknoglobals @@ -49,6 +52,7 @@ var ConfigDefaults = Config{ ProcessInclude: types.RegExpAny, ProcessExclude: types.RegExpEmpty, EnableWorkerProcess: false, + CounterVersion: 0, } type Collector struct { @@ -59,10 +63,9 @@ type Collector struct { miSession *mi.Session workerProcessMIQueryQuery mi.Query - collectorVersion int - - collectorV1 - collectorV2 + perfDataCollector pdhtypes.Collector + perfDataObject []perfDataCounterValues + workerCh chan processWorkerRequest lookupCache sync.Map @@ -130,6 +133,11 @@ func NewWithFlags(app *kingpin.Application) *Collector { "Enable IIS collectWorker process name queries. May cause the collector to leak memory.", ).Default(strconv.FormatBool(c.config.EnableWorkerProcess)).BoolVar(&c.config.EnableWorkerProcess) + app.Flag( + "collector.process.counter-version", + "Version of the process collector to use. 1 for Process V1, 2 for Process V2. Defaults to 0 which will use the latest version available.", + ).Default(strconv.FormatUint(uint64(c.config.CounterVersion), 10)).Uint8Var(&c.config.CounterVersion) + app.Action(func(*kingpin.ParseContext) error { var err error @@ -157,8 +165,12 @@ func (c *Collector) Close() error { c.mu.Lock() defer c.mu.Unlock() - c.closeV1() - c.closeV2() + c.perfDataCollector.Close() + + if c.workerCh != nil { + close(c.workerCh) + c.workerCh = nil + } return nil } @@ -166,42 +178,47 @@ func (c *Collector) Close() error { func (c *Collector) Build(logger *slog.Logger, miSession *mi.Session) error { c.logger = logger.With(slog.String("collector", Name)) - if miSession == nil { - return errors.New("miSession is nil") + var err error + + if c.config.EnableWorkerProcess { + if miSession == nil { + return errors.New("miSession is nil") + } + + miQuery, err := mi.NewQuery("SELECT AppPoolName, ProcessId FROM WorkerProcess") + if err != nil { + return fmt.Errorf("failed to create WMI query: %w", err) + } + + c.workerProcessMIQueryQuery = miQuery + c.miSession = miSession } - miQuery, err := mi.NewQuery("SELECT AppPoolName, ProcessId FROM WorkerProcess") - if err != nil { - return fmt.Errorf("failed to create WMI query: %w", err) - } + switch c.config.CounterVersion { + case 2: + c.perfDataCollector, err = pdh.NewCollector[perfDataCounterValues](pdh.CounterTypeRaw, "Process V2", pdh.InstancesAll) + case 1: + c.perfDataCollector, err = registry.NewCollector[perfDataCounterValues]("Process", pdh.InstancesAll) + default: + c.perfDataCollector, err = pdh.NewCollector[perfDataCounterValues](pdh.CounterTypeRaw, "Process V2", pdh.InstancesAll) + c.config.CounterVersion = 2 - c.workerProcessMIQueryQuery = miQuery - c.miSession = miSession + if errors.Is(err, pdh.NewPdhError(pdh.CstatusNoObject)) { + c.perfDataCollector, err = registry.NewCollector[perfDataCounterValues]("Process", pdh.InstancesAll) + c.config.CounterVersion = 1 + } - c.collectorVersion = 2 - c.perfDataCollectorV2, err = pdh.NewCollector[perfDataCounterValuesV2](pdh.CounterTypeRaw, "Process V2", pdh.InstancesAll) - - if errors.Is(err, pdh.NewPdhError(pdh.CstatusNoObject)) { - c.collectorVersion = 1 - c.perfDataCollectorV1, err = registry.NewCollector[perfDataCounterValuesV1]("Process", pdh.InstancesAll) + c.logger.LogAttrs(context.Background(), slog.LevelDebug, fmt.Sprintf("Using process collector V%d", c.config.CounterVersion)) } if err != nil { - return fmt.Errorf("failed to create Process collector: %w", err) + return fmt.Errorf("failed to create Process V%d collector: %w", c.config.CounterVersion, err) } - if c.collectorVersion == 1 { - c.workerChV1 = make(chan processWorkerRequestV1, 32) + c.workerCh = make(chan processWorkerRequest, 32) - for range 4 { - go c.collectWorkerV1() - } - } else { - c.workerChV2 = make(chan processWorkerRequestV2, 32) - - for range 4 { - go c.collectWorkerV2() - } + for range 4 { + go c.collectWorker() } c.mu = sync.RWMutex{} @@ -320,18 +337,7 @@ func (c *Collector) Build(logger *slog.Logger, miSession *mi.Session) error { } func (c *Collector) Collect(ch chan<- prometheus.Metric) error { - var workerProcesses []WorkerProcess - if c.config.EnableWorkerProcess { - if err := c.miSession.Query(&workerProcesses, mi.NamespaceRootWebAdministration, c.workerProcessMIQueryQuery); err != nil { - return fmt.Errorf("WMI query failed: %w", err) - } - } - - if c.collectorVersion == 1 { - return c.collectV1(ch, workerProcesses) - } - - return c.collectV2(ch, workerProcesses) + return c.collect(ch) } // ref: https://github.com/microsoft/hcsshim/blob/8beabacfc2d21767a07c20f8dd5f9f3932dbf305/internal/uvm/stats.go#L25 diff --git a/internal/collector/process/process_v1.go b/internal/collector/process/process_v1.go deleted file mode 100644 index 7ee4f659..00000000 --- a/internal/collector/process/process_v1.go +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// -// Copyright The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build windows - -package process - -import ( - "context" - "fmt" - "log/slog" - "runtime/debug" - "strconv" - "strings" - "sync" - - "github.com/prometheus-community/windows_exporter/internal/pdh/registry" - "github.com/prometheus/client_golang/prometheus" -) - -type collectorV1 struct { - perfDataCollectorV1 *registry.Collector - perfDataObjectV1 []perfDataCounterValuesV1 - workerChV1 chan processWorkerRequestV1 -} - -type processWorkerRequestV1 struct { - ch chan<- prometheus.Metric - name string - performanceCounterValues perfDataCounterValuesV1 - waitGroup *sync.WaitGroup - workerProcesses []WorkerProcess -} - -func (c *Collector) closeV1() { - c.perfDataCollectorV1.Close() - - if c.workerChV1 != nil { - close(c.workerChV1) - c.workerChV1 = nil - } -} - -func (c *Collector) collectV1(ch chan<- prometheus.Metric, workerProcesses []WorkerProcess) error { - err := c.perfDataCollectorV1.Collect(&c.perfDataObjectV1) - if err != nil { - return fmt.Errorf("failed to collect metrics: %w", err) - } - - wg := &sync.WaitGroup{} - - for _, process := range c.perfDataObjectV1 { - // Duplicate processes are suffixed #, and an index number. Remove those. - name, _, _ := strings.Cut(process.Name, ":") // Process V1 - - if c.config.ProcessExclude.MatchString(name) || !c.config.ProcessInclude.MatchString(name) { - continue - } - - wg.Add(1) - - c.workerChV1 <- processWorkerRequestV1{ - ch: ch, - name: name, - performanceCounterValues: process, - workerProcesses: workerProcesses, - waitGroup: wg, - } - } - - wg.Wait() - - return nil -} - -func (c *Collector) collectWorkerV1() { - defer func() { - if r := recover(); r != nil { - c.logger.Error("Worker panic", - slog.Any("panic", r), - slog.String("stack", string(debug.Stack())), - ) - - // Restart the collectWorker - go c.collectWorkerV1() - } - }() - - for req := range c.workerChV1 { - (func() { - defer req.waitGroup.Done() - - ch := req.ch - name := req.name - data := req.performanceCounterValues - - pid := uint64(data.IdProcess) - parentPID := strconv.FormatUint(uint64(data.CreatingProcessID), 10) - - if c.config.EnableWorkerProcess { - for _, wp := range req.workerProcesses { - if wp.ProcessId == pid { - name = strings.Join([]string{name, wp.AppPoolName}, "_") - - break - } - } - } - - cmdLine, processOwner, processGroupID, err := c.getProcessInformation(uint32(pid)) - if err != nil { - slog.LogAttrs(context.Background(), slog.LevelDebug, "Failed to get process information", - slog.Uint64("pid", pid), - slog.Any("err", err), - ) - } - - pidString := strconv.FormatUint(pid, 10) - - ch <- prometheus.MustNewConstMetric( - c.info, - prometheus.GaugeValue, - 1.0, - name, pidString, parentPID, strconv.Itoa(int(processGroupID)), processOwner, cmdLine, - ) - - ch <- prometheus.MustNewConstMetric( - c.startTime, - prometheus.GaugeValue, - data.ElapsedTime, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.startTimeOld, - prometheus.GaugeValue, - data.ElapsedTime, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.handleCount, - prometheus.GaugeValue, - data.HandleCount, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.cpuTimeTotal, - prometheus.CounterValue, - data.PercentPrivilegedTime, - name, pidString, "privileged", - ) - - ch <- prometheus.MustNewConstMetric( - c.cpuTimeTotal, - prometheus.CounterValue, - data.PercentUserTime, - name, pidString, "user", - ) - - ch <- prometheus.MustNewConstMetric( - c.ioBytesTotal, - prometheus.CounterValue, - data.IoOtherBytesPerSec, - name, pidString, "other", - ) - - ch <- prometheus.MustNewConstMetric( - c.ioOperationsTotal, - prometheus.CounterValue, - data.IoOtherOperationsPerSec, - name, pidString, "other", - ) - - ch <- prometheus.MustNewConstMetric( - c.ioBytesTotal, - prometheus.CounterValue, - data.IoReadBytesPerSec, - name, pidString, "read", - ) - - ch <- prometheus.MustNewConstMetric( - c.ioOperationsTotal, - prometheus.CounterValue, - data.IoReadOperationsPerSec, - name, pidString, "read", - ) - - ch <- prometheus.MustNewConstMetric( - c.ioBytesTotal, - prometheus.CounterValue, - data.IoWriteBytesPerSec, - name, pidString, "write", - ) - - ch <- prometheus.MustNewConstMetric( - c.ioOperationsTotal, - prometheus.CounterValue, - data.IoWriteOperationsPerSec, - name, pidString, "write", - ) - - ch <- prometheus.MustNewConstMetric( - c.pageFaultsTotal, - prometheus.CounterValue, - data.PageFaultsPerSec, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.pageFileBytes, - prometheus.GaugeValue, - data.PageFileBytes, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.poolBytes, - prometheus.GaugeValue, - data.PoolNonPagedBytes, - name, pidString, "nonpaged", - ) - - ch <- prometheus.MustNewConstMetric( - c.poolBytes, - prometheus.GaugeValue, - data.PoolPagedBytes, - name, pidString, "paged", - ) - - ch <- prometheus.MustNewConstMetric( - c.priorityBase, - prometheus.GaugeValue, - data.PriorityBase, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.privateBytes, - prometheus.GaugeValue, - data.PrivateBytes, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.threadCount, - prometheus.GaugeValue, - data.ThreadCount, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.virtualBytes, - prometheus.GaugeValue, - data.VirtualBytes, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.workingSetPrivate, - prometheus.GaugeValue, - data.WorkingSetPrivate, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.workingSetPeak, - prometheus.GaugeValue, - data.WorkingSetPeak, - name, pidString, - ) - - ch <- prometheus.MustNewConstMetric( - c.workingSet, - prometheus.GaugeValue, - data.WorkingSet, - name, pidString, - ) - })() - } -} diff --git a/internal/collector/process/process_v2.go b/internal/collector/process/process_worker.go similarity index 84% rename from internal/collector/process/process_v2.go rename to internal/collector/process/process_worker.go index e6364219..b98a3a7c 100644 --- a/internal/collector/process/process_v2.go +++ b/internal/collector/process/process_worker.go @@ -27,52 +27,57 @@ import ( "sync" "time" - "github.com/prometheus-community/windows_exporter/internal/pdh" + "github.com/prometheus-community/windows_exporter/internal/mi" "github.com/prometheus/client_golang/prometheus" ) -type collectorV2 struct { - perfDataCollectorV2 *pdh.Collector - perfDataObjectV2 []perfDataCounterValuesV2 - workerChV2 chan processWorkerRequestV2 -} - -type processWorkerRequestV2 struct { +type processWorkerRequest struct { ch chan<- prometheus.Metric name string - performanceCounterValues perfDataCounterValuesV2 + performanceCounterValues perfDataCounterValues waitGroup *sync.WaitGroup workerProcesses []WorkerProcess } -func (c *Collector) closeV2() { - c.perfDataCollectorV2.Close() - - if c.workerChV2 != nil { - close(c.workerChV2) - c.workerChV2 = nil - } -} - -func (c *Collector) collectV2(ch chan<- prometheus.Metric, workerProcesses []WorkerProcess) error { - err := c.perfDataCollectorV2.Collect(&c.perfDataObjectV2) +func (c *Collector) collect(ch chan<- prometheus.Metric) error { + err := c.perfDataCollector.Collect(&c.perfDataObject) if err != nil { return fmt.Errorf("failed to collect metrics: %w", err) } + var workerProcesses []WorkerProcess + if c.config.EnableWorkerProcess { + if err := c.miSession.Query(&workerProcesses, mi.NamespaceRootWebAdministration, c.workerProcessMIQueryQuery); err != nil { + return fmt.Errorf("WMI query failed: %w", err) + } + } + wg := &sync.WaitGroup{} - for _, process := range c.perfDataObjectV2 { + for _, process := range c.perfDataObject { // Duplicate processes are suffixed #, and an index number. Remove those. name, _, _ := strings.Cut(process.Name, ":") // Process V2 + // Duplicate processes are suffixed #, and an index number. Remove those. + name, _, _ = strings.Cut(name, "#") // Process V1 + if c.config.ProcessExclude.MatchString(name) || !c.config.ProcessInclude.MatchString(name) { continue } + if process.ProcessID == 0 && name != "Idle" { + c.logger.LogAttrs(context.Background(), slog.LevelDebug, "Skipping process with PID 0", + slog.String("name", name), + slog.String("process_name", process.Name), + slog.Any("process", fmt.Sprintf("%+v", process)), + ) + + continue + } + wg.Add(1) - c.workerChV2 <- processWorkerRequestV2{ + c.workerCh <- processWorkerRequest{ ch: ch, name: name, performanceCounterValues: process, @@ -86,7 +91,7 @@ func (c *Collector) collectV2(ch chan<- prometheus.Metric, workerProcesses []Wor return nil } -func (c *Collector) collectWorkerV2() { +func (c *Collector) collectWorker() { defer func() { if r := recover(); r != nil { c.logger.Error("Worker panic", @@ -95,11 +100,11 @@ func (c *Collector) collectWorkerV2() { ) // Restart the collectWorker - go c.collectWorkerV2() + go c.collectWorker() } }() - for req := range c.workerChV2 { + for req := range c.workerCh { (func() { defer req.waitGroup.Done() diff --git a/internal/collector/process/types.go b/internal/collector/process/types.go index c8527b8a..7ee77307 100644 --- a/internal/collector/process/types.go +++ b/internal/collector/process/types.go @@ -22,7 +22,7 @@ type WorkerProcess struct { ProcessId uint64 `mi:"ProcessId"` } -type perfDataCounterValuesV1 struct { +type perfDataCounterValues struct { Name string PercentProcessorTime float64 `perfdata:"% Processor Time"` @@ -52,38 +52,5 @@ type perfDataCounterValuesV1 struct { WorkingSetPrivate float64 `perfdata:"Working Set - Private"` WorkingSetPeak float64 `perfdata:"Working Set Peak"` WorkingSet float64 `perfdata:"Working Set"` - IdProcess float64 `perfdata:"ID Process"` -} - -type perfDataCounterValuesV2 struct { - Name string - - PercentProcessorTime float64 `perfdata:"% Processor Time"` - PercentPrivilegedTime float64 `perfdata:"% Privileged Time"` - PercentUserTime float64 `perfdata:"% User Time"` - CreatingProcessID float64 `perfdata:"Creating Process ID"` - ElapsedTime float64 `perfdata:"Elapsed Time"` - HandleCount float64 `perfdata:"Handle Count"` - IoDataBytesPerSec float64 `perfdata:"IO Data Bytes/sec"` - IoDataOperationsPerSec float64 `perfdata:"IO Data Operations/sec"` - IoOtherBytesPerSec float64 `perfdata:"IO Other Bytes/sec"` - IoOtherOperationsPerSec float64 `perfdata:"IO Other Operations/sec"` - IoReadBytesPerSec float64 `perfdata:"IO Read Bytes/sec"` - IoReadOperationsPerSec float64 `perfdata:"IO Read Operations/sec"` - IoWriteBytesPerSec float64 `perfdata:"IO Write Bytes/sec"` - IoWriteOperationsPerSec float64 `perfdata:"IO Write Operations/sec"` - PageFaultsPerSec float64 `perfdata:"Page Faults/sec"` - PageFileBytesPeak float64 `perfdata:"Page File Bytes Peak"` - PageFileBytes float64 `perfdata:"Page File Bytes"` - PoolNonPagedBytes float64 `perfdata:"Pool Nonpaged Bytes"` - PoolPagedBytes float64 `perfdata:"Pool Paged Bytes"` - PriorityBase float64 `perfdata:"Priority Base"` - PrivateBytes float64 `perfdata:"Private Bytes"` - ThreadCount float64 `perfdata:"Thread Count"` - VirtualBytesPeak float64 `perfdata:"Virtual Bytes Peak"` - VirtualBytes float64 `perfdata:"Virtual Bytes"` - WorkingSetPrivate float64 `perfdata:"Working Set - Private"` - WorkingSetPeak float64 `perfdata:"Working Set Peak"` - WorkingSet float64 `perfdata:"Working Set"` - ProcessID float64 `perfdata:"Process ID"` + ProcessID float64 `perfdata:"Process ID" perfdata_v1:"ID Process"` } diff --git a/internal/pdh/registry/collector.go b/internal/pdh/registry/collector.go index 4751c3fd..e232a8fd 100644 --- a/internal/pdh/registry/collector.go +++ b/internal/pdh/registry/collector.go @@ -63,9 +63,12 @@ func NewCollector[T any](object string, _ []string) (*Collector, error) { } for _, f := range reflect.VisibleFields(valueType) { - counterName, ok := f.Tag.Lookup("perfdata") + counterName, ok := f.Tag.Lookup("perfdata_v1") if !ok { - continue + counterName, ok = f.Tag.Lookup("perfdata") + if !ok { + continue + } } var counter Counter