[Management/Client] Trigger debug bundle runs from API/Dashboard (#4592) (#4832)

This PR adds the ability to trigger debug bundle generation remotely from the Management API/Dashboard.
This commit is contained in:
Zoltan Papp
2026-01-19 11:22:16 +01:00
committed by GitHub
parent 245481f33b
commit 58daa674ef
61 changed files with 3657 additions and 1239 deletions

View File

@@ -0,0 +1,59 @@
package job
import (
"context"
"errors"
"fmt"
"sync"
"time"
)
// todo consider the channel buffer size when we allow to run multiple jobs
const jobChannelBuffer = 1
var (
ErrJobChannelClosed = errors.New("job channel closed")
)
type Channel struct {
events chan *Event
once sync.Once
}
func NewChannel() *Channel {
jc := &Channel{
events: make(chan *Event, jobChannelBuffer),
}
return jc
}
func (jc *Channel) AddEvent(ctx context.Context, responseWait time.Duration, event *Event) error {
select {
case <-ctx.Done():
return ctx.Err()
// todo: timeout is handled in the wrong place. If the peer does not respond with the job response, the server does not clean it up from the pending jobs and cannot apply a new job
case <-time.After(responseWait):
return fmt.Errorf("failed to add the event to the channel")
case jc.events <- event:
}
return nil
}
func (jc *Channel) Close() {
jc.once.Do(func() {
close(jc.events)
})
}
func (jc *Channel) Event(ctx context.Context) (*Event, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
case job, open := <-jc.events:
if !open {
return nil, ErrJobChannelClosed
}
return job, nil
}
}

View File

@@ -0,0 +1,182 @@
package job
import (
"context"
"fmt"
"sync"
"time"
log "github.com/sirupsen/logrus"
"github.com/netbirdio/netbird/management/internals/modules/peers"
"github.com/netbirdio/netbird/management/server/store"
"github.com/netbirdio/netbird/management/server/telemetry"
"github.com/netbirdio/netbird/management/server/types"
"github.com/netbirdio/netbird/shared/management/proto"
)
type Event struct {
PeerID string
Request *proto.JobRequest
Response *proto.JobResponse
}
type Manager struct {
mu *sync.RWMutex
jobChannels map[string]*Channel // per-peer job streams
pending map[string]*Event // jobID → event
responseWait time.Duration
metrics telemetry.AppMetrics
Store store.Store
peersManager peers.Manager
}
func NewJobManager(metrics telemetry.AppMetrics, store store.Store, peersManager peers.Manager) *Manager {
return &Manager{
jobChannels: make(map[string]*Channel),
pending: make(map[string]*Event),
responseWait: 5 * time.Minute,
metrics: metrics,
mu: &sync.RWMutex{},
Store: store,
peersManager: peersManager,
}
}
// CreateJobChannel creates or replaces a channel for a peer
func (jm *Manager) CreateJobChannel(ctx context.Context, accountID, peerID string) *Channel {
// all pending jobs stored in db for this peer should be failed
if err := jm.Store.MarkAllPendingJobsAsFailed(ctx, accountID, peerID, "Pending job cleanup: marked as failed automatically due to being stuck too long"); err != nil {
log.WithContext(ctx).Error(err.Error())
}
jm.mu.Lock()
defer jm.mu.Unlock()
if ch, ok := jm.jobChannels[peerID]; ok {
ch.Close()
delete(jm.jobChannels, peerID)
}
ch := NewChannel()
jm.jobChannels[peerID] = ch
return ch
}
// SendJob sends a job to a peer and tracks it as pending
func (jm *Manager) SendJob(ctx context.Context, accountID, peerID string, req *proto.JobRequest) error {
jm.mu.RLock()
ch, ok := jm.jobChannels[peerID]
jm.mu.RUnlock()
if !ok {
return fmt.Errorf("peer %s has no channel", peerID)
}
event := &Event{
PeerID: peerID,
Request: req,
}
jm.mu.Lock()
jm.pending[string(req.ID)] = event
jm.mu.Unlock()
if err := ch.AddEvent(ctx, jm.responseWait, event); err != nil {
jm.cleanup(ctx, accountID, string(req.ID), err.Error())
return err
}
return nil
}
// HandleResponse marks a job as finished and moves it to completed
func (jm *Manager) HandleResponse(ctx context.Context, resp *proto.JobResponse, peerKey string) error {
jm.mu.Lock()
defer jm.mu.Unlock()
// todo: validate job ID and would be nice to use uuid text marshal instead of string
jobID := string(resp.ID)
// todo: in this map has jobs for all peers in any account. Consider to validate the jobID association for the peer
event, ok := jm.pending[jobID]
if !ok {
return fmt.Errorf("job %s not found", jobID)
}
var job types.Job
// todo: ApplyResponse should be static. Any member value is unusable in this way
if err := job.ApplyResponse(resp); err != nil {
return fmt.Errorf("invalid job response: %v", err)
}
peerID, err := jm.peersManager.GetPeerID(ctx, peerKey)
if err != nil {
return fmt.Errorf("failed to get peer ID: %v", err)
}
if peerID != event.PeerID {
return fmt.Errorf("peer ID mismatch: %s != %s", peerID, event.PeerID)
}
// update or create the store for job response
err = jm.Store.CompletePeerJob(ctx, &job)
if err != nil {
return fmt.Errorf("failed to complete job %s: %v", jobID, err)
}
delete(jm.pending, jobID)
return nil
}
// CloseChannel closes a peers channel and cleans up its jobs
func (jm *Manager) CloseChannel(ctx context.Context, accountID, peerID string) {
jm.mu.Lock()
defer jm.mu.Unlock()
if ch, ok := jm.jobChannels[peerID]; ok {
ch.Close()
delete(jm.jobChannels, peerID)
}
for jobID, ev := range jm.pending {
if ev.PeerID == peerID {
// if the client disconnect and there is pending job then mark it as failed
if err := jm.Store.MarkPendingJobsAsFailed(ctx, accountID, peerID, jobID, "Time out peer disconnected"); err != nil {
log.WithContext(ctx).Errorf("failed to mark pending jobs as failed: %v", err)
}
delete(jm.pending, jobID)
}
}
}
// cleanup removes a pending job safely
func (jm *Manager) cleanup(ctx context.Context, accountID, jobID string, reason string) {
jm.mu.Lock()
defer jm.mu.Unlock()
if ev, ok := jm.pending[jobID]; ok {
if err := jm.Store.MarkPendingJobsAsFailed(ctx, accountID, ev.PeerID, jobID, reason); err != nil {
log.WithContext(ctx).Errorf("failed to mark pending jobs as failed: %v", err)
}
delete(jm.pending, jobID)
}
}
func (jm *Manager) IsPeerConnected(peerID string) bool {
jm.mu.RLock()
defer jm.mu.RUnlock()
_, ok := jm.jobChannels[peerID]
return ok
}
func (jm *Manager) IsPeerHasPendingJobs(peerID string) bool {
jm.mu.RLock()
defer jm.mu.RUnlock()
for _, ev := range jm.pending {
if ev.PeerID == peerID {
return true
}
}
return false
}