Files
netbird/proxy/internal/accesslog/logger.go

379 lines
10 KiB
Go

package accesslog
import (
"context"
"net/netip"
"sync"
"sync/atomic"
"time"
"github.com/rs/xid"
log "github.com/sirupsen/logrus"
"google.golang.org/grpc"
"google.golang.org/protobuf/types/known/timestamppb"
"github.com/netbirdio/netbird/proxy/auth"
"github.com/netbirdio/netbird/proxy/internal/types"
"github.com/netbirdio/netbird/shared/management/proto"
)
const (
requestThreshold = 10000 // Log every 10k requests
bytesThreshold = 1024 * 1024 * 1024 // Log every 1GB
usageCleanupPeriod = 1 * time.Hour // Clean up stale counters every hour
usageInactiveWindow = 24 * time.Hour // Consider domain inactive if no traffic for 24 hours
logSendTimeout = 10 * time.Second
// denyCooldown is the min interval between deny log entries per service+reason
// to prevent flooding from denied connections (e.g. UDP packets from blocked IPs).
denyCooldown = 10 * time.Second
// maxDenyBuckets caps tracked deny rate-limit entries to bound memory under DDoS.
maxDenyBuckets = 10000
// maxLogWorkers caps concurrent gRPC send goroutines.
maxLogWorkers = 4096
)
type domainUsage struct {
requestCount int64
requestStartTime time.Time
bytesTransferred int64
bytesStartTime time.Time
lastActivity time.Time // Track last activity for cleanup
}
type gRPCClient interface {
SendAccessLog(ctx context.Context, in *proto.SendAccessLogRequest, opts ...grpc.CallOption) (*proto.SendAccessLogResponse, error)
}
// denyBucketKey identifies a rate-limited deny log stream.
type denyBucketKey struct {
ServiceID types.ServiceID
Reason string
}
// denyBucket tracks rate-limited deny log entries.
type denyBucket struct {
lastLogged time.Time
suppressed int64
}
// Logger sends access log entries to the management server via gRPC.
type Logger struct {
client gRPCClient
logger *log.Logger
trustedProxies []netip.Prefix
usageMux sync.Mutex
domainUsage map[string]*domainUsage
denyMu sync.Mutex
denyBuckets map[denyBucketKey]*denyBucket
logSem chan struct{}
cleanupCancel context.CancelFunc
dropped atomic.Int64
}
// NewLogger creates a new access log Logger. The trustedProxies parameter
// configures which upstream proxy IP ranges are trusted for extracting
// the real client IP from X-Forwarded-For headers.
func NewLogger(client gRPCClient, logger *log.Logger, trustedProxies []netip.Prefix) *Logger {
if logger == nil {
logger = log.StandardLogger()
}
ctx, cancel := context.WithCancel(context.Background())
l := &Logger{
client: client,
logger: logger,
trustedProxies: trustedProxies,
domainUsage: make(map[string]*domainUsage),
denyBuckets: make(map[denyBucketKey]*denyBucket),
logSem: make(chan struct{}, maxLogWorkers),
cleanupCancel: cancel,
}
// Start background cleanup routine
go l.cleanupStaleUsage(ctx)
return l
}
// Close stops the cleanup routine. Should be called during graceful shutdown.
func (l *Logger) Close() {
if l.cleanupCancel != nil {
l.cleanupCancel()
}
}
type logEntry struct {
ID string
AccountID types.AccountID
ServiceID types.ServiceID
Host string
Path string
DurationMs int64
Method string
ResponseCode int32
SourceIP netip.Addr
AuthMechanism string
UserID string
AuthSuccess bool
BytesUpload int64
BytesDownload int64
Protocol Protocol
}
// Protocol identifies the transport protocol of an access log entry.
type Protocol string
const (
ProtocolHTTP Protocol = "http"
ProtocolTCP Protocol = "tcp"
ProtocolUDP Protocol = "udp"
ProtocolTLS Protocol = "tls"
)
// L4Entry holds the data for a layer-4 (TCP/UDP) access log entry.
type L4Entry struct {
AccountID types.AccountID
ServiceID types.ServiceID
Protocol Protocol
Host string // SNI hostname or listen address
SourceIP netip.Addr
DurationMs int64
BytesUpload int64
BytesDownload int64
// DenyReason, when non-empty, indicates the connection was denied.
// Values match the HTTP auth mechanism strings: "ip_restricted",
// "country_restricted", "geo_unavailable".
DenyReason string
}
// LogL4 sends an access log entry for a layer-4 connection (TCP or UDP).
// The call is non-blocking: the gRPC send happens in a background goroutine.
func (l *Logger) LogL4(entry L4Entry) {
le := logEntry{
ID: xid.New().String(),
AccountID: entry.AccountID,
ServiceID: entry.ServiceID,
Protocol: entry.Protocol,
Host: entry.Host,
SourceIP: entry.SourceIP,
DurationMs: entry.DurationMs,
BytesUpload: entry.BytesUpload,
BytesDownload: entry.BytesDownload,
}
if entry.DenyReason != "" {
if !l.allowDenyLog(entry.ServiceID, entry.DenyReason) {
return
}
le.AuthMechanism = entry.DenyReason
le.AuthSuccess = false
}
l.log(le)
l.trackUsage(entry.Host, entry.BytesUpload+entry.BytesDownload)
}
// allowDenyLog rate-limits deny log entries per service+reason combination.
func (l *Logger) allowDenyLog(serviceID types.ServiceID, reason string) bool {
key := denyBucketKey{ServiceID: serviceID, Reason: reason}
now := time.Now()
l.denyMu.Lock()
defer l.denyMu.Unlock()
b, ok := l.denyBuckets[key]
if !ok {
if len(l.denyBuckets) >= maxDenyBuckets {
return false
}
l.denyBuckets[key] = &denyBucket{lastLogged: now}
return true
}
if now.Sub(b.lastLogged) >= denyCooldown {
if b.suppressed > 0 {
l.logger.Debugf("access restriction: suppressed %d deny log entries for %s (%s)", b.suppressed, serviceID, reason)
}
b.lastLogged = now
b.suppressed = 0
return true
}
b.suppressed++
return false
}
func (l *Logger) log(entry logEntry) {
// Fire off the log request in a separate routine.
// This increases the possibility of losing a log message
// (although it should still get logged in the event of an error),
// but it will reduce latency returning the request in the
// middleware.
// There is also a chance that log messages will arrive at
// the server out of order; however, the timestamp should
// allow for resolving that on the server.
now := timestamppb.Now()
select {
case l.logSem <- struct{}{}:
default:
total := l.dropped.Add(1)
l.logger.Debugf("access log send dropped: worker limit reached (total dropped: %d)", total)
return
}
go func() {
defer func() { <-l.logSem }()
logCtx, cancel := context.WithTimeout(context.Background(), logSendTimeout)
defer cancel()
// Only OIDC sessions have a meaningful user identity.
if entry.AuthMechanism != auth.MethodOIDC.String() {
entry.UserID = ""
}
var sourceIP string
if entry.SourceIP.IsValid() {
sourceIP = entry.SourceIP.String()
}
if _, err := l.client.SendAccessLog(logCtx, &proto.SendAccessLogRequest{
Log: &proto.AccessLog{
LogId: entry.ID,
AccountId: string(entry.AccountID),
Timestamp: now,
ServiceId: string(entry.ServiceID),
Host: entry.Host,
Path: entry.Path,
DurationMs: entry.DurationMs,
Method: entry.Method,
ResponseCode: entry.ResponseCode,
SourceIp: sourceIP,
AuthMechanism: entry.AuthMechanism,
UserId: entry.UserID,
AuthSuccess: entry.AuthSuccess,
BytesUpload: entry.BytesUpload,
BytesDownload: entry.BytesDownload,
Protocol: string(entry.Protocol),
},
}); err != nil {
l.logger.WithFields(log.Fields{
"service_id": entry.ServiceID,
"host": entry.Host,
"path": entry.Path,
"duration": entry.DurationMs,
"method": entry.Method,
"response_code": entry.ResponseCode,
"source_ip": sourceIP,
"auth_mechanism": entry.AuthMechanism,
"user_id": entry.UserID,
"auth_success": entry.AuthSuccess,
"error": err,
}).Error("Error sending access log on gRPC connection")
}
}()
}
// trackUsage records request and byte counts per domain, logging when thresholds are hit.
func (l *Logger) trackUsage(domain string, bytesTransferred int64) {
if domain == "" {
return
}
l.usageMux.Lock()
defer l.usageMux.Unlock()
now := time.Now()
usage, exists := l.domainUsage[domain]
if !exists {
usage = &domainUsage{
requestStartTime: now,
bytesStartTime: now,
lastActivity: now,
}
l.domainUsage[domain] = usage
}
usage.lastActivity = now
usage.requestCount++
if usage.requestCount >= requestThreshold {
elapsed := time.Since(usage.requestStartTime)
l.logger.WithFields(log.Fields{
"domain": domain,
"requests": usage.requestCount,
"duration": elapsed.String(),
}).Infof("domain %s had %d requests over %s", domain, usage.requestCount, elapsed)
usage.requestCount = 0
usage.requestStartTime = now
}
usage.bytesTransferred += bytesTransferred
if usage.bytesTransferred >= bytesThreshold {
elapsed := time.Since(usage.bytesStartTime)
bytesInGB := float64(usage.bytesTransferred) / (1024 * 1024 * 1024)
l.logger.WithFields(log.Fields{
"domain": domain,
"bytes": usage.bytesTransferred,
"bytes_gb": bytesInGB,
"duration": elapsed.String(),
}).Infof("domain %s transferred %.2f GB over %s", domain, bytesInGB, elapsed)
usage.bytesTransferred = 0
usage.bytesStartTime = now
}
}
// cleanupStaleUsage removes usage and deny-rate-limit entries that have been inactive.
func (l *Logger) cleanupStaleUsage(ctx context.Context) {
ticker := time.NewTicker(usageCleanupPeriod)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
now := time.Now()
l.cleanupDomainUsage(now)
l.cleanupDenyBuckets(now)
}
}
}
func (l *Logger) cleanupDomainUsage(now time.Time) {
l.usageMux.Lock()
defer l.usageMux.Unlock()
removed := 0
for domain, usage := range l.domainUsage {
if now.Sub(usage.lastActivity) > usageInactiveWindow {
delete(l.domainUsage, domain)
removed++
}
}
if removed > 0 {
l.logger.Debugf("cleaned up %d stale domain usage entries", removed)
}
}
func (l *Logger) cleanupDenyBuckets(now time.Time) {
l.denyMu.Lock()
defer l.denyMu.Unlock()
removed := 0
for key, bucket := range l.denyBuckets {
if now.Sub(bucket.lastLogged) > usageInactiveWindow {
delete(l.denyBuckets, key)
removed++
}
}
if removed > 0 {
l.logger.Debugf("cleaned up %d stale deny rate-limit entries", removed)
}
}