mirror of
https://github.com/netbirdio/netbird.git
synced 2026-07-02 12:49:54 +00:00
Compare commits
24 Commits
peer-acl-m
...
fix-config
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
641ae6f3a1 | ||
|
|
96338dccdb | ||
|
|
1d8b5f6e5c | ||
|
|
7d4736de55 | ||
|
|
06839a4731 | ||
|
|
eb422a5cd3 | ||
|
|
0aa0f7c76b | ||
|
|
7c0d8cbae0 | ||
|
|
2ab99eefa6 | ||
|
|
ff04ffb534 | ||
|
|
980598ed4a | ||
|
|
92a66cdd20 | ||
|
|
3be90f06b2 | ||
|
|
4ef65294e9 | ||
|
|
5b5f11740a | ||
|
|
3de889d529 | ||
|
|
04c3d19032 | ||
|
|
3f1fb3b52d | ||
|
|
b434cda062 | ||
|
|
0b594c639a | ||
|
|
deff8af59f | ||
|
|
5711f0e38c | ||
|
|
1409a1325a | ||
|
|
4400372f37 |
68
.github/workflows/agent-network-e2e.yml
vendored
Normal file
68
.github/workflows/agent-network-e2e.yml
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
name: Agent Network E2E
|
||||
|
||||
on:
|
||||
# Nightly at 03:00 UTC, plus on demand from the Actions tab.
|
||||
schedule:
|
||||
- cron: "0 3 * * *"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
e2e:
|
||||
name: Agent Network E2E
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
|
||||
with:
|
||||
go-version-file: "go.mod"
|
||||
|
||||
# Container-driver builder so the harness can build the combined/proxy/
|
||||
# client images from source with a local layer cache.
|
||||
- name: Set up Buildx
|
||||
uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0
|
||||
|
||||
# Persist the Docker layer cache across runs. This caches the base, apt,
|
||||
# and go-mod-download layers; the Go compile still re-runs, as BuildKit
|
||||
# mount caches cannot be exported to the GitHub cache.
|
||||
- name: Cache Docker layers
|
||||
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
|
||||
with:
|
||||
path: /tmp/.buildx-cache
|
||||
key: ${{ runner.os }}-anet-e2e-buildx-${{ hashFiles('go.sum', 'combined/Dockerfile.multistage', 'proxy/Dockerfile.multistage', 'e2e/harness/Dockerfile.client') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-anet-e2e-buildx-
|
||||
|
||||
- name: Run agent-network e2e
|
||||
env:
|
||||
# Build the images from source (this branch's code) with the shared
|
||||
# local layer cache.
|
||||
NB_E2E_BUILDX_CACHE: /tmp/.buildx-cache
|
||||
# Provider credentials. Each provider scenario skips if its
|
||||
# token (and URL, for gateways) is unset, so partial coverage is fine.
|
||||
OPENAI_TOKEN: ${{ secrets.E2E_OPENAI_TOKEN }}
|
||||
ANTHROPIC_TOKEN: ${{ secrets.E2E_ANTHROPIC_TOKEN }}
|
||||
VERCEL_URL: ${{ secrets.E2E_VERCEL_URL }}
|
||||
VERCEL_TOKEN: ${{ secrets.E2E_VERCEL_TOKEN }}
|
||||
OPENROUTER_URL: ${{ secrets.E2E_OPENROUTER_URL }}
|
||||
OPENROUTER_TOKEN: ${{ secrets.E2E_OPENROUTER_TOKEN }}
|
||||
CLOUDFLARE_URL: ${{ secrets.E2E_CLOUDFLARE_URL }}
|
||||
CLOUDFLARE_TOKEN: ${{ secrets.E2E_CLOUDFLARE_TOKEN }}
|
||||
AWS_BEARER_TOKEN_BEDROCK: ${{ secrets.E2E_AWS_BEARER_TOKEN_BEDROCK }}
|
||||
AWS_REGION: ${{ secrets.E2E_AWS_REGION }}
|
||||
# Vertex (Anthropic-on-Vertex): SA + project required; region defaults
|
||||
# to "global", model to a pinned claude snapshot.
|
||||
GOOGLE_VERTEX_SA_BASE64: ${{ secrets.E2E_GOOGLE_VERTEX_SA_BASE64 }}
|
||||
GOOGLE_VERTEX_PROJECT: ${{ secrets.E2E_GOOGLE_VERTEX_PROJECT }}
|
||||
GOOGLE_VERTEX_REGION: ${{ secrets.E2E_GOOGLE_VERTEX_REGION }}
|
||||
GOOGLE_VERTEX_MODEL: ${{ secrets.E2E_GOOGLE_VERTEX_MODEL }}
|
||||
run: go test -tags e2e -timeout 40m -v ./e2e/...
|
||||
2
.github/workflows/golangci-lint.yml
vendored
2
.github/workflows/golangci-lint.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
- name: codespell
|
||||
uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579 # v2.2
|
||||
with:
|
||||
ignore_words_list: erro,clienta,hastable,iif,groupd,testin,groupe,cros,ans,deriver,te,userA,ede,additionals
|
||||
ignore_words_list: erro,clienta,hastable,iif,groupd,testin,groupe,cros,ans,deriver,te,userA,ede,additionals,flate,recordin,unparseable
|
||||
skip: go.mod,go.sum,**/proxy/web/**
|
||||
golangci:
|
||||
strategy:
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
<br/>
|
||||
<br/>
|
||||
<strong>
|
||||
🚀 <a href="https://careers.netbird.io">We are hiring! Join us at careers.netbird.io</a>
|
||||
🚀 <a href="https://netbird.io/careers">We are hiring! Join us at https://netbird.io/careers</a>
|
||||
</strong>
|
||||
</p>
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ var (
|
||||
EnvKeyNBForceRelay = peer.EnvKeyNBForceRelay
|
||||
|
||||
// EnvKeyNBLazyConn Exported for Android java client to configure lazy connection
|
||||
EnvKeyNBLazyConn = lazyconn.EnvEnableLazyConn
|
||||
EnvKeyNBLazyConn = lazyconn.EnvLazyConn
|
||||
|
||||
// EnvKeyNBInactivityThreshold Exported for Android java client to configure connection inactivity threshold
|
||||
EnvKeyNBInactivityThreshold = lazyconn.EnvInactivityThreshold
|
||||
|
||||
@@ -71,12 +71,14 @@ var (
|
||||
extraIFaceBlackList []string
|
||||
anonymizeFlag bool
|
||||
dnsRouteInterval time.Duration
|
||||
lazyConnEnabled bool
|
||||
mtu uint16
|
||||
profilesDisabled bool
|
||||
updateSettingsDisabled bool
|
||||
captureEnabled bool
|
||||
networksDisabled bool
|
||||
// lazyConnEnabled is the parse target for the deprecated --enable-lazy-connection
|
||||
// flag. The flag is inert; the value is no longer read (use NB_LAZY_CONN instead).
|
||||
lazyConnEnabled bool
|
||||
mtu uint16
|
||||
profilesDisabled bool
|
||||
updateSettingsDisabled bool
|
||||
captureEnabled bool
|
||||
networksDisabled bool
|
||||
|
||||
rootCmd = &cobra.Command{
|
||||
Use: "netbird",
|
||||
@@ -210,7 +212,8 @@ func init() {
|
||||
upCmd.PersistentFlags().BoolVar(&rosenpassEnabled, enableRosenpassFlag, false, "[Experimental] Enable Rosenpass feature. If enabled, the connection will be post-quantum secured via Rosenpass.")
|
||||
upCmd.PersistentFlags().BoolVar(&rosenpassPermissive, rosenpassPermissiveFlag, false, "[Experimental] Enable Rosenpass in permissive mode to allow this peer to accept WireGuard connections without requiring Rosenpass functionality from peers that do not have Rosenpass enabled.")
|
||||
upCmd.PersistentFlags().BoolVar(&autoConnectDisabled, disableAutoConnectFlag, false, "Disables auto-connect feature. If enabled, then the client won't connect automatically when the service starts.")
|
||||
upCmd.PersistentFlags().BoolVar(&lazyConnEnabled, enableLazyConnectionFlag, false, "[Experimental] Enable the lazy connection feature. If enabled, the client will establish connections on-demand. Note: this setting may be overridden by management configuration.")
|
||||
upCmd.PersistentFlags().BoolVar(&lazyConnEnabled, enableLazyConnectionFlag, false, "Deprecated: no longer used. Lazy connections are controlled by the server and the NB_LAZY_CONN environment variable.")
|
||||
_ = upCmd.PersistentFlags().MarkDeprecated(enableLazyConnectionFlag, "no longer used; lazy connections are controlled by the server and the NB_LAZY_CONN environment variable")
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -479,10 +479,6 @@ func setupSetConfigReq(customDNSAddressConverted []byte, cmd *cobra.Command, pro
|
||||
req.DisableIpv6 = &disableIPv6
|
||||
}
|
||||
|
||||
if cmd.Flag(enableLazyConnectionFlag).Changed {
|
||||
req.LazyConnectionEnabled = &lazyConnEnabled
|
||||
}
|
||||
|
||||
return &req
|
||||
}
|
||||
|
||||
@@ -600,9 +596,6 @@ func setupConfig(customDNSAddressConverted []byte, cmd *cobra.Command, configFil
|
||||
ic.DisableIPv6 = &disableIPv6
|
||||
}
|
||||
|
||||
if cmd.Flag(enableLazyConnectionFlag).Changed {
|
||||
ic.LazyConnectionEnabled = &lazyConnEnabled
|
||||
}
|
||||
return &ic, nil
|
||||
}
|
||||
|
||||
@@ -718,9 +711,6 @@ func setupLoginRequest(providedSetupKey string, customDNSAddressConverted []byte
|
||||
loginRequest.DisableIpv6 = &disableIPv6
|
||||
}
|
||||
|
||||
if cmd.Flag(enableLazyConnectionFlag).Changed {
|
||||
loginRequest.LazyConnectionEnabled = &lazyConnEnabled
|
||||
}
|
||||
return &loginRequest, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -136,6 +136,11 @@ func (p *ProxyBind) CloseConn() error {
|
||||
return p.close()
|
||||
}
|
||||
|
||||
// InjectPacket is a no-op for the userspace proxy: first-packet reinjection is kernel-only.
|
||||
func (p *ProxyBind) InjectPacket(_ []byte) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *ProxyBind) close() error {
|
||||
if p.remoteConn == nil {
|
||||
return nil
|
||||
|
||||
@@ -219,6 +219,17 @@ func (p *ProxyWrapper) RedirectAs(endpoint *net.UDPAddr) {
|
||||
p.pausedCond.L.Unlock()
|
||||
}
|
||||
|
||||
// InjectPacket writes b to the remote peer over the underlying transport.
|
||||
func (p *ProxyWrapper) InjectPacket(b []byte) error {
|
||||
if p.remoteConn == nil {
|
||||
return errors.New("proxy not started")
|
||||
}
|
||||
if _, err := p.remoteConn.Write(b); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CloseConn close the remoteConn and automatically remove the conn instance from the map
|
||||
func (p *ProxyWrapper) CloseConn() error {
|
||||
if p.cancel == nil {
|
||||
|
||||
@@ -18,4 +18,9 @@ type Proxy interface {
|
||||
RedirectAs(endpoint *net.UDPAddr)
|
||||
CloseConn() error
|
||||
SetDisconnectListener(disconnected func())
|
||||
|
||||
// InjectPacket writes a raw packet directly to the remote peer over the underlying transport,
|
||||
// bypassing WireGuard. Used to replay the captured lazyconn handshake initiation. Only the
|
||||
// kernel-mode proxies act on it; the userspace proxy is a no-op since reinjection is kernel-only.
|
||||
InjectPacket(b []byte) error
|
||||
}
|
||||
|
||||
@@ -147,6 +147,17 @@ func (p *WGUDPProxy) RedirectAs(endpoint *net.UDPAddr) {
|
||||
p.sendPkg = p.srcFakerConn.SendPkg
|
||||
}
|
||||
|
||||
// InjectPacket writes b to the remote peer over the underlying transport.
|
||||
func (p *WGUDPProxy) InjectPacket(b []byte) error {
|
||||
if p.remoteConn == nil {
|
||||
return errors.New("proxy not started")
|
||||
}
|
||||
if _, err := p.remoteConn.Write(b); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CloseConn close the localConn
|
||||
func (p *WGUDPProxy) CloseConn() error {
|
||||
if p.cancel == nil {
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-multierror"
|
||||
"github.com/mitchellh/hashstructure/v2"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
nberrors "github.com/netbirdio/netbird/client/errors"
|
||||
@@ -30,11 +31,13 @@ type Manager interface {
|
||||
|
||||
// DefaultManager uses firewall manager to handle
|
||||
type DefaultManager struct {
|
||||
firewall firewall.Manager
|
||||
ipsetCounter int
|
||||
peerRulesPairs map[id.RuleID][]firewall.Rule
|
||||
routeRules map[id.RuleID]struct{}
|
||||
mutex sync.Mutex
|
||||
firewall firewall.Manager
|
||||
ipsetCounter int
|
||||
peerRulesPairs map[id.RuleID][]firewall.Rule
|
||||
routeRules map[id.RuleID]struct{}
|
||||
previousConfigHash uint64
|
||||
hasAppliedConfig bool
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
func NewDefaultManager(fm firewall.Manager) *DefaultManager {
|
||||
@@ -57,6 +60,23 @@ func (d *DefaultManager) ApplyFiltering(networkMap *mgmProto.NetworkMap, dnsRout
|
||||
return
|
||||
}
|
||||
|
||||
// Skip the full rebuild + flush when the inputs that drive the firewall
|
||||
// state are byte-for-byte identical to the last successfully applied
|
||||
// update. Management re-sends the same network map far more often than it
|
||||
// actually changes (account-wide updates, peer meta churn), and rebuilding
|
||||
// every peer/route ACL and flushing the firewall on every such sync is the
|
||||
// dominant client-side cost when nothing changed. Mirrors the same guard the
|
||||
// DNS server already uses (previousConfigHash). Only the fields ApplyFiltering
|
||||
// consumes participate in the hash, so an unrelated map change cannot mask a
|
||||
// real ACL change.
|
||||
hash, err := d.firewallConfigHash(networkMap, dnsRouteFeatureFlag)
|
||||
if err != nil {
|
||||
log.Errorf("unable to hash firewall configuration, applying unconditionally: %v", err)
|
||||
} else if d.hasAppliedConfig && d.previousConfigHash == hash {
|
||||
log.Debugf("not applying the firewall configuration update as there is nothing new (hash: %d)", hash)
|
||||
return
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
total := 0
|
||||
@@ -70,13 +90,49 @@ func (d *DefaultManager) ApplyFiltering(networkMap *mgmProto.NetworkMap, dnsRout
|
||||
|
||||
d.applyPeerACLs(networkMap)
|
||||
|
||||
if err := d.applyRouteACLs(networkMap.RoutesFirewallRules, dnsRouteFeatureFlag); err != nil {
|
||||
log.Errorf("Failed to apply route ACLs: %v", err)
|
||||
routeErr := d.applyRouteACLs(networkMap.RoutesFirewallRules, dnsRouteFeatureFlag)
|
||||
if routeErr != nil {
|
||||
log.Errorf("Failed to apply route ACLs: %v", routeErr)
|
||||
}
|
||||
|
||||
if err := d.firewall.Flush(); err != nil {
|
||||
log.Error("failed to flush firewall rules: ", err)
|
||||
flushErr := d.firewall.Flush()
|
||||
if flushErr != nil {
|
||||
log.Error("failed to flush firewall rules: ", flushErr)
|
||||
}
|
||||
|
||||
// Only remember the hash once the firewall actually reflects this config.
|
||||
// If applying or flushing failed, leave the previous hash untouched so the
|
||||
// next (possibly identical) update is not skipped and gets a chance to
|
||||
// reconcile the firewall state.
|
||||
if err == nil && routeErr == nil && flushErr == nil {
|
||||
d.previousConfigHash = hash
|
||||
d.hasAppliedConfig = true
|
||||
} else {
|
||||
d.hasAppliedConfig = false
|
||||
}
|
||||
}
|
||||
|
||||
// firewallConfigHash hashes exactly the inputs ApplyFiltering uses to build the
|
||||
// firewall state, so an identical hash means an identical resulting ruleset.
|
||||
func (d *DefaultManager) firewallConfigHash(networkMap *mgmProto.NetworkMap, dnsRouteFeatureFlag bool) (uint64, error) {
|
||||
return hashstructure.Hash(struct {
|
||||
PeerRules []*mgmProto.FirewallRule
|
||||
PeerRulesIsEmpty bool
|
||||
RouteRules []*mgmProto.RouteFirewallRule
|
||||
RouteRulesIsEmpty bool
|
||||
DNSRouteFeatureFlag bool
|
||||
}{
|
||||
PeerRules: networkMap.GetFirewallRules(),
|
||||
PeerRulesIsEmpty: networkMap.GetFirewallRulesIsEmpty(),
|
||||
RouteRules: networkMap.GetRoutesFirewallRules(),
|
||||
RouteRulesIsEmpty: networkMap.GetRoutesFirewallRulesIsEmpty(),
|
||||
DNSRouteFeatureFlag: dnsRouteFeatureFlag,
|
||||
}, hashstructure.FormatV2, &hashstructure.HashOptions{
|
||||
ZeroNil: true,
|
||||
IgnoreZeroValue: true,
|
||||
SlicesAsSets: true,
|
||||
UseStringer: true,
|
||||
})
|
||||
}
|
||||
|
||||
func (d *DefaultManager) applyPeerACLs(networkMap *mgmProto.NetworkMap) {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package acl
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/netip"
|
||||
"testing"
|
||||
|
||||
@@ -485,3 +486,149 @@ func TestPortInfoEmpty(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyFilteringSkipsUnchangedConfig verifies that an identical network map
|
||||
// re-applied is recognized as a no-op (hash unchanged), while a real change to
|
||||
// any firewall-relevant input forces a re-apply (hash changes). This is the
|
||||
// guard that prevents a full ruleset rebuild + flush on every redundant sync.
|
||||
func TestApplyFilteringSkipsUnchangedConfig(t *testing.T) {
|
||||
t.Setenv("NB_WG_KERNEL_DISABLED", "true")
|
||||
t.Setenv(firewall.EnvForceUserspaceFirewall, "true")
|
||||
|
||||
ctrl := gomock.NewController(t)
|
||||
defer ctrl.Finish()
|
||||
|
||||
ifaceMock := mocks.NewMockIFaceMapper(ctrl)
|
||||
ifaceMock.EXPECT().IsUserspaceBind().Return(true).AnyTimes()
|
||||
ifaceMock.EXPECT().SetFilter(gomock.Any())
|
||||
network := netip.MustParsePrefix("172.0.0.1/32")
|
||||
ifaceMock.EXPECT().Name().Return("lo").AnyTimes()
|
||||
ifaceMock.EXPECT().Address().Return(wgaddr.Address{
|
||||
IP: network.Addr(),
|
||||
Network: network,
|
||||
}).AnyTimes()
|
||||
ifaceMock.EXPECT().GetWGDevice().Return(nil).AnyTimes()
|
||||
|
||||
fw, err := firewall.NewFirewall(ifaceMock, nil, flowLogger, false, iface.DefaultMTU)
|
||||
require.NoError(t, err)
|
||||
defer func() {
|
||||
require.NoError(t, fw.Close(nil))
|
||||
}()
|
||||
|
||||
acl := NewDefaultManager(fw)
|
||||
|
||||
networkMap := &mgmProto.NetworkMap{
|
||||
FirewallRules: []*mgmProto.FirewallRule{
|
||||
{
|
||||
PeerIP: "10.93.0.1",
|
||||
Direction: mgmProto.RuleDirection_IN,
|
||||
Action: mgmProto.RuleAction_ACCEPT,
|
||||
Protocol: mgmProto.RuleProtocol_TCP,
|
||||
Port: "22",
|
||||
},
|
||||
},
|
||||
FirewallRulesIsEmpty: false,
|
||||
}
|
||||
|
||||
acl.ApplyFiltering(networkMap, false)
|
||||
require.True(t, acl.hasAppliedConfig, "config should be marked applied after first apply")
|
||||
firstHash := acl.previousConfigHash
|
||||
require.NotZero(t, firstHash)
|
||||
|
||||
// Re-applying the identical map must not change the recorded hash: the
|
||||
// expensive rebuild path was skipped.
|
||||
acl.ApplyFiltering(networkMap, false)
|
||||
assert.Equal(t, firstHash, acl.previousConfigHash,
|
||||
"identical re-apply must be a no-op (hash unchanged)")
|
||||
|
||||
// A real change must produce a different hash and re-apply.
|
||||
networkMap.FirewallRules[0].Action = mgmProto.RuleAction_DROP
|
||||
acl.ApplyFiltering(networkMap, false)
|
||||
assert.NotEqual(t, firstHash, acl.previousConfigHash,
|
||||
"changing a rule's action must force a re-apply (hash changed)")
|
||||
|
||||
// The dnsRouteFeatureFlag also participates in the hash.
|
||||
changedHash := acl.previousConfigHash
|
||||
acl.ApplyFiltering(networkMap, true)
|
||||
assert.NotEqual(t, changedHash, acl.previousConfigHash,
|
||||
"flipping dnsRouteFeatureFlag must force a re-apply (hash changed)")
|
||||
}
|
||||
|
||||
func buildNetworkMap(peerRules, routeRules int) *mgmProto.NetworkMap {
|
||||
nm := &mgmProto.NetworkMap{
|
||||
FirewallRulesIsEmpty: peerRules == 0,
|
||||
RoutesFirewallRulesIsEmpty: routeRules == 0,
|
||||
}
|
||||
for i := range peerRules {
|
||||
nm.FirewallRules = append(nm.FirewallRules, &mgmProto.FirewallRule{
|
||||
PeerIP: fmt.Sprintf("10.%d.%d.%d", i>>16&0xff, i>>8&0xff, i&0xff),
|
||||
Direction: mgmProto.RuleDirection_IN,
|
||||
Action: mgmProto.RuleAction_ACCEPT,
|
||||
Protocol: mgmProto.RuleProtocol_TCP,
|
||||
Port: fmt.Sprintf("%d", 1024+i%64511),
|
||||
})
|
||||
}
|
||||
for i := range routeRules {
|
||||
nm.RoutesFirewallRules = append(nm.RoutesFirewallRules, &mgmProto.RouteFirewallRule{
|
||||
Destination: fmt.Sprintf("192.168.%d.0/24", i%256),
|
||||
SourceRanges: []string{fmt.Sprintf("10.0.%d.0/24", i%256)},
|
||||
Action: mgmProto.RuleAction_ACCEPT,
|
||||
Protocol: mgmProto.RuleProtocol_ALL,
|
||||
})
|
||||
}
|
||||
return nm
|
||||
}
|
||||
|
||||
func BenchmarkFirewallConfigHash_Small(b *testing.B) {
|
||||
d := &DefaultManager{}
|
||||
nm := buildNetworkMap(10, 5)
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
_, _ = d.firewallConfigHash(nm, false)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkFirewallConfigHash_Medium(b *testing.B) {
|
||||
d := &DefaultManager{}
|
||||
nm := buildNetworkMap(100, 50)
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
_, _ = d.firewallConfigHash(nm, false)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkFirewallConfigHash_Large(b *testing.B) {
|
||||
d := &DefaultManager{}
|
||||
nm := buildNetworkMap(1000, 200)
|
||||
b.ResetTimer()
|
||||
for b.Loop() {
|
||||
_, _ = d.firewallConfigHash(nm, false)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFirewallConfigHashDeterministic verifies the hash is stable for equal
|
||||
// inputs and order-independent for the rule slices (management does not
|
||||
// guarantee rule order).
|
||||
func TestFirewallConfigHashDeterministic(t *testing.T) {
|
||||
d := &DefaultManager{}
|
||||
|
||||
nm1 := &mgmProto.NetworkMap{
|
||||
FirewallRules: []*mgmProto.FirewallRule{
|
||||
{PeerIP: "10.0.0.1", Direction: mgmProto.RuleDirection_IN, Action: mgmProto.RuleAction_ACCEPT, Protocol: mgmProto.RuleProtocol_TCP, Port: "22"},
|
||||
{PeerIP: "10.0.0.2", Direction: mgmProto.RuleDirection_IN, Action: mgmProto.RuleAction_DROP, Protocol: mgmProto.RuleProtocol_TCP, Port: "80"},
|
||||
},
|
||||
}
|
||||
// Same rules, reversed order.
|
||||
nm2 := &mgmProto.NetworkMap{
|
||||
FirewallRules: []*mgmProto.FirewallRule{
|
||||
nm1.FirewallRules[1],
|
||||
nm1.FirewallRules[0],
|
||||
},
|
||||
}
|
||||
|
||||
h1, err := d.firewallConfigHash(nm1, false)
|
||||
require.NoError(t, err)
|
||||
h2, err := d.firewallConfigHash(nm2, false)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, h1, h2, "hash must be order-independent for rule slices")
|
||||
}
|
||||
|
||||
@@ -322,7 +322,6 @@ func (a *Auth) setSystemInfoFlags(info *system.Info) {
|
||||
a.config.BlockLANAccess,
|
||||
a.config.BlockInbound,
|
||||
a.config.DisableIPv6,
|
||||
a.config.LazyConnectionEnabled,
|
||||
a.config.EnableSSHRoot,
|
||||
a.config.EnableSSHSFTP,
|
||||
a.config.EnableSSHLocalPortForwarding,
|
||||
|
||||
@@ -16,6 +16,16 @@ import (
|
||||
"github.com/netbirdio/netbird/route"
|
||||
)
|
||||
|
||||
// lazyForce is the resolved local decision for lazy connections, layered above the
|
||||
// management feature flag. lazyForceNone defers to management.
|
||||
type lazyForce int
|
||||
|
||||
const (
|
||||
lazyForceNone lazyForce = iota
|
||||
lazyForceOn
|
||||
lazyForceOff
|
||||
)
|
||||
|
||||
// ConnMgr coordinates both lazy connections (established on-demand) and permanent peer connections.
|
||||
//
|
||||
// The connection manager is responsible for:
|
||||
@@ -28,7 +38,7 @@ type ConnMgr struct {
|
||||
peerStore *peerstore.Store
|
||||
statusRecorder *peer.Status
|
||||
iface lazyconn.WGIface
|
||||
enabledLocally bool
|
||||
force lazyForce
|
||||
rosenpassEnabled bool
|
||||
|
||||
lazyConnMgr *manager.Manager
|
||||
@@ -43,28 +53,34 @@ func NewConnMgr(engineConfig *EngineConfig, statusRecorder *peer.Status, peerSto
|
||||
peerStore: peerStore,
|
||||
statusRecorder: statusRecorder,
|
||||
iface: iface,
|
||||
force: resolveLazyForce(engineConfig.LazyConnection),
|
||||
rosenpassEnabled: engineConfig.RosenpassEnabled,
|
||||
}
|
||||
if engineConfig.LazyConnectionEnabled || lazyconn.IsLazyConnEnabledByEnv() {
|
||||
e.enabledLocally = true
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// Start initializes the connection manager and starts the lazy connection manager if enabled by env var or cmd line option.
|
||||
// Start initializes the connection manager. It starts the lazy connection manager when a
|
||||
// local override forces it on; with no local override it waits for the management feature flag.
|
||||
func (e *ConnMgr) Start(ctx context.Context) {
|
||||
if e.lazyConnMgr != nil {
|
||||
log.Errorf("lazy connection manager is already started")
|
||||
return
|
||||
}
|
||||
|
||||
if !e.enabledLocally {
|
||||
log.Infof("lazy connection manager is disabled")
|
||||
switch e.force {
|
||||
case lazyForceOff:
|
||||
log.Infof("lazy connection manager is disabled by local override (%s or MDM policy)", lazyconn.EnvLazyConn)
|
||||
e.statusRecorder.UpdateLazyConnection(false)
|
||||
return
|
||||
case lazyForceNone:
|
||||
log.Infof("lazy connection manager is managed by the management feature flag")
|
||||
e.statusRecorder.UpdateLazyConnection(false)
|
||||
return
|
||||
}
|
||||
|
||||
if e.rosenpassEnabled {
|
||||
log.Warnf("rosenpass connection manager is enabled, lazy connection manager will not be started")
|
||||
e.statusRecorder.UpdateLazyConnection(false)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -76,8 +92,8 @@ func (e *ConnMgr) Start(ctx context.Context) {
|
||||
// If enabled, it initializes the lazy connection manager and start it. Do not need to call Start() again.
|
||||
// If disabled, then it closes the lazy connection manager and open the connections to all peers.
|
||||
func (e *ConnMgr) UpdatedRemoteFeatureFlag(ctx context.Context, enabled bool) error {
|
||||
// do not disable lazy connection manager if it was enabled by env var
|
||||
if e.enabledLocally {
|
||||
// a local override (NB_LAZY_CONN or local config) takes precedence over management
|
||||
if e.force != lazyForceNone {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -89,6 +105,7 @@ func (e *ConnMgr) UpdatedRemoteFeatureFlag(ctx context.Context, enabled bool) er
|
||||
|
||||
if e.rosenpassEnabled {
|
||||
log.Infof("rosenpass connection manager is enabled, lazy connection manager will not be started")
|
||||
e.statusRecorder.UpdateLazyConnection(false)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -98,6 +115,7 @@ func (e *ConnMgr) UpdatedRemoteFeatureFlag(ctx context.Context, enabled bool) er
|
||||
return e.addPeersToLazyConnManager()
|
||||
} else {
|
||||
if e.lazyConnMgr == nil {
|
||||
e.statusRecorder.UpdateLazyConnection(false)
|
||||
return nil
|
||||
}
|
||||
log.Infof("lazy connection manager is disabled by management feature flag")
|
||||
@@ -309,6 +327,25 @@ func (e *ConnMgr) isStartedWithLazyMgr() bool {
|
||||
return e.lazyConnMgr != nil && e.lazyCtxCancel != nil
|
||||
}
|
||||
|
||||
// resolveLazyForce determines the local override. NB_LAZY_CONN takes precedence; when it
|
||||
// is unset the MDM policy override (mdmState) applies. Either wins in both directions over
|
||||
// the management feature flag; StateUnset for both defers to management.
|
||||
func resolveLazyForce(mdmState lazyconn.State) lazyForce {
|
||||
state := lazyconn.EnvState()
|
||||
if state == lazyconn.StateUnset {
|
||||
state = mdmState
|
||||
}
|
||||
|
||||
switch state {
|
||||
case lazyconn.StateOn:
|
||||
return lazyForceOn
|
||||
case lazyconn.StateOff:
|
||||
return lazyForceOff
|
||||
default:
|
||||
return lazyForceNone
|
||||
}
|
||||
}
|
||||
|
||||
func inactivityThresholdEnv() *time.Duration {
|
||||
envValue := os.Getenv(lazyconn.EnvInactivityThreshold)
|
||||
if envValue == "" {
|
||||
|
||||
40
client/internal/conn_mgr_test.go
Normal file
40
client/internal/conn_mgr_test.go
Normal file
@@ -0,0 +1,40 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/netbirdio/netbird/client/internal/lazyconn"
|
||||
)
|
||||
|
||||
func TestResolveLazyForce(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
env string
|
||||
envSet bool
|
||||
mdm lazyconn.State
|
||||
want lazyForce
|
||||
}{
|
||||
{name: "env unset, mdm unset -> defer to management", mdm: lazyconn.StateUnset, want: lazyForceNone},
|
||||
{name: "env on -> force on", env: "on", envSet: true, mdm: lazyconn.StateUnset, want: lazyForceOn},
|
||||
{name: "env off -> force off", env: "off", envSet: true, mdm: lazyconn.StateUnset, want: lazyForceOff},
|
||||
{name: "env unset, mdm on -> force on", mdm: lazyconn.StateOn, want: lazyForceOn},
|
||||
{name: "env unset, mdm off -> force off", mdm: lazyconn.StateOff, want: lazyForceOff},
|
||||
{name: "env on beats mdm off", env: "on", envSet: true, mdm: lazyconn.StateOff, want: lazyForceOn},
|
||||
{name: "env off beats mdm on", env: "off", envSet: true, mdm: lazyconn.StateOn, want: lazyForceOff},
|
||||
{name: "unrecognized env, mdm on -> mdm wins", env: "auto", envSet: true, mdm: lazyconn.StateOn, want: lazyForceOn},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
t.Setenv(lazyconn.EnvLazyConn, tt.env)
|
||||
if !tt.envSet {
|
||||
os.Unsetenv(lazyconn.EnvLazyConn)
|
||||
}
|
||||
|
||||
if got := resolveLazyForce(tt.mdm); got != tt.want {
|
||||
t.Fatalf("resolveLazyForce(%v) = %v, want %v", tt.mdm, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -27,6 +27,7 @@ import (
|
||||
"github.com/netbirdio/netbird/client/iface/device"
|
||||
"github.com/netbirdio/netbird/client/iface/netstack"
|
||||
"github.com/netbirdio/netbird/client/internal/dns"
|
||||
"github.com/netbirdio/netbird/client/internal/lazyconn"
|
||||
"github.com/netbirdio/netbird/client/internal/listener"
|
||||
"github.com/netbirdio/netbird/client/internal/metrics"
|
||||
"github.com/netbirdio/netbird/client/internal/peer"
|
||||
@@ -314,6 +315,10 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
|
||||
c.clientMetrics.RecordLoginDuration(engineCtx, time.Since(loginStarted), true)
|
||||
c.statusRecorder.MarkManagementConnected()
|
||||
|
||||
if metricsConfig := loginResp.GetNetbirdConfig().GetMetrics(); metricsConfig != nil {
|
||||
c.clientMetrics.UpdatePushFromMgm(c.ctx, metricsConfig.GetEnabled())
|
||||
}
|
||||
|
||||
localPeerState := peer.LocalPeerState{
|
||||
IP: loginResp.GetPeerConfig().GetAddress(),
|
||||
PubKey: myPrivateKey.PublicKey().String(),
|
||||
@@ -399,6 +404,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
|
||||
StateManager: stateManager,
|
||||
UpdateManager: c.updateManager,
|
||||
ClientMetrics: c.clientMetrics,
|
||||
MetricsCtx: c.ctx,
|
||||
}, mobileDependency)
|
||||
engine.SetSyncResponsePersistence(c.persistSyncResponse)
|
||||
c.engine = engine
|
||||
@@ -596,7 +602,7 @@ func createEngineConfig(key wgtypes.Key, config *profilemanager.Config, peerConf
|
||||
BlockInbound: config.BlockInbound,
|
||||
DisableIPv6: config.DisableIPv6,
|
||||
|
||||
LazyConnectionEnabled: config.LazyConnectionEnabled,
|
||||
LazyConnection: lazyconn.ParseState(config.LazyConnection),
|
||||
|
||||
MTU: selectMTU(config.MTU, peerConfig.Mtu),
|
||||
LogPath: logPath,
|
||||
@@ -670,7 +676,6 @@ func loginToManagement(ctx context.Context, client mgm.Client, pubSSHKey []byte,
|
||||
config.BlockLANAccess,
|
||||
config.BlockInbound,
|
||||
config.DisableIPv6,
|
||||
config.LazyConnectionEnabled,
|
||||
config.EnableSSHRoot,
|
||||
config.EnableSSHSFTP,
|
||||
config.EnableSSHLocalPortForwarding,
|
||||
|
||||
@@ -681,7 +681,7 @@ func (g *BundleGenerator) addCommonConfigFields(configContent *strings.Builder)
|
||||
configContent.WriteString(fmt.Sprintf("ClientCertKeyPath: %s\n", g.internalConfig.ClientCertKeyPath))
|
||||
}
|
||||
|
||||
configContent.WriteString(fmt.Sprintf("LazyConnectionEnabled: %v\n", g.internalConfig.LazyConnectionEnabled))
|
||||
configContent.WriteString(fmt.Sprintf("LazyConnection: %q\n", g.internalConfig.LazyConnection))
|
||||
configContent.WriteString(fmt.Sprintf("MTU: %d\n", g.internalConfig.MTU))
|
||||
}
|
||||
|
||||
|
||||
@@ -885,7 +885,7 @@ func TestAddConfig_AllFieldsCovered(t *testing.T) {
|
||||
DNSRouteInterval: 5 * time.Second,
|
||||
ClientCertPath: "/tmp/cert",
|
||||
ClientCertKeyPath: "/tmp/key",
|
||||
LazyConnectionEnabled: true,
|
||||
LazyConnection: "on",
|
||||
MTU: 1280,
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"errors"
|
||||
"net"
|
||||
"net/netip"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/miekg/dns"
|
||||
@@ -167,7 +168,10 @@ func getRcodeForNotFound(ctx context.Context, r resolver, domain string, origina
|
||||
case dns.TypeA:
|
||||
alternativeNetwork = "ip6"
|
||||
default:
|
||||
return dns.RcodeNameError
|
||||
// Non-address types reach LookupIP only unexpectedly; without an
|
||||
// address pair to probe we cannot prove the name is absent, so answer
|
||||
// NODATA rather than a poisoning NXDOMAIN.
|
||||
return dns.RcodeSuccess
|
||||
}
|
||||
|
||||
if _, err := r.LookupNetIP(ctx, alternativeNetwork, domain); err != nil {
|
||||
@@ -184,6 +188,230 @@ func getRcodeForNotFound(ctx context.Context, r resolver, domain string, origina
|
||||
return dns.RcodeSuccess
|
||||
}
|
||||
|
||||
// RecordResolver is the host resolver surface used to forward non-address
|
||||
// record queries. net.DefaultResolver satisfies it.
|
||||
type RecordResolver interface {
|
||||
LookupMX(ctx context.Context, name string) ([]*net.MX, error)
|
||||
LookupTXT(ctx context.Context, name string) ([]string, error)
|
||||
LookupNS(ctx context.Context, name string) ([]*net.NS, error)
|
||||
LookupSRV(ctx context.Context, service, proto, name string) (string, []*net.SRV, error)
|
||||
LookupCNAME(ctx context.Context, host string) (string, error)
|
||||
LookupAddr(ctx context.Context, addr string) ([]string, error)
|
||||
}
|
||||
|
||||
// LookupRecords resolves a non-address DNS record type through the host
|
||||
// resolver and returns the resource records and the DNS rcode. Types the host
|
||||
// resolver cannot answer (anything not covered by the net.Resolver Lookup*
|
||||
// methods) yield NODATA so that a routed name is never poisoned with NXDOMAIN
|
||||
// for an unsupported type.
|
||||
func LookupRecords(ctx context.Context, r RecordResolver, name string, qtype uint16, ttl uint32) ([]dns.RR, int) {
|
||||
fqdn := dns.Fqdn(name)
|
||||
|
||||
switch qtype {
|
||||
case dns.TypeMX:
|
||||
return lookupMX(ctx, r, name, fqdn, ttl)
|
||||
case dns.TypeTXT:
|
||||
return lookupTXT(ctx, r, name, fqdn, ttl)
|
||||
case dns.TypeNS:
|
||||
return lookupNS(ctx, r, name, fqdn, ttl)
|
||||
case dns.TypeSRV:
|
||||
return lookupSRV(ctx, r, name, fqdn, ttl)
|
||||
case dns.TypeCNAME:
|
||||
return lookupCNAME(ctx, r, name, fqdn, ttl)
|
||||
case dns.TypePTR:
|
||||
return lookupPTR(ctx, r, name, fqdn, ttl)
|
||||
default:
|
||||
return nil, dns.RcodeSuccess
|
||||
}
|
||||
}
|
||||
|
||||
func recordHeader(fqdn string, rrtype uint16, ttl uint32) dns.RR_Header {
|
||||
return dns.RR_Header{Name: fqdn, Rrtype: rrtype, Class: dns.ClassINET, Ttl: ttl}
|
||||
}
|
||||
|
||||
func lookupMX(ctx context.Context, r RecordResolver, name, fqdn string, ttl uint32) ([]dns.RR, int) {
|
||||
recs, err := r.LookupMX(ctx, name)
|
||||
if err != nil {
|
||||
return nil, rcodeForRecordError(err)
|
||||
}
|
||||
rrs := make([]dns.RR, 0, len(recs))
|
||||
for _, mx := range recs {
|
||||
rrs = append(rrs, &dns.MX{
|
||||
Hdr: recordHeader(fqdn, dns.TypeMX, ttl),
|
||||
Preference: mx.Pref,
|
||||
Mx: dns.Fqdn(mx.Host),
|
||||
})
|
||||
}
|
||||
return rrs, dns.RcodeSuccess
|
||||
}
|
||||
|
||||
func lookupTXT(ctx context.Context, r RecordResolver, name, fqdn string, ttl uint32) ([]dns.RR, int) {
|
||||
recs, err := r.LookupTXT(ctx, name)
|
||||
if err != nil {
|
||||
return nil, rcodeForRecordError(err)
|
||||
}
|
||||
rrs := make([]dns.RR, 0, len(recs))
|
||||
for _, txt := range recs {
|
||||
rrs = append(rrs, &dns.TXT{
|
||||
Hdr: recordHeader(fqdn, dns.TypeTXT, ttl),
|
||||
Txt: chunkTXT(txt),
|
||||
})
|
||||
}
|
||||
return rrs, dns.RcodeSuccess
|
||||
}
|
||||
|
||||
func lookupNS(ctx context.Context, r RecordResolver, name, fqdn string, ttl uint32) ([]dns.RR, int) {
|
||||
recs, err := r.LookupNS(ctx, name)
|
||||
if err != nil {
|
||||
return nil, rcodeForRecordError(err)
|
||||
}
|
||||
rrs := make([]dns.RR, 0, len(recs))
|
||||
for _, ns := range recs {
|
||||
rrs = append(rrs, &dns.NS{
|
||||
Hdr: recordHeader(fqdn, dns.TypeNS, ttl),
|
||||
Ns: dns.Fqdn(ns.Host),
|
||||
})
|
||||
}
|
||||
return rrs, dns.RcodeSuccess
|
||||
}
|
||||
|
||||
func lookupSRV(ctx context.Context, r RecordResolver, name, fqdn string, ttl uint32) ([]dns.RR, int) {
|
||||
_, recs, err := r.LookupSRV(ctx, "", "", name)
|
||||
if err != nil {
|
||||
return nil, rcodeForRecordError(err)
|
||||
}
|
||||
rrs := make([]dns.RR, 0, len(recs))
|
||||
for _, srv := range recs {
|
||||
rrs = append(rrs, &dns.SRV{
|
||||
Hdr: recordHeader(fqdn, dns.TypeSRV, ttl),
|
||||
Priority: srv.Priority,
|
||||
Weight: srv.Weight,
|
||||
Port: srv.Port,
|
||||
Target: dns.Fqdn(srv.Target),
|
||||
})
|
||||
}
|
||||
return rrs, dns.RcodeSuccess
|
||||
}
|
||||
|
||||
func lookupCNAME(ctx context.Context, r RecordResolver, name, fqdn string, ttl uint32) ([]dns.RR, int) {
|
||||
cname, err := r.LookupCNAME(ctx, name)
|
||||
if err != nil {
|
||||
return nil, rcodeForRecordError(err)
|
||||
}
|
||||
// LookupCNAME returns the queried name itself when the name resolves but
|
||||
// has no CNAME record; that is a NODATA result, not a CNAME.
|
||||
if strings.EqualFold(dns.Fqdn(cname), fqdn) {
|
||||
return nil, dns.RcodeSuccess
|
||||
}
|
||||
return []dns.RR{&dns.CNAME{
|
||||
Hdr: recordHeader(fqdn, dns.TypeCNAME, ttl),
|
||||
Target: dns.Fqdn(cname),
|
||||
}}, dns.RcodeSuccess
|
||||
}
|
||||
|
||||
func lookupPTR(ctx context.Context, r RecordResolver, name, fqdn string, ttl uint32) ([]dns.RR, int) {
|
||||
addr, ok := ptrQueryAddr(name)
|
||||
if !ok {
|
||||
return nil, dns.RcodeSuccess
|
||||
}
|
||||
names, err := r.LookupAddr(ctx, addr)
|
||||
if err != nil {
|
||||
return nil, rcodeForRecordError(err)
|
||||
}
|
||||
rrs := make([]dns.RR, 0, len(names))
|
||||
for _, n := range names {
|
||||
rrs = append(rrs, &dns.PTR{
|
||||
Hdr: recordHeader(fqdn, dns.TypePTR, ttl),
|
||||
Ptr: dns.Fqdn(n),
|
||||
})
|
||||
}
|
||||
return rrs, dns.RcodeSuccess
|
||||
}
|
||||
|
||||
// ptrQueryAddr converts a reverse-DNS query name (in-addr.arpa or ip6.arpa)
|
||||
// into the address string expected by net.Resolver.LookupAddr. It reports false
|
||||
// when the name is not a well-formed reverse name.
|
||||
func ptrQueryAddr(qname string) (string, bool) {
|
||||
name := strings.TrimSuffix(strings.ToLower(dns.Fqdn(qname)), ".")
|
||||
|
||||
switch {
|
||||
case strings.HasSuffix(name, ".in-addr.arpa"):
|
||||
return parseInAddrArpa(strings.TrimSuffix(name, ".in-addr.arpa"))
|
||||
case strings.HasSuffix(name, ".ip6.arpa"):
|
||||
return parseIP6Arpa(strings.TrimSuffix(name, ".ip6.arpa"))
|
||||
default:
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
|
||||
// parseInAddrArpa turns the label portion of an in-addr.arpa name into an IPv4
|
||||
// address string, reporting false when it is not a well-formed reverse name.
|
||||
func parseInAddrArpa(labelPart string) (string, bool) {
|
||||
labels := strings.Split(labelPart, ".")
|
||||
if len(labels) != 4 {
|
||||
return "", false
|
||||
}
|
||||
slices.Reverse(labels)
|
||||
addr, err := netip.ParseAddr(strings.Join(labels, "."))
|
||||
if err != nil || !addr.Is4() {
|
||||
return "", false
|
||||
}
|
||||
return addr.String(), true
|
||||
}
|
||||
|
||||
// parseIP6Arpa turns the nibble portion of an ip6.arpa name into an IPv6
|
||||
// address string, reporting false when it is not a well-formed reverse name.
|
||||
func parseIP6Arpa(nibblePart string) (string, bool) {
|
||||
nibbles := strings.Split(nibblePart, ".")
|
||||
if len(nibbles) != 32 {
|
||||
return "", false
|
||||
}
|
||||
slices.Reverse(nibbles)
|
||||
var sb strings.Builder
|
||||
for i, n := range nibbles {
|
||||
if i > 0 && i%4 == 0 {
|
||||
sb.WriteByte(':')
|
||||
}
|
||||
sb.WriteString(n)
|
||||
}
|
||||
addr, err := netip.ParseAddr(sb.String())
|
||||
if err != nil || !addr.Is6() {
|
||||
return "", false
|
||||
}
|
||||
return addr.String(), true
|
||||
}
|
||||
|
||||
// rcodeForRecordError maps a non-address lookup error to a DNS rcode. A
|
||||
// not-found result becomes NODATA rather than NXDOMAIN: net.DNSError.IsNotFound
|
||||
// does not distinguish a missing name from a name that exists only with records
|
||||
// of other types, so the name cannot be proven absent and must not be poisoned.
|
||||
func rcodeForRecordError(err error) int {
|
||||
var dnsErr *net.DNSError
|
||||
if errors.As(err, &dnsErr) && dnsErr.IsNotFound {
|
||||
return dns.RcodeSuccess
|
||||
}
|
||||
return dns.RcodeServerFailure
|
||||
}
|
||||
|
||||
// chunkTXT splits a TXT string into character-strings no longer than 255 bytes
|
||||
// so the record can be packed. The chunks form one TXT resource record.
|
||||
func chunkTXT(s string) []string {
|
||||
const maxLen = 255
|
||||
if len(s) <= maxLen {
|
||||
return []string{s}
|
||||
}
|
||||
|
||||
var chunks []string
|
||||
for len(s) > maxLen {
|
||||
chunks = append(chunks, s[:maxLen])
|
||||
s = s[maxLen:]
|
||||
}
|
||||
if len(s) > 0 {
|
||||
chunks = append(chunks, s)
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
// FormatAnswers formats DNS resource records for logging.
|
||||
func FormatAnswers(answers []dns.RR) string {
|
||||
if len(answers) == 0 {
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"errors"
|
||||
"net"
|
||||
"net/netip"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/miekg/dns"
|
||||
@@ -121,6 +122,164 @@ func TestLookupIP_DNSErrorNotIsNotFound(t *testing.T) {
|
||||
assert.Equal(t, dns.RcodeServerFailure, result.Rcode, "upstream failure should map to SERVFAIL")
|
||||
}
|
||||
|
||||
func TestPtrQueryAddr(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
qname string
|
||||
want string
|
||||
wantOK bool
|
||||
}{
|
||||
{name: "ipv4", qname: "4.3.2.1.in-addr.arpa.", want: "1.2.3.4", wantOK: true},
|
||||
{name: "ipv4 no trailing dot", qname: "1.0.0.127.in-addr.arpa", want: "127.0.0.1", wantOK: true},
|
||||
{
|
||||
name: "ipv6",
|
||||
qname: "1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.8.b.d.0.1.0.0.2.ip6.arpa.",
|
||||
want: "2001:db8::1",
|
||||
wantOK: true,
|
||||
},
|
||||
{name: "ipv4 wrong label count", qname: "2.1.in-addr.arpa.", wantOK: false},
|
||||
{name: "ipv6 wrong nibble count", qname: "1.0.ip6.arpa.", wantOK: false},
|
||||
{name: "not a reverse name", qname: "example.com.", wantOK: false},
|
||||
{name: "ipv4 bad octet", qname: "4.3.2.999.in-addr.arpa.", wantOK: false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got, ok := ptrQueryAddr(tt.qname)
|
||||
assert.Equal(t, tt.wantOK, ok, "parse success mismatch")
|
||||
if tt.wantOK {
|
||||
assert.Equal(t, tt.want, got, "parsed address mismatch")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type mockRecordResolver struct {
|
||||
mx []*net.MX
|
||||
txt []string
|
||||
ns []*net.NS
|
||||
srv []*net.SRV
|
||||
cname string
|
||||
ptr []string
|
||||
err error
|
||||
}
|
||||
|
||||
func (m *mockRecordResolver) LookupMX(context.Context, string) ([]*net.MX, error) {
|
||||
return m.mx, m.err
|
||||
}
|
||||
func (m *mockRecordResolver) LookupTXT(context.Context, string) ([]string, error) {
|
||||
return m.txt, m.err
|
||||
}
|
||||
func (m *mockRecordResolver) LookupNS(context.Context, string) ([]*net.NS, error) {
|
||||
return m.ns, m.err
|
||||
}
|
||||
func (m *mockRecordResolver) LookupSRV(context.Context, string, string, string) (string, []*net.SRV, error) {
|
||||
return "", m.srv, m.err
|
||||
}
|
||||
func (m *mockRecordResolver) LookupCNAME(context.Context, string) (string, error) {
|
||||
return m.cname, m.err
|
||||
}
|
||||
func (m *mockRecordResolver) LookupAddr(context.Context, string) ([]string, error) {
|
||||
return m.ptr, m.err
|
||||
}
|
||||
|
||||
func TestLookupRecords(t *testing.T) {
|
||||
notFound := &net.DNSError{IsNotFound: true, Name: "example.com."}
|
||||
|
||||
t.Run("MX success", func(t *testing.T) {
|
||||
r := &mockRecordResolver{mx: []*net.MX{{Host: "mail.example.com.", Pref: 10}}}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeMX, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
require.Len(t, rrs, 1)
|
||||
assert.Equal(t, "mail.example.com.", rrs[0].(*dns.MX).Mx)
|
||||
})
|
||||
|
||||
t.Run("TXT short string is one character-string", func(t *testing.T) {
|
||||
r := &mockRecordResolver{txt: []string{"v=spf1 -all"}}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeTXT, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
require.Len(t, rrs, 1)
|
||||
assert.Equal(t, []string{"v=spf1 -all"}, rrs[0].(*dns.TXT).Txt)
|
||||
})
|
||||
|
||||
t.Run("TXT chunks long strings", func(t *testing.T) {
|
||||
long := strings.Repeat("a", 300)
|
||||
r := &mockRecordResolver{txt: []string{long}}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeTXT, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
require.Len(t, rrs, 1)
|
||||
txt := rrs[0].(*dns.TXT).Txt
|
||||
require.Len(t, txt, 2, "300-byte string should split into two character-strings")
|
||||
assert.Equal(t, 255, len(txt[0]))
|
||||
assert.Equal(t, 45, len(txt[1]))
|
||||
})
|
||||
|
||||
t.Run("NS success", func(t *testing.T) {
|
||||
r := &mockRecordResolver{ns: []*net.NS{{Host: "ns1.example.com."}}}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeNS, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
require.Len(t, rrs, 1)
|
||||
assert.Equal(t, "ns1.example.com.", rrs[0].(*dns.NS).Ns)
|
||||
})
|
||||
|
||||
t.Run("SRV success", func(t *testing.T) {
|
||||
r := &mockRecordResolver{srv: []*net.SRV{{Target: "sip.example.com.", Port: 5060}}}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "_sip._tcp.example.com.", dns.TypeSRV, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
require.Len(t, rrs, 1)
|
||||
assert.Equal(t, uint16(5060), rrs[0].(*dns.SRV).Port)
|
||||
})
|
||||
|
||||
t.Run("CNAME success", func(t *testing.T) {
|
||||
r := &mockRecordResolver{cname: "target.example.com."}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "www.example.com.", dns.TypeCNAME, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
require.Len(t, rrs, 1)
|
||||
assert.Equal(t, "target.example.com.", rrs[0].(*dns.CNAME).Target)
|
||||
})
|
||||
|
||||
t.Run("CNAME equal to name is NODATA", func(t *testing.T) {
|
||||
r := &mockRecordResolver{cname: "example.com."}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeCNAME, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
assert.Empty(t, rrs, "self-referential CNAME is NODATA")
|
||||
})
|
||||
|
||||
t.Run("PTR success", func(t *testing.T) {
|
||||
r := &mockRecordResolver{ptr: []string{"host.example.com."}}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "4.3.2.1.in-addr.arpa.", dns.TypePTR, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
require.Len(t, rrs, 1)
|
||||
assert.Equal(t, "host.example.com.", rrs[0].(*dns.PTR).Ptr)
|
||||
})
|
||||
|
||||
t.Run("PTR malformed name is NODATA", func(t *testing.T) {
|
||||
r := &mockRecordResolver{}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypePTR, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
assert.Empty(t, rrs)
|
||||
})
|
||||
|
||||
t.Run("not found is NODATA never NXDOMAIN", func(t *testing.T) {
|
||||
r := &mockRecordResolver{err: notFound}
|
||||
_, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeMX, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode, "missing record must not poison the name")
|
||||
})
|
||||
|
||||
t.Run("server failure maps to SERVFAIL", func(t *testing.T) {
|
||||
r := &mockRecordResolver{err: &net.DNSError{Err: "server misbehaving", IsTemporary: true}}
|
||||
_, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeMX, 300)
|
||||
assert.Equal(t, dns.RcodeServerFailure, rcode)
|
||||
})
|
||||
|
||||
t.Run("unsupported type is NODATA", func(t *testing.T) {
|
||||
r := &mockRecordResolver{}
|
||||
rrs, rcode := LookupRecords(context.Background(), r, "example.com.", dns.TypeCAA, 300)
|
||||
assert.Equal(t, dns.RcodeSuccess, rcode)
|
||||
assert.Empty(t, rrs)
|
||||
})
|
||||
}
|
||||
|
||||
func TestStripOPT(t *testing.T) {
|
||||
rm := &dns.Msg{
|
||||
Extra: []dns.RR{
|
||||
|
||||
@@ -37,6 +37,12 @@ const (
|
||||
|
||||
type resolver interface {
|
||||
LookupNetIP(ctx context.Context, network, host string) ([]netip.Addr, error)
|
||||
LookupMX(ctx context.Context, name string) ([]*net.MX, error)
|
||||
LookupTXT(ctx context.Context, name string) ([]string, error)
|
||||
LookupNS(ctx context.Context, name string) ([]*net.NS, error)
|
||||
LookupSRV(ctx context.Context, service, proto, name string) (string, []*net.SRV, error)
|
||||
LookupCNAME(ctx context.Context, host string) (string, error)
|
||||
LookupAddr(ctx context.Context, addr string) ([]string, error)
|
||||
}
|
||||
|
||||
type firewaller interface {
|
||||
@@ -210,12 +216,6 @@ func (f *DNSForwarder) handleDNSQuery(logger *log.Entry, w dns.ResponseWriter, q
|
||||
qname, dns.TypeToString[question.Qtype], dns.ClassToString[question.Qclass])
|
||||
|
||||
resp := query.SetReply(query)
|
||||
network := resutil.NetworkForQtype(question.Qtype)
|
||||
if network == "" {
|
||||
resp.Rcode = dns.RcodeNotImplemented
|
||||
f.writeResponse(logger, w, resp, qname, startTime)
|
||||
return
|
||||
}
|
||||
|
||||
mostSpecificResId, matchingEntries := f.getMatchingEntries(strings.TrimSuffix(qname, "."))
|
||||
if mostSpecificResId == "" {
|
||||
@@ -227,9 +227,46 @@ func (f *DNSForwarder) handleDNSQuery(logger *log.Entry, w dns.ResponseWriter, q
|
||||
ctx, cancel := context.WithTimeout(context.Background(), upstreamTimeout)
|
||||
defer cancel()
|
||||
|
||||
reqHasEdns := query.IsEdns0() != nil
|
||||
|
||||
switch question.Qtype {
|
||||
case dns.TypeA, dns.TypeAAAA:
|
||||
f.handleAddressQuery(ctx, logger, w, resp, mostSpecificResId, matchingEntries, reqHasEdns, startTime)
|
||||
case dns.TypeMX, dns.TypeTXT, dns.TypeNS, dns.TypeSRV, dns.TypeCNAME, dns.TypePTR:
|
||||
f.handleRecordQuery(ctx, logger, w, resp, startTime)
|
||||
default:
|
||||
// The domain is routed here, so any other type is answered NODATA
|
||||
// (NOERROR, empty answer) rather than falling back to a resolver that
|
||||
// would poison the name with NXDOMAIN. The Extended DNS Error lets a
|
||||
// client tell this capability-driven NODATA apart from an
|
||||
// authoritative one. The OPT pseudo-record must not appear unless the
|
||||
// query advertised EDNS0.
|
||||
if reqHasEdns {
|
||||
attachEDE(resp, dns.ExtendedErrorCodeNotSupported, "netbird forwarder: unsupported query type")
|
||||
}
|
||||
f.writeResponse(logger, w, resp, qname, startTime)
|
||||
}
|
||||
}
|
||||
|
||||
// handleAddressQuery resolves A/AAAA queries, programs the firewall sets and
|
||||
// resolved-IP state, and caches the answer for resilience on upstream failure.
|
||||
func (f *DNSForwarder) handleAddressQuery(
|
||||
ctx context.Context,
|
||||
logger *log.Entry,
|
||||
w dns.ResponseWriter,
|
||||
resp *dns.Msg,
|
||||
mostSpecificResId route.ResID,
|
||||
matchingEntries []*ForwarderEntry,
|
||||
reqHasEdns bool,
|
||||
startTime time.Time,
|
||||
) {
|
||||
question := resp.Question[0]
|
||||
qname := strings.ToLower(question.Name)
|
||||
|
||||
network := resutil.NetworkForQtype(question.Qtype)
|
||||
result := resutil.LookupIP(ctx, f.resolver, network, qname, question.Qtype)
|
||||
if result.Err != nil {
|
||||
f.handleDNSError(ctx, logger, w, question, resp, qname, result, query.IsEdns0() != nil, startTime)
|
||||
f.handleDNSError(ctx, logger, w, question, resp, qname, result, reqHasEdns, startTime)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -240,6 +277,25 @@ func (f *DNSForwarder) handleDNSQuery(logger *log.Entry, w dns.ResponseWriter, q
|
||||
f.writeResponse(logger, w, resp, qname, startTime)
|
||||
}
|
||||
|
||||
// handleRecordQuery resolves non-address record types (MX, TXT, NS, SRV,
|
||||
// CNAME, PTR) through the host resolver. Missing records are answered NODATA so
|
||||
// the routed name is never poisoned with NXDOMAIN.
|
||||
func (f *DNSForwarder) handleRecordQuery(
|
||||
ctx context.Context,
|
||||
logger *log.Entry,
|
||||
w dns.ResponseWriter,
|
||||
resp *dns.Msg,
|
||||
startTime time.Time,
|
||||
) {
|
||||
question := resp.Question[0]
|
||||
qname := strings.ToLower(question.Name)
|
||||
|
||||
records, rcode := resutil.LookupRecords(ctx, f.resolver, qname, question.Qtype, f.ttl)
|
||||
resp.Rcode = rcode
|
||||
resp.Answer = append(resp.Answer, records...)
|
||||
f.writeResponse(logger, w, resp, qname, startTime)
|
||||
}
|
||||
|
||||
func (f *DNSForwarder) writeResponse(logger *log.Entry, w dns.ResponseWriter, resp *dns.Msg, qname string, startTime time.Time) {
|
||||
if err := w.WriteMsg(resp); err != nil {
|
||||
logger.Errorf("failed to write DNS response: %v", err)
|
||||
|
||||
@@ -133,6 +133,41 @@ func (m *MockResolver) LookupNetIP(ctx context.Context, network, host string) ([
|
||||
return args.Get(0).([]netip.Addr), args.Error(1)
|
||||
}
|
||||
|
||||
func (m *MockResolver) LookupMX(ctx context.Context, name string) ([]*net.MX, error) {
|
||||
args := m.Called(ctx, name)
|
||||
recs, _ := args.Get(0).([]*net.MX)
|
||||
return recs, args.Error(1)
|
||||
}
|
||||
|
||||
func (m *MockResolver) LookupTXT(ctx context.Context, name string) ([]string, error) {
|
||||
args := m.Called(ctx, name)
|
||||
recs, _ := args.Get(0).([]string)
|
||||
return recs, args.Error(1)
|
||||
}
|
||||
|
||||
func (m *MockResolver) LookupNS(ctx context.Context, name string) ([]*net.NS, error) {
|
||||
args := m.Called(ctx, name)
|
||||
recs, _ := args.Get(0).([]*net.NS)
|
||||
return recs, args.Error(1)
|
||||
}
|
||||
|
||||
func (m *MockResolver) LookupSRV(ctx context.Context, service, proto, name string) (string, []*net.SRV, error) {
|
||||
args := m.Called(ctx, service, proto, name)
|
||||
recs, _ := args.Get(1).([]*net.SRV)
|
||||
return args.String(0), recs, args.Error(2)
|
||||
}
|
||||
|
||||
func (m *MockResolver) LookupCNAME(ctx context.Context, host string) (string, error) {
|
||||
args := m.Called(ctx, host)
|
||||
return args.String(0), args.Error(1)
|
||||
}
|
||||
|
||||
func (m *MockResolver) LookupAddr(ctx context.Context, addr string) ([]string, error) {
|
||||
args := m.Called(ctx, addr)
|
||||
recs, _ := args.Get(0).([]string)
|
||||
return recs, args.Error(1)
|
||||
}
|
||||
|
||||
func TestDNSForwarder_SubdomainAccessLogic(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -545,12 +580,15 @@ func TestDNSForwarder_MultipleIPsInSingleUpdate(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestDNSForwarder_ResponseCodes(t *testing.T) {
|
||||
// A type with no net.Resolver Lookup method (CAA) must answer NODATA
|
||||
// (NOERROR, empty) rather than NXDOMAIN/NOTIMP to avoid poisoning the name.
|
||||
tests := []struct {
|
||||
name string
|
||||
queryType uint16
|
||||
queryDomain string
|
||||
configured string
|
||||
expectedCode int
|
||||
expectEDE bool
|
||||
description string
|
||||
}{
|
||||
{
|
||||
@@ -562,28 +600,13 @@ func TestDNSForwarder_ResponseCodes(t *testing.T) {
|
||||
description: "RFC compliant REFUSED for unauthorized queries",
|
||||
},
|
||||
{
|
||||
name: "unsupported query type returns NOTIMP",
|
||||
queryType: dns.TypeMX,
|
||||
name: "unsupported query type returns NODATA",
|
||||
queryType: dns.TypeCAA,
|
||||
queryDomain: "example.com",
|
||||
configured: "example.com",
|
||||
expectedCode: dns.RcodeNotImplemented,
|
||||
description: "RFC compliant NOTIMP for unsupported types",
|
||||
},
|
||||
{
|
||||
name: "CNAME query returns NOTIMP",
|
||||
queryType: dns.TypeCNAME,
|
||||
queryDomain: "example.com",
|
||||
configured: "example.com",
|
||||
expectedCode: dns.RcodeNotImplemented,
|
||||
description: "CNAME queries not supported",
|
||||
},
|
||||
{
|
||||
name: "TXT query returns NOTIMP",
|
||||
queryType: dns.TypeTXT,
|
||||
queryDomain: "example.com",
|
||||
configured: "example.com",
|
||||
expectedCode: dns.RcodeNotImplemented,
|
||||
description: "TXT queries not supported",
|
||||
expectedCode: dns.RcodeSuccess,
|
||||
expectEDE: true,
|
||||
description: "Unsupported types answer NODATA, not NXDOMAIN/NOTIMP",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -599,6 +622,7 @@ func TestDNSForwarder_ResponseCodes(t *testing.T) {
|
||||
|
||||
query := &dns.Msg{}
|
||||
query.SetQuestion(dns.Fqdn(tt.queryDomain), tt.queryType)
|
||||
query.SetEdns0(dns.DefaultMsgSize, false)
|
||||
|
||||
// Capture the written response
|
||||
var writtenResp *dns.Msg
|
||||
@@ -614,10 +638,213 @@ func TestDNSForwarder_ResponseCodes(t *testing.T) {
|
||||
// Check the response written to the writer
|
||||
require.NotNil(t, writtenResp, "Expected response to be written")
|
||||
assert.Equal(t, tt.expectedCode, writtenResp.Rcode, tt.description)
|
||||
assert.Empty(t, writtenResp.Answer, "Non-address response should carry no answers")
|
||||
|
||||
if tt.expectEDE {
|
||||
require.NotNil(t, writtenResp.IsEdns0(), "EDNS0 client should get an OPT in the reply")
|
||||
assert.True(t, hasEDE(writtenResp, dns.ExtendedErrorCodeNotSupported),
|
||||
"unsupported type NODATA should carry EDE Not Supported")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func hasEDE(m *dns.Msg, code uint16) bool {
|
||||
opt := m.IsEdns0()
|
||||
if opt == nil {
|
||||
return false
|
||||
}
|
||||
for _, o := range opt.Option {
|
||||
if ede, ok := o.(*dns.EDNS0_EDE); ok && ede.InfoCode == code {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func TestDNSForwarder_RecordQueries(t *testing.T) {
|
||||
notFound := &net.DNSError{IsNotFound: true, Name: "example.com"}
|
||||
|
||||
t.Run("MX records are forwarded", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "example.com")
|
||||
|
||||
mockResolver.On("LookupMX", mock.Anything, "example.com.").
|
||||
Return([]*net.MX{{Host: "mail.example.com.", Pref: 10}}, nil).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "example.com", dns.TypeMX)
|
||||
require.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
require.Len(t, resp.Answer, 1)
|
||||
mx, ok := resp.Answer[0].(*dns.MX)
|
||||
require.True(t, ok, "answer should be an MX record")
|
||||
assert.Equal(t, uint16(10), mx.Preference)
|
||||
assert.Equal(t, "mail.example.com.", mx.Mx)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("missing MX is NODATA not NXDOMAIN", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "example.com")
|
||||
|
||||
// A not-found cannot prove the name is absent (it may exist with only
|
||||
// other record types), so it must answer NODATA, never NXDOMAIN.
|
||||
mockResolver.On("LookupMX", mock.Anything, "example.com.").
|
||||
Return(nil, notFound).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "example.com", dns.TypeMX)
|
||||
assert.Equal(t, dns.RcodeSuccess, resp.Rcode, "missing record must be NODATA")
|
||||
assert.Empty(t, resp.Answer)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("NS records are forwarded", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "example.com")
|
||||
|
||||
mockResolver.On("LookupNS", mock.Anything, "example.com.").
|
||||
Return([]*net.NS{{Host: "ns1.example.com."}}, nil).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "example.com", dns.TypeNS)
|
||||
require.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
require.Len(t, resp.Answer, 1)
|
||||
ns, ok := resp.Answer[0].(*dns.NS)
|
||||
require.True(t, ok, "answer should be an NS record")
|
||||
assert.Equal(t, "ns1.example.com.", ns.Ns)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("missing NS is NODATA", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "example.com")
|
||||
|
||||
mockResolver.On("LookupNS", mock.Anything, "example.com.").
|
||||
Return(nil, notFound).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "example.com", dns.TypeNS)
|
||||
assert.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
assert.Empty(t, resp.Answer)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("SRV records are forwarded", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "_sip._tcp.example.com")
|
||||
|
||||
mockResolver.On("LookupSRV", mock.Anything, "", "", "_sip._tcp.example.com.").
|
||||
Return("", []*net.SRV{{Target: "sip.example.com.", Port: 5060, Priority: 10, Weight: 5}}, nil).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "_sip._tcp.example.com", dns.TypeSRV)
|
||||
require.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
require.Len(t, resp.Answer, 1)
|
||||
srv, ok := resp.Answer[0].(*dns.SRV)
|
||||
require.True(t, ok, "answer should be an SRV record")
|
||||
assert.Equal(t, "sip.example.com.", srv.Target)
|
||||
assert.Equal(t, uint16(5060), srv.Port)
|
||||
assert.Equal(t, uint16(10), srv.Priority)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("missing SRV is NODATA", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "_sip._tcp.example.com")
|
||||
|
||||
mockResolver.On("LookupSRV", mock.Anything, "", "", "_sip._tcp.example.com.").
|
||||
Return("", nil, notFound).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "_sip._tcp.example.com", dns.TypeSRV)
|
||||
assert.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
assert.Empty(t, resp.Answer)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("TXT records are forwarded", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "example.com")
|
||||
|
||||
mockResolver.On("LookupTXT", mock.Anything, "example.com.").
|
||||
Return([]string{"v=spf1 -all"}, nil).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "example.com", dns.TypeTXT)
|
||||
require.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
require.Len(t, resp.Answer, 1)
|
||||
txt, ok := resp.Answer[0].(*dns.TXT)
|
||||
require.True(t, ok, "answer should be a TXT record")
|
||||
assert.Equal(t, []string{"v=spf1 -all"}, txt.Txt)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("CNAME record is forwarded", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "www.example.com")
|
||||
|
||||
mockResolver.On("LookupCNAME", mock.Anything, "www.example.com.").
|
||||
Return("target.example.com.", nil).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "www.example.com", dns.TypeCNAME)
|
||||
require.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
require.Len(t, resp.Answer, 1)
|
||||
cname, ok := resp.Answer[0].(*dns.CNAME)
|
||||
require.True(t, ok, "answer should be a CNAME record")
|
||||
assert.Equal(t, "target.example.com.", cname.Target)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("CNAME equal to the name is NODATA", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "example.com")
|
||||
|
||||
// No CNAME exists: LookupCNAME echoes the queried name back.
|
||||
mockResolver.On("LookupCNAME", mock.Anything, "example.com.").
|
||||
Return("example.com.", nil).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "example.com", dns.TypeCNAME)
|
||||
assert.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
assert.Empty(t, resp.Answer, "self-referential CNAME means no CNAME record")
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
|
||||
t.Run("PTR record is forwarded", func(t *testing.T) {
|
||||
mockResolver := &MockResolver{}
|
||||
forwarder := newRecordTestForwarder(t, mockResolver, "*.in-addr.arpa")
|
||||
|
||||
// The reverse name is parsed back to the address LookupAddr expects.
|
||||
mockResolver.On("LookupAddr", mock.Anything, "1.2.3.4").
|
||||
Return([]string{"host.example.com."}, nil).Once()
|
||||
|
||||
resp := runRecordQuery(t, forwarder, "4.3.2.1.in-addr.arpa", dns.TypePTR)
|
||||
require.Equal(t, dns.RcodeSuccess, resp.Rcode)
|
||||
require.Len(t, resp.Answer, 1)
|
||||
ptr, ok := resp.Answer[0].(*dns.PTR)
|
||||
require.True(t, ok, "answer should be a PTR record")
|
||||
assert.Equal(t, "host.example.com.", ptr.Ptr)
|
||||
mockResolver.AssertExpectations(t)
|
||||
})
|
||||
}
|
||||
|
||||
func newRecordTestForwarder(t *testing.T, r resolver, configured string) *DNSForwarder {
|
||||
t.Helper()
|
||||
forwarder := NewDNSForwarder(netip.MustParseAddrPort("127.0.0.1:0"), 300, nil, &peer.Status{}, nil)
|
||||
forwarder.resolver = r
|
||||
|
||||
d, err := domain.FromString(configured)
|
||||
require.NoError(t, err)
|
||||
forwarder.UpdateDomains([]*ForwarderEntry{{Domain: d, ResID: "test-res"}})
|
||||
return forwarder
|
||||
}
|
||||
|
||||
func runRecordQuery(t *testing.T, forwarder *DNSForwarder, qname string, qtype uint16) *dns.Msg {
|
||||
t.Helper()
|
||||
query := &dns.Msg{}
|
||||
query.SetQuestion(dns.Fqdn(qname), qtype)
|
||||
|
||||
mockWriter := &test.MockResponseWriter{}
|
||||
forwarder.handleDNSQuery(log.NewEntry(log.StandardLogger()), mockWriter, query, time.Now())
|
||||
|
||||
resp := mockWriter.GetLastResponse()
|
||||
require.NotNil(t, resp, "expected response to be written")
|
||||
return resp
|
||||
}
|
||||
|
||||
func TestDNSForwarder_UpstreamFailureEDE(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
|
||||
@@ -40,6 +40,7 @@ import (
|
||||
"github.com/netbirdio/netbird/client/internal/dnsfwd"
|
||||
"github.com/netbirdio/netbird/client/internal/expose"
|
||||
"github.com/netbirdio/netbird/client/internal/ingressgw"
|
||||
"github.com/netbirdio/netbird/client/internal/lazyconn"
|
||||
"github.com/netbirdio/netbird/client/internal/metrics"
|
||||
"github.com/netbirdio/netbird/client/internal/netflow"
|
||||
nftypes "github.com/netbirdio/netbird/client/internal/netflow/types"
|
||||
@@ -82,6 +83,12 @@ const (
|
||||
PeerConnectionTimeoutMax = 45000 // ms
|
||||
PeerConnectionTimeoutMin = 30000 // ms
|
||||
disableAutoUpdate = "disabled"
|
||||
|
||||
// systemInfoTimeout bounds how long the sync loop waits for system info / posture
|
||||
// check gathering. The gathering runs uncancellable system calls (process scan,
|
||||
// exec, os.Stat); without this bound a single stuck call freezes handleSync, and
|
||||
// thus syncMsgMux, for as long as the call hangs (observed multi-minute freezes).
|
||||
systemInfoTimeout = 15 * time.Second
|
||||
)
|
||||
|
||||
var ErrResetConnection = fmt.Errorf("reset connection")
|
||||
@@ -141,7 +148,9 @@ type EngineConfig struct {
|
||||
BlockInbound bool
|
||||
DisableIPv6 bool
|
||||
|
||||
LazyConnectionEnabled bool
|
||||
// LazyConnection is the MDM-sourced lazy-connection override; StateUnset defers to
|
||||
// the env var and management feature flag.
|
||||
LazyConnection lazyconn.State
|
||||
|
||||
MTU uint16
|
||||
|
||||
@@ -166,6 +175,7 @@ type EngineServices struct {
|
||||
StateManager *statemanager.Manager
|
||||
UpdateManager *updater.Manager
|
||||
ClientMetrics *metrics.ClientMetrics
|
||||
MetricsCtx context.Context
|
||||
}
|
||||
|
||||
// Engine is a mechanism responsible for reacting on Signal and Management stream events and managing connections to the remote peers.
|
||||
@@ -258,6 +268,7 @@ type Engine struct {
|
||||
|
||||
// clientMetrics collects and pushes metrics
|
||||
clientMetrics *metrics.ClientMetrics
|
||||
metricsCtx context.Context
|
||||
|
||||
jobExecutor *jobexec.Executor
|
||||
jobExecutorWG sync.WaitGroup
|
||||
@@ -310,6 +321,7 @@ func NewEngine(
|
||||
probeStunTurn: relay.NewStunTurnProbe(relay.DefaultCacheTTL),
|
||||
jobExecutor: jobexec.NewExecutor(),
|
||||
clientMetrics: services.ClientMetrics,
|
||||
metricsCtx: services.MetricsCtx,
|
||||
updateManager: services.UpdateManager,
|
||||
syncStoreDir: config.StateDir,
|
||||
}
|
||||
@@ -895,6 +907,16 @@ func (e *Engine) handleAutoUpdateVersion(autoUpdateSettings *mgmProto.AutoUpdate
|
||||
e.updateManager.SetVersion(autoUpdateSettings.Version, autoUpdateSettings.AlwaysUpdate)
|
||||
}
|
||||
|
||||
// phase times a sync sub-phase: it returns a function that records the elapsed
|
||||
// duration when called. Starting the timer at the call site keeps inter-phase
|
||||
// glue code out of the measurement.
|
||||
func (e *Engine) phase(name string) func() {
|
||||
start := time.Now()
|
||||
return func() {
|
||||
e.clientMetrics.RecordSyncPhase(e.ctx, name, time.Since(start))
|
||||
}
|
||||
}
|
||||
|
||||
func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
|
||||
started := time.Now()
|
||||
defer func() {
|
||||
@@ -914,7 +936,10 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
|
||||
e.handleAutoUpdateVersion(update.NetworkMap.PeerConfig.AutoUpdate)
|
||||
}
|
||||
|
||||
if err := e.updateNetbirdConfig(update.GetNetbirdConfig()); err != nil {
|
||||
done := e.phase("netbird_config")
|
||||
err := e.updateNetbirdConfig(update.GetNetbirdConfig())
|
||||
done()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -928,11 +953,16 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := e.updateChecksIfNew(update.Checks); err != nil {
|
||||
done = e.phase("checks")
|
||||
err = e.updateChecksIfNew(update.Checks)
|
||||
done()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
done = e.phase("persist")
|
||||
e.persistSyncResponse(update)
|
||||
done()
|
||||
|
||||
// only apply new changes and ignore old ones
|
||||
if err := e.updateNetworkMap(nm); err != nil {
|
||||
@@ -973,6 +1003,8 @@ func (e *Engine) updateNetbirdConfig(wCfg *mgmProto.NetbirdConfig) error {
|
||||
return fmt.Errorf("handle the flow configuration: %w", err)
|
||||
}
|
||||
|
||||
e.handleMetricsUpdate(wCfg.GetMetrics())
|
||||
|
||||
if err := e.PopulateNetbirdConfig(wCfg, nil); err != nil {
|
||||
log.Warnf("Failed to update DNS server config: %v", err)
|
||||
}
|
||||
@@ -1042,6 +1074,14 @@ func (e *Engine) handleFlowUpdate(config *mgmProto.FlowConfig) error {
|
||||
return e.flowManager.Update(flowConfig)
|
||||
}
|
||||
|
||||
func (e *Engine) handleMetricsUpdate(config *mgmProto.MetricsConfig) {
|
||||
if config == nil {
|
||||
return
|
||||
}
|
||||
log.Infof("received metrics configuration from management: enabled=%v", config.GetEnabled())
|
||||
e.clientMetrics.UpdatePushFromMgm(e.metricsCtx, config.GetEnabled())
|
||||
}
|
||||
|
||||
func toFlowLoggerConfig(config *mgmProto.FlowConfig) (*nftypes.FlowConfig, error) {
|
||||
if config.GetInterval() == nil {
|
||||
return nil, errors.New("flow interval is nil")
|
||||
@@ -1066,11 +1106,22 @@ func (e *Engine) updateChecksIfNew(checks []*mgmProto.Checks) error {
|
||||
}
|
||||
e.checks = checks
|
||||
|
||||
info, err := system.GetInfoWithChecks(e.ctx, checks, e.overlayAddresses()...)
|
||||
if err != nil {
|
||||
log.Warnf("failed to get system info with checks: %v", err)
|
||||
info = system.GetInfo(e.ctx)
|
||||
info, ok := system.GetInfoWithChecksTimeout(e.ctx, systemInfoTimeout, checks, e.overlayAddresses()...)
|
||||
if !ok {
|
||||
// Gathering timed out; skip the meta sync this cycle rather than blocking the
|
||||
// sync loop (and syncMsgMux) on a stuck system call. A later sync will retry.
|
||||
return nil
|
||||
}
|
||||
e.applyInfoFlags(info)
|
||||
|
||||
if err := e.mgmClient.SyncMeta(info); err != nil {
|
||||
return fmt.Errorf("could not sync meta: error %s", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// applyInfoFlags sets the engine's config-derived feature flags on the gathered system info.
|
||||
func (e *Engine) applyInfoFlags(info *system.Info) {
|
||||
info.SetFlags(
|
||||
e.config.RosenpassEnabled,
|
||||
e.config.RosenpassPermissive,
|
||||
@@ -1082,19 +1133,12 @@ func (e *Engine) updateChecksIfNew(checks []*mgmProto.Checks) error {
|
||||
e.config.BlockLANAccess,
|
||||
e.config.BlockInbound,
|
||||
e.config.DisableIPv6,
|
||||
e.config.LazyConnectionEnabled,
|
||||
e.config.EnableSSHRoot,
|
||||
e.config.EnableSSHSFTP,
|
||||
e.config.EnableSSHLocalPortForwarding,
|
||||
e.config.EnableSSHRemotePortForwarding,
|
||||
e.config.DisableSSHAuth,
|
||||
)
|
||||
|
||||
if err := e.mgmClient.SyncMeta(info); err != nil {
|
||||
log.Errorf("could not sync meta: error %s", err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// overlayAddresses returns our own WireGuard overlay address (v4 and v6) so it
|
||||
@@ -1254,31 +1298,15 @@ func (e *Engine) receiveManagementEvents() {
|
||||
e.shutdownWg.Add(1)
|
||||
go func() {
|
||||
defer e.shutdownWg.Done()
|
||||
info, err := system.GetInfoWithChecks(e.ctx, e.checks, e.overlayAddresses()...)
|
||||
if err != nil {
|
||||
log.Warnf("failed to get system info with checks: %v", err)
|
||||
info, ok := system.GetInfoWithChecksTimeout(e.ctx, systemInfoTimeout, e.checks, e.overlayAddresses()...)
|
||||
if !ok {
|
||||
// Gathering timed out; connect the stream with base info so management
|
||||
// connectivity still comes up rather than blocking here.
|
||||
info = system.GetInfo(e.ctx)
|
||||
}
|
||||
info.SetFlags(
|
||||
e.config.RosenpassEnabled,
|
||||
e.config.RosenpassPermissive,
|
||||
&e.config.ServerSSHAllowed,
|
||||
e.config.DisableClientRoutes,
|
||||
e.config.DisableServerRoutes,
|
||||
e.config.DisableDNS,
|
||||
e.config.DisableFirewall,
|
||||
e.config.BlockLANAccess,
|
||||
e.config.BlockInbound,
|
||||
e.config.DisableIPv6,
|
||||
e.config.LazyConnectionEnabled,
|
||||
e.config.EnableSSHRoot,
|
||||
e.config.EnableSSHSFTP,
|
||||
e.config.EnableSSHLocalPortForwarding,
|
||||
e.config.EnableSSHRemotePortForwarding,
|
||||
e.config.DisableSSHAuth,
|
||||
)
|
||||
e.applyInfoFlags(info)
|
||||
|
||||
err = e.mgmClient.Sync(e.ctx, info, e.handleSync)
|
||||
err := e.mgmClient.Sync(e.ctx, info, e.handleSync)
|
||||
if err != nil {
|
||||
// happens if management is unavailable for a long time.
|
||||
// We want to cancel the operation of the whole client
|
||||
@@ -1371,13 +1399,16 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
|
||||
|
||||
dnsConfig := toDNSConfig(protoDNSConfig, e.wgInterface.Address())
|
||||
|
||||
done := e.phase("dns_server")
|
||||
if err := e.dnsServer.UpdateDNSServer(serial, dnsConfig); err != nil {
|
||||
log.Errorf("failed to update dns server, err: %v", err)
|
||||
}
|
||||
done()
|
||||
|
||||
e.routeManager.SetDNSForwarderPort(dnsConfig.ForwarderPort)
|
||||
|
||||
// apply routes first, route related actions might depend on routing being enabled
|
||||
done = e.phase("routes_classify")
|
||||
routes := toRoutes(networkMap.GetRoutes())
|
||||
serverRoutes, clientRoutes := e.routeManager.ClassifyRoutes(routes)
|
||||
|
||||
@@ -1386,29 +1417,60 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
|
||||
e.connMgr.UpdateRouteHAMap(clientRoutes)
|
||||
log.Debugf("updated lazy connection manager with %d HA groups", len(clientRoutes))
|
||||
}
|
||||
done()
|
||||
|
||||
done = e.phase("routes_apply")
|
||||
dnsRouteFeatureFlag := toDNSFeatureFlag(networkMap)
|
||||
if err := e.routeManager.UpdateRoutes(serial, serverRoutes, clientRoutes, dnsRouteFeatureFlag); err != nil {
|
||||
log.Errorf("failed to update routes: %v", err)
|
||||
}
|
||||
done()
|
||||
|
||||
done = e.phase("filtering")
|
||||
if e.acl != nil {
|
||||
e.acl.ApplyFiltering(networkMap, dnsRouteFeatureFlag)
|
||||
}
|
||||
done()
|
||||
|
||||
done = e.phase("dns_forwarder")
|
||||
fwdEntries := toRouteDomains(e.config.WgPrivateKey.PublicKey().String(), routes)
|
||||
e.updateDNSForwarder(dnsRouteFeatureFlag, fwdEntries)
|
||||
done()
|
||||
|
||||
// Ingress forward rules
|
||||
done = e.phase("forward_rules")
|
||||
forwardingRules, err := e.updateForwardRules(networkMap.GetForwardingRules())
|
||||
if err != nil {
|
||||
log.Errorf("failed to update forward rules, err: %v", err)
|
||||
}
|
||||
done()
|
||||
|
||||
log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))
|
||||
|
||||
done = e.phase("offline_peers")
|
||||
e.updateOfflinePeers(networkMap.GetOfflinePeers())
|
||||
done()
|
||||
|
||||
remotePeers, err := e.reconcilePeers(networkMap)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// must set the exclude list after the peers are added. Without it the manager can not figure out the peers parameters from the store
|
||||
done = e.phase("lazy_exclude")
|
||||
excludedLazyPeers := e.toExcludedLazyPeers(forwardingRules, remotePeers)
|
||||
e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
|
||||
done()
|
||||
|
||||
e.networkSerial = serial
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// reconcilePeers applies the remote peer list from the network map (removing,
|
||||
// modifying and adding peers, then updating SSH config) and returns the remote
|
||||
// peers with our own peer filtered out, for use by later sync steps.
|
||||
func (e *Engine) reconcilePeers(networkMap *mgmProto.NetworkMap) ([]*mgmProto.RemotePeerConfig, error) {
|
||||
// Filter out own peer from the remote peers list
|
||||
localPubKey := e.config.WgPrivateKey.PublicKey().String()
|
||||
remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
|
||||
@@ -1423,42 +1485,43 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
|
||||
err := e.removeAllPeers()
|
||||
e.statusRecorder.FinishPeerListModifications()
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
err := e.removePeers(remotePeers)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = e.modifyPeers(remotePeers)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = e.addNewPeers(remotePeers)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
e.statusRecorder.FinishPeerListModifications()
|
||||
|
||||
e.updatePeerSSHHostKeys(remotePeers)
|
||||
|
||||
if err := e.updateSSHClientConfig(remotePeers); err != nil {
|
||||
log.Warnf("failed to update SSH client config: %v", err)
|
||||
}
|
||||
|
||||
e.updateSSHServerAuth(networkMap.GetSshAuth())
|
||||
return remotePeers, nil
|
||||
}
|
||||
|
||||
// must set the exclude list after the peers are added. Without it the manager can not figure out the peers parameters from the store
|
||||
excludedLazyPeers := e.toExcludedLazyPeers(forwardingRules, remotePeers)
|
||||
e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
|
||||
done := e.phase("removed_peers")
|
||||
err := e.removePeers(remotePeers)
|
||||
done()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
e.networkSerial = serial
|
||||
done = e.phase("modified_peers")
|
||||
err = e.modifyPeers(remotePeers)
|
||||
done()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return nil
|
||||
done = e.phase("added_peers")
|
||||
err = e.addNewPeers(remotePeers)
|
||||
done()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
e.statusRecorder.FinishPeerListModifications()
|
||||
|
||||
e.updatePeerSSHHostKeys(remotePeers)
|
||||
|
||||
if err := e.updateSSHClientConfig(remotePeers); err != nil {
|
||||
log.Warnf("failed to update SSH client config: %v", err)
|
||||
}
|
||||
|
||||
e.updateSSHServerAuth(networkMap.GetSshAuth())
|
||||
|
||||
return remotePeers, nil
|
||||
}
|
||||
|
||||
func toDNSFeatureFlag(networkMap *mgmProto.NetworkMap) bool {
|
||||
@@ -1938,7 +2001,6 @@ func (e *Engine) readInitialSettings() ([]*route.Route, *nbdns.Config, bool, err
|
||||
e.config.BlockLANAccess,
|
||||
e.config.BlockInbound,
|
||||
e.config.DisableIPv6,
|
||||
e.config.LazyConnectionEnabled,
|
||||
e.config.EnableSSHRoot,
|
||||
e.config.EnableSSHSFTP,
|
||||
e.config.EnableSSHLocalPortForwarding,
|
||||
|
||||
@@ -178,6 +178,10 @@ func (m *MockWGIface) LastActivities() map[string]monotime.Time {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockWGIface) MTU() uint16 {
|
||||
return 1280
|
||||
}
|
||||
|
||||
func (m *MockWGIface) SetPresharedKey(peerKey string, psk wgtypes.Key, updateOnly bool) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -44,4 +44,5 @@ type wgIfaceBase interface {
|
||||
FullStats() (*configurer.Stats, error)
|
||||
LastActivities() map[string]monotime.Time
|
||||
SetPresharedKey(peerKey string, psk wgtypes.Key, updateOnly bool) error
|
||||
MTU() uint16
|
||||
}
|
||||
|
||||
@@ -124,6 +124,11 @@ func (d *BindListener) ReadPackets() {
|
||||
d.done.Done()
|
||||
}
|
||||
|
||||
// CapturedPacket is unused in userspace bind mode: first-packet reinjection is kernel-only.
|
||||
func (d *BindListener) CapturedPacket() []byte {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close stops the listener and cleans up resources.
|
||||
func (d *BindListener) Close() {
|
||||
d.peerCfg.Log.Infof("closing activity listener (LazyConn)")
|
||||
|
||||
@@ -45,10 +45,6 @@ type MockWGIfaceBind struct {
|
||||
endpointMgr *mockEndpointManager
|
||||
}
|
||||
|
||||
func (m *MockWGIfaceBind) RemovePeer(string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockWGIfaceBind) UpdatePeer(string, []netip.Prefix, time.Duration, *net.UDPAddr, *wgtypes.Key) error {
|
||||
return nil
|
||||
}
|
||||
@@ -68,6 +64,10 @@ func (m *MockWGIfaceBind) GetBind() device.EndpointManager {
|
||||
return m.endpointMgr
|
||||
}
|
||||
|
||||
func (m *MockWGIfaceBind) MTU() uint16 {
|
||||
return 1280
|
||||
}
|
||||
|
||||
func TestBindListener_Creation(t *testing.T) {
|
||||
mockEndpointMgr := newMockEndpointManager()
|
||||
mockIface := &MockWGIfaceBind{endpointMgr: mockEndpointMgr}
|
||||
@@ -207,8 +207,9 @@ func TestManager_BindMode(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
|
||||
select {
|
||||
case peerConnID := <-mgr.OnActivityChan:
|
||||
assert.Equal(t, cfg.PeerConnID, peerConnID, "Received peer connection ID should match")
|
||||
case ev := <-mgr.OnActivityChan:
|
||||
assert.Equal(t, cfg.PeerConnID, ev.PeerConnID, "Received peer connection ID should match")
|
||||
assert.Nil(t, ev.FirstPacket, "Bind mode does not capture packets: reinjection is kernel-only")
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("timeout waiting for activity notification")
|
||||
}
|
||||
@@ -266,8 +267,8 @@ func TestManager_BindMode_MultiplePeers(t *testing.T) {
|
||||
receivedPeers := make(map[peerid.ConnID]bool)
|
||||
for i := 0; i < 2; i++ {
|
||||
select {
|
||||
case peerConnID := <-mgr.OnActivityChan:
|
||||
receivedPeers[peerConnID] = true
|
||||
case ev := <-mgr.OnActivityChan:
|
||||
receivedPeers[ev.PeerConnID] = true
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("timeout waiting for activity notifications")
|
||||
}
|
||||
|
||||
@@ -3,11 +3,13 @@ package activity
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"slices"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/netbirdio/netbird/client/iface/bufsize"
|
||||
"github.com/netbirdio/netbird/client/internal/lazyconn"
|
||||
)
|
||||
|
||||
@@ -20,6 +22,8 @@ type UDPListener struct {
|
||||
done sync.Mutex
|
||||
|
||||
isClosed atomic.Bool
|
||||
|
||||
capturedPacket []byte
|
||||
}
|
||||
|
||||
// NewUDPListener creates a listener that detects activity via UDP socket reads.
|
||||
@@ -46,9 +50,13 @@ func NewUDPListener(wgIface WgInterface, cfg lazyconn.PeerConfig) (*UDPListener,
|
||||
}
|
||||
|
||||
// ReadPackets blocks reading from the UDP socket until activity is detected or the listener is closed.
|
||||
// The first packet that triggers activity is captured so it can be reinjected through the real
|
||||
// transport once it is established. Without this, kernel WireGuard's handshake initiation would be
|
||||
// dropped and WG would only retry after REKEY_TIMEOUT.
|
||||
func (d *UDPListener) ReadPackets() {
|
||||
for {
|
||||
n, remoteAddr, err := d.conn.ReadFromUDP(make([]byte, 1))
|
||||
buf := make([]byte, int(d.wgIface.MTU())+bufsize.WGBufferOverhead)
|
||||
n, remoteAddr, err := d.conn.ReadFromUDP(buf)
|
||||
if err != nil {
|
||||
if d.isClosed.Load() {
|
||||
d.peerCfg.Log.Infof("exit from activity listener")
|
||||
@@ -62,20 +70,24 @@ func (d *UDPListener) ReadPackets() {
|
||||
d.peerCfg.Log.Warnf("received %d bytes from %s, too short", n, remoteAddr)
|
||||
continue
|
||||
}
|
||||
d.peerCfg.Log.Infof("activity detected")
|
||||
d.capturedPacket = slices.Clone(buf[:n])
|
||||
d.peerCfg.Log.Infof("activity detected, captured %d bytes for reinjection", n)
|
||||
break
|
||||
}
|
||||
|
||||
d.peerCfg.Log.Debugf("removing lazy endpoint: %s", d.endpoint.String())
|
||||
if err := d.wgIface.RemovePeer(d.peerCfg.PublicKey); err != nil {
|
||||
d.peerCfg.Log.Errorf("failed to remove endpoint: %s", err)
|
||||
}
|
||||
|
||||
// Ignore close error as it may return "use of closed network connection" if already closed.
|
||||
// Leave the peer in place. ConfigureWGEndpoint will UpdatePeer with the real endpoint;
|
||||
// removing the peer here wipes kernel WG's staged queue and drops the user packet that
|
||||
// triggered activation.
|
||||
_ = d.conn.Close()
|
||||
d.done.Unlock()
|
||||
}
|
||||
|
||||
// CapturedPacket returns the first packet that triggered activity, or nil if none was captured.
|
||||
// Safe to call after ReadPackets returns.
|
||||
func (d *UDPListener) CapturedPacket() []byte {
|
||||
return d.capturedPacket
|
||||
}
|
||||
|
||||
// Close stops the listener and cleans up resources.
|
||||
func (d *UDPListener) Close() {
|
||||
d.peerCfg.Log.Infof("closing activity listener: %s", d.conn.LocalAddr().String())
|
||||
|
||||
@@ -19,17 +19,25 @@ import (
|
||||
type listener interface {
|
||||
ReadPackets()
|
||||
Close()
|
||||
CapturedPacket() []byte
|
||||
}
|
||||
|
||||
// Event reports activity on a managed peer. FirstPacket is the bytes that triggered activation,
|
||||
// captured for reinjection through the real transport.
|
||||
type Event struct {
|
||||
PeerConnID peerid.ConnID
|
||||
FirstPacket []byte
|
||||
}
|
||||
|
||||
type WgInterface interface {
|
||||
RemovePeer(peerKey string) error
|
||||
UpdatePeer(peerKey string, allowedIps []netip.Prefix, keepAlive time.Duration, endpoint *net.UDPAddr, preSharedKey *wgtypes.Key) error
|
||||
IsUserspaceBind() bool
|
||||
Address() wgaddr.Address
|
||||
MTU() uint16
|
||||
}
|
||||
|
||||
type Manager struct {
|
||||
OnActivityChan chan peerid.ConnID
|
||||
OnActivityChan chan Event
|
||||
|
||||
wgIface WgInterface
|
||||
|
||||
@@ -41,7 +49,7 @@ type Manager struct {
|
||||
|
||||
func NewManager(wgIface WgInterface) *Manager {
|
||||
m := &Manager{
|
||||
OnActivityChan: make(chan peerid.ConnID, 1),
|
||||
OnActivityChan: make(chan Event, 1),
|
||||
wgIface: wgIface,
|
||||
peers: make(map[peerid.ConnID]listener),
|
||||
done: make(chan struct{}),
|
||||
@@ -116,12 +124,12 @@ func (m *Manager) waitForTraffic(l listener, peerConnID peerid.ConnID) {
|
||||
delete(m.peers, peerConnID)
|
||||
m.mu.Unlock()
|
||||
|
||||
m.notify(peerConnID)
|
||||
m.notify(Event{PeerConnID: peerConnID, FirstPacket: l.CapturedPacket()})
|
||||
}
|
||||
|
||||
func (m *Manager) notify(peerConnID peerid.ConnID) {
|
||||
func (m *Manager) notify(ev Event) {
|
||||
select {
|
||||
case <-m.done:
|
||||
case m.OnActivityChan <- peerConnID:
|
||||
case m.OnActivityChan <- ev:
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package activity
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"net"
|
||||
"net/netip"
|
||||
"testing"
|
||||
@@ -25,10 +26,6 @@ func (m *MocPeer) ConnID() peerid.ConnID {
|
||||
type MocWGIface struct {
|
||||
}
|
||||
|
||||
func (m MocWGIface) RemovePeer(string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m MocWGIface) UpdatePeer(string, []netip.Prefix, time.Duration, *net.UDPAddr, *wgtypes.Key) error {
|
||||
return nil
|
||||
}
|
||||
@@ -44,6 +41,10 @@ func (m MocWGIface) Address() wgaddr.Address {
|
||||
}
|
||||
}
|
||||
|
||||
func (m MocWGIface) MTU() uint16 {
|
||||
return 1280
|
||||
}
|
||||
|
||||
// GetPeerListener is a test helper to access listeners
|
||||
func (m *Manager) GetPeerListener(peerConnID peerid.ConnID) (listener, bool) {
|
||||
m.mu.Lock()
|
||||
@@ -86,11 +87,15 @@ func TestManager_MonitorPeerActivity(t *testing.T) {
|
||||
}
|
||||
|
||||
select {
|
||||
case peerConnID := <-mgr.OnActivityChan:
|
||||
if peerConnID != peerCfg1.PeerConnID {
|
||||
t.Fatalf("unexpected peerConnID: %v", peerConnID)
|
||||
case ev := <-mgr.OnActivityChan:
|
||||
if ev.PeerConnID != peerCfg1.PeerConnID {
|
||||
t.Fatalf("unexpected peerConnID: %v", ev.PeerConnID)
|
||||
}
|
||||
if !bytes.Equal(ev.FirstPacket, []byte{0x01, 0x02, 0x03, 0x04, 0x05}) {
|
||||
t.Fatalf("unexpected first packet: %v", ev.FirstPacket)
|
||||
}
|
||||
case <-time.After(1 * time.Second):
|
||||
t.Fatal("timed out waiting for activity")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,24 +3,57 @@ package lazyconn
|
||||
import (
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
EnvEnableLazyConn = "NB_ENABLE_EXPERIMENTAL_LAZY_CONN"
|
||||
EnvLazyConn = "NB_LAZY_CONN"
|
||||
EnvInactivityThreshold = "NB_LAZY_CONN_INACTIVITY_THRESHOLD"
|
||||
)
|
||||
|
||||
func IsLazyConnEnabledByEnv() bool {
|
||||
val := os.Getenv(EnvEnableLazyConn)
|
||||
if val == "" {
|
||||
return false
|
||||
}
|
||||
enabled, err := strconv.ParseBool(val)
|
||||
if err != nil {
|
||||
log.Warnf("failed to parse %s: %v", EnvEnableLazyConn, err)
|
||||
return false
|
||||
}
|
||||
return enabled
|
||||
// State is the tri-state local override for lazy connections read from the environment.
|
||||
type State int
|
||||
|
||||
const (
|
||||
// StateUnset means no local override; defer to the management feature flag.
|
||||
StateUnset State = iota
|
||||
// StateOn forces lazy connections on, overriding management.
|
||||
StateOn
|
||||
// StateOff forces lazy connections off, overriding management.
|
||||
StateOff
|
||||
)
|
||||
|
||||
// EnvState reads NB_LAZY_CONN and returns the local override state.
|
||||
func EnvState() State {
|
||||
return ParseState(os.Getenv(EnvLazyConn))
|
||||
}
|
||||
|
||||
// ParseState interprets a lazy-connection override value (from the environment or an MDM
|
||||
// policy). It accepts the on/off aliases plus any value strconv.ParseBool understands
|
||||
// (true/false/1/0). An empty or unrecognized value returns StateUnset so that the
|
||||
// management feature flag remains in control.
|
||||
func ParseState(raw string) State {
|
||||
if raw == "" {
|
||||
return StateUnset
|
||||
}
|
||||
|
||||
normalized := strings.ToLower(strings.TrimSpace(raw))
|
||||
switch normalized {
|
||||
case "on":
|
||||
return StateOn
|
||||
case "off":
|
||||
return StateOff
|
||||
}
|
||||
|
||||
enabled, err := strconv.ParseBool(normalized)
|
||||
if err != nil {
|
||||
log.Warnf("failed to parse lazy connection value %q (from %s env or MDM policy): %v", raw, EnvLazyConn, err)
|
||||
return StateUnset
|
||||
}
|
||||
if enabled {
|
||||
return StateOn
|
||||
}
|
||||
return StateOff
|
||||
}
|
||||
|
||||
45
client/internal/lazyconn/env_test.go
Normal file
45
client/internal/lazyconn/env_test.go
Normal file
@@ -0,0 +1,45 @@
|
||||
package lazyconn
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestEnvState(t *testing.T) {
|
||||
tests := []struct {
|
||||
value string
|
||||
set bool
|
||||
want State
|
||||
}{
|
||||
{set: false, want: StateUnset},
|
||||
{value: "", set: true, want: StateUnset},
|
||||
{value: "on", set: true, want: StateOn},
|
||||
{value: "ON", set: true, want: StateOn},
|
||||
{value: "true", set: true, want: StateOn},
|
||||
{value: "1", set: true, want: StateOn},
|
||||
{value: " on ", set: true, want: StateOn},
|
||||
{value: "off", set: true, want: StateOff},
|
||||
{value: "OFF", set: true, want: StateOff},
|
||||
{value: "false", set: true, want: StateOff},
|
||||
{value: "0", set: true, want: StateOff},
|
||||
{value: "auto", set: true, want: StateUnset},
|
||||
{value: "garbage", set: true, want: StateUnset},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
name := tt.value
|
||||
if !tt.set {
|
||||
name = "unset"
|
||||
}
|
||||
t.Run(name, func(t *testing.T) {
|
||||
t.Setenv(EnvLazyConn, tt.value)
|
||||
if !tt.set {
|
||||
os.Unsetenv(EnvLazyConn)
|
||||
}
|
||||
|
||||
if got := EnvState(); got != tt.want {
|
||||
t.Fatalf("EnvState() = %v, want %v", got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -130,8 +130,8 @@ func (m *Manager) Start(ctx context.Context) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case peerConnID := <-m.activityManager.OnActivityChan:
|
||||
m.onPeerActivity(peerConnID)
|
||||
case ev := <-m.activityManager.OnActivityChan:
|
||||
m.onPeerActivity(ev)
|
||||
case peerIDs := <-m.inactivityManager.InactivePeersChan():
|
||||
m.onPeerInactivityTimedOut(peerIDs)
|
||||
}
|
||||
@@ -513,13 +513,13 @@ func (m *Manager) checkHaGroupActivity(haGroup route.HAUniqueID, peerID string,
|
||||
return false
|
||||
}
|
||||
|
||||
func (m *Manager) onPeerActivity(peerConnID peerid.ConnID) {
|
||||
func (m *Manager) onPeerActivity(ev activity.Event) {
|
||||
m.managedPeersMu.Lock()
|
||||
defer m.managedPeersMu.Unlock()
|
||||
|
||||
mp, ok := m.managedPeersByConnID[peerConnID]
|
||||
mp, ok := m.managedPeersByConnID[ev.PeerConnID]
|
||||
if !ok {
|
||||
log.Errorf("peer not found by conn id: %v", peerConnID)
|
||||
log.Errorf("peer not found by conn id: %v", ev.PeerConnID)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -536,7 +536,7 @@ func (m *Manager) onPeerActivity(peerConnID peerid.ConnID) {
|
||||
|
||||
m.activateHAGroupPeers(mp.peerCfg)
|
||||
|
||||
m.peerStore.PeerConnOpen(m.engineCtx, mp.peerCfg.PublicKey)
|
||||
m.peerStore.PeerConnOpenWithFirstPacket(m.engineCtx, mp.peerCfg.PublicKey, ev.FirstPacket)
|
||||
}
|
||||
|
||||
func (m *Manager) onPeerInactivityTimedOut(peerIDs map[string]struct{}) {
|
||||
|
||||
@@ -17,4 +17,5 @@ type WGIface interface {
|
||||
IsUserspaceBind() bool
|
||||
Address() wgaddr.Address
|
||||
LastActivities() map[string]monotime.Time
|
||||
MTU() uint16
|
||||
}
|
||||
|
||||
@@ -60,6 +60,13 @@ func getMetricsInterval() time.Duration {
|
||||
return interval
|
||||
}
|
||||
|
||||
// isMetricsPushEnvSet returns true if NB_METRICS_PUSH_ENABLED is explicitly set (to any value).
|
||||
// When set, the env var takes full precedence over management server configuration.
|
||||
func isMetricsPushEnvSet() bool {
|
||||
_, set := os.LookupEnv(EnvMetricsPushEnabled)
|
||||
return set
|
||||
}
|
||||
|
||||
func isForceSending() bool {
|
||||
force, _ := strconv.ParseBool(os.Getenv(EnvMetricsForceSending))
|
||||
return force
|
||||
|
||||
@@ -120,6 +120,30 @@ func (m *influxDBMetrics) RecordSyncDuration(_ context.Context, agentInfo AgentI
|
||||
m.trimLocked()
|
||||
}
|
||||
|
||||
func (m *influxDBMetrics) RecordSyncPhase(_ context.Context, agentInfo AgentInfo, phase string, duration time.Duration) {
|
||||
tags := fmt.Sprintf("deployment_type=%s,version=%s,os=%s,arch=%s,peer_id=%s,phase=%s",
|
||||
agentInfo.DeploymentType.String(),
|
||||
agentInfo.Version,
|
||||
agentInfo.OS,
|
||||
agentInfo.Arch,
|
||||
agentInfo.peerID,
|
||||
phase,
|
||||
)
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
m.samples = append(m.samples, influxSample{
|
||||
measurement: "netbird_sync_phase",
|
||||
tags: tags,
|
||||
fields: map[string]float64{
|
||||
"duration_seconds": duration.Seconds(),
|
||||
},
|
||||
timestamp: time.Now(),
|
||||
})
|
||||
m.trimLocked()
|
||||
}
|
||||
|
||||
func (m *influxDBMetrics) RecordLoginDuration(_ context.Context, agentInfo AgentInfo, duration time.Duration, success bool) {
|
||||
result := "success"
|
||||
if !success {
|
||||
|
||||
@@ -78,6 +78,25 @@ Tags:
|
||||
- `os`: Operating system (linux, darwin, windows, android, ios, etc.)
|
||||
- `arch`: CPU architecture (amd64, arm64, etc.)
|
||||
|
||||
### Sync Phase Timing
|
||||
|
||||
Measurement: `netbird_sync_phase`
|
||||
|
||||
Breaks down where time goes inside a single sync, so the total `netbird_sync` duration can be attributed to the sub-step that dominates.
|
||||
|
||||
| Field | Description |
|
||||
|-------|-------------|
|
||||
| `duration_seconds` | Time spent in one sub-phase of sync processing |
|
||||
|
||||
Tags:
|
||||
- `phase`: the sub-phase — `netbird_config`, `checks`, `persist`, `dns_server`, `routes_classify`, `routes_apply`, `filtering`, `dns_forwarder`, `forward_rules`, `offline_peers`, `removed_peers`, `modified_peers`, `added_peers`, `lazy_exclude`
|
||||
- `deployment_type`: "cloud" | "selfhosted" | "unknown"
|
||||
- `version`: NetBird version string
|
||||
- `os`: Operating system (linux, darwin, windows, android, ios, etc.)
|
||||
- `arch`: CPU architecture (amd64, arm64, etc.)
|
||||
|
||||
**Note:** this is wall-time per phase — it includes both CPU work and time spent waiting on locks. A slow phase points to *where* the time goes, not *why*; pair it with lock-wait metrics to tell contention apart from real work.
|
||||
|
||||
### Login Duration
|
||||
|
||||
Measurement: `netbird_login`
|
||||
@@ -191,4 +210,52 @@ docker compose exec influxdb influx query \
|
||||
|
||||
# Check ingest server health
|
||||
curl http://localhost:8087/health
|
||||
```
|
||||
```
|
||||
|
||||
## Analyzing a Debug Bundle
|
||||
|
||||
Metrics collection is always on, so every debug bundle ships a `metrics.txt` in InfluxDB line protocol — a timestamped time series of all recorded events (sync durations, sync phases, connection stages, login). You can replay it into the local stack and graph it, without a running client.
|
||||
|
||||
The bundle's `metrics.txt` is a rolling window (capped at 5 days / ~20k samples, see [Buffer Limits](#buffer-limits)). For a connection incident the relevant window is short (connection setup is seconds), so a bundle captured during the issue is enough.
|
||||
|
||||
### 1. Start the stack
|
||||
|
||||
```bash
|
||||
# From this directory (client/internal/metrics/infra)
|
||||
INFLUXDB_ADMIN_TOKEN=admin123 INFLUXDB_ADMIN_PASSWORD=admin123 GRAFANA_ADMIN_PASSWORD=admin123 \
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
(`admin123` are throwaway local credentials — fine for offline analysis.)
|
||||
|
||||
### 2. Clear any previous data
|
||||
|
||||
So you only see this bundle:
|
||||
|
||||
```bash
|
||||
docker exec influxdb influx delete --org netbird --bucket metrics --token admin123 \
|
||||
--start 1970-01-01T00:00:00Z --stop 2100-01-01T00:00:00Z
|
||||
```
|
||||
|
||||
### 3. Import the bundle's metrics.txt
|
||||
|
||||
InfluxDB is not exposed on the host, so import inside the container:
|
||||
|
||||
```bash
|
||||
docker cp /path/to/bundle/metrics.txt influxdb:/tmp/m.txt
|
||||
docker exec influxdb influx write --org netbird --bucket metrics --precision ns \
|
||||
--token admin123 --file /tmp/m.txt
|
||||
```
|
||||
|
||||
Re-importing the same file is idempotent (same measurement+tags+timestamp overwrites).
|
||||
|
||||
### 4. View the dashboards
|
||||
|
||||
Grafana on http://localhost:3001 (login `admin` / `admin123`), datasource pre-provisioned:
|
||||
|
||||
- **Where sync time goes:** http://localhost:3001/d/netbird-sync-phases/netbird-sync-phases-where-time-goes
|
||||
- **General client metrics:** http://localhost:3001/d/netbird-influxdb-metrics
|
||||
|
||||
**Set the time range** to cover the bundle's timestamps (e.g. "Last 7 days" or an absolute range matching when the bundle was taken) — with the default short range the panels look empty.
|
||||
|
||||
Bundles are distinguishable by the `version` tag; add a tag at import time (e.g. `sed 's/^netbird_\([a-z_]*\),/netbird_\1,bundle=mycase,/' metrics.txt`) if you want to compare several side by side.
|
||||
@@ -0,0 +1,259 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"refresh": "",
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"netbird",
|
||||
"sync"
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"definition": "import \"influxdata/influxdb/schema\"\nschema.tagValues(bucket: \"metrics\", tag: \"version\")",
|
||||
"includeAll": true,
|
||||
"label": "version",
|
||||
"multi": true,
|
||||
"name": "version",
|
||||
"query": "import \"influxdata/influxdb/schema\"\nschema.tagValues(bucket: \"metrics\", tag: \"version\")",
|
||||
"refresh": 2,
|
||||
"type": "query",
|
||||
"allValue": ".*"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-2d",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "NetBird Sync Phases (where time goes)",
|
||||
"uid": "netbird-sync-phases",
|
||||
"version": 1,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Time per phase over time (stacked, ms)",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 10,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"drawStyle": "bars",
|
||||
"stacking": {
|
||||
"mode": "normal",
|
||||
"group": "A"
|
||||
},
|
||||
"fillOpacity": 80,
|
||||
"lineWidth": 0
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"max",
|
||||
"mean"
|
||||
]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi",
|
||||
"sort": "desc"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n |> filter(fn: (r) => r.version =~ /${version:regex}/)\n |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n |> keep(columns: [\"_time\", \"_value\", \"phase\"])\n |> group(columns: [\"phase\"])"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "p95 per phase (ms)",
|
||||
"type": "bargauge",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 11,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"color": {
|
||||
"mode": "continuous-GrYlRd"
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showUnfilled": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n |> filter(fn: (r) => r.version =~ /${version:regex}/)\n |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n |> group(columns: [\"phase\"])\n |> quantile(q: 0.95)\n |> group()\n |> sort(columns: [\"_value\"], desc: true)"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Per-phase stats (ms): mean / p95 / max",
|
||||
"type": "table",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 11,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "max",
|
||||
"desc": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"transformations": [
|
||||
{
|
||||
"id": "merge",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"targets": [
|
||||
{
|
||||
"refId": "mean",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n |> filter(fn: (r) => r.version =~ /${version:regex}/)\n |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n |> group(columns: [\"phase\"])\n |> mean()\n |> group()\n |> keep(columns: [\"phase\", \"_value\"])\n |> rename(columns: {_value: \"mean\"})"
|
||||
},
|
||||
{
|
||||
"refId": "p95",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n |> filter(fn: (r) => r.version =~ /${version:regex}/)\n |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n |> group(columns: [\"phase\"])\n |> quantile(q: 0.95)\n |> group()\n |> keep(columns: [\"phase\", \"_value\"])\n |> rename(columns: {_value: \"p95\"})"
|
||||
},
|
||||
{
|
||||
"refId": "max",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n |> filter(fn: (r) => r.version =~ /${version:regex}/)\n |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n |> group(columns: [\"phase\"])\n |> max()\n |> group()\n |> keep(columns: [\"phase\", \"_value\"])\n |> rename(columns: {_value: \"max\"})"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Total sync duration (netbird_sync, ms) \u2014 reference",
|
||||
"type": "timeseries",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"custom": {
|
||||
"drawStyle": "points",
|
||||
"pointSize": 5
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"max",
|
||||
"mean"
|
||||
]
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "single"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "influxdb"
|
||||
},
|
||||
"query": "from(bucket: \"metrics\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n |> filter(fn: (r) => r.version =~ /${version:regex}/)\n |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n |> keep(columns: [\"_time\", \"_value\", \"version\"])\n |> group(columns: [\"version\"])"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -19,7 +19,7 @@ const (
|
||||
defaultListenAddr = ":8087"
|
||||
defaultInfluxDBURL = "http://influxdb:8086/api/v2/write?org=netbird&bucket=metrics&precision=ns"
|
||||
maxBodySize = 50 * 1024 * 1024 // 50 MB max request body
|
||||
maxDurationSeconds = 300.0 // reject any duration field > 5 minutes
|
||||
maxDurationSeconds = 86400.0 // reject any duration field > 24 hours
|
||||
peerIDLength = 16 // truncated SHA-256: 8 bytes = 16 hex chars
|
||||
maxTagValueLength = 64 // reject tag values longer than this
|
||||
)
|
||||
@@ -59,6 +59,19 @@ var allowedMeasurements = map[string]measurementSpec{
|
||||
"peer_id": true,
|
||||
},
|
||||
},
|
||||
"netbird_sync_phase": {
|
||||
allowedFields: map[string]bool{
|
||||
"duration_seconds": true,
|
||||
},
|
||||
allowedTags: map[string]bool{
|
||||
"deployment_type": true,
|
||||
"version": true,
|
||||
"os": true,
|
||||
"arch": true,
|
||||
"peer_id": true,
|
||||
"phase": true,
|
||||
},
|
||||
},
|
||||
"netbird_login": {
|
||||
allowedFields: map[string]bool{
|
||||
"duration_seconds": true,
|
||||
|
||||
@@ -53,14 +53,14 @@ func TestValidateLine_NegativeValue(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestValidateLine_DurationTooLarge(t *testing.T) {
|
||||
line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=999 1234567890`
|
||||
line := `netbird_sync,deployment_type=cloud,version=1.0.0,os=linux,arch=amd64,peer_id=abc duration_seconds=100000 1234567890`
|
||||
err := validateLine(line)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "too large")
|
||||
}
|
||||
|
||||
func TestValidateLine_TotalSecondsTooLarge(t *testing.T) {
|
||||
line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,arch=amd64,peer_id=abc,connection_pair_id=pair total_seconds=500 1234567890`
|
||||
line := `netbird_peer_connection,deployment_type=cloud,connection_type=ice,attempt_type=initial,version=1.0.0,os=linux,arch=amd64,peer_id=abc,connection_pair_id=pair total_seconds=100000 1234567890`
|
||||
err := validateLine(line)
|
||||
require.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "too large")
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
@@ -56,6 +57,9 @@ type metricsImplementation interface {
|
||||
// RecordSyncDuration records how long it took to process a sync message
|
||||
RecordSyncDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration)
|
||||
|
||||
// RecordSyncPhase records how long a single sub-phase of sync processing took
|
||||
RecordSyncPhase(ctx context.Context, agentInfo AgentInfo, phase string, duration time.Duration)
|
||||
|
||||
// RecordLoginDuration records how long the login to management took
|
||||
RecordLoginDuration(ctx context.Context, agentInfo AgentInfo, duration time.Duration, success bool)
|
||||
|
||||
@@ -72,7 +76,7 @@ type ClientMetrics struct {
|
||||
agentInfo AgentInfo
|
||||
mu sync.RWMutex
|
||||
|
||||
push *Push
|
||||
push atomic.Pointer[Push]
|
||||
pushMu sync.Mutex
|
||||
wg sync.WaitGroup
|
||||
pushCancel context.CancelFunc
|
||||
@@ -127,6 +131,18 @@ func (c *ClientMetrics) RecordSyncDuration(ctx context.Context, duration time.Du
|
||||
c.impl.RecordSyncDuration(ctx, agentInfo, duration)
|
||||
}
|
||||
|
||||
// RecordSyncPhase records the duration of a single sub-phase of sync processing
|
||||
func (c *ClientMetrics) RecordSyncPhase(ctx context.Context, phase string, duration time.Duration) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.mu.RLock()
|
||||
agentInfo := c.agentInfo
|
||||
c.mu.RUnlock()
|
||||
|
||||
c.impl.RecordSyncPhase(ctx, agentInfo, phase, duration)
|
||||
}
|
||||
|
||||
// RecordLoginDuration records how long the login to management server took
|
||||
func (c *ClientMetrics) RecordLoginDuration(ctx context.Context, duration time.Duration, success bool) {
|
||||
if c == nil {
|
||||
@@ -152,10 +168,7 @@ func (c *ClientMetrics) UpdateAgentInfo(agentInfo AgentInfo, publicKey string) {
|
||||
c.agentInfo = agentInfo
|
||||
c.mu.Unlock()
|
||||
|
||||
c.pushMu.Lock()
|
||||
push := c.push
|
||||
c.pushMu.Unlock()
|
||||
if push != nil {
|
||||
if push := c.push.Load(); push != nil {
|
||||
push.SetPeerID(agentInfo.peerID)
|
||||
}
|
||||
}
|
||||
@@ -169,7 +182,7 @@ func (c *ClientMetrics) Export(w io.Writer) error {
|
||||
return c.impl.Export(w)
|
||||
}
|
||||
|
||||
// StartPush starts periodic pushing of metrics with the given configuration
|
||||
// StartPush starts periodic pushing of metrics with the given configuration.
|
||||
// Precedence: PushConfig.ServerAddress > remote config server_url
|
||||
func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) {
|
||||
if c == nil {
|
||||
@@ -179,11 +192,58 @@ func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) {
|
||||
c.pushMu.Lock()
|
||||
defer c.pushMu.Unlock()
|
||||
|
||||
if c.push != nil {
|
||||
if c.push.Load() != nil {
|
||||
log.Warnf("metrics push already running")
|
||||
return
|
||||
}
|
||||
|
||||
c.startPushLocked(ctx, config)
|
||||
}
|
||||
|
||||
// StopPush stops the periodic metrics push.
|
||||
func (c *ClientMetrics) StopPush() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.pushMu.Lock()
|
||||
defer c.pushMu.Unlock()
|
||||
|
||||
c.stopPushLocked()
|
||||
}
|
||||
|
||||
// UpdatePushFromMgm updates metrics push based on management server configuration.
|
||||
// If NB_METRICS_PUSH_ENABLED is explicitly set (true or false), management config is ignored.
|
||||
// When unset, management controls whether push is enabled.
|
||||
func (c *ClientMetrics) UpdatePushFromMgm(ctx context.Context, enabled bool) {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if isMetricsPushEnvSet() {
|
||||
log.Debugf("ignoring management config, env var is explicitly set: %s", EnvMetricsPushEnabled)
|
||||
return
|
||||
}
|
||||
|
||||
c.pushMu.Lock()
|
||||
defer c.pushMu.Unlock()
|
||||
|
||||
if enabled {
|
||||
if c.push.Load() != nil {
|
||||
return
|
||||
}
|
||||
log.Infof("enabled metrics push by management")
|
||||
c.startPushLocked(ctx, PushConfigFromEnv())
|
||||
} else {
|
||||
if c.push.Load() == nil {
|
||||
return
|
||||
}
|
||||
log.Infof("disabled metrics push by management")
|
||||
c.stopPushLocked()
|
||||
}
|
||||
}
|
||||
|
||||
// startPushLocked starts push. Caller must hold pushMu.
|
||||
func (c *ClientMetrics) startPushLocked(ctx context.Context, config PushConfig) {
|
||||
c.mu.RLock()
|
||||
agentVersion := c.agentInfo.Version
|
||||
peerID := c.agentInfo.peerID
|
||||
@@ -199,26 +259,23 @@ func (c *ClientMetrics) StartPush(ctx context.Context, config PushConfig) {
|
||||
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
c.pushCancel = cancel
|
||||
c.push.Store(push)
|
||||
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
push.Start(ctx)
|
||||
c.push.CompareAndSwap(push, nil)
|
||||
}()
|
||||
c.push = push
|
||||
}
|
||||
|
||||
func (c *ClientMetrics) StopPush() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
c.pushMu.Lock()
|
||||
defer c.pushMu.Unlock()
|
||||
if c.push == nil {
|
||||
// stopPushLocked stops push. Caller must hold pushMu.
|
||||
func (c *ClientMetrics) stopPushLocked() {
|
||||
if c.push.Load() == nil {
|
||||
return
|
||||
}
|
||||
|
||||
c.pushCancel()
|
||||
c.wg.Wait()
|
||||
c.push = nil
|
||||
c.push.Store(nil)
|
||||
}
|
||||
|
||||
@@ -70,6 +70,9 @@ func (m *mockMetrics) RecordConnectionStages(_ context.Context, _ AgentInfo, _ s
|
||||
func (m *mockMetrics) RecordSyncDuration(_ context.Context, _ AgentInfo, _ time.Duration) {
|
||||
}
|
||||
|
||||
func (m *mockMetrics) RecordSyncPhase(_ context.Context, _ AgentInfo, _ string, _ time.Duration) {
|
||||
}
|
||||
|
||||
func (m *mockMetrics) RecordLoginDuration(_ context.Context, _ AgentInfo, _ time.Duration, _ bool) {
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"net"
|
||||
"net/netip"
|
||||
"runtime"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -136,6 +137,39 @@ type Conn struct {
|
||||
// Connection stage timestamps for metrics
|
||||
metricsRecorder MetricsRecorder
|
||||
metricsStages *MetricsStages
|
||||
|
||||
// pendingFirstPacket is the lazyconn-captured handshake init, replayed once the real
|
||||
// transport is up.
|
||||
pendingFirstPacket []byte
|
||||
}
|
||||
|
||||
// injectPendingFirstPacket replays the captured handshake through the proxy if present, else
|
||||
// directly through the ICE conn. The packet is cleared only after a successful write, so a failed
|
||||
// or transport-less attempt leaves it available for a later reinjection. Caller must hold conn.mu.
|
||||
func (conn *Conn) injectPendingFirstPacket(proxy wgproxy.Proxy, directConn net.Conn) {
|
||||
pkt := conn.pendingFirstPacket
|
||||
if len(pkt) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
switch {
|
||||
case proxy != nil:
|
||||
if err := proxy.InjectPacket(pkt); err != nil {
|
||||
conn.Log.Debugf("failed to reinject captured first packet via proxy: %v", err)
|
||||
return
|
||||
}
|
||||
case directConn != nil:
|
||||
if _, err := directConn.Write(pkt); err != nil {
|
||||
conn.Log.Debugf("failed to reinject captured first packet via direct conn: %v", err)
|
||||
return
|
||||
}
|
||||
default:
|
||||
conn.Log.Debugf("no transport available to reinject captured first packet")
|
||||
return
|
||||
}
|
||||
|
||||
conn.pendingFirstPacket = nil
|
||||
conn.Log.Debugf("reinjected captured first packet (%d bytes)", len(pkt))
|
||||
}
|
||||
|
||||
// NewConn creates a new not opened Conn to the remote peer.
|
||||
@@ -172,6 +206,16 @@ func NewConn(config ConnConfig, services ServiceDependencies) (*Conn, error) {
|
||||
// It will try to establish a connection using ICE and in parallel with relay. The higher priority connection type will
|
||||
// be used.
|
||||
func (conn *Conn) Open(engineCtx context.Context) error {
|
||||
return conn.open(engineCtx, nil)
|
||||
}
|
||||
|
||||
// OpenWithFirstPacket opens the connection like Open and stashes firstPacket to be replayed once
|
||||
// the real transport is established. The packet is retained only on a successful open.
|
||||
func (conn *Conn) OpenWithFirstPacket(engineCtx context.Context, firstPacket []byte) error {
|
||||
return conn.open(engineCtx, firstPacket)
|
||||
}
|
||||
|
||||
func (conn *Conn) open(engineCtx context.Context, firstPacket []byte) error {
|
||||
conn.mu.Lock()
|
||||
defer conn.mu.Unlock()
|
||||
|
||||
@@ -227,6 +271,9 @@ func (conn *Conn) Open(engineCtx context.Context) error {
|
||||
defer conn.wg.Done()
|
||||
conn.guard.Start(conn.ctx, conn.onGuardEvent)
|
||||
}()
|
||||
if len(firstPacket) > 0 {
|
||||
conn.pendingFirstPacket = slices.Clone(firstPacket)
|
||||
}
|
||||
conn.opened = true
|
||||
return nil
|
||||
}
|
||||
@@ -423,6 +470,8 @@ func (conn *Conn) onICEConnectionIsReady(priority conntype.ConnPriority, iceConn
|
||||
conn.wgProxyRelay.RedirectAs(ep)
|
||||
}
|
||||
|
||||
conn.injectPendingFirstPacket(wgProxy, iceConnInfo.RemoteConn)
|
||||
|
||||
conn.currentConnPriority = priority
|
||||
conn.statusICE.SetConnected()
|
||||
conn.updateIceState(iceConnInfo, updateTime)
|
||||
@@ -546,6 +595,8 @@ func (conn *Conn) onRelayConnectionIsReady(rci RelayConnInfo) {
|
||||
|
||||
wgConfigWorkaround()
|
||||
|
||||
conn.injectPendingFirstPacket(wgProxy, nil)
|
||||
|
||||
conn.rosenpassRemoteKey = rci.rosenpassPubKey
|
||||
conn.currentConnPriority = conntype.Relay
|
||||
conn.statusRelay.SetConnected()
|
||||
@@ -752,15 +803,17 @@ func (conn *Conn) isConnectedOnAllWay() (status guard.ConnStatus) {
|
||||
}
|
||||
|
||||
func (conn *Conn) enableWgWatcherIfNeeded(enabledTime time.Time) {
|
||||
if !conn.wgWatcher.IsEnabled() {
|
||||
wgWatcherCtx, wgWatcherCancel := context.WithCancel(conn.ctx)
|
||||
conn.wgWatcherCancel = wgWatcherCancel
|
||||
conn.wgWatcherWg.Add(1)
|
||||
go func() {
|
||||
defer conn.wgWatcherWg.Done()
|
||||
conn.wgWatcher.EnableWgWatcher(wgWatcherCtx, enabledTime, conn.onWGDisconnected, conn.onWGHandshakeSuccess)
|
||||
}()
|
||||
if !conn.wgWatcher.PrepareInitialHandshake() {
|
||||
return
|
||||
}
|
||||
|
||||
wgWatcherCtx, wgWatcherCancel := context.WithCancel(conn.ctx)
|
||||
conn.wgWatcherCancel = wgWatcherCancel
|
||||
conn.wgWatcherWg.Add(1)
|
||||
go func() {
|
||||
defer conn.wgWatcherWg.Done()
|
||||
conn.wgWatcher.EnableWgWatcher(wgWatcherCtx, enabledTime, conn.onWGDisconnected, conn.onWGHandshakeSuccess)
|
||||
}()
|
||||
}
|
||||
|
||||
func (conn *Conn) disableWgWatcherIfNeeded() {
|
||||
|
||||
@@ -31,7 +31,9 @@ type WGWatcher struct {
|
||||
stateDump *stateDump
|
||||
|
||||
enabled bool
|
||||
muEnabled sync.RWMutex
|
||||
muEnabled sync.Mutex
|
||||
// initialHandshake is not thread-safe; never call PrepareInitialHandshake and EnableWgWatcher concurrently.
|
||||
initialHandshake time.Time
|
||||
|
||||
resetCh chan struct{}
|
||||
}
|
||||
@@ -46,38 +48,38 @@ func NewWGWatcher(log *log.Entry, wgIfaceStater WGInterfaceStater, peerKey strin
|
||||
}
|
||||
}
|
||||
|
||||
// EnableWgWatcher starts the WireGuard watcher. If it is already enabled, it will return immediately and do nothing.
|
||||
// The watcher runs until ctx is cancelled. Caller is responsible for context lifecycle management.
|
||||
func (w *WGWatcher) EnableWgWatcher(ctx context.Context, enabledTime time.Time, onDisconnectedFn func(), onHandshakeSuccessFn func(when time.Time)) {
|
||||
// PrepareInitialHandshake reserves the watcher and reads the peer's current WireGuard
|
||||
// handshake time. It must be called before the peer is (re)configured on the WireGuard
|
||||
// interface, so the captured baseline reflects the state prior to this connection attempt
|
||||
// instead of racing with that configuration. Returns ok=false if the watcher is already
|
||||
// running, in which case EnableWgWatcher must not be called.
|
||||
func (w *WGWatcher) PrepareInitialHandshake() (ok bool) {
|
||||
w.muEnabled.Lock()
|
||||
if w.enabled {
|
||||
w.muEnabled.Unlock()
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
w.log.Debugf("enable WireGuard watcher")
|
||||
w.enabled = true
|
||||
w.muEnabled.Unlock()
|
||||
|
||||
initialHandshake, err := w.wgState()
|
||||
if err != nil {
|
||||
w.log.Warnf("failed to read initial wg stats: %v", err)
|
||||
}
|
||||
handshake, _ := w.wgState()
|
||||
w.initialHandshake = handshake
|
||||
return true
|
||||
}
|
||||
|
||||
w.periodicHandshakeCheck(ctx, onDisconnectedFn, onHandshakeSuccessFn, enabledTime, initialHandshake)
|
||||
// EnableWgWatcher runs the WireGuard watcher loop using the handshake baseline captured by
|
||||
// PrepareInitialHandshake. The watcher runs until ctx is cancelled. Caller is responsible
|
||||
// for context lifecycle management.
|
||||
func (w *WGWatcher) EnableWgWatcher(ctx context.Context, enabledTime time.Time, onDisconnectedFn func(), onHandshakeSuccessFn func(when time.Time)) {
|
||||
w.periodicHandshakeCheck(ctx, onDisconnectedFn, onHandshakeSuccessFn, enabledTime, w.initialHandshake)
|
||||
|
||||
w.muEnabled.Lock()
|
||||
w.enabled = false
|
||||
w.muEnabled.Unlock()
|
||||
}
|
||||
|
||||
// IsEnabled returns true if the WireGuard watcher is currently enabled
|
||||
func (w *WGWatcher) IsEnabled() bool {
|
||||
w.muEnabled.RLock()
|
||||
defer w.muEnabled.RUnlock()
|
||||
return w.enabled
|
||||
}
|
||||
|
||||
// Reset signals the watcher that the WireGuard peer has been reset and a new
|
||||
// handshake is expected. This restarts the handshake timeout from scratch.
|
||||
func (w *WGWatcher) Reset() {
|
||||
@@ -101,13 +103,16 @@ func (w *WGWatcher) periodicHandshakeCheck(ctx context.Context, onDisconnectedFn
|
||||
case <-timer.C:
|
||||
handshake, ok := w.handshakeCheck(lastHandshake)
|
||||
if !ok {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
onDisconnectedFn()
|
||||
return
|
||||
}
|
||||
if lastHandshake.IsZero() {
|
||||
elapsed := calcElapsed(enabledTime, *handshake)
|
||||
w.log.Infof("first wg handshake detected within: %.2fsec, (%s)", elapsed, handshake)
|
||||
if onHandshakeSuccessFn != nil {
|
||||
if onHandshakeSuccessFn != nil && ctx.Err() == nil {
|
||||
onHandshakeSuccessFn(*handshake)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"time"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/netbirdio/netbird/client/iface/configurer"
|
||||
)
|
||||
@@ -34,6 +35,9 @@ func TestWGWatcher_EnableWgWatcher(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
ok := watcher.PrepareInitialHandshake()
|
||||
require.True(t, ok, "watcher should not be enabled yet")
|
||||
|
||||
onDisconnected := make(chan struct{}, 1)
|
||||
go watcher.EnableWgWatcher(ctx, time.Now(), func() {
|
||||
mlog.Infof("onDisconnectedFn")
|
||||
@@ -62,6 +66,9 @@ func TestWGWatcher_ReEnable(t *testing.T) {
|
||||
watcher := NewWGWatcher(mlog, mocWgIface, "", newStateDump("peer", mlog, &Status{}))
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
ok := watcher.PrepareInitialHandshake()
|
||||
require.True(t, ok, "watcher should not be enabled yet")
|
||||
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
@@ -76,6 +83,9 @@ func TestWGWatcher_ReEnable(t *testing.T) {
|
||||
ctx, cancel = context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
ok = watcher.PrepareInitialHandshake()
|
||||
require.True(t, ok, "watcher should be re-enabled after the previous run stopped")
|
||||
|
||||
onDisconnected := make(chan struct{}, 1)
|
||||
go watcher.EnableWgWatcher(ctx, time.Now(), func() {
|
||||
onDisconnected <- struct{}{}
|
||||
|
||||
@@ -88,11 +88,24 @@ func (s *Store) PeerConnOpen(ctx context.Context, pubKey string) {
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
// this can be blocked because of the connect open limiter semaphore
|
||||
if err := p.Open(ctx); err != nil {
|
||||
p.Log.Errorf("failed to open peer connection: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// PeerConnOpenWithFirstPacket opens the peer connection and stashes a first packet to be
|
||||
// reinjected once the real transport is established.
|
||||
func (s *Store) PeerConnOpenWithFirstPacket(ctx context.Context, pubKey string, firstPacket []byte) {
|
||||
s.peerConnsMu.RLock()
|
||||
defer s.peerConnsMu.RUnlock()
|
||||
|
||||
p, ok := s.peerConns[pubKey]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if err := p.OpenWithFirstPacket(ctx, firstPacket); err != nil {
|
||||
p.Log.Errorf("failed to open peer connection: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Store) PeerConnIdle(pubKey string) {
|
||||
|
||||
@@ -101,8 +101,6 @@ type ConfigInput struct {
|
||||
|
||||
DNSLabels domain.List
|
||||
|
||||
LazyConnectionEnabled *bool
|
||||
|
||||
MTU *uint16
|
||||
}
|
||||
|
||||
@@ -180,7 +178,9 @@ type Config struct {
|
||||
|
||||
ClientCertKeyPair *tls.Certificate `json:"-"`
|
||||
|
||||
LazyConnectionEnabled bool
|
||||
// LazyConnection is the MDM-managed lazy-connection override ("on"/"off"/"").
|
||||
// Runtime-only: re-derived from MDM policy on each load, never persisted.
|
||||
LazyConnection string `json:"-"`
|
||||
|
||||
MTU uint16
|
||||
|
||||
@@ -386,7 +386,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.NetworkMonitor != nil && input.NetworkMonitor != config.NetworkMonitor {
|
||||
if input.NetworkMonitor != nil && (config.NetworkMonitor == nil || *input.NetworkMonitor != *config.NetworkMonitor) {
|
||||
log.Infof("switching Network Monitor to %t", *input.NetworkMonitor)
|
||||
config.NetworkMonitor = input.NetworkMonitor
|
||||
updated = true
|
||||
@@ -454,7 +454,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.EnableSSHRoot != nil && input.EnableSSHRoot != config.EnableSSHRoot {
|
||||
if input.EnableSSHRoot != nil && (config.EnableSSHRoot == nil || *input.EnableSSHRoot != *config.EnableSSHRoot) {
|
||||
if *input.EnableSSHRoot {
|
||||
log.Infof("enabling SSH root login")
|
||||
} else {
|
||||
@@ -464,7 +464,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.EnableSSHSFTP != nil && input.EnableSSHSFTP != config.EnableSSHSFTP {
|
||||
if input.EnableSSHSFTP != nil && (config.EnableSSHSFTP == nil || *input.EnableSSHSFTP != *config.EnableSSHSFTP) {
|
||||
if *input.EnableSSHSFTP {
|
||||
log.Infof("enabling SSH SFTP subsystem")
|
||||
} else {
|
||||
@@ -474,7 +474,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.EnableSSHLocalPortForwarding != nil && input.EnableSSHLocalPortForwarding != config.EnableSSHLocalPortForwarding {
|
||||
if input.EnableSSHLocalPortForwarding != nil && (config.EnableSSHLocalPortForwarding == nil || *input.EnableSSHLocalPortForwarding != *config.EnableSSHLocalPortForwarding) {
|
||||
if *input.EnableSSHLocalPortForwarding {
|
||||
log.Infof("enabling SSH local port forwarding")
|
||||
} else {
|
||||
@@ -484,7 +484,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.EnableSSHRemotePortForwarding != nil && input.EnableSSHRemotePortForwarding != config.EnableSSHRemotePortForwarding {
|
||||
if input.EnableSSHRemotePortForwarding != nil && (config.EnableSSHRemotePortForwarding == nil || *input.EnableSSHRemotePortForwarding != *config.EnableSSHRemotePortForwarding) {
|
||||
if *input.EnableSSHRemotePortForwarding {
|
||||
log.Infof("enabling SSH remote port forwarding")
|
||||
} else {
|
||||
@@ -494,7 +494,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.DisableSSHAuth != nil && input.DisableSSHAuth != config.DisableSSHAuth {
|
||||
if input.DisableSSHAuth != nil && (config.DisableSSHAuth == nil || *input.DisableSSHAuth != *config.DisableSSHAuth) {
|
||||
if *input.DisableSSHAuth {
|
||||
log.Infof("disabling SSH authentication")
|
||||
} else {
|
||||
@@ -504,7 +504,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.SSHJWTCacheTTL != nil && input.SSHJWTCacheTTL != config.SSHJWTCacheTTL {
|
||||
if input.SSHJWTCacheTTL != nil && (config.SSHJWTCacheTTL == nil || *input.SSHJWTCacheTTL != *config.SSHJWTCacheTTL) {
|
||||
log.Infof("updating SSH JWT cache TTL to %d seconds", *input.SSHJWTCacheTTL)
|
||||
config.SSHJWTCacheTTL = input.SSHJWTCacheTTL
|
||||
updated = true
|
||||
@@ -587,7 +587,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.DisableNotifications != nil && input.DisableNotifications != config.DisableNotifications {
|
||||
if input.DisableNotifications != nil && (config.DisableNotifications == nil || *input.DisableNotifications != *config.DisableNotifications) {
|
||||
if *input.DisableNotifications {
|
||||
log.Infof("disabling notifications")
|
||||
} else {
|
||||
@@ -632,12 +632,6 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.LazyConnectionEnabled != nil && *input.LazyConnectionEnabled != config.LazyConnectionEnabled {
|
||||
log.Infof("switching lazy connection to %t", *input.LazyConnectionEnabled)
|
||||
config.LazyConnectionEnabled = *input.LazyConnectionEnabled
|
||||
updated = true
|
||||
}
|
||||
|
||||
if input.MTU != nil && *input.MTU != config.MTU {
|
||||
log.Infof("updating MTU to %d (old value %d)", *input.MTU, config.MTU)
|
||||
config.MTU = *input.MTU
|
||||
@@ -728,6 +722,15 @@ func (config *Config) applyMDMPolicy(policy *mdm.Policy) {
|
||||
log.Warnf("MDM wireguard port %d out of range [1,65535]; keeping previous value", v)
|
||||
}
|
||||
}
|
||||
|
||||
if v, ok := policy.GetBool(mdm.KeyLazyConnection); ok {
|
||||
state := "off"
|
||||
if v {
|
||||
state = "on"
|
||||
}
|
||||
config.LazyConnection = state
|
||||
logApplied(mdm.KeyLazyConnection, state)
|
||||
}
|
||||
}
|
||||
|
||||
// parseURL parses and validates the URL for the named service. The URL
|
||||
|
||||
@@ -130,6 +130,37 @@ func TestApply_MDMBoolKeysOverrideOnDiskValue(t *testing.T) {
|
||||
assert.True(t, cfg.Policy().HasKey(mdm.KeyRosenpassEnabled))
|
||||
}
|
||||
|
||||
func TestApply_MDMLazyConnection(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
raw any
|
||||
want string
|
||||
}{
|
||||
{"native true", true, "on"},
|
||||
{"native false", false, "off"},
|
||||
{"string on", "on", "on"},
|
||||
{"string off", "off", "off"},
|
||||
{"string yes", "yes", "on"},
|
||||
{"string no", "no", "off"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
withMDMPolicy(t, mdm.NewPolicy(map[string]any{
|
||||
mdm.KeyLazyConnection: c.raw,
|
||||
}))
|
||||
|
||||
cfg, err := UpdateOrCreateConfig(ConfigInput{
|
||||
ConfigPath: filepath.Join(t.TempDir(), "config.json"),
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, cfg)
|
||||
|
||||
assert.Equal(t, c.want, cfg.LazyConnection)
|
||||
assert.True(t, cfg.Policy().HasKey(mdm.KeyLazyConnection))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestApply_MDMPreSharedKeyRedactionSentinelRejected(t *testing.T) {
|
||||
const maskSentinel = "**********"
|
||||
|
||||
|
||||
@@ -226,12 +226,11 @@ func (d *DnsInterceptor) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
|
||||
return
|
||||
}
|
||||
|
||||
// pass if non A/AAAA query
|
||||
if r.Question[0].Qtype != dns.TypeA && r.Question[0].Qtype != dns.TypeAAAA {
|
||||
d.continueToNextHandler(w, r, logger, "non A/AAAA query")
|
||||
return
|
||||
}
|
||||
|
||||
// All query types for an intercepted domain are forwarded to the peer's
|
||||
// DNS forwarder, which owns the name. Falling through to the system
|
||||
// resolver would let it answer NXDOMAIN for a name it isn't authoritative
|
||||
// for, poisoning the whole name (including the A/AAAA records the route
|
||||
// does serve). The forwarder answers NODATA for types it cannot resolve.
|
||||
d.mu.RLock()
|
||||
peerKey := d.currentPeerKey
|
||||
d.mu.RUnlock()
|
||||
@@ -293,19 +292,6 @@ func (d *DnsInterceptor) writeDNSError(w dns.ResponseWriter, r *dns.Msg, logger
|
||||
}
|
||||
}
|
||||
|
||||
// continueToNextHandler signals the handler chain to try the next handler
|
||||
func (d *DnsInterceptor) continueToNextHandler(w dns.ResponseWriter, r *dns.Msg, logger *log.Entry, reason string) {
|
||||
logger.Tracef("continuing to next handler for domain=%s reason=%s", r.Question[0].Name, reason)
|
||||
|
||||
resp := new(dns.Msg)
|
||||
resp.SetRcode(r, dns.RcodeNameError)
|
||||
// Set Zero bit to signal handler chain to continue
|
||||
resp.MsgHdr.Zero = true
|
||||
if err := w.WriteMsg(resp); err != nil {
|
||||
logger.Errorf("failed writing DNS continue response: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (d *DnsInterceptor) getUpstreamIP(peerKey string) (netip.Addr, error) {
|
||||
peerAllowedIP, exists := d.peerStore.AllowedIP(peerKey)
|
||||
if !exists {
|
||||
|
||||
@@ -38,7 +38,7 @@ func GetEnvKeyNBForceRelay() string {
|
||||
|
||||
// GetEnvKeyNBLazyConn Exports the environment variable for the iOS client
|
||||
func GetEnvKeyNBLazyConn() string {
|
||||
return lazyconn.EnvEnableLazyConn
|
||||
return lazyconn.EnvLazyConn
|
||||
}
|
||||
|
||||
// GetEnvKeyNBInactivityThreshold Exports the environment variable for the iOS client
|
||||
|
||||
@@ -27,6 +27,7 @@ var allKeys = []string{
|
||||
KeyWireguardPort,
|
||||
KeySplitTunnelMode,
|
||||
KeySplitTunnelApps,
|
||||
KeyLazyConnection,
|
||||
}
|
||||
|
||||
// canonicalKey maps the lowercase form of a managed-config value name to
|
||||
|
||||
@@ -11,6 +11,7 @@ package mdm
|
||||
import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
@@ -41,6 +42,11 @@ const (
|
||||
// construction — only one mode can be set at a time.
|
||||
KeySplitTunnelMode = "splitTunnelMode"
|
||||
KeySplitTunnelApps = "splitTunnelApps"
|
||||
|
||||
// KeyLazyConnection forces the lazy-connection feature on or off, overriding
|
||||
// the management feature flag. Read as a bool (native bool, or on/off,
|
||||
// true/false, 1/0, yes/no); absent = defer to management.
|
||||
KeyLazyConnection = "lazyConnection"
|
||||
)
|
||||
|
||||
// Split-tunnel mode literals (KeySplitTunnelMode values).
|
||||
@@ -62,12 +68,13 @@ var boolStringLiterals = map[string]bool{
|
||||
"true": true,
|
||||
"1": true,
|
||||
"yes": true,
|
||||
"on": true,
|
||||
"false": false,
|
||||
"0": false,
|
||||
"no": false,
|
||||
"off": false,
|
||||
}
|
||||
|
||||
|
||||
// Policy holds MDM-managed settings read from the platform source. A nil or
|
||||
// empty Policy means no enforcement is active.
|
||||
type Policy struct {
|
||||
@@ -150,7 +157,8 @@ func (p *Policy) GetString(key string) (string, bool) {
|
||||
}
|
||||
|
||||
// GetBool returns the managed value for key coerced to bool, and whether the
|
||||
// key was set. Accepts native bool and string literals "true"/"false"/"1"/"0".
|
||||
// key was set. Accepts native bool and string literals (true/false, 1/0,
|
||||
// yes/no, on/off), case-insensitively and trimmed of surrounding whitespace.
|
||||
func (p *Policy) GetBool(key string) (bool, bool) {
|
||||
if p == nil {
|
||||
return false, false
|
||||
@@ -163,7 +171,7 @@ func (p *Policy) GetBool(key string) (bool, bool) {
|
||||
case bool:
|
||||
return t, true
|
||||
case string:
|
||||
b, known := boolStringLiterals[t]
|
||||
b, known := boolStringLiterals[strings.ToLower(strings.TrimSpace(t))]
|
||||
return b, known
|
||||
case int:
|
||||
return t != 0, true
|
||||
|
||||
@@ -31,8 +31,8 @@ func TestPolicy_Empty(t *testing.T) {
|
||||
|
||||
func TestPolicy_HasKey(t *testing.T) {
|
||||
p := NewPolicy(map[string]any{
|
||||
KeyManagementURL: "https://corp.example.com",
|
||||
KeyDisableProfiles: true,
|
||||
KeyManagementURL: "https://corp.example.com",
|
||||
KeyDisableProfiles: true,
|
||||
})
|
||||
assert.False(t, p.IsEmpty())
|
||||
assert.True(t, p.HasKey(KeyManagementURL))
|
||||
@@ -53,8 +53,8 @@ func TestPolicy_ManagedKeysSorted(t *testing.T) {
|
||||
func TestPolicy_GetString(t *testing.T) {
|
||||
p := NewPolicy(map[string]any{
|
||||
KeyManagementURL: "https://corp.example.com",
|
||||
KeyDisableProfiles: true, // wrong type for GetString
|
||||
KeyPreSharedKey: "", // empty rejected
|
||||
KeyDisableProfiles: true, // wrong type for GetString
|
||||
KeyPreSharedKey: "", // empty rejected
|
||||
})
|
||||
v, ok := p.GetString(KeyManagementURL)
|
||||
assert.True(t, ok)
|
||||
@@ -85,6 +85,11 @@ func TestPolicy_GetBool(t *testing.T) {
|
||||
{"string 0", "0", false, true},
|
||||
{"string yes", "yes", true, true},
|
||||
{"string no", "no", false, true},
|
||||
{"string on", "on", true, true},
|
||||
{"string off", "off", false, true},
|
||||
{"mixed case On", "On", true, true},
|
||||
{"upper TRUE", "TRUE", true, true},
|
||||
{"padded yes", " yes ", true, true},
|
||||
{"int nonzero", 1, true, true},
|
||||
{"int zero", 0, false, true},
|
||||
{"int64 nonzero", int64(2), true, true},
|
||||
|
||||
@@ -152,7 +152,6 @@ func (s *Server) restartEngineForMDMLocked() error {
|
||||
s.config = config
|
||||
s.statusRecorder.UpdateManagementAddress(config.ManagementURL.String())
|
||||
s.statusRecorder.UpdateRosenpass(config.RosenpassEnabled, config.RosenpassPermissive)
|
||||
s.statusRecorder.UpdateLazyConnection(config.LazyConnectionEnabled)
|
||||
|
||||
ctx, cancel := context.WithCancel(s.rootCtx)
|
||||
s.actCancel = cancel
|
||||
@@ -305,7 +304,6 @@ func setConfigRequestHasConfigOverrides(msg *proto.SetConfigRequest) bool {
|
||||
msg.DisableFirewall != nil ||
|
||||
msg.BlockLanAccess != nil ||
|
||||
msg.DisableNotifications != nil ||
|
||||
msg.LazyConnectionEnabled != nil ||
|
||||
msg.BlockInbound != nil ||
|
||||
msg.DisableIpv6 != nil ||
|
||||
msg.EnableSSHRoot != nil ||
|
||||
@@ -348,7 +346,6 @@ func loginRequestHasConfigOverrides(msg *proto.LoginRequest) bool {
|
||||
msg.BlockLanAccess != nil ||
|
||||
msg.DisableNotifications != nil ||
|
||||
len(msg.DnsLabels) > 0 || msg.CleanDNSLabels ||
|
||||
msg.LazyConnectionEnabled != nil ||
|
||||
msg.BlockInbound != nil
|
||||
}
|
||||
|
||||
|
||||
@@ -214,7 +214,6 @@ func (s *Server) Start() error {
|
||||
|
||||
s.statusRecorder.UpdateManagementAddress(config.ManagementURL.String())
|
||||
s.statusRecorder.UpdateRosenpass(config.RosenpassEnabled, config.RosenpassPermissive)
|
||||
s.statusRecorder.UpdateLazyConnection(config.LazyConnectionEnabled)
|
||||
|
||||
if s.sessionWatcher == nil {
|
||||
s.sessionWatcher = internal.NewSessionWatcher(s.rootCtx, s.statusRecorder)
|
||||
@@ -463,7 +462,6 @@ func (s *Server) setConfigInputFromRequest(msg *proto.SetConfigRequest) (profile
|
||||
config.DisableFirewall = msg.DisableFirewall
|
||||
config.BlockLANAccess = msg.BlockLanAccess
|
||||
config.DisableNotifications = msg.DisableNotifications
|
||||
config.LazyConnectionEnabled = msg.LazyConnectionEnabled
|
||||
config.BlockInbound = msg.BlockInbound
|
||||
config.DisableIPv6 = msg.DisableIpv6
|
||||
config.EnableSSHRoot = msg.EnableSSHRoot
|
||||
@@ -1647,7 +1645,6 @@ func (s *Server) GetConfig(ctx context.Context, req *proto.GetConfigRequest) (*p
|
||||
ServerSSHAllowed: *cfg.ServerSSHAllowed,
|
||||
RosenpassEnabled: cfg.RosenpassEnabled,
|
||||
RosenpassPermissive: cfg.RosenpassPermissive,
|
||||
LazyConnectionEnabled: cfg.LazyConnectionEnabled,
|
||||
BlockInbound: cfg.BlockInbound,
|
||||
DisableNotifications: disableNotifications,
|
||||
NetworkMonitor: networkMonitor,
|
||||
|
||||
@@ -69,43 +69,41 @@ func TestSetConfig_AllFieldsSaved(t *testing.T) {
|
||||
disableFirewall := true
|
||||
blockLANAccess := true
|
||||
disableNotifications := true
|
||||
lazyConnectionEnabled := true
|
||||
blockInbound := true
|
||||
disableIPv6 := true
|
||||
mtu := int64(1280)
|
||||
sshJWTCacheTTL := int32(300)
|
||||
|
||||
req := &proto.SetConfigRequest{
|
||||
ProfileName: profName,
|
||||
Username: currUser.Username,
|
||||
ManagementUrl: "https://new-api.netbird.io:443",
|
||||
AdminURL: "https://new-admin.netbird.io",
|
||||
RosenpassEnabled: &rosenpassEnabled,
|
||||
RosenpassPermissive: &rosenpassPermissive,
|
||||
ServerSSHAllowed: &serverSSHAllowed,
|
||||
InterfaceName: &interfaceName,
|
||||
WireguardPort: &wireguardPort,
|
||||
OptionalPreSharedKey: &preSharedKey,
|
||||
DisableAutoConnect: &disableAutoConnect,
|
||||
NetworkMonitor: &networkMonitor,
|
||||
DisableClientRoutes: &disableClientRoutes,
|
||||
DisableServerRoutes: &disableServerRoutes,
|
||||
DisableDns: &disableDNS,
|
||||
DisableFirewall: &disableFirewall,
|
||||
BlockLanAccess: &blockLANAccess,
|
||||
DisableNotifications: &disableNotifications,
|
||||
LazyConnectionEnabled: &lazyConnectionEnabled,
|
||||
BlockInbound: &blockInbound,
|
||||
DisableIpv6: &disableIPv6,
|
||||
NatExternalIPs: []string{"1.2.3.4", "5.6.7.8"},
|
||||
CleanNATExternalIPs: false,
|
||||
CustomDNSAddress: []byte("1.1.1.1:53"),
|
||||
ExtraIFaceBlacklist: []string{"eth1", "eth2"},
|
||||
DnsLabels: []string{"label1", "label2"},
|
||||
CleanDNSLabels: false,
|
||||
DnsRouteInterval: durationpb.New(2 * time.Minute),
|
||||
Mtu: &mtu,
|
||||
SshJWTCacheTTL: &sshJWTCacheTTL,
|
||||
ProfileName: profName,
|
||||
Username: currUser.Username,
|
||||
ManagementUrl: "https://new-api.netbird.io:443",
|
||||
AdminURL: "https://new-admin.netbird.io",
|
||||
RosenpassEnabled: &rosenpassEnabled,
|
||||
RosenpassPermissive: &rosenpassPermissive,
|
||||
ServerSSHAllowed: &serverSSHAllowed,
|
||||
InterfaceName: &interfaceName,
|
||||
WireguardPort: &wireguardPort,
|
||||
OptionalPreSharedKey: &preSharedKey,
|
||||
DisableAutoConnect: &disableAutoConnect,
|
||||
NetworkMonitor: &networkMonitor,
|
||||
DisableClientRoutes: &disableClientRoutes,
|
||||
DisableServerRoutes: &disableServerRoutes,
|
||||
DisableDns: &disableDNS,
|
||||
DisableFirewall: &disableFirewall,
|
||||
BlockLanAccess: &blockLANAccess,
|
||||
DisableNotifications: &disableNotifications,
|
||||
BlockInbound: &blockInbound,
|
||||
DisableIpv6: &disableIPv6,
|
||||
NatExternalIPs: []string{"1.2.3.4", "5.6.7.8"},
|
||||
CleanNATExternalIPs: false,
|
||||
CustomDNSAddress: []byte("1.1.1.1:53"),
|
||||
ExtraIFaceBlacklist: []string{"eth1", "eth2"},
|
||||
DnsLabels: []string{"label1", "label2"},
|
||||
CleanDNSLabels: false,
|
||||
DnsRouteInterval: durationpb.New(2 * time.Minute),
|
||||
Mtu: &mtu,
|
||||
SshJWTCacheTTL: &sshJWTCacheTTL,
|
||||
}
|
||||
|
||||
_, err = s.SetConfig(ctx, req)
|
||||
@@ -140,7 +138,6 @@ func TestSetConfig_AllFieldsSaved(t *testing.T) {
|
||||
require.Equal(t, blockLANAccess, cfg.BlockLANAccess)
|
||||
require.NotNil(t, cfg.DisableNotifications)
|
||||
require.Equal(t, disableNotifications, *cfg.DisableNotifications)
|
||||
require.Equal(t, lazyConnectionEnabled, cfg.LazyConnectionEnabled)
|
||||
require.Equal(t, blockInbound, cfg.BlockInbound)
|
||||
require.Equal(t, disableIPv6, cfg.DisableIPv6)
|
||||
require.Equal(t, []string{"1.2.3.4", "5.6.7.8"}, cfg.NATExternalIPs)
|
||||
@@ -164,13 +161,14 @@ func verifyAllFieldsCovered(t *testing.T, req *proto.SetConfigRequest) {
|
||||
t.Helper()
|
||||
|
||||
metadataFields := map[string]bool{
|
||||
"state": true, // protobuf internal
|
||||
"sizeCache": true, // protobuf internal
|
||||
"unknownFields": true, // protobuf internal
|
||||
"Username": true, // metadata
|
||||
"ProfileName": true, // metadata
|
||||
"CleanNATExternalIPs": true, // control flag for clearing
|
||||
"CleanDNSLabels": true, // control flag for clearing
|
||||
"state": true, // protobuf internal
|
||||
"sizeCache": true, // protobuf internal
|
||||
"unknownFields": true, // protobuf internal
|
||||
"Username": true, // metadata
|
||||
"ProfileName": true, // metadata
|
||||
"CleanNATExternalIPs": true, // control flag for clearing
|
||||
"CleanDNSLabels": true, // control flag for clearing
|
||||
"LazyConnectionEnabled": true, // deprecated: proto field retained for compat, no longer applied
|
||||
}
|
||||
|
||||
expectedFields := map[string]bool{
|
||||
@@ -190,7 +188,6 @@ func verifyAllFieldsCovered(t *testing.T, req *proto.SetConfigRequest) {
|
||||
"DisableFirewall": true,
|
||||
"BlockLanAccess": true,
|
||||
"DisableNotifications": true,
|
||||
"LazyConnectionEnabled": true,
|
||||
"BlockInbound": true,
|
||||
"DisableIpv6": true,
|
||||
"NatExternalIPs": true,
|
||||
@@ -252,7 +249,6 @@ func TestCLIFlags_MappedToSetConfig(t *testing.T) {
|
||||
"block-lan-access": "BlockLanAccess",
|
||||
"block-inbound": "BlockInbound",
|
||||
"disable-ipv6": "DisableIpv6",
|
||||
"enable-lazy-connection": "LazyConnectionEnabled",
|
||||
"external-ip-map": "NatExternalIPs",
|
||||
"dns-resolver-address": "CustomDNSAddress",
|
||||
"extra-iface-blacklist": "ExtraIFaceBlacklist",
|
||||
@@ -269,7 +265,8 @@ func TestCLIFlags_MappedToSetConfig(t *testing.T) {
|
||||
|
||||
// SetConfigRequest fields that don't have CLI flags (settable only via UI or other means).
|
||||
fieldsWithoutCLIFlags := map[string]bool{
|
||||
"DisableNotifications": true, // Only settable via UI
|
||||
"DisableNotifications": true, // Only settable via UI
|
||||
"LazyConnectionEnabled": true, // deprecated: no longer settable (managed by server + NB_LAZY_CONN)
|
||||
}
|
||||
|
||||
// Get all SetConfigRequest fields to verify our map is complete.
|
||||
|
||||
@@ -2,9 +2,11 @@ package system
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/netip"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"google.golang.org/grpc/metadata"
|
||||
@@ -72,8 +74,6 @@ type Info struct {
|
||||
BlockInbound bool
|
||||
DisableIPv6 bool
|
||||
|
||||
LazyConnectionEnabled bool
|
||||
|
||||
EnableSSHRoot bool
|
||||
EnableSSHSFTP bool
|
||||
EnableSSHLocalPortForwarding bool
|
||||
@@ -85,7 +85,7 @@ func (i *Info) SetFlags(
|
||||
rosenpassEnabled, rosenpassPermissive bool,
|
||||
serverSSHAllowed *bool,
|
||||
disableClientRoutes, disableServerRoutes,
|
||||
disableDNS, disableFirewall, blockLANAccess, blockInbound, disableIPv6, lazyConnectionEnabled bool,
|
||||
disableDNS, disableFirewall, blockLANAccess, blockInbound, disableIPv6 bool,
|
||||
enableSSHRoot, enableSSHSFTP, enableSSHLocalPortForwarding, enableSSHRemotePortForwarding *bool,
|
||||
disableSSHAuth *bool,
|
||||
) {
|
||||
@@ -103,8 +103,6 @@ func (i *Info) SetFlags(
|
||||
i.BlockInbound = blockInbound
|
||||
i.DisableIPv6 = disableIPv6
|
||||
|
||||
i.LazyConnectionEnabled = lazyConnectionEnabled
|
||||
|
||||
if enableSSHRoot != nil {
|
||||
i.EnableSSHRoot = *enableSSHRoot
|
||||
}
|
||||
@@ -174,7 +172,7 @@ func GetInfoWithChecks(ctx context.Context, checks []*proto.Checks, excludeIPs .
|
||||
processCheckPaths = append(processCheckPaths, check.GetFiles()...)
|
||||
}
|
||||
|
||||
files, err := checkFileAndProcess(processCheckPaths)
|
||||
files, err := checkFileAndProcess(ctx, processCheckPaths)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -187,3 +185,43 @@ func GetInfoWithChecks(ctx context.Context, checks []*proto.Checks, excludeIPs .
|
||||
log.Debugf("all system information gathered successfully")
|
||||
return info, nil
|
||||
}
|
||||
|
||||
// GetInfoWithChecksTimeout is GetInfoWithChecks bounded by timeout. Posture-check gathering
|
||||
// runs uncancellable system calls (process enumeration, os.Stat), so calling it inline can
|
||||
// block the caller for as long as such a call hangs. It runs in a goroutine instead: if it
|
||||
// does not return within timeout the caller gets (nil, false) and should proceed with
|
||||
// degraded behavior rather than block. On a gathering error it falls back to base GetInfo.
|
||||
//
|
||||
// The buffered channel lets the abandoned goroutine finish and exit once its blocking call
|
||||
// returns, so it does not leak beyond the duration of that call.
|
||||
func GetInfoWithChecksTimeout(ctx context.Context, timeout time.Duration, checks []*proto.Checks, excludeIPs ...netip.Addr) (*Info, bool) {
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
infoCh := make(chan *Info, 1)
|
||||
go func() {
|
||||
info, err := GetInfoWithChecks(ctx, checks, excludeIPs...)
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
log.Warnf("failed to get system info with checks: %v", err)
|
||||
info = GetInfo(ctx)
|
||||
info.removeAddresses(excludeIPs...)
|
||||
}
|
||||
infoCh <- info
|
||||
}()
|
||||
|
||||
select {
|
||||
case info := <-infoCh:
|
||||
return info, true
|
||||
case <-ctx.Done():
|
||||
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
|
||||
log.Warnf("gathering system info with checks timed out after %s", timeout)
|
||||
} else {
|
||||
// Parent context canceled (e.g. shutdown), not a timeout.
|
||||
log.Warnf("gathering system info with checks canceled: %v", ctx.Err())
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ func GetInfo(ctx context.Context) *Info {
|
||||
}
|
||||
|
||||
// checkFileAndProcess checks if the file path exists and if a process is running at that path.
|
||||
func checkFileAndProcess(paths []string) ([]File, error) {
|
||||
func checkFileAndProcess(_ context.Context, _ []string) ([]File, error) {
|
||||
return []File{}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ func GetInfo(ctx context.Context) *Info {
|
||||
sysName := string(bytes.Split(utsname.Sysname[:], []byte{0})[0])
|
||||
machine := string(bytes.Split(utsname.Machine[:], []byte{0})[0])
|
||||
release := string(bytes.Split(utsname.Release[:], []byte{0})[0])
|
||||
swVersion, err := exec.Command("sw_vers", "-productVersion").Output()
|
||||
swVersion, err := exec.CommandContext(ctx, "sw_vers", "-productVersion").Output()
|
||||
if err != nil {
|
||||
log.Warnf("got an error while retrieving macOS version with sw_vers, error: %s. Using darwin version instead.\n", err)
|
||||
swVersion = []byte(release)
|
||||
|
||||
@@ -105,7 +105,7 @@ func isDuplicated(addresses []NetworkAddress, addr NetworkAddress) bool {
|
||||
}
|
||||
|
||||
// checkFileAndProcess checks if the file path exists and if a process is running at that path.
|
||||
func checkFileAndProcess(paths []string) ([]File, error) {
|
||||
func checkFileAndProcess(_ context.Context, _ []string) ([]File, error) {
|
||||
return []File{}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@ func collectLocationInfo(info *Info) {
|
||||
}
|
||||
}
|
||||
|
||||
func checkFileAndProcess(_ []string) ([]File, error) {
|
||||
func checkFileAndProcess(_ context.Context, _ []string) ([]File, error) {
|
||||
return []File{}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"net/netip"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"google.golang.org/grpc/metadata"
|
||||
@@ -35,6 +36,20 @@ func Test_CustomHostname(t *testing.T) {
|
||||
assert.Equal(t, want, got.Hostname)
|
||||
}
|
||||
|
||||
func TestGetInfoWithChecksTimeout_Success(t *testing.T) {
|
||||
info, ok := GetInfoWithChecksTimeout(context.Background(), 30*time.Second, nil)
|
||||
assert.True(t, ok, "expected gathering to complete within the timeout")
|
||||
assert.NotNil(t, info)
|
||||
}
|
||||
|
||||
func TestGetInfoWithChecksTimeout_Timeout(t *testing.T) {
|
||||
// A 1ns budget expires before the (real) system-info gathering can finish, so the
|
||||
// caller must get (nil, false) instead of blocking on the in-flight goroutine.
|
||||
info, ok := GetInfoWithChecksTimeout(context.Background(), time.Nanosecond, nil)
|
||||
assert.False(t, ok, "expected timeout to be reported")
|
||||
assert.Nil(t, info)
|
||||
}
|
||||
|
||||
func Test_NetAddresses(t *testing.T) {
|
||||
addr, err := networkAddresses()
|
||||
if err != nil {
|
||||
|
||||
@@ -3,24 +3,30 @@
|
||||
package system
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"slices"
|
||||
|
||||
"github.com/shirou/gopsutil/v3/process"
|
||||
)
|
||||
|
||||
// getRunningProcesses returns a list of running process paths.
|
||||
func getRunningProcesses() ([]string, error) {
|
||||
processIDs, err := process.Pids()
|
||||
// getRunningProcesses returns a list of running process paths. The context bounds the work:
|
||||
// the per-PID loop bails as soon as ctx is done, and the gopsutil calls honor it where they
|
||||
// can, so a stuck enumeration cannot run unbounded.
|
||||
func getRunningProcesses(ctx context.Context) ([]string, error) {
|
||||
processIDs, err := process.PidsWithContext(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
processMap := make(map[string]bool)
|
||||
for _, pID := range processIDs {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
p := &process.Process{Pid: pID}
|
||||
|
||||
path, _ := p.Exe()
|
||||
path, _ := p.ExeWithContext(ctx)
|
||||
if path != "" {
|
||||
processMap[path] = false
|
||||
}
|
||||
@@ -35,18 +41,21 @@ func getRunningProcesses() ([]string, error) {
|
||||
}
|
||||
|
||||
// checkFileAndProcess checks if the file path exists and if a process is running at that path.
|
||||
func checkFileAndProcess(paths []string) ([]File, error) {
|
||||
func checkFileAndProcess(ctx context.Context, paths []string) ([]File, error) {
|
||||
files := make([]File, len(paths))
|
||||
if len(paths) == 0 {
|
||||
return files, nil
|
||||
}
|
||||
|
||||
runningProcesses, err := getRunningProcesses()
|
||||
runningProcesses, err := getRunningProcesses(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i, path := range paths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
file := File{Path: path}
|
||||
|
||||
_, err := os.Stat(path)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package system
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/shirou/gopsutil/v3/process"
|
||||
@@ -9,7 +10,7 @@ import (
|
||||
func Benchmark_getRunningProcesses(b *testing.B) {
|
||||
b.Run("getRunningProcesses new", func(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
ps, err := getRunningProcesses()
|
||||
ps, err := getRunningProcesses(context.Background())
|
||||
if err != nil {
|
||||
b.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
@@ -29,12 +30,38 @@ func Benchmark_getRunningProcesses(b *testing.B) {
|
||||
}
|
||||
}
|
||||
})
|
||||
s, _ := getRunningProcesses()
|
||||
s, _ := getRunningProcesses(context.Background())
|
||||
b.Logf("getRunningProcesses returned %d processes", len(s))
|
||||
s, _ = getRunningProcessesOld()
|
||||
b.Logf("getRunningProcessesOld returned %d processes", len(s))
|
||||
}
|
||||
|
||||
func TestCheckFileAndProcess_ContextCanceled(t *testing.T) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
// With a canceled context and non-empty paths the gathering must bail with an error
|
||||
// instead of running the (potentially blocking) process scan / stat loop.
|
||||
if _, err := checkFileAndProcess(ctx, []string{"/does/not/exist"}); err == nil {
|
||||
t.Fatal("expected error on canceled context, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckFileAndProcess_EmptyPaths(t *testing.T) {
|
||||
// No check paths means no work to do: it must return immediately with no error,
|
||||
// even on a canceled context (nothing to scan or stat).
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
files, err := checkFileAndProcess(ctx, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error for empty paths: %v", err)
|
||||
}
|
||||
if len(files) != 0 {
|
||||
t.Fatalf("expected no files, got %d", len(files))
|
||||
}
|
||||
}
|
||||
|
||||
func getRunningProcessesOld() ([]string, error) {
|
||||
processes, err := process.Processes()
|
||||
if err != nil {
|
||||
|
||||
@@ -266,7 +266,6 @@ type serviceClient struct {
|
||||
mAllowSSH *systray.MenuItem
|
||||
mAutoConnect *systray.MenuItem
|
||||
mEnableRosenpass *systray.MenuItem
|
||||
mLazyConnEnabled *systray.MenuItem
|
||||
mBlockInbound *systray.MenuItem
|
||||
mNotifications *systray.MenuItem
|
||||
mAdvancedSettings *systray.MenuItem
|
||||
@@ -336,11 +335,11 @@ type serviceClient struct {
|
||||
// mNetworks + mExitNode submenu items. Combines features.DisableNetworks
|
||||
// AND s.connected — both must be true for the menus to be active.
|
||||
// Zero value (false) matches the Disable() call at AddMenuItem time.
|
||||
networksMenuEnabled bool
|
||||
showNetworks bool
|
||||
wNetworks fyne.Window
|
||||
wProfiles fyne.Window
|
||||
wQuickActions fyne.Window
|
||||
networksMenuEnabled bool
|
||||
showNetworks bool
|
||||
wNetworks fyne.Window
|
||||
wProfiles fyne.Window
|
||||
wQuickActions fyne.Window
|
||||
|
||||
eventManager *event.Manager
|
||||
|
||||
@@ -1094,7 +1093,6 @@ func (s *serviceClient) onTrayReady() {
|
||||
s.mAllowSSH = s.mSettings.AddSubMenuItemCheckbox("Allow SSH", allowSSHMenuDescr, false)
|
||||
s.mAutoConnect = s.mSettings.AddSubMenuItemCheckbox("Connect on Startup", autoConnectMenuDescr, false)
|
||||
s.mEnableRosenpass = s.mSettings.AddSubMenuItemCheckbox("Enable Quantum-Resistance", quantumResistanceMenuDescr, false)
|
||||
s.mLazyConnEnabled = s.mSettings.AddSubMenuItemCheckbox("Enable Lazy Connections", lazyConnMenuDescr, false)
|
||||
s.mBlockInbound = s.mSettings.AddSubMenuItemCheckbox("Block Inbound Connections", blockInboundMenuDescr, false)
|
||||
s.mNotifications = s.mSettings.AddSubMenuItemCheckbox("Notifications", notificationsMenuDescr, false)
|
||||
s.mSettings.AddSeparator()
|
||||
@@ -1578,7 +1576,6 @@ func protoConfigToConfig(cfg *proto.GetConfigResponse) *profilemanager.Config {
|
||||
config.RosenpassEnabled = cfg.RosenpassEnabled
|
||||
config.RosenpassPermissive = cfg.RosenpassPermissive
|
||||
config.DisableNotifications = &cfg.DisableNotifications
|
||||
config.LazyConnectionEnabled = cfg.LazyConnectionEnabled
|
||||
config.BlockInbound = cfg.BlockInbound
|
||||
config.NetworkMonitor = &cfg.NetworkMonitor
|
||||
config.DisableDNS = cfg.DisableDns
|
||||
@@ -1682,12 +1679,6 @@ func (s *serviceClient) loadSettings() {
|
||||
s.mEnableRosenpass.Uncheck()
|
||||
}
|
||||
|
||||
if cfg.LazyConnectionEnabled {
|
||||
s.mLazyConnEnabled.Check()
|
||||
} else {
|
||||
s.mLazyConnEnabled.Uncheck()
|
||||
}
|
||||
|
||||
if cfg.BlockInbound {
|
||||
s.mBlockInbound.Check()
|
||||
} else {
|
||||
@@ -1833,7 +1824,6 @@ func (s *serviceClient) updateConfig() error {
|
||||
disableAutoStart := !s.mAutoConnect.Checked()
|
||||
sshAllowed := s.mAllowSSH.Checked()
|
||||
rosenpassEnabled := s.mEnableRosenpass.Checked()
|
||||
lazyConnectionEnabled := s.mLazyConnEnabled.Checked()
|
||||
blockInbound := s.mBlockInbound.Checked()
|
||||
notificationsDisabled := !s.mNotifications.Checked()
|
||||
|
||||
@@ -1856,14 +1846,13 @@ func (s *serviceClient) updateConfig() error {
|
||||
}
|
||||
|
||||
req := proto.SetConfigRequest{
|
||||
ProfileName: activeProf.ID.String(),
|
||||
Username: currUser.Username,
|
||||
DisableAutoConnect: &disableAutoStart,
|
||||
ServerSSHAllowed: &sshAllowed,
|
||||
RosenpassEnabled: &rosenpassEnabled,
|
||||
LazyConnectionEnabled: &lazyConnectionEnabled,
|
||||
BlockInbound: &blockInbound,
|
||||
DisableNotifications: ¬ificationsDisabled,
|
||||
ProfileName: activeProf.ID.String(),
|
||||
Username: currUser.Username,
|
||||
DisableAutoConnect: &disableAutoStart,
|
||||
ServerSSHAllowed: &sshAllowed,
|
||||
RosenpassEnabled: &rosenpassEnabled,
|
||||
BlockInbound: &blockInbound,
|
||||
DisableNotifications: ¬ificationsDisabled,
|
||||
}
|
||||
|
||||
if _, err := conn.SetConfig(s.ctx, &req); err != nil {
|
||||
|
||||
@@ -4,7 +4,6 @@ const (
|
||||
allowSSHMenuDescr = "Allow SSH connections"
|
||||
autoConnectMenuDescr = "Connect automatically when the service starts"
|
||||
quantumResistanceMenuDescr = "Enable post-quantum security via Rosenpass"
|
||||
lazyConnMenuDescr = "[Experimental] Enable lazy connections"
|
||||
blockInboundMenuDescr = "Block inbound connections to the local machine and routed networks"
|
||||
notificationsMenuDescr = "Enable notifications"
|
||||
advancedSettingsMenuDescr = "Advanced settings of the application"
|
||||
|
||||
@@ -43,8 +43,6 @@ func (h *eventHandler) listen(ctx context.Context) {
|
||||
h.handleAutoConnectClick()
|
||||
case <-h.client.mEnableRosenpass.ClickedCh:
|
||||
h.handleRosenpassClick()
|
||||
case <-h.client.mLazyConnEnabled.ClickedCh:
|
||||
h.handleLazyConnectionClick()
|
||||
case <-h.client.mBlockInbound.ClickedCh:
|
||||
h.handleBlockInboundClick()
|
||||
case <-h.client.mAdvancedSettings.ClickedCh:
|
||||
@@ -152,15 +150,6 @@ func (h *eventHandler) handleRosenpassClick() {
|
||||
}
|
||||
}
|
||||
|
||||
func (h *eventHandler) handleLazyConnectionClick() {
|
||||
h.toggleCheckbox(h.client.mLazyConnEnabled)
|
||||
if err := h.updateConfigWithErr(); err != nil {
|
||||
h.toggleCheckbox(h.client.mLazyConnEnabled) // revert checkbox state on error
|
||||
log.Errorf("failed to update config: %v", err)
|
||||
h.client.notifier.Send("Error", "Failed to update lazy connection settings")
|
||||
}
|
||||
}
|
||||
|
||||
func (h *eventHandler) handleBlockInboundClick() {
|
||||
h.toggleCheckbox(h.client.mBlockInbound)
|
||||
if err := h.updateConfigWithErr(); err != nil {
|
||||
|
||||
@@ -5,12 +5,16 @@ WORKDIR /app
|
||||
RUN apt-get update && apt-get install -y gcc libc6-dev git && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
RUN --mount=type=cache,target=/go/pkg/mod go mod download
|
||||
|
||||
COPY . .
|
||||
|
||||
# Build with version info from git (matching goreleaser ldflags)
|
||||
RUN CGO_ENABLED=1 GOOS=linux go build \
|
||||
# Build with version info from git (matching goreleaser ldflags).
|
||||
# BuildKit cache mounts persist the module + build caches across image builds,
|
||||
# so a source change recompiles incrementally instead of from scratch.
|
||||
RUN --mount=type=cache,target=/go/pkg/mod \
|
||||
--mount=type=cache,target=/root/.cache/go-build \
|
||||
CGO_ENABLED=1 GOOS=linux go build \
|
||||
-ldflags="-s -w \
|
||||
-X github.com/netbirdio/netbird/version.version=$(git describe --tags --always --dirty 2>/dev/null || echo 'dev') \
|
||||
-X main.commit=$(git rev-parse --short HEAD 2>/dev/null || echo 'unknown') \
|
||||
|
||||
109
docs/agent-networks/00-overview.md
Normal file
109
docs/agent-networks/00-overview.md
Normal file
@@ -0,0 +1,109 @@
|
||||
# Agent Networks — overview
|
||||
|
||||
Single-entry point. Feature scope, the module map, and the cross-cutting
|
||||
topics worth keeping in mind, with links into every per-module guide.
|
||||
|
||||
## TL;DR
|
||||
|
||||
Agent Networks introduces an **LLM-aware reverse-proxy middleware system**
|
||||
plus **account-level controls** (budget rules, log collection toggles,
|
||||
PII redaction). The management server synthesises a per-peer middleware
|
||||
chain that the proxy executes on every LLM request; the chain enforces
|
||||
quotas, injects identity, redacts PII, parses tokens/cost, and emits
|
||||
access-log entries. The dashboard exposes the surface as a single **AI
|
||||
Observability** page with four tabs.
|
||||
|
||||
- **Backend** lives in this repo, primarily under
|
||||
`management/server/agentnetwork`, `proxy/internal/middleware`, and
|
||||
`proxy/internal/llm`, with wire contracts in `shared/management`.
|
||||
- **Dashboard** lives in the dashboard repo under
|
||||
`src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`.
|
||||
|
||||
## Reading order
|
||||
|
||||
| # | Doc | Why |
|
||||
|---|-----|-----|
|
||||
| 1 | [01-end-to-end-flows.md](01-end-to-end-flows.md) | Get the three big diagrams in your head first. |
|
||||
| 2 | [modules/10-shared-api.md](modules/10-shared-api.md) | Wire contracts — every other module either produces or consumes these. |
|
||||
| 3 | [modules/21-management-agentnetwork.md](modules/21-management-agentnetwork.md) | The largest module; everything the proxy executes originates here. |
|
||||
| 4 | [modules/30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md) | The generic plugin system on the proxy side. |
|
||||
| 5 | [modules/31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md) | The 8 LLM middlewares that ride on the framework. |
|
||||
| 6 | Everything else in any order. | |
|
||||
|
||||
## Module map
|
||||
|
||||
11 modules. Each is described in detail in its own file under
|
||||
[`modules/`](modules/).
|
||||
|
||||
| # | Module | Risk | BC impact |
|
||||
|---|--------|------|-----------|
|
||||
| 10 | [shared/api](modules/10-shared-api.md) — proto + OpenAPI | Low | Additive only |
|
||||
| 20 | [management/store](modules/20-management-store.md) — SQL persistence | Medium | Auto-migrate (additive) |
|
||||
| 21 | [management/agentnetwork](modules/21-management-agentnetwork.md) — domain layer + synthesizer | **High** | Additive |
|
||||
| 22 | [management/handlers + wiring](modules/22-management-handlers-wiring.md) — HTTP API + gRPC delivery | Medium | Additive |
|
||||
| 30 | [proxy/middleware-framework](modules/30-proxy-middleware-framework.md) — generic plugin system | High | Additive |
|
||||
| 31 | [proxy/middleware-builtin](modules/31-proxy-middleware-builtin.md) — 8 LLM middlewares | High | Additive |
|
||||
| 32 | [proxy/llm-parsers](modules/32-proxy-llm-parsers.md) — SDK adapters + pricing | Medium | Additive |
|
||||
| 33 | [proxy/runtime](modules/33-proxy-runtime.md) — translate + serve + access-log | High | Additive (touches hot path) |
|
||||
| 40 | [dashboard](modules/40-dashboard.md) — UI for everything above | Medium | Sidebar reshape |
|
||||
| 50 | [path-routed-providers](modules/50-path-routed-providers.md) — Vertex AI + Bedrock | Medium | Additive (new catalog entries) |
|
||||
|
||||
The largest and highest-risk module is `management/agentnetwork`: it is
|
||||
the single writer of the middleware chain the proxy executes.
|
||||
|
||||
## Cross-cutting topics
|
||||
|
||||
These are the items most likely to bite production. Each is fully
|
||||
documented in the linked module guide.
|
||||
|
||||
1. **Capture-pointer semantics** (`*bool` for `capture_prompt` and
|
||||
`capture_completion`): nil = legacy emit, false = suppress, true =
|
||||
emit. nil-vs-false must be handled at every JSON hop. See
|
||||
[21-management-agentnetwork.md](modules/21-management-agentnetwork.md)
|
||||
and [31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md).
|
||||
2. **`ProxyMapping.Private` preservation** on per-proxy live updates.
|
||||
Failure mode: `auth` skips `ValidateTunnelPeer` →
|
||||
`CapturedData.UserGroups` empty → `llm_router` denies. See
|
||||
[33-proxy-runtime.md](modules/33-proxy-runtime.md).
|
||||
3. **respInput carrying `UserEmail`/`UserGroups`/`UserGroupNames` onto
|
||||
the response leg** in `reverseproxy.go`. Load-bearing wire that lets
|
||||
`llm_limit_record` ship non-empty `group_ids` on `RecordLLMUsage`. See
|
||||
[33-proxy-runtime.md](modules/33-proxy-runtime.md).
|
||||
4. **Min-wins all-must-pass budget rule semantics**. Every matching
|
||||
rule's remaining quota must be > 0 for the request to proceed; one
|
||||
exhausted rule blocks the whole call. Documented in
|
||||
[21-management-agentnetwork.md](modules/21-management-agentnetwork.md)
|
||||
and the `llm_limit_check` middleware in
|
||||
[31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md).
|
||||
5. **body-tap memory bounds**: per-direction 1 MiB cap, shared 256 MiB
|
||||
budget, `LimitReader(r.Body, limit+1)` for truncation detection with
|
||||
`replayReadCloser` fallback so upstream still sees the full body.
|
||||
`cloneInputFor` deep-copies the body up to 16 times per chain — a
|
||||
perf hot-spot. See
|
||||
[30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md).
|
||||
6. **UpstreamRewrite.AuthHeader bypasses the header denylist**
|
||||
deliberately. The runtime consumer only unpacks it via the
|
||||
trusted upstream-build path. See
|
||||
[30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md).
|
||||
7. **`disable_access_log` default-false semantics**: the synth target
|
||||
sets it true, all other targets leave it false. See
|
||||
[10-shared-api.md](modules/10-shared-api.md).
|
||||
8. **String-typed `decision` / `deny_code`** on
|
||||
`CheckLLMPolicyLimitsResponse` — would benefit from enum pinning
|
||||
before external consumers integrate. See
|
||||
[10-shared-api.md](modules/10-shared-api.md).
|
||||
|
||||
## Explicit non-goals
|
||||
|
||||
- **Reaper / GC pass over stale synth services** — designed but cut from
|
||||
scope.
|
||||
- **URL-sync for tab state on AI Observability** — read path is wired
|
||||
(`?tab=`) but write path isn't. Future work.
|
||||
- **CI golden-file regen-and-diff for `types.gen.go` /
|
||||
`proxy_service.pb.go`** — would catch codegen drift; not yet in place.
|
||||
|
||||
## Where to read the code
|
||||
|
||||
Per-module file scopes are listed in each module guide. Behaviour is
|
||||
covered by Go tests co-located with each package (and an end-to-end
|
||||
chain integration test under `proxy/internal/proxy`).
|
||||
217
docs/agent-networks/01-end-to-end-flows.md
Normal file
217
docs/agent-networks/01-end-to-end-flows.md
Normal file
@@ -0,0 +1,217 @@
|
||||
# End-to-end flows
|
||||
|
||||
Three cross-module mermaid diagrams. Each per-module guide repeats the
|
||||
slice that's relevant to its own scope — these are the canonical
|
||||
top-down views.
|
||||
|
||||
- [Flow A — Config → runtime (synth + deliver)](#flow-a--config--runtime-synth--deliver)
|
||||
- [Flow B — Request lifecycle through the LLM chain](#flow-b--request-lifecycle-through-the-llm-chain)
|
||||
- [Flow C — Budget rule feedback loop](#flow-c--budget-rule-feedback-loop)
|
||||
|
||||
---
|
||||
|
||||
## Flow A — Config → runtime (synth + deliver)
|
||||
|
||||
How an operator's change to a Provider, Policy, Guardrail, Budget Rule,
|
||||
or Settings record ends up as live middleware on a peer's proxy.
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
actor Op as Operator
|
||||
participant UI as Dashboard
|
||||
participant HTTP as management/handlers
|
||||
participant Mgr as agentnetwork.Manager
|
||||
participant Store as management/store (SQL)
|
||||
participant Ctl as network_map.Controller
|
||||
participant Synth as agentnetwork.SynthesizeServices
|
||||
participant Grpc as management gRPC
|
||||
participant Proxy as netbird-proxy
|
||||
participant Xlate as middleware_translate
|
||||
participant Chain as middleware.Chain
|
||||
|
||||
Op->>UI: edit provider/policy/budget/settings
|
||||
UI->>HTTP: REST PUT/POST /api/agent-network/*
|
||||
HTTP->>Mgr: SaveProvider / SavePolicy / SaveBudgetRule / SaveSettings
|
||||
Mgr->>Store: persist (gorm)
|
||||
Mgr-->>Ctl: account change event (Network-Map dirty)
|
||||
loop per connected peer
|
||||
Ctl->>Synth: SynthesizeServices(ctx, store, accountID)
|
||||
Synth->>Store: load providers, policies, guardrails, budget rules, settings
|
||||
Synth-->>Synth: build per-peer Service list
|
||||
Note over Synth: each Service has a middleware<br/>chain with capture_prompt /<br/>capture_completion / redact_pii<br/>baked from account settings
|
||||
Synth-->>Ctl: []rpservice.Service
|
||||
Ctl->>Grpc: NetworkMap push (services + middleware configs)
|
||||
end
|
||||
Grpc-->>Proxy: NetworkMap stream
|
||||
Proxy->>Xlate: translate proto MiddlewareConfig → runtime Spec
|
||||
Xlate->>Chain: register / replace per-service chain
|
||||
Note over Chain: chain replacement is live<br/>(no proxy restart, in-flight<br/>requests unaffected)
|
||||
```
|
||||
|
||||
**Notes on the diagram**
|
||||
|
||||
- The `network_map.Controller` synthesises on every push, not on a
|
||||
timer. A single config change costs O(connected peers × policies ×
|
||||
providers) per push. See [`modules/22-management-handlers-wiring.md`](modules/22-management-handlers-wiring.md).
|
||||
- `SynthesizeServices` is the single source of truth for the wire
|
||||
format the proxy executes. Anything the proxy does that the
|
||||
synthesiser didn't request is a bug. See
|
||||
[`modules/21-management-agentnetwork.md`](modules/21-management-agentnetwork.md).
|
||||
- The translate step (step 13) is the only place that knows the
|
||||
middleware-ID strings on the proxy side. It must reject unknown IDs;
|
||||
silently dropping middlewares would create a security gap (e.g.
|
||||
missing `llm_limit_check` ⇒ unbounded spend). See
|
||||
[`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
|
||||
|
||||
---
|
||||
|
||||
## Flow B — Request lifecycle through the LLM chain
|
||||
|
||||
What happens when an agent on the client peer sends a chat-completion /
|
||||
messages request through the synthesised reverse-proxy.
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
actor Agent as Agent (local)
|
||||
participant Px as netbird-proxy
|
||||
participant Auth as auth middleware
|
||||
participant Map as service-mapping
|
||||
participant Req as llm_request_parser
|
||||
participant Rt as llm_router
|
||||
participant Chk as llm_limit_check
|
||||
participant Inj as llm_identity_inject
|
||||
participant Grd as llm_guardrail
|
||||
participant Up as upstream LLM
|
||||
participant Resp as llm_response_parser
|
||||
participant Cost as cost_meter
|
||||
participant Rec as llm_limit_record
|
||||
participant Log as access-log
|
||||
participant MgmtGrpc as management gRPC
|
||||
|
||||
Agent->>Px: POST /v1/chat/completions (OpenAI / Anthropic)
|
||||
Px->>Auth: identify peer (user, groups)
|
||||
Auth->>Map: resolve service from Host + path
|
||||
Map-->>Req: dispatch chain in slot order
|
||||
|
||||
Req->>Req: parse body → provider, model, prompt, token estimate
|
||||
Note over Req: capture_prompt gates raw_prompt<br/>capture (nil = legacy emit,<br/>false = drop, true = emit)
|
||||
Req->>Rt: pass metadata
|
||||
Rt->>Chk: route to upstream candidate
|
||||
|
||||
Chk->>MgmtGrpc: CheckLLMPolicyLimits(provider, model, est_tokens, groups, user)
|
||||
MgmtGrpc-->>Chk: decision = allow / deny + deny_code
|
||||
alt decision == deny
|
||||
Chk-->>Log: emit access-log with deny_code<br/>(if EnableLogCollection)
|
||||
Chk-->>Agent: 429 (or 403 per deny_code)
|
||||
else decision == allow
|
||||
Chk->>Inj: continue
|
||||
Inj->>Inj: inject NetBird identity headers per provider config
|
||||
Inj->>Grd: continue
|
||||
Grd->>Grd: enforce model allowlist
|
||||
Grd->>Up: forward (over WireGuard)
|
||||
Up-->>Resp: response (JSON or SSE stream)
|
||||
Resp->>Resp: parse usage tokens, completion
|
||||
Note over Resp: capture_completion gates raw<br/>completion capture
|
||||
Resp->>Cost: tokens
|
||||
Cost->>Cost: lookup pricing.yaml + compute cost
|
||||
Cost->>Rec: tokens + cost
|
||||
Rec->>MgmtGrpc: RecordLLMUsage(provider, model, prompt_t, completion_t, cost, groups, user)
|
||||
Rec-->>Log: emit access-log entry<br/>(if EnableLogCollection)
|
||||
Log-->>Agent: 200 + body (streamed if SSE)
|
||||
end
|
||||
```
|
||||
|
||||
**Notes on the diagram**
|
||||
|
||||
- The chain runs in synth-defined order. Re-ordering middlewares
|
||||
changes invariants — `llm_limit_check` must precede `llm_router` so
|
||||
a denied request never hits upstream, and `llm_limit_record` must
|
||||
pair with `llm_limit_check` so a successful check is always recorded
|
||||
(or the rate-limit semantics break). See
|
||||
[`modules/31-proxy-middleware-builtin.md`](modules/31-proxy-middleware-builtin.md).
|
||||
- `llm_guardrail` is also where PII redaction happens
|
||||
(`redact_pii = settings.RedactPii`). Phones, emails, credit cards,
|
||||
PII names — see `redact.go` for the full set. See
|
||||
[`modules/31-proxy-middleware-builtin.md`](modules/31-proxy-middleware-builtin.md).
|
||||
- SSE streaming requires special handling on the response side; the
|
||||
parser must handle partial chunks without buffering the whole
|
||||
stream. See [`modules/32-proxy-llm-parsers.md`](modules/32-proxy-llm-parsers.md).
|
||||
- Access-log emission is gated on `settings.EnableLogCollection`. With
|
||||
it OFF, neither the deny nor the allow leg writes an entry — the
|
||||
chain still runs (budget rules are still enforced) but no audit trail
|
||||
is kept. See
|
||||
[`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
|
||||
|
||||
---
|
||||
|
||||
## Flow C — Budget rule feedback loop
|
||||
|
||||
How an account's budget rules tighten ceilings on every request and how
|
||||
consumption flows back into the dashboard.
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph Operator
|
||||
DashBud[Dashboard Budget Settings tab]
|
||||
end
|
||||
subgraph Mgmt[Management]
|
||||
Save[POST/PUT /api/agent-network/budget-rules]
|
||||
Store[(SQL store)]
|
||||
Synth[SynthesizeServices]
|
||||
Check[CheckLLMPolicyLimits RPC]
|
||||
Rec[RecordLLMUsage RPC]
|
||||
Cons[/api/agent-network/consumption]
|
||||
end
|
||||
subgraph Proxy[Proxy]
|
||||
Chk[llm_limit_check]
|
||||
RecMw[llm_limit_record]
|
||||
end
|
||||
subgraph DashView[Dashboard Budget Dashboard tab]
|
||||
Panel[AgentConsumptionPanel]
|
||||
end
|
||||
|
||||
DashBud -->|create / update rules| Save
|
||||
Save --> Store
|
||||
Store --> Synth
|
||||
Synth -->|push synth-services to peer| Proxy
|
||||
|
||||
Chk -->|per request| Check
|
||||
Check -->|aggregate matching rules<br/>min-wins all-must-pass| Store
|
||||
Check -->|allow / deny| Chk
|
||||
|
||||
RecMw -->|post-response| Rec
|
||||
Rec -->|tokens + cost + groups + user| Store
|
||||
|
||||
Store -->|read counters| Cons
|
||||
Cons --> Panel
|
||||
```
|
||||
|
||||
**Notes on the diagram**
|
||||
|
||||
- **min-wins all-must-pass** is the core semantic. A budget rule binds
|
||||
to (group set, user set) with a (window, ceiling). At check time,
|
||||
every rule that matches the caller is evaluated; if ANY rule has
|
||||
zero remaining quota the request is denied. This is the most
|
||||
surprising semantic for operators — see the invariants section of
|
||||
[`modules/21-management-agentnetwork.md`](modules/21-management-agentnetwork.md).
|
||||
- The proxy never makes its own budget decisions. It always asks
|
||||
management via `CheckLLMPolicyLimits` and reports back via
|
||||
`RecordLLMUsage`. This keeps account-wide accounting in one place
|
||||
and avoids per-proxy drift.
|
||||
- `RecordLLMUsage` must carry `group_ids` and `user_id` so the
|
||||
decrement hits the right rule(s). The wire that carries those
|
||||
fields onto the response leg is `respInput` in `reverseproxy.go`. See
|
||||
[`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
|
||||
- The dashboard's Budget Dashboard tab polls
|
||||
`/api/agent-network/consumption` — not gRPC, not WebSocket. Poll
|
||||
interval lives in `AgentConsumptionPanel.tsx`. See
|
||||
[`modules/40-dashboard.md`](modules/40-dashboard.md).
|
||||
|
||||
---
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Per-module guides: [`modules/`](modules/)
|
||||
- Overview + module map: [`00-overview.md`](00-overview.md)
|
||||
66
docs/agent-networks/README.md
Normal file
66
docs/agent-networks/README.md
Normal file
@@ -0,0 +1,66 @@
|
||||
# Agent Networks — architecture documentation
|
||||
|
||||
A self-contained set of documents describing the agent-networks feature:
|
||||
an LLM-aware reverse-proxy middleware system plus account-level controls
|
||||
(budget rules, log collection toggles, PII redaction). The management
|
||||
server synthesises a per-peer middleware chain that the proxy executes on
|
||||
every LLM request.
|
||||
|
||||
## What to read first
|
||||
|
||||
1. **[00-overview.md](00-overview.md)** — the single entry point. Feature
|
||||
scope, the module map, and the cross-cutting topics worth keeping in
|
||||
mind, with links to every per-module guide.
|
||||
2. **[01-end-to-end-flows.md](01-end-to-end-flows.md)** — three
|
||||
high-level mermaid diagrams: config-to-runtime synth/delivery,
|
||||
per-request lifecycle through the LLM chain, and the budget-rule
|
||||
feedback loop.
|
||||
3. **Per-module guides** under `modules/` — one file per package. Each
|
||||
describes the module boundary, the file-level layout, its own flow
|
||||
diagrams, the public contracts, the invariants it relies on, and the
|
||||
areas worth the closest attention.
|
||||
|
||||
## Directory layout
|
||||
|
||||
```
|
||||
docs/agent-networks/
|
||||
├── README.md # you are here
|
||||
├── 00-overview.md # feature summary + module map
|
||||
├── 01-end-to-end-flows.md # cross-module mermaid diagrams
|
||||
└── modules/
|
||||
├── 10-shared-api.md # proto + OpenAPI wire contracts
|
||||
├── 20-management-store.md # SQL persistence layer
|
||||
├── 21-management-agentnetwork.md # domain layer + synthesizer (largest)
|
||||
├── 22-management-handlers-wiring.md # HTTP API + gRPC delivery
|
||||
├── 30-proxy-middleware-framework.md # generic plugin system
|
||||
├── 31-proxy-middleware-builtin.md # 8 LLM-aware middlewares
|
||||
├── 32-proxy-llm-parsers.md # OpenAI/Anthropic/Bedrock SDKs + pricing
|
||||
├── 33-proxy-runtime.md # translate + serve + access-log
|
||||
├── 40-dashboard.md # UI for everything above (lives in the dashboard repo)
|
||||
└── 50-path-routed-providers.md # Vertex AI + Bedrock (path-routed, keyfile:: creds, /bedrock prefix)
|
||||
```
|
||||
|
||||
The `40-dashboard.md` module documents code that lives in the **dashboard
|
||||
repo**, not in this repo. The guide is co-located here so backend readers
|
||||
see the full picture in one place.
|
||||
|
||||
## How the per-module guides are structured
|
||||
|
||||
Every `modules/*.md` follows the same template so the docs are easy to
|
||||
scan:
|
||||
|
||||
- **Module boundary** — what this package owns; where it sits in the stack.
|
||||
- **Files** — path / role.
|
||||
- **Architecture & flow** — one or more mermaid diagrams.
|
||||
- **Public contracts** — function signatures, gRPC messages, JSON shapes.
|
||||
- **Invariants** — semantic guarantees the module relies on or enforces.
|
||||
- **Things to scrutinize** — split by correctness / security /
|
||||
concurrency / backward-compat / performance / observability.
|
||||
- **Test coverage** — the test files that lock down behaviour in this
|
||||
module.
|
||||
- **Known limitations / non-goals** — what is intentionally out of scope.
|
||||
- **Cross-references** — upstream/downstream module links + the
|
||||
end-to-end flow + the overview.
|
||||
|
||||
See [00-overview.md](00-overview.md) for the module map and the
|
||||
cross-cutting topics.
|
||||
105
docs/agent-networks/modules/10-shared-api.md
Normal file
105
docs/agent-networks/modules/10-shared-api.md
Normal file
@@ -0,0 +1,105 @@
|
||||
# shared/api — wire contracts (proto + OpenAPI)
|
||||
|
||||
> **Risk level:** Medium — wire-format surface that every other module pins against; backward-compat hinges on field-number discipline more than on logic correctness.
|
||||
> **Backward-compat impact:** Additive only (new proto fields use unallocated numbers, new RPCs default to `Unimplemented`, new OpenAPI schemas/paths are append-only; no existing field/RPC/schema removed or renumbered).
|
||||
|
||||
## Module boundary
|
||||
This module owns the cross-process contract surface between management, proxy, and dashboard. Two artefacts: `shared/management/proto/proxy_service.proto` (management↔proxy gRPC) and `shared/management/http/api/openapi.yml` (dashboard/CLI↔management REST). Both have generated companions checked in (`proxy_service.pb.go`, `proxy_service_grpc.pb.go`, `types.gen.go`) which must travel in lockstep with their sources. `shared/management/status/error.go` is in scope only for the four new typed `NotFound` constructors that the new HTTP handlers return.
|
||||
|
||||
Everything downstream — `management/agentnetwork`, `management/server/http/handlers/*`, `proxy/internal/*`, the dashboard SDK — consumes these types verbatim. The concern here is wire stability and codegen reproducibility, not behaviour: behaviour is covered in the management and proxy module guides.
|
||||
|
||||
`management.proto` and `signalexchange.proto` are unchanged. `status/error.go` only receives four additive constructors (lines 208-227); no existing error types are reshaped.
|
||||
|
||||
## Files
|
||||
| Path | Role |
|
||||
| ---- | ---- |
|
||||
| `shared/management/proto/proxy_service.proto` | Source of truth: 2 new RPCs, 1 new message group (`MiddlewareConfig` + slot enum), additive fields on `PathTargetOptions`, `AccessLog`, `RecordLLMUsageRequest` |
|
||||
| `shared/management/proto/proxy_service.pb.go` | Generated (protoc-gen-go) |
|
||||
| `shared/management/proto/proxy_service_grpc.pb.go` | Generated; adds `CheckLLMPolicyLimits` + `RecordLLMUsage` client/server stubs and `UnimplementedProxyServiceServer` defaults |
|
||||
| `shared/management/http/api/openapi.yml` | 15 new `AgentNetwork*` schemas, 9 new path groups under `/api/agent-network/*` |
|
||||
| `shared/management/http/api/types.gen.go` | Generated (oapi-codegen; see codegen note below) |
|
||||
| `shared/management/status/error.go` | Four `NotFound` constructors for the new resource kinds (lines 208-227) |
|
||||
|
||||
## Architecture & flow
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Dash as Dashboard / CLI
|
||||
participant Mgmt as management (HTTP+gRPC)
|
||||
participant Px as proxy
|
||||
|
||||
Note over Dash,Mgmt: REST (OpenAPI / types.gen.go)
|
||||
Dash->>Mgmt: PUT /api/agent-network/providers (AgentNetworkProviderRequest)
|
||||
Dash->>Mgmt: PUT /api/agent-network/settings (AgentNetworkSettingsRequest)
|
||||
Dash->>Mgmt: GET /api/agent-network/consumption -> [AgentNetworkConsumption]
|
||||
|
||||
Note over Mgmt,Px: gRPC ProxyService (proxy_service.proto)
|
||||
Mgmt-->>Px: SyncMappingsResponse{ ProxyMapping.path[*].options.middlewares,<br/>agent_network, disable_access_log, capture_* }
|
||||
Px->>Mgmt: CheckLLMPolicyLimits(account, user, groups, provider, model)
|
||||
Mgmt-->>Px: decision=allow|deny + selected_policy_id + attribution_group_id + window_seconds
|
||||
Px->>Mgmt: RecordLLMUsage(account, user, group_id, group_ids, window_seconds, tokens, cost)
|
||||
Px->>Mgmt: SendAccessLog(AccessLog{ agent_network=true })
|
||||
```
|
||||
|
||||
The proto changes split into three independent slices: (1) **mapping enrichment** — `PathTargetOptions` grows fields 8-13 so management can ship middleware configs, capture limits, and the agent-network / log-suppression flags down to the proxy without a second RPC; (2) **two new request/response RPCs** (`CheckLLMPolicyLimits`, `RecordLLMUsage`) for per-LLM-request budget arbitration; (3) **observability tag** — `AccessLog.agent_network` so management can route logs to the right surface.
|
||||
|
||||
The OpenAPI side is a thin CRUD surface — every resource (`Provider`, `Policy`, `Guardrail`, `BudgetRule`, `Settings`) follows the same `GET-list / POST / GET / PUT / DELETE` pattern, plus a read-only `/consumption` listing and a catalog endpoint. The `*Request` variants drop server-controlled fields (id, timestamps). `AgentNetworkBudgetRule` deliberately reuses `AgentNetworkPolicyLimits` to keep wire-shape parity with policies.
|
||||
|
||||
## Public contracts added
|
||||
- gRPC RPCs (`proxy_service.proto:52-57`): `CheckLLMPolicyLimits(CheckLLMPolicyLimitsRequest) → CheckLLMPolicyLimitsResponse`, `RecordLLMUsage(RecordLLMUsageRequest) → RecordLLMUsageResponse`. Both unary; default `UnimplementedProxyServiceServer` returns `codes.Unimplemented` (`proxy_service_grpc.pb.go:283-289`).
|
||||
- New messages (`proxy_service.proto:145-175,448-502`): `MiddlewareConfig`, `MiddlewareSlot` enum, `CheckLLMPolicyLimitsRequest`/`Response`, `RecordLLMUsageRequest`/`Response`.
|
||||
- New `PathTargetOptions` fields 8-13 (`proxy_service.proto:124-140`): `capture_max_request_bytes`, `capture_max_response_bytes`, `capture_content_types`, `middlewares`, `agent_network`, `disable_access_log`. All default-false / zero; pre-existing fields 1-7 byte-for-byte unchanged.
|
||||
- `AccessLog.agent_network = 18` (`proxy_service.proto:258-261`).
|
||||
- `RecordLLMUsageRequest.group_ids = 8` (`proxy_service.proto:496-498`) — so the record path can fan out to every applicable budget rule's window without a re-lookup.
|
||||
- 15 new OpenAPI component schemas (`openapi.yml:5072-5829`): `AgentNetworkProvider[Request|Model]`, `AgentNetworkCatalog{Model,Provider,IdentityInjection,HeaderPairInjection,JSONMetadataInjection,ExtraHeader}`, `AgentNetworkPolicy[Request|TokenLimit|BudgetLimit|Limits]`, `AgentNetworkGuardrail[Checks|Request]`, `AgentNetworkConsumption`, `AgentNetworkSettings[Request]`, `AgentNetworkBudgetRule[Request]`.
|
||||
- 9 new path groups (`openapi.yml:12797-13460`): `/api/agent-network/{consumption,settings,budget-rules,budget-rules/{ruleId},catalog/providers,providers,providers/{providerId},policies,policies/{policyId},guardrails,guardrails/{guardrailId}}`.
|
||||
- Four typed NotFound errors (`shared/management/status/error.go:208-227`).
|
||||
|
||||
## Invariants
|
||||
- **Field-number monotonicity.** Every new proto field uses a previously-unallocated number in its message: `PathTargetOptions` 8-13 (was 1-7), `AccessLog` 18 (was 1-17), `RecordLLMUsageRequest` 8. `SendStatusUpdateRequest.inbound_listener = 50` (pre-existing) reserves 50+ for observability extensions, so 8 on `RecordLLMUsageRequest` doesn't conflict.
|
||||
- **Old proxies stay compatible.** Old management never sends `disable_access_log`/`middlewares`/`agent_network` (zero value → existing behaviour); old proxies that don't decode these fields just drop them silently (proto3 unknown-field semantics) — log emission stays on. No pre-existing field number changed: the proto change is insertions only.
|
||||
- **Old management stays compatible.** The two new RPCs are registered on the same `management.ProxyService` descriptor; old proxies hitting them get `codes.Unimplemented` from the unimplemented embed (`proxy_service_grpc.pb.go:283-289`), which is the same fallback pattern `SyncMappings` already documents (`proxy_service.proto:20-21`).
|
||||
- **OpenAPI shapes are append-only.** New schemas are placed at the end of `components.schemas` (line 5072+); new paths at the end of `paths` (line 12797+). No existing schema's `required` list, enum, or property type was changed.
|
||||
- **`*Request` vs response asymmetry.** Read shapes (`AgentNetworkProvider`, `AgentNetworkPolicy`, `AgentNetworkGuardrail`, `AgentNetworkSettings`, `AgentNetworkBudgetRule`) require `created_at`/`updated_at`; the matching `*Request` shapes do not — server fills them. `AgentNetworkProviderRequest.api_key` is write-only (`openapi.yml:5158-5161` "never returned in responses"); reviewers should confirm the response schema (5072-5138) actually omits `api_key`.
|
||||
|
||||
## Things to scrutinize
|
||||
### Correctness
|
||||
- `RecordLLMUsageRequest` carries both `group_id` (singular, the attribution group — field 3) and `group_ids` (plural, full membership — field 8). `b22d5a181` adds field 8 to drive account-budget fan-out; double-check that consumers can't accidentally key counters on the wrong one. Field comments at `proxy_service.proto:489-491` and `496-498` distinguish them but it's the kind of subtle thing a follow-up commit might collapse.
|
||||
- `PathTargetOptions.disable_access_log` is the only field whose default-false meaning **changes semantics** on the proxy side: false → log (status quo), true → suppress. Synthesizer sets `DisableAccessLog = !settings.EnableLogCollection`, so a missing/default settings row yields `EnableLogCollection=false → DisableAccessLog=true → suppressed`. Worth confirming downstream (`agentnetwork.synthesizer`) that operator-defined private services never inherit this flag — the proto field default protects them, but only if synth code is explicit.
|
||||
- `CheckLLMPolicyLimitsResponse.decision` is a free-form `string` (`proxy_service.proto:471`) rather than an enum. Only documented values are "allow" / "deny". An enum would prevent typo drift; consider before this RPC ships to external consumers.
|
||||
- `deny_code` (`proxy_service.proto:478-481`) is documented as "a stable label" but is also a free string. Pin the allowed set somewhere observable to the proxy.
|
||||
|
||||
### Security
|
||||
- `AgentNetworkProvider.api_key` MUST be write-only. Schema split (request has it at line 5158; response omits it) looks correct, but a regression here leaks the upstream provider credential to every dashboard reader. Check that the handler explicitly zeros it on the response path.
|
||||
- `extra_values` / `identity_header_*` headers on `AgentNetworkProvider` get stamped onto upstream requests. Description at `openapi.yml:5099` says "values not declared by the catalog are ignored at synth time" — a contract this module documents but the synthesizer must enforce. Confirm the synth module honours it.
|
||||
- Cluster + subdomain on `AgentNetworkSettings` are documented immutable (`openapi.yml:5686-5694`) and the `AgentNetworkSettingsRequest` (lines 5733-5752) doesn't accept them. Verify the `PUT /api/agent-network/settings` handler can't be tricked by extra JSON keys (oapi-codegen's `additionalProperties: false` is not declared here; spec defaults to permissive).
|
||||
|
||||
### Backward compatibility
|
||||
- The proto change is field-number additive: every previously numbered field keeps the same name + type, and the change is insertions only (no deletions in `proxy_service.proto`), so this holds at the source-text level.
|
||||
- `proxy_service_grpc.pb.go` adds two RPC handlers and registers them in `ProxyService_ServiceDesc.Methods` (lines 543-552). The existing entries are unchanged and order-preserving — gRPC method dispatch is name-keyed, so order doesn't matter, but reviewing the diff (no method renamed/dropped) is still worth a glance.
|
||||
- OpenAPI 3.0 doesn't have a built-in deprecation flow for paths; if any client tooling iterates `paths.*`, the additive routes shouldn't break it, but generated SDKs (especially the dashboard's) need a regen to gain access to `AgentNetwork*`.
|
||||
|
||||
### Codegen pinning
|
||||
- `generate.sh` (`shared/management/http/api/generate.sh:14`) installs `oapi-codegen@latest` rather than a pinned version. **This is a reproducibility gap** — re-running the script later may produce a different `types.gen.go`. Either pin the version in `generate.sh` (e.g. `@v2.7.0`) or document the pin in a `tools.go`.
|
||||
- proto codegen has the protoc / protoc-gen-go version stamped in the generated file header (`proxy_service.pb.go:3-4`).
|
||||
- Regenerate locally and confirm zero diff against the committed `types.gen.go` / `proxy_service.pb.go`.
|
||||
|
||||
## Test coverage
|
||||
| Test file | Locks down |
|
||||
| --------- | ---------- |
|
||||
| None in this scope | The proto and OpenAPI sources are tested transitively by the handler tests (`shared/management/http/handlers/agentnetwork/...`) and by the synthesizer/manager tests (`management/server/agentnetwork/...`). No round-trip serialisation test exists in the `proto/` or `api/` packages themselves. |
|
||||
| `shared/management/proto/*_test.go` | (absent) |
|
||||
| `shared/management/http/api/*_test.go` | (absent) |
|
||||
|
||||
Acceptable for codegen artefacts, but a single golden-file test that re-runs `oapi-codegen` and `protoc` in CI and diffs against the checked-in files would close the reproducibility gap noted above.
|
||||
|
||||
## Known limitations / explicit non-goals
|
||||
- **No deprecation surface.** Old fields/RPCs are kept silently; there is no `[deprecated = true]` annotation on anything. Acceptable here because nothing is being removed.
|
||||
- **No proto-side validation.** Numeric ranges (e.g. `window_seconds >= 60`, `cost_usd >= 0`, capture-byte clamps) are enforced in the OpenAPI schema via `minimum:` and inside Go code by the proxy/management, but `proto3` itself can't express them; downstream is expected to validate every message.
|
||||
- **`MiddlewareConfig.config_json` is `bytes`** (`proxy_service.proto:163`) — opaque to the proto layer. Schema validity is the middleware factory's problem. This is a deliberate tradeoff (per the comment at 161-162) but worth flagging: a corrupted/malicious config_json can only fail at proxy apply time, not at the wire-decode step.
|
||||
- **No catalog endpoint schema for the catalog itself** — the catalog data ships as a `GET /api/agent-network/catalog/providers` returning `[AgentNetworkCatalogProvider]` (`openapi.yml:13024`), but the catalog source-of-truth lives in `management/server/agentnetwork/catalog`, not here.
|
||||
- The reaper / GC design was cut from scope; no reaper-related types appear here.
|
||||
|
||||
## Cross-references
|
||||
- Downstream: [management/store](20-management-store.md), [management/agentnetwork](21-management-agentnetwork.md), [management/handlers + wiring](22-management-handlers-wiring.md), [proxy/runtime](33-proxy-runtime.md)
|
||||
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
|
||||
- Top-level: [../00-overview.md](../00-overview.md)
|
||||
112
docs/agent-networks/modules/20-management-store.md
Normal file
112
docs/agent-networks/modules/20-management-store.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# management/store — persistence for agent-network entities
|
||||
|
||||
> **Risk level:** Medium — six brand-new tables behind AutoMigrate, one upsert-counter table that runs on the request hot path, and one column carrying an encrypted secret.
|
||||
> **Backward-compat impact:** Additive (six new tables created by AutoMigrate; the `Store` interface gains 23 methods, but no existing column/index is touched).
|
||||
|
||||
## Module boundary
|
||||
|
||||
This module is the persistence layer for the Agent Network feature. Everything the management server stores about LLM proxying — providers, policies, guardrails, the per-account settings row, a usage-counter table written on every proxied LLM request, and the account-budget rules — flows through the methods added to `store.Store`. The module owns six tables, six entity types from `management/server/agentnetwork/types`, and a single hot-path upsert (`IncrementAgentNetworkConsumption`) consumed by the proxy fleet.
|
||||
|
||||
Out of scope here: the catalog of provider definitions (compiled-in, no DB), the synthesizer/manager built on top of these CRUDs (covered in [21-management-agentnetwork.md](21-management-agentnetwork.md)), and the HTTP handlers that translate API requests into Save/Delete calls.
|
||||
|
||||
## Files
|
||||
|
||||
| Path | Role |
|
||||
| ---- | ---- |
|
||||
| `management/server/store/sql_store_agentnetwork.go` | gorm implementations of all 23 store methods |
|
||||
| `management/server/store/sql_store_agentnetwork_budgetrule_test.go` | round-trip + account-scoping coverage against a real sqlite store |
|
||||
| `management/server/store/sql_store.go` | one import, six entities appended to the `AutoMigrate` slice (sql_store.go:40, sql_store.go:141-142) |
|
||||
| `management/server/store/store.go` | 23 methods added to the `Store` interface (store.go:328-354) |
|
||||
| `management/server/store/store_mock_agentnetwork.go` | mockgen output for the new interface surface |
|
||||
|
||||
## Tables added / migrations
|
||||
|
||||
All six tables are created by `db.AutoMigrate` invoked from `NewSqlStore` at sql_store.go:133-143. There is no hand-rolled SQL migration script — the schema is whatever GORM derives from the struct tags.
|
||||
|
||||
- `agent_network_providers` — `Provider.TableName()` at provider.go:76. PK `id`, index on `account_id`, named index `idx_agent_network_provider` on `provider_id`. Carries an at-rest-encrypted `api_key` and ed25519 `session_private_key` (provider.go:35,56). `extra_values` and `models` are JSON blobs (`serializer:json`).
|
||||
- `agent_network_policies` — `Policy.TableName()` at policy.go:70. PK `id`, index on `account_id`. JSON columns: `source_groups`, `destination_provider_ids`, `guardrail_ids`, `limits`.
|
||||
- `agent_network_guardrails` — `Guardrail.TableName()` at guardrail.go:41. PK `id`, index on `account_id`. JSON `checks`.
|
||||
- `agent_network_settings` — `Settings.TableName()` at settings.go:33. PK `account_id` (one row per account), named index `idx_agent_network_settings_cluster_subdomain` on `subdomain` only — the index name implies a composite, but only one column is tagged.
|
||||
- `agent_network_consumption` — `Consumption.TableName()` at consumption.go:46. Composite PK across `(account_id, dim_kind, dim_id, window_seconds, window_start_utc)` — the same tuple the upsert keys on.
|
||||
- `agent_network_budget_rules` — `AccountBudgetRule.TableName()` at budgetrule.go:35. PK `id`, index on `account_id`. JSON `target_groups`, `target_users`, `limits`.
|
||||
|
||||
## CRUD surface added
|
||||
|
||||
Provider, Policy, Guardrail, BudgetRule follow the same pattern: `Get<Kind>ByID`, `GetAccount<Kind>` (list), `Save<Kind>` (upsert), `Delete<Kind>`, with account-scoping enforced by the existing `accountAndIDQueryCondition` / `accountIDCondition` constants (sql_store.go:59-62). Provider additionally exposes `GetAllAgentNetworkProviders` (cross-account, used by the synthesizer). Settings exposes `Get`/`GetByCluster`/`Save` (no delete — one row per account, created on first save). Consumption exposes the upsert `Increment`, a point `Get`, and a cross-window `List`.
|
||||
|
||||
## Architecture & flow
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
handlers["HTTP handlers<br/>(management/server/agentnetwork)"] -->|Save/Delete| iface["Store interface<br/>store.go:328-354"]
|
||||
manager["agentnetwork.Manager"] -->|Get*| iface
|
||||
synth["synthesizer<br/>(global)"] -->|GetAllAgentNetworkProviders| iface
|
||||
proxy["proxy fleet<br/>(hot path)"] -->|IncrementAgentNetworkConsumption| iface
|
||||
iface --> sql["SqlStore methods<br/>sql_store_agentnetwork.go"]
|
||||
iface -.gomock.-> mock["MockStore<br/>store_mock_agentnetwork.go"]
|
||||
sql --> gorm["gorm.DB"]
|
||||
gorm --> tables[("6 tables<br/>agent_network_*")]
|
||||
sql --> enc["crypt.FieldEncrypt<br/>(provider only)"]
|
||||
```
|
||||
|
||||
Reads decrypt provider secrets in-place; writes do `provider.Copy().EncryptSensitiveData(...)` before `db.Save` so the caller's in-memory object keeps the plaintext `api_key` (sql_store_agentnetwork.go:88-102). Every list/get takes a `LockingStrength` and applies `clause.Locking{Strength: ...}` when non-`None` — matching the rest of the store. The upsert path uses `clause.OnConflict` with `gorm.Expr` server-side increments so concurrent proxy nodes converge without read-modify-write races (sql_store_agentnetwork.go:321-335).
|
||||
|
||||
## Invariants enforced at the store layer
|
||||
|
||||
- **Account scoping.** Every entity-by-ID method keys on `account_id = ? and id = ?`; no cross-tenant leak path through the API is reachable as long as callers always pass the auth'd `accountID` (sql_store_agentnetwork.go:70,141,201,429).
|
||||
- **NotFound mapping.** `gorm.ErrRecordNotFound` is translated to typed `status.NewAgentNetwork*NotFoundError`; `Delete*` returns NotFound when `RowsAffected == 0` (sql_store_agentnetwork.go:111-113,171-173,231-233,461-463).
|
||||
- **Provider secret encryption at rest.** `SaveAgentNetworkProvider` always encrypts before persist; `Get*` always decrypts after read. The plaintext `api_key` never reaches the DB through this layer (sql_store_agentnetwork.go:31,54,80,90).
|
||||
- **Consumption monotonicity.** The upsert only ever issues `col = col + ?` for the three counter columns — no decrement path exists (sql_store_agentnetwork.go:330-332).
|
||||
- **Window alignment is the caller's responsibility.** The store stamps `WindowStartUTC` as-passed; alignment to epoch happens in `types.WindowStart` at consumption.go:51-58.
|
||||
- **Settings has no Delete.** Intentional — one row per account, created on first save; the row sticks around for the account lifetime.
|
||||
|
||||
## Things to scrutinize
|
||||
|
||||
### Correctness
|
||||
- `SaveAgentNetworkProvider` saves the copy (sql_store_agentnetwork.go:95). The caller's in-memory pointer therefore keeps plaintext `api_key` and any `CreatedAt`/`UpdatedAt` gorm autofills land on the copy, not the original. Callers that need synced timestamps must re-fetch.
|
||||
- `IncrementAgentNetworkConsumption`'s `Create` provides initial counter values (`TokensInput: tokensIn`, etc.) in the row, and on conflict the assignments add the same deltas to the existing values. The insert-vs-update arithmetic is consistent. Cross-check that no engine in use (sqlite, postgres, mysql) silently rejects the `OnConflict` clause — GORM emits engine-specific SQL but `ON DUPLICATE KEY UPDATE` (mysql) vs `ON CONFLICT (...)` (sqlite/postgres) need their unique constraint to match the composite PK on `agent_network_consumption`; it does, by construction.
|
||||
- `IncrementAgentNetworkConsumption` writes `updated_at: time.Now().UTC()` literally inside the assignments map (sql_store_agentnetwork.go:333) — fine, but it's a Go-side timestamp captured at call time, not a DB-side `now()`. Acceptable for an audit field.
|
||||
- `GetAgentNetworkConsumption` returns a zero-valued non-nil row on `ErrRecordNotFound` (sql_store_agentnetwork.go:364-371). Document or rename — a typed sentinel error would be more orthodox; callers must know not to error-check.
|
||||
|
||||
### Concurrency / transactions
|
||||
- Hot-path `IncrementAgentNetworkConsumption` runs outside any explicit transaction; concurrency safety relies entirely on the DB serialising the `ON CONFLICT` upsert against the composite PK. This is correct for postgres and mysql; for sqlite it serialises behind the single writer.
|
||||
- `SaveAgentNetworkSettings` is a blind upsert with no version/etag — concurrent writes from two operators last-write-wins on the collection-toggle flags (settings.go:23-25). Acceptable for admin-curated state but worth flagging.
|
||||
- `Save*Provider` uses `db.Save` on a struct with a PK already set — GORM emits UPDATE or INSERT based on row existence. No upsert clause is attached, so a race between two creates with the same generated `xid` (vanishingly unlikely) would surface as a PK violation.
|
||||
|
||||
### Migration safety
|
||||
- All six tables ride `AutoMigrate` (sql_store.go:141-142). AutoMigrate is additive: new columns get added, but it never drops columns nor narrows types. Three `bool` columns on `agent_network_settings` (`EnableLogCollection`, `EnablePromptCollection`, `RedactPii`) default to false at the GORM/DDL layer for existing rows; the test at sql_store_agentnetwork_budgetrule_test.go:83-112 locks that down on a fresh sqlite. Verify postgres/mysql produce the same default.
|
||||
- The named index `idx_agent_network_settings_cluster_subdomain` on settings.go:15 is declared on only `subdomain`. Either the cluster column also needs `gorm:"index:idx_agent_network_settings_cluster_subdomain"` to make it composite, or the name is misleading.
|
||||
- The named index `idx_agent_network_provider` on `Provider.ProviderID` (provider.go:30) is *not* unique and not scoped to account — two providers in the same account with the same `provider_id` are permitted at the DB layer; uniqueness, if any, must live above the store.
|
||||
|
||||
### Backward compatibility
|
||||
- Net additive. No removed methods, no renamed columns, no schema change to existing tables. Existing deployments running a prior binary continue to work; the first boot of the new binary creates the six tables.
|
||||
- The `Store` interface grows by 23 methods (store.go:330-354); any non-mock external implementer of `store.Store` will fail to compile. The repo only has `SqlStore` + `MockStore`, both updated.
|
||||
|
||||
### Performance (indexes, N+1)
|
||||
- All by-account list queries hit the `idx_account_id` per-table index. No N+1: list methods return the full slice in one query.
|
||||
- `GetAgentNetworkSettingsByCluster` (sql_store_agentnetwork.go:263-277) does a tablescan on `cluster` — no index. Tolerable for the bootstrap label generator (one-shot at provisioning) but worth noting if the call moves onto a hot path.
|
||||
- `ListAgentNetworkConsumption` returns every row ever recorded for the account (sql_store_agentnetwork.go:382-400) — unbounded growth, no `LIMIT`, no time filter. With one row per (dim, window) per request burst, this table grows fastest of the six; a retention job + a paginated list method are obvious follow-ups.
|
||||
|
||||
## Test coverage
|
||||
|
||||
| Test file | Locks down |
|
||||
| --------- | ---------- |
|
||||
| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkBudgetRule_RealStore_RoundTrip` | full save → reload of `AccountBudgetRule` including the JSON-serialised `PolicyLimits`, target slices, double-delete returns NotFound (lines 18-59) |
|
||||
| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkBudgetRule_RealStore_ScopedByAccount` | cross-account isolation for budget rules (lines 63-78) |
|
||||
| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkSettings_RealStore_CollectionTogglesRoundTrip` | collection toggles default off, survive save/reload at the set values (lines 83-112) |
|
||||
|
||||
Gap: there is no store-level test for providers (encryption round-trip), policies, guardrails, or `IncrementAgentNetworkConsumption` (concurrent upsert, window-key uniqueness). The consumption upsert is the most performance-sensitive method in this module and the only one without a real-sqlite test.
|
||||
|
||||
## Known limitations / explicit non-goals
|
||||
|
||||
- No retention / GC for `agent_network_consumption`.
|
||||
- No `Delete` for `Settings` (one row per account, cleared with the account).
|
||||
- No DB-engine-specific tuning — the same struct tags drive sqlite, mysql, postgres.
|
||||
- Provider `extra_values` and `models` are JSON blobs; querying inside them is not supported by design.
|
||||
- `GetAgentNetworkConsumption` "not-found = zero row" contract is convenient but unconventional.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Upstream: [shared/api](10-shared-api.md), [management/agentnetwork](21-management-agentnetwork.md)
|
||||
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
|
||||
- Top-level: [../00-overview.md](../00-overview.md)
|
||||
225
docs/agent-networks/modules/21-management-agentnetwork.md
Normal file
225
docs/agent-networks/modules/21-management-agentnetwork.md
Normal file
@@ -0,0 +1,225 @@
|
||||
# management/agentnetwork — domain layer + synth pipeline
|
||||
|
||||
> **Risk level:** High — central business logic + budget enforcement + the source of every middleware-chain change the proxy executes.
|
||||
> **Backward-compat impact:** Additive within the agent-network surface; one **behavioural difference for opted-out accounts** in parser capture (the capture flag is stamped explicitly false instead of being absent — see capture-pointer semantics below). Non-agent-network proxy services are untouched (the synth chain only ships on `agent-net-svc-*` targets).
|
||||
|
||||
## Module boundary
|
||||
|
||||
`management/server/agentnetwork` owns every agent-network entity (providers, policies, guardrails, account budget rules, per-account settings, consumption rows) and **translates them into the in-memory `*rpservice.Service` that the reverse-proxy controller turns into `proto.ProxyMapping`s and pushes to clusters**. It is the *only* writer of the agent-network middleware chain.
|
||||
|
||||
Inside the package: `manager.go` is the CRUD + permissions-gated facade; `synthesizer.go` walks settings + providers + policies + guardrails and emits the per-account service plus every middleware's JSON config; `policyselect.go` runs per-request attribution (min-wins account ceiling, then "drain bigger pool first"); `reconcile.go` diffs successive synth outputs and emits precise Create/Update/Delete proxy-mapping updates plus a peer-map refresh. `labelgen/` mints DNS-safe subdomain labels; `catalog/` is the static provider catalogue; `types/` carries gorm entity structs. The `_realstack_test.go` files in the parent `management/server/` directory exercise the manager + network-map controller end-to-end with no mocks.
|
||||
|
||||
## Files
|
||||
|
||||
| Path | Role |
|
||||
| ---- | ---- |
|
||||
| `agentnetwork/manager.go` | Manager interface + CRUD + permission gates + bootstrap-settings + reconcile trigger |
|
||||
| `agentnetwork/synthesizer.go` | Settings/policy → wire-format synthesis; sole writer of the proxy middleware chain |
|
||||
| `agentnetwork/policyselect.go` | Per-request policy attribution + account-budget ceiling (min-wins) |
|
||||
| `agentnetwork/reconcile.go` | Per-account synth diff vs in-memory cache → Create/Update/Delete |
|
||||
| `agentnetwork/catalog/catalog.go` | Static provider catalogue (auth headers, identity-injection shapes) |
|
||||
| `agentnetwork/labelgen/{labelgen,words}.go` | DNS-safe subdomain picker + curated wordlist |
|
||||
| `agentnetwork/types/provider.go` | Provider entity + APIKey + Models + ExtraValues + SessionKeys |
|
||||
| `agentnetwork/types/policy.go` | Policy entity + `PolicyLimits` (token + budget) |
|
||||
| `agentnetwork/types/guardrail.go` | Guardrail entity (`ModelAllowlist`, `PromptCapture`) |
|
||||
| `agentnetwork/types/budgetrule.go` | `AccountBudgetRule` (reuses `PolicyLimits`) |
|
||||
| `agentnetwork/types/settings.go` | Per-account `Settings` (Cluster, Subdomain, 3 toggles) |
|
||||
| `agentnetwork/types/consumption.go` | `Consumption` row + `WindowStart` aligner |
|
||||
| `agentnetwork/{synthesizer,policyselect,reconcile,wire_shape}_*test.go` | See test coverage table |
|
||||
| `agentnetwork/types/consumption_test.go` | `WindowStart` alignment proofs |
|
||||
| `agentnetwork/labelgen/labelgen_test.go` | Deterministic picks + exhaustion + fallback |
|
||||
| `management/server/agentnetwork_realstack_test.go` | No-mock provider CRUD → network-map fan-out |
|
||||
| `management/server/agentnetwork_budgetrule_realstack_test.go` | No-mock budget-rule CRUD + settings preserve-immutable |
|
||||
|
||||
## Architecture & flow
|
||||
|
||||
### Synthesis (settings/policy → wire format)
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Mutation: provider/policy/guardrail/settings] --> B[managerImpl.reconcile accountID]
|
||||
B --> C{proxyController nil?}
|
||||
C -- yes --> D[accountManager.UpdateAccountPeers only]
|
||||
C -- no --> E[SynthesizeServices]
|
||||
E --> F[loadSettings — NotFound returns ok=false, no synth]
|
||||
F --> G[filterEnabledProviders sorted by CreatedAt]
|
||||
G --> H[filterEnabledPolicies]
|
||||
H --> I[backfillProviderSessionKeys if missing]
|
||||
I --> J[indexProviderGroups: providerID -> sorted source groups]
|
||||
J --> K[buildRouterConfigJSON drops orphan providers]
|
||||
J --> L[buildIdentityInjectConfigJSON per catalog entry]
|
||||
H --> M[mergeGuardrails: union allowlist, OR redact]
|
||||
M --> N[applyAccountCollectionControls account toggle = SOLE capture control]
|
||||
N --> O[marshalGuardrailConfig]
|
||||
K --> P[buildMiddlewareChain 8 middleware entries]
|
||||
L --> P
|
||||
O --> P
|
||||
P --> Q[buildAccountService: AccessGroups=union source groups, noop.invalid target]
|
||||
Q --> R[reconcile.diffMappings vs cache]
|
||||
R --> S[SendServiceUpdateToCluster CREATE/MODIFY/REMOVE]
|
||||
R --> T[accountManager.UpdateAccountPeers — fans synth ACLs into network map]
|
||||
```
|
||||
|
||||
### Budget rule resolution (min-wins, group+user bound)
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[SelectPolicyForRequest in] --> B[checkAccountBudget — runs FIRST, independent of policies]
|
||||
B --> C[GetAccountAgentNetworkBudgetRules]
|
||||
C --> D{for each enabled rule}
|
||||
D --> E{budgetRuleApplies?}
|
||||
E -- no --> D
|
||||
E -- yes --> F[attrGroup = lowestIntersect TargetGroups, in.GroupIDs]
|
||||
F --> G{Token cap enabled?}
|
||||
G -- yes --> H[evalTokenCap user dim + group dim]
|
||||
H --> I{exhausted?}
|
||||
I -- yes --> J[DENY: llm_account.token_cap_exceeded - STOP]
|
||||
I -- no --> K{Budget cap enabled?}
|
||||
G -- no --> K
|
||||
K -- yes --> L[evalBudgetCap user dim + group dim]
|
||||
L --> M{exhausted?}
|
||||
M -- yes --> N[DENY: llm_account.budget_cap_exceeded - STOP]
|
||||
M -- no --> D
|
||||
K -- no --> D
|
||||
D --> O[All rules passed -> fall through to per-policy selection]
|
||||
```
|
||||
|
||||
Key invariant: **rules are checked sequentially and ANY exhausted rule denies (all-must-pass / min-wins).** Untargeted rules (`len(TargetGroups)==0 && len(TargetUsers)==0`) apply to every caller (`policyselect.go:393`).
|
||||
|
||||
### Policy selection (per-peer, per-request)
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Account-budget gate passed] --> B[GetAccountAgentNetworkPolicies]
|
||||
B --> C[filterApplicablePolicies enabled + provider match + group intersect]
|
||||
C --> D{candidates empty?}
|
||||
D -- yes --> E[Allow, empty SelectedPolicyID]
|
||||
D -- no --> F[scoreCandidates -> scoreOne per policy]
|
||||
F --> G[scoreOne: attrGroup + window]
|
||||
G --> H{any cap exhausted?}
|
||||
H -- yes --> I[Drop policy; record last deny code]
|
||||
H -- no --> K[Keep as live candidate]
|
||||
F --> L{live candidates exist?}
|
||||
L -- no --> M[Deny with last exhaustion code]
|
||||
L -- yes --> N[Sort: uncapped wins -> larger group token -> group budget -> user token -> user budget -> oldest CreatedAt]
|
||||
N --> O[winner = scored 0]
|
||||
O --> P[Allow + SelectedPolicyID + AttributionGroupID + WindowSeconds]
|
||||
```
|
||||
|
||||
End-to-end: a mutation calls `managerImpl.reconcile(ctx, accountID)` (`manager.go:205,239,...`). Reconcile defers an `accountManager.UpdateAccountPeers` so the network-map controller re-runs and `injectAllProxyPolicies` picks up the new access groups; with a `proxyController` wired, it re-synthesizes the service, diffs against `reconcileCache[accountID]` (guarded by `reconcileMu`), and emits proto mappings to the cluster derived from the mapping's domain (`reconcile.go:120`). Synthesis is stateless and idempotent. Sole persistent side effect: `backfillProviderSessionKeys` (`synthesizer.go:249`) mints ed25519 keys on legacy provider rows and writes them back.
|
||||
|
||||
At request time the path is independent: the proxy calls `SelectPolicyForRequest` (`policyselect.go:56`); account-budget ceiling first, then per-policy scoring. Token + budget caps share `evalTokenCap` / `evalBudgetCap` — same primitive for account rules and policy limits, `label` differentiates the deny reason. After a served request, `RecordAccountBudgetUsage` (`policyselect.go:415`) fans deltas to every applicable rule's distinct `(dim_kind, dim_id, window)` tuple, deduplicating to prevent double-count when two rules share target+window.
|
||||
|
||||
## Public contracts
|
||||
|
||||
- **Manager interface** (`manager.go:48-80`): CRUD for `Providers/Policies/Guardrails/BudgetRules`; `GetSettings/UpdateSettings` (cluster + subdomain immutable, only the three toggles mutate); `ListConsumption/RecordConsumption(account, kind, dimID, windowSec, in, out, USD)`; `RecordAccountBudgetUsage(account, user, groups, in, out, USD)`; `SelectPolicyForRequest(ctx, PolicySelectionInput) → *PolicySelectionResult{Allow, SelectedPolicyID, AttributionGroupID, WindowSeconds, DenyCode, DenyReason}`.
|
||||
- **`PolicySelectionInput`** (`manager.go:85-90`): `{AccountID, UserID, GroupIDs, ProviderID}` — populated by the proxy from CapturedData + `llm_router` resolution.
|
||||
- **Synthesized middleware chain** (`synthesizer.go:576-657`), order load-bearing — response slot runs reverse-of-slice:
|
||||
|
||||
| Slot | Idx | ID | ConfigJSON shape | CanMutate |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| on_request | 0 | `llm_request_parser` | `{"capture_prompt": <bool>, "redact_pii"?: true}` | – |
|
||||
| on_request | 1 | `llm_router` | `{"providers":[{id, models[], upstream_*, auth_header_*, allowed_group_ids[]}]}` | **true** |
|
||||
| on_request | 2 | `llm_limit_check` | `{}` | – |
|
||||
| on_request | 3 | `llm_identity_inject` | `{"providers":[{provider_id, header_pair?, json_metadata?, extra_headers?}]}` | **true** |
|
||||
| on_request | 4 | `llm_guardrail` | `{"model_allowlist"?, "prompt_capture":{enabled,redact_pii}}` | – |
|
||||
| on_response | 5 | `llm_limit_record` | `{}` (runs LAST at runtime) | – |
|
||||
| on_response | 6 | `cost_meter` | `{}` | – |
|
||||
| on_response | 7 | `llm_response_parser` | `{"capture_completion": <bool>, "redact_pii"?: true}` | – |
|
||||
- **Synthesized service shape** (`synthesizer.go:739`): `Mode=HTTP`, `Private=true`, `Domain=<subdomain>.<cluster>`, `AccessGroups=unionSourceGroups(enabledPolicies)`, one `TargetTypeCluster` target with `Host=noop.invalid:443` (router rewrites per request), `Options.{DirectUpstream,AgentNetwork}=true`, `DisableAccessLog=!settings.EnableLogCollection`, `CaptureMax{Req,Resp}Bytes=1<<20`, `CaptureContentTypes=["application/json","text/event-stream"]`.
|
||||
|
||||
## Invariants
|
||||
|
||||
- **Min-wins / all-must-pass for account budget rules** (`checkAccountBudget`, `policyselect.go:353`): every applicable enabled rule is checked; first exhausted cap denies. Untargeted rules bind every caller.
|
||||
- **Account toggle is the SOLE control for capture enablement.** `applyAccountCollectionControls` (`synthesizer.go:701`) sets `merged.PromptCapture.Enabled = settings.EnablePromptCollection` *unconditionally*.
|
||||
- **Capture-pointer semantics on parser configs** — see "Things to scrutinize" below.
|
||||
- **`EnableLogCollection` ↔ `DisableAccessLog` is the only access-log toggle** (`synthesizer.go:770`). Default off ⇒ access log suppressed.
|
||||
- **`RedactPii` flows verbatim to BOTH parsers** (`synthesizer.go:584-585`) and is OR'd into the merged guardrail (`synthesizer.go:706`).
|
||||
- **Cluster and Subdomain are immutable on Settings.** `UpdateSettings` reloads existing row and overlays only the three toggles (`manager.go:558-561`).
|
||||
- **Orphan providers (no enabled policy authorises them) NEVER reach the router** (`synthesizer.go:351-357`); skipped from `identity_inject` for symmetry.
|
||||
- **Provider creation refuses empty `api_key`** (`manager.go:175`); **deletion refuses while any policy still references it** (`manager.go:265-273`).
|
||||
- **Session keypair stability across provider edits** (`manager.go:226-228`) — server-managed, copied through every `UpdateProvider`, never API-surfaced.
|
||||
|
||||
## Things to scrutinize
|
||||
|
||||
### Correctness
|
||||
|
||||
- **Capture-pointer semantics — `*bool` vs `bool`.** Three states, owned by separate sides:
|
||||
- **Wire JSON this module emits:** `buildParserConfigJSON` (`synthesizer.go:678-693`) *always* stamps the capture field. Agent-network targets ship `"capture_prompt": false` or `"capture_prompt": true` — never absent. Same for `"capture_completion"`. The happy-path test pins `{"capture_prompt":false}` (`synthesizer_test.go:174`).
|
||||
- **Proxy-side parser config (consumer):** parsers decode into `*bool`. Matrix:
|
||||
- `nil` (field absent) → **legacy default = emit**. Preserved for non-agent-network callers and pre-existing tests (the backward-compat hook).
|
||||
- `false` (field present, value false) → **suppress emission entirely**. The behaviour for opted-out agent-network accounts. Without this, `enable_log_collection=true` + `enable_prompt_collection=false` would leak raw user input AND raw model output to the access log.
|
||||
- `true` → emit normally.
|
||||
- **Why the synth always stamps a value:** an agent-network mapping omitting the field would hit legacy "always emit" and re-introduce the leak. The `json.Marshal` error fallback at `synthesizer.go:687` degrades to `{}` — comment-claimed unreachable, but if ever fired re-introduces the leak. Consider fail-closed (return literal `{"capture_prompt":false}`) instead.
|
||||
- **`scoreCandidates` non-cumulative deny code.** Only the *last* exhausted policy's deny code survives (`policyselect.go:188-190`). Iteration order is store's natural order. Auth signal is `len(scored)==0`, so this is informational only — verify no UI depends on "first exhausted policy" semantics.
|
||||
- **`effectiveWindowSeconds` token-wins tiebreak.** When both halves are enabled with different windows, token's window wins (`policyselect.go:482`). Verify `RecordLLMUsage` increments against the winning window only.
|
||||
- **`RecordAccountBudgetUsage` dedup.** Two rules with the same `(kind, dim_id, window)` would double-count without the `tuples` map (`policyselect.go:434-449`). Key includes all three dimensions — correct.
|
||||
- **Fail-closed on bad provider:** unknown catalog id (`synthesizer.go:794-796`) or empty API key (`synthesizer.go:801-803`) drops the **entire** account's synth, not just the bad provider. Confirm matches operator UX.
|
||||
|
||||
### Security
|
||||
|
||||
- **Redact OR-merge:** merged `RedactPii` = account OR guardrail (`synthesizer.go:706`). **Parser-side flag is `settings.RedactPii` only, NOT the OR** — a guardrail-only opt-in does not propagate to parsers. Correct because the account toggle gates capture, but worth noting on the proxy side.
|
||||
- **Group resolution must not leak across accounts.** Every store call carries `accountID` (`policyselect.go:73, 286, 298, 322, 334, 354`); `lowestIntersect` uses caller's claimed groups only (`policyselect.go:494`). Risk surface is upstream (handler populates `in.GroupIDs`).
|
||||
- **`UpdateSettings` preserves immutable Cluster + Subdomain** (`manager.go:558`). A client can't rebind the cluster.
|
||||
- **Provider session keypair backfill writes through `SaveAgentNetworkProvider`** (`synthesizer.go:256`) from a read-shaped call. Idempotent → worst case is a wasted write under concurrent reconcile + snapshot.
|
||||
|
||||
### Concurrency
|
||||
|
||||
- **`reconcileMu`** guards `reconcileCache`. Lock window is narrow — compute diff inside, send outside (`reconcile.go:56-68`).
|
||||
- **`labelRngMu`** guards `labelRng` because `math/rand.Source` is unsafe for concurrent use (`manager.go:638-640`).
|
||||
- **Real-store tests** use `store.NewTestStoreFromSQL` with `t.TempDir()` per test — no shared state, no `t.Parallel()`.
|
||||
- **`RecordAccountBudgetUsage` dedup `tuples` map is per-call;** concurrent calls fan out fully — correct (each request's tokens book once per applicable rule).
|
||||
- **Deferred `UpdateAccountPeers` runs inline after the proxy push** (`reconcile.go:28-35`); a slow call stretches CRUD response time.
|
||||
|
||||
### Backward compatibility
|
||||
|
||||
- **Capture-pointer semantics (restated):** non-agent-network callers see no field → legacy nil-default emit, identical to pre-PR. Agent-network targets always carry an explicit `capture_*` value.
|
||||
- **`TestSynthesizeServices_HappyPath` was updated:** request-parser config moved from `{}` to `{"capture_prompt":false}` (`synthesizer_test.go:174`). External snapshot tests against synth output need updating.
|
||||
- **`MergedGuardrails` retains zeroed `TokenLimits`/`Budget`/`Retention`** even though `Policy.Limits` carries the real values now; `llm_limit_check` is the authoritative enforcement. Comment at `synthesizer.go:940-948` calls this out.
|
||||
|
||||
### Performance
|
||||
|
||||
- **`SynthesizeServices` runs on every controller tick / mutation reconcile.** Cost: 4 store reads + optional per-provider keypair backfill. Sort + index + merge are O(N log N) / O(P × G); dominant cost is JSON marshalling. No nested loops escape these dimensions.
|
||||
- **`reconcile.diffMappings` is O(N + M)** with N=M=1 per account today — effectively constant.
|
||||
- **`SynthesizeServicesForCluster`** (`synthesizer.go:71`) walks every account on a cluster; per-account failures are **swallowed** (`synthesizer.go:91-93`) so a single misconfigured account doesn't drop the cluster. Runs per proxy reconnect.
|
||||
|
||||
### Observability
|
||||
|
||||
- **Activity codes:** `AgentNetwork{Provider,Policy,Guardrail,BudgetRule}{Created,Updated,Deleted}`; `AgentNetworkSettingsUpdated` with `log_collection/prompt_collection/redact_pii` payload (`manager.go:567-571`). **No activity code for `SelectPolicyForRequest` denies** — surfaced via proxy access log only (likely intentional given volume).
|
||||
- **Deny codes** namespaced: `llm_policy.{token,budget}_cap_exceeded`, `llm_account.{token,budget}_cap_exceeded` (`policyselect.go:18-26`).
|
||||
- **Reconcile failures are logged at warn and swallowed** (`reconcile.go:42-44`). Persistent synth failures (e.g. unknown catalog id) silently keep the proxy out of sync — consider a manager-level synth-health surface if this becomes a support burden.
|
||||
|
||||
## Test coverage
|
||||
|
||||
| Test file | Locks down |
|
||||
| --------- | ---------- |
|
||||
| `synthesizer_test.go` | Mock-store: `HappyPath` (8-mw chain ordering, `{"capture_prompt":false}` baseline); `No{Settings,Providers}`; `Disabled{Provider,Policy}_NoService`; `RouterConfigOrdering`; `PolicyCheckConfig_UnionsSourceGroups`; `OrphanProvider_HasEmptyAllowedGroups`; identity-inject for LiteLLM / Bifrost (overrides + partial disable) / Cloudflare / Portkey / Vercel / OpenRouter / generic non-customizable; `GuardrailMerge_AllowlistUnion_LimitsRestrictive`; `BackfillsMissingSessionKeys`; `HTTPUpstream_KeepsExplicitPort`; `UpstreamURLPath_FlowsToRouter`; `UnknownProviderID_FailsClosed`; `EmptyAPIKey_FailsClosed`. |
|
||||
| `synthesizer_realstore_test.go` | Real-sqlite: `SurvivesStatusToggle` reproduces the disable/re-enable 403 regression; `Reconcile_RealStore_PushesPrivateAfterStatusToggle` extends through reconcile push. |
|
||||
| `synthesizer_guardrail_realstore_test.go` | `PromptCaptureAccountIsSoleControl`; `PromptCaptureFlowsWhenAccountOptsIn`; `AccountRedactWithoutGuardrailRedact`; `NoGuardrail_CaptureOff`. |
|
||||
| `synthesizer_log_collection_realstore_test.go` | `LogCollection{Off_SuppressesAccessLog,On_PermitsAccessLog}` — verifies `DisableAccessLog` propagation through `ToProtoMapping`. |
|
||||
| `synthesizer_parser_redact_realstore_test.go` | **Capture-pointer regression suite:** `ParserConfigsCarryRedactPii`; `ParserConfigsSuppressCaptureWhenLogCollectionOnly` (log=on/prompt=off ⇒ both capture flags false); `ParserConfigsOmitRedactPiiWhenOff`. |
|
||||
| `policyselect_test.go` | Mock-store: `NoApplicablePolicies`; `AllowWithLowestGroupAttribution`; `LargerPoolWinsAcrossUsageLevels`; `StaysOnLargerPoolAfterPartialDrain`; `FallsThroughToSmallerPoolWhenLargerExhausted`; `TiebreakBy{LargerGroupPool,CreatedAt}`; `DeniesWhenAllExhausted`; `UncappedPolicyAlwaysWinsAgainstCapped`; `DisabledPolicyIgnored`; `StoreErrorPropagates`; `RejectsEmptyAccount`; `SharesGroupCounterAcrossPolicies`; `AntiFallThroughOnLowestGroup`; `BudgetOnlyExhaustionDenies`; `BudgetTighterThanTokenWins`. |
|
||||
| `policyselect_realstore_test.go` | Real-sqlite regression guard: `NoApplicablePolicies`; `AllowAndLowestGroupAttribution`; `LargerPoolWins_FallsThroughWhenExhausted`; `BudgetCapDenies`; `GroupCounterSharedAcrossPolicies`; `DisabledPolicyIgnored`. |
|
||||
| `policyselect_account_realstore_test.go` | Account budget rules: `AccountCeilingBindsEvenWithUncappedPolicy` (min-wins); `AccountGroupCeiling`; `AccountTargetUsersBindsOnlyThatUser`; `AccountRuleRecordsToOwnWindow`. |
|
||||
| `reconcile_test.go` | `FirstSynth_EmitsCreate`; `NoChange_EmitsNothingExtra` (re-push as Modified — verify desired); `PolicyRemoved_EmitsDelete`; `NilProxyController_NoOp`; `EmptyAccountID_NoOp`; `ClusterFromMapping`. |
|
||||
| `wire_shape_test.go` | `TestSynthesizedService_WireShape` — proto-shape lockdown via `ToProtoMapping`. Catches "service not matching" (mapping reaches proxy but no SNI/HTTP route). Asserts ID, Domain, Mode, AuthToken, `Private`, `Auth.Oidc=false`, one path `/` + `https://noop.invalid/`, 8 middlewares with correct slot enums, router config `auth_header_value="Bearer sk-test-key"`. |
|
||||
| `labelgen/labelgen_test.go` | `PickUnique_{DeterministicWithSeededRng,AvoidsTakenWordsWhenMostAreReserved,FallsBackWhenAllReserved}`; `UniqueWords_DropsDuplicates`. |
|
||||
| `types/consumption_test.go` | `WindowStart_{AlignedToUnixEpoch,WithinWindowConverges,AcrossWindowsDiverges,DifferentWindowsHaveDifferentBuckets,SubMinuteAndMinuteAlignment,ZeroWindowReturnsInputUTC}`. Bucket alignment so multi-node reads converge. |
|
||||
| `agentnetwork_realstack_test.go` | `ProviderCRUD_FansOutToProxyAndClientPeers` — no-mock end-to-end through real account manager + network-map + agentnetwork: provider create propagates the updated map to both proxy peer and client peer with the synth DNS surface. |
|
||||
| `agentnetwork_budgetrule_realstack_test.go` | `BudgetRuleCRUD_RealManager`; `UpdateSettings_PreservesImmutableAndTogglesCollection`. |
|
||||
|
||||
## Known limitations / explicit non-goals
|
||||
|
||||
- **`MergedGuardrails.TokenLimits/Budget/Retention` emit at zero** (`synthesizer.go:940-948`); real enforcement is `Policy.Limits` via `llm_limit_check`. Future cleanup implied.
|
||||
- **Session keys picked from first enabled provider by created_at** (`pickServiceSessionKeys`, `synthesizer.go:270`). Existing session cookies survive provider edits only while the first-by-CreatedAt provider stays in place. Document for operators.
|
||||
- **Reconcile failures silently swallowed** (`reconcile.go:42-44`). Persistent failures keep the proxy out of sync until the next reconcile.
|
||||
- **`scoreCandidates` exposes only the LAST exhaustion's deny code** when multiple policies are exhausted.
|
||||
- **`bootstrapSettingsIfNeeded` failure is non-fatal to provider create** (`manager.go:200`): provider lands, synth is no-op until the next provider create retries the bootstrap.
|
||||
- **Budget rules do not trigger a reconcile** (`manager.go:476-477`). Request-time evaluation only; new rules take effect on the next request without a proxy push.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- **Upstream:** [shared/api](10-shared-api.md), [management/store](20-management-store.md), reverseproxy `service`/`proxy`/`sessionkey` packages, `management/server/permissions` + `activity`.
|
||||
- **Downstream:** [management/handlers (HTTP wiring)](22-management-handlers-wiring.md), [proxy/middleware-builtin](31-proxy-middleware-builtin.md), network-map controller (`injectAllProxyPolicies` fan-out).
|
||||
- **End-to-end flow:** [../01-end-to-end-flows.md](../01-end-to-end-flows.md) — "Provider create → reconcile → proxy push → peer map refresh" and "request → policy select → record" diagrams.
|
||||
- **Top-level:** [../00-overview.md](../00-overview.md)
|
||||
203
docs/agent-networks/modules/22-management-handlers-wiring.md
Normal file
203
docs/agent-networks/modules/22-management-handlers-wiring.md
Normal file
@@ -0,0 +1,203 @@
|
||||
# management/handlers + wiring — HTTP API + gRPC delivery
|
||||
|
||||
> **Risk level:** Medium — the surface is mostly additive, but two changes are load-bearing: `injectAllProxyPolicies` runs on every per-peer compute, and `shallowCloneMapping` must round-trip `Private` (a missed field silently breaks every MODIFIED).
|
||||
> **Backward-compat impact:** Additive on the wire (new routes, new RPCs, new proto fields, new gorm column on `AccessLogEntry`). One management-internal break: `nbhttp.NewAPIHandler` gains a trailing `agentNetworkManager` parameter; `nil` is tolerated and silently skips route registration.
|
||||
|
||||
## Module boundary
|
||||
|
||||
This module is the seam between the public Agent Network HTTP API and the proxy fleet that serves agent traffic. North side: a `/api/agent-network/*` surface (providers, policies, guardrails, budget rules, settings, consumption) on the existing gorilla router, delegating to `agentnetwork.Manager`. Handlers are thin — they translate `api.*` ↔ `types.*`, validate shape, forward. RBAC and event emission stay inside the manager (`manager.go:680-682`).
|
||||
|
||||
South side: `ProxyServiceServer` (`proxy.go`) learns to (a) ship synth services to a proxy on initial snapshot, (b) resolve agent-network domains in `getServiceByDomain` for OIDC/session/tunnel-peer flows, (c) gate LLM requests via `CheckLLMPolicyLimits` + `RecordLLMUsage`, (d) preserve `Private` through `shallowCloneMapping` so per-proxy live updates don't silently flip services public. The network_map controller prepends synth services to `account.Services` on every per-peer compute; `accesslogentry.go` gains an indexed `AgentNetwork` column so the dashboard can filter cheaply.
|
||||
|
||||
## Files
|
||||
|
||||
| Path | Role |
|
||||
| ---- | ---- |
|
||||
| `handlers/agentnetwork/providers_handler.go` | Catalog + provider CRUD + central `AddEndpoints` |
|
||||
| `handlers/agentnetwork/policies_handler.go` | Policy CRUD + shared `validatePolicy*` |
|
||||
| `handlers/agentnetwork/guardrails_handler.go` | Guardrail CRUD |
|
||||
| `handlers/agentnetwork/budget_handler.go` | Account-level budget rule CRUD |
|
||||
| `handlers/agentnetwork/settings_handler.go` | GET (200+`null` if unbootstrapped) + PUT toggles |
|
||||
| `handlers/agentnetwork/consumption_handler.go` | Read-only consumption rows |
|
||||
| `handlers/agentnetwork/handlers_test.go` | Real-store fixture; wire round-trip + validation |
|
||||
| `handlers/agentnetwork/budget_handler_test.go` | Budget-rule + settings toggles |
|
||||
| `server/http/handler.go` | New `agentNetworkManager` arg; conditional `AddEndpoints` |
|
||||
| `server/permissions/modules/module.go` | New `AgentNetwork` module key |
|
||||
| `internals/server/boot.go` | Wires synthesiser adapter + limits service into proxy server |
|
||||
| `internals/server/modules.go` | `AgentNetworkManager()` lazy-create node |
|
||||
| `internals/controllers/network_map/controller/controller.go` | `injectAllProxyPolicies` replaces 4 `InjectProxyPolicies` calls |
|
||||
| `internals/controllers/network_map/controller/repository.go` | `SynthesizeAgentNetworkServices` repo method |
|
||||
| `internals/modules/reverseproxy/service/service.go` | `MiddlewareConfig`, capture limits, `AgentNetwork`, `DisableAccessLog` + proto |
|
||||
| `internals/modules/reverseproxy/accesslogs/accesslogentry.go` | Indexed `AgentNetwork bool` from proto |
|
||||
| `internals/shared/grpc/proxy.go` | Synth wiring, 2 RPCs, domain fallback, `Private` in clone |
|
||||
| `internals/shared/grpc/proxy_clone_test.go` | Locks every `ProxyMapping` field minus `AuthToken` |
|
||||
| `server/activity/codes.go` | 13 new activity codes (125-137) |
|
||||
|
||||
## HTTP routes added
|
||||
|
||||
All routes inherit the platform's auth middleware. Perms enforced inside `agentnetwork.Manager.requirePermission` (`manager.go:680-682`) on `modules.AgentNetwork`. Permission column shows the `op` passed to `requirePermission` — read = `Read`, etc.
|
||||
|
||||
| Method | Path | Perm | Handler |
|
||||
| ------ | ---- | ---- | ------- |
|
||||
| GET | `/agent-network/catalog/providers` | authn only | `providers_handler.go:43` |
|
||||
| GET | `/agent-network/providers` | read | `providers_handler.go:57` |
|
||||
| POST | `/agent-network/providers` | create | `providers_handler.go:97` |
|
||||
| GET | `/agent-network/providers/{providerId}` | read | `providers_handler.go:77` |
|
||||
| PUT | `/agent-network/providers/{providerId}` | update | `providers_handler.go:132` |
|
||||
| DELETE | `/agent-network/providers/{providerId}` | delete | `providers_handler.go:172` |
|
||||
| GET | `/agent-network/policies` | read | `policies_handler.go:32` |
|
||||
| POST | `/agent-network/policies` | create | `policies_handler.go:72` |
|
||||
| GET | `/agent-network/policies/{policyId}` | read | `policies_handler.go:52` |
|
||||
| PUT | `/agent-network/policies/{policyId}` | update | `policies_handler.go:102` |
|
||||
| DELETE | `/agent-network/policies/{policyId}` | delete | `policies_handler.go:142` |
|
||||
| GET | `/agent-network/guardrails` | read | `guardrails_handler.go:25` |
|
||||
| POST | `/agent-network/guardrails` | create | `guardrails_handler.go:65` |
|
||||
| GET | `/agent-network/guardrails/{guardrailId}` | read | `guardrails_handler.go:45` |
|
||||
| PUT | `/agent-network/guardrails/{guardrailId}` | update | `guardrails_handler.go:95` |
|
||||
| DELETE | `/agent-network/guardrails/{guardrailId}` | delete | `guardrails_handler.go:135` |
|
||||
| GET | `/agent-network/budget-rules` | read | `budget_handler.go:24` |
|
||||
| POST | `/agent-network/budget-rules` | create | `budget_handler.go:64` |
|
||||
| GET | `/agent-network/budget-rules/{ruleId}` | read | `budget_handler.go:44` |
|
||||
| PUT | `/agent-network/budget-rules/{ruleId}` | update | `budget_handler.go:95` |
|
||||
| DELETE | `/agent-network/budget-rules/{ruleId}` | delete | `budget_handler.go:135` |
|
||||
| GET | `/agent-network/settings` | read | `settings_handler.go:53` (200+`null` if no row) |
|
||||
| PUT | `/agent-network/settings` | update | `settings_handler.go:27` |
|
||||
| GET | `/agent-network/consumption` | read | `consumption_handler.go:21` |
|
||||
|
||||
## gRPC RPCs added (or modified)
|
||||
|
||||
| RPC | Direction | Trigger |
|
||||
| --- | --------- | ------- |
|
||||
| `CheckLLMPolicyLimits` | proxy→mgmt unary | Pre-flight gate; returns allow/deny, selected policy, attribution group, window, deny code+reason (`proxy.go:259-301`). `Unimplemented` when limits service is nil. |
|
||||
| `RecordLLMUsage` | proxy→mgmt unary | Post-flight write of tokens+cost against policy-window dimensions + every applicable account budget rule (`proxy.go:303-349`). `window_seconds==0` ⇒ no policy cap, only account fan-out runs. |
|
||||
| `GetMappingUpdate`/`SendServiceUpdate` (stream) | mgmt→proxy | Snapshot (`proxy.go:752-780`) now appends `SynthesizeServicesForCluster`. Live updates use `SendServiceUpdateToCluster` + `shallowCloneMapping`. |
|
||||
|
||||
## Architecture & flow
|
||||
|
||||
### HTTP request lifecycle
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant DB as Dashboard
|
||||
participant R as gorilla.Router (/api)
|
||||
participant H as handler (agentnetwork)
|
||||
participant M as agentnetwork.Manager
|
||||
participant S as store.Store
|
||||
participant AM as accountManager (StoreEvent)
|
||||
|
||||
DB->>R: POST /api/agent-network/providers
|
||||
R->>H: createProvider (auth mw sets UserAuth)
|
||||
H->>H: GetUserAuthFromContext + validate(req)
|
||||
H->>M: CreateProvider(userID, provider, bootstrapCluster)
|
||||
M->>M: requirePermission(AgentNetwork, Create)
|
||||
M->>S: SaveAgentNetworkProvider
|
||||
M->>AM: StoreEvent(AgentNetworkProviderCreated)
|
||||
M-->>H: created provider
|
||||
H-->>DB: 200 + api.AgentNetworkProvider JSON
|
||||
```
|
||||
|
||||
### Synth-service delivery via gRPC
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant P as Proxy
|
||||
participant G as ProxyServiceServer
|
||||
participant SM as service.Manager (persisted)
|
||||
participant SA as synthesizerAdapter
|
||||
participant AN as SynthesizeServicesForCluster
|
||||
participant ST as store.Store
|
||||
|
||||
Note over P,G: Initial snapshot
|
||||
P->>G: GetMappingUpdate (stream open)
|
||||
G->>SM: GetServicesForCluster(conn.address)
|
||||
SM-->>G: persisted []*Service
|
||||
G->>SA: SynthesizeServicesForCluster(conn.address)
|
||||
SA->>AN: SynthesizeServicesForCluster(store, clusterAddr)
|
||||
AN->>ST: walk every account; read providers/policies/settings
|
||||
AN-->>SA: in-memory []*Service
|
||||
SA-->>G: []*Service
|
||||
G->>P: response (persisted + synth)
|
||||
|
||||
Note over G,P: Per-request live update
|
||||
G->>G: SendServiceUpdateToCluster(update, clusterAddr)
|
||||
G->>G: shallowCloneMapping(update) %% Private MUST survive
|
||||
G->>P: response with single mapping
|
||||
```
|
||||
|
||||
End-to-end: HTTP write persists rows and emits an activity event; the manager then triggers `proxyController.SendServiceUpdate` so proxies re-render. **The snapshot path is the only one that calls into the synthesiser** — on stream open it pulls persisted services then appends synth services for the cluster. Synth services are never persisted. For OIDC/session/tunnel-peer flows, `getServiceByDomain` falls back to `SynthesizeServicesForCluster(clusterFromDomain(domain))` when persisted lookup misses (`proxy.go:1763-1793`). The network_map contribution is orthogonal: per-peer compute prepends the same synth services to `account.Services` before `InjectProxyPolicies`.
|
||||
|
||||
## Permissions model added
|
||||
|
||||
- `permissions/modules/module.go:22` adds `AgentNetwork Module = "agent_network"`, registered in `All` (`module.go:42`). Standard `operations.{Read,Create,Update,Delete}` matrix.
|
||||
- Handlers don't call `permissionsManager` directly — they extract `UserAuth` and delegate to `agentnetwork.Manager`, which gates every mutation through `requirePermission` (`manager.go:168, 308, 549`, etc.). Confirm your role-set provider has `agent_network` rows for owner/admin/user/billing-admin before merging.
|
||||
- `getCatalogProviders` (`providers_handler.go:43`) intentionally skips RBAC — catalog is global static data.
|
||||
|
||||
## Activity codes added
|
||||
|
||||
`activity/codes.go:244-274` adds Activities 125-137 + string/code mappings (`codes.go:428-444`), following `<domain>.<resource>.<action>` (e.g., `agent_network.provider.create`). Audit-log exporters / SIEM forwarders need to know the new codes.
|
||||
|
||||
## Invariants
|
||||
|
||||
- **Synth services are never persisted.** Snapshot appends after `serviceManager.GetServicesForCluster` (`proxy.go:761-770`); network_map prepends before `InjectProxyPolicies` (`controller.go:117-126`).
|
||||
- **`shallowCloneMapping` must round-trip every `ProxyMapping` field except `AuthToken`** — `proxy_clone_test.go:50-58` enforces via `gproto.Equal`. The bug it guards: a missing `Private` made every MODIFIED arrive `private=false`, the proxy skipped `ValidateTunnelPeer`, `UserGroups` stayed empty, `llm_router` denied `no_authorised_provider`; a restart "fixed" it because the snapshot uses the original mapping.
|
||||
- **Limit-window floor is 60s** (`policies_handler.go:189-220`); enabled cap with both per-group and per-user at zero is rejected. Budget rules reuse the same validator (`budget_handler.go:170`).
|
||||
- **Manager is optional at boot.** `NewAPIHandler` registers routes only when non-nil (`handler.go:129`); `ProxyServiceServer` returns `Unimplemented` from both RPCs when limits service is unwired (`proxy.go:262-265, 306-309`).
|
||||
- **Settings GET on an unbootstrapped account returns 200 + `null`** (`settings_handler.go:65-72`) — not 404.
|
||||
|
||||
## Things to scrutinize
|
||||
|
||||
### Correctness
|
||||
- **`injectAllProxyPolicies` runs on every per-peer compute**: `controller.go:163, 309, 415, 681`. `sendUpdateAccountPeers` is the target of the buffered fan-out — synth runs once per debounced account-update tick **and** once per direct `UpdateAccountPeer`. Cost is O(providers + policies × users-per-group) per account under `LockingStrengthNone`. No per-account synth cache — verify it fits the buffer interval for your largest tenant.
|
||||
- **`clusterFromDomain` strips at the first `.`** (`proxy.go:1784-1792`). A zero-dot domain returns `""` and the synth call walks every account. Confirm no path reaches this with a malformed/internal domain.
|
||||
- **Account-budget `RecordConsumption` fans out even when `window_seconds == 0`** (`proxy.go:341-348`) — intentional. Verify the proxy never sends `RecordLLMUsage` for a request that wasn't actually allowed.
|
||||
|
||||
### Security
|
||||
- Every handler extracts `UserAuth` via `nbcontext.GetUserAuthFromContext` before any work. Routes live behind the standard `/api` mux; bypass list is not extended.
|
||||
- `CheckLLMPolicyLimits` / `RecordLLMUsage` ride the existing **proxy → mgmt** gRPC connection auth. No additional token check inside the RPCs — they trust the connection. Confirm the proxy-side token-verification interceptor in this package gates both.
|
||||
- `RecordLLMUsage` only validates `account_id != ""` (`proxy.go:317-319`). A compromised proxy can attribute cost to any account in its cluster — was already true for prior RPCs but is louder now that data drives denials.
|
||||
|
||||
### Concurrency
|
||||
- `SetAgentNetworkSynthesizer` / `SetAgentNetworkLimitsService` write under `s.mu.Lock`; read paths copy the interface under read lock (`proxy.go:236-247, 260-263, 304-307`). Same pattern as existing `serviceManager`/`proxyController` setters.
|
||||
- Manager writes use `LockingStrengthUpdate`; synth reads use `LockingStrengthNone` — read-after-write via the proxy snapshot can observe a stale view by up to one fan-out tick.
|
||||
- Network_map controller is single-threaded per account; cross-account is parallel.
|
||||
|
||||
### Backward compatibility
|
||||
- `proxy_clone_test.go` is the regression net; any new `ProxyMapping` field must be cloned or explicitly nulled in the test.
|
||||
- `AccessLogEntry` adds indexed `AgentNetwork bool` — implicit AutoMigrate; deploy story must handle table-rewrite cost on high-volume access-log tables.
|
||||
- `TargetOptions` gains seven `omitempty` JSON fields (`service.go:69-94`); on-wire shape stays compatible. `targetOptionsToProto` tests all fields when deciding nil (`service.go:551-556`).
|
||||
- `NewAPIHandler` signature changes — every caller must pass `agentNetworkManager`; `nil` is supported.
|
||||
|
||||
### Observability
|
||||
- 13 new activity codes via `accountManager.StoreEvent` in the manager — confirm dashboard's audit-log UI maps them.
|
||||
- `AccessLogEntry.AgentNetwork` is indexed for the dashboard's agent-network log filter.
|
||||
- New RPCs log at error level on store/selector failures (`proxy.go:284, 327, 332, 348`). Snapshot synth failures degrade to warnings — stream is not aborted (`proxy.go:765`).
|
||||
|
||||
## Test coverage
|
||||
|
||||
| Test | Locks down |
|
||||
| ---- | ---------- |
|
||||
| `handlers_test.go::TestPolicyHandler_WindowSecondsRoundTrip` | GET carries `window_seconds`; legacy `window_hours`/`window_days` absent. |
|
||||
| `handlers_test.go::TestPolicyHandler_RejectsSubMinuteWindow` | POST `<60s` returns 4xx. |
|
||||
| `handlers_test.go::TestConsumptionHandler_EmptyAccountReturnsArray` | `/consumption` returns `[]` — never null. |
|
||||
| `handlers_test.go::TestConsumptionHandler_PopulatedAccountListsRows` | RecordConsumption×2 surfaces both with correct tokens/cost/window. |
|
||||
| `budget_handler_test.go::TestBudgetRuleHandler_RoundTrip` | Targets + PolicyLimits shape round-trip. |
|
||||
| `budget_handler_test.go::TestBudgetRuleHandler_ListReturnsArray` | Empty-list shape. |
|
||||
| `budget_handler_test.go::TestBudgetRuleHandler_{RejectsMissingName,RejectsSubMinuteWindow}` | Validation rejections are 4xx. |
|
||||
| `budget_handler_test.go::TestSettingsHandler_GetExposesCollectionToggles` | All four toggles + computed `Endpoint`. |
|
||||
| `proxy_clone_test.go::TestShallowCloneMapping_PreservesAllFieldsExceptAuthToken` | Future-proofs clone; every field round-trips, `AuthToken` dropped. |
|
||||
|
||||
Handler tests use a real sqlite store + real manager + always-allow permissions mock (`handlers_test.go:53-75`). Create/update/delete success paths flow through `accountManager.StoreEvent` which the fixture doesn't wire — covered by manager-level no-mock tests outside this module.
|
||||
|
||||
## Known limitations / explicit non-goals
|
||||
|
||||
- No pagination on any list endpoint; no bulk endpoints.
|
||||
- Synth result is not cached — every snapshot and every per-peer compute repeats the store walk.
|
||||
- `getSettings` returning `200 + null` is a deliberate dashboard concession.
|
||||
- No rate-limiting beyond the global `/api` rate limiter.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Upstream: [shared/api](10-shared-api.md), [management/agentnetwork](21-management-agentnetwork.md), [management/store](20-management-store.md)
|
||||
- Downstream: [proxy/runtime](33-proxy-runtime.md)
|
||||
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
|
||||
- Top-level: [../00-overview.md](../00-overview.md)
|
||||
215
docs/agent-networks/modules/30-proxy-middleware-framework.md
Normal file
215
docs/agent-networks/modules/30-proxy-middleware-framework.md
Normal file
@@ -0,0 +1,215 @@
|
||||
# proxy/middleware-framework — generic plugin system
|
||||
|
||||
> **Risk level:** **High** — every proxied request transits this chain. Budget exhaustion, panic recovery, or chain-close bugs hit the hot path for all targets, not just agent-network ones.
|
||||
> **Backward-compat impact:** Additive at the proxy. The `middleware` and `bodytap` packages are new (`proxy/internal/middleware/middleware.go:1`, `proxy/internal/middleware/bodytap/request.go:13`); existing proxy targets keep working until a chain is bound to them via `Manager.Rebuild`.
|
||||
|
||||
This module is the **framework only** — no LLM/agent-network domain knowledge is required, since every example built into it is generic.
|
||||
|
||||
## Module boundary
|
||||
|
||||
This module is the **framework only**: slots, chains, registry, dispatcher, accumulator, body-tap, output filters. No middleware *implementation* lives here — those land in `proxy/internal/middleware/builtin/*` (covered in module 31). The package contract is:
|
||||
|
||||
1. The proxy hands a `Manager` to its config-apply path. The synth pushes per-path `PathTargetBinding` lists (`proxy/internal/middleware/manager.go:26`) into `Manager.Rebuild`, which resolves each spec via the `Registry`/`Resolver` (`proxy/internal/middleware/registry.go:81-121`) and produces an immutable `Chain` keyed by `serviceID|pathID` (`proxy/internal/middleware/manager.go:410-412`).
|
||||
2. The reverse-proxy handler captures the request body via `bodytap.CaptureRequest`, calls `Chain.RunRequest`, applies returned mutations (already filtered by `chain.applyMutations`), forwards to the upstream behind a `bodytap.CapturingResponseWriter`, then calls `Chain.RunResponse` and `Chain.RunTerminal`.
|
||||
3. Middlewares are inert plugins that receive a deep-cloned `Input` and return an `Output` whose decision/mutations are clamped by the dispatcher's `filterOutput` (`proxy/internal/middleware/dispatcher.go:149-172`).
|
||||
|
||||
Everything that crosses the framework boundary in either direction is value-typed and deep-copied — middlewares cannot mutate the live request directly, and the framework cannot inadvertently leak middleware-owned slices into the request hot path.
|
||||
|
||||
## Files
|
||||
|
||||
| Path | Role |
|
||||
| ---- | ---- |
|
||||
| `proxy/internal/middleware/middleware.go` | `Middleware` + `Factory` interfaces. |
|
||||
| `proxy/internal/middleware/types.go` | `Slot`, `FailMode`, `Decision`, all limit constants, `Input`/`Output`/`Mutations`/`UpstreamRewrite`/`AuthHeader` value types. |
|
||||
| `proxy/internal/middleware/spec.go` | Apply-time `Spec` (validated wire shape + runtime-injected fields) and `Clone`. |
|
||||
| `proxy/internal/middleware/registry.go` | `Registry` (factory map, RWMutex) and `Resolver` (Spec → bound `Middleware`). |
|
||||
| `proxy/internal/middleware/manager.go` | `Manager`, `chainTable` reverse index, `Rebuild`/`Invalidate*`, async chain close. |
|
||||
| `proxy/internal/middleware/chain.go` | `Chain.RunRequest`/`RunResponse`/`RunTerminal`, mutation gating, `cloneInputFor`. |
|
||||
| `proxy/internal/middleware/chain_test.go` | Metadata threading, LIFO response order, rewrite gating, UserGroups propagation, terminal accumulation. |
|
||||
| `proxy/internal/middleware/dispatcher.go` | Timeout/panic recovery, fail-mode, error classification, `filterOutput`. |
|
||||
| `proxy/internal/middleware/decision.go` | `RenderDenyResponse`, deny-code regex, status clamp. |
|
||||
| `proxy/internal/middleware/headerpolicy.go` | Compile-in header denylist + `FilterHeaderMutations`. |
|
||||
| `proxy/internal/middleware/bodypolicy.go` | `ValidateBodyReplace` / `ApplyBodyReplace` smuggling guards. |
|
||||
| `proxy/internal/middleware/keys.go` | Metadata key namespace constants. |
|
||||
| `proxy/internal/middleware/metadata.go` | `Accumulator` — allowlist, per-mw/per-request byte caps, redaction. |
|
||||
| `proxy/internal/middleware/metrics.go` | OTel instrument bundle (`proxy.middleware.*`). |
|
||||
| `proxy/internal/middleware/redaction.go` | `Scan` — PEM/JWT/AWS/bearer/Luhn-validated CC patterns. |
|
||||
| `proxy/internal/middleware/bodytap/request.go` | Capture + replay reader, `Budget` semaphore, bypass reason codes. |
|
||||
| `proxy/internal/middleware/bodytap/response.go` | `CapturingResponseWriter` (tee with `PassthroughWriter` for Flusher/Hijacker preservation). |
|
||||
|
||||
## Slot model
|
||||
|
||||
Three slots, declared per-middleware exactly once (`proxy/internal/middleware/types.go:27-41`):
|
||||
|
||||
- **`SlotOnRequest`** (`Slot=1`) — runs **before** the upstream call, in registration order. May `DecisionDeny`, may emit `Mutations` (header add/remove, body replace, `UpstreamRewrite`) when both `Spec.CanMutate` and `Middleware.MutationsSupported()` are true. May emit metadata. Each middleware in the slot sees metadata that earlier ones in the same slot just emitted (`proxy/internal/middleware/chain.go:144-178`) — this is how the framework gives middlewares an intra-slot side channel without a global bag.
|
||||
- **`SlotOnResponse`** (`Slot=2`) — runs **after** the upstream returns, in **reverse** registration order. Cannot deny (clamped in `dispatcher.filterOutput`, `proxy/internal/middleware/dispatcher.go:153-157`). May still mutate response headers in principle, but the current chain only forwards `RewriteUpstream` from on_request, so on_response mutations are observe-only in practice. Threads the same per-slot metadata view as on_request.
|
||||
- **`SlotTerminal`** (`Slot=3`) — runs **after** every on_response middleware has emitted, in registration order. Sees the full accumulated bag plus prior terminal emissions (`chain.go:221-245`). Cannot deny, cannot mutate (`dispatcher.go:168-170`). Designed for sinks (access log, metrics push, audit emitter).
|
||||
|
||||
Splitting a feature across slots (e.g. "parse on the way out, ship on terminal") is the explicit architectural choice — `types.go:7-15` and `types.go:22-25` make it clear no middleware participates in more than one slot.
|
||||
|
||||
## Architecture & flow
|
||||
|
||||
### Chain dispatch
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant H as proxy HTTP handler
|
||||
participant BT as bodytap.CaptureRequest
|
||||
participant CH as Chain
|
||||
participant DI as Dispatcher
|
||||
participant MW as Middleware (per slot)
|
||||
participant US as Upstream
|
||||
participant CW as CapturingResponseWriter
|
||||
|
||||
H->>BT: CaptureRequest(r, cfg, budget)
|
||||
BT-->>H: body[], truncated, release()
|
||||
H->>CH: RunRequest(ctx, r, Input, Accumulator)
|
||||
loop on_request, registration order
|
||||
CH->>CH: cloneInputFor(in, OnRequest)
|
||||
CH->>DI: Invoke(ctx, spec, mw, call)
|
||||
DI->>MW: mw.Invoke(callCtx, in)
|
||||
MW-->>DI: Output{decision, metadata, mutations?}
|
||||
DI->>DI: filterOutput (clamp deny, gate mutations)
|
||||
DI-->>CH: filtered Output
|
||||
CH->>CH: Accumulator.Emit (allowlist + caps + redact)
|
||||
alt DecisionDeny
|
||||
CH-->>H: denied, merged, rewrite
|
||||
else allow
|
||||
CH->>CH: applyMutations(r, m) and capture rewrite
|
||||
end
|
||||
end
|
||||
CH-->>H: nil, merged, rewrite
|
||||
H->>US: ProxyRequest (with rewrite/mutations applied)
|
||||
US-->>CW: bytes (streamed, tee'd into cap-bounded buf)
|
||||
CW-->>H: passthrough complete
|
||||
H->>CH: RunResponse(ctx, Input{RespBody:CW.Body(),...}, acc)
|
||||
loop on_response, REVERSE order (LIFO)
|
||||
CH->>DI: Invoke (same wrappers)
|
||||
end
|
||||
H->>CH: RunTerminal(ctx, Input{Metadata:full bag}, acc)
|
||||
H->>BT: release() + CW.Release()
|
||||
```
|
||||
|
||||
### Body-tap mechanics (request + response)
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph req[Request capture — bodytap.CaptureRequest]
|
||||
R0[r.Body] --> R1{cfg.MaxRequestBytes > 0?\nUpgrade absent?\nContent-Type allowed?\nCL <= cap?}
|
||||
R1 -- no --> R2[bypass = reason\nbody = nil\nr.Body untouched]
|
||||
R1 -- yes --> R3[Budget.Acquire(cap)]
|
||||
R3 -- denied --> R4[bypass=BypassBudget]
|
||||
R3 -- ok --> R5[io.LimitReader(r.Body, cap+1)\nio.ReadAll]
|
||||
R5 --> R6{len > cap?}
|
||||
R6 -- truncated --> R7[viewable = buf[:cap]\nr.Body = replayReadCloser{buf, tail}]
|
||||
R6 -- whole --> R8[r.Body = NopCloser(bytes.Reader(buf))\nclose original]
|
||||
R7 --> R9[(release captured\nbudget on req end)]
|
||||
R8 --> R9
|
||||
end
|
||||
|
||||
subgraph resp[Response capture — CapturingResponseWriter]
|
||||
W0[client] -.-> CW[Write(p)]
|
||||
CW --> P1[PassthroughWriter.Write(p)\n— bytes leave to client first]
|
||||
P1 --> P2{!stopped?}
|
||||
P2 -- yes --> P3{remaining = cap - buf.Len()}
|
||||
P3 --> P4[buf.Write(p[:take])\nset truncated if take<n]
|
||||
P2 -- no --> P5[silent drop into the tee\n(client write already done)]
|
||||
end
|
||||
```
|
||||
|
||||
The body-tap is the highest-leak-risk surface in this module; three details matter:
|
||||
|
||||
1. **Request capture is "read-and-replay", not "read-and-forward".** `CaptureRequest` always swaps `r.Body` for either a `bytes.Reader` (whole body fit) or a `replayReadCloser` that replays the captured prefix then drains the remaining stream from the original body (`bodytap/request.go:178-201`). This means the **upstream still sees the full body even when the tap truncates**. The original `r.Body` is **not** closed in the truncated branch — `replayReadCloser.Close()` only closes the tail (`bodytap/request.go:199-201`), which is the same reader, so close once on request end is correct, but reviewers should confirm the upstream proxy always reads to EOF (otherwise the tail is leaked).
|
||||
2. **Response capture is a write-through tee.** `CapturingResponseWriter.Write` forwards to the underlying writer **first** (`bodytap/response.go:116-117`), then tees into `buf` under its own mutex. Client never blocks on the tee. `Flusher`/`Hijacker` are preserved via the embedded `responsewriter.PassthroughWriter`. SSE/chunked streams flow through untouched; middlewares only see the bounded prefix.
|
||||
3. **Budget is a single shared semaphore.** `Manager` constructs one `bodytap.Budget` at startup (`manager.go:138-144`, default `256 MiB` from `bodytap/request.go:39`). Every capture pre-acquires its full `MaxRequestBytes` / `MaxResponseBytes` from the budget regardless of actual body size; that prevents a flood of small captures from collectively exceeding the cap, but it also means a misconfigured `MaxRequestBytes = 1 MiB` with 256 concurrent requests already exhausts the default budget. Reviewers should sanity-check the operator-facing defaults that ship with synth-service.
|
||||
|
||||
The framework explicitly aborts capture (and increments `proxy.middleware.capture_bypass_total`) before reading the first byte when `Upgrade`/`Connection: upgrade` is set (`bodytap/request.go:120-125`), when the content-type isn't in the allowlist (`bodytap/request.go:126-128`), or when the advertised `Content-Length` already exceeds the cap (`bodytap/request.go:131-133`). This is the right place to make sure WebSocket upgrades and large file uploads never reach the buffer.
|
||||
|
||||
## Public contracts
|
||||
|
||||
- **`Middleware` interface** (`middleware.go:14-36`): `ID()`, `Version()`, `Slot()`, `AcceptedContentTypes()`, `MetadataKeys()`, `MutationsSupported()`, `Invoke(ctx, *Input) (*Output, error)`, `Close()`. `MetadataKeys()` is the **closed set** the middleware is allowed to emit — the accumulator drops anything outside it (`metadata.go:71-75`). `Close` must be idempotent (called even when `Invoke` was never reached).
|
||||
- **`Factory` interface** (`middleware.go:44-47`): `ID()`, `New(rawConfig []byte) (Middleware, error)`. `RawConfig` is opaque JSON bytes on the wire (`spec.go:6-12`); each factory owns its own typed config.
|
||||
- **`Decision` type** (`types.go:59-69`): `Allow=0`, `Deny=1`, `Passthrough=2`. Default-zero is permissive — important because every middleware that omits `Decision` gets `Allow`. Dispatcher clamps `Deny` to `Passthrough` outside `SlotOnRequest` (`dispatcher.go:153-157`).
|
||||
- **`Mutations`** (`types.go:196-201`): `HeadersAdd`/`HeadersRemove` (filtered through `headerpolicy.go`), `BodyReplace` (gated through `bodypolicy.go`), and `RewriteUpstream`. `RewriteUpstream` is **last-write-wins** within the on_request slot (`chain.go:170-172`, locked down by `TestChain_RunRequest_LatestRewriteWins`).
|
||||
- **Metadata propagation keys** (`keys.go`): all keys live in a single file and follow `^[a-z][a-z0-9_-]*(\.[a-z0-9_-]*)+$` (`metadata.go:8`). Framework-injected error tagging uses `mw.<id>.error_kind` (`keys.go:81`) so operators can distinguish framework-emitted entries from middleware-emitted ones.
|
||||
|
||||
## Invariants
|
||||
|
||||
- **Per-request context isolation.** `cloneInputFor` deep-copies every mutable field (`Headers`, `RespHeaders`, `Metadata`, `Body`, `RespBody`, `UserGroups`, `UserGroupNames`) before each invocation (`chain.go:286-308`). A misbehaving middleware that mutates `in.Headers` only corrupts its own copy.
|
||||
- **Body-tap bounded by capture limit.** Request side uses `io.LimitReader(r.Body, limit+1)` (`bodytap/request.go:152`) — the `+1` is how the code detects truncation (`bodytap/request.go:160`); the surfaced buffer is sliced back down to `limit`. Response side stops teeing once `buf.Len() >= cap` (`bodytap/response.go:121-133`). Neither side can grow the buffer past the configured cap.
|
||||
- **Headers/body redaction order.** Accumulator runs `Scan(value)` **before** counting cost (`metadata.go:81-82`), so the byte budgets are computed against post-redaction sizes. `Scan` order is PEM → JWT → AWS key → bearer → Luhn-validated CC (`redaction.go:25-51`) — the comment block in `redaction.go:8-13` is explicit that this is best-effort, not DLP.
|
||||
- **No middleware can starve the chain.** Every invocation runs inside `context.WithTimeout(ctx, clampTimeout(spec.Timeout))` in a separate goroutine (`dispatcher.go:51-94`), with the deadline race-`select`ed against the result channel. A blocked middleware fires the timeout path, gets fail-mode'd, and `IncError(kind=timeout)`. Timeouts are clamped to `[10ms, 5s]` (`types.go:80-86`, `dispatcher.go:174-185`).
|
||||
- **Panic recovery.** `recover()` captures the panic, logs only the type + a 4 KiB stack prefix (no panic value — avoids leaking secrets the middleware was processing), and produces a `panicError` that flows through fail-mode (`dispatcher.go:64-76`).
|
||||
- **Chain immutability + atomic swap.** `chainTable` is cloned on every `Rebuild`/`Invalidate*` and swapped via `atomic.Pointer` (`manager.go:44-69`, `manager.go:221-300`). Readers (`ChainFor`) are lock-free; writers serialise on `writeMu`. The retired chain is `Close`-d in a background goroutine bounded by `chainCloseTimeout = 2 * MaxTimeout` (`manager.go:21-22`, `manager.go:326-346`), so in-flight invocations finish on the old chain after the swap.
|
||||
|
||||
## Things to scrutinize
|
||||
|
||||
### Correctness
|
||||
|
||||
- **Chain ordering deterministic from synth output?** `Manager.buildChain` iterates `b.Specs` in slice order and appends to `bound` (`manager.go:366-391`); `NewChain` then partitions by slot but **preserves slice order within each slot** (`chain.go:50-60`). So order on the wire = order observed at runtime. Synth must therefore emit specs in the intended execution order — there is no per-spec `Priority` field. Worth flagging.
|
||||
- **Decision short-circuit semantics.** `RunRequest` returns immediately on `DecisionDeny` (`chain.go:164-167`) **with the metadata accumulated so far** plus the `denied.Metadata`. Callers that ignore `merged` on deny will lose framework-injected `mw.<id>.error_kind` entries. The proxy runtime is the only caller; confirm it always feeds `merged` into the access log on the deny path as well.
|
||||
- **`UpstreamRewrite` `AuthHeader` bypass** (`types.go:218-235`). The `AuthHeader`/`StripHeaders` fields *intentionally* bypass the header denylist on the basis that the proxy itself rewrites auth. The denylist still blocks middleware-emitted `HeadersAdd: Authorization=...`. This is a delicate carve-out — review the runtime consumer to confirm only the trusted upstream-build path unpacks `AuthHeader`, never the generic `applyMutations` loop.
|
||||
- **`replayReadCloser.Close` only closes the tail** (`bodytap/request.go:199-201`). The replay buffer doesn't own a resource, so this is correct, but it conflates "replay finished" with "underlying body closed". If a caller `Close()`s without reading to EOF, the original body is closed but the captured prefix is lost; harmless for the proxy path (upstream always reads to EOF) but worth a doc-comment.
|
||||
|
||||
### Security
|
||||
|
||||
- **Body-tap memory bounds.** Discussed above — bounded by `MaxBodyCapBytes = 1 MiB` per direction (`types.go:77`) and the shared `Budget` (default 256 MiB). The concerning case is the **deep-copy in `cloneInputFor`** (`chain.go:300-306`): every middleware invocation gets its **own copy** of `Body` and `RespBody`. A chain of N middlewares with a 1 MiB body allocates N MiB of transient bytes per request. With `MaxMiddlewaresPerChain = 16` (`types.go:103`) that's up to 16 MiB extra per in-flight request. Worth pricing into the budget model.
|
||||
- **Header redaction completeness.** `denyHeaders` (`headerpolicy.go:5-17`) covers the auth/forwarding family and framing (`Content-Length`, `Transfer-Encoding`, `Trailer`). `denyHeaderPrefixes` covers `X-Authenticated-*`, `X-Forwarded-*`, `X-Remote-*`, `X-NetBird-*`. Notably absent: `Range`, `If-Match`/`If-None-Match` (mutation could cause cache poisoning), `Origin`/`Referer`. Not necessarily wrong, but worth a deliberate decision.
|
||||
- **Metadata key collisions across middlewares.** The accumulator has no cross-middleware uniqueness check; two middlewares with the same key in their allowlist can both emit it, and both copies land in `merged` (`metadata.go:51-99`). Downstream consumers must tolerate duplicates. Worth documenting.
|
||||
- **Deny rendering.** `RenderDenyResponse` only allows codes matching `^[a-z][a-z0-9._-]{0,63}$` (`decision.go:9`), redacts/truncates message + detail values, caps `Details` at 8 entries (`decision.go:42-50`), clamps status to `[400,499]\{401}` (`decision.go:65-73`). The deny body type is fixed; middlewares cannot inject arbitrary JSON.
|
||||
|
||||
### Concurrency
|
||||
|
||||
- **Per-request state vs shared state in factories.** Each `Factory.New` is called once per chain build; the returned `Middleware` instance is **shared across all requests** for that chain. `Invoke` must be reentrant. The framework does not enforce this — a buggy middleware that holds per-call state on the struct will silently race. Suggest a `// Invoke must be safe for concurrent use` doc on the interface.
|
||||
- **`chainTable` clone-on-write** is correct, but `addChain`/`removeChain` mutate the *cloned* table before the swap (`manager.go:71-108`), and they're called under `writeMu`. Readers only ever see the post-swap pointer. Good.
|
||||
- **`Chain.inflight` WaitGroup**. `Run*` does `Add(1)`/`Done()` (`chain.go:142-143`, `chain.go:194-195`, `chain.go:225-226`); `Close` waits on it bounded by ctx (`chain.go:75-85`). One concern: a *new* `RunRequest` can `Add(1)` *after* `Close` started waiting if the caller still holds a stale chain pointer. `WaitGroup` does not panic on this if the count was already > 0 at `Wait` time, but it does panic if `Add` happens after `Wait` returns and another `Wait` runs. `Close` is documented one-shot, so single-`Wait` is fine, but callers must drop the chain reference before calling `Close`. Worth a code comment near `Close`.
|
||||
- **Goroutine leaks.** `Dispatcher.Invoke` spawns one goroutine per call and *always* writes to a buffered (cap=1) channel (`dispatcher.go:62-76`), so even if the timeout fires the goroutine completes its send and exits. No leak.
|
||||
- **`closeChainsAsync`** detaches retired chains into a goroutine (`manager.go:326-346`). If `Manager` is never GC'd this is fine, but there's no shutdown hook to wait on outstanding closes. Reviewers should confirm the proxy shutdown path explicitly drains in-flight requests before tearing down `Manager`, or accept that the last chain-close round may be cut short on exit.
|
||||
|
||||
### Performance
|
||||
|
||||
- **Allocations per request.** `cloneInputFor` allocates new slices for `Headers`, `RespHeaders`, `Metadata`, `Body`, `RespBody`, `UserGroups`, `UserGroupNames` — once per middleware per request. For a typical 5-middleware chain on a 1 KiB body that's ~10 small slice allocs plus one `Body` copy each. Not a hot-path crisis, but `sync.Pool` for the per-call `Input` would be a natural follow-up.
|
||||
- **Accumulator allocates a fresh `allowSet` per `Emit` call** (`metadata.go:55-58`). One per middleware per slot pass = up to 48 per request. Cheap, but worth noting.
|
||||
- **Regex cost.** `Scan` runs five regex passes on every accepted metadata value (`redaction.go:25-51`). Bounded by `MaxMetadataValueBytes = 4 KiB` so worst case is small.
|
||||
|
||||
### Observability
|
||||
|
||||
- **Per-middleware metrics.** `proxy.middleware.requests_total{middleware,target_id,outcome}` (`metrics.go:34-41`), `duration_ms`, `invocations_total`, `errors_total{kind}`, `metadata_rejected_total{reason}`, `header_mutation_blocked_total{header}`, `capture_bypass_total{reason}`. Comprehensive surface; operators can alert on `errors_total{kind=panic}` and `errors_total{kind=timeout}` separately. **Latency histogram is in milliseconds with default OTel buckets** — for a 10ms–5s timeout range default buckets cover OK, but a custom bucket set centred on 1–500ms would resolve the agent-network response-parser tail better.
|
||||
- **Decision logs.** Panic logs (`dispatcher.go:69`) include `request_id`, type, and stack but not the panic value (safe). `Chain.Close` logs middleware-close errors at debug (`chain.go:91`). `applyMutations` logs body-replace rejections at warn (`chain.go:278`). No log on the deny path itself — by design, since the access-log terminal middleware is expected to record outcomes.
|
||||
|
||||
## Test coverage
|
||||
|
||||
| Test file | Locks down |
|
||||
| --------- | ---------- |
|
||||
| `proxy/internal/middleware/chain_test.go:77` | `RunRequest` threads metadata across on_request middlewares (regression for the "later mw can't see earlier mw's emissions" bug). |
|
||||
| `chain_test.go:110` | `RunResponse` reverse-order threading. |
|
||||
| `chain_test.go:142` | `cost_meter`-shaped scenario: response_parser registered after cost_meter still emits *before* cost_meter sees the bag (guards the `cost.skipped=missing_tokens` regression). |
|
||||
| `chain_test.go:178` | `UpstreamRewrite` last-write-wins. |
|
||||
| `chain_test.go:206` | No middleware emits → nil rewrite. |
|
||||
| `chain_test.go:224` | Rewrite filtered when `CanMutate=false`. |
|
||||
| `chain_test.go:245` | `Input.UserGroups` propagates verbatim through `cloneInputFor`. |
|
||||
| `chain_test.go:304` | Terminal middlewares see the full accumulated bag + prior terminal emissions. |
|
||||
|
||||
**Gaps** worth raising with the author:
|
||||
- No direct test for `Dispatcher.Invoke` timeout / panic / fail-mode behaviour at the framework level (covered indirectly by built-in tests, but a unit test pinning `errors_total{kind=...}` labels would be cheap insurance).
|
||||
- No test for `bodytap.CaptureRequest` truncated replay (the upstream-sees-full-body invariant is exactly the kind of thing a regression would silently break).
|
||||
- No test for `Budget` exhaustion behaviour under concurrency.
|
||||
- No test for `Manager.InvalidateMiddleware` + `LiveServiceCheck` race (the auth-revocation race the comment at `manager.go:33-38` calls out is the load-bearing reason for `LiveServiceCheck`).
|
||||
|
||||
## Known limitations / explicit non-goals
|
||||
|
||||
- **No middleware-to-middleware RPC.** Side-channel is metadata only.
|
||||
- **No streaming body inspection.** Middlewares see a bounded prefix; SSE / chunked parsing happens against that prefix in the response middleware.
|
||||
- **No per-spec priority.** Order is registration order in the spec slice.
|
||||
- **No retry / circuit-breaker** on middleware errors. Fail-mode is binary (open/closed) and per-spec.
|
||||
- **Mutations cannot rewrite the request URL path or query** — only `RewriteUpstream` can change scheme/host (+ optional path replacement, see `types.go:218-235`).
|
||||
- **Redaction is best-effort.** Explicitly documented in `redaction.go:8-13`. Not a DLP solution.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Upstream wire shape: [../modules/10-shared-api.md](10-shared-api.md) (Spec/RawConfig encoding from management).
|
||||
- Built-in middlewares using this framework: [../modules/31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md).
|
||||
- Runtime wiring (where `Manager`, `Chain`, and `bodytap` are consumed by the HTTP handler): [../modules/33-proxy-runtime.md](33-proxy-runtime.md).
|
||||
- End-to-end request flow including capture + chain dispatch: [../01-end-to-end-flows.md](../01-end-to-end-flows.md).
|
||||
- Top-level architecture: [../00-overview.md](../00-overview.md).
|
||||
365
docs/agent-networks/modules/31-proxy-middleware-builtin.md
Normal file
365
docs/agent-networks/modules/31-proxy-middleware-builtin.md
Normal file
@@ -0,0 +1,365 @@
|
||||
# proxy/middleware-builtin — the LLM chain
|
||||
|
||||
The registry-mounted middleware set the proxy executes on every agent-network
|
||||
LLM request. The two highest-blast-radius areas are the **capture-pointer
|
||||
semantics** and the **limit_check ⇒ limit_record** record-once invariant.
|
||||
|
||||
Sibling module: [32-proxy-llm-parsers.md](./32-proxy-llm-parsers.md) — the SDK
|
||||
adapters + pricing catalog this chain delegates to.
|
||||
|
||||
---
|
||||
|
||||
## Module boundary
|
||||
|
||||
This module is the registry-mounted middleware set the proxy executes on
|
||||
every agent-network LLM request. Each sub-package registers itself via
|
||||
`init()`
|
||||
([builtin.go:32–34](../../../proxy/internal/middleware/builtin/builtin.go));
|
||||
the proxy server anonymous-imports the set
|
||||
([all_test.go:11–19](../../../proxy/internal/middleware/builtin/all_test.go))
|
||||
so the registry is populated at boot. The chain is wired by the management
|
||||
synthesiser and executed by the framework
|
||||
(`proxy/internal/middleware/{chain,dispatcher,accumulator}.go` — both out
|
||||
of scope). Everything here reads from / writes to one envelope: the
|
||||
`middleware.KV` metadata bag plus `middleware.Mutations` for header/body
|
||||
rewrites.
|
||||
|
||||
## The 8 middlewares
|
||||
|
||||
| Name | Slot | Inputs (metadata read) | Outputs (metadata written) | Side effects |
|
||||
|---|---|---|---|---|
|
||||
| `llm_request_parser` | OnRequest | `Input.{URL,Body,BodyTruncated}` | `llm.{provider,model,stream,request_prompt_raw,capture_truncated}` | none |
|
||||
| `llm_router` | OnRequest | `llm.model`, `Input.{URL,UserGroups}` | `llm.{resolved_provider_id,authorising_groups}`, `llm_policy.{decision,reason}` | upstream rewrite + auth strip/inject |
|
||||
| `llm_limit_check` | OnRequest | `llm.{resolved_provider_id,model}`, `Input.{AccountID,UserID,UserGroups}` | `llm.{selected_policy_id,attribution_group_id,attribution_window_seconds}`, `llm_policy.{decision,reason}` | gRPC `CheckLLMPolicyLimits` |
|
||||
| `llm_identity_inject` | OnRequest | `llm.{resolved_provider_id,authorising_groups}`, `Input.{UserEmail,UserID,UserGroups,UserGroupNames}` | none | header strip/inject + optional body rewrite |
|
||||
| `llm_guardrail` | OnRequest | `llm.{model,request_prompt_raw}` | `llm_policy.{decision,reason}`, `llm.request_prompt` | none (model allowlist deny) |
|
||||
| `llm_response_parser` | OnResponse | `llm.provider`, `Input.{RespHeaders,RespBody,Status}` | `llm.{input,output,total,cached_input,cache_creation}_tokens`, `llm.response_completion` | none |
|
||||
| `cost_meter` | OnResponse | `llm.{provider,model}`, token buckets | `cost.usd_total` or `cost.skipped` | pricing lookup |
|
||||
| `llm_limit_record` | OnResponse | `llm.{attribution_group_id,attribution_window_seconds,input_tokens,output_tokens}`, `cost.usd_total` | none | gRPC `RecordLLMUsage` |
|
||||
|
||||
[all_test.go:26–40](../../../proxy/internal/middleware/builtin/all_test.go)
|
||||
locks the ID set; adding or removing one is a conscious extension.
|
||||
|
||||
## Files
|
||||
|
||||
| File | LOC | Notes |
|
||||
|---|---:|---|
|
||||
| `builtin.go` | 86 | Registry + `FactoryContext` (ctx, data dir, meter, logger, mgmt client) |
|
||||
| `all_test.go` | 41 | Locks the 8-ID registry surface |
|
||||
| `agentnetwork_chain_integration_test.go` | 319 | Live sqlite + real gRPC bufconn; gate→recorder wire path |
|
||||
| `llm_request_parser/*` | 162 / 66 / 356 | Provider detection, body parse, prompt extraction with capture-pointer gating |
|
||||
| `llm_router/*` | 385 / 84 / 586 | Three-pass route selection (model → groups → path-prefix) |
|
||||
| `llm_limit_check/*` | 196 / 38 / 182 | Pre-flight `CheckLLMPolicyLimits` (2s, fail-open) |
|
||||
| `llm_identity_inject/*` | 440 / 108 / 666 | HeaderPair (LiteLLM) + JSONMetadata (Portkey) + ExtraHeaders |
|
||||
| `llm_guardrail/*` | 176 / 82 / 75 / 219 / 217 | Model allowlist + optional prompt capture with PII redaction |
|
||||
| `llm_response_parser/*` | 258 / 222 / 43 / 433 / 169 / 111 | Buffered + SSE accumulation; AWS event-stream accumulator (`streaming_bedrock.go`) for Bedrock; capture-pointer gates completion emit |
|
||||
| `cost_meter/*` | 181 / 84 / 439 | Token → USD via `proxy/internal/llm/pricing` |
|
||||
| `llm_limit_record/*` | 144 / 35 / 191 | Post-flight `RecordLLMUsage` (5s, debug-on-error) |
|
||||
|
||||
## Per-middleware
|
||||
|
||||
### llm_request_parser
|
||||
|
||||
Detects the LLM provider via `llm.DetectParser` (URL sniff) or by name via
|
||||
`llm.ParserByName` when synthesiser stamps `provider_id`
|
||||
([middleware.go:96–99](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
|
||||
**Path-routed providers short-circuit first:** `parseVertexPath` and
|
||||
`parseBedrockPath` ([middleware.go:85–94](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go))
|
||||
pull the model + vendor out of the URL before parser selection runs — Vertex
|
||||
from `/v1/projects/.../publishers/{pub}/models/{model}:{action}` (publisher →
|
||||
vendor via `vertexPublisherVendor`), Bedrock from `/model/{id}/{action}` with
|
||||
`normalizeBedrockModel` stripping the region prefix + version suffix. See
|
||||
[50-path-routed-providers.md](./50-path-routed-providers.md) for the full path
|
||||
grammar. For body-routed providers it decodes the body into `RequestFacts`
|
||||
(model + stream) and extracts the prompt. On
|
||||
`capture_prompt=true` (or absent — see capture-pointer semantics below) the
|
||||
prompt is run through `llm_guardrail.RedactPII` when `redact_pii=true` and
|
||||
truncated rune-safely to 3500 bytes
|
||||
([middleware.go:109–122](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
|
||||
**Key invariant:** redaction is parser-side, not guardrail-side — access-log
|
||||
reads `llm.request_prompt_raw` directly.
|
||||
|
||||
### llm_router
|
||||
|
||||
Three-pass route selection in `matchRoute`
|
||||
([middleware.go:241–300](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
|
||||
filter by `Models` claim → vendor-pin (a vendor-tagged request never crosses to
|
||||
another vendor's route) → filter by `AllowedGroupIDs` intersection → model
|
||||
precedence over path → tie-break by longest `UpstreamPath` prefix match.
|
||||
Model-miss returns `llm_policy.model_not_routable`; known-but-unauthorised
|
||||
returns `llm_policy.no_authorised_provider`. **Key invariant:** auth-header
|
||||
strip+inject rides on `UpstreamRewrite.{StripHeaders,AuthHeader}`
|
||||
([middleware.go:606–646](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
|
||||
— NOT `HeadersAdd/HeadersRemove` — because the framework's mutation gate
|
||||
blocks `Authorization` on the generic header path.
|
||||
|
||||
**Path-routed providers route before the model table.** `Invoke` checks
|
||||
`isVertexPath` / `isBedrockPath`
|
||||
([middleware.go:138–216](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
|
||||
ahead of the model lookup, so a path-carried model can't be claimed by a
|
||||
same-vendor body-routed provider. `matchPathRoute` enforces the route's `Models`
|
||||
allowlist (empty = catch-all) even though the model came from the URL.
|
||||
Two path-only behaviours:
|
||||
- **Vertex unmeterable publisher** — when `llm_request_parser` emits no
|
||||
`llm.provider` (e.g. Gemini/`google`), the router denies with
|
||||
`llm_policy.unmeterable_publisher` (403) rather than forward it uncounted.
|
||||
- **GCP token minting** — when the route carries `GCPServiceAccountKeyB64`
|
||||
(set from a `keyfile::` api_key), `gcpBearer` mints + caches a short-lived
|
||||
OAuth2 token per request instead of injecting a static value; a bad key or
|
||||
unreachable token endpoint denies with `llm_policy.upstream_auth_failed`
|
||||
(502). Bedrock uses its static bearer token directly (no minting).
|
||||
- **`/bedrock` prefix** — an optional `/bedrock` gateway-namespace prefix is
|
||||
accepted and stripped via `RewriteUpstream.StripPathPrefix` so the native
|
||||
`/model/...` path reaches the upstream.
|
||||
|
||||
Full treatment in [50-path-routed-providers.md](./50-path-routed-providers.md).
|
||||
|
||||
### llm_limit_check
|
||||
|
||||
Pre-flight gate. Reads `llm.resolved_provider_id`, calls
|
||||
`CheckLLMPolicyLimits` with a 2s context timeout
|
||||
([middleware.go:24, 97–106](../../../proxy/internal/middleware/builtin/llm_limit_check/middleware.go)),
|
||||
on allow stamps `llm.selected_policy_id`, `llm.attribution_group_id`,
|
||||
`llm.attribution_window_seconds`. **Key invariant:** fail-open. Nil
|
||||
`MgmtClient`, empty provider id, or RPC error returns `allowNoAttribution()`
|
||||
— management outage doesn't take down every LLM request. Operators audit via
|
||||
the access-log; a future flag may switch this to fail-closed.
|
||||
|
||||
### llm_identity_inject
|
||||
|
||||
Dispatches per-rule between LiteLLM-shaped `HeaderPair`
|
||||
([middleware.go:169](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go))
|
||||
and Portkey-shaped `JSONMetadata`
|
||||
([middleware.go:292](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go)).
|
||||
Identity is the peer's email (or `UserID` fallback); tags are the
|
||||
**authorising-groups intersection** emitted by `llm_router`, not the full
|
||||
`UserGroups` — a peer in 5 groups authorised under 1 only tags as that 1.
|
||||
**Anti-spoof:** every `HeadersAdd` is preceded by a `HeadersRemove` of the
|
||||
same name; the framework runs `Remove` before `Add` so client-supplied
|
||||
identity never reaches the upstream. Body-level inject (`tags_in_body`,
|
||||
`end_user_id_in_body`) is skipped on empty / truncated / non-JSON bodies so
|
||||
header attribution stays intact.
|
||||
|
||||
### llm_guardrail
|
||||
|
||||
Model allowlist deny + optional prompt-capture-with-redaction. Allowlist
|
||||
match is case-insensitive via `normaliseModel`; empty allowlist disables the
|
||||
check. Prompt capture reads `llm.request_prompt_raw` and emits
|
||||
`llm.request_prompt` only when `prompt_capture.enabled`
|
||||
([middleware.go:149–165](../../../proxy/internal/middleware/builtin/llm_guardrail/middleware.go)).
|
||||
**Key invariant:** `RedactPII` is the exported function the parsers call —
|
||||
single PII contract across all three keys.
|
||||
|
||||
### llm_response_parser
|
||||
|
||||
Buffered and SSE paths share one `Invoke`
|
||||
([middleware.go:102–127](../../../proxy/internal/middleware/builtin/llm_response_parser/middleware.go)):
|
||||
content-type sniffing dispatches to `invokeBuffered` (JSON, status<400) or
|
||||
`invokeStreaming` (text/event-stream, partial bodies tolerated). Streaming
|
||||
delegates to `accumulateStream`
|
||||
([streaming.go:21–30](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go))
|
||||
using `llm.NewScanner`. A third path, `accumulateBedrockStream`
|
||||
([streaming_bedrock.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming_bedrock.go)),
|
||||
decodes the AWS binary event-stream (`application/vnd.amazon.eventstream`)
|
||||
returned by Bedrock's `-stream` actions — InvokeModel `chunk` frames wrap a
|
||||
base64 Anthropic event, Converse frames carry text + a trailing usage block.
|
||||
Cached / cache-creation buckets emit only when non-zero, preserving the existing
|
||||
token schema.
|
||||
|
||||
### cost_meter
|
||||
|
||||
Reads `llm.provider` + `llm.model` + token buckets, looks up per-1k rate via
|
||||
`pricing.Loader`, emits `cost.usd_total` or a closed-set `cost.skipped`
|
||||
reason (`missing_provider/model/tokens`, `unparseable_tokens`, `zero_tokens`,
|
||||
`unknown_model`). Loader's hot-reload goroutine is bound to proxy-lifetime
|
||||
context via `startReloader`. **Key invariant:** provider-shape switch lives
|
||||
in `pricing.Table.Cost` (sibling doc) — `cost_meter` stays provider-agnostic.
|
||||
|
||||
### llm_limit_record
|
||||
|
||||
Post-flight write. Always returns `DecisionAllow`; response has already been
|
||||
served so RPC errors mustn't surface (logged at `Debugf`). Skip-on-no-signal
|
||||
at line 81 (zero tokens + zero cost). **Key invariant:** the
|
||||
skip-on-missing-attribution guard at line 98 is a safety net independent of
|
||||
the framework's deny short-circuit — if the gate denied and the framework
|
||||
still runs the recorder, the recorder skips on absent
|
||||
`UserID`+`groupID`+`UserGroups` and no phantom counter materialises.
|
||||
|
||||
## Full-chain diagram (canonical order)
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[HTTP request] --> B[llm_request_parser<br/>OnRequest]
|
||||
B -->|llm.provider, llm.model,<br/>llm.stream, llm.request_prompt_raw| C[llm_router<br/>OnRequest]
|
||||
C -->|llm.resolved_provider_id,<br/>llm.authorising_groups,<br/>upstream rewrite + auth| D[llm_limit_check<br/>OnRequest]
|
||||
D -->|deny path| Z1[403 llm_policy.*]
|
||||
D -->|allow + llm.selected_policy_id,<br/>llm.attribution_group_id,<br/>llm.attribution_window_seconds| E[llm_identity_inject<br/>OnRequest]
|
||||
E -->|header strip+inject<br/>+ optional body rewrite| F[llm_guardrail<br/>OnRequest]
|
||||
F -->|deny: model_blocked| Z2[403 llm_policy.model_blocked]
|
||||
F -->|allow + llm.request_prompt| G[upstream LLM call]
|
||||
G --> H[llm_response_parser<br/>OnResponse]
|
||||
H -->|llm.{input,output,total,cached_input,cache_creation}_tokens,<br/>llm.response_completion| I[cost_meter<br/>OnResponse]
|
||||
I -->|cost.usd_total or cost.skipped| J[llm_limit_record<br/>OnResponse]
|
||||
J --> K[response to client]
|
||||
```
|
||||
|
||||
## limit_check ⇒ limit_record record-once invariant
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant LC as llm_limit_check
|
||||
participant M as management gRPC
|
||||
participant U as upstream LLM
|
||||
participant LR as llm_limit_record
|
||||
participant DB as sqlite consumption table
|
||||
|
||||
LC->>M: CheckLLMPolicyLimits (2s)
|
||||
alt allow
|
||||
M-->>LC: selected_policy_id, attribution_group_id, window_s
|
||||
LC->>U: stamps attribution metadata
|
||||
U-->>LR: response + tokens (via llm_response_parser + cost_meter)
|
||||
LR->>M: RecordLLMUsage (5s, debug-on-error)
|
||||
M->>DB: increment (user, group, window) row
|
||||
else deny
|
||||
M-->>LC: llm_policy.token_cap_exceeded
|
||||
Note over LR: framework short-circuits; even if invoked,<br/>recorder skips on absent UserID+groupID+UserGroups
|
||||
else mgmt nil / rpc error
|
||||
LC-->>LC: allowNoAttribution() — fail open
|
||||
Note over LR: no window_s ⇒ recorder books only account-level<br/>budget rules (which run independently)
|
||||
end
|
||||
```
|
||||
|
||||
The integration test
|
||||
[agentnetwork_chain_integration_test.go](../../../proxy/internal/middleware/builtin/agentnetwork_chain_integration_test.go)
|
||||
exercises all three branches against a real sqlite store + bufconn gRPC —
|
||||
no mocks. Tests: `TestChain_AllowPath_StampsAttributionAndRecordsCounter`
|
||||
(line 130), `TestChain_DenyPath_GateRejectsAndNoConsumptionWritten` (line
|
||||
207), `TestChain_CapExhaustTransition` (line 265).
|
||||
|
||||
## Public contracts (per-middleware JSON config)
|
||||
|
||||
| Middleware | Config shape |
|
||||
|---|---|
|
||||
| `llm_request_parser` | `{provider_id?, redact_pii?, capture_prompt?: *bool}` ([factory.go:19–37](../../../proxy/internal/middleware/builtin/llm_request_parser/factory.go)) |
|
||||
| `llm_router` | `{providers: [{id, models, upstream_scheme, upstream_host, upstream_path?, auth_header_name, auth_header_value, allowed_group_ids}]}` |
|
||||
| `llm_limit_check` | `{}` — pulls `MgmtClient` from `FactoryContext` |
|
||||
| `llm_identity_inject` | `{providers: [{provider_id, header_pair?|json_metadata?, extra_headers?}]}` |
|
||||
| `llm_guardrail` | `{model_allowlist: []string, prompt_capture: {enabled, redact_pii}}` |
|
||||
| `llm_response_parser` | `{redact_pii?, capture_completion?: *bool}` |
|
||||
| `cost_meter` | `{pricing_path?}` (basename inside data-dir; defaults `pricing.yaml`) |
|
||||
| `llm_limit_record` | `{}` — same pattern as `llm_limit_check` |
|
||||
|
||||
All factories accept empty / null / `{}` / whitespace as zero-value config;
|
||||
only structurally invalid JSON is rejected so misconfig surfaces at chain
|
||||
build time.
|
||||
|
||||
## Invariants
|
||||
|
||||
1. **limit_check ↔ limit_record paired.** They MUST appear together. Gate
|
||||
stamps attribution metadata on the request leg; recorder reads it on the
|
||||
response leg. If a chain contains only the recorder, the
|
||||
skip-on-missing-attribution guard at
|
||||
[llm_limit_record/middleware.go:81–87, 98–103](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go)
|
||||
keeps counters consistent but no enforcement runs. Only-gate means
|
||||
counters never tick and headroom appears infinite.
|
||||
|
||||
2. **`capture_prompt` / `capture_completion` pointer semantics.** Both are
|
||||
`*bool`. `nil` = "preserve legacy emit" (back-compat default for
|
||||
non-agent-network callers and pre-toggle tests). `false` = suppress the
|
||||
key entirely (access-log row carries zero prompt / completion content).
|
||||
`true` = emit. The synthesiser sets the pointer explicitly to the
|
||||
account's `EnablePromptCollection` toggle. The handling lives
|
||||
in [llm_request_parser/factory.go:55–61](../../../proxy/internal/middleware/builtin/llm_request_parser/factory.go)
|
||||
and the symmetric [llm_response_parser/middleware.go:62–68](../../../proxy/internal/middleware/builtin/llm_response_parser/middleware.go);
|
||||
a missing pointer must not be treated as `false` (that would suppress
|
||||
capture for legacy non-agent-network callers).
|
||||
`redact_pii` is an orthogonal `bool` controlling **form** of emitted
|
||||
content, not whether it's emitted.
|
||||
|
||||
3. **`redact_pii` is parser-side.** Both parsers import
|
||||
`llm_guardrail.RedactPII` and run it BEFORE stamping the metadata bag.
|
||||
Load-bearing because the access-log sink reads `llm.request_prompt_raw`
|
||||
and `llm.response_completion` directly — by the time `llm_guardrail`
|
||||
runs its own pass on `llm.request_prompt`, the raw key has already been
|
||||
stamped. Tests: `TestInvoke_RedactPii_RedactsBeforeEmittingRawPrompt`,
|
||||
`TestInvoke_RedactPii_RedactsCompletionBeforeEmit`.
|
||||
|
||||
4. **Metadata allowlist enforcement.** Every middleware declares
|
||||
`MetadataKeys()`. The framework accumulator drops any KV outside that
|
||||
allowlist. When adding a new key, also extend the docstring in
|
||||
`middleware/keys.go`.
|
||||
|
||||
5. **Closed deny-code set.** All deny paths emit one of:
|
||||
`llm_policy.model_not_routable`, `llm_policy.no_authorised_provider`,
|
||||
`llm_policy.model_blocked`, `llm_policy.token_cap_exceeded`,
|
||||
`llm_policy.unmeterable_publisher` (path-routed Vertex publisher with no
|
||||
parser → 403), `llm_policy.upstream_auth_failed` (GCP token mint failure →
|
||||
502), or the management-supplied code on `llm_limit_check`. These surface
|
||||
verbatim; arbitrary middleware text never reaches the wire.
|
||||
|
||||
## Things to scrutinise
|
||||
|
||||
**Correctness.** `llm_router` model match treats an empty `Models` slice as
|
||||
"claim every model"
|
||||
([middleware.go:238–248](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
|
||||
for gateway-style providers — confirm no real provider record ships with an
|
||||
empty `Models` by accident. Path-prefix tie-break falls back to declaration
|
||||
order when no candidate prefix-matches, so the synthesiser must emit a
|
||||
deterministic order. `llm_limit_record` discards `strconv.ParseInt` errors
|
||||
([middleware.go:78–80](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go))
|
||||
— relies on `llm_response_parser` always emitting parseable values; spot-check
|
||||
the streaming partial path on truncated bodies.
|
||||
|
||||
**Security.** Auth headers must NEVER appear on `Mutations.HeadersAdd/Remove`
|
||||
for the router — a direct headers path would bypass the framework gate. The
|
||||
capture-pointer handling is the kind of place a bug ships PII to logs
|
||||
silently; every synthesiser config path must set the pointer explicitly.
|
||||
`llm_identity_inject` body inject silently skips on a
|
||||
non-object `metadata` field
|
||||
([middleware.go:262–270](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go))
|
||||
— header path still attributes, but body-level tag-budget enforcement
|
||||
doesn't run for that request.
|
||||
|
||||
**Concurrency.** `cost_meter` shares a `pricing.Loader` via
|
||||
`atomic.Pointer[Table]`; readers always see a consistent table. Every
|
||||
middleware is a stateless value receiver. Integration test uses real bufconn
|
||||
gRPC — race detector is the meaningful bar.
|
||||
|
||||
**Perf.** Hot path is `lookupKV` linear scan over <10 KVs; `cost_meter.Cost`
|
||||
is O(1); SSE accumulation is single-pass. No map allocation per call.
|
||||
|
||||
**Observability.** Every deny stamps `llm_policy.decision=deny` and a
|
||||
matching `llm_policy.reason` — access-log can pivot on either.
|
||||
`llm_limit_record` only logs at `Debugf` on RPC failure
|
||||
([middleware.go:125–130](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go));
|
||||
operators need an alternate signal (metric on `RecordLLMUsage` failures) for
|
||||
counter accuracy.
|
||||
|
||||
## Test coverage
|
||||
|
||||
| File | Tests | Notes |
|
||||
|---|---:|---|
|
||||
| `all_test.go` | 1 | Registry surface lock |
|
||||
| `agentnetwork_chain_integration_test.go` | 3 | Allow/deny/cap-exhaust vs live sqlite + bufconn gRPC |
|
||||
| `llm_request_parser/middleware_test.go` | 18 | `provider_id` bypass, redaction, capture-pointer, rune-safe truncation |
|
||||
| `llm_router/middleware_test.go` | 19 | Three-pass match, deny codes, path-prefix tie-break, header strip+inject |
|
||||
| `llm_limit_check/middleware_test.go` | 6 | Allow/deny, fail-open on nil mgmt / RPC error, attribution stamping |
|
||||
| `llm_identity_inject/middleware_test.go` | 28 | HeaderPair, JSONMetadata, ExtraHeaders, body inject, anti-spoof |
|
||||
| `llm_guardrail/middleware_test.go` | 15 | Allowlist case-insensitivity, prompt capture toggle, deny shape |
|
||||
| `llm_guardrail/redact_test.go` | 15 | Email, SSN, phone (E.164 + NA), bearer, IPv4; fixture-driven |
|
||||
| `llm_response_parser/middleware_test.go` | 18 | Buffered OAI+Anthro, capture-pointer, redact, truncation |
|
||||
| `llm_response_parser/streaming_test.go` | 7 | OAI usage frame, Anthro message_delta, truncated body best-effort |
|
||||
| `cost_meter/middleware_test.go` | 17 | Each skip reason, provider-shape, pricing loader integration |
|
||||
| `llm_limit_record/middleware_test.go` | 7 | Skip-on-no-signal, skip-on-missing-attribution, RPC failure swallowed |
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Sibling: [32-proxy-llm-parsers.md](./32-proxy-llm-parsers.md) — SDK adapters
|
||||
+ SSE framer + pricing loader.
|
||||
- Path-routed providers (Vertex AI + Bedrock), `keyfile::` credential, GCP
|
||||
token minting, `/bedrock` prefix:
|
||||
[50-path-routed-providers.md](./50-path-routed-providers.md).
|
||||
- Upstream config: `management/server/agentnetwork/synthesizer` (out of scope).
|
||||
- Framework: `proxy/internal/middleware/{chain,dispatcher,accumulator,registry}.go`.
|
||||
- Metadata key registry: `proxy/internal/middleware/keys.go`.
|
||||
- gRPC surface: `proto.ProxyServiceClient.{CheckLLMPolicyLimits,RecordLLMUsage}`.
|
||||
392
docs/agent-networks/modules/32-proxy-llm-parsers.md
Normal file
392
docs/agent-networks/modules/32-proxy-llm-parsers.md
Normal file
@@ -0,0 +1,392 @@
|
||||
# proxy/llm-parsers — SDK adapters + pricing + SSE
|
||||
|
||||
The runtime-agnostic LLM library: the OpenAI Responses API (`/v1/responses`)
|
||||
and the older Chat Completions API (`/v1/chat/completions`), the Anthropic
|
||||
Messages API (`/v1/messages`), the SSE wire format (`event:` / `data:` lines,
|
||||
`\n\n` framing, CRLF tolerance), and per-provider token accounting (OpenAI's
|
||||
cached-prompt **subset** vs Anthropic's cache_read **additive** model). The
|
||||
pricing table's per-provider cost formula is the highest-leverage place a
|
||||
small bug would silently mis-bill operators.
|
||||
|
||||
Sibling module: [31-proxy-middleware-builtin.md](./31-proxy-middleware-builtin.md)
|
||||
— the 8 middlewares that consume this package's parsers + pricing loader.
|
||||
|
||||
---
|
||||
|
||||
## Module boundary
|
||||
|
||||
`proxy/internal/llm` is the runtime-agnostic LLM library shared by every
|
||||
middleware that needs to understand provider-specific shapes. Zero
|
||||
proxy-framework dependencies:
|
||||
|
||||
- `parser.go` — `Parser` interface, `Provider` enum, public factories
|
||||
(`Parsers`, `DetectParser`, `ParserByName`).
|
||||
- `openai.go` / `anthropic.go` / `bedrock.go` — per-provider `Parser` impls.
|
||||
- `sse.go` — SSE scanner (`Scanner`, `Event`, `NewScanner`).
|
||||
- `errors.go` — sentinels callers branch on with `errors.Is`.
|
||||
- `pricing/` — embedded-default + hot-reload override table with
|
||||
symlink-safe Unix loader (build-tagged stub elsewhere).
|
||||
- `fixtures/` — captured request/response/stream bodies the tests replay.
|
||||
|
||||
The package carries zero proxy-framework dependencies so the same parsers can
|
||||
be reused later by a WASM adapter
|
||||
([parser.go:1–6](../../../proxy/internal/llm/parser.go)).
|
||||
|
||||
## Files
|
||||
|
||||
| File | LOC | Notes |
|
||||
|---|---:|---|
|
||||
| `parser.go` | 104 | Interface + factories + `Provider{Unknown,OpenAI,Anthropic}` enum |
|
||||
| `openai.go` | 347 | Chat Completions + Completions + Responses API; cached_tokens subset |
|
||||
| `openai_test.go` | 222 | 11 tests; fixture replay + cached/Responses-API matrix |
|
||||
| `anthropic.go` | 172 | Messages + legacy `/v1/complete`; cache_read + cache_creation additive |
|
||||
| `anthropic_test.go` | 154 | 7 tests including streaming-extraction-skipped contract |
|
||||
| `bedrock.go` | 190 | AWS Bedrock InvokeModel (snake_case) + Converse (camelCase) response shapes; model lives in URL path |
|
||||
| `bedrock_test.go` | — | InvokeModel + Converse usage shapes; AWS event-stream content-type → `ErrStreamingUnsupported` on buffered `ParseResponse` |
|
||||
| `sse.go` | 117 | `bufio`-backed scanner; CRLF normalised; trailing-event handling |
|
||||
| `sse_test.go` | 175 | 12 tests; fixture replay + multiline + size limits |
|
||||
| `parser_test.go` | 53 | `Parsers()`, `DetectParser`, provider enum values |
|
||||
| `errors.go` | 31 | 6 sentinels: `Err{Unknown,Unsupported}Provider/Model`, `Err{NotLLM,Malformed}Response`, `ErrStreamingUnsupported`, `ErrMalformedRequest` |
|
||||
| `pricing/pricing.go` | 421 | `Loader`, `Table`, `Entry`; embedded defaults + atomic swap + mtime reload |
|
||||
| `pricing/pricing_unix.go` | 69 | `O_NOFOLLOW` + fstat-from-FD + 1 MiB cap |
|
||||
| `pricing/pricing_other.go` | 21 | Stub returning "not supported on this platform" |
|
||||
| `pricing/pricing_test.go` | 432 | 21 tests — symlink rejection, reload race, path traversal, oversize |
|
||||
| `pricing/defaults_pricing.yaml` | 85 | go:embed source of truth |
|
||||
| `fixtures/*` | 21–59 | OAI chat/responses/stream + Anthro messages/stream + pricing starter |
|
||||
|
||||
## Request body → parser dispatch
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[HTTP request<br/>URL + JSON body] --> B{ParserByName?<br/>provider_id config set}
|
||||
B -- yes --> P[matched Parser]
|
||||
B -- no --> C[DetectParser]
|
||||
C --> D{loop Parsers<br/>OpenAIParser, AnthropicParser}
|
||||
D -- DetectFromURL match --> P
|
||||
D -- no match --> X[ok=false<br/>middleware skips]
|
||||
P --> E[ParseRequest body]
|
||||
E -->|err: ErrMalformedRequest| Y[middleware emits provider only]
|
||||
E --> F[RequestFacts<br/>model + stream]
|
||||
P --> G[ExtractPrompt body]
|
||||
G --> H[joinMessages<br/>extractContentParts<br/>decodeStringOrJoin]
|
||||
H --> I[prompt text<br/>or empty]
|
||||
F --> J[stamps llm.model + llm.stream]
|
||||
I --> K[stamps llm.request_prompt_raw<br/>subject to capture_prompt gate]
|
||||
```
|
||||
|
||||
OpenAI's URL hints
|
||||
([openai.go:27–33](../../../proxy/internal/llm/openai.go)) include
|
||||
both `/v1/chat/completions` and the bare `/chat/completions` — the latter
|
||||
covers Cloudflare AI Gateway, which rewrites the canonical version segment.
|
||||
Anthropic's hints are `/v1/messages` and `/v1/complete`
|
||||
([anthropic.go:14–17](../../../proxy/internal/llm/anthropic.go)).
|
||||
Both implementations use case-insensitive substring matching so a proxy prefix
|
||||
strip / rewrite doesn't defeat detection.
|
||||
|
||||
`ParserByName` ([parser.go:93–103](../../../proxy/internal/llm/parser.go))
|
||||
is the **agent-network bypass**: the synthesiser knows which parser to use
|
||||
because it built the synth service from the catalog, so it stamps
|
||||
`provider_id` on the parser config and the middleware skips URL sniffing
|
||||
entirely. This is what makes the same parser set work whether the request
|
||||
flows to OpenAI direct, to LiteLLM, to Portkey, or to any gateway with a
|
||||
non-canonical URL shape.
|
||||
|
||||
**Path-routed providers (Vertex AI, Bedrock) bypass both `ParserByName` and
|
||||
`DetectParser`.** The model and the parser surface live in the URL path, so the
|
||||
request middleware extracts them directly (`parseVertexPath` /
|
||||
`parseBedrockPath`) before the parser-selection step. For Vertex the publisher
|
||||
segment picks the parser (`anthropic` → Anthropic parser; `google`/Gemini →
|
||||
none, request denied as unmeterable). For Bedrock the dedicated `BedrockParser`
|
||||
handles the response. Full treatment in
|
||||
[50-path-routed-providers.md](./50-path-routed-providers.md).
|
||||
|
||||
## Streaming response → SSE chunker → response parser → completion + token count
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant U as upstream LLM
|
||||
participant LR as llm_response_parser<br/>(OnResponse)
|
||||
participant S as llm.NewScanner<br/>(SSE framer)
|
||||
participant P as Parser-specific accumulator<br/>(accumulateOpenAIStream<br/>or accumulateAnthropicStream)
|
||||
|
||||
U-->>LR: text/event-stream<br/>(buffered prefix in RespBody)
|
||||
LR->>S: NewScanner(bytes.NewReader(body))
|
||||
loop until EOF or [DONE]
|
||||
S-->>LR: Event{Type, Data}
|
||||
LR->>P: dispatch per event.Type<br/>(OpenAI: data-only<br/>Anthropic: named events)
|
||||
P-->>P: accumulate completion text<br/>track usage from final frame
|
||||
end
|
||||
P-->>LR: llm.Usage + completion string
|
||||
LR->>LR: appendUsage stamps<br/>llm.{input,output,total,cached_input,cache_creation}_tokens
|
||||
LR->>LR: truncateCompletion(3500 bytes, rune-safe)
|
||||
LR->>LR: redactPII if redact_pii && captureCompletion
|
||||
```
|
||||
|
||||
`Scanner.Next`
|
||||
([sse.go:44–87](../../../proxy/internal/llm/sse.go)) returns one
|
||||
event per `\n\n` boundary; multiple `data:` lines join with `\n`; comment lines
|
||||
(starting with `:`) are skipped per the SSE spec; a trailing event without a
|
||||
closing blank line is still returned before `io.EOF` so a server that closes
|
||||
the connection cleanly doesn't lose the last frame
|
||||
([sse.go:55–58](../../../proxy/internal/llm/sse.go)). CRLF is
|
||||
normalised in `trimEOL` so fixtures captured from live servers replay
|
||||
unchanged.
|
||||
|
||||
## Per-provider
|
||||
|
||||
### OpenAI
|
||||
|
||||
[openai.go:54–67](../../../proxy/internal/llm/openai.go) defines
|
||||
`openAIRequest` with three prompt fields: `messages` (Chat Completions),
|
||||
`prompt` (legacy), `input` (Responses API). The decoder uses
|
||||
`json.RawMessage` so each shape is parsed lazily.
|
||||
|
||||
`ParseResponse`
|
||||
([openai.go:117–146](../../../proxy/internal/llm/openai.go))
|
||||
accepts both naming conventions: Chat Completions returns
|
||||
`prompt_tokens`/`completion_tokens`, Responses API returns
|
||||
`input_tokens`/`output_tokens`. `pickInt64` prefers Responses-API names and
|
||||
falls back — same parser handles both endpoints without per-route config.
|
||||
`openAICachedTokens` mirrors the fallback for
|
||||
`input_tokens_details.cached_tokens` vs `prompt_tokens_details.cached_tokens`.
|
||||
|
||||
**Key invariant:** `CachedInputTokens` for OpenAI is a SUBSET of
|
||||
`InputTokens`. The cost meter clamps to guard against malformed upstream
|
||||
responses where `cached > total`.
|
||||
|
||||
### Anthropic
|
||||
|
||||
[anthropic.go:37–49](../../../proxy/internal/llm/anthropic.go)
|
||||
defines `anthropicRequest` covering Messages API (`system` + `messages[]`)
|
||||
and legacy `/v1/complete` (`prompt` string). `ExtractPrompt` emits
|
||||
`system: <text>` first when present, then per-message `role: content`.
|
||||
|
||||
`ParseResponse`
|
||||
([anthropic.go:82–104](../../../proxy/internal/llm/anthropic.go))
|
||||
fills three independent token buckets: `InputTokens`, `CacheReadInputTokens`,
|
||||
`CacheCreationInputTokens`. Latter two are **additive** (not subset).
|
||||
`TotalTokens` sums all four so downstream dashboards render one "tokens"
|
||||
number without double-counting.
|
||||
|
||||
`ExtractCompletion` walks `content[]` `{type, text}` parts and concatenates
|
||||
non-empty text with newlines, falling back to legacy `completion`.
|
||||
|
||||
### Bedrock
|
||||
|
||||
[bedrock.go](../../../proxy/internal/llm/bedrock.go) implements the
|
||||
`Parser` interface for the AWS Bedrock runtime. Bedrock is **path-routed**: the
|
||||
model lives in the URL (`/model/{id}/{action}`), so the request middleware
|
||||
extracts it (see [50-path-routed-providers.md](./50-path-routed-providers.md))
|
||||
and `ParseRequest` is a deliberate no-op. The parser's real work is on the
|
||||
response leg, covering both Bedrock body shapes:
|
||||
|
||||
- **InvokeModel** — vendor-native. Anthropic-on-Bedrock returns snake_case usage
|
||||
(`input_tokens`, `output_tokens`, `cache_read_input_tokens`,
|
||||
`cache_creation_input_tokens`) with the same additive cache buckets as
|
||||
first-party Anthropic.
|
||||
- **Converse** — unified camelCase (`inputTokens`, `outputTokens`,
|
||||
`totalTokens`). `firstNonZero` folds the two naming conventions into one
|
||||
`Usage`; when Converse omits `totalTokens` the parser sums the buckets.
|
||||
|
||||
`ProviderName()` returns `"bedrock"` — its own `defaults_pricing.yaml` block,
|
||||
keyed by the **normalised** model id (region prefix + version suffix stripped by
|
||||
the request parser). `ParseResponse` returns `ErrStreamingUnsupported` for an
|
||||
AWS binary event-stream content-type (`application/vnd.amazon.eventstream`,
|
||||
`isAWSEventStream`) so the caller routes to the streaming accumulator instead.
|
||||
|
||||
### SSE framing
|
||||
|
||||
`Scanner` is `bufio`-backed, 64 KiB read buffer, 1 MiB max line so a
|
||||
malicious upstream can't blow process memory
|
||||
([sse.go:33–38, 97–100](../../../proxy/internal/llm/sse.go)).
|
||||
`splitField` strips one space after the `:` per the SSE spec. Documented
|
||||
`not safe for concurrent use`; every consumer creates a fresh scanner per
|
||||
response body. Streaming accumulators live in the middleware package
|
||||
([llm_response_parser/streaming.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go))
|
||||
but use `llm.NewScanner` so the framing contract stays here.
|
||||
|
||||
### Pricing catalog
|
||||
|
||||
`Table.Cost`
|
||||
([pricing.go:129–174](../../../proxy/internal/llm/pricing/pricing.go))
|
||||
is the cost formula — most security-relevant math in this module:
|
||||
|
||||
| Provider | Formula |
|
||||
|---|---|
|
||||
| `openai` | `(inTokens − clamped) × InputPer1K + clamped × CachedInputPer1K + outTokens × OutputPer1K` where `clamped = min(cachedInput, inTokens)` |
|
||||
| `anthropic`, `bedrock` | `inTokens × InputPer1K + cachedInput × CacheReadPer1K + cacheCreation × CacheCreationPer1K + outTokens × OutputPer1K` |
|
||||
| default | `inTokens × InputPer1K + outTokens × OutputPer1K` |
|
||||
|
||||
`bedrock` shares the Anthropic additive-cache formula
|
||||
([pricing.go:172-174](../../../proxy/internal/llm/pricing/pricing.go)):
|
||||
Anthropic-on-Bedrock reports the same additive cache buckets, while non-Anthropic
|
||||
Bedrock models (Nova, Llama) simply report zero in those buckets so cost reduces
|
||||
to `input + output`.
|
||||
|
||||
Each per-bucket rate falls back to `InputPer1K` when zero — operators opt in
|
||||
to discounts by setting the field.
|
||||
|
||||
`Loader`
|
||||
([pricing.go:212–268](../../../proxy/internal/llm/pricing/pricing.go))
|
||||
overlays an optional `pricing.yaml` from data-dir on top of the go:embed
|
||||
defaults. Atomic pointer swap means readers never observe a partial update.
|
||||
The mtime-poll reloader (30s default cadence) keeps the previous table on
|
||||
parse failure so cost annotation never goes blank during a botched edit.
|
||||
|
||||
`defaults_pricing.yaml` is the source of truth for built-in pricing.
|
||||
Operator overrides only carry the entries they want to change.
|
||||
|
||||
## Public contracts
|
||||
|
||||
**`Parser` interface**
|
||||
([parser.go:50–66](../../../proxy/internal/llm/parser.go)):
|
||||
|
||||
```go
|
||||
type Parser interface {
|
||||
Provider() Provider
|
||||
ProviderName() string
|
||||
DetectFromURL(path string) bool
|
||||
ParseRequest(body []byte) (RequestFacts, error)
|
||||
ParseResponse(status int, contentType string, body []byte) (Usage, error)
|
||||
ExtractPrompt(body []byte) string
|
||||
ExtractCompletion(status int, contentType string, body []byte) string
|
||||
}
|
||||
```
|
||||
|
||||
Adding a provider means implementing this interface and appending to the
|
||||
slice returned by `Parsers()` ([parser.go:78–84](../../../proxy/internal/llm/parser.go)).
|
||||
Order matters: `DetectFromURL` ties resolve by registration order.
|
||||
`Parsers()` today returns `{OpenAIParser, AnthropicParser, BedrockParser}`.
|
||||
|
||||
**`Provider` enum**
|
||||
([parser.go:8–18](../../../proxy/internal/llm/parser.go)):
|
||||
`ProviderUnknown = 0`, `ProviderOpenAI = 1`, `ProviderAnthropic = 2`,
|
||||
`ProviderBedrock = 3`. Numeric values are persisted in nothing today but treat
|
||||
them as wire-stable — new providers must take fresh numbers.
|
||||
|
||||
**`Pricing` lookup**
|
||||
([pricing.go:129](../../../proxy/internal/llm/pricing/pricing.go)):
|
||||
|
||||
```go
|
||||
func (t *Table) Cost(provider, model string, inTokens, outTokens, cachedInput, cacheCreation int64) (float64, bool)
|
||||
```
|
||||
|
||||
Nil-safe: `t.Cost` on a nil receiver returns `(0, false)`
|
||||
([pricing.go:130–132](../../../proxy/internal/llm/pricing/pricing.go)).
|
||||
`ok=false` means provider or model is absent from the loaded table; the caller
|
||||
emits `cost.skipped=unknown_model`.
|
||||
|
||||
## Invariants
|
||||
|
||||
1. **Cross-platform pricing build.** `pricing_unix.go` carries the only
|
||||
functional `loadPricing` (uses `syscall.O_NOFOLLOW` and `f.Stat()` on an
|
||||
open descriptor — both Unix-only). `pricing_other.go` is a build-tag
|
||||
fallback that returns `"not supported on this platform"`
|
||||
([pricing_other.go:14–16](../../../proxy/internal/llm/pricing/pricing_other.go)).
|
||||
The proxy is Linux-only in production today; a Windows port needs an
|
||||
equivalent path-as-handle implementation. Reviewers building on Windows
|
||||
should expect this surface to return an error at startup if an override
|
||||
file is configured.
|
||||
|
||||
2. **SSE scanner handles partial chunks.** A buffered prefix that doesn't end
|
||||
in `\n\n` still yields its accumulated event before `io.EOF`
|
||||
([sse.go:55–58](../../../proxy/internal/llm/sse.go)). Tests:
|
||||
`TestSSEScanner_OpenAIFixture`, `TestSSEScanner_AnthropicFixture`,
|
||||
`TestSSEScanner_MultilineData`, `TestSSEScanner_CRLF`. The streaming
|
||||
accumulators ride on this: `accumulateAnthropicStream` and
|
||||
`accumulateOpenAIStream` `break` on any scanner error to return partial
|
||||
usage rather than aborting
|
||||
([streaming.go:68–73, 144–150](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go)).
|
||||
|
||||
3. **`defaults_pricing.yaml` is the source of truth.** Compiled into the
|
||||
binary via `//go:embed`
|
||||
([pricing.go:29–30](../../../proxy/internal/llm/pricing/pricing.go)).
|
||||
`DefaultTable()` parses once and panics on parse failure
|
||||
([pricing.go:42–49](../../../proxy/internal/llm/pricing/pricing.go))
|
||||
— by design: a broken embedded YAML must not ship to production.
|
||||
|
||||
4. **Loader path validation.** `resolveMiddlewareDataPath`
|
||||
([pricing.go:370–394](../../../proxy/internal/llm/pricing/pricing.go))
|
||||
rejects absolute paths, traversal segments, and basenames that fail
|
||||
`basenameRegex = ^[a-zA-Z0-9._-]+$`. The resolved path must remain
|
||||
inside `baseDir` even after `filepath.Clean`. Tests:
|
||||
`TestNewLoader_PathValidation`, `TestNewLoader_PathValidation_Extended`,
|
||||
`TestNewLoader_SymlinkOutsideBaseDirRejected`, `TestNewLoader_SymlinkRejected`.
|
||||
|
||||
5. **Unix loader symlink safety.** `O_NOFOLLOW` on open, `f.Stat()` on the
|
||||
open descriptor (never re-stat by path), `info.Mode().IsRegular()` check,
|
||||
`io.LimitReader(f, maxPricingBytes+1)` with a final size assertion
|
||||
([pricing_unix.go:25–57](../../../proxy/internal/llm/pricing/pricing_unix.go)).
|
||||
A mid-read symlink swap is detected because the fstat is on the original
|
||||
fd. Test: `TestNewLoader_RejectsOversizedFile_FixesM4`.
|
||||
|
||||
6. **`yaml.NewDecoder(...).KnownFields(true)`**
|
||||
([pricing.go:397–398](../../../proxy/internal/llm/pricing/pricing.go))
|
||||
rejects YAML files that carry fields not in the schema. A typo in an
|
||||
operator override file fails loud instead of silently zeroing rates.
|
||||
|
||||
## Things to scrutinise
|
||||
|
||||
**Correctness.** Verify OpenAI cached-prompt clamp at
|
||||
[pricing.go:147–149](../../../proxy/internal/llm/pricing/pricing.go)
|
||||
short-circuits before subtraction. `Anthropic.TotalTokens` sums all four
|
||||
buckets (in + out + cache_read + cache_creation) — downstream dashboards
|
||||
need to know this differs from `input + output`.
|
||||
`OpenAIParser.ExtractPrompt` falls through `messages → input → prompt`; a
|
||||
request sending all three reports only `messages` (uncommon but worth
|
||||
noting).
|
||||
|
||||
**Security.** `Scanner.maxLine = 1 MiB`; a 2 MiB single-line `data:` event
|
||||
errors from `Scanner.Next` and both accumulators stop with partial usage.
|
||||
Pricing file 1 MiB cap is orders of magnitude larger than realistic. Confirm
|
||||
new schema additions are mirrored in both `pricingFile` and `Entry`;
|
||||
`KnownFields(true)` will reject silently-typo'd operator overrides
|
||||
otherwise.
|
||||
|
||||
**Concurrency.** `Loader.table` is `atomic.Pointer[Table]`; readers never
|
||||
block or see a torn table. `Loader.Reload` is one goroutine, cancelled via
|
||||
context (`TestLoader_ReloadBackgroundLoopCancellation`). `DefaultTable()`
|
||||
uses `sync.Once`. Per-call `Scanner` instances mean no shared state across
|
||||
concurrent response-parser calls.
|
||||
|
||||
**Perf.** `Table.Cost` is two map lookups + multiplications, O(1).
|
||||
`Scanner.Next` is one `ReadString('\n')` per line. Pricing reload poll 30s.
|
||||
|
||||
**Observability.** Reload failures count via `metric.Int64Counter` keyed
|
||||
`plugin`; warning log rate-limited at 5 min so a broken file doesn't flood.
|
||||
Parser errors return sentinels — middleware uses `errors.Is` to map to the
|
||||
right `cost.skipped` reason.
|
||||
|
||||
## Test coverage
|
||||
|
||||
| File | Tests | Coverage highlights |
|
||||
|---|---:|---|
|
||||
| `parser_test.go` | 3 | `Parsers()` shape lock, `DetectParser` URL matrix, provider enum stability |
|
||||
| `openai_test.go` | 11 | Chat Completions + Responses API + legacy `prompt`; cached-tokens subset for both naming conventions; fixture replays |
|
||||
| `anthropic_test.go` | 7 | Messages + legacy `/v1/complete`; streaming REJECTED on `ParseResponse` (must use scanner); fixture replays |
|
||||
| `sse_test.go` | 12 | Fixture replay both providers; multiline `data:`; CRLF; comment skip; trailing-event-without-blank-line; oversize rejection |
|
||||
| `pricing/pricing_test.go` | 21 | Provider-shape switch; cached-rate fallback; cached-clamp; symlink rejection (target outside basedir + symlink to file); path validation matrix; oversize rejection; reload-keeps-previous-on-parse-error; mtime change detection; goroutine cancellation |
|
||||
|
||||
**Fixtures** ([proxy/internal/llm/fixtures/](../../../proxy/internal/llm/fixtures/)):
|
||||
`openai_chat_completion.json` (chat.completions with usage),
|
||||
`openai_responses.json` (Responses API shape),
|
||||
`openai_stream.txt` (3 deltas + usage + `[DONE]`),
|
||||
`anthropic_messages.json` (Messages API non-streaming),
|
||||
`anthropic_stream.txt` (full 7-event sequence: message_start →
|
||||
content_block_{start,delta×2,stop} → message_delta (usage) → message_stop),
|
||||
`pricing.yaml` (realistic-pricing starter for operator overrides).
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Sibling: [31-proxy-middleware-builtin.md](./31-proxy-middleware-builtin.md)
|
||||
— the chain that calls `llm.Parsers()`, `llm.ParserByName`,
|
||||
`llm.NewScanner`, `pricing.NewLoader`.
|
||||
- Path-routed providers (Vertex AI + Bedrock), credential syntax, and the
|
||||
Bedrock AWS event-stream accumulator:
|
||||
[50-path-routed-providers.md](./50-path-routed-providers.md).
|
||||
- Direct callers: `llm_request_parser/middleware.go:82–94`,
|
||||
`llm_response_parser/middleware.go:113–123`,
|
||||
`llm_response_parser/streaming.go:65, 142`, `cost_meter/factory.go:49–57`.
|
||||
- Related elsewhere: the agent-network synthesiser stamping `provider_id`
|
||||
is covered in the management-side module guide; proxy server boot +
|
||||
`FactoryContext` construction is covered in the proxy-framework guide.
|
||||
194
docs/agent-networks/modules/33-proxy-runtime.md
Normal file
194
docs/agent-networks/modules/33-proxy-runtime.md
Normal file
@@ -0,0 +1,194 @@
|
||||
# proxy/runtime — translate + serve + log
|
||||
|
||||
> **Risk level:** High — every config push from management is translated here, and the chain runs on every HTTP request to a synth target.
|
||||
> **Backward-compat impact:** Additive at the wire (`PathTargetOptions.middlewares`, `agent_network`, `disable_access_log`, capture caps) and on the proxy `Server` struct (`MiddlewareDataDir`, `MiddlewareCaptureBudgetBytes`). Non-agent-network targets stay on the no-middleware fast path.
|
||||
|
||||
## Module boundary
|
||||
|
||||
Turns the synth-service wire format from `ProxyService.SyncMappings`/`GetMappingUpdate` into in-process middleware chains and runs them on top of the existing `httputil.ReverseProxy`. Four concerns: (a) **translate** — `proto.MiddlewareConfig` → validated `middleware.Spec` (proxy/middleware_translate.go) + self-register the eight built-ins (proxy/middleware_register.go); (b) **boot + rebuild** — construct the `middleware.Manager`, share the OTel meter, install the live-service check, rebuild per-path chains on every `addMapping`/`modifyMapping` (proxy/server.go); (c) **serve** — resolve chain at request time, capture bodies under a global budget, invoke `RunRequest`/`RunResponse`/`RunTerminal`, render deny responses, apply `UpstreamRewrite` (proxy/internal/proxy/reverseproxy.go); (d) **log + tag** — emit access-log entries with the new `agent_network` flag, gate emission on `EnableLogCollection` via `DisableAccessLog` (proxy/internal/accesslog).
|
||||
|
||||
**Inert for non-agent-network targets**: nil or empty chain → existing fast path (reverseproxy.go:127-139); `SuppressAccessLog` defaults false so the access-log middleware emits unchanged.
|
||||
|
||||
## Files
|
||||
|
||||
| Path | Role |
|
||||
| ---- | ---- |
|
||||
| proxy/middleware_translate.go | proto→Spec translation; slot/failmode/timeout mapping; caps |
|
||||
| proxy/middleware_translate_test.go | translator unit tests |
|
||||
| proxy/middleware_register.go | blank-imports the eight builtins for `init()` registration |
|
||||
| proxy/server.go | `initMiddlewareManager`, `rebuildMiddlewareChains`, `isLiveService`, `buildMiddlewareBindings`, new Server fields, `protoToMapping` stamps AgentNetwork/DisableAccessLog/CaptureConfig/Middlewares |
|
||||
| proxy/internal/proxy/reverseproxy.go | `WithMiddlewareManager`, chain dispatch, body capture, `applyUpstreamRewrite`/`Headers`, `buildRequestInput`, response-leg respInput identity fields |
|
||||
| proxy/internal/proxy/reverseproxy_test.go | `TestBuildRequestInput_PropagatesIdentityAndGroups` |
|
||||
| proxy/internal/proxy/context.go | `agentNetwork`, `suppressAccessLog`, `userGroupNames` on `CapturedData` |
|
||||
| proxy/internal/proxy/servicemapping.go | new `PathTarget` fields |
|
||||
| proxy/internal/proxy/agent_network_chain_realstack_test.go | end-to-end self-contained chain test |
|
||||
| proxy/internal/accesslog/logger.go | `logEntry.AgentNetwork` → `proto.AccessLog` |
|
||||
| proxy/internal/accesslog/middleware.go | reads `GetAgentNetwork()`; gates `l.log` on `!GetSuppressAccessLog()` |
|
||||
| proxy/internal/accesslog/middleware_test.go | suppress/default/preserves-usage assertions |
|
||||
| proxy/internal/auth/middleware_test.go | tunnel-peer group propagation contract |
|
||||
| proxy/internal/metrics/metrics.go | `Meter()` getter for the middleware manager |
|
||||
|
||||
## Architecture & flow
|
||||
|
||||
### Synth-service ingestion → translate → register → serve
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Management SyncMappings/GetMappingUpdate] --> B["processMappings\nserver.go:1492"]
|
||||
B --> C{Mapping type}
|
||||
C -->|CREATED| D["addMapping → setupHTTPMapping → updateMapping"]
|
||||
C -->|MODIFIED| E["modifyMapping → cleanupMappingRoutes → setupHTTPMapping → updateMapping"]
|
||||
C -->|REMOVED| F["removeMapping → cleanupMappingRoutes → invalidateMiddlewareChains"]
|
||||
D --> G["protoToMapping\nserver.go:2181"]
|
||||
E --> G
|
||||
G --> H["translateMiddlewareConfigs\nmiddleware_translate.go:55"]
|
||||
G --> I["translateMiddlewareCaptureConfig\nmiddleware_translate.go:18"]
|
||||
H --> J["[]middleware.Spec on PathTarget"]
|
||||
I --> K["*bodytap.Config on PathTarget"]
|
||||
J --> L["proxy.AddMapping\nservicemapping.go:118"]
|
||||
K --> L
|
||||
L --> M["rebuildMiddlewareChains\nserver.go:2017 → Manager.Rebuild"]
|
||||
F --> N["Manager.Invalidate(serviceID)"]
|
||||
```
|
||||
|
||||
### Per-request lifecycle through the chain + accesslog
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant C as Client
|
||||
participant M as accesslog.Middleware
|
||||
participant A as auth.Middleware (Protect)
|
||||
participant RP as ReverseProxy.ServeHTTP
|
||||
participant CH as middleware.Chain
|
||||
participant U as Upstream
|
||||
C->>M: HTTP request
|
||||
M->>M: NewCapturedData(requestID), WithCapturedData(ctx)
|
||||
M->>A: next.ServeHTTP
|
||||
A->>A: Private → ValidateTunnelPeer → stamp UserID/Email/Groups/GroupNames/AuthMethod
|
||||
A->>RP: next.ServeHTTP
|
||||
RP->>RP: findTargetForRequest → targetResult
|
||||
RP->>RP: stamp ServiceID/AccountID/AgentNetwork/SuppressAccessLog on CapturedData
|
||||
RP->>RP: resolveChain via Manager.ChainFor
|
||||
alt chain == nil or Empty
|
||||
RP->>U: httputil.ReverseProxy.ServeHTTP (fast path)
|
||||
else chain non-empty
|
||||
RP->>RP: bodytap.CaptureRequest (global budget)
|
||||
RP->>CH: RunRequest
|
||||
CH-->>RP: denyOutput? requestMeta + upstreamRewrite
|
||||
alt deny
|
||||
RP->>C: RenderDenyResponse
|
||||
else allow
|
||||
RP->>RP: capturingWriter + applyUpstreamRewrite/Headers
|
||||
RP->>U: httputil.ReverseProxy.ServeHTTP(respWriter)
|
||||
U-->>RP: response
|
||||
RP->>CH: RunResponse (respInput carries UserGroups)
|
||||
RP->>CH: RunTerminal (merged request+response metadata)
|
||||
end
|
||||
end
|
||||
RP-->>M: handler returns
|
||||
M->>M: build logEntry incl. AgentNetwork
|
||||
alt SuppressAccessLog == true
|
||||
M->>M: skip l.log; still trackUsage
|
||||
else default
|
||||
M->>M: l.log → goroutine SendAccessLog
|
||||
end
|
||||
```
|
||||
|
||||
### EnableLogCollection suppression path
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
S["agentnetwork.Settings.EnableLogCollection"] --> B["synthesizer: target.DisableAccessLog = !EnableLogCollection"]
|
||||
B --> P["proto PathTargetOptions.disable_access_log (field 13)"]
|
||||
P --> T["protoToMapping reads GetDisableAccessLog()\nserver.go:2211"]
|
||||
T --> M["PathTarget.DisableAccessLog\nservicemapping.go:47"]
|
||||
M --> R["ServeHTTP: cd.SetSuppressAccessLog\nreverseproxy.go:106"]
|
||||
R --> G["accesslog middleware: if !GetSuppressAccessLog l.log\nmiddleware.go:95"]
|
||||
R --> U["trackUsage unconditional — bandwidth telemetry preserved"]
|
||||
```
|
||||
|
||||
**Ingestion** lands as a `ProxyMapping` batch on `handleSyncMappingsStream`/`handleMappingStream`. `processMappings` dispatches to `addMapping`/`modifyMapping`/`removeMapping`; HTTP goes `setupHTTPMapping → updateMapping → protoToMapping`. `protoToMapping` (server.go:2181) is the single translation surface that materialises `[]middleware.Spec`, `*bodytap.Config`, `AgentNetwork`, `DisableAccessLog` onto each `PathTarget`; `updateMapping` finishes with `s.proxy.AddMapping(m)` (atomic swap under `mappingsMux`) and `s.rebuildMiddlewareChains(svcID, m)`.
|
||||
|
||||
At **request time** the access-log middleware stamps `CapturedData`; the auth chain runs (Private services lift `peer_group_ids` from `ValidateTunnelPeer` — auth/middleware_test.go:322). `ReverseProxy.ServeHTTP` resolves the chain; nil or empty → original `httputil.ReverseProxy`, no body capture. When a chain matches, body is captured under the global budget, `RunRequest` produces an `UpstreamRewrite` (`llm_router` selects a provider, rewrites scheme/host/path, injects `Authorization`), and `RunResponse`+`RunTerminal` run after the upstream returns. The terminal slot sees the merged metadata bag — that's how `llm_limit_record` ships the consumption sample. The **access-log** addition: `logEntry.AgentNetwork` from `GetAgentNetwork()` onto `proto.AccessLog.AgentNetwork`; the gate at middleware.go:95 honors `EnableLogCollection`, skipping `l.log` but keeping `trackUsage` so bandwidth telemetry survives.
|
||||
|
||||
## Public contracts touched
|
||||
|
||||
- `proxy.Server.MiddlewareDataDir` (string) — base dir for file-backed middleware config (server.go:238-241).
|
||||
- `proxy.Server.MiddlewareCaptureBudgetBytes` (int64) — process-wide capture cap; defaults to 256 MiB (server.go:248-250).
|
||||
- `proxy/internal/proxy.WithMiddlewareManager(*middleware.Manager) Option` — new option on `NewReverseProxy`; nil keeps the fast path (reverseproxy.go:48-56).
|
||||
- `proxy/internal/proxy.PathTarget` adds `Middlewares`, `CaptureConfig`, `AgentNetwork`, `DisableAccessLog` (servicemapping.go:27-51), all zero-default.
|
||||
- `proxy/internal/proxy.CapturedData` adds `agentNetwork`, `suppressAccessLog`, `userGroupNames` behind `sync.RWMutex`; slices deep-copied (context.go:47-66, 183-258).
|
||||
- `accesslog.logEntry.AgentNetwork` + `proto.AccessLog.AgentNetwork` (logger.go:131, 268).
|
||||
- `metrics.Metrics.Meter()` exposes the OTel meter for the middleware manager (metrics.go:53-58).
|
||||
|
||||
## Invariants
|
||||
|
||||
- **Synth-service updates are live (no proxy restart).** Every `MODIFIED` flows through `modifyMapping → cleanupMappingRoutes` (invalidates chains) `→ setupHTTPMapping → updateMapping → rebuildMiddlewareChains`. **ProxyMapping.Private preservation:** the relevant logic lives in `management/internals/shared/grpc/proxy.go:shallowCloneMapping`, not this module, but it surfaces here — if a `MODIFIED` synth service arrives `private=false`, auth skips `ValidateTunnelPeer`, `CapturedData.UserGroups` stays empty, and `llm_router` denies with `llm_policy.no_authorised_provider` until a management restart re-pushes the snapshot. This module assumes `mapping.GetPrivate()` is correct on every batch.
|
||||
- **`EnableLogCollection=false` suppresses access-log writes but middleware still runs.** Gate is one `if !cd.GetSuppressAccessLog()` immediately around `l.log(entry)` (middleware.go:95); `trackUsage` runs below the gate. Locked by `TestMiddleware_SuppressAccessLog_PreservesUsageTracking` (middleware_test.go:139).
|
||||
- **`agent_network` flag on access-log entries is set when the chain processed the request.** Source `target.AgentNetwork`, stamped at reverseproxy.go:105, read at accesslog/middleware.go:86.
|
||||
- **auth → builtin group propagation.** `Protect` writes `UserGroups`/`UserGroupNames`; `buildRequestInput` (reverseproxy.go:333) copies them into `middleware.Input`. The response-leg `respInput` (reverseproxy.go:196-223) also carries `UserEmail`/`UserGroups`/`UserGroupNames` — `llm_limit_record` needs `UserGroups` to ship `group_ids` so management's group-targeted budget rules match (comment at reverseproxy.go:211-215).
|
||||
- **Empty chains stay on the fast path.** `ServeHTTP` skips body capture and the run sequence when `chain == nil || chain.Empty()` (reverseproxy.go:127).
|
||||
- **Self-registration is the only way a builtin reaches the registry.** `middleware_register.go` blank-imports each builtin; `init()` adds the factory to `mwbuiltin.DefaultRegistry()`. Missing it → translator drops the entry with a warn (translate.go:97).
|
||||
|
||||
## Things to scrutinize
|
||||
|
||||
### Correctness
|
||||
- **Translate edge cases** — drops on nil cfg, empty ID, unknown ID, UNSPECIFIED slot; each logs one warn; volume bounded by `MaxMiddlewaresPerChain`.
|
||||
- **Re-translate without dropping in-flight requests** — `Manager.Rebuild` is the only call from `rebuildMiddlewareChains`. Reverse proxy reads `ChainFor` once per request (reverseproxy.go:327) and runs the captured `*Chain` for the whole request. Verify in module 30 that `Rebuild` swaps atomically.
|
||||
- **ProxyMapping.Private preservation** — enforced management-side in `shallowCloneMapping`. Proxy-side regression catches: `TestProtect_PrivateService_TunnelPeerGroupsPropagate` + the integration test.
|
||||
- **Body-capture cleanup** — `defer releaseBudget()` (reverseproxy.go:145) and `defer capturingWriter.Release()` (reverseproxy.go:180) must run on every return; confirm no future `return` lands between acquisition and defer.
|
||||
- **`applyUpstreamRewrite` clones the URL** — `cloned := *orig` value-copies `*url.URL`; safe because overwritten fields are strings, not slices/maps (reverseproxy.go:285-292).
|
||||
|
||||
### Security
|
||||
- **Translate validates every config** — registry membership rejects unknown IDs; UNSPECIFIED slot drops; ID-less drops; raw config copied (not aliased) at translate.go:109.
|
||||
- **`AuthHeader`/`StripHeaders` only reachable via `UpstreamRewrite`** — regular mutation surface goes through the framework denylist (`Authorization`/`Cookie` blocked); only the router middleware can replace `Authorization` (reverseproxy.go:296-304). Confirm in module 30 nothing outside the proxy-trusted path populates `UpstreamRewrite.AuthHeader`.
|
||||
- **`stampNetBirdIdentity` strips client-sent values first** (reverseproxy.go:742-743) — anti-spoof for `X-NetBird-User`/`X-NetBird-Groups`; control chars filtered; comma-bearing labels dropped (reverseproxy_test.go:1217/:1243/:1193).
|
||||
- **Auth → group propagation** — `auth/middleware_test.go:322` and `:366` cover the contract. If auth ever stops calling `ValidateTunnelPeer` for Private services, every agent-network request silently denies.
|
||||
|
||||
### Concurrency
|
||||
- **Chain replacement under in-flight requests** — `findTargetForRequest` takes `mappingsMux.RLock`; `AddMapping` writes. `resolveChain` calls `ChainFor` once; even if `Rebuild` swaps mid-request, in-flight requests keep running on the captured pointer.
|
||||
- **`CapturedData` mutation across slots** — accessors take `sync.RWMutex`; slices deep-copied on both Set and Get. Verify no caller mutates the returned slice expecting it to land back.
|
||||
- **`Manager.Invalidate` race** — `removeMapping` invalidates after `cleanupMappingRoutes`; mapping read happens before chain resolution, so requests before invalidate run captured chains; later ones fail `findTargetForRequest`.
|
||||
- **`Logger.log` goroutine** — `logSem` caps at `maxLogWorkers = 4096`; overflow → `dropped.Add(1)` + debug log. Middleware test uses a buffered channel and 150ms negative-assertion window — review whether 150ms holds on slow CI.
|
||||
|
||||
### Backward compatibility
|
||||
- **Non-agent-network services unaffected** — `protoToMapping` reads new fields only when `opts != nil`; defaults leave `Middlewares`/`CaptureConfig` nil → chain resolves nil → fast path. Existing `reverseproxy_test.go` (non-chain) still passes.
|
||||
- **`disable_access_log` is proto field 13, default false** — every existing target unset; gate is no-op. Locked by `TestMiddleware_SuppressAccessLog_DefaultEmitsLog` (middleware_test.go:104).
|
||||
- **`Server` additions optional** — 256 MiB default when `MiddlewareCaptureBudgetBytes ≤ 0` (server.go:1997-2000).
|
||||
|
||||
### Performance
|
||||
- **Translate cost per push** — O(n) with per-entry registry lookup and `config_json` copy; negligible vs. the upstream gRPC unmarshal.
|
||||
- **Empty-chain hot path** — one `ChainFor` map lookup + one `chain.Empty()` check; no allocation delta vs. pre-PR.
|
||||
- **Body capture buffer churn** — `bodytap.CaptureRequest` allocates `MaxRequestBytes` per chain-hitting request; `releaseBudget` ties allocation to the 256 MiB proxy-wide budget. Confirm in module 30 the budget is a hard cap.
|
||||
|
||||
### Observability
|
||||
- **Metrics** — `Metrics.Meter()` shared with `middleware.NewMetrics` (server.go:1990-1993) so middleware instruments land in the same prometheus exporter. No new metrics defined here.
|
||||
- **Access-log accuracy** — every entry carries `AgentNetwork`; terminal-slot metadata merged into `CapturedData.Metadata` (reverseproxy.go:238-241).
|
||||
- **Deny logs at `Infof`** (reverseproxy.go:170) — review whether `Info` is too noisy at high deny rates; consider Debug or rate-limit.
|
||||
|
||||
## Test coverage
|
||||
|
||||
| Test file | Locks down |
|
||||
| --------- | ---------- |
|
||||
| proxy/middleware_translate_test.go | Empty/nil → nil; field preservation; unknown ID skip; nil registry permissive; timeout clamping; fail-mode + slot incl. UNSPECIFIED-drop; empty-ID drop; truncation above + at `MaxMiddlewaresPerChain` |
|
||||
| proxy/internal/proxy/reverseproxy_test.go | Rewrite host/headers/cookies/query; trusted proxy; path forwarding; classifyProxyError; X-NetBird-User/Groups anti-spoof + CSV-join + control-char/comma rejection + fallback-to-ID; `TestBuildRequestInput_PropagatesIdentityAndGroups` (UserGroups/Email/GroupNames/AgentNetwork reach `middleware.Input`) |
|
||||
| proxy/internal/proxy/agent_network_chain_realstack_test.go | **The end-to-end integration test.** Drives a real agent-network request through `ReverseProxy.ServeHTTP` with the chain the synthesizer produces, against an in-process management gRPC (bufconn) backed by a real sqlite store + real `agentnetwork.Manager`, plus an `httptest` upstream — no external infrastructure or real LLM. Guarantees: (1) response-leg `respInput` carries `UserGroups` so `llm_limit_record` ships non-empty `group_ids` and the admin-group consumption row increments; (2) `RedactPii=true` redacts both prompt and completion on captured metadata; (3) the full chain runs against a real management stack. **Line 189-211 inlines the proto→Spec mapping** instead of calling the proxy's private `translateMiddlewareConfig` — keep that inline mirror in sync with `proxy/middleware_translate.go` or the test silently diverges from production. |
|
||||
| proxy/internal/accesslog/middleware_test.go | `SuppressAccessLog=true` skips `SendAccessLog` (150ms negative wait); default emits one send (2s positive); usage tracking runs under suppression |
|
||||
| proxy/internal/auth/middleware_test.go | `TestProtect_PrivateService_TunnelPeerGroupsPropagate` proves `peer_group_ids` reach `CapturedData.UserGroups`; `TestProtect_PrivateService_TunnelPeerDenied` proves rejected peers 403 without reaching the handler |
|
||||
|
||||
The integration test runs in a few seconds with no external infrastructure — exercising the real synthesizer, `Manager.Rebuild`, `ServeHTTP` dispatch, and `llm_limit_record` writing a real consumption row through the real `agentnetwork.Manager` over real gRPC.
|
||||
|
||||
## Known limitations / explicit non-goals
|
||||
|
||||
- **Translator does not validate `RawConfig` JSON** — factory's job at `New([]byte)`. Confirm in module 30 that a per-binding factory failure doesn't poison the rest of the chain.
|
||||
- **No throttle on management push rate** — every `MODIFIED` triggers `Manager.Rebuild`. Mitigation upstream.
|
||||
- **Streaming responses (SSE)** — body capture is streaming-aware, but response-leg middleware runs only after the response completes; long SSE streams delay `llm_limit_record` until close.
|
||||
- **OIDC-only path doesn't carry tunnel-peer groups** — agent-network synth services rely on the Private tunnel-peer path; JWT groups claim is the only carrier for non-Private OIDC.
|
||||
- **`agent_network` flag on L4 entries** not added; HTTP-only.
|
||||
- **`mw.capture.bypass_reason` metadata key** documented at reverseproxy.go:151,184; namespace this in module 30/31 to avoid collisions.
|
||||
|
||||
## Cross-references
|
||||
- Upstream: [shared/api](10-shared-api.md), [proxy/middleware-framework](30-proxy-middleware-framework.md), [proxy/middleware-builtin](31-proxy-middleware-builtin.md), [proxy/llm-parsers](32-proxy-llm-parsers.md)
|
||||
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
|
||||
- Top-level: [../00-overview.md](../00-overview.md)
|
||||
228
docs/agent-networks/modules/40-dashboard.md
Normal file
228
docs/agent-networks/modules/40-dashboard.md
Normal file
@@ -0,0 +1,228 @@
|
||||
# dashboard — UI for agent-networks
|
||||
|
||||
This module documents code that lives in the **dashboard repo** (under
|
||||
`src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`), not
|
||||
in this repo. It is co-located here so backend readers see the full picture.
|
||||
|
||||
> **Risk level:** Medium. The new surface is isolated under `src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`, but it also reshapes the sidebar, splits `/peers`, renames `reverse-proxy/clusters` → `self-hosted-proxies`, and overlays the Control Center graph. Regressions here would be cross-cutting.
|
||||
> **Backward-compat impact:** Additive on the API side. Breaking on URL/navigation: `/peers` redirects to `/peers/devices` (src/app/(dashboard)/peers/page.tsx:7-15), `/reverse-proxy/clusters` was renamed to `/reverse-proxy/self-hosted-proxies`, the sidebar lost Access Control / Networks / Reverse Proxy / DNS / standalone Guardrails / Consumption / Activity (Navigation.tsx:165-171 — routes still resolve via URL), and the standalone `/agent-network/{access-log,consumption,global-controls}` routes are gone in favor of `/agent-network/observability`.
|
||||
|
||||
## Module boundary
|
||||
|
||||
The dashboard is the only place an operator interacts with agent-networks: provider catalog, configured providers, policies, guardrails, account-level budget rules, account settings (collection / redaction toggles), per-request access log, and consumption rollups all render, paginate, and edit here. Data flows in via SWR (`useFetchApi`) keyed by REST URL. One big context provider (`src/modules/agent-network/AIProvidersProvider.tsx`) aggregates five resources (providers, policies, guardrails, budget rules, settings) plus the proxy access-log stream filtered to `agent_network=true`, and exposes `add* / update* / toggle* / delete*` mutators that call through `useApiCall` and re-`mutate()` SWR. Pages mount the provider once at the top and compose presentational tables and modals beneath. The control-center page additionally fetches `/agent-network/{providers,policies}` directly (control-center/page.tsx:123-130) to overlay graph nodes.
|
||||
|
||||
## What the UI delivers
|
||||
|
||||
- **AI Observability** page with four tabs: Access Logs, Budget Dashboard,
|
||||
Budget Settings, Log Settings (replaces the standalone access-log,
|
||||
consumption, and global-controls routes).
|
||||
- **Providers** page: provider catalog + connect/edit wizard with per-vendor
|
||||
copy (LiteLLM, Portkey, Bifrost, Cloudflare, Vercel, OpenRouter, custom).
|
||||
- **Policies** page: group → provider authorization with per-policy Limits
|
||||
(minute-granular windows) + guardrail attach.
|
||||
- **Guardrails** page: reusable model-allowlist + prompt-capture sets.
|
||||
- **Account controls**: Log Collection / Prompt Collection / Redact PII toggles.
|
||||
- **Budget rules**: account-level rules reusing the policy Limits UI.
|
||||
- **Control Center overlay**: provider + agent-policy nodes on the graph.
|
||||
- **Navigation + peers reshaping**: peers split into Devices / Agents,
|
||||
`reverse-proxy/clusters` renamed to `self-hosted-proxies`, sidebar
|
||||
repackaged for agent-network focus.
|
||||
|
||||
## Surface added
|
||||
|
||||
### New pages
|
||||
|
||||
| Route | Purpose | Backing module(s) |
|
||||
| ----- | ------- | ----------------- |
|
||||
| `/agent-network` | Redirect to `/agent-network/providers` | page.tsx:7-15 |
|
||||
| `/agent-network/providers` | List + connect providers; header surfaces per-account base URL | providers/page.tsx + AgentProvidersTable + AIProviderModal |
|
||||
| `/agent-network/policies` | Group → Provider authorization with per-policy Limits + Guardrail attach | policies/page.tsx + AgentPoliciesTable + AgentPolicyModal |
|
||||
| `/agent-network/guardrails` | Reusable guardrail sets (model allowlist + prompt capture) | guardrails/page.tsx + AgentGuardrailsTable + AgentGuardrailModal |
|
||||
| `/agent-network/observability` | Tabs: Access Logs / Budget Dashboard / Budget Settings / Log Settings | observability/page.tsx |
|
||||
| `/peers/devices`, `/peers/agents` | Split of `/peers`, shared via `PeersListView` keyed by `kind` | peers/{devices,agents}/page.tsx |
|
||||
| `/reverse-proxy/self-hosted-proxies` | Renamed from `clusters` | self-hosted-proxies/page.tsx |
|
||||
|
||||
Removed in favor of `/agent-network/observability`: `/agent-network/access-log`, `/agent-network/consumption`, `/agent-network/global-controls`.
|
||||
|
||||
### New modules under src/modules/agent-network
|
||||
|
||||
| File | Role |
|
||||
| ---- | ---- |
|
||||
| AIProvidersProvider.tsx (~1158 LOC) | Aggregates every agent-network resource via SWR; normalises snake↔camel; exposes mutators; holds wizard-open state |
|
||||
| AIProviderModal.tsx (~1268 LOC) | Connect / edit provider wizard with per-vendor copy (Bifrost, Portkey, LiteLLM, Cloudflare, Vercel, OpenRouter, custom) |
|
||||
| AIProviderLogo + useProviderCatalog | Catalog-driven brand swatch + SWR hook over `/agent-network/catalog/providers` |
|
||||
| AgentPoliciesTable + AgentPolicyModal + AgentPolicyGuardrailsTab + AgentPolicyLimitsTab | Policies; modal has 3 tabs (Rule, Limits, Guardrails) |
|
||||
| AgentGuardrailsTable + AgentGuardrailModal + AgentGuardrailBrowseModal + AgentGuardrailChecksCell | Guardrails CRUD + attach-from-policy |
|
||||
| AgentBudgetRulesTable + AgentBudgetRuleModal | Account-level budget rules; modal reuses AgentPolicyLimitsTab verbatim |
|
||||
| AgentAccountControlsCard | Three account-wide toggles (Log Collection / Prompt Collection / Redact PII) |
|
||||
| AgentAccessLogTable + AgentAccessLogExpandedRow | Access log on `/events/proxy?agent_network=true` |
|
||||
| AgentConsumptionPanel + AgentConsumptionTable | Token + cost panel: charts + counter table |
|
||||
| table/AgentProvidersTable + AgentProviderActionCell | Providers table + per-row actions |
|
||||
| data/mockData.ts | Domain types and a few residual `MOCK_*` constants (see scrutinize) |
|
||||
|
||||
### Touched non-agent-network areas
|
||||
|
||||
- **control-center**: agent-network overlay (provider + agent-policy nodes); removed the All Networks dropdown; hid the Networks tab in FlowSelector (FlowSelector.tsx:9-14 — enum value kept so `?tab=networks` still type-checks); wrapped `ControlCenterView` in `AIProvidersProvider` (page.tsx:73-83); `agentPolicyNode` clicks routed to a separate state slot (page.tsx:1871-1874). New node renderers: nodes/ProviderNode.tsx, nodes/AgentPolicyNode.tsx (registered at utils/nodes.ts:21-22).
|
||||
- **peers**: Split into Devices and Agents sub-routes; shared via `PeersListView` keyed by `kind` (PeersListView.tsx:24-95). New compact-toolbar `UserFilterSelector` (users/UserFilterSelector.tsx).
|
||||
- **reverse-proxy**: Folder rename `clusters/` → `self-hosted-proxies/`; deleted `ClustersFeaturesCell.tsx`, `ClusterTypeIndicator.tsx`; new ReverseProxyClusterTargetSelector for cluster target type; Private toggle on target modal; body-capture knobs removed; new ReverseProxyEventExpandedRow.
|
||||
- **events**: `ReverseProxyEventsUserCell` rewritten with user + peer fallback (ReverseProxyEventsUserCell.tsx:14-21), shared with the access-log table.
|
||||
- **navigation**: Full repackaging in Navigation.tsx — Agent Network items flattened (no collapsible parent), distinct icons per item; Access Control, Networks, Reverse Proxy, DNS, standalone Guardrails, Consumption, Activity removed (still URL-reachable, per lines 165-171).
|
||||
|
||||
## Architecture & flow
|
||||
|
||||
### Page → Provider → Table/Modal hierarchy
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
Nav[Navigation.tsx]
|
||||
Nav --> ProvidersPage[/agent-network/providers/]
|
||||
Nav --> PoliciesPage[/agent-network/policies/]
|
||||
Nav --> GuardrailsPage[/agent-network/guardrails/]
|
||||
Nav --> ObsPage[/agent-network/observability/]
|
||||
|
||||
ProvidersPage --> AIPP1[AIProvidersProvider]
|
||||
PoliciesPage --> AIPP2[AIProvidersProvider]
|
||||
GuardrailsPage --> AIPP3[AIProvidersProvider]
|
||||
ObsPage --> AIPP4[AIProvidersProvider]
|
||||
ObsPage -.wraps.-> GroupsProvider
|
||||
ObsPage -.wraps.-> PeersProvider
|
||||
|
||||
AIPP1 --> ProvTable[AgentProvidersTable]
|
||||
ProvTable --> ProvModal[AIProviderModal]
|
||||
AIPP2 --> PolTable[AgentPoliciesTable]
|
||||
PolTable --> PolModal[AgentPolicyModal]
|
||||
PolModal --> PolGuardTab[AgentPolicyGuardrailsTab]
|
||||
PolModal --> PolLimitsTab[AgentPolicyLimitsTab]
|
||||
PolGuardTab --> GuardBrowse[AgentGuardrailBrowseModal]
|
||||
PolGuardTab --> GuardModal[AgentGuardrailModal]
|
||||
AIPP3 --> GuardTable[AgentGuardrailsTable]
|
||||
GuardTable --> GuardModal
|
||||
AIPP4 --> Tabs[Tabs]
|
||||
Tabs --> AccessLog[AgentAccessLogTable]
|
||||
Tabs --> Consumption[AgentConsumptionPanel]
|
||||
Tabs --> BudgetRules[AgentBudgetRulesTable]
|
||||
Tabs --> AccountCtl[AgentAccountControlsCard]
|
||||
BudgetRules --> BudgetModal[AgentBudgetRuleModal]
|
||||
BudgetModal -.reuses.-> PolLimitsTab
|
||||
```
|
||||
|
||||
### AI Observability tab page
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
Page[AIObservabilityPage] --> RA[RestrictedAccess<br/>permission.services.read]
|
||||
RA --> GP[GroupsProvider]
|
||||
GP --> PP[PeersProvider]
|
||||
PP --> AIP[AIProvidersProvider]
|
||||
AIP --> Tabs[Tabs / TabsList]
|
||||
Tabs --> T1[Access Logs<br/>AgentAccessLogTable]
|
||||
Tabs --> T2[Budget Dashboard<br/>AgentConsumptionPanel]
|
||||
Tabs --> T3[Budget Settings<br/>AgentBudgetRulesTable]
|
||||
Tabs --> T4[Log Settings<br/>AgentAccountControlsCard]
|
||||
T1 -.GET.-> EP[/events/proxy?agent_network=true/]
|
||||
T2 -.GET poll 5s.-> CONS[/agent-network/consumption/]
|
||||
T3 -.GET/PUT.-> BR[/agent-network/budget-rules/]
|
||||
T4 -.GET/PUT.-> ST[/agent-network/settings/]
|
||||
```
|
||||
|
||||
### Data fetch path
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
Page[Page component] --> Prov[AIProvidersProvider]
|
||||
Prov -->|useFetchApi| SWR[(SWR cache<br/>key = URL)]
|
||||
SWR -.GET.-> P[/agent-network/providers/]
|
||||
SWR -.GET.-> POL[/agent-network/policies/]
|
||||
SWR -.GET.-> G[/agent-network/guardrails/]
|
||||
SWR -.GET.-> BR[/agent-network/budget-rules/]
|
||||
SWR -.GET ignoreError.-> ST[/agent-network/settings/]
|
||||
SWR -.GET.-> CAT[/agent-network/catalog/providers/]
|
||||
SWR -.GET pageSize=100.-> EVT[/events/proxy agent_network=true/]
|
||||
Prov --> Mut[useApiCall.post/put/del]
|
||||
Mut -.on success.-> MutateSWR[SWR mutate keys]
|
||||
Prov --> Children[Tables / Modals via useAIProviders]
|
||||
```
|
||||
|
||||
Every list view reaches management through SWR over `/api/agent-network/*`. The provider context maps snake-case payloads to camelCase domain types (`fromAPI`, `policyFromAPI`, `guardrailFromAPI`, `budgetRuleFromAPI`, `settingsFromAPI`, `accessLogFromAPI` — AIProvidersProvider.tsx:138-562) and back via matching `*ToRequest` adaptors. The access log piggy-backs on `/events/proxy` with `agent_network=true&page_size=100` (line 707-709) and decodes LLM-specific fields from per-event `metadata`. Group IDs on events are resolved to current names through the surrounding GroupsProvider catalog (lines 515-521, 717-731) — no extra round trip. Mutators run `*ToRequest`, await `useApiCall.post/put/del`, call SWR `mutate()`, then `notify`. Errors caught and surfaced via `notify` — no exceptions escape into render. The Connect Provider modal's open state lives in the provider itself (`isWizardOpen` at lines 732-735) so the providers-page empty-state CTA and the table's + button share one modal. Control-center re-fetches `/agent-network/{providers,policies}` directly on top of `AIProvidersProvider` — SWR de-dupes but the code path is harder to reason about.
|
||||
|
||||
## Public contracts consumed
|
||||
|
||||
- `GET/POST /api/agent-network/providers`, `PUT/DELETE /:id`
|
||||
- `GET/POST /api/agent-network/policies`, `PUT/DELETE /:id`
|
||||
- `GET/POST /api/agent-network/guardrails`, `PUT/DELETE /:id`
|
||||
- `GET/POST /api/agent-network/budget-rules`, `PUT/DELETE /:id`
|
||||
- `GET/PUT /api/agent-network/settings` (ignoreError-tolerant; 404 = not yet bootstrapped — auto-bootstrap on first provider create via `bootstrap_cluster` field — AIProvidersProvider.tsx:737-760)
|
||||
- `GET /api/agent-network/catalog/providers` (read-only declarative; backend owns vendor list, IDs, brand colors, models, extra_headers, identity_injection — useProviderCatalog.ts:6-95)
|
||||
- `GET /api/agent-network/consumption` (polled every 5s on Budget Dashboard — ConsumptionPanel.tsx:53,65-71)
|
||||
- `GET /api/events/proxy?agent_network=true&page_size=100` (shared with Proxy Events)
|
||||
- `permission?.services?.read` gates every agent-network route via RestrictedAccess.
|
||||
|
||||
`AIProviderId` is a closed union in dashboard types (data/mockData.ts:8-21) but the converter tolerates anything the backend ships — unknown ids fall through to `"custom"` (AIProvidersProvider.tsx:497-506). Catalog values are pure read-through: anything declared in `extra_headers` renders in the modal automatically, copy keyed by header name (`EXTRA_HEADER_UI` in AIProviderModal.tsx:61-89), labeled-fallback for unknown ones.
|
||||
|
||||
## Invariants
|
||||
|
||||
- Provider context wrap order on user-attribution pages: `GroupsProvider > PeersProvider > AIProvidersProvider` (observability/page.tsx:87-89). Reverse it and access-log group resolution silently drops names.
|
||||
- Every agent-network route checks `permission?.services?.read` via `RestrictedAccess` (observability/page.tsx:85, providers/page.tsx:184, policies/page.tsx:53, guardrails/page.tsx:55).
|
||||
- Modal `key={open ? 1 : 0}` pattern is used to force unmount/remount on close so internal `useState` resets between edits (AgentBudgetRuleModal.tsx:60, AgentPolicyModal.tsx:66). Removing this would leak prior-row state into a new-row session.
|
||||
- `mockData.ts` is the canonical home for ALL agent-network domain types; `MOCK_*` constants must never reach a production code path. One leak remains (below).
|
||||
|
||||
## Things to scrutinize
|
||||
|
||||
### Correctness
|
||||
|
||||
- **Tab-state URL hand-off is one-way.** observability/page.tsx:53-58 reads `?tab=` on mount (despite the file comment at line 28 saying URL hand-off is future) but `setTab` does NOT push back, so reload preserves the chosen tab only if it came in via the link. Inconsistent with control-center (page.tsx:1817-1831).
|
||||
- **Provider overlay runs only in `applySingleGroupView` / `applyPeerView`** (control-center/page.tsx:557, 1159-1166). User view does NOT show providers — if agent-network is a primary lens, that's a gap.
|
||||
- **Two useEffects race to invalidate the control-center layout.** page.tsx:1655-1657 drops `layoutInitialized` when `agentPolicies` / `agentProviders` arrive; the main effect (1786-1799) also lists them as deps. Functional but fragile — watch for flash-of-empty-graph.
|
||||
- **`updateProvider` / `updatePolicy` / `updateBudgetRule` use `??` on `enabled`** (AIProvidersProvider.tsx:784, 859, 1018). Toggle paths are safe; any caller sending `enabled: false` thinking "leave it off" gets `existing.enabled` instead. Audit modal callers.
|
||||
- **Form validation in modals is minimal.** Window-seconds picker — mockData.ts:209-215 documents "minimum 60 — one minute" but there is no matching UI guard in PolicyLimitsTab; the backend validator is the enforcement point.
|
||||
|
||||
### Security
|
||||
|
||||
- **No client-side enforcement claims** — every cap, allowlist, and toggle is display + edit; proxy is the source of truth for deny decisions (AccessLogTable.tsx:177-191 renders backend-emitted `denyReason` as-is).
|
||||
- **Prompt display is gated by what the backend stamps.** When `enable_prompt_collection` is OFF the proxy must not put prompt/completion into event metadata; the dashboard renders whatever it gets verbatim (AccessLogTable lines 532-534, AccessLogExpandedRow.tsx:42-57). No UI filter on top of backend collection switches.
|
||||
- Account Controls disables `Redact PII` when `Prompt Collection` is off (AgentAccountControlsCard.tsx:122) and clears it on off-transition (line 100), but relies on backend to enforce the same gate at write — confirm PUT handler rejects `redact_pii=true && enable_prompt_collection=false`.
|
||||
- **Bifrost identity-header overrides**: empty-string vs nil semantics documented in AIProvidersProvider.tsx:772-781 ("omitted = preserve, empty = explicit clear"). Mishandling could leak group attribution to a header the operator thought disabled. Focused read of Bifrost code path in AIProviderModal.tsx recommended.
|
||||
|
||||
### Accessibility
|
||||
|
||||
- Observability TabsList (observability/page.tsx:96-113) uses the shared Tabs component — should inherit Radix roving-tabindex. All four TabsTriggers carry only icon + text, no `aria-label`; fine because text is visible.
|
||||
- Modal focus traps are inherited from the shared Modal; agent-network modals don't override them. Quick keyboard pass recommended.
|
||||
- `EndpointBadge` Copy button (providers/page.tsx:66-76) has an `aria-label`, good.
|
||||
|
||||
### Performance
|
||||
|
||||
- `AgentConsumptionPanel` polls `/agent-network/consumption` every 5s (ConsumptionPanel.tsx:53,70). Tab switches unmount the panel, so the poll stops — verify in network panel.
|
||||
- `AgentAccessLogTable` is hard-capped at 100 rows via `page_size=100` (AIProvidersProvider.tsx:707-709). Server-side pagination is future work; high-traffic tenants miss everything past row 100 — known limitation.
|
||||
- Observability page mounts providers ONCE at page level (observability/page.tsx:87-89); tab switches keep SWR cache hot. Moving the provider mount inside `TabsContent` would re-fetch the access log on every switch.
|
||||
|
||||
### Visual consistency
|
||||
|
||||
- The observability tab style mirrors peers/page.tsx. Outer Tabs `pt-4 pb-0 mb-0`, TabsList `px-8` (observability/page.tsx:94-96) — confirm chrome height matches so the page doesn't visually jump.
|
||||
- Sidebar: `Boxes` for Providers, `AccessControlIcon` for Policies, `TelescopeIcon` for AI Observability (Navigation.tsx:113,120,133). Reusing `AccessControlIcon` makes Policies look identical to the (now hidden) Access Control item — if Access Control ever comes back, they collide.
|
||||
- `AgentNetworkIcon` is used in breadcrumbs on every agent-network page but NOT in the sidebar (per-page icons instead). Deliberate departure — record so it doesn't get reverted.
|
||||
|
||||
## Test coverage
|
||||
|
||||
- **Cypress**: One file (`cypress/e2e/test.cy.ts`) covering only the install-page copy-to-clipboard flow. NOTHING covers agent-network UI.
|
||||
- **Component / unit tests**: `src/utils/version.test.ts` is the only `.test.*` file in the repo. The agent-network modules ship without component tests.
|
||||
- Data-cy hooks exist on key controls: `save-account-controls` (AgentAccountControlsCard.tsx:71), `enable-log-collection`, `enable-prompt-collection`, `redact-pii`, plus existing `data-cy={policy.name}` / `data-cy={provider.name}` on ActiveInactiveRow. Sufficient hooks for Cypress flows; none written yet.
|
||||
- **Tooling gap (pre-existing):** `npm run lint` (`next lint`) is broken in Next 16 — the `lint` subcommand was removed from the Next CLI in 16.x, so the dashboard effectively has no working lint gate. The fix is to add either a flat-config `eslint .` script or wire ESLint via an explicit `eslint-config-next` invocation.
|
||||
|
||||
## Known limitations / explicit non-goals
|
||||
|
||||
- **`data/mockData.ts` still contains `MOCK_GROUPS`, `MOCK_PROVIDERS`, `MOCK_PEERS`.** Only `MOCK_GROUPS` is referenced from production — AgentPoliciesTable.tsx:45,76 uses it as a name-lookup fallback when a policy references a group ID the real GroupsProvider doesn't know about. `MOCK_PROVIDERS` / `MOCK_PEERS` are unreferenced; safe to delete. The file is `/* eslint-disable */` so dead-code warnings don't flag them.
|
||||
- **Tab-state URL hand-off on observability page is one-way** (read-only).
|
||||
- **Access log hard-capped at 100 rows**; no server-side pagination.
|
||||
- **No optimistic updates.** All mutations are round-trip; failures rollback via SWR revalidation.
|
||||
- **`FlowView.NETWORKS` retained but hidden** from FlowSelector (FlowSelector.tsx:9-14). Old `?tab=networks` links still route to the hidden view because `applyNetworksView` still runs.
|
||||
- **Redirects are not query-preserving** — `router.replace("/peers/devices")` (peers/page.tsx:13) strips any incoming filter params.
|
||||
- **Control-center cross-fetches** `/agent-network/{providers,policies}` directly on top of `AIProvidersProvider`. Could be collapsed.
|
||||
- **Sidebar permanently hides Access Control, Networks, Reverse Proxy, standalone Guardrails, DNS, Activity, Consumption.** Routes still resolve via URL (Navigation.tsx:165-171); intentional.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Upstream API contracts: [shared/api](10-shared-api.md)
|
||||
- Backend persistence: [management/store](20-management-store.md)
|
||||
- Backend handler wiring: [management/handlers + wiring](22-management-handlers-wiring.md)
|
||||
- End-to-end flow narrative: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
|
||||
- Top-level overview: [../00-overview.md](../00-overview.md)
|
||||
251
docs/agent-networks/modules/50-path-routed-providers.md
Normal file
251
docs/agent-networks/modules/50-path-routed-providers.md
Normal file
@@ -0,0 +1,251 @@
|
||||
# path-routed providers — Vertex AI + Bedrock
|
||||
|
||||
This guide pulls the **path-routed** provider story together in one place
|
||||
because it crosses the catalog, the synthesiser, the request parser, and the
|
||||
router. The relevant building blocks are the `llm_router` /
|
||||
`llm_request_parser` middlewares
|
||||
([31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)), the
|
||||
per-provider parser surface ([32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)),
|
||||
and the synthesiser's catalog → `ProviderRoute` mapping
|
||||
([21-management-agentnetwork.md](21-management-agentnetwork.md)).
|
||||
|
||||
Sibling modules: [31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)
|
||||
(router + request parser) and [32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)
|
||||
(Bedrock parser + pricing).
|
||||
|
||||
---
|
||||
|
||||
## What "path-routed" means
|
||||
|
||||
Most catalog providers carry the model in the request **body** (`{"model": …}`),
|
||||
so `llm_router` selects an upstream by matching the model name against each
|
||||
provider's `Models` claim. Two providers instead carry the model in the **URL
|
||||
path**, so they are routed by path before the model/vendor table is consulted:
|
||||
|
||||
| Catalog id | Style flag | Request path shape |
|
||||
|---|---|---|
|
||||
| `vertex_ai_api` | `IsVertexPathStyle` → `ProviderRoute.Vertex` | `/v1/projects/{project}/locations/{region}/publishers/{publisher}/models/{model}:{action}` |
|
||||
| `bedrock_api` | `IsBedrockPathStyle` → `ProviderRoute.Bedrock` | `/model/{modelId}/{action}` (optionally behind `/bedrock`) |
|
||||
|
||||
The catalog declares the style with
|
||||
[`catalog.IsVertexPathStyle` / `catalog.IsBedrockPathStyle`](../../../management/server/agentnetwork/catalog/catalog.go)
|
||||
and the synthesiser copies the result onto the router route as the `Vertex` /
|
||||
`Bedrock` booleans
|
||||
([synthesizer.go:450-451](../../../management/server/agentnetwork/synthesizer.go)).
|
||||
On the request leg `llm_router.Invoke` dispatches `isVertexPath` / `isBedrockPath`
|
||||
**before** the model lookup
|
||||
([llm_router/middleware.go:138-216](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
|
||||
so a model the parser extracted from the path can't be claimed by a same-vendor
|
||||
*body-routed* provider (e.g. `claude-*` on `api.anthropic.com`).
|
||||
|
||||
## Google Vertex AI (`vertex_ai_api`)
|
||||
|
||||
### Catalog entry
|
||||
|
||||
`KindProvider`, parser surface left unset on the catalog entry — the request
|
||||
parser picks the parser from the URL **publisher** segment, not from
|
||||
`ParserID`. Upstream host is `<region>-aiplatform.googleapis.com`
|
||||
(`https://aiplatform.googleapis.com` for the `global` location). The catalog
|
||||
lists the Claude-on-Vertex lineup (`claude-opus-4-*`, `claude-sonnet-4-*`,
|
||||
`claude-haiku-4-5`, `claude-fable-5`) at the same per-token rates as the
|
||||
first-party Anthropic entry
|
||||
([catalog.go:333-363](../../../management/server/agentnetwork/catalog/catalog.go)).
|
||||
|
||||
### Credential — service-account OAuth (`keyfile::`)
|
||||
|
||||
Vertex does **not** accept a static API key. The operator sets the provider
|
||||
`api_key` to:
|
||||
|
||||
```
|
||||
keyfile::<base64 of the GCP service-account JSON key>
|
||||
```
|
||||
|
||||
The synthesiser recognises the `keyfile::` prefix in `providerAuthHeader`
|
||||
([synthesizer.go:897-903](../../../management/server/agentnetwork/synthesizer.go)),
|
||||
emits **no** static auth value, and carries the base64 key material on the
|
||||
route as `GCPServiceAccountKeyB64`
|
||||
([factory.go:56-61](../../../proxy/internal/middleware/builtin/llm_router/factory.go)).
|
||||
At request time the router mints a short-lived OAuth2 access token from the key
|
||||
(cloud-platform scope) and injects `Authorization: Bearer <access-token>` —
|
||||
never the key itself
|
||||
([llm_router/middleware.go:621-692](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
|
||||
|
||||
- One auto-refreshing `oauth2.TokenSource` is cached per key (keyed by a
|
||||
SHA-256 of the base64 material), so token minting happens once and refreshes
|
||||
amortise across requests.
|
||||
- Mint / refresh is bounded by a 10s timeout HTTP client (`gcpTokenTimeout`) so
|
||||
a slow Google token endpoint can't hang the request.
|
||||
- A malformed key or an unreachable token endpoint fails the request with
|
||||
`llm_policy.upstream_auth_failed` at HTTP **502** (an upstream problem, not a
|
||||
policy denial) — see `denyUpstreamAuth`.
|
||||
|
||||
### Metering — Anthropic-on-Vertex only
|
||||
|
||||
The request parser extracts `{publisher, model, action}` from the path
|
||||
(`parseVertexPath`, [llm_request_parser/middleware.go:237-263](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)),
|
||||
strips the `@version` suffix from the model, and maps the publisher to a parser
|
||||
surface via `vertexPublisherVendor`:
|
||||
|
||||
- `anthropic` → `llm.provider="anthropic"` → metered through the Anthropic
|
||||
parser, priced under the **`anthropic`** block in `defaults_pricing.yaml`
|
||||
(the parser emits the standard Anthropic provider label, so Vertex Claude
|
||||
reuses first-party Anthropic prices).
|
||||
- `openai` → `llm.provider="openai"` (reserved; not in the catalog lineup
|
||||
today).
|
||||
- anything else (notably `google` / Gemini) → empty vendor → **no parser**.
|
||||
|
||||
**Gemini is intentionally denied as unmeterable.** When the parser emits no
|
||||
`llm.provider` for a Vertex publisher, `llm_router` returns
|
||||
`llm_policy.unmeterable_publisher` (403) rather than forwarding the request
|
||||
uncounted — serving it would bypass token / budget metering
|
||||
([llm_router/middleware.go:144-162, 712-728](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)).
|
||||
A Gemini parser would lift this restriction; until then the `google` publisher
|
||||
is omitted from the catalog.
|
||||
|
||||
> Caveat: cross-region inference profiles in `eu` / `apac` carry a ~10% price
|
||||
> premium that the base per-token rates do **not** model — cost annotations for
|
||||
> those regions read low. Operators who need exact regional billing override
|
||||
> the affected entries in `pricing.yaml`.
|
||||
|
||||
## AWS Bedrock (`bedrock_api`)
|
||||
|
||||
### Catalog entry
|
||||
|
||||
`KindProvider`, upstream host `bedrock-runtime.<region>.amazonaws.com`. Metered
|
||||
models are the Anthropic-on-Bedrock lineup (`anthropic.claude-*`) plus Amazon
|
||||
Nova and Llama 3.3 entries
|
||||
([catalog.go:300-332](../../../management/server/agentnetwork/catalog/catalog.go)).
|
||||
Anthropic-on-Bedrock reuses the first-party Claude prices (with additive cache
|
||||
buckets); Nova / Llama report no cache, so cost is `input + output`.
|
||||
|
||||
### Credential — static bearer token
|
||||
|
||||
Bedrock uses the **AWS Bedrock API key** as a static bearer. The operator sets
|
||||
the provider `api_key` directly (no `keyfile::` prefix); the catalog template
|
||||
is `Authorization: Bearer ${API_KEY}`
|
||||
([catalog.go:306-307](../../../management/server/agentnetwork/catalog/catalog.go)).
|
||||
No token minting — the synthesiser substitutes the key into the template and
|
||||
the router injects the resulting `Authorization` header after stripping inbound
|
||||
vendor auth (including client-supplied AWS SigV4 material: `X-Amz-Date`,
|
||||
`X-Amz-Security-Token`, `X-Amz-Content-Sha256`, see `strippedAuthHeaders`).
|
||||
|
||||
### Model id form — cross-region inference profiles
|
||||
|
||||
Bedrock model ids in the request path must be the cross-region
|
||||
**inference-profile** form, e.g.
|
||||
`eu.anthropic.claude-sonnet-4-5-20250929-v1:0`. The bare
|
||||
`anthropic.claude-…` id is rejected by AWS. `normalizeBedrockModel`
|
||||
([llm_request_parser/middleware.go:398-414](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go))
|
||||
strips the region prefix (`us.` / `eu.` / `apac.` / `global.`), an optional ARN
|
||||
wrapper, and the `-YYYYMMDD-vN[:N]` version/throughput suffix so the normalised
|
||||
id (`anthropic.claude-sonnet-4-5`) matches the catalog/pricing key.
|
||||
|
||||
### Supported endpoints + actions
|
||||
|
||||
`/model/{modelId}/{action}` where action ∈ `invoke`,
|
||||
`invoke-with-response-stream`, `converse`, `converse-stream`
|
||||
([llm_request_parser/middleware.go:363-390](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
|
||||
`invoke` / `converse` are non-streaming; the `-stream` actions set the streaming
|
||||
flag.
|
||||
|
||||
- **InvokeModel** body uses the vendor-native shape — for Anthropic that means
|
||||
`"anthropic_version":"bedrock-2023-05-31"` and snake_case usage with additive
|
||||
cache buckets.
|
||||
- **Converse** uses the unified camelCase shape with a precomputed `totalTokens`.
|
||||
- The `BedrockParser` reads both shapes on the response leg
|
||||
([bedrock.go](../../../proxy/internal/llm/bedrock.go)); the request parser
|
||||
doesn't need to distinguish them (`ParseRequest` is a no-op — model + stream
|
||||
come from the path).
|
||||
|
||||
### Streaming — AWS binary event-stream
|
||||
|
||||
The `-stream` actions return `application/vnd.amazon.eventstream` (the AWS
|
||||
binary event-stream framing), and streaming **is metered**.
|
||||
`accumulateBedrockStream`
|
||||
([llm_response_parser/streaming_bedrock.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming_bedrock.go))
|
||||
decodes the frames with `aws-sdk-go-v2/aws/protocol/eventstream`:
|
||||
|
||||
- InvokeModel `chunk` frames wrap a base64 `{"bytes":…}` payload carrying a
|
||||
vendor-native (Anthropic) stream event — folded through the shared Anthropic
|
||||
stream accumulator.
|
||||
- Converse `contentBlockDelta` frames carry text; the trailing `metadata` frame
|
||||
carries the final usage block.
|
||||
- A truncated stream (cut at the body-tap capture cap) decodes best-effort:
|
||||
frames up to the cut are applied and partial usage is returned.
|
||||
|
||||
### Optional `/bedrock` gateway-namespace prefix
|
||||
|
||||
Clients may place an optional `/bedrock` prefix before the native path
|
||||
(`/bedrock/model/{modelId}/{action}`) to disambiguate Bedrock from other
|
||||
providers that also use `/model/...`. Both the request parser
|
||||
(`trimBedrockNamespace`) and the router (`splitBedrockNamespace`) accept it.
|
||||
When the prefix is present, the router sets
|
||||
`RewriteUpstream.StripPathPrefix = "/bedrock"` so the **native** path
|
||||
(`/model/...`) is what reaches `bedrock-runtime.<region>.amazonaws.com`
|
||||
([llm_router/middleware.go:168-184, 320-348](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)).
|
||||
|
||||
## Model allowlist on path-routed providers
|
||||
|
||||
Because the model lives in the URL rather than the body, a path-routed provider
|
||||
credential could otherwise be used for any model the upstream supports. The
|
||||
router still enforces the route's `Models` allowlist via `matchPathRoute`
|
||||
([llm_router/middleware.go:370-416](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
|
||||
|
||||
1. Filter to routes of the matching style (`Vertex` / `Bedrock`).
|
||||
2. Filter to routes whose `AllowedGroupIDs` authorise the caller's groups
|
||||
(else `no_authorised_provider`).
|
||||
3. Filter to routes that **claim the requested model**. As with body-routed
|
||||
providers, an **empty `Models` list = catch-all** (serve any model);
|
||||
a non-empty list serves only the listed models (else `model_not_routable`).
|
||||
4. Multiple survivors disambiguate by longest `UpstreamPath` prefix match.
|
||||
|
||||
So an operator who lists explicit models on a Vertex/Bedrock provider gets a
|
||||
hard allowlist; an operator who leaves `Models` empty accepts every model the
|
||||
upstream serves (still subject to the unmeterable-publisher gate on Vertex).
|
||||
|
||||
Model-less OpenAI endpoints (`GET /v1/models`) are **never** routed to a
|
||||
Vertex/Bedrock provider — `matchModelless` skips path-routed routes
|
||||
([llm_router/middleware.go:427-462](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
|
||||
so a model-listing call can't be rewritten onto an upstream that would 404 it.
|
||||
|
||||
## Catalog ↔ pricing cross-check
|
||||
|
||||
Catalog prices and context windows are cross-checked against LiteLLM's
|
||||
`model_prices_and_context_window.json`. The proxy's embedded
|
||||
`defaults_pricing.yaml` covers **every metered first-party model** the catalog
|
||||
enumerates — guarded by
|
||||
`TestDefaultTable_FirstPartyModelCoverage`
|
||||
([pricing/defaults_coverage_test.go](../../../proxy/internal/llm/pricing/defaults_coverage_test.go)),
|
||||
which fails if a catalog model has no embedded price. Bedrock entries are keyed
|
||||
by the **normalised** id the request parser emits (region prefix + version
|
||||
suffix stripped). Vertex Claude carries no Bedrock-style prefix, so it prices
|
||||
straight off the `anthropic` block.
|
||||
|
||||
## Things to scrutinise
|
||||
|
||||
**Security.** The Vertex service-account key is never forwarded — only a minted
|
||||
short-lived bearer. Confirm the key material stays out of access logs (it lives
|
||||
on `ProviderRoute.GCPServiceAccountKeyB64`, not in any emitted metadata key).
|
||||
The unmeterable-publisher deny is the only thing standing between an
|
||||
operator-misconfigured Vertex provider and unmetered Gemini traffic; verify
|
||||
`vertexPublisherVendor` stays conservative (deny by default for unknown
|
||||
publishers).
|
||||
|
||||
**Correctness.** `normalizeBedrockModel` is the join between the wire id and the
|
||||
pricing key — a model that normalises to something not in `defaults_pricing.yaml`
|
||||
meters at `cost.skipped=unknown_model` rather than failing the request. The
|
||||
`/bedrock` prefix strip must run on both the parser side (so the model is
|
||||
extracted) and the router side (so the upstream path is native); a regression in
|
||||
either silently breaks the other.
|
||||
|
||||
**Metering caveats.** eu/apac cross-region Bedrock + Vertex profiles carry a
|
||||
~10% premium not modelled by base pricing — flagged in both the catalog comment
|
||||
and `defaults_pricing.yaml`. Operators needing exact regional billing override
|
||||
the relevant entries.
|
||||
|
||||
## Cross-references
|
||||
|
||||
- Router + request-parser detail: [31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)
|
||||
- Bedrock parser + pricing + SSE / event-stream: [32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)
|
||||
- Catalog → route synthesis + `keyfile::` handling: [21-management-agentnetwork.md](21-management-agentnetwork.md)
|
||||
- Overview: [../00-overview.md](../00-overview.md)
|
||||
30
e2e/agentnetwork/bootstrap_test.go
Normal file
30
e2e/agentnetwork/bootstrap_test.go
Normal file
@@ -0,0 +1,30 @@
|
||||
//go:build e2e
|
||||
|
||||
package agentnetwork
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestCombinedBootstrap proves Pillar 1: the shared combined server came up and
|
||||
// the /api/setup-minted PAT authenticates a real management API call through
|
||||
// the typed REST client (the bootstrap itself ran in TestMain).
|
||||
func TestCombinedBootstrap(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
require.NotEmpty(t, srv.PAT, "TestMain must have minted an admin PAT")
|
||||
|
||||
users, err := srv.API().Users.List(ctx)
|
||||
require.NoError(t, err, "authenticated Users.List must round-trip")
|
||||
require.NotEmpty(t, users, "the bootstrapped account must have at least one user")
|
||||
|
||||
var emails []string
|
||||
for _, u := range users {
|
||||
emails = append(emails, u.Email)
|
||||
}
|
||||
assert.Contains(t, emails, "admin@netbird.test", "the bootstrapped owner should appear in the users list")
|
||||
}
|
||||
281
e2e/agentnetwork/chat_test.go
Normal file
281
e2e/agentnetwork/chat_test.go
Normal file
@@ -0,0 +1,281 @@
|
||||
//go:build e2e
|
||||
|
||||
package agentnetwork
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/netbirdio/netbird/e2e/harness"
|
||||
"github.com/netbirdio/netbird/shared/management/http/api"
|
||||
)
|
||||
|
||||
// providerCase is one entry in the live provider matrix. The same scenario runs
|
||||
// for every available provider; availability is keyed off env vars so the suite
|
||||
// covers whatever credentials are present (source ~/.llm-keys locally / set the
|
||||
// Actions secrets in CI).
|
||||
type providerCase struct {
|
||||
name string
|
||||
catalogID string
|
||||
upstream string
|
||||
apiKey string
|
||||
model string // body model (chat/messages) or path model@version (vertex)
|
||||
kind string // harness.WireChat, harness.WireMessages, or harness.WireVertex
|
||||
project string // vertex only: GCP project for the rawPredict path
|
||||
region string // vertex only: GCP region for the rawPredict path
|
||||
}
|
||||
|
||||
// availableProviders builds the matrix from the provider env vars that are set.
|
||||
func availableProviders() []providerCase {
|
||||
var ps []providerCase
|
||||
if k := os.Getenv("OPENAI_TOKEN"); k != "" {
|
||||
ps = append(ps, providerCase{name: "openai", catalogID: "openai_api", upstream: "https://api.openai.com", apiKey: k, model: "gpt-4o-mini", kind: harness.WireChat})
|
||||
}
|
||||
if k := os.Getenv("ANTHROPIC_TOKEN"); k != "" {
|
||||
ps = append(ps, providerCase{name: "anthropic", catalogID: "anthropic_api", upstream: "https://api.anthropic.com", apiKey: k, model: "claude-haiku-4-5", kind: harness.WireMessages})
|
||||
}
|
||||
if k, u := os.Getenv("VERCEL_TOKEN"), os.Getenv("VERCEL_URL"); k != "" && u != "" {
|
||||
ps = append(ps, providerCase{name: "vercel", catalogID: "vercel_ai_gateway", upstream: u, apiKey: k, model: "openai/gpt-4o-mini", kind: harness.WireChat})
|
||||
}
|
||||
if k, u := os.Getenv("OPENROUTER_TOKEN"), os.Getenv("OPENROUTER_URL"); k != "" && u != "" {
|
||||
// Distinct model string from Vercel so each provider routes unambiguously
|
||||
// while all are enabled together.
|
||||
ps = append(ps, providerCase{name: "openrouter", catalogID: "openrouter", upstream: u, apiKey: k, model: "openai/gpt-4o", kind: harness.WireChat})
|
||||
}
|
||||
if k, u := os.Getenv("CLOUDFLARE_TOKEN"), os.Getenv("CLOUDFLARE_URL"); k != "" && u != "" {
|
||||
// Cloudflare AI Gateway routes by a provider segment in the URL path;
|
||||
// append the openai provider unless the gateway URL already carries one.
|
||||
if !strings.Contains(u, "/openai") {
|
||||
u = strings.TrimRight(u, "/") + "/openai"
|
||||
}
|
||||
// Raw model (distinct string from OpenAI's gpt-4o-mini).
|
||||
ps = append(ps, providerCase{name: "cloudflare", catalogID: "cloudflare_ai_gateway", upstream: u, apiKey: k, model: "gpt-4o", kind: harness.WireChat})
|
||||
}
|
||||
// Vertex (vertex_ai_api): Anthropic-on-Vertex, path-routed, SA-OAuth
|
||||
// (api_key = keyfile::<SA>). The model travels in the rawPredict path rather
|
||||
// than the body, so the provider is created without a models array. Region
|
||||
// defaults to "global" (host aiplatform.googleapis.com); a real region uses
|
||||
// <region>-aiplatform.googleapis.com.
|
||||
if sa := os.Getenv("GOOGLE_VERTEX_SA_BASE64"); sa != "" {
|
||||
project := os.Getenv("GOOGLE_VERTEX_PROJECT")
|
||||
if project != "" {
|
||||
region := os.Getenv("GOOGLE_VERTEX_REGION")
|
||||
if region == "" {
|
||||
region = "global"
|
||||
}
|
||||
host := "aiplatform.googleapis.com"
|
||||
if region != "global" {
|
||||
host = region + "-aiplatform.googleapis.com"
|
||||
}
|
||||
model := os.Getenv("GOOGLE_VERTEX_MODEL")
|
||||
if model == "" {
|
||||
model = "claude-sonnet-4-5@20250929"
|
||||
}
|
||||
ps = append(ps, providerCase{
|
||||
name: "vertex", catalogID: "vertex_ai_api", upstream: "https://" + host,
|
||||
apiKey: "keyfile::" + sa, model: model, kind: harness.WireVertex,
|
||||
project: project, region: region,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Bedrock: path-routed, bearer auth. Model is a cross-region inference
|
||||
// profile id (distinct string from the first-party Anthropic case).
|
||||
if k := os.Getenv("AWS_BEARER_TOKEN_BEDROCK"); k != "" {
|
||||
region := os.Getenv("AWS_REGION")
|
||||
if region == "" {
|
||||
region = "us-east-1"
|
||||
}
|
||||
ps = append(ps, providerCase{name: "bedrock", catalogID: "bedrock_api", upstream: "https://bedrock-runtime." + region + ".amazonaws.com", apiKey: k, model: "us.anthropic.claude-haiku-4-5", kind: harness.WireMessages})
|
||||
}
|
||||
return ps
|
||||
}
|
||||
|
||||
// providerRequest builds a create request for a matrix provider: enabled, with
|
||||
// a uniquely-priced model for body-routed providers and none for the
|
||||
// path-routed Vertex (whose model lives in the request path).
|
||||
func providerRequest(pc providerCase) api.AgentNetworkProviderRequest {
|
||||
req := api.AgentNetworkProviderRequest{
|
||||
Name: pc.name,
|
||||
ProviderId: pc.catalogID,
|
||||
UpstreamUrl: pc.upstream,
|
||||
ApiKey: &pc.apiKey,
|
||||
Enabled: ptr(true),
|
||||
}
|
||||
if pc.kind != harness.WireVertex {
|
||||
req.Models = &[]api.AgentNetworkProviderModel{
|
||||
{Id: pc.model, InputPer1k: 0.001, OutputPer1k: 0.002},
|
||||
}
|
||||
}
|
||||
return req
|
||||
}
|
||||
|
||||
// TestProvidersMatrix is Pillar 3: it provisions every available provider (all
|
||||
// enabled, each with a unique model so routing stays unambiguous), runs proxy +
|
||||
// client once, and drives the same live chat-completion scenario through each
|
||||
// provider over the WireGuard tunnel. Each provider must return 200 and produce
|
||||
// an ingested access-log row.
|
||||
func TestProvidersMatrix(t *testing.T) {
|
||||
matrix := availableProviders()
|
||||
if len(matrix) == 0 {
|
||||
t.Skip("no provider keys set; source ~/.llm-keys to run the provider matrix")
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
// Group + setup key the client joins into; the policy authorizes it.
|
||||
grp, err := srv.API().Groups.Create(ctx, api.PostApiGroupsJSONRequestBody{Name: "e2e-agents"})
|
||||
require.NoError(t, err, "create agents group")
|
||||
t.Cleanup(func() { _ = srv.API().Groups.Delete(context.Background(), grp.Id) })
|
||||
|
||||
ephemeral := false
|
||||
sk, err := srv.API().SetupKeys.Create(ctx, api.PostApiSetupKeysJSONRequestBody{
|
||||
Name: "e2e-client",
|
||||
Type: "reusable",
|
||||
ExpiresIn: 86400,
|
||||
UsageLimit: 0,
|
||||
AutoGroups: []string{grp.Id},
|
||||
Ephemeral: &ephemeral,
|
||||
})
|
||||
require.NoError(t, err, "mint setup key")
|
||||
require.NotEmpty(t, sk.Key, "setup key plaintext")
|
||||
|
||||
// Create every provider, all enabled, each with a unique model string so the
|
||||
// proxy's connect-time snapshot carries them all and model→provider routing
|
||||
// is unambiguous (provider toggles after connect don't reconcile to the
|
||||
// proxy, so we enable everything up front). The first create bootstraps the
|
||||
// cluster.
|
||||
ids := make([]string, 0, len(matrix))
|
||||
for i, pc := range matrix {
|
||||
req := providerRequest(pc)
|
||||
if i == 0 {
|
||||
req.BootstrapCluster = ptr(harness.AgentNetworkCluster)
|
||||
}
|
||||
prov, perr := srv.CreateProvider(ctx, req)
|
||||
require.NoError(t, perr, "create provider %s", pc.name)
|
||||
ids = append(ids, prov.Id)
|
||||
id := prov.Id
|
||||
t.Cleanup(func() { _ = srv.DeleteProvider(context.Background(), id) })
|
||||
}
|
||||
|
||||
enabled := true
|
||||
pol, err := srv.CreatePolicy(ctx, api.AgentNetworkPolicyRequest{
|
||||
Name: "e2e-allow",
|
||||
Enabled: &enabled,
|
||||
SourceGroups: []string{grp.Id},
|
||||
DestinationProviderIds: ids,
|
||||
// Token limit at the 60s window floor with caps far above the few hundred
|
||||
// tokens this suite drives, so it never blocks traffic but switches on
|
||||
// usage metering, which is what makes consumption rows get recorded.
|
||||
Limits: &api.AgentNetworkPolicyLimits{
|
||||
TokenLimit: api.AgentNetworkPolicyTokenLimit{
|
||||
Enabled: true,
|
||||
GroupCap: 10_000_000,
|
||||
UserCap: 10_000_000,
|
||||
WindowSeconds: 60,
|
||||
},
|
||||
},
|
||||
})
|
||||
require.NoError(t, err, "create policy")
|
||||
t.Cleanup(func() { _ = srv.DeletePolicy(context.Background(), pol.Id) })
|
||||
|
||||
settings, err := srv.GetSettings(ctx)
|
||||
require.NoError(t, err, "read settings for endpoint")
|
||||
require.NotEmpty(t, settings.Endpoint, "agent-network endpoint must be assigned")
|
||||
|
||||
// Proxy (global CLI token) + client, brought up once.
|
||||
proxyToken, err := srv.CreateProxyTokenCLI(ctx, "e2e-proxy")
|
||||
require.NoError(t, err, "mint proxy token via CLI")
|
||||
px, err := harness.StartProxy(ctx, srv, proxyToken)
|
||||
require.NoError(t, err, "start proxy")
|
||||
t.Cleanup(func() { _ = px.Terminate(context.Background()) })
|
||||
|
||||
cl, err := harness.StartClient(ctx, srv, sk.Key)
|
||||
require.NoError(t, err, "start client")
|
||||
t.Cleanup(func() { _ = cl.Terminate(context.Background()) })
|
||||
|
||||
require.NoError(t, cl.WaitConnected(ctx, 90*time.Second), "client must connect to management")
|
||||
if err := cl.WaitProxyPeer(ctx, 180*time.Second); err != nil {
|
||||
t.Fatalf("client did not see the proxy peer: %v\n=== proxy logs ===\n%s", err, px.Logs(context.Background()))
|
||||
}
|
||||
proxyIP, err := cl.ResolveProxyIP(ctx, settings.Endpoint)
|
||||
require.NoError(t, err, "resolve agent-network endpoint to proxy IP")
|
||||
|
||||
for _, pc := range matrix {
|
||||
pc := pc
|
||||
t.Run(pc.name, func(t *testing.T) {
|
||||
before, _ := srv.ListAccessLogs(ctx)
|
||||
|
||||
// Unique per provider so we can find this provider's row by its
|
||||
// session id and confirm the marker propagated end-to-end.
|
||||
sessionID := "e2e-session-" + pc.name
|
||||
|
||||
// Retry briefly to absorb tunnel/DNS jitter on the first call.
|
||||
var code int
|
||||
var body string
|
||||
deadline := time.Now().Add(90 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
var c int
|
||||
var b string
|
||||
var cerr error
|
||||
if pc.kind == harness.WireVertex {
|
||||
c, b, cerr = cl.Vertex(ctx, settings.Endpoint, proxyIP, pc.project, pc.region, pc.model, "Reply with exactly: pong", sessionID)
|
||||
} else {
|
||||
c, b, cerr = cl.Chat(ctx, settings.Endpoint, proxyIP, pc.kind, pc.model, "Reply with exactly: pong", sessionID)
|
||||
}
|
||||
if cerr == nil {
|
||||
code, body = c, b
|
||||
if code == 200 {
|
||||
break
|
||||
}
|
||||
}
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
require.Equal(t, 200, code, "chat through %s (%s %s) should return 200; body: %s", pc.name, pc.kind, pc.model, body)
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
logs, lerr := srv.ListAccessLogs(ctx)
|
||||
return lerr == nil && logs.TotalRecords > before.TotalRecords
|
||||
}, 30*time.Second, 2*time.Second, "an access-log row should be ingested for %s", pc.name)
|
||||
|
||||
// The session id sent as x-session-id must round-trip into the
|
||||
// access-log row for this provider.
|
||||
require.Eventually(t, func() bool {
|
||||
logs, lerr := srv.ListAccessLogs(ctx)
|
||||
if lerr != nil {
|
||||
return false
|
||||
}
|
||||
for _, r := range logs.Data {
|
||||
if r.SessionId != nil && *r.SessionId == sessionID {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}, 30*time.Second, 2*time.Second, "session id %q must be recorded in an access-log row for %s", sessionID, pc.name)
|
||||
})
|
||||
}
|
||||
|
||||
// Metering: the policy's uncapped token limit switches on usage recording,
|
||||
// so the live traffic just driven must surface as consumption rows with
|
||||
// positive token counts. Consumption is account-scoped (keyed by source
|
||||
// group / user and time window, not per provider), and ingest is async, so
|
||||
// poll for any row that has booked tokens.
|
||||
require.Eventually(t, func() bool {
|
||||
rows, lerr := srv.ListConsumption(ctx)
|
||||
if lerr != nil {
|
||||
return false
|
||||
}
|
||||
for _, r := range rows {
|
||||
if r.TokensInput > 0 && r.TokensOutput > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}, 60*time.Second, 3*time.Second, "consumption must be recorded with positive token counts after live traffic")
|
||||
}
|
||||
46
e2e/agentnetwork/main_test.go
Normal file
46
e2e/agentnetwork/main_test.go
Normal file
@@ -0,0 +1,46 @@
|
||||
//go:build e2e
|
||||
|
||||
// Package agentnetwork holds the container-based agent-network e2e suite. A
|
||||
// single combined server is built and bootstrapped once per package run
|
||||
// (TestMain) and shared across tests via srv; each test creates and cleans up
|
||||
// its own resources so order doesn't matter.
|
||||
package agentnetwork
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/netbirdio/netbird/e2e/harness"
|
||||
)
|
||||
|
||||
// srv is the shared combined server for the package, ready (PAT-authenticated)
|
||||
// by the time any Test runs.
|
||||
var srv *harness.Combined
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
os.Exit(run(m))
|
||||
}
|
||||
|
||||
func run(m *testing.M) int {
|
||||
// Generous timeout to cover a cold image build on first run.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
var err error
|
||||
srv, err = harness.StartCombined(ctx)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "e2e: start combined server: %v\n", err)
|
||||
return 1
|
||||
}
|
||||
defer func() { _ = srv.Terminate(context.Background()) }()
|
||||
|
||||
if _, err := srv.Bootstrap(ctx); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "e2e: bootstrap admin PAT: %v\n", err)
|
||||
return 1
|
||||
}
|
||||
|
||||
return m.Run()
|
||||
}
|
||||
221
e2e/agentnetwork/management_test.go
Normal file
221
e2e/agentnetwork/management_test.go
Normal file
@@ -0,0 +1,221 @@
|
||||
//go:build e2e
|
||||
|
||||
package agentnetwork
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/netbirdio/netbird/e2e/harness"
|
||||
"github.com/netbirdio/netbird/shared/management/client/rest"
|
||||
"github.com/netbirdio/netbird/shared/management/http/api"
|
||||
)
|
||||
|
||||
func ptr[T any](v T) *T { return &v }
|
||||
|
||||
// newProvider creates an OpenAI-catalog provider with a dummy key (these tests
|
||||
// never call the upstream) and registers cleanup.
|
||||
func newProvider(t *testing.T, ctx context.Context, name string) api.AgentNetworkProvider {
|
||||
t.Helper()
|
||||
prov, err := srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
|
||||
Name: name,
|
||||
ProviderId: "openai_api",
|
||||
UpstreamUrl: "https://api.openai.com",
|
||||
ApiKey: ptr("sk-dummy-e2e-key"),
|
||||
BootstrapCluster: ptr("eu.proxy.netbird.test"),
|
||||
})
|
||||
require.NoError(t, err, "create provider %q", name)
|
||||
t.Cleanup(func() { _ = srv.DeleteProvider(context.Background(), prov.Id) })
|
||||
return prov
|
||||
}
|
||||
|
||||
// requireClientError asserts err is a REST APIError with a 4xx status.
|
||||
func requireClientError(t *testing.T, err error) {
|
||||
t.Helper()
|
||||
var apiErr *rest.APIError
|
||||
require.ErrorAs(t, err, &apiErr, "expected a REST APIError")
|
||||
assert.GreaterOrEqual(t, apiErr.StatusCode, 400, "expected a 4xx status")
|
||||
assert.Less(t, apiErr.StatusCode, 500, "expected a 4xx status")
|
||||
}
|
||||
|
||||
// TestProviderLifecycle covers create → get → list → delete → 404 for every
|
||||
// available real provider catalog (and a synthetic OpenAI provider when no
|
||||
// provider keys are set), so each catalog's create and field round-trip is
|
||||
// exercised. Create is offline — no upstream call — so this stays fast and
|
||||
// burns no provider quota.
|
||||
func TestProviderLifecycle(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
cases := availableProviders()
|
||||
if len(cases) == 0 {
|
||||
cases = []providerCase{{
|
||||
name: "openai", catalogID: "openai_api", upstream: "https://api.openai.com",
|
||||
apiKey: "sk-dummy-e2e-key", model: "gpt-4o-mini", kind: harness.WireChat,
|
||||
}}
|
||||
}
|
||||
|
||||
for i, pc := range cases {
|
||||
i, pc := i, pc
|
||||
t.Run(pc.name, func(t *testing.T) {
|
||||
req := providerRequest(pc)
|
||||
req.Name = "lc-" + pc.name
|
||||
// Bootstrap the cluster on the first create in case the matrix has
|
||||
// not run (e.g. no provider keys → settings not yet bootstrapped).
|
||||
if i == 0 {
|
||||
req.BootstrapCluster = ptr(harness.AgentNetworkCluster)
|
||||
}
|
||||
|
||||
prov, err := srv.CreateProvider(ctx, req)
|
||||
require.NoError(t, err, "create %s provider", pc.name)
|
||||
t.Cleanup(func() { _ = srv.DeleteProvider(context.Background(), prov.Id) })
|
||||
|
||||
assert.NotEmpty(t, prov.Id, "created provider must have an id")
|
||||
assert.Equal(t, pc.catalogID, prov.ProviderId, "catalog id must round-trip")
|
||||
assert.Equal(t, req.Name, prov.Name, "name must round-trip")
|
||||
assert.Equal(t, pc.upstream, prov.UpstreamUrl, "upstream must round-trip")
|
||||
|
||||
got, err := srv.GetProvider(ctx, prov.Id)
|
||||
require.NoError(t, err, "get provider")
|
||||
assert.Equal(t, prov.Id, got.Id)
|
||||
|
||||
list, err := srv.ListProviders(ctx)
|
||||
require.NoError(t, err, "list providers")
|
||||
var ids []string
|
||||
for _, p := range list {
|
||||
ids = append(ids, p.Id)
|
||||
}
|
||||
assert.Contains(t, ids, prov.Id, "created provider must appear in the list")
|
||||
|
||||
require.NoError(t, srv.DeleteProvider(ctx, prov.Id), "delete provider")
|
||||
_, err = srv.GetProvider(ctx, prov.Id)
|
||||
requireClientError(t, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestProviderValidation exercises the create-time validation rules. These are
|
||||
// uniform across catalogs (no per-provider required-field rules exist: a
|
||||
// catalog-specific malformed value such as a Vertex key without the keyfile::
|
||||
// prefix is accepted at create and only fails at the proxy), so the cases here
|
||||
// are catalog-agnostic: missing API key, unknown catalog id, an invalid upstream
|
||||
// URL, and a blank name.
|
||||
func TestProviderValidation(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
_, err := srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
|
||||
Name: "No Key",
|
||||
ProviderId: "openai_api",
|
||||
UpstreamUrl: "https://api.openai.com",
|
||||
})
|
||||
requireClientError(t, err)
|
||||
|
||||
_, err = srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
|
||||
Name: "Unknown Catalog",
|
||||
ProviderId: "totally_unknown_provider",
|
||||
UpstreamUrl: "https://example.com",
|
||||
ApiKey: ptr("sk-dummy"),
|
||||
})
|
||||
requireClientError(t, err)
|
||||
|
||||
_, err = srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
|
||||
Name: "Bad Upstream",
|
||||
ProviderId: "openai_api",
|
||||
UpstreamUrl: "not-a-url",
|
||||
ApiKey: ptr("sk-dummy"),
|
||||
})
|
||||
requireClientError(t, err)
|
||||
|
||||
_, err = srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
|
||||
Name: " ",
|
||||
ProviderId: "openai_api",
|
||||
UpstreamUrl: "https://api.openai.com",
|
||||
ApiKey: ptr("sk-dummy"),
|
||||
})
|
||||
requireClientError(t, err)
|
||||
}
|
||||
|
||||
// TestSettingsRoundTrip flips the collection toggles and confirms cluster /
|
||||
// subdomain stay immutable, then restores the original state.
|
||||
func TestSettingsRoundTrip(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
// Settings are bootstrapped on first provider create.
|
||||
newProvider(t, ctx, "Settings Bootstrap")
|
||||
|
||||
before, err := srv.GetSettings(ctx)
|
||||
require.NoError(t, err, "get settings")
|
||||
require.NotEmpty(t, before.Cluster, "settings must carry an assigned cluster")
|
||||
|
||||
flipped, err := srv.UpdateSettings(ctx, api.AgentNetworkSettingsRequest{
|
||||
EnableLogCollection: !before.EnableLogCollection,
|
||||
EnablePromptCollection: !before.EnablePromptCollection,
|
||||
RedactPii: !before.RedactPii,
|
||||
})
|
||||
require.NoError(t, err, "update settings")
|
||||
assert.Equal(t, !before.EnableLogCollection, flipped.EnableLogCollection, "log collection toggle must flip")
|
||||
assert.Equal(t, !before.EnablePromptCollection, flipped.EnablePromptCollection, "prompt collection toggle must flip")
|
||||
assert.Equal(t, before.Cluster, flipped.Cluster, "cluster must be immutable across updates")
|
||||
assert.Equal(t, before.Subdomain, flipped.Subdomain, "subdomain must be immutable across updates")
|
||||
|
||||
// Restore the original toggles.
|
||||
_, err = srv.UpdateSettings(ctx, api.AgentNetworkSettingsRequest{
|
||||
EnableLogCollection: before.EnableLogCollection,
|
||||
EnablePromptCollection: before.EnablePromptCollection,
|
||||
RedactPii: before.RedactPii,
|
||||
})
|
||||
require.NoError(t, err, "restore settings")
|
||||
}
|
||||
|
||||
// TestPolicyWindowFloor rejects an enabled limit below the 60s window floor and
|
||||
// accepts one at the floor.
|
||||
func TestPolicyWindowFloor(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
grp, err := srv.API().Groups.Create(ctx, api.PostApiGroupsJSONRequestBody{Name: "e2e-policy-grp"})
|
||||
require.NoError(t, err, "create source group")
|
||||
t.Cleanup(func() { _ = srv.API().Groups.Delete(context.Background(), grp.Id) })
|
||||
|
||||
prov := newProvider(t, ctx, "Policy Provider")
|
||||
|
||||
limits := func(window int64) *api.AgentNetworkPolicyLimits {
|
||||
return &api.AgentNetworkPolicyLimits{
|
||||
TokenLimit: api.AgentNetworkPolicyTokenLimit{
|
||||
Enabled: true,
|
||||
GroupCap: 1000,
|
||||
UserCap: 1000,
|
||||
WindowSeconds: window,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
_, err = srv.CreatePolicy(ctx, api.AgentNetworkPolicyRequest{
|
||||
Name: "e2e-below-floor",
|
||||
SourceGroups: []string{grp.Id},
|
||||
DestinationProviderIds: []string{prov.Id},
|
||||
Limits: limits(30),
|
||||
})
|
||||
requireClientError(t, err)
|
||||
|
||||
pol, err := srv.CreatePolicy(ctx, api.AgentNetworkPolicyRequest{
|
||||
Name: "e2e-at-floor",
|
||||
SourceGroups: []string{grp.Id},
|
||||
DestinationProviderIds: []string{prov.Id},
|
||||
Limits: limits(60),
|
||||
})
|
||||
require.NoError(t, err, "policy at the 60s floor must be accepted")
|
||||
assert.NotEmpty(t, pol.Id, "created policy must have an id")
|
||||
t.Cleanup(func() { _ = srv.DeletePolicy(context.Background(), pol.Id) })
|
||||
}
|
||||
|
||||
// TestConsumptionList confirms the read endpoint always returns an array, never
|
||||
// a 404/500.
|
||||
func TestConsumptionList(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
|
||||
rows, err := srv.ListConsumption(ctx)
|
||||
require.NoError(t, err, "consumption list must not error")
|
||||
assert.NotNil(t, rows, "consumption must be a JSON array (possibly empty)")
|
||||
}
|
||||
140
e2e/agentnetwork/skiptls_test.go
Normal file
140
e2e/agentnetwork/skiptls_test.go
Normal file
@@ -0,0 +1,140 @@
|
||||
//go:build e2e
|
||||
|
||||
package agentnetwork
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/netbirdio/netbird/e2e/harness"
|
||||
"github.com/netbirdio/netbird/shared/management/http/api"
|
||||
)
|
||||
|
||||
// TestProviderSkipTLSVerification proves skip_tls_verification is per-provider:
|
||||
// two providers share one self-signed upstream, one skipping TLS verification
|
||||
// and one not. The skip=true provider's chat reaches the upstream and returns
|
||||
// 200; the skip=false provider's chat fails at the TLS handshake — same
|
||||
// upstream, opposite outcome. This is the behaviour a target-level flag could
|
||||
// not give, since all of an account's providers share one synthesised target.
|
||||
func TestProviderSkipTLSVerification(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
up, err := harness.StartFakeUpstream(ctx, srv)
|
||||
require.NoError(t, err, "start self-signed upstream")
|
||||
t.Cleanup(func() { _ = up.Terminate(context.Background()) })
|
||||
|
||||
grp, err := srv.API().Groups.Create(ctx, api.PostApiGroupsJSONRequestBody{Name: "e2e-skiptls"})
|
||||
require.NoError(t, err, "create group")
|
||||
t.Cleanup(func() { _ = srv.API().Groups.Delete(context.Background(), grp.Id) })
|
||||
|
||||
ephemeral := false
|
||||
sk, err := srv.API().SetupKeys.Create(ctx, api.PostApiSetupKeysJSONRequestBody{
|
||||
Name: "e2e-skiptls-client",
|
||||
Type: "reusable",
|
||||
ExpiresIn: 86400,
|
||||
UsageLimit: 0,
|
||||
AutoGroups: []string{grp.Id},
|
||||
Ephemeral: &ephemeral,
|
||||
})
|
||||
require.NoError(t, err, "mint setup key")
|
||||
require.NotEmpty(t, sk.Key, "setup key plaintext")
|
||||
|
||||
const (
|
||||
insecureModel = "insecure-model"
|
||||
secureModel = "secure-model"
|
||||
)
|
||||
|
||||
// Two providers on the SAME self-signed upstream, distinguished only by their
|
||||
// skip_tls_verification and a unique model string so the router picks each
|
||||
// unambiguously.
|
||||
newReq := func(name, model string, skip bool) api.AgentNetworkProviderRequest {
|
||||
key := "sk-dummy-e2e"
|
||||
return api.AgentNetworkProviderRequest{
|
||||
Name: name,
|
||||
ProviderId: "openai_api",
|
||||
UpstreamUrl: up.URL,
|
||||
ApiKey: &key,
|
||||
Enabled: ptr(true),
|
||||
SkipTlsVerification: ptr(skip),
|
||||
Models: &[]api.AgentNetworkProviderModel{
|
||||
{Id: model, InputPer1k: 0.001, OutputPer1k: 0.002},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// First create bootstraps the account cluster.
|
||||
insecureReq := newReq("skip-tls", insecureModel, true)
|
||||
insecureReq.BootstrapCluster = ptr(harness.AgentNetworkCluster)
|
||||
insecureProv, err := srv.CreateProvider(ctx, insecureReq)
|
||||
require.NoError(t, err, "create skip-tls provider")
|
||||
t.Cleanup(func() { _ = srv.DeleteProvider(context.Background(), insecureProv.Id) })
|
||||
require.True(t, insecureProv.SkipTlsVerification, "response must echo skip_tls_verification=true")
|
||||
|
||||
secureProv, err := srv.CreateProvider(ctx, newReq("verify-tls", secureModel, false))
|
||||
require.NoError(t, err, "create verify-tls provider")
|
||||
t.Cleanup(func() { _ = srv.DeleteProvider(context.Background(), secureProv.Id) })
|
||||
require.False(t, secureProv.SkipTlsVerification, "response must echo skip_tls_verification=false")
|
||||
|
||||
enabled := true
|
||||
pol, err := srv.CreatePolicy(ctx, api.AgentNetworkPolicyRequest{
|
||||
Name: "e2e-skiptls-allow",
|
||||
Enabled: &enabled,
|
||||
SourceGroups: []string{grp.Id},
|
||||
DestinationProviderIds: []string{insecureProv.Id, secureProv.Id},
|
||||
})
|
||||
require.NoError(t, err, "create policy")
|
||||
t.Cleanup(func() { _ = srv.DeletePolicy(context.Background(), pol.Id) })
|
||||
|
||||
settings, err := srv.GetSettings(ctx)
|
||||
require.NoError(t, err, "read settings")
|
||||
require.NotEmpty(t, settings.Endpoint, "endpoint must be assigned")
|
||||
|
||||
proxyToken, err := srv.CreateProxyTokenCLI(ctx, "e2e-skiptls-proxy")
|
||||
require.NoError(t, err, "mint proxy token")
|
||||
px, err := harness.StartProxy(ctx, srv, proxyToken)
|
||||
require.NoError(t, err, "start proxy")
|
||||
t.Cleanup(func() { _ = px.Terminate(context.Background()) })
|
||||
|
||||
cl, err := harness.StartClient(ctx, srv, sk.Key)
|
||||
require.NoError(t, err, "start client")
|
||||
t.Cleanup(func() { _ = cl.Terminate(context.Background()) })
|
||||
|
||||
require.NoError(t, cl.WaitConnected(ctx, 90*time.Second), "client must connect to management")
|
||||
if err := cl.WaitProxyPeer(ctx, 180*time.Second); err != nil {
|
||||
t.Fatalf("client did not see the proxy peer: %v\n=== proxy logs ===\n%s", err, px.Logs(context.Background()))
|
||||
}
|
||||
proxyIP, err := cl.ResolveProxyIP(ctx, settings.Endpoint)
|
||||
require.NoError(t, err, "resolve endpoint to proxy IP")
|
||||
|
||||
// Positive: skip=true reaches the self-signed upstream. Retry to absorb
|
||||
// tunnel/DNS jitter on the first call; success also proves the path works.
|
||||
var code int
|
||||
var body string
|
||||
deadline := time.Now().Add(90 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
c, b, cerr := cl.Chat(ctx, settings.Endpoint, proxyIP, harness.WireChat, insecureModel, "Reply with exactly: pong", "e2e-skiptls-insecure")
|
||||
if cerr == nil {
|
||||
code, body = c, b
|
||||
if code == 200 {
|
||||
break
|
||||
}
|
||||
}
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
require.Equal(t, 200, code,
|
||||
"skip_tls_verification=true must reach the self-signed upstream; body: %s\n=== upstream logs ===\n%s\n=== proxy logs ===\n%s",
|
||||
body, up.Logs(context.Background()), px.Logs(context.Background()))
|
||||
|
||||
// Negative: skip=false must fail the TLS handshake to the SAME upstream. The
|
||||
// path is already proven working, so a non-200 here is the cert rejection.
|
||||
secureCode, secureBody, cerr := cl.Chat(ctx, settings.Endpoint, proxyIP, harness.WireChat, secureModel, "Reply with exactly: pong", "e2e-skiptls-secure")
|
||||
require.NoError(t, cerr, "the chat call itself must complete (proxy returns an error status, not a transport error)")
|
||||
require.NotEqual(t, 200, secureCode,
|
||||
"skip_tls_verification=false must NOT reach the self-signed upstream; got %d, body: %s", secureCode, secureBody)
|
||||
require.GreaterOrEqual(t, secureCode, 500,
|
||||
"a TLS verification failure should surface as a 5xx from the proxy; got %d, body: %s", secureCode, secureBody)
|
||||
}
|
||||
24
e2e/harness/Dockerfile.client
Normal file
24
e2e/harness/Dockerfile.client
Normal file
@@ -0,0 +1,24 @@
|
||||
# Multistage build for the NetBird client used in e2e tests. The repo has no
|
||||
# source-building client Dockerfile (client/Dockerfile packages a goreleaser
|
||||
# artifact), so this mirrors its alpine runtime + entrypoint while compiling the
|
||||
# CGO-free client inline. BuildKit cache mounts keep rebuilds incremental.
|
||||
|
||||
FROM golang:1.25-bookworm AS builder
|
||||
WORKDIR /src
|
||||
COPY go.mod go.sum ./
|
||||
RUN --mount=type=cache,target=/go/pkg/mod go mod download
|
||||
COPY . .
|
||||
RUN --mount=type=cache,target=/go/pkg/mod \
|
||||
--mount=type=cache,target=/root/.cache/go-build \
|
||||
CGO_ENABLED=0 GOOS=linux go build -o /out/netbird ./client
|
||||
|
||||
FROM alpine:3.24
|
||||
RUN apk add --no-cache bash ca-certificates ip6tables iproute2 iptables
|
||||
ENV NETBIRD_BIN="/usr/local/bin/netbird" \
|
||||
NB_LOG_FILE="console,/var/log/netbird/client.log" \
|
||||
NB_DAEMON_ADDR="unix:///var/run/netbird.sock" \
|
||||
NB_ENABLE_CAPTURE="false" \
|
||||
NB_ENTRYPOINT_SERVICE_TIMEOUT="30"
|
||||
ENTRYPOINT [ "/usr/local/bin/netbird-entrypoint.sh" ]
|
||||
COPY client/netbird-entrypoint.sh /usr/local/bin/netbird-entrypoint.sh
|
||||
COPY --from=builder /out/netbird /usr/local/bin/netbird
|
||||
130
e2e/harness/agentnetwork.go
Normal file
130
e2e/harness/agentnetwork.go
Normal file
@@ -0,0 +1,130 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"github.com/netbirdio/netbird/shared/management/http/api"
|
||||
)
|
||||
|
||||
// The shared REST client doesn't (yet) expose typed agent-network methods, so
|
||||
// these helpers drive the /api/agent-network/* endpoints through the client's
|
||||
// NewRequest primitive — reusing its auth, error handling (rest.APIError on
|
||||
// non-2xx), and transport — while still speaking the generated api types.
|
||||
|
||||
// anRequest issues an agent-network API call and decodes the JSON response into
|
||||
// T. A non-2xx response surfaces as a *rest.APIError from the client, which
|
||||
// tests inspect for negative-path status assertions.
|
||||
func anRequest[T any](ctx context.Context, c *Combined, method, path string, body any) (T, error) {
|
||||
var out T
|
||||
var reader io.Reader
|
||||
if body != nil {
|
||||
bs, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return out, fmt.Errorf("marshal %s %s: %w", method, path, err)
|
||||
}
|
||||
reader = bytes.NewReader(bs)
|
||||
}
|
||||
|
||||
resp, err := c.api.NewRequest(ctx, method, path, reader, nil)
|
||||
if err != nil {
|
||||
return out, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return out, fmt.Errorf("decode %s %s response: %w", method, path, err)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// anDelete issues a DELETE and discards the (empty-object) body.
|
||||
func anDelete(ctx context.Context, c *Combined, path string) error {
|
||||
resp, err := c.api.NewRequest(ctx, http.MethodDelete, path, nil, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
resp.Body.Close()
|
||||
return nil
|
||||
}
|
||||
|
||||
// CreateProvider creates an agent-network provider.
|
||||
func (c *Combined) CreateProvider(ctx context.Context, req api.AgentNetworkProviderRequest) (api.AgentNetworkProvider, error) {
|
||||
return anRequest[api.AgentNetworkProvider](ctx, c, http.MethodPost, "/api/agent-network/providers", req)
|
||||
}
|
||||
|
||||
// GetProvider fetches a provider by id.
|
||||
func (c *Combined) GetProvider(ctx context.Context, id string) (api.AgentNetworkProvider, error) {
|
||||
return anRequest[api.AgentNetworkProvider](ctx, c, http.MethodGet, "/api/agent-network/providers/"+id, nil)
|
||||
}
|
||||
|
||||
// ListProviders returns all providers for the account.
|
||||
func (c *Combined) ListProviders(ctx context.Context) ([]api.AgentNetworkProvider, error) {
|
||||
return anRequest[[]api.AgentNetworkProvider](ctx, c, http.MethodGet, "/api/agent-network/providers", nil)
|
||||
}
|
||||
|
||||
// DeleteProvider removes a provider by id.
|
||||
func (c *Combined) DeleteProvider(ctx context.Context, id string) error {
|
||||
return anDelete(ctx, c, "/api/agent-network/providers/"+id)
|
||||
}
|
||||
|
||||
// SetProviderEnabled toggles a provider's enabled flag, preserving its other
|
||||
// fields (the API key is omitted, which keeps the stored one). Used to run one
|
||||
// provider at a time so model→provider routing is unambiguous.
|
||||
func (c *Combined) SetProviderEnabled(ctx context.Context, id string, enabled bool) error {
|
||||
p, err := c.GetProvider(ctx, id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = anRequest[api.AgentNetworkProvider](ctx, c, http.MethodPut, "/api/agent-network/providers/"+id, api.AgentNetworkProviderRequest{
|
||||
Name: p.Name,
|
||||
ProviderId: p.ProviderId,
|
||||
UpstreamUrl: p.UpstreamUrl,
|
||||
Enabled: &enabled,
|
||||
Models: &p.Models,
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
// CreatePolicy creates an agent-network policy.
|
||||
func (c *Combined) CreatePolicy(ctx context.Context, req api.AgentNetworkPolicyRequest) (api.AgentNetworkPolicy, error) {
|
||||
return anRequest[api.AgentNetworkPolicy](ctx, c, http.MethodPost, "/api/agent-network/policies", req)
|
||||
}
|
||||
|
||||
// UpdatePolicy replaces a policy by id.
|
||||
func (c *Combined) UpdatePolicy(ctx context.Context, id string, req api.AgentNetworkPolicyRequest) (api.AgentNetworkPolicy, error) {
|
||||
return anRequest[api.AgentNetworkPolicy](ctx, c, http.MethodPut, "/api/agent-network/policies/"+id, req)
|
||||
}
|
||||
|
||||
// DeletePolicy removes a policy by id.
|
||||
func (c *Combined) DeletePolicy(ctx context.Context, id string) error {
|
||||
return anDelete(ctx, c, "/api/agent-network/policies/"+id)
|
||||
}
|
||||
|
||||
// GetSettings returns the account's agent-network settings row. It exists only
|
||||
// after the first provider create bootstraps it.
|
||||
func (c *Combined) GetSettings(ctx context.Context) (api.AgentNetworkSettings, error) {
|
||||
return anRequest[api.AgentNetworkSettings](ctx, c, http.MethodGet, "/api/agent-network/settings", nil)
|
||||
}
|
||||
|
||||
// UpdateSettings applies the mutable collection toggles.
|
||||
func (c *Combined) UpdateSettings(ctx context.Context, req api.AgentNetworkSettingsRequest) (api.AgentNetworkSettings, error) {
|
||||
return anRequest[api.AgentNetworkSettings](ctx, c, http.MethodPut, "/api/agent-network/settings", req)
|
||||
}
|
||||
|
||||
// ListConsumption returns the account's consumption rows (possibly empty).
|
||||
func (c *Combined) ListConsumption(ctx context.Context) ([]api.AgentNetworkConsumption, error) {
|
||||
return anRequest[[]api.AgentNetworkConsumption](ctx, c, http.MethodGet, "/api/agent-network/consumption", nil)
|
||||
}
|
||||
|
||||
// ListAccessLogs returns the account's agent-network access-log page (the
|
||||
// flattened per-request rows the proxy ships and management ingests).
|
||||
func (c *Combined) ListAccessLogs(ctx context.Context) (api.AgentNetworkAccessLogsResponse, error) {
|
||||
return anRequest[api.AgentNetworkAccessLogsResponse](ctx, c, http.MethodGet, "/api/agent-network/access-logs", nil)
|
||||
}
|
||||
47
e2e/harness/bootstrap.go
Normal file
47
e2e/harness/bootstrap.go
Normal file
@@ -0,0 +1,47 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/netbirdio/netbird/shared/management/client/rest"
|
||||
"github.com/netbirdio/netbird/shared/management/http/api"
|
||||
)
|
||||
|
||||
// Bootstrap creates the initial admin owner through the unauthenticated
|
||||
// /api/setup endpoint and returns the plaintext admin PAT. It also wires an
|
||||
// authenticated REST client on the Combined (see API). create_pat requires the
|
||||
// server to run with NB_SETUP_PAT_ENABLED=true, which the harness sets. A
|
||||
// second call returns an error (the server reports setup already completed).
|
||||
func (c *Combined) Bootstrap(ctx context.Context) (string, error) {
|
||||
// The setup endpoint is unauthenticated; use a tokenless client.
|
||||
setupClient := rest.NewWithOptions(rest.WithManagementURL(c.BaseURL))
|
||||
|
||||
createPAT := true
|
||||
expireDays := 1
|
||||
resp, err := setupClient.Instance.Setup(ctx, api.PostApiSetupJSONRequestBody{ //nolint:gosec // static throwaway test credentials
|
||||
Email: "admin@netbird.test",
|
||||
Password: "Netbird-e2e-Passw0rd!",
|
||||
Name: "E2E Admin",
|
||||
CreatePat: &createPAT,
|
||||
PatExpireIn: &expireDays,
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("instance setup: %w", err)
|
||||
}
|
||||
if resp.PersonalAccessToken == nil || *resp.PersonalAccessToken == "" {
|
||||
return "", fmt.Errorf("setup succeeded but no PAT returned (is NB_SETUP_PAT_ENABLED set?)")
|
||||
}
|
||||
|
||||
c.PAT = *resp.PersonalAccessToken
|
||||
c.api = rest.New(c.BaseURL, c.PAT)
|
||||
return c.PAT, nil
|
||||
}
|
||||
|
||||
// API returns the PAT-authenticated management REST client. It is nil until
|
||||
// Bootstrap runs.
|
||||
func (c *Combined) API() *rest.Client {
|
||||
return c.api
|
||||
}
|
||||
66
e2e/harness/cert.go
Normal file
66
e2e/harness/cert.go
Normal file
@@ -0,0 +1,66 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"crypto/ecdsa"
|
||||
"crypto/elliptic"
|
||||
"crypto/rand"
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/pem"
|
||||
"fmt"
|
||||
"math/big"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
)
|
||||
|
||||
// writeSelfSignedCert generates a self-signed TLS cert/key pair covering the
|
||||
// given DNS names and writes them as tls.crt / tls.key in dir. The proxy serves
|
||||
// this for the agent-network endpoint; the client curls with -k, so validity
|
||||
// chains don't matter — the proxy just needs a usable cert to present.
|
||||
func writeSelfSignedCert(dir string, dnsNames []string) error {
|
||||
priv, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("generate key: %w", err)
|
||||
}
|
||||
|
||||
serial, err := rand.Int(rand.Reader, new(big.Int).Lsh(big.NewInt(1), 128))
|
||||
if err != nil {
|
||||
return fmt.Errorf("generate serial: %w", err)
|
||||
}
|
||||
|
||||
tmpl := x509.Certificate{
|
||||
SerialNumber: serial,
|
||||
Subject: pkix.Name{CommonName: dnsNames[0]},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(365 * 24 * time.Hour),
|
||||
KeyUsage: x509.KeyUsageDigitalSignature,
|
||||
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
|
||||
DNSNames: dnsNames,
|
||||
BasicConstraintsValid: true,
|
||||
}
|
||||
|
||||
der, err := x509.CreateCertificate(rand.Reader, &tmpl, &tmpl, &priv.PublicKey, priv)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create certificate: %w", err)
|
||||
}
|
||||
|
||||
certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der})
|
||||
if err := os.WriteFile(filepath.Join(dir, "tls.crt"), certPEM, 0o644); err != nil { //nolint:gosec // public cert, bind-mounted and read by the proxy container
|
||||
return fmt.Errorf("write cert: %w", err)
|
||||
}
|
||||
|
||||
keyDER, err := x509.MarshalECPrivateKey(priv)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal key: %w", err)
|
||||
}
|
||||
keyPEM := pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER})
|
||||
// World-readable so the (non-root) proxy container can read the bind-mounted
|
||||
// key on Linux CI runners; this is a throwaway self-signed e2e key.
|
||||
if err := os.WriteFile(filepath.Join(dir, "tls.key"), keyPEM, 0o644); err != nil { //nolint:gosec // throwaway self-signed e2e key, must be readable by the proxy container uid
|
||||
return fmt.Errorf("write key: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
296
e2e/harness/client.go
Normal file
296
e2e/harness/client.go
Normal file
@@ -0,0 +1,296 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/testcontainers/testcontainers-go"
|
||||
tcexec "github.com/testcontainers/testcontainers-go/exec"
|
||||
)
|
||||
|
||||
const (
|
||||
clientDockerfile = "e2e/harness/Dockerfile.client"
|
||||
// defaultClientImage is the local tag the client is built under from
|
||||
// clientDockerfile. Override with NB_E2E_CLIENT_IMAGE: a value with a "/" is
|
||||
// pulled as a published image; a bare tag is built under that name.
|
||||
defaultClientImage = "netbird-client:e2e"
|
||||
clientAlias = "client"
|
||||
curlImage = "curlimages/curl:latest"
|
||||
)
|
||||
|
||||
// Client is a running NetBird client container joined to the combined server.
|
||||
type Client struct {
|
||||
container testcontainers.Container
|
||||
}
|
||||
|
||||
// StartClient builds the client image and runs it on the combined server's
|
||||
// network, joining via the given setup key. The image entrypoint brings the
|
||||
// daemon up automatically; callers wait for connectivity with WaitConnected /
|
||||
// WaitProxyPeer.
|
||||
func StartClient(ctx context.Context, c *Combined, setupKey string) (*Client, error) {
|
||||
root, err := repoRoot()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
clientImage, err := resolveImage(ctx, root, "NB_E2E_CLIENT_IMAGE", defaultClientImage, clientDockerfile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := testcontainers.ContainerRequest{
|
||||
Image: clientImage,
|
||||
Networks: []string{c.network.Name},
|
||||
NetworkAliases: map[string][]string{c.network.Name: {clientAlias}},
|
||||
Env: map[string]string{
|
||||
"NB_MANAGEMENT_URL": combinedExposedURL,
|
||||
"NB_SETUP_KEY": setupKey,
|
||||
"NB_LOG_LEVEL": "info",
|
||||
// Match the proxy: the combined relay is WebSocket-only, so the
|
||||
// client must use WS transport to keep a stable relay link to it.
|
||||
"NB_RELAY_TRANSPORT": "ws",
|
||||
},
|
||||
HostConfigModifier: func(hc *container.HostConfig) {
|
||||
hc.CapAdd = append(hc.CapAdd, "NET_ADMIN", "SYS_ADMIN", "SYS_RESOURCE")
|
||||
},
|
||||
}
|
||||
|
||||
ctr, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||
ContainerRequest: req,
|
||||
Started: true,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start client container: %w", err)
|
||||
}
|
||||
return &Client{container: ctr}, nil
|
||||
}
|
||||
|
||||
// Restart bounces the client connection (netbird down/up) so it pulls a fresh
|
||||
// network map — the documented workaround for a freshly-joined client not yet
|
||||
// seeing a synthesized agent-network service.
|
||||
func (cl *Client) Restart(ctx context.Context) error {
|
||||
if _, _, err := cl.container.Exec(ctx, []string{"netbird", "down"}, tcexec.Multiplexed()); err != nil {
|
||||
return fmt.Errorf("netbird down: %w", err)
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
code, reader, err := cl.container.Exec(ctx, []string{"netbird", "up"}, tcexec.Multiplexed())
|
||||
if err != nil {
|
||||
return fmt.Errorf("netbird up: %w", err)
|
||||
}
|
||||
if code != 0 {
|
||||
out, _ := io.ReadAll(reader)
|
||||
return fmt.Errorf("netbird up exited %d: %s", code, string(out))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Status returns `netbird status` output from inside the client.
|
||||
func (cl *Client) Status(ctx context.Context) (string, error) {
|
||||
code, reader, err := cl.container.Exec(ctx, []string{"netbird", "status"}, tcexec.Multiplexed())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
out, _ := io.ReadAll(reader)
|
||||
if code != 0 {
|
||||
return string(out), fmt.Errorf("netbird status exited %d", code)
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// WaitConnected polls until the client reports Management: Connected.
|
||||
func (cl *Client) WaitConnected(ctx context.Context, timeout time.Duration) error {
|
||||
return cl.pollStatus(ctx, timeout, "Management: Connected")
|
||||
}
|
||||
|
||||
// WaitProxyPeer polls until the client sees at least one connected peer — the
|
||||
// proxy serving the agent-network endpoint. It requires ">=1 connected" rather
|
||||
// than an exact "1/1" because proxy peers from earlier tests linger in the
|
||||
// account as disconnected (each proxy container registers a fresh WireGuard key
|
||||
// and the peer is not removed on teardown), so the count is e.g. "1/2". Only the
|
||||
// live proxy can be connected, and the caller's subsequent chat is the real
|
||||
// end-to-end assertion.
|
||||
func (cl *Client) WaitProxyPeer(ctx context.Context, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
var last string
|
||||
for time.Now().Before(deadline) {
|
||||
out, _ := cl.Status(ctx)
|
||||
last = out
|
||||
if connectedPeers(out) >= 1 {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(3 * time.Second)
|
||||
}
|
||||
return fmt.Errorf("timed out waiting for a connected proxy peer; last status:\n%s", last)
|
||||
}
|
||||
|
||||
// connectedPeers parses the "Peers count: X/Y Connected" line from `netbird
|
||||
// status` and returns X (the connected count), or 0 when absent/unparseable.
|
||||
func connectedPeers(status string) int {
|
||||
for _, line := range strings.Split(status, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
rest, ok := strings.CutPrefix(line, "Peers count:")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
rest = strings.TrimSpace(rest)
|
||||
slash := strings.IndexByte(rest, '/')
|
||||
if slash <= 0 {
|
||||
return 0
|
||||
}
|
||||
n, err := strconv.Atoi(strings.TrimSpace(rest[:slash]))
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return n
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func (cl *Client) pollStatus(ctx context.Context, timeout time.Duration, want string) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
var last string
|
||||
for time.Now().Before(deadline) {
|
||||
out, _ := cl.Status(ctx)
|
||||
last = out
|
||||
if strings.Contains(out, want) {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(3 * time.Second)
|
||||
}
|
||||
return fmt.Errorf("timed out waiting for %q; last status:\n%s", want, last)
|
||||
}
|
||||
|
||||
// ResolveProxyIP resolves the agent-network endpoint to the proxy peer's
|
||||
// NetBird IP from inside the client (via magic DNS).
|
||||
func (cl *Client) ResolveProxyIP(ctx context.Context, endpoint string) (string, error) {
|
||||
code, reader, err := cl.container.Exec(ctx, []string{"getent", "hosts", endpoint}, tcexec.Multiplexed())
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
out, _ := io.ReadAll(reader)
|
||||
if code != 0 {
|
||||
return "", fmt.Errorf("getent hosts %s exited %d", endpoint, code)
|
||||
}
|
||||
fields := strings.Fields(string(out))
|
||||
if len(fields) == 0 {
|
||||
return "", fmt.Errorf("no address for %s", endpoint)
|
||||
}
|
||||
return fields[0], nil
|
||||
}
|
||||
|
||||
// Wire shapes for Chat.
|
||||
const (
|
||||
// WireChat is the OpenAI-compatible /v1/chat/completions shape.
|
||||
WireChat = "chat"
|
||||
// WireMessages is the Anthropic /v1/messages shape.
|
||||
WireMessages = "messages"
|
||||
// WireVertex is the Anthropic-on-Vertex rawPredict shape: the client posts
|
||||
// the full Vertex model path and the proxy mints the SA OAuth token.
|
||||
WireVertex = "vertex"
|
||||
)
|
||||
|
||||
// Chat issues a chat-completion POST to the agent-network endpoint over the
|
||||
// client's tunnel, returning the HTTP status and response body. kind selects
|
||||
// the wire shape: WireChat (OpenAI) or WireMessages (Anthropic). A non-empty
|
||||
// sessionID is sent as the universal x-session-id header the proxy records.
|
||||
func (cl *Client) Chat(ctx context.Context, endpoint, proxyIP, kind, model, prompt, sessionID string) (int, string, error) {
|
||||
var path, body string
|
||||
var headers []string
|
||||
switch kind {
|
||||
case WireMessages:
|
||||
path = "/v1/messages"
|
||||
headers = []string{"anthropic-version: 2023-06-01"}
|
||||
body = fmt.Sprintf(`{"model":%q,"max_tokens":64,"messages":[{"role":"user","content":%q}]}`, model, prompt)
|
||||
default:
|
||||
path = "/v1/chat/completions"
|
||||
body = fmt.Sprintf(`{"model":%q,"messages":[{"role":"user","content":%q}]}`, model, prompt)
|
||||
}
|
||||
return cl.post(ctx, endpoint, proxyIP, path, body, withSessionID(headers, sessionID))
|
||||
}
|
||||
|
||||
// Vertex issues an Anthropic-on-Vertex rawPredict POST over the tunnel. Unlike
|
||||
// Chat, the model is carried in the request path (project/region/model), so the
|
||||
// proxy routes by path and mints the service-account OAuth token; the body uses
|
||||
// the Vertex anthropic_version rather than a model field. A non-empty sessionID
|
||||
// is sent as the universal x-session-id header the proxy records.
|
||||
func (cl *Client) Vertex(ctx context.Context, endpoint, proxyIP, project, region, model, prompt, sessionID string) (int, string, error) {
|
||||
path := fmt.Sprintf("/v1/projects/%s/locations/%s/publishers/anthropic/models/%s:rawPredict", project, region, model)
|
||||
body := fmt.Sprintf(`{"anthropic_version":"vertex-2023-10-16","max_tokens":64,"messages":[{"role":"user","content":%q}]}`, prompt)
|
||||
return cl.post(ctx, endpoint, proxyIP, path, body, withSessionID(nil, sessionID))
|
||||
}
|
||||
|
||||
// withSessionID appends the x-session-id header when sessionID is non-empty.
|
||||
func withSessionID(headers []string, sessionID string) []string {
|
||||
if sessionID == "" {
|
||||
return headers
|
||||
}
|
||||
return append(headers, "x-session-id: "+sessionID)
|
||||
}
|
||||
|
||||
// post runs curl in a throwaway container sharing the client's network
|
||||
// namespace so the request traverses the WireGuard tunnel, pinning the endpoint
|
||||
// to the proxy IP. It returns the HTTP status and response body.
|
||||
func (cl *Client) post(ctx context.Context, endpoint, proxyIP, path, body string, extraHeaders []string) (int, string, error) {
|
||||
url := "https://" + endpoint + path
|
||||
args := []string{
|
||||
"run", "--rm",
|
||||
"--network", "container:" + cl.container.GetContainerID(),
|
||||
curlImage,
|
||||
"-sk", "--connect-timeout", "5", "--max-time", "90",
|
||||
"--resolve", endpoint + ":443:" + proxyIP,
|
||||
"-o", "/dev/stderr", "-w", "%{http_code}",
|
||||
"-X", "POST", url,
|
||||
"-H", "Content-Type: application/json",
|
||||
}
|
||||
for _, h := range extraHeaders {
|
||||
args = append(args, "-H", h)
|
||||
}
|
||||
args = append(args, "--data", body)
|
||||
cmd := exec.CommandContext(ctx, "docker", args...)
|
||||
// -w writes the status code to stdout; -o /dev/stderr writes the body to
|
||||
// stderr so we can capture both separately.
|
||||
var stdout, stderr strings.Builder
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
return 0, stderr.String(), fmt.Errorf("curl through tunnel: %w", err)
|
||||
}
|
||||
|
||||
code := 0
|
||||
_, _ = fmt.Sscanf(strings.TrimSpace(stdout.String()), "%d", &code)
|
||||
return code, stderr.String(), nil
|
||||
}
|
||||
|
||||
// Logs returns the client container logs, for diagnostics on failure.
|
||||
func (cl *Client) Logs(ctx context.Context) string {
|
||||
return containerLogs(ctx, cl.container)
|
||||
}
|
||||
|
||||
// Terminate stops the client container.
|
||||
func (cl *Client) Terminate(ctx context.Context) error {
|
||||
if cl.container == nil {
|
||||
return nil
|
||||
}
|
||||
return cl.container.Terminate(ctx)
|
||||
}
|
||||
|
||||
// containerLogs reads up to 256 KiB of a container's logs for diagnostics.
|
||||
func containerLogs(ctx context.Context, c testcontainers.Container) string {
|
||||
if c == nil {
|
||||
return ""
|
||||
}
|
||||
r, err := c.Logs(ctx)
|
||||
if err != nil {
|
||||
return fmt.Sprintf("<logs error: %v>", err)
|
||||
}
|
||||
defer r.Close()
|
||||
b, _ := io.ReadAll(io.LimitReader(r, 256<<10))
|
||||
return string(b)
|
||||
}
|
||||
243
e2e/harness/combined.go
Normal file
243
e2e/harness/combined.go
Normal file
@@ -0,0 +1,243 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/docker/go-connections/nat"
|
||||
"github.com/testcontainers/testcontainers-go"
|
||||
tcexec "github.com/testcontainers/testcontainers-go/exec"
|
||||
"github.com/testcontainers/testcontainers-go/network"
|
||||
"github.com/testcontainers/testcontainers-go/wait"
|
||||
|
||||
"github.com/netbirdio/netbird/shared/management/client/rest"
|
||||
)
|
||||
|
||||
const (
|
||||
combinedDockerfile = "combined/Dockerfile.multistage"
|
||||
// defaultCombinedImage is the local tag the combined server is built under
|
||||
// from combinedDockerfile, so the e2e exercises this branch's code. Override
|
||||
// with NB_E2E_COMBINED_IMAGE: a value containing a "/" is pulled as a
|
||||
// published image; a bare tag is built under that name instead.
|
||||
defaultCombinedImage = "netbird-combined:e2e"
|
||||
combinedHTTPPort = "8080/tcp"
|
||||
|
||||
// combinedAlias is the combined server's network alias AND the deployment
|
||||
// domain. The working manual setup uses a single NETBIRD_DOMAIN for the
|
||||
// management exposed address, the proxy domain, and the agent-network
|
||||
// cluster — so we mirror that: peers reach management/signal/relay at this
|
||||
// name, the proxy registers this as its cluster, and the agent-network
|
||||
// endpoint is <subdomain>.<combinedAlias>.
|
||||
combinedAlias = "netbird.local"
|
||||
combinedExposedURL = "http://" + combinedAlias + ":8080"
|
||||
|
||||
// containerIssuer is the embedded IdP issuer, used only for internal JWT
|
||||
// validation (peers authenticate with setup keys / proxy tokens, not OIDC),
|
||||
// so the in-container localhost address is fine.
|
||||
containerIssuer = "http://localhost:8080/oauth2"
|
||||
)
|
||||
|
||||
// Combined is a running combined NetBird server (management + signal + relay +
|
||||
// STUN + embedded IdP) plus the connection details tests need. It owns the
|
||||
// shared docker network that the proxy and client containers join.
|
||||
type Combined struct {
|
||||
container testcontainers.Container
|
||||
network *testcontainers.DockerNetwork
|
||||
// BaseURL is the host-reachable management API root, e.g. http://127.0.0.1:51234.
|
||||
BaseURL string
|
||||
// PAT is the admin Personal Access Token minted via Bootstrap.
|
||||
PAT string
|
||||
|
||||
api *rest.Client
|
||||
workDir string
|
||||
}
|
||||
|
||||
// StartCombined builds the combined server from its multistage Dockerfile and
|
||||
// boots it with setup-PAT enabled on a fresh shared network, returning once the
|
||||
// API is serving. The caller still owns minting the admin PAT via Bootstrap.
|
||||
func StartCombined(ctx context.Context) (*Combined, error) {
|
||||
root, err := repoRoot()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
combinedImage, err := resolveImage(ctx, root, "NB_E2E_COMBINED_IMAGE", defaultCombinedImage, combinedDockerfile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
net, err := network.New(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create shared network: %w", err)
|
||||
}
|
||||
|
||||
// Work dir under /tmp so Docker Desktop file sharing (which excludes
|
||||
// macOS's /var/folders TMPDIR) can bind-mount it.
|
||||
workDir, err := os.MkdirTemp("/tmp", "nb-e2e-combined-*")
|
||||
if err != nil {
|
||||
_ = net.Remove(ctx)
|
||||
return nil, fmt.Errorf("create work dir: %w", err)
|
||||
}
|
||||
|
||||
cfg := fmt.Sprintf(combinedConfigYAML, combinedExposedURL, containerIssuer)
|
||||
if err := os.WriteFile(filepath.Join(workDir, "config.yaml"), []byte(cfg), 0o644); err != nil { //nolint:gosec // non-secret config, bind-mounted and read by the container
|
||||
_ = net.Remove(ctx)
|
||||
return nil, fmt.Errorf("write combined config: %w", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(workDir, "data"), 0o755); err != nil {
|
||||
_ = net.Remove(ctx)
|
||||
return nil, fmt.Errorf("create datadir: %w", err)
|
||||
}
|
||||
|
||||
req := testcontainers.ContainerRequest{
|
||||
Image: combinedImage,
|
||||
ExposedPorts: []string{combinedHTTPPort},
|
||||
Networks: []string{net.Name},
|
||||
NetworkAliases: map[string][]string{net.Name: {combinedAlias}},
|
||||
Env: map[string]string{
|
||||
"NB_SETUP_PAT_ENABLED": "true",
|
||||
// Skip the GeoLite DB download — it blocks startup and agent-network
|
||||
// ingest doesn't use geolocation.
|
||||
"NB_DISABLE_GEOLOCATION": "true",
|
||||
},
|
||||
Cmd: []string{"--config", "/nb/config.yaml"},
|
||||
HostConfigModifier: func(hc *container.HostConfig) {
|
||||
hc.Binds = append(hc.Binds, workDir+":/nb")
|
||||
},
|
||||
WaitingFor: wait.ForHTTP("/api/instance").
|
||||
WithPort(combinedHTTPPort).
|
||||
WithStatusCodeMatcher(func(status int) bool { return status == 200 }).
|
||||
WithStartupTimeout(120 * time.Second),
|
||||
}
|
||||
|
||||
c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||
ContainerRequest: req,
|
||||
Started: true,
|
||||
})
|
||||
if err != nil {
|
||||
_ = net.Remove(ctx)
|
||||
return nil, fmt.Errorf("start combined container: %w", err)
|
||||
}
|
||||
|
||||
host, err := c.Host(ctx)
|
||||
if err != nil {
|
||||
_ = c.Terminate(ctx)
|
||||
_ = net.Remove(ctx)
|
||||
return nil, fmt.Errorf("container host: %w", err)
|
||||
}
|
||||
mapped, err := c.MappedPort(ctx, nat.Port(combinedHTTPPort))
|
||||
if err != nil {
|
||||
_ = c.Terminate(ctx)
|
||||
_ = net.Remove(ctx)
|
||||
return nil, fmt.Errorf("mapped port: %w", err)
|
||||
}
|
||||
|
||||
return &Combined{
|
||||
container: c,
|
||||
network: net,
|
||||
BaseURL: fmt.Sprintf("http://%s:%s", host, mapped.Port()),
|
||||
workDir: workDir,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// resolveImage returns the image to run for a component. By default it builds
|
||||
// the image from the repo Dockerfile under localTag, so the e2e exercises the
|
||||
// branch's code. The env override changes this: a value containing a "/" is a
|
||||
// registry reference that testcontainers pulls (e.g. to test a published
|
||||
// release); a bare tag is built under that name instead.
|
||||
func resolveImage(ctx context.Context, root, envKey, localTag, dockerfile string) (string, error) {
|
||||
if v := os.Getenv(envKey); v != "" {
|
||||
if strings.Contains(v, "/") {
|
||||
return v, nil
|
||||
}
|
||||
localTag = v
|
||||
}
|
||||
if err := buildImage(ctx, root, dockerfile, localTag); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return localTag, nil
|
||||
}
|
||||
|
||||
// buildImage builds an image from a repo Dockerfile via buildx with BuildKit, so
|
||||
// the Dockerfile cache mounts are honored and unchanged layers are reused. The
|
||||
// result is loaded into the docker image store so testcontainers runs it by tag.
|
||||
// When NB_E2E_BUILDX_CACHE names a directory (CI, with a container-driver
|
||||
// builder from docker/setup-buildx-action), layer cache is read from and written
|
||||
// to it as a local cache so actions/cache can persist it across runs; the Go
|
||||
// compile itself still re-runs, as BuildKit mount caches can't be exported.
|
||||
func buildImage(ctx context.Context, root, dockerfile, tag string) error {
|
||||
args := []string{"buildx", "build", "-f", dockerfile, "-t", tag, "--load"}
|
||||
if dir := os.Getenv("NB_E2E_BUILDX_CACHE"); dir != "" {
|
||||
args = append(args,
|
||||
"--cache-from", "type=local,src="+dir,
|
||||
"--cache-to", "type=local,dest="+dir+",mode=max",
|
||||
)
|
||||
}
|
||||
args = append(args, ".")
|
||||
|
||||
cmd := exec.CommandContext(ctx, "docker", args...)
|
||||
cmd.Dir = root
|
||||
cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1")
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("build image %s: %w\n%s", tag, err, string(out))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CreateProxyTokenCLI mints a proxy access token via the server's `token
|
||||
// create` CLI inside the container — the same path the manual install uses.
|
||||
// This yields a GLOBAL (account-less) token, so the proxy serves the whole
|
||||
// cluster (SynthesizeServicesForCluster); an account-scoped REST token instead
|
||||
// drives the per-account path. Returns the plaintext token.
|
||||
func (c *Combined) CreateProxyTokenCLI(ctx context.Context, name string) (string, error) {
|
||||
code, reader, err := c.container.Exec(ctx,
|
||||
[]string{"/go/bin/netbird-server", "token", "create", "--name", name, "--config", "/nb/config.yaml"},
|
||||
tcexec.Multiplexed())
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("exec token create: %w", err)
|
||||
}
|
||||
out, _ := io.ReadAll(reader)
|
||||
if code != 0 {
|
||||
return "", fmt.Errorf("token create exited %d: %s", code, string(out))
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if strings.HasPrefix(line, "Token:") {
|
||||
tok := strings.TrimSpace(strings.TrimPrefix(line, "Token:"))
|
||||
if tok != "" {
|
||||
return tok, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("token not found in CLI output: %s", string(out))
|
||||
}
|
||||
|
||||
// Logs returns the combined server container logs, for diagnostics.
|
||||
func (c *Combined) Logs(ctx context.Context) string {
|
||||
return containerLogs(ctx, c.container)
|
||||
}
|
||||
|
||||
// Terminate stops the container, removes the shared network, and cleans the
|
||||
// work dir.
|
||||
func (c *Combined) Terminate(ctx context.Context) error {
|
||||
var err error
|
||||
if c.container != nil {
|
||||
err = c.container.Terminate(ctx)
|
||||
}
|
||||
if c.network != nil {
|
||||
_ = c.network.Remove(ctx)
|
||||
}
|
||||
if c.workDir != "" {
|
||||
_ = os.RemoveAll(c.workDir)
|
||||
}
|
||||
return err
|
||||
}
|
||||
26
e2e/harness/config.go
Normal file
26
e2e/harness/config.go
Normal file
@@ -0,0 +1,26 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
// combinedConfigYAML is a minimal combined-server config for tests: plain HTTP
|
||||
// on :8080 (no TLS cert configured → the server serves HTTP and expects to sit
|
||||
// behind a reverse proxy, which is exactly what we want for in-cluster tests),
|
||||
// embedded IdP, local signal/relay/STUN, and a sqlite store under the mounted
|
||||
// data dir. exposedAddress is the address peers use to reach this container; it
|
||||
// is overridden per-run so the value matches the container's network alias.
|
||||
const combinedConfigYAML = `server:
|
||||
listenAddress: ":8080"
|
||||
exposedAddress: "%s"
|
||||
healthcheckAddress: ":9000"
|
||||
metricsPort: 9090
|
||||
logLevel: "info"
|
||||
logFile: "console"
|
||||
authSecret: "e2e-relay-secret"
|
||||
dataDir: "/nb/data"
|
||||
disableAnonymousMetrics: true
|
||||
disableGeoliteUpdate: true
|
||||
auth:
|
||||
issuer: "%s"
|
||||
store:
|
||||
engine: "sqlite"
|
||||
`
|
||||
13
e2e/harness/doc.go
Normal file
13
e2e/harness/doc.go
Normal file
@@ -0,0 +1,13 @@
|
||||
//go:build e2e
|
||||
|
||||
// Package harness provides a self-contained, OIDC-free way to stand up NetBird
|
||||
// components in containers for end-to-end tests. It is feature-agnostic: any
|
||||
// suite can ask for a live management server (with an admin PAT minted through
|
||||
// the unauthenticated /api/setup bootstrap) and, later, a proxy and client.
|
||||
//
|
||||
// The harness compiles each component once in a cached builder container and
|
||||
// mounts the resulting binary into a slim runtime container, so iterating on a
|
||||
// branch doesn't pay a full image rebuild per run. Everything is gated behind
|
||||
// the `e2e` build tag so normal builds and unit tests never pull in
|
||||
// testcontainers.
|
||||
package harness
|
||||
29
e2e/harness/paths.go
Normal file
29
e2e/harness/paths.go
Normal file
@@ -0,0 +1,29 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// repoRoot walks up from the working directory to the module root (the
|
||||
// directory holding go.mod), so the Docker build context is correct no matter
|
||||
// which package the test runs from.
|
||||
func repoRoot() (string, error) {
|
||||
dir, err := os.Getwd()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
for {
|
||||
if _, statErr := os.Stat(filepath.Join(dir, "go.mod")); statErr == nil {
|
||||
return dir, nil
|
||||
}
|
||||
parent := filepath.Dir(dir)
|
||||
if parent == dir {
|
||||
return "", fmt.Errorf("go.mod not found above %s", dir)
|
||||
}
|
||||
dir = parent
|
||||
}
|
||||
}
|
||||
122
e2e/harness/proxy.go
Normal file
122
e2e/harness/proxy.go
Normal file
@@ -0,0 +1,122 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/testcontainers/testcontainers-go"
|
||||
"github.com/testcontainers/testcontainers-go/wait"
|
||||
)
|
||||
|
||||
const (
|
||||
proxyDockerfile = "proxy/Dockerfile.multistage"
|
||||
// defaultProxyImage is the local tag the reverse proxy is built under from
|
||||
// proxyDockerfile. Override with NB_E2E_PROXY_IMAGE: a value with a "/" is
|
||||
// pulled as a published image; a bare tag is built under that name.
|
||||
defaultProxyImage = "netbird-reverse-proxy:e2e"
|
||||
proxyAlias = "proxy"
|
||||
|
||||
// AgentNetworkCluster is the proxy cluster the e2e provider bootstraps and
|
||||
// the proxy serves. It must equal the management's exposed domain
|
||||
// (combinedAlias) — the working manual setup uses one NETBIRD_DOMAIN for
|
||||
// both. The agent-network endpoint is <subdomain>.<cluster>.
|
||||
AgentNetworkCluster = combinedAlias
|
||||
)
|
||||
|
||||
// Proxy is a running agent-network gateway (netbird proxy) container.
|
||||
type Proxy struct {
|
||||
container testcontainers.Container
|
||||
workDir string
|
||||
}
|
||||
|
||||
// StartProxy builds the proxy image and runs it on the combined server's
|
||||
// network, registered via the given account proxy token and serving the
|
||||
// AgentNetworkCluster over a self-signed wildcard cert. It does not wait for
|
||||
// peer connectivity — callers poll management for the proxy peer.
|
||||
func StartProxy(ctx context.Context, c *Combined, proxyToken string) (*Proxy, error) {
|
||||
root, err := repoRoot()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
proxyImage, err := resolveImage(ctx, root, "NB_E2E_PROXY_IMAGE", defaultProxyImage, proxyDockerfile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
workDir, err := os.MkdirTemp("/tmp", "nb-e2e-proxy-*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create proxy work dir: %w", err)
|
||||
}
|
||||
// MkdirTemp creates the dir 0700; widen it so the non-root proxy container
|
||||
// can traverse the bind-mounted cert dir on Linux CI runners.
|
||||
if err := os.Chmod(workDir, 0o755); err != nil { //nolint:gosec // throwaway e2e cert dir, must be traversable by the proxy container uid
|
||||
return nil, fmt.Errorf("chmod proxy cert dir: %w", err)
|
||||
}
|
||||
if err := writeSelfSignedCert(workDir, []string{"*." + AgentNetworkCluster, AgentNetworkCluster}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req := testcontainers.ContainerRequest{
|
||||
Image: proxyImage,
|
||||
Networks: []string{c.network.Name},
|
||||
NetworkAliases: map[string][]string{c.network.Name: {proxyAlias}},
|
||||
Env: map[string]string{
|
||||
"NB_PROXY_TOKEN": proxyToken,
|
||||
"NB_PROXY_MANAGEMENT_ADDRESS": combinedExposedURL,
|
||||
"NB_PROXY_DOMAIN": AgentNetworkCluster,
|
||||
"NB_PROXY_ADDRESS": ":443",
|
||||
"NB_PROXY_CERTIFICATE_DIRECTORY": "/certs",
|
||||
"NB_PROXY_HEALTH_ADDRESS": ":8081",
|
||||
"NB_PROXY_LOG_LEVEL": "debug",
|
||||
"NB_PROXY_PRIVATE": "true",
|
||||
// Management is plain HTTP in-cluster, so allow the proxy token to
|
||||
// ride a non-TLS gRPC connection.
|
||||
"NB_PROXY_ALLOW_INSECURE": "true",
|
||||
// The combined server multiplexes the relay over WebSocket on :8080
|
||||
// (no QUIC listener). The proxy's embedded relay client defaults to
|
||||
// QUIC, which fails here and flaps the relay link, churning the
|
||||
// proxy peer so it never stably registers. Force WS transport.
|
||||
"NB_RELAY_TRANSPORT": "ws",
|
||||
// Trace the embedded client (relay / signal / handshake) so
|
||||
// peer-registration issues are visible in the proxy logs.
|
||||
"NB_PROXY_CLIENT_LOG_LEVEL": "trace",
|
||||
},
|
||||
HostConfigModifier: func(hc *container.HostConfig) {
|
||||
hc.Binds = append(hc.Binds, workDir+":/certs")
|
||||
hc.CapAdd = append(hc.CapAdd, "NET_ADMIN", "SYS_ADMIN", "SYS_RESOURCE", "NET_BIND_SERVICE")
|
||||
},
|
||||
WaitingFor: wait.ForLog("Initial mapping sync complete").WithStartupTimeout(90 * time.Second),
|
||||
}
|
||||
|
||||
ctr, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||
ContainerRequest: req,
|
||||
Started: true,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start proxy container: %w", err)
|
||||
}
|
||||
|
||||
return &Proxy{container: ctr, workDir: workDir}, nil
|
||||
}
|
||||
|
||||
// Logs returns the proxy container logs, for diagnostics on failure.
|
||||
func (p *Proxy) Logs(ctx context.Context) string {
|
||||
return containerLogs(ctx, p.container)
|
||||
}
|
||||
|
||||
// Terminate stops the proxy container and cleans its work dir.
|
||||
func (p *Proxy) Terminate(ctx context.Context) error {
|
||||
var err error
|
||||
if p.container != nil {
|
||||
err = p.container.Terminate(ctx)
|
||||
}
|
||||
if p.workDir != "" {
|
||||
_ = os.RemoveAll(p.workDir)
|
||||
}
|
||||
return err
|
||||
}
|
||||
107
e2e/harness/upstream.go
Normal file
107
e2e/harness/upstream.go
Normal file
@@ -0,0 +1,107 @@
|
||||
//go:build e2e
|
||||
|
||||
package harness
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/testcontainers/testcontainers-go"
|
||||
"github.com/testcontainers/testcontainers-go/wait"
|
||||
)
|
||||
|
||||
const (
|
||||
fakeUpstreamImage = "nginx:alpine"
|
||||
fakeUpstreamAlias = "fakeupstream"
|
||||
fakeUpstreamPort = "443/tcp"
|
||||
)
|
||||
|
||||
// fakeUpstreamNginxConf serves a canned OpenAI-shaped chat completion for any
|
||||
// path over a self-signed certificate, so the proxy reaches it only when the
|
||||
// provider opts into skipping TLS verification.
|
||||
const fakeUpstreamNginxConf = `pid /tmp/nginx.pid;
|
||||
events {}
|
||||
http {
|
||||
server {
|
||||
listen 443 ssl;
|
||||
ssl_certificate /certs/tls.crt;
|
||||
ssl_certificate_key /certs/tls.key;
|
||||
location / {
|
||||
default_type application/json;
|
||||
return 200 '{"id":"chatcmpl-e2e","object":"chat.completion","choices":[{"index":0,"message":{"role":"assistant","content":"pong"},"finish_reason":"stop"}],"usage":{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2}}';
|
||||
}
|
||||
}
|
||||
}
|
||||
`
|
||||
|
||||
// FakeUpstream is a self-signed HTTPS server on the combined server's network,
|
||||
// used to exercise provider skip_tls_verification: a proxy that verifies the
|
||||
// certificate rejects it, one that skips verification reaches it.
|
||||
type FakeUpstream struct {
|
||||
container testcontainers.Container
|
||||
workDir string
|
||||
// URL is the upstream URL providers point at (https://<alias>).
|
||||
URL string
|
||||
}
|
||||
|
||||
// StartFakeUpstream runs the self-signed upstream on the shared network.
|
||||
func StartFakeUpstream(ctx context.Context, c *Combined) (*FakeUpstream, error) {
|
||||
workDir, err := os.MkdirTemp("/tmp", "nb-e2e-upstream-*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create upstream work dir: %w", err)
|
||||
}
|
||||
// Widen so the (non-root worker) nginx container can traverse the bind mount.
|
||||
if err := os.Chmod(workDir, 0o755); err != nil { //nolint:gosec // throwaway e2e cert dir
|
||||
return nil, fmt.Errorf("chmod upstream dir: %w", err)
|
||||
}
|
||||
if err := writeSelfSignedCert(workDir, []string{fakeUpstreamAlias}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(workDir, "nginx.conf"), []byte(fakeUpstreamNginxConf), 0o644); err != nil { //nolint:gosec // non-secret e2e config
|
||||
return nil, fmt.Errorf("write nginx conf: %w", err)
|
||||
}
|
||||
|
||||
req := testcontainers.ContainerRequest{
|
||||
Image: fakeUpstreamImage,
|
||||
ExposedPorts: []string{fakeUpstreamPort},
|
||||
Networks: []string{c.network.Name},
|
||||
NetworkAliases: map[string][]string{c.network.Name: {fakeUpstreamAlias}},
|
||||
Cmd: []string{"nginx", "-c", "/certs/nginx.conf", "-g", "daemon off;"},
|
||||
HostConfigModifier: func(hc *container.HostConfig) {
|
||||
hc.Binds = append(hc.Binds, workDir+":/certs:ro")
|
||||
},
|
||||
WaitingFor: wait.ForListeningPort(fakeUpstreamPort).WithStartupTimeout(60 * time.Second),
|
||||
}
|
||||
|
||||
ctr, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||
ContainerRequest: req,
|
||||
Started: true,
|
||||
})
|
||||
if err != nil {
|
||||
_ = os.RemoveAll(workDir)
|
||||
return nil, fmt.Errorf("start fake upstream container: %w", err)
|
||||
}
|
||||
|
||||
return &FakeUpstream{container: ctr, workDir: workDir, URL: "https://" + fakeUpstreamAlias}, nil
|
||||
}
|
||||
|
||||
// Logs returns the upstream container logs, for diagnostics on failure.
|
||||
func (u *FakeUpstream) Logs(ctx context.Context) string {
|
||||
return containerLogs(ctx, u.container)
|
||||
}
|
||||
|
||||
// Terminate stops the upstream container and cleans its work dir.
|
||||
func (u *FakeUpstream) Terminate(ctx context.Context) error {
|
||||
var err error
|
||||
if u.container != nil {
|
||||
err = u.container.Terminate(ctx)
|
||||
}
|
||||
if u.workDir != "" {
|
||||
_ = os.RemoveAll(u.workDir)
|
||||
}
|
||||
return err
|
||||
}
|
||||
6
go.mod
6
go.mod
@@ -35,6 +35,7 @@ require (
|
||||
github.com/DeRuina/timberjack v1.4.2
|
||||
github.com/awnumar/memguard v0.23.0
|
||||
github.com/aws/aws-sdk-go-v2 v1.38.3
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1
|
||||
github.com/aws/aws-sdk-go-v2/config v1.31.6
|
||||
github.com/aws/aws-sdk-go-v2/credentials v1.18.10
|
||||
github.com/aws/aws-sdk-go-v2/service/s3 v1.87.3
|
||||
@@ -49,6 +50,8 @@ require (
|
||||
github.com/crowdsecurity/go-cs-bouncer v0.0.21
|
||||
github.com/dexidp/dex v2.13.0+incompatible
|
||||
github.com/dexidp/dex/api/v2 v2.4.0
|
||||
github.com/docker/docker v28.0.1+incompatible
|
||||
github.com/docker/go-connections v0.6.0
|
||||
github.com/ebitengine/purego v0.8.4
|
||||
github.com/eko/gocache/lib/v4 v4.2.0
|
||||
github.com/eko/gocache/store/go_cache/v4 v4.2.2
|
||||
@@ -158,7 +161,6 @@ require (
|
||||
github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
|
||||
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
|
||||
github.com/awnumar/memcall v0.4.0 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.6 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 // indirect
|
||||
@@ -188,8 +190,6 @@ require (
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/distribution/reference v0.6.0 // indirect
|
||||
github.com/docker/docker v28.0.1+incompatible // indirect
|
||||
github.com/docker/go-connections v0.6.0 // indirect
|
||||
github.com/docker/go-units v0.5.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fredbi/uri v1.1.1 // indirect
|
||||
|
||||
@@ -9,6 +9,8 @@ set -o pipefail
|
||||
|
||||
SED_STRIP_PADDING='s/=//g'
|
||||
|
||||
NETBIRD_EULA_URL="https://netbird.io/self-hosted-EULA"
|
||||
|
||||
check_docker_compose() {
|
||||
if command -v docker-compose &> /dev/null; then
|
||||
echo "docker-compose"
|
||||
@@ -139,6 +141,43 @@ read_yes_no() {
|
||||
esac
|
||||
}
|
||||
|
||||
# Gate the install on explicit acceptance of the NetBird On-Premise EULA.
|
||||
require_eula_acceptance() {
|
||||
cat > /dev/stderr <<EOF
|
||||
|
||||
──────────────────────────────────────────────────────────────────────
|
||||
NetBird On-Premise End User License Agreement
|
||||
──────────────────────────────────────────────────────────────────────
|
||||
NetBird's on-premise software is commercial software, licensed and not
|
||||
sold. Your installation, deployment and use are governed by the NetBird
|
||||
On-Premise End User License Agreement (the "EULA"). Please read the EULA
|
||||
in full before continuing:
|
||||
|
||||
${NETBIRD_EULA_URL}
|
||||
|
||||
By typing "accept" and continuing the installation, you confirm that you
|
||||
have read and agree to the EULA, that you are authorized to accept it on
|
||||
behalf of your organization (the "Customer"), and that the Software is
|
||||
used for business purposes only.
|
||||
──────────────────────────────────────────────────────────────────────
|
||||
EOF
|
||||
|
||||
if [[ "${NB_ACCEPT_EULA:-}" == "yes" ]]; then
|
||||
echo "EULA accepted via NB_ACCEPT_EULA=yes." > /dev/stderr
|
||||
return 0
|
||||
fi
|
||||
|
||||
local ans=""
|
||||
echo -n 'Type "accept" to agree, or anything else to abort: ' > /dev/stderr
|
||||
read -r ans < /dev/tty
|
||||
if [[ "$ans" != "accept" ]]; then
|
||||
echo "" > /dev/stderr
|
||||
echo "EULA not accepted. Aborting installation." > /dev/stderr
|
||||
exit 1
|
||||
fi
|
||||
echo "" > /dev/stderr
|
||||
}
|
||||
|
||||
wait_postgres() {
|
||||
set +e
|
||||
echo -n "Waiting for postgres to become ready"
|
||||
@@ -174,6 +213,9 @@ init_environment() {
|
||||
exit 1
|
||||
fi
|
||||
|
||||
require_eula_acceptance
|
||||
NETBIRD_EULA_ACCEPTED_AT=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
|
||||
echo "NetBird Enterprise bootstrap"
|
||||
echo ""
|
||||
echo "Traffic flow:"
|
||||
@@ -260,6 +302,11 @@ render_env() {
|
||||
# Generated by getting-started-enterprise.sh
|
||||
# Holds all configuration and secrets for the stack. Mode 600.
|
||||
|
||||
# NetBird On-Premise EULA acceptance
|
||||
NETBIRD_EULA_ACCEPTED=yes
|
||||
NETBIRD_EULA_ACCEPTED_AT=${NETBIRD_EULA_ACCEPTED_AT}
|
||||
NETBIRD_EULA_URL=${NETBIRD_EULA_URL}
|
||||
|
||||
# Features (set by the script; don't edit without re-running)
|
||||
NETBIRD_TRAFFIC_FLOW_ENABLED=${NETBIRD_TRAFFIC_FLOW}
|
||||
|
||||
|
||||
@@ -25,6 +25,8 @@ set -o pipefail
|
||||
OVERRIDE_FILE="docker-compose.override.yml"
|
||||
ENTERPRISE_CONFIG_FILE="config.yaml.enterprise"
|
||||
|
||||
NETBIRD_EULA_URL="https://netbird.io/self-hosted-EULA"
|
||||
|
||||
check_docker_compose() {
|
||||
if command -v docker-compose &> /dev/null; then
|
||||
echo "docker-compose"
|
||||
@@ -115,6 +117,43 @@ read_yes_no() {
|
||||
esac
|
||||
}
|
||||
|
||||
# Gate the migration on explicit acceptance of the NetBird On-Premise EULA.
|
||||
require_eula_acceptance() {
|
||||
cat > /dev/stderr <<EOF
|
||||
|
||||
──────────────────────────────────────────────────────────────────────
|
||||
NetBird On-Premise End User License Agreement
|
||||
──────────────────────────────────────────────────────────────────────
|
||||
NetBird's on-premise software is commercial software, licensed and not
|
||||
sold. Your installation, deployment and use are governed by the NetBird
|
||||
On-Premise End User License Agreement (the "EULA"). Please read the EULA
|
||||
in full before continuing:
|
||||
|
||||
${NETBIRD_EULA_URL}
|
||||
|
||||
By typing "accept" and continuing the installation, you confirm that you
|
||||
have read and agree to the EULA, that you are authorized to accept it on
|
||||
behalf of your organization (the "Customer"), and that the Software is
|
||||
used for business purposes only.
|
||||
──────────────────────────────────────────────────────────────────────
|
||||
EOF
|
||||
|
||||
if [[ "${NB_ACCEPT_EULA:-}" == "yes" ]]; then
|
||||
echo "EULA accepted via NB_ACCEPT_EULA=yes." > /dev/stderr
|
||||
return 0
|
||||
fi
|
||||
|
||||
local ans=""
|
||||
echo -n 'Type "accept" to agree, or anything else to abort: ' > /dev/stderr
|
||||
read -r ans < /dev/tty
|
||||
if [[ "$ans" != "accept" ]]; then
|
||||
echo "" > /dev/stderr
|
||||
echo "EULA not accepted. Aborting migration." > /dev/stderr
|
||||
exit 1
|
||||
fi
|
||||
echo "" > /dev/stderr
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detection — read the operator's existing compose to find service names and
|
||||
# paths we need to override. Bail loudly if shape isn't recognised.
|
||||
@@ -436,6 +475,9 @@ init_migration() {
|
||||
echo " Network: $COMPOSE_NETWORK"
|
||||
echo ""
|
||||
|
||||
require_eula_acceptance
|
||||
NETBIRD_EULA_ACCEPTED_AT=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
|
||||
local proceed
|
||||
proceed=$(read_yes_no "Proceed with migration?" "y")
|
||||
if [[ "$proceed" != "yes" ]]; then
|
||||
@@ -529,6 +571,10 @@ apply_changes() {
|
||||
{
|
||||
echo ""
|
||||
echo "# Added by migrate-to-enterprise.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
echo "# NetBird On-Premise EULA accepted at install time"
|
||||
echo "NETBIRD_EULA_ACCEPTED=yes"
|
||||
echo "NETBIRD_EULA_ACCEPTED_AT=${NETBIRD_EULA_ACCEPTED_AT}"
|
||||
echo "NETBIRD_EULA_URL=${NETBIRD_EULA_URL}"
|
||||
echo "NB_LICENSE_KEY=${NB_LICENSE_KEY}"
|
||||
if [[ -n "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
|
||||
echo "NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user