use API

Adds restart for MDM
NOP
2026-06-29 03:09:56 +00:00 · 2026-06-18 19:25:55 +02:00 · 2026-06-18 19:23:41 +02:00 · 2026-06-18 17:27:18 +02:00 · 2026-06-18 17:10:17 +02:00 · 2026-06-18 16:51:06 +02:00
301 changed files with 2570 additions and 43090 deletions
--- a/.github/workflows/check-license-dependencies.yml
+++ b/.github/workflows/check-license-dependencies.yml
@@ -20,7 +20,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

@@ -59,12 +59,12 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Set up Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: true
--- a/.github/workflows/git-town.yml
+++ b/.github/workflows/git-town.yml
@@ -15,7 +15,7 @@ jobs:
      pull-requests: write

    steps:
-      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - uses: git-town/action@3d8b878379abb1ee393fb49865a28b4a6c2cd3b0 # v1.2.1
--- a/.github/workflows/golang-test-darwin.yml
+++ b/.github/workflows/golang-test-darwin.yml
@@ -16,12 +16,12 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -48,7 +48,7 @@ jobs:
        run: NETBIRD_STORE_ENGINE=${{ matrix.store }} CI=true go test -coverprofile=coverage.txt -tags=devcert -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE' -timeout 5m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined)

      - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
+        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
--- a/.github/workflows/golang-test-freebsd.yml
+++ b/.github/workflows/golang-test-freebsd.yml
@@ -16,7 +16,7 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

@@ -28,7 +28,7 @@ jobs:
        id: test
        env:
          GO_VERSION: ${{ steps.goversion.outputs.version }}
-        uses: vmactions/freebsd-vm@b84ab5559b5a1bb4b8ee2737d2506a16e1737636 # v1.4.8
+        uses: vmactions/freebsd-vm@d1e65811565151536c0c894fff74f06351ed26e6 # v1.4.5
        with:
          usesh: true
          copyback: false
--- a/.github/workflows/golang-test-linux.yml
+++ b/.github/workflows/golang-test-linux.yml
@@ -18,7 +18,7 @@ jobs:
      management: ${{ steps.filter.outputs.management }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

@@ -30,7 +30,7 @@ jobs:
              - 'management/**'

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -119,12 +119,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -162,7 +162,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
+        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -175,12 +175,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -246,12 +246,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -290,7 +290,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
+        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -306,12 +306,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -347,7 +347,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
+        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -363,12 +363,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -407,7 +407,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
+        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -424,12 +424,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -484,7 +484,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
+        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -529,12 +529,12 @@ jobs:
            prom/prometheus

      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -579,11 +579,10 @@ jobs:
          CGO_ENABLED=1 GOARCH=${{ matrix.arch }} \
          NETBIRD_STORE_ENGINE=${{ matrix.store }} \
          CI=true \
+          GIT_BRANCH=${{ github.ref_name }} \
          go test -tags devcert -run=^$ -bench=. \
          -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE,GIT_BRANCH,GITHUB_RUN_ID' \
          -timeout 20m ./management/... ./shared/management/... $(go list ./management/... ./shared/management/... | grep -v -e /management/server/http)
-        env:
-          GIT_BRANCH: ${{ github.ref_name }}

  api_benchmark:
    name: "Management / Benchmark (API)"
@@ -624,12 +623,12 @@ jobs:
            prom/prometheus

      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -674,13 +673,12 @@ jobs:
          CGO_ENABLED=1 GOARCH=${{ matrix.arch }} \
          NETBIRD_STORE_ENGINE=${{ matrix.store }} \
          CI=true \
+          GIT_BRANCH=${{ github.ref_name }} \
          go test -tags=benchmark \
            -run=^$ \
            -bench=. \
            -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE,GIT_BRANCH,GITHUB_RUN_ID' \
            -timeout 20m ./management/server/http/...
-        env:
-          GIT_BRANCH: ${{ github.ref_name }}

  api_integration_test:
    name: "Management / Integration"
@@ -694,12 +692,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -736,7 +734,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
+        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
--- a/.github/workflows/golang-test-windows.yml
+++ b/.github/workflows/golang-test-windows.yml
@@ -18,12 +18,12 @@ jobs:
    runs-on: windows-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        id: go
        with:
          go-version-file: "go.mod"
--- a/.github/workflows/golangci-lint.yml
+++ b/.github/workflows/golangci-lint.yml
@@ -15,13 +15,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: codespell
        uses: codespell-project/actions-codespell@8f01853be192eb0f849a5c7d721450e7a467c579 # v2.2
        with:
-          ignore_words_list: erro,clienta,hastable,iif,groupd,testin,groupe,cros,ans,deriver,te,userA,ede,additionals,flate,recordin,unparseable
+          ignore_words_list: erro,clienta,hastable,iif,groupd,testin,groupe,cros,ans,deriver,te,userA,ede,additionals
          skip: go.mod,go.sum,**/proxy/web/**
  golangci:
    strategy:
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 15
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Check for duplicate constants
@@ -48,7 +48,7 @@ jobs:
        run: |
          ! awk '/const \(/,/)/{print $0}' management/server/activity/codes.go | grep -o '= [0-9]*' | sort | uniq -d | grep .
      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
--- a/.github/workflows/install-script-test.yml
+++ b/.github/workflows/install-script-test.yml
@@ -22,7 +22,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

--- a/.github/workflows/mobile-build-validation.yml
+++ b/.github/workflows/mobile-build-validation.yml
@@ -16,11 +16,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
      - name: Setup Android SDK
@@ -28,7 +28,7 @@ jobs:
        with:
          cmdline-tools-version: 8512546
      - name: Setup Java
-        uses: actions/setup-java@ad2b38190b15e4d6bdf0c97fb4fca8412226d287
+        uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654
        with:
          java-version: "11"
          distribution: "adopt"
@@ -54,11 +54,11 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
      - name: install gomobile
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -27,7 +27,7 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

@@ -64,7 +64,7 @@ jobs:
        if: steps.check_diff.outputs.diff_exists == 'true'
        env:
          GO_VERSION: ${{ steps.goversion.outputs.version }}
-        uses: vmactions/freebsd-vm@b84ab5559b5a1bb4b8ee2737d2506a16e1737636 # v1.4.8
+        uses: vmactions/freebsd-vm@d1e65811565151536c0c894fff74f06351ed26e6 # v1.4.5
        with:
          usesh: true
          copyback: false
@@ -135,7 +135,7 @@ jobs:
      ghcr_images: ${{ steps.tag_and_push_images.outputs.images_markdown }}
    steps:
      - name: Checkout
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0 # It is required for GoReleaser to work properly
          persist-credentials: false
@@ -166,7 +166,7 @@ jobs:
          fi

      - name: Set up Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -186,9 +186,9 @@ jobs:
      - name: check git status
        run: git --no-pager diff --exit-code
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@06116385d9baf250c9f4dcb4858b16962ea869c3 #v4.1.0
+        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a #v4.0.0
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 #v4.1.0
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd #v4.0.0
      - name: Login to Docker hub
        if: github.event_name != 'pull_request'
        uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0
@@ -221,7 +221,7 @@ jobs:
        run: goversioninfo -arm -64 -icon client/ui/assets/netbird.ico -manifest client/manifest.xml -product-name ${{ env.PRODUCT_NAME }} -copyright "${{ env.COPYRIGHT }}" -ver-major ${{ steps.semver_parser.outputs.major }} -ver-minor ${{ steps.semver_parser.outputs.minor }} -ver-patch ${{ steps.semver_parser.outputs.patch }} -ver-build 0 -file-version ${{ steps.semver_parser.outputs.fullversion }}.0 -product-version ${{ steps.semver_parser.outputs.fullversion }}.0 -o client/resources_windows_arm64.syso
      - name: Run GoReleaser
        id: goreleaser
-        uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
+        uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
        with:
          version: ${{ env.GORELEASER_VER }}
          args: release --clean ${{ env.flags }}
@@ -347,7 +347,7 @@ jobs:
      release_ui_artifact_url: ${{ steps.upload_release_ui.outputs.artifact-url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0 # It is required for GoReleaser to work properly
          persist-credentials: false
@@ -374,7 +374,7 @@ jobs:
          fi

      - name: Set up Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -420,7 +420,7 @@ jobs:
        run: goversioninfo -arm -64 -icon client/ui/assets/netbird.ico -manifest client/ui/manifest.xml -product-name ${{ env.PRODUCT_NAME }}-"UI" -copyright "${{ env.COPYRIGHT }}" -ver-major ${{ steps.semver_parser.outputs.major }} -ver-minor ${{ steps.semver_parser.outputs.minor }} -ver-patch ${{ steps.semver_parser.outputs.patch }} -ver-build 0 -file-version ${{ steps.semver_parser.outputs.fullversion }}.0 -product-version ${{ steps.semver_parser.outputs.fullversion }}.0 -o client/ui/resources_windows_arm64.syso

      - name: Run GoReleaser
-        uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
+        uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
        with:
          version: ${{ env.GORELEASER_VER }}
          args: release --config .goreleaser_ui.yaml --clean ${{ env.flags }}
@@ -464,12 +464,12 @@ jobs:
      - if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
        run: echo "flags=--snapshot" >> $GITHUB_ENV
      - name: Checkout
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0 # It is required for GoReleaser to work properly
          persist-credentials: false
      - name: Set up Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -488,7 +488,7 @@ jobs:
        run: git --no-pager diff --exit-code
      - name: Run GoReleaser
        id: goreleaser
-        uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
+        uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
        with:
          version: ${{ env.GORELEASER_VER }}
          args: release --config .goreleaser_ui_darwin.yaml --clean ${{ env.flags }}
@@ -522,7 +522,7 @@ jobs:
      downloadPath: '${{ github.workspace }}\temp'
    steps:
      - name: Checkout
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

@@ -534,13 +534,13 @@ jobs:
        run: echo "C:\Program Files\7-Zip" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append

      - name: Download release artifacts
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.1
        with:
          name: release
          path: release

      - name: Download UI release artifacts
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.1
        with:
          name: release-ui
          path: release-ui
--- a/.github/workflows/test-infrastructure-files.yml
+++ b/.github/workflows/test-infrastructure-files.yml
@@ -68,12 +68,12 @@ jobs:
        run: sudo apt-get install -y curl

      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"

@@ -207,7 +207,7 @@ jobs:
      - name: Build management docker image
        working-directory: management
        run: |
-          docker build -t netbirdio/management:latest --build-arg TARGETPLATFORM=. .
+          docker build -t netbirdio/management:latest .

      - name: Build signal binary
        working-directory: signal
@@ -216,7 +216,7 @@ jobs:
      - name: Build signal docker image
        working-directory: signal
        run: |
-          docker build -t netbirdio/signal:latest --build-arg TARGETPLATFORM=. .
+          docker build -t netbirdio/signal:latest .

      - name: Build relay binary
        working-directory: relay
@@ -225,7 +225,7 @@ jobs:
      - name: Build relay docker image
        working-directory: relay
        run: |
-          docker build -t netbirdio/relay:latest --build-arg TARGETPLATFORM=. .
+          docker build -t netbirdio/relay:latest .

      - name: run docker compose up
        working-directory: infrastructure_files/artifacts
@@ -256,7 +256,7 @@ jobs:
        run: sudo apt-get install -y jq

      - name: Checkout code
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false

--- a/.github/workflows/wasm-build-validation.yml
+++ b/.github/workflows/wasm-build-validation.yml
@@ -19,11 +19,11 @@ jobs:
      GOARCH: wasm
    steps:
      - name: Checkout repository
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
      - name: Install dependencies
@@ -44,11 +44,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
        with:
          go-version-file: "go.mod"
      - name: Build Wasm client
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -247,7 +247,7 @@ dockers_v2:
       - netbirdio/netbird
       - ghcr.io/netbirdio/netbird
     tags:
-       - "{{ .Version }}"
+       - "v{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: client/Dockerfile
     extra_files:
@@ -295,7 +295,7 @@ dockers_v2:
       - netbirdio/relay
       - ghcr.io/netbirdio/relay
     tags:
-       - "{{ .Version }}"
+       - "v{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: relay/Dockerfile
     platforms:
@@ -317,7 +317,7 @@ dockers_v2:
       - netbirdio/signal
       - ghcr.io/netbirdio/signal
     tags:
-       - "{{ .Version }}"
+       - "v{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: signal/Dockerfile
     platforms:
@@ -339,7 +339,7 @@ dockers_v2:
       - netbirdio/management
       - ghcr.io/netbirdio/management
     tags:
-       - "{{ .Version }}"
+       - "v{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: management/Dockerfile
     platforms:
@@ -361,7 +361,7 @@ dockers_v2:
       - netbirdio/upload
       - ghcr.io/netbirdio/upload
     tags:
-       - "{{ .Version }}"
+       - "v{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: upload-server/Dockerfile
     platforms:
@@ -383,7 +383,7 @@ dockers_v2:
       - netbirdio/netbird-server
       - ghcr.io/netbirdio/netbird-server
     tags:
-       - "{{ .Version }}"
+       - "v{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: combined/Dockerfile
     platforms:
@@ -405,7 +405,7 @@ dockers_v2:
       - netbirdio/reverse-proxy
       - ghcr.io/netbirdio/reverse-proxy
     tags:
-       - "{{ .Version }}"
+       - "v{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: proxy/Dockerfile
     platforms:
@@ -462,13 +462,9 @@ checksum:
    - glob: ./infrastructure_files/getting-started-with-zitadel.sh
    - glob: ./release_files/install.sh
    - glob: ./infrastructure_files/getting-started.sh
-    - glob: ./infrastructure_files/getting-started-enterprise.sh
-    - glob: ./infrastructure_files/migrate-to-enterprise.sh

 release:
  extra_files:
    - glob: ./infrastructure_files/getting-started-with-zitadel.sh
    - glob: ./release_files/install.sh
    - glob: ./infrastructure_files/getting-started.sh
-    - glob: ./infrastructure_files/getting-started-enterprise.sh
-    - glob: ./infrastructure_files/migrate-to-enterprise.sh
--- a/README.md
+++ b/README.md
@@ -37,11 +37,6 @@
  </strong>
 </p>

-> ### 🤖 NetBird Agent Network (Beta)
-> Identity-aware access control for AI agents — keyless access to LLM APIs and private
-> resources over the encrypted NetBird tunnel. See [`agent-network/`](agent-network/) or
-> read the docs at **[netbird.ai](https://netbird.ai)**.
-
 **NetBird combines a configuration-free peer-to-peer private network and a centralized access control system in a single platform, making it easy to create secure private networks for your organization or home.**

 **Connect.** NetBird creates a WireGuard-based overlay network that automatically connects your machines over an encrypted tunnel, leaving behind the hassle of opening ports, complex firewall rules, VPN gateways, and so forth.
--- a/agent-network/README.md
+++ b/agent-network/README.md
@@ -1,39 +0,0 @@
-# NetBird Agent Network
-
-Agent Network is NetBird's access control layer for AI agents and the people who run
-them. It gives every agent a real identity, tied to your identity provider (IdP), and
-governs what it can reach — the LLM APIs and AI gateways it can call, and the internal
-resources it can access. Traffic flows only over the encrypted NetBird tunnel, scoped by
-policy, with no API keys to leak.
-
-> **Beta.** Agent Network is open source and can be self-hosted on your own
-> infrastructure.
-
-## How it works
-
-Agent Network is built on two existing NetBird capabilities:
-
- **Overlay network** — the encrypted WireGuard mesh between peers.
- **Reverse proxy** — a NetBird peer that terminates LLM requests, establishes the
-  caller's identity, evaluates policies/limits/guardrails, injects the upstream provider
-  key server-side, forwards to the API or gateway, and records usage.
-
-LLM traffic is routed through the proxy's identity-aware pipeline, while internal
-resources (databases, internal APIs, self-hosted models) are reached directly over
-peer-to-peer WireGuard tunnels, governed by the same identities and access policies.
-
-## Where the code lives
-
-There is no separate "agent-network" service — it reuses the reverse-proxy and management
-components:
-
- [`proxy/`](../proxy) — the NetBird reverse proxy that serves the agent network endpoint
-  and runs the per-request middleware pipeline.
- [`management/internals/modules/reverseproxy/`](../management/internals/modules/reverseproxy)
-  — the management-side control plane: providers, policies, guardrails, limits, routing,
-  and usage/access logs.
-
-## Documentation
-
-Full documentation, architecture, and quickstart:
-**https://docs.netbird.io/agent-network**
--- a/client/android/client.go
+++ b/client/android/client.go
@@ -151,9 +151,9 @@ func (c *Client) Run(platformFiles PlatformFiles, urlOpener URLOpener, isAndroid

 	// todo do not throw error in case of cancelled context
 	ctx = internal.CtxInitState(ctx)
-	connectClient := internal.NewConnectClient(ctx, cfg, c.recorder)
+	connectClient := internal.NewConnectClient(ctx, c.recorder)
 	c.setState(cfg, cacheDir, connectClient)
-	return connectClient.RunOnAndroid(c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
+	return connectClient.RunOnAndroid(cfg, c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
 }

 // RunWithoutLogin we apply this type of run function when the backed has been started without UI (i.e. after reboot).
@@ -186,9 +186,9 @@ func (c *Client) RunWithoutLogin(platformFiles PlatformFiles, dns *DNSList, dnsR

 	// todo do not throw error in case of cancelled context
 	ctx = internal.CtxInitState(ctx)
-	connectClient := internal.NewConnectClient(ctx, cfg, c.recorder)
+	connectClient := internal.NewConnectClient(ctx, c.recorder)
 	c.setState(cfg, cacheDir, connectClient)
-	return connectClient.RunOnAndroid(c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
+	return connectClient.RunOnAndroid(cfg, c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
 }

 // Stop the internal client and free the resources
--- a/client/cmd/debug.go
+++ b/client/cmd/debug.go
@@ -130,7 +130,7 @@ func debugConfigDump(cmd *cobra.Command, _ []string) error {

 	client := proto.NewDaemonServiceClient(conn)
 	resp, err := client.GetConfig(cmd.Context(), &proto.GetConfigRequest{
-		ProfileName: string(activeProf.ID),
+		ProfileName: activeProf.Name,
 		Username:    currUser.Username,
 	})
 	if err != nil {
--- a/client/cmd/login.go
+++ b/client/cmd/login.go
@@ -227,7 +227,7 @@ func switchProfile(ctx context.Context, handle string, username string) (profile
 		Username:    &username,
 	})
 	if err != nil {
-		return "", fmt.Errorf("switch profile failed: %w", err)
+		return "", fmt.Errorf("switch profile failed: %v", err)
 	}

 	return profilemanager.ID(resp.Id), nil
--- a/client/cmd/profile.go
+++ b/client/cmd/profile.go
@@ -138,23 +138,26 @@ func addProfileFunc(cmd *cobra.Command, args []string) error {
 		return err
 	}

-	currUser, err := user.Current()
-	if err != nil {
-		return fmt.Errorf("get current user: %w", err)
-	}
-
 	conn, err := DialClientGRPCServer(cmd.Context(), daemonAddr)
 	if err != nil {
 		return fmt.Errorf("connect to service CLI interface: %w", err)
 	}
 	defer conn.Close()

+	currUser, err := user.Current()
+	if err != nil {
+		return fmt.Errorf("get current user: %w", err)
+	}
+
 	daemonClient := proto.NewDaemonServiceClient(conn)
 	profileName := args[0]

-	id, err := addProfileOnDaemon(cmd.Context(), daemonClient, profileName, currUser.Username)
+	resp, err := daemonClient.AddProfile(cmd.Context(), &proto.AddProfileRequest{
+		ProfileName: profileName,
+		Username:    currUser.Username,
+	})
 	if err != nil {
-		return err
+		return fmt.Errorf("add profile request: %w", err)
 	}

 	dupCount, _ := countProfilesWithName(cmd.Context(), daemonClient, currUser.Username, profileName)
@@ -163,6 +166,7 @@ func addProfileFunc(cmd *cobra.Command, args []string) error {
 		cmd.Println("Use `netbird profile list --show-id` to disambiguate later.")
 	}

+	id := profilemanager.ID(resp.Id)
 	cmd.Printf("Profile added: %s  %s\n", id.ShortID(), profilemanager.StripCtrlChars(profileName))
 	return nil

@@ -326,19 +330,3 @@ func wrapAmbiguityError(err error, handle string) error {
 	}
 	return err
 }
-
-// addProfileOnDaemon issues the AddProfile RPC on an existing daemon client
-// and returns the new profile's ID. It is the single entry point for profile
-// creation, shared by `netbird profile add` and the `netbird up --profile
-// <name>` auto-create path.
-func addProfileOnDaemon(ctx context.Context, client proto.DaemonServiceClient, profileName, username string) (profilemanager.ID, error) {
-	resp, err := client.AddProfile(ctx, &proto.AddProfileRequest{
-		ProfileName: profileName,
-		Username:    username,
-	})
-	if err != nil {
-		return "", fmt.Errorf("add profile failed: %w", err)
-	}
-
-	return profilemanager.ID(resp.Id), nil
-}
--- a/client/cmd/root.go
+++ b/client/cmd/root.go
@@ -20,6 +20,7 @@ import (
 	"github.com/spf13/cobra"
 	"github.com/spf13/pflag"
 	"google.golang.org/grpc"
+	"google.golang.org/grpc/connectivity"
 	"google.golang.org/grpc/credentials/insecure"

 	daddr "github.com/netbirdio/netbird/client/internal/daemonaddr"
@@ -261,17 +262,46 @@ func FlagNameToEnvVar(cmdFlag string, prefix string) string {
 	return prefix + upper
 }

-// DialClientGRPCServer returns client connection to the daemon server.
+// DialClientGRPCServer returns client connection to the daemon server. It waits
+// (up to the timeout) for the daemon to become reachable so an `up` issued right
+// after `service start` tolerates the startup race. Instead of grpc's blocking
+// dial — whose raw "transport failed" retry warnings are silenced by the logger
+// config — we drive the wait ourselves and emit one clean line per failed attempt.
 func DialClientGRPCServer(ctx context.Context, addr string) (*grpc.ClientConn, error) {
 	ctx, cancel := context.WithTimeout(ctx, time.Second*10)
 	defer cancel()

-	return grpc.DialContext(
+	conn, err := grpc.DialContext(
 		ctx,
 		strings.TrimPrefix(addr, "tcp://"),
 		grpc.WithTransportCredentials(insecure.NewCredentials()),
-		grpc.WithBlock(),
 	)
+	if err != nil {
+		return nil, err
+	}
+
+	conn.Connect()
+	for {
+		state := conn.GetState()
+		if state == connectivity.Ready {
+			return conn, nil
+		}
+		// Log only once the connection has actually failed — not during the
+		// brief Idle/Connecting phase on a healthy daemon (avoids a spurious
+		// line + wait when the daemon is already up).
+		if state == connectivity.TransientFailure {
+			log.Infof("waiting for the netbird daemon to become available at %s...", addr)
+		}
+		// Wake on the next state change, but at least every second so a stuck
+		// TransientFailure re-logs at a steady cadence until the timeout.
+		waitCtx, waitCancel := context.WithTimeout(ctx, time.Second)
+		conn.WaitForStateChange(waitCtx, state)
+		waitCancel()
+		if ctx.Err() != nil {
+			_ = conn.Close()
+			return nil, fmt.Errorf("daemon not reachable at %s: %w", addr, ctx.Err())
+		}
+	}
 }

 // WithBackOff execute function in backoff cycle.
--- a/client/cmd/status.go
+++ b/client/cmd/status.go
@@ -11,6 +11,7 @@ import (
 	"google.golang.org/grpc/status"

 	"github.com/netbirdio/netbird/client/internal"
+	"github.com/netbirdio/netbird/client/internal/profilemanager"
 	"github.com/netbirdio/netbird/client/proto"
 	nbstatus "github.com/netbirdio/netbird/client/status"
 	"github.com/netbirdio/netbird/util"
@@ -110,10 +111,11 @@ func statusFunc(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	// Resolve the active profile's display name via the daemon, which runs
-	// as root and can read the per-user profile files. The local profile
-	// manager only knows the active profile ID, not its display name.
-	profName := getActiveProfileName(ctx)
+	pm := profilemanager.NewProfileManager()
+	var profName string
+	if activeProf, err := pm.GetActiveProfile(); err == nil {
+		profName = activeProf.Name
+	}

 	var outputInformationHolder = nbstatus.ConvertToStatusOutputOverview(resp.GetFullStatus(), nbstatus.ConvertOptions{
 		Anonymize:            anonymizeFlag,
@@ -165,25 +167,6 @@ func getStatus(ctx context.Context, fullPeerStatus bool, shouldRunProbes bool) (
 	return resp, nil
 }

-// getActiveProfileName asks the daemon for the active profile's display
-// name. The daemon runs as root and can read the per-user profile files to
-// resolve the ID to its human-readable name. Returns an empty string on any
-// error so status output degrades gracefully.
-func getActiveProfileName(ctx context.Context) string {
-	conn, err := DialClientGRPCServer(ctx, daemonAddr)
-	if err != nil {
-		return ""
-	}
-	defer conn.Close()
-
-	resp, err := proto.NewDaemonServiceClient(conn).GetActiveProfile(ctx, &proto.GetActiveProfileRequest{})
-	if err != nil {
-		return ""
-	}
-
-	return resp.GetProfileName()
-}
-
 func parseFilters() error {
 	switch strings.ToLower(statusFilter) {
 	case "", "idle", "connecting", "connected":
--- a/client/cmd/up.go
+++ b/client/cmd/up.go
@@ -128,9 +128,15 @@ func upFunc(cmd *cobra.Command, args []string) error {
 	var profileSwitched bool
 	// switch profile if provided
 	if profileName != "" {
-		if err := switchOrCreateProfile(cmd.Context(), pm, profileName, username.Username); err != nil {
+		resolvedID, err := switchProfile(cmd.Context(), profileName, username.Username)
+		if err != nil {
 			return fmt.Errorf("switch profile: %v", err)
 		}
+
+		if err := pm.SwitchProfile(resolvedID); err != nil {
+			return fmt.Errorf("switch profile: %v", err)
+		}
+
 		profileSwitched = true
 	}

@@ -145,52 +151,6 @@ func upFunc(cmd *cobra.Command, args []string) error {
 	return runInDaemonMode(ctx, cmd, pm, activeProf, profileSwitched)
 }

-// switchOrCreateProfile switches the active profile to the one identified by
-// handle, creating it first when it does not exist yet. This restores the
-// pre-0.73 behaviour where `netbird up --profile <name>` auto-creates a
-// missing profile instead of failing.
-func switchOrCreateProfile(ctx context.Context, pm *profilemanager.ProfileManager, handle, username string) error {
-	resolvedID, err := switchProfile(ctx, handle, username)
-	if err != nil {
-		st, ok := gstatus.FromError(err)
-		if !ok || st.Code() != codes.NotFound {
-			return err
-		}
-		// Don't fail immediately on a create error: a concurrent run may
-		// have created the profile between the NotFound above and this
-		// call, in which case the retried switch still succeeds. Only
-		// surface the create error if the switch also fails.
-		_, createErr := createProfile(ctx, handle, username)
-		if resolvedID, err = switchProfile(ctx, handle, username); err != nil {
-			if createErr != nil {
-				return fmt.Errorf("create profile: %w", createErr)
-			}
-			return err
-		}
-	}
-
-	if err := pm.SwitchProfile(resolvedID); err != nil {
-		return err
-	}
-	return nil
-}
-
-// createProfile dials the daemon and creates a new profile with the given
-// display name, returning its generated ID. Use addProfileOnDaemon directly
-// when a daemon client is already available to reuse the connection.
-func createProfile(ctx context.Context, profileName, username string) (profilemanager.ID, error) {
-	conn, err := DialClientGRPCServer(ctx, daemonAddr)
-	if err != nil {
-		//nolint
-		return "", fmt.Errorf("failed to connect to daemon error: %v\n"+
-			"If the daemon is not running please run: "+
-			"\nnetbird service install \nnetbird service start\n", err)
-	}
-	defer conn.Close()
-
-	return addProfileOnDaemon(ctx, proto.NewDaemonServiceClient(conn), profileName, username)
-}
-
 func runInForegroundMode(ctx context.Context, cmd *cobra.Command, activeProf *profilemanager.Profile) error {
 	// override the default profile filepath if provided
 	if configPath != "" {
@@ -241,10 +201,10 @@ func runInForegroundMode(ctx context.Context, cmd *cobra.Command, activeProf *pr
 	r := peer.NewRecorder(config.ManagementURL.String())
 	r.GetFullStatus()

-	connectClient := internal.NewConnectClient(ctx, config, r)
+	connectClient := internal.NewConnectClient(ctx, r)
 	SetupDebugHandler(ctx, config, r, connectClient, "")

-	return connectClient.Run(nil, util.FindFirstLogPath(logFiles))
+	return connectClient.Run(config, nil, util.FindFirstLogPath(logFiles))
 }

 func runInDaemonMode(ctx context.Context, cmd *cobra.Command, pm *profilemanager.ProfileManager, activeProf *profilemanager.Profile, profileSwitched bool) error {
--- a/client/embed/embed.go
+++ b/client/embed/embed.go
@@ -264,34 +264,24 @@ func (c *Client) Start(startCtx context.Context) error {
 	if err, _ := authClient.Login(ctx, c.setupKey, c.jwtToken); err != nil {
 		return fmt.Errorf("login: %w", err)
 	}
-	client := internal.NewConnectClient(ctx, c.config, c.recorder)
+	client := internal.NewConnectClient(ctx, c.recorder)
 	client.SetSyncResponsePersistence(true)

-	// either startup error (permanent backoff err) or nil err (successful engine up)
+	// The supervisor owns the run; we wait until it is established, ends with a
+	// startup error (permanent backoff err), or startCtx expires.
 	// TODO: make after-startup backoff err available
-	run := make(chan struct{})
-	clientErr := make(chan error, 1)
-	go func() {
-		if err := client.Run(run, ""); err != nil {
-			clientErr <- err
-		}
-	}()
+	client.RunAsync(c.config, nil)

-	select {
-	case <-startCtx.Done():
-		// ConnectClient.Stop now cancels its own run context and waits for the
-		// run loop to tear the engine down, so this cancel() is no longer
-		// required to break the deadlock and could be removed. It is kept as a
-		// defensive belt-and-suspenders: cancelling the parent context first
-		// guarantees the run loop is unblocked even if Stop's contract regresses.
+	if err := client.WaitEstablishedOrDone(startCtx); err != nil {
+		// Either startCtx expired while connecting, or the run ended before it
+		// established. Cancel the client context before stopping: Engine.Start
+		// blocks on the signal stream while holding the engine mutex and only
+		// unblocks on cancellation. Stopping first would deadlock on that mutex.
 		cancel()
 		if stopErr := client.Stop(); stopErr != nil {
-			return fmt.Errorf("stop error after context done. Stop error: %w. Context done: %w", stopErr, startCtx.Err())
+			return fmt.Errorf("stop error after startup failure. Stop error: %w. Startup: %w", stopErr, err)
 		}
-		return startCtx.Err()
-	case err := <-clientErr:
 		return fmt.Errorf("startup: %w", err)
-	case <-run:
 	}

 	c.connect = client
--- a/client/internal/connect.go
+++ b/client/internal/connect.go
@@ -11,7 +11,6 @@ import (
 	"runtime/debug"
 	"strings"
 	"sync"
-	"sync/atomic"
 	"time"

 	"github.com/cenkalti/backoff/v4"
@@ -19,6 +18,7 @@ import (

 	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
 	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/metadata"
 	gstatus "google.golang.org/grpc/status"

 	"github.com/netbirdio/netbird/client/iface/wgaddr"
@@ -49,17 +49,23 @@ import (
 	"github.com/netbirdio/netbird/version"
 )

-// androidRunOverride is set on Android to inject mobile dependencies
-// when using embed.Client (which calls Run() with empty MobileDependency).
-var androidRunOverride func(c *ConnectClient, runningChan chan struct{}, logPath string) error
+// androidMobileDep is set on Android to inject the MobileDependency for runs
+// started through the generic entry points (Run/RunAsync, e.g. embed.Client).
+// nil on other platforms, where the dependency is empty.
+var androidMobileDep func(config *profilemanager.Config) MobileDependency
+
+// mobileDependency returns the MobileDependency for a run started via the
+// generic entry points. On Android the androidMobileDep provider supplies
+// platform stubs (or real implementations); elsewhere it is empty.
+func (c *ConnectClient) mobileDependency(config *profilemanager.Config) MobileDependency {
+	if androidMobileDep != nil {
+		return androidMobileDep(config)
+	}
+	return MobileDependency{}
+}

 type ConnectClient struct {
 	ctx            context.Context
-	runCancel      context.CancelFunc
-	runExited      chan struct{}
-	runOnce        sync.Once
-	runStarted     atomic.Bool
-	config         *profilemanager.Config
 	statusRecorder *peer.Status

 	engine        *Engine
@@ -68,41 +74,62 @@ type ConnectClient struct {
 	updateManager *updater.Manager

 	persistSyncResponse bool
+
+	// sup serializes all start/stop requests so two lifecycle operations can
+	// never overlap. See connect_lifecycle.go.
+	sup *supervisor
 }

 func NewConnectClient(
 	ctx context.Context,
-	config *profilemanager.Config,
 	statusRecorder *peer.Status,
 ) *ConnectClient {
-	// Derive the run context here so Stop owns the cancel that unblocks the run
-	// loop. runCancel is set once at construction, so Stop can call it without
-	// racing the run loop's startup. Callers therefore need not cancel before Stop.
-	runCtx, runCancel := context.WithCancel(ctx)
-	return &ConnectClient{
-		ctx:            runCtx,
-		runCancel:      runCancel,
-		runExited:      make(chan struct{}),
-		config:         config,
+	c := &ConnectClient{
+		ctx:            ctx,
 		statusRecorder: statusRecorder,
 		engineMutex:    sync.Mutex{},
 	}
+	c.sup = newSupervisor(ctx, c.run)
+	return c
 }

 func (c *ConnectClient) SetUpdateManager(um *updater.Manager) {
 	c.updateManager = um
 }

-// Run with main logic.
-func (c *ConnectClient) Run(runningChan chan struct{}, logPath string) error {
-	if androidRunOverride != nil {
-		return androidRunOverride(c, runningChan, logPath)
-	}
-	return c.run(MobileDependency{}, runningChan, logPath)
+// Run with main logic. md carries optional gRPC metadata (e.g. the UI
+// user-agent) to forward to the management/signal services; nil when none.
+func (c *ConnectClient) Run(config *profilemanager.Config, md metadata.MD, logPath string) error {
+	return c.sup.start(config, md, c.mobileDependency(config), logPath)
+}
+
+// RunAsync starts a client run without blocking. Used by the daemon and embed,
+// which drive the lifecycle through the supervisor rather than blocking on Run;
+// they then wait for the outcome via WaitEstablishedOrDone. The run's lifecycle
+// channels are created and owned by the supervisor — callers never hold them.
+func (c *ConnectClient) RunAsync(config *profilemanager.Config, md metadata.MD) {
+	c.sup.startAsync(config, md, c.mobileDependency(config), "", nil)
+}
+
+// Restart atomically stops any in-flight run and starts a fresh one with the
+// given config. The stop+start happens as a single supervisor operation, so no
+// other lifecycle request can interleave between them — used for explicit
+// restarts (e.g. an MDM policy change) that must not expose a "stopped" window.
+func (c *ConnectClient) Restart(config *profilemanager.Config, md metadata.MD) {
+	c.sup.restartAsync(config, md, c.mobileDependency(config), "")
+}
+
+// WaitEstablishedOrDone blocks until the in-flight run becomes established (nil),
+// ends before that (the run error, or a sentinel on a clean stop), or ctx is
+// cancelled. Returns errNoRunInFlight if no run is in flight. Wraps the wait on
+// the supervisor-owned channels so callers never touch them directly.
+func (c *ConnectClient) WaitEstablishedOrDone(ctx context.Context) error {
+	return c.sup.waitEstablishedOrDone(ctx)
 }

 // RunOnAndroid with main logic on mobile system
 func (c *ConnectClient) RunOnAndroid(
+	config *profilemanager.Config,
 	tunAdapter device.TunAdapter,
 	iFaceDiscover stdnet.ExternalIFaceDiscover,
 	networkChangeListener listener.NetworkChangeListener,
@@ -121,10 +148,11 @@ func (c *ConnectClient) RunOnAndroid(
 		StateFilePath:         stateFilePath,
 		TempDir:               cacheDir,
 	}
-	return c.run(mobileDependency, nil, "")
+	return c.sup.start(config, nil, mobileDependency, "")
 }

 func (c *ConnectClient) RunOniOS(
+	config *profilemanager.Config,
 	fileDescriptor int32,
 	networkChangeListener listener.NetworkChangeListener,
 	dnsManager dns.IosDnsManager,
@@ -142,15 +170,12 @@ func (c *ConnectClient) RunOniOS(
 		StateFilePath:         stateFilePath,
 		TempDir:               cacheDir,
 	}
-	return c.run(mobileDependency, nil, logFilePath)
+	return c.sup.start(config, nil, mobileDependency, logFilePath)
 }

-func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan struct{}, logPath string) error {
-	// Mark the loop as started and signal exit on return so Stop can wait for
-	// the loop to finish (and skip the wait if the loop never ran).
-	c.runStarted.Store(true)
-	defer c.runOnce.Do(func() { close(c.runExited) })
-
+// run executes a single client run. runCtx is owned by the supervisor: cancelling
+// it tears the run down (it is the parent of the per-attempt engine context).
+func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Config, mobileDependency MobileDependency, connEstablishedChan chan struct{}, logPath string) error {
 	defer func() {
 		if r := recover(); r != nil {
 			rec := c.statusRecorder
@@ -214,18 +239,18 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 	}()

 	wrapErr := state.Wrap
-	myPrivateKey, err := wgtypes.ParseKey(c.config.PrivateKey)
+	myPrivateKey, err := wgtypes.ParseKey(config.PrivateKey)
 	if err != nil {
-		log.Errorf("failed parsing Wireguard key %s: [%s]", c.config.PrivateKey, err.Error())
+		log.Errorf("failed parsing Wireguard key %s: [%s]", config.PrivateKey, err.Error())
 		return wrapErr(err)
 	}

 	var mgmTlsEnabled bool
-	if c.config.ManagementURL.Scheme == "https" {
+	if config.ManagementURL.Scheme == "https" {
 		mgmTlsEnabled = true
 	}

-	publicSSHKey, err := ssh.GeneratePublicKey([]byte(c.config.SSHKey))
+	publicSSHKey, err := ssh.GeneratePublicKey([]byte(config.SSHKey))
 	if err != nil {
 		return err
 	}
@@ -259,13 +284,13 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 	defer c.statusRecorder.ClientStop()
 	operation := func() error {
 		// if context cancelled we not start new backoff cycle
-		if c.ctx.Err() != nil {
+		if runCtx.Err() != nil {
 			return nil
 		}

 		state.Set(StatusConnecting)

-		engineCtx, cancel := context.WithCancel(c.ctx)
+		engineCtx, cancel := context.WithCancel(runCtx)
 		defer func() {
 			_, err := state.Status()
 			c.statusRecorder.MarkManagementDisconnected(err)
@@ -273,8 +298,8 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 			cancel()
 		}()

-		log.Debugf("connecting to the Management service %s", c.config.ManagementURL.Host)
-		mgmClient, err := mgm.NewClient(engineCtx, c.config.ManagementURL.Host, myPrivateKey, mgmTlsEnabled)
+		log.Debugf("connecting to the Management service %s", config.ManagementURL.Host)
+		mgmClient, err := mgm.NewClient(engineCtx, config.ManagementURL.Host, myPrivateKey, mgmTlsEnabled)
 		if err != nil {
 			return wrapErr(gstatus.Errorf(codes.FailedPrecondition, "failed connecting to Management Service : %s", err))
 		}
@@ -291,7 +316,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 		}
 		c.clientMetrics.UpdateAgentInfo(agentInfo, myPrivateKey.PublicKey().String())

-		log.Debugf("connected to the Management service %s", c.config.ManagementURL.Host)
+		log.Debugf("connected to the Management service %s", config.ManagementURL.Host)
 		defer func() {
 			if err = mgmClient.Close(); err != nil {
 				log.Warnf("failed to close the Management service client %v", err)
@@ -300,13 +325,14 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan

 		// connect (just a connection, no stream yet) and login to Management Service to get an initial global Netbird config
 		loginStarted := time.Now()
-		loginResp, err := loginToManagement(engineCtx, mgmClient, publicSSHKey, c.config)
+		loginResp, err := loginToManagement(engineCtx, mgmClient, publicSSHKey, config)
 		if err != nil {
 			c.clientMetrics.RecordLoginDuration(engineCtx, time.Since(loginStarted), false)
 			log.Debug(err)
 			if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
 				state.Set(StatusNeedsLogin)
-				c.runCancel()
+				// No teardown needed: login fails before the engine is started
+				// (engine.Start is below), so there is nothing running to stop.
 				return backoff.Permanent(wrapErr(err)) // unrecoverable error
 			}
 			return wrapErr(err)
@@ -360,7 +386,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 		}
 		peerConfig := loginResp.GetPeerConfig()

-		engineConfig, err := createEngineConfig(myPrivateKey, c.config, peerConfig, logPath)
+		engineConfig, err := createEngineConfig(myPrivateKey, config, peerConfig, logPath)
 		if err != nil {
 			log.Error(err)
 			return wrapErr(err)
@@ -404,7 +430,7 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 		c.engine = engine
 		c.engineMutex.Unlock()

-		if err := engine.Start(loginResp.GetNetbirdConfig(), c.config.ManagementURL); err != nil {
+		if err := engine.Start(loginResp.GetNetbirdConfig(), config.ManagementURL); err != nil {
 			log.Errorf("error while starting Netbird Connection Engine: %s", err)
 			return wrapErr(err)
 		}
@@ -412,12 +438,13 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 		log.Infof("Netbird engine started, the IP is: %s", peerConfig.GetAddress())
 		state.Set(StatusConnected)

-		if runningChan != nil {
-			select {
-			case <-runningChan:
-			default:
-				close(runningChan)
-			}
+		// The supervisor owns connEstablishedChan and it is always present. Guard
+		// against a double close: operation re-runs on ErrResetConnection retries
+		// within the same run, and the channel is closed only on the first connect.
+		select {
+		case <-connEstablishedChan:
+		default:
+			close(connEstablishedChan)
 		}

 		<-engineCtx.Done()
@@ -426,8 +453,10 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 		c.engine = nil
 		c.engineMutex.Unlock()

-		log.Infof("ensuring wg interface is removed, Netbird engine context cancelled")
-
+		// Always tear the engine down once its context is cancelled. engine.Stop
+		// is nil-guarded per component, so calling it unconditionally is safe and
+		// avoids both the data race on engine.wgInterface and skipping teardown
+		// when the interface was never brought up (e.g. a mid-start failure).
 		if err := engine.Stop(); err != nil {
 			log.Errorf("Failed to stop engine: %v", err)
 		}
@@ -445,12 +474,13 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 	}

 	c.statusRecorder.ClientStart()
-	err = backoff.Retry(operation, backoff.WithContext(backOff, c.ctx))
+	err = backoff.Retry(operation, backOff)
 	if err != nil {
 		log.Debugf("exiting client retry loop due to unrecoverable error: %s", err)
 		if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
+			// Login failed permanently: the engine was never started, so there
+			// is nothing to tear down — just record that a login is needed.
 			state.Set(StatusNeedsLogin)
-			c.runCancel()
 		}
 		return err
 	}
@@ -471,6 +501,22 @@ func parseRelayInfo(loginResp *mgmProto.LoginResponse) ([]string, *hmac.Token) {
 	return relayCfg.GetUrls(), token
 }

+// ConnectionRunning reports whether a connection run is currently in flight
+// (connecting, connected, or reconnecting). Answered by the supervisor via a
+// serialized query, so it settles behind an in-flight stop. Distinct from
+// ServiceRunning, which reports whether the service itself is alive.
+func (c *ConnectClient) ConnectionRunning() bool {
+	return c.sup.isRunning()
+}
+
+// ServiceRunning reports whether the client's lifecycle supervisor is alive and
+// able to accept start/stop commands — i.e. its context has not been cancelled
+// (the daemon is not shutting down). Independent of whether a connection run is
+// up (that is ConnectionRunning).
+func (c *ConnectClient) ServiceRunning() bool {
+	return c.sup.ctx.Err() == nil
+}
+
 func (c *ConnectClient) Engine() *Engine {
 	if c == nil {
 		return nil
@@ -527,12 +573,10 @@ func (c *ConnectClient) Status() StatusType {
 	return status
 }

+// Stop serializes a stop request through the lifecycle supervisor and blocks
+// until the in-flight run is fully torn down.
 func (c *ConnectClient) Stop() error {
-	c.runCancel()
-	if c.runStarted.Load() {
-		<-c.runExited
-	}
-	return nil
+	return c.sup.stop()
 }

 // SetSyncResponsePersistence enables or disables sync response persistence.
--- a/client/internal/connect_android_default.go
+++ b/client/internal/connect_android_default.go
@@ -7,6 +7,7 @@ import (

 	"github.com/netbirdio/netbird/client/internal/dns"
 	"github.com/netbirdio/netbird/client/internal/listener"
+	"github.com/netbirdio/netbird/client/internal/profilemanager"
 	"github.com/netbirdio/netbird/client/internal/stdnet"
 )

@@ -59,19 +60,17 @@ var _ listener.NetworkChangeListener = noopNetworkChangeListener{}
 var _ dns.ReadyListener = noopDnsReadyListener{}

 func init() {
-	// Wire up the default override so embed.Client.Start() works on Android
-	// with netstack mode. Provides complete no-op stubs for all mobile
+	// Wire up the default MobileDependency provider so embed.Client.Start() works
+	// on Android with netstack mode. Provides complete no-op stubs for all mobile
 	// dependencies so the engine's existing Android code paths work unchanged.
-	// Applications that need P2P ICE or real DNS should replace this by
-	// setting androidRunOverride before calling Start().
-	androidRunOverride = func(c *ConnectClient, runningChan chan struct{}, logPath string) error {
-		return c.runOnAndroidEmbed(
+	// Applications that need P2P ICE or real DNS should replace this by setting
+	// androidMobileDep before calling Start().
+	androidMobileDep = func(config *profilemanager.Config) MobileDependency {
+		return mobileDependencyForEmbed(
 			noopIFaceDiscover{},
 			noopNetworkChangeListener{},
 			[]netip.AddrPort{},
 			noopDnsReadyListener{},
-			runningChan,
-			logPath,
 		)
 	}
 }
--- a/client/internal/connect_android_embed.go
+++ b/client/internal/connect_android_embed.go
@@ -10,23 +10,18 @@ import (
 	"github.com/netbirdio/netbird/client/internal/stdnet"
 )

-// runOnAndroidEmbed is like RunOnAndroid but accepts a runningChan
-// so embed.Client.Start() can detect when the engine is ready.
-// It provides complete MobileDependency so the engine's existing
-// Android code paths work unchanged.
-func (c *ConnectClient) runOnAndroidEmbed(
+// mobileDependencyForEmbed builds the MobileDependency used by embed.Client on
+// Android so the engine's existing Android code paths work unchanged.
+func mobileDependencyForEmbed(
 	iFaceDiscover stdnet.ExternalIFaceDiscover,
 	networkChangeListener listener.NetworkChangeListener,
 	dnsAddresses []netip.AddrPort,
 	dnsReadyListener dns.ReadyListener,
-	runningChan chan struct{},
-	logPath string,
-) error {
-	mobileDependency := MobileDependency{
+) MobileDependency {
+	return MobileDependency{
 		IFaceDiscover:         iFaceDiscover,
 		NetworkChangeListener: networkChangeListener,
 		HostDNSAddresses:      dnsAddresses,
 		DnsReadyListener:      dnsReadyListener,
 	}
-	return c.run(mobileDependency, runningChan, logPath)
 }
--- a/client/internal/connect_lifecycle.go
+++ b/client/internal/connect_lifecycle.go
@@ -0,0 +1,362 @@
+package internal
+
+import (
+	"context"
+	"errors"
+
+	"google.golang.org/grpc/metadata"
+
+	"github.com/netbirdio/netbird/client/internal/profilemanager"
+)
+
+// errAlreadyRunning is returned when a start is requested while a run is already
+// in flight.
+var errAlreadyRunning = errors.New("client is already running")
+
+// errNoRunInFlight is returned by waitEstablishedOrDone when no run is active.
+var errNoRunInFlight = errors.New("no connection run in flight")
+
+// errStoppedBeforeEstablished is returned when a run ended (cleanly) before the
+// connection was established.
+var errStoppedBeforeEstablished = errors.New("run stopped before the connection was established")
+
+// lifecycleOp is a serialized lifecycle operation processed by the supervisor.
+type lifecycleOp int
+
+const (
+	opStart lifecycleOp = iota
+	opStop
+	opRestart
+	opStatus
+	opWaitEstablished
+)
+
+// lifecycleCmd is a single lifecycle request handed to the supervisor goroutine.
+// They all flow through the same cmdCh so they are strictly ordered (FIFO) with
+// respect to each other.
+type lifecycleCmd struct {
+	op        lifecycleOp
+	config    *profilemanager.Config
+	md        metadata.MD
+	mobileDep MobileDependency
+	logPath   string
+
+	// done is the caller's notification channel (nil for fire-and-forget). Its
+	// meaning depends on op:
+	//   - opStart: receives the run's end result when the run terminates, or
+	//     errAlreadyRunning immediately if a run is already in flight.
+	//   - opStop: receives nil once the in-flight run has fully unwound.
+	//   - opWaitEstablished: receives the wait outcome (see waitEstablishedOrDone).
+	done chan error
+
+	reply   chan bool       // opStatus only: receives whether a run is in flight
+	waitCtx context.Context // opWaitEstablished only: the waiter's cancellation context
+}
+
+// runState holds the lifecycle channels of a single in-flight run, owned by the
+// loop goroutine. It never escapes the supervisor as an API; the only readers
+// are the per-wait goroutines the loop spawns for opWaitEstablished.
+//
+// connEstablishedChan is closed by the run once the connection is established.
+// The supervisor creates and owns it — callers no longer supply it; they observe
+// it through waitEstablishedOrDone. ended is closed (broadcast) when the run
+// terminates, so any number of waiters can observe it; err is the run's end
+// result, valid only after ended is closed.
+type runState struct {
+	connEstablishedChan chan struct{} // closed by the run on established
+	ended               chan struct{} // closed by finishRun when the run terminates
+	err                 error         // run end result, valid after ended is closed
+}
+
+// runEndResult is sent by the run goroutine to the supervisor when a run ends,
+// whether on its own (error / external context cancellation) or because of a Stop.
+type runEndResult struct {
+	err error
+}
+
+// runFunc executes a single client run bound to the supervisor-owned context,
+// with the config supplied by the start request.
+type runFunc func(ctx context.Context, config *profilemanager.Config, mobileDep MobileDependency, connEstablishedChan chan struct{}, logPath string) error
+
+// supervisor serializes start/stop of a single client run. Every request goes
+// through cmdCh and is handled one at a time by the loop goroutine, so two
+// lifecycle operations can never overlap and their order is preserved (FIFO).
+// The loop goroutine is the sole owner of curStart/runCancel, so that state
+// needs no locking. The loop exits when the parent context is cancelled.
+type supervisor struct {
+	ctx      context.Context
+	run      runFunc
+	cmdCh    chan lifecycleCmd
+	runEnded chan runEndResult
+
+	// owned exclusively by the loop goroutine. curStart is the in-flight start
+	// command (nil = idle); its done channel is notified when the run ends.
+	// curRun holds that run's lifecycle channels; runCancel cancels it.
+	curStart  *lifecycleCmd
+	curRun    *runState
+	runCancel context.CancelFunc
+}
+
+func newSupervisor(ctx context.Context, run runFunc) *supervisor {
+	s := &supervisor{
+		ctx:      ctx,
+		run:      run,
+		cmdCh:    make(chan lifecycleCmd, 16),
+		runEnded: make(chan runEndResult, 1),
+	}
+	go s.loop()
+	return s
+}
+
+func (s *supervisor) loop() {
+	for {
+		select {
+		case <-s.ctx.Done():
+			s.shutdown()
+			return
+		case cmd := <-s.cmdCh:
+			switch cmd.op {
+			case opStart:
+				s.handleStart(cmd)
+			case opStop:
+				s.handleStop(cmd)
+			case opRestart:
+				s.handleRestart(cmd)
+			case opStatus:
+				cmd.reply <- (s.isRunningInternal())
+			case opWaitEstablished:
+				s.handleWaitEstablished(cmd)
+			}
+		case res := <-s.runEnded:
+			// Run ended on its own, without an explicit Stop.
+			s.finishRun(res.err)
+		}
+	}
+}
+
+func (s *supervisor) handleStart(cmd lifecycleCmd) {
+	if s.isRunningInternal() {
+		notify(cmd.done, errAlreadyRunning)
+		return
+	}
+
+	runCtx, cancel := context.WithCancel(s.ctx)
+	if cmd.md != nil {
+		// Carry caller-supplied gRPC metadata (e.g. UI user-agent) into the run
+		// context so the engine's management/signal calls forward it. The cancel
+		// still drives runCtx (metadata wrapping preserves cancellation).
+		runCtx = metadata.NewOutgoingContext(runCtx, cmd.md)
+	}
+	s.runCancel = cancel
+	s.curStart = &cmd
+	s.curRun = &runState{connEstablishedChan: make(chan struct{}), ended: make(chan struct{})}
+
+	go func(ctx context.Context, cfg *profilemanager.Config, m MobileDependency, established chan struct{}, lp string) {
+		err := s.run(ctx, cfg, m, established, lp)
+		s.runEnded <- runEndResult{err: err}
+	}(runCtx, cmd.config, cmd.mobileDep, s.curRun.connEstablishedChan, cmd.logPath)
+}
+
+func (s *supervisor) handleStop(cmd lifecycleCmd) {
+	if !s.isRunningInternal() {
+		notify(cmd.done, nil)
+		return
+	}
+	s.stopCurrentRun()
+	notify(cmd.done, nil)
+}
+
+// handleRestart tears down any in-flight run and starts a fresh one in a single
+// loop turn. No other command can interleave between the stop and the start
+// (the loop is single-threaded), so the swap is atomic without relying on any
+// daemon-side lock — that is what an explicit restart (e.g. MDM config change)
+// needs to avoid a window where the client is observably stopped.
+func (s *supervisor) handleRestart(cmd lifecycleCmd) {
+	if s.isRunningInternal() {
+		s.stopCurrentRun()
+	}
+	s.handleStart(cmd)
+}
+
+// stopCurrentRun cancels the in-flight run and blocks the supervisor until it
+// has fully unwound, so the next action starts from a clean slate. The run
+// goroutine reports completion via runEnded. Caller must hold an in-flight run
+// (curStart != nil).
+func (s *supervisor) stopCurrentRun() {
+	s.runCancel()
+	res := <-s.runEnded
+	s.finishRun(res.err)
+}
+
+// finishRun resets lifecycle state after a run terminates and hands the run
+// error back to whoever asked to be notified of the start.
+func (s *supervisor) finishRun(err error) {
+	s.runCancel = nil
+	if s.isRunningInternal() {
+		// Publish the result to the broadcast channel before nil-ing curRun, so
+		// any opWaitEstablished goroutines blocked on ended observe err.
+		s.curRun.err = err
+		close(s.curRun.ended)
+		s.curRun = nil
+
+		notify(s.curStart.done, err)
+		s.curStart = nil
+	}
+}
+
+// handleWaitEstablished answers an opWaitEstablished request. The select itself
+// runs in a spawned goroutine on the run's channels so it never blocks the loop;
+// the loop only snapshots the in-flight run's channels (which it owns) here.
+func (s *supervisor) handleWaitEstablished(cmd lifecycleCmd) {
+	caller := cmd.done
+	if !s.isRunningInternal() {
+		notify(caller, errNoRunInFlight)
+		return
+	}
+	rs := s.curRun
+	established := rs.connEstablishedChan
+	ctx := cmd.waitCtx
+	go func() {
+		select {
+		case <-established:
+			notify(caller, nil)
+		case <-rs.ended:
+			if rs.err != nil {
+				notify(caller, rs.err)
+				return
+			}
+			notify(caller, errStoppedBeforeEstablished)
+		case <-ctx.Done():
+			notify(caller, ctx.Err())
+		}
+	}()
+}
+
+// shutdown tears down the in-flight run when the parent context is cancelled,
+// then fails any still-queued commands so their callers never hang.
+func (s *supervisor) shutdown() {
+	if s.runCancel != nil {
+		s.runCancel()
+		res := <-s.runEnded
+		s.finishRun(res.err)
+	}
+	for {
+		select {
+		case cmd := <-s.cmdCh:
+			notify(cmd.done, s.ctx.Err())
+		default:
+			return
+		}
+	}
+}
+
+// startAsync enqueues a start without blocking. If done is non-nil it receives
+// the run's end result (or errAlreadyRunning on rejection, or the context error
+// on shutdown).
+func (s *supervisor) startAsync(config *profilemanager.Config, md metadata.MD, mobileDep MobileDependency, logPath string, done chan error) {
+	cmd := lifecycleCmd{op: opStart, config: config, md: md, mobileDep: mobileDep, logPath: logPath, done: done}
+	select {
+	case s.cmdCh <- cmd:
+	case <-s.ctx.Done():
+		notify(done, s.ctx.Err())
+	}
+}
+
+// restartAsync enqueues an atomic stop+start without blocking. The supervisor
+// tears down any in-flight run and starts a fresh one with the supplied config
+// in a single loop turn (see handleRestart). Fire-and-forget: the new run owns
+// its lifecycle channels, observed via waitEstablishedOrDone.
+func (s *supervisor) restartAsync(config *profilemanager.Config, md metadata.MD, mobileDep MobileDependency, logPath string) {
+	cmd := lifecycleCmd{op: opRestart, config: config, md: md, mobileDep: mobileDep, logPath: logPath}
+	select {
+	case s.cmdCh <- cmd:
+	case <-s.ctx.Done():
+	}
+}
+
+// start enqueues a start and blocks until the run terminates, preserving the
+// blocking contract of the legacy Run entry points.
+func (s *supervisor) start(config *profilemanager.Config, md metadata.MD, mobileDep MobileDependency, logPath string) error {
+	done := make(chan error, 1)
+	s.startAsync(config, md, mobileDep, logPath, done)
+	select {
+	case err := <-done:
+		return err
+	case <-s.ctx.Done():
+		return s.ctx.Err()
+	}
+}
+
+// isRunning asks the loop whether a run is in flight. The query is serialized
+// with start/stop, so during a stop it waits for the teardown to settle and
+// then reports the final state — never a transient "half-stopped".
+func (s *supervisor) isRunning() bool {
+	reply := make(chan bool, 1)
+	select {
+	case s.cmdCh <- lifecycleCmd{op: opStatus, reply: reply}:
+	case <-s.ctx.Done():
+		return false
+	}
+	select {
+	case r := <-reply:
+		return r
+	case <-s.ctx.Done():
+		return false
+	}
+}
+
+func (s *supervisor) isRunningInternal() bool {
+	return s.curStart != nil
+}
+
+// waitEstablishedOrDone blocks until the in-flight run becomes established
+// (returns nil) or ends before that (returns the run error, or
+// errStoppedBeforeEstablished on a clean stop), or ctx is cancelled. Returns
+// errNoRunInFlight if no run is in flight. The wait is performed by a goroutine
+// spawned inside the loop (see handleWaitEstablished); the run's channels never
+// leave the supervisor.
+func (s *supervisor) waitEstablishedOrDone(ctx context.Context) error {
+	reply := make(chan error, 1)
+	select {
+	case s.cmdCh <- lifecycleCmd{op: opWaitEstablished, waitCtx: ctx, done: reply}:
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-s.ctx.Done():
+		return s.ctx.Err()
+	}
+	select {
+	case err := <-reply:
+		return err
+	case <-s.ctx.Done():
+		return s.ctx.Err()
+	}
+}
+
+// stop enqueues a stop and blocks until the in-flight run is fully torn down.
+func (s *supervisor) stop() error {
+	done := make(chan error, 1)
+	select {
+	case s.cmdCh <- lifecycleCmd{op: opStop, done: done}:
+	case <-s.ctx.Done():
+		return s.ctx.Err()
+	}
+	select {
+	case err := <-done:
+		return err
+	case <-s.ctx.Done():
+		return s.ctx.Err()
+	}
+}
+
+// notify sends on a caller-supplied channel without blocking. The channel is
+// expected to be buffered (cap 1); a nil channel means the caller did not ask
+// to be notified.
+func notify(ch chan error, err error) {
+	if ch == nil {
+		return
+	}
+	select {
+	case ch <- err:
+	default:
+	}
+}
--- a/client/internal/dns/mgmt/mgmt.go
+++ b/client/internal/dns/mgmt/mgmt.go
@@ -51,20 +51,13 @@ type cachedRecord struct {
 }

 // Resolver caches critical NetBird infrastructure domains.
-// records, refreshing, failedResolves, mgmtDomain and serverDomains are all
-// guarded by mutex.
+// records, refreshing, mgmtDomain and serverDomains are all guarded by mutex.
 type Resolver struct {
 	records       map[dns.Question]*cachedRecord
 	mgmtDomain    *domain.Domain
 	serverDomains *dnsconfig.ServerDomains
 	mutex         sync.RWMutex

-	// failedResolves records the last failed initial resolve per domain so a
-	// domain that never resolves isn't retried on every server-domains update
-	// until refreshBackoff elapses. Entries are cleared on success and pruned
-	// to the current server-domains set.
-	failedResolves map[domain.Domain]time.Time
-
 	chain            ChainResolver
 	chainMaxPriority int
 	refreshGroup     singleflight.Group
@@ -83,10 +76,9 @@ type Resolver struct {
 // NewResolver creates a new management domains cache resolver.
 func NewResolver() *Resolver {
 	return &Resolver{
-		records:        make(map[dns.Question]*cachedRecord),
-		refreshing:     make(map[dns.Question]*atomic.Bool),
-		failedResolves: make(map[domain.Domain]time.Time),
-		cacheTTL:       resolveCacheTTL(),
+		records:    make(map[dns.Question]*cachedRecord),
+		refreshing: make(map[dns.Question]*atomic.Bool),
+		cacheTTL:   resolveCacheTTL(),
 	}
 }

@@ -181,9 +173,7 @@ func (m *Resolver) continueToNext(w dns.ResponseWriter, r *dns.Msg) {

 // AddDomain resolves a domain and stores its A/AAAA records in the cache.
 // A family that resolves NODATA (nil err, zero records) evicts any stale
-// entry for that qtype. When one family hard-errors while the other succeeds,
-// the resolved family is still cached but AddDomain returns an error so the
-// caller retries the incomplete resolve rather than treating it as complete.
+// entry for that qtype.
 func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))

@@ -213,10 +203,6 @@ func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	log.Debugf("added/updated domain=%s with %d A records and %d AAAA records",
 		d.SafeString(), len(aRecords), len(aaaaRecords))

-	if errA != nil || errAAAA != nil {
-		return fmt.Errorf("resolve %s: incomplete, a family failed: %w", d.SafeString(), errors.Join(errA, errAAAA))
-	}
-
 	return nil
 }

@@ -476,7 +462,6 @@ func (m *Resolver) RemoveDomain(d domain.Domain) error {
 	delete(m.records, qAAAA)
 	delete(m.refreshing, qA)
 	delete(m.refreshing, qAAAA)
-	delete(m.failedResolves, d)

 	log.Debugf("removed domain=%s from cache", d.SafeString())
 	return nil
@@ -520,7 +505,6 @@ func (m *Resolver) UpdateFromServerDomains(ctx context.Context, serverDomains dn
 		allDomains := m.extractDomainsFromServerDomains(updatedServerDomains)
 		currentDomains := m.GetCachedDomains()
 		removedDomains = m.removeStaleDomains(currentDomains, allDomains)
-		m.pruneFailedResolves(allDomains)
 	}

 	m.addNewDomains(ctx, newDomains)
@@ -593,85 +577,13 @@ func (m *Resolver) isManagementDomain(domain domain.Domain) bool {
 	return m.mgmtDomain != nil && domain == *m.mgmtDomain
 }

-// addNewDomains resolves and caches domains that are not yet in the cache,
-// running the lookups concurrently. Domains already cached are skipped and left
-// to the stale-while-revalidate refresh path, so a sync never re-resolves them
-// synchronously: once NetBird owns the OS resolver the resolve runs through the
-// handler chain and would otherwise dial the managed upstreams under the engine
-// sync lock on every update.
+// addNewDomains resolves and caches all domains from the update
 func (m *Resolver) addNewDomains(ctx context.Context, newDomains domain.List) {
-	var wg sync.WaitGroup
-	seen := make(map[domain.Domain]struct{}, len(newDomains))
 	for _, newDomain := range newDomains {
-		if _, dup := seen[newDomain]; dup {
-			continue
-		}
-		seen[newDomain] = struct{}{}
-
-		if !m.needsResolve(newDomain) {
-			continue
-		}
-
-		wg.Add(1)
-		go func(d domain.Domain) {
-			defer wg.Done()
-			if err := m.AddDomain(ctx, d); err != nil {
-				m.markResolveFailed(d)
-				log.Warnf("failed to add/update domain=%s: %v", d.SafeString(), err)
-				return
-			}
-			m.clearResolveFailed(d)
-			log.Debugf("added/updated management cache domain=%s", d.SafeString())
-		}(newDomain)
-	}
-	wg.Wait()
-}
-
-// needsResolve reports whether d should be resolved now. A recent failed or
-// incomplete resolve gates retries on the backoff even when one family is
-// already cached, so a transiently-failed family is retried instead of being
-// treated as fully resolved. Otherwise a domain with any cached record is left
-// to the stale-while-revalidate refresh path.
-func (m *Resolver) needsResolve(d domain.Domain) bool {
-	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))
-
-	m.mutex.RLock()
-	defer m.mutex.RUnlock()
-
-	if failedAt, ok := m.failedResolves[d]; ok {
-		return time.Since(failedAt) >= refreshBackoff
-	}
-
-	for _, qtype := range []uint16{dns.TypeA, dns.TypeAAAA} {
-		q := dns.Question{Name: dnsName, Qtype: qtype, Qclass: dns.ClassINET}
-		if _, ok := m.records[q]; ok {
-			return false
-		}
-	}
-	return true
-}
-
-func (m *Resolver) markResolveFailed(d domain.Domain) {
-	m.mutex.Lock()
-	m.failedResolves[d] = time.Now()
-	m.mutex.Unlock()
-}
-
-func (m *Resolver) clearResolveFailed(d domain.Domain) {
-	m.mutex.Lock()
-	delete(m.failedResolves, d)
-	m.mutex.Unlock()
-}
-
-// pruneFailedResolves drops failure markers for domains no longer present in
-// the server-domains set, keeping the map bounded to the current set (a
-// failed-only domain has no cached record, so RemoveDomain never sees it).
-func (m *Resolver) pruneFailedResolves(domains domain.List) {
-	m.mutex.Lock()
-	defer m.mutex.Unlock()
-	for d := range m.failedResolves {
-		if !slices.Contains(domains, d) {
-			delete(m.failedResolves, d)
+		if err := m.AddDomain(ctx, newDomain); err != nil {
+			log.Warnf("failed to add/update domain=%s: %v", newDomain.SafeString(), err)
+		} else {
+			log.Debugf("added/updated management cache domain=%s", newDomain.SafeString())
 		}
 	}
 }
--- a/client/internal/dns/mgmt/mgmt_refresh_test.go
+++ b/client/internal/dns/mgmt/mgmt_refresh_test.go
@@ -21,7 +21,6 @@ type fakeChain struct {
 	mu       sync.Mutex
 	calls    map[string]int
 	answers  map[string][]dns.RR
-	qErr     map[string]error
 	err      error
 	hasRoot  bool
 	onLookup func()
@@ -31,7 +30,6 @@ func newFakeChain() *fakeChain {
 	return &fakeChain{
 		calls:   map[string]int{},
 		answers: map[string][]dns.RR{},
-		qErr:    map[string]error{},
 		hasRoot: true,
 	}
 }
@@ -49,9 +47,6 @@ func (f *fakeChain) ResolveInternal(ctx context.Context, msg *dns.Msg, maxPriori
 	f.calls[key]++
 	answers := f.answers[key]
 	err := f.err
-	if err == nil {
-		err = f.qErr[key]
-	}
 	onLookup := f.onLookup
 	f.mu.Unlock()

@@ -80,12 +75,6 @@ func (f *fakeChain) setAnswer(name string, qtype uint16, ip string) {
 	}
 }

-func (f *fakeChain) setErr(name string, qtype uint16, err error) {
-	f.mu.Lock()
-	defer f.mu.Unlock()
-	f.qErr[name+"|"+dns.TypeToString[qtype]] = err
-}
-
 func (f *fakeChain) callCount(name string, qtype uint16) int {
 	f.mu.Lock()
 	defer f.mu.Unlock()
--- a/client/internal/dns/mgmt/mgmt_resolve_test.go
+++ b/client/internal/dns/mgmt/mgmt_resolve_test.go
@@ -1,183 +0,0 @@
-package mgmt
-
-import (
-	"context"
-	"errors"
-	"sync/atomic"
-	"testing"
-	"time"
-
-	"github.com/miekg/dns"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-
-	dnsconfig "github.com/netbirdio/netbird/client/internal/dns/config"
-	"github.com/netbirdio/netbird/shared/management/domain"
-)
-
-// A domain already in the cache must not be re-resolved on a subsequent server
-// domains update; it is left to the stale-while-revalidate refresh path.
-func TestResolver_UpdateFromServerDomains_SkipsCached(t *testing.T) {
-	r := NewResolver()
-	chain := newFakeChain()
-	chain.setAnswer("signal.example.com.", dns.TypeA, "10.0.0.2")
-	r.SetChainResolver(chain, 50)
-
-	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
-
-	_, err := r.UpdateFromServerDomains(context.Background(), sd)
-	require.NoError(t, err)
-	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
-		"first update must resolve the domain")
-
-	_, err = r.UpdateFromServerDomains(context.Background(), sd)
-	require.NoError(t, err)
-	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
-		"cached domain must not be re-resolved on a subsequent update")
-}
-
-// New domains in a single update must resolve concurrently rather than serially.
-func TestResolver_AddNewDomains_ResolvesConcurrently(t *testing.T) {
-	r := NewResolver()
-	chain := newFakeChain()
-
-	var inflight, maxInflight atomic.Int32
-	chain.onLookup = func() {
-		n := inflight.Add(1)
-		for {
-			old := maxInflight.Load()
-			if n <= old || maxInflight.CompareAndSwap(old, n) {
-				break
-			}
-		}
-		time.Sleep(50 * time.Millisecond)
-		inflight.Add(-1)
-	}
-
-	relays := []domain.Domain{"a.example.com", "b.example.com", "c.example.com", "d.example.com"}
-	for _, d := range relays {
-		chain.setAnswer(dns.Fqdn(string(d)), dns.TypeA, "10.0.0.2")
-	}
-	r.SetChainResolver(chain, 50)
-
-	start := time.Now()
-	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: relays})
-	require.NoError(t, err)
-	elapsed := time.Since(start)
-
-	assert.GreaterOrEqual(t, int(maxInflight.Load()), 2, "domains must resolve concurrently")
-	// Serial resolution of 4 domains would take at least 4*50ms; concurrent is far less.
-	assert.Less(t, elapsed, 300*time.Millisecond, "resolution should not be serial")
-}
-
-// A domain that fails to resolve must not be retried on every update; the
-// failure backoff suppresses re-resolution until it expires.
-func TestResolver_UpdateFromServerDomains_BacksOffFailures(t *testing.T) {
-	r := NewResolver()
-	chain := newFakeChain()
-	chain.err = errors.New("resolve boom")
-	r.SetChainResolver(chain, 50)
-
-	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
-
-	_, err := r.UpdateFromServerDomains(context.Background(), sd)
-	require.NoError(t, err)
-	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
-		"first update must attempt the resolve")
-
-	_, err = r.UpdateFromServerDomains(context.Background(), sd)
-	require.NoError(t, err)
-	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
-		"failed resolve must back off and not retry on the next update")
-}
-
-// A domain listed under more than one server-domain type (e.g. STUN and TURN on
-// the same host) must be resolved once per update, not once per occurrence.
-func TestResolver_AddNewDomains_DedupesDuplicateDomains(t *testing.T) {
-	r := NewResolver()
-	chain := newFakeChain()
-	chain.setAnswer("dup.example.com.", dns.TypeA, "10.0.0.9")
-	r.SetChainResolver(chain, 50)
-
-	sd := dnsconfig.ServerDomains{
-		Stuns: []domain.Domain{"dup.example.com"},
-		Turns: []domain.Domain{"dup.example.com"},
-	}
-
-	_, err := r.UpdateFromServerDomains(context.Background(), sd)
-	require.NoError(t, err)
-	assert.Equal(t, 1, chain.callCount("dup.example.com.", dns.TypeA),
-		"a domain appearing under multiple server-domain types must resolve once")
-}
-
-// A failure marker must be dropped once its domain leaves the server-domains set
-// so the map stays bounded to the current set.
-func TestResolver_UpdateFromServerDomains_PrunesFailedResolves(t *testing.T) {
-	r := NewResolver()
-	chain := newFakeChain()
-	chain.err = errors.New("resolve boom")
-	r.SetChainResolver(chain, 50)
-
-	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("gone.example.com")})
-	require.NoError(t, err)
-	r.mutex.RLock()
-	_, marked := r.failedResolves[domain.Domain("gone.example.com")]
-	r.mutex.RUnlock()
-	require.True(t, marked, "failed resolve must be recorded")
-
-	_, err = r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("other.example.com")})
-	require.NoError(t, err)
-	r.mutex.RLock()
-	_, stillMarked := r.failedResolves[domain.Domain("gone.example.com")]
-	r.mutex.RUnlock()
-	assert.False(t, stillMarked, "failure marker for a domain no longer in the set must be pruned")
-}
-
-// When one family hard-errors while the other resolves, the domain is cached
-// for the working family but recorded as incomplete so the failed family is
-// retried under backoff instead of being treated as fully resolved forever.
-func TestResolver_AddNewDomains_RetriesPartialFamilyFailure(t *testing.T) {
-	d := domain.Domain("relay.example.com")
-	r := NewResolver()
-	chain := newFakeChain()
-	chain.setAnswer("relay.example.com.", dns.TypeA, "10.0.0.2")
-	chain.setErr("relay.example.com.", dns.TypeAAAA, errors.New("servfail"))
-	r.SetChainResolver(chain, 50)
-
-	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
-	require.NoError(t, err)
-
-	r.mutex.RLock()
-	_, aCached := r.records[dns.Question{Name: "relay.example.com.", Qtype: dns.TypeA, Qclass: dns.ClassINET}]
-	_, marked := r.failedResolves[d]
-	r.mutex.RUnlock()
-	require.True(t, aCached, "the working family must still be cached")
-	require.True(t, marked, "a partial failure must be recorded so the failed family is retried")
-
-	assert.False(t, r.needsResolve(d), "within the backoff window the domain is not retried")
-
-	r.mutex.Lock()
-	r.failedResolves[d] = time.Now().Add(-2 * refreshBackoff)
-	r.mutex.Unlock()
-	assert.True(t, r.needsResolve(d), "after the backoff elapses the domain is retried to pick up the missing family")
-}
-
-// A family that returns NODATA (legitimately absent, e.g. an IPv4-only host) is
-// not a failure: the domain must not be marked for retry, otherwise it would be
-// re-resolved on every sync.
-func TestResolver_AddNewDomains_NodataIsNotFailure(t *testing.T) {
-	d := domain.Domain("v4only.example.com")
-	r := NewResolver()
-	chain := newFakeChain()
-	chain.setAnswer("v4only.example.com.", dns.TypeA, "10.0.0.2")
-	r.SetChainResolver(chain, 50)
-
-	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
-	require.NoError(t, err)
-
-	r.mutex.RLock()
-	_, marked := r.failedResolves[d]
-	r.mutex.RUnlock()
-	assert.False(t, marked, "a NODATA family must not be recorded as a failure")
-	assert.False(t, r.needsResolve(d), "an IPv4-only host must not be re-resolved on later syncs")
-}
--- a/client/internal/dns/resutil/resolve.go
+++ b/client/internal/dns/resutil/resolve.go
@@ -207,35 +207,3 @@ func FormatAnswers(answers []dns.RR) string {
 	}
 	return "[" + strings.Join(parts, ", ") + "]"
 }
-
-// StripOPT removes any OPT pseudo-RRs from the message's Extra section. Per
-// RFC 6891 a responder must not include an OPT RR toward a client that did not
-// advertise EDNS0.
-func StripOPT(msg *dns.Msg) {
-	if len(msg.Extra) == 0 {
-		return
-	}
-	out := msg.Extra[:0]
-	for _, rr := range msg.Extra {
-		if _, ok := rr.(*dns.OPT); ok {
-			continue
-		}
-		out = append(out, rr)
-	}
-	msg.Extra = out
-}
-
-// ExtractEDE returns the first Extended DNS Error (RFC 8914) option carried in
-// the message, if present.
-func ExtractEDE(msg *dns.Msg) (*dns.EDNS0_EDE, bool) {
-	opt := msg.IsEdns0()
-	if opt == nil {
-		return nil, false
-	}
-	for _, o := range opt.Option {
-		if ede, ok := o.(*dns.EDNS0_EDE); ok {
-			return ede, true
-		}
-	}
-	return nil, false
-}
--- a/client/internal/dns/resutil/resolve_test.go
+++ b/client/internal/dns/resutil/resolve_test.go
@@ -120,42 +120,3 @@ func TestLookupIP_DNSErrorNotIsNotFound(t *testing.T) {

 	assert.Equal(t, dns.RcodeServerFailure, result.Rcode, "upstream failure should map to SERVFAIL")
 }
-
-func TestStripOPT(t *testing.T) {
-	rm := &dns.Msg{
-		Extra: []dns.RR{
-			&dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}},
-			&dns.A{Hdr: dns.RR_Header{Name: "x.", Rrtype: dns.TypeA}, A: net.IPv4(1, 2, 3, 4)},
-		},
-	}
-	StripOPT(rm)
-	assert.Len(t, rm.Extra, 1, "OPT should be removed, A kept")
-	_, isOPT := rm.Extra[0].(*dns.OPT)
-	assert.False(t, isOPT, "remaining record must not be OPT")
-}
-
-func TestExtractEDE(t *testing.T) {
-	t.Run("no edns", func(t *testing.T) {
-		_, ok := ExtractEDE(&dns.Msg{})
-		assert.False(t, ok, "message without OPT has no EDE")
-	})
-
-	t.Run("edns without ede", func(t *testing.T) {
-		rm := &dns.Msg{}
-		rm.SetEdns0(4096, false)
-		_, ok := ExtractEDE(rm)
-		assert.False(t, ok, "OPT without EDE option returns false")
-	})
-
-	t.Run("with ede", func(t *testing.T) {
-		rm := &dns.Msg{}
-		opt := &dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}}
-		opt.Option = append(opt.Option, &dns.EDNS0_EDE{InfoCode: 49152, ExtraText: "upstream timeout"})
-		rm.Extra = append(rm.Extra, opt)
-
-		ede, ok := ExtractEDE(rm)
-		assert.True(t, ok, "EDE option should be found")
-		assert.Equal(t, uint16(49152), ede.InfoCode)
-		assert.Equal(t, "upstream timeout", ede.ExtraText)
-	})
-}
--- a/client/internal/dns/server.go
+++ b/client/internal/dns/server.go
@@ -6,7 +6,6 @@ import (
 	"fmt"
 	"net/netip"
 	"net/url"
-	"os"
 	"slices"
 	"strings"
 	"sync"
@@ -39,15 +38,11 @@ const (
 	// defaultWarningDelayBase is the starting grace window before a
 	// "Nameserver group unreachable" event fires for a group that's
 	// never been healthy and only has overlay upstreams with no
-	// Connected peer. Per-server and overridable via envWarningDelay;
-	// see warningDelay.
-	defaultWarningDelayBase = 60 * time.Second
+	// Connected peer. Per-server and overridable; see warningDelayFor.
+	defaultWarningDelayBase = 30 * time.Second
 	// warningDelayBonusCap caps the route-count bonus added to the
-	// base grace window. See warningDelay.
+	// base grace window. See warningDelayFor.
 	warningDelayBonusCap = 30 * time.Second
-	// envWarningDelay overrides defaultWarningDelayBase with a Go duration
-	// string (e.g. "90s", "2m"). Invalid or non-positive values are ignored.
-	envWarningDelay = "NB_DNS_HEALTH_WARNING_DELAY"
 )

 // errNoUsableNameservers signals that a merged-domain group has no usable
@@ -140,7 +135,7 @@ type DefaultServer struct {
 	disableSys         bool
 	mux                sync.Mutex
 	service            service
-	dnsMuxHandlers     []handlerWrapper
+	dnsMuxMap          registeredHandlerMap
 	localResolver      *local.Resolver
 	wgInterface        WGIface
 	hostManager        hostManager
@@ -204,6 +199,8 @@ type handlerWrapper struct {
 	priority int
 }

+type registeredHandlerMap map[types.HandlerID]handlerWrapper
+
 // DefaultServerConfig holds configuration parameters for NewDefaultServer
 type DefaultServerConfig struct {
 	WgInterface    WGIface
@@ -292,6 +289,7 @@ func newDefaultServer(
 		service:           dnsService,
 		handlerChain:      handlerChain,
 		extraDomains:      make(map[domain.Domain]int),
+		dnsMuxMap:         make(registeredHandlerMap),
 		localResolver:     local.NewResolver(),
 		wgInterface:       wgInterface,
 		statusRecorder:    statusRecorder,
@@ -300,7 +298,7 @@ func newDefaultServer(
 		hostManager:       &noopHostConfigurator{},
 		mgmtCacheResolver: mgmtCacheResolver,
 		currentConfigHash: ^uint64(0), // Initialize to max uint64 to ensure first config is always applied
-		warningDelayBase:  warningDelayBaseFromEnv(),
+		warningDelayBase:  defaultWarningDelayBase,
 		healthRefresh:     make(chan struct{}, 1),
 	}
 	// Wire the local resolver against the peer status recorder so it can
@@ -330,7 +328,7 @@ func (s *DefaultServer) SetRouteSources(selected, active func() route.HAMap) {
 	type routeSettable interface {
 		setSelectedRoutes(func() route.HAMap)
 	}
-	for _, entry := range s.dnsMuxHandlers {
+	for _, entry := range s.dnsMuxMap {
 		if h, ok := entry.handler.(routeSettable); ok {
 			h.setSelectedRoutes(selected)
 		}
@@ -980,23 +978,19 @@ func (s *DefaultServer) usableNameServers(nameServers []nbdns.NameServer) []neti

 func (s *DefaultServer) updateMux(muxUpdates []handlerWrapper) {
 	// this will introduce a short period of time when the server is not able to handle DNS requests
-	for _, existing := range s.dnsMuxHandlers {
+	for _, existing := range s.dnsMuxMap {
 		s.deregisterHandler([]string{existing.domain}, existing.priority)
-		// The local resolver is a persistent singleton shared by every custom
-		// zone and reused across config updates. Its chain registrations are
-		// per-config and must be deregistered, but Stop() cancels its lookup
-		// context (breaking external CNAME-target resolution) and clears its
-		// records, so it must not be torn down here.
-		if existing.handler != s.localResolver {
-			existing.handler.Stop()
-		}
+		existing.handler.Stop()
 	}

+	muxUpdateMap := make(registeredHandlerMap)
+
 	for _, update := range muxUpdates {
 		s.registerHandler([]string{update.domain}, update.handler, update.priority)
+		muxUpdateMap[update.handler.ID()] = update
 	}

-	s.dnsMuxHandlers = muxUpdates
+	s.dnsMuxMap = muxUpdateMap
 }

 // updateNSGroupStates records the new group set and pokes the refresher.
@@ -1160,26 +1154,6 @@ func (s *DefaultServer) projectUnhealthy(p *nsGroupProj, servers []netip.AddrPor
 	return false
 }

-// warningDelayBaseFromEnv returns the base grace window, honoring
-// envWarningDelay when it holds a valid positive Go duration. Invalid or
-// non-positive values fall back to defaultWarningDelayBase.
-func warningDelayBaseFromEnv() time.Duration {
-	val := os.Getenv(envWarningDelay)
-	if val == "" {
-		return defaultWarningDelayBase
-	}
-	d, err := time.ParseDuration(val)
-	if err != nil {
-		log.Warnf("invalid %s value %q, using default %v: %v", envWarningDelay, val, defaultWarningDelayBase, err)
-		return defaultWarningDelayBase
-	}
-	if d <= 0 {
-		log.Warnf("%s must be positive, got %v, using default %v", envWarningDelay, d, defaultWarningDelayBase)
-		return defaultWarningDelayBase
-	}
-	return d
-}
-
 // warningDelay returns the grace window for the given selected-route
 // count. Scales gently: +1s per 100 routes, capped by
 // warningDelayBonusCap. Parallel handshakes mean handshake time grows
@@ -1230,7 +1204,7 @@ func (s *DefaultServer) groupHasImmediateUpstream(servers []netip.AddrPort, snap
 // in more than one handler.
 func (s *DefaultServer) collectUpstreamHealth() map[netip.AddrPort]UpstreamHealth {
 	merged := make(map[netip.AddrPort]UpstreamHealth)
-	for _, entry := range s.dnsMuxHandlers {
+	for _, entry := range s.dnsMuxMap {
 		reporter, ok := entry.handler.(upstreamHealthReporter)
 		if !ok {
 			continue
--- a/client/internal/dns/server_test.go
+++ b/client/internal/dns/server_test.go
@@ -104,6 +104,19 @@ func init() {
 	formatter.SetTextFormatter(log.StandardLogger())
 }

+func generateDummyHandler(d string, servers []nbdns.NameServer) *upstreamResolverBase {
+	var srvs []netip.AddrPort
+	for _, srv := range servers {
+		srvs = append(srvs, srv.AddrPort())
+	}
+	u := &upstreamResolverBase{
+		domain: domain.Domain(d),
+		cancel: func() {},
+	}
+	u.addRace(srvs)
+	return u
+}
+
 func TestUpdateDNSServer(t *testing.T) {

 	nameServers := []nbdns.NameServer{
@@ -119,20 +132,22 @@ func TestUpdateDNSServer(t *testing.T) {
 		},
 	}

+	dummyHandler := local.NewResolver()
+
 	testCases := []struct {
 		name                string
-		initUpstreamMap     []handlerWrapper
+		initUpstreamMap     registeredHandlerMap
 		initLocalZones      []nbdns.CustomZone
 		initSerial          uint64
 		inputSerial         uint64
 		inputUpdate         nbdns.Config
 		shouldFail          bool
-		expectedUpstreamMap []handlerWrapper
+		expectedUpstreamMap registeredHandlerMap
 		expectedLocalQs     []dns.Question
 	}{
 		{
 			name:            "Initial Config Should Succeed",
-			initUpstreamMap: nil,
+			initUpstreamMap: make(registeredHandlerMap),
 			initSerial:      0,
 			inputSerial:     1,
 			inputUpdate: nbdns.Config{
@@ -154,17 +169,20 @@ func TestUpdateDNSServer(t *testing.T) {
 					},
 				},
 			},
-			expectedUpstreamMap: []handlerWrapper{
-				{
+			expectedUpstreamMap: registeredHandlerMap{
+				generateDummyHandler("netbird.io", nameServers).ID(): handlerWrapper{
 					domain:   "netbird.io",
+					handler:  dummyHandler,
 					priority: PriorityUpstream,
 				},
-				{
+				dummyHandler.ID(): handlerWrapper{
 					domain:   "netbird.cloud",
+					handler:  dummyHandler,
 					priority: PriorityLocal,
 				},
-				{
+				generateDummyHandler(".", nameServers).ID(): handlerWrapper{
 					domain:   nbdns.RootZone,
+					handler:  dummyHandler,
 					priority: PriorityDefault,
 				},
 			},
@@ -173,10 +191,10 @@ func TestUpdateDNSServer(t *testing.T) {
 		{
 			name:           "New Config Should Succeed",
 			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: 1, Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
-			initUpstreamMap: []handlerWrapper{
-				{
+			initUpstreamMap: registeredHandlerMap{
+				generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
 					domain:   "netbird.cloud",
-					handler:  &mockHandler{},
+					handler:  dummyHandler,
 					priority: PriorityUpstream,
 				},
 			},
@@ -197,13 +215,15 @@ func TestUpdateDNSServer(t *testing.T) {
 					},
 				},
 			},
-			expectedUpstreamMap: []handlerWrapper{
-				{
+			expectedUpstreamMap: registeredHandlerMap{
+				generateDummyHandler("netbird.io", nameServers).ID(): handlerWrapper{
 					domain:   "netbird.io",
+					handler:  dummyHandler,
 					priority: PriorityUpstream,
 				},
-				{
+				"local-resolver": handlerWrapper{
 					domain:   "netbird.cloud",
+					handler:  dummyHandler,
 					priority: PriorityLocal,
 				},
 			},
@@ -212,7 +232,7 @@ func TestUpdateDNSServer(t *testing.T) {
 		{
 			name:            "Smaller Config Serial Should Be Skipped",
 			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: nil,
+			initUpstreamMap: make(registeredHandlerMap),
 			initSerial:      2,
 			inputSerial:     1,
 			shouldFail:      true,
@@ -220,7 +240,7 @@ func TestUpdateDNSServer(t *testing.T) {
 		{
 			name:            "Empty NS Group Domain Or Not Primary Element Should Fail",
 			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: nil,
+			initUpstreamMap: make(registeredHandlerMap),
 			initSerial:      0,
 			inputSerial:     1,
 			inputUpdate: nbdns.Config{
@@ -242,7 +262,7 @@ func TestUpdateDNSServer(t *testing.T) {
 		{
 			name:            "Invalid NS Group Nameservers list Should Fail",
 			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: nil,
+			initUpstreamMap: make(registeredHandlerMap),
 			initSerial:      0,
 			inputSerial:     1,
 			inputUpdate: nbdns.Config{
@@ -264,7 +284,7 @@ func TestUpdateDNSServer(t *testing.T) {
 		{
 			name:            "Invalid Custom Zone Records list Should Skip",
 			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: nil,
+			initUpstreamMap: make(registeredHandlerMap),
 			initSerial:      0,
 			inputSerial:     1,
 			inputUpdate: nbdns.Config{
@@ -281,41 +301,42 @@ func TestUpdateDNSServer(t *testing.T) {
 					},
 				},
 			},
-			expectedUpstreamMap: []handlerWrapper{{
+			expectedUpstreamMap: registeredHandlerMap{generateDummyHandler(".", nameServers).ID(): handlerWrapper{
 				domain:   ".",
+				handler:  dummyHandler,
 				priority: PriorityDefault,
 			}},
 		},
 		{
 			name:           "Empty Config Should Succeed and Clean Maps",
 			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
-			initUpstreamMap: []handlerWrapper{
-				{
+			initUpstreamMap: registeredHandlerMap{
+				generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
 					domain:   zoneRecords[0].Name,
-					handler:  &mockHandler{},
+					handler:  dummyHandler,
 					priority: PriorityUpstream,
 				},
 			},
 			initSerial:          0,
 			inputSerial:         1,
 			inputUpdate:         nbdns.Config{ServiceEnable: true},
-			expectedUpstreamMap: nil,
+			expectedUpstreamMap: make(registeredHandlerMap),
 			expectedLocalQs:     []dns.Question{},
 		},
 		{
 			name:           "Disabled Service Should clean map",
 			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
-			initUpstreamMap: []handlerWrapper{
-				{
+			initUpstreamMap: registeredHandlerMap{
+				generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
 					domain:   zoneRecords[0].Name,
-					handler:  &mockHandler{},
+					handler:  dummyHandler,
 					priority: PriorityUpstream,
 				},
 			},
 			initSerial:          0,
 			inputSerial:         1,
 			inputUpdate:         nbdns.Config{ServiceEnable: false},
-			expectedUpstreamMap: nil,
+			expectedUpstreamMap: make(registeredHandlerMap),
 			expectedLocalQs:     []dns.Question{},
 		},
 	}
@@ -372,7 +393,7 @@ func TestUpdateDNSServer(t *testing.T) {
 				}
 			}()

-			dnsServer.dnsMuxHandlers = testCase.initUpstreamMap
+			dnsServer.dnsMuxMap = testCase.initUpstreamMap
 			dnsServer.localResolver.Update(testCase.initLocalZones)
 			dnsServer.updateSerial = testCase.initSerial

@@ -384,20 +405,14 @@ func TestUpdateDNSServer(t *testing.T) {
 				t.Fatalf("update dns server should not fail, got error: %v", err)
 			}

-			if len(dnsServer.dnsMuxHandlers) != len(testCase.expectedUpstreamMap) {
-				t.Fatalf("update upstream failed, map size is different than expected, want %d, got %d", len(testCase.expectedUpstreamMap), len(dnsServer.dnsMuxHandlers))
+			if len(dnsServer.dnsMuxMap) != len(testCase.expectedUpstreamMap) {
+				t.Fatalf("update upstream failed, map size is different than expected, want %d, got %d", len(testCase.expectedUpstreamMap), len(dnsServer.dnsMuxMap))
 			}

-			for _, expected := range testCase.expectedUpstreamMap {
-				found := false
-				for _, got := range dnsServer.dnsMuxHandlers {
-					if got.domain == expected.domain && got.priority == expected.priority {
-						found = true
-						break
-					}
-				}
+			for key := range testCase.expectedUpstreamMap {
+				_, found := dnsServer.dnsMuxMap[key]
 				if !found {
-					t.Fatalf("update upstream failed, handler for domain=%s priority=%d not found in dnsMuxHandlers: %#v", expected.domain, expected.priority, dnsServer.dnsMuxHandlers)
+					t.Fatalf("update upstream failed, key %s was not found in the dnsMuxMap: %#v", key, dnsServer.dnsMuxMap)
 				}
 			}

@@ -497,8 +512,8 @@ func TestDNSFakeResolverHandleUpdates(t *testing.T) {
 		}
 	}()

-	dnsServer.dnsMuxHandlers = []handlerWrapper{
-		{
+	dnsServer.dnsMuxMap = registeredHandlerMap{
+		"id1": handlerWrapper{
 			domain:   zoneRecords[0].Name,
 			handler:  &local.Resolver{},
 			priority: PriorityUpstream,
@@ -1014,15 +1029,15 @@ func (m *mockService) RegisterMux(string, dns.Handler) {}
 func (m *mockService) DeregisterMux(string)            {}

 func TestDefaultServer_UpdateMux(t *testing.T) {
-	baseMatchHandlers := []handlerWrapper{
-		{
+	baseMatchHandlers := registeredHandlerMap{
+		"upstream-group1": {
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group1",
 			},
 			priority: PriorityUpstream,
 		},
-		{
+		"upstream-group2": {
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group2",
@@ -1031,15 +1046,15 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 		},
 	}

-	baseRootHandlers := []handlerWrapper{
-		{
+	baseRootHandlers := registeredHandlerMap{
+		"upstream-root1": {
 			domain: ".",
 			handler: &mockHandler{
 				Id: "upstream-root1",
 			},
 			priority: PriorityDefault,
 		},
-		{
+		"upstream-root2": {
 			domain: ".",
 			handler: &mockHandler{
 				Id: "upstream-root2",
@@ -1048,22 +1063,22 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 		},
 	}

-	baseMixedHandlers := []handlerWrapper{
-		{
+	baseMixedHandlers := registeredHandlerMap{
+		"upstream-group1": {
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group1",
 			},
 			priority: PriorityUpstream,
 		},
-		{
+		"upstream-group2": {
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group2",
 			},
 			priority: PriorityUpstream - 1,
 		},
-		{
+		"upstream-other": {
 			domain: "other.com",
 			handler: &mockHandler{
 				Id: "upstream-other",
@@ -1074,7 +1089,7 @@ func TestDefaultServer_UpdateMux(t *testing.T) {

 	tests := []struct {
 		name             string
-		initialHandlers  []handlerWrapper
+		initialHandlers  registeredHandlerMap
 		updates          []handlerWrapper
 		expectedHandlers map[string]string // map[HandlerID]domain
 		description      string
@@ -1358,38 +1373,32 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			server := &DefaultServer{
-				dnsMuxHandlers: tt.initialHandlers,
-				handlerChain:   NewHandlerChain(),
-				service:        &mockService{},
+				dnsMuxMap:    tt.initialHandlers,
+				handlerChain: NewHandlerChain(),
+				service:      &mockService{},
 			}

 			// Perform the update
 			server.updateMux(tt.updates)

 			// Verify the results
-			assert.Equal(t, len(tt.expectedHandlers), len(server.dnsMuxHandlers),
+			assert.Equal(t, len(tt.expectedHandlers), len(server.dnsMuxMap),
 				"Number of handlers after update doesn't match expected")

 			// Check each expected handler
 			for id, expectedDomain := range tt.expectedHandlers {
-				var found *handlerWrapper
-				for i := range server.dnsMuxHandlers {
-					if server.dnsMuxHandlers[i].handler.ID() == types.HandlerID(id) {
-						found = &server.dnsMuxHandlers[i]
-						break
-					}
-				}
-				assert.NotNil(t, found, "Expected handler %s not found", id)
-				if found != nil {
-					assert.Equal(t, expectedDomain, found.domain,
+				handler, exists := server.dnsMuxMap[types.HandlerID(id)]
+				assert.True(t, exists, "Expected handler %s not found", id)
+				if exists {
+					assert.Equal(t, expectedDomain, handler.domain,
 						"Domain mismatch for handler %s", id)
 				}
 			}

 			// Verify no unexpected handlers exist
-			for _, entry := range server.dnsMuxHandlers {
-				_, expected := tt.expectedHandlers[string(entry.handler.ID())]
-				assert.True(t, expected, "Unexpected handler found: %s", entry.handler.ID())
+			for HandlerID := range server.dnsMuxMap {
+				_, expected := tt.expectedHandlers[string(HandlerID)]
+				assert.True(t, expected, "Unexpected handler found: %s", HandlerID)
 			}

 			// Verify the handlerChain state and order
@@ -1404,7 +1413,7 @@ func TestDefaultServer_UpdateMux(t *testing.T) {

 				// Verify handler exists in mux
 				foundInMux := false
-				for _, muxEntry := range server.dnsMuxHandlers {
+				for _, muxEntry := range server.dnsMuxMap {
 					if chainEntry.Handler == muxEntry.handler &&
 						chainEntry.Priority == muxEntry.priority &&
 						chainEntry.Pattern == dns.Fqdn(muxEntry.domain) {
@@ -1413,108 +1422,12 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 					}
 				}
 				assert.True(t, foundInMux,
-					"Handler in chain not found in dnsMuxHandlers")
+					"Handler in chain not found in dnsMuxMap")
 			}
 		})
 	}
 }

-// chainHasPattern reports whether the handler chain holds an entry registered
-// for the given fqdn pattern at the given priority.
-func chainHasPattern(s *DefaultServer, pattern string, priority int) bool {
-	for _, h := range s.handlerChain.handlers {
-		if h.OrigPattern == pattern && h.Priority == priority {
-			return true
-		}
-	}
-	return false
-}
-
-// TestDefaultServer_UpdateMux_SharedHandlerZoneRemoval verifies that updateMux
-// tracks each (handler, domain) registration independently when one handler
-// serves multiple zones. Every custom zone is served by the same handler
-// instance (the local resolver, whose ID is the constant "local-resolver"), so
-// removing one zone must deregister exactly that zone's chain entry and leave
-// the others in place. Tracking registrations by handler ID alone collapses all
-// zones onto one entry, leaving removed zones in the chain to answer
-// authoritatively with no records.
-func TestDefaultServer_UpdateMux_SharedHandlerZoneRemoval(t *testing.T) {
-	// One handler serves every custom zone, mirroring s.localResolver.
-	shared := &mockHandler{Id: "local-resolver"}
-
-	server := &DefaultServer{
-		handlerChain: NewHandlerChain(),
-		service:      &mockService{},
-	}
-
-	// Two custom zones under the same handler. The surviving zone is registered
-	// last, mirroring the management emission order.
-	server.updateMux([]handlerWrapper{
-		{domain: "userzone.test", handler: shared, priority: PriorityLocal},
-		{domain: "peerzone.test", handler: shared, priority: PriorityLocal},
-	})
-
-	require.True(t, chainHasPattern(server, "userzone.test.", PriorityLocal),
-		"userzone.test should be registered after the first update")
-	require.True(t, chainHasPattern(server, "peerzone.test.", PriorityLocal),
-		"peerzone.test should be registered after the first update")
-
-	// Remove one zone, keep the other.
-	server.updateMux([]handlerWrapper{
-		{domain: "peerzone.test", handler: shared, priority: PriorityLocal},
-	})
-
-	assert.True(t, chainHasPattern(server, "peerzone.test.", PriorityLocal),
-		"peerzone.test should remain after removing userzone.test")
-	assert.False(t, chainHasPattern(server, "userzone.test.", PriorityLocal),
-		"userzone.test handler must be deregistered, not leaked in the chain")
-}
-
-// TestDefaultServer_UpdateMux_PreservesLocalResolver verifies that updateMux
-// does not tear down the shared local resolver during reconfiguration. The
-// resolver is a process-lifetime singleton reused across config updates;
-// Stop() cancels its lookup context (breaking external CNAME-target
-// resolution) and clears its records. updateMux must deregister its chain
-// entries without stopping it. Records surviving a teardown update is the
-// observable proxy: Stop() would have cleared them.
-func TestDefaultServer_UpdateMux_PreservesLocalResolver(t *testing.T) {
-	resolver := local.NewResolver()
-	require.NoError(t, resolver.RegisterRecord(nbdns.SimpleRecord{
-		Name:  "peer.netbird.cloud.",
-		Type:  int(dns.TypeA),
-		Class: nbdns.DefaultClass,
-		TTL:   300,
-		RData: "10.0.0.1",
-	}))
-
-	server := &DefaultServer{
-		handlerChain:  NewHandlerChain(),
-		service:       &mockService{},
-		localResolver: resolver,
-	}
-
-	server.updateMux([]handlerWrapper{
-		{domain: "netbird.cloud", handler: resolver, priority: PriorityLocal},
-	})
-
-	// Remove the zone. The resolver must survive so its records and lookup
-	// context stay intact for the next registration.
-	server.updateMux(nil)
-
-	var response *dns.Msg
-	resolver.ServeDNS(&test.MockResponseWriter{
-		WriteMsgFunc: func(m *dns.Msg) error {
-			response = m
-			return nil
-		},
-	}, &dns.Msg{Question: []dns.Question{{Name: "peer.netbird.cloud.", Qtype: dns.TypeA, Qclass: dns.ClassINET}}})
-
-	require.NotNil(t, response, "local resolver should answer after teardown")
-	assert.Equal(t, dns.RcodeSuccess, response.Rcode,
-		"local resolver records must survive teardown; updateMux must not Stop() the shared resolver")
-	assert.NotEmpty(t, response.Answer, "answer should contain the surviving record")
-}
-
 func TestExtraDomains(t *testing.T) {
 	tests := []struct {
 		name                string
@@ -2136,6 +2049,7 @@ func TestBuildUpstreamHandler_MergesGroupsPerDomain(t *testing.T) {
 		localResolver: local.NewResolver(),
 		handlerChain:  NewHandlerChain(),
 		hostManager:   &noopHostConfigurator{},
+		dnsMuxMap:     make(registeredHandlerMap),
 	}

 	groups := []*nbdns.NameServerGroup{
@@ -2293,7 +2207,7 @@ func TestEvaluateNSGroupHealth(t *testing.T) {
 	}
 }

-// healthStubHandler is a minimal dnsMuxHandlers entry that exposes a fixed
+// healthStubHandler is a minimal dnsMuxMap entry that exposes a fixed
 // UpstreamHealth snapshot, letting tests drive recomputeNSGroupStates
 // without spinning up real handlers.
 type healthStubHandler struct {
@@ -2369,11 +2283,12 @@ func newProjTestFixture(t *testing.T) *projTestFixture {
 		ctx:              context.Background(),
 		wgInterface:      &mocWGIface{},
 		statusRecorder:   recorder,
+		dnsMuxMap:        make(registeredHandlerMap),
 		selectedRoutes:   func() route.HAMap { return fx.selected },
 		activeRoutes:     func() route.HAMap { return fx.active },
 		warningDelayBase: defaultWarningDelayBase,
 	}
-	fx.server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: fx.stub, priority: PriorityUpstream}}
+	fx.server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: fx.stub, priority: PriorityUpstream}

 	fx.server.mux.Lock()
 	fx.server.updateNSGroupStates([]*nbdns.NameServerGroup{fx.group})
@@ -2480,6 +2395,7 @@ func TestProjection_OverlayAddrNoRouteDelaysWarning(t *testing.T) {
 		ctx:              context.Background(),
 		wgInterface:      &mocWGIface{},
 		statusRecorder:   recorder,
+		dnsMuxMap:        make(registeredHandlerMap),
 		selectedRoutes:   func() route.HAMap { return nil },
 		activeRoutes:     func() route.HAMap { return nil },
 		warningDelayBase: 50 * time.Millisecond,
@@ -2491,7 +2407,7 @@ func TestProjection_OverlayAddrNoRouteDelaysWarning(t *testing.T) {
 	stub := &healthStubHandler{health: map[netip.AddrPort]UpstreamHealth{
 		overlayPeer: {LastFail: time.Now(), LastErr: "timeout"},
 	}}
-	server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}
+	server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}

 	server.mux.Lock()
 	server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2528,6 +2444,7 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
 		service:           NewServiceViaMemory(wgIface),
 		hostManager:       &noopHostConfigurator{},
 		extraDomains:      map[domain.Domain]int{},
+		dnsMuxMap:         make(registeredHandlerMap),
 		statusRecorder:    peer.NewRecorder("mgm"),
 		selectedRoutes:    func() route.HAMap { return nil },
 		activeRoutes:      func() route.HAMap { return nil },
@@ -2542,7 +2459,7 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
 		NameServers: []nbdns.NameServer{{IP: srv.Addr(), NSType: nbdns.UDPNameServerType, Port: int(srv.Port())}},
 	}
 	stub := &healthStubHandler{health: map[netip.AddrPort]UpstreamHealth{srv: {LastOk: time.Now()}}}
-	server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}
+	server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}

 	server.mux.Lock()
 	server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2567,32 +2484,6 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
 // rule 3: startup failures while the peer is handshaking, then the peer
 // comes up and a query succeeds before the grace window elapses. No
 // warning should ever have fired, and no recovery either.
-func TestWarningDelayBaseFromEnv(t *testing.T) {
-	tests := []struct {
-		name string
-		set  bool
-		val  string
-		want time.Duration
-	}{
-		{name: "unset uses default", set: false, want: defaultWarningDelayBase},
-		{name: "valid override", set: true, val: "90s", want: 90 * time.Second},
-		{name: "valid minutes", set: true, val: "2m", want: 2 * time.Minute},
-		{name: "invalid falls back", set: true, val: "notaduration", want: defaultWarningDelayBase},
-		{name: "zero falls back", set: true, val: "0s", want: defaultWarningDelayBase},
-		{name: "negative falls back", set: true, val: "-30s", want: defaultWarningDelayBase},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.name, func(t *testing.T) {
-			t.Setenv(envWarningDelay, tc.val)
-			if !tc.set {
-				os.Unsetenv(envWarningDelay)
-			}
-			assert.Equal(t, tc.want, warningDelayBaseFromEnv(), "grace window base")
-		})
-	}
-}
-
 func TestProjection_OverlayRecoversDuringGrace(t *testing.T) {
 	fx := newProjTestFixture(t)
 	fx.server.warningDelayBase = 200 * time.Millisecond
@@ -2704,6 +2595,7 @@ func TestProjection_MixedGroupEmitsImmediately(t *testing.T) {
 	server := &DefaultServer{
 		ctx:              context.Background(),
 		statusRecorder:   recorder,
+		dnsMuxMap:        make(registeredHandlerMap),
 		selectedRoutes:   func() route.HAMap { return overlayMap },
 		activeRoutes:     func() route.HAMap { return nil },
 		warningDelayBase: time.Hour,
@@ -2721,7 +2613,7 @@ func TestProjection_MixedGroupEmitsImmediately(t *testing.T) {
 			overlay: {LastFail: time.Now(), LastErr: "timeout"},
 		},
 	}
-	server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}
+	server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}

 	server.mux.Lock()
 	server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2748,6 +2640,7 @@ func TestDNSLoopPrevention(t *testing.T) {
 		localResolver: local.NewResolver(),
 		handlerChain:  NewHandlerChain(),
 		hostManager:   &noopHostConfigurator{},
+		dnsMuxMap:     make(registeredHandlerMap),
 	}

 	tests := []struct {
--- a/client/internal/dns/upstream.go
+++ b/client/internal/dns/upstream.go
@@ -443,32 +443,29 @@ func (u *upstreamResolverBase) queryUpstream(parentCtx context.Context, r *dns.M
 		return raceResult{}, &upstreamFailure{upstream: upstream, reason: "no response"}
 	}

-	// A valid response means the upstream is reachable, whatever the Rcode.
-	u.markUpstreamOk(upstream)
-
 	proto := ""
 	if upstreamProto != nil {
 		proto = upstreamProto.protocol
 	}

 	if rm.Rcode == dns.RcodeServerFailure || rm.Rcode == dns.RcodeRefused {
-		// SERVFAIL and REFUSED are per-question outcomes (DNSSEC-bogus names,
-		// refused zones, transient recursion errors), not reachability
-		// problems: fail over for a better answer but keep the upstream healthy.
 		if code, ok := nonRetryableEDE(rm); ok {
 			if !hadEdns {
-				resutil.StripOPT(rm)
+				stripOPT(rm)
 			}
+			u.markUpstreamOk(upstream)
 			return raceResult{msg: rm, upstream: upstream, protocol: proto, ede: edeName(code)}, nil
 		}
 		reason := dns.RcodeToString[rm.Rcode]
+		u.markUpstreamFail(upstream, reason)
 		return raceResult{}, &upstreamFailure{upstream: upstream, reason: reason}
 	}

 	if !hadEdns {
-		resutil.StripOPT(rm)
+		stripOPT(rm)
 	}

+	u.markUpstreamOk(upstream)
 	return raceResult{msg: rm, upstream: upstream, protocol: proto}, nil
 }

@@ -523,6 +520,22 @@ func upstreamUDPSize() uint16 {
 	return dns.MinMsgSize
 }

+// stripOPT removes any OPT pseudo-RRs from the response's Extra section so
+// the response complies with RFC 6891 when the client did not advertise EDNS0.
+func stripOPT(rm *dns.Msg) {
+	if len(rm.Extra) == 0 {
+		return
+	}
+	out := rm.Extra[:0]
+	for _, rr := range rm.Extra {
+		if _, ok := rr.(*dns.OPT); ok {
+			continue
+		}
+		out = append(out, rr)
+	}
+	rm.Extra = out
+}
+
 func (u *upstreamResolverBase) handleUpstreamError(err error, upstream netip.AddrPort, startTime time.Time) *upstreamFailure {
 	if !errors.Is(err, context.DeadlineExceeded) && !isTimeout(err) {
 		return &upstreamFailure{upstream: upstream, reason: err.Error()}
--- a/client/internal/dns/upstream_test.go
+++ b/client/internal/dns/upstream_test.go
@@ -517,78 +517,6 @@ func TestUpstreamResolver_HealthTracking(t *testing.T) {
 	assert.NotContains(t, health, bad, "sibling upstream should not be queried when primary answers")
 }

-// TestUpstreamResolver_HealthTracking_ResponseMeansReachable verifies that an
-// upstream which answers with SERVFAIL or REFUSED is recorded as healthy:
-// those are per-question outcomes from a reachable server and must not mark
-// the upstream unhealthy. Only transport failures (timeouts) do.
-func TestUpstreamResolver_HealthTracking_ResponseMeansReachable(t *testing.T) {
-	a := netip.MustParseAddrPort("192.0.2.10:53")
-	b := netip.MustParseAddrPort("192.0.2.11:53")
-	timeoutErr := &net.OpError{Op: "read", Err: fmt.Errorf("i/o timeout")}
-
-	tests := []struct {
-		name        string
-		respA       mockUpstreamResponse
-		respB       mockUpstreamResponse
-		wantHealthy bool
-	}{
-		{
-			name:        "both SERVFAIL are reachable",
-			respA:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeServerFailure, "")},
-			respB:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeServerFailure, "")},
-			wantHealthy: true,
-		},
-		{
-			name:        "both REFUSED are reachable",
-			respA:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeRefused, "")},
-			respB:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeRefused, "")},
-			wantHealthy: true,
-		},
-		{
-			name:        "timeout marks unhealthy",
-			respA:       mockUpstreamResponse{err: timeoutErr},
-			respB:       mockUpstreamResponse{err: timeoutErr},
-			wantHealthy: false,
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.name, func(t *testing.T) {
-			mockClient := &mockUpstreamResolverPerServer{
-				responses: map[string]mockUpstreamResponse{
-					a.String(): tc.respA,
-					b.String(): tc.respB,
-				},
-				rtt: time.Millisecond,
-			}
-
-			ctx, cancel := context.WithCancel(context.Background())
-			defer cancel()
-
-			resolver := &upstreamResolverBase{
-				ctx:             ctx,
-				upstreamClient:  mockClient,
-				upstreamTimeout: UpstreamTimeout,
-			}
-			resolver.addRace([]netip.AddrPort{a, b})
-
-			responseWriter := &test.MockResponseWriter{WriteMsgFunc: func(m *dns.Msg) error { return nil }}
-			resolver.ServeDNS(responseWriter, new(dns.Msg).SetQuestion("example.com.", dns.TypeA))
-
-			health := resolver.UpstreamHealth()
-			require.Contains(t, health, a, "primary upstream should have a health record")
-			if tc.wantHealthy {
-				assert.False(t, health[a].LastOk.IsZero(), "responding upstream should have LastOk set")
-				assert.True(t, health[a].LastFail.IsZero(), "responding upstream should not be marked failed")
-				assert.Empty(t, health[a].LastErr, "responding upstream should have no error")
-			} else {
-				assert.False(t, health[a].LastFail.IsZero(), "timed-out upstream should be marked failed")
-				assert.NotEmpty(t, health[a].LastErr, "timed-out upstream should record an error")
-			}
-		})
-	}
-}
-
 func TestFormatFailures(t *testing.T) {
 	testCases := []struct {
 		name     string
@@ -985,6 +913,19 @@ func TestEDEName(t *testing.T) {
 	assert.Equal(t, "EDE 9999", edeName(9999), "unknown code falls back to numeric")
 }

+func TestStripOPT(t *testing.T) {
+	rm := &dns.Msg{
+		Extra: []dns.RR{
+			&dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}},
+			&dns.A{Hdr: dns.RR_Header{Name: "x.", Rrtype: dns.TypeA}, A: net.IPv4(1, 2, 3, 4)},
+		},
+	}
+	stripOPT(rm)
+	assert.Len(t, rm.Extra, 1, "OPT should be removed, A kept")
+	_, isOPT := rm.Extra[0].(*dns.OPT)
+	assert.False(t, isOPT, "remaining record must not be OPT")
+}
+
 func TestUpstreamResolver_NonRetryableEDEShortCircuits(t *testing.T) {
 	upstream1 := netip.MustParseAddrPort("192.0.2.1:53")
 	upstream2 := netip.MustParseAddrPort("192.0.2.2:53")
--- a/client/internal/dnsfwd/forwarder.go
+++ b/client/internal/dnsfwd/forwarder.go
@@ -26,15 +26,6 @@ import (
 const errResolveFailed = "failed to resolve query for domain=%s: %v"
 const upstreamTimeout = 15 * time.Second

-// EDE info codes the forwarder emits on upstream failures so the querying
-// client can see the reason without inspecting this peer's logs. They live in
-// the RFC 8914 Private Use range (49152-65535); the Go resolver never exposes a
-// real upstream EDE here, so these cannot collide with a genuine code.
-const (
-	edeNetbirdUpstreamTimeout uint16 = 49152
-	edeNetbirdUpstreamFailure uint16 = 49153
-)
-
 type resolver interface {
 	LookupNetIP(ctx context.Context, network, host string) ([]netip.Addr, error)
 }
@@ -229,7 +220,7 @@ func (f *DNSForwarder) handleDNSQuery(logger *log.Entry, w dns.ResponseWriter, q

 	result := resutil.LookupIP(ctx, f.resolver, network, qname, question.Qtype)
 	if result.Err != nil {
-		f.handleDNSError(ctx, logger, w, question, resp, qname, result, query.IsEdns0() != nil, startTime)
+		f.handleDNSError(ctx, logger, w, question, resp, qname, result, startTime)
 		return
 	}

@@ -342,7 +333,6 @@ func (f *DNSForwarder) handleDNSError(
 	resp *dns.Msg,
 	domain string,
 	result resutil.LookupResult,
-	reqHasEdns bool,
 	startTime time.Time,
 ) {
 	qType := question.Qtype
@@ -384,10 +374,6 @@ func (f *DNSForwarder) handleDNSError(
 		logger.Warnf(errResolveFailed, domain, result.Err)
 	}

-	if reqHasEdns {
-		attachEDE(resp, edeCodeFor(dnsErr), edeText(dnsErr))
-	}
-
 	f.writeResponse(logger, w, resp, domain, startTime)
 }

@@ -428,33 +414,3 @@ func (f *DNSForwarder) getMatchingEntries(domain string) (route.ResID, []*Forwar

 	return selectedResId, matches
 }
-
-// edeCodeFor maps an upstream lookup error to the NetBird EDE info code.
-func edeCodeFor(dnsErr *net.DNSError) uint16 {
-	if dnsErr != nil && dnsErr.IsTimeout {
-		return edeNetbirdUpstreamTimeout
-	}
-	return edeNetbirdUpstreamFailure
-}
-
-// edeText builds the EDE extra-text describing the class of upstream failure.
-// It deliberately omits the upstream server address, which may be an internal
-// resolver and is exposed to any client permitted to use the route; the full
-// detail stays in the forwarder's local log.
-func edeText(dnsErr *net.DNSError) string {
-	if dnsErr != nil && dnsErr.IsTimeout {
-		return "netbird forwarder: upstream timeout"
-	}
-	return "netbird forwarder: upstream failure"
-}
-
-// attachEDE adds an Extended DNS Error (RFC 8914) option to the response,
-// creating the OPT pseudo-record if the response does not already carry one.
-func attachEDE(resp *dns.Msg, code uint16, text string) {
-	opt := resp.IsEdns0()
-	if opt == nil {
-		resp.SetEdns0(dns.DefaultMsgSize, false)
-		opt = resp.IsEdns0()
-	}
-	opt.Option = append(opt.Option, &dns.EDNS0_EDE{InfoCode: code, ExtraText: text})
-}
--- a/client/internal/dnsfwd/forwarder_test.go
+++ b/client/internal/dnsfwd/forwarder_test.go
@@ -16,7 +16,6 @@ import (
 	"github.com/stretchr/testify/require"

 	firewall "github.com/netbirdio/netbird/client/firewall/manager"
-	"github.com/netbirdio/netbird/client/internal/dns/resutil"
 	"github.com/netbirdio/netbird/client/internal/dns/test"
 	"github.com/netbirdio/netbird/client/internal/peer"
 	"github.com/netbirdio/netbird/route"
@@ -618,85 +617,6 @@ func TestDNSForwarder_ResponseCodes(t *testing.T) {
 	}
 }

-func TestDNSForwarder_UpstreamFailureEDE(t *testing.T) {
-	tests := []struct {
-		name        string
-		lookupErr   error
-		reqEdns     bool
-		wantEDE     bool
-		wantCode    uint16
-		wantTextHas string
-	}{
-		{
-			name:        "timeout with edns0",
-			lookupErr:   &net.DNSError{Err: "i/o timeout", Server: "10.0.0.53:53", IsTimeout: true},
-			reqEdns:     true,
-			wantEDE:     true,
-			wantCode:    edeNetbirdUpstreamTimeout,
-			wantTextHas: "netbird forwarder: upstream timeout",
-		},
-		{
-			name:        "server failure with edns0",
-			lookupErr:   &net.DNSError{Err: "server misbehaving", Server: "10.0.0.53:53"},
-			reqEdns:     true,
-			wantEDE:     true,
-			wantCode:    edeNetbirdUpstreamFailure,
-			wantTextHas: "netbird forwarder: upstream failure",
-		},
-		{
-			name:      "no edns0 in request omits ede",
-			lookupErr: &net.DNSError{Err: "server misbehaving", Server: "10.0.0.53:53"},
-			reqEdns:   false,
-			wantEDE:   false,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			mockResolver := &MockResolver{}
-			forwarder := NewDNSForwarder(netip.MustParseAddrPort("127.0.0.1:0"), 300, nil, &peer.Status{}, nil)
-			forwarder.resolver = mockResolver
-
-			d, err := domain.FromString("example.com")
-			require.NoError(t, err)
-			forwarder.UpdateDomains([]*ForwarderEntry{{Domain: d, ResID: "test-res"}})
-
-			mockResolver.On("LookupNetIP", mock.Anything, "ip4", "example.com.").
-				Return([]netip.Addr(nil), tt.lookupErr).Once()
-
-			query := &dns.Msg{}
-			query.SetQuestion("example.com.", dns.TypeA)
-			if tt.reqEdns {
-				query.SetEdns0(dns.DefaultMsgSize, false)
-			}
-
-			var writtenResp *dns.Msg
-			mockWriter := &test.MockResponseWriter{
-				WriteMsgFunc: func(m *dns.Msg) error {
-					writtenResp = m
-					return nil
-				},
-			}
-
-			forwarder.handleDNSQuery(log.NewEntry(log.StandardLogger()), mockWriter, query, time.Now())
-			mockResolver.AssertExpectations(t)
-
-			require.NotNil(t, writtenResp, "expected a response")
-			assert.Equal(t, dns.RcodeServerFailure, writtenResp.Rcode, "upstream failure must be SERVFAIL")
-
-			ede, ok := resutil.ExtractEDE(writtenResp)
-			if !tt.wantEDE {
-				assert.False(t, ok, "response must not carry EDE")
-				return
-			}
-			require.True(t, ok, "response must carry EDE")
-			assert.Equal(t, tt.wantCode, ede.InfoCode, "EDE info code")
-			assert.Contains(t, ede.ExtraText, tt.wantTextHas, "EDE extra-text")
-			assert.NotContains(t, ede.ExtraText, "10.0.0.53", "must not leak upstream server address")
-		})
-	}
-}
-
 func TestDNSForwarder_TCPTruncation(t *testing.T) {
 	// Test that large UDP responses are truncated with TC bit set
 	mockResolver := &MockResolver{}
--- a/client/internal/engine.go
+++ b/client/internal/engine.go
@@ -22,6 +22,8 @@ import (
 	log "github.com/sirupsen/logrus"
 	"golang.zx2c4.com/wireguard/tun/netstack"
 	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+	"google.golang.org/grpc/codes"
+	gstatus "google.golang.org/grpc/status"

 	nberrors "github.com/netbirdio/netbird/client/errors"
 	"github.com/netbirdio/netbird/client/firewall"
@@ -86,8 +88,6 @@ const (

 var ErrResetConnection = fmt.Errorf("reset connection")

-var ErrEngineAlreadyStarted = errors.New("engine already started")
-
 type EngineConfig struct {
 	WgPort      int
 	WgIfaceName string
@@ -201,8 +201,6 @@ type Engine struct {
 	ctx    context.Context
 	cancel context.CancelFunc

-	started bool
-
 	wgInterface WGIface

 	udpMux *udpmux.UniversalUDPMuxDefault
@@ -283,15 +281,9 @@ func NewEngine(
 	services EngineServices,
 	mobileDep MobileDependency,
 ) *Engine {
-	// The engine is single-use: a fresh instance is built per connection
-	// cycle (see Client.run), so the run context is created once here rather
-	// than in Start.
-	ctx, cancel := context.WithCancel(clientCtx)
 	engine := &Engine{
 		clientCtx:          clientCtx,
 		clientCancel:       clientCancel,
-		ctx:                ctx,
-		cancel:             cancel,
 		signal:             services.SignalClient,
 		signaler:           peer.NewSignaler(services.SignalClient, config.WgPrivateKey),
 		mgmClient:          services.MgmClient,
@@ -324,34 +316,8 @@ func (e *Engine) Stop() error {
 		log.Debugf("tried stopping engine that is nil")
 		return nil
 	}
-	e.cancel()
 	e.syncMsgMux.Lock()

-	e.stopLocked()
-
-	e.syncMsgMux.Unlock()
-
-	timeout := e.calculateShutdownTimeout()
-	log.Debugf("waiting for goroutines to finish with timeout: %v", timeout)
-	shutdownCtx, cancel := context.WithTimeout(context.Background(), timeout)
-	defer cancel()
-
-	if err := waitWithContext(shutdownCtx, &e.shutdownWg); err != nil {
-		log.Warnf("shutdown timeout exceeded after %v, some goroutines may still be running", timeout)
-	}
-
-	log.Infof("stopped Netbird Engine")
-
-	return nil
-}
-
-// stopLocked tears down everything Start may have brought up, in the order
-// teardown requires (DNS before the interface goes down, flow manager after).
-// The caller must hold syncMsgMux. It is shared by Stop and by Start's failure
-// path, so a partially-initialized engine is cleaned up the same way; every
-// step is nil-guarded. It does not wait on shutdownWg — the caller does that
-// after releasing the lock, since the goroutines also take syncMsgMux.
-func (e *Engine) stopLocked() {
 	if e.connMgr != nil {
 		e.connMgr.Close()
 	}
@@ -402,6 +368,10 @@ func (e *Engine) stopLocked() {
 	// so dbus and friends don't complain because of a missing interface
 	e.stopDNSServer()

+	if e.cancel != nil {
+		e.cancel()
+	}
+
 	e.jobExecutorWG.Wait() // block until job goroutines finish

 	e.close()
@@ -420,6 +390,21 @@ func (e *Engine) stopLocked() {
 	if err := e.stateManager.PersistState(context.Background()); err != nil {
 		log.Errorf("failed to persist state: %v", err)
 	}
+
+	e.syncMsgMux.Unlock()
+
+	timeout := e.calculateShutdownTimeout()
+	log.Debugf("waiting for goroutines to finish with timeout: %v", timeout)
+	shutdownCtx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+
+	if err := waitWithContext(shutdownCtx, &e.shutdownWg); err != nil {
+		log.Warnf("shutdown timeout exceeded after %v, some goroutines may still be running", timeout)
+	}
+
+	log.Infof("stopped Netbird Engine")
+
+	return nil
 }

 // calculateShutdownTimeout returns shutdown timeout: 10s base + 100ms per peer, capped at 30s.
@@ -457,38 +442,18 @@ func waitWithContext(ctx context.Context, wg *sync.WaitGroup) error {
 // Start creates a new WireGuard tunnel interface and listens to events from Signal and Management services
 // Connections to remote peers are not established here.
 // However, they will be established once an event with a list of peers to connect to will be received from Management Service
-func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) (err error) {
+func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) error {
 	e.syncMsgMux.Lock()
 	defer e.syncMsgMux.Unlock()

-	// The engine is single-use. Reject a duplicate start and a start on an
-	// already-stopped engine (run context cancelled).
-	if e.started {
-		return ErrEngineAlreadyStarted
-	}
-
-	if ctxErr := e.ctx.Err(); ctxErr != nil {
-		return fmt.Errorf("engine already stopped: %w", ctxErr)
-	}
-
-	e.started = true
-
-	// Tear down any partially-initialized state on a failed start. Cancel the
-	// run context first so goroutines started before the failure (connMgr,
-	// srWatcher, monitors) unwind, then stopLocked mirrors Stop's teardown (we
-	// already hold syncMsgMux), cleaning up route/DNS/flow/state managers too,
-	// not just what close() covers.
-	defer func() {
-		if err != nil {
-			e.cancel()
-			e.stopLocked()
-		}
-	}()
-
-	if err = iface.ValidateMTU(e.config.MTU); err != nil {
+	if err := iface.ValidateMTU(e.config.MTU); err != nil {
 		return fmt.Errorf("invalid MTU configuration: %w", err)
 	}

+	if e.cancel != nil {
+		e.cancel()
+	}
+	e.ctx, e.cancel = context.WithCancel(e.clientCtx)
 	e.exposeManager = expose.NewManager(e.ctx, e.mgmClient)

 	wgIface, err := e.newWgIface()
@@ -522,11 +487,13 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)

 	initialRoutes, dnsConfig, dnsFeatureFlag, err := e.readInitialSettings()
 	if err != nil {
+		e.close()
 		return fmt.Errorf("read initial settings: %w", err)
 	}

 	dnsServer, err := e.newDnsServer(dnsConfig)
 	if err != nil {
+		e.close()
 		return fmt.Errorf("create dns server: %w", err)
 	}
 	e.dnsServer = dnsServer
@@ -561,6 +528,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)

 	if err = e.wgInterfaceCreate(); err != nil {
 		log.Errorf("failed creating tunnel interface %s: [%s]", e.config.WgIfaceName, err.Error())
+		e.close()
 		return fmt.Errorf("create wg interface: %w", err)
 	}

@@ -569,6 +537,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 	}

 	if err := e.createFirewall(); err != nil {
+		e.close()
 		return err
 	}

@@ -580,6 +549,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 	e.udpMux, err = e.wgInterface.Up()
 	if err != nil {
 		log.Errorf("failed to pull up wgInterface [%s]: %s", e.wgInterface.Name(), err.Error())
+		e.close()
 		return fmt.Errorf("up wg interface: %w", err)
 	}

@@ -604,7 +574,9 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 		e.acl = acl.NewDefaultManager(e.firewall)
 	}

-	if err := e.dnsServer.Initialize(); err != nil {
+	err = e.dnsServer.Initialize()
+	if err != nil {
+		e.close()
 		return fmt.Errorf("initialize dns server: %w", err)
 	}

@@ -616,9 +588,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 	e.srWatcher = guard.NewSRWatcher(e.signal, e.relayManager, e.mobileDep.IFaceDiscover, iceCfg)
 	e.srWatcher.Start(peer.IsForceRelayed())

-	if err = e.receiveSignalEvents(); err != nil {
-		return err
-	}
+	e.receiveSignalEvents()
 	e.receiveManagementEvents()
 	e.receiveJobEvents()

@@ -670,6 +640,7 @@ func (e *Engine) createFirewall() error {

 func (e *Engine) initFirewall() error {
 	if err := e.routeManager.SetFirewall(e.firewall); err != nil {
+		e.close()
 		return fmt.Errorf("set firewall: %w", err)
 	}

@@ -1156,6 +1127,20 @@ func (e *Engine) hasIPv6Changed(conf *mgmProto.PeerConfig) bool {
 	return !current.HasIPv6() || current.IPv6 != prefix.Addr() || current.IPv6Net != prefix.Masked()
 }

+// wrapDisconnectError classifies a receive-loop failure before the run is torn
+// down. An auth rejection (PermissionDenied/Unauthenticated) means the session
+// needs re-login and retrying is futile, so mark it terminal (NeedsLogin) — run()
+// then exits on its own instead of spinning the backoff. Any other failure is a
+// recoverable connection reset that the backoff should retry.
+func (e *Engine) wrapDisconnectError(err error) {
+	state := CtxGetState(e.ctx)
+	if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied || s.Code() == codes.Unauthenticated) {
+		state.Set(StatusNeedsLogin)
+		return
+	}
+	_ = state.Wrap(ErrResetConnection)
+}
+
 func (e *Engine) receiveJobEvents() {
 	e.jobExecutorWG.Add(1)
 	go func() {
@@ -1182,9 +1167,9 @@ func (e *Engine) receiveJobEvents() {
 			}
 		})
 		if err != nil {
-			// happens if management is unavailable for a long time.
-			// We want to cancel the operation of the whole client
-			_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
+			// happens if management is unavailable for a long time, or rejects
+			// us (auth). wrapDisconnectError decides retry vs needs-login.
+			e.wrapDisconnectError(err)
 			e.clientCancel()
 			return
 		}
@@ -1266,9 +1251,9 @@ func (e *Engine) receiveManagementEvents() {

 		err = e.mgmClient.Sync(e.ctx, info, e.handleSync)
 		if err != nil {
-			// happens if management is unavailable for a long time.
-			// We want to cancel the operation of the whole client
-			_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
+			// happens if management is unavailable for a long time, or rejects
+			// us (auth). wrapDisconnectError decides retry vs needs-login.
+			e.wrapDisconnectError(err)
 			e.clientCancel()
 			return
 		}
@@ -1729,7 +1714,7 @@ func (e *Engine) createPeerConn(pubKey string, allowedIPs []netip.Prefix, agentV
 }

 // receiveSignalEvents connects to the Signal Service event stream to negotiate connection with remote peers
-func (e *Engine) receiveSignalEvents() error {
+func (e *Engine) receiveSignalEvents() {
 	e.shutdownWg.Add(1)
 	go func() {
 		defer e.shutdownWg.Done()
@@ -1792,20 +1777,15 @@ func (e *Engine) receiveSignalEvents() error {
 			return nil
 		})
 		if err != nil {
-			// happens if signal is unavailable for a long time.
-			// We want to cancel the operation of the whole client
-			_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
+			// happens if signal is unavailable for a long time, or rejects us
+			// (auth). wrapDisconnectError decides retry vs needs-login.
+			e.wrapDisconnectError(err)
 			e.clientCancel()
 			return
 		}
 	}()

-	// todo: consider to remove this blocker. I do not see benefit to block the Start operations
-	e.signal.WaitStreamConnected(e.ctx)
-	if err := e.ctx.Err(); err != nil {
-		return fmt.Errorf("wait for signal stream: %w", err)
-	}
-	return nil
+	e.signal.WaitStreamConnected()
 }

 func (e *Engine) parseNATExternalIPMappings() []string {
--- a/client/internal/engine_test.go
+++ b/client/internal/engine_test.go
@@ -247,7 +247,7 @@ func TestEngine_SSH(t *testing.T) {
 		return
 	}

-	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

 	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
@@ -426,7 +426,7 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {
 		return
 	}

-	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

 	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
@@ -638,7 +638,7 @@ func TestEngine_Sync(t *testing.T) {
 		return
 	}

-	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

 	// feed updates to Engine via mocked Management client
@@ -817,7 +817,7 @@ func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
 				return
 			}

-			ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+			ctx, cancel := context.WithCancel(context.Background())
 			defer cancel()

 			wgIfaceName := fmt.Sprintf("utun%d", 104+n)
@@ -1024,7 +1024,7 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) {
 				return
 			}

-			ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+			ctx, cancel := context.WithCancel(context.Background())
 			defer cancel()

 			wgIfaceName := fmt.Sprintf("utun%d", 104+n)
--- a/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-sync-phases.json
+++ b/client/internal/metrics/infra/grafana/provisioning/dashboards/json/netbird-sync-phases.json
@@ -1,259 +0,0 @@
-{
-  "annotations": {
-    "list": []
-  },
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "graphTooltip": 1,
-  "links": [],
-  "refresh": "",
-  "schemaVersion": 39,
-  "tags": [
-    "netbird",
-    "sync"
-  ],
-  "templating": {
-    "list": [
-      {
-        "current": {
-          "text": "All",
-          "value": "$__all"
-        },
-        "datasource": {
-          "type": "influxdb",
-          "uid": "influxdb"
-        },
-        "definition": "import \"influxdata/influxdb/schema\"\nschema.tagValues(bucket: \"metrics\", tag: \"version\")",
-        "includeAll": true,
-        "label": "version",
-        "multi": true,
-        "name": "version",
-        "query": "import \"influxdata/influxdb/schema\"\nschema.tagValues(bucket: \"metrics\", tag: \"version\")",
-        "refresh": 2,
-        "type": "query",
-        "allValue": ".*"
-      }
-    ]
-  },
-  "time": {
-    "from": "now-2d",
-    "to": "now"
-  },
-  "timepicker": {},
-  "timezone": "",
-  "title": "NetBird Sync Phases (where time goes)",
-  "uid": "netbird-sync-phases",
-  "version": 1,
-  "panels": [
-    {
-      "id": 1,
-      "title": "Time per phase over time (stacked, ms)",
-      "type": "timeseries",
-      "datasource": {
-        "type": "influxdb",
-        "uid": "influxdb"
-      },
-      "gridPos": {
-        "h": 10,
-        "w": 24,
-        "x": 0,
-        "y": 0
-      },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ms",
-          "custom": {
-            "drawStyle": "bars",
-            "stacking": {
-              "mode": "normal",
-              "group": "A"
-            },
-            "fillOpacity": 80,
-            "lineWidth": 0
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right",
-          "calcs": [
-            "max",
-            "mean"
-          ]
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "desc"
-        }
-      },
-      "targets": [
-        {
-          "refId": "A",
-          "datasource": {
-            "type": "influxdb",
-            "uid": "influxdb"
-          },
-          "query": "from(bucket: \"metrics\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n  |> filter(fn: (r) => r.version =~ /${version:regex}/)\n  |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n  |> keep(columns: [\"_time\", \"_value\", \"phase\"])\n  |> group(columns: [\"phase\"])"
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "title": "p95 per phase (ms)",
-      "type": "bargauge",
-      "datasource": {
-        "type": "influxdb",
-        "uid": "influxdb"
-      },
-      "gridPos": {
-        "h": 11,
-        "w": 12,
-        "x": 0,
-        "y": 10
-      },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ms",
-          "color": {
-            "mode": "continuous-GrYlRd"
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "displayMode": "gradient",
-        "orientation": "horizontal",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showUnfilled": true
-      },
-      "targets": [
-        {
-          "refId": "A",
-          "datasource": {
-            "type": "influxdb",
-            "uid": "influxdb"
-          },
-          "query": "from(bucket: \"metrics\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n  |> filter(fn: (r) => r.version =~ /${version:regex}/)\n  |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n  |> group(columns: [\"phase\"])\n  |> quantile(q: 0.95)\n  |> group()\n  |> sort(columns: [\"_value\"], desc: true)"
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "title": "Per-phase stats (ms): mean / p95 / max",
-      "type": "table",
-      "datasource": {
-        "type": "influxdb",
-        "uid": "influxdb"
-      },
-      "gridPos": {
-        "h": 11,
-        "w": 12,
-        "x": 12,
-        "y": 10
-      },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ms"
-        },
-        "overrides": []
-      },
-      "options": {
-        "showHeader": true,
-        "sortBy": [
-          {
-            "displayName": "max",
-            "desc": true
-          }
-        ]
-      },
-      "transformations": [
-        {
-          "id": "merge",
-          "options": {}
-        }
-      ],
-      "targets": [
-        {
-          "refId": "mean",
-          "datasource": {
-            "type": "influxdb",
-            "uid": "influxdb"
-          },
-          "query": "from(bucket: \"metrics\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n  |> filter(fn: (r) => r.version =~ /${version:regex}/)\n  |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n  |> group(columns: [\"phase\"])\n  |> mean()\n  |> group()\n  |> keep(columns: [\"phase\", \"_value\"])\n  |> rename(columns: {_value: \"mean\"})"
-        },
-        {
-          "refId": "p95",
-          "datasource": {
-            "type": "influxdb",
-            "uid": "influxdb"
-          },
-          "query": "from(bucket: \"metrics\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n  |> filter(fn: (r) => r.version =~ /${version:regex}/)\n  |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n  |> group(columns: [\"phase\"])\n  |> quantile(q: 0.95)\n  |> group()\n  |> keep(columns: [\"phase\", \"_value\"])\n  |> rename(columns: {_value: \"p95\"})"
-        },
-        {
-          "refId": "max",
-          "datasource": {
-            "type": "influxdb",
-            "uid": "influxdb"
-          },
-          "query": "from(bucket: \"metrics\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"netbird_sync_phase\" and r._field == \"duration_seconds\")\n  |> filter(fn: (r) => r.version =~ /${version:regex}/)\n  |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n  |> group(columns: [\"phase\"])\n  |> max()\n  |> group()\n  |> keep(columns: [\"phase\", \"_value\"])\n  |> rename(columns: {_value: \"max\"})"
-        }
-      ]
-    },
-    {
-      "id": 4,
-      "title": "Total sync duration (netbird_sync, ms) \u2014 reference",
-      "type": "timeseries",
-      "datasource": {
-        "type": "influxdb",
-        "uid": "influxdb"
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 24,
-        "x": 0,
-        "y": 21
-      },
-      "fieldConfig": {
-        "defaults": {
-          "unit": "ms",
-          "custom": {
-            "drawStyle": "points",
-            "pointSize": 5
-          }
-        },
-        "overrides": []
-      },
-      "options": {
-        "legend": {
-          "displayMode": "table",
-          "placement": "right",
-          "calcs": [
-            "max",
-            "mean"
-          ]
-        },
-        "tooltip": {
-          "mode": "single"
-        }
-      },
-      "targets": [
-        {
-          "refId": "A",
-          "datasource": {
-            "type": "influxdb",
-            "uid": "influxdb"
-          },
-          "query": "from(bucket: \"metrics\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"netbird_sync\" and r._field == \"duration_seconds\")\n  |> filter(fn: (r) => r.version =~ /${version:regex}/)\n  |> map(fn: (r) => ({ r with _value: r._value * 1000.0 }))\n  |> keep(columns: [\"_time\", \"_value\", \"version\"])\n  |> group(columns: [\"version\"])"
-        }
-      ]
-    }
-  ]
-}
--- a/client/internal/metrics/infra/ingest/main.go
+++ b/client/internal/metrics/infra/ingest/main.go
@@ -59,19 +59,6 @@ var allowedMeasurements = map[string]measurementSpec{
 			"peer_id":         true,
 		},
 	},
-	"netbird_sync_phase": {
-		allowedFields: map[string]bool{
-			"duration_seconds": true,
-		},
-		allowedTags: map[string]bool{
-			"deployment_type": true,
-			"version":         true,
-			"os":              true,
-			"arch":            true,
-			"peer_id":         true,
-			"phase":           true,
-		},
-	},
 	"netbird_login": {
 		allowedFields: map[string]bool{
 			"duration_seconds": true,
--- a/client/internal/routemanager/dnsinterceptor/handler.go
+++ b/client/internal/routemanager/dnsinterceptor/handler.go
@@ -251,14 +251,6 @@ func (d *DnsInterceptor) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
 		r.MsgHdr.AuthenticatedData = true
 	}

-	// Advertise EDNS0 to the forwarder so it may return an Extended DNS Error
-	// describing why a lookup failed. The OPT is stripped from the reply when
-	// the original client did not request EDNS0.
-	hadEdns := r.IsEdns0() != nil
-	if !hadEdns {
-		r.SetEdns0(dns.DefaultMsgSize, false)
-	}
-
 	upstream := net.JoinHostPort(upstreamIP.String(), strconv.FormatUint(uint64(d.forwarderPort.Load()), 10))
 	ctx, cancel := context.WithTimeout(context.Background(), dnsTimeout)
 	defer cancel()
@@ -268,13 +260,6 @@ func (d *DnsInterceptor) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
 		return
 	}

-	if ede, ok := resutil.ExtractEDE(reply); ok {
-		resutil.SetMeta(w, "ede", fmt.Sprintf("%d %s", ede.InfoCode, ede.ExtraText))
-	}
-	if !hadEdns {
-		resutil.StripOPT(reply)
-	}
-
 	resutil.SetMeta(w, "peer", peerKey)

 	reply.Id = r.Id
--- a/client/ios/NetBirdSDK/client.go
+++ b/client/ios/NetBirdSDK/client.go
@@ -171,13 +171,13 @@ func (c *Client) Run(fd int32, interfaceName string, envList *EnvList) error {
 	c.onHostDnsFn = func([]string) {}
 	cfg.WgIface = interfaceName

-	connectClient := internal.NewConnectClient(ctx, cfg, c.recorder)
+	connectClient := internal.NewConnectClient(ctx, c.recorder)
 	c.setState(cfg, connectClient)
 	// Persist the latest sync response so DebugBundle can include the network
 	// map. On iOS this is backed by disk to keep it out of the constrained
 	// process memory (see the syncstore package).
 	connectClient.SetSyncResponsePersistence(true)
-	return connectClient.RunOniOS(fd, c.networkChangeListener, c.dnsManager, c.stateFile, c.cacheDir, c.logFilePath)
+	return connectClient.RunOniOS(cfg, fd, c.networkChangeListener, c.dnsManager, c.stateFile, c.cacheDir, c.logFilePath)
 }

 // Stop the internal client and free the resources
--- a/client/ios/NetBirdSDK/login.go
+++ b/client/ios/NetBirdSDK/login.go
@@ -36,7 +36,6 @@ type URLOpener interface {
 // Auth can register or login new client
 type Auth struct {
 	ctx     context.Context
-	cancel  context.CancelFunc
 	config  *profilemanager.Config
 	cfgPath string
 }
@@ -52,19 +51,8 @@ func NewAuth(cfgPath string, mgmURL string) (*Auth, error) {
 		return nil, err
 	}

-	// Use a cancellable context so Stop() can abort an in-progress interactive
-	// login. The PKCE flow's WaitToken blocks (and keeps its loopback HTTP server
-	// bound to a port) until the OAuth callback arrives or the flow expires;
-	// cancelling the context unblocks WaitToken, which then shuts that server down
-	// and frees the port for the next login attempt. iOS runs login in the main-app
-	// process (decoupled from the network extension), so without this the server
-	// lingers after the user dismisses the browser and the next connect stalls
-	// trying to bind the same port.
-	ctx, cancel := context.WithCancel(context.Background())
-
 	return &Auth{
-		ctx:     ctx,
-		cancel:  cancel,
+		ctx:     context.Background(),
 		config:  cfg,
 		cfgPath: cfgPath,
 	}, nil
@@ -72,24 +60,12 @@ func NewAuth(cfgPath string, mgmURL string) (*Auth, error) {

 // NewAuthWithConfig instantiate Auth based on existing config
 func NewAuthWithConfig(ctx context.Context, config *profilemanager.Config) *Auth {
-	ctx, cancel := context.WithCancel(ctx)
 	return &Auth{
 		ctx:    ctx,
-		cancel: cancel,
 		config: config,
 	}
 }

-// Stop aborts an in-progress interactive login started via Login/LoginWithDeviceName.
-// It cancels the auth context, which unblocks the PKCE WaitToken and shuts down its
-// loopback HTTP server, freeing the redirect port. Safe to call multiple times and
-// safe to call when no login is running.
-func (a *Auth) Stop() {
-	if a.cancel != nil {
-		a.cancel()
-	}
-}
-
 // SaveConfigIfSSOSupported test the connectivity with the management server by retrieving the server device flow info.
 // If it returns a flow info than save the configuration and return true. If it gets a codes.NotFound, it means that SSO
 // is not supported and returns false without saving the configuration. For other errors return false.
--- a/client/server/capture.go
+++ b/client/server/capture.go
@@ -344,9 +344,6 @@ func (s *Server) clearCaptureIfOwner(sess *capture.Session, engine *internal.Eng
 }

 func (s *Server) getCaptureEngineLocked() (*internal.Engine, error) {
-	if s.connectClient == nil {
-		return nil, status.Error(codes.FailedPrecondition, "client not connected")
-	}
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, status.Error(codes.FailedPrecondition, "engine not initialized")
--- a/client/server/debug.go
+++ b/client/server/debug.go
@@ -5,7 +5,6 @@ package server
 import (
 	"bytes"
 	"context"
-	"errors"
 	"fmt"
 	"runtime/pprof"

@@ -28,11 +27,9 @@ func (s *Server) DebugBundle(_ context.Context, req *proto.DebugBundleRequest) (
 	}

 	var clientMetrics debug.MetricsExporter
-	if s.connectClient != nil {
-		if engine := s.connectClient.Engine(); engine != nil {
-			if cm := engine.GetClientMetrics(); cm != nil {
-				clientMetrics = cm
-			}
+	if engine := s.connectClient.Engine(); engine != nil {
+		if cm := engine.GetClientMetrics(); cm != nil {
+			clientMetrics = cm
 		}
 	}

@@ -48,13 +45,10 @@ func (s *Server) DebugBundle(_ context.Context, req *proto.DebugBundleRequest) (
 	defer s.cleanupBundleCapture()

 	var refreshStatus func()
-	if s.connectClient != nil {
-		engine := s.connectClient.Engine()
-		if engine != nil {
-			refreshStatus = func() {
-				log.Debug("refreshing system health status for debug bundle")
-				engine.RunHealthProbes(true)
-			}
+	if engine := s.connectClient.Engine(); engine != nil {
+		refreshStatus = func() {
+			log.Debug("refreshing system health status for debug bundle")
+			engine.RunHealthProbes(true)
 		}
 	}

@@ -118,9 +112,7 @@ func (s *Server) SetLogLevel(_ context.Context, req *proto.SetLogLevelRequest) (

 	log.SetLevel(level)

-	if s.connectClient != nil {
-		s.connectClient.SetLogLevel(level)
-	}
+	s.connectClient.SetLogLevel(level)

 	log.Infof("Log level set to %s", level.String())

@@ -134,20 +126,13 @@ func (s *Server) SetSyncResponsePersistence(_ context.Context, req *proto.SetSyn

 	enabled := req.GetEnabled()
 	s.persistSyncResponse = enabled
-	if s.connectClient != nil {
-		s.connectClient.SetSyncResponsePersistence(enabled)
-	}
+	s.connectClient.SetSyncResponsePersistence(enabled)

 	return &proto.SetSyncResponsePersistenceResponse{}, nil
 }

 func (s *Server) getLatestSyncResponse() (*mgmProto.SyncResponse, error) {
-	cClient := s.connectClient
-	if cClient == nil {
-		return nil, errors.New("connect client is not initialized")
-	}
-
-	return cClient.GetLatestSyncResponse()
+	return s.connectClient.GetLatestSyncResponse()
 }

 // StartCPUProfile starts CPU profiling in the daemon.
--- a/client/server/mdm.go
+++ b/client/server/mdm.go
@@ -3,7 +3,6 @@ package server
 import (
 	"context"
 	"fmt"
-	"time"

 	log "github.com/sirupsen/logrus"
 	"google.golang.org/grpc/codes"
@@ -39,12 +38,11 @@ type conflictCheck struct {
 // OS-native managed-config store reports a diff vs the last observation.
 //
 // Restart sequence:
-//  1. Cancel the active engine context (terminates connectWithRetryRuns).
-//  2. Wait briefly for that goroutine to exit (giveUpChan is closed on exit).
-//  3. Re-resolve Config from disk + MDM policy (Config.apply re-runs
+//  1. Stop the in-flight run via the supervisor (blocks until fully torn down).
+//  2. Re-resolve Config from disk + MDM policy (Config.apply re-runs
 //     applyMDMPolicy with the freshly loaded Policy).
-//  4. Spawn a fresh connectWithRetryRuns with the new context and config.
-//  5. Broadcast a SystemEvent so any GUI / CLI subscriber (SubscribeEvents
+//  3. Start a fresh run with the new config.
+//  4. Broadcast a SystemEvent so any GUI / CLI subscriber (SubscribeEvents
 //     RPC) can refresh its cached config view without polling.
 //
 // The callback runs in the ticker's own goroutine. Ticker has already
@@ -52,39 +50,24 @@ type conflictCheck struct {
 func (s *Server) onMDMPolicyChange(_, _ *mdm.Policy) error {
 	log.Warn("MDM policy changed; restarting engine to apply new configuration")

-	// Hold s.mutex for the entire restart sequence (cancel + quiescence
-	// wait + re-spawn). Any concurrent Up/Down/Status arriving while
-	// MDM is restarting blocks on the Lock until we are done — they
-	// then observe the post-restart state coherently. This is safe
-	// because the connectWithRetryRuns goroutine no longer acquires
-	// s.mutex in its defer (intent vs. goroutine-alive concerns are
-	// fully separated; see the connectionGoroutineRunning helper).
+	// Hold s.mutex for the entire restart sequence (stop + re-start). Any
+	// concurrent Up/Down/Status arriving while MDM is restarting blocks on the
+	// Lock until we are done — they then observe the post-restart state coherently.
 	s.mutex.Lock()
 	defer s.mutex.Unlock()

-	if !s.clientRunning {
-		// The client is not running, so there's no engine to restart.
+	if !s.connectClient.ConnectionRunning() {
+		// No run in flight, so there's no engine to restart.
 		return nil
 	}
+
+	// Cancel daemon-side login/status activities tied to the old run; the run
+	// itself is torn down atomically by the supervisor inside Restart (see
+	// restartEngineForMDMLocked), which stops and re-starts in one operation.
 	if s.actCancel != nil {
 		s.actCancel()
 	}

-	// Wait for previous connectWithRetryRuns to exit so we don't end up
-	// with two goroutines fighting over the same status recorder + engine.
-	// The teardown engages a fan-out of engine goroutines (peer workers,
-	// signal handler, route manager, ...). close(clientGiveUpChan)
-	// happens in the function-scope defer of connectWithRetryRuns, on
-	// every exit path (ctx cancel, backoff exhausted, panic) — see the
-	// defer in server.go.
-	if s.clientGiveUpChan != nil {
-		select {
-		case <-s.clientGiveUpChan:
-		case <-time.After(10 * time.Second):
-			return fmt.Errorf("failed to restart the engine due to timeout")
-		}
-	}
-
 	if err := s.restartEngineForMDMLocked(); err != nil {
 		log.Errorf("MDM restart failed: %v", err)
 		return err
@@ -131,14 +114,13 @@ func (s *Server) publishConfigChangedEvent(source string) {
 }

 // restartEngineForMDMLocked re-resolves the active profile config
-// (re-running applyMDMPolicy via Config.apply) and re-spawns
-// connectWithRetryRuns. Mirrors the tail of Server.Start so a runtime
-// MDM change behaves identically to a fresh boot under the new policy.
+// (re-running applyMDMPolicy via Config.apply) and starts a fresh run.
+// Mirrors the tail of Server.Start so a runtime MDM change behaves
+// identically to a fresh boot under the new policy.
 //
 // MUST be called with s.mutex held — onMDMPolicyChange holds the lock
-// for the entire restart sequence (cancel + quiescence wait + re-spawn)
-// so concurrent Up/Down/Status RPCs observe a coherent post-restart
-// state.
+// for the entire restart sequence so concurrent Up/Down/Status RPCs
+// observe a coherent post-restart state.
 func (s *Server) restartEngineForMDMLocked() error {
 	activeProf, err := s.profileManager.GetActiveProfileState()
 	if err != nil {
@@ -154,13 +136,13 @@ func (s *Server) restartEngineForMDMLocked() error {
 	s.statusRecorder.UpdateRosenpass(config.RosenpassEnabled, config.RosenpassPermissive)
 	s.statusRecorder.UpdateLazyConnection(config.LazyConnectionEnabled)

-	ctx, cancel := context.WithCancel(s.rootCtx)
+	_, cancel := context.WithCancel(s.rootCtx)
 	s.actCancel = cancel
-	s.clientRunning = true
-	s.clientRunningChan = make(chan struct{})
-	s.clientGiveUpChan = make(chan struct{})
-	log.Info("MDM restart: spawning connectWithRetryRuns with re-resolved config")
-	go s.connectWithRetryRuns(ctx, config, s.statusRecorder, s.clientRunningChan, s.clientGiveUpChan)
+	log.Info("MDM restart: atomically restarting the run with re-resolved config")
+	// MDM restart has no incoming RPC metadata; fire and forget. Restart is a
+	// single supervisor op (atomic stop+start), so there is no observable
+	// "stopped" window between tearing down the old run and starting the new.
+	s.connectClient.Restart(config, nil)
 	s.publishConfigChangedEvent("mdm")
 	return nil
 }
--- a/client/server/network.go
+++ b/client/server/network.go
@@ -34,10 +34,6 @@ func (s *Server) ListNetworks(context.Context, *proto.ListNetworksRequest) (*pro
 		return nil, gstatus.Errorf(codes.Unavailable, errNetworksDisabled)
 	}

-	if s.connectClient == nil {
-		return nil, fmt.Errorf("not connected")
-	}
-
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, fmt.Errorf("not connected")
@@ -147,10 +143,6 @@ func (s *Server) SelectNetworks(_ context.Context, req *proto.SelectNetworksRequ
 		return nil, gstatus.Errorf(codes.Unavailable, errNetworksDisabled)
 	}

-	if s.connectClient == nil {
-		return nil, fmt.Errorf("not connected")
-	}
-
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, fmt.Errorf("not connected")
@@ -199,10 +191,6 @@ func (s *Server) DeselectNetworks(_ context.Context, req *proto.SelectNetworksRe
 		return nil, gstatus.Errorf(codes.Unavailable, errNetworksDisabled)
 	}

-	if s.connectClient == nil {
-		return nil, fmt.Errorf("not connected")
-	}
-
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, fmt.Errorf("not connected")
--- a/client/server/server.go
+++ b/client/server/server.go
@@ -8,12 +8,10 @@ import (
 	"os"
 	"os/exec"
 	"runtime"
-	"strconv"
 	"sync"
 	"sync/atomic"
 	"time"

-	"github.com/cenkalti/backoff/v4"
 	log "github.com/sirupsen/logrus"
 	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
 	"google.golang.org/grpc/codes"
@@ -39,15 +37,7 @@ import (
 )

 const (
-	probeThreshold          = time.Second * 5
-	retryInitialIntervalVar = "NB_CONN_RETRY_INTERVAL_TIME"
-	maxRetryIntervalVar     = "NB_CONN_MAX_RETRY_INTERVAL_TIME"
-	maxRetryTimeVar         = "NB_CONN_MAX_RETRY_TIME_TIME"
-	retryMultiplierVar      = "NB_CONN_RETRY_MULTIPLIER"
-	defaultInitialRetryTime = 30 * time.Minute
-	defaultMaxRetryInterval = 60 * time.Minute
-	defaultMaxRetryTime     = 14 * 24 * time.Hour
-	defaultRetryMultiplier  = 1.7
+	probeThreshold = time.Second * 5

 	// JWT token cache TTL for the client daemon (disabled by default)
 	defaultJWTCacheTTL = 0
@@ -72,15 +62,8 @@ type Server struct {
 	mutex  sync.Mutex
 	config *profilemanager.Config
 	proto.UnimplementedDaemonServiceServer
-	// clientRunning tracks "the daemon wants to be connected" — set true by
-	// Start / Up, cleared by Down / Logout. Persists across retry
-	// loops, signal disconnects, and ErrResetConnection cycles. NOT
-	// changed by connectWithRetryRuns goroutine exit — for that
-	// (goroutine-still-alive) check, see connectionGoroutineRunning() which
-	// derives from clientGiveUpChan close state. Protected by s.mutex.
-	clientRunning     bool
-	clientRunningChan chan struct{}
-	clientGiveUpChan  chan struct{} // closed when connectWithRetryRuns goroutine exits
+	// Run state (in-flight? established/done channels?) is owned entirely by the
+	// supervisor inside connectClient — the daemon keeps no per-run fields.

 	connectClient *internal.ConnectClient

@@ -136,6 +119,13 @@ func New(ctx context.Context, logFile string, configFile string, profilesDisable
 		networksDisabled:       networksDisabled,
 		jwtCache:               newJWTCache(),
 	}
+	// The ConnectClient is daemon-lifetime: build it exactly once, here. Its
+	// supervisor lives as long as the daemon; Up/Down/MDM and reconnects all
+	// drive this same instance. updateManager isn't ready yet (created in
+	// Start) and is injected there via SetUpdateManager.
+	s.connectClient = internal.NewConnectClient(ctx, s.statusRecorder)
+	s.connectClient.SetSyncResponsePersistence(s.persistSyncResponse)
+
 	agent := &serverAgent{s}
 	s.sleepHandler = sleephandler.New(agent)
 	s.startSleepDetector()
@@ -147,7 +137,7 @@ func (s *Server) Start() error {
 	s.mutex.Lock()
 	defer s.mutex.Unlock()

-	if s.clientRunning {
+	if s.connectClient.ConnectionRunning() {
 		return nil
 	}

@@ -165,6 +155,7 @@ func (s *Server) Start() error {
 		stateMgr := statemanager.New(s.profileManager.GetStatePath())
 		s.updateManager = updater.NewManager(s.statusRecorder, stateMgr)
 		s.updateManager.CheckUpdateSuccess(s.rootCtx)
+		s.connectClient.SetUpdateManager(s.updateManager)
 	}

 	// MDM policy reload ticker: every minute the desktop daemon re-reads
@@ -190,7 +181,9 @@ func (s *Server) Start() error {
 		return nil
 	}

-	ctx, cancel := context.WithCancel(s.rootCtx)
+	// actCancel cancels in-flight foreground operations (login/status); the run
+	// itself is owned by the supervisor and stopped via Stop, not this cancel.
+	_, cancel := context.WithCancel(s.rootCtx)
 	s.actCancel = cancel

 	// copy old default config
@@ -232,99 +225,14 @@ func (s *Server) Start() error {
 		return nil
 	}

-	s.clientRunning = true
-	s.clientRunningChan = make(chan struct{})
-	s.clientGiveUpChan = make(chan struct{})
-	go s.connectWithRetryRuns(ctx, config, s.statusRecorder, s.clientRunningChan, s.clientGiveUpChan)
+	// Boot autoconnect: no incoming RPC metadata. The supervisor runs the
+	// client and reconnects internally; we just fire and forget (the run owns
+	// its established/done channels).
+	s.connectClient.RunAsync(config, nil)
 	s.publishConfigChangedEvent("startup")
 	return nil
 }

-// connectWithRetryRuns runs the client connection with a backoff strategy where we retry the operation as additional
-// mechanism to keep the client connected even when the connection is lost.
-// we cancel retry if the client receive a stop or down command, or if disable auto connect is configured.
-//
-// The goroutine's exit is signalled to the daemon via close(giveUpChan)
-// — placed in the function-scope defer so every return path (panic,
-// DisableAutoConnect early-exit, backoff exhausted, ctx cancel) closes
-// it. Callers that need to observe "is the goroutine still alive?" use
-// Server.connectionGoroutineRunning() which non-blockingly checks the close state
-// of clientGiveUpChan. The defer does NOT touch s.mutex; the daemon's
-// "intent" (clientRunning) is maintained by the RPC handlers, not by this
-// goroutine.
-func (s *Server) connectWithRetryRuns(ctx context.Context, profileConfig *profilemanager.Config, statusRecorder *peer.Status, runningChan chan struct{}, giveUpChan chan struct{}) {
-	defer func() {
-		if giveUpChan != nil {
-			close(giveUpChan)
-		}
-	}()
-
-	if s.config.DisableAutoConnect {
-		if err := s.connect(ctx, s.config, s.statusRecorder, runningChan); err != nil {
-			log.Debugf("run client connection exited with error: %v", err)
-		}
-		log.Tracef("client connection exited")
-		return
-	}
-
-	backOff := getConnectWithBackoff(ctx)
-	go func() {
-		t := time.NewTicker(24 * time.Hour)
-		for {
-			select {
-			case <-ctx.Done():
-				t.Stop()
-				return
-			case <-t.C:
-				mgmtState := statusRecorder.GetManagementState()
-				signalState := statusRecorder.GetSignalState()
-				if mgmtState.Connected && signalState.Connected {
-					log.Tracef("resetting status")
-					backOff.Reset()
-				} else {
-					log.Tracef("not resetting status: mgmt: %v, signal: %v", mgmtState.Connected, signalState.Connected)
-				}
-			}
-		}
-	}()
-
-	runOperation := func() error {
-		err := s.connect(ctx, profileConfig, statusRecorder, runningChan)
-		if err != nil {
-			log.Debugf("run client connection exited with error: %v. Will retry in the background", err)
-			return err
-		}
-
-		log.Tracef("client connection exited gracefully, do not need to retry")
-		return nil
-	}
-
-	if err := backoff.Retry(runOperation, backOff); err != nil {
-		log.Errorf("operation failed: %v", err)
-	}
-	// giveUpChan is closed by the function-scope defer.
-}
-
-// connectionGoroutineRunning reports whether the connectWithRetryRuns goroutine is
-// still running. Returns false when no goroutine has ever been started
-// AND when the most recent one has already closed clientGiveUpChan on
-// exit (whether due to ctx cancel, DisableAutoConnect single-shot
-// completion, or backoff retry exhaustion).
-//
-// MUST be called with s.mutex held — accesses s.clientGiveUpChan which
-// is written by Start/Up under the same lock.
-func (s *Server) connectionGoroutineRunning() bool {
-	if s.clientGiveUpChan == nil {
-		return false
-	}
-	select {
-	case <-s.clientGiveUpChan:
-		return false
-	default:
-		return true
-	}
-}
-
 // loginAttempt attempts to login using the provided information. it returns a status in case something fails
 func (s *Server) loginAttempt(ctx context.Context, setupKey, jwtToken string) (internal.StatusType, error) {
 	authClient, err := auth.NewAuth(ctx, s.config.PrivateKey, s.config.ManagementURL, s.config)
@@ -720,13 +628,22 @@ func (s *Server) WaitSSOLogin(callerCtx context.Context, msg *proto.WaitSSOLogin
 // Up starts engine work in the daemon.
 func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpResponse, error) {
 	s.mutex.Lock()
-	// clientRunning is the daemon-intent flag (set by previous Up/Start, cleared
-	// by Down). connectionGoroutineRunning() reports whether the previous retry-loop
-	// goroutine is still trying. When intent is up AND goroutine is alive,
-	// the existing engine is on the job — just wait for it. When intent
-	// is up but the goroutine has given up (backoff exhausted) OR when
-	// intent is down, fall through to spawn a fresh retry loop.
-	if s.clientRunning && s.connectionGoroutineRunning() {
+
+	// The client (and its supervisor) is built once in New(), so a nil here
+	// never happens in production — Up is only reachable after New() has run and
+	// the gRPC server is serving. The real case this guards is the daemon
+	// SHUTTING DOWN: rootCtx is cancelled, the supervisor is no longer accepting
+	// commands, so ServiceRunning() is false even though the client exists. Bail
+	// loud instead of enqueuing a run that will never start. (nil only happens in
+	// tests that build a Server without New(); ServiceRunning is nil-safe.)
+	if !s.connectClient.ServiceRunning() {
+		s.mutex.Unlock()
+		return nil, fmt.Errorf("service is not running, start the netbird service for 'up' to take effect")
+	}
+
+	// If a connection run is already in flight, the existing engine is on the
+	// job — just wait for it. Otherwise fall through to start a fresh run.
+	if s.connectClient.ConnectionRunning() {
 		state := internal.CtxGetState(s.rootCtx)
 		status, err := state.Status()
 		if err != nil {
@@ -764,14 +681,14 @@ func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpR
 	if s.actCancel != nil {
 		s.actCancel()
 	}
-	ctx, cancel := context.WithCancel(s.rootCtx)
-	md, ok := metadata.FromIncomingContext(callerCtx)
-	if ok {
-		ctx = metadata.NewOutgoingContext(ctx, md)
-	}
-
+	// actCancel cancels in-flight foreground ops (login/status); the run is
+	// owned by the supervisor and stopped via Stop, not this cancel.
+	_, cancel := context.WithCancel(s.rootCtx)
 	s.actCancel = cancel

+	// Forward the caller's gRPC metadata (e.g. UI user-agent) into the run.
+	md, _ := metadata.FromIncomingContext(callerCtx)
+
 	if s.config == nil {
 		s.mutex.Unlock()
 		return nil, fmt.Errorf("config is not defined, please call login command first")
@@ -812,35 +729,26 @@ func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpR
 	s.statusRecorder.UpdateManagementAddress(s.config.ManagementURL.String())
 	s.statusRecorder.UpdateRosenpass(s.config.RosenpassEnabled, s.config.RosenpassPermissive)

-	s.clientRunning = true
-	s.clientRunningChan = make(chan struct{})
-	s.clientGiveUpChan = make(chan struct{})
-
-	go s.connectWithRetryRuns(ctx, s.config, s.statusRecorder, s.clientRunningChan, s.clientGiveUpChan)
+	s.connectClient.RunAsync(s.config, md)
 	s.publishConfigChangedEvent("up_rpc")

 	s.mutex.Unlock()
 	return s.waitForUp(callerCtx)
 }

-// todo: handle potential race conditions
+// waitForUp blocks until the in-flight run becomes established (success) or ends
+// before that (failure). The wait is owned by the supervisor (via the client) —
+// the daemon holds no per-run state here.
 func (s *Server) waitForUp(callerCtx context.Context) (*proto.UpResponse, error) {
 	timeoutCtx, cancel := context.WithTimeout(callerCtx, 50*time.Second)
 	defer cancel()

-	select {
-	case <-s.clientGiveUpChan:
-		return nil, fmt.Errorf("client gave up to connect")
-	case <-s.clientRunningChan:
-		s.isSessionActive.Store(true)
-		return &proto.UpResponse{}, nil
-	case <-callerCtx.Done():
-		log.Debug("context done, stopping the wait for engine to become ready")
-		return nil, callerCtx.Err()
-	case <-timeoutCtx.Done():
-		log.Debug("up is timed out, stopping the wait for engine to become ready")
-		return nil, timeoutCtx.Err()
+	if err := s.connectClient.WaitEstablishedOrDone(timeoutCtx); err != nil {
+		log.Debugf("waiting for the connection to be established failed: %v", err)
+		return nil, fmt.Errorf("connection not established: %w", err)
 	}
+	s.isSessionActive.Store(true)
+	return &proto.UpResponse{}, nil
 }

 // resolveProfileHandle resolves a wire-level profile handle (display
@@ -935,11 +843,11 @@ func (s *Server) SwitchProfile(callerCtx context.Context, msg *proto.SwitchProfi
 // Down engine work in the daemon.
 func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownResponse, error) {
 	s.mutex.Lock()
+	defer s.mutex.Unlock()

-	giveUpChan := s.clientGiveUpChan
-
+	// cleanupConnection stops the run through the supervisor, which blocks until
+	// the run has fully unwound — no separate goroutine-quiescence wait needed.
 	if err := s.cleanupConnection(); err != nil {
-		s.mutex.Unlock()
 		// todo review to update the status in case any type of error
 		log.Errorf("failed to shut down properly: %v", err)
 		return nil, err
@@ -948,20 +856,6 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes
 	state := internal.CtxGetState(s.rootCtx)
 	state.Set(internal.StatusIdle)

-	s.mutex.Unlock()
-
-	// Wait for the connectWithRetryRuns goroutine to finish with a short timeout.
-	// This prevents the goroutine from setting ErrResetConnection after Down() returns.
-	// The giveUpChan is closed at the end of connectWithRetryRuns.
-	if giveUpChan != nil {
-		select {
-		case <-giveUpChan:
-			log.Debugf("client goroutine finished successfully")
-		case <-time.After(5 * time.Second):
-			log.Warnf("timeout waiting for client goroutine to finish, proceeding anyway")
-		}
-	}
-
 	return &proto.DownResponse{}, nil
 }

@@ -972,38 +866,19 @@ func (s *Server) cleanupConnection() error {
 		return ErrServiceNotUp
 	}

-	// Daemon intent flips to "down" — all callers (Down RPC,
-	// Logout RPC handlers) tear down the connection because the user
-	// explicitly asked for it. MDM restart does NOT go through this
-	// path, so its clientRunning stays true.
-	s.clientRunning = false
-
-	// Capture the engine reference before cancelling the context.
-	// After actCancel(), the connectWithRetryRuns goroutine wakes up
-	// and sets connectClient.engine = nil, causing connectClient.Stop()
-	// to skip the engine shutdown entirely.
-	var engine *internal.Engine
-	if s.connectClient != nil {
-		engine = s.connectClient.Engine()
+	// Tear the client down through the lifecycle supervisor BEFORE cancelling
+	// the retry context. Stop serializes on the supervisor queue and blocks
+	// until the in-flight run has fully unwound (a clean, synchronous teardown).
+	// It must run before actCancel: cancelling the context first would make
+	// Stop observe a dead context and return early without waiting.
+	if err := s.connectClient.Stop(); err != nil {
+		return err
 	}

+	// Stop the retry goroutine so it does not start a fresh run. The client
+	// itself is daemon-lifetime and intentionally kept (a later Up reuses it).
 	s.actCancel()

-	if s.connectClient == nil {
-		return nil
-	}
-
-	// TODO: consider calling s.connectClient.Stop() instead of engine.Stop().
-	// actCancel() lets the run loop stop the engine too, so both stop it
-	// concurrently; ConnectClient.Stop cancels and waits for the run loop,
-	// making the run loop the sole owner of engine shutdown.
-	if engine != nil {
-		if err := engine.Stop(); err != nil {
-			return err
-		}
-	}
-
-	s.connectClient = nil
 	s.isSessionActive.Store(false)

 	log.Infof("service is down")
@@ -1138,7 +1013,7 @@ func (s *Server) validateProfileOperation(id profilemanager.ID, allowActiveProfi

 func (s *Server) logoutFromProfile(ctx context.Context, profile *profilemanager.Profile) error {
 	activeProf, err := s.profileManager.GetActiveProfileState()
-	if err == nil && activeProf.ID == profile.ID && s.connectClient != nil {
+	if err == nil && activeProf.ID == profile.ID && s.connectClient.ConnectionRunning() {
 		return s.sendLogoutRequest(ctx)
 	}

@@ -1184,48 +1059,13 @@ func (s *Server) Status(
 	ctx context.Context,
 	msg *proto.StatusRequest,
 ) (*proto.StatusResponse, error) {
-	s.mutex.Lock()
-	// Only wait if the retry-loop goroutine is alive and making
-	// progress. clientRunning=true with connectionGoroutineRunning=false means the
-	// backoff has given up — there is nothing to wait for; let the
-	// caller observe the failed status directly.
-	alive := s.connectionGoroutineRunning()
-	s.mutex.Unlock()
-
-	if msg.WaitForReady != nil && *msg.WaitForReady && alive {
-		state := internal.CtxGetState(s.rootCtx)
-		status, err := state.Status()
-		if err != nil {
-			return nil, err
-		}
-
-		if status != internal.StatusIdle && status != internal.StatusConnected && status != internal.StatusConnecting {
-			s.actCancel()
-		}
-
-		ticker := time.NewTicker(1 * time.Second)
-		defer ticker.Stop()
-	loop:
-		for {
-			select {
-			case <-s.clientGiveUpChan:
-				ticker.Stop()
-				break loop
-			case <-s.clientRunningChan:
-				ticker.Stop()
-				break loop
-			case <-ticker.C:
-				status, err := state.Status()
-				if err != nil {
-					continue
-				}
-				if status != internal.StatusIdle && status != internal.StatusConnected && status != internal.StatusConnecting {
-					s.actCancel()
-				}
-				continue
-			case <-ctx.Done():
-				return nil, ctx.Err()
-			}
+	// A run that hits a terminal auth failure now exits on its own (engine marks
+	// NeedsLogin), so we no longer poll-and-cancel: we wait for the in-flight run
+	// to become established or to end. With no run in flight this returns
+	// immediately (errNoRunInFlight); either way we then report the status below.
+	if msg.WaitForReady != nil && *msg.WaitForReady {
+		if err := s.connectClient.WaitEstablishedOrDone(ctx); err != nil && ctx.Err() != nil {
+			return nil, ctx.Err()
 		}
 	}

@@ -1263,10 +1103,6 @@ func (s *Server) getSSHServerState() *proto.SSHServerState {
 	connectClient := s.connectClient
 	s.mutex.Unlock()

-	if connectClient == nil {
-		return nil
-	}
-
 	engine := connectClient.Engine()
 	if engine == nil {
 		return nil
@@ -1304,10 +1140,6 @@ func (s *Server) GetPeerSSHHostKey(
 	statusRecorder := s.statusRecorder
 	s.mutex.Unlock()

-	if connectClient == nil {
-		return nil, errors.New("client not initialized")
-	}
-
 	engine := connectClient.Engine()
 	if engine == nil {
 		return nil, errors.New("engine not started")
@@ -1474,17 +1306,13 @@ func (s *Server) WaitJWTToken(
 // ExposeService exposes a local port via the NetBird reverse proxy.
 func (s *Server) ExposeService(req *proto.ExposeServiceRequest, srv proto.DaemonService_ExposeServiceServer) error {
 	s.mutex.Lock()
-	if !s.clientRunning {
+	if !s.connectClient.ConnectionRunning() {
 		s.mutex.Unlock()
 		return gstatus.Errorf(codes.FailedPrecondition, "client is not running, run 'netbird up' first")
 	}
 	connectClient := s.connectClient
 	s.mutex.Unlock()

-	if connectClient == nil {
-		return gstatus.Errorf(codes.FailedPrecondition, "client not initialized")
-	}
-
 	engine := connectClient.Engine()
 	if engine == nil {
 		return gstatus.Errorf(codes.FailedPrecondition, "engine not initialized")
@@ -1538,10 +1366,6 @@ func isUnixRunningDesktop() bool {
 }

 func (s *Server) runProbes(waitForProbeResult bool) {
-	if s.connectClient == nil {
-		return
-	}
-
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return
@@ -1820,22 +1644,6 @@ func (s *Server) GetFeatures(ctx context.Context, msg *proto.GetFeaturesRequest)
 	return features, nil
 }

-func (s *Server) connect(ctx context.Context, config *profilemanager.Config, statusRecorder *peer.Status, runningChan chan struct{}) error {
-	log.Tracef("running client connection")
-	client := internal.NewConnectClient(ctx, config, statusRecorder)
-	client.SetUpdateManager(s.updateManager)
-	client.SetSyncResponsePersistence(s.persistSyncResponse)
-
-	s.mutex.Lock()
-	s.connectClient = client
-	s.mutex.Unlock()
-
-	if err := client.Run(runningChan, s.logFile); err != nil {
-		return err
-	}
-	return nil
-}
-
 // MDM authority: when the platform-native MDM source sets a kill switch
 // key (regardless of true/false value), that value wins. The CLI flag
 // supplied at service install time is the fallback used only when the
@@ -1897,45 +1705,6 @@ func (s *Server) onSessionExpire() {
 	}
 }

-// getConnectWithBackoff returns a backoff with exponential backoff strategy for connection retries
-func getConnectWithBackoff(ctx context.Context) backoff.BackOff {
-	initialInterval := parseEnvDuration(retryInitialIntervalVar, defaultInitialRetryTime)
-	maxInterval := parseEnvDuration(maxRetryIntervalVar, defaultMaxRetryInterval)
-	maxElapsedTime := parseEnvDuration(maxRetryTimeVar, defaultMaxRetryTime)
-	multiplier := defaultRetryMultiplier
-
-	if envValue := os.Getenv(retryMultiplierVar); envValue != "" {
-		// parse the multiplier from the environment variable string value to float64
-		value, err := strconv.ParseFloat(envValue, 64)
-		if err != nil {
-			log.Warnf("unable to parse environment variable %s: %s. using default: %f", retryMultiplierVar, envValue, multiplier)
-		} else {
-			multiplier = value
-		}
-	}
-
-	return backoff.WithContext(&backoff.ExponentialBackOff{
-		InitialInterval:     initialInterval,
-		RandomizationFactor: 1,
-		Multiplier:          multiplier,
-		MaxInterval:         maxInterval,
-		MaxElapsedTime:      maxElapsedTime, // 14 days
-		Stop:                backoff.Stop,
-		Clock:               backoff.SystemClock,
-	}, ctx)
-}
-
-// parseEnvDuration parses the environment variable and returns the duration
-func parseEnvDuration(envVar string, defaultDuration time.Duration) time.Duration {
-	if envValue := os.Getenv(envVar); envValue != "" {
-		if duration, err := time.ParseDuration(envValue); err == nil {
-			return duration
-		}
-		log.Warnf("unable to parse environment variable %s: %s. using default: %s", envVar, envValue, defaultDuration)
-	}
-	return defaultDuration
-}
-
 // sendTerminalNotification sends a terminal notification message
 // to inform the user that the NetBird connection session has expired.
 func sendTerminalNotification() error {
--- a/client/server/server_connect_test.go
+++ b/client/server/server_connect_test.go
@@ -15,14 +15,19 @@ import (
 )

 func newTestServer() *Server {
-	return &Server{
-		rootCtx:        context.Background(),
+	ctx := context.Background()
+	s := &Server{
+		rootCtx:        ctx,
 		statusRecorder: peer.NewRecorder(""),
 	}
+	// Honor the production invariant: the daemon-lifetime client always exists
+	// (built in New). Server methods rely on s.connectClient being non-nil.
+	s.connectClient = internal.NewConnectClient(ctx, s.statusRecorder)
+	return s
 }

 func newDummyConnectClient(ctx context.Context) *internal.ConnectClient {
-	return internal.NewConnectClient(ctx, nil, nil)
+	return internal.NewConnectClient(ctx, nil)
 }

 // TestConnectSetsClientWithMutex validates that connect() sets s.connectClient
@@ -87,41 +92,36 @@ func TestConcurrentConnectClientAccess(t *testing.T) {
 	assert.Equal(t, 50, nilCount+setCount, "all goroutines should complete without panic")
 }

-// TestCleanupConnection_ClearsConnectClient validates that cleanupConnection
-// properly nils out connectClient.
-func TestCleanupConnection_ClearsConnectClient(t *testing.T) {
+// TestCleanupConnection_KeepsClientStopsRunning validates that cleanupConnection
+// clears the daemon "up" intent but KEEPS the daemon-lifetime ConnectClient
+// (it is reused across Up/Down; only the run is stopped).
+func TestCleanupConnection_KeepsClientStopsRunning(t *testing.T) {
 	s := newTestServer()
 	_, cancel := context.WithCancel(context.Background())
 	s.actCancel = cancel

-	s.connectClient = newDummyConnectClient(context.Background())
-	s.clientRunning = true
-
 	err := s.cleanupConnection()
 	require.NoError(t, err)

-	assert.Nil(t, s.connectClient, "connectClient should be nil after cleanup")
-	assert.False(t, s.clientRunning, "clientRunning should be cleared after cleanup (intent = down)")
+	assert.NotNil(t, s.connectClient, "connectClient is daemon-lifetime and must persist after cleanup")
+	assert.False(t, s.connectClient.ConnectionRunning(), "no run should be in flight after cleanup")
 }

-// TestCleanState_NilConnectClient validates that CleanState doesn't panic
-// when connectClient is nil.
-func TestCleanState_NilConnectClient(t *testing.T) {
+// TestCleanState_NotConnected validates that CleanState doesn't panic when no
+// connection run is in flight.
+func TestCleanState_NotConnected(t *testing.T) {
 	s := newTestServer()
-	s.connectClient = nil
-	s.profileManager = nil // will cause error if it tries to proceed past the nil check
+	s.profileManager = nil // will cause error if it tries to proceed

-	// Should not panic — the nil check should prevent calling Status() on nil
 	assert.NotPanics(t, func() {
 		_, _ = s.CleanState(context.Background(), &proto.CleanStateRequest{All: true})
 	})
 }

-// TestDeleteState_NilConnectClient validates that DeleteState doesn't panic
-// when connectClient is nil.
-func TestDeleteState_NilConnectClient(t *testing.T) {
+// TestDeleteState_NotConnected validates that DeleteState doesn't panic when no
+// connection run is in flight.
+func TestDeleteState_NotConnected(t *testing.T) {
 	s := newTestServer()
-	s.connectClient = nil
 	s.profileManager = nil

 	assert.NotPanics(t, func() {
@@ -129,60 +129,6 @@ func TestDeleteState_NilConnectClient(t *testing.T) {
 	})
 }

-// TestDownThenUp_StaleRunningChan documents the known state issue where
-// clientRunningChan from a previous connection is already closed, causing
-// waitForUp() to return immediately on reconnect.
-func TestDownThenUp_StaleRunningChan(t *testing.T) {
-	s := newTestServer()
-
-	// Simulate state after a successful connection
-	s.clientRunning = true
-	s.clientRunningChan = make(chan struct{})
-	close(s.clientRunningChan) // closed when engine started
-	s.clientGiveUpChan = make(chan struct{})
-	s.connectClient = newDummyConnectClient(context.Background())
-
-	_, cancel := context.WithCancel(context.Background())
-	s.actCancel = cancel
-
-	// Simulate Down(): cleanupConnection sets connectClient = nil and
-	// flips clientRunning to false (intent = down). The connectionGoroutineRunning state
-	// remains independent of intent — derived from clientGiveUpChan.
-	s.mutex.Lock()
-	err := s.cleanupConnection()
-	s.mutex.Unlock()
-	require.NoError(t, err)
-
-	// After cleanup: connectClient is nil, clientRunning is false (intent
-	// cleared by cleanupConnection), connectionGoroutineRunning may still be true
-	// (goroutine teardown is independent of the intent flag).
-	s.mutex.Lock()
-	assert.Nil(t, s.connectClient, "connectClient should be nil after cleanup")
-	assert.False(t, s.clientRunning, "clientRunning should be cleared by cleanupConnection (intent = down)")
-	s.mutex.Unlock()
-
-	// waitForUp() returns immediately due to stale closed clientRunningChan
-	ctx, ctxCancel := context.WithTimeout(context.Background(), 2*time.Second)
-	defer ctxCancel()
-
-	waitDone := make(chan error, 1)
-	go func() {
-		_, err := s.waitForUp(ctx)
-		waitDone <- err
-	}()
-
-	select {
-	case err := <-waitDone:
-		assert.NoError(t, err, "waitForUp returns success on stale channel")
-		// But connectClient is still nil — this is the stale state issue
-		s.mutex.Lock()
-		assert.Nil(t, s.connectClient, "connectClient is nil despite waitForUp success")
-		s.mutex.Unlock()
-	case <-time.After(1 * time.Second):
-		t.Fatal("waitForUp should have returned immediately due to stale closed channel")
-	}
-}
-
 // TestConnectClient_EngineNilOnFreshClient validates that a newly created
 // ConnectClient has nil Engine (before Run is called).
 func TestConnectClient_EngineNilOnFreshClient(t *testing.T) {
--- a/client/server/server_test.go
+++ b/client/server/server_test.go
@@ -31,7 +31,6 @@ import (
 	"google.golang.org/grpc/keepalive"

 	"github.com/netbirdio/netbird/client/internal"
-	"github.com/netbirdio/netbird/client/internal/peer"
 	"github.com/netbirdio/netbird/client/internal/profilemanager"
 	daemonProto "github.com/netbirdio/netbird/client/proto"
 	"github.com/netbirdio/netbird/management/server"
@@ -61,65 +60,6 @@ var (
 	}
 )

-// TestConnectWithRetryRuns checks that the connectWithRetry function runs and runs the retries according to the times specified via environment variables
-// we will use a management server started via to simulate the server and capture the number of retries
-func TestConnectWithRetryRuns(t *testing.T) {
-	// start the signal server
-	_, signalAddr, err := startSignal(t)
-	if err != nil {
-		t.Fatalf("failed to start signal server: %v", err)
-	}
-
-	counter := 0
-	// start the management server
-	_, mgmtAddr, err := startManagement(t, signalAddr, &counter)
-	if err != nil {
-		t.Fatalf("failed to start management server: %v", err)
-	}
-
-	ctx := internal.CtxInitState(context.Background())
-
-	ctx, cancel := context.WithDeadline(ctx, time.Now().Add(30*time.Second))
-	defer cancel()
-	// create new server
-	ic := profilemanager.ConfigInput{
-		ManagementURL: "http://" + mgmtAddr,
-		ConfigPath:    t.TempDir() + "/test-profile.json",
-	}
-
-	config, err := profilemanager.UpdateOrCreateConfig(ic)
-	if err != nil {
-		t.Fatalf("failed to create config: %v", err)
-	}
-
-	currUser, err := user.Current()
-	require.NoError(t, err)
-
-	pm := profilemanager.ServiceManager{}
-	err = pm.SetActiveProfileState(&profilemanager.ActiveProfileState{
-		ID:       "test-profile",
-		Username: currUser.Username,
-	})
-	if err != nil {
-		t.Fatalf("failed to set active profile state: %v", err)
-	}
-
-	s := New(ctx, "debug", "", false, false, false, false)
-
-	s.config = config
-
-	s.statusRecorder = peer.NewRecorder(config.ManagementURL.String())
-	t.Setenv(retryInitialIntervalVar, "1s")
-	t.Setenv(maxRetryIntervalVar, "2s")
-	t.Setenv(maxRetryTimeVar, "5s")
-	t.Setenv(retryMultiplierVar, "1")
-
-	s.connectWithRetryRuns(ctx, config, s.statusRecorder, nil, nil)
-	if counter < 3 {
-		t.Fatalf("expected counter > 2, got %d", counter)
-	}
-}
-
 func TestServer_Up(t *testing.T) {
 	tempDir := t.TempDir()
 	origDefaultProfileDir := profilemanager.DefaultConfigPathDir
--- a/client/server/state.go
+++ b/client/server/state.go
@@ -9,7 +9,6 @@ import (
 	"google.golang.org/grpc/status"

 	nberrors "github.com/netbirdio/netbird/client/errors"
-	"github.com/netbirdio/netbird/client/internal"
 	"github.com/netbirdio/netbird/client/internal/routemanager/systemops"
 	"github.com/netbirdio/netbird/client/internal/statemanager"
 	"github.com/netbirdio/netbird/client/proto"
@@ -38,7 +37,7 @@ func (s *Server) ListStates(_ context.Context, _ *proto.ListStatesRequest) (*pro

 // CleanState handles cleaning of states (performing cleanup operations)
 func (s *Server) CleanState(ctx context.Context, req *proto.CleanStateRequest) (*proto.CleanStateResponse, error) {
-	if s.connectClient != nil && (s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting) {
+	if s.connectClient.ConnectionRunning() {
 		return nil, status.Errorf(codes.FailedPrecondition, "cannot clean state while connecting or connected, run 'netbird down' first.")
 	}

@@ -81,7 +80,7 @@ func (s *Server) CleanState(ctx context.Context, req *proto.CleanStateRequest) (

 // DeleteState handles deletion of states without cleanup
 func (s *Server) DeleteState(ctx context.Context, req *proto.DeleteStateRequest) (*proto.DeleteStateResponse, error) {
-	if s.connectClient != nil && (s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting) {
+	if s.connectClient.ConnectionRunning() {
 		return nil, status.Errorf(codes.FailedPrecondition, "cannot clean state while connecting or connected, run 'netbird down' first.")
 	}

--- a/client/server/trace.go
+++ b/client/server/trace.go
@@ -62,10 +62,6 @@ func (s *Server) TracePacket(_ context.Context, req *proto.TracePacketRequest) (
 }

 func (s *Server) getPacketTracer() (packetTracer, *internal.Engine, error) {
-	if s.connectClient == nil {
-		return nil, nil, fmt.Errorf("connect client not initialized")
-	}
-
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, nil, fmt.Errorf("engine not initialized")
--- a/client/ui/client_ui.go
+++ b/client/ui/client_ui.go
@@ -418,14 +418,7 @@ func newServiceClient(args *newServiceClientArgs) *serviceClient {
 	case args.showProfiles:
 		s.showProfilesUI()
 	case args.showQuickActions:
-		// Suppress the on-boot Quick Actions popup when the daemon
-		// reports DisableAutoConnect=true — that flag carries both the
-		// user's "Connect on Startup = off" preference AND any MDM-
-		// enforced override (applyMDMPolicy writes the policy value
-		// into the same Config field). See netbirdio/netbird#5744.
-		if !s.disableAutoConnectFromDaemon() {
-			s.showQuickActionsUI()
-		}
+		s.showQuickActionsUI()
 	case args.showUpdate:
 		s.showUpdateProgress(ctx, args.showUpdateVersion)
 	}
@@ -1345,40 +1338,6 @@ func (s *serviceClient) getFeatures() (*proto.GetFeaturesResponse, error) {
 	return features, nil
 }

-// disableAutoConnectFromDaemon returns true when the daemon reports
-// the active profile has DisableAutoConnect=true. Used by the
-// --quick-actions startup path to suppress the on-boot popup when the
-// user (or an MDM admin) opted out of auto-connecting; both cases
-// converge on the same Config field because applyMDMPolicy writes the
-// policy value into it. Returns false on any RPC / lookup failure so a
-// daemon hiccup does not silently swallow the popup.
-func (s *serviceClient) disableAutoConnectFromDaemon() bool {
-	activeProf, err := s.profileManager.GetActiveProfile()
-	if err != nil {
-		log.Warnf("disableAutoConnectFromDaemon: get active profile: %v", err)
-		return false
-	}
-	currUser, err := user.Current()
-	if err != nil {
-		log.Warnf("disableAutoConnectFromDaemon: get current user: %v", err)
-		return false
-	}
-	conn, err := s.getSrvClient(failFastTimeout)
-	if err != nil {
-		log.Warnf("disableAutoConnectFromDaemon: get daemon client: %v", err)
-		return false
-	}
-	srvCfg, err := conn.GetConfig(s.ctx, &proto.GetConfigRequest{
-		ProfileName: activeProf.ID.String(),
-		Username:    currUser.Username,
-	})
-	if err != nil {
-		log.Warnf("disableAutoConnectFromDaemon: GetConfig RPC: %v", err)
-		return false
-	}
-	return srvCfg.GetDisableAutoConnect()
-}
-
 // getSrvConfig from the service to show it in the settings window.
 func (s *serviceClient) getSrvConfig() {
 	s.managementURL = profilemanager.DefaultManagementURL
--- a/combined/Dockerfile.multistage
+++ b/combined/Dockerfile.multistage
@@ -5,16 +5,12 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y gcc libc6-dev git && rm -rf /var/lib/apt/lists/*

 COPY go.mod go.sum ./
-RUN --mount=type=cache,target=/go/pkg/mod go mod download
+RUN go mod download

 COPY . .

-# Build with version info from git (matching goreleaser ldflags).
-# BuildKit cache mounts persist the module + build caches across image builds,
-# so a source change recompiles incrementally instead of from scratch.
-RUN --mount=type=cache,target=/go/pkg/mod \
-    --mount=type=cache,target=/root/.cache/go-build \
-    CGO_ENABLED=1 GOOS=linux go build \
+# Build with version info from git (matching goreleaser ldflags)
+RUN CGO_ENABLED=1 GOOS=linux go build \
    -ldflags="-s -w \
    -X github.com/netbirdio/netbird/version.version=$(git describe --tags --always --dirty 2>/dev/null || echo 'dev') \
    -X main.commit=$(git rev-parse --short HEAD 2>/dev/null || echo 'unknown') \
--- a/docker/build-env/README.md
+++ b/docker/build-env/README.md
@@ -0,0 +1,56 @@
+# Build environments
+
+Dockerfiles that pin the same toolchain CI uses, so a developer can
+reproduce a CI build locally without installing platform SDKs on their
+workstation. The version pins in each `Dockerfile` must stay in lockstep
+with `.github/workflows/`.
+
+## `android/`
+
+Mirrors `.github/workflows/mobile-build-validation.yml` (`android_build`
+job). Carries Go 1.25.5, Adopt JDK 11, Android cmdline-tools 8512546,
+NDK 23.1.7779620 and gomobile pinned at the CI commit. Use it to
+produce `netbird.aar` from `./client/android`:
+
+```bash
+docker build -t netbird/build-android docker/build-env/android
+docker run --rm -v "$PWD:/src" -w /src netbird/build-android \
+    gomobile bind \
+    -o netbird.aar \
+    -javapkg=io.netbird.gomobile \
+    -ldflags="-checklinkname=0 \
+              -X golang.zx2c4.com/wireguard/ipc.socketDirectory=/data/data/io.netbird.client/cache/wireguard \
+              -X github.com/netbirdio/netbird/version.version=local" \
+    ./client/android
+```
+
+To build the full Android APK, bind-mount the `android-client` repo as
+well and run its own `./gradlew assembleDebug` from inside the
+container (the gradle wrapper ships with `android-client`).
+
+## `windows-cross/`
+
+Cross-compiles Windows binaries from Linux using `mingw-w64`. Lets you
+verify that `GOOS=windows go build ./...` compiles cleanly without
+needing a Windows VM. Cannot run Windows tests — the `golang-test-windows`
+CI job executes on a native `windows-latest` runner with wintun.dll
+and PsExec, neither of which lives under Linux containers.
+
+```bash
+docker build -t netbird/build-windows docker/build-env/windows-cross
+docker run --rm -v "$PWD:/src" -w /src netbird/build-windows \
+    bash -c 'GOOS=windows GOARCH=amd64 go build ./...'
+```
+
+## What is NOT here
+
+- **iOS / macOS**: cannot legally run macOS in Docker (Apple EULA),
+  and Xcode is not redistributable. The `ios_build` CI job uses a
+  `macos-latest` GitHub runner; locally you need a real Mac.
+
+- **Native Windows tests**: see note above. The Linux+mingw image
+  builds, it does not execute Windows-host code paths
+  (registry, wintun, services, PsExec workflows).
+
+When CI version pins change, update the corresponding `ARG` lines in
+the Dockerfiles and the README's table of versions.
--- a/docker/build-env/android/Dockerfile
+++ b/docker/build-env/android/Dockerfile
@@ -0,0 +1,86 @@
+# Android build environment.
+#
+# Mirrors the toolchain pinned by .github/workflows/mobile-build-validation.yml
+# so a `gomobile bind` against ./client/android in this image produces the
+# same netbird.aar that CI builds.
+#
+# Tooling versions (must stay in sync with the CI workflow):
+#   - Ubuntu 22.04 (matches the ubuntu-latest GitHub runner)
+#   - Go 1.25.5 (matches go.mod)
+#   - Adopt JDK 11 (matches actions/setup-java@v3 java-version: 11, distribution: adopt)
+#   - Android SDK cmdline-tools 8512546
+#   - Android NDK 23.1.7779620
+#   - gomobile commit v0.0.0-20251113184115-a159579294ab
+#
+# Usage (from the netbird repo root):
+#
+#   docker build -t netbird/build-android docker/build-env/android
+#
+#   # bind the netbird checkout in and run the same gomobile command CI runs
+#   docker run --rm -v "$PWD:/src" -w /src netbird/build-android \
+#       gomobile bind \
+#       -o netbird.aar \
+#       -javapkg=io.netbird.gomobile \
+#       -ldflags="-checklinkname=0 \
+#                 -X golang.zx2c4.com/wireguard/ipc.socketDirectory=/data/data/io.netbird.client/cache/wireguard \
+#                 -X github.com/netbirdio/netbird/version.version=local" \
+#       ./client/android
+#
+# To build the full APK, mount the android-client repo too and run
+# `./gradlew assembleDebug` from /android-client (this image carries
+# gradle's prerequisites JDK + Android SDK but not the gradle wrapper —
+# that ships with android-client).
+
+FROM ubuntu:22.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Versions — bump in lockstep with .github/workflows/mobile-build-validation.yml.
+ARG GO_VERSION=1.25.5
+ARG ANDROID_CMDLINE_TOOLS_VERSION=8512546
+ARG ANDROID_NDK_VERSION=23.1.7779620
+ARG GOMOBILE_VERSION=v0.0.0-20251113184115-a159579294ab
+
+ENV ANDROID_HOME=/opt/android-sdk
+ENV ANDROID_NDK_HOME=${ANDROID_HOME}/ndk/${ANDROID_NDK_VERSION}
+ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+ENV GOPATH=/go
+ENV GOTOOLCHAIN=local
+ENV CGO_ENABLED=0
+ENV PATH=${GOPATH}/bin:/usr/local/go/bin:${ANDROID_HOME}/cmdline-tools/latest/bin:${ANDROID_HOME}/platform-tools:${JAVA_HOME}/bin:${PATH}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        unzip \
+        git \
+        openjdk-11-jdk-headless \
+        build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Go (matches go.mod). actions/setup-go fetches the same tarball.
+RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \
+        | tar -C /usr/local -xz \
+    && go version
+
+# Install Android SDK command-line tools, accept licenses, install NDK.
+RUN mkdir -p "${ANDROID_HOME}/cmdline-tools" \
+    && curl -fsSL -o /tmp/cmdline.zip \
+        "https://dl.google.com/android/repository/commandlinetools-linux-${ANDROID_CMDLINE_TOOLS_VERSION}_latest.zip" \
+    && unzip -q /tmp/cmdline.zip -d "${ANDROID_HOME}/cmdline-tools" \
+    && mv "${ANDROID_HOME}/cmdline-tools/cmdline-tools" "${ANDROID_HOME}/cmdline-tools/latest" \
+    && rm /tmp/cmdline.zip \
+    && yes | sdkmanager --licenses > /dev/null \
+    && sdkmanager --install "ndk;${ANDROID_NDK_VERSION}" "platform-tools" > /dev/null
+
+# Install gomobile at the same commit CI pins. Don't run `gomobile init` here:
+# `init` resolves the NDK at runtime, do it on the first bind in the mounted
+# workspace so the cache lands on the host volume.
+RUN GOBIN=/usr/local/bin go install "golang.org/x/mobile/cmd/gomobile@${GOMOBILE_VERSION}" \
+    && gomobile version
+
+WORKDIR /src
+
+# Default entrypoint is a plain shell so the image is composable: callers pass
+# the full gomobile / gradle command they want to run.
+CMD ["/bin/bash"]
--- a/docker/build-env/windows-cross/Dockerfile
+++ b/docker/build-env/windows-cross/Dockerfile
@@ -0,0 +1,63 @@
+# Windows-cross build environment.
+#
+# Cross-compiles Windows .exe targets from a Linux container using
+# mingw-w64. Mirrors the toolchain set used by
+# .github/workflows/golang-test-windows.yml insofar as that is possible
+# without a Windows kernel.
+#
+# IMPORTANT — what this image CAN do:
+#   - `GOOS=windows go build ./...` to validate that Windows builds compile
+#   - CGO Windows cross-compile via x86_64-w64-mingw32-gcc when CGO_ENABLED=1
+#     (matches CI's choco-installed mingw-w64)
+#
+# IMPORTANT — what this image CANNOT do:
+#   - Run Windows binaries (no Windows kernel under Docker on Linux).
+#   - Replicate the CI's `go test` runs which execute on a real
+#     windows-latest runner (wintun.dll, PsExec, registry, etc.).
+#     Use the CI for that or a native Windows VM.
+#
+# Usage (from the netbird repo root):
+#
+#   docker build -t netbird/build-windows docker/build-env/windows-cross
+#
+#   # Cross-compile a static client (.exe) from Linux:
+#   docker run --rm -v "$PWD:/src" -w /src netbird/build-windows \
+#       bash -c 'CGO_ENABLED=1 GOOS=windows GOARCH=amd64 \
+#           CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ \
+#           go build -o netbird.exe ./client'
+#
+#   # Just validate that everything *compiles* on Windows (no CGO):
+#   docker run --rm -v "$PWD:/src" -w /src netbird/build-windows \
+#       bash -c 'GOOS=windows GOARCH=amd64 go build ./...'
+#
+# Tooling versions (keep in sync with go.mod and any future explicit pin
+# documented in golang-test-windows.yml):
+#   - Ubuntu 22.04
+#   - Go 1.25.5 (matches go.mod)
+#   - mingw-w64 (Ubuntu package — pin further if drift becomes a problem)
+
+FROM ubuntu:22.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG GO_VERSION=1.25.5
+
+ENV GOPATH=/go
+ENV GOTOOLCHAIN=local
+ENV PATH=${GOPATH}/bin:/usr/local/go/bin:${PATH}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+        git \
+        build-essential \
+        mingw-w64 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Go (matches go.mod).
+RUN curl -fsSL "https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz" \
+        | tar -C /usr/local -xz \
+    && go version
+
+WORKDIR /src
+
+CMD ["/bin/bash"]
--- a/docs/agent-networks/00-overview.md
+++ b/docs/agent-networks/00-overview.md
@@ -1,109 +0,0 @@
-# Agent Networks — overview
-
-Single-entry point. Feature scope, the module map, and the cross-cutting
-topics worth keeping in mind, with links into every per-module guide.
-
-## TL;DR
-
-Agent Networks introduces an **LLM-aware reverse-proxy middleware system**
-plus **account-level controls** (budget rules, log collection toggles,
-PII redaction). The management server synthesises a per-peer middleware
-chain that the proxy executes on every LLM request; the chain enforces
-quotas, injects identity, redacts PII, parses tokens/cost, and emits
-access-log entries. The dashboard exposes the surface as a single **AI
-Observability** page with four tabs.
-
- **Backend** lives in this repo, primarily under
-  `management/server/agentnetwork`, `proxy/internal/middleware`, and
-  `proxy/internal/llm`, with wire contracts in `shared/management`.
- **Dashboard** lives in the dashboard repo under
-  `src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`.
-
-## Reading order
-
-| # | Doc | Why |
-|---|-----|-----|
-| 1 | [01-end-to-end-flows.md](01-end-to-end-flows.md) | Get the three big diagrams in your head first. |
-| 2 | [modules/10-shared-api.md](modules/10-shared-api.md) | Wire contracts — every other module either produces or consumes these. |
-| 3 | [modules/21-management-agentnetwork.md](modules/21-management-agentnetwork.md) | The largest module; everything the proxy executes originates here. |
-| 4 | [modules/30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md) | The generic plugin system on the proxy side. |
-| 5 | [modules/31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md) | The 8 LLM middlewares that ride on the framework. |
-| 6 | Everything else in any order. | |
-
-## Module map
-
-11 modules. Each is described in detail in its own file under
-[`modules/`](modules/).
-
-| # | Module | Risk | BC impact |
-|---|--------|------|-----------|
-| 10 | [shared/api](modules/10-shared-api.md) — proto + OpenAPI | Low | Additive only |
-| 20 | [management/store](modules/20-management-store.md) — SQL persistence | Medium | Auto-migrate (additive) |
-| 21 | [management/agentnetwork](modules/21-management-agentnetwork.md) — domain layer + synthesizer | **High** | Additive |
-| 22 | [management/handlers + wiring](modules/22-management-handlers-wiring.md) — HTTP API + gRPC delivery | Medium | Additive |
-| 30 | [proxy/middleware-framework](modules/30-proxy-middleware-framework.md) — generic plugin system | High | Additive |
-| 31 | [proxy/middleware-builtin](modules/31-proxy-middleware-builtin.md) — 8 LLM middlewares | High | Additive |
-| 32 | [proxy/llm-parsers](modules/32-proxy-llm-parsers.md) — SDK adapters + pricing | Medium | Additive |
-| 33 | [proxy/runtime](modules/33-proxy-runtime.md) — translate + serve + access-log | High | Additive (touches hot path) |
-| 40 | [dashboard](modules/40-dashboard.md) — UI for everything above | Medium | Sidebar reshape |
-| 50 | [path-routed-providers](modules/50-path-routed-providers.md) — Vertex AI + Bedrock | Medium | Additive (new catalog entries) |
-
-The largest and highest-risk module is `management/agentnetwork`: it is
-the single writer of the middleware chain the proxy executes.
-
-## Cross-cutting topics
-
-These are the items most likely to bite production. Each is fully
-documented in the linked module guide.
-
-1. **Capture-pointer semantics** (`*bool` for `capture_prompt` and
-   `capture_completion`): nil = legacy emit, false = suppress, true =
-   emit. nil-vs-false must be handled at every JSON hop. See
-   [21-management-agentnetwork.md](modules/21-management-agentnetwork.md)
-   and [31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md).
-2. **`ProxyMapping.Private` preservation** on per-proxy live updates.
-   Failure mode: `auth` skips `ValidateTunnelPeer` →
-   `CapturedData.UserGroups` empty → `llm_router` denies. See
-   [33-proxy-runtime.md](modules/33-proxy-runtime.md).
-3. **respInput carrying `UserEmail`/`UserGroups`/`UserGroupNames` onto
-   the response leg** in `reverseproxy.go`. Load-bearing wire that lets
-   `llm_limit_record` ship non-empty `group_ids` on `RecordLLMUsage`. See
-   [33-proxy-runtime.md](modules/33-proxy-runtime.md).
-4. **Min-wins all-must-pass budget rule semantics**. Every matching
-   rule's remaining quota must be > 0 for the request to proceed; one
-   exhausted rule blocks the whole call. Documented in
-   [21-management-agentnetwork.md](modules/21-management-agentnetwork.md)
-   and the `llm_limit_check` middleware in
-   [31-proxy-middleware-builtin.md](modules/31-proxy-middleware-builtin.md).
-5. **body-tap memory bounds**: per-direction 1 MiB cap, shared 256 MiB
-   budget, `LimitReader(r.Body, limit+1)` for truncation detection with
-   `replayReadCloser` fallback so upstream still sees the full body.
-   `cloneInputFor` deep-copies the body up to 16 times per chain — a
-   perf hot-spot. See
-   [30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md).
-6. **UpstreamRewrite.AuthHeader bypasses the header denylist**
-   deliberately. The runtime consumer only unpacks it via the
-   trusted upstream-build path. See
-   [30-proxy-middleware-framework.md](modules/30-proxy-middleware-framework.md).
-7. **`disable_access_log` default-false semantics**: the synth target
-   sets it true, all other targets leave it false. See
-   [10-shared-api.md](modules/10-shared-api.md).
-8. **String-typed `decision` / `deny_code`** on
-   `CheckLLMPolicyLimitsResponse` — would benefit from enum pinning
-   before external consumers integrate. See
-   [10-shared-api.md](modules/10-shared-api.md).
-
-## Explicit non-goals
-
- **Reaper / GC pass over stale synth services** — designed but cut from
-  scope.
- **URL-sync for tab state on AI Observability** — read path is wired
-  (`?tab=`) but write path isn't. Future work.
- **CI golden-file regen-and-diff for `types.gen.go` /
-  `proxy_service.pb.go`** — would catch codegen drift; not yet in place.
-
-## Where to read the code
-
-Per-module file scopes are listed in each module guide. Behaviour is
-covered by Go tests co-located with each package (and an end-to-end
-chain integration test under `proxy/internal/proxy`).
--- a/docs/agent-networks/01-end-to-end-flows.md
+++ b/docs/agent-networks/01-end-to-end-flows.md
@@ -1,217 +0,0 @@
-# End-to-end flows
-
-Three cross-module mermaid diagrams. Each per-module guide repeats the
-slice that's relevant to its own scope — these are the canonical
-top-down views.
-
- [Flow A — Config → runtime (synth + deliver)](#flow-a--config--runtime-synth--deliver)
- [Flow B — Request lifecycle through the LLM chain](#flow-b--request-lifecycle-through-the-llm-chain)
- [Flow C — Budget rule feedback loop](#flow-c--budget-rule-feedback-loop)
-
---
-
-## Flow A — Config → runtime (synth + deliver)
-
-How an operator's change to a Provider, Policy, Guardrail, Budget Rule,
-or Settings record ends up as live middleware on a peer's proxy.
-
-```mermaid
-sequenceDiagram
-    autonumber
-    actor Op as Operator
-    participant UI as Dashboard
-    participant HTTP as management/handlers
-    participant Mgr as agentnetwork.Manager
-    participant Store as management/store (SQL)
-    participant Ctl as network_map.Controller
-    participant Synth as agentnetwork.SynthesizeServices
-    participant Grpc as management gRPC
-    participant Proxy as netbird-proxy
-    participant Xlate as middleware_translate
-    participant Chain as middleware.Chain
-
-    Op->>UI: edit provider/policy/budget/settings
-    UI->>HTTP: REST PUT/POST /api/agent-network/*
-    HTTP->>Mgr: SaveProvider / SavePolicy / SaveBudgetRule / SaveSettings
-    Mgr->>Store: persist (gorm)
-    Mgr-->>Ctl: account change event (Network-Map dirty)
-    loop per connected peer
-        Ctl->>Synth: SynthesizeServices(ctx, store, accountID)
-        Synth->>Store: load providers, policies, guardrails, budget rules, settings
-        Synth-->>Synth: build per-peer Service list
-        Note over Synth: each Service has a middleware<br/>chain with capture_prompt /<br/>capture_completion / redact_pii<br/>baked from account settings
-        Synth-->>Ctl: []rpservice.Service
-        Ctl->>Grpc: NetworkMap push (services + middleware configs)
-    end
-    Grpc-->>Proxy: NetworkMap stream
-    Proxy->>Xlate: translate proto MiddlewareConfig → runtime Spec
-    Xlate->>Chain: register / replace per-service chain
-    Note over Chain: chain replacement is live<br/>(no proxy restart, in-flight<br/>requests unaffected)
-```
-
-**Notes on the diagram**
-
- The `network_map.Controller` synthesises on every push, not on a
-  timer. A single config change costs O(connected peers × policies ×
-  providers) per push. See [`modules/22-management-handlers-wiring.md`](modules/22-management-handlers-wiring.md).
- `SynthesizeServices` is the single source of truth for the wire
-  format the proxy executes. Anything the proxy does that the
-  synthesiser didn't request is a bug. See
-  [`modules/21-management-agentnetwork.md`](modules/21-management-agentnetwork.md).
- The translate step (step 13) is the only place that knows the
-  middleware-ID strings on the proxy side. It must reject unknown IDs;
-  silently dropping middlewares would create a security gap (e.g.
-  missing `llm_limit_check` ⇒ unbounded spend). See
-  [`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
-
---
-
-## Flow B — Request lifecycle through the LLM chain
-
-What happens when an agent on the client peer sends a chat-completion /
-messages request through the synthesised reverse-proxy.
-
-```mermaid
-sequenceDiagram
-    autonumber
-    actor Agent as Agent (local)
-    participant Px as netbird-proxy
-    participant Auth as auth middleware
-    participant Map as service-mapping
-    participant Req as llm_request_parser
-    participant Rt as llm_router
-    participant Chk as llm_limit_check
-    participant Inj as llm_identity_inject
-    participant Grd as llm_guardrail
-    participant Up as upstream LLM
-    participant Resp as llm_response_parser
-    participant Cost as cost_meter
-    participant Rec as llm_limit_record
-    participant Log as access-log
-    participant MgmtGrpc as management gRPC
-
-    Agent->>Px: POST /v1/chat/completions  (OpenAI / Anthropic)
-    Px->>Auth: identify peer (user, groups)
-    Auth->>Map: resolve service from Host + path
-    Map-->>Req: dispatch chain in slot order
-
-    Req->>Req: parse body → provider, model, prompt, token estimate
-    Note over Req: capture_prompt gates raw_prompt<br/>capture (nil = legacy emit,<br/>false = drop, true = emit)
-    Req->>Rt: pass metadata
-    Rt->>Chk: route to upstream candidate
-
-    Chk->>MgmtGrpc: CheckLLMPolicyLimits(provider, model, est_tokens, groups, user)
-    MgmtGrpc-->>Chk: decision = allow / deny + deny_code
-    alt decision == deny
-        Chk-->>Log: emit access-log with deny_code<br/>(if EnableLogCollection)
-        Chk-->>Agent: 429 (or 403 per deny_code)
-    else decision == allow
-        Chk->>Inj: continue
-        Inj->>Inj: inject NetBird identity headers per provider config
-        Inj->>Grd: continue
-        Grd->>Grd: enforce model allowlist
-        Grd->>Up: forward (over WireGuard)
-        Up-->>Resp: response (JSON or SSE stream)
-        Resp->>Resp: parse usage tokens, completion
-        Note over Resp: capture_completion gates raw<br/>completion capture
-        Resp->>Cost: tokens
-        Cost->>Cost: lookup pricing.yaml + compute cost
-        Cost->>Rec: tokens + cost
-        Rec->>MgmtGrpc: RecordLLMUsage(provider, model, prompt_t, completion_t, cost, groups, user)
-        Rec-->>Log: emit access-log entry<br/>(if EnableLogCollection)
-        Log-->>Agent: 200 + body (streamed if SSE)
-    end
-```
-
-**Notes on the diagram**
-
- The chain runs in synth-defined order. Re-ordering middlewares
-  changes invariants — `llm_limit_check` must precede `llm_router` so
-  a denied request never hits upstream, and `llm_limit_record` must
-  pair with `llm_limit_check` so a successful check is always recorded
-  (or the rate-limit semantics break). See
-  [`modules/31-proxy-middleware-builtin.md`](modules/31-proxy-middleware-builtin.md).
- `llm_guardrail` is also where PII redaction happens
-  (`redact_pii = settings.RedactPii`). Phones, emails, credit cards,
-  PII names — see `redact.go` for the full set. See
-  [`modules/31-proxy-middleware-builtin.md`](modules/31-proxy-middleware-builtin.md).
- SSE streaming requires special handling on the response side; the
-  parser must handle partial chunks without buffering the whole
-  stream. See [`modules/32-proxy-llm-parsers.md`](modules/32-proxy-llm-parsers.md).
- Access-log emission is gated on `settings.EnableLogCollection`. With
-  it OFF, neither the deny nor the allow leg writes an entry — the
-  chain still runs (budget rules are still enforced) but no audit trail
-  is kept. See
-  [`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
-
---
-
-## Flow C — Budget rule feedback loop
-
-How an account's budget rules tighten ceilings on every request and how
-consumption flows back into the dashboard.
-
-```mermaid
-flowchart LR
-    subgraph Operator
-      DashBud[Dashboard Budget Settings tab]
-    end
-    subgraph Mgmt[Management]
-      Save[POST/PUT /api/agent-network/budget-rules]
-      Store[(SQL store)]
-      Synth[SynthesizeServices]
-      Check[CheckLLMPolicyLimits RPC]
-      Rec[RecordLLMUsage RPC]
-      Cons[/api/agent-network/consumption]
-    end
-    subgraph Proxy[Proxy]
-      Chk[llm_limit_check]
-      RecMw[llm_limit_record]
-    end
-    subgraph DashView[Dashboard Budget Dashboard tab]
-      Panel[AgentConsumptionPanel]
-    end
-
-    DashBud -->|create / update rules| Save
-    Save --> Store
-    Store --> Synth
-    Synth -->|push synth-services to peer| Proxy
-
-    Chk -->|per request| Check
-    Check -->|aggregate matching rules<br/>min-wins all-must-pass| Store
-    Check -->|allow / deny| Chk
-
-    RecMw -->|post-response| Rec
-    Rec -->|tokens + cost + groups + user| Store
-
-    Store -->|read counters| Cons
-    Cons --> Panel
-```
-
-**Notes on the diagram**
-
- **min-wins all-must-pass** is the core semantic. A budget rule binds
-  to (group set, user set) with a (window, ceiling). At check time,
-  every rule that matches the caller is evaluated; if ANY rule has
-  zero remaining quota the request is denied. This is the most
-  surprising semantic for operators — see the invariants section of
-  [`modules/21-management-agentnetwork.md`](modules/21-management-agentnetwork.md).
- The proxy never makes its own budget decisions. It always asks
-  management via `CheckLLMPolicyLimits` and reports back via
-  `RecordLLMUsage`. This keeps account-wide accounting in one place
-  and avoids per-proxy drift.
- `RecordLLMUsage` must carry `group_ids` and `user_id` so the
-  decrement hits the right rule(s). The wire that carries those
-  fields onto the response leg is `respInput` in `reverseproxy.go`. See
-  [`modules/33-proxy-runtime.md`](modules/33-proxy-runtime.md).
- The dashboard's Budget Dashboard tab polls
-  `/api/agent-network/consumption` — not gRPC, not WebSocket. Poll
-  interval lives in `AgentConsumptionPanel.tsx`. See
-  [`modules/40-dashboard.md`](modules/40-dashboard.md).
-
---
-
-## Cross-references
-
- Per-module guides: [`modules/`](modules/)
- Overview + module map: [`00-overview.md`](00-overview.md)
--- a/docs/agent-networks/README.md
+++ b/docs/agent-networks/README.md
@@ -1,66 +0,0 @@
-# Agent Networks — architecture documentation
-
-A self-contained set of documents describing the agent-networks feature:
-an LLM-aware reverse-proxy middleware system plus account-level controls
-(budget rules, log collection toggles, PII redaction). The management
-server synthesises a per-peer middleware chain that the proxy executes on
-every LLM request.
-
-## What to read first
-
-1. **[00-overview.md](00-overview.md)** — the single entry point. Feature
-   scope, the module map, and the cross-cutting topics worth keeping in
-   mind, with links to every per-module guide.
-2. **[01-end-to-end-flows.md](01-end-to-end-flows.md)** — three
-   high-level mermaid diagrams: config-to-runtime synth/delivery,
-   per-request lifecycle through the LLM chain, and the budget-rule
-   feedback loop.
-3. **Per-module guides** under `modules/` — one file per package. Each
-   describes the module boundary, the file-level layout, its own flow
-   diagrams, the public contracts, the invariants it relies on, and the
-   areas worth the closest attention.
-
-## Directory layout
-
-```
-docs/agent-networks/
-├── README.md                              # you are here
-├── 00-overview.md                         # feature summary + module map
-├── 01-end-to-end-flows.md                 # cross-module mermaid diagrams
-└── modules/
-    ├── 10-shared-api.md                   # proto + OpenAPI wire contracts
-    ├── 20-management-store.md             # SQL persistence layer
-    ├── 21-management-agentnetwork.md      # domain layer + synthesizer (largest)
-    ├── 22-management-handlers-wiring.md   # HTTP API + gRPC delivery
-    ├── 30-proxy-middleware-framework.md   # generic plugin system
-    ├── 31-proxy-middleware-builtin.md     # 8 LLM-aware middlewares
-    ├── 32-proxy-llm-parsers.md            # OpenAI/Anthropic/Bedrock SDKs + pricing
-    ├── 33-proxy-runtime.md                # translate + serve + access-log
-    ├── 40-dashboard.md                    # UI for everything above (lives in the dashboard repo)
-    └── 50-path-routed-providers.md        # Vertex AI + Bedrock (path-routed, keyfile:: creds, /bedrock prefix)
-```
-
-The `40-dashboard.md` module documents code that lives in the **dashboard
-repo**, not in this repo. The guide is co-located here so backend readers
-see the full picture in one place.
-
-## How the per-module guides are structured
-
-Every `modules/*.md` follows the same template so the docs are easy to
-scan:
-
- **Module boundary** — what this package owns; where it sits in the stack.
- **Files** — path / role.
- **Architecture & flow** — one or more mermaid diagrams.
- **Public contracts** — function signatures, gRPC messages, JSON shapes.
- **Invariants** — semantic guarantees the module relies on or enforces.
- **Things to scrutinize** — split by correctness / security /
-  concurrency / backward-compat / performance / observability.
- **Test coverage** — the test files that lock down behaviour in this
-  module.
- **Known limitations / non-goals** — what is intentionally out of scope.
- **Cross-references** — upstream/downstream module links + the
-  end-to-end flow + the overview.
-
-See [00-overview.md](00-overview.md) for the module map and the
-cross-cutting topics.
--- a/docs/agent-networks/modules/10-shared-api.md
+++ b/docs/agent-networks/modules/10-shared-api.md
@@ -1,105 +0,0 @@
-# shared/api — wire contracts (proto + OpenAPI)
-
-> **Risk level:** Medium — wire-format surface that every other module pins against; backward-compat hinges on field-number discipline more than on logic correctness.
-> **Backward-compat impact:** Additive only (new proto fields use unallocated numbers, new RPCs default to `Unimplemented`, new OpenAPI schemas/paths are append-only; no existing field/RPC/schema removed or renumbered).
-
-## Module boundary
-This module owns the cross-process contract surface between management, proxy, and dashboard. Two artefacts: `shared/management/proto/proxy_service.proto` (management↔proxy gRPC) and `shared/management/http/api/openapi.yml` (dashboard/CLI↔management REST). Both have generated companions checked in (`proxy_service.pb.go`, `proxy_service_grpc.pb.go`, `types.gen.go`) which must travel in lockstep with their sources. `shared/management/status/error.go` is in scope only for the four new typed `NotFound` constructors that the new HTTP handlers return.
-
-Everything downstream — `management/agentnetwork`, `management/server/http/handlers/*`, `proxy/internal/*`, the dashboard SDK — consumes these types verbatim. The concern here is wire stability and codegen reproducibility, not behaviour: behaviour is covered in the management and proxy module guides.
-
-`management.proto` and `signalexchange.proto` are unchanged. `status/error.go` only receives four additive constructors (lines 208-227); no existing error types are reshaped.
-
-## Files
-| Path | Role |
-| ---- | ---- |
-| `shared/management/proto/proxy_service.proto` | Source of truth: 2 new RPCs, 1 new message group (`MiddlewareConfig` + slot enum), additive fields on `PathTargetOptions`, `AccessLog`, `RecordLLMUsageRequest` |
-| `shared/management/proto/proxy_service.pb.go` | Generated (protoc-gen-go) |
-| `shared/management/proto/proxy_service_grpc.pb.go` | Generated; adds `CheckLLMPolicyLimits` + `RecordLLMUsage` client/server stubs and `UnimplementedProxyServiceServer` defaults |
-| `shared/management/http/api/openapi.yml` | 15 new `AgentNetwork*` schemas, 9 new path groups under `/api/agent-network/*` |
-| `shared/management/http/api/types.gen.go` | Generated (oapi-codegen; see codegen note below) |
-| `shared/management/status/error.go` | Four `NotFound` constructors for the new resource kinds (lines 208-227) |
-
-## Architecture & flow
-```mermaid
-sequenceDiagram
-    participant Dash as Dashboard / CLI
-    participant Mgmt as management (HTTP+gRPC)
-    participant Px as proxy
-
-    Note over Dash,Mgmt: REST (OpenAPI / types.gen.go)
-    Dash->>Mgmt: PUT /api/agent-network/providers (AgentNetworkProviderRequest)
-    Dash->>Mgmt: PUT /api/agent-network/settings (AgentNetworkSettingsRequest)
-    Dash->>Mgmt: GET /api/agent-network/consumption -> [AgentNetworkConsumption]
-
-    Note over Mgmt,Px: gRPC ProxyService (proxy_service.proto)
-    Mgmt-->>Px: SyncMappingsResponse{ ProxyMapping.path[*].options.middlewares,<br/>agent_network, disable_access_log, capture_* }
-    Px->>Mgmt: CheckLLMPolicyLimits(account, user, groups, provider, model)
-    Mgmt-->>Px: decision=allow|deny + selected_policy_id + attribution_group_id + window_seconds
-    Px->>Mgmt: RecordLLMUsage(account, user, group_id, group_ids, window_seconds, tokens, cost)
-    Px->>Mgmt: SendAccessLog(AccessLog{ agent_network=true })
-```
-
-The proto changes split into three independent slices: (1) **mapping enrichment** — `PathTargetOptions` grows fields 8-13 so management can ship middleware configs, capture limits, and the agent-network / log-suppression flags down to the proxy without a second RPC; (2) **two new request/response RPCs** (`CheckLLMPolicyLimits`, `RecordLLMUsage`) for per-LLM-request budget arbitration; (3) **observability tag** — `AccessLog.agent_network` so management can route logs to the right surface.
-
-The OpenAPI side is a thin CRUD surface — every resource (`Provider`, `Policy`, `Guardrail`, `BudgetRule`, `Settings`) follows the same `GET-list / POST / GET / PUT / DELETE` pattern, plus a read-only `/consumption` listing and a catalog endpoint. The `*Request` variants drop server-controlled fields (id, timestamps). `AgentNetworkBudgetRule` deliberately reuses `AgentNetworkPolicyLimits` to keep wire-shape parity with policies.
-
-## Public contracts added
- gRPC RPCs (`proxy_service.proto:52-57`): `CheckLLMPolicyLimits(CheckLLMPolicyLimitsRequest) → CheckLLMPolicyLimitsResponse`, `RecordLLMUsage(RecordLLMUsageRequest) → RecordLLMUsageResponse`. Both unary; default `UnimplementedProxyServiceServer` returns `codes.Unimplemented` (`proxy_service_grpc.pb.go:283-289`).
- New messages (`proxy_service.proto:145-175,448-502`): `MiddlewareConfig`, `MiddlewareSlot` enum, `CheckLLMPolicyLimitsRequest`/`Response`, `RecordLLMUsageRequest`/`Response`.
- New `PathTargetOptions` fields 8-13 (`proxy_service.proto:124-140`): `capture_max_request_bytes`, `capture_max_response_bytes`, `capture_content_types`, `middlewares`, `agent_network`, `disable_access_log`. All default-false / zero; pre-existing fields 1-7 byte-for-byte unchanged.
- `AccessLog.agent_network = 18` (`proxy_service.proto:258-261`).
- `RecordLLMUsageRequest.group_ids = 8` (`proxy_service.proto:496-498`) — so the record path can fan out to every applicable budget rule's window without a re-lookup.
- 15 new OpenAPI component schemas (`openapi.yml:5072-5829`): `AgentNetworkProvider[Request|Model]`, `AgentNetworkCatalog{Model,Provider,IdentityInjection,HeaderPairInjection,JSONMetadataInjection,ExtraHeader}`, `AgentNetworkPolicy[Request|TokenLimit|BudgetLimit|Limits]`, `AgentNetworkGuardrail[Checks|Request]`, `AgentNetworkConsumption`, `AgentNetworkSettings[Request]`, `AgentNetworkBudgetRule[Request]`.
- 9 new path groups (`openapi.yml:12797-13460`): `/api/agent-network/{consumption,settings,budget-rules,budget-rules/{ruleId},catalog/providers,providers,providers/{providerId},policies,policies/{policyId},guardrails,guardrails/{guardrailId}}`.
- Four typed NotFound errors (`shared/management/status/error.go:208-227`).
-
-## Invariants
- **Field-number monotonicity.** Every new proto field uses a previously-unallocated number in its message: `PathTargetOptions` 8-13 (was 1-7), `AccessLog` 18 (was 1-17), `RecordLLMUsageRequest` 8. `SendStatusUpdateRequest.inbound_listener = 50` (pre-existing) reserves 50+ for observability extensions, so 8 on `RecordLLMUsageRequest` doesn't conflict.
- **Old proxies stay compatible.** Old management never sends `disable_access_log`/`middlewares`/`agent_network` (zero value → existing behaviour); old proxies that don't decode these fields just drop them silently (proto3 unknown-field semantics) — log emission stays on. No pre-existing field number changed: the proto change is insertions only.
- **Old management stays compatible.** The two new RPCs are registered on the same `management.ProxyService` descriptor; old proxies hitting them get `codes.Unimplemented` from the unimplemented embed (`proxy_service_grpc.pb.go:283-289`), which is the same fallback pattern `SyncMappings` already documents (`proxy_service.proto:20-21`).
- **OpenAPI shapes are append-only.** New schemas are placed at the end of `components.schemas` (line 5072+); new paths at the end of `paths` (line 12797+). No existing schema's `required` list, enum, or property type was changed.
- **`*Request` vs response asymmetry.** Read shapes (`AgentNetworkProvider`, `AgentNetworkPolicy`, `AgentNetworkGuardrail`, `AgentNetworkSettings`, `AgentNetworkBudgetRule`) require `created_at`/`updated_at`; the matching `*Request` shapes do not — server fills them. `AgentNetworkProviderRequest.api_key` is write-only (`openapi.yml:5158-5161` "never returned in responses"); reviewers should confirm the response schema (5072-5138) actually omits `api_key`.
-
-## Things to scrutinize
-### Correctness
- `RecordLLMUsageRequest` carries both `group_id` (singular, the attribution group — field 3) and `group_ids` (plural, full membership — field 8). `b22d5a181` adds field 8 to drive account-budget fan-out; double-check that consumers can't accidentally key counters on the wrong one. Field comments at `proxy_service.proto:489-491` and `496-498` distinguish them but it's the kind of subtle thing a follow-up commit might collapse.
- `PathTargetOptions.disable_access_log` is the only field whose default-false meaning **changes semantics** on the proxy side: false → log (status quo), true → suppress. Synthesizer sets `DisableAccessLog = !settings.EnableLogCollection`, so a missing/default settings row yields `EnableLogCollection=false → DisableAccessLog=true → suppressed`. Worth confirming downstream (`agentnetwork.synthesizer`) that operator-defined private services never inherit this flag — the proto field default protects them, but only if synth code is explicit.
- `CheckLLMPolicyLimitsResponse.decision` is a free-form `string` (`proxy_service.proto:471`) rather than an enum. Only documented values are "allow" / "deny". An enum would prevent typo drift; consider before this RPC ships to external consumers.
- `deny_code` (`proxy_service.proto:478-481`) is documented as "a stable label" but is also a free string. Pin the allowed set somewhere observable to the proxy.
-
-### Security
- `AgentNetworkProvider.api_key` MUST be write-only. Schema split (request has it at line 5158; response omits it) looks correct, but a regression here leaks the upstream provider credential to every dashboard reader. Check that the handler explicitly zeros it on the response path.
- `extra_values` / `identity_header_*` headers on `AgentNetworkProvider` get stamped onto upstream requests. Description at `openapi.yml:5099` says "values not declared by the catalog are ignored at synth time" — a contract this module documents but the synthesizer must enforce. Confirm the synth module honours it.
- Cluster + subdomain on `AgentNetworkSettings` are documented immutable (`openapi.yml:5686-5694`) and the `AgentNetworkSettingsRequest` (lines 5733-5752) doesn't accept them. Verify the `PUT /api/agent-network/settings` handler can't be tricked by extra JSON keys (oapi-codegen's `additionalProperties: false` is not declared here; spec defaults to permissive).
-
-### Backward compatibility
- The proto change is field-number additive: every previously numbered field keeps the same name + type, and the change is insertions only (no deletions in `proxy_service.proto`), so this holds at the source-text level.
- `proxy_service_grpc.pb.go` adds two RPC handlers and registers them in `ProxyService_ServiceDesc.Methods` (lines 543-552). The existing entries are unchanged and order-preserving — gRPC method dispatch is name-keyed, so order doesn't matter, but reviewing the diff (no method renamed/dropped) is still worth a glance.
- OpenAPI 3.0 doesn't have a built-in deprecation flow for paths; if any client tooling iterates `paths.*`, the additive routes shouldn't break it, but generated SDKs (especially the dashboard's) need a regen to gain access to `AgentNetwork*`.
-
-### Codegen pinning
- `generate.sh` (`shared/management/http/api/generate.sh:14`) installs `oapi-codegen@latest` rather than a pinned version. **This is a reproducibility gap** — re-running the script later may produce a different `types.gen.go`. Either pin the version in `generate.sh` (e.g. `@v2.7.0`) or document the pin in a `tools.go`.
- proto codegen has the protoc / protoc-gen-go version stamped in the generated file header (`proxy_service.pb.go:3-4`).
- Regenerate locally and confirm zero diff against the committed `types.gen.go` / `proxy_service.pb.go`.
-
-## Test coverage
-| Test file | Locks down |
-| --------- | ---------- |
-| None in this scope | The proto and OpenAPI sources are tested transitively by the handler tests (`shared/management/http/handlers/agentnetwork/...`) and by the synthesizer/manager tests (`management/server/agentnetwork/...`). No round-trip serialisation test exists in the `proto/` or `api/` packages themselves. |
-| `shared/management/proto/*_test.go` | (absent) |
-| `shared/management/http/api/*_test.go` | (absent) |
-
-Acceptable for codegen artefacts, but a single golden-file test that re-runs `oapi-codegen` and `protoc` in CI and diffs against the checked-in files would close the reproducibility gap noted above.
-
-## Known limitations / explicit non-goals
- **No deprecation surface.** Old fields/RPCs are kept silently; there is no `[deprecated = true]` annotation on anything. Acceptable here because nothing is being removed.
- **No proto-side validation.** Numeric ranges (e.g. `window_seconds >= 60`, `cost_usd >= 0`, capture-byte clamps) are enforced in the OpenAPI schema via `minimum:` and inside Go code by the proxy/management, but `proto3` itself can't express them; downstream is expected to validate every message.
- **`MiddlewareConfig.config_json` is `bytes`** (`proxy_service.proto:163`) — opaque to the proto layer. Schema validity is the middleware factory's problem. This is a deliberate tradeoff (per the comment at 161-162) but worth flagging: a corrupted/malicious config_json can only fail at proxy apply time, not at the wire-decode step.
- **No catalog endpoint schema for the catalog itself** — the catalog data ships as a `GET /api/agent-network/catalog/providers` returning `[AgentNetworkCatalogProvider]` (`openapi.yml:13024`), but the catalog source-of-truth lives in `management/server/agentnetwork/catalog`, not here.
- The reaper / GC design was cut from scope; no reaper-related types appear here.
-
-## Cross-references
- Downstream: [management/store](20-management-store.md), [management/agentnetwork](21-management-agentnetwork.md), [management/handlers + wiring](22-management-handlers-wiring.md), [proxy/runtime](33-proxy-runtime.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)
--- a/docs/agent-networks/modules/20-management-store.md
+++ b/docs/agent-networks/modules/20-management-store.md
@@ -1,112 +0,0 @@
-# management/store — persistence for agent-network entities
-
-> **Risk level:** Medium — six brand-new tables behind AutoMigrate, one upsert-counter table that runs on the request hot path, and one column carrying an encrypted secret.
-> **Backward-compat impact:** Additive (six new tables created by AutoMigrate; the `Store` interface gains 23 methods, but no existing column/index is touched).
-
-## Module boundary
-
-This module is the persistence layer for the Agent Network feature. Everything the management server stores about LLM proxying — providers, policies, guardrails, the per-account settings row, a usage-counter table written on every proxied LLM request, and the account-budget rules — flows through the methods added to `store.Store`. The module owns six tables, six entity types from `management/server/agentnetwork/types`, and a single hot-path upsert (`IncrementAgentNetworkConsumption`) consumed by the proxy fleet.
-
-Out of scope here: the catalog of provider definitions (compiled-in, no DB), the synthesizer/manager built on top of these CRUDs (covered in [21-management-agentnetwork.md](21-management-agentnetwork.md)), and the HTTP handlers that translate API requests into Save/Delete calls.
-
-## Files
-
-| Path | Role |
-| ---- | ---- |
-| `management/server/store/sql_store_agentnetwork.go` | gorm implementations of all 23 store methods |
-| `management/server/store/sql_store_agentnetwork_budgetrule_test.go` | round-trip + account-scoping coverage against a real sqlite store |
-| `management/server/store/sql_store.go` | one import, six entities appended to the `AutoMigrate` slice (sql_store.go:40, sql_store.go:141-142) |
-| `management/server/store/store.go` | 23 methods added to the `Store` interface (store.go:328-354) |
-| `management/server/store/store_mock_agentnetwork.go` | mockgen output for the new interface surface |
-
-## Tables added / migrations
-
-All six tables are created by `db.AutoMigrate` invoked from `NewSqlStore` at sql_store.go:133-143. There is no hand-rolled SQL migration script — the schema is whatever GORM derives from the struct tags.
-
- `agent_network_providers` — `Provider.TableName()` at provider.go:76. PK `id`, index on `account_id`, named index `idx_agent_network_provider` on `provider_id`. Carries an at-rest-encrypted `api_key` and ed25519 `session_private_key` (provider.go:35,56). `extra_values` and `models` are JSON blobs (`serializer:json`).
- `agent_network_policies` — `Policy.TableName()` at policy.go:70. PK `id`, index on `account_id`. JSON columns: `source_groups`, `destination_provider_ids`, `guardrail_ids`, `limits`.
- `agent_network_guardrails` — `Guardrail.TableName()` at guardrail.go:41. PK `id`, index on `account_id`. JSON `checks`.
- `agent_network_settings` — `Settings.TableName()` at settings.go:33. PK `account_id` (one row per account), named index `idx_agent_network_settings_cluster_subdomain` on `subdomain` only — the index name implies a composite, but only one column is tagged.
- `agent_network_consumption` — `Consumption.TableName()` at consumption.go:46. Composite PK across `(account_id, dim_kind, dim_id, window_seconds, window_start_utc)` — the same tuple the upsert keys on.
- `agent_network_budget_rules` — `AccountBudgetRule.TableName()` at budgetrule.go:35. PK `id`, index on `account_id`. JSON `target_groups`, `target_users`, `limits`.
-
-## CRUD surface added
-
-Provider, Policy, Guardrail, BudgetRule follow the same pattern: `Get<Kind>ByID`, `GetAccount<Kind>` (list), `Save<Kind>` (upsert), `Delete<Kind>`, with account-scoping enforced by the existing `accountAndIDQueryCondition` / `accountIDCondition` constants (sql_store.go:59-62). Provider additionally exposes `GetAllAgentNetworkProviders` (cross-account, used by the synthesizer). Settings exposes `Get`/`GetByCluster`/`Save` (no delete — one row per account, created on first save). Consumption exposes the upsert `Increment`, a point `Get`, and a cross-window `List`.
-
-## Architecture & flow
-
-```mermaid
-flowchart LR
-    handlers["HTTP handlers<br/>(management/server/agentnetwork)"] -->|Save/Delete| iface["Store interface<br/>store.go:328-354"]
-    manager["agentnetwork.Manager"] -->|Get*| iface
-    synth["synthesizer<br/>(global)"] -->|GetAllAgentNetworkProviders| iface
-    proxy["proxy fleet<br/>(hot path)"] -->|IncrementAgentNetworkConsumption| iface
-    iface --> sql["SqlStore methods<br/>sql_store_agentnetwork.go"]
-    iface -.gomock.-> mock["MockStore<br/>store_mock_agentnetwork.go"]
-    sql --> gorm["gorm.DB"]
-    gorm --> tables[("6 tables<br/>agent_network_*")]
-    sql --> enc["crypt.FieldEncrypt<br/>(provider only)"]
-```
-
-Reads decrypt provider secrets in-place; writes do `provider.Copy().EncryptSensitiveData(...)` before `db.Save` so the caller's in-memory object keeps the plaintext `api_key` (sql_store_agentnetwork.go:88-102). Every list/get takes a `LockingStrength` and applies `clause.Locking{Strength: ...}` when non-`None` — matching the rest of the store. The upsert path uses `clause.OnConflict` with `gorm.Expr` server-side increments so concurrent proxy nodes converge without read-modify-write races (sql_store_agentnetwork.go:321-335).
-
-## Invariants enforced at the store layer
-
- **Account scoping.** Every entity-by-ID method keys on `account_id = ? and id = ?`; no cross-tenant leak path through the API is reachable as long as callers always pass the auth'd `accountID` (sql_store_agentnetwork.go:70,141,201,429).
- **NotFound mapping.** `gorm.ErrRecordNotFound` is translated to typed `status.NewAgentNetwork*NotFoundError`; `Delete*` returns NotFound when `RowsAffected == 0` (sql_store_agentnetwork.go:111-113,171-173,231-233,461-463).
- **Provider secret encryption at rest.** `SaveAgentNetworkProvider` always encrypts before persist; `Get*` always decrypts after read. The plaintext `api_key` never reaches the DB through this layer (sql_store_agentnetwork.go:31,54,80,90).
- **Consumption monotonicity.** The upsert only ever issues `col = col + ?` for the three counter columns — no decrement path exists (sql_store_agentnetwork.go:330-332).
- **Window alignment is the caller's responsibility.** The store stamps `WindowStartUTC` as-passed; alignment to epoch happens in `types.WindowStart` at consumption.go:51-58.
- **Settings has no Delete.** Intentional — one row per account, created on first save; the row sticks around for the account lifetime.
-
-## Things to scrutinize
-
-### Correctness
- `SaveAgentNetworkProvider` saves the copy (sql_store_agentnetwork.go:95). The caller's in-memory pointer therefore keeps plaintext `api_key` and any `CreatedAt`/`UpdatedAt` gorm autofills land on the copy, not the original. Callers that need synced timestamps must re-fetch.
- `IncrementAgentNetworkConsumption`'s `Create` provides initial counter values (`TokensInput: tokensIn`, etc.) in the row, and on conflict the assignments add the same deltas to the existing values. The insert-vs-update arithmetic is consistent. Cross-check that no engine in use (sqlite, postgres, mysql) silently rejects the `OnConflict` clause — GORM emits engine-specific SQL but `ON DUPLICATE KEY UPDATE` (mysql) vs `ON CONFLICT (...)` (sqlite/postgres) need their unique constraint to match the composite PK on `agent_network_consumption`; it does, by construction.
- `IncrementAgentNetworkConsumption` writes `updated_at: time.Now().UTC()` literally inside the assignments map (sql_store_agentnetwork.go:333) — fine, but it's a Go-side timestamp captured at call time, not a DB-side `now()`. Acceptable for an audit field.
- `GetAgentNetworkConsumption` returns a zero-valued non-nil row on `ErrRecordNotFound` (sql_store_agentnetwork.go:364-371). Document or rename — a typed sentinel error would be more orthodox; callers must know not to error-check.
-
-### Concurrency / transactions
- Hot-path `IncrementAgentNetworkConsumption` runs outside any explicit transaction; concurrency safety relies entirely on the DB serialising the `ON CONFLICT` upsert against the composite PK. This is correct for postgres and mysql; for sqlite it serialises behind the single writer.
- `SaveAgentNetworkSettings` is a blind upsert with no version/etag — concurrent writes from two operators last-write-wins on the collection-toggle flags (settings.go:23-25). Acceptable for admin-curated state but worth flagging.
- `Save*Provider` uses `db.Save` on a struct with a PK already set — GORM emits UPDATE or INSERT based on row existence. No upsert clause is attached, so a race between two creates with the same generated `xid` (vanishingly unlikely) would surface as a PK violation.
-
-### Migration safety
- All six tables ride `AutoMigrate` (sql_store.go:141-142). AutoMigrate is additive: new columns get added, but it never drops columns nor narrows types. Three `bool` columns on `agent_network_settings` (`EnableLogCollection`, `EnablePromptCollection`, `RedactPii`) default to false at the GORM/DDL layer for existing rows; the test at sql_store_agentnetwork_budgetrule_test.go:83-112 locks that down on a fresh sqlite. Verify postgres/mysql produce the same default.
- The named index `idx_agent_network_settings_cluster_subdomain` on settings.go:15 is declared on only `subdomain`. Either the cluster column also needs `gorm:"index:idx_agent_network_settings_cluster_subdomain"` to make it composite, or the name is misleading.
- The named index `idx_agent_network_provider` on `Provider.ProviderID` (provider.go:30) is *not* unique and not scoped to account — two providers in the same account with the same `provider_id` are permitted at the DB layer; uniqueness, if any, must live above the store.
-
-### Backward compatibility
- Net additive. No removed methods, no renamed columns, no schema change to existing tables. Existing deployments running a prior binary continue to work; the first boot of the new binary creates the six tables.
- The `Store` interface grows by 23 methods (store.go:330-354); any non-mock external implementer of `store.Store` will fail to compile. The repo only has `SqlStore` + `MockStore`, both updated.
-
-### Performance (indexes, N+1)
- All by-account list queries hit the `idx_account_id` per-table index. No N+1: list methods return the full slice in one query.
- `GetAgentNetworkSettingsByCluster` (sql_store_agentnetwork.go:263-277) does a tablescan on `cluster` — no index. Tolerable for the bootstrap label generator (one-shot at provisioning) but worth noting if the call moves onto a hot path.
- `ListAgentNetworkConsumption` returns every row ever recorded for the account (sql_store_agentnetwork.go:382-400) — unbounded growth, no `LIMIT`, no time filter. With one row per (dim, window) per request burst, this table grows fastest of the six; a retention job + a paginated list method are obvious follow-ups.
-
-## Test coverage
-
-| Test file | Locks down |
-| --------- | ---------- |
-| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkBudgetRule_RealStore_RoundTrip` | full save → reload of `AccountBudgetRule` including the JSON-serialised `PolicyLimits`, target slices, double-delete returns NotFound (lines 18-59) |
-| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkBudgetRule_RealStore_ScopedByAccount` | cross-account isolation for budget rules (lines 63-78) |
-| `sql_store_agentnetwork_budgetrule_test.go::TestAgentNetworkSettings_RealStore_CollectionTogglesRoundTrip` | collection toggles default off, survive save/reload at the set values (lines 83-112) |
-
-Gap: there is no store-level test for providers (encryption round-trip), policies, guardrails, or `IncrementAgentNetworkConsumption` (concurrent upsert, window-key uniqueness). The consumption upsert is the most performance-sensitive method in this module and the only one without a real-sqlite test.
-
-## Known limitations / explicit non-goals
-
- No retention / GC for `agent_network_consumption`.
- No `Delete` for `Settings` (one row per account, cleared with the account).
- No DB-engine-specific tuning — the same struct tags drive sqlite, mysql, postgres.
- Provider `extra_values` and `models` are JSON blobs; querying inside them is not supported by design.
- `GetAgentNetworkConsumption` "not-found = zero row" contract is convenient but unconventional.
-
-## Cross-references
-
- Upstream: [shared/api](10-shared-api.md), [management/agentnetwork](21-management-agentnetwork.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)
--- a/docs/agent-networks/modules/21-management-agentnetwork.md
+++ b/docs/agent-networks/modules/21-management-agentnetwork.md
@@ -1,225 +0,0 @@
-# management/agentnetwork — domain layer + synth pipeline
-
-> **Risk level:** High — central business logic + budget enforcement + the source of every middleware-chain change the proxy executes.
-> **Backward-compat impact:** Additive within the agent-network surface; one **behavioural difference for opted-out accounts** in parser capture (the capture flag is stamped explicitly false instead of being absent — see capture-pointer semantics below). Non-agent-network proxy services are untouched (the synth chain only ships on `agent-net-svc-*` targets).
-
-## Module boundary
-
-`management/server/agentnetwork` owns every agent-network entity (providers, policies, guardrails, account budget rules, per-account settings, consumption rows) and **translates them into the in-memory `*rpservice.Service` that the reverse-proxy controller turns into `proto.ProxyMapping`s and pushes to clusters**. It is the *only* writer of the agent-network middleware chain.
-
-Inside the package: `manager.go` is the CRUD + permissions-gated facade; `synthesizer.go` walks settings + providers + policies + guardrails and emits the per-account service plus every middleware's JSON config; `policyselect.go` runs per-request attribution (min-wins account ceiling, then "drain bigger pool first"); `reconcile.go` diffs successive synth outputs and emits precise Create/Update/Delete proxy-mapping updates plus a peer-map refresh. `labelgen/` mints DNS-safe subdomain labels; `catalog/` is the static provider catalogue; `types/` carries gorm entity structs. The `_realstack_test.go` files in the parent `management/server/` directory exercise the manager + network-map controller end-to-end with no mocks.
-
-## Files
-
-| Path | Role |
-| ---- | ---- |
-| `agentnetwork/manager.go` | Manager interface + CRUD + permission gates + bootstrap-settings + reconcile trigger |
-| `agentnetwork/synthesizer.go` | Settings/policy → wire-format synthesis; sole writer of the proxy middleware chain |
-| `agentnetwork/policyselect.go` | Per-request policy attribution + account-budget ceiling (min-wins) |
-| `agentnetwork/reconcile.go` | Per-account synth diff vs in-memory cache → Create/Update/Delete |
-| `agentnetwork/catalog/catalog.go` | Static provider catalogue (auth headers, identity-injection shapes) |
-| `agentnetwork/labelgen/{labelgen,words}.go` | DNS-safe subdomain picker + curated wordlist |
-| `agentnetwork/types/provider.go` | Provider entity + APIKey + Models + ExtraValues + SessionKeys |
-| `agentnetwork/types/policy.go` | Policy entity + `PolicyLimits` (token + budget) |
-| `agentnetwork/types/guardrail.go` | Guardrail entity (`ModelAllowlist`, `PromptCapture`) |
-| `agentnetwork/types/budgetrule.go` | `AccountBudgetRule` (reuses `PolicyLimits`) |
-| `agentnetwork/types/settings.go` | Per-account `Settings` (Cluster, Subdomain, 3 toggles) |
-| `agentnetwork/types/consumption.go` | `Consumption` row + `WindowStart` aligner |
-| `agentnetwork/{synthesizer,policyselect,reconcile,wire_shape}_*test.go` | See test coverage table |
-| `agentnetwork/types/consumption_test.go` | `WindowStart` alignment proofs |
-| `agentnetwork/labelgen/labelgen_test.go` | Deterministic picks + exhaustion + fallback |
-| `management/server/agentnetwork_realstack_test.go` | No-mock provider CRUD → network-map fan-out |
-| `management/server/agentnetwork_budgetrule_realstack_test.go` | No-mock budget-rule CRUD + settings preserve-immutable |
-
-## Architecture & flow
-
-### Synthesis (settings/policy → wire format)
-
-```mermaid
-flowchart TD
-    A[Mutation: provider/policy/guardrail/settings] --> B[managerImpl.reconcile accountID]
-    B --> C{proxyController nil?}
-    C -- yes --> D[accountManager.UpdateAccountPeers only]
-    C -- no --> E[SynthesizeServices]
-    E --> F[loadSettings — NotFound returns ok=false, no synth]
-    F --> G[filterEnabledProviders sorted by CreatedAt]
-    G --> H[filterEnabledPolicies]
-    H --> I[backfillProviderSessionKeys if missing]
-    I --> J[indexProviderGroups: providerID -> sorted source groups]
-    J --> K[buildRouterConfigJSON drops orphan providers]
-    J --> L[buildIdentityInjectConfigJSON per catalog entry]
-    H --> M[mergeGuardrails: union allowlist, OR redact]
-    M --> N[applyAccountCollectionControls account toggle = SOLE capture control]
-    N --> O[marshalGuardrailConfig]
-    K --> P[buildMiddlewareChain 8 middleware entries]
-    L --> P
-    O --> P
-    P --> Q[buildAccountService: AccessGroups=union source groups, noop.invalid target]
-    Q --> R[reconcile.diffMappings vs cache]
-    R --> S[SendServiceUpdateToCluster CREATE/MODIFY/REMOVE]
-    R --> T[accountManager.UpdateAccountPeers — fans synth ACLs into network map]
-```
-
-### Budget rule resolution (min-wins, group+user bound)
-
-```mermaid
-flowchart TD
-    A[SelectPolicyForRequest in] --> B[checkAccountBudget — runs FIRST, independent of policies]
-    B --> C[GetAccountAgentNetworkBudgetRules]
-    C --> D{for each enabled rule}
-    D --> E{budgetRuleApplies?}
-    E -- no --> D
-    E -- yes --> F[attrGroup = lowestIntersect TargetGroups, in.GroupIDs]
-    F --> G{Token cap enabled?}
-    G -- yes --> H[evalTokenCap user dim + group dim]
-    H --> I{exhausted?}
-    I -- yes --> J[DENY: llm_account.token_cap_exceeded - STOP]
-    I -- no --> K{Budget cap enabled?}
-    G -- no --> K
-    K -- yes --> L[evalBudgetCap user dim + group dim]
-    L --> M{exhausted?}
-    M -- yes --> N[DENY: llm_account.budget_cap_exceeded - STOP]
-    M -- no --> D
-    K -- no --> D
-    D --> O[All rules passed -> fall through to per-policy selection]
-```
-
-Key invariant: **rules are checked sequentially and ANY exhausted rule denies (all-must-pass / min-wins).** Untargeted rules (`len(TargetGroups)==0 && len(TargetUsers)==0`) apply to every caller (`policyselect.go:393`).
-
-### Policy selection (per-peer, per-request)
-
-```mermaid
-flowchart TD
-    A[Account-budget gate passed] --> B[GetAccountAgentNetworkPolicies]
-    B --> C[filterApplicablePolicies enabled + provider match + group intersect]
-    C --> D{candidates empty?}
-    D -- yes --> E[Allow, empty SelectedPolicyID]
-    D -- no --> F[scoreCandidates -> scoreOne per policy]
-    F --> G[scoreOne: attrGroup + window]
-    G --> H{any cap exhausted?}
-    H -- yes --> I[Drop policy; record last deny code]
-    H -- no --> K[Keep as live candidate]
-    F --> L{live candidates exist?}
-    L -- no --> M[Deny with last exhaustion code]
-    L -- yes --> N[Sort: uncapped wins -> larger group token -> group budget -> user token -> user budget -> oldest CreatedAt]
-    N --> O[winner = scored 0]
-    O --> P[Allow + SelectedPolicyID + AttributionGroupID + WindowSeconds]
-```
-
-End-to-end: a mutation calls `managerImpl.reconcile(ctx, accountID)` (`manager.go:205,239,...`). Reconcile defers an `accountManager.UpdateAccountPeers` so the network-map controller re-runs and `injectAllProxyPolicies` picks up the new access groups; with a `proxyController` wired, it re-synthesizes the service, diffs against `reconcileCache[accountID]` (guarded by `reconcileMu`), and emits proto mappings to the cluster derived from the mapping's domain (`reconcile.go:120`). Synthesis is stateless and idempotent. Sole persistent side effect: `backfillProviderSessionKeys` (`synthesizer.go:249`) mints ed25519 keys on legacy provider rows and writes them back.
-
-At request time the path is independent: the proxy calls `SelectPolicyForRequest` (`policyselect.go:56`); account-budget ceiling first, then per-policy scoring. Token + budget caps share `evalTokenCap` / `evalBudgetCap` — same primitive for account rules and policy limits, `label` differentiates the deny reason. After a served request, `RecordAccountBudgetUsage` (`policyselect.go:415`) fans deltas to every applicable rule's distinct `(dim_kind, dim_id, window)` tuple, deduplicating to prevent double-count when two rules share target+window.
-
-## Public contracts
-
- **Manager interface** (`manager.go:48-80`): CRUD for `Providers/Policies/Guardrails/BudgetRules`; `GetSettings/UpdateSettings` (cluster + subdomain immutable, only the three toggles mutate); `ListConsumption/RecordConsumption(account, kind, dimID, windowSec, in, out, USD)`; `RecordAccountBudgetUsage(account, user, groups, in, out, USD)`; `SelectPolicyForRequest(ctx, PolicySelectionInput) → *PolicySelectionResult{Allow, SelectedPolicyID, AttributionGroupID, WindowSeconds, DenyCode, DenyReason}`.
- **`PolicySelectionInput`** (`manager.go:85-90`): `{AccountID, UserID, GroupIDs, ProviderID}` — populated by the proxy from CapturedData + `llm_router` resolution.
- **Synthesized middleware chain** (`synthesizer.go:576-657`), order load-bearing — response slot runs reverse-of-slice:
-
-  | Slot | Idx | ID | ConfigJSON shape | CanMutate |
-  | --- | --- | --- | --- | --- |
-  | on_request | 0 | `llm_request_parser` | `{"capture_prompt": <bool>, "redact_pii"?: true}` | – |
-  | on_request | 1 | `llm_router` | `{"providers":[{id, models[], upstream_*, auth_header_*, allowed_group_ids[]}]}` | **true** |
-  | on_request | 2 | `llm_limit_check` | `{}` | – |
-  | on_request | 3 | `llm_identity_inject` | `{"providers":[{provider_id, header_pair?, json_metadata?, extra_headers?}]}` | **true** |
-  | on_request | 4 | `llm_guardrail` | `{"model_allowlist"?, "prompt_capture":{enabled,redact_pii}}` | – |
-  | on_response | 5 | `llm_limit_record` | `{}` (runs LAST at runtime) | – |
-  | on_response | 6 | `cost_meter` | `{}` | – |
-  | on_response | 7 | `llm_response_parser` | `{"capture_completion": <bool>, "redact_pii"?: true}` | – |
- **Synthesized service shape** (`synthesizer.go:739`): `Mode=HTTP`, `Private=true`, `Domain=<subdomain>.<cluster>`, `AccessGroups=unionSourceGroups(enabledPolicies)`, one `TargetTypeCluster` target with `Host=noop.invalid:443` (router rewrites per request), `Options.{DirectUpstream,AgentNetwork}=true`, `DisableAccessLog=!settings.EnableLogCollection`, `CaptureMax{Req,Resp}Bytes=1<<20`, `CaptureContentTypes=["application/json","text/event-stream"]`.
-
-## Invariants
-
- **Min-wins / all-must-pass for account budget rules** (`checkAccountBudget`, `policyselect.go:353`): every applicable enabled rule is checked; first exhausted cap denies. Untargeted rules bind every caller.
- **Account toggle is the SOLE control for capture enablement.** `applyAccountCollectionControls` (`synthesizer.go:701`) sets `merged.PromptCapture.Enabled = settings.EnablePromptCollection` *unconditionally*.
- **Capture-pointer semantics on parser configs** — see "Things to scrutinize" below.
- **`EnableLogCollection` ↔ `DisableAccessLog` is the only access-log toggle** (`synthesizer.go:770`). Default off ⇒ access log suppressed.
- **`RedactPii` flows verbatim to BOTH parsers** (`synthesizer.go:584-585`) and is OR'd into the merged guardrail (`synthesizer.go:706`).
- **Cluster and Subdomain are immutable on Settings.** `UpdateSettings` reloads existing row and overlays only the three toggles (`manager.go:558-561`).
- **Orphan providers (no enabled policy authorises them) NEVER reach the router** (`synthesizer.go:351-357`); skipped from `identity_inject` for symmetry.
- **Provider creation refuses empty `api_key`** (`manager.go:175`); **deletion refuses while any policy still references it** (`manager.go:265-273`).
- **Session keypair stability across provider edits** (`manager.go:226-228`) — server-managed, copied through every `UpdateProvider`, never API-surfaced.
-
-## Things to scrutinize
-
-### Correctness
-
- **Capture-pointer semantics — `*bool` vs `bool`.** Three states, owned by separate sides:
-  - **Wire JSON this module emits:** `buildParserConfigJSON` (`synthesizer.go:678-693`) *always* stamps the capture field. Agent-network targets ship `"capture_prompt": false` or `"capture_prompt": true` — never absent. Same for `"capture_completion"`. The happy-path test pins `{"capture_prompt":false}` (`synthesizer_test.go:174`).
-  - **Proxy-side parser config (consumer):** parsers decode into `*bool`. Matrix:
-    - `nil` (field absent) → **legacy default = emit**. Preserved for non-agent-network callers and pre-existing tests (the backward-compat hook).
-    - `false` (field present, value false) → **suppress emission entirely**. The behaviour for opted-out agent-network accounts. Without this, `enable_log_collection=true` + `enable_prompt_collection=false` would leak raw user input AND raw model output to the access log.
-    - `true` → emit normally.
-  - **Why the synth always stamps a value:** an agent-network mapping omitting the field would hit legacy "always emit" and re-introduce the leak. The `json.Marshal` error fallback at `synthesizer.go:687` degrades to `{}` — comment-claimed unreachable, but if ever fired re-introduces the leak. Consider fail-closed (return literal `{"capture_prompt":false}`) instead.
- **`scoreCandidates` non-cumulative deny code.** Only the *last* exhausted policy's deny code survives (`policyselect.go:188-190`). Iteration order is store's natural order. Auth signal is `len(scored)==0`, so this is informational only — verify no UI depends on "first exhausted policy" semantics.
- **`effectiveWindowSeconds` token-wins tiebreak.** When both halves are enabled with different windows, token's window wins (`policyselect.go:482`). Verify `RecordLLMUsage` increments against the winning window only.
- **`RecordAccountBudgetUsage` dedup.** Two rules with the same `(kind, dim_id, window)` would double-count without the `tuples` map (`policyselect.go:434-449`). Key includes all three dimensions — correct.
- **Fail-closed on bad provider:** unknown catalog id (`synthesizer.go:794-796`) or empty API key (`synthesizer.go:801-803`) drops the **entire** account's synth, not just the bad provider. Confirm matches operator UX.
-
-### Security
-
- **Redact OR-merge:** merged `RedactPii` = account OR guardrail (`synthesizer.go:706`). **Parser-side flag is `settings.RedactPii` only, NOT the OR** — a guardrail-only opt-in does not propagate to parsers. Correct because the account toggle gates capture, but worth noting on the proxy side.
- **Group resolution must not leak across accounts.** Every store call carries `accountID` (`policyselect.go:73, 286, 298, 322, 334, 354`); `lowestIntersect` uses caller's claimed groups only (`policyselect.go:494`). Risk surface is upstream (handler populates `in.GroupIDs`).
- **`UpdateSettings` preserves immutable Cluster + Subdomain** (`manager.go:558`). A client can't rebind the cluster.
- **Provider session keypair backfill writes through `SaveAgentNetworkProvider`** (`synthesizer.go:256`) from a read-shaped call. Idempotent → worst case is a wasted write under concurrent reconcile + snapshot.
-
-### Concurrency
-
- **`reconcileMu`** guards `reconcileCache`. Lock window is narrow — compute diff inside, send outside (`reconcile.go:56-68`).
- **`labelRngMu`** guards `labelRng` because `math/rand.Source` is unsafe for concurrent use (`manager.go:638-640`).
- **Real-store tests** use `store.NewTestStoreFromSQL` with `t.TempDir()` per test — no shared state, no `t.Parallel()`.
- **`RecordAccountBudgetUsage` dedup `tuples` map is per-call;** concurrent calls fan out fully — correct (each request's tokens book once per applicable rule).
- **Deferred `UpdateAccountPeers` runs inline after the proxy push** (`reconcile.go:28-35`); a slow call stretches CRUD response time.
-
-### Backward compatibility
-
- **Capture-pointer semantics (restated):** non-agent-network callers see no field → legacy nil-default emit, identical to pre-PR. Agent-network targets always carry an explicit `capture_*` value.
- **`TestSynthesizeServices_HappyPath` was updated:** request-parser config moved from `{}` to `{"capture_prompt":false}` (`synthesizer_test.go:174`). External snapshot tests against synth output need updating.
- **`MergedGuardrails` retains zeroed `TokenLimits`/`Budget`/`Retention`** even though `Policy.Limits` carries the real values now; `llm_limit_check` is the authoritative enforcement. Comment at `synthesizer.go:940-948` calls this out.
-
-### Performance
-
- **`SynthesizeServices` runs on every controller tick / mutation reconcile.** Cost: 4 store reads + optional per-provider keypair backfill. Sort + index + merge are O(N log N) / O(P × G); dominant cost is JSON marshalling. No nested loops escape these dimensions.
- **`reconcile.diffMappings` is O(N + M)** with N=M=1 per account today — effectively constant.
- **`SynthesizeServicesForCluster`** (`synthesizer.go:71`) walks every account on a cluster; per-account failures are **swallowed** (`synthesizer.go:91-93`) so a single misconfigured account doesn't drop the cluster. Runs per proxy reconnect.
-
-### Observability
-
- **Activity codes:** `AgentNetwork{Provider,Policy,Guardrail,BudgetRule}{Created,Updated,Deleted}`; `AgentNetworkSettingsUpdated` with `log_collection/prompt_collection/redact_pii` payload (`manager.go:567-571`). **No activity code for `SelectPolicyForRequest` denies** — surfaced via proxy access log only (likely intentional given volume).
- **Deny codes** namespaced: `llm_policy.{token,budget}_cap_exceeded`, `llm_account.{token,budget}_cap_exceeded` (`policyselect.go:18-26`).
- **Reconcile failures are logged at warn and swallowed** (`reconcile.go:42-44`). Persistent synth failures (e.g. unknown catalog id) silently keep the proxy out of sync — consider a manager-level synth-health surface if this becomes a support burden.
-
-## Test coverage
-
-| Test file | Locks down |
-| --------- | ---------- |
-| `synthesizer_test.go` | Mock-store: `HappyPath` (8-mw chain ordering, `{"capture_prompt":false}` baseline); `No{Settings,Providers}`; `Disabled{Provider,Policy}_NoService`; `RouterConfigOrdering`; `PolicyCheckConfig_UnionsSourceGroups`; `OrphanProvider_HasEmptyAllowedGroups`; identity-inject for LiteLLM / Bifrost (overrides + partial disable) / Cloudflare / Portkey / Vercel / OpenRouter / generic non-customizable; `GuardrailMerge_AllowlistUnion_LimitsRestrictive`; `BackfillsMissingSessionKeys`; `HTTPUpstream_KeepsExplicitPort`; `UpstreamURLPath_FlowsToRouter`; `UnknownProviderID_FailsClosed`; `EmptyAPIKey_FailsClosed`. |
-| `synthesizer_realstore_test.go` | Real-sqlite: `SurvivesStatusToggle` reproduces the disable/re-enable 403 regression; `Reconcile_RealStore_PushesPrivateAfterStatusToggle` extends through reconcile push. |
-| `synthesizer_guardrail_realstore_test.go` | `PromptCaptureAccountIsSoleControl`; `PromptCaptureFlowsWhenAccountOptsIn`; `AccountRedactWithoutGuardrailRedact`; `NoGuardrail_CaptureOff`. |
-| `synthesizer_log_collection_realstore_test.go` | `LogCollection{Off_SuppressesAccessLog,On_PermitsAccessLog}` — verifies `DisableAccessLog` propagation through `ToProtoMapping`. |
-| `synthesizer_parser_redact_realstore_test.go` | **Capture-pointer regression suite:** `ParserConfigsCarryRedactPii`; `ParserConfigsSuppressCaptureWhenLogCollectionOnly` (log=on/prompt=off ⇒ both capture flags false); `ParserConfigsOmitRedactPiiWhenOff`. |
-| `policyselect_test.go` | Mock-store: `NoApplicablePolicies`; `AllowWithLowestGroupAttribution`; `LargerPoolWinsAcrossUsageLevels`; `StaysOnLargerPoolAfterPartialDrain`; `FallsThroughToSmallerPoolWhenLargerExhausted`; `TiebreakBy{LargerGroupPool,CreatedAt}`; `DeniesWhenAllExhausted`; `UncappedPolicyAlwaysWinsAgainstCapped`; `DisabledPolicyIgnored`; `StoreErrorPropagates`; `RejectsEmptyAccount`; `SharesGroupCounterAcrossPolicies`; `AntiFallThroughOnLowestGroup`; `BudgetOnlyExhaustionDenies`; `BudgetTighterThanTokenWins`. |
-| `policyselect_realstore_test.go` | Real-sqlite regression guard: `NoApplicablePolicies`; `AllowAndLowestGroupAttribution`; `LargerPoolWins_FallsThroughWhenExhausted`; `BudgetCapDenies`; `GroupCounterSharedAcrossPolicies`; `DisabledPolicyIgnored`. |
-| `policyselect_account_realstore_test.go` | Account budget rules: `AccountCeilingBindsEvenWithUncappedPolicy` (min-wins); `AccountGroupCeiling`; `AccountTargetUsersBindsOnlyThatUser`; `AccountRuleRecordsToOwnWindow`. |
-| `reconcile_test.go` | `FirstSynth_EmitsCreate`; `NoChange_EmitsNothingExtra` (re-push as Modified — verify desired); `PolicyRemoved_EmitsDelete`; `NilProxyController_NoOp`; `EmptyAccountID_NoOp`; `ClusterFromMapping`. |
-| `wire_shape_test.go` | `TestSynthesizedService_WireShape` — proto-shape lockdown via `ToProtoMapping`. Catches "service not matching" (mapping reaches proxy but no SNI/HTTP route). Asserts ID, Domain, Mode, AuthToken, `Private`, `Auth.Oidc=false`, one path `/` + `https://noop.invalid/`, 8 middlewares with correct slot enums, router config `auth_header_value="Bearer sk-test-key"`. |
-| `labelgen/labelgen_test.go` | `PickUnique_{DeterministicWithSeededRng,AvoidsTakenWordsWhenMostAreReserved,FallsBackWhenAllReserved}`; `UniqueWords_DropsDuplicates`. |
-| `types/consumption_test.go` | `WindowStart_{AlignedToUnixEpoch,WithinWindowConverges,AcrossWindowsDiverges,DifferentWindowsHaveDifferentBuckets,SubMinuteAndMinuteAlignment,ZeroWindowReturnsInputUTC}`. Bucket alignment so multi-node reads converge. |
-| `agentnetwork_realstack_test.go` | `ProviderCRUD_FansOutToProxyAndClientPeers` — no-mock end-to-end through real account manager + network-map + agentnetwork: provider create propagates the updated map to both proxy peer and client peer with the synth DNS surface. |
-| `agentnetwork_budgetrule_realstack_test.go` | `BudgetRuleCRUD_RealManager`; `UpdateSettings_PreservesImmutableAndTogglesCollection`. |
-
-## Known limitations / explicit non-goals
-
- **`MergedGuardrails.TokenLimits/Budget/Retention` emit at zero** (`synthesizer.go:940-948`); real enforcement is `Policy.Limits` via `llm_limit_check`. Future cleanup implied.
- **Session keys picked from first enabled provider by created_at** (`pickServiceSessionKeys`, `synthesizer.go:270`). Existing session cookies survive provider edits only while the first-by-CreatedAt provider stays in place. Document for operators.
- **Reconcile failures silently swallowed** (`reconcile.go:42-44`). Persistent failures keep the proxy out of sync until the next reconcile.
- **`scoreCandidates` exposes only the LAST exhaustion's deny code** when multiple policies are exhausted.
- **`bootstrapSettingsIfNeeded` failure is non-fatal to provider create** (`manager.go:200`): provider lands, synth is no-op until the next provider create retries the bootstrap.
- **Budget rules do not trigger a reconcile** (`manager.go:476-477`). Request-time evaluation only; new rules take effect on the next request without a proxy push.
-
-## Cross-references
-
- **Upstream:** [shared/api](10-shared-api.md), [management/store](20-management-store.md), reverseproxy `service`/`proxy`/`sessionkey` packages, `management/server/permissions` + `activity`.
- **Downstream:** [management/handlers (HTTP wiring)](22-management-handlers-wiring.md), [proxy/middleware-builtin](31-proxy-middleware-builtin.md), network-map controller (`injectAllProxyPolicies` fan-out).
- **End-to-end flow:** [../01-end-to-end-flows.md](../01-end-to-end-flows.md) — "Provider create → reconcile → proxy push → peer map refresh" and "request → policy select → record" diagrams.
- **Top-level:** [../00-overview.md](../00-overview.md)
--- a/docs/agent-networks/modules/22-management-handlers-wiring.md
+++ b/docs/agent-networks/modules/22-management-handlers-wiring.md
@@ -1,203 +0,0 @@
-# management/handlers + wiring — HTTP API + gRPC delivery
-
-> **Risk level:** Medium — the surface is mostly additive, but two changes are load-bearing: `injectAllProxyPolicies` runs on every per-peer compute, and `shallowCloneMapping` must round-trip `Private` (a missed field silently breaks every MODIFIED).
-> **Backward-compat impact:** Additive on the wire (new routes, new RPCs, new proto fields, new gorm column on `AccessLogEntry`). One management-internal break: `nbhttp.NewAPIHandler` gains a trailing `agentNetworkManager` parameter; `nil` is tolerated and silently skips route registration.
-
-## Module boundary
-
-This module is the seam between the public Agent Network HTTP API and the proxy fleet that serves agent traffic. North side: a `/api/agent-network/*` surface (providers, policies, guardrails, budget rules, settings, consumption) on the existing gorilla router, delegating to `agentnetwork.Manager`. Handlers are thin — they translate `api.*` ↔ `types.*`, validate shape, forward. RBAC and event emission stay inside the manager (`manager.go:680-682`).
-
-South side: `ProxyServiceServer` (`proxy.go`) learns to (a) ship synth services to a proxy on initial snapshot, (b) resolve agent-network domains in `getServiceByDomain` for OIDC/session/tunnel-peer flows, (c) gate LLM requests via `CheckLLMPolicyLimits` + `RecordLLMUsage`, (d) preserve `Private` through `shallowCloneMapping` so per-proxy live updates don't silently flip services public. The network_map controller prepends synth services to `account.Services` on every per-peer compute; `accesslogentry.go` gains an indexed `AgentNetwork` column so the dashboard can filter cheaply.
-
-## Files
-
-| Path | Role |
-| ---- | ---- |
-| `handlers/agentnetwork/providers_handler.go` | Catalog + provider CRUD + central `AddEndpoints` |
-| `handlers/agentnetwork/policies_handler.go` | Policy CRUD + shared `validatePolicy*` |
-| `handlers/agentnetwork/guardrails_handler.go` | Guardrail CRUD |
-| `handlers/agentnetwork/budget_handler.go` | Account-level budget rule CRUD |
-| `handlers/agentnetwork/settings_handler.go` | GET (200+`null` if unbootstrapped) + PUT toggles |
-| `handlers/agentnetwork/consumption_handler.go` | Read-only consumption rows |
-| `handlers/agentnetwork/handlers_test.go` | Real-store fixture; wire round-trip + validation |
-| `handlers/agentnetwork/budget_handler_test.go` | Budget-rule + settings toggles |
-| `server/http/handler.go` | New `agentNetworkManager` arg; conditional `AddEndpoints` |
-| `server/permissions/modules/module.go` | New `AgentNetwork` module key |
-| `internals/server/boot.go` | Wires synthesiser adapter + limits service into proxy server |
-| `internals/server/modules.go` | `AgentNetworkManager()` lazy-create node |
-| `internals/controllers/network_map/controller/controller.go` | `injectAllProxyPolicies` replaces 4 `InjectProxyPolicies` calls |
-| `internals/controllers/network_map/controller/repository.go` | `SynthesizeAgentNetworkServices` repo method |
-| `internals/modules/reverseproxy/service/service.go` | `MiddlewareConfig`, capture limits, `AgentNetwork`, `DisableAccessLog` + proto |
-| `internals/modules/reverseproxy/accesslogs/accesslogentry.go` | Indexed `AgentNetwork bool` from proto |
-| `internals/shared/grpc/proxy.go` | Synth wiring, 2 RPCs, domain fallback, `Private` in clone |
-| `internals/shared/grpc/proxy_clone_test.go` | Locks every `ProxyMapping` field minus `AuthToken` |
-| `server/activity/codes.go` | 13 new activity codes (125-137) |
-
-## HTTP routes added
-
-All routes inherit the platform's auth middleware. Perms enforced inside `agentnetwork.Manager.requirePermission` (`manager.go:680-682`) on `modules.AgentNetwork`. Permission column shows the `op` passed to `requirePermission` — read = `Read`, etc.
-
-| Method | Path | Perm | Handler |
-| ------ | ---- | ---- | ------- |
-| GET    | `/agent-network/catalog/providers` | authn only | `providers_handler.go:43` |
-| GET    | `/agent-network/providers` | read | `providers_handler.go:57` |
-| POST   | `/agent-network/providers` | create | `providers_handler.go:97` |
-| GET    | `/agent-network/providers/{providerId}` | read | `providers_handler.go:77` |
-| PUT    | `/agent-network/providers/{providerId}` | update | `providers_handler.go:132` |
-| DELETE | `/agent-network/providers/{providerId}` | delete | `providers_handler.go:172` |
-| GET    | `/agent-network/policies` | read | `policies_handler.go:32` |
-| POST   | `/agent-network/policies` | create | `policies_handler.go:72` |
-| GET    | `/agent-network/policies/{policyId}` | read | `policies_handler.go:52` |
-| PUT    | `/agent-network/policies/{policyId}` | update | `policies_handler.go:102` |
-| DELETE | `/agent-network/policies/{policyId}` | delete | `policies_handler.go:142` |
-| GET    | `/agent-network/guardrails` | read | `guardrails_handler.go:25` |
-| POST   | `/agent-network/guardrails` | create | `guardrails_handler.go:65` |
-| GET    | `/agent-network/guardrails/{guardrailId}` | read | `guardrails_handler.go:45` |
-| PUT    | `/agent-network/guardrails/{guardrailId}` | update | `guardrails_handler.go:95` |
-| DELETE | `/agent-network/guardrails/{guardrailId}` | delete | `guardrails_handler.go:135` |
-| GET    | `/agent-network/budget-rules` | read | `budget_handler.go:24` |
-| POST   | `/agent-network/budget-rules` | create | `budget_handler.go:64` |
-| GET    | `/agent-network/budget-rules/{ruleId}` | read | `budget_handler.go:44` |
-| PUT    | `/agent-network/budget-rules/{ruleId}` | update | `budget_handler.go:95` |
-| DELETE | `/agent-network/budget-rules/{ruleId}` | delete | `budget_handler.go:135` |
-| GET    | `/agent-network/settings` | read | `settings_handler.go:53` (200+`null` if no row) |
-| PUT    | `/agent-network/settings` | update | `settings_handler.go:27` |
-| GET    | `/agent-network/consumption` | read | `consumption_handler.go:21` |
-
-## gRPC RPCs added (or modified)
-
-| RPC | Direction | Trigger |
-| --- | --------- | ------- |
-| `CheckLLMPolicyLimits` | proxy→mgmt unary | Pre-flight gate; returns allow/deny, selected policy, attribution group, window, deny code+reason (`proxy.go:259-301`). `Unimplemented` when limits service is nil. |
-| `RecordLLMUsage` | proxy→mgmt unary | Post-flight write of tokens+cost against policy-window dimensions + every applicable account budget rule (`proxy.go:303-349`). `window_seconds==0` ⇒ no policy cap, only account fan-out runs. |
-| `GetMappingUpdate`/`SendServiceUpdate` (stream) | mgmt→proxy | Snapshot (`proxy.go:752-780`) now appends `SynthesizeServicesForCluster`. Live updates use `SendServiceUpdateToCluster` + `shallowCloneMapping`. |
-
-## Architecture & flow
-
-### HTTP request lifecycle
-
-```mermaid
-sequenceDiagram
-    participant DB as Dashboard
-    participant R as gorilla.Router (/api)
-    participant H as handler (agentnetwork)
-    participant M as agentnetwork.Manager
-    participant S as store.Store
-    participant AM as accountManager (StoreEvent)
-
-    DB->>R: POST /api/agent-network/providers
-    R->>H: createProvider (auth mw sets UserAuth)
-    H->>H: GetUserAuthFromContext + validate(req)
-    H->>M: CreateProvider(userID, provider, bootstrapCluster)
-    M->>M: requirePermission(AgentNetwork, Create)
-    M->>S: SaveAgentNetworkProvider
-    M->>AM: StoreEvent(AgentNetworkProviderCreated)
-    M-->>H: created provider
-    H-->>DB: 200 + api.AgentNetworkProvider JSON
-```
-
-### Synth-service delivery via gRPC
-
-```mermaid
-sequenceDiagram
-    participant P as Proxy
-    participant G as ProxyServiceServer
-    participant SM as service.Manager (persisted)
-    participant SA as synthesizerAdapter
-    participant AN as SynthesizeServicesForCluster
-    participant ST as store.Store
-
-    Note over P,G: Initial snapshot
-    P->>G: GetMappingUpdate (stream open)
-    G->>SM: GetServicesForCluster(conn.address)
-    SM-->>G: persisted []*Service
-    G->>SA: SynthesizeServicesForCluster(conn.address)
-    SA->>AN: SynthesizeServicesForCluster(store, clusterAddr)
-    AN->>ST: walk every account; read providers/policies/settings
-    AN-->>SA: in-memory []*Service
-    SA-->>G: []*Service
-    G->>P: response (persisted + synth)
-
-    Note over G,P: Per-request live update
-    G->>G: SendServiceUpdateToCluster(update, clusterAddr)
-    G->>G: shallowCloneMapping(update)   %% Private MUST survive
-    G->>P: response with single mapping
-```
-
-End-to-end: HTTP write persists rows and emits an activity event; the manager then triggers `proxyController.SendServiceUpdate` so proxies re-render. **The snapshot path is the only one that calls into the synthesiser** — on stream open it pulls persisted services then appends synth services for the cluster. Synth services are never persisted. For OIDC/session/tunnel-peer flows, `getServiceByDomain` falls back to `SynthesizeServicesForCluster(clusterFromDomain(domain))` when persisted lookup misses (`proxy.go:1763-1793`). The network_map contribution is orthogonal: per-peer compute prepends the same synth services to `account.Services` before `InjectProxyPolicies`.
-
-## Permissions model added
-
- `permissions/modules/module.go:22` adds `AgentNetwork Module = "agent_network"`, registered in `All` (`module.go:42`). Standard `operations.{Read,Create,Update,Delete}` matrix.
- Handlers don't call `permissionsManager` directly — they extract `UserAuth` and delegate to `agentnetwork.Manager`, which gates every mutation through `requirePermission` (`manager.go:168, 308, 549`, etc.). Confirm your role-set provider has `agent_network` rows for owner/admin/user/billing-admin before merging.
- `getCatalogProviders` (`providers_handler.go:43`) intentionally skips RBAC — catalog is global static data.
-
-## Activity codes added
-
-`activity/codes.go:244-274` adds Activities 125-137 + string/code mappings (`codes.go:428-444`), following `<domain>.<resource>.<action>` (e.g., `agent_network.provider.create`). Audit-log exporters / SIEM forwarders need to know the new codes.
-
-## Invariants
-
- **Synth services are never persisted.** Snapshot appends after `serviceManager.GetServicesForCluster` (`proxy.go:761-770`); network_map prepends before `InjectProxyPolicies` (`controller.go:117-126`).
- **`shallowCloneMapping` must round-trip every `ProxyMapping` field except `AuthToken`** — `proxy_clone_test.go:50-58` enforces via `gproto.Equal`. The bug it guards: a missing `Private` made every MODIFIED arrive `private=false`, the proxy skipped `ValidateTunnelPeer`, `UserGroups` stayed empty, `llm_router` denied `no_authorised_provider`; a restart "fixed" it because the snapshot uses the original mapping.
- **Limit-window floor is 60s** (`policies_handler.go:189-220`); enabled cap with both per-group and per-user at zero is rejected. Budget rules reuse the same validator (`budget_handler.go:170`).
- **Manager is optional at boot.** `NewAPIHandler` registers routes only when non-nil (`handler.go:129`); `ProxyServiceServer` returns `Unimplemented` from both RPCs when limits service is unwired (`proxy.go:262-265, 306-309`).
- **Settings GET on an unbootstrapped account returns 200 + `null`** (`settings_handler.go:65-72`) — not 404.
-
-## Things to scrutinize
-
-### Correctness
- **`injectAllProxyPolicies` runs on every per-peer compute**: `controller.go:163, 309, 415, 681`. `sendUpdateAccountPeers` is the target of the buffered fan-out — synth runs once per debounced account-update tick **and** once per direct `UpdateAccountPeer`. Cost is O(providers + policies × users-per-group) per account under `LockingStrengthNone`. No per-account synth cache — verify it fits the buffer interval for your largest tenant.
- **`clusterFromDomain` strips at the first `.`** (`proxy.go:1784-1792`). A zero-dot domain returns `""` and the synth call walks every account. Confirm no path reaches this with a malformed/internal domain.
- **Account-budget `RecordConsumption` fans out even when `window_seconds == 0`** (`proxy.go:341-348`) — intentional. Verify the proxy never sends `RecordLLMUsage` for a request that wasn't actually allowed.
-
-### Security
- Every handler extracts `UserAuth` via `nbcontext.GetUserAuthFromContext` before any work. Routes live behind the standard `/api` mux; bypass list is not extended.
- `CheckLLMPolicyLimits` / `RecordLLMUsage` ride the existing **proxy → mgmt** gRPC connection auth. No additional token check inside the RPCs — they trust the connection. Confirm the proxy-side token-verification interceptor in this package gates both.
- `RecordLLMUsage` only validates `account_id != ""` (`proxy.go:317-319`). A compromised proxy can attribute cost to any account in its cluster — was already true for prior RPCs but is louder now that data drives denials.
-
-### Concurrency
- `SetAgentNetworkSynthesizer` / `SetAgentNetworkLimitsService` write under `s.mu.Lock`; read paths copy the interface under read lock (`proxy.go:236-247, 260-263, 304-307`). Same pattern as existing `serviceManager`/`proxyController` setters.
- Manager writes use `LockingStrengthUpdate`; synth reads use `LockingStrengthNone` — read-after-write via the proxy snapshot can observe a stale view by up to one fan-out tick.
- Network_map controller is single-threaded per account; cross-account is parallel.
-
-### Backward compatibility
- `proxy_clone_test.go` is the regression net; any new `ProxyMapping` field must be cloned or explicitly nulled in the test.
- `AccessLogEntry` adds indexed `AgentNetwork bool` — implicit AutoMigrate; deploy story must handle table-rewrite cost on high-volume access-log tables.
- `TargetOptions` gains seven `omitempty` JSON fields (`service.go:69-94`); on-wire shape stays compatible. `targetOptionsToProto` tests all fields when deciding nil (`service.go:551-556`).
- `NewAPIHandler` signature changes — every caller must pass `agentNetworkManager`; `nil` is supported.
-
-### Observability
- 13 new activity codes via `accountManager.StoreEvent` in the manager — confirm dashboard's audit-log UI maps them.
- `AccessLogEntry.AgentNetwork` is indexed for the dashboard's agent-network log filter.
- New RPCs log at error level on store/selector failures (`proxy.go:284, 327, 332, 348`). Snapshot synth failures degrade to warnings — stream is not aborted (`proxy.go:765`).
-
-## Test coverage
-
-| Test | Locks down |
-| ---- | ---------- |
-| `handlers_test.go::TestPolicyHandler_WindowSecondsRoundTrip` | GET carries `window_seconds`; legacy `window_hours`/`window_days` absent. |
-| `handlers_test.go::TestPolicyHandler_RejectsSubMinuteWindow` | POST `<60s` returns 4xx. |
-| `handlers_test.go::TestConsumptionHandler_EmptyAccountReturnsArray` | `/consumption` returns `[]` — never null. |
-| `handlers_test.go::TestConsumptionHandler_PopulatedAccountListsRows` | RecordConsumption×2 surfaces both with correct tokens/cost/window. |
-| `budget_handler_test.go::TestBudgetRuleHandler_RoundTrip` | Targets + PolicyLimits shape round-trip. |
-| `budget_handler_test.go::TestBudgetRuleHandler_ListReturnsArray` | Empty-list shape. |
-| `budget_handler_test.go::TestBudgetRuleHandler_{RejectsMissingName,RejectsSubMinuteWindow}` | Validation rejections are 4xx. |
-| `budget_handler_test.go::TestSettingsHandler_GetExposesCollectionToggles` | All four toggles + computed `Endpoint`. |
-| `proxy_clone_test.go::TestShallowCloneMapping_PreservesAllFieldsExceptAuthToken` | Future-proofs clone; every field round-trips, `AuthToken` dropped. |
-
-Handler tests use a real sqlite store + real manager + always-allow permissions mock (`handlers_test.go:53-75`). Create/update/delete success paths flow through `accountManager.StoreEvent` which the fixture doesn't wire — covered by manager-level no-mock tests outside this module.
-
-## Known limitations / explicit non-goals
-
- No pagination on any list endpoint; no bulk endpoints.
- Synth result is not cached — every snapshot and every per-peer compute repeats the store walk.
- `getSettings` returning `200 + null` is a deliberate dashboard concession.
- No rate-limiting beyond the global `/api` rate limiter.
-
-## Cross-references
-
- Upstream: [shared/api](10-shared-api.md), [management/agentnetwork](21-management-agentnetwork.md), [management/store](20-management-store.md)
- Downstream: [proxy/runtime](33-proxy-runtime.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)
--- a/docs/agent-networks/modules/30-proxy-middleware-framework.md
+++ b/docs/agent-networks/modules/30-proxy-middleware-framework.md
@@ -1,215 +0,0 @@
-# proxy/middleware-framework — generic plugin system
-
-> **Risk level:** **High** — every proxied request transits this chain. Budget exhaustion, panic recovery, or chain-close bugs hit the hot path for all targets, not just agent-network ones.
-> **Backward-compat impact:** Additive at the proxy. The `middleware` and `bodytap` packages are new (`proxy/internal/middleware/middleware.go:1`, `proxy/internal/middleware/bodytap/request.go:13`); existing proxy targets keep working until a chain is bound to them via `Manager.Rebuild`.
-
-This module is the **framework only** — no LLM/agent-network domain knowledge is required, since every example built into it is generic.
-
-## Module boundary
-
-This module is the **framework only**: slots, chains, registry, dispatcher, accumulator, body-tap, output filters. No middleware *implementation* lives here — those land in `proxy/internal/middleware/builtin/*` (covered in module 31). The package contract is:
-
-1. The proxy hands a `Manager` to its config-apply path. The synth pushes per-path `PathTargetBinding` lists (`proxy/internal/middleware/manager.go:26`) into `Manager.Rebuild`, which resolves each spec via the `Registry`/`Resolver` (`proxy/internal/middleware/registry.go:81-121`) and produces an immutable `Chain` keyed by `serviceID|pathID` (`proxy/internal/middleware/manager.go:410-412`).
-2. The reverse-proxy handler captures the request body via `bodytap.CaptureRequest`, calls `Chain.RunRequest`, applies returned mutations (already filtered by `chain.applyMutations`), forwards to the upstream behind a `bodytap.CapturingResponseWriter`, then calls `Chain.RunResponse` and `Chain.RunTerminal`.
-3. Middlewares are inert plugins that receive a deep-cloned `Input` and return an `Output` whose decision/mutations are clamped by the dispatcher's `filterOutput` (`proxy/internal/middleware/dispatcher.go:149-172`).
-
-Everything that crosses the framework boundary in either direction is value-typed and deep-copied — middlewares cannot mutate the live request directly, and the framework cannot inadvertently leak middleware-owned slices into the request hot path.
-
-## Files
-
-| Path | Role |
-| ---- | ---- |
-| `proxy/internal/middleware/middleware.go` | `Middleware` + `Factory` interfaces. |
-| `proxy/internal/middleware/types.go` | `Slot`, `FailMode`, `Decision`, all limit constants, `Input`/`Output`/`Mutations`/`UpstreamRewrite`/`AuthHeader` value types. |
-| `proxy/internal/middleware/spec.go` | Apply-time `Spec` (validated wire shape + runtime-injected fields) and `Clone`. |
-| `proxy/internal/middleware/registry.go` | `Registry` (factory map, RWMutex) and `Resolver` (Spec → bound `Middleware`). |
-| `proxy/internal/middleware/manager.go` | `Manager`, `chainTable` reverse index, `Rebuild`/`Invalidate*`, async chain close. |
-| `proxy/internal/middleware/chain.go` | `Chain.RunRequest`/`RunResponse`/`RunTerminal`, mutation gating, `cloneInputFor`. |
-| `proxy/internal/middleware/chain_test.go` | Metadata threading, LIFO response order, rewrite gating, UserGroups propagation, terminal accumulation. |
-| `proxy/internal/middleware/dispatcher.go` | Timeout/panic recovery, fail-mode, error classification, `filterOutput`. |
-| `proxy/internal/middleware/decision.go` | `RenderDenyResponse`, deny-code regex, status clamp. |
-| `proxy/internal/middleware/headerpolicy.go` | Compile-in header denylist + `FilterHeaderMutations`. |
-| `proxy/internal/middleware/bodypolicy.go` | `ValidateBodyReplace` / `ApplyBodyReplace` smuggling guards. |
-| `proxy/internal/middleware/keys.go` | Metadata key namespace constants. |
-| `proxy/internal/middleware/metadata.go` | `Accumulator` — allowlist, per-mw/per-request byte caps, redaction. |
-| `proxy/internal/middleware/metrics.go` | OTel instrument bundle (`proxy.middleware.*`). |
-| `proxy/internal/middleware/redaction.go` | `Scan` — PEM/JWT/AWS/bearer/Luhn-validated CC patterns. |
-| `proxy/internal/middleware/bodytap/request.go` | Capture + replay reader, `Budget` semaphore, bypass reason codes. |
-| `proxy/internal/middleware/bodytap/response.go` | `CapturingResponseWriter` (tee with `PassthroughWriter` for Flusher/Hijacker preservation). |
-
-## Slot model
-
-Three slots, declared per-middleware exactly once (`proxy/internal/middleware/types.go:27-41`):
-
- **`SlotOnRequest`** (`Slot=1`) — runs **before** the upstream call, in registration order. May `DecisionDeny`, may emit `Mutations` (header add/remove, body replace, `UpstreamRewrite`) when both `Spec.CanMutate` and `Middleware.MutationsSupported()` are true. May emit metadata. Each middleware in the slot sees metadata that earlier ones in the same slot just emitted (`proxy/internal/middleware/chain.go:144-178`) — this is how the framework gives middlewares an intra-slot side channel without a global bag.
- **`SlotOnResponse`** (`Slot=2`) — runs **after** the upstream returns, in **reverse** registration order. Cannot deny (clamped in `dispatcher.filterOutput`, `proxy/internal/middleware/dispatcher.go:153-157`). May still mutate response headers in principle, but the current chain only forwards `RewriteUpstream` from on_request, so on_response mutations are observe-only in practice. Threads the same per-slot metadata view as on_request.
- **`SlotTerminal`** (`Slot=3`) — runs **after** every on_response middleware has emitted, in registration order. Sees the full accumulated bag plus prior terminal emissions (`chain.go:221-245`). Cannot deny, cannot mutate (`dispatcher.go:168-170`). Designed for sinks (access log, metrics push, audit emitter).
-
-Splitting a feature across slots (e.g. "parse on the way out, ship on terminal") is the explicit architectural choice — `types.go:7-15` and `types.go:22-25` make it clear no middleware participates in more than one slot.
-
-## Architecture & flow
-
-### Chain dispatch
-
-```mermaid
-sequenceDiagram
-    autonumber
-    participant H as proxy HTTP handler
-    participant BT as bodytap.CaptureRequest
-    participant CH as Chain
-    participant DI as Dispatcher
-    participant MW as Middleware (per slot)
-    participant US as Upstream
-    participant CW as CapturingResponseWriter
-
-    H->>BT: CaptureRequest(r, cfg, budget)
-    BT-->>H: body[], truncated, release()
-    H->>CH: RunRequest(ctx, r, Input, Accumulator)
-    loop on_request, registration order
-        CH->>CH: cloneInputFor(in, OnRequest)
-        CH->>DI: Invoke(ctx, spec, mw, call)
-        DI->>MW: mw.Invoke(callCtx, in)
-        MW-->>DI: Output{decision, metadata, mutations?}
-        DI->>DI: filterOutput (clamp deny, gate mutations)
-        DI-->>CH: filtered Output
-        CH->>CH: Accumulator.Emit (allowlist + caps + redact)
-        alt DecisionDeny
-            CH-->>H: denied, merged, rewrite
-        else allow
-            CH->>CH: applyMutations(r, m) and capture rewrite
-        end
-    end
-    CH-->>H: nil, merged, rewrite
-    H->>US: ProxyRequest (with rewrite/mutations applied)
-    US-->>CW: bytes (streamed, tee'd into cap-bounded buf)
-    CW-->>H: passthrough complete
-    H->>CH: RunResponse(ctx, Input{RespBody:CW.Body(),...}, acc)
-    loop on_response, REVERSE order (LIFO)
-        CH->>DI: Invoke (same wrappers)
-    end
-    H->>CH: RunTerminal(ctx, Input{Metadata:full bag}, acc)
-    H->>BT: release() + CW.Release()
-```
-
-### Body-tap mechanics (request + response)
-
-```mermaid
-flowchart LR
-    subgraph req[Request capture — bodytap.CaptureRequest]
-        R0[r.Body] --> R1{cfg.MaxRequestBytes > 0?\nUpgrade absent?\nContent-Type allowed?\nCL <= cap?}
-        R1 -- no --> R2[bypass = reason\nbody = nil\nr.Body untouched]
-        R1 -- yes --> R3[Budget.Acquire(cap)]
-        R3 -- denied --> R4[bypass=BypassBudget]
-        R3 -- ok --> R5[io.LimitReader(r.Body, cap+1)\nio.ReadAll]
-        R5 --> R6{len > cap?}
-        R6 -- truncated --> R7[viewable = buf[:cap]\nr.Body = replayReadCloser{buf, tail}]
-        R6 -- whole --> R8[r.Body = NopCloser(bytes.Reader(buf))\nclose original]
-        R7 --> R9[(release captured\nbudget on req end)]
-        R8 --> R9
-    end
-
-    subgraph resp[Response capture — CapturingResponseWriter]
-        W0[client] -.-> CW[Write(p)]
-        CW --> P1[PassthroughWriter.Write(p)\n— bytes leave to client first]
-        P1 --> P2{!stopped?}
-        P2 -- yes --> P3{remaining = cap - buf.Len()}
-        P3 --> P4[buf.Write(p[:take])\nset truncated if take<n]
-        P2 -- no --> P5[silent drop into the tee\n(client write already done)]
-    end
-```
-
-The body-tap is the highest-leak-risk surface in this module; three details matter:
-
-1. **Request capture is "read-and-replay", not "read-and-forward".** `CaptureRequest` always swaps `r.Body` for either a `bytes.Reader` (whole body fit) or a `replayReadCloser` that replays the captured prefix then drains the remaining stream from the original body (`bodytap/request.go:178-201`). This means the **upstream still sees the full body even when the tap truncates**. The original `r.Body` is **not** closed in the truncated branch — `replayReadCloser.Close()` only closes the tail (`bodytap/request.go:199-201`), which is the same reader, so close once on request end is correct, but reviewers should confirm the upstream proxy always reads to EOF (otherwise the tail is leaked).
-2. **Response capture is a write-through tee.** `CapturingResponseWriter.Write` forwards to the underlying writer **first** (`bodytap/response.go:116-117`), then tees into `buf` under its own mutex. Client never blocks on the tee. `Flusher`/`Hijacker` are preserved via the embedded `responsewriter.PassthroughWriter`. SSE/chunked streams flow through untouched; middlewares only see the bounded prefix.
-3. **Budget is a single shared semaphore.** `Manager` constructs one `bodytap.Budget` at startup (`manager.go:138-144`, default `256 MiB` from `bodytap/request.go:39`). Every capture pre-acquires its full `MaxRequestBytes` / `MaxResponseBytes` from the budget regardless of actual body size; that prevents a flood of small captures from collectively exceeding the cap, but it also means a misconfigured `MaxRequestBytes = 1 MiB` with 256 concurrent requests already exhausts the default budget. Reviewers should sanity-check the operator-facing defaults that ship with synth-service.
-
-The framework explicitly aborts capture (and increments `proxy.middleware.capture_bypass_total`) before reading the first byte when `Upgrade`/`Connection: upgrade` is set (`bodytap/request.go:120-125`), when the content-type isn't in the allowlist (`bodytap/request.go:126-128`), or when the advertised `Content-Length` already exceeds the cap (`bodytap/request.go:131-133`). This is the right place to make sure WebSocket upgrades and large file uploads never reach the buffer.
-
-## Public contracts
-
- **`Middleware` interface** (`middleware.go:14-36`): `ID()`, `Version()`, `Slot()`, `AcceptedContentTypes()`, `MetadataKeys()`, `MutationsSupported()`, `Invoke(ctx, *Input) (*Output, error)`, `Close()`. `MetadataKeys()` is the **closed set** the middleware is allowed to emit — the accumulator drops anything outside it (`metadata.go:71-75`). `Close` must be idempotent (called even when `Invoke` was never reached).
- **`Factory` interface** (`middleware.go:44-47`): `ID()`, `New(rawConfig []byte) (Middleware, error)`. `RawConfig` is opaque JSON bytes on the wire (`spec.go:6-12`); each factory owns its own typed config.
- **`Decision` type** (`types.go:59-69`): `Allow=0`, `Deny=1`, `Passthrough=2`. Default-zero is permissive — important because every middleware that omits `Decision` gets `Allow`. Dispatcher clamps `Deny` to `Passthrough` outside `SlotOnRequest` (`dispatcher.go:153-157`).
- **`Mutations`** (`types.go:196-201`): `HeadersAdd`/`HeadersRemove` (filtered through `headerpolicy.go`), `BodyReplace` (gated through `bodypolicy.go`), and `RewriteUpstream`. `RewriteUpstream` is **last-write-wins** within the on_request slot (`chain.go:170-172`, locked down by `TestChain_RunRequest_LatestRewriteWins`).
- **Metadata propagation keys** (`keys.go`): all keys live in a single file and follow `^[a-z][a-z0-9_-]*(\.[a-z0-9_-]*)+$` (`metadata.go:8`). Framework-injected error tagging uses `mw.<id>.error_kind` (`keys.go:81`) so operators can distinguish framework-emitted entries from middleware-emitted ones.
-
-## Invariants
-
- **Per-request context isolation.** `cloneInputFor` deep-copies every mutable field (`Headers`, `RespHeaders`, `Metadata`, `Body`, `RespBody`, `UserGroups`, `UserGroupNames`) before each invocation (`chain.go:286-308`). A misbehaving middleware that mutates `in.Headers` only corrupts its own copy.
- **Body-tap bounded by capture limit.** Request side uses `io.LimitReader(r.Body, limit+1)` (`bodytap/request.go:152`) — the `+1` is how the code detects truncation (`bodytap/request.go:160`); the surfaced buffer is sliced back down to `limit`. Response side stops teeing once `buf.Len() >= cap` (`bodytap/response.go:121-133`). Neither side can grow the buffer past the configured cap.
- **Headers/body redaction order.** Accumulator runs `Scan(value)` **before** counting cost (`metadata.go:81-82`), so the byte budgets are computed against post-redaction sizes. `Scan` order is PEM → JWT → AWS key → bearer → Luhn-validated CC (`redaction.go:25-51`) — the comment block in `redaction.go:8-13` is explicit that this is best-effort, not DLP.
- **No middleware can starve the chain.** Every invocation runs inside `context.WithTimeout(ctx, clampTimeout(spec.Timeout))` in a separate goroutine (`dispatcher.go:51-94`), with the deadline race-`select`ed against the result channel. A blocked middleware fires the timeout path, gets fail-mode'd, and `IncError(kind=timeout)`. Timeouts are clamped to `[10ms, 5s]` (`types.go:80-86`, `dispatcher.go:174-185`).
- **Panic recovery.** `recover()` captures the panic, logs only the type + a 4 KiB stack prefix (no panic value — avoids leaking secrets the middleware was processing), and produces a `panicError` that flows through fail-mode (`dispatcher.go:64-76`).
- **Chain immutability + atomic swap.** `chainTable` is cloned on every `Rebuild`/`Invalidate*` and swapped via `atomic.Pointer` (`manager.go:44-69`, `manager.go:221-300`). Readers (`ChainFor`) are lock-free; writers serialise on `writeMu`. The retired chain is `Close`-d in a background goroutine bounded by `chainCloseTimeout = 2 * MaxTimeout` (`manager.go:21-22`, `manager.go:326-346`), so in-flight invocations finish on the old chain after the swap.
-
-## Things to scrutinize
-
-### Correctness
-
- **Chain ordering deterministic from synth output?** `Manager.buildChain` iterates `b.Specs` in slice order and appends to `bound` (`manager.go:366-391`); `NewChain` then partitions by slot but **preserves slice order within each slot** (`chain.go:50-60`). So order on the wire = order observed at runtime. Synth must therefore emit specs in the intended execution order — there is no per-spec `Priority` field. Worth flagging.
- **Decision short-circuit semantics.** `RunRequest` returns immediately on `DecisionDeny` (`chain.go:164-167`) **with the metadata accumulated so far** plus the `denied.Metadata`. Callers that ignore `merged` on deny will lose framework-injected `mw.<id>.error_kind` entries. The proxy runtime is the only caller; confirm it always feeds `merged` into the access log on the deny path as well.
- **`UpstreamRewrite` `AuthHeader` bypass** (`types.go:218-235`). The `AuthHeader`/`StripHeaders` fields *intentionally* bypass the header denylist on the basis that the proxy itself rewrites auth. The denylist still blocks middleware-emitted `HeadersAdd: Authorization=...`. This is a delicate carve-out — review the runtime consumer to confirm only the trusted upstream-build path unpacks `AuthHeader`, never the generic `applyMutations` loop.
- **`replayReadCloser.Close` only closes the tail** (`bodytap/request.go:199-201`). The replay buffer doesn't own a resource, so this is correct, but it conflates "replay finished" with "underlying body closed". If a caller `Close()`s without reading to EOF, the original body is closed but the captured prefix is lost; harmless for the proxy path (upstream always reads to EOF) but worth a doc-comment.
-
-### Security
-
- **Body-tap memory bounds.** Discussed above — bounded by `MaxBodyCapBytes = 1 MiB` per direction (`types.go:77`) and the shared `Budget` (default 256 MiB). The concerning case is the **deep-copy in `cloneInputFor`** (`chain.go:300-306`): every middleware invocation gets its **own copy** of `Body` and `RespBody`. A chain of N middlewares with a 1 MiB body allocates N MiB of transient bytes per request. With `MaxMiddlewaresPerChain = 16` (`types.go:103`) that's up to 16 MiB extra per in-flight request. Worth pricing into the budget model.
- **Header redaction completeness.** `denyHeaders` (`headerpolicy.go:5-17`) covers the auth/forwarding family and framing (`Content-Length`, `Transfer-Encoding`, `Trailer`). `denyHeaderPrefixes` covers `X-Authenticated-*`, `X-Forwarded-*`, `X-Remote-*`, `X-NetBird-*`. Notably absent: `Range`, `If-Match`/`If-None-Match` (mutation could cause cache poisoning), `Origin`/`Referer`. Not necessarily wrong, but worth a deliberate decision.
- **Metadata key collisions across middlewares.** The accumulator has no cross-middleware uniqueness check; two middlewares with the same key in their allowlist can both emit it, and both copies land in `merged` (`metadata.go:51-99`). Downstream consumers must tolerate duplicates. Worth documenting.
- **Deny rendering.** `RenderDenyResponse` only allows codes matching `^[a-z][a-z0-9._-]{0,63}$` (`decision.go:9`), redacts/truncates message + detail values, caps `Details` at 8 entries (`decision.go:42-50`), clamps status to `[400,499]\{401}` (`decision.go:65-73`). The deny body type is fixed; middlewares cannot inject arbitrary JSON.
-
-### Concurrency
-
- **Per-request state vs shared state in factories.** Each `Factory.New` is called once per chain build; the returned `Middleware` instance is **shared across all requests** for that chain. `Invoke` must be reentrant. The framework does not enforce this — a buggy middleware that holds per-call state on the struct will silently race. Suggest a `// Invoke must be safe for concurrent use` doc on the interface.
- **`chainTable` clone-on-write** is correct, but `addChain`/`removeChain` mutate the *cloned* table before the swap (`manager.go:71-108`), and they're called under `writeMu`. Readers only ever see the post-swap pointer. Good.
- **`Chain.inflight` WaitGroup**. `Run*` does `Add(1)`/`Done()` (`chain.go:142-143`, `chain.go:194-195`, `chain.go:225-226`); `Close` waits on it bounded by ctx (`chain.go:75-85`). One concern: a *new* `RunRequest` can `Add(1)` *after* `Close` started waiting if the caller still holds a stale chain pointer. `WaitGroup` does not panic on this if the count was already > 0 at `Wait` time, but it does panic if `Add` happens after `Wait` returns and another `Wait` runs. `Close` is documented one-shot, so single-`Wait` is fine, but callers must drop the chain reference before calling `Close`. Worth a code comment near `Close`.
- **Goroutine leaks.** `Dispatcher.Invoke` spawns one goroutine per call and *always* writes to a buffered (cap=1) channel (`dispatcher.go:62-76`), so even if the timeout fires the goroutine completes its send and exits. No leak.
- **`closeChainsAsync`** detaches retired chains into a goroutine (`manager.go:326-346`). If `Manager` is never GC'd this is fine, but there's no shutdown hook to wait on outstanding closes. Reviewers should confirm the proxy shutdown path explicitly drains in-flight requests before tearing down `Manager`, or accept that the last chain-close round may be cut short on exit.
-
-### Performance
-
- **Allocations per request.** `cloneInputFor` allocates new slices for `Headers`, `RespHeaders`, `Metadata`, `Body`, `RespBody`, `UserGroups`, `UserGroupNames` — once per middleware per request. For a typical 5-middleware chain on a 1 KiB body that's ~10 small slice allocs plus one `Body` copy each. Not a hot-path crisis, but `sync.Pool` for the per-call `Input` would be a natural follow-up.
- **Accumulator allocates a fresh `allowSet` per `Emit` call** (`metadata.go:55-58`). One per middleware per slot pass = up to 48 per request. Cheap, but worth noting.
- **Regex cost.** `Scan` runs five regex passes on every accepted metadata value (`redaction.go:25-51`). Bounded by `MaxMetadataValueBytes = 4 KiB` so worst case is small.
-
-### Observability
-
- **Per-middleware metrics.** `proxy.middleware.requests_total{middleware,target_id,outcome}` (`metrics.go:34-41`), `duration_ms`, `invocations_total`, `errors_total{kind}`, `metadata_rejected_total{reason}`, `header_mutation_blocked_total{header}`, `capture_bypass_total{reason}`. Comprehensive surface; operators can alert on `errors_total{kind=panic}` and `errors_total{kind=timeout}` separately. **Latency histogram is in milliseconds with default OTel buckets** — for a 10ms–5s timeout range default buckets cover OK, but a custom bucket set centred on 1–500ms would resolve the agent-network response-parser tail better.
- **Decision logs.** Panic logs (`dispatcher.go:69`) include `request_id`, type, and stack but not the panic value (safe). `Chain.Close` logs middleware-close errors at debug (`chain.go:91`). `applyMutations` logs body-replace rejections at warn (`chain.go:278`). No log on the deny path itself — by design, since the access-log terminal middleware is expected to record outcomes.
-
-## Test coverage
-
-| Test file | Locks down |
-| --------- | ---------- |
-| `proxy/internal/middleware/chain_test.go:77` | `RunRequest` threads metadata across on_request middlewares (regression for the "later mw can't see earlier mw's emissions" bug). |
-| `chain_test.go:110` | `RunResponse` reverse-order threading. |
-| `chain_test.go:142` | `cost_meter`-shaped scenario: response_parser registered after cost_meter still emits *before* cost_meter sees the bag (guards the `cost.skipped=missing_tokens` regression). |
-| `chain_test.go:178` | `UpstreamRewrite` last-write-wins. |
-| `chain_test.go:206` | No middleware emits → nil rewrite. |
-| `chain_test.go:224` | Rewrite filtered when `CanMutate=false`. |
-| `chain_test.go:245` | `Input.UserGroups` propagates verbatim through `cloneInputFor`. |
-| `chain_test.go:304` | Terminal middlewares see the full accumulated bag + prior terminal emissions. |
-
-**Gaps** worth raising with the author:
- No direct test for `Dispatcher.Invoke` timeout / panic / fail-mode behaviour at the framework level (covered indirectly by built-in tests, but a unit test pinning `errors_total{kind=...}` labels would be cheap insurance).
- No test for `bodytap.CaptureRequest` truncated replay (the upstream-sees-full-body invariant is exactly the kind of thing a regression would silently break).
- No test for `Budget` exhaustion behaviour under concurrency.
- No test for `Manager.InvalidateMiddleware` + `LiveServiceCheck` race (the auth-revocation race the comment at `manager.go:33-38` calls out is the load-bearing reason for `LiveServiceCheck`).
-
-## Known limitations / explicit non-goals
-
- **No middleware-to-middleware RPC.** Side-channel is metadata only.
- **No streaming body inspection.** Middlewares see a bounded prefix; SSE / chunked parsing happens against that prefix in the response middleware.
- **No per-spec priority.** Order is registration order in the spec slice.
- **No retry / circuit-breaker** on middleware errors. Fail-mode is binary (open/closed) and per-spec.
- **Mutations cannot rewrite the request URL path or query** — only `RewriteUpstream` can change scheme/host (+ optional path replacement, see `types.go:218-235`).
- **Redaction is best-effort.** Explicitly documented in `redaction.go:8-13`. Not a DLP solution.
-
-## Cross-references
-
- Upstream wire shape: [../modules/10-shared-api.md](10-shared-api.md) (Spec/RawConfig encoding from management).
- Built-in middlewares using this framework: [../modules/31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md).
- Runtime wiring (where `Manager`, `Chain`, and `bodytap` are consumed by the HTTP handler): [../modules/33-proxy-runtime.md](33-proxy-runtime.md).
- End-to-end request flow including capture + chain dispatch: [../01-end-to-end-flows.md](../01-end-to-end-flows.md).
- Top-level architecture: [../00-overview.md](../00-overview.md).
--- a/docs/agent-networks/modules/31-proxy-middleware-builtin.md
+++ b/docs/agent-networks/modules/31-proxy-middleware-builtin.md
@@ -1,365 +0,0 @@
-# proxy/middleware-builtin — the LLM chain
-
-The registry-mounted middleware set the proxy executes on every agent-network
-LLM request. The two highest-blast-radius areas are the **capture-pointer
-semantics** and the **limit_check ⇒ limit_record** record-once invariant.
-
-Sibling module: [32-proxy-llm-parsers.md](./32-proxy-llm-parsers.md) — the SDK
-adapters + pricing catalog this chain delegates to.
-
---
-
-## Module boundary
-
-This module is the registry-mounted middleware set the proxy executes on
-every agent-network LLM request. Each sub-package registers itself via
-`init()`
-([builtin.go:32–34](../../../proxy/internal/middleware/builtin/builtin.go));
-the proxy server anonymous-imports the set
-([all_test.go:11–19](../../../proxy/internal/middleware/builtin/all_test.go))
-so the registry is populated at boot. The chain is wired by the management
-synthesiser and executed by the framework
-(`proxy/internal/middleware/{chain,dispatcher,accumulator}.go` — both out
-of scope). Everything here reads from / writes to one envelope: the
-`middleware.KV` metadata bag plus `middleware.Mutations` for header/body
-rewrites.
-
-## The 8 middlewares
-
-| Name | Slot | Inputs (metadata read) | Outputs (metadata written) | Side effects |
-|---|---|---|---|---|
-| `llm_request_parser` | OnRequest | `Input.{URL,Body,BodyTruncated}` | `llm.{provider,model,stream,request_prompt_raw,capture_truncated}` | none |
-| `llm_router` | OnRequest | `llm.model`, `Input.{URL,UserGroups}` | `llm.{resolved_provider_id,authorising_groups}`, `llm_policy.{decision,reason}` | upstream rewrite + auth strip/inject |
-| `llm_limit_check` | OnRequest | `llm.{resolved_provider_id,model}`, `Input.{AccountID,UserID,UserGroups}` | `llm.{selected_policy_id,attribution_group_id,attribution_window_seconds}`, `llm_policy.{decision,reason}` | gRPC `CheckLLMPolicyLimits` |
-| `llm_identity_inject` | OnRequest | `llm.{resolved_provider_id,authorising_groups}`, `Input.{UserEmail,UserID,UserGroups,UserGroupNames}` | none | header strip/inject + optional body rewrite |
-| `llm_guardrail` | OnRequest | `llm.{model,request_prompt_raw}` | `llm_policy.{decision,reason}`, `llm.request_prompt` | none (model allowlist deny) |
-| `llm_response_parser` | OnResponse | `llm.provider`, `Input.{RespHeaders,RespBody,Status}` | `llm.{input,output,total,cached_input,cache_creation}_tokens`, `llm.response_completion` | none |
-| `cost_meter` | OnResponse | `llm.{provider,model}`, token buckets | `cost.usd_total` or `cost.skipped` | pricing lookup |
-| `llm_limit_record` | OnResponse | `llm.{attribution_group_id,attribution_window_seconds,input_tokens,output_tokens}`, `cost.usd_total` | none | gRPC `RecordLLMUsage` |
-
-[all_test.go:26–40](../../../proxy/internal/middleware/builtin/all_test.go)
-locks the ID set; adding or removing one is a conscious extension.
-
-## Files
-
-| File | LOC | Notes |
-|---|---:|---|
-| `builtin.go` | 86 | Registry + `FactoryContext` (ctx, data dir, meter, logger, mgmt client) |
-| `all_test.go` | 41 | Locks the 8-ID registry surface |
-| `agentnetwork_chain_integration_test.go` | 319 | Live sqlite + real gRPC bufconn; gate→recorder wire path |
-| `llm_request_parser/*` | 162 / 66 / 356 | Provider detection, body parse, prompt extraction with capture-pointer gating |
-| `llm_router/*` | 385 / 84 / 586 | Three-pass route selection (model → groups → path-prefix) |
-| `llm_limit_check/*` | 196 / 38 / 182 | Pre-flight `CheckLLMPolicyLimits` (2s, fail-open) |
-| `llm_identity_inject/*` | 440 / 108 / 666 | HeaderPair (LiteLLM) + JSONMetadata (Portkey) + ExtraHeaders |
-| `llm_guardrail/*` | 176 / 82 / 75 / 219 / 217 | Model allowlist + optional prompt capture with PII redaction |
-| `llm_response_parser/*` | 258 / 222 / 43 / 433 / 169 / 111 | Buffered + SSE accumulation; AWS event-stream accumulator (`streaming_bedrock.go`) for Bedrock; capture-pointer gates completion emit |
-| `cost_meter/*` | 181 / 84 / 439 | Token → USD via `proxy/internal/llm/pricing` |
-| `llm_limit_record/*` | 144 / 35 / 191 | Post-flight `RecordLLMUsage` (5s, debug-on-error) |
-
-## Per-middleware
-
-### llm_request_parser
-
-Detects the LLM provider via `llm.DetectParser` (URL sniff) or by name via
-`llm.ParserByName` when synthesiser stamps `provider_id`
-([middleware.go:96–99](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
-**Path-routed providers short-circuit first:** `parseVertexPath` and
-`parseBedrockPath` ([middleware.go:85–94](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go))
-pull the model + vendor out of the URL before parser selection runs — Vertex
-from `/v1/projects/.../publishers/{pub}/models/{model}:{action}` (publisher →
-vendor via `vertexPublisherVendor`), Bedrock from `/model/{id}/{action}` with
-`normalizeBedrockModel` stripping the region prefix + version suffix. See
-[50-path-routed-providers.md](./50-path-routed-providers.md) for the full path
-grammar. For body-routed providers it decodes the body into `RequestFacts`
-(model + stream) and extracts the prompt. On
-`capture_prompt=true` (or absent — see capture-pointer semantics below) the
-prompt is run through `llm_guardrail.RedactPII` when `redact_pii=true` and
-truncated rune-safely to 3500 bytes
-([middleware.go:109–122](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
-**Key invariant:** redaction is parser-side, not guardrail-side — access-log
-reads `llm.request_prompt_raw` directly.
-
-### llm_router
-
-Three-pass route selection in `matchRoute`
-([middleware.go:241–300](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
-filter by `Models` claim → vendor-pin (a vendor-tagged request never crosses to
-another vendor's route) → filter by `AllowedGroupIDs` intersection → model
-precedence over path → tie-break by longest `UpstreamPath` prefix match.
-Model-miss returns `llm_policy.model_not_routable`; known-but-unauthorised
-returns `llm_policy.no_authorised_provider`. **Key invariant:** auth-header
-strip+inject rides on `UpstreamRewrite.{StripHeaders,AuthHeader}`
-([middleware.go:606–646](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
-— NOT `HeadersAdd/HeadersRemove` — because the framework's mutation gate
-blocks `Authorization` on the generic header path.
-
-**Path-routed providers route before the model table.** `Invoke` checks
-`isVertexPath` / `isBedrockPath`
-([middleware.go:138–216](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
-ahead of the model lookup, so a path-carried model can't be claimed by a
-same-vendor body-routed provider. `matchPathRoute` enforces the route's `Models`
-allowlist (empty = catch-all) even though the model came from the URL.
-Two path-only behaviours:
- **Vertex unmeterable publisher** — when `llm_request_parser` emits no
-  `llm.provider` (e.g. Gemini/`google`), the router denies with
-  `llm_policy.unmeterable_publisher` (403) rather than forward it uncounted.
- **GCP token minting** — when the route carries `GCPServiceAccountKeyB64`
-  (set from a `keyfile::` api_key), `gcpBearer` mints + caches a short-lived
-  OAuth2 token per request instead of injecting a static value; a bad key or
-  unreachable token endpoint denies with `llm_policy.upstream_auth_failed`
-  (502). Bedrock uses its static bearer token directly (no minting).
- **`/bedrock` prefix** — an optional `/bedrock` gateway-namespace prefix is
-  accepted and stripped via `RewriteUpstream.StripPathPrefix` so the native
-  `/model/...` path reaches the upstream.
-
-Full treatment in [50-path-routed-providers.md](./50-path-routed-providers.md).
-
-### llm_limit_check
-
-Pre-flight gate. Reads `llm.resolved_provider_id`, calls
-`CheckLLMPolicyLimits` with a 2s context timeout
-([middleware.go:24, 97–106](../../../proxy/internal/middleware/builtin/llm_limit_check/middleware.go)),
-on allow stamps `llm.selected_policy_id`, `llm.attribution_group_id`,
-`llm.attribution_window_seconds`. **Key invariant:** fail-open. Nil
-`MgmtClient`, empty provider id, or RPC error returns `allowNoAttribution()`
-— management outage doesn't take down every LLM request. Operators audit via
-the access-log; a future flag may switch this to fail-closed.
-
-### llm_identity_inject
-
-Dispatches per-rule between LiteLLM-shaped `HeaderPair`
-([middleware.go:169](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go))
-and Portkey-shaped `JSONMetadata`
-([middleware.go:292](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go)).
-Identity is the peer's email (or `UserID` fallback); tags are the
-**authorising-groups intersection** emitted by `llm_router`, not the full
-`UserGroups` — a peer in 5 groups authorised under 1 only tags as that 1.
-**Anti-spoof:** every `HeadersAdd` is preceded by a `HeadersRemove` of the
-same name; the framework runs `Remove` before `Add` so client-supplied
-identity never reaches the upstream. Body-level inject (`tags_in_body`,
-`end_user_id_in_body`) is skipped on empty / truncated / non-JSON bodies so
-header attribution stays intact.
-
-### llm_guardrail
-
-Model allowlist deny + optional prompt-capture-with-redaction. Allowlist
-match is case-insensitive via `normaliseModel`; empty allowlist disables the
-check. Prompt capture reads `llm.request_prompt_raw` and emits
-`llm.request_prompt` only when `prompt_capture.enabled`
-([middleware.go:149–165](../../../proxy/internal/middleware/builtin/llm_guardrail/middleware.go)).
-**Key invariant:** `RedactPII` is the exported function the parsers call —
-single PII contract across all three keys.
-
-### llm_response_parser
-
-Buffered and SSE paths share one `Invoke`
-([middleware.go:102–127](../../../proxy/internal/middleware/builtin/llm_response_parser/middleware.go)):
-content-type sniffing dispatches to `invokeBuffered` (JSON, status<400) or
-`invokeStreaming` (text/event-stream, partial bodies tolerated). Streaming
-delegates to `accumulateStream`
-([streaming.go:21–30](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go))
-using `llm.NewScanner`. A third path, `accumulateBedrockStream`
-([streaming_bedrock.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming_bedrock.go)),
-decodes the AWS binary event-stream (`application/vnd.amazon.eventstream`)
-returned by Bedrock's `-stream` actions — InvokeModel `chunk` frames wrap a
-base64 Anthropic event, Converse frames carry text + a trailing usage block.
-Cached / cache-creation buckets emit only when non-zero, preserving the existing
-token schema.
-
-### cost_meter
-
-Reads `llm.provider` + `llm.model` + token buckets, looks up per-1k rate via
-`pricing.Loader`, emits `cost.usd_total` or a closed-set `cost.skipped`
-reason (`missing_provider/model/tokens`, `unparseable_tokens`, `zero_tokens`,
-`unknown_model`). Loader's hot-reload goroutine is bound to proxy-lifetime
-context via `startReloader`. **Key invariant:** provider-shape switch lives
-in `pricing.Table.Cost` (sibling doc) — `cost_meter` stays provider-agnostic.
-
-### llm_limit_record
-
-Post-flight write. Always returns `DecisionAllow`; response has already been
-served so RPC errors mustn't surface (logged at `Debugf`). Skip-on-no-signal
-at line 81 (zero tokens + zero cost). **Key invariant:** the
-skip-on-missing-attribution guard at line 98 is a safety net independent of
-the framework's deny short-circuit — if the gate denied and the framework
-still runs the recorder, the recorder skips on absent
-`UserID`+`groupID`+`UserGroups` and no phantom counter materialises.
-
-## Full-chain diagram (canonical order)
-
-```mermaid
-flowchart TD
-    A[HTTP request] --> B[llm_request_parser<br/>OnRequest]
-    B -->|llm.provider, llm.model,<br/>llm.stream, llm.request_prompt_raw| C[llm_router<br/>OnRequest]
-    C -->|llm.resolved_provider_id,<br/>llm.authorising_groups,<br/>upstream rewrite + auth| D[llm_limit_check<br/>OnRequest]
-    D -->|deny path| Z1[403 llm_policy.*]
-    D -->|allow + llm.selected_policy_id,<br/>llm.attribution_group_id,<br/>llm.attribution_window_seconds| E[llm_identity_inject<br/>OnRequest]
-    E -->|header strip+inject<br/>+ optional body rewrite| F[llm_guardrail<br/>OnRequest]
-    F -->|deny: model_blocked| Z2[403 llm_policy.model_blocked]
-    F -->|allow + llm.request_prompt| G[upstream LLM call]
-    G --> H[llm_response_parser<br/>OnResponse]
-    H -->|llm.{input,output,total,cached_input,cache_creation}_tokens,<br/>llm.response_completion| I[cost_meter<br/>OnResponse]
-    I -->|cost.usd_total or cost.skipped| J[llm_limit_record<br/>OnResponse]
-    J --> K[response to client]
-```
-
-## limit_check ⇒ limit_record record-once invariant
-
-```mermaid
-sequenceDiagram
-    participant LC as llm_limit_check
-    participant M as management gRPC
-    participant U as upstream LLM
-    participant LR as llm_limit_record
-    participant DB as sqlite consumption table
-
-    LC->>M: CheckLLMPolicyLimits (2s)
-    alt allow
-        M-->>LC: selected_policy_id, attribution_group_id, window_s
-        LC->>U: stamps attribution metadata
-        U-->>LR: response + tokens (via llm_response_parser + cost_meter)
-        LR->>M: RecordLLMUsage (5s, debug-on-error)
-        M->>DB: increment (user, group, window) row
-    else deny
-        M-->>LC: llm_policy.token_cap_exceeded
-        Note over LR: framework short-circuits; even if invoked,<br/>recorder skips on absent UserID+groupID+UserGroups
-    else mgmt nil / rpc error
-        LC-->>LC: allowNoAttribution() — fail open
-        Note over LR: no window_s ⇒ recorder books only account-level<br/>budget rules (which run independently)
-    end
-```
-
-The integration test
-[agentnetwork_chain_integration_test.go](../../../proxy/internal/middleware/builtin/agentnetwork_chain_integration_test.go)
-exercises all three branches against a real sqlite store + bufconn gRPC —
-no mocks. Tests: `TestChain_AllowPath_StampsAttributionAndRecordsCounter`
-(line 130), `TestChain_DenyPath_GateRejectsAndNoConsumptionWritten` (line
-207), `TestChain_CapExhaustTransition` (line 265).
-
-## Public contracts (per-middleware JSON config)
-
-| Middleware | Config shape |
-|---|---|
-| `llm_request_parser` | `{provider_id?, redact_pii?, capture_prompt?: *bool}` ([factory.go:19–37](../../../proxy/internal/middleware/builtin/llm_request_parser/factory.go)) |
-| `llm_router` | `{providers: [{id, models, upstream_scheme, upstream_host, upstream_path?, auth_header_name, auth_header_value, allowed_group_ids}]}` |
-| `llm_limit_check` | `{}` — pulls `MgmtClient` from `FactoryContext` |
-| `llm_identity_inject` | `{providers: [{provider_id, header_pair?|json_metadata?, extra_headers?}]}` |
-| `llm_guardrail` | `{model_allowlist: []string, prompt_capture: {enabled, redact_pii}}` |
-| `llm_response_parser` | `{redact_pii?, capture_completion?: *bool}` |
-| `cost_meter` | `{pricing_path?}` (basename inside data-dir; defaults `pricing.yaml`) |
-| `llm_limit_record` | `{}` — same pattern as `llm_limit_check` |
-
-All factories accept empty / null / `{}` / whitespace as zero-value config;
-only structurally invalid JSON is rejected so misconfig surfaces at chain
-build time.
-
-## Invariants
-
-1. **limit_check ↔ limit_record paired.** They MUST appear together. Gate
-   stamps attribution metadata on the request leg; recorder reads it on the
-   response leg. If a chain contains only the recorder, the
-   skip-on-missing-attribution guard at
-   [llm_limit_record/middleware.go:81–87, 98–103](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go)
-   keeps counters consistent but no enforcement runs. Only-gate means
-   counters never tick and headroom appears infinite.
-
-2. **`capture_prompt` / `capture_completion` pointer semantics.** Both are
-   `*bool`. `nil` = "preserve legacy emit" (back-compat default for
-   non-agent-network callers and pre-toggle tests). `false` = suppress the
-   key entirely (access-log row carries zero prompt / completion content).
-   `true` = emit. The synthesiser sets the pointer explicitly to the
-   account's `EnablePromptCollection` toggle. The handling lives
-   in [llm_request_parser/factory.go:55–61](../../../proxy/internal/middleware/builtin/llm_request_parser/factory.go)
-   and the symmetric [llm_response_parser/middleware.go:62–68](../../../proxy/internal/middleware/builtin/llm_response_parser/middleware.go);
-   a missing pointer must not be treated as `false` (that would suppress
-   capture for legacy non-agent-network callers).
-   `redact_pii` is an orthogonal `bool` controlling **form** of emitted
-   content, not whether it's emitted.
-
-3. **`redact_pii` is parser-side.** Both parsers import
-   `llm_guardrail.RedactPII` and run it BEFORE stamping the metadata bag.
-   Load-bearing because the access-log sink reads `llm.request_prompt_raw`
-   and `llm.response_completion` directly — by the time `llm_guardrail`
-   runs its own pass on `llm.request_prompt`, the raw key has already been
-   stamped. Tests: `TestInvoke_RedactPii_RedactsBeforeEmittingRawPrompt`,
-   `TestInvoke_RedactPii_RedactsCompletionBeforeEmit`.
-
-4. **Metadata allowlist enforcement.** Every middleware declares
-   `MetadataKeys()`. The framework accumulator drops any KV outside that
-   allowlist. When adding a new key, also extend the docstring in
-   `middleware/keys.go`.
-
-5. **Closed deny-code set.** All deny paths emit one of:
-   `llm_policy.model_not_routable`, `llm_policy.no_authorised_provider`,
-   `llm_policy.model_blocked`, `llm_policy.token_cap_exceeded`,
-   `llm_policy.unmeterable_publisher` (path-routed Vertex publisher with no
-   parser → 403), `llm_policy.upstream_auth_failed` (GCP token mint failure →
-   502), or the management-supplied code on `llm_limit_check`. These surface
-   verbatim; arbitrary middleware text never reaches the wire.
-
-## Things to scrutinise
-
-**Correctness.** `llm_router` model match treats an empty `Models` slice as
-"claim every model"
-([middleware.go:238–248](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
-for gateway-style providers — confirm no real provider record ships with an
-empty `Models` by accident. Path-prefix tie-break falls back to declaration
-order when no candidate prefix-matches, so the synthesiser must emit a
-deterministic order. `llm_limit_record` discards `strconv.ParseInt` errors
-([middleware.go:78–80](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go))
-— relies on `llm_response_parser` always emitting parseable values; spot-check
-the streaming partial path on truncated bodies.
-
-**Security.** Auth headers must NEVER appear on `Mutations.HeadersAdd/Remove`
-for the router — a direct headers path would bypass the framework gate. The
-capture-pointer handling is the kind of place a bug ships PII to logs
-silently; every synthesiser config path must set the pointer explicitly.
-`llm_identity_inject` body inject silently skips on a
-non-object `metadata` field
-([middleware.go:262–270](../../../proxy/internal/middleware/builtin/llm_identity_inject/middleware.go))
-— header path still attributes, but body-level tag-budget enforcement
-doesn't run for that request.
-
-**Concurrency.** `cost_meter` shares a `pricing.Loader` via
-`atomic.Pointer[Table]`; readers always see a consistent table. Every
-middleware is a stateless value receiver. Integration test uses real bufconn
-gRPC — race detector is the meaningful bar.
-
-**Perf.** Hot path is `lookupKV` linear scan over <10 KVs; `cost_meter.Cost`
-is O(1); SSE accumulation is single-pass. No map allocation per call.
-
-**Observability.** Every deny stamps `llm_policy.decision=deny` and a
-matching `llm_policy.reason` — access-log can pivot on either.
-`llm_limit_record` only logs at `Debugf` on RPC failure
-([middleware.go:125–130](../../../proxy/internal/middleware/builtin/llm_limit_record/middleware.go));
-operators need an alternate signal (metric on `RecordLLMUsage` failures) for
-counter accuracy.
-
-## Test coverage
-
-| File | Tests | Notes |
-|---|---:|---|
-| `all_test.go` | 1 | Registry surface lock |
-| `agentnetwork_chain_integration_test.go` | 3 | Allow/deny/cap-exhaust vs live sqlite + bufconn gRPC |
-| `llm_request_parser/middleware_test.go` | 18 | `provider_id` bypass, redaction, capture-pointer, rune-safe truncation |
-| `llm_router/middleware_test.go` | 19 | Three-pass match, deny codes, path-prefix tie-break, header strip+inject |
-| `llm_limit_check/middleware_test.go` | 6 | Allow/deny, fail-open on nil mgmt / RPC error, attribution stamping |
-| `llm_identity_inject/middleware_test.go` | 28 | HeaderPair, JSONMetadata, ExtraHeaders, body inject, anti-spoof |
-| `llm_guardrail/middleware_test.go` | 15 | Allowlist case-insensitivity, prompt capture toggle, deny shape |
-| `llm_guardrail/redact_test.go` | 15 | Email, SSN, phone (E.164 + NA), bearer, IPv4; fixture-driven |
-| `llm_response_parser/middleware_test.go` | 18 | Buffered OAI+Anthro, capture-pointer, redact, truncation |
-| `llm_response_parser/streaming_test.go` | 7 | OAI usage frame, Anthro message_delta, truncated body best-effort |
-| `cost_meter/middleware_test.go` | 17 | Each skip reason, provider-shape, pricing loader integration |
-| `llm_limit_record/middleware_test.go` | 7 | Skip-on-no-signal, skip-on-missing-attribution, RPC failure swallowed |
-
-## Cross-references
-
- Sibling: [32-proxy-llm-parsers.md](./32-proxy-llm-parsers.md) — SDK adapters
-  + SSE framer + pricing loader.
- Path-routed providers (Vertex AI + Bedrock), `keyfile::` credential, GCP
-  token minting, `/bedrock` prefix:
-  [50-path-routed-providers.md](./50-path-routed-providers.md).
- Upstream config: `management/server/agentnetwork/synthesizer` (out of scope).
- Framework: `proxy/internal/middleware/{chain,dispatcher,accumulator,registry}.go`.
- Metadata key registry: `proxy/internal/middleware/keys.go`.
- gRPC surface: `proto.ProxyServiceClient.{CheckLLMPolicyLimits,RecordLLMUsage}`.
--- a/docs/agent-networks/modules/32-proxy-llm-parsers.md
+++ b/docs/agent-networks/modules/32-proxy-llm-parsers.md
@@ -1,392 +0,0 @@
-# proxy/llm-parsers — SDK adapters + pricing + SSE
-
-The runtime-agnostic LLM library: the OpenAI Responses API (`/v1/responses`)
-and the older Chat Completions API (`/v1/chat/completions`), the Anthropic
-Messages API (`/v1/messages`), the SSE wire format (`event:` / `data:` lines,
-`\n\n` framing, CRLF tolerance), and per-provider token accounting (OpenAI's
-cached-prompt **subset** vs Anthropic's cache_read **additive** model). The
-pricing table's per-provider cost formula is the highest-leverage place a
-small bug would silently mis-bill operators.
-
-Sibling module: [31-proxy-middleware-builtin.md](./31-proxy-middleware-builtin.md)
-— the 8 middlewares that consume this package's parsers + pricing loader.
-
---
-
-## Module boundary
-
-`proxy/internal/llm` is the runtime-agnostic LLM library shared by every
-middleware that needs to understand provider-specific shapes. Zero
-proxy-framework dependencies:
-
- `parser.go` — `Parser` interface, `Provider` enum, public factories
-  (`Parsers`, `DetectParser`, `ParserByName`).
- `openai.go` / `anthropic.go` / `bedrock.go` — per-provider `Parser` impls.
- `sse.go` — SSE scanner (`Scanner`, `Event`, `NewScanner`).
- `errors.go` — sentinels callers branch on with `errors.Is`.
- `pricing/` — embedded-default + hot-reload override table with
-  symlink-safe Unix loader (build-tagged stub elsewhere).
- `fixtures/` — captured request/response/stream bodies the tests replay.
-
-The package carries zero proxy-framework dependencies so the same parsers can
-be reused later by a WASM adapter
-([parser.go:1–6](../../../proxy/internal/llm/parser.go)).
-
-## Files
-
-| File | LOC | Notes |
-|---|---:|---|
-| `parser.go` | 104 | Interface + factories + `Provider{Unknown,OpenAI,Anthropic}` enum |
-| `openai.go` | 347 | Chat Completions + Completions + Responses API; cached_tokens subset |
-| `openai_test.go` | 222 | 11 tests; fixture replay + cached/Responses-API matrix |
-| `anthropic.go` | 172 | Messages + legacy `/v1/complete`; cache_read + cache_creation additive |
-| `anthropic_test.go` | 154 | 7 tests including streaming-extraction-skipped contract |
-| `bedrock.go` | 190 | AWS Bedrock InvokeModel (snake_case) + Converse (camelCase) response shapes; model lives in URL path |
-| `bedrock_test.go` | — | InvokeModel + Converse usage shapes; AWS event-stream content-type → `ErrStreamingUnsupported` on buffered `ParseResponse` |
-| `sse.go` | 117 | `bufio`-backed scanner; CRLF normalised; trailing-event handling |
-| `sse_test.go` | 175 | 12 tests; fixture replay + multiline + size limits |
-| `parser_test.go` | 53 | `Parsers()`, `DetectParser`, provider enum values |
-| `errors.go` | 31 | 6 sentinels: `Err{Unknown,Unsupported}Provider/Model`, `Err{NotLLM,Malformed}Response`, `ErrStreamingUnsupported`, `ErrMalformedRequest` |
-| `pricing/pricing.go` | 421 | `Loader`, `Table`, `Entry`; embedded defaults + atomic swap + mtime reload |
-| `pricing/pricing_unix.go` | 69 | `O_NOFOLLOW` + fstat-from-FD + 1 MiB cap |
-| `pricing/pricing_other.go` | 21 | Stub returning "not supported on this platform" |
-| `pricing/pricing_test.go` | 432 | 21 tests — symlink rejection, reload race, path traversal, oversize |
-| `pricing/defaults_pricing.yaml` | 85 | go:embed source of truth |
-| `fixtures/*` | 21–59 | OAI chat/responses/stream + Anthro messages/stream + pricing starter |
-
-## Request body → parser dispatch
-
-```mermaid
-flowchart TD
-    A[HTTP request<br/>URL + JSON body] --> B{ParserByName?<br/>provider_id config set}
-    B -- yes --> P[matched Parser]
-    B -- no --> C[DetectParser]
-    C --> D{loop Parsers<br/>OpenAIParser, AnthropicParser}
-    D -- DetectFromURL match --> P
-    D -- no match --> X[ok=false<br/>middleware skips]
-    P --> E[ParseRequest body]
-    E -->|err: ErrMalformedRequest| Y[middleware emits provider only]
-    E --> F[RequestFacts<br/>model + stream]
-    P --> G[ExtractPrompt body]
-    G --> H[joinMessages<br/>extractContentParts<br/>decodeStringOrJoin]
-    H --> I[prompt text<br/>or empty]
-    F --> J[stamps llm.model + llm.stream]
-    I --> K[stamps llm.request_prompt_raw<br/>subject to capture_prompt gate]
-```
-
-OpenAI's URL hints
-([openai.go:27–33](../../../proxy/internal/llm/openai.go)) include
-both `/v1/chat/completions` and the bare `/chat/completions` — the latter
-covers Cloudflare AI Gateway, which rewrites the canonical version segment.
-Anthropic's hints are `/v1/messages` and `/v1/complete`
-([anthropic.go:14–17](../../../proxy/internal/llm/anthropic.go)).
-Both implementations use case-insensitive substring matching so a proxy prefix
-strip / rewrite doesn't defeat detection.
-
-`ParserByName` ([parser.go:93–103](../../../proxy/internal/llm/parser.go))
-is the **agent-network bypass**: the synthesiser knows which parser to use
-because it built the synth service from the catalog, so it stamps
-`provider_id` on the parser config and the middleware skips URL sniffing
-entirely. This is what makes the same parser set work whether the request
-flows to OpenAI direct, to LiteLLM, to Portkey, or to any gateway with a
-non-canonical URL shape.
-
-**Path-routed providers (Vertex AI, Bedrock) bypass both `ParserByName` and
-`DetectParser`.** The model and the parser surface live in the URL path, so the
-request middleware extracts them directly (`parseVertexPath` /
-`parseBedrockPath`) before the parser-selection step. For Vertex the publisher
-segment picks the parser (`anthropic` → Anthropic parser; `google`/Gemini →
-none, request denied as unmeterable). For Bedrock the dedicated `BedrockParser`
-handles the response. Full treatment in
-[50-path-routed-providers.md](./50-path-routed-providers.md).
-
-## Streaming response → SSE chunker → response parser → completion + token count
-
-```mermaid
-sequenceDiagram
-    participant U as upstream LLM
-    participant LR as llm_response_parser<br/>(OnResponse)
-    participant S as llm.NewScanner<br/>(SSE framer)
-    participant P as Parser-specific accumulator<br/>(accumulateOpenAIStream<br/>or accumulateAnthropicStream)
-
-    U-->>LR: text/event-stream<br/>(buffered prefix in RespBody)
-    LR->>S: NewScanner(bytes.NewReader(body))
-    loop until EOF or [DONE]
-        S-->>LR: Event{Type, Data}
-        LR->>P: dispatch per event.Type<br/>(OpenAI: data-only<br/>Anthropic: named events)
-        P-->>P: accumulate completion text<br/>track usage from final frame
-    end
-    P-->>LR: llm.Usage + completion string
-    LR->>LR: appendUsage stamps<br/>llm.{input,output,total,cached_input,cache_creation}_tokens
-    LR->>LR: truncateCompletion(3500 bytes, rune-safe)
-    LR->>LR: redactPII if redact_pii && captureCompletion
-```
-
-`Scanner.Next`
-([sse.go:44–87](../../../proxy/internal/llm/sse.go)) returns one
-event per `\n\n` boundary; multiple `data:` lines join with `\n`; comment lines
-(starting with `:`) are skipped per the SSE spec; a trailing event without a
-closing blank line is still returned before `io.EOF` so a server that closes
-the connection cleanly doesn't lose the last frame
-([sse.go:55–58](../../../proxy/internal/llm/sse.go)). CRLF is
-normalised in `trimEOL` so fixtures captured from live servers replay
-unchanged.
-
-## Per-provider
-
-### OpenAI
-
-[openai.go:54–67](../../../proxy/internal/llm/openai.go) defines
-`openAIRequest` with three prompt fields: `messages` (Chat Completions),
-`prompt` (legacy), `input` (Responses API). The decoder uses
-`json.RawMessage` so each shape is parsed lazily.
-
-`ParseResponse`
-([openai.go:117–146](../../../proxy/internal/llm/openai.go))
-accepts both naming conventions: Chat Completions returns
-`prompt_tokens`/`completion_tokens`, Responses API returns
-`input_tokens`/`output_tokens`. `pickInt64` prefers Responses-API names and
-falls back — same parser handles both endpoints without per-route config.
-`openAICachedTokens` mirrors the fallback for
-`input_tokens_details.cached_tokens` vs `prompt_tokens_details.cached_tokens`.
-
-**Key invariant:** `CachedInputTokens` for OpenAI is a SUBSET of
-`InputTokens`. The cost meter clamps to guard against malformed upstream
-responses where `cached > total`.
-
-### Anthropic
-
-[anthropic.go:37–49](../../../proxy/internal/llm/anthropic.go)
-defines `anthropicRequest` covering Messages API (`system` + `messages[]`)
-and legacy `/v1/complete` (`prompt` string). `ExtractPrompt` emits
-`system: <text>` first when present, then per-message `role: content`.
-
-`ParseResponse`
-([anthropic.go:82–104](../../../proxy/internal/llm/anthropic.go))
-fills three independent token buckets: `InputTokens`, `CacheReadInputTokens`,
-`CacheCreationInputTokens`. Latter two are **additive** (not subset).
-`TotalTokens` sums all four so downstream dashboards render one "tokens"
-number without double-counting.
-
-`ExtractCompletion` walks `content[]` `{type, text}` parts and concatenates
-non-empty text with newlines, falling back to legacy `completion`.
-
-### Bedrock
-
-[bedrock.go](../../../proxy/internal/llm/bedrock.go) implements the
-`Parser` interface for the AWS Bedrock runtime. Bedrock is **path-routed**: the
-model lives in the URL (`/model/{id}/{action}`), so the request middleware
-extracts it (see [50-path-routed-providers.md](./50-path-routed-providers.md))
-and `ParseRequest` is a deliberate no-op. The parser's real work is on the
-response leg, covering both Bedrock body shapes:
-
- **InvokeModel** — vendor-native. Anthropic-on-Bedrock returns snake_case usage
-  (`input_tokens`, `output_tokens`, `cache_read_input_tokens`,
-  `cache_creation_input_tokens`) with the same additive cache buckets as
-  first-party Anthropic.
- **Converse** — unified camelCase (`inputTokens`, `outputTokens`,
-  `totalTokens`). `firstNonZero` folds the two naming conventions into one
-  `Usage`; when Converse omits `totalTokens` the parser sums the buckets.
-
-`ProviderName()` returns `"bedrock"` — its own `defaults_pricing.yaml` block,
-keyed by the **normalised** model id (region prefix + version suffix stripped by
-the request parser). `ParseResponse` returns `ErrStreamingUnsupported` for an
-AWS binary event-stream content-type (`application/vnd.amazon.eventstream`,
-`isAWSEventStream`) so the caller routes to the streaming accumulator instead.
-
-### SSE framing
-
-`Scanner` is `bufio`-backed, 64 KiB read buffer, 1 MiB max line so a
-malicious upstream can't blow process memory
-([sse.go:33–38, 97–100](../../../proxy/internal/llm/sse.go)).
-`splitField` strips one space after the `:` per the SSE spec. Documented
-`not safe for concurrent use`; every consumer creates a fresh scanner per
-response body. Streaming accumulators live in the middleware package
-([llm_response_parser/streaming.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go))
-but use `llm.NewScanner` so the framing contract stays here.
-
-### Pricing catalog
-
-`Table.Cost`
-([pricing.go:129–174](../../../proxy/internal/llm/pricing/pricing.go))
-is the cost formula — most security-relevant math in this module:
-
-| Provider | Formula |
-|---|---|
-| `openai` | `(inTokens − clamped) × InputPer1K + clamped × CachedInputPer1K + outTokens × OutputPer1K` where `clamped = min(cachedInput, inTokens)` |
-| `anthropic`, `bedrock` | `inTokens × InputPer1K + cachedInput × CacheReadPer1K + cacheCreation × CacheCreationPer1K + outTokens × OutputPer1K` |
-| default | `inTokens × InputPer1K + outTokens × OutputPer1K` |
-
-`bedrock` shares the Anthropic additive-cache formula
-([pricing.go:172-174](../../../proxy/internal/llm/pricing/pricing.go)):
-Anthropic-on-Bedrock reports the same additive cache buckets, while non-Anthropic
-Bedrock models (Nova, Llama) simply report zero in those buckets so cost reduces
-to `input + output`.
-
-Each per-bucket rate falls back to `InputPer1K` when zero — operators opt in
-to discounts by setting the field.
-
-`Loader`
-([pricing.go:212–268](../../../proxy/internal/llm/pricing/pricing.go))
-overlays an optional `pricing.yaml` from data-dir on top of the go:embed
-defaults. Atomic pointer swap means readers never observe a partial update.
-The mtime-poll reloader (30s default cadence) keeps the previous table on
-parse failure so cost annotation never goes blank during a botched edit.
-
-`defaults_pricing.yaml` is the source of truth for built-in pricing.
-Operator overrides only carry the entries they want to change.
-
-## Public contracts
-
-**`Parser` interface**
-([parser.go:50–66](../../../proxy/internal/llm/parser.go)):
-
-```go
-type Parser interface {
-    Provider() Provider
-    ProviderName() string
-    DetectFromURL(path string) bool
-    ParseRequest(body []byte) (RequestFacts, error)
-    ParseResponse(status int, contentType string, body []byte) (Usage, error)
-    ExtractPrompt(body []byte) string
-    ExtractCompletion(status int, contentType string, body []byte) string
-}
-```
-
-Adding a provider means implementing this interface and appending to the
-slice returned by `Parsers()` ([parser.go:78–84](../../../proxy/internal/llm/parser.go)).
-Order matters: `DetectFromURL` ties resolve by registration order.
-`Parsers()` today returns `{OpenAIParser, AnthropicParser, BedrockParser}`.
-
-**`Provider` enum**
-([parser.go:8–18](../../../proxy/internal/llm/parser.go)):
-`ProviderUnknown = 0`, `ProviderOpenAI = 1`, `ProviderAnthropic = 2`,
-`ProviderBedrock = 3`. Numeric values are persisted in nothing today but treat
-them as wire-stable — new providers must take fresh numbers.
-
-**`Pricing` lookup**
-([pricing.go:129](../../../proxy/internal/llm/pricing/pricing.go)):
-
-```go
-func (t *Table) Cost(provider, model string, inTokens, outTokens, cachedInput, cacheCreation int64) (float64, bool)
-```
-
-Nil-safe: `t.Cost` on a nil receiver returns `(0, false)`
-([pricing.go:130–132](../../../proxy/internal/llm/pricing/pricing.go)).
-`ok=false` means provider or model is absent from the loaded table; the caller
-emits `cost.skipped=unknown_model`.
-
-## Invariants
-
-1. **Cross-platform pricing build.** `pricing_unix.go` carries the only
-   functional `loadPricing` (uses `syscall.O_NOFOLLOW` and `f.Stat()` on an
-   open descriptor — both Unix-only). `pricing_other.go` is a build-tag
-   fallback that returns `"not supported on this platform"`
-   ([pricing_other.go:14–16](../../../proxy/internal/llm/pricing/pricing_other.go)).
-   The proxy is Linux-only in production today; a Windows port needs an
-   equivalent path-as-handle implementation. Reviewers building on Windows
-   should expect this surface to return an error at startup if an override
-   file is configured.
-
-2. **SSE scanner handles partial chunks.** A buffered prefix that doesn't end
-   in `\n\n` still yields its accumulated event before `io.EOF`
-   ([sse.go:55–58](../../../proxy/internal/llm/sse.go)). Tests:
-   `TestSSEScanner_OpenAIFixture`, `TestSSEScanner_AnthropicFixture`,
-   `TestSSEScanner_MultilineData`, `TestSSEScanner_CRLF`. The streaming
-   accumulators ride on this: `accumulateAnthropicStream` and
-   `accumulateOpenAIStream` `break` on any scanner error to return partial
-   usage rather than aborting
-   ([streaming.go:68–73, 144–150](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming.go)).
-
-3. **`defaults_pricing.yaml` is the source of truth.** Compiled into the
-   binary via `//go:embed`
-   ([pricing.go:29–30](../../../proxy/internal/llm/pricing/pricing.go)).
-   `DefaultTable()` parses once and panics on parse failure
-   ([pricing.go:42–49](../../../proxy/internal/llm/pricing/pricing.go))
-   — by design: a broken embedded YAML must not ship to production.
-
-4. **Loader path validation.** `resolveMiddlewareDataPath`
-   ([pricing.go:370–394](../../../proxy/internal/llm/pricing/pricing.go))
-   rejects absolute paths, traversal segments, and basenames that fail
-   `basenameRegex = ^[a-zA-Z0-9._-]+$`. The resolved path must remain
-   inside `baseDir` even after `filepath.Clean`. Tests:
-   `TestNewLoader_PathValidation`, `TestNewLoader_PathValidation_Extended`,
-   `TestNewLoader_SymlinkOutsideBaseDirRejected`, `TestNewLoader_SymlinkRejected`.
-
-5. **Unix loader symlink safety.** `O_NOFOLLOW` on open, `f.Stat()` on the
-   open descriptor (never re-stat by path), `info.Mode().IsRegular()` check,
-   `io.LimitReader(f, maxPricingBytes+1)` with a final size assertion
-   ([pricing_unix.go:25–57](../../../proxy/internal/llm/pricing/pricing_unix.go)).
-   A mid-read symlink swap is detected because the fstat is on the original
-   fd. Test: `TestNewLoader_RejectsOversizedFile_FixesM4`.
-
-6. **`yaml.NewDecoder(...).KnownFields(true)`**
-   ([pricing.go:397–398](../../../proxy/internal/llm/pricing/pricing.go))
-   rejects YAML files that carry fields not in the schema. A typo in an
-   operator override file fails loud instead of silently zeroing rates.
-
-## Things to scrutinise
-
-**Correctness.** Verify OpenAI cached-prompt clamp at
-[pricing.go:147–149](../../../proxy/internal/llm/pricing/pricing.go)
-short-circuits before subtraction. `Anthropic.TotalTokens` sums all four
-buckets (in + out + cache_read + cache_creation) — downstream dashboards
-need to know this differs from `input + output`.
-`OpenAIParser.ExtractPrompt` falls through `messages → input → prompt`; a
-request sending all three reports only `messages` (uncommon but worth
-noting).
-
-**Security.** `Scanner.maxLine = 1 MiB`; a 2 MiB single-line `data:` event
-errors from `Scanner.Next` and both accumulators stop with partial usage.
-Pricing file 1 MiB cap is orders of magnitude larger than realistic. Confirm
-new schema additions are mirrored in both `pricingFile` and `Entry`;
-`KnownFields(true)` will reject silently-typo'd operator overrides
-otherwise.
-
-**Concurrency.** `Loader.table` is `atomic.Pointer[Table]`; readers never
-block or see a torn table. `Loader.Reload` is one goroutine, cancelled via
-context (`TestLoader_ReloadBackgroundLoopCancellation`). `DefaultTable()`
-uses `sync.Once`. Per-call `Scanner` instances mean no shared state across
-concurrent response-parser calls.
-
-**Perf.** `Table.Cost` is two map lookups + multiplications, O(1).
-`Scanner.Next` is one `ReadString('\n')` per line. Pricing reload poll 30s.
-
-**Observability.** Reload failures count via `metric.Int64Counter` keyed
-`plugin`; warning log rate-limited at 5 min so a broken file doesn't flood.
-Parser errors return sentinels — middleware uses `errors.Is` to map to the
-right `cost.skipped` reason.
-
-## Test coverage
-
-| File | Tests | Coverage highlights |
-|---|---:|---|
-| `parser_test.go` | 3 | `Parsers()` shape lock, `DetectParser` URL matrix, provider enum stability |
-| `openai_test.go` | 11 | Chat Completions + Responses API + legacy `prompt`; cached-tokens subset for both naming conventions; fixture replays |
-| `anthropic_test.go` | 7 | Messages + legacy `/v1/complete`; streaming REJECTED on `ParseResponse` (must use scanner); fixture replays |
-| `sse_test.go` | 12 | Fixture replay both providers; multiline `data:`; CRLF; comment skip; trailing-event-without-blank-line; oversize rejection |
-| `pricing/pricing_test.go` | 21 | Provider-shape switch; cached-rate fallback; cached-clamp; symlink rejection (target outside basedir + symlink to file); path validation matrix; oversize rejection; reload-keeps-previous-on-parse-error; mtime change detection; goroutine cancellation |
-
-**Fixtures** ([proxy/internal/llm/fixtures/](../../../proxy/internal/llm/fixtures/)):
-`openai_chat_completion.json` (chat.completions with usage),
-`openai_responses.json` (Responses API shape),
-`openai_stream.txt` (3 deltas + usage + `[DONE]`),
-`anthropic_messages.json` (Messages API non-streaming),
-`anthropic_stream.txt` (full 7-event sequence: message_start →
-content_block_{start,delta×2,stop} → message_delta (usage) → message_stop),
-`pricing.yaml` (realistic-pricing starter for operator overrides).
-
-## Cross-references
-
- Sibling: [31-proxy-middleware-builtin.md](./31-proxy-middleware-builtin.md)
-  — the chain that calls `llm.Parsers()`, `llm.ParserByName`,
-  `llm.NewScanner`, `pricing.NewLoader`.
- Path-routed providers (Vertex AI + Bedrock), credential syntax, and the
-  Bedrock AWS event-stream accumulator:
-  [50-path-routed-providers.md](./50-path-routed-providers.md).
- Direct callers: `llm_request_parser/middleware.go:82–94`,
-  `llm_response_parser/middleware.go:113–123`,
-  `llm_response_parser/streaming.go:65, 142`, `cost_meter/factory.go:49–57`.
- Related elsewhere: the agent-network synthesiser stamping `provider_id`
-  is covered in the management-side module guide; proxy server boot +
-  `FactoryContext` construction is covered in the proxy-framework guide.
--- a/docs/agent-networks/modules/33-proxy-runtime.md
+++ b/docs/agent-networks/modules/33-proxy-runtime.md
@@ -1,194 +0,0 @@
-# proxy/runtime — translate + serve + log
-
-> **Risk level:** High — every config push from management is translated here, and the chain runs on every HTTP request to a synth target.
-> **Backward-compat impact:** Additive at the wire (`PathTargetOptions.middlewares`, `agent_network`, `disable_access_log`, capture caps) and on the proxy `Server` struct (`MiddlewareDataDir`, `MiddlewareCaptureBudgetBytes`). Non-agent-network targets stay on the no-middleware fast path.
-
-## Module boundary
-
-Turns the synth-service wire format from `ProxyService.SyncMappings`/`GetMappingUpdate` into in-process middleware chains and runs them on top of the existing `httputil.ReverseProxy`. Four concerns: (a) **translate** — `proto.MiddlewareConfig` → validated `middleware.Spec` (proxy/middleware_translate.go) + self-register the eight built-ins (proxy/middleware_register.go); (b) **boot + rebuild** — construct the `middleware.Manager`, share the OTel meter, install the live-service check, rebuild per-path chains on every `addMapping`/`modifyMapping` (proxy/server.go); (c) **serve** — resolve chain at request time, capture bodies under a global budget, invoke `RunRequest`/`RunResponse`/`RunTerminal`, render deny responses, apply `UpstreamRewrite` (proxy/internal/proxy/reverseproxy.go); (d) **log + tag** — emit access-log entries with the new `agent_network` flag, gate emission on `EnableLogCollection` via `DisableAccessLog` (proxy/internal/accesslog).
-
-**Inert for non-agent-network targets**: nil or empty chain → existing fast path (reverseproxy.go:127-139); `SuppressAccessLog` defaults false so the access-log middleware emits unchanged.
-
-## Files
-
-| Path | Role |
-| ---- | ---- |
-| proxy/middleware_translate.go | proto→Spec translation; slot/failmode/timeout mapping; caps |
-| proxy/middleware_translate_test.go | translator unit tests |
-| proxy/middleware_register.go | blank-imports the eight builtins for `init()` registration |
-| proxy/server.go | `initMiddlewareManager`, `rebuildMiddlewareChains`, `isLiveService`, `buildMiddlewareBindings`, new Server fields, `protoToMapping` stamps AgentNetwork/DisableAccessLog/CaptureConfig/Middlewares |
-| proxy/internal/proxy/reverseproxy.go | `WithMiddlewareManager`, chain dispatch, body capture, `applyUpstreamRewrite`/`Headers`, `buildRequestInput`, response-leg respInput identity fields |
-| proxy/internal/proxy/reverseproxy_test.go | `TestBuildRequestInput_PropagatesIdentityAndGroups` |
-| proxy/internal/proxy/context.go | `agentNetwork`, `suppressAccessLog`, `userGroupNames` on `CapturedData` |
-| proxy/internal/proxy/servicemapping.go | new `PathTarget` fields |
-| proxy/internal/proxy/agent_network_chain_realstack_test.go | end-to-end self-contained chain test |
-| proxy/internal/accesslog/logger.go | `logEntry.AgentNetwork` → `proto.AccessLog` |
-| proxy/internal/accesslog/middleware.go | reads `GetAgentNetwork()`; gates `l.log` on `!GetSuppressAccessLog()` |
-| proxy/internal/accesslog/middleware_test.go | suppress/default/preserves-usage assertions |
-| proxy/internal/auth/middleware_test.go | tunnel-peer group propagation contract |
-| proxy/internal/metrics/metrics.go | `Meter()` getter for the middleware manager |
-
-## Architecture & flow
-
-### Synth-service ingestion → translate → register → serve
-
-```mermaid
-flowchart TD
-    A[Management SyncMappings/GetMappingUpdate] --> B["processMappings\nserver.go:1492"]
-    B --> C{Mapping type}
-    C -->|CREATED| D["addMapping → setupHTTPMapping → updateMapping"]
-    C -->|MODIFIED| E["modifyMapping → cleanupMappingRoutes → setupHTTPMapping → updateMapping"]
-    C -->|REMOVED| F["removeMapping → cleanupMappingRoutes → invalidateMiddlewareChains"]
-    D --> G["protoToMapping\nserver.go:2181"]
-    E --> G
-    G --> H["translateMiddlewareConfigs\nmiddleware_translate.go:55"]
-    G --> I["translateMiddlewareCaptureConfig\nmiddleware_translate.go:18"]
-    H --> J["[]middleware.Spec on PathTarget"]
-    I --> K["*bodytap.Config on PathTarget"]
-    J --> L["proxy.AddMapping\nservicemapping.go:118"]
-    K --> L
-    L --> M["rebuildMiddlewareChains\nserver.go:2017 → Manager.Rebuild"]
-    F --> N["Manager.Invalidate(serviceID)"]
-```
-
-### Per-request lifecycle through the chain + accesslog
-
-```mermaid
-sequenceDiagram
-    autonumber
-    participant C as Client
-    participant M as accesslog.Middleware
-    participant A as auth.Middleware (Protect)
-    participant RP as ReverseProxy.ServeHTTP
-    participant CH as middleware.Chain
-    participant U as Upstream
-    C->>M: HTTP request
-    M->>M: NewCapturedData(requestID), WithCapturedData(ctx)
-    M->>A: next.ServeHTTP
-    A->>A: Private → ValidateTunnelPeer → stamp UserID/Email/Groups/GroupNames/AuthMethod
-    A->>RP: next.ServeHTTP
-    RP->>RP: findTargetForRequest → targetResult
-    RP->>RP: stamp ServiceID/AccountID/AgentNetwork/SuppressAccessLog on CapturedData
-    RP->>RP: resolveChain via Manager.ChainFor
-    alt chain == nil or Empty
-        RP->>U: httputil.ReverseProxy.ServeHTTP (fast path)
-    else chain non-empty
-        RP->>RP: bodytap.CaptureRequest (global budget)
-        RP->>CH: RunRequest
-        CH-->>RP: denyOutput? requestMeta + upstreamRewrite
-        alt deny
-            RP->>C: RenderDenyResponse
-        else allow
-            RP->>RP: capturingWriter + applyUpstreamRewrite/Headers
-            RP->>U: httputil.ReverseProxy.ServeHTTP(respWriter)
-            U-->>RP: response
-            RP->>CH: RunResponse (respInput carries UserGroups)
-            RP->>CH: RunTerminal (merged request+response metadata)
-        end
-    end
-    RP-->>M: handler returns
-    M->>M: build logEntry incl. AgentNetwork
-    alt SuppressAccessLog == true
-        M->>M: skip l.log; still trackUsage
-    else default
-        M->>M: l.log → goroutine SendAccessLog
-    end
-```
-
-### EnableLogCollection suppression path
-
-```mermaid
-flowchart LR
-    S["agentnetwork.Settings.EnableLogCollection"] --> B["synthesizer: target.DisableAccessLog = !EnableLogCollection"]
-    B --> P["proto PathTargetOptions.disable_access_log (field 13)"]
-    P --> T["protoToMapping reads GetDisableAccessLog()\nserver.go:2211"]
-    T --> M["PathTarget.DisableAccessLog\nservicemapping.go:47"]
-    M --> R["ServeHTTP: cd.SetSuppressAccessLog\nreverseproxy.go:106"]
-    R --> G["accesslog middleware: if !GetSuppressAccessLog l.log\nmiddleware.go:95"]
-    R --> U["trackUsage unconditional — bandwidth telemetry preserved"]
-```
-
-**Ingestion** lands as a `ProxyMapping` batch on `handleSyncMappingsStream`/`handleMappingStream`. `processMappings` dispatches to `addMapping`/`modifyMapping`/`removeMapping`; HTTP goes `setupHTTPMapping → updateMapping → protoToMapping`. `protoToMapping` (server.go:2181) is the single translation surface that materialises `[]middleware.Spec`, `*bodytap.Config`, `AgentNetwork`, `DisableAccessLog` onto each `PathTarget`; `updateMapping` finishes with `s.proxy.AddMapping(m)` (atomic swap under `mappingsMux`) and `s.rebuildMiddlewareChains(svcID, m)`.
-
-At **request time** the access-log middleware stamps `CapturedData`; the auth chain runs (Private services lift `peer_group_ids` from `ValidateTunnelPeer` — auth/middleware_test.go:322). `ReverseProxy.ServeHTTP` resolves the chain; nil or empty → original `httputil.ReverseProxy`, no body capture. When a chain matches, body is captured under the global budget, `RunRequest` produces an `UpstreamRewrite` (`llm_router` selects a provider, rewrites scheme/host/path, injects `Authorization`), and `RunResponse`+`RunTerminal` run after the upstream returns. The terminal slot sees the merged metadata bag — that's how `llm_limit_record` ships the consumption sample. The **access-log** addition: `logEntry.AgentNetwork` from `GetAgentNetwork()` onto `proto.AccessLog.AgentNetwork`; the gate at middleware.go:95 honors `EnableLogCollection`, skipping `l.log` but keeping `trackUsage` so bandwidth telemetry survives.
-
-## Public contracts touched
-
- `proxy.Server.MiddlewareDataDir` (string) — base dir for file-backed middleware config (server.go:238-241).
- `proxy.Server.MiddlewareCaptureBudgetBytes` (int64) — process-wide capture cap; defaults to 256 MiB (server.go:248-250).
- `proxy/internal/proxy.WithMiddlewareManager(*middleware.Manager) Option` — new option on `NewReverseProxy`; nil keeps the fast path (reverseproxy.go:48-56).
- `proxy/internal/proxy.PathTarget` adds `Middlewares`, `CaptureConfig`, `AgentNetwork`, `DisableAccessLog` (servicemapping.go:27-51), all zero-default.
- `proxy/internal/proxy.CapturedData` adds `agentNetwork`, `suppressAccessLog`, `userGroupNames` behind `sync.RWMutex`; slices deep-copied (context.go:47-66, 183-258).
- `accesslog.logEntry.AgentNetwork` + `proto.AccessLog.AgentNetwork` (logger.go:131, 268).
- `metrics.Metrics.Meter()` exposes the OTel meter for the middleware manager (metrics.go:53-58).
-
-## Invariants
-
- **Synth-service updates are live (no proxy restart).** Every `MODIFIED` flows through `modifyMapping → cleanupMappingRoutes` (invalidates chains) `→ setupHTTPMapping → updateMapping → rebuildMiddlewareChains`. **ProxyMapping.Private preservation:** the relevant logic lives in `management/internals/shared/grpc/proxy.go:shallowCloneMapping`, not this module, but it surfaces here — if a `MODIFIED` synth service arrives `private=false`, auth skips `ValidateTunnelPeer`, `CapturedData.UserGroups` stays empty, and `llm_router` denies with `llm_policy.no_authorised_provider` until a management restart re-pushes the snapshot. This module assumes `mapping.GetPrivate()` is correct on every batch.
- **`EnableLogCollection=false` suppresses access-log writes but middleware still runs.** Gate is one `if !cd.GetSuppressAccessLog()` immediately around `l.log(entry)` (middleware.go:95); `trackUsage` runs below the gate. Locked by `TestMiddleware_SuppressAccessLog_PreservesUsageTracking` (middleware_test.go:139).
- **`agent_network` flag on access-log entries is set when the chain processed the request.** Source `target.AgentNetwork`, stamped at reverseproxy.go:105, read at accesslog/middleware.go:86.
- **auth → builtin group propagation.** `Protect` writes `UserGroups`/`UserGroupNames`; `buildRequestInput` (reverseproxy.go:333) copies them into `middleware.Input`. The response-leg `respInput` (reverseproxy.go:196-223) also carries `UserEmail`/`UserGroups`/`UserGroupNames` — `llm_limit_record` needs `UserGroups` to ship `group_ids` so management's group-targeted budget rules match (comment at reverseproxy.go:211-215).
- **Empty chains stay on the fast path.** `ServeHTTP` skips body capture and the run sequence when `chain == nil || chain.Empty()` (reverseproxy.go:127).
- **Self-registration is the only way a builtin reaches the registry.** `middleware_register.go` blank-imports each builtin; `init()` adds the factory to `mwbuiltin.DefaultRegistry()`. Missing it → translator drops the entry with a warn (translate.go:97).
-
-## Things to scrutinize
-
-### Correctness
- **Translate edge cases** — drops on nil cfg, empty ID, unknown ID, UNSPECIFIED slot; each logs one warn; volume bounded by `MaxMiddlewaresPerChain`.
- **Re-translate without dropping in-flight requests** — `Manager.Rebuild` is the only call from `rebuildMiddlewareChains`. Reverse proxy reads `ChainFor` once per request (reverseproxy.go:327) and runs the captured `*Chain` for the whole request. Verify in module 30 that `Rebuild` swaps atomically.
- **ProxyMapping.Private preservation** — enforced management-side in `shallowCloneMapping`. Proxy-side regression catches: `TestProtect_PrivateService_TunnelPeerGroupsPropagate` + the integration test.
- **Body-capture cleanup** — `defer releaseBudget()` (reverseproxy.go:145) and `defer capturingWriter.Release()` (reverseproxy.go:180) must run on every return; confirm no future `return` lands between acquisition and defer.
- **`applyUpstreamRewrite` clones the URL** — `cloned := *orig` value-copies `*url.URL`; safe because overwritten fields are strings, not slices/maps (reverseproxy.go:285-292).
-
-### Security
- **Translate validates every config** — registry membership rejects unknown IDs; UNSPECIFIED slot drops; ID-less drops; raw config copied (not aliased) at translate.go:109.
- **`AuthHeader`/`StripHeaders` only reachable via `UpstreamRewrite`** — regular mutation surface goes through the framework denylist (`Authorization`/`Cookie` blocked); only the router middleware can replace `Authorization` (reverseproxy.go:296-304). Confirm in module 30 nothing outside the proxy-trusted path populates `UpstreamRewrite.AuthHeader`.
- **`stampNetBirdIdentity` strips client-sent values first** (reverseproxy.go:742-743) — anti-spoof for `X-NetBird-User`/`X-NetBird-Groups`; control chars filtered; comma-bearing labels dropped (reverseproxy_test.go:1217/:1243/:1193).
- **Auth → group propagation** — `auth/middleware_test.go:322` and `:366` cover the contract. If auth ever stops calling `ValidateTunnelPeer` for Private services, every agent-network request silently denies.
-
-### Concurrency
- **Chain replacement under in-flight requests** — `findTargetForRequest` takes `mappingsMux.RLock`; `AddMapping` writes. `resolveChain` calls `ChainFor` once; even if `Rebuild` swaps mid-request, in-flight requests keep running on the captured pointer.
- **`CapturedData` mutation across slots** — accessors take `sync.RWMutex`; slices deep-copied on both Set and Get. Verify no caller mutates the returned slice expecting it to land back.
- **`Manager.Invalidate` race** — `removeMapping` invalidates after `cleanupMappingRoutes`; mapping read happens before chain resolution, so requests before invalidate run captured chains; later ones fail `findTargetForRequest`.
- **`Logger.log` goroutine** — `logSem` caps at `maxLogWorkers = 4096`; overflow → `dropped.Add(1)` + debug log. Middleware test uses a buffered channel and 150ms negative-assertion window — review whether 150ms holds on slow CI.
-
-### Backward compatibility
- **Non-agent-network services unaffected** — `protoToMapping` reads new fields only when `opts != nil`; defaults leave `Middlewares`/`CaptureConfig` nil → chain resolves nil → fast path. Existing `reverseproxy_test.go` (non-chain) still passes.
- **`disable_access_log` is proto field 13, default false** — every existing target unset; gate is no-op. Locked by `TestMiddleware_SuppressAccessLog_DefaultEmitsLog` (middleware_test.go:104).
- **`Server` additions optional** — 256 MiB default when `MiddlewareCaptureBudgetBytes ≤ 0` (server.go:1997-2000).
-
-### Performance
- **Translate cost per push** — O(n) with per-entry registry lookup and `config_json` copy; negligible vs. the upstream gRPC unmarshal.
- **Empty-chain hot path** — one `ChainFor` map lookup + one `chain.Empty()` check; no allocation delta vs. pre-PR.
- **Body capture buffer churn** — `bodytap.CaptureRequest` allocates `MaxRequestBytes` per chain-hitting request; `releaseBudget` ties allocation to the 256 MiB proxy-wide budget. Confirm in module 30 the budget is a hard cap.
-
-### Observability
- **Metrics** — `Metrics.Meter()` shared with `middleware.NewMetrics` (server.go:1990-1993) so middleware instruments land in the same prometheus exporter. No new metrics defined here.
- **Access-log accuracy** — every entry carries `AgentNetwork`; terminal-slot metadata merged into `CapturedData.Metadata` (reverseproxy.go:238-241).
- **Deny logs at `Infof`** (reverseproxy.go:170) — review whether `Info` is too noisy at high deny rates; consider Debug or rate-limit.
-
-## Test coverage
-
-| Test file | Locks down |
-| --------- | ---------- |
-| proxy/middleware_translate_test.go | Empty/nil → nil; field preservation; unknown ID skip; nil registry permissive; timeout clamping; fail-mode + slot incl. UNSPECIFIED-drop; empty-ID drop; truncation above + at `MaxMiddlewaresPerChain` |
-| proxy/internal/proxy/reverseproxy_test.go | Rewrite host/headers/cookies/query; trusted proxy; path forwarding; classifyProxyError; X-NetBird-User/Groups anti-spoof + CSV-join + control-char/comma rejection + fallback-to-ID; `TestBuildRequestInput_PropagatesIdentityAndGroups` (UserGroups/Email/GroupNames/AgentNetwork reach `middleware.Input`) |
-| proxy/internal/proxy/agent_network_chain_realstack_test.go | **The end-to-end integration test.** Drives a real agent-network request through `ReverseProxy.ServeHTTP` with the chain the synthesizer produces, against an in-process management gRPC (bufconn) backed by a real sqlite store + real `agentnetwork.Manager`, plus an `httptest` upstream — no external infrastructure or real LLM. Guarantees: (1) response-leg `respInput` carries `UserGroups` so `llm_limit_record` ships non-empty `group_ids` and the admin-group consumption row increments; (2) `RedactPii=true` redacts both prompt and completion on captured metadata; (3) the full chain runs against a real management stack. **Line 189-211 inlines the proto→Spec mapping** instead of calling the proxy's private `translateMiddlewareConfig` — keep that inline mirror in sync with `proxy/middleware_translate.go` or the test silently diverges from production. |
-| proxy/internal/accesslog/middleware_test.go | `SuppressAccessLog=true` skips `SendAccessLog` (150ms negative wait); default emits one send (2s positive); usage tracking runs under suppression |
-| proxy/internal/auth/middleware_test.go | `TestProtect_PrivateService_TunnelPeerGroupsPropagate` proves `peer_group_ids` reach `CapturedData.UserGroups`; `TestProtect_PrivateService_TunnelPeerDenied` proves rejected peers 403 without reaching the handler |
-
-The integration test runs in a few seconds with no external infrastructure — exercising the real synthesizer, `Manager.Rebuild`, `ServeHTTP` dispatch, and `llm_limit_record` writing a real consumption row through the real `agentnetwork.Manager` over real gRPC.
-
-## Known limitations / explicit non-goals
-
- **Translator does not validate `RawConfig` JSON** — factory's job at `New([]byte)`. Confirm in module 30 that a per-binding factory failure doesn't poison the rest of the chain.
- **No throttle on management push rate** — every `MODIFIED` triggers `Manager.Rebuild`. Mitigation upstream.
- **Streaming responses (SSE)** — body capture is streaming-aware, but response-leg middleware runs only after the response completes; long SSE streams delay `llm_limit_record` until close.
- **OIDC-only path doesn't carry tunnel-peer groups** — agent-network synth services rely on the Private tunnel-peer path; JWT groups claim is the only carrier for non-Private OIDC.
- **`agent_network` flag on L4 entries** not added; HTTP-only.
- **`mw.capture.bypass_reason` metadata key** documented at reverseproxy.go:151,184; namespace this in module 30/31 to avoid collisions.
-
-## Cross-references
- Upstream: [shared/api](10-shared-api.md), [proxy/middleware-framework](30-proxy-middleware-framework.md), [proxy/middleware-builtin](31-proxy-middleware-builtin.md), [proxy/llm-parsers](32-proxy-llm-parsers.md)
- End-to-end flow: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level: [../00-overview.md](../00-overview.md)
--- a/docs/agent-networks/modules/40-dashboard.md
+++ b/docs/agent-networks/modules/40-dashboard.md
@@ -1,228 +0,0 @@
-# dashboard — UI for agent-networks
-
-This module documents code that lives in the **dashboard repo** (under
-`src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`), not
-in this repo. It is co-located here so backend readers see the full picture.
-
-> **Risk level:** Medium. The new surface is isolated under `src/modules/agent-network/` and `src/app/(dashboard)/agent-network/`, but it also reshapes the sidebar, splits `/peers`, renames `reverse-proxy/clusters` → `self-hosted-proxies`, and overlays the Control Center graph. Regressions here would be cross-cutting.
-> **Backward-compat impact:** Additive on the API side. Breaking on URL/navigation: `/peers` redirects to `/peers/devices` (src/app/(dashboard)/peers/page.tsx:7-15), `/reverse-proxy/clusters` was renamed to `/reverse-proxy/self-hosted-proxies`, the sidebar lost Access Control / Networks / Reverse Proxy / DNS / standalone Guardrails / Consumption / Activity (Navigation.tsx:165-171 — routes still resolve via URL), and the standalone `/agent-network/{access-log,consumption,global-controls}` routes are gone in favor of `/agent-network/observability`.
-
-## Module boundary
-
-The dashboard is the only place an operator interacts with agent-networks: provider catalog, configured providers, policies, guardrails, account-level budget rules, account settings (collection / redaction toggles), per-request access log, and consumption rollups all render, paginate, and edit here. Data flows in via SWR (`useFetchApi`) keyed by REST URL. One big context provider (`src/modules/agent-network/AIProvidersProvider.tsx`) aggregates five resources (providers, policies, guardrails, budget rules, settings) plus the proxy access-log stream filtered to `agent_network=true`, and exposes `add* / update* / toggle* / delete*` mutators that call through `useApiCall` and re-`mutate()` SWR. Pages mount the provider once at the top and compose presentational tables and modals beneath. The control-center page additionally fetches `/agent-network/{providers,policies}` directly (control-center/page.tsx:123-130) to overlay graph nodes.
-
-## What the UI delivers
-
- **AI Observability** page with four tabs: Access Logs, Budget Dashboard,
-  Budget Settings, Log Settings (replaces the standalone access-log,
-  consumption, and global-controls routes).
- **Providers** page: provider catalog + connect/edit wizard with per-vendor
-  copy (LiteLLM, Portkey, Bifrost, Cloudflare, Vercel, OpenRouter, custom).
- **Policies** page: group → provider authorization with per-policy Limits
-  (minute-granular windows) + guardrail attach.
- **Guardrails** page: reusable model-allowlist + prompt-capture sets.
- **Account controls**: Log Collection / Prompt Collection / Redact PII toggles.
- **Budget rules**: account-level rules reusing the policy Limits UI.
- **Control Center overlay**: provider + agent-policy nodes on the graph.
- **Navigation + peers reshaping**: peers split into Devices / Agents,
-  `reverse-proxy/clusters` renamed to `self-hosted-proxies`, sidebar
-  repackaged for agent-network focus.
-
-## Surface added
-
-### New pages
-
-| Route | Purpose | Backing module(s) |
-| ----- | ------- | ----------------- |
-| `/agent-network` | Redirect to `/agent-network/providers` | page.tsx:7-15 |
-| `/agent-network/providers` | List + connect providers; header surfaces per-account base URL | providers/page.tsx + AgentProvidersTable + AIProviderModal |
-| `/agent-network/policies` | Group → Provider authorization with per-policy Limits + Guardrail attach | policies/page.tsx + AgentPoliciesTable + AgentPolicyModal |
-| `/agent-network/guardrails` | Reusable guardrail sets (model allowlist + prompt capture) | guardrails/page.tsx + AgentGuardrailsTable + AgentGuardrailModal |
-| `/agent-network/observability` | Tabs: Access Logs / Budget Dashboard / Budget Settings / Log Settings | observability/page.tsx |
-| `/peers/devices`, `/peers/agents` | Split of `/peers`, shared via `PeersListView` keyed by `kind` | peers/{devices,agents}/page.tsx |
-| `/reverse-proxy/self-hosted-proxies` | Renamed from `clusters` | self-hosted-proxies/page.tsx |
-
-Removed in favor of `/agent-network/observability`: `/agent-network/access-log`, `/agent-network/consumption`, `/agent-network/global-controls`.
-
-### New modules under src/modules/agent-network
-
-| File | Role |
-| ---- | ---- |
-| AIProvidersProvider.tsx (~1158 LOC) | Aggregates every agent-network resource via SWR; normalises snake↔camel; exposes mutators; holds wizard-open state |
-| AIProviderModal.tsx (~1268 LOC) | Connect / edit provider wizard with per-vendor copy (Bifrost, Portkey, LiteLLM, Cloudflare, Vercel, OpenRouter, custom) |
-| AIProviderLogo + useProviderCatalog | Catalog-driven brand swatch + SWR hook over `/agent-network/catalog/providers` |
-| AgentPoliciesTable + AgentPolicyModal + AgentPolicyGuardrailsTab + AgentPolicyLimitsTab | Policies; modal has 3 tabs (Rule, Limits, Guardrails) |
-| AgentGuardrailsTable + AgentGuardrailModal + AgentGuardrailBrowseModal + AgentGuardrailChecksCell | Guardrails CRUD + attach-from-policy |
-| AgentBudgetRulesTable + AgentBudgetRuleModal | Account-level budget rules; modal reuses AgentPolicyLimitsTab verbatim |
-| AgentAccountControlsCard | Three account-wide toggles (Log Collection / Prompt Collection / Redact PII) |
-| AgentAccessLogTable + AgentAccessLogExpandedRow | Access log on `/events/proxy?agent_network=true` |
-| AgentConsumptionPanel + AgentConsumptionTable | Token + cost panel: charts + counter table |
-| table/AgentProvidersTable + AgentProviderActionCell | Providers table + per-row actions |
-| data/mockData.ts | Domain types and a few residual `MOCK_*` constants (see scrutinize) |
-
-### Touched non-agent-network areas
-
- **control-center**: agent-network overlay (provider + agent-policy nodes); removed the All Networks dropdown; hid the Networks tab in FlowSelector (FlowSelector.tsx:9-14 — enum value kept so `?tab=networks` still type-checks); wrapped `ControlCenterView` in `AIProvidersProvider` (page.tsx:73-83); `agentPolicyNode` clicks routed to a separate state slot (page.tsx:1871-1874). New node renderers: nodes/ProviderNode.tsx, nodes/AgentPolicyNode.tsx (registered at utils/nodes.ts:21-22).
- **peers**: Split into Devices and Agents sub-routes; shared via `PeersListView` keyed by `kind` (PeersListView.tsx:24-95). New compact-toolbar `UserFilterSelector` (users/UserFilterSelector.tsx).
- **reverse-proxy**: Folder rename `clusters/` → `self-hosted-proxies/`; deleted `ClustersFeaturesCell.tsx`, `ClusterTypeIndicator.tsx`; new ReverseProxyClusterTargetSelector for cluster target type; Private toggle on target modal; body-capture knobs removed; new ReverseProxyEventExpandedRow.
- **events**: `ReverseProxyEventsUserCell` rewritten with user + peer fallback (ReverseProxyEventsUserCell.tsx:14-21), shared with the access-log table.
- **navigation**: Full repackaging in Navigation.tsx — Agent Network items flattened (no collapsible parent), distinct icons per item; Access Control, Networks, Reverse Proxy, DNS, standalone Guardrails, Consumption, Activity removed (still URL-reachable, per lines 165-171).
-
-## Architecture & flow
-
-### Page → Provider → Table/Modal hierarchy
-
-```mermaid
-graph TD
-  Nav[Navigation.tsx]
-  Nav --> ProvidersPage[/agent-network/providers/]
-  Nav --> PoliciesPage[/agent-network/policies/]
-  Nav --> GuardrailsPage[/agent-network/guardrails/]
-  Nav --> ObsPage[/agent-network/observability/]
-
-  ProvidersPage --> AIPP1[AIProvidersProvider]
-  PoliciesPage --> AIPP2[AIProvidersProvider]
-  GuardrailsPage --> AIPP3[AIProvidersProvider]
-  ObsPage --> AIPP4[AIProvidersProvider]
-  ObsPage -.wraps.-> GroupsProvider
-  ObsPage -.wraps.-> PeersProvider
-
-  AIPP1 --> ProvTable[AgentProvidersTable]
-  ProvTable --> ProvModal[AIProviderModal]
-  AIPP2 --> PolTable[AgentPoliciesTable]
-  PolTable --> PolModal[AgentPolicyModal]
-  PolModal --> PolGuardTab[AgentPolicyGuardrailsTab]
-  PolModal --> PolLimitsTab[AgentPolicyLimitsTab]
-  PolGuardTab --> GuardBrowse[AgentGuardrailBrowseModal]
-  PolGuardTab --> GuardModal[AgentGuardrailModal]
-  AIPP3 --> GuardTable[AgentGuardrailsTable]
-  GuardTable --> GuardModal
-  AIPP4 --> Tabs[Tabs]
-  Tabs --> AccessLog[AgentAccessLogTable]
-  Tabs --> Consumption[AgentConsumptionPanel]
-  Tabs --> BudgetRules[AgentBudgetRulesTable]
-  Tabs --> AccountCtl[AgentAccountControlsCard]
-  BudgetRules --> BudgetModal[AgentBudgetRuleModal]
-  BudgetModal -.reuses.-> PolLimitsTab
-```
-
-### AI Observability tab page
-
-```mermaid
-graph LR
-  Page[AIObservabilityPage] --> RA[RestrictedAccess<br/>permission.services.read]
-  RA --> GP[GroupsProvider]
-  GP --> PP[PeersProvider]
-  PP --> AIP[AIProvidersProvider]
-  AIP --> Tabs[Tabs / TabsList]
-  Tabs --> T1[Access Logs<br/>AgentAccessLogTable]
-  Tabs --> T2[Budget Dashboard<br/>AgentConsumptionPanel]
-  Tabs --> T3[Budget Settings<br/>AgentBudgetRulesTable]
-  Tabs --> T4[Log Settings<br/>AgentAccountControlsCard]
-  T1 -.GET.-> EP[/events/proxy?agent_network=true/]
-  T2 -.GET poll 5s.-> CONS[/agent-network/consumption/]
-  T3 -.GET/PUT.-> BR[/agent-network/budget-rules/]
-  T4 -.GET/PUT.-> ST[/agent-network/settings/]
-```
-
-### Data fetch path
-
-```mermaid
-graph TD
-  Page[Page component] --> Prov[AIProvidersProvider]
-  Prov -->|useFetchApi| SWR[(SWR cache<br/>key = URL)]
-  SWR -.GET.-> P[/agent-network/providers/]
-  SWR -.GET.-> POL[/agent-network/policies/]
-  SWR -.GET.-> G[/agent-network/guardrails/]
-  SWR -.GET.-> BR[/agent-network/budget-rules/]
-  SWR -.GET ignoreError.-> ST[/agent-network/settings/]
-  SWR -.GET.-> CAT[/agent-network/catalog/providers/]
-  SWR -.GET pageSize=100.-> EVT[/events/proxy agent_network=true/]
-  Prov --> Mut[useApiCall.post/put/del]
-  Mut -.on success.-> MutateSWR[SWR mutate keys]
-  Prov --> Children[Tables / Modals via useAIProviders]
-```
-
-Every list view reaches management through SWR over `/api/agent-network/*`. The provider context maps snake-case payloads to camelCase domain types (`fromAPI`, `policyFromAPI`, `guardrailFromAPI`, `budgetRuleFromAPI`, `settingsFromAPI`, `accessLogFromAPI` — AIProvidersProvider.tsx:138-562) and back via matching `*ToRequest` adaptors. The access log piggy-backs on `/events/proxy` with `agent_network=true&page_size=100` (line 707-709) and decodes LLM-specific fields from per-event `metadata`. Group IDs on events are resolved to current names through the surrounding GroupsProvider catalog (lines 515-521, 717-731) — no extra round trip. Mutators run `*ToRequest`, await `useApiCall.post/put/del`, call SWR `mutate()`, then `notify`. Errors caught and surfaced via `notify` — no exceptions escape into render. The Connect Provider modal's open state lives in the provider itself (`isWizardOpen` at lines 732-735) so the providers-page empty-state CTA and the table's + button share one modal. Control-center re-fetches `/agent-network/{providers,policies}` directly on top of `AIProvidersProvider` — SWR de-dupes but the code path is harder to reason about.
-
-## Public contracts consumed
-
- `GET/POST /api/agent-network/providers`, `PUT/DELETE /:id`
- `GET/POST /api/agent-network/policies`, `PUT/DELETE /:id`
- `GET/POST /api/agent-network/guardrails`, `PUT/DELETE /:id`
- `GET/POST /api/agent-network/budget-rules`, `PUT/DELETE /:id`
- `GET/PUT /api/agent-network/settings` (ignoreError-tolerant; 404 = not yet bootstrapped — auto-bootstrap on first provider create via `bootstrap_cluster` field — AIProvidersProvider.tsx:737-760)
- `GET /api/agent-network/catalog/providers` (read-only declarative; backend owns vendor list, IDs, brand colors, models, extra_headers, identity_injection — useProviderCatalog.ts:6-95)
- `GET /api/agent-network/consumption` (polled every 5s on Budget Dashboard — ConsumptionPanel.tsx:53,65-71)
- `GET /api/events/proxy?agent_network=true&page_size=100` (shared with Proxy Events)
- `permission?.services?.read` gates every agent-network route via RestrictedAccess.
-
-`AIProviderId` is a closed union in dashboard types (data/mockData.ts:8-21) but the converter tolerates anything the backend ships — unknown ids fall through to `"custom"` (AIProvidersProvider.tsx:497-506). Catalog values are pure read-through: anything declared in `extra_headers` renders in the modal automatically, copy keyed by header name (`EXTRA_HEADER_UI` in AIProviderModal.tsx:61-89), labeled-fallback for unknown ones.
-
-## Invariants
-
- Provider context wrap order on user-attribution pages: `GroupsProvider > PeersProvider > AIProvidersProvider` (observability/page.tsx:87-89). Reverse it and access-log group resolution silently drops names.
- Every agent-network route checks `permission?.services?.read` via `RestrictedAccess` (observability/page.tsx:85, providers/page.tsx:184, policies/page.tsx:53, guardrails/page.tsx:55).
- Modal `key={open ? 1 : 0}` pattern is used to force unmount/remount on close so internal `useState` resets between edits (AgentBudgetRuleModal.tsx:60, AgentPolicyModal.tsx:66). Removing this would leak prior-row state into a new-row session.
- `mockData.ts` is the canonical home for ALL agent-network domain types; `MOCK_*` constants must never reach a production code path. One leak remains (below).
-
-## Things to scrutinize
-
-### Correctness
-
- **Tab-state URL hand-off is one-way.** observability/page.tsx:53-58 reads `?tab=` on mount (despite the file comment at line 28 saying URL hand-off is future) but `setTab` does NOT push back, so reload preserves the chosen tab only if it came in via the link. Inconsistent with control-center (page.tsx:1817-1831).
- **Provider overlay runs only in `applySingleGroupView` / `applyPeerView`** (control-center/page.tsx:557, 1159-1166). User view does NOT show providers — if agent-network is a primary lens, that's a gap.
- **Two useEffects race to invalidate the control-center layout.** page.tsx:1655-1657 drops `layoutInitialized` when `agentPolicies` / `agentProviders` arrive; the main effect (1786-1799) also lists them as deps. Functional but fragile — watch for flash-of-empty-graph.
- **`updateProvider` / `updatePolicy` / `updateBudgetRule` use `??` on `enabled`** (AIProvidersProvider.tsx:784, 859, 1018). Toggle paths are safe; any caller sending `enabled: false` thinking "leave it off" gets `existing.enabled` instead. Audit modal callers.
- **Form validation in modals is minimal.** Window-seconds picker — mockData.ts:209-215 documents "minimum 60 — one minute" but there is no matching UI guard in PolicyLimitsTab; the backend validator is the enforcement point.
-
-### Security
-
- **No client-side enforcement claims** — every cap, allowlist, and toggle is display + edit; proxy is the source of truth for deny decisions (AccessLogTable.tsx:177-191 renders backend-emitted `denyReason` as-is).
- **Prompt display is gated by what the backend stamps.** When `enable_prompt_collection` is OFF the proxy must not put prompt/completion into event metadata; the dashboard renders whatever it gets verbatim (AccessLogTable lines 532-534, AccessLogExpandedRow.tsx:42-57). No UI filter on top of backend collection switches.
- Account Controls disables `Redact PII` when `Prompt Collection` is off (AgentAccountControlsCard.tsx:122) and clears it on off-transition (line 100), but relies on backend to enforce the same gate at write — confirm PUT handler rejects `redact_pii=true && enable_prompt_collection=false`.
- **Bifrost identity-header overrides**: empty-string vs nil semantics documented in AIProvidersProvider.tsx:772-781 ("omitted = preserve, empty = explicit clear"). Mishandling could leak group attribution to a header the operator thought disabled. Focused read of Bifrost code path in AIProviderModal.tsx recommended.
-
-### Accessibility
-
- Observability TabsList (observability/page.tsx:96-113) uses the shared Tabs component — should inherit Radix roving-tabindex. All four TabsTriggers carry only icon + text, no `aria-label`; fine because text is visible.
- Modal focus traps are inherited from the shared Modal; agent-network modals don't override them. Quick keyboard pass recommended.
- `EndpointBadge` Copy button (providers/page.tsx:66-76) has an `aria-label`, good.
-
-### Performance
-
- `AgentConsumptionPanel` polls `/agent-network/consumption` every 5s (ConsumptionPanel.tsx:53,70). Tab switches unmount the panel, so the poll stops — verify in network panel.
- `AgentAccessLogTable` is hard-capped at 100 rows via `page_size=100` (AIProvidersProvider.tsx:707-709). Server-side pagination is future work; high-traffic tenants miss everything past row 100 — known limitation.
- Observability page mounts providers ONCE at page level (observability/page.tsx:87-89); tab switches keep SWR cache hot. Moving the provider mount inside `TabsContent` would re-fetch the access log on every switch.
-
-### Visual consistency
-
- The observability tab style mirrors peers/page.tsx. Outer Tabs `pt-4 pb-0 mb-0`, TabsList `px-8` (observability/page.tsx:94-96) — confirm chrome height matches so the page doesn't visually jump.
- Sidebar: `Boxes` for Providers, `AccessControlIcon` for Policies, `TelescopeIcon` for AI Observability (Navigation.tsx:113,120,133). Reusing `AccessControlIcon` makes Policies look identical to the (now hidden) Access Control item — if Access Control ever comes back, they collide.
- `AgentNetworkIcon` is used in breadcrumbs on every agent-network page but NOT in the sidebar (per-page icons instead). Deliberate departure — record so it doesn't get reverted.
-
-## Test coverage
-
- **Cypress**: One file (`cypress/e2e/test.cy.ts`) covering only the install-page copy-to-clipboard flow. NOTHING covers agent-network UI.
- **Component / unit tests**: `src/utils/version.test.ts` is the only `.test.*` file in the repo. The agent-network modules ship without component tests.
- Data-cy hooks exist on key controls: `save-account-controls` (AgentAccountControlsCard.tsx:71), `enable-log-collection`, `enable-prompt-collection`, `redact-pii`, plus existing `data-cy={policy.name}` / `data-cy={provider.name}` on ActiveInactiveRow. Sufficient hooks for Cypress flows; none written yet.
- **Tooling gap (pre-existing):** `npm run lint` (`next lint`) is broken in Next 16 — the `lint` subcommand was removed from the Next CLI in 16.x, so the dashboard effectively has no working lint gate. The fix is to add either a flat-config `eslint .` script or wire ESLint via an explicit `eslint-config-next` invocation.
-
-## Known limitations / explicit non-goals
-
- **`data/mockData.ts` still contains `MOCK_GROUPS`, `MOCK_PROVIDERS`, `MOCK_PEERS`.** Only `MOCK_GROUPS` is referenced from production — AgentPoliciesTable.tsx:45,76 uses it as a name-lookup fallback when a policy references a group ID the real GroupsProvider doesn't know about. `MOCK_PROVIDERS` / `MOCK_PEERS` are unreferenced; safe to delete. The file is `/* eslint-disable */` so dead-code warnings don't flag them.
- **Tab-state URL hand-off on observability page is one-way** (read-only).
- **Access log hard-capped at 100 rows**; no server-side pagination.
- **No optimistic updates.** All mutations are round-trip; failures rollback via SWR revalidation.
- **`FlowView.NETWORKS` retained but hidden** from FlowSelector (FlowSelector.tsx:9-14). Old `?tab=networks` links still route to the hidden view because `applyNetworksView` still runs.
- **Redirects are not query-preserving** — `router.replace("/peers/devices")` (peers/page.tsx:13) strips any incoming filter params.
- **Control-center cross-fetches** `/agent-network/{providers,policies}` directly on top of `AIProvidersProvider`. Could be collapsed.
- **Sidebar permanently hides Access Control, Networks, Reverse Proxy, standalone Guardrails, DNS, Activity, Consumption.** Routes still resolve via URL (Navigation.tsx:165-171); intentional.
-
-## Cross-references
-
- Upstream API contracts: [shared/api](10-shared-api.md)
- Backend persistence: [management/store](20-management-store.md)
- Backend handler wiring: [management/handlers + wiring](22-management-handlers-wiring.md)
- End-to-end flow narrative: [../01-end-to-end-flows.md](../01-end-to-end-flows.md)
- Top-level overview: [../00-overview.md](../00-overview.md)
--- a/docs/agent-networks/modules/50-path-routed-providers.md
+++ b/docs/agent-networks/modules/50-path-routed-providers.md
@@ -1,251 +0,0 @@
-# path-routed providers — Vertex AI + Bedrock
-
-This guide pulls the **path-routed** provider story together in one place
-because it crosses the catalog, the synthesiser, the request parser, and the
-router. The relevant building blocks are the `llm_router` /
-`llm_request_parser` middlewares
-([31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)), the
-per-provider parser surface ([32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)),
-and the synthesiser's catalog → `ProviderRoute` mapping
-([21-management-agentnetwork.md](21-management-agentnetwork.md)).
-
-Sibling modules: [31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)
-(router + request parser) and [32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)
-(Bedrock parser + pricing).
-
---
-
-## What "path-routed" means
-
-Most catalog providers carry the model in the request **body** (`{"model": …}`),
-so `llm_router` selects an upstream by matching the model name against each
-provider's `Models` claim. Two providers instead carry the model in the **URL
-path**, so they are routed by path before the model/vendor table is consulted:
-
-| Catalog id | Style flag | Request path shape |
-|---|---|---|
-| `vertex_ai_api` | `IsVertexPathStyle` → `ProviderRoute.Vertex` | `/v1/projects/{project}/locations/{region}/publishers/{publisher}/models/{model}:{action}` |
-| `bedrock_api` | `IsBedrockPathStyle` → `ProviderRoute.Bedrock` | `/model/{modelId}/{action}` (optionally behind `/bedrock`) |
-
-The catalog declares the style with
-[`catalog.IsVertexPathStyle` / `catalog.IsBedrockPathStyle`](../../../management/server/agentnetwork/catalog/catalog.go)
-and the synthesiser copies the result onto the router route as the `Vertex` /
-`Bedrock` booleans
-([synthesizer.go:450-451](../../../management/server/agentnetwork/synthesizer.go)).
-On the request leg `llm_router.Invoke` dispatches `isVertexPath` / `isBedrockPath`
-**before** the model lookup
-([llm_router/middleware.go:138-216](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
-so a model the parser extracted from the path can't be claimed by a same-vendor
-*body-routed* provider (e.g. `claude-*` on `api.anthropic.com`).
-
-## Google Vertex AI (`vertex_ai_api`)
-
-### Catalog entry
-
-`KindProvider`, parser surface left unset on the catalog entry — the request
-parser picks the parser from the URL **publisher** segment, not from
-`ParserID`. Upstream host is `<region>-aiplatform.googleapis.com`
-(`https://aiplatform.googleapis.com` for the `global` location). The catalog
-lists the Claude-on-Vertex lineup (`claude-opus-4-*`, `claude-sonnet-4-*`,
-`claude-haiku-4-5`, `claude-fable-5`) at the same per-token rates as the
-first-party Anthropic entry
-([catalog.go:333-363](../../../management/server/agentnetwork/catalog/catalog.go)).
-
-### Credential — service-account OAuth (`keyfile::`)
-
-Vertex does **not** accept a static API key. The operator sets the provider
-`api_key` to:
-
-```
-keyfile::<base64 of the GCP service-account JSON key>
-```
-
-The synthesiser recognises the `keyfile::` prefix in `providerAuthHeader`
-([synthesizer.go:897-903](../../../management/server/agentnetwork/synthesizer.go)),
-emits **no** static auth value, and carries the base64 key material on the
-route as `GCPServiceAccountKeyB64`
-([factory.go:56-61](../../../proxy/internal/middleware/builtin/llm_router/factory.go)).
-At request time the router mints a short-lived OAuth2 access token from the key
-(cloud-platform scope) and injects `Authorization: Bearer <access-token>` —
-never the key itself
-([llm_router/middleware.go:621-692](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
-
- One auto-refreshing `oauth2.TokenSource` is cached per key (keyed by a
-  SHA-256 of the base64 material), so token minting happens once and refreshes
-  amortise across requests.
- Mint / refresh is bounded by a 10s timeout HTTP client (`gcpTokenTimeout`) so
-  a slow Google token endpoint can't hang the request.
- A malformed key or an unreachable token endpoint fails the request with
-  `llm_policy.upstream_auth_failed` at HTTP **502** (an upstream problem, not a
-  policy denial) — see `denyUpstreamAuth`.
-
-### Metering — Anthropic-on-Vertex only
-
-The request parser extracts `{publisher, model, action}` from the path
-(`parseVertexPath`, [llm_request_parser/middleware.go:237-263](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)),
-strips the `@version` suffix from the model, and maps the publisher to a parser
-surface via `vertexPublisherVendor`:
-
- `anthropic` → `llm.provider="anthropic"` → metered through the Anthropic
-  parser, priced under the **`anthropic`** block in `defaults_pricing.yaml`
-  (the parser emits the standard Anthropic provider label, so Vertex Claude
-  reuses first-party Anthropic prices).
- `openai` → `llm.provider="openai"` (reserved; not in the catalog lineup
-  today).
- anything else (notably `google` / Gemini) → empty vendor → **no parser**.
-
-**Gemini is intentionally denied as unmeterable.** When the parser emits no
-`llm.provider` for a Vertex publisher, `llm_router` returns
-`llm_policy.unmeterable_publisher` (403) rather than forwarding the request
-uncounted — serving it would bypass token / budget metering
-([llm_router/middleware.go:144-162, 712-728](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)).
-A Gemini parser would lift this restriction; until then the `google` publisher
-is omitted from the catalog.
-
-> Caveat: cross-region inference profiles in `eu` / `apac` carry a ~10% price
-> premium that the base per-token rates do **not** model — cost annotations for
-> those regions read low. Operators who need exact regional billing override
-> the affected entries in `pricing.yaml`.
-
-## AWS Bedrock (`bedrock_api`)
-
-### Catalog entry
-
-`KindProvider`, upstream host `bedrock-runtime.<region>.amazonaws.com`. Metered
-models are the Anthropic-on-Bedrock lineup (`anthropic.claude-*`) plus Amazon
-Nova and Llama 3.3 entries
-([catalog.go:300-332](../../../management/server/agentnetwork/catalog/catalog.go)).
-Anthropic-on-Bedrock reuses the first-party Claude prices (with additive cache
-buckets); Nova / Llama report no cache, so cost is `input + output`.
-
-### Credential — static bearer token
-
-Bedrock uses the **AWS Bedrock API key** as a static bearer. The operator sets
-the provider `api_key` directly (no `keyfile::` prefix); the catalog template
-is `Authorization: Bearer ${API_KEY}`
-([catalog.go:306-307](../../../management/server/agentnetwork/catalog/catalog.go)).
-No token minting — the synthesiser substitutes the key into the template and
-the router injects the resulting `Authorization` header after stripping inbound
-vendor auth (including client-supplied AWS SigV4 material: `X-Amz-Date`,
-`X-Amz-Security-Token`, `X-Amz-Content-Sha256`, see `strippedAuthHeaders`).
-
-### Model id form — cross-region inference profiles
-
-Bedrock model ids in the request path must be the cross-region
-**inference-profile** form, e.g.
-`eu.anthropic.claude-sonnet-4-5-20250929-v1:0`. The bare
-`anthropic.claude-…` id is rejected by AWS. `normalizeBedrockModel`
-([llm_request_parser/middleware.go:398-414](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go))
-strips the region prefix (`us.` / `eu.` / `apac.` / `global.`), an optional ARN
-wrapper, and the `-YYYYMMDD-vN[:N]` version/throughput suffix so the normalised
-id (`anthropic.claude-sonnet-4-5`) matches the catalog/pricing key.
-
-### Supported endpoints + actions
-
-`/model/{modelId}/{action}` where action ∈ `invoke`,
-`invoke-with-response-stream`, `converse`, `converse-stream`
-([llm_request_parser/middleware.go:363-390](../../../proxy/internal/middleware/builtin/llm_request_parser/middleware.go)).
-`invoke` / `converse` are non-streaming; the `-stream` actions set the streaming
-flag.
-
- **InvokeModel** body uses the vendor-native shape — for Anthropic that means
-  `"anthropic_version":"bedrock-2023-05-31"` and snake_case usage with additive
-  cache buckets.
- **Converse** uses the unified camelCase shape with a precomputed `totalTokens`.
- The `BedrockParser` reads both shapes on the response leg
-  ([bedrock.go](../../../proxy/internal/llm/bedrock.go)); the request parser
-  doesn't need to distinguish them (`ParseRequest` is a no-op — model + stream
-  come from the path).
-
-### Streaming — AWS binary event-stream
-
-The `-stream` actions return `application/vnd.amazon.eventstream` (the AWS
-binary event-stream framing), and streaming **is metered**.
-`accumulateBedrockStream`
-([llm_response_parser/streaming_bedrock.go](../../../proxy/internal/middleware/builtin/llm_response_parser/streaming_bedrock.go))
-decodes the frames with `aws-sdk-go-v2/aws/protocol/eventstream`:
-
- InvokeModel `chunk` frames wrap a base64 `{"bytes":…}` payload carrying a
-  vendor-native (Anthropic) stream event — folded through the shared Anthropic
-  stream accumulator.
- Converse `contentBlockDelta` frames carry text; the trailing `metadata` frame
-  carries the final usage block.
- A truncated stream (cut at the body-tap capture cap) decodes best-effort:
-  frames up to the cut are applied and partial usage is returned.
-
-### Optional `/bedrock` gateway-namespace prefix
-
-Clients may place an optional `/bedrock` prefix before the native path
-(`/bedrock/model/{modelId}/{action}`) to disambiguate Bedrock from other
-providers that also use `/model/...`. Both the request parser
-(`trimBedrockNamespace`) and the router (`splitBedrockNamespace`) accept it.
-When the prefix is present, the router sets
-`RewriteUpstream.StripPathPrefix = "/bedrock"` so the **native** path
-(`/model/...`) is what reaches `bedrock-runtime.<region>.amazonaws.com`
-([llm_router/middleware.go:168-184, 320-348](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)).
-
-## Model allowlist on path-routed providers
-
-Because the model lives in the URL rather than the body, a path-routed provider
-credential could otherwise be used for any model the upstream supports. The
-router still enforces the route's `Models` allowlist via `matchPathRoute`
-([llm_router/middleware.go:370-416](../../../proxy/internal/middleware/builtin/llm_router/middleware.go)):
-
-1. Filter to routes of the matching style (`Vertex` / `Bedrock`).
-2. Filter to routes whose `AllowedGroupIDs` authorise the caller's groups
-   (else `no_authorised_provider`).
-3. Filter to routes that **claim the requested model**. As with body-routed
-   providers, an **empty `Models` list = catch-all** (serve any model);
-   a non-empty list serves only the listed models (else `model_not_routable`).
-4. Multiple survivors disambiguate by longest `UpstreamPath` prefix match.
-
-So an operator who lists explicit models on a Vertex/Bedrock provider gets a
-hard allowlist; an operator who leaves `Models` empty accepts every model the
-upstream serves (still subject to the unmeterable-publisher gate on Vertex).
-
-Model-less OpenAI endpoints (`GET /v1/models`) are **never** routed to a
-Vertex/Bedrock provider — `matchModelless` skips path-routed routes
-([llm_router/middleware.go:427-462](../../../proxy/internal/middleware/builtin/llm_router/middleware.go))
-so a model-listing call can't be rewritten onto an upstream that would 404 it.
-
-## Catalog ↔ pricing cross-check
-
-Catalog prices and context windows are cross-checked against LiteLLM's
-`model_prices_and_context_window.json`. The proxy's embedded
-`defaults_pricing.yaml` covers **every metered first-party model** the catalog
-enumerates — guarded by
-`TestDefaultTable_FirstPartyModelCoverage`
-([pricing/defaults_coverage_test.go](../../../proxy/internal/llm/pricing/defaults_coverage_test.go)),
-which fails if a catalog model has no embedded price. Bedrock entries are keyed
-by the **normalised** id the request parser emits (region prefix + version
-suffix stripped). Vertex Claude carries no Bedrock-style prefix, so it prices
-straight off the `anthropic` block.
-
-## Things to scrutinise
-
-**Security.** The Vertex service-account key is never forwarded — only a minted
-short-lived bearer. Confirm the key material stays out of access logs (it lives
-on `ProviderRoute.GCPServiceAccountKeyB64`, not in any emitted metadata key).
-The unmeterable-publisher deny is the only thing standing between an
-operator-misconfigured Vertex provider and unmetered Gemini traffic; verify
-`vertexPublisherVendor` stays conservative (deny by default for unknown
-publishers).
-
-**Correctness.** `normalizeBedrockModel` is the join between the wire id and the
-pricing key — a model that normalises to something not in `defaults_pricing.yaml`
-meters at `cost.skipped=unknown_model` rather than failing the request. The
-`/bedrock` prefix strip must run on both the parser side (so the model is
-extracted) and the router side (so the upstream path is native); a regression in
-either silently breaks the other.
-
-**Metering caveats.** eu/apac cross-region Bedrock + Vertex profiles carry a
-~10% premium not modelled by base pricing — flagged in both the catalog comment
-and `defaults_pricing.yaml`. Operators needing exact regional billing override
-the relevant entries.
-
-## Cross-references
-
- Router + request-parser detail: [31-proxy-middleware-builtin.md](31-proxy-middleware-builtin.md)
- Bedrock parser + pricing + SSE / event-stream: [32-proxy-llm-parsers.md](32-proxy-llm-parsers.md)
- Catalog → route synthesis + `keyfile::` handling: [21-management-agentnetwork.md](21-management-agentnetwork.md)
- Overview: [../00-overview.md](../00-overview.md)
--- a/e2e/agentnetwork/bootstrap_test.go
+++ b/e2e/agentnetwork/bootstrap_test.go
@@ -1,30 +0,0 @@
-//go:build e2e
-
-package agentnetwork
-
-import (
-	"context"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-// TestCombinedBootstrap proves Pillar 1: the shared combined server came up and
-// the /api/setup-minted PAT authenticates a real management API call through
-// the typed REST client (the bootstrap itself ran in TestMain).
-func TestCombinedBootstrap(t *testing.T) {
-	ctx := context.Background()
-
-	require.NotEmpty(t, srv.PAT, "TestMain must have minted an admin PAT")
-
-	users, err := srv.API().Users.List(ctx)
-	require.NoError(t, err, "authenticated Users.List must round-trip")
-	require.NotEmpty(t, users, "the bootstrapped account must have at least one user")
-
-	var emails []string
-	for _, u := range users {
-		emails = append(emails, u.Email)
-	}
-	assert.Contains(t, emails, "admin@netbird.test", "the bootstrapped owner should appear in the users list")
-}
--- a/e2e/agentnetwork/main_test.go
+++ b/e2e/agentnetwork/main_test.go
@@ -1,46 +0,0 @@
-//go:build e2e
-
-// Package agentnetwork holds the container-based agent-network e2e suite. A
-// single combined server is built and bootstrapped once per package run
-// (TestMain) and shared across tests via srv; each test creates and cleans up
-// its own resources so order doesn't matter.
-package agentnetwork
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"testing"
-	"time"
-
-	"github.com/netbirdio/netbird/e2e/harness"
-)
-
-// srv is the shared combined server for the package, ready (PAT-authenticated)
-// by the time any Test runs.
-var srv *harness.Combined
-
-func TestMain(m *testing.M) {
-	os.Exit(run(m))
-}
-
-func run(m *testing.M) int {
-	// Generous timeout to cover a cold image build on first run.
-	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
-	defer cancel()
-
-	var err error
-	srv, err = harness.StartCombined(ctx)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "e2e: start combined server: %v\n", err)
-		return 1
-	}
-	defer func() { _ = srv.Terminate(context.Background()) }()
-
-	if _, err := srv.Bootstrap(ctx); err != nil {
-		fmt.Fprintf(os.Stderr, "e2e: bootstrap admin PAT: %v\n", err)
-		return 1
-	}
-
-	return m.Run()
-}
--- a/e2e/agentnetwork/management_test.go
+++ b/e2e/agentnetwork/management_test.go
@@ -1,169 +0,0 @@
-//go:build e2e
-
-package agentnetwork
-
-import (
-	"context"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-
-	"github.com/netbirdio/netbird/shared/management/client/rest"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-)
-
-func ptr[T any](v T) *T { return &v }
-
-// newProvider creates an OpenAI-catalog provider with a dummy key (these tests
-// never call the upstream) and registers cleanup.
-func newProvider(t *testing.T, ctx context.Context, name string) api.AgentNetworkProvider {
-	t.Helper()
-	prov, err := srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
-		Name:             name,
-		ProviderId:       "openai_api",
-		UpstreamUrl:      "https://api.openai.com",
-		ApiKey:           ptr("sk-dummy-e2e-key"),
-		BootstrapCluster: ptr("eu.proxy.netbird.test"),
-	})
-	require.NoError(t, err, "create provider %q", name)
-	t.Cleanup(func() { _ = srv.DeleteProvider(context.Background(), prov.Id) })
-	return prov
-}
-
-// requireClientError asserts err is a REST APIError with a 4xx status.
-func requireClientError(t *testing.T, err error) {
-	t.Helper()
-	var apiErr *rest.APIError
-	require.ErrorAs(t, err, &apiErr, "expected a REST APIError")
-	assert.GreaterOrEqual(t, apiErr.StatusCode, 400, "expected a 4xx status")
-	assert.Less(t, apiErr.StatusCode, 500, "expected a 4xx status")
-}
-
-// TestProviderLifecycle covers create → get → list → delete → 404.
-func TestProviderLifecycle(t *testing.T) {
-	ctx := context.Background()
-
-	prov := newProvider(t, ctx, "Provider Lifecycle")
-	assert.NotEmpty(t, prov.Id, "created provider must have an id")
-	assert.Equal(t, "openai_api", prov.ProviderId)
-
-	got, err := srv.GetProvider(ctx, prov.Id)
-	require.NoError(t, err, "get provider")
-	assert.Equal(t, prov.Id, got.Id)
-
-	list, err := srv.ListProviders(ctx)
-	require.NoError(t, err, "list providers")
-	var ids []string
-	for _, p := range list {
-		ids = append(ids, p.Id)
-	}
-	assert.Contains(t, ids, prov.Id, "created provider must appear in the list")
-
-	require.NoError(t, srv.DeleteProvider(ctx, prov.Id), "delete provider")
-	_, err = srv.GetProvider(ctx, prov.Id)
-	requireClientError(t, err)
-}
-
-// TestProviderValidation rejects a missing API key and an unknown catalog id.
-func TestProviderValidation(t *testing.T) {
-	ctx := context.Background()
-
-	_, err := srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
-		Name:        "No Key",
-		ProviderId:  "openai_api",
-		UpstreamUrl: "https://api.openai.com",
-	})
-	requireClientError(t, err)
-
-	_, err = srv.CreateProvider(ctx, api.AgentNetworkProviderRequest{
-		Name:        "Unknown Catalog",
-		ProviderId:  "totally_unknown_provider",
-		UpstreamUrl: "https://example.com",
-		ApiKey:      ptr("sk-dummy"),
-	})
-	requireClientError(t, err)
-}
-
-// TestSettingsRoundTrip flips the collection toggles and confirms cluster /
-// subdomain stay immutable, then restores the original state.
-func TestSettingsRoundTrip(t *testing.T) {
-	ctx := context.Background()
-
-	// Settings are bootstrapped on first provider create.
-	newProvider(t, ctx, "Settings Bootstrap")
-
-	before, err := srv.GetSettings(ctx)
-	require.NoError(t, err, "get settings")
-	require.NotEmpty(t, before.Cluster, "settings must carry an assigned cluster")
-
-	flipped, err := srv.UpdateSettings(ctx, api.AgentNetworkSettingsRequest{
-		EnableLogCollection:    !before.EnableLogCollection,
-		EnablePromptCollection: !before.EnablePromptCollection,
-		RedactPii:              !before.RedactPii,
-	})
-	require.NoError(t, err, "update settings")
-	assert.Equal(t, !before.EnableLogCollection, flipped.EnableLogCollection, "log collection toggle must flip")
-	assert.Equal(t, !before.EnablePromptCollection, flipped.EnablePromptCollection, "prompt collection toggle must flip")
-	assert.Equal(t, before.Cluster, flipped.Cluster, "cluster must be immutable across updates")
-	assert.Equal(t, before.Subdomain, flipped.Subdomain, "subdomain must be immutable across updates")
-
-	// Restore the original toggles.
-	_, err = srv.UpdateSettings(ctx, api.AgentNetworkSettingsRequest{
-		EnableLogCollection:    before.EnableLogCollection,
-		EnablePromptCollection: before.EnablePromptCollection,
-		RedactPii:              before.RedactPii,
-	})
-	require.NoError(t, err, "restore settings")
-}
-
-// TestPolicyWindowFloor rejects an enabled limit below the 60s window floor and
-// accepts one at the floor.
-func TestPolicyWindowFloor(t *testing.T) {
-	ctx := context.Background()
-
-	grp, err := srv.API().Groups.Create(ctx, api.PostApiGroupsJSONRequestBody{Name: "e2e-policy-grp"})
-	require.NoError(t, err, "create source group")
-	t.Cleanup(func() { _ = srv.API().Groups.Delete(context.Background(), grp.Id) })
-
-	prov := newProvider(t, ctx, "Policy Provider")
-
-	limits := func(window int64) *api.AgentNetworkPolicyLimits {
-		return &api.AgentNetworkPolicyLimits{
-			TokenLimit: api.AgentNetworkPolicyTokenLimit{
-				Enabled:       true,
-				GroupCap:      1000,
-				UserCap:       1000,
-				WindowSeconds: window,
-			},
-		}
-	}
-
-	_, err = srv.CreatePolicy(ctx, api.AgentNetworkPolicyRequest{
-		Name:                   "e2e-below-floor",
-		SourceGroups:           []string{grp.Id},
-		DestinationProviderIds: []string{prov.Id},
-		Limits:                 limits(30),
-	})
-	requireClientError(t, err)
-
-	pol, err := srv.CreatePolicy(ctx, api.AgentNetworkPolicyRequest{
-		Name:                   "e2e-at-floor",
-		SourceGroups:           []string{grp.Id},
-		DestinationProviderIds: []string{prov.Id},
-		Limits:                 limits(60),
-	})
-	require.NoError(t, err, "policy at the 60s floor must be accepted")
-	assert.NotEmpty(t, pol.Id, "created policy must have an id")
-	t.Cleanup(func() { _ = srv.DeletePolicy(context.Background(), pol.Id) })
-}
-
-// TestConsumptionList confirms the read endpoint always returns an array, never
-// a 404/500.
-func TestConsumptionList(t *testing.T) {
-	ctx := context.Background()
-
-	rows, err := srv.ListConsumption(ctx)
-	require.NoError(t, err, "consumption list must not error")
-	assert.NotNil(t, rows, "consumption must be a JSON array (possibly empty)")
-}
--- a/e2e/harness/agentnetwork.go
+++ b/e2e/harness/agentnetwork.go
@@ -1,101 +0,0 @@
-//go:build e2e
-
-package harness
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"fmt"
-	"io"
-	"net/http"
-
-	"github.com/netbirdio/netbird/shared/management/http/api"
-)
-
-// The shared REST client doesn't (yet) expose typed agent-network methods, so
-// these helpers drive the /api/agent-network/* endpoints through the client's
-// NewRequest primitive — reusing its auth, error handling (rest.APIError on
-// non-2xx), and transport — while still speaking the generated api types.
-
-// anRequest issues an agent-network API call and decodes the JSON response into
-// T. A non-2xx response surfaces as a *rest.APIError from the client, which
-// tests inspect for negative-path status assertions.
-func anRequest[T any](ctx context.Context, c *Combined, method, path string, body any) (T, error) {
-	var out T
-	var reader io.Reader
-	if body != nil {
-		bs, err := json.Marshal(body)
-		if err != nil {
-			return out, fmt.Errorf("marshal %s %s: %w", method, path, err)
-		}
-		reader = bytes.NewReader(bs)
-	}
-
-	resp, err := c.api.NewRequest(ctx, method, path, reader, nil)
-	if err != nil {
-		return out, err
-	}
-	defer resp.Body.Close()
-
-	if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
-		return out, fmt.Errorf("decode %s %s response: %w", method, path, err)
-	}
-	return out, nil
-}
-
-// anDelete issues a DELETE and discards the (empty-object) body.
-func anDelete(ctx context.Context, c *Combined, path string) error {
-	resp, err := c.api.NewRequest(ctx, http.MethodDelete, path, nil, nil)
-	if err != nil {
-		return err
-	}
-	resp.Body.Close()
-	return nil
-}
-
-// CreateProvider creates an agent-network provider.
-func (c *Combined) CreateProvider(ctx context.Context, req api.AgentNetworkProviderRequest) (api.AgentNetworkProvider, error) {
-	return anRequest[api.AgentNetworkProvider](ctx, c, http.MethodPost, "/api/agent-network/providers", req)
-}
-
-// GetProvider fetches a provider by id.
-func (c *Combined) GetProvider(ctx context.Context, id string) (api.AgentNetworkProvider, error) {
-	return anRequest[api.AgentNetworkProvider](ctx, c, http.MethodGet, "/api/agent-network/providers/"+id, nil)
-}
-
-// ListProviders returns all providers for the account.
-func (c *Combined) ListProviders(ctx context.Context) ([]api.AgentNetworkProvider, error) {
-	return anRequest[[]api.AgentNetworkProvider](ctx, c, http.MethodGet, "/api/agent-network/providers", nil)
-}
-
-// DeleteProvider removes a provider by id.
-func (c *Combined) DeleteProvider(ctx context.Context, id string) error {
-	return anDelete(ctx, c, "/api/agent-network/providers/"+id)
-}
-
-// CreatePolicy creates an agent-network policy.
-func (c *Combined) CreatePolicy(ctx context.Context, req api.AgentNetworkPolicyRequest) (api.AgentNetworkPolicy, error) {
-	return anRequest[api.AgentNetworkPolicy](ctx, c, http.MethodPost, "/api/agent-network/policies", req)
-}
-
-// DeletePolicy removes a policy by id.
-func (c *Combined) DeletePolicy(ctx context.Context, id string) error {
-	return anDelete(ctx, c, "/api/agent-network/policies/"+id)
-}
-
-// GetSettings returns the account's agent-network settings row. It exists only
-// after the first provider create bootstraps it.
-func (c *Combined) GetSettings(ctx context.Context) (api.AgentNetworkSettings, error) {
-	return anRequest[api.AgentNetworkSettings](ctx, c, http.MethodGet, "/api/agent-network/settings", nil)
-}
-
-// UpdateSettings applies the mutable collection toggles.
-func (c *Combined) UpdateSettings(ctx context.Context, req api.AgentNetworkSettingsRequest) (api.AgentNetworkSettings, error) {
-	return anRequest[api.AgentNetworkSettings](ctx, c, http.MethodPut, "/api/agent-network/settings", req)
-}
-
-// ListConsumption returns the account's consumption rows (possibly empty).
-func (c *Combined) ListConsumption(ctx context.Context) ([]api.AgentNetworkConsumption, error) {
-	return anRequest[[]api.AgentNetworkConsumption](ctx, c, http.MethodGet, "/api/agent-network/consumption", nil)
-}
--- a/e2e/harness/bootstrap.go
+++ b/e2e/harness/bootstrap.go
@@ -1,47 +0,0 @@
-//go:build e2e
-
-package harness
-
-import (
-	"context"
-	"fmt"
-
-	"github.com/netbirdio/netbird/shared/management/client/rest"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-)
-
-// Bootstrap creates the initial admin owner through the unauthenticated
-// /api/setup endpoint and returns the plaintext admin PAT. It also wires an
-// authenticated REST client on the Combined (see API). create_pat requires the
-// server to run with NB_SETUP_PAT_ENABLED=true, which the harness sets. A
-// second call returns an error (the server reports setup already completed).
-func (c *Combined) Bootstrap(ctx context.Context) (string, error) {
-	// The setup endpoint is unauthenticated; use a tokenless client.
-	setupClient := rest.NewWithOptions(rest.WithManagementURL(c.BaseURL))
-
-	createPAT := true
-	expireDays := 1
-	resp, err := setupClient.Instance.Setup(ctx, api.PostApiSetupJSONRequestBody{
-		Email:       "admin@netbird.test",
-		Password:    "Netbird-e2e-Passw0rd!",
-		Name:        "E2E Admin",
-		CreatePat:   &createPAT,
-		PatExpireIn: &expireDays,
-	})
-	if err != nil {
-		return "", fmt.Errorf("instance setup: %w", err)
-	}
-	if resp.PersonalAccessToken == nil || *resp.PersonalAccessToken == "" {
-		return "", fmt.Errorf("setup succeeded but no PAT returned (is NB_SETUP_PAT_ENABLED set?)")
-	}
-
-	c.PAT = *resp.PersonalAccessToken
-	c.api = rest.New(c.BaseURL, c.PAT)
-	return c.PAT, nil
-}
-
-// API returns the PAT-authenticated management REST client. It is nil until
-// Bootstrap runs.
-func (c *Combined) API() *rest.Client {
-	return c.api
-}
--- a/e2e/harness/combined.go
+++ b/e2e/harness/combined.go
@@ -1,146 +0,0 @@
-//go:build e2e
-
-package harness
-
-import (
-	"context"
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"time"
-
-	"github.com/docker/docker/api/types/container"
-	"github.com/docker/go-connections/nat"
-	"github.com/testcontainers/testcontainers-go"
-	"github.com/testcontainers/testcontainers-go/wait"
-
-	"github.com/netbirdio/netbird/shared/management/client/rest"
-)
-
-const (
-	combinedDockerfile = "combined/Dockerfile.multistage"
-	combinedImageTag   = "netbird-e2e-combined:latest"
-	combinedHTTPPort   = "8080/tcp"
-
-	// containerInternalURL is how the server addresses itself: it listens on
-	// :8080 inside the container, so the embedded IdP issuer and exposed
-	// address both resolve there. Tests reach it via the mapped host port.
-	containerInternalURL = "http://localhost:8080"
-	containerIssuer      = containerInternalURL + "/oauth2"
-)
-
-// Combined is a running combined NetBird server (management + signal + relay +
-// STUN + embedded IdP) plus the connection details tests need.
-type Combined struct {
-	container testcontainers.Container
-	// BaseURL is the host-reachable management API root, e.g. http://127.0.0.1:51234.
-	BaseURL string
-	// PAT is the admin Personal Access Token minted via Bootstrap.
-	PAT string
-
-	api     *rest.Client
-	workDir string
-}
-
-// StartCombined builds the combined server from its multistage Dockerfile and
-// boots it with setup-PAT enabled, returning once the API is serving. The
-// caller still owns minting the admin PAT via Bootstrap.
-func StartCombined(ctx context.Context) (*Combined, error) {
-	root, err := repoRoot()
-	if err != nil {
-		return nil, err
-	}
-
-	// Build with the docker CLI (BuildKit) rather than testcontainers' classic
-	// build path so the Dockerfile's cache mounts are honored and recompiles
-	// stay incremental. testcontainers then just runs the tagged image.
-	if err := buildCombinedImage(ctx, root); err != nil {
-		return nil, err
-	}
-
-	// Work dir under /tmp so Docker Desktop file sharing (which excludes
-	// macOS's /var/folders TMPDIR) can bind-mount it.
-	workDir, err := os.MkdirTemp("/tmp", "nb-e2e-combined-*")
-	if err != nil {
-		return nil, fmt.Errorf("create work dir: %w", err)
-	}
-
-	cfg := fmt.Sprintf(combinedConfigYAML, containerInternalURL, containerIssuer)
-	if err := os.WriteFile(filepath.Join(workDir, "config.yaml"), []byte(cfg), 0o644); err != nil {
-		return nil, fmt.Errorf("write combined config: %w", err)
-	}
-	if err := os.MkdirAll(filepath.Join(workDir, "data"), 0o755); err != nil {
-		return nil, fmt.Errorf("create datadir: %w", err)
-	}
-
-	req := testcontainers.ContainerRequest{
-		Image:        combinedImageTag,
-		ExposedPorts: []string{combinedHTTPPort},
-		Env: map[string]string{
-			"NB_SETUP_PAT_ENABLED": "true",
-		},
-		Cmd: []string{"--config", "/nb/config.yaml"},
-		HostConfigModifier: func(hc *container.HostConfig) {
-			hc.Binds = append(hc.Binds, workDir+":/nb")
-		},
-		WaitingFor: wait.ForHTTP("/api/instance").
-			WithPort(combinedHTTPPort).
-			WithStatusCodeMatcher(func(status int) bool { return status == 200 }).
-			WithStartupTimeout(90 * time.Second),
-	}
-
-	c, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
-		ContainerRequest: req,
-		Started:          true,
-	})
-	if err != nil {
-		return nil, fmt.Errorf("start combined container: %w", err)
-	}
-
-	host, err := c.Host(ctx)
-	if err != nil {
-		_ = c.Terminate(ctx)
-		return nil, fmt.Errorf("container host: %w", err)
-	}
-	mapped, err := c.MappedPort(ctx, nat.Port(combinedHTTPPort))
-	if err != nil {
-		_ = c.Terminate(ctx)
-		return nil, fmt.Errorf("mapped port: %w", err)
-	}
-
-	return &Combined{
-		container: c,
-		BaseURL:   fmt.Sprintf("http://%s:%s", host, mapped.Port()),
-		workDir:   workDir,
-	}, nil
-}
-
-// buildCombinedImage builds the combined server image via the docker CLI with
-// BuildKit enabled, so the Dockerfile's cache mounts work and unchanged sources
-// reuse the layer + go caches. The image is tagged stably so reruns are cheap.
-func buildCombinedImage(ctx context.Context, root string) error {
-	cmd := exec.CommandContext(ctx, "docker", "build",
-		"-f", combinedDockerfile,
-		"-t", combinedImageTag,
-		".",
-	)
-	cmd.Dir = root
-	cmd.Env = append(os.Environ(), "DOCKER_BUILDKIT=1")
-	if out, err := cmd.CombinedOutput(); err != nil {
-		return fmt.Errorf("build combined image: %w\n%s", err, string(out))
-	}
-	return nil
-}
-
-// Terminate stops the container and removes the work dir.
-func (c *Combined) Terminate(ctx context.Context) error {
-	var err error
-	if c.container != nil {
-		err = c.container.Terminate(ctx)
-	}
-	if c.workDir != "" {
-		_ = os.RemoveAll(c.workDir)
-	}
-	return err
-}
--- a/e2e/harness/config.go
+++ b/e2e/harness/config.go
@@ -1,24 +0,0 @@
-//go:build e2e
-
-package harness
-
-// combinedConfigYAML is a minimal combined-server config for tests: plain HTTP
-// on :8080 (no TLS cert configured → the server serves HTTP and expects to sit
-// behind a reverse proxy, which is exactly what we want for in-cluster tests),
-// embedded IdP, local signal/relay/STUN, and a sqlite store under the mounted
-// data dir. exposedAddress is the address peers use to reach this container; it
-// is overridden per-run so the value matches the container's network alias.
-const combinedConfigYAML = `server:
-  listenAddress: ":8080"
-  exposedAddress: "%s"
-  healthcheckAddress: ":9000"
-  metricsPort: 9090
-  logLevel: "info"
-  logFile: "console"
-  authSecret: "e2e-relay-secret"
-  dataDir: "/nb/data"
-  auth:
-    issuer: "%s"
-  store:
-    engine: "sqlite"
-`
--- a/e2e/harness/doc.go
+++ b/e2e/harness/doc.go
@@ -1,13 +0,0 @@
-//go:build e2e
-
-// Package harness provides a self-contained, OIDC-free way to stand up NetBird
-// components in containers for end-to-end tests. It is feature-agnostic: any
-// suite can ask for a live management server (with an admin PAT minted through
-// the unauthenticated /api/setup bootstrap) and, later, a proxy and client.
-//
-// The harness compiles each component once in a cached builder container and
-// mounts the resulting binary into a slim runtime container, so iterating on a
-// branch doesn't pay a full image rebuild per run. Everything is gated behind
-// the `e2e` build tag so normal builds and unit tests never pull in
-// testcontainers.
-package harness
--- a/e2e/harness/paths.go
+++ b/e2e/harness/paths.go
@@ -1,29 +0,0 @@
-//go:build e2e
-
-package harness
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-)
-
-// repoRoot walks up from the working directory to the module root (the
-// directory holding go.mod), so the Docker build context is correct no matter
-// which package the test runs from.
-func repoRoot() (string, error) {
-	dir, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-	for {
-		if _, statErr := os.Stat(filepath.Join(dir, "go.mod")); statErr == nil {
-			return dir, nil
-		}
-		parent := filepath.Dir(dir)
-		if parent == dir {
-			return "", fmt.Errorf("go.mod not found above %s", dir)
-		}
-		dir = parent
-	}
-}
--- a/go.mod
+++ b/go.mod
@@ -35,7 +35,6 @@ require (
 	github.com/DeRuina/timberjack v1.4.2
 	github.com/awnumar/memguard v0.23.0
 	github.com/aws/aws-sdk-go-v2 v1.38.3
-	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1
 	github.com/aws/aws-sdk-go-v2/config v1.31.6
 	github.com/aws/aws-sdk-go-v2/credentials v1.18.10
 	github.com/aws/aws-sdk-go-v2/service/s3 v1.87.3
@@ -157,6 +156,7 @@ require (
 	github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
 	github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
 	github.com/awnumar/memcall v0.4.0 // indirect
+	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.1 // indirect
 	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.6 // indirect
 	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 // indirect
 	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 // indirect
--- a/infrastructure_files/getting-started-enterprise.sh
+++ b/infrastructure_files/getting-started-enterprise.sh
@@ -1,616 +0,0 @@
-#!/bin/bash
-
-set -e
-set -o pipefail
-
-# NetBird Enterprise — Getting Started
-# Single-node bootstrap for a self-hosted NetBird Enterprise stack with the
-# embedded identity provider. Owner is created via first-login flow.
-
-SED_STRIP_PADDING='s/=//g'
-
-check_docker_compose() {
-  if command -v docker-compose &> /dev/null; then
-    echo "docker-compose"
-    return
-  fi
-  if docker compose --help &> /dev/null; then
-    echo "docker compose"
-    return
-  fi
-  echo "docker-compose is not installed or not in PATH. See https://docs.docker.com/engine/install/" > /dev/stderr
-  exit 1
-}
-
-check_openssl() {
-  if ! command -v openssl &> /dev/null; then
-    echo "openssl is not installed or not in PATH." > /dev/stderr
-    exit 1
-  fi
-}
-
-rand_secret() {
-  openssl rand -base64 32 | sed "$SED_STRIP_PADDING"
-}
-
-rand_b64_key() {
-  openssl rand -base64 32
-}
-
-check_nb_domain() {
-  local domain="$1"
-  if [[ -z "$domain" ]]; then
-    echo "The domain cannot be empty." > /dev/stderr
-    return 1
-  fi
-  if [[ "$domain" == "netbird.example.com" ]]; then
-    echo "The domain cannot be netbird.example.com" > /dev/stderr
-    return 1
-  fi
-  if [[ "$domain" =~ ^[0-9.]+$ ]]; then
-    echo "An IP address is not allowed. A real DNS-resolvable domain is required for TLS and the embedded IdP issuer." > /dev/stderr
-    return 1
-  fi
-  if [[ ! "$domain" =~ ^[A-Za-z0-9]([A-Za-z0-9-]*[A-Za-z0-9])?(\.[A-Za-z0-9]([A-Za-z0-9-]*[A-Za-z0-9])?)+$ ]]; then
-    echo "The value '$domain' is not a valid FQDN. A real DNS-resolvable domain is required for TLS and the embedded IdP issuer." > /dev/stderr
-    return 1
-  fi
-  return 0
-}
-
-check_domain_resolves() {
-  local domain="$1"
-  if command -v getent &> /dev/null && getent hosts "$domain" &> /dev/null; then return 0; fi
-  if command -v host &> /dev/null && host "$domain" &> /dev/null; then return 0; fi
-  if command -v dig &> /dev/null && [[ -n "$(dig +short "$domain" 2>/dev/null)" ]]; then return 0; fi
-  if command -v nslookup &> /dev/null && nslookup "$domain" &> /dev/null; then return 0; fi
-  return 1
-}
-
-read_nb_domain() {
-  local value=""
-  echo -n "Enter the FQDN for NetBird (must resolve via DNS, e.g. netbird.my-domain.com): " > /dev/stderr
-  read -r value < /dev/tty
-  if ! check_nb_domain "$value"; then
-    read_nb_domain
-    return
-  fi
-  if ! check_domain_resolves "$value"; then
-    echo "" > /dev/stderr
-    echo "Warning: '$value' does not resolve via DNS from this host." > /dev/stderr
-    echo "Caddy will not be able to issue TLS certificates until it does." > /dev/stderr
-    local confirm=""
-    echo -n "Continue anyway? [y/N]: " > /dev/stderr
-    read -r confirm < /dev/tty
-    if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
-      read_nb_domain
-      return
-    fi
-  fi
-  echo "$value"
-}
-
-read_required() {
-  local prompt="$1"
-  local value=""
-  while [[ -z "$value" ]]; do
-    echo -n "$prompt: " > /dev/stderr
-    read -r value < /dev/tty
-    if [[ -z "$value" ]]; then
-      echo "Value cannot be empty." > /dev/stderr
-    fi
-  done
-  echo "$value"
-}
-
-read_secret() {
-  local prompt="$1"
-  local value=""
-  while [[ -z "$value" ]]; do
-    echo -n "$prompt: " > /dev/stderr
-    read -rs value < /dev/tty
-    echo "" > /dev/stderr
-    if [[ -z "$value" ]]; then
-      echo "Value cannot be empty." > /dev/stderr
-    fi
-  done
-  echo "$value"
-}
-
-# read_yes_no "<prompt>" [<default y|n>]
-read_yes_no() {
-  local prompt="$1"
-  local default="${2:-n}"
-  local hint
-  if [[ "$default" == "y" ]]; then
-    hint="[Y/n]"
-  else
-    hint="[y/N]"
-  fi
-  echo -n "${prompt} ${hint}: " > /dev/stderr
-  local ans=""
-  read -r ans < /dev/tty
-  if [[ -z "$ans" ]]; then
-    ans="$default"
-  fi
-  case "$ans" in
-    [Yy] | [Yy][Ee][Ss]) echo "yes" ;;
-    *) echo "no" ;;
-  esac
-}
-
-wait_postgres() {
-  set +e
-  echo -n "Waiting for postgres to become ready"
-  local counter=1
-  while true; do
-    if $DOCKER_COMPOSE_COMMAND exec -T postgres pg_isready -U "$POSTGRES_USER" -d "$POSTGRES_DB" &> /dev/null; then
-      break
-    fi
-    if [[ $counter -eq 60 ]]; then
-      echo ""
-      echo "Postgres is taking too long. Recent logs:"
-      $DOCKER_COMPOSE_COMMAND logs --tail=20 postgres
-      exit 1
-    fi
-    echo -n " ."
-    sleep 2
-    counter=$((counter + 1))
-  done
-  echo " done"
-  set -e
-}
-
-init_environment() {
-  check_openssl
-  DOCKER_COMPOSE_COMMAND=$(check_docker_compose)
-
-  if [[ -f .env ]] || [[ -f docker-compose.yml ]] || [[ -f config.yaml ]] || [[ -f Caddyfile ]]; then
-    echo "Generated files already exist in $(pwd)."
-    echo "If you want to reinitialize the environment, please remove them first:"
-    echo "  $DOCKER_COMPOSE_COMMAND down --volumes # removes all containers and volumes"
-    echo "  rm -f .env docker-compose.yml Caddyfile config.yaml"
-    echo "Be aware this will remove all data from the database."
-    exit 1
-  fi
-
-  echo "NetBird Enterprise bootstrap"
-  echo ""
-  echo "Traffic flow:"
-  echo "  Enables traffic events logging on the management server."
-  echo "  When enabled, the NetBird stack also runs NATS along with two"
-  echo "  additional containers: netbird-receiver (the traffic log receiver"
-  echo "  service) and netbird-enricher (the traffic log enricher service)."
-  echo "  It still has to be turned on from the dashboard settings afterwards."
-  echo "  See https://docs.netbird.io/manage/activity/traffic-events-logging"
-  NETBIRD_TRAFFIC_FLOW=$(read_yes_no "Enable traffic flow" "n")
-
-  echo ""
-  NETBIRD_DOMAIN=$(read_nb_domain)
-
-  echo ""
-
-  NETBIRD_LICENSE_KEY=$(read_secret "Enter license key (input hidden)")
-
-  GHCR_USERNAME="netbirdExtAccess1"
-  GHCR_TOKEN=$(read_secret "Enter GHCR token (input hidden)")
-
-  POSTGRES_USER="netbird"
-  POSTGRES_DB="netbird"
-  POSTGRES_PASSWORD=$(rand_secret)
-  NETBIRD_ENCRYPTION_KEY=$(rand_b64_key)
-  NETBIRD_RELAY_AUTH_SECRET=$(rand_secret)
-
-  POSTGRES_DSN="host=postgres user=${POSTGRES_USER} password=${POSTGRES_PASSWORD} dbname=${POSTGRES_DB} port=5432 sslmode=disable TimeZone=UTC"
-  NETBIRD_RELAY_ENDPOINT="rels://${NETBIRD_DOMAIN}:443"
-
-  echo ""
-  echo "Selected:"
-  echo "  Traffic flow: ${NETBIRD_TRAFFIC_FLOW}"
-  echo "  Domain:       ${NETBIRD_DOMAIN}"
-  echo ""
-  echo "Rendering files into $(pwd) ..."
-  install -m 600 /dev/null .env
-  render_env >> .env
-  render_docker_compose > docker-compose.yml
-
-  if [[ -z "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
-    sed -i.bak '/NETBIRD_LICENSE_SERVER_BASE_URL/d' docker-compose.yml && rm -f docker-compose.yml.bak
-  fi
-  render_caddyfile > Caddyfile
-  install -m 600 /dev/null config.yaml
-  render_config_yaml >> config.yaml
-
-  echo "Logging in to ghcr.io ..."
-  printf '%s' "$GHCR_TOKEN" | docker login ghcr.io -u "$GHCR_USERNAME" --password-stdin
-  unset GHCR_TOKEN
-
-  echo ""
-  echo "Pulling images ..."
-  $DOCKER_COMPOSE_COMMAND pull
-
-  echo ""
-  echo "Starting postgres ..."
-  $DOCKER_COMPOSE_COMMAND up -d postgres
-  sleep 2
-  wait_postgres
-
-  echo ""
-  echo "Starting remaining services ..."
-  $DOCKER_COMPOSE_COMMAND up -d
-
-  echo ""
-  echo "Done."
-  echo ""
-  echo "Dashboard: https://${NETBIRD_DOMAIN}"
-  echo ""
-  echo "Open the dashboard in a browser to complete the first-login owner setup."
-  echo "All configuration and secrets are stored (mode 600) in $(pwd)/.env"
-  echo ""
-  echo "Tail logs:"
-  echo "  cd $(pwd) && $DOCKER_COMPOSE_COMMAND logs -f netbird-server caddy"
-}
-
-# ------------------------------------------------------------------
-# Renderers
-# ------------------------------------------------------------------
-
-render_env() {
-  cat <<EOF
-# Generated by getting-started-enterprise.sh
-# Holds all configuration and secrets for the stack. Mode 600.
-
-# Features (set by the script; don't edit without re-running)
-NETBIRD_TRAFFIC_FLOW_ENABLED=${NETBIRD_TRAFFIC_FLOW}
-
-# Domain
-NETBIRD_DOMAIN=${NETBIRD_DOMAIN}
-
-# Image tags. Default to "latest"
-NETBIRD_DASHBOARD_TAG=${NETBIRD_DASHBOARD_TAG:-latest}
-NETBIRD_SERVER_TAG=${NETBIRD_SERVER_TAG:-latest}
-EOF
-
-  if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
-    cat <<EOF
-NETBIRD_ENRICHER_TAG=${NETBIRD_ENRICHER_TAG:-latest}
-NETBIRD_RECEIVER_TAG=${NETBIRD_RECEIVER_TAG:-latest}
-EOF
-  fi
-
-  cat <<EOF
-
-# License keys
-EOF
-  if [[ -n "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
-    cat <<EOF
-NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
-EOF
-  fi
-  cat <<EOF
-NETBIRD_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
-EOF
-
-  cat <<EOF
-
-# Postgres
-POSTGRES_USER=${POSTGRES_USER}
-POSTGRES_DB=${POSTGRES_DB}
-POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
-NETBIRD_STORE_ENGINE_POSTGRES_DSN=${POSTGRES_DSN}
-
-# Relay
-NETBIRD_RELAY_ENDPOINT=${NETBIRD_RELAY_ENDPOINT}
-NETBIRD_RELAY_AUTH_SECRET=${NETBIRD_RELAY_AUTH_SECRET}
-
-# Datastore encryption
-NETBIRD_ENCRYPTION_KEY=${NETBIRD_ENCRYPTION_KEY}
-
-# Dashboard OIDC scopes
-NETBIRD_AUTH_SUPPORTED_SCOPES=${NETBIRD_AUTH_SUPPORTED_SCOPES:-openid profile email groups}
-EOF
-}
-
-render_docker_compose() {
-  render_compose_header
-  render_compose_common
-  render_compose_server
-  if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
-    render_compose_flow
-  fi
-  render_compose_postgres
-  render_compose_footer
-}
-
-render_compose_header() {
-  cat <<'EOF'
-x-default: &default
-  restart: unless-stopped
-  logging:
-    driver: json-file
-    options:
-      max-size: '500m'
-      max-file: '2'
-
-services:
-EOF
-}
-
-render_compose_common() {
-  cat <<'EOF'
-  caddy:
-    <<: *default
-    image: caddy:2
-    container_name: netbird-caddy
-    networks: [netbird]
-    environment:
-      - CADDY_SECURE_DOMAIN=${NETBIRD_DOMAIN}
-    ports:
-      - '443:443'
-      - '443:443/udp'
-      - '80:80'
-    volumes:
-      - netbird_caddy_data:/data
-      - ./Caddyfile:/etc/caddy/Caddyfile
-
-  dashboard:
-    <<: *default
-    image: ghcr.io/netbirdio/dashboard-cloud:${NETBIRD_DASHBOARD_TAG}
-    container_name: netbird-dashboard
-    networks: [netbird]
-    environment:
-      - NETBIRD_MGMT_API_ENDPOINT=https://${NETBIRD_DOMAIN}
-      - NETBIRD_MGMT_GRPC_API_ENDPOINT=https://${NETBIRD_DOMAIN}
-      - AUTH_AUDIENCE=netbird-dashboard
-      - AUTH_CLIENT_ID=netbird-dashboard
-      - AUTH_CLIENT_SECRET=
-      - AUTH_AUTHORITY=https://${NETBIRD_DOMAIN}/oauth2
-      - USE_AUTH0=false
-      - AUTH_SUPPORTED_SCOPES=${NETBIRD_AUTH_SUPPORTED_SCOPES}
-      - AUTH_REDIRECT_URI=/nb-auth
-      - AUTH_SILENT_REDIRECT_URI=/nb-silent-auth
-      - NETBIRD_TOKEN_SOURCE=accessToken
-      - NGINX_SSL_PORT=443
-      - LETSENCRYPT_DOMAIN=
-      - LETSENCRYPT_EMAIL=
-
-EOF
-}
-
-render_compose_server() {
-  cat <<'EOF'
-  netbird-server:
-    <<: *default
-    image: ghcr.io/netbirdio/netbird-server-cloud:${NETBIRD_SERVER_TAG}
-    container_name: netbird-server
-    networks: [netbird]
-    depends_on:
-      dashboard:
-        condition: service_started
-      postgres:
-        condition: service_healthy
-    ports:
-      - '3478:3478/udp'
-    volumes:
-      - netbird_data:/var/lib/netbird
-      - ./config.yaml:/etc/netbird/config.yaml
-    command: ["--config", "/etc/netbird/config.yaml"]
-    environment:
-      - NB_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
-      - NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
-
-EOF
-}
-
-render_compose_flow() {
-  cat <<'EOF'
-  nats:
-    <<: *default
-    image: nats:2
-    container_name: netbird-nats
-    networks: [netbird]
-    volumes:
-      - netbird_nats_data:/data
-    command: ["-m", "8222", "--jetstream", "--store_dir", "/data"]
-
-  enricher:
-    <<: *default
-    image: ghcr.io/netbirdio/flow-enricher-cloud:${NETBIRD_ENRICHER_TAG}
-    container_name: netbird-enricher
-    networks: [netbird]
-    depends_on:
-      postgres:
-        condition: service_healthy
-      nats:
-        condition: service_started
-    volumes:
-      - netbird_enricher:/var/lib/netbird
-    environment:
-      - NB_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
-      - NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
-      - NB_DATADIR=/var/lib/netbird
-      - NB_MANAGEMENT_STORE_ENGINE=postgres
-      - NB_MANAGEMENT_POSTGRES_DSN=${NETBIRD_STORE_ENGINE_POSTGRES_DSN}
-      - NETBIRD_STORE_ENGINE_POSTGRES_DSN=${NETBIRD_STORE_ENGINE_POSTGRES_DSN}
-      - NB_TRAFFIC_EVENT_POSTGRES_DSN=${NETBIRD_STORE_ENGINE_POSTGRES_DSN}
-      - NB_TRAFFIC_EVENT_STORE_ENGINE=postgres
-      - NB_MANAGEMENT_STORE_KEY=${NETBIRD_ENCRYPTION_KEY}
-      - NB_FLOW_ADAPTER_TYPE=nats
-      - NB_FLOW_NATS_ENDPOINTS=nats://nats:4222
-      - NB_FLOW_NATS_STREAM=traffic-events
-      - NB_METRICS_PORT=9091
-      - NB_PERSISTENCE_RETENTION_PERIOD=168h
-
-  receiver:
-    <<: *default
-    image: ghcr.io/netbirdio/flow-receiver-cloud:${NETBIRD_RECEIVER_TAG}
-    container_name: netbird-receiver
-    networks: [netbird]
-    depends_on:
-      nats:
-        condition: service_started
-    environment:
-      - NB_LICENSE_KEY=${NETBIRD_LICENSE_KEY}
-      - NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}
-      - NB_FLOW_LISTEN_PORT=80
-      - NB_FLOW_ADAPTER_TYPE=nats
-      - NB_FLOW_NATS_ENDPOINTS=nats://nats:4222
-      - NB_FLOW_NATS_STREAM=traffic-events
-      - NB_FLOW_AUTH_SECRET=${NETBIRD_RELAY_AUTH_SECRET}
-
-EOF
-}
-
-render_compose_postgres() {
-  cat <<'EOF'
-  postgres:
-    <<: *default
-    image: postgres:17
-    container_name: netbird-postgres
-    networks: [netbird]
-    environment:
-      - POSTGRES_USER=${POSTGRES_USER}
-      - POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
-      - POSTGRES_DB=${POSTGRES_DB}
-    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"]
-      interval: 10s
-      timeout: 5s
-      retries: 10
-    volumes:
-      - netbird_postgres:/var/lib/postgresql/data
-
-EOF
-}
-
-render_compose_footer() {
-  cat <<'EOF'
-volumes:
-  netbird_data:
-EOF
-  if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
-    cat <<'EOF'
-  netbird_nats_data:
-  netbird_enricher:
-EOF
-  fi
-  cat <<'EOF'
-  netbird_postgres:
-  netbird_caddy_data:
-
-networks:
-  netbird:
-EOF
-}
-
-render_caddyfile() {
-  cat <<'EOF'
-{
-  servers :80,:443 {
-    protocols h1 h2c h2 h3
-  }
-}
-
-(security_headers) {
-    header * {
-        Strict-Transport-Security "max-age=3600; includeSubDomains; preload"
-        X-Content-Type-Options "nosniff"
-        X-Frame-Options "SAMEORIGIN"
-        X-XSS-Protection "1; mode=block"
-        -Server
-        Referrer-Policy strict-origin-when-cross-origin
-    }
-}
-
-:80 {
-    redir https://{$CADDY_SECURE_DOMAIN}{uri} permanent
-}
-
-{$CADDY_SECURE_DOMAIN}:443 {
-    import security_headers
-    # Signal (gRPC over h2c)
-    reverse_proxy /signalexchange.SignalExchange/* h2c://netbird-server:80
-    # Management (gRPC over h2c + HTTP)
-    reverse_proxy /management.ManagementService/* h2c://netbird-server:80
-    reverse_proxy /api/* netbird-server:80
-    reverse_proxy /ws-proxy/* netbird-server:80
-    # Embedded IdP (OAuth2 endpoints served by netbird server)
-    reverse_proxy /oauth2/* netbird-server:80
-    # Relay (WebSocket multiplexed on the same port)
-    reverse_proxy /relay* netbird-server:80
-EOF
-
-  if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
-    cat <<'EOF'
-    # Flow receiver (gRPC over h2c)
-    reverse_proxy /flow.FlowService/* h2c://receiver:80
-EOF
-  fi
-
-  cat <<'EOF'
-    # Dashboard
-    reverse_proxy /* dashboard:80
-}
-EOF
-}
-
-render_config_yaml() {
-  cat <<EOF
-# NetBird Enterprise server configuration.
-# Generated by getting-started-enterprise.sh. Mode 600.
-
-server:
-  listenAddress: ":80"
-  exposedAddress: "https://${NETBIRD_DOMAIN}:443"
-
-  metricsPort: 9090
-  healthcheckAddress: ":9000"
-
-  logLevel: "info"
-  logFile: "console"
-
-  # TLS is terminated by Caddy in front; leave this block empty.
-  tls:
-    certFile: ""
-    keyFile: ""
-    letsencrypt:
-      enabled: false
-
-  authSecret: "${NETBIRD_RELAY_AUTH_SECRET}"
-  dataDir: "/var/lib/netbird/"
-
-  disableAnonymousMetrics: false
-  disableGeoliteUpdate: false
-
-  auth:
-    issuer: "https://${NETBIRD_DOMAIN}/oauth2"
-    localAuthDisabled: false
-    signKeyRefreshEnabled: false
-    dashboardRedirectURIs:
-      - "https://${NETBIRD_DOMAIN}/nb-auth"
-      - "https://${NETBIRD_DOMAIN}/nb-silent-auth"
-    cliRedirectURIs:
-      - "http://localhost:53000/"
-
-  store:
-    engine: "postgres"
-    dsn: "${POSTGRES_DSN}"
-    encryptionKey: "${NETBIRD_ENCRYPTION_KEY}"
-
-  activityStore:
-    engine: "postgres"
-    dsn: "${POSTGRES_DSN}"
-EOF
-
-  if [[ "$NETBIRD_TRAFFIC_FLOW" == "yes" ]]; then
-    cat <<EOF
-
-  trafficFlow:
-    enabled: true
-    address: "https://${NETBIRD_DOMAIN}:443"
-    interval: "60s"
-EOF
-  fi
-}
-
-init_environment
--- a/infrastructure_files/getting-started.sh
+++ b/infrastructure_files/getting-started.sh
@@ -351,11 +351,6 @@ initialize_default_values() {
  NETBIRD_STUN_PORT=3478

  # Docker images
-  # Record whether the operator explicitly pinned the server/proxy images via
-  # env vars, so the agent-network preset can pick its own defaults without
-  # clobbering an explicit override.
-  NETBIRD_SERVER_IMAGE_EXPLICIT=${NETBIRD_SERVER_IMAGE:+true}
-  NETBIRD_PROXY_IMAGE_EXPLICIT=${NETBIRD_PROXY_IMAGE:+true}
  DASHBOARD_IMAGE=${DASHBOARD_IMAGE:-"netbirdio/dashboard:latest"}
  # Combined server replaces separate signal, relay, and management containers
  NETBIRD_SERVER_IMAGE=${NETBIRD_SERVER_IMAGE:-"netbirdio/netbird-server:latest"}
@@ -403,53 +398,7 @@ configure_domain() {
  return 0
 }

-apply_agent_network_preset() {
-  # Agent-network turnkey install: built-in Traefik + NetBird Proxy with
-  # NB_PROXY_PRIVATE=true, dashboard locked to agent-network-only mode.
-  # Bypasses every reverse-proxy / proxy / CrowdSec prompt. The only
-  # inputs we still need from the operator are the domain (handled by
-  # configure_domain via NETBIRD_DOMAIN env var or interactive prompt)
-  # and the ACME email — both honor env vars first and fall back to a
-  # prompt only when unset. CrowdSec is intentionally off.
-  REVERSE_PROXY_TYPE="0"
-  ENABLE_PROXY="true"
-  ENABLE_CROWDSEC="false"
-
-  # Agent-network ships dedicated server/proxy images. Honor an explicit
-  # env override; otherwise pin the agent-network builds.
-  if [[ "${NETBIRD_SERVER_IMAGE_EXPLICIT}" != "true" ]]; then
-    NETBIRD_SERVER_IMAGE="netbirdio/netbird-server:0.74.0-rc.2"
-  fi
-  if [[ "${NETBIRD_PROXY_IMAGE_EXPLICIT}" != "true" ]]; then
-    NETBIRD_PROXY_IMAGE="netbirdio/reverse-proxy:0.74.0-rc.2"
-  fi
-
-  if [[ -n "${NETBIRD_LETSENCRYPT_EMAIL}" ]]; then
-    TRAEFIK_ACME_EMAIL="${NETBIRD_LETSENCRYPT_EMAIL}"
-  else
-    TRAEFIK_ACME_EMAIL=$(read_traefik_acme_email)
-  fi
-
-  echo "" > /dev/stderr
-  echo "Agent-network preset enabled (NETBIRD_AGENT_NETWORK=true):" > /dev/stderr
-  echo "  - reverse proxy: built-in Traefik" > /dev/stderr
-  echo "  - NetBird Proxy: enabled with NB_PROXY_PRIVATE=true" > /dev/stderr
-  echo "  - server image: ${NETBIRD_SERVER_IMAGE}" > /dev/stderr
-  echo "  - proxy image: ${NETBIRD_PROXY_IMAGE}" > /dev/stderr
-  echo "  - dashboard: NETBIRD_AGENT_NETWORK_ONLY=true" > /dev/stderr
-  echo "  - CrowdSec: disabled" > /dev/stderr
-  echo "  - Let's Encrypt email: ${TRAEFIK_ACME_EMAIL}" > /dev/stderr
-  echo "" > /dev/stderr
-}
-
 configure_reverse_proxy() {
-  # Short-circuit: agent-network preset locks every reverse-proxy /
-  # proxy / CrowdSec choice and bypasses the interactive prompts.
-  if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
-    apply_agent_network_preset
-    return 0
-  fi
-
  # Prompt for reverse proxy type
  REVERSE_PROXY_TYPE=$(read_reverse_proxy_type)

@@ -961,15 +910,6 @@ NGINX_SSL_PORT=443
 # Letsencrypt
 LETSENCRYPT_DOMAIN=none
 EOF
-
-  if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
-    cat <<EOF
-# Agent-network preset: dashboard hides the standard NetBird surfaces
-# and exposes only the AI Observability + agent-network configuration
-# pages. Paired with NB_PROXY_PRIVATE=true on the proxy side.
-NETBIRD_AGENT_NETWORK_ONLY=true
-EOF
-  fi
  return 0
 }

@@ -1006,17 +946,6 @@ NB_PROXY_PROXY_PROTOCOL=true
 NB_PROXY_TRUSTED_PROXIES=$TRAEFIK_IP
 EOF

-  if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
-    cat <<EOF
-# Agent-network preset: turn the proxy into the private reverse-proxy
-# ingress for agent-network synth services. Disables the public-facing
-# surface so the proxy serves only synth-generated routes (the
-# llm_router-driven LLM endpoints) and the per-account inbound
-# listeners on the embedded netstack.
-NB_PROXY_PRIVATE=true
-EOF
-  fi
-
  if [[ "$ENABLE_CROWDSEC" == "true" && -n "$CROWDSEC_BOUNCER_KEY" ]]; then
    cat <<EOF
 NB_PROXY_CROWDSEC_API_URL=http://crowdsec:8080
@@ -1397,20 +1326,12 @@ print_builtin_traefik_instructions() {
    echo "  - 51820/udp (WIREGUARD - (optional) for P2P proxy connections)"
  fi
  echo ""
-  if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
-    echo "For enterprise environments requiring high availability and advanced integrations,"
-    echo "consider a commercial on-prem license:"
-    echo ""
-    echo "  Commercial license: https://netbird.ai/pricing"
-    echo "  Documentation: https://docs.netbird.io/agent-network"
-  else
-    echo "This setup is ideal for homelabs and smaller organization deployments."
-    echo "For enterprise environments requiring high availability and advanced integrations,"
-    echo "consider a commercial on-prem license or scaling your open source deployment:"
-    echo ""
-    echo "  Commercial license: https://netbird.io/pricing#on-prem"
-    echo "  Scaling guide:      https://docs.netbird.io/scaling-your-self-hosted-deployment"
-  fi
+  echo "This setup is ideal for homelabs and smaller organization deployments."
+  echo "For enterprise environments requiring high availability and advanced integrations,"
+  echo "consider a commercial on-prem license or scaling your open source deployment:"
+  echo ""
+  echo "  Commercial license: https://netbird.io/pricing#on-prem"
+  echo "  Scaling guide:      https://docs.netbird.io/scaling-your-self-hosted-deployment"
  echo ""
  if [[ "$ENABLE_PROXY" == "true" ]]; then
    echo "NetBird Proxy:"
@@ -1433,11 +1354,6 @@ print_builtin_traefik_instructions() {
      echo ""
    fi
  fi
-  if [[ "${NETBIRD_AGENT_NETWORK}" == "true" ]]; then
-    echo "Note: The public domain is only for setting up secure connections."
-    echo "Your APIs and agent services remain private and are never exposed publicly."
-    echo ""
-  fi
  return 0
 }

--- a/infrastructure_files/migrate-to-enterprise.sh
+++ b/infrastructure_files/migrate-to-enterprise.sh
@@ -1,638 +0,0 @@
-#!/bin/bash
-
-set -e
-set -o pipefail
-
-# NetBird — community combined → Enterprise combined migration
-#
-# Non-destructive migration: produces docker-compose.override.yml (auto-loaded
-# by docker compose) and config.yaml.enterprise alongside the operator's
-# existing files. Original docker-compose.yml and config.yaml are never
-# modified.
-#
-# Steps (all optional, asked interactively):
-#   1. Image swap         — replace community images with enterprise cloud images.
-#   2. Postgres migration — add Postgres, migrate SQLite data via migrate-store.
-#   3. Traffic flow       — add NATS + flow-enricher + flow-receiver.
-#
-# To revert:
-#   docker compose down
-#   rm -f docker-compose.override.yml config.yaml.enterprise
-#   # If Postgres migration was done, also restore the SQLite backup printed
-#   # at the end of this script's run.
-#   docker compose up -d
-
-OVERRIDE_FILE="docker-compose.override.yml"
-ENTERPRISE_CONFIG_FILE="config.yaml.enterprise"
-
-check_docker_compose() {
-  if command -v docker-compose &> /dev/null; then
-    echo "docker-compose"
-    return
-  fi
-  if docker compose --help &> /dev/null; then
-    echo "docker compose"
-    return
-  fi
-  echo "docker-compose is not installed or not in PATH." > /dev/stderr
-  exit 1
-}
-
-check_yq() {
-  if ! command -v yq &> /dev/null; then
-    cat > /dev/stderr <<'EOF'
-yq is required to parse and update YAML safely.
-
-  macOS:   brew install yq
-  Linux:   https://github.com/mikefarah/yq/releases (download binary into PATH)
-  Debian:  apt-get install yq   (Note: must be the mikefarah Go yq, not the Python wrapper.)
-
-EOF
-    exit 1
-  fi
-  if ! yq --version 2>&1 | grep -q "mikefarah"; then
-    echo "yq is present but appears to be the wrong implementation. The mikefarah Go-based yq is required (https://github.com/mikefarah/yq)." > /dev/stderr
-    exit 1
-  fi
-}
-
-check_openssl() {
-  if ! command -v openssl &> /dev/null; then
-    echo "openssl is not installed or not in PATH." > /dev/stderr
-    exit 1
-  fi
-}
-
-rand_password() {
-  openssl rand -hex 32
-}
-
-read_required() {
-  local prompt="$1"
-  local value=""
-  while [[ -z "$value" ]]; do
-    echo -n "$prompt: " > /dev/stderr
-    read -r value < /dev/tty
-    if [[ -z "$value" ]]; then
-      echo "Value cannot be empty." > /dev/stderr
-    fi
-  done
-  echo "$value"
-}
-
-read_secret() {
-  local prompt="$1"
-  local value=""
-  while [[ -z "$value" ]]; do
-    echo -n "$prompt: " > /dev/stderr
-    read -rs value < /dev/tty
-    echo "" > /dev/stderr
-    if [[ -z "$value" ]]; then
-      echo "Value cannot be empty." > /dev/stderr
-    fi
-  done
-  echo "$value"
-}
-
-read_yes_no() {
-  local prompt="$1"
-  local default="${2:-n}"
-  local hint
-  if [[ "$default" == "y" ]]; then
-    hint="[Y/n]"
-  else
-    hint="[y/N]"
-  fi
-  echo -n "${prompt} ${hint}: " > /dev/stderr
-  local ans=""
-  read -r ans < /dev/tty
-  if [[ -z "$ans" ]]; then
-    ans="$default"
-  fi
-  case "$ans" in
-    [Yy] | [Yy][Ee][Ss]) echo "yes" ;;
-    *) echo "no" ;;
-  esac
-}
-
-# ---------------------------------------------------------------------------
-# Detection — read the operator's existing compose to find service names and
-# paths we need to override. Bail loudly if shape isn't recognised.
-# ---------------------------------------------------------------------------
-
-detect_combined_service() {
-  yq eval '.services | to_entries | map(select(.value.image | test("^netbirdio/netbird-server"))) | .[0].key // ""' "$COMPOSE_FILE"
-}
-
-detect_dashboard_service() {
-  yq eval '.services | to_entries | map(select(.value.image | test("^netbirdio/dashboard"))) | .[0].key // ""' "$COMPOSE_FILE"
-}
-
-detect_config_yaml_host_path() {
-  yq eval ".services[\"$COMBINED_SERVICE\"].volumes[] | select(. | test(\":/etc/netbird/config.yaml\")) | sub(\":/etc/netbird/config.yaml.*\"; \"\") // \"\"" "$COMPOSE_FILE" | head -1
-}
-
-detect_data_volume() {
-  yq eval ".services[\"$COMBINED_SERVICE\"].volumes[] | select(. | test(\":/var/lib/netbird\")) | sub(\":/var/lib/netbird.*\"; \"\") // \"\"" "$COMPOSE_FILE" | head -1
-}
-
-detect_exposed_address() {
-  yq eval '.server.exposedAddress // ""' "$CONFIG_YAML_HOST"
-}
-
-detect_compose_network() {
-  local tag
-  tag=$(yq eval ".services[\"$COMBINED_SERVICE\"].networks | tag" "$COMPOSE_FILE" 2>/dev/null)
-  case "$tag" in
-    "!!seq")
-      yq eval ".services[\"$COMBINED_SERVICE\"].networks[0]" "$COMPOSE_FILE"
-      ;;
-    "!!map")
-      yq eval ".services[\"$COMBINED_SERVICE\"].networks | keys | .[0]" "$COMPOSE_FILE"
-      ;;
-    *)
-      echo "default"
-      ;;
-  esac
-}
-
-# ---------------------------------------------------------------------------
-# Renderers
-# ---------------------------------------------------------------------------
-
-# Build docker-compose.override.yml from the steps the operator selected.
-# Service names match what we detected on the operator's side.
-render_override() {
-  cat <<EOF
-# Generated by migrate-to-enterprise.sh. Mode 644.
-# Merged with docker-compose.yml automatically by Docker Compose.
-# Remove this file (and config.yaml.enterprise if present) to revert.
-
-services:
-  ${DASHBOARD_SERVICE}:
-    image: \${NETBIRD_DASHBOARD_IMAGE:-ghcr.io/netbirdio/dashboard-cloud:latest}
-
-  ${COMBINED_SERVICE}:
-    image: \${NETBIRD_SERVER_IMAGE:-ghcr.io/netbirdio/netbird-server-cloud:latest}
-    environment:
-      NB_LICENSE_KEY: \${NB_LICENSE_KEY}
-      NETBIRD_LICENSE_SERVER_BASE_URL: \${NETBIRD_LICENSE_SERVER_BASE_URL}
-EOF
-
-  if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-    cat <<EOF
-    depends_on:
-      postgres:
-        condition: service_healthy
-    volumes:
-      - ./${ENTERPRISE_CONFIG_FILE}:/etc/netbird/config.yaml.enterprise:ro
-    command: ["--config", "/etc/netbird/config.yaml.enterprise"]
-
-  postgres:
-    image: postgres:17
-    container_name: netbird-postgres
-    restart: unless-stopped
-    networks: [${COMPOSE_NETWORK}]
-    environment:
-      POSTGRES_USER: netbird
-      POSTGRES_PASSWORD: \${POSTGRES_PASSWORD}
-      POSTGRES_DB: netbird
-    volumes:
-      - netbird_postgres:/var/lib/postgresql/data
-    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U netbird -d netbird"]
-      interval: 5s
-      timeout: 5s
-      retries: 20
-EOF
-  fi
-
-  if [[ "$ENABLE_FLOW" == "yes" ]]; then
-    cat <<EOF
-
-  nats:
-    image: nats:2
-    container_name: netbird-nats
-    restart: unless-stopped
-    networks: [${COMPOSE_NETWORK}]
-    command: ["-m", "8222", "--jetstream", "--store_dir", "/data"]
-    volumes:
-      - netbird_nats_data:/data
-
-  flow-enricher:
-    image: ghcr.io/netbirdio/flow-enricher-cloud:latest
-    container_name: netbird-flow-enricher
-    restart: unless-stopped
-    networks: [${COMPOSE_NETWORK}]
-    depends_on:
-      postgres:
-        condition: service_healthy
-      nats:
-        condition: service_started
-    environment:
-      NB_LICENSE_KEY: \${NB_LICENSE_KEY}
-      NETBIRD_LICENSE_SERVER_BASE_URL: \${NETBIRD_LICENSE_SERVER_BASE_URL}
-      NB_DATADIR: /var/lib/netbird
-      NB_MANAGEMENT_STORE_ENGINE: postgres
-      NB_MANAGEMENT_POSTGRES_DSN: "host=postgres user=netbird password=\${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
-      NB_STORE_ENGINE_POSTGRES_DSN: "host=postgres user=netbird password=\${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
-      NB_TRAFFIC_EVENT_STORE_ENGINE: postgres
-      NB_TRAFFIC_EVENT_POSTGRES_DSN: "host=postgres user=netbird password=\${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
-      NB_MANAGEMENT_STORE_KEY: \${NETBIRD_ENCRYPTION_KEY}
-      NB_FLOW_ADAPTER_TYPE: nats
-      NB_FLOW_NATS_ENDPOINTS: nats://nats:4222
-      NB_FLOW_NATS_STREAM: traffic-events
-      NB_METRICS_PORT: 9091
-      NB_PERSISTENCE_RETENTION_PERIOD: 168h
-
-  flow-receiver:
-    image: ghcr.io/netbirdio/flow-receiver-cloud:latest
-    container_name: netbird-flow-receiver
-    restart: unless-stopped
-    networks: [${COMPOSE_NETWORK}]
-    depends_on:
-      nats:
-        condition: service_started
-    environment:
-      NB_LICENSE_KEY: \${NB_LICENSE_KEY}
-      NETBIRD_LICENSE_SERVER_BASE_URL: \${NETBIRD_LICENSE_SERVER_BASE_URL}
-      NB_FLOW_LISTEN_PORT: 80
-      NB_FLOW_ADAPTER_TYPE: nats
-      NB_FLOW_NATS_ENDPOINTS: nats://nats:4222
-      NB_FLOW_NATS_STREAM: traffic-events
-      NB_FLOW_AUTH_SECRET: \${NB_FLOW_AUTH_SECRET}
-    labels:
-      - traefik.enable=true
-      - traefik.http.routers.netbird-flow.rule=Host(\`${NETBIRD_HOSTNAME}\`) && PathPrefix(\`/flow.FlowService/\`)
-      - traefik.http.routers.netbird-flow.entrypoints=websecure
-      - traefik.http.routers.netbird-flow.tls=true
-      - traefik.http.routers.netbird-flow.tls.certresolver=letsencrypt
-      - traefik.http.routers.netbird-flow.service=netbird-flow-h2c
-      - traefik.http.routers.netbird-flow.priority=100
-      - traefik.http.services.netbird-flow-h2c.loadbalancer.server.port=80
-      - traefik.http.services.netbird-flow-h2c.loadbalancer.server.scheme=h2c
-EOF
-  fi
-
-  # Volume declarations for anything new the override introduced
-  local has_volumes="no"
-  if [[ "$MIGRATE_POSTGRES" == "yes" ]] || [[ "$ENABLE_FLOW" == "yes" ]]; then
-    has_volumes="yes"
-  fi
-
-  if [[ "$has_volumes" == "yes" ]]; then
-    cat <<EOF
-
-volumes:
-EOF
-    if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-      echo "  netbird_postgres:"
-    fi
-    if [[ "$ENABLE_FLOW" == "yes" ]]; then
-      echo "  netbird_nats_data:"
-    fi
-  fi
-}
-
-# Build config.yaml.enterprise by yq-editing the operator's existing
-# config.yaml. We don't touch the original file.
-render_enterprise_config() {
-  local pg_dsn="host=postgres user=netbird password=${POSTGRES_PASSWORD} dbname=netbird port=5432 sslmode=disable"
-
-  yq eval "
-    .server.store.engine = \"postgres\" |
-    .server.store.dsn = \"$pg_dsn\" |
-    .server.activityStore.engine = \"postgres\" |
-    .server.activityStore.dsn = \"$pg_dsn\" |
-    .server.authStore.engine = \"postgres\" |
-    .server.authStore.dsn = \"$pg_dsn\"
-  " "$CONFIG_YAML_HOST" > "$ENTERPRISE_CONFIG_FILE"
-
-  if [[ "$ENABLE_FLOW" == "yes" ]]; then
-    local flow_addr="${NETBIRD_DOMAIN}"
-    yq eval -i "
-      .server.trafficFlow.enabled = true |
-      .server.trafficFlow.address = \"$flow_addr\" |
-      .server.trafficFlow.interval = \"60s\"
-    " "$ENTERPRISE_CONFIG_FILE"
-  fi
-}
-
-# ---------------------------------------------------------------------------
-# Execution steps
-# ---------------------------------------------------------------------------
-
-resolve_data_volume() {
-  local short="$1"
-  local actual
-  # Resolve project-prefixed volume name from Docker Compose config first.
-  actual=$($DOCKER_COMPOSE_COMMAND config 2>/dev/null | yq eval ".volumes.\"$short\".name" - 2>/dev/null)
-  if [[ -n "$actual" && "$actual" != "null" ]]; then
-    echo "$actual"
-    return
-  fi
-  # Relative bind mount: docker-compose resolves it against the compose
-  # file's directory, but `docker run -v` resolves it against the current
-  # working directory. Normalize to an absolute path so both interpretations
-  # agree (and the printed revert command works from any CWD).
-  if [[ "$short" == ./* || "$short" == ../* ]]; then
-    local compose_dir
-    compose_dir="$(cd "$(dirname "$COMPOSE_FILE")" && pwd)"
-    (
-      cd "$compose_dir"
-      cd "$(dirname "$short")"
-      printf '%s/%s\n' "$(pwd)" "$(basename "$short")"
-    )
-    return
-  fi
-  # Not a named volume (e.g. an absolute bind-mount path) — use it as-is.
-  echo "$short"
-}
-
-backup_sqlite() {
-  BACKUP_DIR="$(pwd)/backups/sqlite-pre-enterprise-$(date +%Y%m%d-%H%M%S)"
-  mkdir -p "$BACKUP_DIR"
-  local data_volume_actual
-  data_volume_actual=$(resolve_data_volume "$DATA_VOLUME")
-  echo "Backing up SQLite store from volume '$data_volume_actual' to $BACKUP_DIR ..."
-  docker run --rm \
-    -v "${data_volume_actual}:/var/lib/netbird:ro" \
-    -v "${BACKUP_DIR}:/backup" \
-    busybox \
-    sh -c 'cp -a /var/lib/netbird/. /backup/ 2>/dev/null || true'
-  local copied
-  copied=$(find "$BACKUP_DIR" -mindepth 1 | head -1)
-  if [[ -z "$copied" ]]; then
-    echo "  ⚠ Backup directory is empty — the volume '$data_volume_actual' didn't contain data. Aborting." > /dev/stderr
-    exit 1
-  fi
-  echo "  done"
-}
-
-run_migrate_store() {
-  echo "Running migrate-store (SQLite → Postgres) ..."
-  $DOCKER_COMPOSE_COMMAND run --rm "$COMBINED_SERVICE" migrate-store --config /etc/netbird/config.yaml.enterprise --verify
-  echo "  done"
-}
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-init_migration() {
-  DOCKER_COMPOSE_COMMAND=$(check_docker_compose)
-  check_yq
-  check_openssl
-
-  COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.yml}"
-
-  if [[ ! -f "$COMPOSE_FILE" ]]; then
-    echo "$COMPOSE_FILE not found in $(pwd)." > /dev/stderr
-    exit 1
-  fi
-  if [[ -f "$OVERRIDE_FILE" ]] || [[ -f "$ENTERPRISE_CONFIG_FILE" ]]; then
-    echo "Migration artifacts already exist in $(pwd):"
-    [[ -f "$OVERRIDE_FILE" ]] && echo "  $OVERRIDE_FILE"
-    [[ -f "$ENTERPRISE_CONFIG_FILE" ]] && echo "  $ENTERPRISE_CONFIG_FILE"
-    echo ""
-    echo "Either you've already migrated, or a previous run was interrupted."
-    echo "To re-run cleanly: rm -f $OVERRIDE_FILE $ENTERPRISE_CONFIG_FILE"
-    exit 1
-  fi
-
-  COMBINED_SERVICE=$(detect_combined_service)
-  DASHBOARD_SERVICE=$(detect_dashboard_service)
-  CONFIG_YAML_HOST=$(detect_config_yaml_host_path)
-  DATA_VOLUME=$(detect_data_volume)
-  COMPOSE_NETWORK=$(detect_compose_network)
-
-  if [[ -z "$COMBINED_SERVICE" ]]; then
-    echo "Could not find a service running netbirdio/netbird-server* in $COMPOSE_FILE." > /dev/stderr
-    echo "This script targets the community combined-server deployment." > /dev/stderr
-    exit 1
-  fi
-  if [[ -z "$DASHBOARD_SERVICE" ]]; then
-    echo "Could not find a service running netbirdio/dashboard* in $COMPOSE_FILE." > /dev/stderr
-    exit 1
-  fi
-  if [[ -z "$CONFIG_YAML_HOST" ]]; then
-    echo "Could not find a config.yaml mount on $COMBINED_SERVICE (expected to bind-mount to /etc/netbird/config.yaml)." > /dev/stderr
-    exit 1
-  fi
-  if [[ ! -f "$CONFIG_YAML_HOST" ]]; then
-    echo "config.yaml host file not found at $CONFIG_YAML_HOST." > /dev/stderr
-    exit 1
-  fi
-  if [[ -z "$DATA_VOLUME" ]]; then
-    echo "Could not find a volume mounted at /var/lib/netbird on $COMBINED_SERVICE." > /dev/stderr
-    exit 1
-  fi
-
-  echo "Detected existing deployment:"
-  echo "  Combined service: $COMBINED_SERVICE"
-  echo "  Dashboard:        $DASHBOARD_SERVICE"
-  echo "  config.yaml:      $CONFIG_YAML_HOST"
-  echo "  Data volume:      $DATA_VOLUME"
-  echo "  Network:          $COMPOSE_NETWORK"
-  echo ""
-
-  local proceed
-  proceed=$(read_yes_no "Proceed with migration?" "y")
-  if [[ "$proceed" != "yes" ]]; then
-    echo "Aborted."
-    exit 0
-  fi
-
-  # Step 1 — always (this is the point of the script)
-  MIGRATE_IMAGES="yes"
-  echo ""
-  echo "Step 1: Image swap (community → Enterprise). License key required."
-  NB_LICENSE_KEY=$(read_secret "  License key")
-  GHCR_USERNAME="netbirdExtAccess1"
-  GHCR_TOKEN=$(read_secret "  GHCR token (input hidden)")
-
-  # Step 2 — optional
-  echo ""
-  MIGRATE_POSTGRES=$(read_yes_no "Step 2: Migrate storage from SQLite to Postgres? (recommended)" "n")
-  if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-    echo ""
-    echo "  ⚠  Data will be migrated from SQLite to Postgres. The SQLite store"
-    echo "     will be backed up automatically. To fully revert later, restore"
-    echo "     that backup and delete docker-compose.override.yml +"
-    echo "     config.yaml.enterprise."
-    local confirm
-    confirm=$(read_yes_no "  Continue?" "y")
-    if [[ "$confirm" != "yes" ]]; then
-      MIGRATE_POSTGRES="no"
-      echo "  Skipping Postgres migration."
-    else
-      POSTGRES_PASSWORD=$(rand_password)
-    fi
-  fi
-
-  # Step 3 — optional, only if Postgres is on (flow requires Postgres)
-  echo ""
-  if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-    ENABLE_FLOW=$(read_yes_no "Step 3: Enable traffic flow? (requires Postgres)" "n")
-    if [[ "$ENABLE_FLOW" == "yes" ]]; then
-      # Auth secret MUST match server.authSecret from config.yaml
-      NB_FLOW_AUTH_SECRET=$(yq eval '.server.authSecret // ""' "$CONFIG_YAML_HOST")
-      if [[ -z "$NB_FLOW_AUTH_SECRET" ]] || [[ "$NB_FLOW_AUTH_SECRET" == "null" ]]; then
-        echo "Could not read server.authSecret from $CONFIG_YAML_HOST." > /dev/stderr
-        echo "Flow receiver auth must match the combined server's authSecret." > /dev/stderr
-        exit 1
-      fi
-
-      NETBIRD_DOMAIN=$(detect_exposed_address)
-      if [[ -z "$NETBIRD_DOMAIN" ]] || [[ "$NETBIRD_DOMAIN" == "null" ]]; then
-        NETBIRD_DOMAIN=$(read_required "  Public NetBird URL (e.g. https://netbird.example.com)")
-      fi
-      # Strip protocol + port to leave just the hostname for the Traefik Host() rule.
-      NETBIRD_HOSTNAME=$(echo "$NETBIRD_DOMAIN" | sed -E 's,^https?://,,' | sed 's,:.*,,' | sed 's,/.*,,')
-
-      # We need the encryption key from the existing config.yaml for the enricher
-      NETBIRD_ENCRYPTION_KEY=$(yq eval '.server.store.encryptionKey // ""' "$CONFIG_YAML_HOST")
-      if [[ -z "$NETBIRD_ENCRYPTION_KEY" ]] || [[ "$NETBIRD_ENCRYPTION_KEY" == "null" ]]; then
-        echo "Could not read server.store.encryptionKey from $CONFIG_YAML_HOST." > /dev/stderr
-        exit 1
-      fi
-    fi
-  else
-    ENABLE_FLOW="no"
-    echo "Step 3 (traffic flow) skipped — requires Postgres."
-  fi
-}
-
-apply_changes() {
-  echo ""
-  echo "Writing $OVERRIDE_FILE ..."
-  install -m 644 /dev/null "$OVERRIDE_FILE"
-  render_override > "$OVERRIDE_FILE"
-
-  if [[ -z "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
-    sed -i.bak '/NETBIRD_LICENSE_SERVER_BASE_URL/d' "$OVERRIDE_FILE" && rm -f "$OVERRIDE_FILE.bak"
-  fi
-
-  if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-    echo "Writing $ENTERPRISE_CONFIG_FILE ..."
-    install -m 600 /dev/null "$ENTERPRISE_CONFIG_FILE"
-    render_enterprise_config
-  fi
-
-  # Persist secrets that the override file references via env interpolation.
-  # We write them to a .env file in the current directory; docker compose
-  # picks it up automatically.
-  echo "Writing .env additions (mode 600) ..."
-  local ENV_FILE=".env"
-  touch "$ENV_FILE"
-  chmod 600 "$ENV_FILE"
-  {
-    echo ""
-    echo "# Added by migrate-to-enterprise.sh on $(date -u +%Y-%m-%dT%H:%M:%SZ)"
-    echo "NB_LICENSE_KEY=${NB_LICENSE_KEY}"
-    if [[ -n "${NETBIRD_LICENSE_SERVER_BASE_URL:-}" ]]; then
-      echo "NETBIRD_LICENSE_SERVER_BASE_URL=${NETBIRD_LICENSE_SERVER_BASE_URL}"
-    fi
-    if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-      echo "POSTGRES_PASSWORD=${POSTGRES_PASSWORD}"
-    fi
-    if [[ "$ENABLE_FLOW" == "yes" ]]; then
-      echo "NB_FLOW_AUTH_SECRET=${NB_FLOW_AUTH_SECRET}"
-      echo "NETBIRD_ENCRYPTION_KEY=${NETBIRD_ENCRYPTION_KEY}"
-    fi
-  } >> "$ENV_FILE"
-
-  echo ""
-  echo "Logging in to ghcr.io ..."
-  printf '%s' "$GHCR_TOKEN" | docker login ghcr.io -u "$GHCR_USERNAME" --password-stdin
-  unset GHCR_TOKEN
-
-  echo ""
-  echo "Pulling enterprise images ..."
-  $DOCKER_COMPOSE_COMMAND pull
-
-  if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-    echo ""
-    echo "Stopping existing services (volumes preserved) ..."
-    $DOCKER_COMPOSE_COMMAND down
-
-    backup_sqlite
-
-    echo ""
-    echo "Starting Postgres ..."
-    $DOCKER_COMPOSE_COMMAND up -d postgres
-
-    # Wait for healthy
-    local counter=0
-    echo -n "Waiting for Postgres to become ready"
-    while ! $DOCKER_COMPOSE_COMMAND exec -T postgres pg_isready -U netbird -d netbird &> /dev/null; do
-      echo -n " ."
-      sleep 2
-      counter=$((counter + 1))
-      if [[ $counter -ge 60 ]]; then
-        echo ""
-        echo "Postgres did not become ready in 120s. Recent logs:"
-        $DOCKER_COMPOSE_COMMAND logs --tail=20 postgres
-        exit 1
-      fi
-    done
-    echo " done"
-
-    run_migrate_store
-  fi
-
-  echo ""
-  echo "Bringing up all services ..."
-  $DOCKER_COMPOSE_COMMAND up -d
-
-  echo ""
-  echo "Migration complete."
-}
-
-print_summary() {
-  echo ""
-  echo "──────────────────────────────────────────────────────────────────────"
-  echo " Summary"
-  echo "──────────────────────────────────────────────────────────────────────"
-  echo "  Images:           swapped to enterprise"
-  [[ "$MIGRATE_POSTGRES" == "yes" ]] && echo "  Storage:          Postgres (data migrated from SQLite)"
-  [[ "$MIGRATE_POSTGRES" != "yes" ]] && echo "  Storage:          SQLite (unchanged)"
-  [[ "$ENABLE_FLOW" == "yes" ]] && echo "  Traffic flow:     enabled"
-  [[ "$ENABLE_FLOW" != "yes" ]] && echo "  Traffic flow:     disabled"
-  echo ""
-  echo "  Generated files (next to your docker-compose.yml):"
-  echo "    $OVERRIDE_FILE"
-  [[ "$MIGRATE_POSTGRES" == "yes" ]] && echo "    $ENTERPRISE_CONFIG_FILE"
-  echo "    .env  (license key + secrets, mode 600)"
-  [[ "$MIGRATE_POSTGRES" == "yes" ]] && echo "    backups/sqlite-pre-enterprise-*/  (SQLite backup)"
-  echo ""
-  echo " Tail logs:"
-  echo "   $DOCKER_COMPOSE_COMMAND logs -f $COMBINED_SERVICE"
-  echo ""
-  echo "──────────────────────────────────────────────────────────────────────"
-  echo " To revert"
-  echo "──────────────────────────────────────────────────────────────────────"
-  echo "  $DOCKER_COMPOSE_COMMAND down"
-  if [[ "$MIGRATE_POSTGRES" == "yes" ]]; then
-    # Resolve project-prefixed volume names now (before override is removed).
-    local pg_volume data_volume_actual
-    pg_volume=$(resolve_data_volume "netbird_postgres")
-    data_volume_actual=$(resolve_data_volume "$DATA_VOLUME")
-    echo "  # Remove the Postgres volume FIRST, before deleting the override file:"
-    echo "  docker volume rm $pg_volume"
-    echo "  # Restore SQLite from the backup created during this run:"
-    echo "  docker run --rm -v ${data_volume_actual}:/var/lib/netbird -v ${BACKUP_DIR}:/backup busybox sh -c 'cp -a /backup/. /var/lib/netbird/'"
-  fi
-  echo "  rm -f $OVERRIDE_FILE $ENTERPRISE_CONFIG_FILE"
-  echo "  # Remove migrate-to-enterprise.sh additions from .env (search for the timestamp marker)"
-  echo "  $DOCKER_COMPOSE_COMMAND up -d"
-  echo "──────────────────────────────────────────────────────────────────────"
-}
-
-# ---------------------------------------------------------------------------
-# Run
-# ---------------------------------------------------------------------------
-
-init_migration
-apply_changes
-print_summary
--- a/management/internals/controllers/network_map/controller/controller.go
+++ b/management/internals/controllers/network_map/controller/controller.go
@@ -116,24 +116,6 @@ func (c *Controller) OnPeerDisconnected(ctx context.Context, accountID string, p
 	c.EphemeralPeersManager.OnPeerDisconnected(ctx, peer)
 }

-// injectAllProxyPolicies prepares an account for the per-peer network-map
-// computation. It prepends the in-memory agent-network services synthesised
-// from the account's current provider/policy state to account.Services so
-// the existing InjectProxyPolicies + injectPrivateServicePolicies walks pick
-// them up alongside persisted reverse-proxy services. Synthesised services
-// are never persisted; the account is loaded fresh per cycle so re-prepending
-// is safe and idempotent. Accounts without agent-network providers get an
-// empty synth slice — no behaviour change.
-func (c *Controller) injectAllProxyPolicies(ctx context.Context, account *types.Account) {
-	synth, err := c.repo.SynthesizeAgentNetworkServices(ctx, account.Id)
-	if err != nil {
-		log.WithContext(ctx).Warnf("synthesise agent-network services for account %s: %v", account.Id, err)
-	} else if len(synth) > 0 {
-		account.Services = append(synth, account.Services...)
-	}
-	account.InjectProxyPolicies(ctx)
-}
-
 func (c *Controller) CountStreams() int {
 	return c.peersUpdateManager.CountStreams()
 }
@@ -168,7 +150,7 @@ func (c *Controller) sendUpdateAccountPeers(ctx context.Context, accountID strin
 	var wg sync.WaitGroup
 	semaphore := make(chan struct{}, 10)

-	c.injectAllProxyPolicies(ctx, account)
+	account.InjectProxyPolicies(ctx)
 	dnsCache := &cache.DNSConfigCache{}
 	dnsDomain := c.GetDNSDomain(account.Settings)
 	peersCustomZone := account.GetPeersCustomZone(ctx, dnsDomain)
@@ -299,15 +281,7 @@ func (c *Controller) sendUpdateForAffectedPeers(ctx context.Context, accountID s
 	var wg sync.WaitGroup
 	semaphore := make(chan struct{}, 10)

-	// The affected-peer path MUST mirror sendUpdateAccountPeers (line 171)
-	// here: injectAllProxyPolicies prepends the synthesised agent-network
-	// services BEFORE InjectProxyPolicies + private-service policies run.
-	// Previously this path called only account.InjectProxyPolicies, which
-	// skipped the synth-services prepend — so peer-level changes
-	// (proxy restart, embedded peer connect/disconnect) propagated a
-	// network map that omitted the synth DNS zone, and the agent kept
-	// resolving against the stale or absent record.
-	c.injectAllProxyPolicies(ctx, account)
+	account.InjectProxyPolicies(ctx)
 	dnsCache := &cache.DNSConfigCache{}
 	dnsDomain := c.GetDNSDomain(account.Settings)
 	peersCustomZone := account.GetPeersCustomZone(ctx, dnsDomain)
@@ -425,7 +399,7 @@ func (c *Controller) UpdateAccountPeer(ctx context.Context, accountId string, pe
 		return fmt.Errorf("failed to get validated peers: %v", err)
 	}

-	c.injectAllProxyPolicies(ctx, account)
+	account.InjectProxyPolicies(ctx)
 	dnsCache := &cache.DNSConfigCache{}
 	dnsDomain := c.GetDNSDomain(account.Settings)
 	peersCustomZone := account.GetPeersCustomZone(ctx, dnsDomain)
@@ -523,7 +497,7 @@ func (c *Controller) BufferUpdateAffectedPeers(ctx context.Context, accountID st
 		c.accountManagerMetrics.CountUpdateAccountPeersTriggered(string(reason.Resource), string(reason.Operation))
 	}

-	log.WithContext(ctx).Tracef("buffer updating %d affected peers for account %s from %s with reason %s/%s", len(peerIDs), accountID, util.GetCallerName(), reason.Operation, reason.Resource)
+	log.WithContext(ctx).Tracef("buffer updating %d affected peers for account %s from %s", len(peerIDs), accountID, util.GetCallerName())

 	bufUpd, _ := c.affectedPeerUpdateLocks.LoadOrStore(accountID, &bufferAffectedUpdate{
 		peerIDs: make(map[string]struct{}),
@@ -629,17 +603,19 @@ func (c *Controller) GetValidatedPeerWithMap(ctx context.Context, isRequiresAppr
 		return nil, nil, 0, err
 	}

-	c.injectAllProxyPolicies(ctx, account)
+	account.InjectProxyPolicies(ctx)

 	approvedPeersMap, err := c.integratedPeerValidator.GetValidatedPeers(ctx, account.Id, maps.Values(account.Groups), maps.Values(account.Peers), account.Settings.Extra)
 	if err != nil {
 		return nil, nil, 0, err
 	}

+	startPosture := time.Now()
 	postureChecks, err := c.getPeerPostureChecks(account, peerID)
 	if err != nil {
 		return nil, nil, 0, err
 	}
+	log.WithContext(ctx).Debugf("getPeerPostureChecks took %s", time.Since(startPosture))

 	accountZones, err := c.repo.GetAccountZones(ctx, account.Id)
 	if err != nil {
@@ -900,7 +876,7 @@ func (c *Controller) GetNetworkMap(ctx context.Context, peerID string) (*types.N
 		return nil, err
 	}

-	c.injectAllProxyPolicies(ctx, account)
+	account.InjectProxyPolicies(ctx)
 	resourcePolicies := account.GetResourcePoliciesMap()
 	routers := account.GetResourceRoutersMap()
 	groupIDToUserIDs := account.GetActiveGroupUsers()
--- a/management/internals/controllers/network_map/controller/repository.go
+++ b/management/internals/controllers/network_map/controller/repository.go
@@ -3,9 +3,7 @@ package controller
 import (
 	"context"

-	"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/service"
 	"github.com/netbirdio/netbird/management/internals/modules/zones"
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork"
 	"github.com/netbirdio/netbird/management/server/peer"
 	"github.com/netbirdio/netbird/management/server/store"
 	"github.com/netbirdio/netbird/management/server/types"
@@ -18,10 +16,6 @@ type Repository interface {
 	GetPeersByIDs(ctx context.Context, accountID string, peerIDs []string) (map[string]*peer.Peer, error)
 	GetPeerByID(ctx context.Context, accountID string, peerID string) (*peer.Peer, error)
 	GetAccountZones(ctx context.Context, accountID string) ([]*zones.Zone, error)
-	// SynthesizeAgentNetworkServices returns the in-memory reverse-proxy
-	// services synthesised from the account's agent-network provider/policy
-	// state. Empty for accounts without agent-network providers.
-	SynthesizeAgentNetworkServices(ctx context.Context, accountID string) ([]*service.Service, error)
 }

 type repository struct {
@@ -56,10 +50,6 @@ func (r *repository) GetPeerByID(ctx context.Context, accountID string, peerID s
 	return r.store.GetPeerByID(ctx, store.LockingStrengthNone, accountID, peerID)
 }

-func (r *repository) SynthesizeAgentNetworkServices(ctx context.Context, accountID string) ([]*service.Service, error) {
-	return agentnetwork.SynthesizeServices(ctx, r.store, accountID)
-}
-
 func (r *repository) GetAccountZones(ctx context.Context, accountID string) ([]*zones.Zone, error) {
 	return r.store.GetAccountZones(ctx, store.LockingStrengthNone, accountID)
 }
--- a/management/internals/modules/agentnetwork/accesslog_ingest.go
+++ b/management/internals/modules/agentnetwork/accesslog_ingest.go
@@ -1,215 +0,0 @@
-package agentnetwork
-
-import (
-	"context"
-	"math"
-	"strconv"
-	"strings"
-
-	log "github.com/sirupsen/logrus"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/accesslogs"
-	"github.com/netbirdio/netbird/management/server/store"
-)
-
-// Metadata keys the proxy stamps on agent-network access-log entries. These
-// mirror the constants in proxy/internal/middleware/keys.go and form the wire
-// contract between the proxy and management; management flattens them into
-// queryable columns. Keep in sync with the proxy side.
-const (
-	metaKeyProvider           = "llm.provider"
-	metaKeyModel              = "llm.model"
-	metaKeyResolvedProviderID = "llm.resolved_provider_id"
-	metaKeySelectedPolicyID   = "llm.selected_policy_id"
-	metaKeyPolicyDecision     = "llm_policy.decision"
-	metaKeyPolicyReason       = "llm_policy.reason"
-	metaKeyInputTokens        = "llm.input_tokens"  //nolint:gosec // metadata key name, not a credential
-	metaKeyOutputTokens       = "llm.output_tokens" //nolint:gosec // metadata key name, not a credential
-	metaKeyTotalTokens        = "llm.total_tokens"  //nolint:gosec // metadata key name, not a credential
-	metaKeyCostUSDTotal       = "cost.usd_total"
-	metaKeyStream             = "llm.stream"
-	metaKeySessionID          = "llm.session_id"
-	metaKeyAuthorisingGroups  = "llm.authorising_groups"
-	metaKeyRequestPrompt      = "llm.request_prompt"
-	metaKeyResponseCompletion = "llm.response_completion"
-)
-
-// IngestAccessLog flattens the metadata-bearing reverse-proxy access-log entry
-// and persists it in the dedicated agent-network tables (instead of the shared
-// reverse-proxy table), in two parts:
-//
-//   - The stripped usage record is written unconditionally — usage/cost is
-//     collected on every request regardless of the account's log-collection
-//     toggle (the proxy ships a usage-only entry when logging is disabled).
-//   - The full access-log row (with request detail + prompt) is written only
-//     when the account's EnableLogCollection setting is on. This setting read
-//     is the authoritative gate; the proxy-side strip is defense in depth.
-func IngestAccessLog(ctx context.Context, s store.Store, logEntry *accesslogs.AccessLogEntry) error {
-	entry, groups := flattenAccessLog(logEntry)
-
-	usage, usageGroups := usageFromFlattenedLog(entry, groups)
-	if err := s.CreateAgentNetworkUsage(ctx, usage, usageGroups); err != nil {
-		log.WithContext(ctx).WithFields(log.Fields{
-			"account_id": entry.AccountID,
-			"model":      entry.Model,
-		}).Errorf("failed to save agent-network usage: %v", err)
-		return err
-	}
-
-	settings, err := s.GetAgentNetworkSettings(ctx, store.LockingStrengthNone, entry.AccountID)
-	if err != nil {
-		// No settings row (or a transient read error) means we can't confirm
-		// log collection is enabled — usage is already saved, so skip the full
-		// row rather than fail the whole ingest.
-		log.WithContext(ctx).Debugf("skipping full agent-network access-log row for account %s: %v", entry.AccountID, err)
-		return nil
-	}
-	if !settings.EnableLogCollection {
-		return nil
-	}
-
-	if err := s.CreateAgentNetworkAccessLog(ctx, entry, groups); err != nil {
-		log.WithContext(ctx).WithFields(log.Fields{
-			"account_id": entry.AccountID,
-			"service_id": entry.ServiceID,
-			"model":      entry.Model,
-			"status":     entry.StatusCode,
-		}).Errorf("failed to save agent-network access log: %v", err)
-		return err
-	}
-	return nil
-}
-
-// flattenAccessLog converts a reverse-proxy AccessLogEntry (whose LLM
-// dimensions live in the opaque Metadata map) into the flattened
-// agent-network row + authorising-group child rows.
-func flattenAccessLog(e *accesslogs.AccessLogEntry) (*types.AgentNetworkAccessLog, []types.AgentNetworkAccessLogGroup) {
-	meta := e.Metadata
-
-	var sourceIP string
-	if e.GeoLocation.ConnectionIP != nil {
-		sourceIP = e.GeoLocation.ConnectionIP.String()
-	}
-
-	entry := &types.AgentNetworkAccessLog{
-		ID:            e.ID,
-		AccountID:     e.AccountID,
-		ServiceID:     e.ServiceID,
-		Timestamp:     e.Timestamp,
-		UserID:        e.UserId,
-		SourceIP:      sourceIP,
-		Method:        e.Method,
-		Host:          e.Host,
-		Path:          e.Path,
-		Duration:      e.Duration,
-		StatusCode:    e.StatusCode,
-		AuthMethod:    e.AuthMethodUsed,
-		BytesUpload:   e.BytesUpload,
-		BytesDownload: e.BytesDownload,
-
-		Provider:           meta[metaKeyProvider],
-		Model:              meta[metaKeyModel],
-		SessionID:          meta[metaKeySessionID],
-		ResolvedProviderID: meta[metaKeyResolvedProviderID],
-		SelectedPolicyID:   meta[metaKeySelectedPolicyID],
-		Decision:           meta[metaKeyPolicyDecision],
-		DenyReason:         meta[metaKeyPolicyReason],
-		InputTokens:        parseMetaInt(meta, metaKeyInputTokens),
-		OutputTokens:       parseMetaInt(meta, metaKeyOutputTokens),
-		TotalTokens:        parseMetaInt(meta, metaKeyTotalTokens),
-		CostUSD:            parseMetaFloat(meta, metaKeyCostUSDTotal),
-		Stream:             parseMetaBool(meta, metaKeyStream),
-		RequestPrompt:      meta[metaKeyRequestPrompt],
-		ResponseCompletion: meta[metaKeyResponseCompletion],
-	}
-
-	var groups []types.AgentNetworkAccessLogGroup
-	for _, gid := range parseGroupCSV(meta[metaKeyAuthorisingGroups]) {
-		groups = append(groups, types.AgentNetworkAccessLogGroup{
-			LogID:     entry.ID,
-			GroupID:   gid,
-			AccountID: entry.AccountID,
-		})
-	}
-	return entry, groups
-}
-
-// usageFromFlattenedLog derives the stripped usage record (and its group child
-// rows) from an already-flattened access-log entry. The usage row shares the
-// log's ID so the two correlate.
-func usageFromFlattenedLog(e *types.AgentNetworkAccessLog, groups []types.AgentNetworkAccessLogGroup) (*types.AgentNetworkUsage, []types.AgentNetworkUsageGroup) {
-	usage := &types.AgentNetworkUsage{
-		ID:                 e.ID,
-		AccountID:          e.AccountID,
-		Timestamp:          e.Timestamp,
-		UserID:             e.UserID,
-		ResolvedProviderID: e.ResolvedProviderID,
-		Provider:           e.Provider,
-		Model:              e.Model,
-		SessionID:          e.SessionID,
-		InputTokens:        e.InputTokens,
-		OutputTokens:       e.OutputTokens,
-		TotalTokens:        e.TotalTokens,
-		CostUSD:            e.CostUSD,
-	}
-
-	usageGroups := make([]types.AgentNetworkUsageGroup, 0, len(groups))
-	for _, g := range groups {
-		usageGroups = append(usageGroups, types.AgentNetworkUsageGroup{
-			UsageID:   usage.ID,
-			GroupID:   g.GroupID,
-			AccountID: g.AccountID,
-		})
-	}
-	return usage, usageGroups
-}
-
-// parseMetaInt parses a non-negative token count. Negative or unparseable
-// values are clamped to 0 so a malformed metric can't persist a negative
-// counter.
-func parseMetaInt(meta map[string]string, key string) int64 {
-	if v, err := strconv.ParseInt(strings.TrimSpace(meta[key]), 10, 64); err == nil && v >= 0 {
-		return v
-	}
-	return 0
-}
-
-// parseMetaFloat parses a non-negative, finite cost. Negative, NaN, Inf, or
-// unparseable values are clamped to 0 so a malformed metric can't poison the
-// stored cost.
-func parseMetaFloat(meta map[string]string, key string) float64 {
-	if v, err := strconv.ParseFloat(strings.TrimSpace(meta[key]), 64); err == nil && v >= 0 && !math.IsInf(v, 0) {
-		return v
-	}
-	return 0
-}
-
-func parseMetaBool(meta map[string]string, key string) bool {
-	v, _ := strconv.ParseBool(strings.TrimSpace(meta[key]))
-	return v
-}
-
-// parseGroupCSV splits the comma-separated authorising-group id list the proxy
-// emits, trimming blanks and de-duplicating. Dedup matters because the group
-// rows are keyed by (log_id, group_id) / (usage_id, group_id): a repeated id
-// in the CSV would otherwise produce a duplicate primary key and fail the
-// insert transaction.
-func parseGroupCSV(raw string) []string {
-	if raw == "" {
-		return nil
-	}
-	parts := strings.Split(raw, ",")
-	out := make([]string, 0, len(parts))
-	seen := make(map[string]struct{}, len(parts))
-	for _, p := range parts {
-		if p = strings.TrimSpace(p); p != "" {
-			if _, dup := seen[p]; dup {
-				continue
-			}
-			seen[p] = struct{}{}
-			out = append(out, p)
-		}
-	}
-	return out
-}
--- a/management/internals/modules/agentnetwork/accesslog_ingest_realstore_test.go
+++ b/management/internals/modules/agentnetwork/accesslog_ingest_realstore_test.go
@@ -1,124 +0,0 @@
-package agentnetwork
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	"github.com/netbirdio/netbird/management/internals/modules/reverseproxy/accesslogs"
-	"github.com/netbirdio/netbird/management/server/store"
-)
-
-// newIngestTestEntry builds an agent-network reverse-proxy access-log entry whose
-// LLM dimensions live in the opaque Metadata map, as the proxy ships it.
-func newIngestTestEntry() *accesslogs.AccessLogEntry {
-	return &accesslogs.AccessLogEntry{
-		ID:           "log-1",
-		AccountID:    testAccountID,
-		ServiceID:    "svc-1",
-		Timestamp:    time.Now().UTC(),
-		Method:       "POST",
-		Host:         testEndpoint,
-		Path:         "/v1/chat/completions",
-		StatusCode:   200,
-		UserId:       "user-1",
-		AgentNetwork: true,
-		Metadata: map[string]string{
-			metaKeyProvider:           "openai",
-			metaKeyModel:              "gpt-5.4",
-			metaKeyResolvedProviderID: "prov-1",
-			metaKeySessionID:          "sess-1",
-			metaKeyInputTokens:        "100",
-			metaKeyOutputTokens:       "50",
-			metaKeyTotalTokens:        "150",
-			metaKeyCostUSDTotal:       "0.0123",
-			metaKeyStream:             "true",
-			metaKeyRequestPrompt:      "hello",
-			metaKeyResponseCompletion: "world",
-			// repeated id must be de-duplicated before the group rows insert.
-			metaKeyAuthorisingGroups: "grp-eng,grp-eng,grp-ops",
-		},
-	}
-}
-
-// TestIngestAccessLog_RealStore_LogCollectionOff persists the usage ledger
-// unconditionally but skips the full access-log row when the account hasn't
-// opted into log collection.
-func TestIngestAccessLog_RealStore_LogCollectionOff(t *testing.T) {
-	ctx := context.Background()
-	s, cleanup, err := store.NewTestStoreFromSQL(ctx, "", t.TempDir())
-	require.NoError(t, err, "real sqlite test store must come up")
-	defer cleanup()
-
-	settings := newSynthTestSettings()
-	settings.EnableLogCollection = false
-	require.NoError(t, s.SaveAgentNetworkSettings(ctx, settings))
-
-	require.NoError(t, IngestAccessLog(ctx, s, newIngestTestEntry()))
-
-	usage, err := s.GetAgentNetworkUsageRows(ctx, store.LockingStrengthNone, testAccountID, types.AgentNetworkAccessLogFilter{})
-	require.NoError(t, err)
-	require.Len(t, usage, 1, "usage row must be written even with log collection off")
-	assert.Equal(t, int64(100), usage[0].InputTokens, "input tokens must round-trip from metadata")
-	assert.Equal(t, int64(50), usage[0].OutputTokens, "output tokens must round-trip from metadata")
-	assert.InDelta(t, 0.0123, usage[0].CostUSD, 1e-9, "cost must round-trip from metadata")
-
-	logs, _, err := s.GetAgentNetworkAccessLogs(ctx, store.LockingStrengthNone, testAccountID, types.AgentNetworkAccessLogFilter{})
-	require.NoError(t, err)
-	assert.Empty(t, logs, "full access-log row must be skipped while log collection is off")
-}
-
-// TestIngestAccessLog_RealStore_LogCollectionOn writes both the usage ledger and
-// the full access-log row once the account opts in, carrying the request detail
-// and prompt through.
-func TestIngestAccessLog_RealStore_LogCollectionOn(t *testing.T) {
-	ctx := context.Background()
-	s, cleanup, err := store.NewTestStoreFromSQL(ctx, "", t.TempDir())
-	require.NoError(t, err, "real sqlite test store must come up")
-	defer cleanup()
-
-	settings := newSynthTestSettings()
-	settings.EnableLogCollection = true
-	require.NoError(t, s.SaveAgentNetworkSettings(ctx, settings))
-
-	require.NoError(t, IngestAccessLog(ctx, s, newIngestTestEntry()))
-
-	usage, err := s.GetAgentNetworkUsageRows(ctx, store.LockingStrengthNone, testAccountID, types.AgentNetworkAccessLogFilter{})
-	require.NoError(t, err)
-	require.Len(t, usage, 1, "usage row must be written when log collection is on")
-
-	logs, total, err := s.GetAgentNetworkAccessLogs(ctx, store.LockingStrengthNone, testAccountID, types.AgentNetworkAccessLogFilter{})
-	require.NoError(t, err)
-	require.Equal(t, int64(1), total, "exactly one access-log row expected")
-	require.Len(t, logs, 1, "full access-log row must be written when log collection is on")
-	assert.Equal(t, "gpt-5.4", logs[0].Model, "model must flatten from metadata")
-	assert.Equal(t, "hello", logs[0].RequestPrompt, "prompt must be retained when log collection is on")
-	assert.Equal(t, "world", logs[0].ResponseCompletion, "completion must be retained when log collection is on")
-	assert.True(t, logs[0].Stream, "stream flag must flatten from metadata")
-}
-
-func TestParseGroupCSV_DedupAndTrim(t *testing.T) {
-	assert.Nil(t, parseGroupCSV(""), "empty CSV yields no groups")
-	assert.Equal(t, []string{"a", "b"}, parseGroupCSV(" a , b , a ,"),
-		"group CSV must trim, drop blanks, and de-duplicate preserving first-seen order")
-}
-
-func TestParseMetaInt_ClampsNegativeAndJunk(t *testing.T) {
-	meta := map[string]string{"ok": " 42 ", "neg": "-5", "junk": "abc"}
-	assert.Equal(t, int64(42), parseMetaInt(meta, "ok"), "valid count parses with surrounding space trimmed")
-	assert.Equal(t, int64(0), parseMetaInt(meta, "neg"), "negative count clamps to 0")
-	assert.Equal(t, int64(0), parseMetaInt(meta, "junk"), "unparseable count clamps to 0")
-	assert.Equal(t, int64(0), parseMetaInt(meta, "missing"), "missing key clamps to 0")
-}
-
-func TestParseMetaFloat_ClampsNegativeInfAndJunk(t *testing.T) {
-	meta := map[string]string{"ok": "1.5", "neg": "-1", "inf": "Inf", "junk": "x"}
-	assert.InDelta(t, 1.5, parseMetaFloat(meta, "ok"), 1e-9, "valid cost parses")
-	assert.Equal(t, float64(0), parseMetaFloat(meta, "neg"), "negative cost clamps to 0")
-	assert.Equal(t, float64(0), parseMetaFloat(meta, "inf"), "non-finite cost clamps to 0")
-	assert.Equal(t, float64(0), parseMetaFloat(meta, "junk"), "unparseable cost clamps to 0")
-}
--- a/management/internals/modules/agentnetwork/affectedpeers_hook.go
+++ b/management/internals/modules/agentnetwork/affectedpeers_hook.go
@@ -1,15 +0,0 @@
-package agentnetwork
-
-import "github.com/netbirdio/netbird/management/server/affectedpeers"
-
-// init registers the agent-network service synthesiser with the affectedpeers
-// resolver. Agent-network reverse-proxy services are synthesised on demand and
-// never persisted, so the resolver can't load them from the store; without them
-// it can't fold the embedded proxy peer into the affected set on a client
-// group/peer change, and the proxy never learns a newly authorised client until
-// it reconnects. Registered here (rather than via a direct
-// affectedpeers→agentnetwork import) to avoid an import cycle
-// (agentnetwork → account → affectedpeers).
-func init() {
-	affectedpeers.SetAgentNetworkSynthesizer(SynthesizeServices)
-}
--- a/management/internals/modules/agentnetwork/catalog/catalog.go
+++ b/management/internals/modules/agentnetwork/catalog/catalog.go
@@ -1,749 +0,0 @@
-// Package catalog defines the static set of Agent Network providers
-// recognized by the management server. The catalog is consulted both to
-// validate provider_id on create/update and to surface the available
-// providers (and their models) to the dashboard.
-package catalog
-
-import "github.com/netbirdio/netbird/shared/management/http/api"
-
-// Model is the in-memory representation of a catalog model.
-type Model struct {
-	ID            string
-	Label         string
-	InputPer1k    float64
-	OutputPer1k   float64
-	ContextWindow int
-}
-
-// ProviderKind groups catalog entries for UI presentation. The split
-// is semantic, not technical:
-//   - KindProvider: the upstream is a vendor's first-party API (OpenAI,
-//     Anthropic, Mistral, Bedrock, etc.) — NetBird talks straight to
-//     the model provider.
-//   - KindGateway: the upstream is itself a routing / aggregation layer
-//     in front of multiple providers (LiteLLM, Portkey, Helicone, …).
-//     These typically need NetBird identity stamped onto upstream
-//     requests so the gateway's analytics and budgets attribute to the
-//     real caller; that's what IdentityInjection is for.
-//   - KindCustom: the catch-all "OpenAI-compatible self-hosted endpoint"
-//     entry (vLLM, Ollama, custom inference servers).
-//
-// Frontend uses Kind to group the provider Select in the modal so an
-// operator can spot at a glance which catalog entries proxy other
-// providers vs. talk straight to one. Backend doesn't dispatch on Kind
-// today; it's purely a presentation hint.
-type ProviderKind string
-
-const (
-	KindProvider ProviderKind = "provider"
-	KindGateway  ProviderKind = "gateway"
-	KindCustom   ProviderKind = "custom"
-)
-
-// Provider is the in-memory representation of a catalog provider.
-type Provider struct {
-	ID          string
-	Name        string
-	Description string
-	DefaultHost string
-	// Kind groups this entry for UI presentation; see ProviderKind.
-	Kind ProviderKind
-	// AuthHeaderName is the HTTP header the provider's API expects
-	// the credential under (e.g. "Authorization" for OpenAI,
-	// "x-api-key" for Anthropic). Combined with AuthHeaderTemplate
-	// at synthesis time to inject the auth header on every upstream
-	// request.
-	AuthHeaderName     string
-	AuthHeaderTemplate string
-	DefaultContentType string
-	BrandColor         string
-	// ParserID names the proxy LLM parser surface this provider
-	// speaks (matches llm.Parser.ProviderName: "openai",
-	// "anthropic"). Multiple catalog ids may share a parser surface
-	// (e.g. azure_openai_api and mistral_api both speak the OpenAI
-	// shape). Empty when no parser is yet implemented for the
-	// surface — the proxy middleware then falls back to URL sniffing
-	// or skips request-side enrichment.
-	ParserID string
-	// IdentityInjection, when non-nil, instructs the proxy to stamp
-	// the caller's NetBird identity onto upstream requests under the
-	// configured header names. Used for gateways like LiteLLM that
-	// key budgets and attribution off request headers (the gateway
-	// otherwise has no way to learn which user / group made the call).
-	// The proxy strips the same header names from the inbound request
-	// before stamping ours, so an app can't spoof identity by setting
-	// these headers itself.
-	IdentityInjection *IdentityInjection
-	// ExtraHeaders is a catalog-declared list of additional per-
-	// provider routing/config headers the proxy stamps on every
-	// upstream request. Distinct from AuthHeaderName/Template (which
-	// always carries the API_KEY) and from IdentityInjection (caller
-	// identity). Each entry surfaces an optional input on the
-	// dashboard's provider modal whose value lives on the provider
-	// record's ExtraValues map (keyed by ExtraHeader.Name). Empty
-	// list = no extra inputs rendered. Used today by Portkey for
-	// "x-portkey-config: pc-..." (a saved-config id that resolves
-	// upstream provider + credentials on Portkey's hosted side).
-	ExtraHeaders []ExtraHeader
-	Models       []Model
-}
-
-// ExtraHeader names a single optional per-provider routing/config
-// header. Catalog declares N of these per provider type; the operator
-// fills any subset on the provider record (see Provider.ExtraValues).
-// At synth time, only entries with a non-empty operator value are
-// stamped; the proxy's identity-inject middleware applies anti-spoof
-// (Remove + Add) so a client can't supply these headers themselves.
-//
-// UI copy (label / help text / tooltip) for each known Name lives on
-// the dashboard, not here — the backend's job is just to declare
-// which wire headers are accepted. New provider needs an extra
-// header? Add the Name here AND the matching UI copy on the dashboard.
-type ExtraHeader struct {
-	// Name is the wire header name, e.g. "x-portkey-config".
-	Name string
-}
-
-// IdentityInjection describes how the proxy stamps NetBird identity onto
-// upstream gateway requests. Exactly one shape must be set — they're
-// mutually exclusive and dispatched by the inject middleware.
-//
-// Shape choice tracks the wire convention the upstream gateway uses,
-// not the vendor name. New gateways with a known shape become a catalog
-// entry, not a new code path.
-type IdentityInjection struct {
-	// HeaderPair emits separate headers per identity dimension
-	// (end-user id, tags as CSV). LiteLLM and OpenAI-compatible
-	// self-hosted gateways that read identity from dedicated headers.
-	HeaderPair *HeaderPairInjection
-	// JSONMetadata emits a single header carrying a JSON object with
-	// reserved keys for user / groups / etc. Portkey, Helicone-style
-	// metadata headers, anything that wants a structured envelope.
-	JSONMetadata *JSONMetadataInjection
-}
-
-// HeaderPairInjection is the LiteLLM-style wire convention.
-type HeaderPairInjection struct {
-	// Customizable, when true, marks the wire header names as
-	// operator-overridable: the dashboard surfaces EndUserIDHeader
-	// and TagsHeader as editable inputs (defaults shown as
-	// placeholders) and the synthesizer pulls the actual values from
-	// the provider record's IdentityHeader* fields rather than from
-	// these defaults. An empty operator value disables stamping for
-	// that dimension. Used today for Bifrost, whose log-metadata /
-	// telemetry header prefix (x-bf-lh-* vs x-bf-dim-*) is a
-	// per-operator choice; LiteLLM and similar gateways with a fixed
-	// wire protocol leave this false so the catalog defaults are
-	// authoritative.
-	Customizable bool
-	// EndUserIDHeader receives the caller's display identity (user
-	// email when the peer is attached to a user, else peer.Name),
-	// e.g. "x-litellm-end-user-id".
-	EndUserIDHeader string
-	// TagsHeader receives the caller's NetBird group display names
-	// as a CSV, e.g. "x-litellm-tags".
-	TagsHeader string
-	// TagsInBody, when true, additionally writes the tag list into
-	// the request body's metadata.tags array (a JSON path the
-	// gateway parses for budget enforcement). LiteLLM only honours
-	// metadata.tags for tag-budget gating — its x-litellm-tags
-	// header path feeds spend tracking but bypasses
-	// _tag_max_budget_check entirely. Body inject is skipped when
-	// the request body is empty, truncated, non-JSON, or when an
-	// existing metadata field is a non-object value (defensive: we
-	// never clobber a client-supplied non-object). The header path
-	// remains a robust fallback for spend tracking in those cases.
-	TagsInBody bool
-	// EndUserIDInBody, when true, additionally writes the display
-	// identity into the request body's top-level "user" field (the
-	// OpenAI-standard end-user identifier). LiteLLM resolves the end
-	// user id from headers first then body, so for LiteLLM this is
-	// belt-and-suspenders. It matters when an OpenAI-compatible
-	// gateway downstream of LiteLLM (or OpenAI direct, bypassing
-	// LiteLLM) only reads the body, and as anti-spoof: client-
-	// supplied "user" values are overwritten with our trusted
-	// identity. Same skip rules as TagsInBody.
-	EndUserIDInBody bool
-}
-
-// JSONMetadataInjection is the Portkey-style wire convention: a single
-// header carrying a JSON object. NetBird identity fields land under the
-// configured reserved keys; missing keys (empty string) are skipped at
-// emit time.
-type JSONMetadataInjection struct {
-	// Customizable, when true, marks the JSON keys as operator-
-	// overridable. The dashboard surfaces UserKey and GroupsKey as
-	// editable inputs (the catalog values shown as placeholders) and
-	// the synthesizer pulls the actual JSON-key names from the
-	// provider record's IdentityHeader* fields. Same field reuse as
-	// HeaderPair's customizable path — the dimensions (user identity,
-	// groups) are the same, only the wire encoding differs (JSON key
-	// vs HTTP header name). An empty operator value disables emission
-	// for that dimension. Used today for Cloudflare AI Gateway, whose
-	// cf-aig-metadata header accepts arbitrary JSON keys; Portkey
-	// leaves this false because its keys are reserved by the Portkey
-	// schema.
-	Customizable bool
-	// Header is the wire header name carrying the JSON payload, e.g.
-	// "x-portkey-metadata".
-	Header string
-	// UserKey is the JSON key for the caller's display identity.
-	// Portkey reserves "_user" for this dimension.
-	UserKey string
-	// GroupsKey is the JSON key for the caller's NetBird groups,
-	// emitted as a CSV string value (Portkey requires string values).
-	GroupsKey string
-	// MaxValueLength caps each emitted JSON value, in bytes. Portkey
-	// enforces a 128-char limit per value; oversized values are
-	// truncated rather than failing the request. 0 disables the cap.
-	MaxValueLength int
-}
-
-// providers is the canonical list of supported Agent Network providers.
-// Update this list together with the dashboard's PROVIDER_CATALOG.
-var providers = []Provider{
-	{
-		ID:                 "openai_api",
-		Kind:               KindProvider,
-		Name:               "OpenAI API",
-		Description:        "GPT, Responses API, and Embeddings",
-		DefaultHost:        "api.openai.com",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#10A37F",
-		ParserID:           "openai",
-		// Pricing + context windows cross-checked against LiteLLM's
-		// model_prices_and_context_window.json. Notable corrections from
-		// earlier values: o4-mini repriced from $4/$16 to $1.10/$4.40
-		// per MTok, gpt-4o from $5/$15 to $2.50/$10, and the GPT-5
-		// family context windows split between 1.05M for full-size
-		// models and 272K for mini/nano/codex variants.
-		Models: []Model{
-			{ID: "gpt-5.5", Label: "GPT-5.5", InputPer1k: 0.005, OutputPer1k: 0.030, ContextWindow: 1050000},
-			{ID: "gpt-5.5-pro", Label: "GPT-5.5 Pro", InputPer1k: 0.030, OutputPer1k: 0.180, ContextWindow: 1050000},
-			{ID: "gpt-5.4", Label: "GPT-5.4", InputPer1k: 0.0025, OutputPer1k: 0.015, ContextWindow: 1050000},
-			{ID: "gpt-5.4-pro", Label: "GPT-5.4 Pro", InputPer1k: 0.030, OutputPer1k: 0.180, ContextWindow: 1050000},
-			{ID: "gpt-5.4-mini", Label: "GPT-5.4 Mini", InputPer1k: 0.00075, OutputPer1k: 0.0045, ContextWindow: 272000},
-			{ID: "gpt-5.4-nano", Label: "GPT-5.4 Nano", InputPer1k: 0.0002, OutputPer1k: 0.00125, ContextWindow: 272000},
-			{ID: "gpt-5.3-codex", Label: "GPT-5.3 Codex", InputPer1k: 0.00175, OutputPer1k: 0.014, ContextWindow: 272000},
-			{ID: "gpt-5.3-chat-latest", Label: "GPT-5.3 Chat", InputPer1k: 0.00175, OutputPer1k: 0.014, ContextWindow: 128000},
-			{ID: "o4-mini", Label: "o4-mini", InputPer1k: 0.0011, OutputPer1k: 0.0044, ContextWindow: 200000},
-			{ID: "gpt-4.1", Label: "GPT-4.1", InputPer1k: 0.002, OutputPer1k: 0.008, ContextWindow: 1047576},
-			{ID: "gpt-4.1-mini", Label: "GPT-4.1 mini", InputPer1k: 0.0004, OutputPer1k: 0.0016, ContextWindow: 1047576},
-			{ID: "gpt-4.1-nano", Label: "GPT-4.1 nano", InputPer1k: 0.0001, OutputPer1k: 0.0004, ContextWindow: 1047576},
-			{ID: "gpt-4o", Label: "GPT-4o", InputPer1k: 0.0025, OutputPer1k: 0.010, ContextWindow: 128000},
-			{ID: "gpt-4o-mini", Label: "GPT-4o mini", InputPer1k: 0.00015, OutputPer1k: 0.0006, ContextWindow: 128000},
-			{ID: "gpt-4-turbo", Label: "GPT-4 Turbo", InputPer1k: 0.01, OutputPer1k: 0.03, ContextWindow: 128000},
-			{ID: "gpt-3.5-turbo", Label: "GPT-3.5 Turbo", InputPer1k: 0.0005, OutputPer1k: 0.0015, ContextWindow: 16385},
-			{ID: "text-embedding-3-large", Label: "text-embedding-3-large", InputPer1k: 0.00013, OutputPer1k: 0, ContextWindow: 8191},
-			{ID: "text-embedding-3-small", Label: "text-embedding-3-small", InputPer1k: 0.00002, OutputPer1k: 0, ContextWindow: 8191},
-		},
-	},
-	{
-		ID:                 "anthropic_api",
-		Kind:               KindProvider,
-		Name:               "Anthropic API",
-		Description:        "Claude Messages API",
-		DefaultHost:        "api.anthropic.com",
-		AuthHeaderName:     "x-api-key",
-		AuthHeaderTemplate: "${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#D97757",
-		ParserID:           "anthropic",
-		// Per Anthropic's current model lineup. Pricing in USD per 1k
-		// tokens. Context windows: 4.6+ family is 1M; Haiku 4.5 stays at
-		// 200K. claude-3-7-sonnet and claude-3-5-haiku retired
-		// 2026-02-19 — dropped from the catalog. claude-opus-4-1
-		// deprecated, retires 2026-08-05 — kept until the cutover.
-		// claude-mythos-5 omitted: Project Glasswing access only, not a
-		// general-availability target. claude-fable-5 requires the
-		// account to be on >= 30-day data retention or all requests
-		// 400.
-		Models: []Model{
-			{ID: "claude-fable-5", Label: "Claude Fable 5", InputPer1k: 0.010, OutputPer1k: 0.050, ContextWindow: 1000000},
-			{ID: "claude-opus-4-8", Label: "Claude Opus 4.8", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "claude-opus-4-7", Label: "Claude Opus 4.7", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "claude-opus-4-6", Label: "Claude Opus 4.6", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "claude-opus-4-1", Label: "Claude Opus 4.1 (deprecated, retires 2026-08-05)", InputPer1k: 0.015, OutputPer1k: 0.075, ContextWindow: 200000},
-			{ID: "claude-sonnet-4-6", Label: "Claude Sonnet 4.6", InputPer1k: 0.003, OutputPer1k: 0.015, ContextWindow: 1000000},
-			{ID: "claude-sonnet-4-5", Label: "Claude Sonnet 4.5", InputPer1k: 0.003, OutputPer1k: 0.015, ContextWindow: 200000},
-			{ID: "claude-haiku-4-5", Label: "Claude Haiku 4.5", InputPer1k: 0.001, OutputPer1k: 0.005, ContextWindow: 200000},
-		},
-	},
-	{
-		ID:                 "azure_openai_api",
-		Kind:               KindProvider,
-		Name:               "Azure OpenAI API",
-		Description:        "Azure-hosted OpenAI deployments",
-		DefaultHost:        "<resource>.openai.azure.com",
-		AuthHeaderName:     "api-key",
-		AuthHeaderTemplate: "${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#0078D4",
-		ParserID:           "openai",
-		// Mirrors openai_api pricing — Azure resells OpenAI models at the
-		// same per-token rates, just under different deployment names.
-		Models: []Model{
-			{ID: "gpt-5.5", Label: "GPT-5.5 (Azure)", InputPer1k: 0.005, OutputPer1k: 0.030, ContextWindow: 1050000},
-			{ID: "gpt-5.4", Label: "GPT-5.4 (Azure)", InputPer1k: 0.0025, OutputPer1k: 0.015, ContextWindow: 1050000},
-			{ID: "gpt-5.4-mini", Label: "GPT-5.4 Mini (Azure)", InputPer1k: 0.00075, OutputPer1k: 0.0045, ContextWindow: 272000},
-			{ID: "gpt-5.4-nano", Label: "GPT-5.4 Nano (Azure)", InputPer1k: 0.0002, OutputPer1k: 0.00125, ContextWindow: 272000},
-			{ID: "o4-mini", Label: "o4-mini (Azure)", InputPer1k: 0.0011, OutputPer1k: 0.0044, ContextWindow: 200000},
-			{ID: "gpt-4.1", Label: "GPT-4.1 (Azure)", InputPer1k: 0.002, OutputPer1k: 0.008, ContextWindow: 1047576},
-			{ID: "gpt-4.1-mini", Label: "GPT-4.1 mini (Azure)", InputPer1k: 0.0004, OutputPer1k: 0.0016, ContextWindow: 1047576},
-			{ID: "gpt-4o", Label: "GPT-4o (Azure)", InputPer1k: 0.0025, OutputPer1k: 0.010, ContextWindow: 128000},
-			{ID: "gpt-4o-mini", Label: "GPT-4o mini (Azure)", InputPer1k: 0.00015, OutputPer1k: 0.0006, ContextWindow: 128000},
-			{ID: "gpt-35-turbo", Label: "GPT-3.5 Turbo (Azure)", InputPer1k: 0.0005, OutputPer1k: 0.0015, ContextWindow: 16385},
-		},
-	},
-	{
-		ID:                 "bedrock_api",
-		Kind:               KindProvider,
-		Name:               "AWS Bedrock API",
-		Description:        "Anthropic, Meta, Cohere via Bedrock",
-		DefaultHost:        "bedrock-runtime.<region>.amazonaws.com",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#FF9900",
-		// Anthropic models on Bedrock take the anthropic.* prefix and
-		// follow the same lineup / pricing as the first-party Anthropic
-		// catalog entry above. claude-3-7-sonnet and claude-3-5-haiku
-		// were retired upstream on 2026-02-19 — dropped from the
-		// Bedrock list too. Amazon Nova entries cross-checked against
-		// LiteLLM (added Nova Micro + the new Nova 2 Lite preview).
-		// Llama 3.3 70B entry kept unchanged — LiteLLM tracks only
-		// per-region Llama 3 entries; standalone 3.3 not yet listed.
-		Models: []Model{
-			{ID: "anthropic.claude-opus-4-8", Label: "Claude Opus 4.8 (Bedrock)", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "anthropic.claude-opus-4-7", Label: "Claude Opus 4.7 (Bedrock)", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "anthropic.claude-opus-4-6", Label: "Claude Opus 4.6 (Bedrock)", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "anthropic.claude-opus-4-1", Label: "Claude Opus 4.1 (Bedrock, deprecated 2026-08-05)", InputPer1k: 0.015, OutputPer1k: 0.075, ContextWindow: 200000},
-			{ID: "anthropic.claude-sonnet-4-6", Label: "Claude Sonnet 4.6 (Bedrock)", InputPer1k: 0.003, OutputPer1k: 0.015, ContextWindow: 1000000},
-			{ID: "anthropic.claude-sonnet-4-5", Label: "Claude Sonnet 4.5 (Bedrock)", InputPer1k: 0.003, OutputPer1k: 0.015, ContextWindow: 200000},
-			{ID: "anthropic.claude-haiku-4-5", Label: "Claude Haiku 4.5 (Bedrock)", InputPer1k: 0.001, OutputPer1k: 0.005, ContextWindow: 200000},
-			{ID: "meta.llama3-3-70b-instruct", Label: "Llama 3.3 70B (Bedrock)", InputPer1k: 0.00072, OutputPer1k: 0.00072, ContextWindow: 128000},
-			{ID: "amazon.nova-2-lite", Label: "Amazon Nova 2 Lite (Bedrock, preview)", InputPer1k: 0.0003, OutputPer1k: 0.0025, ContextWindow: 1000000},
-			{ID: "amazon.nova-pro", Label: "Amazon Nova Pro (Bedrock)", InputPer1k: 0.0008, OutputPer1k: 0.0032, ContextWindow: 300000},
-			{ID: "amazon.nova-lite", Label: "Amazon Nova Lite (Bedrock)", InputPer1k: 0.00006, OutputPer1k: 0.00024, ContextWindow: 300000},
-			{ID: "amazon.nova-micro", Label: "Amazon Nova Micro (Bedrock)", InputPer1k: 0.000035, OutputPer1k: 0.00014, ContextWindow: 128000},
-		},
-	},
-	{
-		ID:                 "vertex_ai_api",
-		Kind:               KindProvider,
-		Name:               "Google Vertex AI API",
-		Description:        "Anthropic Claude models hosted on Vertex AI",
-		DefaultHost:        "<region>-aiplatform.googleapis.com",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#4285F4",
-		// Vertex carries the model in the URL path and authenticates with a
-		// service-account-minted OAuth token (api_key = "keyfile::<base64 SA>").
-		// Only Anthropic-on-Vertex is metered today: the request parser maps the
-		// anthropic publisher to the Anthropic parser, so the lineup + prices
-		// mirror the first-party Anthropic catalog (LiteLLM vertex_ai/claude-*
-		// confirms the same per-token rates; cross-region profiles in eu/apac
-		// carry a ~10% premium that base pricing does not model). Gemini (the
-		// google publisher) is intentionally omitted until a Gemini parser
-		// exists — the router denies unmeterable publishers rather than forward
-		// them uncounted.
-		Models: []Model{
-			{ID: "claude-fable-5", Label: "Claude Fable 5 (Vertex)", InputPer1k: 0.010, OutputPer1k: 0.050, ContextWindow: 1000000},
-			{ID: "claude-opus-4-8", Label: "Claude Opus 4.8 (Vertex)", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "claude-opus-4-7", Label: "Claude Opus 4.7 (Vertex)", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "claude-opus-4-6", Label: "Claude Opus 4.6 (Vertex)", InputPer1k: 0.005, OutputPer1k: 0.025, ContextWindow: 1000000},
-			{ID: "claude-opus-4-1", Label: "Claude Opus 4.1 (Vertex, deprecated 2026-08-05)", InputPer1k: 0.015, OutputPer1k: 0.075, ContextWindow: 200000},
-			{ID: "claude-sonnet-4-6", Label: "Claude Sonnet 4.6 (Vertex)", InputPer1k: 0.003, OutputPer1k: 0.015, ContextWindow: 1000000},
-			{ID: "claude-sonnet-4-5", Label: "Claude Sonnet 4.5 (Vertex)", InputPer1k: 0.003, OutputPer1k: 0.015, ContextWindow: 200000},
-			{ID: "claude-haiku-4-5", Label: "Claude Haiku 4.5 (Vertex)", InputPer1k: 0.001, OutputPer1k: 0.005, ContextWindow: 200000},
-		},
-	},
-	{
-		ID:                 "mistral_api",
-		Kind:               KindProvider,
-		Name:               "Mistral API",
-		Description:        "Mistral cloud API",
-		DefaultHost:        "api.mistral.ai",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#FF7000",
-		ParserID:           "openai",
-		// Pricing + context windows cross-checked against LiteLLM. Key
-		// gotchas the marketing page hides:
-		//   - `mistral-medium-latest` aliases to Medium 3.1 ($0.40/$2),
-		//     NOT Medium 3.5 ($1.50/$7.50). Catalog exposes both.
-		//   - `mistral-large-latest` aliases to Large 3 — 262K context,
-		//     cheaper than Medium 3.5.
-		//   - Magistral models are tuned for reasoning but cap context
-		//     at only 40K (vs 128K-262K elsewhere).
-		//   - `codestral-latest` still routes to the old 2405 build
-		//     ($1/$3) per LiteLLM; the newer codestral-2508 is both
-		//     cheaper and longer-context. Both exposed.
-		//   - Pixtral was folded into the main Large/Medium series; no
-		//     standalone vision entry.
-		Models: []Model{
-			{ID: "mistral-large-latest", Label: "Mistral Large 3", InputPer1k: 0.0005, OutputPer1k: 0.0015, ContextWindow: 262144},
-			{ID: "mistral-medium-latest", Label: "Mistral Medium 3.1", InputPer1k: 0.0004, OutputPer1k: 0.002, ContextWindow: 131072},
-			{ID: "mistral-medium-3-5", Label: "Mistral Medium 3.5", InputPer1k: 0.0015, OutputPer1k: 0.0075, ContextWindow: 262144},
-			{ID: "mistral-small-latest", Label: "Mistral Small 3.2", InputPer1k: 0.00006, OutputPer1k: 0.00018, ContextWindow: 131072},
-			{ID: "magistral-medium-latest", Label: "Magistral Medium (reasoning)", InputPer1k: 0.002, OutputPer1k: 0.005, ContextWindow: 40000},
-			{ID: "magistral-small-latest", Label: "Magistral Small (reasoning)", InputPer1k: 0.0005, OutputPer1k: 0.0015, ContextWindow: 40000},
-			{ID: "devstral-medium-latest", Label: "Devstral Medium 2 (coding)", InputPer1k: 0.0004, OutputPer1k: 0.002, ContextWindow: 256000},
-			{ID: "devstral-small-latest", Label: "Devstral Small 2 (coding)", InputPer1k: 0.0001, OutputPer1k: 0.0003, ContextWindow: 256000},
-			{ID: "codestral-2508", Label: "Codestral 2508", InputPer1k: 0.0003, OutputPer1k: 0.0009, ContextWindow: 256000},
-			{ID: "codestral-latest", Label: "Codestral (legacy 2405)", InputPer1k: 0.001, OutputPer1k: 0.003, ContextWindow: 32000},
-			{ID: "ministral-3-14b-2512", Label: "Ministral 3 14B", InputPer1k: 0.0002, OutputPer1k: 0.0002, ContextWindow: 262144},
-			{ID: "ministral-8b-latest", Label: "Ministral 8B", InputPer1k: 0.00015, OutputPer1k: 0.00015, ContextWindow: 262144},
-			{ID: "ministral-3-3b-2512", Label: "Ministral 3 3B", InputPer1k: 0.0001, OutputPer1k: 0.0001, ContextWindow: 131072},
-			{ID: "mistral-embed", Label: "Mistral Embed", InputPer1k: 0.0001, OutputPer1k: 0, ContextWindow: 8192},
-		},
-	},
-	{
-		ID:                 "litellm_proxy",
-		Kind:               KindGateway,
-		Name:               "LiteLLM Proxy",
-		Description:        "Bring your own LiteLLM proxy with NetBird identity stamped on every request",
-		DefaultHost:        "",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#0EA5E9",
-		ParserID:           "openai",
-		// IdentityInjection requires a LiteLLM virtual key minted with
-		// metadata.allow_client_tags=true; the master key silently drops
-		// caller tags. Tags go out via both the x-litellm-tags header and
-		// body metadata.tags: LiteLLM enforces budgets from the body only,
-		// so the header is the spend-tracking fallback when body injection
-		// can't run. See the Agent Network provider docs for key setup.
-		IdentityInjection: &IdentityInjection{
-			HeaderPair: &HeaderPairInjection{
-				EndUserIDHeader: "x-litellm-end-user-id",
-				TagsHeader:      "x-litellm-tags",
-				TagsInBody:      true,
-				EndUserIDInBody: true,
-			},
-		},
-		Models: []Model{},
-	},
-	{
-		ID:          "portkey",
-		Kind:        KindGateway,
-		Name:        "Portkey AI Gateway",
-		Description: "Portkey AI Gateway with NetBird identity stamped via x-portkey-metadata",
-		DefaultHost: "api.portkey.ai",
-		// Portkey hosted requires x-portkey-api-key (account key)
-		// plus a routing decision per request. The simplest routing
-		// path is a saved Portkey config id stamped via
-		// x-portkey-config — operators paste the pc-... id once and
-		// Portkey resolves the upstream provider + virtual key from
-		// it. ExtraHeaders below surfaces the input. Alternative:
-		// callers author "@org/model" in the body; both flows
-		// coexist (per-request authoring still works without a
-		// configured value).
-		AuthHeaderName:     "x-portkey-api-key",
-		AuthHeaderTemplate: "${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#FF5C00",
-		ParserID:           "openai",
-		IdentityInjection: &IdentityInjection{
-			JSONMetadata: &JSONMetadataInjection{
-				Header:         "x-portkey-metadata",
-				UserKey:        "_user",
-				GroupsKey:      "groups",
-				MaxValueLength: 128,
-			},
-		},
-		ExtraHeaders: []ExtraHeader{
-			{Name: "x-portkey-config"},
-		},
-		Models: []Model{},
-	},
-	{
-		ID:                 "bifrost",
-		Kind:               KindGateway,
-		Name:               "Bifrost",
-		Description:        "Maxim AI's Bifrost gateway. Point upstream URL at /openai/v1 or /anthropic/v1 on your Bifrost host depending on which body shape your apps use.",
-		DefaultHost:        "",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#7C3AED",
-		// ParserID empty: the proxy's request parser sniffs the URL
-		// path. Bifrost's /openai/v1/... contains "/v1/chat/completions"
-		// (matches OpenAIParser.DetectFromURL); /anthropic/v1/messages
-		// contains "/v1/messages" (matches AnthropicParser). Operators
-		// who paste a different prefix get no usage parsing and the
-		// cost meter skips with skipMissingProvider — degraded but
-		// non-fatal.
-		ParserID: "",
-		// Identity-injection headers are operator-customisable. The
-		// HeaderPair values below are PLACEHOLDERS surfaced by the
-		// dashboard; the actual values stamped on the wire come from
-		// the provider record's IdentityHeaderUserID /
-		// IdentityHeaderGroups fields. An empty operator value
-		// disables stamping for that dimension (the inject middleware
-		// already no-ops on empty header names). Defaulting to the
-		// x-bf-dim- family so the values land in Bifrost's
-		// Prometheus/OTEL pipelines when the operator declares the
-		// label names in their client.prometheus_labels config — see
-		// docs.getbifrost.ai/features/telemetry. Operators who use
-		// the always-on x-bf-lh- log-metadata family (no Bifrost-side
-		// declaration required) just edit the inputs.
-		//
-		// Bifrost virtual keys (sk-bf-*) ride Authorization: Bearer.
-		// Operators provision the VK on their Bifrost (UI /
-		// config.json / POST /api/governance/virtual-keys) and paste
-		// the returned sk-bf-... as ${API_KEY}. Pin v1.4+ to avoid
-		// the v1.3.0 x-bf-vk regression (maximhq/bifrost#632).
-		IdentityInjection: &IdentityInjection{
-			HeaderPair: &HeaderPairInjection{
-				EndUserIDHeader: "x-bf-dim-netbird_user_id",
-				TagsHeader:      "x-bf-dim-netbird_groups",
-				Customizable:    true,
-			},
-		},
-		Models: []Model{},
-	},
-	{
-		ID:                 "cloudflare_ai_gateway",
-		Kind:               KindGateway,
-		Name:               "Cloudflare AI Gateway",
-		Description:        "Cloudflare AI Gateway. Operator pastes the gateway URL (with the upstream provider slug like /openai or /anthropic so the URL sniffer dispatches to the right parser) and a per-gateway authentication token. Recommended setup is BYOK / Stored Keys: Cloudflare manages the upstream provider credential and the gateway token is the only secret NetBird needs.",
-		DefaultHost:        "",
-		AuthHeaderName:     "cf-aig-authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#F38020",
-		// ParserID empty: like Bifrost, the proxy's parser-detect
-		// sniffs the URL path. /openai/... contains the OpenAI hint
-		// substrings; /anthropic/v1/messages contains /v1/messages
-		// (matches AnthropicParser). The /compat universal endpoint
-		// also speaks OpenAI shape so OpenAIParser handles it.
-		// Operators who paste a different prefix degrade to no-cost
-		// (skipMissingProvider) but the request still flows.
-		ParserID: "",
-		// cf-aig-metadata is a single header carrying a JSON object;
-		// up to five string/number/boolean values per request. NetBird
-		// occupies two slots (user id + groups CSV) and leaves three
-		// for operator-added context. JSON keys are operator-
-		// customisable so Cloudflare-side log filters can use the
-		// operator's existing label conventions instead of NetBird's
-		// defaults — hence Customizable=true. The dashboard surfaces
-		// the catalog values as placeholders; only the values stored
-		// on the provider record's IdentityHeader* fields land on the
-		// wire (empty operator value = key is omitted from the JSON,
-		// since applyJSONMetadata already skips empty keys).
-		IdentityInjection: &IdentityInjection{
-			JSONMetadata: &JSONMetadataInjection{
-				Header:       "cf-aig-metadata",
-				UserKey:      "netbird_user_id",
-				GroupsKey:    "netbird_groups",
-				Customizable: true,
-				// Cloudflare's docs don't specify a per-value cap;
-				// leaving 0 disables the truncate path. Header-level
-				// constraint is "5 entries max" rather than length.
-				MaxValueLength: 0,
-			},
-		},
-		Models: []Model{},
-	},
-	{
-		ID:                 "vercel_ai_gateway",
-		Kind:               KindGateway,
-		Name:               "Vercel AI Gateway",
-		Description:        "Vercel's unified API for hundreds of models. Single endpoint, OpenAI-compatible body, model dispatch via prefix (openai/..., anthropic/..., google/..., xai/...). Per-user / per-tag attribution lands in Vercel's Custom Reporting API and observability dashboard.",
-		DefaultHost:        "",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#000000",
-		// Vercel always speaks OpenAI shape on /v1/chat/completions —
-		// the model prefix in the body picks the upstream provider.
-		// No URL sniffing needed; pin the parser directly.
-		ParserID: "openai",
-		// HeaderPair shape with fixed wire names dictated by Vercel's
-		// Custom Reporting API contract. Customizable=false because
-		// renaming the headers makes Vercel silently stop attributing
-		// — the gateway's reporting endpoint only matches its own
-		// header names. Same fixed-protocol position as LiteLLM.
-		//
-		// Caveats operators should know:
-		//   - up to 10 tags total per request (deduped); 11+ → HTTP 400
-		//   - each tag must be 1-64 chars
-		//   - user up to 256 chars (NetBird user emails fit)
-		//   - $0.075 per 1k unique user/tag values written
-		// We don't enforce the caps in the inject middleware today;
-		// operators in groups beyond the 10-tag limit will see Vercel
-		// 400s and need to re-scope their group memberships.
-		IdentityInjection: &IdentityInjection{
-			HeaderPair: &HeaderPairInjection{
-				EndUserIDHeader: "ai-reporting-user",
-				TagsHeader:      "ai-reporting-tags",
-			},
-		},
-		Models: []Model{},
-	},
-	{
-		ID:                 "openrouter",
-		Kind:               KindGateway,
-		Name:               "OpenRouter",
-		Description:        "OpenRouter's unified API for hundreds of models. Single endpoint at openrouter.ai/api/v1, OpenAI-compatible body, model dispatch via prefix (anthropic/claude-..., openai/gpt-..., google/gemini-..., etc.). Per-user attribution lands in OpenRouter's analytics via the OpenAI-standard `user` body field; OpenRouter has no groups / tags dimension at request time.",
-		DefaultHost:        "openrouter.ai/api/v1",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#6F4FF2",
-		// OpenRouter is single-endpoint OpenAI-shape on /api/v1/chat/completions —
-		// model prefix in the body picks the upstream provider.
-		// Pinning the parser saves URL sniffing.
-		ParserID: "openai",
-		// HeaderPair shape with EndUserIDInBody as the only active
-		// dimension. OpenRouter's per-user attribution is the
-		// OpenAI-standard `user` body field, not a header — and
-		// OpenRouter offers no per-request groups / tags dimension at
-		// all. Customizable=false because the field name is locked by
-		// OpenAI's spec; renaming would just defeat the inject.
-		IdentityInjection: &IdentityInjection{
-			HeaderPair: &HeaderPairInjection{
-				EndUserIDInBody: true,
-			},
-		},
-		// HTTP-Referer + X-OpenRouter-Title surface in OpenRouter's
-		// app rankings and per-app analytics. Operators paste their
-		// own app URL + display name on the provider record so their
-		// requests show under their brand instead of "no app". Both
-		// are static per-deployment, not per-request, hence the
-		// ExtraHeaders mechanism (operator-typed value, stamped on
-		// every request to this provider). Skip X-OpenRouter-Categories
-		// for now — the marketplace-categories dimension is
-		// niche-enough that we'd add it on demand.
-		ExtraHeaders: []ExtraHeader{
-			{Name: "HTTP-Referer"},
-			{Name: "X-OpenRouter-Title"},
-		},
-		Models: []Model{},
-	},
-	{
-		ID:                 "custom",
-		Kind:               KindCustom,
-		Name:               "Custom / Self-hosted",
-		Description:        "OpenAI-compatible endpoint (vLLM, Ollama, …)",
-		DefaultHost:        "",
-		AuthHeaderName:     "Authorization",
-		AuthHeaderTemplate: "Bearer ${API_KEY}",
-		DefaultContentType: "application/json",
-		BrandColor:         "#9CA3AF",
-		Models:             []Model{},
-	},
-}
-
-// All returns a copy of the full catalog.
-func All() []Provider {
-	out := make([]Provider, len(providers))
-	copy(out, providers)
-	return out
-}
-
-// Lookup returns the catalog entry with the given id, if any.
-func Lookup(id string) (Provider, bool) {
-	for _, p := range providers {
-		if p.ID == id {
-			return p, true
-		}
-	}
-	return Provider{}, false
-}
-
-// IsKnown reports whether the given id refers to a catalog entry.
-func IsKnown(id string) bool {
-	_, ok := Lookup(id)
-	return ok
-}
-
-// IsVertexPathStyle reports whether a provider uses the Google Vertex AI
-// request shape — the model is carried in the URL path
-// (/v1/projects/{p}/locations/{r}/publishers/{pub}/models/{model}:{action})
-// rather than the body, so the proxy routes it by path instead of by model.
-func IsVertexPathStyle(providerID string) bool {
-	return providerID == "vertex_ai_api"
-}
-
-// IsBedrockPathStyle reports whether a provider uses the AWS Bedrock request
-// shape — the model is carried in the URL path (/model/{modelId}/{action},
-// action being invoke, invoke-with-response-stream, converse, or
-// converse-stream) rather than the body, so the proxy routes it by path.
-func IsBedrockPathStyle(providerID string) bool {
-	return providerID == "bedrock_api"
-}
-
-// ToAPIResponse renders a catalog provider as the API representation.
-func (p Provider) ToAPIResponse() api.AgentNetworkCatalogProvider {
-	models := make([]api.AgentNetworkCatalogModel, 0, len(p.Models))
-	for _, m := range p.Models {
-		models = append(models, api.AgentNetworkCatalogModel{
-			Id:            m.ID,
-			Label:         m.Label,
-			InputPer1k:    m.InputPer1k,
-			OutputPer1k:   m.OutputPer1k,
-			ContextWindow: m.ContextWindow,
-		})
-	}
-	kind := api.AgentNetworkCatalogProviderKindProvider
-	switch p.Kind {
-	case KindGateway:
-		kind = api.AgentNetworkCatalogProviderKindGateway
-	case KindCustom:
-		kind = api.AgentNetworkCatalogProviderKindCustom
-	}
-	resp := api.AgentNetworkCatalogProvider{
-		Id:                 p.ID,
-		Name:               p.Name,
-		Description:        p.Description,
-		DefaultHost:        p.DefaultHost,
-		Kind:               kind,
-		AuthHeaderTemplate: p.AuthHeaderTemplate,
-		DefaultContentType: p.DefaultContentType,
-		BrandColor:         p.BrandColor,
-		Models:             models,
-	}
-	if len(p.ExtraHeaders) > 0 {
-		extras := make([]api.AgentNetworkCatalogExtraHeader, 0, len(p.ExtraHeaders))
-		for _, h := range p.ExtraHeaders {
-			extras = append(extras, api.AgentNetworkCatalogExtraHeader{
-				Name: h.Name,
-			})
-		}
-		resp.ExtraHeaders = &extras
-	}
-	// Surface IdentityInjection so the dashboard can decide whether
-	// to render editable inputs vs. a read-only mappings strip per
-	// shape's customizable flag. HeaderPair (Bifrost) and
-	// JSONMetadata (Cloudflare, Portkey) are mutually exclusive on a
-	// given catalog entry; emit whichever shape is set.
-	if p.IdentityInjection != nil {
-		injection := &api.AgentNetworkCatalogIdentityInjection{}
-		if hp := p.IdentityInjection.HeaderPair; hp != nil {
-			injection.HeaderPair = &api.AgentNetworkCatalogHeaderPairInjection{
-				Customizable:    hp.Customizable,
-				EndUserIdHeader: hp.EndUserIDHeader,
-				TagsHeader:      hp.TagsHeader,
-			}
-		}
-		if jm := p.IdentityInjection.JSONMetadata; jm != nil {
-			injection.JsonMetadata = &api.AgentNetworkCatalogJSONMetadataInjection{
-				Customizable: jm.Customizable,
-				Header:       jm.Header,
-				UserKey:      jm.UserKey,
-				GroupsKey:    jm.GroupsKey,
-			}
-		}
-		if injection.HeaderPair != nil || injection.JsonMetadata != nil {
-			resp.IdentityInjection = injection
-		}
-	}
-	return resp
-}
--- a/management/internals/modules/agentnetwork/handlers/access_log_handler.go
+++ b/management/internals/modules/agentnetwork/handlers/access_log_handler.go
@@ -1,91 +0,0 @@
-package handlers
-
-import (
-	"net/http"
-	"time"
-
-	"github.com/gorilla/mux"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-	"github.com/netbirdio/netbird/shared/management/http/util"
-)
-
-// addAccessLogEndpoints registers the read-only, server-side-filtered
-// agent-network access-log listing and the aggregated usage overview.
-func (h *handler) addAccessLogEndpoints(router *mux.Router) {
-	router.HandleFunc("/agent-network/access-logs", h.listAccessLogs).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/usage/overview", h.getUsageOverview).Methods("GET", "OPTIONS")
-}
-
-func (h *handler) getUsageOverview(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	// Reuse the access-log filter for the shared date/user/group/provider/model
-	// params; pagination/sort/search are irrelevant for an aggregate.
-	var filter types.AgentNetworkAccessLogFilter
-	if err := filter.ParseFromRequest(r); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	// Bound the aggregation window so an unbounded or over-wide query can't load
-	// an account's entire usage history into memory.
-	filter.ApplyUsageOverviewBounds(time.Now())
-	granularity := types.ParseUsageGranularity(r.URL.Query().Get("granularity"))
-
-	buckets, err := h.manager.GetUsageOverview(r.Context(), userAuth.AccountId, userAuth.UserId, filter, granularity)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	out := make([]api.AgentNetworkUsageBucket, 0, len(buckets))
-	for _, b := range buckets {
-		out = append(out, b.ToAPIResponse())
-	}
-	util.WriteJSONObject(r.Context(), w, out)
-}
-
-func (h *handler) listAccessLogs(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	var filter types.AgentNetworkAccessLogFilter
-	if err := filter.ParseFromRequest(r); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	rows, total, err := h.manager.ListAccessLogs(r.Context(), userAuth.AccountId, userAuth.UserId, filter)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	data := make([]api.AgentNetworkAccessLog, 0, len(rows))
-	for _, row := range rows {
-		data = append(data, row.ToAPIResponse())
-	}
-
-	pageSize := filter.GetLimit()
-	totalPages := 0
-	if pageSize > 0 {
-		totalPages = int((total + int64(pageSize) - 1) / int64(pageSize))
-	}
-
-	util.WriteJSONObject(r.Context(), w, api.AgentNetworkAccessLogsResponse{
-		Data:         data,
-		Page:         filter.Page,
-		PageSize:     pageSize,
-		TotalRecords: int(total),
-		TotalPages:   totalPages,
-	})
-}
--- a/management/internals/modules/agentnetwork/handlers/budget_handler.go
+++ b/management/internals/modules/agentnetwork/handlers/budget_handler.go
@@ -1,172 +0,0 @@
-package handlers
-
-import (
-	"encoding/json"
-	"net/http"
-	"strings"
-
-	"github.com/gorilla/mux"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-	"github.com/netbirdio/netbird/shared/management/http/util"
-	"github.com/netbirdio/netbird/shared/management/status"
-)
-
-// addBudgetRuleEndpoints registers the account-level budget rule routes.
-func (h *handler) addBudgetRuleEndpoints(router *mux.Router) {
-	router.HandleFunc("/agent-network/budget-rules", h.getAllBudgetRules).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/budget-rules", h.createBudgetRule).Methods("POST", "OPTIONS")
-	router.HandleFunc("/agent-network/budget-rules/{ruleId}", h.getBudgetRule).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/budget-rules/{ruleId}", h.updateBudgetRule).Methods("PUT", "OPTIONS")
-	router.HandleFunc("/agent-network/budget-rules/{ruleId}", h.deleteBudgetRule).Methods("DELETE", "OPTIONS")
-}
-
-func (h *handler) getAllBudgetRules(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	rules, err := h.manager.GetAllBudgetRules(r.Context(), userAuth.AccountId, userAuth.UserId)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	out := make([]*api.AgentNetworkBudgetRule, 0, len(rules))
-	for _, rule := range rules {
-		out = append(out, rule.ToAPIResponse())
-	}
-	util.WriteJSONObject(r.Context(), w, out)
-}
-
-func (h *handler) getBudgetRule(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	ruleID := mux.Vars(r)["ruleId"]
-	if ruleID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "budget rule ID is required"), w)
-		return
-	}
-
-	rule, err := h.manager.GetBudgetRule(r.Context(), userAuth.AccountId, userAuth.UserId, ruleID)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, rule.ToAPIResponse())
-}
-
-func (h *handler) createBudgetRule(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	var req api.AgentNetworkBudgetRuleRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validateBudgetRule(&req); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	rule := types.NewAccountBudgetRule(userAuth.AccountId)
-	rule.FromAPIRequest(&req)
-
-	created, err := h.manager.CreateBudgetRule(r.Context(), userAuth.UserId, rule)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, created.ToAPIResponse())
-}
-
-func (h *handler) updateBudgetRule(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	ruleID := mux.Vars(r)["ruleId"]
-	if ruleID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "budget rule ID is required"), w)
-		return
-	}
-
-	var req api.AgentNetworkBudgetRuleRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validateBudgetRule(&req); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	rule := &types.AccountBudgetRule{ID: ruleID, AccountID: userAuth.AccountId}
-	rule.FromAPIRequest(&req)
-
-	updated, err := h.manager.UpdateBudgetRule(r.Context(), userAuth.UserId, rule)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, updated.ToAPIResponse())
-}
-
-func (h *handler) deleteBudgetRule(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	ruleID := mux.Vars(r)["ruleId"]
-	if ruleID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "budget rule ID is required"), w)
-		return
-	}
-
-	if err := h.manager.DeleteBudgetRule(r.Context(), userAuth.AccountId, userAuth.UserId, ruleID); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, util.EmptyObject{})
-}
-
-// validateBudgetRule rejects malformed budget rules. It reuses the policy limit
-// validation since the cap shape is identical, and rejects empty target entries.
-func validateBudgetRule(req *api.AgentNetworkBudgetRuleRequest) error {
-	if strings.TrimSpace(req.Name) == "" {
-		return status.Errorf(status.InvalidArgument, "name is required")
-	}
-	if req.TargetGroups != nil {
-		for _, id := range *req.TargetGroups {
-			if strings.TrimSpace(id) == "" {
-				return status.Errorf(status.InvalidArgument, "target_groups must not contain empty entries")
-			}
-		}
-	}
-	if req.TargetUsers != nil {
-		for _, id := range *req.TargetUsers {
-			if strings.TrimSpace(id) == "" {
-				return status.Errorf(status.InvalidArgument, "target_users must not contain empty entries")
-			}
-		}
-	}
-	return validatePolicyLimits(req.Limits)
-}
--- a/management/internals/modules/agentnetwork/handlers/budget_handler_test.go
+++ b/management/internals/modules/agentnetwork/handlers/budget_handler_test.go
@@ -1,131 +0,0 @@
-package handlers
-
-import (
-	"context"
-	"encoding/json"
-	"net/http"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-
-	agentNetworkTypes "github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-)
-
-// TestBudgetRuleHandler_RoundTrip seeds a budget rule via the store and asserts
-// the GET wire shape carries targets and the reused PolicyLimits cap shape. The
-// create/update/delete success paths go through accountManager.StoreEvent which
-// this fixture doesn't wire — they are covered by the manager-level no-mock
-// test (TestAgentNetwork_BudgetRuleCRUD_RealManager).
-func TestBudgetRuleHandler_RoundTrip(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	rule := &agentNetworkTypes.AccountBudgetRule{
-		ID:           "ainbud_test",
-		AccountID:    testAccountID,
-		Name:         "org-monthly",
-		Enabled:      true,
-		TargetGroups: []string{"grp-eng"},
-		TargetUsers:  []string{"user-alice"},
-		Limits: agentNetworkTypes.PolicyLimits{
-			TokenLimit:  agentNetworkTypes.PolicyTokenLimit{Enabled: true, GroupCap: 100000, UserCap: 10000, WindowSeconds: 2_592_000},
-			BudgetLimit: agentNetworkTypes.PolicyBudgetLimit{Enabled: true, GroupCapUsd: 500, WindowSeconds: 2_592_000},
-		},
-	}
-	require.NoError(t, f.store.SaveAgentNetworkBudgetRule(context.Background(), rule))
-
-	rec := f.do(t, http.MethodGet, "/agent-network/budget-rules/"+rule.ID, "")
-	require.Equal(t, http.StatusOK, rec.Code, "GET must succeed: %s", rec.Body.String())
-
-	var got api.AgentNetworkBudgetRule
-	require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &got))
-	assert.Equal(t, "org-monthly", got.Name, "name must round-trip")
-	assert.Equal(t, []string{"grp-eng"}, got.TargetGroups, "target groups must round-trip")
-	assert.Equal(t, []string{"user-alice"}, got.TargetUsers, "target users must round-trip")
-	assert.Equal(t, int64(100000), got.Limits.TokenLimit.GroupCap, "token group cap must round-trip")
-	assert.Equal(t, int64(2_592_000), got.Limits.BudgetLimit.WindowSeconds, "budget window must round-trip")
-}
-
-// TestBudgetRuleHandler_ListReturnsArray asserts the list endpoint returns a
-// JSON array (never null) for an account with no rules.
-func TestBudgetRuleHandler_ListReturnsArray(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	rec := f.do(t, http.MethodGet, "/agent-network/budget-rules", "")
-	require.Equal(t, http.StatusOK, rec.Code, "GET must succeed: %s", rec.Body.String())
-	assert.Equal(t, "[]", trimSpace(rec.Body.String()), "empty account must return an empty array, not null")
-}
-
-// TestBudgetRuleHandler_RejectsMissingName covers the validation path (which
-// runs before the manager call, so it works without a wired accountManager).
-func TestBudgetRuleHandler_RejectsMissingName(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	body := `{
-        "name": "",
-        "limits": {
-            "token_limit": {"enabled": false, "group_cap": 0, "user_cap": 0, "window_seconds": 0},
-            "budget_limit": {"enabled": false, "group_cap_usd": 0, "user_cap_usd": 0, "window_seconds": 0}
-        }
-    }`
-	rec := f.do(t, http.MethodPost, "/agent-network/budget-rules", body)
-	assert.Equal(t, http.StatusUnprocessableEntity, rec.Code,
-		"missing name must be rejected as a validation error (not a route/auth 4xx): got %d body=%s", rec.Code, rec.Body.String())
-	assert.Contains(t, rec.Body.String(), "name",
-		"rejection body must name the offending field, proving the validation path: %s", rec.Body.String())
-}
-
-// TestBudgetRuleHandler_RejectsSubMinuteWindow proves budget rules reuse the
-// policy-limit validation (enabled limit needs window >= 60s).
-func TestBudgetRuleHandler_RejectsSubMinuteWindow(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	body := `{
-        "name": "bad-window",
-        "limits": {
-            "token_limit": {"enabled": true, "group_cap": 1000, "user_cap": 0, "window_seconds": 30},
-            "budget_limit": {"enabled": false, "group_cap_usd": 0, "user_cap_usd": 0, "window_seconds": 0}
-        }
-    }`
-	rec := f.do(t, http.MethodPost, "/agent-network/budget-rules", body)
-	assert.Equal(t, http.StatusUnprocessableEntity, rec.Code,
-		"sub-minute window must be rejected as a validation error (not a route/auth 4xx): got %d body=%s", rec.Code, rec.Body.String())
-	assert.Contains(t, rec.Body.String(), "window_seconds",
-		"rejection body must name the offending window_seconds field, proving the validation path: %s", rec.Body.String())
-}
-
-// TestSettingsHandler_GetExposesCollectionToggles asserts the GET settings wire
-// shape carries the account-level collection toggles after a store seed.
-func TestSettingsHandler_GetExposesCollectionToggles(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	require.NoError(t, f.store.SaveAgentNetworkSettings(context.Background(), &agentNetworkTypes.Settings{
-		AccountID:              testAccountID,
-		Cluster:                "eu.proxy.netbird.io",
-		Subdomain:              "violet",
-		EnableLogCollection:    true,
-		EnablePromptCollection: true,
-		RedactPii:              false,
-	}))
-
-	rec := f.do(t, http.MethodGet, "/agent-network/settings", "")
-	require.Equal(t, http.StatusOK, rec.Code, "GET must succeed: %s", rec.Body.String())
-
-	var got api.AgentNetworkSettings
-	require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &got))
-	assert.True(t, got.EnableLogCollection, "log collection toggle must surface on the wire")
-	assert.True(t, got.EnablePromptCollection, "prompt collection toggle must surface on the wire")
-	assert.False(t, got.RedactPii, "redact toggle must surface its false value")
-	assert.Equal(t, "violet.eu.proxy.netbird.io", got.Endpoint, "endpoint stays computed from immutable cluster+subdomain")
-}
-
-func trimSpace(s string) string {
-	for len(s) > 0 && (s[len(s)-1] == '\n' || s[len(s)-1] == ' ' || s[len(s)-1] == '\t' || s[len(s)-1] == '\r') {
-		s = s[:len(s)-1]
-	}
-	for len(s) > 0 && (s[0] == '\n' || s[0] == ' ' || s[0] == '\t' || s[0] == '\r') {
-		s = s[1:]
-	}
-	return s
-}
--- a/management/internals/modules/agentnetwork/handlers/consumption_handler.go
+++ b/management/internals/modules/agentnetwork/handlers/consumption_handler.go
@@ -1,53 +0,0 @@
-package handlers
-
-import (
-	"net/http"
-
-	"github.com/gorilla/mux"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-	"github.com/netbirdio/netbird/shared/management/http/util"
-)
-
-// addConsumptionEndpoints registers the read-only Agent Network
-// consumption listing — backs the dashboard's basic counter view.
-func (h *handler) addConsumptionEndpoints(router *mux.Router) {
-	router.HandleFunc("/agent-network/consumption", h.listConsumption).Methods("GET", "OPTIONS")
-}
-
-func (h *handler) listConsumption(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	rows, err := h.manager.ListConsumption(r.Context(), userAuth.AccountId, userAuth.UserId)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	out := make([]api.AgentNetworkConsumption, 0, len(rows))
-	for _, row := range rows {
-		out = append(out, consumptionToAPI(row))
-	}
-	util.WriteJSONObject(r.Context(), w, out)
-}
-
-func consumptionToAPI(c *types.Consumption) api.AgentNetworkConsumption {
-	windowStart := c.WindowStartUTC
-	updatedAt := c.UpdatedAt
-	return api.AgentNetworkConsumption{
-		DimensionKind:  api.AgentNetworkConsumptionDimensionKind(c.DimensionKind),
-		DimensionId:    c.DimensionID,
-		WindowSeconds:  c.WindowSeconds,
-		WindowStartUtc: windowStart,
-		TokensInput:    c.TokensInput,
-		TokensOutput:   c.TokensOutput,
-		CostUsd:        c.CostUSD,
-		UpdatedAt:      &updatedAt,
-	}
-}
--- a/management/internals/modules/agentnetwork/handlers/guardrails_handler.go
+++ b/management/internals/modules/agentnetwork/handlers/guardrails_handler.go
@@ -1,171 +0,0 @@
-package handlers
-
-import (
-	"encoding/json"
-	"net/http"
-	"strings"
-
-	"github.com/gorilla/mux"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-	"github.com/netbirdio/netbird/shared/management/http/util"
-	"github.com/netbirdio/netbird/shared/management/status"
-)
-
-// addGuardrailEndpoints registers all Agent Network guardrail routes.
-func (h *handler) addGuardrailEndpoints(router *mux.Router) {
-	router.HandleFunc("/agent-network/guardrails", h.getAllGuardrails).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/guardrails", h.createGuardrail).Methods("POST", "OPTIONS")
-	router.HandleFunc("/agent-network/guardrails/{guardrailId}", h.getGuardrail).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/guardrails/{guardrailId}", h.updateGuardrail).Methods("PUT", "OPTIONS")
-	router.HandleFunc("/agent-network/guardrails/{guardrailId}", h.deleteGuardrail).Methods("DELETE", "OPTIONS")
-}
-
-func (h *handler) getAllGuardrails(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	guardrails, err := h.manager.GetAllGuardrails(r.Context(), userAuth.AccountId, userAuth.UserId)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	out := make([]*api.AgentNetworkGuardrail, 0, len(guardrails))
-	for _, g := range guardrails {
-		out = append(out, g.ToAPIResponse())
-	}
-	util.WriteJSONObject(r.Context(), w, out)
-}
-
-func (h *handler) getGuardrail(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	guardrailID := mux.Vars(r)["guardrailId"]
-	if guardrailID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "guardrail ID is required"), w)
-		return
-	}
-
-	guardrail, err := h.manager.GetGuardrail(r.Context(), userAuth.AccountId, userAuth.UserId, guardrailID)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, guardrail.ToAPIResponse())
-}
-
-func (h *handler) createGuardrail(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	var req api.AgentNetworkGuardrailRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validateGuardrail(&req); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	guardrail := types.NewGuardrail(userAuth.AccountId)
-	guardrail.FromAPIRequest(&req)
-
-	created, err := h.manager.CreateGuardrail(r.Context(), userAuth.UserId, guardrail)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, created.ToAPIResponse())
-}
-
-func (h *handler) updateGuardrail(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	guardrailID := mux.Vars(r)["guardrailId"]
-	if guardrailID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "guardrail ID is required"), w)
-		return
-	}
-
-	var req api.AgentNetworkGuardrailRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validateGuardrail(&req); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	guardrail := &types.Guardrail{
-		ID:        guardrailID,
-		AccountID: userAuth.AccountId,
-	}
-	guardrail.FromAPIRequest(&req)
-
-	updated, err := h.manager.UpdateGuardrail(r.Context(), userAuth.UserId, guardrail)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, updated.ToAPIResponse())
-}
-
-func (h *handler) deleteGuardrail(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	guardrailID := mux.Vars(r)["guardrailId"]
-	if guardrailID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "guardrail ID is required"), w)
-		return
-	}
-
-	if err := h.manager.DeleteGuardrail(r.Context(), userAuth.AccountId, userAuth.UserId, guardrailID); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, util.EmptyObject{})
-}
-
-func validateGuardrail(req *api.AgentNetworkGuardrailRequest) error {
-	if strings.TrimSpace(req.Name) == "" {
-		return status.Errorf(status.InvalidArgument, "name is required")
-	}
-
-	c := req.Checks
-	if c.ModelAllowlist.Enabled {
-		for _, id := range c.ModelAllowlist.Models {
-			if strings.TrimSpace(id) == "" {
-				return status.Errorf(status.InvalidArgument, "model_allowlist.models must not contain empty entries")
-			}
-		}
-	}
-	return nil
-}
--- a/management/internals/modules/agentnetwork/handlers/handlers_test.go
+++ b/management/internals/modules/agentnetwork/handlers/handlers_test.go
@@ -1,256 +0,0 @@
-package handlers
-
-import (
-	"context"
-	"encoding/json"
-	"io"
-	"net/http"
-	"net/http/httptest"
-	"runtime"
-	"strings"
-	"testing"
-
-	"github.com/golang/mock/gomock"
-	"github.com/gorilla/mux"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork"
-	agentNetworkTypes "github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/management/server/permissions"
-	"github.com/netbirdio/netbird/management/server/store"
-	nbtypes "github.com/netbirdio/netbird/management/server/types"
-	"github.com/netbirdio/netbird/shared/auth"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-)
-
-const (
-	testAccountID = "acc-1"
-	testUserID    = "user-bob"
-)
-
-// agentNetworkHandlerFixture builds a real agentnetwork.Manager with
-// a sqlite store and an always-allow permissions mock, then exposes
-// the HTTP handlers via a gorilla router. Tests issue requests
-// through httptest and assert on the wire shape — the same path the
-// dashboard exercises.
-type agentNetworkHandlerFixture struct {
-	store   store.Store
-	manager agentnetwork.Manager
-	router  *mux.Router
-}
-
-func newAgentNetworkHandlerFixture(t *testing.T) *agentNetworkHandlerFixture {
-	t.Helper()
-	if runtime.GOOS == "windows" {
-		t.Skip("sqlite store not properly supported on Windows yet")
-	}
-	t.Setenv("NETBIRD_STORE_ENGINE", string(nbtypes.SqliteStoreEngine))
-
-	st, cleanUp, err := store.NewTestStoreFromSQL(context.Background(), "", t.TempDir())
-	require.NoError(t, err)
-	t.Cleanup(cleanUp)
-
-	ctrl := gomock.NewController(t)
-	perms := permissions.NewMockManager(ctrl)
-	// Always-allow: the handler tests are about wire shape, not
-	// authz. Authz is covered by the manager's own tests.
-	perms.EXPECT().
-		ValidateUserPermissions(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
-		Return(true, context.Background(), nil).
-		AnyTimes()
-
-	manager := agentnetwork.NewManager(st, perms, nil, nil)
-	h := &handler{manager: manager}
-
-	router := mux.NewRouter()
-	h.addPolicyEndpoints(router)
-	h.addConsumptionEndpoints(router)
-	h.addBudgetRuleEndpoints(router)
-	h.addSettingsEndpoints(router)
-
-	return &agentNetworkHandlerFixture{
-		store:   st,
-		manager: manager,
-		router:  router,
-	}
-}
-
-func (f *agentNetworkHandlerFixture) do(t *testing.T, method, path, body string) *httptest.ResponseRecorder {
-	t.Helper()
-	var reader io.Reader
-	if body != "" {
-		reader = strings.NewReader(body)
-	}
-	req := httptest.NewRequest(method, path, reader)
-	if body != "" {
-		req.Header.Set("Content-Type", "application/json")
-	}
-	req = nbcontext.SetUserAuthInRequest(req, auth.UserAuth{
-		UserId:    testUserID,
-		AccountId: testAccountID,
-	})
-	rec := httptest.NewRecorder()
-	f.router.ServeHTTP(rec, req)
-	return rec
-}
-
-// seedProvider persists a minimal provider record so policy create
-// passes the manager's destination_provider_ids existence check.
-func (f *agentNetworkHandlerFixture) seedProvider(t *testing.T, id string) {
-	t.Helper()
-	require.NoError(t, f.store.SaveAgentNetworkProvider(context.Background(), &agentNetworkTypes.Provider{
-		ID:                id,
-		AccountID:         testAccountID,
-		ProviderID:        "openai_api",
-		Name:              "test-" + id,
-		UpstreamURL:       "https://api.openai.com",
-		APIKey:            "sk-test",
-		Enabled:           true,
-		SessionPrivateKey: "test-priv-key",
-		SessionPublicKey:  "test-pub-key",
-	}))
-}
-
-// TestPolicyHandler_WindowSecondsRoundTrip ports bash 10 to Go:
-// assert that a policy with window_seconds on both Token + Budget
-// halves round-trips through GET unchanged AND that legacy
-// window_hours / window_days are absent from the JSON response. We
-// seed the policy directly via the store rather than POST-ing
-// because the create path goes through the manager's
-// accountManager.StoreEvent which we don't wire in this fixture; the
-// on-wire shape is what matters here, and the POST validation path
-// is covered separately by the RejectsSubMinuteWindow test.
-func TestPolicyHandler_WindowSecondsRoundTrip(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	policy := &agentNetworkTypes.Policy{
-		ID:                     "ainpol_test",
-		AccountID:              testAccountID,
-		Name:                   "round-trip",
-		Enabled:                true,
-		SourceGroups:           []string{"grp-engineers"},
-		DestinationProviderIDs: []string{"prov-1"},
-		Limits: agentNetworkTypes.PolicyLimits{
-			TokenLimit:  agentNetworkTypes.PolicyTokenLimit{Enabled: true, GroupCap: 10000, UserCap: 5000, WindowSeconds: 86_400},
-			BudgetLimit: agentNetworkTypes.PolicyBudgetLimit{Enabled: true, GroupCapUsd: 10.0, UserCapUsd: 2.5, WindowSeconds: 2_592_000},
-		},
-	}
-	require.NoError(t, f.store.SaveAgentNetworkPolicy(context.Background(), policy))
-
-	rec := f.do(t, http.MethodGet, "/agent-network/policies/"+policy.ID, "")
-	require.Equal(t, http.StatusOK, rec.Code, "GET must succeed: %s", rec.Body.String())
-
-	var got api.AgentNetworkPolicy
-	require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &got))
-	assert.Equal(t, int64(86_400), got.Limits.TokenLimit.WindowSeconds, "token_limit.window_seconds must round-trip")
-	assert.Equal(t, int64(2_592_000), got.Limits.BudgetLimit.WindowSeconds, "budget_limit.window_seconds must round-trip")
-
-	// Legacy field names must NOT appear in the response — would
-	// signal that the management server is still emitting the old
-	// shape and would fool a v1 dashboard into rendering days/hours.
-	assert.NotContains(t, rec.Body.String(), "window_hours",
-		"legacy window_hours field must be absent from the on-wire response")
-	assert.NotContains(t, rec.Body.String(), "window_days",
-		"legacy window_days field must be absent from the on-wire response")
-}
-
-// TestPolicyHandler_RejectsSubMinuteWindow ports bash 20 to Go: an
-// enabled limit with window_seconds < 60 must surface as a 4xx
-// because anything finer than per-minute produces an untenable
-// volume of consumption rows for a feature whose value comes from
-// per-window cap enforcement.
-func TestPolicyHandler_RejectsSubMinuteWindow(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-	f.seedProvider(t, "prov-1")
-
-	body := `{
-        "name": "sub-minute-window",
-        "enabled": true,
-        "source_groups": ["grp-engineers"],
-        "destination_provider_ids": ["prov-1"],
-        "guardrail_ids": [],
-        "limits": {
-            "token_limit": {"enabled": true, "group_cap": 10000, "user_cap": 5000, "window_seconds": 30},
-            "budget_limit": {"enabled": false, "group_cap_usd": 0, "user_cap_usd": 0, "window_seconds": 0}
-        }
-    }`
-	rec := f.do(t, http.MethodPost, "/agent-network/policies", body)
-	// 422 specifically (InvalidArgument) proves the window-validation path —
-	// a route miss would be 404 and an auth failure 403, so a generic 4xx
-	// would let those false-pass.
-	assert.Equal(t, http.StatusUnprocessableEntity, rec.Code,
-		"enabled token_limit with window_seconds<60 must be rejected as a validation error: got %d body=%s", rec.Code, rec.Body.String())
-	assert.Contains(t, rec.Body.String(), "window_seconds",
-		"rejection body must name the offending window_seconds field, proving it's the validation path: %s", rec.Body.String())
-}
-
-// TestConsumptionHandler_EmptyAccountReturnsArray ports bash 30 to
-// Go: GET /agent-network/consumption on a clean account always
-// returns a JSON array (possibly empty), never a 404 / 500. The
-// dashboard depends on this shape to render its empty state.
-func TestConsumptionHandler_EmptyAccountReturnsArray(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	rec := f.do(t, http.MethodGet, "/agent-network/consumption", "")
-	require.Equal(t, http.StatusOK, rec.Code)
-
-	var rows []api.AgentNetworkConsumption
-	require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &rows),
-		"response must always be a JSON array — even when empty: %s", rec.Body.String())
-	assert.Empty(t, rows)
-}
-
-// TestConsumptionHandler_PopulatedAccountListsRows mirrors the
-// /consumption read after a few RecordConsumption calls. Validates
-// the wire shape carries every field the dashboard reads (dim_kind,
-// dim_id, window_seconds, window_start_utc, tokens, cost_usd) and
-// rows are ordered window-newest-first.
-func TestConsumptionHandler_PopulatedAccountListsRows(t *testing.T) {
-	f := newAgentNetworkHandlerFixture(t)
-
-	require.NoError(t, f.manager.RecordConsumption(
-		context.Background(), testAccountID,
-		agentNetworkTypes.DimensionGroup, "grp-engineers",
-		86_400, 100, 50, 0.0125,
-	))
-	require.NoError(t, f.manager.RecordConsumption(
-		context.Background(), testAccountID,
-		agentNetworkTypes.DimensionUser, testUserID,
-		86_400, 100, 50, 0.0125,
-	))
-
-	rec := f.do(t, http.MethodGet, "/agent-network/consumption", "")
-	require.Equal(t, http.StatusOK, rec.Code)
-
-	var rows []api.AgentNetworkConsumption
-	require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &rows))
-	require.Len(t, rows, 2, "two RecordConsumption calls must yield two rows")
-
-	// Index by dim_kind so we can assert the full wire shape of each row,
-	// including the dimension id and the aligned window start the dashboard
-	// keys on. Both rows share totals and window.
-	byKind := make(map[string]api.AgentNetworkConsumption, len(rows))
-	for _, row := range rows {
-		assert.Equal(t, int64(100), row.TokensInput)
-		assert.Equal(t, int64(50), row.TokensOutput)
-		assert.InDelta(t, 0.0125, row.CostUsd, 1e-9)
-		assert.Equal(t, int64(86_400), row.WindowSeconds)
-		assert.False(t, row.WindowStartUtc.IsZero(), "window_start_utc must be set on every row")
-		byKind[string(row.DimensionKind)] = row
-	}
-
-	groupRow, ok := byKind["group"]
-	require.True(t, ok, "group dimension must surface")
-	assert.Equal(t, "grp-engineers", groupRow.DimensionId, "group row must carry the source group id as dimension_id")
-
-	userRow, ok := byKind["user"]
-	require.True(t, ok, "user dimension must surface")
-	assert.Equal(t, testUserID, userRow.DimensionId, "user row must carry the user id as dimension_id")
-
-	// Both rows fall in the same aligned window (same length, recorded
-	// together), so window_start_utc must match across them.
-	assert.Equal(t, groupRow.WindowStartUtc, userRow.WindowStartUtc,
-		"rows recorded in the same window must share the aligned window_start_utc")
-}
--- a/management/internals/modules/agentnetwork/handlers/policies_handler.go
+++ b/management/internals/modules/agentnetwork/handlers/policies_handler.go
@@ -1,228 +0,0 @@
-package handlers
-
-import (
-	"encoding/json"
-	"net/http"
-	"strings"
-
-	"github.com/gorilla/mux"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-	"github.com/netbirdio/netbird/shared/management/http/util"
-	"github.com/netbirdio/netbird/shared/management/status"
-)
-
-// minWindowSeconds is the floor enforced on enabled token / budget
-// limit windows. One minute is short enough for fine-grained burst
-// control without producing untenable consumption-row volume at scale.
-const minWindowSeconds int64 = 60
-
-// addPolicyEndpoints registers all Agent Network policy routes on the
-// shared handler.
-func (h *handler) addPolicyEndpoints(router *mux.Router) {
-	router.HandleFunc("/agent-network/policies", h.getAllPolicies).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/policies", h.createPolicy).Methods("POST", "OPTIONS")
-	router.HandleFunc("/agent-network/policies/{policyId}", h.getPolicy).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/policies/{policyId}", h.updatePolicy).Methods("PUT", "OPTIONS")
-	router.HandleFunc("/agent-network/policies/{policyId}", h.deletePolicy).Methods("DELETE", "OPTIONS")
-}
-
-func (h *handler) getAllPolicies(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	policies, err := h.manager.GetAllPolicies(r.Context(), userAuth.AccountId, userAuth.UserId)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	out := make([]*api.AgentNetworkPolicy, 0, len(policies))
-	for _, p := range policies {
-		out = append(out, p.ToAPIResponse())
-	}
-	util.WriteJSONObject(r.Context(), w, out)
-}
-
-func (h *handler) getPolicy(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	policyID := mux.Vars(r)["policyId"]
-	if policyID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "policy ID is required"), w)
-		return
-	}
-
-	policy, err := h.manager.GetPolicy(r.Context(), userAuth.AccountId, userAuth.UserId, policyID)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, policy.ToAPIResponse())
-}
-
-func (h *handler) createPolicy(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	var req api.AgentNetworkPolicyRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validatePolicy(&req); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	policy := types.NewPolicy(userAuth.AccountId)
-	policy.FromAPIRequest(&req)
-
-	created, err := h.manager.CreatePolicy(r.Context(), userAuth.UserId, policy)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, created.ToAPIResponse())
-}
-
-func (h *handler) updatePolicy(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	policyID := mux.Vars(r)["policyId"]
-	if policyID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "policy ID is required"), w)
-		return
-	}
-
-	var req api.AgentNetworkPolicyRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validatePolicy(&req); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	policy := &types.Policy{
-		ID:        policyID,
-		AccountID: userAuth.AccountId,
-	}
-	policy.FromAPIRequest(&req)
-
-	updated, err := h.manager.UpdatePolicy(r.Context(), userAuth.UserId, policy)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, updated.ToAPIResponse())
-}
-
-func (h *handler) deletePolicy(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	policyID := mux.Vars(r)["policyId"]
-	if policyID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "policy ID is required"), w)
-		return
-	}
-
-	if err := h.manager.DeletePolicy(r.Context(), userAuth.AccountId, userAuth.UserId, policyID); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, util.EmptyObject{})
-}
-
-func validatePolicy(req *api.AgentNetworkPolicyRequest) error {
-	if strings.TrimSpace(req.Name) == "" {
-		return status.Errorf(status.InvalidArgument, "name is required")
-	}
-	if len(req.SourceGroups) == 0 {
-		return status.Errorf(status.InvalidArgument, "source_groups must contain at least one group id")
-	}
-	for _, id := range req.SourceGroups {
-		if strings.TrimSpace(id) == "" {
-			return status.Errorf(status.InvalidArgument, "source_groups must not contain empty entries")
-		}
-	}
-	if len(req.DestinationProviderIds) == 0 {
-		return status.Errorf(status.InvalidArgument, "destination_provider_ids must contain at least one provider id")
-	}
-	for _, id := range req.DestinationProviderIds {
-		if strings.TrimSpace(id) == "" {
-			return status.Errorf(status.InvalidArgument, "destination_provider_ids must not contain empty entries")
-		}
-	}
-	if req.GuardrailIds != nil {
-		for _, id := range *req.GuardrailIds {
-			if strings.TrimSpace(id) == "" {
-				return status.Errorf(status.InvalidArgument, "guardrail_ids must not contain empty entries")
-			}
-		}
-	}
-	if req.Limits != nil {
-		if err := validatePolicyLimits(*req.Limits); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func validatePolicyLimits(l api.AgentNetworkPolicyLimits) error {
-	if l.TokenLimit.Enabled {
-		if l.TokenLimit.WindowSeconds < minWindowSeconds {
-			return status.Errorf(status.InvalidArgument, "limits.token_limit.window_seconds must be at least %d (one minute) when enabled", minWindowSeconds)
-		}
-		if l.TokenLimit.GroupCap < 0 {
-			return status.Errorf(status.InvalidArgument, "limits.token_limit.group_cap must not be negative")
-		}
-		if l.TokenLimit.UserCap < 0 {
-			return status.Errorf(status.InvalidArgument, "limits.token_limit.user_cap must not be negative")
-		}
-		if l.TokenLimit.GroupCap == 0 && l.TokenLimit.UserCap == 0 {
-			return status.Errorf(status.InvalidArgument, "limits.token_limit requires group_cap or user_cap to be greater than zero when enabled")
-		}
-	}
-	if l.BudgetLimit.Enabled {
-		if l.BudgetLimit.WindowSeconds < minWindowSeconds {
-			return status.Errorf(status.InvalidArgument, "limits.budget_limit.window_seconds must be at least %d (one minute) when enabled", minWindowSeconds)
-		}
-		if l.BudgetLimit.GroupCapUsd < 0 {
-			return status.Errorf(status.InvalidArgument, "limits.budget_limit.group_cap_usd must not be negative")
-		}
-		if l.BudgetLimit.UserCapUsd < 0 {
-			return status.Errorf(status.InvalidArgument, "limits.budget_limit.user_cap_usd must not be negative")
-		}
-		if l.BudgetLimit.GroupCapUsd == 0 && l.BudgetLimit.UserCapUsd == 0 {
-			return status.Errorf(status.InvalidArgument, "limits.budget_limit requires group_cap_usd or user_cap_usd to be greater than zero when enabled")
-		}
-	}
-	return nil
-}
--- a/management/internals/modules/agentnetwork/handlers/providers_handler.go
+++ b/management/internals/modules/agentnetwork/handlers/providers_handler.go
@@ -1,217 +0,0 @@
-// Package handlers serves the Agent Network HTTP API.
-//
-// All persistence is delegated to agentnetwork.Manager so this layer only
-// translates between the wire format (api.AgentNetworkProvider*) and the
-// domain types.
-package handlers
-
-import (
-	"encoding/json"
-	"net/http"
-	"net/url"
-	"strings"
-
-	"github.com/gorilla/mux"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork"
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/catalog"
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-	"github.com/netbirdio/netbird/shared/management/http/util"
-	"github.com/netbirdio/netbird/shared/management/status"
-)
-
-type handler struct {
-	manager agentnetwork.Manager
-}
-
-// RegisterEndpoints registers all Agent Network routes.
-func RegisterEndpoints(manager agentnetwork.Manager, router *mux.Router) {
-	h := &handler{manager: manager}
-	router.HandleFunc("/agent-network/catalog/providers", h.getCatalogProviders).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/providers", h.getAllProviders).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/providers", h.createProvider).Methods("POST", "OPTIONS")
-	router.HandleFunc("/agent-network/providers/{providerId}", h.getProvider).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/providers/{providerId}", h.updateProvider).Methods("PUT", "OPTIONS")
-	router.HandleFunc("/agent-network/providers/{providerId}", h.deleteProvider).Methods("DELETE", "OPTIONS")
-	h.addPolicyEndpoints(router)
-	h.addGuardrailEndpoints(router)
-	h.addSettingsEndpoints(router)
-	h.addConsumptionEndpoints(router)
-	h.addAccessLogEndpoints(router)
-	h.addBudgetRuleEndpoints(router)
-}
-
-func (h *handler) getCatalogProviders(w http.ResponseWriter, r *http.Request) {
-	if _, err := nbcontext.GetUserAuthFromContext(r.Context()); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	entries := catalog.All()
-	out := make([]api.AgentNetworkCatalogProvider, 0, len(entries))
-	for _, e := range entries {
-		out = append(out, e.ToAPIResponse())
-	}
-	util.WriteJSONObject(r.Context(), w, out)
-}
-
-func (h *handler) getAllProviders(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	providers, err := h.manager.GetAllProviders(r.Context(), userAuth.AccountId, userAuth.UserId)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	out := make([]*api.AgentNetworkProvider, 0, len(providers))
-	for _, p := range providers {
-		out = append(out, p.ToAPIResponse())
-	}
-	util.WriteJSONObject(r.Context(), w, out)
-}
-
-func (h *handler) getProvider(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	providerID := mux.Vars(r)["providerId"]
-	if providerID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "provider ID is required"), w)
-		return
-	}
-
-	provider, err := h.manager.GetProvider(r.Context(), userAuth.AccountId, userAuth.UserId, providerID)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, provider.ToAPIResponse())
-}
-
-func (h *handler) createProvider(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	var req api.AgentNetworkProviderRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validate(&req, true); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	provider := types.NewProvider(userAuth.AccountId)
-	provider.FromAPIRequest(&req)
-
-	bootstrapCluster := ""
-	if req.BootstrapCluster != nil {
-		bootstrapCluster = *req.BootstrapCluster
-	}
-
-	created, err := h.manager.CreateProvider(r.Context(), userAuth.UserId, provider, bootstrapCluster)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, created.ToAPIResponse())
-}
-
-func (h *handler) updateProvider(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	providerID := mux.Vars(r)["providerId"]
-	if providerID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "provider ID is required"), w)
-		return
-	}
-
-	var req api.AgentNetworkProviderRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	if err := validate(&req, false); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	provider := &types.Provider{
-		ID:        providerID,
-		AccountID: userAuth.AccountId,
-	}
-	provider.FromAPIRequest(&req)
-
-	updated, err := h.manager.UpdateProvider(r.Context(), userAuth.UserId, provider)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, updated.ToAPIResponse())
-}
-
-func (h *handler) deleteProvider(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	providerID := mux.Vars(r)["providerId"]
-	if providerID == "" {
-		util.WriteError(r.Context(), status.Errorf(status.InvalidArgument, "provider ID is required"), w)
-		return
-	}
-
-	if err := h.manager.DeleteProvider(r.Context(), userAuth.AccountId, userAuth.UserId, providerID); err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	util.WriteJSONObject(r.Context(), w, util.EmptyObject{})
-}
-
-func validate(req *api.AgentNetworkProviderRequest, requireAPIKey bool) error {
-	if strings.TrimSpace(req.ProviderId) == "" {
-		return status.Errorf(status.InvalidArgument, "provider_id is required")
-	}
-	if !catalog.IsKnown(req.ProviderId) {
-		return status.Errorf(status.InvalidArgument, "provider_id %q is not a known catalog provider", req.ProviderId)
-	}
-	if strings.TrimSpace(req.Name) == "" {
-		return status.Errorf(status.InvalidArgument, "name is required")
-	}
-	if strings.TrimSpace(req.UpstreamUrl) == "" {
-		return status.Errorf(status.InvalidArgument, "upstream_url is required")
-	}
-	u, err := url.Parse(strings.TrimSpace(req.UpstreamUrl))
-	if err != nil || u.Host == "" || (u.Scheme != "http" && u.Scheme != "https") {
-		return status.Errorf(status.InvalidArgument, "upstream_url must be a full http(s) URL")
-	}
-	if requireAPIKey && (req.ApiKey == nil || strings.TrimSpace(*req.ApiKey) == "") {
-		return status.Errorf(status.InvalidArgument, "api_key is required")
-	}
-	return nil
-}
--- a/management/internals/modules/agentnetwork/handlers/settings_handler.go
+++ b/management/internals/modules/agentnetwork/handlers/settings_handler.go
@@ -1,74 +0,0 @@
-package handlers
-
-import (
-	"encoding/json"
-	"errors"
-	"net/http"
-
-	"github.com/gorilla/mux"
-
-	"github.com/netbirdio/netbird/management/internals/modules/agentnetwork/types"
-	nbcontext "github.com/netbirdio/netbird/management/server/context"
-	"github.com/netbirdio/netbird/shared/management/http/api"
-	"github.com/netbirdio/netbird/shared/management/http/util"
-	"github.com/netbirdio/netbird/shared/management/status"
-)
-
-// addSettingsEndpoints registers the Agent Network settings routes. The
-// settings row is bootstrapped server-side on first provider create; GET reads
-// it and PUT updates the mutable collection toggles (cluster/subdomain stay
-// immutable).
-func (h *handler) addSettingsEndpoints(router *mux.Router) {
-	router.HandleFunc("/agent-network/settings", h.getSettings).Methods("GET", "OPTIONS")
-	router.HandleFunc("/agent-network/settings", h.updateSettings).Methods("PUT", "OPTIONS")
-}
-
-// updateSettings applies the collection toggles to the account's settings row.
-func (h *handler) updateSettings(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	var req api.AgentNetworkSettingsRequest
-	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
-		util.WriteErrorResponse("couldn't parse JSON request", http.StatusBadRequest, w)
-		return
-	}
-
-	settings := &types.Settings{AccountID: userAuth.AccountId}
-	settings.FromAPIRequest(&req)
-
-	updated, err := h.manager.UpdateSettings(r.Context(), userAuth.UserId, settings)
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, updated.ToAPIResponse())
-}
-
-// getSettings returns the account's agent-network settings. The settings
-// row is bootstrapped on first provider create, so freshly-onboarded
-// accounts have nothing to read. Rather than 404-ing in that case (which
-// the dashboard would have to special-case), return a JSON null with 200
-// so consumers can branch on the body alone.
-func (h *handler) getSettings(w http.ResponseWriter, r *http.Request) {
-	userAuth, err := nbcontext.GetUserAuthFromContext(r.Context())
-	if err != nil {
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-
-	settings, err := h.manager.GetSettings(r.Context(), userAuth.AccountId, userAuth.UserId)
-	if err != nil {
-		var sErr *status.Error
-		if errors.As(err, &sErr) && sErr.Type() == status.NotFound {
-			util.WriteJSONObject(r.Context(), w, nil)
-			return
-		}
-		util.WriteError(r.Context(), err, w)
-		return
-	}
-	util.WriteJSONObject(r.Context(), w, settings.ToAPIResponse())
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
riccardom	7706f578fe	use API	2026-06-18 19:25:55 +02:00
riccardom	daf5026192	Adds restart for MDM	2026-06-18 19:23:41 +02:00
riccardom	ec18b07959	NOP	2026-06-18 17:27:18 +02:00
riccardom	9628f016da	Bridges embed to use the supervisor + remove providing of establishedChan from caller (We now do AsyncStart + waitEstablishedOrDone, so everything is managed inside the supervisor)	2026-06-18 17:10:17 +02:00
riccardom	b39e9df194	Renaming	2026-06-18 16:51:06 +02:00
riccardom	0388e0f262	Merge branch 'main' into lock_removal # Conflicts: # client/internal/connect.go # client/ios/NetBirdSDK/client.go # client/server/server.go # client/server/server_test.go	2026-06-18 16:10:40 +02:00
riccardom	86f896723d	Wait on broadcasted ended signal for establishment / done	2026-06-18 15:20:11 +02:00
riccardom	29ee84999c	conn established (success) or done (end/failure..) are signals of the supervisor	2026-06-18 14:40:03 +02:00
riccardom	0e8fd22f36	Highlight what signals that the sup is running (which means the Connection is running because of UP/auto start)	2026-06-18 14:40:03 +02:00
riccardom	ff98105212	Clarifies: service -> ServiceRunning -> up -> ConnectionRunning -> connestablished ->connEstablished -> end of run -> connDone	2026-06-18 14:40:03 +02:00
riccardom	6465997a69	Aligns tests	2026-06-18 14:40:03 +02:00
riccardom	3204270c4b	Removes all unrequired checks, since the lifetime now guarantees the non nil presence of connectClient	2026-06-18 14:40:03 +02:00
riccardom	6d3bcef2c4	Rename to clarify	2026-06-18 14:40:03 +02:00
riccardom	5d7cb30e5b	Removes other occurrencies of connectClient check	2026-06-18 14:40:03 +02:00
riccardom	aff5da2c8e	Log something better when UP doesn't find service grpc socket	2026-06-18 14:40:03 +02:00
riccardom	9b179be324	Defines an API for knowing if the SERVICE is running (regardless of up and down state) New() builds s.connecClient and is called when the gRPC service is started. Up() is invoked only IF a gRPC service IS running which is possible only if the New() was called.	2026-06-17 23:30:30 +02:00
riccardom	33e7b6a8f1	Align tests	2026-06-17 23:00:53 +02:00
riccardom	e0cff5e240	If New creates, Starts MUST find a connectClient	2026-06-17 23:00:53 +02:00
riccardom	0085aebf77	Removes external connectWithRetryRuns	2026-06-17 08:52:02 +02:00
riccardom	91d2d341b7	Guard is done inside not from external. Stop called unconditionally	2026-06-16 23:18:50 +02:00
riccardom	8d46580c13	Not the run duty	2026-06-16 23:09:59 +02:00
riccardom	b42fe6a10f	And now let's just avoid it at all	2026-06-16 21:58:05 +02:00
riccardom	0f5d7fdc07	Removes deadlock	2026-06-16 21:57:07 +02:00
riccardom	13c78d98f5	Client (and the supervisor within) now lives forever. So checkin that it's nil isn't anymore an indirect way to know the cleanup has succeeded	2026-06-16 18:55:08 +02:00
riccardom	d1229ed84c	Restores context MD passed to GetInfo to mgmt (is it valuable data?)	2026-06-16 18:55:08 +02:00
riccardom	9758145517	Discriminate auth fails from mgmt unreachable. Needed to avoid this workaround if status != internal.StatusIdle && status != internal.StatusConnected && status != internal.StatusConnecting { s.actCancel() } in server.go Status(...)	2026-06-16 18:55:08 +02:00
riccardom	200a5a6a70	Rename. We keep only start command (long lasting command) For stop we do synchronously.	2026-06-16 18:55:08 +02:00
riccardom	1f7b1ea863	IsRunning V1	2026-06-16 18:55:08 +02:00
riccardom	4abb10c1aa	Fixes DisableAutoConnect semantics DisableAutoConnect semantics ============================ Scope: governs ONLY the service-Start auto-connect decision. Once the connection goroutine has been spawned (by any path), the flag is never consulted again — the retry loop keeps trying to connect until ctx is cancelled by Down / Stop / Logout. Cases ----- 1. Service Start + DisableAutoConnect = true - No connection goroutine spawned. - clientRunning stays false. - State set to StatusIdle. - Daemon stays passive until an explicit Up RPC. 2. Service Start + DisableAutoConnect = false - Spawn connectWithRetryRuns. - clientRunning = true. - Retry loop runs until ctx cancelled. 3. Up RPC (any value of DisableAutoConnect) - Flag ignored. The user / admin explicitly asked to connect — by definition not "auto". - Spawn connectWithRetryRuns. - clientRunning = true. 4. MDM-triggered restart (any value of DisableAutoConnect) - Flag ignored. An MDM policy change applies new config to an already-running engine; treated as an implicit Up. - Spawn connectWithRetryRuns. - clientRunning = true. 5. Down / Stop / Logout - Cancels ctx → connectWithRetryRuns exits → close(giveUpChan). - cleanupConnection clears clientRunning = false. - DisableAutoConnect not involved. Prepare for next step Collapse error log into s.connect and rename it to more explicit connectOnce # Conflicts: # client/server/server.go	2026-06-16 18:55:08 +02:00
riccardom	a45cefe57a	IsRunning V0	2026-06-16 18:55:08 +02:00
riccardom	a6d504633f	Use Stop not other direct calls like actCancel() things.. for now adding then removing	2026-06-16 18:55:08 +02:00
riccardom	70f2097fff	config becomes start/run arg so we are sure it gets updated on any (re)start This is because we now have the supervisor not destroyed over time Still intermediate step where server.go regenerates the client entirely.	2026-06-16 18:31:46 +02:00
riccardom	befa9a879c	DO NOT recreate ConnectClient!	2026-06-16 18:31:46 +02:00
riccardom	4152c41796	Wire the stop-for-any-reason to the sup stop (and remove race on engine!)	2026-06-16 18:31:46 +02:00
riccardom	8b76b3d824	Keep the command DON'T DO COPIES!!!	2026-06-16 18:31:46 +02:00
riccardom	0503a18644	Wraps client lifetime into a supervisor - define needed supervisor context/variables - will use runCancel as knob to know if the client is running. No extra boolean flags - runWaiter is used to signal to the async run caller	2026-06-16 18:31:46 +02:00
riccardom	ec6512d660	Docker build env for windows/android	2026-06-16 15:31:52 +02:00