Aligns new tests to signature

Prevents skipping of intermediate map updates potentially not applied
by moving the persistence from applySync to the map state manager
2026-06-29 19:29:56 +00:00 · 2026-06-28 17:25:17 +02:00 · 2026-06-28 17:23:34 +02:00 · 2026-06-28 17:23:34 +02:00 · 2026-06-28 17:20:00 +02:00 · 2026-06-28 17:20:00 +02:00
158 changed files with 8862 additions and 4412 deletions
--- a/.github/workflows/check-license-dependencies.yml
+++ b/.github/workflows/check-license-dependencies.yml
@@ -20,7 +20,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

@@ -59,12 +59,12 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Set up Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: true
--- a/.github/workflows/git-town.yml
+++ b/.github/workflows/git-town.yml
@@ -15,7 +15,7 @@ jobs:
      pull-requests: write

    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false
      - uses: git-town/action@3d8b878379abb1ee393fb49865a28b4a6c2cd3b0 # v1.2.1
--- a/.github/workflows/golang-test-darwin.yml
+++ b/.github/workflows/golang-test-darwin.yml
@@ -16,18 +16,18 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false

      - name: Cache Go modules
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: ~/go/pkg/mod
          key: macos-gotest-${{ hashFiles('**/go.sum') }}
@@ -45,10 +45,10 @@ jobs:
        run: git --no-pager diff --exit-code

      - name: Test
-        run: NETBIRD_STORE_ENGINE=${{ matrix.store }} CI=true go test -coverprofile=coverage.txt -tags=devcert -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE' -timeout 5m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined)
+        run: NETBIRD_STORE_ENGINE=${{ matrix.store }} CI=true go test -coverprofile=coverage.txt -tags 'devcert privileged' -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE' -timeout 5m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined -e /client/testutil/privileged)

      - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
+        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
--- a/.github/workflows/golang-test-freebsd.yml
+++ b/.github/workflows/golang-test-freebsd.yml
@@ -16,7 +16,7 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

@@ -28,7 +28,7 @@ jobs:
        id: test
        env:
          GO_VERSION: ${{ steps.goversion.outputs.version }}
-        uses: vmactions/freebsd-vm@d1e65811565151536c0c894fff74f06351ed26e6 # v1.4.5
+        uses: vmactions/freebsd-vm@b84ab5559b5a1bb4b8ee2737d2506a16e1737636 # v1.4.8
        with:
          usesh: true
          copyback: false
@@ -48,14 +48,14 @@ jobs:
            export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin
            time go build -o netbird client/main.go
            # check all component except management, since we do not support management server on freebsd
-            time go test -timeout 1m -failfast ./base62/...
+            time go test -tags privileged -timeout 1m -failfast ./base62/...
            # NOTE: without -p1 `client/internal/dns` will fail because of `listen udp4 :33100: bind: address already in use`
-            time go test -timeout 8m -failfast -v -p 1 ./client/...
-            time go test -timeout 1m -failfast ./dns/...
-            time go test -timeout 1m -failfast ./encryption/...
-            time go test -timeout 1m -failfast ./formatter/...
-            time go test -timeout 1m -failfast ./client/iface/...
-            time go test -timeout 1m -failfast ./route/...
-            time go test -timeout 1m -failfast ./sharedsock/...
-            time go test -timeout 1m -failfast ./util/...
-            time go test -timeout 1m -failfast ./version/...
+            time go test -tags privileged -timeout 8m -failfast -v -p 1 ./client/...
+            time go test -tags privileged -timeout 1m -failfast ./dns/...
+            time go test -tags privileged -timeout 1m -failfast ./encryption/...
+            time go test -tags privileged -timeout 1m -failfast ./formatter/...
+            time go test -tags privileged -timeout 1m -failfast ./client/iface/...
+            time go test -tags privileged -timeout 1m -failfast ./route/...
+            time go test -tags privileged -timeout 1m -failfast ./sharedsock/...
+            time go test -tags privileged -timeout 1m -failfast ./util/...
+            time go test -tags privileged -timeout 1m -failfast ./version/...
--- a/.github/workflows/golang-test-linux.yml
+++ b/.github/workflows/golang-test-linux.yml
@@ -18,7 +18,7 @@ jobs:
      management: ${{ steps.filter.outputs.management }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

@@ -30,7 +30,7 @@ jobs:
              - 'management/**'

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -41,7 +41,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        id: cache
        with:
          path: |
@@ -119,12 +119,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -135,7 +135,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -158,11 +158,11 @@ jobs:
        run: git --no-pager diff --exit-code

      - name: Test
-        run: CGO_ENABLED=1 GOARCH=${{ matrix.arch }} CI=true go test -coverprofile=coverage.txt -tags devcert -exec 'sudo' -timeout 10m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined)
+        run: CGO_ENABLED=1 GOARCH=${{ matrix.arch }} CI=true go test -coverprofile=coverage.txt -tags devcert -timeout 10m -p 1 $(go list ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined)

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
+        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -175,12 +175,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -192,7 +192,7 @@ jobs:
          echo "modcache_dir=$(go env GOMODCACHE)" >> $GITHUB_OUTPUT

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        id: cache-restore
        with:
          path: |
@@ -229,7 +229,7 @@ jobs:
            sh -c ' \
              apk update; apk add --no-cache \
                ca-certificates iptables ip6tables dbus dbus-dev libpcap-dev build-base; \
-              go test -buildvcs=false -tags devcert -v -timeout 10m -p 1 $(go list -buildvcs=false ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined -e /client/ui -e /upload-server)
+              go test -buildvcs=false -tags "devcert privileged" -v -timeout 10m -p 1 $(go list -buildvcs=false ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined -e /client/ui -e /upload-server -e /client/testutil/privileged)
            '

  test_relay:
@@ -246,12 +246,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -266,7 +266,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -290,7 +290,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
+        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -306,12 +306,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -325,7 +325,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -347,7 +347,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
+        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -363,12 +363,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -383,7 +383,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -407,7 +407,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
+        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -424,12 +424,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -440,7 +440,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -484,7 +484,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
+        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
@@ -529,12 +529,12 @@ jobs:
            prom/prometheus

      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -545,7 +545,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -579,10 +579,11 @@ jobs:
          CGO_ENABLED=1 GOARCH=${{ matrix.arch }} \
          NETBIRD_STORE_ENGINE=${{ matrix.store }} \
          CI=true \
-          GIT_BRANCH=${{ github.ref_name }} \
          go test -tags devcert -run=^$ -bench=. \
          -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE,GIT_BRANCH,GITHUB_RUN_ID' \
          -timeout 20m ./management/... ./shared/management/... $(go list ./management/... ./shared/management/... | grep -v -e /management/server/http)
+        env:
+          GIT_BRANCH: ${{ github.ref_name }}

  api_benchmark:
    name: "Management / Benchmark (API)"
@@ -623,12 +624,12 @@ jobs:
            prom/prometheus

      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -639,7 +640,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -673,12 +674,13 @@ jobs:
          CGO_ENABLED=1 GOARCH=${{ matrix.arch }} \
          NETBIRD_STORE_ENGINE=${{ matrix.store }} \
          CI=true \
-          GIT_BRANCH=${{ github.ref_name }} \
          go test -tags=benchmark \
            -run=^$ \
            -bench=. \
            -exec 'sudo --preserve-env=CI,NETBIRD_STORE_ENGINE,GIT_BRANCH,GITHUB_RUN_ID' \
            -timeout 20m ./management/server/http/...
+        env:
+          GIT_BRANCH: ${{ github.ref_name }}

  api_integration_test:
    name: "Management / Integration"
@@ -692,12 +694,12 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
@@ -708,7 +710,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -734,7 +736,7 @@ jobs:

      - name: Upload coverage reports to Codecov
        if: matrix.arch == 'amd64'
-        uses: codecov/codecov-action@e79a6962e0d4c0c17b229090214935d2e33f8354 #v6.0.1
+        uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f #v7.0.0
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          slug: netbirdio/netbird
--- a/.github/workflows/golang-test-windows.yml
+++ b/.github/workflows/golang-test-windows.yml
@@ -18,12 +18,12 @@ jobs:
    runs-on: windows-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        id: go
        with:
          go-version-file: "go.mod"
@@ -35,7 +35,7 @@ jobs:
          echo "modcache=$(go env GOMODCACHE)" >> $env:GITHUB_ENV

      - name: Cache Go modules
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ${{ env.cache }}
@@ -68,7 +68,7 @@ jobs:
        run: |
          $packages = go list ./... | Where-Object { $_ -notmatch '/management' } | Where-Object { $_ -notmatch '/relay' } | Where-Object { $_ -notmatch '/signal' } | Where-Object { $_ -notmatch '/proxy' } | Where-Object { $_ -notmatch '/combined' }
          $goExe = "C:\hostedtoolcache\windows\go\${{ steps.go.outputs.go-version }}\x64\bin\go.exe"
-          $cmd = "$goExe test -tags=devcert -timeout 10m -p 1 $($packages -join ' ') > test-out.txt 2>&1"
+          $cmd = "$goExe test -tags `"devcert privileged`" -timeout 10m -p 1 $($packages -join ' ') > test-out.txt 2>&1"
          Set-Content -Path "${{ github.workspace }}\run-tests.cmd" -Value $cmd

      - name: test
--- a/.github/workflows/golangci-lint.yml
+++ b/.github/workflows/golangci-lint.yml
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false
      - name: codespell
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 15
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false
      - name: Check for duplicate constants
@@ -48,7 +48,7 @@ jobs:
        run: |
          ! awk '/const \(/,/)/{print $0}' management/server/activity/codes.go | grep -o '= [0-9]*' | sort | uniq -d | grep .
      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
--- a/.github/workflows/install-script-test.yml
+++ b/.github/workflows/install-script-test.yml
@@ -22,7 +22,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

--- a/.github/workflows/mobile-build-validation.yml
+++ b/.github/workflows/mobile-build-validation.yml
@@ -16,11 +16,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
      - name: Setup Android SDK
@@ -28,13 +28,13 @@ jobs:
        with:
          cmdline-tools-version: 8512546
      - name: Setup Java
-        uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654
+        uses: actions/setup-java@1bcf9fb12cf4aa7d266a90ae39939e61372fe520
        with:
          java-version: "11"
          distribution: "adopt"
      - name: NDK Cache
        id: ndk-cache
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: /usr/local/lib/android/sdk/ndk
          key: ndk-cache-23.1.7779620
@@ -54,11 +54,11 @@ jobs:
    runs-on: macos-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
      - name: install gomobile
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -27,7 +27,7 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

@@ -64,7 +64,7 @@ jobs:
        if: steps.check_diff.outputs.diff_exists == 'true'
        env:
          GO_VERSION: ${{ steps.goversion.outputs.version }}
-        uses: vmactions/freebsd-vm@d1e65811565151536c0c894fff74f06351ed26e6 # v1.4.5
+        uses: vmactions/freebsd-vm@b84ab5559b5a1bb4b8ee2737d2506a16e1737636 # v1.4.8
        with:
          usesh: true
          copyback: false
@@ -135,7 +135,7 @@ jobs:
      ghcr_images: ${{ steps.tag_and_push_images.outputs.images_markdown }}
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          fetch-depth: 0 # It is required for GoReleaser to work properly
          persist-credentials: false
@@ -166,12 +166,12 @@ jobs:
          fi

      - name: Set up Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
      - name: Cache Go modules
-        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache/restore@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ~/go/pkg/mod
@@ -186,9 +186,9 @@ jobs:
      - name: check git status
        run: git --no-pager diff --exit-code
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a #v4.0.0
+        uses: docker/setup-qemu-action@06116385d9baf250c9f4dcb4858b16962ea869c3 #v4.1.0
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd #v4.0.0
+        uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 #v4.1.0
      - name: Login to Docker hub
        if: github.event_name != 'pull_request'
        uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0
@@ -221,7 +221,7 @@ jobs:
        run: goversioninfo -arm -64 -icon client/ui/assets/netbird.ico -manifest client/manifest.xml -product-name ${{ env.PRODUCT_NAME }} -copyright "${{ env.COPYRIGHT }}" -ver-major ${{ steps.semver_parser.outputs.major }} -ver-minor ${{ steps.semver_parser.outputs.minor }} -ver-patch ${{ steps.semver_parser.outputs.patch }} -ver-build 0 -file-version ${{ steps.semver_parser.outputs.fullversion }}.0 -product-version ${{ steps.semver_parser.outputs.fullversion }}.0 -o client/resources_windows_arm64.syso
      - name: Run GoReleaser
        id: goreleaser
-        uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
+        uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
        with:
          version: ${{ env.GORELEASER_VER }}
          args: release --clean ${{ env.flags }}
@@ -347,7 +347,7 @@ jobs:
      release_ui_artifact_url: ${{ steps.upload_release_ui.outputs.artifact-url }}
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          fetch-depth: 0 # It is required for GoReleaser to work properly
          persist-credentials: false
@@ -374,12 +374,12 @@ jobs:
          fi

      - name: Set up Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
      - name: Cache Go modules
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ~/go/pkg/mod
@@ -420,7 +420,7 @@ jobs:
        run: goversioninfo -arm -64 -icon client/ui/assets/netbird.ico -manifest client/ui/manifest.xml -product-name ${{ env.PRODUCT_NAME }}-"UI" -copyright "${{ env.COPYRIGHT }}" -ver-major ${{ steps.semver_parser.outputs.major }} -ver-minor ${{ steps.semver_parser.outputs.minor }} -ver-patch ${{ steps.semver_parser.outputs.patch }} -ver-build 0 -file-version ${{ steps.semver_parser.outputs.fullversion }}.0 -product-version ${{ steps.semver_parser.outputs.fullversion }}.0 -o client/ui/resources_windows_arm64.syso

      - name: Run GoReleaser
-        uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
+        uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
        with:
          version: ${{ env.GORELEASER_VER }}
          args: release --config .goreleaser_ui.yaml --clean ${{ env.flags }}
@@ -464,17 +464,17 @@ jobs:
      - if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
        run: echo "flags=--snapshot" >> $GITHUB_ENV
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          fetch-depth: 0 # It is required for GoReleaser to work properly
          persist-credentials: false
      - name: Set up Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
          cache: false
      - name: Cache Go modules
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: |
            ~/go/pkg/mod
@@ -488,7 +488,7 @@ jobs:
        run: git --no-pager diff --exit-code
      - name: Run GoReleaser
        id: goreleaser
-        uses: goreleaser/goreleaser-action@4c6ab561adb47e50c45ef534e2155934e91c40c1 # v7.2.0
+        uses: goreleaser/goreleaser-action@5daf1e915a5f0af01ddbcd89a43b8061ff4f1a89 # v7.2.2
        with:
          version: ${{ env.GORELEASER_VER }}
          args: release --config .goreleaser_ui_darwin.yaml --clean ${{ env.flags }}
@@ -522,7 +522,7 @@ jobs:
      downloadPath: '${{ github.workspace }}\temp'
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

@@ -534,13 +534,13 @@ jobs:
        run: echo "C:\Program Files\7-Zip" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append

      - name: Download release artifacts
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.1
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
          name: release
          path: release

      - name: Download UI release artifacts
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.1
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
        with:
          name: release-ui
          path: release-ui
--- a/.github/workflows/test-infrastructure-files.yml
+++ b/.github/workflows/test-infrastructure-files.yml
@@ -68,17 +68,17 @@ jobs:
        run: sudo apt-get install -y curl

      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"

      - name: Cache Go modules
-        uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+        uses: actions/cache@2c8a9bd7457de244a408f35966fab2fb45fda9c8 # v6.0.0
        with:
          path: ~/go/pkg/mod
          key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }}
@@ -207,7 +207,7 @@ jobs:
      - name: Build management docker image
        working-directory: management
        run: |
-          docker build -t netbirdio/management:latest .
+          docker build -t netbirdio/management:latest --build-arg TARGETPLATFORM=. .

      - name: Build signal binary
        working-directory: signal
@@ -216,7 +216,7 @@ jobs:
      - name: Build signal docker image
        working-directory: signal
        run: |
-          docker build -t netbirdio/signal:latest .
+          docker build -t netbirdio/signal:latest --build-arg TARGETPLATFORM=. .

      - name: Build relay binary
        working-directory: relay
@@ -225,7 +225,7 @@ jobs:
      - name: Build relay docker image
        working-directory: relay
        run: |
-          docker build -t netbirdio/relay:latest .
+          docker build -t netbirdio/relay:latest --build-arg TARGETPLATFORM=. .

      - name: run docker compose up
        working-directory: infrastructure_files/artifacts
@@ -256,7 +256,7 @@ jobs:
        run: sudo apt-get install -y jq

      - name: Checkout code
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false

--- a/.github/workflows/wasm-build-validation.yml
+++ b/.github/workflows/wasm-build-validation.yml
@@ -19,11 +19,11 @@ jobs:
      GOARCH: wasm
    steps:
      - name: Checkout repository
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
      - name: Install dependencies
@@ -44,11 +44,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          persist-credentials: false
      - name: Install Go
-        uses: actions/setup-go@4b73464bb391d4059bd26b0524d20df3927bd417 # v6.3.0
+        uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        with:
          go-version-file: "go.mod"
      - name: Build Wasm client
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -247,7 +247,7 @@ dockers_v2:
       - netbirdio/netbird
       - ghcr.io/netbirdio/netbird
     tags:
-       - "v{{ .Version }}"
+       - "{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: client/Dockerfile
     extra_files:
@@ -295,7 +295,7 @@ dockers_v2:
       - netbirdio/relay
       - ghcr.io/netbirdio/relay
     tags:
-       - "v{{ .Version }}"
+       - "{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: relay/Dockerfile
     platforms:
@@ -317,7 +317,7 @@ dockers_v2:
       - netbirdio/signal
       - ghcr.io/netbirdio/signal
     tags:
-       - "v{{ .Version }}"
+       - "{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: signal/Dockerfile
     platforms:
@@ -339,7 +339,7 @@ dockers_v2:
       - netbirdio/management
       - ghcr.io/netbirdio/management
     tags:
-       - "v{{ .Version }}"
+       - "{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: management/Dockerfile
     platforms:
@@ -361,7 +361,7 @@ dockers_v2:
       - netbirdio/upload
       - ghcr.io/netbirdio/upload
     tags:
-       - "v{{ .Version }}"
+       - "{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: upload-server/Dockerfile
     platforms:
@@ -383,7 +383,7 @@ dockers_v2:
       - netbirdio/netbird-server
       - ghcr.io/netbirdio/netbird-server
     tags:
-       - "v{{ .Version }}"
+       - "{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: combined/Dockerfile
     platforms:
@@ -405,7 +405,7 @@ dockers_v2:
       - netbirdio/reverse-proxy
       - ghcr.io/netbirdio/reverse-proxy
     tags:
-       - "v{{ .Version }}"
+       - "{{ .Version }}"
       - "{{ if eq .Env.SKIP_PUBLISH \"false\" }}latest{{ end }}"
     dockerfile: proxy/Dockerfile
     platforms:
@@ -462,9 +462,13 @@ checksum:
    - glob: ./infrastructure_files/getting-started-with-zitadel.sh
    - glob: ./release_files/install.sh
    - glob: ./infrastructure_files/getting-started.sh
+    - glob: ./infrastructure_files/getting-started-enterprise.sh
+    - glob: ./infrastructure_files/migrate-to-enterprise.sh

 release:
  extra_files:
    - glob: ./infrastructure_files/getting-started-with-zitadel.sh
    - glob: ./release_files/install.sh
    - glob: ./infrastructure_files/getting-started.sh
+    - glob: ./infrastructure_files/getting-started-enterprise.sh
+    - glob: ./infrastructure_files/migrate-to-enterprise.sh
--- a/14
+++ b/14
@@ -1,4 +1,4 @@
-.PHONY: lint lint-all lint-install setup-hooks
+.PHONY: lint lint-all lint-install setup-hooks test-unit test-privileged
 GOLANGCI_LINT := $(shell pwd)/bin/golangci-lint

 # Install golangci-lint locally if needed
@@ -25,3 +25,15 @@ setup-hooks:
 	@git config core.hooksPath .githooks
 	@chmod +x .githooks/pre-push
 	@echo "✅ Git hooks configured! Pre-push will now run 'make lint'"
+
+# Host-safe unit tests: excludes the privileged-tagged tests (root / system-mutating).
+# Runs as a normal user with no sudo and leaves host networking untouched.
+test-unit:
+	@go test -tags devcert -timeout 10m ./...
+
+# Privileged suite: runs the `privileged`-tagged tests inside a --privileged
+# --cap-add=NET_ADMIN container via the ory/dockertest harness. Requires Docker.
+# Narrow the run with env vars, e.g.:
+#   PRIV_RUN=TestNftablesManager PRIV_PKGS=./client/firewall/nftables/... make test-privileged
+test-privileged:
+	@go test -tags 'devcert privileged' -timeout 30m -run TestRunPrivilegedSuiteInDocker -v ./client/testutil/privileged/...
--- a/README.md
+++ b/README.md
@@ -37,6 +37,11 @@
  </strong>
 </p>

+> ### 🤖 NetBird Agent Network (Beta)
+> Identity-aware access control for AI agents — keyless access to LLM APIs and private
+> resources over the encrypted NetBird tunnel. See [`agent-network/`](agent-network/) or
+> read the docs at **[netbird.ai](https://netbird.ai)**.
+
 **NetBird combines a configuration-free peer-to-peer private network and a centralized access control system in a single platform, making it easy to create secure private networks for your organization or home.**

 **Connect.** NetBird creates a WireGuard-based overlay network that automatically connects your machines over an encrypted tunnel, leaving behind the hassle of opening ports, complex firewall rules, VPN gateways, and so forth.
--- a/agent-network/README.md
+++ b/agent-network/README.md
@@ -0,0 +1,39 @@
+# NetBird Agent Network
+
+Agent Network is NetBird's access control layer for AI agents and the people who run
+them. It gives every agent a real identity, tied to your identity provider (IdP), and
+governs what it can reach — the LLM APIs and AI gateways it can call, and the internal
+resources it can access. Traffic flows only over the encrypted NetBird tunnel, scoped by
+policy, with no API keys to leak.
+
+> **Beta.** Agent Network is open source and can be self-hosted on your own
+> infrastructure.
+
+## How it works
+
+Agent Network is built on two existing NetBird capabilities:
+
+- **Overlay network** — the encrypted WireGuard mesh between peers.
+- **Reverse proxy** — a NetBird peer that terminates LLM requests, establishes the
+  caller's identity, evaluates policies/limits/guardrails, injects the upstream provider
+  key server-side, forwards to the API or gateway, and records usage.
+
+LLM traffic is routed through the proxy's identity-aware pipeline, while internal
+resources (databases, internal APIs, self-hosted models) are reached directly over
+peer-to-peer WireGuard tunnels, governed by the same identities and access policies.
+
+## Where the code lives
+
+There is no separate "agent-network" service — it reuses the reverse-proxy and management
+components:
+
+- [`proxy/`](../proxy) — the NetBird reverse proxy that serves the agent network endpoint
+  and runs the per-request middleware pipeline.
+- [`management/internals/modules/reverseproxy/`](../management/internals/modules/reverseproxy)
+  — the management-side control plane: providers, policies, guardrails, limits, routing,
+  and usage/access logs.
+
+## Documentation
+
+Full documentation, architecture, and quickstart:
+**https://docs.netbird.io/agent-network**
--- a/client/android/client.go
+++ b/client/android/client.go
@@ -151,9 +151,9 @@ func (c *Client) Run(platformFiles PlatformFiles, urlOpener URLOpener, isAndroid

 	// todo do not throw error in case of cancelled context
 	ctx = internal.CtxInitState(ctx)
-	connectClient := internal.NewConnectClient(ctx, c.recorder)
+	connectClient := internal.NewConnectClient(ctx, cfg, c.recorder)
 	c.setState(cfg, cacheDir, connectClient)
-	return connectClient.RunOnAndroid(cfg, c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
+	return connectClient.RunOnAndroid(c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
 }

 // RunWithoutLogin we apply this type of run function when the backed has been started without UI (i.e. after reboot).
@@ -186,9 +186,9 @@ func (c *Client) RunWithoutLogin(platformFiles PlatformFiles, dns *DNSList, dnsR

 	// todo do not throw error in case of cancelled context
 	ctx = internal.CtxInitState(ctx)
-	connectClient := internal.NewConnectClient(ctx, c.recorder)
+	connectClient := internal.NewConnectClient(ctx, cfg, c.recorder)
 	c.setState(cfg, cacheDir, connectClient)
-	return connectClient.RunOnAndroid(cfg, c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
+	return connectClient.RunOnAndroid(c.tunAdapter, c.iFaceDiscover, c.networkChangeListener, slices.Clone(dns.items), dnsReadyListener, stateFile, cacheDir)
 }

 // Stop the internal client and free the resources
--- a/client/cmd/debug.go
+++ b/client/cmd/debug.go
@@ -130,7 +130,7 @@ func debugConfigDump(cmd *cobra.Command, _ []string) error {

 	client := proto.NewDaemonServiceClient(conn)
 	resp, err := client.GetConfig(cmd.Context(), &proto.GetConfigRequest{
-		ProfileName: activeProf.Name,
+		ProfileName: string(activeProf.ID),
 		Username:    currUser.Username,
 	})
 	if err != nil {
--- a/client/cmd/login.go
+++ b/client/cmd/login.go
@@ -227,7 +227,7 @@ func switchProfile(ctx context.Context, handle string, username string) (profile
 		Username:    &username,
 	})
 	if err != nil {
-		return "", fmt.Errorf("switch profile failed: %v", err)
+		return "", fmt.Errorf("switch profile failed: %w", err)
 	}

 	return profilemanager.ID(resp.Id), nil
--- a/client/cmd/profile.go
+++ b/client/cmd/profile.go
@@ -138,26 +138,23 @@ func addProfileFunc(cmd *cobra.Command, args []string) error {
 		return err
 	}

+	currUser, err := user.Current()
+	if err != nil {
+		return fmt.Errorf("get current user: %w", err)
+	}
+
 	conn, err := DialClientGRPCServer(cmd.Context(), daemonAddr)
 	if err != nil {
 		return fmt.Errorf("connect to service CLI interface: %w", err)
 	}
 	defer conn.Close()

-	currUser, err := user.Current()
-	if err != nil {
-		return fmt.Errorf("get current user: %w", err)
-	}
-
 	daemonClient := proto.NewDaemonServiceClient(conn)
 	profileName := args[0]

-	resp, err := daemonClient.AddProfile(cmd.Context(), &proto.AddProfileRequest{
-		ProfileName: profileName,
-		Username:    currUser.Username,
-	})
+	id, err := addProfileOnDaemon(cmd.Context(), daemonClient, profileName, currUser.Username)
 	if err != nil {
-		return fmt.Errorf("add profile request: %w", err)
+		return err
 	}

 	dupCount, _ := countProfilesWithName(cmd.Context(), daemonClient, currUser.Username, profileName)
@@ -166,7 +163,6 @@ func addProfileFunc(cmd *cobra.Command, args []string) error {
 		cmd.Println("Use `netbird profile list --show-id` to disambiguate later.")
 	}

-	id := profilemanager.ID(resp.Id)
 	cmd.Printf("Profile added: %s  %s\n", id.ShortID(), profilemanager.StripCtrlChars(profileName))
 	return nil

@@ -330,3 +326,19 @@ func wrapAmbiguityError(err error, handle string) error {
 	}
 	return err
 }
+
+// addProfileOnDaemon issues the AddProfile RPC on an existing daemon client
+// and returns the new profile's ID. It is the single entry point for profile
+// creation, shared by `netbird profile add` and the `netbird up --profile
+// <name>` auto-create path.
+func addProfileOnDaemon(ctx context.Context, client proto.DaemonServiceClient, profileName, username string) (profilemanager.ID, error) {
+	resp, err := client.AddProfile(ctx, &proto.AddProfileRequest{
+		ProfileName: profileName,
+		Username:    username,
+	})
+	if err != nil {
+		return "", fmt.Errorf("add profile failed: %w", err)
+	}
+
+	return profilemanager.ID(resp.Id), nil
+}
--- a/client/cmd/root.go
+++ b/client/cmd/root.go
@@ -20,7 +20,6 @@ import (
 	"github.com/spf13/cobra"
 	"github.com/spf13/pflag"
 	"google.golang.org/grpc"
-	"google.golang.org/grpc/connectivity"
 	"google.golang.org/grpc/credentials/insecure"

 	daddr "github.com/netbirdio/netbird/client/internal/daemonaddr"
@@ -262,46 +261,17 @@ func FlagNameToEnvVar(cmdFlag string, prefix string) string {
 	return prefix + upper
 }

-// DialClientGRPCServer returns client connection to the daemon server. It waits
-// (up to the timeout) for the daemon to become reachable so an `up` issued right
-// after `service start` tolerates the startup race. Instead of grpc's blocking
-// dial — whose raw "transport failed" retry warnings are silenced by the logger
-// config — we drive the wait ourselves and emit one clean line per failed attempt.
+// DialClientGRPCServer returns client connection to the daemon server.
 func DialClientGRPCServer(ctx context.Context, addr string) (*grpc.ClientConn, error) {
 	ctx, cancel := context.WithTimeout(ctx, time.Second*10)
 	defer cancel()

-	conn, err := grpc.DialContext(
+	return grpc.DialContext(
 		ctx,
 		strings.TrimPrefix(addr, "tcp://"),
 		grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithBlock(),
 	)
-	if err != nil {
-		return nil, err
-	}
-
-	conn.Connect()
-	for {
-		state := conn.GetState()
-		if state == connectivity.Ready {
-			return conn, nil
-		}
-		// Log only once the connection has actually failed — not during the
-		// brief Idle/Connecting phase on a healthy daemon (avoids a spurious
-		// line + wait when the daemon is already up).
-		if state == connectivity.TransientFailure {
-			log.Infof("waiting for the netbird daemon to become available at %s...", addr)
-		}
-		// Wake on the next state change, but at least every second so a stuck
-		// TransientFailure re-logs at a steady cadence until the timeout.
-		waitCtx, waitCancel := context.WithTimeout(ctx, time.Second)
-		conn.WaitForStateChange(waitCtx, state)
-		waitCancel()
-		if ctx.Err() != nil {
-			_ = conn.Close()
-			return nil, fmt.Errorf("daemon not reachable at %s: %w", addr, ctx.Err())
-		}
-	}
 }

 // WithBackOff execute function in backoff cycle.
--- a/client/cmd/service_privileged_test.go
+++ b/client/cmd/service_privileged_test.go
@@ -0,0 +1,196 @@
+//go:build privileged
+
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/kardianos/service"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+const (
+	serviceStartTimeout = 10 * time.Second
+	serviceStopTimeout  = 5 * time.Second
+	statusPollInterval  = 500 * time.Millisecond
+)
+
+// waitForServiceStatus waits for service to reach expected status with timeout
+func waitForServiceStatus(expectedStatus service.Status, timeout time.Duration) (bool, error) {
+	cfg, err := newSVCConfig()
+	if err != nil {
+		return false, err
+	}
+
+	ctxSvc, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
+	if err != nil {
+		return false, err
+	}
+
+	ctx, timeoutCancel := context.WithTimeout(context.Background(), timeout)
+	defer timeoutCancel()
+
+	ticker := time.NewTicker(statusPollInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return false, fmt.Errorf("timeout waiting for service status %v", expectedStatus)
+		case <-ticker.C:
+			status, err := s.Status()
+			if err != nil {
+				// Continue polling on transient errors
+				continue
+			}
+			if status == expectedStatus {
+				return true, nil
+			}
+		}
+	}
+}
+
+// TestServiceLifecycle tests the complete service lifecycle
+func TestServiceLifecycle(t *testing.T) {
+	// TODO: Add support for Windows and macOS
+	if runtime.GOOS != "linux" && runtime.GOOS != "freebsd" {
+		t.Skipf("Skipping service lifecycle test on unsupported OS: %s", runtime.GOOS)
+	}
+
+	if os.Getenv("CONTAINER") == "true" {
+		t.Skip("Skipping service lifecycle test in container environment")
+	}
+
+	originalServiceName := serviceName
+	serviceName = "netbirdtest" + fmt.Sprintf("%d", time.Now().Unix())
+	defer func() {
+		serviceName = originalServiceName
+	}()
+
+	tempDir := t.TempDir()
+	configPath = fmt.Sprintf("%s/netbird-test-config.json", tempDir)
+	logLevel = "info"
+	daemonAddr = fmt.Sprintf("unix://%s/netbird-test.sock", tempDir)
+
+	// Ensure cleanup even if a subtest fails and Stop/Uninstall subtests don't run.
+	t.Cleanup(func() {
+		cfg, err := newSVCConfig()
+		if err != nil {
+			t.Errorf("cleanup: create service config: %v", err)
+			return
+		}
+		ctxSvc, cancel := context.WithCancel(context.Background())
+		defer cancel()
+		s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
+		if err != nil {
+			t.Errorf("cleanup: create service: %v", err)
+			return
+		}
+
+		// If the subtests already cleaned up, there's nothing to do.
+		if _, err := s.Status(); err != nil {
+			return
+		}
+
+		if err := s.Stop(); err != nil {
+			t.Errorf("cleanup: stop service: %v", err)
+		}
+		if err := s.Uninstall(); err != nil {
+			t.Errorf("cleanup: uninstall service: %v", err)
+		}
+	})
+
+	ctx := context.Background()
+
+	t.Run("Install", func(t *testing.T) {
+		installCmd.SetContext(ctx)
+		err := installCmd.RunE(installCmd, []string{})
+		require.NoError(t, err)
+
+		cfg, err := newSVCConfig()
+		require.NoError(t, err)
+
+		ctxSvc, cancel := context.WithCancel(context.Background())
+		defer cancel()
+
+		s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
+		require.NoError(t, err)
+
+		status, err := s.Status()
+		assert.NoError(t, err)
+		assert.NotEqual(t, service.StatusUnknown, status)
+	})
+
+	t.Run("Start", func(t *testing.T) {
+		startCmd.SetContext(ctx)
+		err := startCmd.RunE(startCmd, []string{})
+		require.NoError(t, err)
+
+		running, err := waitForServiceStatus(service.StatusRunning, serviceStartTimeout)
+		require.NoError(t, err)
+		assert.True(t, running)
+	})
+
+	t.Run("Restart", func(t *testing.T) {
+		restartCmd.SetContext(ctx)
+		err := restartCmd.RunE(restartCmd, []string{})
+		require.NoError(t, err)
+
+		running, err := waitForServiceStatus(service.StatusRunning, serviceStartTimeout)
+		require.NoError(t, err)
+		assert.True(t, running)
+	})
+
+	t.Run("Reconfigure", func(t *testing.T) {
+		originalLogLevel := logLevel
+		logLevel = "debug"
+		defer func() {
+			logLevel = originalLogLevel
+		}()
+
+		reconfigureCmd.SetContext(ctx)
+		err := reconfigureCmd.RunE(reconfigureCmd, []string{})
+		require.NoError(t, err)
+
+		running, err := waitForServiceStatus(service.StatusRunning, serviceStartTimeout)
+		require.NoError(t, err)
+		assert.True(t, running)
+	})
+
+	t.Run("Stop", func(t *testing.T) {
+		stopCmd.SetContext(ctx)
+		err := stopCmd.RunE(stopCmd, []string{})
+		require.NoError(t, err)
+
+		stopped, err := waitForServiceStatus(service.StatusStopped, serviceStopTimeout)
+		require.NoError(t, err)
+		assert.True(t, stopped)
+	})
+
+	t.Run("Uninstall", func(t *testing.T) {
+		uninstallCmd.SetContext(ctx)
+		err := uninstallCmd.RunE(uninstallCmd, []string{})
+		require.NoError(t, err)
+
+		cfg, err := newSVCConfig()
+		require.NoError(t, err)
+
+		ctxSvc, cancel := context.WithCancel(context.Background())
+		defer cancel()
+
+		s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
+		require.NoError(t, err)
+
+		_, err = s.Status()
+		assert.Error(t, err)
+	})
+}
--- a/client/cmd/service_test.go
+++ b/client/cmd/service_test.go
@@ -1,16 +1,12 @@
 package cmd

 import (
-	"context"
-	"fmt"
 	"os"
 	"os/signal"
 	"runtime"
 	"syscall"
 	"testing"
-	"time"

-	"github.com/kardianos/service"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -31,186 +27,6 @@ func TestMain(m *testing.M) {
 	os.Exit(m.Run())
 }

-const (
-	serviceStartTimeout = 10 * time.Second
-	serviceStopTimeout  = 5 * time.Second
-	statusPollInterval  = 500 * time.Millisecond
-)
-
-// waitForServiceStatus waits for service to reach expected status with timeout
-func waitForServiceStatus(expectedStatus service.Status, timeout time.Duration) (bool, error) {
-	cfg, err := newSVCConfig()
-	if err != nil {
-		return false, err
-	}
-
-	ctxSvc, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
-	if err != nil {
-		return false, err
-	}
-
-	ctx, timeoutCancel := context.WithTimeout(context.Background(), timeout)
-	defer timeoutCancel()
-
-	ticker := time.NewTicker(statusPollInterval)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-ctx.Done():
-			return false, fmt.Errorf("timeout waiting for service status %v", expectedStatus)
-		case <-ticker.C:
-			status, err := s.Status()
-			if err != nil {
-				// Continue polling on transient errors
-				continue
-			}
-			if status == expectedStatus {
-				return true, nil
-			}
-		}
-	}
-}
-
-// TestServiceLifecycle tests the complete service lifecycle
-func TestServiceLifecycle(t *testing.T) {
-	// TODO: Add support for Windows and macOS
-	if runtime.GOOS != "linux" && runtime.GOOS != "freebsd" {
-		t.Skipf("Skipping service lifecycle test on unsupported OS: %s", runtime.GOOS)
-	}
-
-	if os.Getenv("CONTAINER") == "true" {
-		t.Skip("Skipping service lifecycle test in container environment")
-	}
-
-	originalServiceName := serviceName
-	serviceName = "netbirdtest" + fmt.Sprintf("%d", time.Now().Unix())
-	defer func() {
-		serviceName = originalServiceName
-	}()
-
-	tempDir := t.TempDir()
-	configPath = fmt.Sprintf("%s/netbird-test-config.json", tempDir)
-	logLevel = "info"
-	daemonAddr = fmt.Sprintf("unix://%s/netbird-test.sock", tempDir)
-
-	// Ensure cleanup even if a subtest fails and Stop/Uninstall subtests don't run.
-	t.Cleanup(func() {
-		cfg, err := newSVCConfig()
-		if err != nil {
-			t.Errorf("cleanup: create service config: %v", err)
-			return
-		}
-		ctxSvc, cancel := context.WithCancel(context.Background())
-		defer cancel()
-		s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
-		if err != nil {
-			t.Errorf("cleanup: create service: %v", err)
-			return
-		}
-
-		// If the subtests already cleaned up, there's nothing to do.
-		if _, err := s.Status(); err != nil {
-			return
-		}
-
-		if err := s.Stop(); err != nil {
-			t.Errorf("cleanup: stop service: %v", err)
-		}
-		if err := s.Uninstall(); err != nil {
-			t.Errorf("cleanup: uninstall service: %v", err)
-		}
-	})
-
-	ctx := context.Background()
-
-	t.Run("Install", func(t *testing.T) {
-		installCmd.SetContext(ctx)
-		err := installCmd.RunE(installCmd, []string{})
-		require.NoError(t, err)
-
-		cfg, err := newSVCConfig()
-		require.NoError(t, err)
-
-		ctxSvc, cancel := context.WithCancel(context.Background())
-		defer cancel()
-
-		s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
-		require.NoError(t, err)
-
-		status, err := s.Status()
-		assert.NoError(t, err)
-		assert.NotEqual(t, service.StatusUnknown, status)
-	})
-
-	t.Run("Start", func(t *testing.T) {
-		startCmd.SetContext(ctx)
-		err := startCmd.RunE(startCmd, []string{})
-		require.NoError(t, err)
-
-		running, err := waitForServiceStatus(service.StatusRunning, serviceStartTimeout)
-		require.NoError(t, err)
-		assert.True(t, running)
-	})
-
-	t.Run("Restart", func(t *testing.T) {
-		restartCmd.SetContext(ctx)
-		err := restartCmd.RunE(restartCmd, []string{})
-		require.NoError(t, err)
-
-		running, err := waitForServiceStatus(service.StatusRunning, serviceStartTimeout)
-		require.NoError(t, err)
-		assert.True(t, running)
-	})
-
-	t.Run("Reconfigure", func(t *testing.T) {
-		originalLogLevel := logLevel
-		logLevel = "debug"
-		defer func() {
-			logLevel = originalLogLevel
-		}()
-
-		reconfigureCmd.SetContext(ctx)
-		err := reconfigureCmd.RunE(reconfigureCmd, []string{})
-		require.NoError(t, err)
-
-		running, err := waitForServiceStatus(service.StatusRunning, serviceStartTimeout)
-		require.NoError(t, err)
-		assert.True(t, running)
-	})
-
-	t.Run("Stop", func(t *testing.T) {
-		stopCmd.SetContext(ctx)
-		err := stopCmd.RunE(stopCmd, []string{})
-		require.NoError(t, err)
-
-		stopped, err := waitForServiceStatus(service.StatusStopped, serviceStopTimeout)
-		require.NoError(t, err)
-		assert.True(t, stopped)
-	})
-
-	t.Run("Uninstall", func(t *testing.T) {
-		uninstallCmd.SetContext(ctx)
-		err := uninstallCmd.RunE(uninstallCmd, []string{})
-		require.NoError(t, err)
-
-		cfg, err := newSVCConfig()
-		require.NoError(t, err)
-
-		ctxSvc, cancel := context.WithCancel(context.Background())
-		defer cancel()
-
-		s, err := newSVC(newProgram(ctxSvc, cancel), cfg)
-		require.NoError(t, err)
-
-		_, err = s.Status()
-		assert.Error(t, err)
-	})
-}
-
 // TestServiceEnvVars tests environment variable parsing
 func TestServiceEnvVars(t *testing.T) {
 	tests := []struct {
--- a/client/cmd/status.go
+++ b/client/cmd/status.go
@@ -11,7 +11,6 @@ import (
 	"google.golang.org/grpc/status"

 	"github.com/netbirdio/netbird/client/internal"
-	"github.com/netbirdio/netbird/client/internal/profilemanager"
 	"github.com/netbirdio/netbird/client/proto"
 	nbstatus "github.com/netbirdio/netbird/client/status"
 	"github.com/netbirdio/netbird/util"
@@ -111,11 +110,10 @@ func statusFunc(cmd *cobra.Command, args []string) error {
 		return nil
 	}

-	pm := profilemanager.NewProfileManager()
-	var profName string
-	if activeProf, err := pm.GetActiveProfile(); err == nil {
-		profName = activeProf.Name
-	}
+	// Resolve the active profile's display name via the daemon, which runs
+	// as root and can read the per-user profile files. The local profile
+	// manager only knows the active profile ID, not its display name.
+	profName := getActiveProfileName(ctx)

 	var outputInformationHolder = nbstatus.ConvertToStatusOutputOverview(resp.GetFullStatus(), nbstatus.ConvertOptions{
 		Anonymize:            anonymizeFlag,
@@ -167,6 +165,25 @@ func getStatus(ctx context.Context, fullPeerStatus bool, shouldRunProbes bool) (
 	return resp, nil
 }

+// getActiveProfileName asks the daemon for the active profile's display
+// name. The daemon runs as root and can read the per-user profile files to
+// resolve the ID to its human-readable name. Returns an empty string on any
+// error so status output degrades gracefully.
+func getActiveProfileName(ctx context.Context) string {
+	conn, err := DialClientGRPCServer(ctx, daemonAddr)
+	if err != nil {
+		return ""
+	}
+	defer conn.Close()
+
+	resp, err := proto.NewDaemonServiceClient(conn).GetActiveProfile(ctx, &proto.GetActiveProfileRequest{})
+	if err != nil {
+		return ""
+	}
+
+	return resp.GetProfileName()
+}
+
 func parseFilters() error {
 	switch strings.ToLower(statusFilter) {
 	case "", "idle", "connecting", "connected":
--- a/client/cmd/up.go
+++ b/client/cmd/up.go
@@ -128,15 +128,9 @@ func upFunc(cmd *cobra.Command, args []string) error {
 	var profileSwitched bool
 	// switch profile if provided
 	if profileName != "" {
-		resolvedID, err := switchProfile(cmd.Context(), profileName, username.Username)
-		if err != nil {
+		if err := switchOrCreateProfile(cmd.Context(), pm, profileName, username.Username); err != nil {
 			return fmt.Errorf("switch profile: %v", err)
 		}
-
-		if err := pm.SwitchProfile(resolvedID); err != nil {
-			return fmt.Errorf("switch profile: %v", err)
-		}
-
 		profileSwitched = true
 	}

@@ -151,6 +145,52 @@ func upFunc(cmd *cobra.Command, args []string) error {
 	return runInDaemonMode(ctx, cmd, pm, activeProf, profileSwitched)
 }

+// switchOrCreateProfile switches the active profile to the one identified by
+// handle, creating it first when it does not exist yet. This restores the
+// pre-0.73 behaviour where `netbird up --profile <name>` auto-creates a
+// missing profile instead of failing.
+func switchOrCreateProfile(ctx context.Context, pm *profilemanager.ProfileManager, handle, username string) error {
+	resolvedID, err := switchProfile(ctx, handle, username)
+	if err != nil {
+		st, ok := gstatus.FromError(err)
+		if !ok || st.Code() != codes.NotFound {
+			return err
+		}
+		// Don't fail immediately on a create error: a concurrent run may
+		// have created the profile between the NotFound above and this
+		// call, in which case the retried switch still succeeds. Only
+		// surface the create error if the switch also fails.
+		_, createErr := createProfile(ctx, handle, username)
+		if resolvedID, err = switchProfile(ctx, handle, username); err != nil {
+			if createErr != nil {
+				return fmt.Errorf("create profile: %w", createErr)
+			}
+			return err
+		}
+	}
+
+	if err := pm.SwitchProfile(resolvedID); err != nil {
+		return err
+	}
+	return nil
+}
+
+// createProfile dials the daemon and creates a new profile with the given
+// display name, returning its generated ID. Use addProfileOnDaemon directly
+// when a daemon client is already available to reuse the connection.
+func createProfile(ctx context.Context, profileName, username string) (profilemanager.ID, error) {
+	conn, err := DialClientGRPCServer(ctx, daemonAddr)
+	if err != nil {
+		//nolint
+		return "", fmt.Errorf("failed to connect to daemon error: %v\n"+
+			"If the daemon is not running please run: "+
+			"\nnetbird service install \nnetbird service start\n", err)
+	}
+	defer conn.Close()
+
+	return addProfileOnDaemon(ctx, proto.NewDaemonServiceClient(conn), profileName, username)
+}
+
 func runInForegroundMode(ctx context.Context, cmd *cobra.Command, activeProf *profilemanager.Profile) error {
 	// override the default profile filepath if provided
 	if configPath != "" {
@@ -201,10 +241,10 @@ func runInForegroundMode(ctx context.Context, cmd *cobra.Command, activeProf *pr
 	r := peer.NewRecorder(config.ManagementURL.String())
 	r.GetFullStatus()

-	connectClient := internal.NewConnectClient(ctx, r)
+	connectClient := internal.NewConnectClient(ctx, config, r)
 	SetupDebugHandler(ctx, config, r, connectClient, "")

-	return connectClient.Run(config, nil, util.FindFirstLogPath(logFiles))
+	return connectClient.Run(nil, util.FindFirstLogPath(logFiles))
 }

 func runInDaemonMode(ctx context.Context, cmd *cobra.Command, pm *profilemanager.ProfileManager, activeProf *profilemanager.Profile, profileSwitched bool) error {
--- a/client/embed/embed.go
+++ b/client/embed/embed.go
@@ -264,24 +264,34 @@ func (c *Client) Start(startCtx context.Context) error {
 	if err, _ := authClient.Login(ctx, c.setupKey, c.jwtToken); err != nil {
 		return fmt.Errorf("login: %w", err)
 	}
-	client := internal.NewConnectClient(ctx, c.recorder)
+	client := internal.NewConnectClient(ctx, c.config, c.recorder)
 	client.SetSyncResponsePersistence(true)

-	// The supervisor owns the run; we wait until it is established, ends with a
-	// startup error (permanent backoff err), or startCtx expires.
+	// either startup error (permanent backoff err) or nil err (successful engine up)
 	// TODO: make after-startup backoff err available
-	client.RunAsync(c.config, nil)
+	run := make(chan struct{})
+	clientErr := make(chan error, 1)
+	go func() {
+		if err := client.Run(run, ""); err != nil {
+			clientErr <- err
+		}
+	}()

-	if err := client.WaitEstablishedOrDone(startCtx); err != nil {
-		// Either startCtx expired while connecting, or the run ended before it
-		// established. Cancel the client context before stopping: Engine.Start
-		// blocks on the signal stream while holding the engine mutex and only
-		// unblocks on cancellation. Stopping first would deadlock on that mutex.
+	select {
+	case <-startCtx.Done():
+		// ConnectClient.Stop now cancels its own run context and waits for the
+		// run loop to tear the engine down, so this cancel() is no longer
+		// required to break the deadlock and could be removed. It is kept as a
+		// defensive belt-and-suspenders: cancelling the parent context first
+		// guarantees the run loop is unblocked even if Stop's contract regresses.
 		cancel()
 		if stopErr := client.Stop(); stopErr != nil {
-			return fmt.Errorf("stop error after startup failure. Stop error: %w. Startup: %w", stopErr, err)
+			return fmt.Errorf("stop error after context done. Stop error: %w. Context done: %w", stopErr, startCtx.Err())
 		}
+		return startCtx.Err()
+	case err := <-clientErr:
 		return fmt.Errorf("startup: %w", err)
+	case <-run:
 	}

 	c.connect = client
--- a/client/firewall/iptables/manager_linux_test.go
+++ b/client/firewall/iptables/manager_linux_test.go
@@ -1,3 +1,5 @@
+//go:build privileged
+
 package iptables

 import (
--- a/client/firewall/iptables/router_linux_test.go
+++ b/client/firewall/iptables/router_linux_test.go
@@ -1,4 +1,4 @@
-//go:build !android
+//go:build !android && privileged

 package iptables

--- a/client/firewall/nftables/manager_linux_test.go
+++ b/client/firewall/nftables/manager_linux_test.go
@@ -1,3 +1,5 @@
+//go:build privileged
+
 package nftables

 import (
--- a/client/firewall/nftables/router_linux_test.go
+++ b/client/firewall/nftables/router_linux_test.go
@@ -1,4 +1,4 @@
-//go:build !android
+//go:build !android && privileged

 package nftables

--- a/client/iface/iface_test.go
+++ b/client/iface/iface_test.go
@@ -1,3 +1,5 @@
+//go:build privileged
+
 package iface

 import (
--- a/client/iface/wgproxy/proxy_linux_test.go
+++ b/client/iface/wgproxy/proxy_linux_test.go
@@ -1,4 +1,4 @@
-//go:build linux && !android
+//go:build linux && !android && privileged

 package wgproxy

--- a/client/iface/wgproxy/proxy_seed_test.go
+++ b/client/iface/wgproxy/proxy_seed_test.go
@@ -1,4 +1,4 @@
-//go:build !linux
+//go:build !linux || !privileged

 package wgproxy

--- a/client/iface/wgproxy/redirect_test.go
+++ b/client/iface/wgproxy/redirect_test.go
@@ -1,4 +1,4 @@
-//go:build linux && !android
+//go:build linux && !android && privileged

 package wgproxy

@@ -26,64 +26,6 @@ func compareUDPAddr(addr1, addr2 net.Addr) bool {
 	return udpAddr1.IP.Equal(udpAddr2.IP) && udpAddr1.Port == udpAddr2.Port
 }

-// TestRedirectAs_eBPF_IPv4 tests RedirectAs with eBPF proxy using IPv4 addresses
-func TestRedirectAs_eBPF_IPv4(t *testing.T) {
-	wgPort := 51850
-	ebpfProxy := ebpf.NewWGEBPFProxy(wgPort, 1280)
-	if err := ebpfProxy.Listen(); err != nil {
-		t.Fatalf("failed to initialize ebpf proxy: %v", err)
-	}
-	defer func() {
-		if err := ebpfProxy.Free(); err != nil {
-			t.Errorf("failed to free ebpf proxy: %v", err)
-		}
-	}()
-
-	proxy := ebpf.NewProxyWrapper(ebpfProxy)
-
-	// NetBird UDP address of the remote peer
-	nbAddr := &net.UDPAddr{
-		IP:   net.ParseIP("100.108.111.177"),
-		Port: 38746,
-	}
-
-	p2pEndpoint := &net.UDPAddr{
-		IP:   net.ParseIP("192.168.0.56"),
-		Port: 51820,
-	}
-
-	testRedirectAs(t, proxy, wgPort, nbAddr, p2pEndpoint)
-}
-
-// TestRedirectAs_eBPF_IPv6 tests RedirectAs with eBPF proxy using IPv6 addresses
-func TestRedirectAs_eBPF_IPv6(t *testing.T) {
-	wgPort := 51851
-	ebpfProxy := ebpf.NewWGEBPFProxy(wgPort, 1280)
-	if err := ebpfProxy.Listen(); err != nil {
-		t.Fatalf("failed to initialize ebpf proxy: %v", err)
-	}
-	defer func() {
-		if err := ebpfProxy.Free(); err != nil {
-			t.Errorf("failed to free ebpf proxy: %v", err)
-		}
-	}()
-
-	proxy := ebpf.NewProxyWrapper(ebpfProxy)
-
-	// NetBird UDP address of the remote peer
-	nbAddr := &net.UDPAddr{
-		IP:   net.ParseIP("100.108.111.177"),
-		Port: 38746,
-	}
-
-	p2pEndpoint := &net.UDPAddr{
-		IP:   net.ParseIP("fe80::56"),
-		Port: 51820,
-	}
-
-	testRedirectAs(t, proxy, wgPort, nbAddr, p2pEndpoint)
-}
-
 // TestRedirectAs_UDP_IPv4 tests RedirectAs with UDP proxy using IPv4 addresses
 func TestRedirectAs_UDP_IPv4(t *testing.T) {
 	wgPort := 51852
@@ -256,6 +198,64 @@ func testRedirectAs(t *testing.T, proxy Proxy, wgPort int, nbAddr, p2pEndpoint *
 	}
 }

+// TestRedirectAs_eBPF_IPv4 tests RedirectAs with eBPF proxy using IPv4 addresses
+func TestRedirectAs_eBPF_IPv4(t *testing.T) {
+	wgPort := 51850
+	ebpfProxy := ebpf.NewWGEBPFProxy(wgPort, 1280)
+	if err := ebpfProxy.Listen(); err != nil {
+		t.Fatalf("failed to initialize ebpf proxy: %v", err)
+	}
+	defer func() {
+		if err := ebpfProxy.Free(); err != nil {
+			t.Errorf("failed to free ebpf proxy: %v", err)
+		}
+	}()
+
+	proxy := ebpf.NewProxyWrapper(ebpfProxy)
+
+	// NetBird UDP address of the remote peer
+	nbAddr := &net.UDPAddr{
+		IP:   net.ParseIP("100.108.111.177"),
+		Port: 38746,
+	}
+
+	p2pEndpoint := &net.UDPAddr{
+		IP:   net.ParseIP("192.168.0.56"),
+		Port: 51820,
+	}
+
+	testRedirectAs(t, proxy, wgPort, nbAddr, p2pEndpoint)
+}
+
+// TestRedirectAs_eBPF_IPv6 tests RedirectAs with eBPF proxy using IPv6 addresses
+func TestRedirectAs_eBPF_IPv6(t *testing.T) {
+	wgPort := 51851
+	ebpfProxy := ebpf.NewWGEBPFProxy(wgPort, 1280)
+	if err := ebpfProxy.Listen(); err != nil {
+		t.Fatalf("failed to initialize ebpf proxy: %v", err)
+	}
+	defer func() {
+		if err := ebpfProxy.Free(); err != nil {
+			t.Errorf("failed to free ebpf proxy: %v", err)
+		}
+	}()
+
+	proxy := ebpf.NewProxyWrapper(ebpfProxy)
+
+	// NetBird UDP address of the remote peer
+	nbAddr := &net.UDPAddr{
+		IP:   net.ParseIP("100.108.111.177"),
+		Port: 38746,
+	}
+
+	p2pEndpoint := &net.UDPAddr{
+		IP:   net.ParseIP("fe80::56"),
+		Port: 51820,
+	}
+
+	testRedirectAs(t, proxy, wgPort, nbAddr, p2pEndpoint)
+}
+
 // TestRedirectAs_Multiple_Switches tests switching between multiple endpoints
 func TestRedirectAs_Multiple_Switches(t *testing.T) {
 	wgPort := 51856
--- a/client/internal/connect.go
+++ b/client/internal/connect.go
@@ -11,6 +11,7 @@ import (
 	"runtime/debug"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"time"

 	"github.com/cenkalti/backoff/v4"
@@ -18,7 +19,6 @@ import (

 	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
 	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/metadata"
 	gstatus "google.golang.org/grpc/status"

 	"github.com/netbirdio/netbird/client/iface/wgaddr"
@@ -49,23 +49,17 @@ import (
 	"github.com/netbirdio/netbird/version"
 )

-// androidMobileDep is set on Android to inject the MobileDependency for runs
-// started through the generic entry points (Run/RunAsync, e.g. embed.Client).
-// nil on other platforms, where the dependency is empty.
-var androidMobileDep func(config *profilemanager.Config) MobileDependency
-
-// mobileDependency returns the MobileDependency for a run started via the
-// generic entry points. On Android the androidMobileDep provider supplies
-// platform stubs (or real implementations); elsewhere it is empty.
-func (c *ConnectClient) mobileDependency(config *profilemanager.Config) MobileDependency {
-	if androidMobileDep != nil {
-		return androidMobileDep(config)
-	}
-	return MobileDependency{}
-}
+// androidRunOverride is set on Android to inject mobile dependencies
+// when using embed.Client (which calls Run() with empty MobileDependency).
+var androidRunOverride func(c *ConnectClient, runningChan chan struct{}, logPath string) error

 type ConnectClient struct {
 	ctx            context.Context
+	runCancel      context.CancelFunc
+	runExited      chan struct{}
+	runOnce        sync.Once
+	runStarted     atomic.Bool
+	config         *profilemanager.Config
 	statusRecorder *peer.Status

 	engine        *Engine
@@ -74,62 +68,41 @@ type ConnectClient struct {
 	updateManager *updater.Manager

 	persistSyncResponse bool
-
-	// sup serializes all start/stop requests so two lifecycle operations can
-	// never overlap. See connect_lifecycle.go.
-	sup *supervisor
 }

 func NewConnectClient(
 	ctx context.Context,
+	config *profilemanager.Config,
 	statusRecorder *peer.Status,
 ) *ConnectClient {
-	c := &ConnectClient{
-		ctx:            ctx,
+	// Derive the run context here so Stop owns the cancel that unblocks the run
+	// loop. runCancel is set once at construction, so Stop can call it without
+	// racing the run loop's startup. Callers therefore need not cancel before Stop.
+	runCtx, runCancel := context.WithCancel(ctx)
+	return &ConnectClient{
+		ctx:            runCtx,
+		runCancel:      runCancel,
+		runExited:      make(chan struct{}),
+		config:         config,
 		statusRecorder: statusRecorder,
 		engineMutex:    sync.Mutex{},
 	}
-	c.sup = newSupervisor(ctx, c.run)
-	return c
 }

 func (c *ConnectClient) SetUpdateManager(um *updater.Manager) {
 	c.updateManager = um
 }

-// Run with main logic. md carries optional gRPC metadata (e.g. the UI
-// user-agent) to forward to the management/signal services; nil when none.
-func (c *ConnectClient) Run(config *profilemanager.Config, md metadata.MD, logPath string) error {
-	return c.sup.start(config, md, c.mobileDependency(config), logPath)
-}
-
-// RunAsync starts a client run without blocking. Used by the daemon and embed,
-// which drive the lifecycle through the supervisor rather than blocking on Run;
-// they then wait for the outcome via WaitEstablishedOrDone. The run's lifecycle
-// channels are created and owned by the supervisor — callers never hold them.
-func (c *ConnectClient) RunAsync(config *profilemanager.Config, md metadata.MD) {
-	c.sup.startAsync(config, md, c.mobileDependency(config), "", nil)
-}
-
-// Restart atomically stops any in-flight run and starts a fresh one with the
-// given config. The stop+start happens as a single supervisor operation, so no
-// other lifecycle request can interleave between them — used for explicit
-// restarts (e.g. an MDM policy change) that must not expose a "stopped" window.
-func (c *ConnectClient) Restart(config *profilemanager.Config, md metadata.MD) {
-	c.sup.restartAsync(config, md, c.mobileDependency(config), "")
-}
-
-// WaitEstablishedOrDone blocks until the in-flight run becomes established (nil),
-// ends before that (the run error, or a sentinel on a clean stop), or ctx is
-// cancelled. Returns errNoRunInFlight if no run is in flight. Wraps the wait on
-// the supervisor-owned channels so callers never touch them directly.
-func (c *ConnectClient) WaitEstablishedOrDone(ctx context.Context) error {
-	return c.sup.waitEstablishedOrDone(ctx)
+// Run with main logic.
+func (c *ConnectClient) Run(runningChan chan struct{}, logPath string) error {
+	if androidRunOverride != nil {
+		return androidRunOverride(c, runningChan, logPath)
+	}
+	return c.run(MobileDependency{}, runningChan, logPath)
 }

 // RunOnAndroid with main logic on mobile system
 func (c *ConnectClient) RunOnAndroid(
-	config *profilemanager.Config,
 	tunAdapter device.TunAdapter,
 	iFaceDiscover stdnet.ExternalIFaceDiscover,
 	networkChangeListener listener.NetworkChangeListener,
@@ -148,11 +121,10 @@ func (c *ConnectClient) RunOnAndroid(
 		StateFilePath:         stateFilePath,
 		TempDir:               cacheDir,
 	}
-	return c.sup.start(config, nil, mobileDependency, "")
+	return c.run(mobileDependency, nil, "")
 }

 func (c *ConnectClient) RunOniOS(
-	config *profilemanager.Config,
 	fileDescriptor int32,
 	networkChangeListener listener.NetworkChangeListener,
 	dnsManager dns.IosDnsManager,
@@ -170,12 +142,15 @@ func (c *ConnectClient) RunOniOS(
 		StateFilePath:         stateFilePath,
 		TempDir:               cacheDir,
 	}
-	return c.sup.start(config, nil, mobileDependency, logFilePath)
+	return c.run(mobileDependency, nil, logFilePath)
 }

-// run executes a single client run. runCtx is owned by the supervisor: cancelling
-// it tears the run down (it is the parent of the per-attempt engine context).
-func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Config, mobileDependency MobileDependency, connEstablishedChan chan struct{}, logPath string) error {
+func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan struct{}, logPath string) error {
+	// Mark the loop as started and signal exit on return so Stop can wait for
+	// the loop to finish (and skip the wait if the loop never ran).
+	c.runStarted.Store(true)
+	defer c.runOnce.Do(func() { close(c.runExited) })
+
 	defer func() {
 		if r := recover(); r != nil {
 			rec := c.statusRecorder
@@ -239,18 +214,18 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 	}()

 	wrapErr := state.Wrap
-	myPrivateKey, err := wgtypes.ParseKey(config.PrivateKey)
+	myPrivateKey, err := wgtypes.ParseKey(c.config.PrivateKey)
 	if err != nil {
-		log.Errorf("failed parsing Wireguard key %s: [%s]", config.PrivateKey, err.Error())
+		log.Errorf("failed parsing Wireguard key %s: [%s]", c.config.PrivateKey, err.Error())
 		return wrapErr(err)
 	}

 	var mgmTlsEnabled bool
-	if config.ManagementURL.Scheme == "https" {
+	if c.config.ManagementURL.Scheme == "https" {
 		mgmTlsEnabled = true
 	}

-	publicSSHKey, err := ssh.GeneratePublicKey([]byte(config.SSHKey))
+	publicSSHKey, err := ssh.GeneratePublicKey([]byte(c.config.SSHKey))
 	if err != nil {
 		return err
 	}
@@ -284,13 +259,13 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 	defer c.statusRecorder.ClientStop()
 	operation := func() error {
 		// if context cancelled we not start new backoff cycle
-		if runCtx.Err() != nil {
+		if c.ctx.Err() != nil {
 			return nil
 		}

 		state.Set(StatusConnecting)

-		engineCtx, cancel := context.WithCancel(runCtx)
+		engineCtx, cancel := context.WithCancel(c.ctx)
 		defer func() {
 			_, err := state.Status()
 			c.statusRecorder.MarkManagementDisconnected(err)
@@ -298,8 +273,8 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 			cancel()
 		}()

-		log.Debugf("connecting to the Management service %s", config.ManagementURL.Host)
-		mgmClient, err := mgm.NewClient(engineCtx, config.ManagementURL.Host, myPrivateKey, mgmTlsEnabled)
+		log.Debugf("connecting to the Management service %s", c.config.ManagementURL.Host)
+		mgmClient, err := mgm.NewClient(engineCtx, c.config.ManagementURL.Host, myPrivateKey, mgmTlsEnabled)
 		if err != nil {
 			return wrapErr(gstatus.Errorf(codes.FailedPrecondition, "failed connecting to Management Service : %s", err))
 		}
@@ -316,7 +291,7 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 		}
 		c.clientMetrics.UpdateAgentInfo(agentInfo, myPrivateKey.PublicKey().String())

-		log.Debugf("connected to the Management service %s", config.ManagementURL.Host)
+		log.Debugf("connected to the Management service %s", c.config.ManagementURL.Host)
 		defer func() {
 			if err = mgmClient.Close(); err != nil {
 				log.Warnf("failed to close the Management service client %v", err)
@@ -325,14 +300,13 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi

 		// connect (just a connection, no stream yet) and login to Management Service to get an initial global Netbird config
 		loginStarted := time.Now()
-		loginResp, err := loginToManagement(engineCtx, mgmClient, publicSSHKey, config)
+		loginResp, err := loginToManagement(engineCtx, mgmClient, publicSSHKey, c.config)
 		if err != nil {
 			c.clientMetrics.RecordLoginDuration(engineCtx, time.Since(loginStarted), false)
 			log.Debug(err)
 			if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
 				state.Set(StatusNeedsLogin)
-				// No teardown needed: login fails before the engine is started
-				// (engine.Start is below), so there is nothing running to stop.
+				c.runCancel()
 				return backoff.Permanent(wrapErr(err)) // unrecoverable error
 			}
 			return wrapErr(err)
@@ -386,7 +360,7 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 		}
 		peerConfig := loginResp.GetPeerConfig()

-		engineConfig, err := createEngineConfig(myPrivateKey, config, peerConfig, logPath)
+		engineConfig, err := createEngineConfig(myPrivateKey, c.config, peerConfig, logPath)
 		if err != nil {
 			log.Error(err)
 			return wrapErr(err)
@@ -430,7 +404,7 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 		c.engine = engine
 		c.engineMutex.Unlock()

-		if err := engine.Start(loginResp.GetNetbirdConfig(), config.ManagementURL); err != nil {
+		if err := engine.Start(loginResp.GetNetbirdConfig(), c.config.ManagementURL); err != nil {
 			log.Errorf("error while starting Netbird Connection Engine: %s", err)
 			return wrapErr(err)
 		}
@@ -438,13 +412,12 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 		log.Infof("Netbird engine started, the IP is: %s", peerConfig.GetAddress())
 		state.Set(StatusConnected)

-		// The supervisor owns connEstablishedChan and it is always present. Guard
-		// against a double close: operation re-runs on ErrResetConnection retries
-		// within the same run, and the channel is closed only on the first connect.
-		select {
-		case <-connEstablishedChan:
-		default:
-			close(connEstablishedChan)
+		if runningChan != nil {
+			select {
+			case <-runningChan:
+			default:
+				close(runningChan)
+			}
 		}

 		<-engineCtx.Done()
@@ -453,10 +426,8 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 		c.engine = nil
 		c.engineMutex.Unlock()

-		// Always tear the engine down once its context is cancelled. engine.Stop
-		// is nil-guarded per component, so calling it unconditionally is safe and
-		// avoids both the data race on engine.wgInterface and skipping teardown
-		// when the interface was never brought up (e.g. a mid-start failure).
+		log.Infof("ensuring wg interface is removed, Netbird engine context cancelled")
+
 		if err := engine.Stop(); err != nil {
 			log.Errorf("Failed to stop engine: %v", err)
 		}
@@ -474,13 +445,12 @@ func (c *ConnectClient) run(runCtx context.Context, config *profilemanager.Confi
 	}

 	c.statusRecorder.ClientStart()
-	err = backoff.Retry(operation, backOff)
+	err = backoff.Retry(operation, backoff.WithContext(backOff, c.ctx))
 	if err != nil {
 		log.Debugf("exiting client retry loop due to unrecoverable error: %s", err)
 		if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
-			// Login failed permanently: the engine was never started, so there
-			// is nothing to tear down — just record that a login is needed.
 			state.Set(StatusNeedsLogin)
+			c.runCancel()
 		}
 		return err
 	}
@@ -501,22 +471,6 @@ func parseRelayInfo(loginResp *mgmProto.LoginResponse) ([]string, *hmac.Token) {
 	return relayCfg.GetUrls(), token
 }

-// ConnectionRunning reports whether a connection run is currently in flight
-// (connecting, connected, or reconnecting). Answered by the supervisor via a
-// serialized query, so it settles behind an in-flight stop. Distinct from
-// ServiceRunning, which reports whether the service itself is alive.
-func (c *ConnectClient) ConnectionRunning() bool {
-	return c.sup.isRunning()
-}
-
-// ServiceRunning reports whether the client's lifecycle supervisor is alive and
-// able to accept start/stop commands — i.e. its context has not been cancelled
-// (the daemon is not shutting down). Independent of whether a connection run is
-// up (that is ConnectionRunning).
-func (c *ConnectClient) ServiceRunning() bool {
-	return c.sup.ctx.Err() == nil
-}
-
 func (c *ConnectClient) Engine() *Engine {
 	if c == nil {
 		return nil
@@ -573,10 +527,12 @@ func (c *ConnectClient) Status() StatusType {
 	return status
 }

-// Stop serializes a stop request through the lifecycle supervisor and blocks
-// until the in-flight run is fully torn down.
 func (c *ConnectClient) Stop() error {
-	return c.sup.stop()
+	c.runCancel()
+	if c.runStarted.Load() {
+		<-c.runExited
+	}
+	return nil
 }

 // SetSyncResponsePersistence enables or disables sync response persistence.
--- a/client/internal/connect_android_default.go
+++ b/client/internal/connect_android_default.go
@@ -7,7 +7,6 @@ import (

 	"github.com/netbirdio/netbird/client/internal/dns"
 	"github.com/netbirdio/netbird/client/internal/listener"
-	"github.com/netbirdio/netbird/client/internal/profilemanager"
 	"github.com/netbirdio/netbird/client/internal/stdnet"
 )

@@ -60,17 +59,19 @@ var _ listener.NetworkChangeListener = noopNetworkChangeListener{}
 var _ dns.ReadyListener = noopDnsReadyListener{}

 func init() {
-	// Wire up the default MobileDependency provider so embed.Client.Start() works
-	// on Android with netstack mode. Provides complete no-op stubs for all mobile
+	// Wire up the default override so embed.Client.Start() works on Android
+	// with netstack mode. Provides complete no-op stubs for all mobile
 	// dependencies so the engine's existing Android code paths work unchanged.
-	// Applications that need P2P ICE or real DNS should replace this by setting
-	// androidMobileDep before calling Start().
-	androidMobileDep = func(config *profilemanager.Config) MobileDependency {
-		return mobileDependencyForEmbed(
+	// Applications that need P2P ICE or real DNS should replace this by
+	// setting androidRunOverride before calling Start().
+	androidRunOverride = func(c *ConnectClient, runningChan chan struct{}, logPath string) error {
+		return c.runOnAndroidEmbed(
 			noopIFaceDiscover{},
 			noopNetworkChangeListener{},
 			[]netip.AddrPort{},
 			noopDnsReadyListener{},
+			runningChan,
+			logPath,
 		)
 	}
 }
--- a/client/internal/connect_android_embed.go
+++ b/client/internal/connect_android_embed.go
@@ -10,18 +10,23 @@ import (
 	"github.com/netbirdio/netbird/client/internal/stdnet"
 )

-// mobileDependencyForEmbed builds the MobileDependency used by embed.Client on
-// Android so the engine's existing Android code paths work unchanged.
-func mobileDependencyForEmbed(
+// runOnAndroidEmbed is like RunOnAndroid but accepts a runningChan
+// so embed.Client.Start() can detect when the engine is ready.
+// It provides complete MobileDependency so the engine's existing
+// Android code paths work unchanged.
+func (c *ConnectClient) runOnAndroidEmbed(
 	iFaceDiscover stdnet.ExternalIFaceDiscover,
 	networkChangeListener listener.NetworkChangeListener,
 	dnsAddresses []netip.AddrPort,
 	dnsReadyListener dns.ReadyListener,
-) MobileDependency {
-	return MobileDependency{
+	runningChan chan struct{},
+	logPath string,
+) error {
+	mobileDependency := MobileDependency{
 		IFaceDiscover:         iFaceDiscover,
 		NetworkChangeListener: networkChangeListener,
 		HostDNSAddresses:      dnsAddresses,
 		DnsReadyListener:      dnsReadyListener,
 	}
+	return c.run(mobileDependency, runningChan, logPath)
 }
--- a/client/internal/connect_lifecycle.go
+++ b/client/internal/connect_lifecycle.go
@@ -1,362 +0,0 @@
-package internal
-
-import (
-	"context"
-	"errors"
-
-	"google.golang.org/grpc/metadata"
-
-	"github.com/netbirdio/netbird/client/internal/profilemanager"
-)
-
-// errAlreadyRunning is returned when a start is requested while a run is already
-// in flight.
-var errAlreadyRunning = errors.New("client is already running")
-
-// errNoRunInFlight is returned by waitEstablishedOrDone when no run is active.
-var errNoRunInFlight = errors.New("no connection run in flight")
-
-// errStoppedBeforeEstablished is returned when a run ended (cleanly) before the
-// connection was established.
-var errStoppedBeforeEstablished = errors.New("run stopped before the connection was established")
-
-// lifecycleOp is a serialized lifecycle operation processed by the supervisor.
-type lifecycleOp int
-
-const (
-	opStart lifecycleOp = iota
-	opStop
-	opRestart
-	opStatus
-	opWaitEstablished
-)
-
-// lifecycleCmd is a single lifecycle request handed to the supervisor goroutine.
-// They all flow through the same cmdCh so they are strictly ordered (FIFO) with
-// respect to each other.
-type lifecycleCmd struct {
-	op        lifecycleOp
-	config    *profilemanager.Config
-	md        metadata.MD
-	mobileDep MobileDependency
-	logPath   string
-
-	// done is the caller's notification channel (nil for fire-and-forget). Its
-	// meaning depends on op:
-	//   - opStart: receives the run's end result when the run terminates, or
-	//     errAlreadyRunning immediately if a run is already in flight.
-	//   - opStop: receives nil once the in-flight run has fully unwound.
-	//   - opWaitEstablished: receives the wait outcome (see waitEstablishedOrDone).
-	done chan error
-
-	reply   chan bool       // opStatus only: receives whether a run is in flight
-	waitCtx context.Context // opWaitEstablished only: the waiter's cancellation context
-}
-
-// runState holds the lifecycle channels of a single in-flight run, owned by the
-// loop goroutine. It never escapes the supervisor as an API; the only readers
-// are the per-wait goroutines the loop spawns for opWaitEstablished.
-//
-// connEstablishedChan is closed by the run once the connection is established.
-// The supervisor creates and owns it — callers no longer supply it; they observe
-// it through waitEstablishedOrDone. ended is closed (broadcast) when the run
-// terminates, so any number of waiters can observe it; err is the run's end
-// result, valid only after ended is closed.
-type runState struct {
-	connEstablishedChan chan struct{} // closed by the run on established
-	ended               chan struct{} // closed by finishRun when the run terminates
-	err                 error         // run end result, valid after ended is closed
-}
-
-// runEndResult is sent by the run goroutine to the supervisor when a run ends,
-// whether on its own (error / external context cancellation) or because of a Stop.
-type runEndResult struct {
-	err error
-}
-
-// runFunc executes a single client run bound to the supervisor-owned context,
-// with the config supplied by the start request.
-type runFunc func(ctx context.Context, config *profilemanager.Config, mobileDep MobileDependency, connEstablishedChan chan struct{}, logPath string) error
-
-// supervisor serializes start/stop of a single client run. Every request goes
-// through cmdCh and is handled one at a time by the loop goroutine, so two
-// lifecycle operations can never overlap and their order is preserved (FIFO).
-// The loop goroutine is the sole owner of curStart/runCancel, so that state
-// needs no locking. The loop exits when the parent context is cancelled.
-type supervisor struct {
-	ctx      context.Context
-	run      runFunc
-	cmdCh    chan lifecycleCmd
-	runEnded chan runEndResult
-
-	// owned exclusively by the loop goroutine. curStart is the in-flight start
-	// command (nil = idle); its done channel is notified when the run ends.
-	// curRun holds that run's lifecycle channels; runCancel cancels it.
-	curStart  *lifecycleCmd
-	curRun    *runState
-	runCancel context.CancelFunc
-}
-
-func newSupervisor(ctx context.Context, run runFunc) *supervisor {
-	s := &supervisor{
-		ctx:      ctx,
-		run:      run,
-		cmdCh:    make(chan lifecycleCmd, 16),
-		runEnded: make(chan runEndResult, 1),
-	}
-	go s.loop()
-	return s
-}
-
-func (s *supervisor) loop() {
-	for {
-		select {
-		case <-s.ctx.Done():
-			s.shutdown()
-			return
-		case cmd := <-s.cmdCh:
-			switch cmd.op {
-			case opStart:
-				s.handleStart(cmd)
-			case opStop:
-				s.handleStop(cmd)
-			case opRestart:
-				s.handleRestart(cmd)
-			case opStatus:
-				cmd.reply <- (s.isRunningInternal())
-			case opWaitEstablished:
-				s.handleWaitEstablished(cmd)
-			}
-		case res := <-s.runEnded:
-			// Run ended on its own, without an explicit Stop.
-			s.finishRun(res.err)
-		}
-	}
-}
-
-func (s *supervisor) handleStart(cmd lifecycleCmd) {
-	if s.isRunningInternal() {
-		notify(cmd.done, errAlreadyRunning)
-		return
-	}
-
-	runCtx, cancel := context.WithCancel(s.ctx)
-	if cmd.md != nil {
-		// Carry caller-supplied gRPC metadata (e.g. UI user-agent) into the run
-		// context so the engine's management/signal calls forward it. The cancel
-		// still drives runCtx (metadata wrapping preserves cancellation).
-		runCtx = metadata.NewOutgoingContext(runCtx, cmd.md)
-	}
-	s.runCancel = cancel
-	s.curStart = &cmd
-	s.curRun = &runState{connEstablishedChan: make(chan struct{}), ended: make(chan struct{})}
-
-	go func(ctx context.Context, cfg *profilemanager.Config, m MobileDependency, established chan struct{}, lp string) {
-		err := s.run(ctx, cfg, m, established, lp)
-		s.runEnded <- runEndResult{err: err}
-	}(runCtx, cmd.config, cmd.mobileDep, s.curRun.connEstablishedChan, cmd.logPath)
-}
-
-func (s *supervisor) handleStop(cmd lifecycleCmd) {
-	if !s.isRunningInternal() {
-		notify(cmd.done, nil)
-		return
-	}
-	s.stopCurrentRun()
-	notify(cmd.done, nil)
-}
-
-// handleRestart tears down any in-flight run and starts a fresh one in a single
-// loop turn. No other command can interleave between the stop and the start
-// (the loop is single-threaded), so the swap is atomic without relying on any
-// daemon-side lock — that is what an explicit restart (e.g. MDM config change)
-// needs to avoid a window where the client is observably stopped.
-func (s *supervisor) handleRestart(cmd lifecycleCmd) {
-	if s.isRunningInternal() {
-		s.stopCurrentRun()
-	}
-	s.handleStart(cmd)
-}
-
-// stopCurrentRun cancels the in-flight run and blocks the supervisor until it
-// has fully unwound, so the next action starts from a clean slate. The run
-// goroutine reports completion via runEnded. Caller must hold an in-flight run
-// (curStart != nil).
-func (s *supervisor) stopCurrentRun() {
-	s.runCancel()
-	res := <-s.runEnded
-	s.finishRun(res.err)
-}
-
-// finishRun resets lifecycle state after a run terminates and hands the run
-// error back to whoever asked to be notified of the start.
-func (s *supervisor) finishRun(err error) {
-	s.runCancel = nil
-	if s.isRunningInternal() {
-		// Publish the result to the broadcast channel before nil-ing curRun, so
-		// any opWaitEstablished goroutines blocked on ended observe err.
-		s.curRun.err = err
-		close(s.curRun.ended)
-		s.curRun = nil
-
-		notify(s.curStart.done, err)
-		s.curStart = nil
-	}
-}
-
-// handleWaitEstablished answers an opWaitEstablished request. The select itself
-// runs in a spawned goroutine on the run's channels so it never blocks the loop;
-// the loop only snapshots the in-flight run's channels (which it owns) here.
-func (s *supervisor) handleWaitEstablished(cmd lifecycleCmd) {
-	caller := cmd.done
-	if !s.isRunningInternal() {
-		notify(caller, errNoRunInFlight)
-		return
-	}
-	rs := s.curRun
-	established := rs.connEstablishedChan
-	ctx := cmd.waitCtx
-	go func() {
-		select {
-		case <-established:
-			notify(caller, nil)
-		case <-rs.ended:
-			if rs.err != nil {
-				notify(caller, rs.err)
-				return
-			}
-			notify(caller, errStoppedBeforeEstablished)
-		case <-ctx.Done():
-			notify(caller, ctx.Err())
-		}
-	}()
-}
-
-// shutdown tears down the in-flight run when the parent context is cancelled,
-// then fails any still-queued commands so their callers never hang.
-func (s *supervisor) shutdown() {
-	if s.runCancel != nil {
-		s.runCancel()
-		res := <-s.runEnded
-		s.finishRun(res.err)
-	}
-	for {
-		select {
-		case cmd := <-s.cmdCh:
-			notify(cmd.done, s.ctx.Err())
-		default:
-			return
-		}
-	}
-}
-
-// startAsync enqueues a start without blocking. If done is non-nil it receives
-// the run's end result (or errAlreadyRunning on rejection, or the context error
-// on shutdown).
-func (s *supervisor) startAsync(config *profilemanager.Config, md metadata.MD, mobileDep MobileDependency, logPath string, done chan error) {
-	cmd := lifecycleCmd{op: opStart, config: config, md: md, mobileDep: mobileDep, logPath: logPath, done: done}
-	select {
-	case s.cmdCh <- cmd:
-	case <-s.ctx.Done():
-		notify(done, s.ctx.Err())
-	}
-}
-
-// restartAsync enqueues an atomic stop+start without blocking. The supervisor
-// tears down any in-flight run and starts a fresh one with the supplied config
-// in a single loop turn (see handleRestart). Fire-and-forget: the new run owns
-// its lifecycle channels, observed via waitEstablishedOrDone.
-func (s *supervisor) restartAsync(config *profilemanager.Config, md metadata.MD, mobileDep MobileDependency, logPath string) {
-	cmd := lifecycleCmd{op: opRestart, config: config, md: md, mobileDep: mobileDep, logPath: logPath}
-	select {
-	case s.cmdCh <- cmd:
-	case <-s.ctx.Done():
-	}
-}
-
-// start enqueues a start and blocks until the run terminates, preserving the
-// blocking contract of the legacy Run entry points.
-func (s *supervisor) start(config *profilemanager.Config, md metadata.MD, mobileDep MobileDependency, logPath string) error {
-	done := make(chan error, 1)
-	s.startAsync(config, md, mobileDep, logPath, done)
-	select {
-	case err := <-done:
-		return err
-	case <-s.ctx.Done():
-		return s.ctx.Err()
-	}
-}
-
-// isRunning asks the loop whether a run is in flight. The query is serialized
-// with start/stop, so during a stop it waits for the teardown to settle and
-// then reports the final state — never a transient "half-stopped".
-func (s *supervisor) isRunning() bool {
-	reply := make(chan bool, 1)
-	select {
-	case s.cmdCh <- lifecycleCmd{op: opStatus, reply: reply}:
-	case <-s.ctx.Done():
-		return false
-	}
-	select {
-	case r := <-reply:
-		return r
-	case <-s.ctx.Done():
-		return false
-	}
-}
-
-func (s *supervisor) isRunningInternal() bool {
-	return s.curStart != nil
-}
-
-// waitEstablishedOrDone blocks until the in-flight run becomes established
-// (returns nil) or ends before that (returns the run error, or
-// errStoppedBeforeEstablished on a clean stop), or ctx is cancelled. Returns
-// errNoRunInFlight if no run is in flight. The wait is performed by a goroutine
-// spawned inside the loop (see handleWaitEstablished); the run's channels never
-// leave the supervisor.
-func (s *supervisor) waitEstablishedOrDone(ctx context.Context) error {
-	reply := make(chan error, 1)
-	select {
-	case s.cmdCh <- lifecycleCmd{op: opWaitEstablished, waitCtx: ctx, done: reply}:
-	case <-ctx.Done():
-		return ctx.Err()
-	case <-s.ctx.Done():
-		return s.ctx.Err()
-	}
-	select {
-	case err := <-reply:
-		return err
-	case <-s.ctx.Done():
-		return s.ctx.Err()
-	}
-}
-
-// stop enqueues a stop and blocks until the in-flight run is fully torn down.
-func (s *supervisor) stop() error {
-	done := make(chan error, 1)
-	select {
-	case s.cmdCh <- lifecycleCmd{op: opStop, done: done}:
-	case <-s.ctx.Done():
-		return s.ctx.Err()
-	}
-	select {
-	case err := <-done:
-		return err
-	case <-s.ctx.Done():
-		return s.ctx.Err()
-	}
-}
-
-// notify sends on a caller-supplied channel without blocking. The channel is
-// expected to be buffered (cap 1); a nil channel means the caller did not ask
-// to be notified.
-func notify(ch chan error, err error) {
-	if ch == nil {
-		return
-	}
-	select {
-	case ch <- err:
-	default:
-	}
-}
--- a/client/internal/dns/mgmt/mgmt.go
+++ b/client/internal/dns/mgmt/mgmt.go
@@ -51,13 +51,20 @@ type cachedRecord struct {
 }

 // Resolver caches critical NetBird infrastructure domains.
-// records, refreshing, mgmtDomain and serverDomains are all guarded by mutex.
+// records, refreshing, failedResolves, mgmtDomain and serverDomains are all
+// guarded by mutex.
 type Resolver struct {
 	records       map[dns.Question]*cachedRecord
 	mgmtDomain    *domain.Domain
 	serverDomains *dnsconfig.ServerDomains
 	mutex         sync.RWMutex

+	// failedResolves records the last failed initial resolve per domain so a
+	// domain that never resolves isn't retried on every server-domains update
+	// until refreshBackoff elapses. Entries are cleared on success and pruned
+	// to the current server-domains set.
+	failedResolves map[domain.Domain]time.Time
+
 	chain            ChainResolver
 	chainMaxPriority int
 	refreshGroup     singleflight.Group
@@ -76,9 +83,10 @@ type Resolver struct {
 // NewResolver creates a new management domains cache resolver.
 func NewResolver() *Resolver {
 	return &Resolver{
-		records:    make(map[dns.Question]*cachedRecord),
-		refreshing: make(map[dns.Question]*atomic.Bool),
-		cacheTTL:   resolveCacheTTL(),
+		records:        make(map[dns.Question]*cachedRecord),
+		refreshing:     make(map[dns.Question]*atomic.Bool),
+		failedResolves: make(map[domain.Domain]time.Time),
+		cacheTTL:       resolveCacheTTL(),
 	}
 }

@@ -173,7 +181,9 @@ func (m *Resolver) continueToNext(w dns.ResponseWriter, r *dns.Msg) {

 // AddDomain resolves a domain and stores its A/AAAA records in the cache.
 // A family that resolves NODATA (nil err, zero records) evicts any stale
-// entry for that qtype.
+// entry for that qtype. When one family hard-errors while the other succeeds,
+// the resolved family is still cached but AddDomain returns an error so the
+// caller retries the incomplete resolve rather than treating it as complete.
 func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))

@@ -203,6 +213,10 @@ func (m *Resolver) AddDomain(ctx context.Context, d domain.Domain) error {
 	log.Debugf("added/updated domain=%s with %d A records and %d AAAA records",
 		d.SafeString(), len(aRecords), len(aaaaRecords))

+	if errA != nil || errAAAA != nil {
+		return fmt.Errorf("resolve %s: incomplete, a family failed: %w", d.SafeString(), errors.Join(errA, errAAAA))
+	}
+
 	return nil
 }

@@ -462,6 +476,7 @@ func (m *Resolver) RemoveDomain(d domain.Domain) error {
 	delete(m.records, qAAAA)
 	delete(m.refreshing, qA)
 	delete(m.refreshing, qAAAA)
+	delete(m.failedResolves, d)

 	log.Debugf("removed domain=%s from cache", d.SafeString())
 	return nil
@@ -505,6 +520,7 @@ func (m *Resolver) UpdateFromServerDomains(ctx context.Context, serverDomains dn
 		allDomains := m.extractDomainsFromServerDomains(updatedServerDomains)
 		currentDomains := m.GetCachedDomains()
 		removedDomains = m.removeStaleDomains(currentDomains, allDomains)
+		m.pruneFailedResolves(allDomains)
 	}

 	m.addNewDomains(ctx, newDomains)
@@ -577,13 +593,85 @@ func (m *Resolver) isManagementDomain(domain domain.Domain) bool {
 	return m.mgmtDomain != nil && domain == *m.mgmtDomain
 }

-// addNewDomains resolves and caches all domains from the update
+// addNewDomains resolves and caches domains that are not yet in the cache,
+// running the lookups concurrently. Domains already cached are skipped and left
+// to the stale-while-revalidate refresh path, so a sync never re-resolves them
+// synchronously: once NetBird owns the OS resolver the resolve runs through the
+// handler chain and would otherwise dial the managed upstreams under the engine
+// sync lock on every update.
 func (m *Resolver) addNewDomains(ctx context.Context, newDomains domain.List) {
+	var wg sync.WaitGroup
+	seen := make(map[domain.Domain]struct{}, len(newDomains))
 	for _, newDomain := range newDomains {
-		if err := m.AddDomain(ctx, newDomain); err != nil {
-			log.Warnf("failed to add/update domain=%s: %v", newDomain.SafeString(), err)
-		} else {
-			log.Debugf("added/updated management cache domain=%s", newDomain.SafeString())
+		if _, dup := seen[newDomain]; dup {
+			continue
+		}
+		seen[newDomain] = struct{}{}
+
+		if !m.needsResolve(newDomain) {
+			continue
+		}
+
+		wg.Add(1)
+		go func(d domain.Domain) {
+			defer wg.Done()
+			if err := m.AddDomain(ctx, d); err != nil {
+				m.markResolveFailed(d)
+				log.Warnf("failed to add/update domain=%s: %v", d.SafeString(), err)
+				return
+			}
+			m.clearResolveFailed(d)
+			log.Debugf("added/updated management cache domain=%s", d.SafeString())
+		}(newDomain)
+	}
+	wg.Wait()
+}
+
+// needsResolve reports whether d should be resolved now. A recent failed or
+// incomplete resolve gates retries on the backoff even when one family is
+// already cached, so a transiently-failed family is retried instead of being
+// treated as fully resolved. Otherwise a domain with any cached record is left
+// to the stale-while-revalidate refresh path.
+func (m *Resolver) needsResolve(d domain.Domain) bool {
+	dnsName := strings.ToLower(dns.Fqdn(d.PunycodeString()))
+
+	m.mutex.RLock()
+	defer m.mutex.RUnlock()
+
+	if failedAt, ok := m.failedResolves[d]; ok {
+		return time.Since(failedAt) >= refreshBackoff
+	}
+
+	for _, qtype := range []uint16{dns.TypeA, dns.TypeAAAA} {
+		q := dns.Question{Name: dnsName, Qtype: qtype, Qclass: dns.ClassINET}
+		if _, ok := m.records[q]; ok {
+			return false
+		}
+	}
+	return true
+}
+
+func (m *Resolver) markResolveFailed(d domain.Domain) {
+	m.mutex.Lock()
+	m.failedResolves[d] = time.Now()
+	m.mutex.Unlock()
+}
+
+func (m *Resolver) clearResolveFailed(d domain.Domain) {
+	m.mutex.Lock()
+	delete(m.failedResolves, d)
+	m.mutex.Unlock()
+}
+
+// pruneFailedResolves drops failure markers for domains no longer present in
+// the server-domains set, keeping the map bounded to the current set (a
+// failed-only domain has no cached record, so RemoveDomain never sees it).
+func (m *Resolver) pruneFailedResolves(domains domain.List) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	for d := range m.failedResolves {
+		if !slices.Contains(domains, d) {
+			delete(m.failedResolves, d)
 		}
 	}
 }
--- a/client/internal/dns/mgmt/mgmt_refresh_test.go
+++ b/client/internal/dns/mgmt/mgmt_refresh_test.go
@@ -21,6 +21,7 @@ type fakeChain struct {
 	mu       sync.Mutex
 	calls    map[string]int
 	answers  map[string][]dns.RR
+	qErr     map[string]error
 	err      error
 	hasRoot  bool
 	onLookup func()
@@ -30,6 +31,7 @@ func newFakeChain() *fakeChain {
 	return &fakeChain{
 		calls:   map[string]int{},
 		answers: map[string][]dns.RR{},
+		qErr:    map[string]error{},
 		hasRoot: true,
 	}
 }
@@ -47,6 +49,9 @@ func (f *fakeChain) ResolveInternal(ctx context.Context, msg *dns.Msg, maxPriori
 	f.calls[key]++
 	answers := f.answers[key]
 	err := f.err
+	if err == nil {
+		err = f.qErr[key]
+	}
 	onLookup := f.onLookup
 	f.mu.Unlock()

@@ -75,6 +80,12 @@ func (f *fakeChain) setAnswer(name string, qtype uint16, ip string) {
 	}
 }

+func (f *fakeChain) setErr(name string, qtype uint16, err error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.qErr[name+"|"+dns.TypeToString[qtype]] = err
+}
+
 func (f *fakeChain) callCount(name string, qtype uint16) int {
 	f.mu.Lock()
 	defer f.mu.Unlock()
--- a/client/internal/dns/mgmt/mgmt_resolve_test.go
+++ b/client/internal/dns/mgmt/mgmt_resolve_test.go
@@ -0,0 +1,183 @@
+package mgmt
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/miekg/dns"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	dnsconfig "github.com/netbirdio/netbird/client/internal/dns/config"
+	"github.com/netbirdio/netbird/shared/management/domain"
+)
+
+// A domain already in the cache must not be re-resolved on a subsequent server
+// domains update; it is left to the stale-while-revalidate refresh path.
+func TestResolver_UpdateFromServerDomains_SkipsCached(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("signal.example.com.", dns.TypeA, "10.0.0.2")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"first update must resolve the domain")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"cached domain must not be re-resolved on a subsequent update")
+}
+
+// New domains in a single update must resolve concurrently rather than serially.
+func TestResolver_AddNewDomains_ResolvesConcurrently(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+
+	var inflight, maxInflight atomic.Int32
+	chain.onLookup = func() {
+		n := inflight.Add(1)
+		for {
+			old := maxInflight.Load()
+			if n <= old || maxInflight.CompareAndSwap(old, n) {
+				break
+			}
+		}
+		time.Sleep(50 * time.Millisecond)
+		inflight.Add(-1)
+	}
+
+	relays := []domain.Domain{"a.example.com", "b.example.com", "c.example.com", "d.example.com"}
+	for _, d := range relays {
+		chain.setAnswer(dns.Fqdn(string(d)), dns.TypeA, "10.0.0.2")
+	}
+	r.SetChainResolver(chain, 50)
+
+	start := time.Now()
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: relays})
+	require.NoError(t, err)
+	elapsed := time.Since(start)
+
+	assert.GreaterOrEqual(t, int(maxInflight.Load()), 2, "domains must resolve concurrently")
+	// Serial resolution of 4 domains would take at least 4*50ms; concurrent is far less.
+	assert.Less(t, elapsed, 300*time.Millisecond, "resolution should not be serial")
+}
+
+// A domain that fails to resolve must not be retried on every update; the
+// failure backoff suppresses re-resolution until it expires.
+func TestResolver_UpdateFromServerDomains_BacksOffFailures(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.err = errors.New("resolve boom")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{Signal: domain.Domain("signal.example.com")}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	require.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"first update must attempt the resolve")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("signal.example.com.", dns.TypeA),
+		"failed resolve must back off and not retry on the next update")
+}
+
+// A domain listed under more than one server-domain type (e.g. STUN and TURN on
+// the same host) must be resolved once per update, not once per occurrence.
+func TestResolver_AddNewDomains_DedupesDuplicateDomains(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("dup.example.com.", dns.TypeA, "10.0.0.9")
+	r.SetChainResolver(chain, 50)
+
+	sd := dnsconfig.ServerDomains{
+		Stuns: []domain.Domain{"dup.example.com"},
+		Turns: []domain.Domain{"dup.example.com"},
+	}
+
+	_, err := r.UpdateFromServerDomains(context.Background(), sd)
+	require.NoError(t, err)
+	assert.Equal(t, 1, chain.callCount("dup.example.com.", dns.TypeA),
+		"a domain appearing under multiple server-domain types must resolve once")
+}
+
+// A failure marker must be dropped once its domain leaves the server-domains set
+// so the map stays bounded to the current set.
+func TestResolver_UpdateFromServerDomains_PrunesFailedResolves(t *testing.T) {
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.err = errors.New("resolve boom")
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("gone.example.com")})
+	require.NoError(t, err)
+	r.mutex.RLock()
+	_, marked := r.failedResolves[domain.Domain("gone.example.com")]
+	r.mutex.RUnlock()
+	require.True(t, marked, "failed resolve must be recorded")
+
+	_, err = r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Signal: domain.Domain("other.example.com")})
+	require.NoError(t, err)
+	r.mutex.RLock()
+	_, stillMarked := r.failedResolves[domain.Domain("gone.example.com")]
+	r.mutex.RUnlock()
+	assert.False(t, stillMarked, "failure marker for a domain no longer in the set must be pruned")
+}
+
+// When one family hard-errors while the other resolves, the domain is cached
+// for the working family but recorded as incomplete so the failed family is
+// retried under backoff instead of being treated as fully resolved forever.
+func TestResolver_AddNewDomains_RetriesPartialFamilyFailure(t *testing.T) {
+	d := domain.Domain("relay.example.com")
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("relay.example.com.", dns.TypeA, "10.0.0.2")
+	chain.setErr("relay.example.com.", dns.TypeAAAA, errors.New("servfail"))
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
+	require.NoError(t, err)
+
+	r.mutex.RLock()
+	_, aCached := r.records[dns.Question{Name: "relay.example.com.", Qtype: dns.TypeA, Qclass: dns.ClassINET}]
+	_, marked := r.failedResolves[d]
+	r.mutex.RUnlock()
+	require.True(t, aCached, "the working family must still be cached")
+	require.True(t, marked, "a partial failure must be recorded so the failed family is retried")
+
+	assert.False(t, r.needsResolve(d), "within the backoff window the domain is not retried")
+
+	r.mutex.Lock()
+	r.failedResolves[d] = time.Now().Add(-2 * refreshBackoff)
+	r.mutex.Unlock()
+	assert.True(t, r.needsResolve(d), "after the backoff elapses the domain is retried to pick up the missing family")
+}
+
+// A family that returns NODATA (legitimately absent, e.g. an IPv4-only host) is
+// not a failure: the domain must not be marked for retry, otherwise it would be
+// re-resolved on every sync.
+func TestResolver_AddNewDomains_NodataIsNotFailure(t *testing.T) {
+	d := domain.Domain("v4only.example.com")
+	r := NewResolver()
+	chain := newFakeChain()
+	chain.setAnswer("v4only.example.com.", dns.TypeA, "10.0.0.2")
+	r.SetChainResolver(chain, 50)
+
+	_, err := r.UpdateFromServerDomains(context.Background(), dnsconfig.ServerDomains{Relay: []domain.Domain{d}})
+	require.NoError(t, err)
+
+	r.mutex.RLock()
+	_, marked := r.failedResolves[d]
+	r.mutex.RUnlock()
+	assert.False(t, marked, "a NODATA family must not be recorded as a failure")
+	assert.False(t, r.needsResolve(d), "an IPv4-only host must not be re-resolved on later syncs")
+}
--- a/client/internal/dns/resutil/resolve.go
+++ b/client/internal/dns/resutil/resolve.go
@@ -207,3 +207,35 @@ func FormatAnswers(answers []dns.RR) string {
 	}
 	return "[" + strings.Join(parts, ", ") + "]"
 }
+
+// StripOPT removes any OPT pseudo-RRs from the message's Extra section. Per
+// RFC 6891 a responder must not include an OPT RR toward a client that did not
+// advertise EDNS0.
+func StripOPT(msg *dns.Msg) {
+	if len(msg.Extra) == 0 {
+		return
+	}
+	out := msg.Extra[:0]
+	for _, rr := range msg.Extra {
+		if _, ok := rr.(*dns.OPT); ok {
+			continue
+		}
+		out = append(out, rr)
+	}
+	msg.Extra = out
+}
+
+// ExtractEDE returns the first Extended DNS Error (RFC 8914) option carried in
+// the message, if present.
+func ExtractEDE(msg *dns.Msg) (*dns.EDNS0_EDE, bool) {
+	opt := msg.IsEdns0()
+	if opt == nil {
+		return nil, false
+	}
+	for _, o := range opt.Option {
+		if ede, ok := o.(*dns.EDNS0_EDE); ok {
+			return ede, true
+		}
+	}
+	return nil, false
+}
--- a/client/internal/dns/resutil/resolve_test.go
+++ b/client/internal/dns/resutil/resolve_test.go
@@ -120,3 +120,42 @@ func TestLookupIP_DNSErrorNotIsNotFound(t *testing.T) {

 	assert.Equal(t, dns.RcodeServerFailure, result.Rcode, "upstream failure should map to SERVFAIL")
 }
+
+func TestStripOPT(t *testing.T) {
+	rm := &dns.Msg{
+		Extra: []dns.RR{
+			&dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}},
+			&dns.A{Hdr: dns.RR_Header{Name: "x.", Rrtype: dns.TypeA}, A: net.IPv4(1, 2, 3, 4)},
+		},
+	}
+	StripOPT(rm)
+	assert.Len(t, rm.Extra, 1, "OPT should be removed, A kept")
+	_, isOPT := rm.Extra[0].(*dns.OPT)
+	assert.False(t, isOPT, "remaining record must not be OPT")
+}
+
+func TestExtractEDE(t *testing.T) {
+	t.Run("no edns", func(t *testing.T) {
+		_, ok := ExtractEDE(&dns.Msg{})
+		assert.False(t, ok, "message without OPT has no EDE")
+	})
+
+	t.Run("edns without ede", func(t *testing.T) {
+		rm := &dns.Msg{}
+		rm.SetEdns0(4096, false)
+		_, ok := ExtractEDE(rm)
+		assert.False(t, ok, "OPT without EDE option returns false")
+	})
+
+	t.Run("with ede", func(t *testing.T) {
+		rm := &dns.Msg{}
+		opt := &dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}}
+		opt.Option = append(opt.Option, &dns.EDNS0_EDE{InfoCode: 49152, ExtraText: "upstream timeout"})
+		rm.Extra = append(rm.Extra, opt)
+
+		ede, ok := ExtractEDE(rm)
+		assert.True(t, ok, "EDE option should be found")
+		assert.Equal(t, uint16(49152), ede.InfoCode)
+		assert.Equal(t, "upstream timeout", ede.ExtraText)
+	})
+}
--- a/client/internal/dns/server.go
+++ b/client/internal/dns/server.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"net/netip"
 	"net/url"
+	"os"
 	"slices"
 	"strings"
 	"sync"
@@ -38,11 +39,15 @@ const (
 	// defaultWarningDelayBase is the starting grace window before a
 	// "Nameserver group unreachable" event fires for a group that's
 	// never been healthy and only has overlay upstreams with no
-	// Connected peer. Per-server and overridable; see warningDelayFor.
-	defaultWarningDelayBase = 30 * time.Second
+	// Connected peer. Per-server and overridable via envWarningDelay;
+	// see warningDelay.
+	defaultWarningDelayBase = 60 * time.Second
 	// warningDelayBonusCap caps the route-count bonus added to the
-	// base grace window. See warningDelayFor.
+	// base grace window. See warningDelay.
 	warningDelayBonusCap = 30 * time.Second
+	// envWarningDelay overrides defaultWarningDelayBase with a Go duration
+	// string (e.g. "90s", "2m"). Invalid or non-positive values are ignored.
+	envWarningDelay = "NB_DNS_HEALTH_WARNING_DELAY"
 )

 // errNoUsableNameservers signals that a merged-domain group has no usable
@@ -135,7 +140,7 @@ type DefaultServer struct {
 	disableSys         bool
 	mux                sync.Mutex
 	service            service
-	dnsMuxMap          registeredHandlerMap
+	dnsMuxHandlers     []handlerWrapper
 	localResolver      *local.Resolver
 	wgInterface        WGIface
 	hostManager        hostManager
@@ -199,8 +204,6 @@ type handlerWrapper struct {
 	priority int
 }

-type registeredHandlerMap map[types.HandlerID]handlerWrapper
-
 // DefaultServerConfig holds configuration parameters for NewDefaultServer
 type DefaultServerConfig struct {
 	WgInterface    WGIface
@@ -289,7 +292,6 @@ func newDefaultServer(
 		service:           dnsService,
 		handlerChain:      handlerChain,
 		extraDomains:      make(map[domain.Domain]int),
-		dnsMuxMap:         make(registeredHandlerMap),
 		localResolver:     local.NewResolver(),
 		wgInterface:       wgInterface,
 		statusRecorder:    statusRecorder,
@@ -298,7 +300,7 @@ func newDefaultServer(
 		hostManager:       &noopHostConfigurator{},
 		mgmtCacheResolver: mgmtCacheResolver,
 		currentConfigHash: ^uint64(0), // Initialize to max uint64 to ensure first config is always applied
-		warningDelayBase:  defaultWarningDelayBase,
+		warningDelayBase:  warningDelayBaseFromEnv(),
 		healthRefresh:     make(chan struct{}, 1),
 	}
 	// Wire the local resolver against the peer status recorder so it can
@@ -328,7 +330,7 @@ func (s *DefaultServer) SetRouteSources(selected, active func() route.HAMap) {
 	type routeSettable interface {
 		setSelectedRoutes(func() route.HAMap)
 	}
-	for _, entry := range s.dnsMuxMap {
+	for _, entry := range s.dnsMuxHandlers {
 		if h, ok := entry.handler.(routeSettable); ok {
 			h.setSelectedRoutes(selected)
 		}
@@ -978,19 +980,23 @@ func (s *DefaultServer) usableNameServers(nameServers []nbdns.NameServer) []neti

 func (s *DefaultServer) updateMux(muxUpdates []handlerWrapper) {
 	// this will introduce a short period of time when the server is not able to handle DNS requests
-	for _, existing := range s.dnsMuxMap {
+	for _, existing := range s.dnsMuxHandlers {
 		s.deregisterHandler([]string{existing.domain}, existing.priority)
-		existing.handler.Stop()
+		// The local resolver is a persistent singleton shared by every custom
+		// zone and reused across config updates. Its chain registrations are
+		// per-config and must be deregistered, but Stop() cancels its lookup
+		// context (breaking external CNAME-target resolution) and clears its
+		// records, so it must not be torn down here.
+		if existing.handler != s.localResolver {
+			existing.handler.Stop()
+		}
 	}

-	muxUpdateMap := make(registeredHandlerMap)
-
 	for _, update := range muxUpdates {
 		s.registerHandler([]string{update.domain}, update.handler, update.priority)
-		muxUpdateMap[update.handler.ID()] = update
 	}

-	s.dnsMuxMap = muxUpdateMap
+	s.dnsMuxHandlers = muxUpdates
 }

 // updateNSGroupStates records the new group set and pokes the refresher.
@@ -1154,6 +1160,26 @@ func (s *DefaultServer) projectUnhealthy(p *nsGroupProj, servers []netip.AddrPor
 	return false
 }

+// warningDelayBaseFromEnv returns the base grace window, honoring
+// envWarningDelay when it holds a valid positive Go duration. Invalid or
+// non-positive values fall back to defaultWarningDelayBase.
+func warningDelayBaseFromEnv() time.Duration {
+	val := os.Getenv(envWarningDelay)
+	if val == "" {
+		return defaultWarningDelayBase
+	}
+	d, err := time.ParseDuration(val)
+	if err != nil {
+		log.Warnf("invalid %s value %q, using default %v: %v", envWarningDelay, val, defaultWarningDelayBase, err)
+		return defaultWarningDelayBase
+	}
+	if d <= 0 {
+		log.Warnf("%s must be positive, got %v, using default %v", envWarningDelay, d, defaultWarningDelayBase)
+		return defaultWarningDelayBase
+	}
+	return d
+}
+
 // warningDelay returns the grace window for the given selected-route
 // count. Scales gently: +1s per 100 routes, capped by
 // warningDelayBonusCap. Parallel handshakes mean handshake time grows
@@ -1204,7 +1230,7 @@ func (s *DefaultServer) groupHasImmediateUpstream(servers []netip.AddrPort, snap
 // in more than one handler.
 func (s *DefaultServer) collectUpstreamHealth() map[netip.AddrPort]UpstreamHealth {
 	merged := make(map[netip.AddrPort]UpstreamHealth)
-	for _, entry := range s.dnsMuxMap {
+	for _, entry := range s.dnsMuxHandlers {
 		reporter, ok := entry.handler.(upstreamHealthReporter)
 		if !ok {
 			continue
--- a/client/internal/dns/server_privileged_test.go
+++ b/client/internal/dns/server_privileged_test.go
@@ -0,0 +1,485 @@
+//go:build privileged
+
+package dns
+
+import (
+	"context"
+	"fmt"
+	"net/netip"
+	"os"
+	"testing"
+
+	"github.com/golang/mock/gomock"
+	"github.com/miekg/dns"
+	"github.com/stretchr/testify/assert"
+	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+
+	"github.com/netbirdio/netbird/client/iface"
+	pfmock "github.com/netbirdio/netbird/client/iface/mocks"
+	"github.com/netbirdio/netbird/client/iface/wgaddr"
+	"github.com/netbirdio/netbird/client/internal/dns/local"
+	"github.com/netbirdio/netbird/client/internal/dns/test"
+	"github.com/netbirdio/netbird/client/internal/peer"
+	"github.com/netbirdio/netbird/client/internal/stdnet"
+	nbdns "github.com/netbirdio/netbird/dns"
+)
+
+func TestUpdateDNSServer(t *testing.T) {
+
+	nameServers := []nbdns.NameServer{
+		{
+			IP:     netip.MustParseAddr("8.8.8.8"),
+			NSType: nbdns.UDPNameServerType,
+			Port:   53,
+		},
+		{
+			IP:     netip.MustParseAddr("8.8.4.4"),
+			NSType: nbdns.UDPNameServerType,
+			Port:   53,
+		},
+	}
+
+	testCases := []struct {
+		name                string
+		initUpstreamMap     []handlerWrapper
+		initLocalZones      []nbdns.CustomZone
+		initSerial          uint64
+		inputSerial         uint64
+		inputUpdate         nbdns.Config
+		shouldFail          bool
+		expectedUpstreamMap []handlerWrapper
+		expectedLocalQs     []dns.Question
+	}{
+		{
+			name:            "Initial Config Should Succeed",
+			initUpstreamMap: nil,
+			initSerial:      0,
+			inputSerial:     1,
+			inputUpdate: nbdns.Config{
+				ServiceEnable: true,
+				CustomZones: []nbdns.CustomZone{
+					{
+						Domain:  "netbird.cloud",
+						Records: zoneRecords,
+					},
+				},
+				NameServerGroups: []*nbdns.NameServerGroup{
+					{
+						Domains:     []string{"netbird.io"},
+						NameServers: nameServers,
+					},
+					{
+						NameServers: nameServers,
+						Primary:     true,
+					},
+				},
+			},
+			expectedUpstreamMap: []handlerWrapper{
+				{
+					domain:   "netbird.io",
+					priority: PriorityUpstream,
+				},
+				{
+					domain:   "netbird.cloud",
+					priority: PriorityLocal,
+				},
+				{
+					domain:   nbdns.RootZone,
+					priority: PriorityDefault,
+				},
+			},
+			expectedLocalQs: []dns.Question{{Name: "peera.netbird.cloud.", Qtype: dns.TypeA, Qclass: dns.ClassINET}},
+		},
+		{
+			name:           "New Config Should Succeed",
+			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: 1, Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
+			initUpstreamMap: []handlerWrapper{
+				{
+					domain:   "netbird.cloud",
+					handler:  &mockHandler{},
+					priority: PriorityUpstream,
+				},
+			},
+			initSerial:  0,
+			inputSerial: 1,
+			inputUpdate: nbdns.Config{
+				ServiceEnable: true,
+				CustomZones: []nbdns.CustomZone{
+					{
+						Domain:  "netbird.cloud",
+						Records: zoneRecords,
+					},
+				},
+				NameServerGroups: []*nbdns.NameServerGroup{
+					{
+						Domains:     []string{"netbird.io"},
+						NameServers: nameServers,
+					},
+				},
+			},
+			expectedUpstreamMap: []handlerWrapper{
+				{
+					domain:   "netbird.io",
+					priority: PriorityUpstream,
+				},
+				{
+					domain:   "netbird.cloud",
+					priority: PriorityLocal,
+				},
+			},
+			expectedLocalQs: []dns.Question{{Name: zoneRecords[0].Name, Qtype: 1, Qclass: 1}},
+		},
+		{
+			name:            "Smaller Config Serial Should Be Skipped",
+			initLocalZones:  []nbdns.CustomZone{},
+			initUpstreamMap: nil,
+			initSerial:      2,
+			inputSerial:     1,
+			shouldFail:      true,
+		},
+		{
+			name:            "Empty NS Group Domain Or Not Primary Element Should Fail",
+			initLocalZones:  []nbdns.CustomZone{},
+			initUpstreamMap: nil,
+			initSerial:      0,
+			inputSerial:     1,
+			inputUpdate: nbdns.Config{
+				ServiceEnable: true,
+				CustomZones: []nbdns.CustomZone{
+					{
+						Domain:  "netbird.cloud",
+						Records: zoneRecords,
+					},
+				},
+				NameServerGroups: []*nbdns.NameServerGroup{
+					{
+						NameServers: nameServers,
+					},
+				},
+			},
+			shouldFail: true,
+		},
+		{
+			name:            "Invalid NS Group Nameservers list Should Fail",
+			initLocalZones:  []nbdns.CustomZone{},
+			initUpstreamMap: nil,
+			initSerial:      0,
+			inputSerial:     1,
+			inputUpdate: nbdns.Config{
+				ServiceEnable: true,
+				CustomZones: []nbdns.CustomZone{
+					{
+						Domain:  "netbird.cloud",
+						Records: zoneRecords,
+					},
+				},
+				NameServerGroups: []*nbdns.NameServerGroup{
+					{
+						NameServers: nameServers,
+					},
+				},
+			},
+			shouldFail: true,
+		},
+		{
+			name:            "Invalid Custom Zone Records list Should Skip",
+			initLocalZones:  []nbdns.CustomZone{},
+			initUpstreamMap: nil,
+			initSerial:      0,
+			inputSerial:     1,
+			inputUpdate: nbdns.Config{
+				ServiceEnable: true,
+				CustomZones: []nbdns.CustomZone{
+					{
+						Domain: "netbird.cloud",
+					},
+				},
+				NameServerGroups: []*nbdns.NameServerGroup{
+					{
+						NameServers: nameServers,
+						Primary:     true,
+					},
+				},
+			},
+			expectedUpstreamMap: []handlerWrapper{{
+				domain:   ".",
+				priority: PriorityDefault,
+			}},
+		},
+		{
+			name:           "Empty Config Should Succeed and Clean Maps",
+			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
+			initUpstreamMap: []handlerWrapper{
+				{
+					domain:   zoneRecords[0].Name,
+					handler:  &mockHandler{},
+					priority: PriorityUpstream,
+				},
+			},
+			initSerial:          0,
+			inputSerial:         1,
+			inputUpdate:         nbdns.Config{ServiceEnable: true},
+			expectedUpstreamMap: nil,
+			expectedLocalQs:     []dns.Question{},
+		},
+		{
+			name:           "Disabled Service Should clean map",
+			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
+			initUpstreamMap: []handlerWrapper{
+				{
+					domain:   zoneRecords[0].Name,
+					handler:  &mockHandler{},
+					priority: PriorityUpstream,
+				},
+			},
+			initSerial:          0,
+			inputSerial:         1,
+			inputUpdate:         nbdns.Config{ServiceEnable: false},
+			expectedUpstreamMap: nil,
+			expectedLocalQs:     []dns.Question{},
+		},
+	}
+
+	for n, testCase := range testCases {
+		t.Run(testCase.name, func(t *testing.T) {
+			privKey, _ := wgtypes.GenerateKey()
+			newNet, err := stdnet.NewNet(context.Background(), nil)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			opts := iface.WGIFaceOpts{
+				IFaceName:    fmt.Sprintf("utun230%d", n),
+				Address:      wgaddr.MustParseWGAddress(fmt.Sprintf("100.66.100.%d/32", n+1)),
+				WGPort:       33100,
+				WGPrivKey:    privKey.String(),
+				MTU:          iface.DefaultMTU,
+				TransportNet: newNet,
+			}
+
+			wgIface, err := iface.NewWGIFace(opts)
+			if err != nil {
+				t.Fatal(err)
+			}
+			err = wgIface.Create()
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer func() {
+				err = wgIface.Close()
+				if err != nil {
+					t.Log(err)
+				}
+			}()
+			dnsServer, err := NewDefaultServer(context.Background(), DefaultServerConfig{
+				WgInterface:    wgIface,
+				CustomAddress:  "",
+				StatusRecorder: peer.NewRecorder("mgm"),
+				StateManager:   nil,
+				DisableSys:     false,
+			})
+			if err != nil {
+				t.Fatal(err)
+			}
+			err = dnsServer.Initialize()
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer func() {
+				err = dnsServer.hostManager.restoreHostDNS()
+				if err != nil {
+					t.Log(err)
+				}
+			}()
+
+			dnsServer.dnsMuxHandlers = testCase.initUpstreamMap
+			dnsServer.localResolver.Update(testCase.initLocalZones)
+			dnsServer.updateSerial = testCase.initSerial
+
+			err = dnsServer.UpdateDNSServer(testCase.inputSerial, testCase.inputUpdate)
+			if err != nil {
+				if testCase.shouldFail {
+					return
+				}
+				t.Fatalf("update dns server should not fail, got error: %v", err)
+			}
+
+			if len(dnsServer.dnsMuxHandlers) != len(testCase.expectedUpstreamMap) {
+				t.Fatalf("update upstream failed, map size is different than expected, want %d, got %d", len(testCase.expectedUpstreamMap), len(dnsServer.dnsMuxHandlers))
+			}
+
+			for _, expected := range testCase.expectedUpstreamMap {
+				found := false
+				for _, got := range dnsServer.dnsMuxHandlers {
+					if got.domain == expected.domain && got.priority == expected.priority {
+						found = true
+						break
+					}
+				}
+				if !found {
+					t.Fatalf("update upstream failed, handler for domain=%s priority=%d not found in dnsMuxHandlers: %#v", expected.domain, expected.priority, dnsServer.dnsMuxHandlers)
+				}
+			}
+
+			var responseMSG *dns.Msg
+			responseWriter := &test.MockResponseWriter{
+				WriteMsgFunc: func(m *dns.Msg) error {
+					responseMSG = m
+					return nil
+				},
+			}
+			for _, q := range testCase.expectedLocalQs {
+				dnsServer.localResolver.ServeDNS(responseWriter, &dns.Msg{
+					Question: []dns.Question{q},
+				})
+			}
+
+			if len(testCase.expectedLocalQs) > 0 {
+				assert.NotNil(t, responseMSG, "response message should not be nil")
+				assert.Equal(t, dns.RcodeSuccess, responseMSG.Rcode, "response code should be success")
+				assert.NotEmpty(t, responseMSG.Answer, "response message should have answers")
+			}
+		})
+	}
+}
+
+func TestDNSFakeResolverHandleUpdates(t *testing.T) {
+	ov := os.Getenv("NB_WG_KERNEL_DISABLED")
+	defer t.Setenv("NB_WG_KERNEL_DISABLED", ov)
+
+	t.Setenv("NB_WG_KERNEL_DISABLED", "true")
+	newNet, err := stdnet.NewNet(context.Background(), []string{"utun2301"})
+	if err != nil {
+		t.Errorf("create stdnet: %v", err)
+		return
+	}
+
+	privKey, _ := wgtypes.GeneratePrivateKey()
+	opts := iface.WGIFaceOpts{
+		IFaceName:    "utun2301",
+		Address:      wgaddr.MustParseWGAddress("100.66.100.1/32"),
+		WGPort:       33100,
+		WGPrivKey:    privKey.String(),
+		MTU:          iface.DefaultMTU,
+		TransportNet: newNet,
+	}
+	wgIface, err := iface.NewWGIFace(opts)
+	if err != nil {
+		t.Errorf("build interface wireguard: %v", err)
+		return
+	}
+
+	err = wgIface.Create()
+	if err != nil {
+		t.Errorf("create and init wireguard interface: %v", err)
+		return
+	}
+	defer func() {
+		if err = wgIface.Close(); err != nil {
+			t.Logf("close wireguard interface: %v", err)
+		}
+	}()
+
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+
+	packetfilter := pfmock.NewMockPacketFilter(ctrl)
+	packetfilter.EXPECT().FilterOutbound(gomock.Any(), gomock.Any()).AnyTimes()
+	packetfilter.EXPECT().SetUDPPacketHook(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
+	packetfilter.EXPECT().SetTCPPacketHook(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
+
+	if err := wgIface.SetFilter(packetfilter); err != nil {
+		t.Errorf("set packet filter: %v", err)
+		return
+	}
+
+	dnsServer, err := NewDefaultServer(context.Background(), DefaultServerConfig{
+		WgInterface:    wgIface,
+		CustomAddress:  "",
+		StatusRecorder: peer.NewRecorder("mgm"),
+		StateManager:   nil,
+		DisableSys:     false,
+	})
+	if err != nil {
+		t.Errorf("create DNS server: %v", err)
+		return
+	}
+
+	err = dnsServer.Initialize()
+	if err != nil {
+		t.Errorf("run DNS server: %v", err)
+		return
+	}
+	defer func() {
+		if err = dnsServer.hostManager.restoreHostDNS(); err != nil {
+			t.Logf("restore DNS settings on the host: %v", err)
+			return
+		}
+	}()
+
+	dnsServer.dnsMuxHandlers = []handlerWrapper{
+		{
+			domain:   zoneRecords[0].Name,
+			handler:  &local.Resolver{},
+			priority: PriorityUpstream,
+		},
+	}
+	dnsServer.localResolver.Update([]nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}})
+	dnsServer.updateSerial = 0
+
+	nameServers := []nbdns.NameServer{
+		{
+			IP:     netip.MustParseAddr("8.8.8.8"),
+			NSType: nbdns.UDPNameServerType,
+			Port:   53,
+		},
+		{
+			IP:     netip.MustParseAddr("8.8.4.4"),
+			NSType: nbdns.UDPNameServerType,
+			Port:   53,
+		},
+	}
+
+	update := nbdns.Config{
+		ServiceEnable: true,
+		CustomZones: []nbdns.CustomZone{
+			{
+				Domain:  "netbird.cloud",
+				Records: zoneRecords,
+			},
+		},
+		NameServerGroups: []*nbdns.NameServerGroup{
+			{
+				Domains:     []string{"netbird.io"},
+				NameServers: nameServers,
+			},
+			{
+				NameServers: nameServers,
+				Primary:     true,
+			},
+		},
+	}
+
+	// Start the server with regular configuration
+	if err := dnsServer.UpdateDNSServer(1, update); err != nil {
+		t.Fatalf("update dns server should not fail, got error: %v", err)
+		return
+	}
+
+	update2 := update
+	update2.ServiceEnable = false
+	// Disable the server, stop the listener
+	if err := dnsServer.UpdateDNSServer(2, update2); err != nil {
+		t.Fatalf("update dns server should not fail, got error: %v", err)
+		return
+	}
+
+	update3 := update2
+	update3.NameServerGroups = update3.NameServerGroups[:1]
+	// But service still get updates and we checking that we handle
+	// internal state in the right way
+	if err := dnsServer.UpdateDNSServer(3, update3); err != nil {
+		t.Fatalf("update dns server should not fail, got error: %v", err)
+		return
+	}
+}
--- a/client/internal/dns/server_test.go
+++ b/client/internal/dns/server_test.go
@@ -10,7 +10,6 @@ import (
 	"testing"
 	"time"

-	"github.com/golang/mock/gomock"
 	"github.com/miekg/dns"
 	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
@@ -23,7 +22,6 @@ import (
 	"github.com/netbirdio/netbird/client/iface"
 	"github.com/netbirdio/netbird/client/iface/configurer"
 	"github.com/netbirdio/netbird/client/iface/device"
-	pfmock "github.com/netbirdio/netbird/client/iface/mocks"
 	"github.com/netbirdio/netbird/client/iface/wgaddr"
 	"github.com/netbirdio/netbird/client/internal/dns/local"
 	"github.com/netbirdio/netbird/client/internal/dns/test"
@@ -104,481 +102,6 @@ func init() {
 	formatter.SetTextFormatter(log.StandardLogger())
 }

-func generateDummyHandler(d string, servers []nbdns.NameServer) *upstreamResolverBase {
-	var srvs []netip.AddrPort
-	for _, srv := range servers {
-		srvs = append(srvs, srv.AddrPort())
-	}
-	u := &upstreamResolverBase{
-		domain: domain.Domain(d),
-		cancel: func() {},
-	}
-	u.addRace(srvs)
-	return u
-}
-
-func TestUpdateDNSServer(t *testing.T) {
-
-	nameServers := []nbdns.NameServer{
-		{
-			IP:     netip.MustParseAddr("8.8.8.8"),
-			NSType: nbdns.UDPNameServerType,
-			Port:   53,
-		},
-		{
-			IP:     netip.MustParseAddr("8.8.4.4"),
-			NSType: nbdns.UDPNameServerType,
-			Port:   53,
-		},
-	}
-
-	dummyHandler := local.NewResolver()
-
-	testCases := []struct {
-		name                string
-		initUpstreamMap     registeredHandlerMap
-		initLocalZones      []nbdns.CustomZone
-		initSerial          uint64
-		inputSerial         uint64
-		inputUpdate         nbdns.Config
-		shouldFail          bool
-		expectedUpstreamMap registeredHandlerMap
-		expectedLocalQs     []dns.Question
-	}{
-		{
-			name:            "Initial Config Should Succeed",
-			initUpstreamMap: make(registeredHandlerMap),
-			initSerial:      0,
-			inputSerial:     1,
-			inputUpdate: nbdns.Config{
-				ServiceEnable: true,
-				CustomZones: []nbdns.CustomZone{
-					{
-						Domain:  "netbird.cloud",
-						Records: zoneRecords,
-					},
-				},
-				NameServerGroups: []*nbdns.NameServerGroup{
-					{
-						Domains:     []string{"netbird.io"},
-						NameServers: nameServers,
-					},
-					{
-						NameServers: nameServers,
-						Primary:     true,
-					},
-				},
-			},
-			expectedUpstreamMap: registeredHandlerMap{
-				generateDummyHandler("netbird.io", nameServers).ID(): handlerWrapper{
-					domain:   "netbird.io",
-					handler:  dummyHandler,
-					priority: PriorityUpstream,
-				},
-				dummyHandler.ID(): handlerWrapper{
-					domain:   "netbird.cloud",
-					handler:  dummyHandler,
-					priority: PriorityLocal,
-				},
-				generateDummyHandler(".", nameServers).ID(): handlerWrapper{
-					domain:   nbdns.RootZone,
-					handler:  dummyHandler,
-					priority: PriorityDefault,
-				},
-			},
-			expectedLocalQs: []dns.Question{{Name: "peera.netbird.cloud.", Qtype: dns.TypeA, Qclass: dns.ClassINET}},
-		},
-		{
-			name:           "New Config Should Succeed",
-			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: 1, Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
-			initUpstreamMap: registeredHandlerMap{
-				generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
-					domain:   "netbird.cloud",
-					handler:  dummyHandler,
-					priority: PriorityUpstream,
-				},
-			},
-			initSerial:  0,
-			inputSerial: 1,
-			inputUpdate: nbdns.Config{
-				ServiceEnable: true,
-				CustomZones: []nbdns.CustomZone{
-					{
-						Domain:  "netbird.cloud",
-						Records: zoneRecords,
-					},
-				},
-				NameServerGroups: []*nbdns.NameServerGroup{
-					{
-						Domains:     []string{"netbird.io"},
-						NameServers: nameServers,
-					},
-				},
-			},
-			expectedUpstreamMap: registeredHandlerMap{
-				generateDummyHandler("netbird.io", nameServers).ID(): handlerWrapper{
-					domain:   "netbird.io",
-					handler:  dummyHandler,
-					priority: PriorityUpstream,
-				},
-				"local-resolver": handlerWrapper{
-					domain:   "netbird.cloud",
-					handler:  dummyHandler,
-					priority: PriorityLocal,
-				},
-			},
-			expectedLocalQs: []dns.Question{{Name: zoneRecords[0].Name, Qtype: 1, Qclass: 1}},
-		},
-		{
-			name:            "Smaller Config Serial Should Be Skipped",
-			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: make(registeredHandlerMap),
-			initSerial:      2,
-			inputSerial:     1,
-			shouldFail:      true,
-		},
-		{
-			name:            "Empty NS Group Domain Or Not Primary Element Should Fail",
-			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: make(registeredHandlerMap),
-			initSerial:      0,
-			inputSerial:     1,
-			inputUpdate: nbdns.Config{
-				ServiceEnable: true,
-				CustomZones: []nbdns.CustomZone{
-					{
-						Domain:  "netbird.cloud",
-						Records: zoneRecords,
-					},
-				},
-				NameServerGroups: []*nbdns.NameServerGroup{
-					{
-						NameServers: nameServers,
-					},
-				},
-			},
-			shouldFail: true,
-		},
-		{
-			name:            "Invalid NS Group Nameservers list Should Fail",
-			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: make(registeredHandlerMap),
-			initSerial:      0,
-			inputSerial:     1,
-			inputUpdate: nbdns.Config{
-				ServiceEnable: true,
-				CustomZones: []nbdns.CustomZone{
-					{
-						Domain:  "netbird.cloud",
-						Records: zoneRecords,
-					},
-				},
-				NameServerGroups: []*nbdns.NameServerGroup{
-					{
-						NameServers: nameServers,
-					},
-				},
-			},
-			shouldFail: true,
-		},
-		{
-			name:            "Invalid Custom Zone Records list Should Skip",
-			initLocalZones:  []nbdns.CustomZone{},
-			initUpstreamMap: make(registeredHandlerMap),
-			initSerial:      0,
-			inputSerial:     1,
-			inputUpdate: nbdns.Config{
-				ServiceEnable: true,
-				CustomZones: []nbdns.CustomZone{
-					{
-						Domain: "netbird.cloud",
-					},
-				},
-				NameServerGroups: []*nbdns.NameServerGroup{
-					{
-						NameServers: nameServers,
-						Primary:     true,
-					},
-				},
-			},
-			expectedUpstreamMap: registeredHandlerMap{generateDummyHandler(".", nameServers).ID(): handlerWrapper{
-				domain:   ".",
-				handler:  dummyHandler,
-				priority: PriorityDefault,
-			}},
-		},
-		{
-			name:           "Empty Config Should Succeed and Clean Maps",
-			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
-			initUpstreamMap: registeredHandlerMap{
-				generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
-					domain:   zoneRecords[0].Name,
-					handler:  dummyHandler,
-					priority: PriorityUpstream,
-				},
-			},
-			initSerial:          0,
-			inputSerial:         1,
-			inputUpdate:         nbdns.Config{ServiceEnable: true},
-			expectedUpstreamMap: make(registeredHandlerMap),
-			expectedLocalQs:     []dns.Question{},
-		},
-		{
-			name:           "Disabled Service Should clean map",
-			initLocalZones: []nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}},
-			initUpstreamMap: registeredHandlerMap{
-				generateDummyHandler(zoneRecords[0].Name, nameServers).ID(): handlerWrapper{
-					domain:   zoneRecords[0].Name,
-					handler:  dummyHandler,
-					priority: PriorityUpstream,
-				},
-			},
-			initSerial:          0,
-			inputSerial:         1,
-			inputUpdate:         nbdns.Config{ServiceEnable: false},
-			expectedUpstreamMap: make(registeredHandlerMap),
-			expectedLocalQs:     []dns.Question{},
-		},
-	}
-
-	for n, testCase := range testCases {
-		t.Run(testCase.name, func(t *testing.T) {
-			privKey, _ := wgtypes.GenerateKey()
-			newNet, err := stdnet.NewNet(context.Background(), nil)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			opts := iface.WGIFaceOpts{
-				IFaceName:    fmt.Sprintf("utun230%d", n),
-				Address:      wgaddr.MustParseWGAddress(fmt.Sprintf("100.66.100.%d/32", n+1)),
-				WGPort:       33100,
-				WGPrivKey:    privKey.String(),
-				MTU:          iface.DefaultMTU,
-				TransportNet: newNet,
-			}
-
-			wgIface, err := iface.NewWGIFace(opts)
-			if err != nil {
-				t.Fatal(err)
-			}
-			err = wgIface.Create()
-			if err != nil {
-				t.Fatal(err)
-			}
-			defer func() {
-				err = wgIface.Close()
-				if err != nil {
-					t.Log(err)
-				}
-			}()
-			dnsServer, err := NewDefaultServer(context.Background(), DefaultServerConfig{
-				WgInterface:    wgIface,
-				CustomAddress:  "",
-				StatusRecorder: peer.NewRecorder("mgm"),
-				StateManager:   nil,
-				DisableSys:     false,
-			})
-			if err != nil {
-				t.Fatal(err)
-			}
-			err = dnsServer.Initialize()
-			if err != nil {
-				t.Fatal(err)
-			}
-			defer func() {
-				err = dnsServer.hostManager.restoreHostDNS()
-				if err != nil {
-					t.Log(err)
-				}
-			}()
-
-			dnsServer.dnsMuxMap = testCase.initUpstreamMap
-			dnsServer.localResolver.Update(testCase.initLocalZones)
-			dnsServer.updateSerial = testCase.initSerial
-
-			err = dnsServer.UpdateDNSServer(testCase.inputSerial, testCase.inputUpdate)
-			if err != nil {
-				if testCase.shouldFail {
-					return
-				}
-				t.Fatalf("update dns server should not fail, got error: %v", err)
-			}
-
-			if len(dnsServer.dnsMuxMap) != len(testCase.expectedUpstreamMap) {
-				t.Fatalf("update upstream failed, map size is different than expected, want %d, got %d", len(testCase.expectedUpstreamMap), len(dnsServer.dnsMuxMap))
-			}
-
-			for key := range testCase.expectedUpstreamMap {
-				_, found := dnsServer.dnsMuxMap[key]
-				if !found {
-					t.Fatalf("update upstream failed, key %s was not found in the dnsMuxMap: %#v", key, dnsServer.dnsMuxMap)
-				}
-			}
-
-			var responseMSG *dns.Msg
-			responseWriter := &test.MockResponseWriter{
-				WriteMsgFunc: func(m *dns.Msg) error {
-					responseMSG = m
-					return nil
-				},
-			}
-			for _, q := range testCase.expectedLocalQs {
-				dnsServer.localResolver.ServeDNS(responseWriter, &dns.Msg{
-					Question: []dns.Question{q},
-				})
-			}
-
-			if len(testCase.expectedLocalQs) > 0 {
-				assert.NotNil(t, responseMSG, "response message should not be nil")
-				assert.Equal(t, dns.RcodeSuccess, responseMSG.Rcode, "response code should be success")
-				assert.NotEmpty(t, responseMSG.Answer, "response message should have answers")
-			}
-		})
-	}
-}
-
-func TestDNSFakeResolverHandleUpdates(t *testing.T) {
-	ov := os.Getenv("NB_WG_KERNEL_DISABLED")
-	defer t.Setenv("NB_WG_KERNEL_DISABLED", ov)
-
-	t.Setenv("NB_WG_KERNEL_DISABLED", "true")
-	newNet, err := stdnet.NewNet(context.Background(), []string{"utun2301"})
-	if err != nil {
-		t.Errorf("create stdnet: %v", err)
-		return
-	}
-
-	privKey, _ := wgtypes.GeneratePrivateKey()
-	opts := iface.WGIFaceOpts{
-		IFaceName:    "utun2301",
-		Address:      wgaddr.MustParseWGAddress("100.66.100.1/32"),
-		WGPort:       33100,
-		WGPrivKey:    privKey.String(),
-		MTU:          iface.DefaultMTU,
-		TransportNet: newNet,
-	}
-	wgIface, err := iface.NewWGIFace(opts)
-	if err != nil {
-		t.Errorf("build interface wireguard: %v", err)
-		return
-	}
-
-	err = wgIface.Create()
-	if err != nil {
-		t.Errorf("create and init wireguard interface: %v", err)
-		return
-	}
-	defer func() {
-		if err = wgIface.Close(); err != nil {
-			t.Logf("close wireguard interface: %v", err)
-		}
-	}()
-
-	ctrl := gomock.NewController(t)
-	defer ctrl.Finish()
-
-	packetfilter := pfmock.NewMockPacketFilter(ctrl)
-	packetfilter.EXPECT().FilterOutbound(gomock.Any(), gomock.Any()).AnyTimes()
-	packetfilter.EXPECT().SetUDPPacketHook(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
-	packetfilter.EXPECT().SetTCPPacketHook(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes()
-
-	if err := wgIface.SetFilter(packetfilter); err != nil {
-		t.Errorf("set packet filter: %v", err)
-		return
-	}
-
-	dnsServer, err := NewDefaultServer(context.Background(), DefaultServerConfig{
-		WgInterface:    wgIface,
-		CustomAddress:  "",
-		StatusRecorder: peer.NewRecorder("mgm"),
-		StateManager:   nil,
-		DisableSys:     false,
-	})
-	if err != nil {
-		t.Errorf("create DNS server: %v", err)
-		return
-	}
-
-	err = dnsServer.Initialize()
-	if err != nil {
-		t.Errorf("run DNS server: %v", err)
-		return
-	}
-	defer func() {
-		if err = dnsServer.hostManager.restoreHostDNS(); err != nil {
-			t.Logf("restore DNS settings on the host: %v", err)
-			return
-		}
-	}()
-
-	dnsServer.dnsMuxMap = registeredHandlerMap{
-		"id1": handlerWrapper{
-			domain:   zoneRecords[0].Name,
-			handler:  &local.Resolver{},
-			priority: PriorityUpstream,
-		},
-	}
-	dnsServer.localResolver.Update([]nbdns.CustomZone{{Domain: "netbird.cloud", Records: []nbdns.SimpleRecord{{Name: "netbird.cloud", Type: int(dns.TypeA), Class: nbdns.DefaultClass, TTL: 300, RData: "10.0.0.1"}}}})
-	dnsServer.updateSerial = 0
-
-	nameServers := []nbdns.NameServer{
-		{
-			IP:     netip.MustParseAddr("8.8.8.8"),
-			NSType: nbdns.UDPNameServerType,
-			Port:   53,
-		},
-		{
-			IP:     netip.MustParseAddr("8.8.4.4"),
-			NSType: nbdns.UDPNameServerType,
-			Port:   53,
-		},
-	}
-
-	update := nbdns.Config{
-		ServiceEnable: true,
-		CustomZones: []nbdns.CustomZone{
-			{
-				Domain:  "netbird.cloud",
-				Records: zoneRecords,
-			},
-		},
-		NameServerGroups: []*nbdns.NameServerGroup{
-			{
-				Domains:     []string{"netbird.io"},
-				NameServers: nameServers,
-			},
-			{
-				NameServers: nameServers,
-				Primary:     true,
-			},
-		},
-	}
-
-	// Start the server with regular configuration
-	if err := dnsServer.UpdateDNSServer(1, update); err != nil {
-		t.Fatalf("update dns server should not fail, got error: %v", err)
-		return
-	}
-
-	update2 := update
-	update2.ServiceEnable = false
-	// Disable the server, stop the listener
-	if err := dnsServer.UpdateDNSServer(2, update2); err != nil {
-		t.Fatalf("update dns server should not fail, got error: %v", err)
-		return
-	}
-
-	update3 := update2
-	update3.NameServerGroups = update3.NameServerGroups[:1]
-	// But service still get updates and we checking that we handle
-	// internal state in the right way
-	if err := dnsServer.UpdateDNSServer(3, update3); err != nil {
-		t.Fatalf("update dns server should not fail, got error: %v", err)
-		return
-	}
-}
-
 func TestDNSServerStartStop(t *testing.T) {
 	testCases := []struct {
 		name     string
@@ -1029,15 +552,15 @@ func (m *mockService) RegisterMux(string, dns.Handler) {}
 func (m *mockService) DeregisterMux(string)            {}

 func TestDefaultServer_UpdateMux(t *testing.T) {
-	baseMatchHandlers := registeredHandlerMap{
-		"upstream-group1": {
+	baseMatchHandlers := []handlerWrapper{
+		{
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group1",
 			},
 			priority: PriorityUpstream,
 		},
-		"upstream-group2": {
+		{
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group2",
@@ -1046,15 +569,15 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 		},
 	}

-	baseRootHandlers := registeredHandlerMap{
-		"upstream-root1": {
+	baseRootHandlers := []handlerWrapper{
+		{
 			domain: ".",
 			handler: &mockHandler{
 				Id: "upstream-root1",
 			},
 			priority: PriorityDefault,
 		},
-		"upstream-root2": {
+		{
 			domain: ".",
 			handler: &mockHandler{
 				Id: "upstream-root2",
@@ -1063,22 +586,22 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 		},
 	}

-	baseMixedHandlers := registeredHandlerMap{
-		"upstream-group1": {
+	baseMixedHandlers := []handlerWrapper{
+		{
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group1",
 			},
 			priority: PriorityUpstream,
 		},
-		"upstream-group2": {
+		{
 			domain: "example.com",
 			handler: &mockHandler{
 				Id: "upstream-group2",
 			},
 			priority: PriorityUpstream - 1,
 		},
-		"upstream-other": {
+		{
 			domain: "other.com",
 			handler: &mockHandler{
 				Id: "upstream-other",
@@ -1089,7 +612,7 @@ func TestDefaultServer_UpdateMux(t *testing.T) {

 	tests := []struct {
 		name             string
-		initialHandlers  registeredHandlerMap
+		initialHandlers  []handlerWrapper
 		updates          []handlerWrapper
 		expectedHandlers map[string]string // map[HandlerID]domain
 		description      string
@@ -1373,32 +896,38 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			server := &DefaultServer{
-				dnsMuxMap:    tt.initialHandlers,
-				handlerChain: NewHandlerChain(),
-				service:      &mockService{},
+				dnsMuxHandlers: tt.initialHandlers,
+				handlerChain:   NewHandlerChain(),
+				service:        &mockService{},
 			}

 			// Perform the update
 			server.updateMux(tt.updates)

 			// Verify the results
-			assert.Equal(t, len(tt.expectedHandlers), len(server.dnsMuxMap),
+			assert.Equal(t, len(tt.expectedHandlers), len(server.dnsMuxHandlers),
 				"Number of handlers after update doesn't match expected")

 			// Check each expected handler
 			for id, expectedDomain := range tt.expectedHandlers {
-				handler, exists := server.dnsMuxMap[types.HandlerID(id)]
-				assert.True(t, exists, "Expected handler %s not found", id)
-				if exists {
-					assert.Equal(t, expectedDomain, handler.domain,
+				var found *handlerWrapper
+				for i := range server.dnsMuxHandlers {
+					if server.dnsMuxHandlers[i].handler.ID() == types.HandlerID(id) {
+						found = &server.dnsMuxHandlers[i]
+						break
+					}
+				}
+				assert.NotNil(t, found, "Expected handler %s not found", id)
+				if found != nil {
+					assert.Equal(t, expectedDomain, found.domain,
 						"Domain mismatch for handler %s", id)
 				}
 			}

 			// Verify no unexpected handlers exist
-			for HandlerID := range server.dnsMuxMap {
-				_, expected := tt.expectedHandlers[string(HandlerID)]
-				assert.True(t, expected, "Unexpected handler found: %s", HandlerID)
+			for _, entry := range server.dnsMuxHandlers {
+				_, expected := tt.expectedHandlers[string(entry.handler.ID())]
+				assert.True(t, expected, "Unexpected handler found: %s", entry.handler.ID())
 			}

 			// Verify the handlerChain state and order
@@ -1413,7 +942,7 @@ func TestDefaultServer_UpdateMux(t *testing.T) {

 				// Verify handler exists in mux
 				foundInMux := false
-				for _, muxEntry := range server.dnsMuxMap {
+				for _, muxEntry := range server.dnsMuxHandlers {
 					if chainEntry.Handler == muxEntry.handler &&
 						chainEntry.Priority == muxEntry.priority &&
 						chainEntry.Pattern == dns.Fqdn(muxEntry.domain) {
@@ -1422,12 +951,108 @@ func TestDefaultServer_UpdateMux(t *testing.T) {
 					}
 				}
 				assert.True(t, foundInMux,
-					"Handler in chain not found in dnsMuxMap")
+					"Handler in chain not found in dnsMuxHandlers")
 			}
 		})
 	}
 }

+// chainHasPattern reports whether the handler chain holds an entry registered
+// for the given fqdn pattern at the given priority.
+func chainHasPattern(s *DefaultServer, pattern string, priority int) bool {
+	for _, h := range s.handlerChain.handlers {
+		if h.OrigPattern == pattern && h.Priority == priority {
+			return true
+		}
+	}
+	return false
+}
+
+// TestDefaultServer_UpdateMux_SharedHandlerZoneRemoval verifies that updateMux
+// tracks each (handler, domain) registration independently when one handler
+// serves multiple zones. Every custom zone is served by the same handler
+// instance (the local resolver, whose ID is the constant "local-resolver"), so
+// removing one zone must deregister exactly that zone's chain entry and leave
+// the others in place. Tracking registrations by handler ID alone collapses all
+// zones onto one entry, leaving removed zones in the chain to answer
+// authoritatively with no records.
+func TestDefaultServer_UpdateMux_SharedHandlerZoneRemoval(t *testing.T) {
+	// One handler serves every custom zone, mirroring s.localResolver.
+	shared := &mockHandler{Id: "local-resolver"}
+
+	server := &DefaultServer{
+		handlerChain: NewHandlerChain(),
+		service:      &mockService{},
+	}
+
+	// Two custom zones under the same handler. The surviving zone is registered
+	// last, mirroring the management emission order.
+	server.updateMux([]handlerWrapper{
+		{domain: "userzone.test", handler: shared, priority: PriorityLocal},
+		{domain: "peerzone.test", handler: shared, priority: PriorityLocal},
+	})
+
+	require.True(t, chainHasPattern(server, "userzone.test.", PriorityLocal),
+		"userzone.test should be registered after the first update")
+	require.True(t, chainHasPattern(server, "peerzone.test.", PriorityLocal),
+		"peerzone.test should be registered after the first update")
+
+	// Remove one zone, keep the other.
+	server.updateMux([]handlerWrapper{
+		{domain: "peerzone.test", handler: shared, priority: PriorityLocal},
+	})
+
+	assert.True(t, chainHasPattern(server, "peerzone.test.", PriorityLocal),
+		"peerzone.test should remain after removing userzone.test")
+	assert.False(t, chainHasPattern(server, "userzone.test.", PriorityLocal),
+		"userzone.test handler must be deregistered, not leaked in the chain")
+}
+
+// TestDefaultServer_UpdateMux_PreservesLocalResolver verifies that updateMux
+// does not tear down the shared local resolver during reconfiguration. The
+// resolver is a process-lifetime singleton reused across config updates;
+// Stop() cancels its lookup context (breaking external CNAME-target
+// resolution) and clears its records. updateMux must deregister its chain
+// entries without stopping it. Records surviving a teardown update is the
+// observable proxy: Stop() would have cleared them.
+func TestDefaultServer_UpdateMux_PreservesLocalResolver(t *testing.T) {
+	resolver := local.NewResolver()
+	require.NoError(t, resolver.RegisterRecord(nbdns.SimpleRecord{
+		Name:  "peer.netbird.cloud.",
+		Type:  int(dns.TypeA),
+		Class: nbdns.DefaultClass,
+		TTL:   300,
+		RData: "10.0.0.1",
+	}))
+
+	server := &DefaultServer{
+		handlerChain:  NewHandlerChain(),
+		service:       &mockService{},
+		localResolver: resolver,
+	}
+
+	server.updateMux([]handlerWrapper{
+		{domain: "netbird.cloud", handler: resolver, priority: PriorityLocal},
+	})
+
+	// Remove the zone. The resolver must survive so its records and lookup
+	// context stay intact for the next registration.
+	server.updateMux(nil)
+
+	var response *dns.Msg
+	resolver.ServeDNS(&test.MockResponseWriter{
+		WriteMsgFunc: func(m *dns.Msg) error {
+			response = m
+			return nil
+		},
+	}, &dns.Msg{Question: []dns.Question{{Name: "peer.netbird.cloud.", Qtype: dns.TypeA, Qclass: dns.ClassINET}}})
+
+	require.NotNil(t, response, "local resolver should answer after teardown")
+	assert.Equal(t, dns.RcodeSuccess, response.Rcode,
+		"local resolver records must survive teardown; updateMux must not Stop() the shared resolver")
+	assert.NotEmpty(t, response.Answer, "answer should contain the surviving record")
+}
+
 func TestExtraDomains(t *testing.T) {
 	tests := []struct {
 		name                string
@@ -2049,7 +1674,6 @@ func TestBuildUpstreamHandler_MergesGroupsPerDomain(t *testing.T) {
 		localResolver: local.NewResolver(),
 		handlerChain:  NewHandlerChain(),
 		hostManager:   &noopHostConfigurator{},
-		dnsMuxMap:     make(registeredHandlerMap),
 	}

 	groups := []*nbdns.NameServerGroup{
@@ -2207,7 +1831,7 @@ func TestEvaluateNSGroupHealth(t *testing.T) {
 	}
 }

-// healthStubHandler is a minimal dnsMuxMap entry that exposes a fixed
+// healthStubHandler is a minimal dnsMuxHandlers entry that exposes a fixed
 // UpstreamHealth snapshot, letting tests drive recomputeNSGroupStates
 // without spinning up real handlers.
 type healthStubHandler struct {
@@ -2283,12 +1907,11 @@ func newProjTestFixture(t *testing.T) *projTestFixture {
 		ctx:              context.Background(),
 		wgInterface:      &mocWGIface{},
 		statusRecorder:   recorder,
-		dnsMuxMap:        make(registeredHandlerMap),
 		selectedRoutes:   func() route.HAMap { return fx.selected },
 		activeRoutes:     func() route.HAMap { return fx.active },
 		warningDelayBase: defaultWarningDelayBase,
 	}
-	fx.server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: fx.stub, priority: PriorityUpstream}
+	fx.server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: fx.stub, priority: PriorityUpstream}}

 	fx.server.mux.Lock()
 	fx.server.updateNSGroupStates([]*nbdns.NameServerGroup{fx.group})
@@ -2395,7 +2018,6 @@ func TestProjection_OverlayAddrNoRouteDelaysWarning(t *testing.T) {
 		ctx:              context.Background(),
 		wgInterface:      &mocWGIface{},
 		statusRecorder:   recorder,
-		dnsMuxMap:        make(registeredHandlerMap),
 		selectedRoutes:   func() route.HAMap { return nil },
 		activeRoutes:     func() route.HAMap { return nil },
 		warningDelayBase: 50 * time.Millisecond,
@@ -2407,7 +2029,7 @@ func TestProjection_OverlayAddrNoRouteDelaysWarning(t *testing.T) {
 	stub := &healthStubHandler{health: map[netip.AddrPort]UpstreamHealth{
 		overlayPeer: {LastFail: time.Now(), LastErr: "timeout"},
 	}}
-	server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}
+	server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}

 	server.mux.Lock()
 	server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2444,7 +2066,6 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
 		service:           NewServiceViaMemory(wgIface),
 		hostManager:       &noopHostConfigurator{},
 		extraDomains:      map[domain.Domain]int{},
-		dnsMuxMap:         make(registeredHandlerMap),
 		statusRecorder:    peer.NewRecorder("mgm"),
 		selectedRoutes:    func() route.HAMap { return nil },
 		activeRoutes:      func() route.HAMap { return nil },
@@ -2459,7 +2080,7 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
 		NameServers: []nbdns.NameServer{{IP: srv.Addr(), NSType: nbdns.UDPNameServerType, Port: int(srv.Port())}},
 	}
 	stub := &healthStubHandler{health: map[netip.AddrPort]UpstreamHealth{srv: {LastOk: time.Now()}}}
-	server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}
+	server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}

 	server.mux.Lock()
 	server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2484,6 +2105,32 @@ func TestProjection_StopClearsHealthState(t *testing.T) {
 // rule 3: startup failures while the peer is handshaking, then the peer
 // comes up and a query succeeds before the grace window elapses. No
 // warning should ever have fired, and no recovery either.
+func TestWarningDelayBaseFromEnv(t *testing.T) {
+	tests := []struct {
+		name string
+		set  bool
+		val  string
+		want time.Duration
+	}{
+		{name: "unset uses default", set: false, want: defaultWarningDelayBase},
+		{name: "valid override", set: true, val: "90s", want: 90 * time.Second},
+		{name: "valid minutes", set: true, val: "2m", want: 2 * time.Minute},
+		{name: "invalid falls back", set: true, val: "notaduration", want: defaultWarningDelayBase},
+		{name: "zero falls back", set: true, val: "0s", want: defaultWarningDelayBase},
+		{name: "negative falls back", set: true, val: "-30s", want: defaultWarningDelayBase},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			t.Setenv(envWarningDelay, tc.val)
+			if !tc.set {
+				os.Unsetenv(envWarningDelay)
+			}
+			assert.Equal(t, tc.want, warningDelayBaseFromEnv(), "grace window base")
+		})
+	}
+}
+
 func TestProjection_OverlayRecoversDuringGrace(t *testing.T) {
 	fx := newProjTestFixture(t)
 	fx.server.warningDelayBase = 200 * time.Millisecond
@@ -2595,7 +2242,6 @@ func TestProjection_MixedGroupEmitsImmediately(t *testing.T) {
 	server := &DefaultServer{
 		ctx:              context.Background(),
 		statusRecorder:   recorder,
-		dnsMuxMap:        make(registeredHandlerMap),
 		selectedRoutes:   func() route.HAMap { return overlayMap },
 		activeRoutes:     func() route.HAMap { return nil },
 		warningDelayBase: time.Hour,
@@ -2613,7 +2259,7 @@ func TestProjection_MixedGroupEmitsImmediately(t *testing.T) {
 			overlay: {LastFail: time.Now(), LastErr: "timeout"},
 		},
 	}
-	server.dnsMuxMap["example.com"] = handlerWrapper{domain: "example.com", handler: stub, priority: PriorityUpstream}
+	server.dnsMuxHandlers = []handlerWrapper{{domain: "example.com", handler: stub, priority: PriorityUpstream}}

 	server.mux.Lock()
 	server.updateNSGroupStates([]*nbdns.NameServerGroup{group})
@@ -2640,7 +2286,6 @@ func TestDNSLoopPrevention(t *testing.T) {
 		localResolver: local.NewResolver(),
 		handlerChain:  NewHandlerChain(),
 		hostManager:   &noopHostConfigurator{},
-		dnsMuxMap:     make(registeredHandlerMap),
 	}

 	tests := []struct {
--- a/client/internal/dns/upstream.go
+++ b/client/internal/dns/upstream.go
@@ -443,29 +443,32 @@ func (u *upstreamResolverBase) queryUpstream(parentCtx context.Context, r *dns.M
 		return raceResult{}, &upstreamFailure{upstream: upstream, reason: "no response"}
 	}

+	// A valid response means the upstream is reachable, whatever the Rcode.
+	u.markUpstreamOk(upstream)
+
 	proto := ""
 	if upstreamProto != nil {
 		proto = upstreamProto.protocol
 	}

 	if rm.Rcode == dns.RcodeServerFailure || rm.Rcode == dns.RcodeRefused {
+		// SERVFAIL and REFUSED are per-question outcomes (DNSSEC-bogus names,
+		// refused zones, transient recursion errors), not reachability
+		// problems: fail over for a better answer but keep the upstream healthy.
 		if code, ok := nonRetryableEDE(rm); ok {
 			if !hadEdns {
-				stripOPT(rm)
+				resutil.StripOPT(rm)
 			}
-			u.markUpstreamOk(upstream)
 			return raceResult{msg: rm, upstream: upstream, protocol: proto, ede: edeName(code)}, nil
 		}
 		reason := dns.RcodeToString[rm.Rcode]
-		u.markUpstreamFail(upstream, reason)
 		return raceResult{}, &upstreamFailure{upstream: upstream, reason: reason}
 	}

 	if !hadEdns {
-		stripOPT(rm)
+		resutil.StripOPT(rm)
 	}

-	u.markUpstreamOk(upstream)
 	return raceResult{msg: rm, upstream: upstream, protocol: proto}, nil
 }

@@ -520,22 +523,6 @@ func upstreamUDPSize() uint16 {
 	return dns.MinMsgSize
 }

-// stripOPT removes any OPT pseudo-RRs from the response's Extra section so
-// the response complies with RFC 6891 when the client did not advertise EDNS0.
-func stripOPT(rm *dns.Msg) {
-	if len(rm.Extra) == 0 {
-		return
-	}
-	out := rm.Extra[:0]
-	for _, rr := range rm.Extra {
-		if _, ok := rr.(*dns.OPT); ok {
-			continue
-		}
-		out = append(out, rr)
-	}
-	rm.Extra = out
-}
-
 func (u *upstreamResolverBase) handleUpstreamError(err error, upstream netip.AddrPort, startTime time.Time) *upstreamFailure {
 	if !errors.Is(err, context.DeadlineExceeded) && !isTimeout(err) {
 		return &upstreamFailure{upstream: upstream, reason: err.Error()}
--- a/client/internal/dns/upstream_test.go
+++ b/client/internal/dns/upstream_test.go
@@ -517,6 +517,78 @@ func TestUpstreamResolver_HealthTracking(t *testing.T) {
 	assert.NotContains(t, health, bad, "sibling upstream should not be queried when primary answers")
 }

+// TestUpstreamResolver_HealthTracking_ResponseMeansReachable verifies that an
+// upstream which answers with SERVFAIL or REFUSED is recorded as healthy:
+// those are per-question outcomes from a reachable server and must not mark
+// the upstream unhealthy. Only transport failures (timeouts) do.
+func TestUpstreamResolver_HealthTracking_ResponseMeansReachable(t *testing.T) {
+	a := netip.MustParseAddrPort("192.0.2.10:53")
+	b := netip.MustParseAddrPort("192.0.2.11:53")
+	timeoutErr := &net.OpError{Op: "read", Err: fmt.Errorf("i/o timeout")}
+
+	tests := []struct {
+		name        string
+		respA       mockUpstreamResponse
+		respB       mockUpstreamResponse
+		wantHealthy bool
+	}{
+		{
+			name:        "both SERVFAIL are reachable",
+			respA:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeServerFailure, "")},
+			respB:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeServerFailure, "")},
+			wantHealthy: true,
+		},
+		{
+			name:        "both REFUSED are reachable",
+			respA:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeRefused, "")},
+			respB:       mockUpstreamResponse{msg: buildMockResponse(dns.RcodeRefused, "")},
+			wantHealthy: true,
+		},
+		{
+			name:        "timeout marks unhealthy",
+			respA:       mockUpstreamResponse{err: timeoutErr},
+			respB:       mockUpstreamResponse{err: timeoutErr},
+			wantHealthy: false,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			mockClient := &mockUpstreamResolverPerServer{
+				responses: map[string]mockUpstreamResponse{
+					a.String(): tc.respA,
+					b.String(): tc.respB,
+				},
+				rtt: time.Millisecond,
+			}
+
+			ctx, cancel := context.WithCancel(context.Background())
+			defer cancel()
+
+			resolver := &upstreamResolverBase{
+				ctx:             ctx,
+				upstreamClient:  mockClient,
+				upstreamTimeout: UpstreamTimeout,
+			}
+			resolver.addRace([]netip.AddrPort{a, b})
+
+			responseWriter := &test.MockResponseWriter{WriteMsgFunc: func(m *dns.Msg) error { return nil }}
+			resolver.ServeDNS(responseWriter, new(dns.Msg).SetQuestion("example.com.", dns.TypeA))
+
+			health := resolver.UpstreamHealth()
+			require.Contains(t, health, a, "primary upstream should have a health record")
+			if tc.wantHealthy {
+				assert.False(t, health[a].LastOk.IsZero(), "responding upstream should have LastOk set")
+				assert.True(t, health[a].LastFail.IsZero(), "responding upstream should not be marked failed")
+				assert.Empty(t, health[a].LastErr, "responding upstream should have no error")
+			} else {
+				assert.False(t, health[a].LastFail.IsZero(), "timed-out upstream should be marked failed")
+				assert.NotEmpty(t, health[a].LastErr, "timed-out upstream should record an error")
+			}
+		})
+	}
+}
+
 func TestFormatFailures(t *testing.T) {
 	testCases := []struct {
 		name     string
@@ -913,19 +985,6 @@ func TestEDEName(t *testing.T) {
 	assert.Equal(t, "EDE 9999", edeName(9999), "unknown code falls back to numeric")
 }

-func TestStripOPT(t *testing.T) {
-	rm := &dns.Msg{
-		Extra: []dns.RR{
-			&dns.OPT{Hdr: dns.RR_Header{Name: ".", Rrtype: dns.TypeOPT}},
-			&dns.A{Hdr: dns.RR_Header{Name: "x.", Rrtype: dns.TypeA}, A: net.IPv4(1, 2, 3, 4)},
-		},
-	}
-	stripOPT(rm)
-	assert.Len(t, rm.Extra, 1, "OPT should be removed, A kept")
-	_, isOPT := rm.Extra[0].(*dns.OPT)
-	assert.False(t, isOPT, "remaining record must not be OPT")
-}
-
 func TestUpstreamResolver_NonRetryableEDEShortCircuits(t *testing.T) {
 	upstream1 := netip.MustParseAddrPort("192.0.2.1:53")
 	upstream2 := netip.MustParseAddrPort("192.0.2.2:53")
--- a/client/internal/dnsfwd/forwarder.go
+++ b/client/internal/dnsfwd/forwarder.go
@@ -26,6 +26,15 @@ import (
 const errResolveFailed = "failed to resolve query for domain=%s: %v"
 const upstreamTimeout = 15 * time.Second

+// EDE info codes the forwarder emits on upstream failures so the querying
+// client can see the reason without inspecting this peer's logs. They live in
+// the RFC 8914 Private Use range (49152-65535); the Go resolver never exposes a
+// real upstream EDE here, so these cannot collide with a genuine code.
+const (
+	edeNetbirdUpstreamTimeout uint16 = 49152
+	edeNetbirdUpstreamFailure uint16 = 49153
+)
+
 type resolver interface {
 	LookupNetIP(ctx context.Context, network, host string) ([]netip.Addr, error)
 }
@@ -220,7 +229,7 @@ func (f *DNSForwarder) handleDNSQuery(logger *log.Entry, w dns.ResponseWriter, q

 	result := resutil.LookupIP(ctx, f.resolver, network, qname, question.Qtype)
 	if result.Err != nil {
-		f.handleDNSError(ctx, logger, w, question, resp, qname, result, startTime)
+		f.handleDNSError(ctx, logger, w, question, resp, qname, result, query.IsEdns0() != nil, startTime)
 		return
 	}

@@ -333,6 +342,7 @@ func (f *DNSForwarder) handleDNSError(
 	resp *dns.Msg,
 	domain string,
 	result resutil.LookupResult,
+	reqHasEdns bool,
 	startTime time.Time,
 ) {
 	qType := question.Qtype
@@ -374,6 +384,10 @@ func (f *DNSForwarder) handleDNSError(
 		logger.Warnf(errResolveFailed, domain, result.Err)
 	}

+	if reqHasEdns {
+		attachEDE(resp, edeCodeFor(dnsErr), edeText(dnsErr))
+	}
+
 	f.writeResponse(logger, w, resp, domain, startTime)
 }

@@ -414,3 +428,33 @@ func (f *DNSForwarder) getMatchingEntries(domain string) (route.ResID, []*Forwar

 	return selectedResId, matches
 }
+
+// edeCodeFor maps an upstream lookup error to the NetBird EDE info code.
+func edeCodeFor(dnsErr *net.DNSError) uint16 {
+	if dnsErr != nil && dnsErr.IsTimeout {
+		return edeNetbirdUpstreamTimeout
+	}
+	return edeNetbirdUpstreamFailure
+}
+
+// edeText builds the EDE extra-text describing the class of upstream failure.
+// It deliberately omits the upstream server address, which may be an internal
+// resolver and is exposed to any client permitted to use the route; the full
+// detail stays in the forwarder's local log.
+func edeText(dnsErr *net.DNSError) string {
+	if dnsErr != nil && dnsErr.IsTimeout {
+		return "netbird forwarder: upstream timeout"
+	}
+	return "netbird forwarder: upstream failure"
+}
+
+// attachEDE adds an Extended DNS Error (RFC 8914) option to the response,
+// creating the OPT pseudo-record if the response does not already carry one.
+func attachEDE(resp *dns.Msg, code uint16, text string) {
+	opt := resp.IsEdns0()
+	if opt == nil {
+		resp.SetEdns0(dns.DefaultMsgSize, false)
+		opt = resp.IsEdns0()
+	}
+	opt.Option = append(opt.Option, &dns.EDNS0_EDE{InfoCode: code, ExtraText: text})
+}
--- a/client/internal/dnsfwd/forwarder_test.go
+++ b/client/internal/dnsfwd/forwarder_test.go
@@ -16,6 +16,7 @@ import (
 	"github.com/stretchr/testify/require"

 	firewall "github.com/netbirdio/netbird/client/firewall/manager"
+	"github.com/netbirdio/netbird/client/internal/dns/resutil"
 	"github.com/netbirdio/netbird/client/internal/dns/test"
 	"github.com/netbirdio/netbird/client/internal/peer"
 	"github.com/netbirdio/netbird/route"
@@ -617,6 +618,85 @@ func TestDNSForwarder_ResponseCodes(t *testing.T) {
 	}
 }

+func TestDNSForwarder_UpstreamFailureEDE(t *testing.T) {
+	tests := []struct {
+		name        string
+		lookupErr   error
+		reqEdns     bool
+		wantEDE     bool
+		wantCode    uint16
+		wantTextHas string
+	}{
+		{
+			name:        "timeout with edns0",
+			lookupErr:   &net.DNSError{Err: "i/o timeout", Server: "10.0.0.53:53", IsTimeout: true},
+			reqEdns:     true,
+			wantEDE:     true,
+			wantCode:    edeNetbirdUpstreamTimeout,
+			wantTextHas: "netbird forwarder: upstream timeout",
+		},
+		{
+			name:        "server failure with edns0",
+			lookupErr:   &net.DNSError{Err: "server misbehaving", Server: "10.0.0.53:53"},
+			reqEdns:     true,
+			wantEDE:     true,
+			wantCode:    edeNetbirdUpstreamFailure,
+			wantTextHas: "netbird forwarder: upstream failure",
+		},
+		{
+			name:      "no edns0 in request omits ede",
+			lookupErr: &net.DNSError{Err: "server misbehaving", Server: "10.0.0.53:53"},
+			reqEdns:   false,
+			wantEDE:   false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mockResolver := &MockResolver{}
+			forwarder := NewDNSForwarder(netip.MustParseAddrPort("127.0.0.1:0"), 300, nil, &peer.Status{}, nil)
+			forwarder.resolver = mockResolver
+
+			d, err := domain.FromString("example.com")
+			require.NoError(t, err)
+			forwarder.UpdateDomains([]*ForwarderEntry{{Domain: d, ResID: "test-res"}})
+
+			mockResolver.On("LookupNetIP", mock.Anything, "ip4", "example.com.").
+				Return([]netip.Addr(nil), tt.lookupErr).Once()
+
+			query := &dns.Msg{}
+			query.SetQuestion("example.com.", dns.TypeA)
+			if tt.reqEdns {
+				query.SetEdns0(dns.DefaultMsgSize, false)
+			}
+
+			var writtenResp *dns.Msg
+			mockWriter := &test.MockResponseWriter{
+				WriteMsgFunc: func(m *dns.Msg) error {
+					writtenResp = m
+					return nil
+				},
+			}
+
+			forwarder.handleDNSQuery(log.NewEntry(log.StandardLogger()), mockWriter, query, time.Now())
+			mockResolver.AssertExpectations(t)
+
+			require.NotNil(t, writtenResp, "expected a response")
+			assert.Equal(t, dns.RcodeServerFailure, writtenResp.Rcode, "upstream failure must be SERVFAIL")
+
+			ede, ok := resutil.ExtractEDE(writtenResp)
+			if !tt.wantEDE {
+				assert.False(t, ok, "response must not carry EDE")
+				return
+			}
+			require.True(t, ok, "response must carry EDE")
+			assert.Equal(t, tt.wantCode, ede.InfoCode, "EDE info code")
+			assert.Contains(t, ede.ExtraText, tt.wantTextHas, "EDE extra-text")
+			assert.NotContains(t, ede.ExtraText, "10.0.0.53", "must not leak upstream server address")
+		})
+	}
+}
+
 func TestDNSForwarder_TCPTruncation(t *testing.T) {
 	// Test that large UDP responses are truncated with TC bit set
 	mockResolver := &MockResolver{}
--- a/client/internal/engine.go
+++ b/client/internal/engine.go
@@ -22,8 +22,6 @@ import (
 	log "github.com/sirupsen/logrus"
 	"golang.zx2c4.com/wireguard/tun/netstack"
 	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
-	"google.golang.org/grpc/codes"
-	gstatus "google.golang.org/grpc/status"

 	nberrors "github.com/netbirdio/netbird/client/errors"
 	"github.com/netbirdio/netbird/client/firewall"
@@ -88,6 +86,8 @@ const (

 var ErrResetConnection = fmt.Errorf("reset connection")

+var ErrEngineAlreadyStarted = errors.New("engine already started")
+
 type EngineConfig struct {
 	WgPort      int
 	WgIfaceName string
@@ -201,6 +201,8 @@ type Engine struct {
 	ctx    context.Context
 	cancel context.CancelFunc

+	started bool
+
 	wgInterface WGIface

 	udpMux *udpmux.UniversalUDPMuxDefault
@@ -208,6 +210,12 @@ type Engine struct {
 	// networkSerial is the latest CurrentSerial (state ID) of the network sent by the Management service
 	networkSerial uint64

+	// forwardingRules holds the ingress forward rules applied for the current target.
+	// Wholesale sections (incl. forward rules) run only on the first pass of a target;
+	// it is stashed here so the final, peer-converged pass can build the lazy-connection
+	// exclude list without recomputing them on every bounded peer pass.
+	forwardingRules []firewallManager.ForwardRule
+
 	networkMonitor *networkmonitor.NetworkMonitor

 	sshServer sshServer
@@ -281,9 +289,15 @@ func NewEngine(
 	services EngineServices,
 	mobileDep MobileDependency,
 ) *Engine {
+	// The engine is single-use: a fresh instance is built per connection
+	// cycle (see Client.run), so the run context is created once here rather
+	// than in Start.
+	ctx, cancel := context.WithCancel(clientCtx)
 	engine := &Engine{
 		clientCtx:          clientCtx,
 		clientCancel:       clientCancel,
+		ctx:                ctx,
+		cancel:             cancel,
 		signal:             services.SignalClient,
 		signaler:           peer.NewSignaler(services.SignalClient, config.WgPrivateKey),
 		mgmClient:          services.MgmClient,
@@ -316,8 +330,34 @@ func (e *Engine) Stop() error {
 		log.Debugf("tried stopping engine that is nil")
 		return nil
 	}
+	e.cancel()
 	e.syncMsgMux.Lock()

+	e.stopLocked()
+
+	e.syncMsgMux.Unlock()
+
+	timeout := e.calculateShutdownTimeout()
+	log.Debugf("waiting for goroutines to finish with timeout: %v", timeout)
+	shutdownCtx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+
+	if err := waitWithContext(shutdownCtx, &e.shutdownWg); err != nil {
+		log.Warnf("shutdown timeout exceeded after %v, some goroutines may still be running", timeout)
+	}
+
+	log.Infof("stopped Netbird Engine")
+
+	return nil
+}
+
+// stopLocked tears down everything Start may have brought up, in the order
+// teardown requires (DNS before the interface goes down, flow manager after).
+// The caller must hold syncMsgMux. It is shared by Stop and by Start's failure
+// path, so a partially-initialized engine is cleaned up the same way; every
+// step is nil-guarded. It does not wait on shutdownWg — the caller does that
+// after releasing the lock, since the goroutines also take syncMsgMux.
+func (e *Engine) stopLocked() {
 	if e.connMgr != nil {
 		e.connMgr.Close()
 	}
@@ -368,10 +408,6 @@ func (e *Engine) Stop() error {
 	// so dbus and friends don't complain because of a missing interface
 	e.stopDNSServer()

-	if e.cancel != nil {
-		e.cancel()
-	}
-
 	e.jobExecutorWG.Wait() // block until job goroutines finish

 	e.close()
@@ -390,21 +426,6 @@ func (e *Engine) Stop() error {
 	if err := e.stateManager.PersistState(context.Background()); err != nil {
 		log.Errorf("failed to persist state: %v", err)
 	}
-
-	e.syncMsgMux.Unlock()
-
-	timeout := e.calculateShutdownTimeout()
-	log.Debugf("waiting for goroutines to finish with timeout: %v", timeout)
-	shutdownCtx, cancel := context.WithTimeout(context.Background(), timeout)
-	defer cancel()
-
-	if err := waitWithContext(shutdownCtx, &e.shutdownWg); err != nil {
-		log.Warnf("shutdown timeout exceeded after %v, some goroutines may still be running", timeout)
-	}
-
-	log.Infof("stopped Netbird Engine")
-
-	return nil
 }

 // calculateShutdownTimeout returns shutdown timeout: 10s base + 100ms per peer, capped at 30s.
@@ -442,18 +463,38 @@ func waitWithContext(ctx context.Context, wg *sync.WaitGroup) error {
 // Start creates a new WireGuard tunnel interface and listens to events from Signal and Management services
 // Connections to remote peers are not established here.
 // However, they will be established once an event with a list of peers to connect to will be received from Management Service
-func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) error {
+func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL) (err error) {
 	e.syncMsgMux.Lock()
 	defer e.syncMsgMux.Unlock()

-	if err := iface.ValidateMTU(e.config.MTU); err != nil {
+	// The engine is single-use. Reject a duplicate start and a start on an
+	// already-stopped engine (run context cancelled).
+	if e.started {
+		return ErrEngineAlreadyStarted
+	}
+
+	if ctxErr := e.ctx.Err(); ctxErr != nil {
+		return fmt.Errorf("engine already stopped: %w", ctxErr)
+	}
+
+	e.started = true
+
+	// Tear down any partially-initialized state on a failed start. Cancel the
+	// run context first so goroutines started before the failure (connMgr,
+	// srWatcher, monitors) unwind, then stopLocked mirrors Stop's teardown (we
+	// already hold syncMsgMux), cleaning up route/DNS/flow/state managers too,
+	// not just what close() covers.
+	defer func() {
+		if err != nil {
+			e.cancel()
+			e.stopLocked()
+		}
+	}()
+
+	if err = iface.ValidateMTU(e.config.MTU); err != nil {
 		return fmt.Errorf("invalid MTU configuration: %w", err)
 	}

-	if e.cancel != nil {
-		e.cancel()
-	}
-	e.ctx, e.cancel = context.WithCancel(e.clientCtx)
 	e.exposeManager = expose.NewManager(e.ctx, e.mgmClient)

 	wgIface, err := e.newWgIface()
@@ -487,13 +528,11 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)

 	initialRoutes, dnsConfig, dnsFeatureFlag, err := e.readInitialSettings()
 	if err != nil {
-		e.close()
 		return fmt.Errorf("read initial settings: %w", err)
 	}

 	dnsServer, err := e.newDnsServer(dnsConfig)
 	if err != nil {
-		e.close()
 		return fmt.Errorf("create dns server: %w", err)
 	}
 	e.dnsServer = dnsServer
@@ -528,7 +567,6 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)

 	if err = e.wgInterfaceCreate(); err != nil {
 		log.Errorf("failed creating tunnel interface %s: [%s]", e.config.WgIfaceName, err.Error())
-		e.close()
 		return fmt.Errorf("create wg interface: %w", err)
 	}

@@ -537,7 +575,6 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 	}

 	if err := e.createFirewall(); err != nil {
-		e.close()
 		return err
 	}

@@ -549,7 +586,6 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 	e.udpMux, err = e.wgInterface.Up()
 	if err != nil {
 		log.Errorf("failed to pull up wgInterface [%s]: %s", e.wgInterface.Name(), err.Error())
-		e.close()
 		return fmt.Errorf("up wg interface: %w", err)
 	}

@@ -574,9 +610,7 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 		e.acl = acl.NewDefaultManager(e.firewall)
 	}

-	err = e.dnsServer.Initialize()
-	if err != nil {
-		e.close()
+	if err := e.dnsServer.Initialize(); err != nil {
 		return fmt.Errorf("initialize dns server: %w", err)
 	}

@@ -588,7 +622,9 @@ func (e *Engine) Start(netbirdConfig *mgmProto.NetbirdConfig, mgmtURL *url.URL)
 	e.srWatcher = guard.NewSRWatcher(e.signal, e.relayManager, e.mobileDep.IFaceDiscover, iceCfg)
 	e.srWatcher.Start(peer.IsForceRelayed())

-	e.receiveSignalEvents()
+	if err = e.receiveSignalEvents(); err != nil {
+		return err
+	}
 	e.receiveManagementEvents()
 	e.receiveJobEvents()

@@ -640,7 +676,6 @@ func (e *Engine) createFirewall() error {

 func (e *Engine) initFirewall() error {
 	if err := e.routeManager.SetFirewall(e.firewall); err != nil {
-		e.close()
 		return fmt.Errorf("set firewall: %w", err)
 	}

@@ -733,7 +768,15 @@ func (e *Engine) blockLanAccess() {

 // modifyPeers updates peers that have been modified (e.g. IP address has been changed).
 // It closes the existing connection, removes it from the peerConns map, and creates a new one.
-func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
+// maxPeersPerSyncPass is the default per-pass cap on how many peers each of
+// removePeers/modifyPeers/addNewPeers applies, so syncMsgMux is held only for a
+// batch at a time and other subsystems can interleave between passes. It is
+// passed in (not read globally) so tests can exercise the multi-pass path.
+const maxPeersPerSyncPass = 300
+
+// modifyPeers re-applies up to maxBatch changed peers per call. It returns true
+// when more changed peers remained than the cap, so the caller re-runs.
+func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {

 	// first, check if peers have been modified
 	var modified []*mgmProto.RemotePeerConfig
@@ -763,26 +806,32 @@ func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
 		}
 	}

+	more := false
+	if len(modified) > maxBatch {
+		modified = modified[:maxBatch]
+		more = true
+	}
+
 	// second, close all modified connections and remove them from the state map
 	for _, p := range modified {
-		err := e.removePeer(p.GetWgPubKey())
-		if err != nil {
-			return err
+		if err := e.removePeer(p.GetWgPubKey()); err != nil {
+			return false, err
 		}
 	}
 	// third, add the peer connections again
 	for _, p := range modified {
-		err := e.addNewPeer(p)
-		if err != nil {
-			return err
+		if err := e.addNewPeer(p); err != nil {
+			return false, err
 		}
 	}
-	return nil
+	return more, nil
 }

 // removePeers finds and removes peers that do not exist anymore in the network map received from the Management Service.
 // It also removes peers that have been modified (e.g. change of IP address). They will be added again in addPeers method.
-func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
+// removePeers removes up to maxBatch peers per call. It returns true when more
+// peers remained to remove than the cap, so the caller re-runs.
+func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {
 	newPeers := make([]string, 0, len(peersUpdate))
 	for _, p := range peersUpdate {
 		newPeers = append(newPeers, p.GetWgPubKey())
@@ -790,14 +839,19 @@ func (e *Engine) removePeers(peersUpdate []*mgmProto.RemotePeerConfig) error {

 	toRemove := util.SliceDiff(e.peerStore.PeersPubKey(), newPeers)

+	more := false
+	if len(toRemove) > maxBatch {
+		toRemove = toRemove[:maxBatch]
+		more = true
+	}
+
 	for _, p := range toRemove {
-		err := e.removePeer(p)
-		if err != nil {
-			return err
+		if err := e.removePeer(p); err != nil {
+			return false, err
 		}
 		log.Infof("removed peer %s", p)
 	}
-	return nil
+	return more, nil
 }

 func (e *Engine) removeAllPeers() error {
@@ -866,19 +920,17 @@ func (e *Engine) handleAutoUpdateVersion(autoUpdateSettings *mgmProto.AutoUpdate
 	e.updateManager.SetVersion(autoUpdateSettings.Version, autoUpdateSettings.AlwaysUpdate)
 }

-func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
-	started := time.Now()
-	defer func() {
-		duration := time.Since(started)
-		log.Infof("sync finished in %s", duration)
-		e.clientMetrics.RecordSyncDuration(e.ctx, duration)
-	}()
+// applySyncPass applies one bounded pass of the sync update under syncMsgMux and
+// returns true if more peers remained than the per-pass cap. It is driven by the
+// mapStateManager, which re-invokes it (releasing the lock between passes) until
+// the update is fully applied.
+func (e *Engine) applySyncPass(update *mgmProto.SyncResponse, firstPass bool) (bool, error) {
 	e.syncMsgMux.Lock()
 	defer e.syncMsgMux.Unlock()

 	// Check context INSIDE lock to ensure atomicity with shutdown
 	if e.ctx.Err() != nil {
-		return e.ctx.Err()
+		return false, e.ctx.Err()
 	}

 	if update.NetworkMap != nil && update.NetworkMap.PeerConfig != nil {
@@ -886,7 +938,7 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
 	}

 	if err := e.updateNetbirdConfig(update.GetNetbirdConfig()); err != nil {
-		return err
+		return false, err
 	}

 	// Posture checks are bound to the network map presence:
@@ -896,23 +948,22 @@ func (e *Engine) handleSync(update *mgmProto.SyncResponse) error {
 	//                                        leave the previously applied checks untouched
 	nm := update.GetNetworkMap()
 	if nm == nil {
-		return nil
+		return false, nil
 	}

 	if err := e.updateChecksIfNew(update.Checks); err != nil {
-		return err
+		return false, err
 	}

-	e.persistSyncResponse(update)
-
 	// only apply new changes and ignore old ones
-	if err := e.updateNetworkMap(nm); err != nil {
-		return err
+	more, err := e.updateNetworkMap(nm, maxPeersPerSyncPass, firstPass)
+	if err != nil {
+		return false, err
 	}

 	e.statusRecorder.PublishEvent(cProto.SystemEvent_INFO, cProto.SystemEvent_SYSTEM, "Network map updated", "", nil)

-	return nil
+	return more, nil
 }

 // updateNetbirdConfig applies the management-provided NetBird configuration:
@@ -958,6 +1009,13 @@ func (e *Engine) updateNetbirdConfig(wCfg *mgmProto.NetbirdConfig) error {
 // (not syncMsgMux) is held for the whole Set so the store cannot be cleared (disabled /
 // engine close) mid-call and have this write resurrect a file that was just removed.
 func (e *Engine) persistSyncResponse(update *mgmProto.SyncResponse) {
+	// Only persist updates that carry a network map. Config-only updates (e.g. relay
+	// token rotation, STUN/TURN) have a nil NetworkMap; persisting them would overwrite
+	// the last full map on disk and break restore-on-restart.
+	if update.GetNetworkMap() == nil {
+		return
+	}
+
 	e.syncRespMux.RLock()
 	defer e.syncRespMux.RUnlock()

@@ -1037,7 +1095,7 @@ func (e *Engine) updateChecksIfNew(checks []*mgmProto.Checks) error {
 	}
 	e.checks = checks

-	info, err := system.GetInfoWithChecks(e.ctx, checks)
+	info, err := system.GetInfoWithChecks(e.ctx, checks, e.overlayAddresses()...)
 	if err != nil {
 		log.Warnf("failed to get system info with checks: %v", err)
 		info = system.GetInfo(e.ctx)
@@ -1068,6 +1126,20 @@ func (e *Engine) updateChecksIfNew(checks []*mgmProto.Checks) error {
 	return nil
 }

+// overlayAddresses returns our own WireGuard overlay address (v4 and v6) so it
+// can be excluded from the reported network addresses; the interface coming and
+// going otherwise churns the peer meta on the management server.
+func (e *Engine) overlayAddresses() []netip.Addr {
+	var ips []netip.Addr
+	if e.config.WgAddr.IP.IsValid() {
+		ips = append(ips, e.config.WgAddr.IP)
+	}
+	if e.config.WgAddr.HasIPv6() {
+		ips = append(ips, e.config.WgAddr.IPv6)
+	}
+	return ips
+}
+
 func (e *Engine) updateConfig(conf *mgmProto.PeerConfig) error {
 	if e.wgInterface == nil {
 		return errors.New("wireguard interface is not initialized")
@@ -1127,20 +1199,6 @@ func (e *Engine) hasIPv6Changed(conf *mgmProto.PeerConfig) bool {
 	return !current.HasIPv6() || current.IPv6 != prefix.Addr() || current.IPv6Net != prefix.Masked()
 }

-// wrapDisconnectError classifies a receive-loop failure before the run is torn
-// down. An auth rejection (PermissionDenied/Unauthenticated) means the session
-// needs re-login and retrying is futile, so mark it terminal (NeedsLogin) — run()
-// then exits on its own instead of spinning the backoff. Any other failure is a
-// recoverable connection reset that the backoff should retry.
-func (e *Engine) wrapDisconnectError(err error) {
-	state := CtxGetState(e.ctx)
-	if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied || s.Code() == codes.Unauthenticated) {
-		state.Set(StatusNeedsLogin)
-		return
-	}
-	_ = state.Wrap(ErrResetConnection)
-}
-
 func (e *Engine) receiveJobEvents() {
 	e.jobExecutorWG.Add(1)
 	go func() {
@@ -1167,9 +1225,9 @@ func (e *Engine) receiveJobEvents() {
 			}
 		})
 		if err != nil {
-			// happens if management is unavailable for a long time, or rejects
-			// us (auth). wrapDisconnectError decides retry vs needs-login.
-			e.wrapDisconnectError(err)
+			// happens if management is unavailable for a long time.
+			// We want to cancel the operation of the whole client
+			_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
 			e.clientCancel()
 			return
 		}
@@ -1225,7 +1283,7 @@ func (e *Engine) receiveManagementEvents() {
 	e.shutdownWg.Add(1)
 	go func() {
 		defer e.shutdownWg.Done()
-		info, err := system.GetInfoWithChecks(e.ctx, e.checks)
+		info, err := system.GetInfoWithChecks(e.ctx, e.checks, e.overlayAddresses()...)
 		if err != nil {
 			log.Warnf("failed to get system info with checks: %v", err)
 			info = system.GetInfo(e.ctx)
@@ -1249,11 +1307,23 @@ func (e *Engine) receiveManagementEvents() {
 			e.config.DisableSSHAuth,
 		)

-		err = e.mgmClient.Sync(e.ctx, info, e.handleSync)
+		// The map-state manager converges the latest update in the background in
+		// bounded passes; the stream callback only hands it the newest target.
+		manager := newMapStateManager(e.applySyncPass, e.persistSyncResponse, func(d time.Duration) {
+			log.Infof("sync finished in %s", d)
+			e.clientMetrics.RecordSyncDuration(e.ctx, d)
+		})
+		e.shutdownWg.Add(1)
+		go func() {
+			defer e.shutdownWg.Done()
+			manager.run(e.ctx)
+		}()
+
+		err = e.mgmClient.Sync(e.ctx, info, manager.SetTarget)
 		if err != nil {
-			// happens if management is unavailable for a long time, or rejects
-			// us (auth). wrapDisconnectError decides retry vs needs-login.
-			e.wrapDisconnectError(err)
+			// happens if management is unavailable for a long time.
+			// We want to cancel the operation of the whole client
+			_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
 			e.clientCancel()
 			return
 		}
@@ -1300,21 +1370,104 @@ func (e *Engine) updateTURNs(turns []*mgmProto.ProtectedHostConfig) error {
 	return nil
 }

-func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
+// updateNetworkMap applies the wholesale parts (config, routes, ACL, DNS) in full
+// and up to maxBatch peers per phase. It returns true when more peers remained
+// than the cap, so the caller re-runs until convergence.
+func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap, maxBatch int, firstPass bool) (bool, error) {
 	// intentionally leave it before checking serial because for now it can happen that peer IP changed but serial didn't
 	if networkMap.GetPeerConfig() != nil {
 		err := e.updateConfig(networkMap.GetPeerConfig())
 		if err != nil {
-			return err
+			return false, err
 		}
 	}

 	serial := networkMap.GetSerial()
 	if e.networkSerial > serial {
 		log.Debugf("received outdated NetworkMap with serial %d, ignoring", serial)
-		return nil
+		return false, nil
 	}

+	// Wholesale sections (firewall/ACL, DNS, routes, forward rules) are applied
+	// up-front and only once per target: they are cheap, local, idempotent and must
+	// be in place before peers come up (fail-closed). On the bounded re-runs that only
+	// drain the remaining peer batches they are skipped — the applied forward rules are
+	// reused from e.forwardingRules for the lazy-exclude finalize.
+	if firstPass {
+		e.applyWholesale(networkMap, serial)
+	}
+
+	log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))
+
+	e.updateOfflinePeers(networkMap.GetOfflinePeers())
+
+	// Filter out own peer from the remote peers list
+	localPubKey := e.config.WgPrivateKey.PublicKey().String()
+	remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
+	for _, p := range networkMap.GetRemotePeers() {
+		if p.GetWgPubKey() != localPubKey {
+			remotePeers = append(remotePeers, p)
+		}
+	}
+
+	// needMore signals the caller to re-run when a peer phase hit its per-pass cap.
+	needMore := false
+
+	// cleanup request, most likely our peer has been deleted
+	if networkMap.GetRemotePeersIsEmpty() {
+		err := e.removeAllPeers()
+		e.statusRecorder.FinishPeerListModifications()
+		if err != nil {
+			return false, err
+		}
+	} else {
+		removeMore, err := e.removePeers(remotePeers, maxBatch)
+		if err != nil {
+			return false, err
+		}
+
+		modifyMore, err := e.modifyPeers(remotePeers, maxBatch)
+		if err != nil {
+			return false, err
+		}
+
+		addMore, err := e.addNewPeers(remotePeers, maxBatch)
+		if err != nil {
+			return false, err
+		}
+
+		needMore = removeMore || modifyMore || addMore
+
+		e.statusRecorder.FinishPeerListModifications()
+
+		e.updatePeerSSHHostKeys(remotePeers)
+
+		if err := e.updateSSHClientConfig(remotePeers); err != nil {
+			log.Warnf("failed to update SSH client config: %v", err)
+		}
+
+		e.updateSSHServerAuth(networkMap.GetSshAuth())
+	}
+
+	// Set the exclude list only once peers have fully converged (this pass added
+	// the last batch). It needs all target peers present in the store, and
+	// ExcludePeer has replace-semantics — a partial set mid-convergence would be wrong.
+	if !needMore {
+		excludedLazyPeers := e.toExcludedLazyPeers(e.forwardingRules, remotePeers)
+		e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
+	}
+
+	e.networkSerial = serial
+
+	return needMore, nil
+}
+
+// applyWholesale applies the cheap, local, idempotent map sections — lazy feature
+// flag, firewall/legacy management, DNS, routes, ACL filtering, DNS forwarder and
+// ingress forward rules — that must be in place before peers come up. It runs once
+// per target (first pass only); the resulting forward rules are stashed in
+// e.forwardingRules for the lazy-exclude finalize on the peer-converged pass.
+func (e *Engine) applyWholesale(networkMap *mgmProto.NetworkMap, serial uint64) {
 	if err := e.connMgr.UpdatedRemoteFeatureFlag(e.ctx, networkMap.GetPeerConfig().GetLazyConnectionEnabled()); err != nil {
 		log.Errorf("failed to update lazy connection feature flag: %v", err)
 	}
@@ -1375,61 +1528,7 @@ func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
 	if err != nil {
 		log.Errorf("failed to update forward rules, err: %v", err)
 	}
-
-	log.Debugf("got peers update from Management Service, total peers to connect to = %d", len(networkMap.GetRemotePeers()))
-
-	e.updateOfflinePeers(networkMap.GetOfflinePeers())
-
-	// Filter out own peer from the remote peers list
-	localPubKey := e.config.WgPrivateKey.PublicKey().String()
-	remotePeers := make([]*mgmProto.RemotePeerConfig, 0, len(networkMap.GetRemotePeers()))
-	for _, p := range networkMap.GetRemotePeers() {
-		if p.GetWgPubKey() != localPubKey {
-			remotePeers = append(remotePeers, p)
-		}
-	}
-
-	// cleanup request, most likely our peer has been deleted
-	if networkMap.GetRemotePeersIsEmpty() {
-		err := e.removeAllPeers()
-		e.statusRecorder.FinishPeerListModifications()
-		if err != nil {
-			return err
-		}
-	} else {
-		err := e.removePeers(remotePeers)
-		if err != nil {
-			return err
-		}
-
-		err = e.modifyPeers(remotePeers)
-		if err != nil {
-			return err
-		}
-
-		err = e.addNewPeers(remotePeers)
-		if err != nil {
-			return err
-		}
-
-		e.statusRecorder.FinishPeerListModifications()
-
-		e.updatePeerSSHHostKeys(remotePeers)
-
-		if err := e.updateSSHClientConfig(remotePeers); err != nil {
-			log.Warnf("failed to update SSH client config: %v", err)
-		}
-
-		e.updateSSHServerAuth(networkMap.GetSshAuth())
-	}
-
-	// must set the exclude list after the peers are added. Without it the manager can not figure out the peers parameters from the store
-	excludedLazyPeers := e.toExcludedLazyPeers(forwardingRules, remotePeers)
-	e.connMgr.SetExcludeList(e.ctx, excludedLazyPeers)
-
-	e.networkSerial = serial
-
-	return nil
+	e.forwardingRules = forwardingRules
 }

 func toDNSFeatureFlag(networkMap *mgmProto.NetworkMap) bool {
@@ -1609,14 +1708,23 @@ func addrToString(addr netip.Addr) string {
 }

 // addNewPeers adds peers that were not know before but arrived from the Management service with the update
-func (e *Engine) addNewPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
+// addNewPeers adds up to maxBatch not-yet-present peers per call. It returns true
+// when more new peers remained than the cap, so the caller re-runs.
+func (e *Engine) addNewPeers(peersUpdate []*mgmProto.RemotePeerConfig, maxBatch int) (bool, error) {
+	added := 0
 	for _, p := range peersUpdate {
-		err := e.addNewPeer(p)
-		if err != nil {
-			return err
+		if _, ok := e.peerStore.PeerConn(p.GetWgPubKey()); ok {
+			continue // already present (cheap skip), does not count toward the cap
 		}
+		if added >= maxBatch {
+			return true, nil // at least one more new peer remains
+		}
+		if err := e.addNewPeer(p); err != nil {
+			return false, err
+		}
+		added++
 	}
-	return nil
+	return false, nil
 }

 // addNewPeer add peer if connection doesn't exist
@@ -1714,7 +1822,7 @@ func (e *Engine) createPeerConn(pubKey string, allowedIPs []netip.Prefix, agentV
 }

 // receiveSignalEvents connects to the Signal Service event stream to negotiate connection with remote peers
-func (e *Engine) receiveSignalEvents() {
+func (e *Engine) receiveSignalEvents() error {
 	e.shutdownWg.Add(1)
 	go func() {
 		defer e.shutdownWg.Done()
@@ -1777,15 +1885,20 @@ func (e *Engine) receiveSignalEvents() {
 			return nil
 		})
 		if err != nil {
-			// happens if signal is unavailable for a long time, or rejects us
-			// (auth). wrapDisconnectError decides retry vs needs-login.
-			e.wrapDisconnectError(err)
+			// happens if signal is unavailable for a long time.
+			// We want to cancel the operation of the whole client
+			_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
 			e.clientCancel()
 			return
 		}
 	}()

-	e.signal.WaitStreamConnected()
+	// todo: consider to remove this blocker. I do not see benefit to block the Start operations
+	e.signal.WaitStreamConnected(e.ctx)
+	if err := e.ctx.Err(); err != nil {
+		return fmt.Errorf("wait for signal stream: %w", err)
+	}
+	return nil
 }

 func (e *Engine) parseNATExternalIPMappings() []string {
--- a/client/internal/engine_privileged_test.go
+++ b/client/internal/engine_privileged_test.go
@@ -0,0 +1,565 @@
+//go:build privileged
+
+package internal
+
+import (
+	"context"
+	"fmt"
+	"net"
+	"runtime"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/golang/mock/gomock"
+	"github.com/google/uuid"
+	log "github.com/sirupsen/logrus"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/otel"
+	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/keepalive"
+
+	"github.com/netbirdio/netbird/client/iface"
+	"github.com/netbirdio/netbird/client/iface/device"
+	"github.com/netbirdio/netbird/client/iface/wgaddr"
+	"github.com/netbirdio/netbird/client/internal/dns"
+	"github.com/netbirdio/netbird/client/internal/peer"
+	nbssh "github.com/netbirdio/netbird/client/ssh"
+	"github.com/netbirdio/netbird/client/system"
+	nbdns "github.com/netbirdio/netbird/dns"
+	"github.com/netbirdio/netbird/management/internals/controllers/network_map/controller"
+	"github.com/netbirdio/netbird/management/internals/controllers/network_map/update_channel"
+	"github.com/netbirdio/netbird/management/internals/modules/peers"
+	"github.com/netbirdio/netbird/management/internals/modules/peers/ephemeral/manager"
+	"github.com/netbirdio/netbird/management/internals/server/config"
+	nbgrpc "github.com/netbirdio/netbird/management/internals/shared/grpc"
+	"github.com/netbirdio/netbird/management/server"
+	"github.com/netbirdio/netbird/management/server/activity"
+	nbcache "github.com/netbirdio/netbird/management/server/cache"
+	"github.com/netbirdio/netbird/management/server/groups"
+	"github.com/netbirdio/netbird/management/server/integrations/integrated_validator/validator"
+	"github.com/netbirdio/netbird/management/server/integrations/port_forwarding"
+	"github.com/netbirdio/netbird/management/server/job"
+	"github.com/netbirdio/netbird/management/server/permissions"
+	"github.com/netbirdio/netbird/management/server/settings"
+	"github.com/netbirdio/netbird/management/server/store"
+	"github.com/netbirdio/netbird/management/server/telemetry"
+	"github.com/netbirdio/netbird/management/server/types"
+	mgmt "github.com/netbirdio/netbird/shared/management/client"
+	mgmtProto "github.com/netbirdio/netbird/shared/management/proto"
+	relayClient "github.com/netbirdio/netbird/shared/relay/client"
+	signal "github.com/netbirdio/netbird/shared/signal/client"
+	"github.com/netbirdio/netbird/shared/signal/proto"
+	signalServer "github.com/netbirdio/netbird/signal/server"
+	"github.com/netbirdio/netbird/util"
+)
+
+func TestEngine_SSH(t *testing.T) {
+	key, err := wgtypes.GeneratePrivateKey()
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+
+	sshKey, err := nbssh.GeneratePrivateKey(nbssh.ED25519)
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+
+	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+	defer cancel()
+
+	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
+	engine := NewEngine(
+		ctx, cancel,
+		&EngineConfig{
+			WgIfaceName:      "utun101",
+			WgAddr:           wgaddr.MustParseWGAddress("100.64.0.1/24"),
+			WgPrivateKey:     key,
+			WgPort:           33100,
+			ServerSSHAllowed: true,
+			MTU:              iface.DefaultMTU,
+			SSHKey:           sshKey,
+		},
+		EngineServices{
+			SignalClient:   &signal.MockClient{},
+			MgmClient:      &mgmt.MockClient{},
+			RelayManager:   relayMgr,
+			StatusRecorder: peer.NewRecorder("https://mgm"),
+		},
+		MobileDependency{},
+	)
+
+	engine.dnsServer = &dns.MockServer{
+		UpdateDNSServerFunc: func(serial uint64, update nbdns.Config) error { return nil },
+	}
+
+	err = engine.Start(nil, nil)
+	require.NoError(t, err)
+
+	defer func() {
+		err := engine.Stop()
+		if err != nil {
+			return
+		}
+	}()
+
+	peerWithSSH := &mgmtProto.RemotePeerConfig{
+		WgPubKey:   "MNHf3Ma6z6mdLbriAJbqhX7+nM/B71lgw2+91q3LfhU=",
+		AllowedIps: []string{"100.64.0.21/24"},
+		SshConfig: &mgmtProto.SSHConfig{
+			SshPubKey: []byte("ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFATYCqaQw/9id1Qkq3n16JYhDhXraI6Pc1fgB8ynEfQ"),
+		},
+	}
+
+	// SSH server is not enabled so SSH config of a remote peer should be ignored
+	networkMap := &mgmtProto.NetworkMap{
+		Serial:             6,
+		PeerConfig:         nil,
+		RemotePeers:        []*mgmtProto.RemotePeerConfig{peerWithSSH},
+		RemotePeersIsEmpty: false,
+	}
+
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	require.NoError(t, err)
+
+	assert.Nil(t, engine.sshServer)
+
+	// SSH server is enabled, therefore SSH config should be applied
+	networkMap = &mgmtProto.NetworkMap{
+		Serial: 7,
+		PeerConfig: &mgmtProto.PeerConfig{Address: "100.64.0.1/24",
+			SshConfig: &mgmtProto.SSHConfig{
+				SshEnabled: true,
+				JwtConfig: &mgmtProto.JWTConfig{
+					Issuer:       "test-issuer",
+					Audience:     "test-audience",
+					KeysLocation: "test-keys",
+					MaxTokenAge:  3600,
+				},
+			}},
+		RemotePeers:        []*mgmtProto.RemotePeerConfig{peerWithSSH},
+		RemotePeersIsEmpty: false,
+	}
+
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	require.NoError(t, err)
+
+	time.Sleep(250 * time.Millisecond)
+	assert.NotNil(t, engine.sshServer)
+
+	// now remove peer
+	networkMap = &mgmtProto.NetworkMap{
+		Serial:             8,
+		RemotePeers:        []*mgmtProto.RemotePeerConfig{},
+		RemotePeersIsEmpty: false,
+	}
+
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	require.NoError(t, err)
+
+	// time.Sleep(250 * time.Millisecond)
+	assert.NotNil(t, engine.sshServer)
+
+	// now disable SSH server
+	networkMap = &mgmtProto.NetworkMap{
+		Serial: 9,
+		PeerConfig: &mgmtProto.PeerConfig{Address: "100.64.0.1/24",
+			SshConfig: &mgmtProto.SSHConfig{SshEnabled: false}},
+		RemotePeers:        []*mgmtProto.RemotePeerConfig{peerWithSSH},
+		RemotePeersIsEmpty: false,
+	}
+
+	_, err = engine.updateNetworkMap(networkMap, maxPeersPerSyncPass, true)
+	require.NoError(t, err)
+
+	assert.Nil(t, engine.sshServer)
+}
+
+func TestEngine_Sync(t *testing.T) {
+	key, err := wgtypes.GeneratePrivateKey()
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+
+	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+	defer cancel()
+
+	// feed updates to Engine via mocked Management client
+	updates := make(chan *mgmtProto.SyncResponse)
+	defer close(updates)
+	syncFunc := func(ctx context.Context, info *system.Info, msgHandler func(msg *mgmtProto.SyncResponse) error) error {
+		for msg := range updates {
+			err := msgHandler(msg)
+			if err != nil {
+				t.Fatal(err)
+			}
+		}
+		return nil
+	}
+	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
+	engine := NewEngine(ctx, cancel, &EngineConfig{
+		WgIfaceName:  "utun103",
+		WgAddr:       wgaddr.MustParseWGAddress("100.64.0.1/24"),
+		WgPrivateKey: key,
+		WgPort:       33100,
+		MTU:          iface.DefaultMTU,
+	}, EngineServices{
+		SignalClient:   &signal.MockClient{},
+		MgmClient:      &mgmt.MockClient{SyncFunc: syncFunc},
+		RelayManager:   relayMgr,
+		StatusRecorder: peer.NewRecorder("https://mgm"),
+	}, MobileDependency{})
+	engine.ctx = ctx
+
+	engine.dnsServer = &dns.MockServer{
+		UpdateDNSServerFunc: func(serial uint64, update nbdns.Config) error { return nil },
+	}
+
+	defer func() {
+		err := engine.Stop()
+		if err != nil {
+			return
+		}
+	}()
+
+	err = engine.Start(nil, nil)
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+
+	peer1 := &mgmtProto.RemotePeerConfig{
+		WgPubKey:   "RRHf3Ma6z6mdLbriAJbqhX7+nM/B71lgw2+91q3LfhU=",
+		AllowedIps: []string{"100.64.0.10/24"},
+	}
+	peer2 := &mgmtProto.RemotePeerConfig{
+		WgPubKey:   "LLHf3Ma6z6mdLbriAJbqhX9+nM/B71lgw2+91q3LlhU=",
+		AllowedIps: []string{"100.64.0.11/24"},
+	}
+	peer3 := &mgmtProto.RemotePeerConfig{
+		WgPubKey:   "GGHf3Ma6z6mdLbriAJbqhX9+nM/B71lgw2+91q3LlhU=",
+		AllowedIps: []string{"100.64.0.12/24"},
+	}
+	// 1st update with just 1 peer and serial larger than the current serial of the engine => apply update
+	updates <- &mgmtProto.SyncResponse{
+		NetworkMap: &mgmtProto.NetworkMap{
+			Serial:             10,
+			PeerConfig:         nil,
+			RemotePeers:        []*mgmtProto.RemotePeerConfig{peer1, peer2, peer3},
+			RemotePeersIsEmpty: false,
+		},
+	}
+
+	timeout := time.After(time.Second * 2)
+	for {
+		select {
+		case <-timeout:
+			t.Fatalf("timeout while waiting for test to finish")
+			return
+		default:
+		}
+
+		if getPeers(engine) == 3 && engine.networkSerial == 10 {
+			break
+		}
+	}
+}
+
+func TestEngine_MultiplePeers(t *testing.T) {
+	// log.SetLevel(log.DebugLevel)
+
+	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
+	defer cancel()
+
+	sigServer, signalAddr, err := startSignal(t)
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+	defer sigServer.Stop()
+	mgmtServer, mgmtAddr, err := startManagement(t, t.TempDir(), "../testdata/store.sql")
+	if err != nil {
+		t.Fatal(err)
+		return
+	}
+	defer mgmtServer.GracefulStop()
+
+	setupKey := "A2C8E62B-38F5-4553-B31E-DD66C696CEBB"
+
+	mu := sync.Mutex{}
+	engines := []*Engine{}
+	numPeers := 10
+	wg := sync.WaitGroup{}
+	wg.Add(numPeers)
+	// create and start peers
+	for i := 0; i < numPeers; i++ {
+		j := i
+		go func() {
+			engine, err := createEngine(ctx, cancel, setupKey, j, mgmtAddr, signalAddr)
+			if err != nil {
+				wg.Done()
+				t.Errorf("unable to create the engine for peer %d with error %v", j, err)
+				return
+			}
+			engine.dnsServer = &dns.MockServer{}
+			mu.Lock()
+			defer mu.Unlock()
+			guid := fmt.Sprintf("{%s}", uuid.New().String())
+			device.CustomWindowsGUIDString = strings.ToLower(guid)
+			err = engine.Start(nil, nil)
+			if err != nil {
+				t.Errorf("unable to start engine for peer %d with error %v", j, err)
+				wg.Done()
+				return
+			}
+			engines = append(engines, engine)
+			wg.Done()
+		}()
+	}
+
+	// wait until all have been created and started
+	wg.Wait()
+	if len(engines) != numPeers {
+		t.Fatal("not all peers were started")
+	}
+	// check whether all the peer have expected peers connected
+
+	expectedConnected := numPeers * (numPeers - 1)
+
+	// adjust according to timeouts
+	timeout := 50 * time.Second
+	timeoutChan := time.After(timeout)
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+loop:
+	for {
+		select {
+		case <-timeoutChan:
+			t.Fatalf("waiting for expected connections timeout after %s", timeout.String())
+			break loop
+		case <-ticker.C:
+			totalConnected := 0
+			for _, engine := range engines {
+				totalConnected += getConnectedPeers(engine)
+			}
+			if totalConnected == expectedConnected {
+				log.Infof("total connected=%d", totalConnected)
+				break loop
+			}
+			log.Infof("total connected=%d", totalConnected)
+		}
+	}
+	// cleanup test
+	for n, peerEngine := range engines {
+		t.Logf("stopping peer with interface %s from multipeer test, loopIndex %d", peerEngine.wgInterface.Name(), n)
+		errStop := peerEngine.mgmClient.Close()
+		if errStop != nil {
+			log.Infoln("got error trying to close management clients from engine: ", errStop)
+		}
+		errStop = peerEngine.Stop()
+		if errStop != nil {
+			log.Infoln("got error trying to close testing peers engine: ", errStop)
+		}
+	}
+}
+
+var (
+	kaep = keepalive.EnforcementPolicy{
+		MinTime:             15 * time.Second,
+		PermitWithoutStream: true,
+	}
+
+	kasp = keepalive.ServerParameters{
+		MaxConnectionIdle:     15 * time.Second,
+		MaxConnectionAgeGrace: 5 * time.Second,
+		Time:                  5 * time.Second,
+		Timeout:               2 * time.Second,
+	}
+)
+
+func createEngine(ctx context.Context, cancel context.CancelFunc, setupKey string, i int, mgmtAddr string, signalAddr string) (*Engine, error) {
+	key, err := wgtypes.GeneratePrivateKey()
+	if err != nil {
+		return nil, err
+	}
+	mgmtClient, err := mgmt.NewClient(ctx, mgmtAddr, key, false)
+	if err != nil {
+		return nil, err
+	}
+	signalClient, err := signal.NewClient(ctx, signalAddr, key, false)
+	if err != nil {
+		return nil, err
+	}
+
+	info := system.GetInfo(ctx)
+	resp, err := mgmtClient.Register(setupKey, "", info, nil, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	var ifaceName string
+	if runtime.GOOS == "darwin" {
+		ifaceName = fmt.Sprintf("utun1%d", i)
+	} else {
+		ifaceName = fmt.Sprintf("wt%d", i)
+	}
+
+	wgPort := 33100 + i
+	conf := &EngineConfig{
+		WgIfaceName:  ifaceName,
+		WgAddr:       wgaddr.MustParseWGAddress(resp.PeerConfig.Address),
+		WgPrivateKey: key,
+		WgPort:       wgPort,
+		MTU:          iface.DefaultMTU,
+	}
+
+	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
+	e, err := NewEngine(ctx, cancel, conf, EngineServices{
+		SignalClient:   signalClient,
+		MgmClient:      mgmtClient,
+		RelayManager:   relayMgr,
+		StatusRecorder: peer.NewRecorder("https://mgm"),
+	}, MobileDependency{}), nil
+	e.ctx = ctx
+	return e, err
+}
+
+func startSignal(t *testing.T) (*grpc.Server, string, error) {
+	t.Helper()
+
+	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
+
+	lis, err := net.Listen("tcp", "localhost:0")
+	if err != nil {
+		log.Fatalf("failed to listen: %v", err)
+	}
+
+	srv, err := signalServer.NewServer(context.Background(), otel.Meter(""))
+	require.NoError(t, err)
+	proto.RegisterSignalExchangeServer(s, srv)
+
+	go func() {
+		if err = s.Serve(lis); err != nil {
+			log.Fatalf("failed to serve: %v", err)
+		}
+	}()
+
+	return s, lis.Addr().String(), nil
+}
+
+func startManagement(t *testing.T, dataDir, testFile string) (*grpc.Server, string, error) {
+	t.Helper()
+
+	config := &config.Config{
+		Stuns:      []*config.Host{},
+		TURNConfig: &config.TURNConfig{},
+		Relay: &config.Relay{
+			Addresses:      []string{"127.0.0.1:1234"},
+			CredentialsTTL: util.Duration{Duration: time.Hour},
+			Secret:         "222222222222222222",
+		},
+		Signal: &config.Host{
+			Proto: "http",
+			URI:   "localhost:10000",
+		},
+		Datadir:    dataDir,
+		HttpConfig: nil,
+	}
+
+	lis, err := net.Listen("tcp", "localhost:0")
+	if err != nil {
+		return nil, "", err
+	}
+	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
+
+	store, cleanUp, err := store.NewTestStoreFromSQL(context.Background(), testFile, config.Datadir)
+	if err != nil {
+		return nil, "", err
+	}
+	t.Cleanup(cleanUp)
+
+	eventStore := &activity.InMemoryEventStore{}
+	if err != nil {
+		return nil, "", err
+	}
+
+	permissionsManager := permissions.NewManager(store)
+	peersManager := peers.NewManager(store, permissionsManager)
+	jobManager := job.NewJobManager(nil, store, peersManager)
+
+	cacheStore, err := nbcache.NewStore(context.Background(), 100*time.Millisecond, 300*time.Millisecond, 100)
+	if err != nil {
+		return nil, "", err
+	}
+
+	ia, _ := validator.NewIntegratedValidator(context.Background(), peersManager, nil, eventStore, cacheStore)
+
+	metrics, err := telemetry.NewDefaultAppMetrics(context.Background())
+	require.NoError(t, err)
+
+	ctrl := gomock.NewController(t)
+	t.Cleanup(ctrl.Finish)
+	settingsMockManager := settings.NewMockManager(ctrl)
+	settingsMockManager.EXPECT().
+		GetSettings(gomock.Any(), gomock.Any(), gomock.Any()).
+		Return(&types.Settings{}, nil).
+		AnyTimes()
+	settingsMockManager.EXPECT().
+		GetExtraSettings(gomock.Any(), gomock.Any()).
+		Return(&types.ExtraSettings{}, nil).
+		AnyTimes()
+
+	groupsManager := groups.NewManagerMock()
+
+	updateManager := update_channel.NewPeersUpdateManager(metrics)
+	requestBuffer := server.NewAccountRequestBuffer(context.Background(), store)
+	networkMapController := controller.NewController(context.Background(), store, metrics, updateManager, requestBuffer, server.MockIntegratedValidator{}, settingsMockManager, "netbird.selfhosted", port_forwarding.NewControllerMock(), manager.NewEphemeralManager(store, peersManager), config)
+	accountManager, err := server.BuildManager(context.Background(), config, store, networkMapController, jobManager, nil, "", eventStore, nil, false, ia, metrics, port_forwarding.NewControllerMock(), settingsMockManager, permissionsManager, false, cacheStore)
+	if err != nil {
+		return nil, "", err
+	}
+
+	secretsManager, err := nbgrpc.NewTimeBasedAuthSecretsManager(updateManager, config.TURNConfig, config.Relay, settingsMockManager, groupsManager)
+	if err != nil {
+		return nil, "", err
+	}
+	mgmtServer, err := nbgrpc.NewServer(config, accountManager, settingsMockManager, jobManager, secretsManager, nil, nil, &server.MockIntegratedValidator{}, networkMapController, nil, nil)
+	if err != nil {
+		return nil, "", err
+	}
+	mgmtProto.RegisterManagementServiceServer(s, mgmtServer)
+	go func() {
+		if err = s.Serve(lis); err != nil {
+			log.Fatalf("failed to serve: %v", err)
+		}
+	}()
+
+	return s, lis.Addr().String(), nil
+}
+
+// getConnectedPeers returns a connection Status or nil if peer connection wasn't found
+func getConnectedPeers(e *Engine) int {
+	e.syncMsgMux.Lock()
+	defer e.syncMsgMux.Unlock()
+	i := 0
+	for _, id := range e.peerStore.PeersPubKey() {
+		conn, _ := e.peerStore.PeerConn(id)
+		if conn.IsConnected() {
+			i++
+		}
+	}
+	return i
+}
+
+func getPeers(e *Engine) int {
+	e.syncMsgMux.Lock()
+	defer e.syncMsgMux.Unlock()
+
+	return len(e.peerStore.PeersPubKey())
+}
--- a/client/internal/engine_test.go
+++ b/client/internal/engine_test.go
@@ -6,37 +6,18 @@ import (
 	"net"
 	"net/netip"
 	"os"
-	"runtime"
 	"strings"
 	"sync"
 	"testing"
 	"time"

-	"github.com/golang/mock/gomock"
-	"github.com/google/uuid"
-	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	"go.opentelemetry.io/otel"
 	wgdevice "golang.zx2c4.com/wireguard/device"
 	"golang.zx2c4.com/wireguard/tun/netstack"
 	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/keepalive"

 	"github.com/netbirdio/netbird/client/internal/stdnet"
-	"github.com/netbirdio/netbird/management/server/job"
-
-	"github.com/netbirdio/netbird/management/server/integrations/integrated_validator/validator"
-
-	"github.com/netbirdio/netbird/management/internals/controllers/network_map/controller"
-	"github.com/netbirdio/netbird/management/internals/controllers/network_map/update_channel"
-	"github.com/netbirdio/netbird/management/internals/modules/peers"
-	"github.com/netbirdio/netbird/management/internals/modules/peers/ephemeral/manager"
-	nbgrpc "github.com/netbirdio/netbird/management/internals/shared/grpc"
-
-	"github.com/netbirdio/netbird/management/internals/server/config"
-	"github.com/netbirdio/netbird/management/server/groups"

 	"github.com/netbirdio/netbird/client/iface"
 	"github.com/netbirdio/netbird/client/iface/configurer"
@@ -50,18 +31,7 @@ import (
 	icemaker "github.com/netbirdio/netbird/client/internal/peer/ice"
 	"github.com/netbirdio/netbird/client/internal/profilemanager"
 	"github.com/netbirdio/netbird/client/internal/routemanager"
-	nbssh "github.com/netbirdio/netbird/client/ssh"
-	"github.com/netbirdio/netbird/client/system"
 	nbdns "github.com/netbirdio/netbird/dns"
-	"github.com/netbirdio/netbird/management/server"
-	"github.com/netbirdio/netbird/management/server/activity"
-	nbcache "github.com/netbirdio/netbird/management/server/cache"
-	"github.com/netbirdio/netbird/management/server/integrations/port_forwarding"
-	"github.com/netbirdio/netbird/management/server/permissions"
-	"github.com/netbirdio/netbird/management/server/settings"
-	"github.com/netbirdio/netbird/management/server/store"
-	"github.com/netbirdio/netbird/management/server/telemetry"
-	"github.com/netbirdio/netbird/management/server/types"
 	"github.com/netbirdio/netbird/monotime"
 	"github.com/netbirdio/netbird/route"
 	mgmt "github.com/netbirdio/netbird/shared/management/client"
@@ -69,25 +39,9 @@ import (
 	"github.com/netbirdio/netbird/shared/netiputil"
 	relayClient "github.com/netbirdio/netbird/shared/relay/client"
 	signal "github.com/netbirdio/netbird/shared/signal/client"
-	"github.com/netbirdio/netbird/shared/signal/proto"
-	signalServer "github.com/netbirdio/netbird/signal/server"
 	"github.com/netbirdio/netbird/util"
 )

-var (
-	kaep = keepalive.EnforcementPolicy{
-		MinTime:             15 * time.Second,
-		PermitWithoutStream: true,
-	}
-
-	kasp = keepalive.ServerParameters{
-		MaxConnectionIdle:     15 * time.Second,
-		MaxConnectionAgeGrace: 5 * time.Second,
-		Time:                  5 * time.Second,
-		Timeout:               2 * time.Second,
-	}
-)
-
 type MockWGIface struct {
 	CreateFunc                 func() error
 	CreateOnAndroidFunc        func(routeRange []string, ip string, domains []string) error
@@ -234,129 +188,6 @@ func TestMain(m *testing.M) {
 	os.Exit(code)
 }

-func TestEngine_SSH(t *testing.T) {
-	key, err := wgtypes.GeneratePrivateKey()
-	if err != nil {
-		t.Fatal(err)
-		return
-	}
-
-	sshKey, err := nbssh.GeneratePrivateKey(nbssh.ED25519)
-	if err != nil {
-		t.Fatal(err)
-		return
-	}
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
-	engine := NewEngine(
-		ctx, cancel,
-		&EngineConfig{
-			WgIfaceName:      "utun101",
-			WgAddr:           wgaddr.MustParseWGAddress("100.64.0.1/24"),
-			WgPrivateKey:     key,
-			WgPort:           33100,
-			ServerSSHAllowed: true,
-			MTU:              iface.DefaultMTU,
-			SSHKey:           sshKey,
-		},
-		EngineServices{
-			SignalClient:   &signal.MockClient{},
-			MgmClient:      &mgmt.MockClient{},
-			RelayManager:   relayMgr,
-			StatusRecorder: peer.NewRecorder("https://mgm"),
-		},
-		MobileDependency{},
-	)
-
-	engine.dnsServer = &dns.MockServer{
-		UpdateDNSServerFunc: func(serial uint64, update nbdns.Config) error { return nil },
-	}
-
-	err = engine.Start(nil, nil)
-	require.NoError(t, err)
-
-	defer func() {
-		err := engine.Stop()
-		if err != nil {
-			return
-		}
-	}()
-
-	peerWithSSH := &mgmtProto.RemotePeerConfig{
-		WgPubKey:   "MNHf3Ma6z6mdLbriAJbqhX7+nM/B71lgw2+91q3LfhU=",
-		AllowedIps: []string{"100.64.0.21/24"},
-		SshConfig: &mgmtProto.SSHConfig{
-			SshPubKey: []byte("ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFATYCqaQw/9id1Qkq3n16JYhDhXraI6Pc1fgB8ynEfQ"),
-		},
-	}
-
-	// SSH server is not enabled so SSH config of a remote peer should be ignored
-	networkMap := &mgmtProto.NetworkMap{
-		Serial:             6,
-		PeerConfig:         nil,
-		RemotePeers:        []*mgmtProto.RemotePeerConfig{peerWithSSH},
-		RemotePeersIsEmpty: false,
-	}
-
-	err = engine.updateNetworkMap(networkMap)
-	require.NoError(t, err)
-
-	assert.Nil(t, engine.sshServer)
-
-	// SSH server is enabled, therefore SSH config should be applied
-	networkMap = &mgmtProto.NetworkMap{
-		Serial: 7,
-		PeerConfig: &mgmtProto.PeerConfig{Address: "100.64.0.1/24",
-			SshConfig: &mgmtProto.SSHConfig{
-				SshEnabled: true,
-				JwtConfig: &mgmtProto.JWTConfig{
-					Issuer:       "test-issuer",
-					Audience:     "test-audience",
-					KeysLocation: "test-keys",
-					MaxTokenAge:  3600,
-				},
-			}},
-		RemotePeers:        []*mgmtProto.RemotePeerConfig{peerWithSSH},
-		RemotePeersIsEmpty: false,
-	}
-
-	err = engine.updateNetworkMap(networkMap)
-	require.NoError(t, err)
-
-	time.Sleep(250 * time.Millisecond)
-	assert.NotNil(t, engine.sshServer)
-
-	// now remove peer
-	networkMap = &mgmtProto.NetworkMap{
-		Serial:             8,
-		RemotePeers:        []*mgmtProto.RemotePeerConfig{},
-		RemotePeersIsEmpty: false,
-	}
-
-	err = engine.updateNetworkMap(networkMap)
-	require.NoError(t, err)
-
-	// time.Sleep(250 * time.Millisecond)
-	assert.NotNil(t, engine.sshServer)
-
-	// now disable SSH server
-	networkMap = &mgmtProto.NetworkMap{
-		Serial: 9,
-		PeerConfig: &mgmtProto.PeerConfig{Address: "100.64.0.1/24",
-			SshConfig: &mgmtProto.SSHConfig{SshEnabled: false}},
-		RemotePeers:        []*mgmtProto.RemotePeerConfig{peerWithSSH},
-		RemotePeersIsEmpty: false,
-	}
-
-	err = engine.updateNetworkMap(networkMap)
-	require.NoError(t, err)
-
-	assert.Nil(t, engine.sshServer)
-}
-
 func TestEngine_SSHUpdateLogic(t *testing.T) {
 	// Test that SSH server start/stop logic works based on config
 	engine := &Engine{
@@ -426,7 +257,7 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {
 		return
 	}

-	ctx, cancel := context.WithCancel(context.Background())
+	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
 	defer cancel()

 	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
@@ -602,7 +433,7 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {

 	for _, c := range []testCase{case1, case2, case3, case4, case5, case6} {
 		t.Run(c.name, func(t *testing.T) {
-			err = engine.updateNetworkMap(c.networkMap)
+			_, err = engine.updateNetworkMap(c.networkMap, maxPeersPerSyncPass, true)
 			if err != nil {
 				t.Fatal(err)
 				return
@@ -629,97 +460,47 @@ func TestEngine_UpdateNetworkMap(t *testing.T) {
 			}
 		})
 	}
-}

-func TestEngine_Sync(t *testing.T) {
-	key, err := wgtypes.GeneratePrivateKey()
-	if err != nil {
-		t.Fatal(err)
-		return
-	}
-
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	// feed updates to Engine via mocked Management client
-	updates := make(chan *mgmtProto.SyncResponse)
-	defer close(updates)
-	syncFunc := func(ctx context.Context, info *system.Info, msgHandler func(msg *mgmtProto.SyncResponse) error) error {
-		for msg := range updates {
-			err := msgHandler(msg)
-			if err != nil {
-				t.Fatal(err)
-			}
-		}
-		return nil
-	}
-	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
-	engine := NewEngine(ctx, cancel, &EngineConfig{
-		WgIfaceName:  "utun103",
-		WgAddr:       wgaddr.MustParseWGAddress("100.64.0.1/24"),
-		WgPrivateKey: key,
-		WgPort:       33100,
-		MTU:          iface.DefaultMTU,
-	}, EngineServices{
-		SignalClient:   &signal.MockClient{},
-		MgmClient:      &mgmt.MockClient{SyncFunc: syncFunc},
-		RelayManager:   relayMgr,
-		StatusRecorder: peer.NewRecorder("https://mgm"),
-	}, MobileDependency{})
-	engine.ctx = ctx
-
-	engine.dnsServer = &dns.MockServer{
-		UpdateDNSServerFunc: func(serial uint64, update nbdns.Config) error { return nil },
-	}
-
-	defer func() {
-		err := engine.Stop()
-		if err != nil {
-			return
-		}
-	}()
-
-	err = engine.Start(nil, nil)
-	if err != nil {
-		t.Fatal(err)
-		return
-	}
-
-	peer1 := &mgmtProto.RemotePeerConfig{
-		WgPubKey:   "RRHf3Ma6z6mdLbriAJbqhX7+nM/B71lgw2+91q3LfhU=",
-		AllowedIps: []string{"100.64.0.10/24"},
-	}
-	peer2 := &mgmtProto.RemotePeerConfig{
-		WgPubKey:   "LLHf3Ma6z6mdLbriAJbqhX9+nM/B71lgw2+91q3LlhU=",
-		AllowedIps: []string{"100.64.0.11/24"},
-	}
-	peer3 := &mgmtProto.RemotePeerConfig{
-		WgPubKey:   "GGHf3Ma6z6mdLbriAJbqhX9+nM/B71lgw2+91q3LlhU=",
-		AllowedIps: []string{"100.64.0.12/24"},
-	}
-	// 1st update with just 1 peer and serial larger than the current serial of the engine => apply update
-	updates <- &mgmtProto.SyncResponse{
-		NetworkMap: &mgmtProto.NetworkMap{
-			Serial:             10,
-			PeerConfig:         nil,
-			RemotePeers:        []*mgmtProto.RemotePeerConfig{peer1, peer2, peer3},
-			RemotePeersIsEmpty: false,
-		},
-	}
-
-	timeout := time.After(time.Second * 2)
-	for {
-		select {
-		case <-timeout:
-			t.Fatalf("timeout while waiting for test to finish")
-			return
-		default:
+	// chunked apply: with a per-pass cap smaller than the number of peers, a
+	// single updateNetworkMap applies one batch and reports more==true; the
+	// caller re-runs until convergence. (engine currently holds 0 peers.)
+	t.Run("chunked add converges over multiple passes", func(t *testing.T) {
+		nm := &mgmtProto.NetworkMap{
+			Serial:      6,
+			RemotePeers: []*mgmtProto.RemotePeerConfig{peer1, peer2, peer3},
 		}

-		if getPeers(engine) == 3 && engine.networkSerial == 10 {
-			break
+		more, err := engine.updateNetworkMap(nm, 1, true)
+		require.NoError(t, err)
+		require.True(t, more, "pass 1 should signal more")
+		require.Len(t, engine.peerStore.PeersPubKey(), 1)
+
+		more, err = engine.updateNetworkMap(nm, 1, false)
+		require.NoError(t, err)
+		require.True(t, more, "pass 2 should signal more")
+		require.Len(t, engine.peerStore.PeersPubKey(), 2)
+
+		more, err = engine.updateNetworkMap(nm, 1, false)
+		require.NoError(t, err)
+		require.False(t, more, "pass 3 should converge")
+		require.Len(t, engine.peerStore.PeersPubKey(), 3)
+	})
+
+	t.Run("chunked remove converges over multiple passes", func(t *testing.T) {
+		nm := &mgmtProto.NetworkMap{
+			Serial:      7,
+			RemotePeers: []*mgmtProto.RemotePeerConfig{peer1}, // remove peer2, peer3
 		}
-	}
+
+		more, err := engine.updateNetworkMap(nm, 1, true)
+		require.NoError(t, err)
+		require.True(t, more, "pass 1 should signal more (2 to remove, cap 1)")
+
+		more, err = engine.updateNetworkMap(nm, 1, false)
+		require.NoError(t, err)
+		require.False(t, more, "pass 2 should converge")
+		require.Len(t, engine.peerStore.PeersPubKey(), 1)
+	})
 }

 func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
@@ -817,7 +598,7 @@ func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
 				return
 			}

-			ctx, cancel := context.WithCancel(context.Background())
+			ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
 			defer cancel()

 			wgIfaceName := fmt.Sprintf("utun%d", 104+n)
@@ -890,7 +671,7 @@ func TestEngine_UpdateNetworkMapWithRoutes(t *testing.T) {
 				}
 			}()

-			err = engine.updateNetworkMap(testCase.networkMap)
+			_, err = engine.updateNetworkMap(testCase.networkMap, maxPeersPerSyncPass, true)
 			assert.NoError(t, err, "shouldn't return error")
 			assert.Equal(t, testCase.expectedSerial, input.inputSerial, "serial should match")
 			assert.Len(t, input.clientRoutes, testCase.expectedLen, "clientRoutes len should match")
@@ -1024,7 +805,7 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) {
 				return
 			}

-			ctx, cancel := context.WithCancel(context.Background())
+			ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
 			defer cancel()

 			wgIfaceName := fmt.Sprintf("utun%d", 104+n)
@@ -1094,7 +875,7 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) {
 				}
 			}()

-			err = engine.updateNetworkMap(testCase.networkMap)
+			_, err = engine.updateNetworkMap(testCase.networkMap, maxPeersPerSyncPass, true)
 			assert.NoError(t, err, "shouldn't return error")
 			assert.Equal(t, testCase.expectedSerial, input.inputSerial, "serial should match")
 			assert.Len(t, input.inputNSGroups, testCase.expectedZonesLen, "zones len should match")
@@ -1105,104 +886,6 @@ func TestEngine_UpdateNetworkMapWithDNSUpdate(t *testing.T) {
 	}
 }

-func TestEngine_MultiplePeers(t *testing.T) {
-	// log.SetLevel(log.DebugLevel)
-
-	ctx, cancel := context.WithCancel(CtxInitState(context.Background()))
-	defer cancel()
-
-	sigServer, signalAddr, err := startSignal(t)
-	if err != nil {
-		t.Fatal(err)
-		return
-	}
-	defer sigServer.Stop()
-	mgmtServer, mgmtAddr, err := startManagement(t, t.TempDir(), "../testdata/store.sql")
-	if err != nil {
-		t.Fatal(err)
-		return
-	}
-	defer mgmtServer.GracefulStop()
-
-	setupKey := "A2C8E62B-38F5-4553-B31E-DD66C696CEBB"
-
-	mu := sync.Mutex{}
-	engines := []*Engine{}
-	numPeers := 10
-	wg := sync.WaitGroup{}
-	wg.Add(numPeers)
-	// create and start peers
-	for i := 0; i < numPeers; i++ {
-		j := i
-		go func() {
-			engine, err := createEngine(ctx, cancel, setupKey, j, mgmtAddr, signalAddr)
-			if err != nil {
-				wg.Done()
-				t.Errorf("unable to create the engine for peer %d with error %v", j, err)
-				return
-			}
-			engine.dnsServer = &dns.MockServer{}
-			mu.Lock()
-			defer mu.Unlock()
-			guid := fmt.Sprintf("{%s}", uuid.New().String())
-			device.CustomWindowsGUIDString = strings.ToLower(guid)
-			err = engine.Start(nil, nil)
-			if err != nil {
-				t.Errorf("unable to start engine for peer %d with error %v", j, err)
-				wg.Done()
-				return
-			}
-			engines = append(engines, engine)
-			wg.Done()
-		}()
-	}
-
-	// wait until all have been created and started
-	wg.Wait()
-	if len(engines) != numPeers {
-		t.Fatal("not all peers was started")
-	}
-	// check whether all the peer have expected peers connected
-
-	expectedConnected := numPeers * (numPeers - 1)
-
-	// adjust according to timeouts
-	timeout := 50 * time.Second
-	timeoutChan := time.After(timeout)
-	ticker := time.NewTicker(time.Second)
-	defer ticker.Stop()
-loop:
-	for {
-		select {
-		case <-timeoutChan:
-			t.Fatalf("waiting for expected connections timeout after %s", timeout.String())
-			break loop
-		case <-ticker.C:
-			totalConnected := 0
-			for _, engine := range engines {
-				totalConnected += getConnectedPeers(engine)
-			}
-			if totalConnected == expectedConnected {
-				log.Infof("total connected=%d", totalConnected)
-				break loop
-			}
-			log.Infof("total connected=%d", totalConnected)
-		}
-	}
-	// cleanup test
-	for n, peerEngine := range engines {
-		t.Logf("stopping peer with interface %s from multipeer test, loopIndex %d", peerEngine.wgInterface.Name(), n)
-		errStop := peerEngine.mgmClient.Close()
-		if errStop != nil {
-			log.Infoln("got error trying to close management clients from engine: ", errStop)
-		}
-		errStop = peerEngine.Stop()
-		if errStop != nil {
-			log.Infoln("got error trying to close testing peers engine: ", errStop)
-		}
-	}
-}
-
 func Test_ParseNATExternalIPMappings(t *testing.T) {
 	ifaceList, err := net.Interfaces()
 	if err != nil {
@@ -1526,187 +1209,6 @@ func TestCompareNetIPLists(t *testing.T) {
 	}
 }

-func createEngine(ctx context.Context, cancel context.CancelFunc, setupKey string, i int, mgmtAddr string, signalAddr string) (*Engine, error) {
-	key, err := wgtypes.GeneratePrivateKey()
-	if err != nil {
-		return nil, err
-	}
-	mgmtClient, err := mgmt.NewClient(ctx, mgmtAddr, key, false)
-	if err != nil {
-		return nil, err
-	}
-	signalClient, err := signal.NewClient(ctx, signalAddr, key, false)
-	if err != nil {
-		return nil, err
-	}
-
-	info := system.GetInfo(ctx)
-	resp, err := mgmtClient.Register(setupKey, "", info, nil, nil)
-	if err != nil {
-		return nil, err
-	}
-
-	var ifaceName string
-	if runtime.GOOS == "darwin" {
-		ifaceName = fmt.Sprintf("utun1%d", i)
-	} else {
-		ifaceName = fmt.Sprintf("wt%d", i)
-	}
-
-	wgPort := 33100 + i
-	conf := &EngineConfig{
-		WgIfaceName:  ifaceName,
-		WgAddr:       wgaddr.MustParseWGAddress(resp.PeerConfig.Address),
-		WgPrivateKey: key,
-		WgPort:       wgPort,
-		MTU:          iface.DefaultMTU,
-	}
-
-	relayMgr := relayClient.NewManager(ctx, nil, key.PublicKey().String(), iface.DefaultMTU)
-	e, err := NewEngine(ctx, cancel, conf, EngineServices{
-		SignalClient:   signalClient,
-		MgmClient:      mgmtClient,
-		RelayManager:   relayMgr,
-		StatusRecorder: peer.NewRecorder("https://mgm"),
-	}, MobileDependency{}), nil
-	e.ctx = ctx
-	return e, err
-}
-
-func startSignal(t *testing.T) (*grpc.Server, string, error) {
-	t.Helper()
-
-	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
-
-	lis, err := net.Listen("tcp", "localhost:0")
-	if err != nil {
-		log.Fatalf("failed to listen: %v", err)
-	}
-
-	srv, err := signalServer.NewServer(context.Background(), otel.Meter(""))
-	require.NoError(t, err)
-	proto.RegisterSignalExchangeServer(s, srv)
-
-	go func() {
-		if err = s.Serve(lis); err != nil {
-			log.Fatalf("failed to serve: %v", err)
-		}
-	}()
-
-	return s, lis.Addr().String(), nil
-}
-
-func startManagement(t *testing.T, dataDir, testFile string) (*grpc.Server, string, error) {
-	t.Helper()
-
-	config := &config.Config{
-		Stuns:      []*config.Host{},
-		TURNConfig: &config.TURNConfig{},
-		Relay: &config.Relay{
-			Addresses:      []string{"127.0.0.1:1234"},
-			CredentialsTTL: util.Duration{Duration: time.Hour},
-			Secret:         "222222222222222222",
-		},
-		Signal: &config.Host{
-			Proto: "http",
-			URI:   "localhost:10000",
-		},
-		Datadir:    dataDir,
-		HttpConfig: nil,
-	}
-
-	lis, err := net.Listen("tcp", "localhost:0")
-	if err != nil {
-		return nil, "", err
-	}
-	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
-
-	store, cleanUp, err := store.NewTestStoreFromSQL(context.Background(), testFile, config.Datadir)
-	if err != nil {
-		return nil, "", err
-	}
-	t.Cleanup(cleanUp)
-
-	eventStore := &activity.InMemoryEventStore{}
-	if err != nil {
-		return nil, "", err
-	}
-
-	permissionsManager := permissions.NewManager(store)
-	peersManager := peers.NewManager(store, permissionsManager)
-	jobManager := job.NewJobManager(nil, store, peersManager)
-
-	cacheStore, err := nbcache.NewStore(context.Background(), 100*time.Millisecond, 300*time.Millisecond, 100)
-	if err != nil {
-		return nil, "", err
-	}
-
-	ia, _ := validator.NewIntegratedValidator(context.Background(), peersManager, nil, eventStore, cacheStore)
-
-	metrics, err := telemetry.NewDefaultAppMetrics(context.Background())
-	require.NoError(t, err)
-
-	ctrl := gomock.NewController(t)
-	t.Cleanup(ctrl.Finish)
-	settingsMockManager := settings.NewMockManager(ctrl)
-	settingsMockManager.EXPECT().
-		GetSettings(gomock.Any(), gomock.Any(), gomock.Any()).
-		Return(&types.Settings{}, nil).
-		AnyTimes()
-	settingsMockManager.EXPECT().
-		GetExtraSettings(gomock.Any(), gomock.Any()).
-		Return(&types.ExtraSettings{}, nil).
-		AnyTimes()
-
-	groupsManager := groups.NewManagerMock()
-
-	updateManager := update_channel.NewPeersUpdateManager(metrics)
-	requestBuffer := server.NewAccountRequestBuffer(context.Background(), store)
-	networkMapController := controller.NewController(context.Background(), store, metrics, updateManager, requestBuffer, server.MockIntegratedValidator{}, settingsMockManager, "netbird.selfhosted", port_forwarding.NewControllerMock(), manager.NewEphemeralManager(store, peersManager), config)
-	accountManager, err := server.BuildManager(context.Background(), config, store, networkMapController, jobManager, nil, "", eventStore, nil, false, ia, metrics, port_forwarding.NewControllerMock(), settingsMockManager, permissionsManager, false, cacheStore)
-	if err != nil {
-		return nil, "", err
-	}
-
-	secretsManager, err := nbgrpc.NewTimeBasedAuthSecretsManager(updateManager, config.TURNConfig, config.Relay, settingsMockManager, groupsManager)
-	if err != nil {
-		return nil, "", err
-	}
-	mgmtServer, err := nbgrpc.NewServer(config, accountManager, settingsMockManager, jobManager, secretsManager, nil, nil, &server.MockIntegratedValidator{}, networkMapController, nil, nil)
-	if err != nil {
-		return nil, "", err
-	}
-	mgmtProto.RegisterManagementServiceServer(s, mgmtServer)
-	go func() {
-		if err = s.Serve(lis); err != nil {
-			log.Fatalf("failed to serve: %v", err)
-		}
-	}()
-
-	return s, lis.Addr().String(), nil
-}
-
-// getConnectedPeers returns a connection Status or nil if peer connection wasn't found
-func getConnectedPeers(e *Engine) int {
-	e.syncMsgMux.Lock()
-	defer e.syncMsgMux.Unlock()
-	i := 0
-	for _, id := range e.peerStore.PeersPubKey() {
-		conn, _ := e.peerStore.PeerConn(id)
-		if conn.IsConnected() {
-			i++
-		}
-	}
-	return i
-}
-
-func getPeers(e *Engine) int {
-	e.syncMsgMux.Lock()
-	defer e.syncMsgMux.Unlock()
-
-	return len(e.peerStore.PeersPubKey())
-}
-
 func mustEncodePrefix(t *testing.T, p netip.Prefix) []byte {
 	t.Helper()
 	b, err := netiputil.EncodePrefix(p)
--- a/client/internal/lazyconn/activity/listener_bind.go
+++ b/client/internal/lazyconn/activity/listener_bind.go
@@ -119,10 +119,6 @@ func (d *BindListener) ReadPackets() {
 	}

 	d.peerCfg.Log.Debugf("removing lazy endpoint for peer %s", d.peerCfg.PublicKey)
-	if err := d.wgIface.RemovePeer(d.peerCfg.PublicKey); err != nil {
-		d.peerCfg.Log.Errorf("failed to remove endpoint: %s", err)
-	}
-
 	_ = d.lazyConn.Close()
 	d.bind.RemoveEndpoint(d.fakeIP)
 	d.done.Done()
--- a/client/internal/mapsync.go
+++ b/client/internal/mapsync.go
@@ -0,0 +1,190 @@
+package internal
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	log "github.com/sirupsen/logrus"
+
+	mgmProto "github.com/netbirdio/netbird/shared/management/proto"
+)
+
+// mapStateManager is the single read/write point between the management stream
+// (writes) and the convergence loop (reads/applies).
+//
+// The stream calls SetTarget with the latest full SyncResponse — the complete
+// desired state. A single background goroutine (run) applies it to the engine in
+// bounded passes via apply() until converged, releasing syncMsgMux between passes
+// so other subsystems interleave. If a newer update arrives mid-flight, the loop
+// coalesces: it keeps converging toward the latest target and the intermediate one
+// is SKIPPED — never applied on its own (logged, no onConverged).
+//
+// Convergence is a single comparison: appliedGen == targetGen. targetGen
+// increments on every SetTarget (an internal generation counter, so it also covers
+// config-only updates that carry no network-map serial).
+//
+// onConverged fires once for each — and only each — map that is actually processed
+// (i.e. converged as the target). Skipped/superseded maps and dropped-on-error maps
+// do NOT fire it. So "sync finished in X" / RecordSyncDuration always corresponds
+// to a real, completed alignment.
+type mapStateManager struct {
+	// apply performs one bounded apply pass and reports whether more passes are needed.
+	// firstPass is true on the first pass of a given target, so the caller can run
+	// wholesale (firewall/routes/DNS/forward-rules) once per target and skip it on the
+	// re-runs that only drain the bounded peer batches. The manager owns this signal
+	// because it owns the convergence boundary; the engine need not track serials for it.
+	apply func(update *mgmProto.SyncResponse, firstPass bool) (bool, error)
+	// onConverged is called once per processed map, with the elapsed time since that
+	// map was received (for the sync-duration metric / "sync finished" log).
+	onConverged func(time.Duration)
+	// persist snapshots an update to disk for restore-on-restart. Called once per
+	// update received from management (in SetTarget), including ones later coalesced
+	// or skipped from apply, so the on-disk state mirrors what management last sent.
+	// The impl skips config-only updates (nil NetworkMap). May be nil.
+	persist func(*mgmProto.SyncResponse)
+
+	mu          sync.Mutex
+	target      *mgmProto.SyncResponse
+	targetGen   uint64
+	appliedGen  uint64
+	targetSetAt time.Time
+
+	wake chan struct{}
+}
+
+func newMapStateManager(apply func(update *mgmProto.SyncResponse, firstPass bool) (bool, error), persist func(*mgmProto.SyncResponse), onConverged func(time.Duration)) *mapStateManager {
+	return &mapStateManager{
+		apply:       apply,
+		persist:     persist,
+		onConverged: onConverged,
+		wake:        make(chan struct{}, 1),
+	}
+}
+
+// SetTarget records the latest update as the desired state and wakes the loop.
+// It returns immediately; convergence happens in the background. Serial-based
+// staleness of the network map is still enforced inside apply (updateNetworkMap).
+func (m *mapStateManager) SetTarget(update *mgmProto.SyncResponse) error {
+	m.mu.Lock()
+	// A target that has not settled yet (targetGen > appliedGen) is being superseded
+	// before it converged: we coalesce to the latest map and never apply this one on
+	// its own. It is SKIPPED — logged here, and it will not fire onConverged.
+	if m.target != nil && m.targetGen > m.appliedGen {
+		log.Debugf("sync map (gen %d) superseded before convergence, skipping", m.targetGen)
+	}
+	m.target = m.mergeTarget(m.target, update)
+	// Bump an internal generation counter, NOT the map serial: config-only updates
+	// (relay token rotation, STUN/TURN) arrive with NetworkMap == nil and carry no
+	// serial, yet must still be applied. Every SetTarget is therefore a distinct
+	// target regardless of payload. Map-serial staleness is enforced separately
+	// inside apply (updateNetworkMap).
+	m.targetGen++
+	m.targetSetAt = time.Now()
+	m.mu.Unlock()
+
+	select {
+	case m.wake <- struct{}{}:
+	default:
+	}
+
+	// Persist every update received from management — once per update (not per apply
+	// pass), and including ones that get coalesced/skipped from apply, so the on-disk
+	// state always reflects the latest map management sent. Done after waking the loop
+	// so convergence can start in parallel with the disk write. The persist impl skips
+	// config-only updates (nil NetworkMap).
+	if m.persist != nil {
+		m.persist(update)
+	}
+	return nil
+}
+
+// mergeTarget combines the currently pending target with a freshly received update
+// and returns the new desired state. It is called under m.mu from SetTarget and is
+// the single seam where the replace-vs-squash decision lives.
+//
+// Today management always sends a FULL map (the complete desired state), so the
+// update simply replaces whatever was pending — prev is ignored. When management
+// starts sending incremental/delta updates, squash `update` onto `prev` here; the
+// rest of the manager (generation tracking, convergence, signaling) is unaffected
+// because it already treats target as "the complete desired state, whatever it is".
+func (m *mapStateManager) mergeTarget(prev, update *mgmProto.SyncResponse) *mgmProto.SyncResponse {
+	return update
+}
+
+// run drives convergence until ctx is done. It is meant to run in its own goroutine.
+func (m *mapStateManager) run(ctx context.Context) {
+	// passGen is the generation of the most recent apply() call (0 = none). A pass is
+	// the first for its target when its generation differs from the previous one —
+	// true on a fresh target and on a coalesced switch to a newer target mid-flight.
+	var passGen uint64
+	for {
+		m.mu.Lock()
+		target, tg, ag := m.target, m.targetGen, m.appliedGen
+		m.mu.Unlock()
+
+		// Fully converged (or nothing yet): block until a new target arrives.
+		if target == nil || ag == tg {
+			select {
+			case <-ctx.Done():
+				return
+			case <-m.wake:
+				continue
+			}
+		}
+
+		firstPass := tg != passGen
+		passGen = tg
+		more, err := m.apply(target, firstPass)
+		if err != nil {
+			if ctx.Err() != nil {
+				return
+			}
+			// Log and DROP this target — do not retry it. A deterministic failure
+			// (e.g. a malformed peer in the map) would otherwise spin every pass
+			// making no progress. Management is the source of truth and re-delivers
+			// the full map on the next sync, so dropping is safe; peers already
+			// applied this convergence stay (idempotent diffs) and the remainder is
+			// reconciled by the next target. Mirrors the legacy handleSync path,
+			// where the apply error was logged by the gRPC client and the update
+			// dropped. No onConverged: this target did not converge.
+			log.Errorf("apply sync pass, dropping update: %v", err)
+			m.settle(tg, false)
+			continue
+		}
+
+		if more {
+			// keep converging the current target; syncMsgMux was released by apply
+			// between passes so other subsystems interleave.
+			continue
+		}
+
+		// This pass converged. Mark applied and signal this one map.
+		m.settle(tg, true)
+		// if a newer target arrived mid-pass, settle is a no-op (targetGen != tg) and
+		// ag<tg next iteration -> apply it; this generation was skipped (logged in
+		// SetTarget) and is not signaled.
+	}
+}
+
+// settle marks generation tg as processed so the loop goes idle instead of
+// re-applying the same target. It is a no-op when a newer target arrived during the
+// pass (targetGen != tg), leaving appliedGen behind so that target re-applies — the
+// just-finished generation was already counted as skipped.
+//
+// When signal is true (the pass converged) it fires onConverged once for this map;
+// when false (the target was dropped on error) it does not — the map did not converge.
+func (m *mapStateManager) settle(tg uint64, signal bool) {
+	m.mu.Lock()
+	if m.targetGen != tg {
+		m.mu.Unlock()
+		return
+	}
+	m.appliedGen = tg
+	setAt := m.targetSetAt
+	m.mu.Unlock()
+
+	if signal && m.onConverged != nil {
+		m.onConverged(time.Since(setAt))
+	}
+}
--- a/client/internal/mapsync_test.go
+++ b/client/internal/mapsync_test.go
@@ -0,0 +1,242 @@
+package internal
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	mgmProto "github.com/netbirdio/netbird/shared/management/proto"
+)
+
+// converges over the bounded passes (apply returns more until the 3rd pass),
+// fires onConverged exactly once, then blocks (no further apply) until a new target.
+func TestMapStateManager_ConvergesThenStops(t *testing.T) {
+	var passes int32
+	var firstPasses int32
+	converged := make(chan struct{}, 1)
+
+	apply := func(_ *mgmProto.SyncResponse, firstPass bool) (bool, error) {
+		n := atomic.AddInt32(&passes, 1)
+		if firstPass {
+			atomic.AddInt32(&firstPasses, 1)
+		}
+		return n < 3, nil // more on pass 1 and 2, converge on pass 3
+	}
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged <- struct{}{} })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+
+	select {
+	case <-converged:
+	case <-time.After(2 * time.Second):
+		t.Fatal("manager did not converge")
+	}
+	require.EqualValues(t, 3, atomic.LoadInt32(&passes))
+	require.EqualValues(t, 1, atomic.LoadInt32(&firstPasses), "firstPass true only on pass 1, false on re-runs of the same target")
+
+	// once converged the loop blocks: no further apply calls
+	time.Sleep(100 * time.Millisecond)
+	require.EqualValues(t, 3, atomic.LoadInt32(&passes), "apply must not run after convergence")
+}
+
+// persist runs once per received update (not per apply pass), regardless of how many
+// bounded passes that target takes to converge.
+func TestMapStateManager_PersistsOncePerUpdate(t *testing.T) {
+	var passes, persists int32
+	converged := make(chan struct{}, 1)
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		n := atomic.AddInt32(&passes, 1)
+		return n < 3, nil // 3 passes for one target
+	}
+	persist := func(*mgmProto.SyncResponse) { atomic.AddInt32(&persists, 1) }
+	m := newMapStateManager(apply, persist, func(time.Duration) { converged <- struct{}{} })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-converged:
+	case <-time.After(2 * time.Second):
+		t.Fatal("did not converge")
+	}
+	require.EqualValues(t, 3, atomic.LoadInt32(&passes))
+	require.EqualValues(t, 1, atomic.LoadInt32(&persists), "persist once per update, not per pass")
+}
+
+// every update received from management is persisted — even one that is coalesced /
+// skipped from apply before it ever converges.
+func TestMapStateManager_PersistsEveryUpdateIncludingSkipped(t *testing.T) {
+	release := make(chan struct{})
+	var persists int32
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		<-release // hold the first apply so the second update coalesces/skips
+		return false, nil
+	}
+	persist := func(*mgmProto.SyncResponse) { atomic.AddInt32(&persists, 1) }
+	m := newMapStateManager(apply, persist, nil)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{})) // map1 -> apply blocks
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{})) // map2 supersedes map1 (skipped from apply)
+	close(release)
+
+	// both updates persisted even though map1 is skipped from apply
+	require.Eventually(t, func() bool { return atomic.LoadInt32(&persists) == 2 }, 2*time.Second, 10*time.Millisecond)
+}
+
+// each map that is actually processed (converged before the next arrives) fires
+// onConverged exactly once — mirroring the legacy per-message handleSync timing.
+func TestMapStateManager_SignalsEachProcessedMap(t *testing.T) {
+	converged := make(chan struct{}, 8)
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		return false, nil // converge in one pass
+	}
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged <- struct{}{} })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	const maps = 3
+	for i := 0; i < maps; i++ {
+		require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+		select { // wait for this map to converge before sending the next (no coalescing)
+		case <-converged:
+		case <-time.After(2 * time.Second):
+			t.Fatalf("map %d not signaled", i)
+		}
+	}
+
+	// no extra signals once the stream goes quiet
+	select {
+	case <-converged:
+		t.Fatal("unexpected extra onConverged")
+	case <-time.After(100 * time.Millisecond):
+	}
+}
+
+// a map superseded before it converges is skipped: only the latest (processed) map
+// fires onConverged, not the skipped one.
+func TestMapStateManager_SkippedMapNotSignaled(t *testing.T) {
+	release := make(chan struct{})
+	var applies, converged atomic.Int32
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		applies.Add(1)
+		<-release // hold the first apply in-flight so we can queue a newer target
+		return false, nil
+	}
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged.Add(1) })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	// map1 is picked up; its apply blocks on release
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	require.Eventually(t, func() bool { return applies.Load() >= 1 }, 2*time.Second, 5*time.Millisecond)
+
+	// map2 supersedes map1 before it settled -> map1 is skipped
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	close(release) // let both applies proceed
+
+	// only the processed (latest) map signals; the skipped one does not
+	require.Eventually(t, func() bool { return converged.Load() == 1 }, 2*time.Second, 10*time.Millisecond)
+	time.Sleep(150 * time.Millisecond)
+	require.EqualValues(t, 1, converged.Load(), "skipped map must not fire onConverged")
+	require.EqualValues(t, 2, applies.Load(), "both targets entered apply (map1 once, map2 once)")
+}
+
+// an apply error drops the target: no retry of the same target, no onConverged,
+// the loop goes idle — and a fresh target is still applied afterwards.
+func TestMapStateManager_DropsTargetOnError(t *testing.T) {
+	applied := make(chan struct{}, 8)
+	var failNext atomic.Bool
+	failNext.Store(true)
+
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		applied <- struct{}{}
+		if failNext.Load() {
+			return false, errors.New("boom")
+		}
+		return false, nil // converge in one pass
+	}
+	var converged atomic.Int32
+	m := newMapStateManager(apply, nil, func(time.Duration) { converged.Add(1) })
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	// first target errors -> applied once, then dropped (no retry, no onConverged)
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("errored target not applied")
+	}
+	select {
+	case <-applied:
+		t.Fatal("errored target must not be retried")
+	case <-time.After(150 * time.Millisecond):
+	}
+	require.EqualValues(t, 0, converged.Load(), "onConverged must not fire on error")
+
+	// a new target is still processed normally and converges
+	failNext.Store(false)
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("new target after error not applied")
+	}
+	require.Eventually(t, func() bool { return converged.Load() == 1 }, 2*time.Second, 10*time.Millisecond)
+}
+
+// a new target after convergence triggers a fresh apply; an idle (converged)
+// manager does not apply on its own.
+func TestMapStateManager_ReappliesOnNewTarget(t *testing.T) {
+	applied := make(chan struct{}, 8)
+	apply := func(_ *mgmProto.SyncResponse, _ bool) (bool, error) {
+		applied <- struct{}{}
+		return false, nil // converge in one pass
+	}
+	m := newMapStateManager(apply, nil, nil)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go m.run(ctx)
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("first target not applied")
+	}
+
+	// converged → must stay idle (no spurious apply)
+	select {
+	case <-applied:
+		t.Fatal("unexpected apply while idle/converged")
+	case <-time.After(150 * time.Millisecond):
+	}
+
+	require.NoError(t, m.SetTarget(&mgmProto.SyncResponse{}))
+	select {
+	case <-applied:
+	case <-time.After(2 * time.Second):
+		t.Fatal("new target not applied")
+	}
+}
--- a/client/internal/peer/handshaker.go
+++ b/client/internal/peer/handshaker.go
@@ -195,14 +195,14 @@ func (h *Handshaker) sendOffer() error {
 	}

 	offer := h.buildOfferAnswer()
-	h.log.Infof("sending offer with serial: %s", offer.SessionIDString())
+	h.log.Debugf("sending offer with serial: %s", offer.SessionIDString())

 	return h.signaler.SignalOffer(offer, h.config.Key)
 }

 func (h *Handshaker) sendAnswer() error {
 	answer := h.buildOfferAnswer()
-	h.log.Infof("sending answer with serial: %s", answer.SessionIDString())
+	h.log.Debugf("sending answer with serial: %s", answer.SessionIDString())

 	return h.signaler.SignalAnswer(answer, h.config.Key)
 }
--- a/client/internal/peer/status.go
+++ b/client/internal/peer/status.go
@@ -192,6 +192,7 @@ func (s *StatusChangeSubscription) Events() chan map[string]RouterState {
 // Pure read methods take RLock; anything that mutates state takes Lock.
 type Status struct {
 	mux                   sync.RWMutex
+	muxRelays             sync.RWMutex
 	peers                 map[string]State
 	ipToKey               map[string]string
 	changeNotify          map[string]map[string]*StatusChangeSubscription // map[peerID]map[subscriptionID]*StatusChangeSubscription
@@ -244,8 +245,8 @@ func NewRecorder(mgmAddress string) *Status {
 }

 func (d *Status) SetRelayMgr(manager *relayClient.Manager) {
-	d.mux.Lock()
-	defer d.mux.Unlock()
+	d.muxRelays.Lock()
+	defer d.muxRelays.Unlock()
 	d.relayMgr = manager
 }

@@ -906,8 +907,8 @@ func (d *Status) MarkSignalConnected() {
 }

 func (d *Status) UpdateRelayStates(relayResults []relay.ProbeResult) {
-	d.mux.Lock()
-	defer d.mux.Unlock()
+	d.muxRelays.Lock()
+	defer d.muxRelays.Unlock()
 	d.relayStates = relayResults
 }

@@ -1018,24 +1019,26 @@ func (d *Status) GetSignalState() SignalState {

 // GetRelayStates returns the stun/turn/permanent relay states
 func (d *Status) GetRelayStates() []relay.ProbeResult {
-	d.mux.RLock()
-	defer d.mux.RUnlock()
+	d.muxRelays.RLock()
 	if d.relayMgr == nil {
-		return d.relayStates
+		defer d.muxRelays.RUnlock()
+		return slices.Clone(d.relayStates)
 	}

+	relayMgr := d.relayMgr
 	// extend the list of stun, turn servers with the relay server connections
 	relayStates := slices.Clone(d.relayStates)
+	d.muxRelays.RUnlock()

-	states := d.relayMgr.RelayStates()
+	states := relayMgr.RelayStates()
 	if len(states) == 0 {
 		// no relay connection tracked yet; surface configured servers as
 		// unavailable with the real reconnect error when known
 		err := relayClient.ErrRelayClientNotConnected
-		if connErr := d.relayMgr.RelayConnectError(); connErr != nil {
+		if connErr := relayMgr.RelayConnectError(); connErr != nil {
 			err = connErr
 		}
-		for _, r := range d.relayMgr.ServerURLs() {
+		for _, r := range relayMgr.ServerURLs() {
 			relayStates = append(relayStates, relay.ProbeResult{
 				URI: r,
 				Err: err,
--- a/client/internal/profilemanager/config.go
+++ b/client/internal/profilemanager/config.go
@@ -433,7 +433,7 @@ func (config *Config) apply(input ConfigInput) (updated bool, err error) {
 		updated = true
 	}

-	if input.ServerSSHAllowed != nil && *input.ServerSSHAllowed != *config.ServerSSHAllowed {
+	if input.ServerSSHAllowed != nil && (config.ServerSSHAllowed == nil || *input.ServerSSHAllowed != *config.ServerSSHAllowed) {
 		if *input.ServerSSHAllowed {
 			log.Infof("enabling SSH server")
 		} else {
--- a/client/internal/profilemanager/config_test.go
+++ b/client/internal/profilemanager/config_test.go
@@ -242,6 +242,35 @@ func TestWireguardPortDefaultVsExplicit(t *testing.T) {
 	}
 }

+func TestUpdateConfigServerSSHAllowedNotSet(t *testing.T) {
+	// Configs written before ServerSSHAllowed was introduced lack the field and
+	// unmarshal to nil. Supplying the SSH server flag on top of such a config must
+	// apply the value instead of panicking on a nil pointer dereference.
+	tests := []struct {
+		name  string
+		input *bool
+		want  bool
+	}{
+		{"enable", util.True(), true},
+		{"disable", util.False(), false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			configPath := filepath.Join(t.TempDir(), "config.json")
+			require.NoError(t, os.WriteFile(configPath, []byte("{}"), 0600))
+
+			config, err := UpdateConfig(ConfigInput{
+				ConfigPath:       configPath,
+				ServerSSHAllowed: tt.input,
+			})
+			require.NoError(t, err)
+			require.NotNil(t, config.ServerSSHAllowed, "ServerSSHAllowed should be set from input")
+			assert.Equal(t, tt.want, *config.ServerSSHAllowed)
+		})
+	}
+}
+
 func TestUpdateOldManagementURL(t *testing.T) {
 	origProber := newMgmProber
 	newMgmProber = func(_ context.Context, _ string, _ wgtypes.Key, _ bool) (mgmProber, error) {
--- a/client/internal/routemanager/dnsinterceptor/handler.go
+++ b/client/internal/routemanager/dnsinterceptor/handler.go
@@ -251,6 +251,14 @@ func (d *DnsInterceptor) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
 		r.MsgHdr.AuthenticatedData = true
 	}

+	// Advertise EDNS0 to the forwarder so it may return an Extended DNS Error
+	// describing why a lookup failed. The OPT is stripped from the reply when
+	// the original client did not request EDNS0.
+	hadEdns := r.IsEdns0() != nil
+	if !hadEdns {
+		r.SetEdns0(dns.DefaultMsgSize, false)
+	}
+
 	upstream := net.JoinHostPort(upstreamIP.String(), strconv.FormatUint(uint64(d.forwarderPort.Load()), 10))
 	ctx, cancel := context.WithTimeout(context.Background(), dnsTimeout)
 	defer cancel()
@@ -260,6 +268,13 @@ func (d *DnsInterceptor) ServeDNS(w dns.ResponseWriter, r *dns.Msg) {
 		return
 	}

+	if ede, ok := resutil.ExtractEDE(reply); ok {
+		resutil.SetMeta(w, "ede", fmt.Sprintf("%d %s", ede.InfoCode, ede.ExtraText))
+	}
+	if !hadEdns {
+		resutil.StripOPT(reply)
+	}
+
 	resutil.SetMeta(w, "peer", peerKey)

 	reply.Id = r.Id
--- a/client/internal/routemanager/manager_test.go
+++ b/client/internal/routemanager/manager_test.go
@@ -1,3 +1,5 @@
+//go:build privileged
+
 package routemanager

 import (
--- a/client/internal/routemanager/systemops/rt_tables_linux_test.go
+++ b/client/internal/routemanager/systemops/rt_tables_linux_test.go
@@ -0,0 +1,69 @@
+//go:build linux && !android
+
+package systemops
+
+import (
+	"fmt"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestEntryExists(t *testing.T) {
+	tempDir := t.TempDir()
+	tempFilePath := fmt.Sprintf("%s/rt_tables", tempDir)
+
+	content := []string{
+		"1000 reserved",
+		fmt.Sprintf("%d %s", NetbirdVPNTableID, NetbirdVPNTableName),
+		"9999 other_table",
+	}
+	require.NoError(t, os.WriteFile(tempFilePath, []byte(strings.Join(content, "\n")), 0644))
+
+	file, err := os.Open(tempFilePath)
+	require.NoError(t, err)
+	defer func() {
+		assert.NoError(t, file.Close())
+	}()
+
+	tests := []struct {
+		name        string
+		id          int
+		shouldExist bool
+		err         error
+	}{
+		{
+			name:        "ExistsWithNetbirdPrefix",
+			id:          7120,
+			shouldExist: true,
+			err:         nil,
+		},
+		{
+			name:        "ExistsWithDifferentName",
+			id:          1000,
+			shouldExist: true,
+			err:         ErrTableIDExists,
+		},
+		{
+			name:        "DoesNotExist",
+			id:          1234,
+			shouldExist: false,
+			err:         nil,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			exists, err := entryExists(file, tc.id)
+			if tc.err != nil {
+				assert.ErrorIs(t, err, tc.err)
+			} else {
+				assert.NoError(t, err)
+			}
+			assert.Equal(t, tc.shouldExist, exists)
+		})
+	}
+}
--- a/client/internal/routemanager/systemops/systemops_bsd_privileged_test.go
+++ b/client/internal/routemanager/systemops/systemops_bsd_privileged_test.go
@@ -0,0 +1,191 @@
+//go:build (darwin || dragonfly || freebsd || netbsd || openbsd) && privileged
+
+package systemops
+
+import (
+	"fmt"
+	"net"
+	"net/netip"
+	"os/exec"
+	"regexp"
+	"runtime"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func init() {
+	testCases = append(testCases, []testCase{
+		{
+			name:              "To more specific route without custom dialer via vpn",
+			expectedInterface: expectedVPNint,
+			dialer:            &net.Dialer{},
+			expectedPacket:    createPacketExpectation("100.64.0.1", 12345, "10.10.0.2", 53),
+		},
+	}...)
+}
+
+func TestConcurrentRoutes(t *testing.T) {
+	baseIP := netip.MustParseAddr("192.0.2.0")
+
+	var intf *net.Interface
+	var nexthop Nexthop
+
+	_, intf = setupDummyInterface(t)
+	nexthop = Nexthop{netip.Addr{}, intf}
+
+	r := New(nil, nil)
+
+	var wg sync.WaitGroup
+	for i := 0; i < 1024; i++ {
+		wg.Add(1)
+		go func(ip netip.Addr) {
+			defer wg.Done()
+			prefix := netip.PrefixFrom(ip, 32)
+			if err := r.addToRouteTable(prefix, nexthop); err != nil {
+				t.Errorf("Failed to add route for %s: %v", prefix, err)
+			}
+		}(baseIP)
+		baseIP = baseIP.Next()
+	}
+
+	wg.Wait()
+
+	baseIP = netip.MustParseAddr("192.0.2.0")
+
+	for i := 0; i < 1024; i++ {
+		wg.Add(1)
+		go func(ip netip.Addr) {
+			defer wg.Done()
+			prefix := netip.PrefixFrom(ip, 32)
+			if err := r.removeFromRouteTable(prefix, nexthop); err != nil {
+				t.Errorf("Failed to remove route for %s: %v", prefix, err)
+			}
+		}(baseIP)
+		baseIP = baseIP.Next()
+	}
+
+	wg.Wait()
+}
+
+func createAndSetupDummyInterface(t *testing.T, intf string, ipAddressCIDR string) string {
+	t.Helper()
+
+	if runtime.GOOS == "darwin" {
+		err := exec.Command("ifconfig", intf, "alias", ipAddressCIDR).Run()
+		require.NoError(t, err, "Failed to create loopback alias")
+
+		t.Cleanup(func() {
+			err := exec.Command("ifconfig", intf, ipAddressCIDR, "-alias").Run()
+			assert.NoError(t, err, "Failed to remove loopback alias")
+		})
+
+		return intf
+	}
+
+	prefix, err := netip.ParsePrefix(ipAddressCIDR)
+	require.NoError(t, err, "Failed to parse prefix")
+
+	netIntf, err := net.InterfaceByName(intf)
+	require.NoError(t, err, "Failed to get interface by name")
+
+	nexthop := Nexthop{netip.Addr{}, netIntf}
+
+	r := New(nil, nil)
+	err = r.addToRouteTable(prefix, nexthop)
+	require.NoError(t, err, "Failed to add route to table")
+
+	t.Cleanup(func() {
+		err := r.removeFromRouteTable(prefix, nexthop)
+		assert.NoError(t, err, "Failed to remove route from table")
+	})
+
+	return intf
+}
+
+func addDummyRoute(t *testing.T, dstCIDR string, gw netip.Addr, _ string) {
+	t.Helper()
+
+	var originalNexthop net.IP
+	if dstCIDR == "0.0.0.0/0" {
+		var err error
+		originalNexthop, err = fetchOriginalGateway()
+		if err != nil {
+			t.Logf("Failed to fetch original gateway: %v", err)
+		}
+
+		if output, err := exec.Command("route", "delete", "-net", dstCIDR).CombinedOutput(); err != nil {
+			t.Logf("Failed to delete route: %v, output: %s", err, output)
+		}
+	}
+
+	t.Cleanup(func() {
+		if originalNexthop != nil {
+			err := exec.Command("route", "add", "-net", dstCIDR, originalNexthop.String()).Run()
+			assert.NoError(t, err, "Failed to restore original route")
+		}
+	})
+
+	err := exec.Command("route", "add", "-net", dstCIDR, gw.String()).Run()
+	require.NoError(t, err, "Failed to add route")
+
+	t.Cleanup(func() {
+		err := exec.Command("route", "delete", "-net", dstCIDR).Run()
+		assert.NoError(t, err, "Failed to remove route")
+	})
+}
+
+func fetchOriginalGateway() (net.IP, error) {
+	output, err := exec.Command("route", "-n", "get", "default").CombinedOutput()
+	if err != nil {
+		return nil, err
+	}
+
+	matches := regexp.MustCompile(`gateway: (\S+)`).FindStringSubmatch(string(output))
+	if len(matches) == 0 {
+		return nil, fmt.Errorf("gateway not found")
+	}
+
+	return net.ParseIP(matches[1]), nil
+}
+
+// setupDummyInterface creates a dummy tun interface for FreeBSD route testing
+func setupDummyInterface(t *testing.T) (netip.Addr, *net.Interface) {
+	t.Helper()
+
+	if runtime.GOOS == "darwin" {
+		return netip.AddrFrom4([4]byte{192, 168, 1, 2}), &net.Interface{Name: "lo0"}
+	}
+
+	output, err := exec.Command("ifconfig", "tun", "create").CombinedOutput()
+	require.NoError(t, err, "Failed to create tun interface: %s", string(output))
+
+	tunName := strings.TrimSpace(string(output))
+
+	output, err = exec.Command("ifconfig", tunName, "192.168.1.1", "netmask", "255.255.0.0", "192.168.1.2", "up").CombinedOutput()
+	require.NoError(t, err, "Failed to configure tun interface: %s", string(output))
+
+	intf, err := net.InterfaceByName(tunName)
+	require.NoError(t, err, "Failed to get interface by name")
+
+	t.Cleanup(func() {
+		if err := exec.Command("ifconfig", tunName, "destroy").Run(); err != nil {
+			t.Logf("Failed to destroy tun interface %s: %v", tunName, err)
+		}
+	})
+
+	return netip.AddrFrom4([4]byte{192, 168, 1, 2}), intf
+}
+
+func setupDummyInterfacesAndRoutes(t *testing.T) {
+	t.Helper()
+
+	defaultDummy := createAndSetupDummyInterface(t, expectedExternalInt, "192.168.0.1/24")
+	addDummyRoute(t, "0.0.0.0/0", netip.AddrFrom4([4]byte{192, 168, 0, 1}), defaultDummy)
+
+	otherDummy := createAndSetupDummyInterface(t, expectedInternalInt, "192.168.1.1/24")
+	addDummyRoute(t, "10.0.0.0/8", netip.AddrFrom4([4]byte{192, 168, 1, 1}), otherDummy)
+}
--- a/client/internal/routemanager/systemops/systemops_bsd_test.go
+++ b/client/internal/routemanager/systemops/systemops_bsd_test.go
@@ -3,79 +3,24 @@
 package systemops

 import (
-	"fmt"
-	"net"
-	"net/netip"
-	"os/exec"
-	"regexp"
-	"runtime"
-	"strings"
-	"sync"
 	"testing"

 	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 	"golang.org/x/net/route"
 )

+// Interface names used by the shared routing test fixtures. Kept untagged (no
+// privileged build tag) so the non-privileged test files in this package compile.
+//
+//nolint:unused // consumed by the privileged-tagged routing tests
 var expectedVPNint = "utun100"
+
+//nolint:unused // consumed by the privileged-tagged routing tests
 var expectedExternalInt = "lo0"
+
+//nolint:unused // consumed by the privileged-tagged routing tests
 var expectedInternalInt = "lo0"

-func init() {
-	testCases = append(testCases, []testCase{
-		{
-			name:              "To more specific route without custom dialer via vpn",
-			expectedInterface: expectedVPNint,
-			dialer:            &net.Dialer{},
-			expectedPacket:    createPacketExpectation("100.64.0.1", 12345, "10.10.0.2", 53),
-		},
-	}...)
-}
-
-func TestConcurrentRoutes(t *testing.T) {
-	baseIP := netip.MustParseAddr("192.0.2.0")
-
-	var intf *net.Interface
-	var nexthop Nexthop
-
-	_, intf = setupDummyInterface(t)
-	nexthop = Nexthop{netip.Addr{}, intf}
-
-	r := New(nil, nil)
-
-	var wg sync.WaitGroup
-	for i := 0; i < 1024; i++ {
-		wg.Add(1)
-		go func(ip netip.Addr) {
-			defer wg.Done()
-			prefix := netip.PrefixFrom(ip, 32)
-			if err := r.addToRouteTable(prefix, nexthop); err != nil {
-				t.Errorf("Failed to add route for %s: %v", prefix, err)
-			}
-		}(baseIP)
-		baseIP = baseIP.Next()
-	}
-
-	wg.Wait()
-
-	baseIP = netip.MustParseAddr("192.0.2.0")
-
-	for i := 0; i < 1024; i++ {
-		wg.Add(1)
-		go func(ip netip.Addr) {
-			defer wg.Done()
-			prefix := netip.PrefixFrom(ip, 32)
-			if err := r.removeFromRouteTable(prefix, nexthop); err != nil {
-				t.Errorf("Failed to remove route for %s: %v", prefix, err)
-			}
-		}(baseIP)
-		baseIP = baseIP.Next()
-	}
-
-	wg.Wait()
-}
-
 func TestBits(t *testing.T) {
 	tests := []struct {
 		name    string
@@ -122,122 +67,3 @@ func TestBits(t *testing.T) {
 		})
 	}
 }
-
-func createAndSetupDummyInterface(t *testing.T, intf string, ipAddressCIDR string) string {
-	t.Helper()
-
-	if runtime.GOOS == "darwin" {
-		err := exec.Command("ifconfig", intf, "alias", ipAddressCIDR).Run()
-		require.NoError(t, err, "Failed to create loopback alias")
-
-		t.Cleanup(func() {
-			err := exec.Command("ifconfig", intf, ipAddressCIDR, "-alias").Run()
-			assert.NoError(t, err, "Failed to remove loopback alias")
-		})
-
-		return intf
-	}
-
-	prefix, err := netip.ParsePrefix(ipAddressCIDR)
-	require.NoError(t, err, "Failed to parse prefix")
-
-	netIntf, err := net.InterfaceByName(intf)
-	require.NoError(t, err, "Failed to get interface by name")
-
-	nexthop := Nexthop{netip.Addr{}, netIntf}
-
-	r := New(nil, nil)
-	err = r.addToRouteTable(prefix, nexthop)
-	require.NoError(t, err, "Failed to add route to table")
-
-	t.Cleanup(func() {
-		err := r.removeFromRouteTable(prefix, nexthop)
-		assert.NoError(t, err, "Failed to remove route from table")
-	})
-
-	return intf
-}
-
-func addDummyRoute(t *testing.T, dstCIDR string, gw netip.Addr, _ string) {
-	t.Helper()
-
-	var originalNexthop net.IP
-	if dstCIDR == "0.0.0.0/0" {
-		var err error
-		originalNexthop, err = fetchOriginalGateway()
-		if err != nil {
-			t.Logf("Failed to fetch original gateway: %v", err)
-		}
-
-		if output, err := exec.Command("route", "delete", "-net", dstCIDR).CombinedOutput(); err != nil {
-			t.Logf("Failed to delete route: %v, output: %s", err, output)
-		}
-	}
-
-	t.Cleanup(func() {
-		if originalNexthop != nil {
-			err := exec.Command("route", "add", "-net", dstCIDR, originalNexthop.String()).Run()
-			assert.NoError(t, err, "Failed to restore original route")
-		}
-	})
-
-	err := exec.Command("route", "add", "-net", dstCIDR, gw.String()).Run()
-	require.NoError(t, err, "Failed to add route")
-
-	t.Cleanup(func() {
-		err := exec.Command("route", "delete", "-net", dstCIDR).Run()
-		assert.NoError(t, err, "Failed to remove route")
-	})
-}
-
-func fetchOriginalGateway() (net.IP, error) {
-	output, err := exec.Command("route", "-n", "get", "default").CombinedOutput()
-	if err != nil {
-		return nil, err
-	}
-
-	matches := regexp.MustCompile(`gateway: (\S+)`).FindStringSubmatch(string(output))
-	if len(matches) == 0 {
-		return nil, fmt.Errorf("gateway not found")
-	}
-
-	return net.ParseIP(matches[1]), nil
-}
-
-// setupDummyInterface creates a dummy tun interface for FreeBSD route testing
-func setupDummyInterface(t *testing.T) (netip.Addr, *net.Interface) {
-	t.Helper()
-
-	if runtime.GOOS == "darwin" {
-		return netip.AddrFrom4([4]byte{192, 168, 1, 2}), &net.Interface{Name: "lo0"}
-	}
-
-	output, err := exec.Command("ifconfig", "tun", "create").CombinedOutput()
-	require.NoError(t, err, "Failed to create tun interface: %s", string(output))
-
-	tunName := strings.TrimSpace(string(output))
-
-	output, err = exec.Command("ifconfig", tunName, "192.168.1.1", "netmask", "255.255.0.0", "192.168.1.2", "up").CombinedOutput()
-	require.NoError(t, err, "Failed to configure tun interface: %s", string(output))
-
-	intf, err := net.InterfaceByName(tunName)
-	require.NoError(t, err, "Failed to get interface by name")
-
-	t.Cleanup(func() {
-		if err := exec.Command("ifconfig", tunName, "destroy").Run(); err != nil {
-			t.Logf("Failed to destroy tun interface %s: %v", tunName, err)
-		}
-	})
-
-	return netip.AddrFrom4([4]byte{192, 168, 1, 2}), intf
-}
-
-func setupDummyInterfacesAndRoutes(t *testing.T) {
-	t.Helper()
-
-	defaultDummy := createAndSetupDummyInterface(t, expectedExternalInt, "192.168.0.1/24")
-	addDummyRoute(t, "0.0.0.0/0", netip.AddrFrom4([4]byte{192, 168, 0, 1}), defaultDummy)
-
-	otherDummy := createAndSetupDummyInterface(t, expectedInternalInt, "192.168.1.1/24")
-	addDummyRoute(t, "10.0.0.0/8", netip.AddrFrom4([4]byte{192, 168, 1, 1}), otherDummy)
-}
--- a/client/internal/routemanager/systemops/systemops_dialer_test.go
+++ b/client/internal/routemanager/systemops/systemops_dialer_test.go
@@ -0,0 +1,17 @@
+//go:build !android && !ios
+
+package systemops
+
+import (
+	"context"
+	"net"
+)
+
+// dialer is shared by the per-platform routing test cases. Kept untagged (no
+// privileged build tag) so the non-privileged test files compile on every platform.
+//
+//nolint:unused // consumed by the privileged-tagged routing tests
+type dialer interface {
+	Dial(network, address string) (net.Conn, error)
+	DialContext(ctx context.Context, network, address string) (net.Conn, error)
+}
--- a/client/internal/routemanager/systemops/systemops_generic_test.go
+++ b/client/internal/routemanager/systemops/systemops_generic_test.go
@@ -1,4 +1,4 @@
-//go:build !android && !ios
+//go:build !android && !ios && privileged

 package systemops

@@ -26,11 +26,6 @@ import (
 	nbnet "github.com/netbirdio/netbird/client/net"
 )

-type dialer interface {
-	Dial(network, address string) (net.Conn, error)
-	DialContext(ctx context.Context, network, address string) (net.Conn, error)
-}
-
 func TestAddVPNRoute(t *testing.T) {
 	testCases := []struct {
 		name        string
@@ -515,125 +510,3 @@ func setupTestEnv(t *testing.T) {
 	// unique route in vpn table
 	setupRouteAndCleanup(t, r, netip.MustParsePrefix("172.16.0.0/12"), intf)
 }
-
-func TestIsVpnRoute(t *testing.T) {
-	tests := []struct {
-		name           string
-		addr           string
-		vpnRoutes      []string
-		localRoutes    []string
-		expectedVpn    bool
-		expectedPrefix netip.Prefix
-	}{
-		{
-			name:           "Match in VPN routes",
-			addr:           "192.168.1.1",
-			vpnRoutes:      []string{"192.168.1.0/24"},
-			localRoutes:    []string{"10.0.0.0/8"},
-			expectedVpn:    true,
-			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
-		},
-		{
-			name:           "Match in local routes",
-			addr:           "10.1.1.1",
-			vpnRoutes:      []string{"192.168.1.0/24"},
-			localRoutes:    []string{"10.0.0.0/8"},
-			expectedVpn:    false,
-			expectedPrefix: netip.MustParsePrefix("10.0.0.0/8"),
-		},
-		{
-			name:           "No match",
-			addr:           "172.16.0.1",
-			vpnRoutes:      []string{"192.168.1.0/24"},
-			localRoutes:    []string{"10.0.0.0/8"},
-			expectedVpn:    false,
-			expectedPrefix: netip.Prefix{},
-		},
-		{
-			name:           "Default route ignored",
-			addr:           "192.168.1.1",
-			vpnRoutes:      []string{"0.0.0.0/0", "192.168.1.0/24"},
-			localRoutes:    []string{"10.0.0.0/8"},
-			expectedVpn:    true,
-			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
-		},
-		{
-			name:           "Default route matches but ignored",
-			addr:           "172.16.1.1",
-			vpnRoutes:      []string{"0.0.0.0/0", "192.168.1.0/24"},
-			localRoutes:    []string{"10.0.0.0/8"},
-			expectedVpn:    false,
-			expectedPrefix: netip.Prefix{},
-		},
-		{
-			name:           "Longest prefix match local",
-			addr:           "192.168.1.1",
-			vpnRoutes:      []string{"192.168.0.0/16"},
-			localRoutes:    []string{"192.168.1.0/24"},
-			expectedVpn:    false,
-			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
-		},
-		{
-			name:           "Longest prefix match local multiple",
-			addr:           "192.168.0.1",
-			vpnRoutes:      []string{"192.168.0.0/16", "192.168.0.0/25", "192.168.0.0/27"},
-			localRoutes:    []string{"192.168.0.0/24", "192.168.0.0/26", "192.168.0.0/28"},
-			expectedVpn:    false,
-			expectedPrefix: netip.MustParsePrefix("192.168.0.0/28"),
-		},
-		{
-			name:           "Longest prefix match vpn",
-			addr:           "192.168.1.1",
-			vpnRoutes:      []string{"192.168.1.0/24"},
-			localRoutes:    []string{"192.168.0.0/16"},
-			expectedVpn:    true,
-			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
-		},
-		{
-			name:           "Longest prefix match vpn multiple",
-			addr:           "192.168.0.1",
-			vpnRoutes:      []string{"192.168.0.0/16", "192.168.0.0/25", "192.168.0.0/27"},
-			localRoutes:    []string{"192.168.0.0/24", "192.168.0.0/26"},
-			expectedVpn:    true,
-			expectedPrefix: netip.MustParsePrefix("192.168.0.0/27"),
-		},
-		{
-			name:           "Duplicate prefix in both",
-			addr:           "192.168.1.1",
-			vpnRoutes:      []string{"192.168.1.0/24"},
-			localRoutes:    []string{"192.168.1.0/24"},
-			expectedVpn:    false,
-			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			addr, err := netip.ParseAddr(tt.addr)
-			if err != nil {
-				t.Fatalf("Failed to parse address %s: %v", tt.addr, err)
-			}
-
-			var vpnRoutes, localRoutes []netip.Prefix
-			for _, route := range tt.vpnRoutes {
-				prefix, err := netip.ParsePrefix(route)
-				if err != nil {
-					t.Fatalf("Failed to parse VPN route %s: %v", route, err)
-				}
-				vpnRoutes = append(vpnRoutes, prefix)
-			}
-
-			for _, route := range tt.localRoutes {
-				prefix, err := netip.ParsePrefix(route)
-				if err != nil {
-					t.Fatalf("Failed to parse local route %s: %v", route, err)
-				}
-				localRoutes = append(localRoutes, prefix)
-			}
-
-			isVpn, matchedPrefix := isVpnRoute(addr, vpnRoutes, localRoutes)
-			assert.Equal(t, tt.expectedVpn, isVpn, "isVpnRoute should return expectedVpn value")
-			assert.Equal(t, tt.expectedPrefix, matchedPrefix, "isVpnRoute should return expectedVpn prefix")
-		})
-	}
-}
--- a/client/internal/routemanager/systemops/systemops_isvpnroute_test.go
+++ b/client/internal/routemanager/systemops/systemops_isvpnroute_test.go
@@ -0,0 +1,132 @@
+//go:build !android && !ios
+
+package systemops
+
+import (
+	"net/netip"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIsVpnRoute(t *testing.T) {
+	tests := []struct {
+		name           string
+		addr           string
+		vpnRoutes      []string
+		localRoutes    []string
+		expectedVpn    bool
+		expectedPrefix netip.Prefix
+	}{
+		{
+			name:           "Match in VPN routes",
+			addr:           "192.168.1.1",
+			vpnRoutes:      []string{"192.168.1.0/24"},
+			localRoutes:    []string{"10.0.0.0/8"},
+			expectedVpn:    true,
+			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
+		},
+		{
+			name:           "Match in local routes",
+			addr:           "10.1.1.1",
+			vpnRoutes:      []string{"192.168.1.0/24"},
+			localRoutes:    []string{"10.0.0.0/8"},
+			expectedVpn:    false,
+			expectedPrefix: netip.MustParsePrefix("10.0.0.0/8"),
+		},
+		{
+			name:           "No match",
+			addr:           "172.16.0.1",
+			vpnRoutes:      []string{"192.168.1.0/24"},
+			localRoutes:    []string{"10.0.0.0/8"},
+			expectedVpn:    false,
+			expectedPrefix: netip.Prefix{},
+		},
+		{
+			name:           "Default route ignored",
+			addr:           "192.168.1.1",
+			vpnRoutes:      []string{"0.0.0.0/0", "192.168.1.0/24"},
+			localRoutes:    []string{"10.0.0.0/8"},
+			expectedVpn:    true,
+			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
+		},
+		{
+			name:           "Default route matches but ignored",
+			addr:           "172.16.1.1",
+			vpnRoutes:      []string{"0.0.0.0/0", "192.168.1.0/24"},
+			localRoutes:    []string{"10.0.0.0/8"},
+			expectedVpn:    false,
+			expectedPrefix: netip.Prefix{},
+		},
+		{
+			name:           "Longest prefix match local",
+			addr:           "192.168.1.1",
+			vpnRoutes:      []string{"192.168.0.0/16"},
+			localRoutes:    []string{"192.168.1.0/24"},
+			expectedVpn:    false,
+			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
+		},
+		{
+			name:           "Longest prefix match local multiple",
+			addr:           "192.168.0.1",
+			vpnRoutes:      []string{"192.168.0.0/16", "192.168.0.0/25", "192.168.0.0/27"},
+			localRoutes:    []string{"192.168.0.0/24", "192.168.0.0/26", "192.168.0.0/28"},
+			expectedVpn:    false,
+			expectedPrefix: netip.MustParsePrefix("192.168.0.0/28"),
+		},
+		{
+			name:           "Longest prefix match vpn",
+			addr:           "192.168.1.1",
+			vpnRoutes:      []string{"192.168.1.0/24"},
+			localRoutes:    []string{"192.168.0.0/16"},
+			expectedVpn:    true,
+			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
+		},
+		{
+			name:           "Longest prefix match vpn multiple",
+			addr:           "192.168.0.1",
+			vpnRoutes:      []string{"192.168.0.0/16", "192.168.0.0/25", "192.168.0.0/27"},
+			localRoutes:    []string{"192.168.0.0/24", "192.168.0.0/26"},
+			expectedVpn:    true,
+			expectedPrefix: netip.MustParsePrefix("192.168.0.0/27"),
+		},
+		{
+			name:           "Duplicate prefix in both",
+			addr:           "192.168.1.1",
+			vpnRoutes:      []string{"192.168.1.0/24"},
+			localRoutes:    []string{"192.168.1.0/24"},
+			expectedVpn:    false,
+			expectedPrefix: netip.MustParsePrefix("192.168.1.0/24"),
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			addr, err := netip.ParseAddr(tt.addr)
+			if err != nil {
+				t.Fatalf("Failed to parse address %s: %v", tt.addr, err)
+			}
+
+			var vpnRoutes, localRoutes []netip.Prefix
+			for _, route := range tt.vpnRoutes {
+				prefix, err := netip.ParsePrefix(route)
+				if err != nil {
+					t.Fatalf("Failed to parse VPN route %s: %v", route, err)
+				}
+				vpnRoutes = append(vpnRoutes, prefix)
+			}
+
+			for _, route := range tt.localRoutes {
+				prefix, err := netip.ParsePrefix(route)
+				if err != nil {
+					t.Fatalf("Failed to parse local route %s: %v", route, err)
+				}
+				localRoutes = append(localRoutes, prefix)
+			}
+
+			isVpn, matchedPrefix := isVpnRoute(addr, vpnRoutes, localRoutes)
+			assert.Equal(t, tt.expectedVpn, isVpn, "isVpnRoute should return expectedVpn value")
+			assert.Equal(t, tt.expectedPrefix, matchedPrefix, "isVpnRoute should return expectedVpn prefix")
+		})
+	}
+}
--- a/client/internal/routemanager/systemops/systemops_linux_test.go
+++ b/client/internal/routemanager/systemops/systemops_linux_test.go
@@ -1,13 +1,10 @@
-//go:build !android
+//go:build linux && !android && privileged

 package systemops

 import (
 	"errors"
-	"fmt"
 	"net"
-	"os"
-	"strings"
 	"syscall"
 	"testing"

@@ -18,10 +15,6 @@ import (
 	"github.com/netbirdio/netbird/client/internal/routemanager/vars"
 )

-var expectedVPNint = "wgtest0"
-var expectedExternalInt = "dummyext0"
-var expectedInternalInt = "dummyint0"
-
 func init() {
 	testCases = append(testCases, []testCase{
 		{
@@ -33,62 +26,6 @@ func init() {
 	}...)
 }

-func TestEntryExists(t *testing.T) {
-	tempDir := t.TempDir()
-	tempFilePath := fmt.Sprintf("%s/rt_tables", tempDir)
-
-	content := []string{
-		"1000 reserved",
-		fmt.Sprintf("%d %s", NetbirdVPNTableID, NetbirdVPNTableName),
-		"9999 other_table",
-	}
-	require.NoError(t, os.WriteFile(tempFilePath, []byte(strings.Join(content, "\n")), 0644))
-
-	file, err := os.Open(tempFilePath)
-	require.NoError(t, err)
-	defer func() {
-		assert.NoError(t, file.Close())
-	}()
-
-	tests := []struct {
-		name        string
-		id          int
-		shouldExist bool
-		err         error
-	}{
-		{
-			name:        "ExistsWithNetbirdPrefix",
-			id:          7120,
-			shouldExist: true,
-			err:         nil,
-		},
-		{
-			name:        "ExistsWithDifferentName",
-			id:          1000,
-			shouldExist: true,
-			err:         ErrTableIDExists,
-		},
-		{
-			name:        "DoesNotExist",
-			id:          1234,
-			shouldExist: false,
-			err:         nil,
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.name, func(t *testing.T) {
-			exists, err := entryExists(file, tc.id)
-			if tc.err != nil {
-				assert.ErrorIs(t, err, tc.err)
-			} else {
-				assert.NoError(t, err)
-			}
-			assert.Equal(t, tc.shouldExist, exists)
-		})
-	}
-}
-
 func createAndSetupDummyInterface(t *testing.T, interfaceName, ipAddressCIDR string) string {
 	t.Helper()

--- a/client/internal/routemanager/systemops/systemops_routing_data_linux_test.go
+++ b/client/internal/routemanager/systemops/systemops_routing_data_linux_test.go
@@ -0,0 +1,15 @@
+//go:build linux && !android
+
+package systemops
+
+// Interface names used by the shared routing test fixtures. Kept untagged (no
+// privileged build tag) so the non-privileged test files in this package compile.
+//
+//nolint:unused // consumed by the privileged-tagged routing tests
+var expectedVPNint = "wgtest0"
+
+//nolint:unused // consumed by the privileged-tagged routing tests
+var expectedExternalInt = "dummyext0"
+
+//nolint:unused // consumed by the privileged-tagged routing tests
+var expectedInternalInt = "dummyint0"
--- a/client/internal/routemanager/systemops/systemops_routing_data_test.go
+++ b/client/internal/routemanager/systemops/systemops_routing_data_test.go
@@ -0,0 +1,83 @@
+//go:build (linux && !android) || (darwin && !ios) || freebsd || openbsd || netbsd || dragonfly
+
+package systemops
+
+import (
+	"net"
+
+	nbnet "github.com/netbirdio/netbird/client/net"
+)
+
+// Shared, non-privileged routing test fixtures. The privileged TestRouting (and its
+// per-platform init() appenders) consume these; they live here so the unprivileged
+// BSD/darwin test files compile without the privileged build tag.
+
+type PacketExpectation struct {
+	SrcIP   net.IP
+	DstIP   net.IP
+	SrcPort int
+	DstPort int
+	UDP     bool
+	TCP     bool
+}
+
+//nolint:unused // consumed by the privileged-tagged routing tests
+type testCase struct {
+	name              string
+	expectedInterface string
+	dialer            dialer
+	expectedPacket    PacketExpectation
+}
+
+//nolint:unused // consumed by the privileged-tagged routing tests
+var testCases = []testCase{
+	{
+		name:              "To external host without custom dialer via vpn",
+		expectedInterface: expectedVPNint,
+		dialer:            &net.Dialer{},
+		expectedPacket:    createPacketExpectation("100.64.0.1", 12345, "192.0.2.1", 53),
+	},
+	{
+		name:              "To external host with custom dialer via physical interface",
+		expectedInterface: expectedExternalInt,
+		dialer:            nbnet.NewDialer(),
+		expectedPacket:    createPacketExpectation("192.168.0.1", 12345, "192.0.2.1", 53),
+	},
+
+	{
+		name:              "To duplicate internal route with custom dialer via physical interface",
+		expectedInterface: expectedInternalInt,
+		dialer:            nbnet.NewDialer(),
+		expectedPacket:    createPacketExpectation("192.168.1.1", 12345, "10.0.0.2", 53),
+	},
+	{
+		name:              "To duplicate internal route without custom dialer via physical interface", // local route takes precedence
+		expectedInterface: expectedInternalInt,
+		dialer:            &net.Dialer{},
+		expectedPacket:    createPacketExpectation("192.168.1.1", 12345, "10.0.0.2", 53),
+	},
+
+	{
+		name:              "To unique vpn route with custom dialer via physical interface",
+		expectedInterface: expectedExternalInt,
+		dialer:            nbnet.NewDialer(),
+		expectedPacket:    createPacketExpectation("192.168.0.1", 12345, "172.16.0.2", 53),
+	},
+	{
+		name:              "To unique vpn route without custom dialer via vpn",
+		expectedInterface: expectedVPNint,
+		dialer:            &net.Dialer{},
+		expectedPacket:    createPacketExpectation("100.64.0.1", 12345, "172.16.0.2", 53),
+	},
+}
+
+//nolint:unused // consumed by the privileged-tagged routing tests
+func createPacketExpectation(srcIP string, srcPort int, dstIP string, dstPort int) PacketExpectation {
+	return PacketExpectation{
+		SrcIP:   net.ParseIP(srcIP),
+		DstIP:   net.ParseIP(dstIP),
+		SrcPort: srcPort,
+		DstPort: dstPort,
+		UDP:     true,
+	}
+}
--- a/client/internal/routemanager/systemops/systemops_unix_test.go
+++ b/client/internal/routemanager/systemops/systemops_unix_test.go
@@ -1,4 +1,4 @@
-//go:build (linux && !android) || (darwin && !ios) || freebsd || openbsd || netbsd || dragonfly
+//go:build ((linux && !android) || (darwin && !ios) || freebsd || openbsd || netbsd || dragonfly) && privileged

 package systemops

@@ -20,63 +20,6 @@ import (
 	nbnet "github.com/netbirdio/netbird/client/net"
 )

-type PacketExpectation struct {
-	SrcIP   net.IP
-	DstIP   net.IP
-	SrcPort int
-	DstPort int
-	UDP     bool
-	TCP     bool
-}
-
-type testCase struct {
-	name              string
-	expectedInterface string
-	dialer            dialer
-	expectedPacket    PacketExpectation
-}
-
-var testCases = []testCase{
-	{
-		name:              "To external host without custom dialer via vpn",
-		expectedInterface: expectedVPNint,
-		dialer:            &net.Dialer{},
-		expectedPacket:    createPacketExpectation("100.64.0.1", 12345, "192.0.2.1", 53),
-	},
-	{
-		name:              "To external host with custom dialer via physical interface",
-		expectedInterface: expectedExternalInt,
-		dialer:            nbnet.NewDialer(),
-		expectedPacket:    createPacketExpectation("192.168.0.1", 12345, "192.0.2.1", 53),
-	},
-
-	{
-		name:              "To duplicate internal route with custom dialer via physical interface",
-		expectedInterface: expectedInternalInt,
-		dialer:            nbnet.NewDialer(),
-		expectedPacket:    createPacketExpectation("192.168.1.1", 12345, "10.0.0.2", 53),
-	},
-	{
-		name:              "To duplicate internal route without custom dialer via physical interface", // local route takes precedence
-		expectedInterface: expectedInternalInt,
-		dialer:            &net.Dialer{},
-		expectedPacket:    createPacketExpectation("192.168.1.1", 12345, "10.0.0.2", 53),
-	},
-
-	{
-		name:              "To unique vpn route with custom dialer via physical interface",
-		expectedInterface: expectedExternalInt,
-		dialer:            nbnet.NewDialer(),
-		expectedPacket:    createPacketExpectation("192.168.0.1", 12345, "172.16.0.2", 53),
-	},
-	{
-		name:              "To unique vpn route without custom dialer via vpn",
-		expectedInterface: expectedVPNint,
-		dialer:            &net.Dialer{},
-		expectedPacket:    createPacketExpectation("100.64.0.1", 12345, "172.16.0.2", 53),
-	},
-}
-
 func TestRouting(t *testing.T) {
 	nbnet.Init()
 	for _, tc := range testCases {
@@ -102,16 +45,6 @@ func TestRouting(t *testing.T) {
 	}
 }

-func createPacketExpectation(srcIP string, srcPort int, dstIP string, dstPort int) PacketExpectation {
-	return PacketExpectation{
-		SrcIP:   net.ParseIP(srcIP),
-		DstIP:   net.ParseIP(dstIP),
-		SrcPort: srcPort,
-		DstPort: dstPort,
-		UDP:     true,
-	}
-}
-
 func startPacketCapture(t *testing.T, intf, filter string) *pcap.Handle {
 	t.Helper()

--- a/client/internal/routemanager/systemops/systemops_windows_test.go
+++ b/client/internal/routemanager/systemops/systemops_windows_test.go
@@ -1,3 +1,5 @@
+//go:build windows && privileged
+
 package systemops

 import (
--- a/client/internal/routemanager/systemops/v6route_bsd_test.go
+++ b/client/internal/routemanager/systemops/v6route_bsd_test.go
@@ -11,6 +11,8 @@ import (
 // ensureIPv6DefaultRoute installs an IPv6 default route via the loopback
 // interface so route lookups for global IPv6 prefixes resolve in environments
 // without v6 connectivity. If a default already exists it is left alone.
+//
+//nolint:unused // consumed by the privileged-tagged routing tests
 func ensureIPv6DefaultRoute(t *testing.T) {
 	t.Helper()

--- a/client/internal/routemanager/systemops/v6route_linux_test.go
+++ b/client/internal/routemanager/systemops/v6route_linux_test.go
@@ -1,4 +1,4 @@
-//go:build linux && !android
+//go:build linux && !android && privileged

 package systemops

--- a/client/internal/routemanager/systemops/v6route_windows_test.go
+++ b/client/internal/routemanager/systemops/v6route_windows_test.go
@@ -8,11 +8,14 @@ import (
 	"testing"
 )

+//nolint:unused // consumed by the privileged-tagged routing tests
 const loopbackIfaceWindows = "Loopback Pseudo-Interface 1"

 // ensureIPv6DefaultRoute installs an IPv6 default route via the loopback
 // interface so route lookups for global IPv6 prefixes resolve in environments
 // without v6 connectivity. If a default already exists it is left alone.
+//
+//nolint:unused // consumed by the privileged-tagged routing tests
 func ensureIPv6DefaultRoute(t *testing.T) {
 	t.Helper()

--- a/client/ios/NetBirdSDK/client.go
+++ b/client/ios/NetBirdSDK/client.go
@@ -171,13 +171,13 @@ func (c *Client) Run(fd int32, interfaceName string, envList *EnvList) error {
 	c.onHostDnsFn = func([]string) {}
 	cfg.WgIface = interfaceName

-	connectClient := internal.NewConnectClient(ctx, c.recorder)
+	connectClient := internal.NewConnectClient(ctx, cfg, c.recorder)
 	c.setState(cfg, connectClient)
 	// Persist the latest sync response so DebugBundle can include the network
 	// map. On iOS this is backed by disk to keep it out of the constrained
 	// process memory (see the syncstore package).
 	connectClient.SetSyncResponsePersistence(true)
-	return connectClient.RunOniOS(cfg, fd, c.networkChangeListener, c.dnsManager, c.stateFile, c.cacheDir, c.logFilePath)
+	return connectClient.RunOniOS(fd, c.networkChangeListener, c.dnsManager, c.stateFile, c.cacheDir, c.logFilePath)
 }

 // Stop the internal client and free the resources
--- a/client/ios/NetBirdSDK/login.go
+++ b/client/ios/NetBirdSDK/login.go
@@ -36,6 +36,7 @@ type URLOpener interface {
 // Auth can register or login new client
 type Auth struct {
 	ctx     context.Context
+	cancel  context.CancelFunc
 	config  *profilemanager.Config
 	cfgPath string
 }
@@ -51,8 +52,19 @@ func NewAuth(cfgPath string, mgmURL string) (*Auth, error) {
 		return nil, err
 	}

+	// Use a cancellable context so Stop() can abort an in-progress interactive
+	// login. The PKCE flow's WaitToken blocks (and keeps its loopback HTTP server
+	// bound to a port) until the OAuth callback arrives or the flow expires;
+	// cancelling the context unblocks WaitToken, which then shuts that server down
+	// and frees the port for the next login attempt. iOS runs login in the main-app
+	// process (decoupled from the network extension), so without this the server
+	// lingers after the user dismisses the browser and the next connect stalls
+	// trying to bind the same port.
+	ctx, cancel := context.WithCancel(context.Background())
+
 	return &Auth{
-		ctx:     context.Background(),
+		ctx:     ctx,
+		cancel:  cancel,
 		config:  cfg,
 		cfgPath: cfgPath,
 	}, nil
@@ -60,12 +72,24 @@ func NewAuth(cfgPath string, mgmURL string) (*Auth, error) {

 // NewAuthWithConfig instantiate Auth based on existing config
 func NewAuthWithConfig(ctx context.Context, config *profilemanager.Config) *Auth {
+	ctx, cancel := context.WithCancel(ctx)
 	return &Auth{
 		ctx:    ctx,
+		cancel: cancel,
 		config: config,
 	}
 }

+// Stop aborts an in-progress interactive login started via Login/LoginWithDeviceName.
+// It cancels the auth context, which unblocks the PKCE WaitToken and shuts down its
+// loopback HTTP server, freeing the redirect port. Safe to call multiple times and
+// safe to call when no login is running.
+func (a *Auth) Stop() {
+	if a.cancel != nil {
+		a.cancel()
+	}
+}
+
 // SaveConfigIfSSOSupported test the connectivity with the management server by retrieving the server device flow info.
 // If it returns a flow info than save the configuration and return true. If it gets a codes.NotFound, it means that SSO
 // is not supported and returns false without saving the configuration. For other errors return false.
--- a/client/server/capture.go
+++ b/client/server/capture.go
@@ -344,6 +344,9 @@ func (s *Server) clearCaptureIfOwner(sess *capture.Session, engine *internal.Eng
 }

 func (s *Server) getCaptureEngineLocked() (*internal.Engine, error) {
+	if s.connectClient == nil {
+		return nil, status.Error(codes.FailedPrecondition, "client not connected")
+	}
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, status.Error(codes.FailedPrecondition, "engine not initialized")
--- a/client/server/debug.go
+++ b/client/server/debug.go
@@ -5,6 +5,7 @@ package server
 import (
 	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"runtime/pprof"

@@ -27,9 +28,11 @@ func (s *Server) DebugBundle(_ context.Context, req *proto.DebugBundleRequest) (
 	}

 	var clientMetrics debug.MetricsExporter
-	if engine := s.connectClient.Engine(); engine != nil {
-		if cm := engine.GetClientMetrics(); cm != nil {
-			clientMetrics = cm
+	if s.connectClient != nil {
+		if engine := s.connectClient.Engine(); engine != nil {
+			if cm := engine.GetClientMetrics(); cm != nil {
+				clientMetrics = cm
+			}
 		}
 	}

@@ -45,10 +48,13 @@ func (s *Server) DebugBundle(_ context.Context, req *proto.DebugBundleRequest) (
 	defer s.cleanupBundleCapture()

 	var refreshStatus func()
-	if engine := s.connectClient.Engine(); engine != nil {
-		refreshStatus = func() {
-			log.Debug("refreshing system health status for debug bundle")
-			engine.RunHealthProbes(true)
+	if s.connectClient != nil {
+		engine := s.connectClient.Engine()
+		if engine != nil {
+			refreshStatus = func() {
+				log.Debug("refreshing system health status for debug bundle")
+				engine.RunHealthProbes(true)
+			}
 		}
 	}

@@ -112,7 +118,9 @@ func (s *Server) SetLogLevel(_ context.Context, req *proto.SetLogLevelRequest) (

 	log.SetLevel(level)

-	s.connectClient.SetLogLevel(level)
+	if s.connectClient != nil {
+		s.connectClient.SetLogLevel(level)
+	}

 	log.Infof("Log level set to %s", level.String())

@@ -126,13 +134,20 @@ func (s *Server) SetSyncResponsePersistence(_ context.Context, req *proto.SetSyn

 	enabled := req.GetEnabled()
 	s.persistSyncResponse = enabled
-	s.connectClient.SetSyncResponsePersistence(enabled)
+	if s.connectClient != nil {
+		s.connectClient.SetSyncResponsePersistence(enabled)
+	}

 	return &proto.SetSyncResponsePersistenceResponse{}, nil
 }

 func (s *Server) getLatestSyncResponse() (*mgmProto.SyncResponse, error) {
-	return s.connectClient.GetLatestSyncResponse()
+	cClient := s.connectClient
+	if cClient == nil {
+		return nil, errors.New("connect client is not initialized")
+	}
+
+	return cClient.GetLatestSyncResponse()
 }

 // StartCPUProfile starts CPU profiling in the daemon.
--- a/client/server/mdm.go
+++ b/client/server/mdm.go
@@ -3,6 +3,7 @@ package server
 import (
 	"context"
 	"fmt"
+	"time"

 	log "github.com/sirupsen/logrus"
 	"google.golang.org/grpc/codes"
@@ -38,11 +39,12 @@ type conflictCheck struct {
 // OS-native managed-config store reports a diff vs the last observation.
 //
 // Restart sequence:
-//  1. Stop the in-flight run via the supervisor (blocks until fully torn down).
-//  2. Re-resolve Config from disk + MDM policy (Config.apply re-runs
+//  1. Cancel the active engine context (terminates connectWithRetryRuns).
+//  2. Wait briefly for that goroutine to exit (giveUpChan is closed on exit).
+//  3. Re-resolve Config from disk + MDM policy (Config.apply re-runs
 //     applyMDMPolicy with the freshly loaded Policy).
-//  3. Start a fresh run with the new config.
-//  4. Broadcast a SystemEvent so any GUI / CLI subscriber (SubscribeEvents
+//  4. Spawn a fresh connectWithRetryRuns with the new context and config.
+//  5. Broadcast a SystemEvent so any GUI / CLI subscriber (SubscribeEvents
 //     RPC) can refresh its cached config view without polling.
 //
 // The callback runs in the ticker's own goroutine. Ticker has already
@@ -50,24 +52,39 @@ type conflictCheck struct {
 func (s *Server) onMDMPolicyChange(_, _ *mdm.Policy) error {
 	log.Warn("MDM policy changed; restarting engine to apply new configuration")

-	// Hold s.mutex for the entire restart sequence (stop + re-start). Any
-	// concurrent Up/Down/Status arriving while MDM is restarting blocks on the
-	// Lock until we are done — they then observe the post-restart state coherently.
+	// Hold s.mutex for the entire restart sequence (cancel + quiescence
+	// wait + re-spawn). Any concurrent Up/Down/Status arriving while
+	// MDM is restarting blocks on the Lock until we are done — they
+	// then observe the post-restart state coherently. This is safe
+	// because the connectWithRetryRuns goroutine no longer acquires
+	// s.mutex in its defer (intent vs. goroutine-alive concerns are
+	// fully separated; see the connectionGoroutineRunning helper).
 	s.mutex.Lock()
 	defer s.mutex.Unlock()

-	if !s.connectClient.ConnectionRunning() {
-		// No run in flight, so there's no engine to restart.
+	if !s.clientRunning {
+		// The client is not running, so there's no engine to restart.
 		return nil
 	}
-
-	// Cancel daemon-side login/status activities tied to the old run; the run
-	// itself is torn down atomically by the supervisor inside Restart (see
-	// restartEngineForMDMLocked), which stops and re-starts in one operation.
 	if s.actCancel != nil {
 		s.actCancel()
 	}

+	// Wait for previous connectWithRetryRuns to exit so we don't end up
+	// with two goroutines fighting over the same status recorder + engine.
+	// The teardown engages a fan-out of engine goroutines (peer workers,
+	// signal handler, route manager, ...). close(clientGiveUpChan)
+	// happens in the function-scope defer of connectWithRetryRuns, on
+	// every exit path (ctx cancel, backoff exhausted, panic) — see the
+	// defer in server.go.
+	if s.clientGiveUpChan != nil {
+		select {
+		case <-s.clientGiveUpChan:
+		case <-time.After(10 * time.Second):
+			return fmt.Errorf("failed to restart the engine due to timeout")
+		}
+	}
+
 	if err := s.restartEngineForMDMLocked(); err != nil {
 		log.Errorf("MDM restart failed: %v", err)
 		return err
@@ -114,13 +131,14 @@ func (s *Server) publishConfigChangedEvent(source string) {
 }

 // restartEngineForMDMLocked re-resolves the active profile config
-// (re-running applyMDMPolicy via Config.apply) and starts a fresh run.
-// Mirrors the tail of Server.Start so a runtime MDM change behaves
-// identically to a fresh boot under the new policy.
+// (re-running applyMDMPolicy via Config.apply) and re-spawns
+// connectWithRetryRuns. Mirrors the tail of Server.Start so a runtime
+// MDM change behaves identically to a fresh boot under the new policy.
 //
 // MUST be called with s.mutex held — onMDMPolicyChange holds the lock
-// for the entire restart sequence so concurrent Up/Down/Status RPCs
-// observe a coherent post-restart state.
+// for the entire restart sequence (cancel + quiescence wait + re-spawn)
+// so concurrent Up/Down/Status RPCs observe a coherent post-restart
+// state.
 func (s *Server) restartEngineForMDMLocked() error {
 	activeProf, err := s.profileManager.GetActiveProfileState()
 	if err != nil {
@@ -136,13 +154,13 @@ func (s *Server) restartEngineForMDMLocked() error {
 	s.statusRecorder.UpdateRosenpass(config.RosenpassEnabled, config.RosenpassPermissive)
 	s.statusRecorder.UpdateLazyConnection(config.LazyConnectionEnabled)

-	_, cancel := context.WithCancel(s.rootCtx)
+	ctx, cancel := context.WithCancel(s.rootCtx)
 	s.actCancel = cancel
-	log.Info("MDM restart: atomically restarting the run with re-resolved config")
-	// MDM restart has no incoming RPC metadata; fire and forget. Restart is a
-	// single supervisor op (atomic stop+start), so there is no observable
-	// "stopped" window between tearing down the old run and starting the new.
-	s.connectClient.Restart(config, nil)
+	s.clientRunning = true
+	s.clientRunningChan = make(chan struct{})
+	s.clientGiveUpChan = make(chan struct{})
+	log.Info("MDM restart: spawning connectWithRetryRuns with re-resolved config")
+	go s.connectWithRetryRuns(ctx, config, s.statusRecorder, s.clientRunningChan, s.clientGiveUpChan)
 	s.publishConfigChangedEvent("mdm")
 	return nil
 }
--- a/client/server/network.go
+++ b/client/server/network.go
@@ -34,6 +34,10 @@ func (s *Server) ListNetworks(context.Context, *proto.ListNetworksRequest) (*pro
 		return nil, gstatus.Errorf(codes.Unavailable, errNetworksDisabled)
 	}

+	if s.connectClient == nil {
+		return nil, fmt.Errorf("not connected")
+	}
+
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, fmt.Errorf("not connected")
@@ -143,6 +147,10 @@ func (s *Server) SelectNetworks(_ context.Context, req *proto.SelectNetworksRequ
 		return nil, gstatus.Errorf(codes.Unavailable, errNetworksDisabled)
 	}

+	if s.connectClient == nil {
+		return nil, fmt.Errorf("not connected")
+	}
+
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, fmt.Errorf("not connected")
@@ -191,6 +199,10 @@ func (s *Server) DeselectNetworks(_ context.Context, req *proto.SelectNetworksRe
 		return nil, gstatus.Errorf(codes.Unavailable, errNetworksDisabled)
 	}

+	if s.connectClient == nil {
+		return nil, fmt.Errorf("not connected")
+	}
+
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, fmt.Errorf("not connected")
--- a/client/server/server.go
+++ b/client/server/server.go
@@ -8,10 +8,12 @@ import (
 	"os"
 	"os/exec"
 	"runtime"
+	"strconv"
 	"sync"
 	"sync/atomic"
 	"time"

+	"github.com/cenkalti/backoff/v4"
 	log "github.com/sirupsen/logrus"
 	"golang.zx2c4.com/wireguard/wgctrl/wgtypes"
 	"google.golang.org/grpc/codes"
@@ -37,7 +39,15 @@ import (
 )

 const (
-	probeThreshold = time.Second * 5
+	probeThreshold          = time.Second * 5
+	retryInitialIntervalVar = "NB_CONN_RETRY_INTERVAL_TIME"
+	maxRetryIntervalVar     = "NB_CONN_MAX_RETRY_INTERVAL_TIME"
+	maxRetryTimeVar         = "NB_CONN_MAX_RETRY_TIME_TIME"
+	retryMultiplierVar      = "NB_CONN_RETRY_MULTIPLIER"
+	defaultInitialRetryTime = 30 * time.Minute
+	defaultMaxRetryInterval = 60 * time.Minute
+	defaultMaxRetryTime     = 14 * 24 * time.Hour
+	defaultRetryMultiplier  = 1.7

 	// JWT token cache TTL for the client daemon (disabled by default)
 	defaultJWTCacheTTL = 0
@@ -62,8 +72,15 @@ type Server struct {
 	mutex  sync.Mutex
 	config *profilemanager.Config
 	proto.UnimplementedDaemonServiceServer
-	// Run state (in-flight? established/done channels?) is owned entirely by the
-	// supervisor inside connectClient — the daemon keeps no per-run fields.
+	// clientRunning tracks "the daemon wants to be connected" — set true by
+	// Start / Up, cleared by Down / Logout. Persists across retry
+	// loops, signal disconnects, and ErrResetConnection cycles. NOT
+	// changed by connectWithRetryRuns goroutine exit — for that
+	// (goroutine-still-alive) check, see connectionGoroutineRunning() which
+	// derives from clientGiveUpChan close state. Protected by s.mutex.
+	clientRunning     bool
+	clientRunningChan chan struct{}
+	clientGiveUpChan  chan struct{} // closed when connectWithRetryRuns goroutine exits

 	connectClient *internal.ConnectClient

@@ -119,13 +136,6 @@ func New(ctx context.Context, logFile string, configFile string, profilesDisable
 		networksDisabled:       networksDisabled,
 		jwtCache:               newJWTCache(),
 	}
-	// The ConnectClient is daemon-lifetime: build it exactly once, here. Its
-	// supervisor lives as long as the daemon; Up/Down/MDM and reconnects all
-	// drive this same instance. updateManager isn't ready yet (created in
-	// Start) and is injected there via SetUpdateManager.
-	s.connectClient = internal.NewConnectClient(ctx, s.statusRecorder)
-	s.connectClient.SetSyncResponsePersistence(s.persistSyncResponse)
-
 	agent := &serverAgent{s}
 	s.sleepHandler = sleephandler.New(agent)
 	s.startSleepDetector()
@@ -137,7 +147,7 @@ func (s *Server) Start() error {
 	s.mutex.Lock()
 	defer s.mutex.Unlock()

-	if s.connectClient.ConnectionRunning() {
+	if s.clientRunning {
 		return nil
 	}

@@ -155,7 +165,6 @@ func (s *Server) Start() error {
 		stateMgr := statemanager.New(s.profileManager.GetStatePath())
 		s.updateManager = updater.NewManager(s.statusRecorder, stateMgr)
 		s.updateManager.CheckUpdateSuccess(s.rootCtx)
-		s.connectClient.SetUpdateManager(s.updateManager)
 	}

 	// MDM policy reload ticker: every minute the desktop daemon re-reads
@@ -181,9 +190,7 @@ func (s *Server) Start() error {
 		return nil
 	}

-	// actCancel cancels in-flight foreground operations (login/status); the run
-	// itself is owned by the supervisor and stopped via Stop, not this cancel.
-	_, cancel := context.WithCancel(s.rootCtx)
+	ctx, cancel := context.WithCancel(s.rootCtx)
 	s.actCancel = cancel

 	// copy old default config
@@ -225,14 +232,99 @@ func (s *Server) Start() error {
 		return nil
 	}

-	// Boot autoconnect: no incoming RPC metadata. The supervisor runs the
-	// client and reconnects internally; we just fire and forget (the run owns
-	// its established/done channels).
-	s.connectClient.RunAsync(config, nil)
+	s.clientRunning = true
+	s.clientRunningChan = make(chan struct{})
+	s.clientGiveUpChan = make(chan struct{})
+	go s.connectWithRetryRuns(ctx, config, s.statusRecorder, s.clientRunningChan, s.clientGiveUpChan)
 	s.publishConfigChangedEvent("startup")
 	return nil
 }

+// connectWithRetryRuns runs the client connection with a backoff strategy where we retry the operation as additional
+// mechanism to keep the client connected even when the connection is lost.
+// we cancel retry if the client receive a stop or down command, or if disable auto connect is configured.
+//
+// The goroutine's exit is signalled to the daemon via close(giveUpChan)
+// — placed in the function-scope defer so every return path (panic,
+// DisableAutoConnect early-exit, backoff exhausted, ctx cancel) closes
+// it. Callers that need to observe "is the goroutine still alive?" use
+// Server.connectionGoroutineRunning() which non-blockingly checks the close state
+// of clientGiveUpChan. The defer does NOT touch s.mutex; the daemon's
+// "intent" (clientRunning) is maintained by the RPC handlers, not by this
+// goroutine.
+func (s *Server) connectWithRetryRuns(ctx context.Context, profileConfig *profilemanager.Config, statusRecorder *peer.Status, runningChan chan struct{}, giveUpChan chan struct{}) {
+	defer func() {
+		if giveUpChan != nil {
+			close(giveUpChan)
+		}
+	}()
+
+	if s.config.DisableAutoConnect {
+		if err := s.connect(ctx, s.config, s.statusRecorder, runningChan); err != nil {
+			log.Debugf("run client connection exited with error: %v", err)
+		}
+		log.Tracef("client connection exited")
+		return
+	}
+
+	backOff := getConnectWithBackoff(ctx)
+	go func() {
+		t := time.NewTicker(24 * time.Hour)
+		for {
+			select {
+			case <-ctx.Done():
+				t.Stop()
+				return
+			case <-t.C:
+				mgmtState := statusRecorder.GetManagementState()
+				signalState := statusRecorder.GetSignalState()
+				if mgmtState.Connected && signalState.Connected {
+					log.Tracef("resetting status")
+					backOff.Reset()
+				} else {
+					log.Tracef("not resetting status: mgmt: %v, signal: %v", mgmtState.Connected, signalState.Connected)
+				}
+			}
+		}
+	}()
+
+	runOperation := func() error {
+		err := s.connect(ctx, profileConfig, statusRecorder, runningChan)
+		if err != nil {
+			log.Debugf("run client connection exited with error: %v. Will retry in the background", err)
+			return err
+		}
+
+		log.Tracef("client connection exited gracefully, do not need to retry")
+		return nil
+	}
+
+	if err := backoff.Retry(runOperation, backOff); err != nil {
+		log.Errorf("operation failed: %v", err)
+	}
+	// giveUpChan is closed by the function-scope defer.
+}
+
+// connectionGoroutineRunning reports whether the connectWithRetryRuns goroutine is
+// still running. Returns false when no goroutine has ever been started
+// AND when the most recent one has already closed clientGiveUpChan on
+// exit (whether due to ctx cancel, DisableAutoConnect single-shot
+// completion, or backoff retry exhaustion).
+//
+// MUST be called with s.mutex held — accesses s.clientGiveUpChan which
+// is written by Start/Up under the same lock.
+func (s *Server) connectionGoroutineRunning() bool {
+	if s.clientGiveUpChan == nil {
+		return false
+	}
+	select {
+	case <-s.clientGiveUpChan:
+		return false
+	default:
+		return true
+	}
+}
+
 // loginAttempt attempts to login using the provided information. it returns a status in case something fails
 func (s *Server) loginAttempt(ctx context.Context, setupKey, jwtToken string) (internal.StatusType, error) {
 	authClient, err := auth.NewAuth(ctx, s.config.PrivateKey, s.config.ManagementURL, s.config)
@@ -628,22 +720,13 @@ func (s *Server) WaitSSOLogin(callerCtx context.Context, msg *proto.WaitSSOLogin
 // Up starts engine work in the daemon.
 func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpResponse, error) {
 	s.mutex.Lock()
-
-	// The client (and its supervisor) is built once in New(), so a nil here
-	// never happens in production — Up is only reachable after New() has run and
-	// the gRPC server is serving. The real case this guards is the daemon
-	// SHUTTING DOWN: rootCtx is cancelled, the supervisor is no longer accepting
-	// commands, so ServiceRunning() is false even though the client exists. Bail
-	// loud instead of enqueuing a run that will never start. (nil only happens in
-	// tests that build a Server without New(); ServiceRunning is nil-safe.)
-	if !s.connectClient.ServiceRunning() {
-		s.mutex.Unlock()
-		return nil, fmt.Errorf("service is not running, start the netbird service for 'up' to take effect")
-	}
-
-	// If a connection run is already in flight, the existing engine is on the
-	// job — just wait for it. Otherwise fall through to start a fresh run.
-	if s.connectClient.ConnectionRunning() {
+	// clientRunning is the daemon-intent flag (set by previous Up/Start, cleared
+	// by Down). connectionGoroutineRunning() reports whether the previous retry-loop
+	// goroutine is still trying. When intent is up AND goroutine is alive,
+	// the existing engine is on the job — just wait for it. When intent
+	// is up but the goroutine has given up (backoff exhausted) OR when
+	// intent is down, fall through to spawn a fresh retry loop.
+	if s.clientRunning && s.connectionGoroutineRunning() {
 		state := internal.CtxGetState(s.rootCtx)
 		status, err := state.Status()
 		if err != nil {
@@ -681,13 +764,13 @@ func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpR
 	if s.actCancel != nil {
 		s.actCancel()
 	}
-	// actCancel cancels in-flight foreground ops (login/status); the run is
-	// owned by the supervisor and stopped via Stop, not this cancel.
-	_, cancel := context.WithCancel(s.rootCtx)
-	s.actCancel = cancel
+	ctx, cancel := context.WithCancel(s.rootCtx)
+	md, ok := metadata.FromIncomingContext(callerCtx)
+	if ok {
+		ctx = metadata.NewOutgoingContext(ctx, md)
+	}

-	// Forward the caller's gRPC metadata (e.g. UI user-agent) into the run.
-	md, _ := metadata.FromIncomingContext(callerCtx)
+	s.actCancel = cancel

 	if s.config == nil {
 		s.mutex.Unlock()
@@ -729,26 +812,35 @@ func (s *Server) Up(callerCtx context.Context, msg *proto.UpRequest) (*proto.UpR
 	s.statusRecorder.UpdateManagementAddress(s.config.ManagementURL.String())
 	s.statusRecorder.UpdateRosenpass(s.config.RosenpassEnabled, s.config.RosenpassPermissive)

-	s.connectClient.RunAsync(s.config, md)
+	s.clientRunning = true
+	s.clientRunningChan = make(chan struct{})
+	s.clientGiveUpChan = make(chan struct{})
+
+	go s.connectWithRetryRuns(ctx, s.config, s.statusRecorder, s.clientRunningChan, s.clientGiveUpChan)
 	s.publishConfigChangedEvent("up_rpc")

 	s.mutex.Unlock()
 	return s.waitForUp(callerCtx)
 }

-// waitForUp blocks until the in-flight run becomes established (success) or ends
-// before that (failure). The wait is owned by the supervisor (via the client) —
-// the daemon holds no per-run state here.
+// todo: handle potential race conditions
 func (s *Server) waitForUp(callerCtx context.Context) (*proto.UpResponse, error) {
 	timeoutCtx, cancel := context.WithTimeout(callerCtx, 50*time.Second)
 	defer cancel()

-	if err := s.connectClient.WaitEstablishedOrDone(timeoutCtx); err != nil {
-		log.Debugf("waiting for the connection to be established failed: %v", err)
-		return nil, fmt.Errorf("connection not established: %w", err)
+	select {
+	case <-s.clientGiveUpChan:
+		return nil, fmt.Errorf("client gave up to connect")
+	case <-s.clientRunningChan:
+		s.isSessionActive.Store(true)
+		return &proto.UpResponse{}, nil
+	case <-callerCtx.Done():
+		log.Debug("context done, stopping the wait for engine to become ready")
+		return nil, callerCtx.Err()
+	case <-timeoutCtx.Done():
+		log.Debug("up is timed out, stopping the wait for engine to become ready")
+		return nil, timeoutCtx.Err()
 	}
-	s.isSessionActive.Store(true)
-	return &proto.UpResponse{}, nil
 }

 // resolveProfileHandle resolves a wire-level profile handle (display
@@ -843,11 +935,11 @@ func (s *Server) SwitchProfile(callerCtx context.Context, msg *proto.SwitchProfi
 // Down engine work in the daemon.
 func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownResponse, error) {
 	s.mutex.Lock()
-	defer s.mutex.Unlock()

-	// cleanupConnection stops the run through the supervisor, which blocks until
-	// the run has fully unwound — no separate goroutine-quiescence wait needed.
+	giveUpChan := s.clientGiveUpChan
+
 	if err := s.cleanupConnection(); err != nil {
+		s.mutex.Unlock()
 		// todo review to update the status in case any type of error
 		log.Errorf("failed to shut down properly: %v", err)
 		return nil, err
@@ -856,6 +948,20 @@ func (s *Server) Down(ctx context.Context, _ *proto.DownRequest) (*proto.DownRes
 	state := internal.CtxGetState(s.rootCtx)
 	state.Set(internal.StatusIdle)

+	s.mutex.Unlock()
+
+	// Wait for the connectWithRetryRuns goroutine to finish with a short timeout.
+	// This prevents the goroutine from setting ErrResetConnection after Down() returns.
+	// The giveUpChan is closed at the end of connectWithRetryRuns.
+	if giveUpChan != nil {
+		select {
+		case <-giveUpChan:
+			log.Debugf("client goroutine finished successfully")
+		case <-time.After(5 * time.Second):
+			log.Warnf("timeout waiting for client goroutine to finish, proceeding anyway")
+		}
+	}
+
 	return &proto.DownResponse{}, nil
 }

@@ -866,19 +972,38 @@ func (s *Server) cleanupConnection() error {
 		return ErrServiceNotUp
 	}

-	// Tear the client down through the lifecycle supervisor BEFORE cancelling
-	// the retry context. Stop serializes on the supervisor queue and blocks
-	// until the in-flight run has fully unwound (a clean, synchronous teardown).
-	// It must run before actCancel: cancelling the context first would make
-	// Stop observe a dead context and return early without waiting.
-	if err := s.connectClient.Stop(); err != nil {
-		return err
+	// Daemon intent flips to "down" — all callers (Down RPC,
+	// Logout RPC handlers) tear down the connection because the user
+	// explicitly asked for it. MDM restart does NOT go through this
+	// path, so its clientRunning stays true.
+	s.clientRunning = false
+
+	// Capture the engine reference before cancelling the context.
+	// After actCancel(), the connectWithRetryRuns goroutine wakes up
+	// and sets connectClient.engine = nil, causing connectClient.Stop()
+	// to skip the engine shutdown entirely.
+	var engine *internal.Engine
+	if s.connectClient != nil {
+		engine = s.connectClient.Engine()
 	}

-	// Stop the retry goroutine so it does not start a fresh run. The client
-	// itself is daemon-lifetime and intentionally kept (a later Up reuses it).
 	s.actCancel()

+	if s.connectClient == nil {
+		return nil
+	}
+
+	// TODO: consider calling s.connectClient.Stop() instead of engine.Stop().
+	// actCancel() lets the run loop stop the engine too, so both stop it
+	// concurrently; ConnectClient.Stop cancels and waits for the run loop,
+	// making the run loop the sole owner of engine shutdown.
+	if engine != nil {
+		if err := engine.Stop(); err != nil {
+			return err
+		}
+	}
+
+	s.connectClient = nil
 	s.isSessionActive.Store(false)

 	log.Infof("service is down")
@@ -1013,7 +1138,7 @@ func (s *Server) validateProfileOperation(id profilemanager.ID, allowActiveProfi

 func (s *Server) logoutFromProfile(ctx context.Context, profile *profilemanager.Profile) error {
 	activeProf, err := s.profileManager.GetActiveProfileState()
-	if err == nil && activeProf.ID == profile.ID && s.connectClient.ConnectionRunning() {
+	if err == nil && activeProf.ID == profile.ID && s.connectClient != nil {
 		return s.sendLogoutRequest(ctx)
 	}

@@ -1059,13 +1184,48 @@ func (s *Server) Status(
 	ctx context.Context,
 	msg *proto.StatusRequest,
 ) (*proto.StatusResponse, error) {
-	// A run that hits a terminal auth failure now exits on its own (engine marks
-	// NeedsLogin), so we no longer poll-and-cancel: we wait for the in-flight run
-	// to become established or to end. With no run in flight this returns
-	// immediately (errNoRunInFlight); either way we then report the status below.
-	if msg.WaitForReady != nil && *msg.WaitForReady {
-		if err := s.connectClient.WaitEstablishedOrDone(ctx); err != nil && ctx.Err() != nil {
-			return nil, ctx.Err()
+	s.mutex.Lock()
+	// Only wait if the retry-loop goroutine is alive and making
+	// progress. clientRunning=true with connectionGoroutineRunning=false means the
+	// backoff has given up — there is nothing to wait for; let the
+	// caller observe the failed status directly.
+	alive := s.connectionGoroutineRunning()
+	s.mutex.Unlock()
+
+	if msg.WaitForReady != nil && *msg.WaitForReady && alive {
+		state := internal.CtxGetState(s.rootCtx)
+		status, err := state.Status()
+		if err != nil {
+			return nil, err
+		}
+
+		if status != internal.StatusIdle && status != internal.StatusConnected && status != internal.StatusConnecting {
+			s.actCancel()
+		}
+
+		ticker := time.NewTicker(1 * time.Second)
+		defer ticker.Stop()
+	loop:
+		for {
+			select {
+			case <-s.clientGiveUpChan:
+				ticker.Stop()
+				break loop
+			case <-s.clientRunningChan:
+				ticker.Stop()
+				break loop
+			case <-ticker.C:
+				status, err := state.Status()
+				if err != nil {
+					continue
+				}
+				if status != internal.StatusIdle && status != internal.StatusConnected && status != internal.StatusConnecting {
+					s.actCancel()
+				}
+				continue
+			case <-ctx.Done():
+				return nil, ctx.Err()
+			}
 		}
 	}

@@ -1103,6 +1263,10 @@ func (s *Server) getSSHServerState() *proto.SSHServerState {
 	connectClient := s.connectClient
 	s.mutex.Unlock()

+	if connectClient == nil {
+		return nil
+	}
+
 	engine := connectClient.Engine()
 	if engine == nil {
 		return nil
@@ -1140,6 +1304,10 @@ func (s *Server) GetPeerSSHHostKey(
 	statusRecorder := s.statusRecorder
 	s.mutex.Unlock()

+	if connectClient == nil {
+		return nil, errors.New("client not initialized")
+	}
+
 	engine := connectClient.Engine()
 	if engine == nil {
 		return nil, errors.New("engine not started")
@@ -1306,13 +1474,17 @@ func (s *Server) WaitJWTToken(
 // ExposeService exposes a local port via the NetBird reverse proxy.
 func (s *Server) ExposeService(req *proto.ExposeServiceRequest, srv proto.DaemonService_ExposeServiceServer) error {
 	s.mutex.Lock()
-	if !s.connectClient.ConnectionRunning() {
+	if !s.clientRunning {
 		s.mutex.Unlock()
 		return gstatus.Errorf(codes.FailedPrecondition, "client is not running, run 'netbird up' first")
 	}
 	connectClient := s.connectClient
 	s.mutex.Unlock()

+	if connectClient == nil {
+		return gstatus.Errorf(codes.FailedPrecondition, "client not initialized")
+	}
+
 	engine := connectClient.Engine()
 	if engine == nil {
 		return gstatus.Errorf(codes.FailedPrecondition, "engine not initialized")
@@ -1366,6 +1538,10 @@ func isUnixRunningDesktop() bool {
 }

 func (s *Server) runProbes(waitForProbeResult bool) {
+	if s.connectClient == nil {
+		return
+	}
+
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return
@@ -1644,6 +1820,22 @@ func (s *Server) GetFeatures(ctx context.Context, msg *proto.GetFeaturesRequest)
 	return features, nil
 }

+func (s *Server) connect(ctx context.Context, config *profilemanager.Config, statusRecorder *peer.Status, runningChan chan struct{}) error {
+	log.Tracef("running client connection")
+	client := internal.NewConnectClient(ctx, config, statusRecorder)
+	client.SetUpdateManager(s.updateManager)
+	client.SetSyncResponsePersistence(s.persistSyncResponse)
+
+	s.mutex.Lock()
+	s.connectClient = client
+	s.mutex.Unlock()
+
+	if err := client.Run(runningChan, s.logFile); err != nil {
+		return err
+	}
+	return nil
+}
+
 // MDM authority: when the platform-native MDM source sets a kill switch
 // key (regardless of true/false value), that value wins. The CLI flag
 // supplied at service install time is the fallback used only when the
@@ -1705,6 +1897,45 @@ func (s *Server) onSessionExpire() {
 	}
 }

+// getConnectWithBackoff returns a backoff with exponential backoff strategy for connection retries
+func getConnectWithBackoff(ctx context.Context) backoff.BackOff {
+	initialInterval := parseEnvDuration(retryInitialIntervalVar, defaultInitialRetryTime)
+	maxInterval := parseEnvDuration(maxRetryIntervalVar, defaultMaxRetryInterval)
+	maxElapsedTime := parseEnvDuration(maxRetryTimeVar, defaultMaxRetryTime)
+	multiplier := defaultRetryMultiplier
+
+	if envValue := os.Getenv(retryMultiplierVar); envValue != "" {
+		// parse the multiplier from the environment variable string value to float64
+		value, err := strconv.ParseFloat(envValue, 64)
+		if err != nil {
+			log.Warnf("unable to parse environment variable %s: %s. using default: %f", retryMultiplierVar, envValue, multiplier)
+		} else {
+			multiplier = value
+		}
+	}
+
+	return backoff.WithContext(&backoff.ExponentialBackOff{
+		InitialInterval:     initialInterval,
+		RandomizationFactor: 1,
+		Multiplier:          multiplier,
+		MaxInterval:         maxInterval,
+		MaxElapsedTime:      maxElapsedTime, // 14 days
+		Stop:                backoff.Stop,
+		Clock:               backoff.SystemClock,
+	}, ctx)
+}
+
+// parseEnvDuration parses the environment variable and returns the duration
+func parseEnvDuration(envVar string, defaultDuration time.Duration) time.Duration {
+	if envValue := os.Getenv(envVar); envValue != "" {
+		if duration, err := time.ParseDuration(envValue); err == nil {
+			return duration
+		}
+		log.Warnf("unable to parse environment variable %s: %s. using default: %s", envVar, envValue, defaultDuration)
+	}
+	return defaultDuration
+}
+
 // sendTerminalNotification sends a terminal notification message
 // to inform the user that the NetBird connection session has expired.
 func sendTerminalNotification() error {
--- a/client/server/server_connect_test.go
+++ b/client/server/server_connect_test.go
@@ -15,19 +15,14 @@ import (
 )

 func newTestServer() *Server {
-	ctx := context.Background()
-	s := &Server{
-		rootCtx:        ctx,
+	return &Server{
+		rootCtx:        context.Background(),
 		statusRecorder: peer.NewRecorder(""),
 	}
-	// Honor the production invariant: the daemon-lifetime client always exists
-	// (built in New). Server methods rely on s.connectClient being non-nil.
-	s.connectClient = internal.NewConnectClient(ctx, s.statusRecorder)
-	return s
 }

 func newDummyConnectClient(ctx context.Context) *internal.ConnectClient {
-	return internal.NewConnectClient(ctx, nil)
+	return internal.NewConnectClient(ctx, nil, nil)
 }

 // TestConnectSetsClientWithMutex validates that connect() sets s.connectClient
@@ -92,36 +87,41 @@ func TestConcurrentConnectClientAccess(t *testing.T) {
 	assert.Equal(t, 50, nilCount+setCount, "all goroutines should complete without panic")
 }

-// TestCleanupConnection_KeepsClientStopsRunning validates that cleanupConnection
-// clears the daemon "up" intent but KEEPS the daemon-lifetime ConnectClient
-// (it is reused across Up/Down; only the run is stopped).
-func TestCleanupConnection_KeepsClientStopsRunning(t *testing.T) {
+// TestCleanupConnection_ClearsConnectClient validates that cleanupConnection
+// properly nils out connectClient.
+func TestCleanupConnection_ClearsConnectClient(t *testing.T) {
 	s := newTestServer()
 	_, cancel := context.WithCancel(context.Background())
 	s.actCancel = cancel

+	s.connectClient = newDummyConnectClient(context.Background())
+	s.clientRunning = true
+
 	err := s.cleanupConnection()
 	require.NoError(t, err)

-	assert.NotNil(t, s.connectClient, "connectClient is daemon-lifetime and must persist after cleanup")
-	assert.False(t, s.connectClient.ConnectionRunning(), "no run should be in flight after cleanup")
+	assert.Nil(t, s.connectClient, "connectClient should be nil after cleanup")
+	assert.False(t, s.clientRunning, "clientRunning should be cleared after cleanup (intent = down)")
 }

-// TestCleanState_NotConnected validates that CleanState doesn't panic when no
-// connection run is in flight.
-func TestCleanState_NotConnected(t *testing.T) {
+// TestCleanState_NilConnectClient validates that CleanState doesn't panic
+// when connectClient is nil.
+func TestCleanState_NilConnectClient(t *testing.T) {
 	s := newTestServer()
-	s.profileManager = nil // will cause error if it tries to proceed
+	s.connectClient = nil
+	s.profileManager = nil // will cause error if it tries to proceed past the nil check

+	// Should not panic — the nil check should prevent calling Status() on nil
 	assert.NotPanics(t, func() {
 		_, _ = s.CleanState(context.Background(), &proto.CleanStateRequest{All: true})
 	})
 }

-// TestDeleteState_NotConnected validates that DeleteState doesn't panic when no
-// connection run is in flight.
-func TestDeleteState_NotConnected(t *testing.T) {
+// TestDeleteState_NilConnectClient validates that DeleteState doesn't panic
+// when connectClient is nil.
+func TestDeleteState_NilConnectClient(t *testing.T) {
 	s := newTestServer()
+	s.connectClient = nil
 	s.profileManager = nil

 	assert.NotPanics(t, func() {
@@ -129,6 +129,60 @@ func TestDeleteState_NotConnected(t *testing.T) {
 	})
 }

+// TestDownThenUp_StaleRunningChan documents the known state issue where
+// clientRunningChan from a previous connection is already closed, causing
+// waitForUp() to return immediately on reconnect.
+func TestDownThenUp_StaleRunningChan(t *testing.T) {
+	s := newTestServer()
+
+	// Simulate state after a successful connection
+	s.clientRunning = true
+	s.clientRunningChan = make(chan struct{})
+	close(s.clientRunningChan) // closed when engine started
+	s.clientGiveUpChan = make(chan struct{})
+	s.connectClient = newDummyConnectClient(context.Background())
+
+	_, cancel := context.WithCancel(context.Background())
+	s.actCancel = cancel
+
+	// Simulate Down(): cleanupConnection sets connectClient = nil and
+	// flips clientRunning to false (intent = down). The connectionGoroutineRunning state
+	// remains independent of intent — derived from clientGiveUpChan.
+	s.mutex.Lock()
+	err := s.cleanupConnection()
+	s.mutex.Unlock()
+	require.NoError(t, err)
+
+	// After cleanup: connectClient is nil, clientRunning is false (intent
+	// cleared by cleanupConnection), connectionGoroutineRunning may still be true
+	// (goroutine teardown is independent of the intent flag).
+	s.mutex.Lock()
+	assert.Nil(t, s.connectClient, "connectClient should be nil after cleanup")
+	assert.False(t, s.clientRunning, "clientRunning should be cleared by cleanupConnection (intent = down)")
+	s.mutex.Unlock()
+
+	// waitForUp() returns immediately due to stale closed clientRunningChan
+	ctx, ctxCancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer ctxCancel()
+
+	waitDone := make(chan error, 1)
+	go func() {
+		_, err := s.waitForUp(ctx)
+		waitDone <- err
+	}()
+
+	select {
+	case err := <-waitDone:
+		assert.NoError(t, err, "waitForUp returns success on stale channel")
+		// But connectClient is still nil — this is the stale state issue
+		s.mutex.Lock()
+		assert.Nil(t, s.connectClient, "connectClient is nil despite waitForUp success")
+		s.mutex.Unlock()
+	case <-time.After(1 * time.Second):
+		t.Fatal("waitForUp should have returned immediately due to stale closed channel")
+	}
+}
+
 // TestConnectClient_EngineNilOnFreshClient validates that a newly created
 // ConnectClient has nil Engine (before Run is called).
 func TestConnectClient_EngineNilOnFreshClient(t *testing.T) {
--- a/client/server/server_privileged_test.go
+++ b/client/server/server_privileged_test.go
@@ -0,0 +1,235 @@
+//go:build privileged
+
+package server
+
+import (
+	"context"
+	"net"
+	"os/user"
+	"testing"
+	"time"
+
+	"github.com/golang/mock/gomock"
+	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/otel"
+
+	"github.com/netbirdio/netbird/management/server/integrations/integrated_validator/validator"
+
+	"github.com/netbirdio/netbird/management/internals/controllers/network_map/controller"
+	"github.com/netbirdio/netbird/management/internals/controllers/network_map/update_channel"
+	"github.com/netbirdio/netbird/management/internals/modules/peers"
+	"github.com/netbirdio/netbird/management/internals/modules/peers/ephemeral/manager"
+	nbgrpc "github.com/netbirdio/netbird/management/internals/shared/grpc"
+	"github.com/netbirdio/netbird/management/server/job"
+
+	"github.com/netbirdio/netbird/management/internals/server/config"
+	"github.com/netbirdio/netbird/management/server/groups"
+
+	log "github.com/sirupsen/logrus"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/keepalive"
+
+	"github.com/netbirdio/netbird/client/internal"
+	"github.com/netbirdio/netbird/client/internal/peer"
+	"github.com/netbirdio/netbird/client/internal/profilemanager"
+	"github.com/netbirdio/netbird/management/server"
+	"github.com/netbirdio/netbird/management/server/activity"
+	nbcache "github.com/netbirdio/netbird/management/server/cache"
+	"github.com/netbirdio/netbird/management/server/integrations/port_forwarding"
+	"github.com/netbirdio/netbird/management/server/permissions"
+	"github.com/netbirdio/netbird/management/server/settings"
+	"github.com/netbirdio/netbird/management/server/store"
+	"github.com/netbirdio/netbird/management/server/telemetry"
+	mgmtProto "github.com/netbirdio/netbird/shared/management/proto"
+	"github.com/netbirdio/netbird/shared/signal/proto"
+	signalServer "github.com/netbirdio/netbird/signal/server"
+)
+
+var (
+	kaep = keepalive.EnforcementPolicy{
+		MinTime:             15 * time.Second,
+		PermitWithoutStream: true,
+	}
+
+	kasp = keepalive.ServerParameters{
+		MaxConnectionIdle:     15 * time.Second,
+		MaxConnectionAgeGrace: 5 * time.Second,
+		Time:                  5 * time.Second,
+		Timeout:               2 * time.Second,
+	}
+)
+
+// TestConnectWithRetryRuns checks that the connectWithRetry function runs and runs the retries according to the times specified via environment variables
+// we will use a management server started via to simulate the server and capture the number of retries
+func TestConnectWithRetryRuns(t *testing.T) {
+	// start the signal server
+	_, signalAddr, err := startSignal(t)
+	if err != nil {
+		t.Fatalf("failed to start signal server: %v", err)
+	}
+
+	counter := 0
+	// start the management server
+	_, mgmtAddr, err := startManagement(t, signalAddr, &counter)
+	if err != nil {
+		t.Fatalf("failed to start management server: %v", err)
+	}
+
+	ctx := internal.CtxInitState(context.Background())
+
+	ctx, cancel := context.WithDeadline(ctx, time.Now().Add(30*time.Second))
+	defer cancel()
+	// create new server
+	ic := profilemanager.ConfigInput{
+		ManagementURL: "http://" + mgmtAddr,
+		ConfigPath:    t.TempDir() + "/test-profile.json",
+	}
+
+	config, err := profilemanager.UpdateOrCreateConfig(ic)
+	if err != nil {
+		t.Fatalf("failed to create config: %v", err)
+	}
+
+	currUser, err := user.Current()
+	require.NoError(t, err)
+
+	pm := profilemanager.ServiceManager{}
+	err = pm.SetActiveProfileState(&profilemanager.ActiveProfileState{
+		ID:       "test-profile",
+		Username: currUser.Username,
+	})
+	if err != nil {
+		t.Fatalf("failed to set active profile state: %v", err)
+	}
+
+	s := New(ctx, "debug", "", false, false, false, false)
+
+	s.config = config
+
+	s.statusRecorder = peer.NewRecorder(config.ManagementURL.String())
+	t.Setenv(retryInitialIntervalVar, "1s")
+	t.Setenv(maxRetryIntervalVar, "2s")
+	t.Setenv(maxRetryTimeVar, "5s")
+	t.Setenv(retryMultiplierVar, "1")
+
+	s.connectWithRetryRuns(ctx, config, s.statusRecorder, nil, nil)
+	if counter < 3 {
+		t.Fatalf("expected counter > 2, got %d", counter)
+	}
+}
+
+type mockServer struct {
+	mgmtProto.ManagementServiceServer
+	counter *int
+}
+
+func (m *mockServer) Login(ctx context.Context, req *mgmtProto.EncryptedMessage) (*mgmtProto.EncryptedMessage, error) {
+	*m.counter++
+	return m.ManagementServiceServer.Login(ctx, req)
+}
+
+func startManagement(t *testing.T, signalAddr string, counter *int) (*grpc.Server, string, error) {
+	t.Helper()
+	dataDir := t.TempDir()
+
+	config := &config.Config{
+		Stuns:      []*config.Host{},
+		TURNConfig: &config.TURNConfig{},
+		Signal: &config.Host{
+			Proto: "http",
+			URI:   signalAddr,
+		},
+		Datadir:    dataDir,
+		HttpConfig: nil,
+	}
+
+	lis, err := net.Listen("tcp", "localhost:0")
+	if err != nil {
+		return nil, "", err
+	}
+	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
+	store, cleanUp, err := store.NewTestStoreFromSQL(context.Background(), "", config.Datadir)
+	if err != nil {
+		return nil, "", err
+	}
+	t.Cleanup(cleanUp)
+
+	eventStore := &activity.InMemoryEventStore{}
+	if err != nil {
+		return nil, "", err
+	}
+
+	ctrl := gomock.NewController(t)
+	t.Cleanup(ctrl.Finish)
+
+	permissionsManagerMock := permissions.NewMockManager(ctrl)
+	peersManager := peers.NewManager(store, permissionsManagerMock)
+	settingsManagerMock := settings.NewMockManager(ctrl)
+
+	jobManager := job.NewJobManager(nil, store, peersManager)
+
+	cacheStore, err := nbcache.NewStore(context.Background(), 100*time.Millisecond, 300*time.Millisecond, 100)
+	if err != nil {
+		return nil, "", err
+	}
+
+	ia, _ := validator.NewIntegratedValidator(context.Background(), peersManager, settingsManagerMock, eventStore, cacheStore)
+
+	metrics, err := telemetry.NewDefaultAppMetrics(context.Background())
+	require.NoError(t, err)
+
+	settingsMockManager := settings.NewMockManager(ctrl)
+	groupsManager := groups.NewManagerMock()
+
+	requestBuffer := server.NewAccountRequestBuffer(context.Background(), store)
+	peersUpdateManager := update_channel.NewPeersUpdateManager(metrics)
+	networkMapController := controller.NewController(context.Background(), store, metrics, peersUpdateManager, requestBuffer, server.MockIntegratedValidator{}, settingsMockManager, "netbird.selfhosted", port_forwarding.NewControllerMock(), manager.NewEphemeralManager(store, peersManager), config)
+	accountManager, err := server.BuildManager(context.Background(), config, store, networkMapController, jobManager, nil, "", eventStore, nil, false, ia, metrics, port_forwarding.NewControllerMock(), settingsMockManager, permissionsManagerMock, false, cacheStore)
+	if err != nil {
+		return nil, "", err
+	}
+
+	secretsManager, err := nbgrpc.NewTimeBasedAuthSecretsManager(peersUpdateManager, config.TURNConfig, config.Relay, settingsMockManager, groupsManager)
+	if err != nil {
+		return nil, "", err
+	}
+	mgmtServer, err := nbgrpc.NewServer(config, accountManager, settingsMockManager, jobManager, secretsManager, nil, nil, &server.MockIntegratedValidator{}, networkMapController, nil, nil)
+	if err != nil {
+		return nil, "", err
+	}
+	mock := &mockServer{
+		ManagementServiceServer: mgmtServer,
+		counter:                 counter,
+	}
+	mgmtProto.RegisterManagementServiceServer(s, mock)
+	go func() {
+		if err = s.Serve(lis); err != nil {
+			log.Fatalf("failed to serve: %v", err)
+		}
+	}()
+
+	return s, lis.Addr().String(), nil
+}
+
+func startSignal(t *testing.T) (*grpc.Server, string, error) {
+	t.Helper()
+
+	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
+
+	lis, err := net.Listen("tcp", "localhost:0")
+	if err != nil {
+		return nil, "", err
+	}
+
+	srv, err := signalServer.NewServer(context.Background(), otel.Meter(""))
+	require.NoError(t, err)
+	proto.RegisterSignalExchangeServer(s, srv)
+
+	go func() {
+		if err = s.Serve(lis); err != nil {
+			log.Fatalf("failed to serve: %v", err)
+		}
+	}()
+
+	return s, lis.Addr().String(), nil
+}
--- a/client/server/server_test.go
+++ b/client/server/server_test.go
@@ -2,62 +2,20 @@ package server

 import (
 	"context"
-	"net"
 	"net/url"
 	"os/user"
 	"path/filepath"
 	"testing"
 	"time"

-	"github.com/golang/mock/gomock"
-	"github.com/stretchr/testify/require"
-	"go.opentelemetry.io/otel"
-
-	"github.com/netbirdio/netbird/management/server/integrations/integrated_validator/validator"
-
-	"github.com/netbirdio/netbird/management/internals/controllers/network_map/controller"
-	"github.com/netbirdio/netbird/management/internals/controllers/network_map/update_channel"
-	"github.com/netbirdio/netbird/management/internals/modules/peers"
-	"github.com/netbirdio/netbird/management/internals/modules/peers/ephemeral/manager"
-	nbgrpc "github.com/netbirdio/netbird/management/internals/shared/grpc"
-	"github.com/netbirdio/netbird/management/server/job"
-
-	"github.com/netbirdio/netbird/management/internals/server/config"
-	"github.com/netbirdio/netbird/management/server/groups"
-
 	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 	"google.golang.org/grpc"
-	"google.golang.org/grpc/keepalive"

 	"github.com/netbirdio/netbird/client/internal"
 	"github.com/netbirdio/netbird/client/internal/profilemanager"
 	daemonProto "github.com/netbirdio/netbird/client/proto"
-	"github.com/netbirdio/netbird/management/server"
-	"github.com/netbirdio/netbird/management/server/activity"
-	nbcache "github.com/netbirdio/netbird/management/server/cache"
-	"github.com/netbirdio/netbird/management/server/integrations/port_forwarding"
-	"github.com/netbirdio/netbird/management/server/permissions"
-	"github.com/netbirdio/netbird/management/server/settings"
-	"github.com/netbirdio/netbird/management/server/store"
-	"github.com/netbirdio/netbird/management/server/telemetry"
-	mgmtProto "github.com/netbirdio/netbird/shared/management/proto"
-	"github.com/netbirdio/netbird/shared/signal/proto"
-	signalServer "github.com/netbirdio/netbird/signal/server"
-)
-
-var (
-	kaep = keepalive.EnforcementPolicy{
-		MinTime:             15 * time.Second,
-		PermitWithoutStream: true,
-	}
-
-	kasp = keepalive.ServerParameters{
-		MaxConnectionIdle:     15 * time.Second,
-		MaxConnectionAgeGrace: 5 * time.Second,
-		Time:                  5 * time.Second,
-		Timeout:               2 * time.Second,
-	}
 )

 func TestServer_Up(t *testing.T) {
@@ -199,119 +157,3 @@ func TestServer_SubcribeEvents(t *testing.T) {

 	assert.NoError(t, err)
 }
-
-type mockServer struct {
-	mgmtProto.ManagementServiceServer
-	counter *int
-}
-
-func (m *mockServer) Login(ctx context.Context, req *mgmtProto.EncryptedMessage) (*mgmtProto.EncryptedMessage, error) {
-	*m.counter++
-	return m.ManagementServiceServer.Login(ctx, req)
-}
-
-func startManagement(t *testing.T, signalAddr string, counter *int) (*grpc.Server, string, error) {
-	t.Helper()
-	dataDir := t.TempDir()
-
-	config := &config.Config{
-		Stuns:      []*config.Host{},
-		TURNConfig: &config.TURNConfig{},
-		Signal: &config.Host{
-			Proto: "http",
-			URI:   signalAddr,
-		},
-		Datadir:    dataDir,
-		HttpConfig: nil,
-	}
-
-	lis, err := net.Listen("tcp", "localhost:0")
-	if err != nil {
-		return nil, "", err
-	}
-	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
-	store, cleanUp, err := store.NewTestStoreFromSQL(context.Background(), "", config.Datadir)
-	if err != nil {
-		return nil, "", err
-	}
-	t.Cleanup(cleanUp)
-
-	eventStore := &activity.InMemoryEventStore{}
-	if err != nil {
-		return nil, "", err
-	}
-
-	ctrl := gomock.NewController(t)
-	t.Cleanup(ctrl.Finish)
-
-	permissionsManagerMock := permissions.NewMockManager(ctrl)
-	peersManager := peers.NewManager(store, permissionsManagerMock)
-	settingsManagerMock := settings.NewMockManager(ctrl)
-
-	jobManager := job.NewJobManager(nil, store, peersManager)
-
-	cacheStore, err := nbcache.NewStore(context.Background(), 100*time.Millisecond, 300*time.Millisecond, 100)
-	if err != nil {
-		return nil, "", err
-	}
-
-	ia, _ := validator.NewIntegratedValidator(context.Background(), peersManager, settingsManagerMock, eventStore, cacheStore)
-
-	metrics, err := telemetry.NewDefaultAppMetrics(context.Background())
-	require.NoError(t, err)
-
-	settingsMockManager := settings.NewMockManager(ctrl)
-	groupsManager := groups.NewManagerMock()
-
-	requestBuffer := server.NewAccountRequestBuffer(context.Background(), store)
-	peersUpdateManager := update_channel.NewPeersUpdateManager(metrics)
-	networkMapController := controller.NewController(context.Background(), store, metrics, peersUpdateManager, requestBuffer, server.MockIntegratedValidator{}, settingsMockManager, "netbird.selfhosted", port_forwarding.NewControllerMock(), manager.NewEphemeralManager(store, peersManager), config)
-	accountManager, err := server.BuildManager(context.Background(), config, store, networkMapController, jobManager, nil, "", eventStore, nil, false, ia, metrics, port_forwarding.NewControllerMock(), settingsMockManager, permissionsManagerMock, false, cacheStore)
-	if err != nil {
-		return nil, "", err
-	}
-
-	secretsManager, err := nbgrpc.NewTimeBasedAuthSecretsManager(peersUpdateManager, config.TURNConfig, config.Relay, settingsMockManager, groupsManager)
-	if err != nil {
-		return nil, "", err
-	}
-	mgmtServer, err := nbgrpc.NewServer(config, accountManager, settingsMockManager, jobManager, secretsManager, nil, nil, &server.MockIntegratedValidator{}, networkMapController, nil, nil)
-	if err != nil {
-		return nil, "", err
-	}
-	mock := &mockServer{
-		ManagementServiceServer: mgmtServer,
-		counter:                 counter,
-	}
-	mgmtProto.RegisterManagementServiceServer(s, mock)
-	go func() {
-		if err = s.Serve(lis); err != nil {
-			log.Fatalf("failed to serve: %v", err)
-		}
-	}()
-
-	return s, lis.Addr().String(), nil
-}
-
-func startSignal(t *testing.T) (*grpc.Server, string, error) {
-	t.Helper()
-
-	s := grpc.NewServer(grpc.KeepaliveEnforcementPolicy(kaep), grpc.KeepaliveParams(kasp))
-
-	lis, err := net.Listen("tcp", "localhost:0")
-	if err != nil {
-		log.Fatalf("failed to listen: %v", err)
-	}
-
-	srv, err := signalServer.NewServer(context.Background(), otel.Meter(""))
-	require.NoError(t, err)
-	proto.RegisterSignalExchangeServer(s, srv)
-
-	go func() {
-		if err = s.Serve(lis); err != nil {
-			log.Fatalf("failed to serve: %v", err)
-		}
-	}()
-
-	return s, lis.Addr().String(), nil
-}
--- a/client/server/state.go
+++ b/client/server/state.go
@@ -9,6 +9,7 @@ import (
 	"google.golang.org/grpc/status"

 	nberrors "github.com/netbirdio/netbird/client/errors"
+	"github.com/netbirdio/netbird/client/internal"
 	"github.com/netbirdio/netbird/client/internal/routemanager/systemops"
 	"github.com/netbirdio/netbird/client/internal/statemanager"
 	"github.com/netbirdio/netbird/client/proto"
@@ -37,7 +38,7 @@ func (s *Server) ListStates(_ context.Context, _ *proto.ListStatesRequest) (*pro

 // CleanState handles cleaning of states (performing cleanup operations)
 func (s *Server) CleanState(ctx context.Context, req *proto.CleanStateRequest) (*proto.CleanStateResponse, error) {
-	if s.connectClient.ConnectionRunning() {
+	if s.connectClient != nil && (s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting) {
 		return nil, status.Errorf(codes.FailedPrecondition, "cannot clean state while connecting or connected, run 'netbird down' first.")
 	}

@@ -80,7 +81,7 @@ func (s *Server) CleanState(ctx context.Context, req *proto.CleanStateRequest) (

 // DeleteState handles deletion of states without cleanup
 func (s *Server) DeleteState(ctx context.Context, req *proto.DeleteStateRequest) (*proto.DeleteStateResponse, error) {
-	if s.connectClient.ConnectionRunning() {
+	if s.connectClient != nil && (s.connectClient.Status() == internal.StatusConnected || s.connectClient.Status() == internal.StatusConnecting) {
 		return nil, status.Errorf(codes.FailedPrecondition, "cannot clean state while connecting or connected, run 'netbird down' first.")
 	}

--- a/client/server/trace.go
+++ b/client/server/trace.go
@@ -62,6 +62,10 @@ func (s *Server) TracePacket(_ context.Context, req *proto.TracePacketRequest) (
 }

 func (s *Server) getPacketTracer() (packetTracer, *internal.Engine, error) {
+	if s.connectClient == nil {
+		return nil, nil, fmt.Errorf("connect client not initialized")
+	}
+
 	engine := s.connectClient.Engine()
 	if engine == nil {
 		return nil, nil, fmt.Errorf("engine not initialized")
--- a/client/ssh/client/client_privileged_test.go
+++ b/client/ssh/client/client_privileged_test.go
@@ -0,0 +1,118 @@
+//go:build privileged
+
+package client
+
+import (
+	"context"
+	"errors"
+	"runtime"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	cryptossh "golang.org/x/crypto/ssh"
+
+	"github.com/netbirdio/netbird/client/ssh/testutil"
+)
+
+func TestSSHClient_CommandExecution(t *testing.T) {
+	if runtime.GOOS == "windows" && testutil.IsCI() {
+		t.Skip("Skipping Windows command execution tests in CI due to S4U authentication issues")
+	}
+
+	server, _, client := setupTestSSHServerAndClient(t)
+	defer func() {
+		err := server.Stop()
+		require.NoError(t, err)
+	}()
+	defer func() {
+		err := client.Close()
+		assert.NoError(t, err)
+	}()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+	defer cancel()
+
+	t.Run("ExecuteCommand captures output", func(t *testing.T) {
+		output, err := client.ExecuteCommand(ctx, "echo hello")
+		assert.NoError(t, err)
+		assert.Contains(t, string(output), "hello")
+	})
+
+	t.Run("ExecuteCommandWithIO streams output", func(t *testing.T) {
+		err := client.ExecuteCommandWithIO(ctx, "echo world")
+		assert.NoError(t, err)
+	})
+
+	t.Run("commands with flags work", func(t *testing.T) {
+		output, err := client.ExecuteCommand(ctx, "echo -n test_flag")
+		assert.NoError(t, err)
+		assert.Equal(t, "test_flag", strings.TrimSpace(string(output)))
+	})
+
+	t.Run("non-zero exit codes don't return errors", func(t *testing.T) {
+		var testCmd string
+		if runtime.GOOS == "windows" {
+			testCmd = "echo hello | Select-String notfound"
+		} else {
+			testCmd = "echo 'hello' | grep 'notfound'"
+		}
+		_, err := client.ExecuteCommand(ctx, testCmd)
+		assert.NoError(t, err)
+	})
+}
+
+func TestSSHClient_ContextCancellation(t *testing.T) {
+	server, serverAddr, _ := setupTestSSHServerAndClient(t)
+	defer func() {
+		err := server.Stop()
+		require.NoError(t, err)
+	}()
+
+	t.Run("connection with short timeout", func(t *testing.T) {
+		ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
+		defer cancel()
+
+		currentUser := testutil.GetTestUsername(t)
+		_, err := Dial(ctx, serverAddr, currentUser, DialOptions{
+			InsecureSkipVerify: true,
+		})
+		if err != nil {
+			// Check for actual timeout-related errors rather than string matching
+			assert.True(t,
+				errors.Is(err, context.DeadlineExceeded) ||
+					errors.Is(err, context.Canceled) ||
+					strings.Contains(err.Error(), "timeout"),
+				"Expected timeout-related error, got: %v", err)
+		}
+	})
+
+	t.Run("command execution cancellation", func(t *testing.T) {
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		currentUser := testutil.GetTestUsername(t)
+		client, err := Dial(ctx, serverAddr, currentUser, DialOptions{
+			InsecureSkipVerify: true,
+		})
+		require.NoError(t, err)
+		defer func() {
+			if err := client.Close(); err != nil {
+				t.Logf("client close error: %v", err)
+			}
+		}()
+
+		cmdCtx, cmdCancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+		defer cmdCancel()
+
+		err = client.ExecuteCommandWithPTY(cmdCtx, "sleep 10")
+		if err != nil {
+			var exitMissingErr *cryptossh.ExitMissingError
+			isValidCancellation := errors.Is(err, context.DeadlineExceeded) ||
+				errors.Is(err, context.Canceled) ||
+				errors.As(err, &exitMissingErr)
+			assert.True(t, isValidCancellation, "Should handle command cancellation properly")
+		}
+	})
+}
--- a/client/ssh/client/client_test.go
+++ b/client/ssh/client/client_test.go
@@ -15,7 +15,6 @@ import (

 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	cryptossh "golang.org/x/crypto/ssh"

 	"github.com/netbirdio/netbird/client/ssh"
 	sshserver "github.com/netbirdio/netbird/client/ssh/server"
@@ -78,53 +77,6 @@ func TestSSHClient_DialWithKey(t *testing.T) {
 	assert.NotNil(t, client.client)
 }

-func TestSSHClient_CommandExecution(t *testing.T) {
-	if runtime.GOOS == "windows" && testutil.IsCI() {
-		t.Skip("Skipping Windows command execution tests in CI due to S4U authentication issues")
-	}
-
-	server, _, client := setupTestSSHServerAndClient(t)
-	defer func() {
-		err := server.Stop()
-		require.NoError(t, err)
-	}()
-	defer func() {
-		err := client.Close()
-		assert.NoError(t, err)
-	}()
-
-	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
-	defer cancel()
-
-	t.Run("ExecuteCommand captures output", func(t *testing.T) {
-		output, err := client.ExecuteCommand(ctx, "echo hello")
-		assert.NoError(t, err)
-		assert.Contains(t, string(output), "hello")
-	})
-
-	t.Run("ExecuteCommandWithIO streams output", func(t *testing.T) {
-		err := client.ExecuteCommandWithIO(ctx, "echo world")
-		assert.NoError(t, err)
-	})
-
-	t.Run("commands with flags work", func(t *testing.T) {
-		output, err := client.ExecuteCommand(ctx, "echo -n test_flag")
-		assert.NoError(t, err)
-		assert.Equal(t, "test_flag", strings.TrimSpace(string(output)))
-	})
-
-	t.Run("non-zero exit codes don't return errors", func(t *testing.T) {
-		var testCmd string
-		if runtime.GOOS == "windows" {
-			testCmd = "echo hello | Select-String notfound"
-		} else {
-			testCmd = "echo 'hello' | grep 'notfound'"
-		}
-		_, err := client.ExecuteCommand(ctx, testCmd)
-		assert.NoError(t, err)
-	})
-}
-
 func TestSSHClient_ConnectionHandling(t *testing.T) {
 	server, serverAddr, _ := setupTestSSHServerAndClient(t)
 	defer func() {
@@ -154,59 +106,6 @@ func TestSSHClient_ConnectionHandling(t *testing.T) {
 	}
 }

-func TestSSHClient_ContextCancellation(t *testing.T) {
-	server, serverAddr, _ := setupTestSSHServerAndClient(t)
-	defer func() {
-		err := server.Stop()
-		require.NoError(t, err)
-	}()
-
-	t.Run("connection with short timeout", func(t *testing.T) {
-		ctx, cancel := context.WithTimeout(context.Background(), 1*time.Millisecond)
-		defer cancel()
-
-		currentUser := testutil.GetTestUsername(t)
-		_, err := Dial(ctx, serverAddr, currentUser, DialOptions{
-			InsecureSkipVerify: true,
-		})
-		if err != nil {
-			// Check for actual timeout-related errors rather than string matching
-			assert.True(t,
-				errors.Is(err, context.DeadlineExceeded) ||
-					errors.Is(err, context.Canceled) ||
-					strings.Contains(err.Error(), "timeout"),
-				"Expected timeout-related error, got: %v", err)
-		}
-	})
-
-	t.Run("command execution cancellation", func(t *testing.T) {
-		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-		defer cancel()
-		currentUser := testutil.GetTestUsername(t)
-		client, err := Dial(ctx, serverAddr, currentUser, DialOptions{
-			InsecureSkipVerify: true,
-		})
-		require.NoError(t, err)
-		defer func() {
-			if err := client.Close(); err != nil {
-				t.Logf("client close error: %v", err)
-			}
-		}()
-
-		cmdCtx, cmdCancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
-		defer cmdCancel()
-
-		err = client.ExecuteCommandWithPTY(cmdCtx, "sleep 10")
-		if err != nil {
-			var exitMissingErr *cryptossh.ExitMissingError
-			isValidCancellation := errors.Is(err, context.DeadlineExceeded) ||
-				errors.Is(err, context.Canceled) ||
-				errors.As(err, &exitMissingErr)
-			assert.True(t, isValidCancellation, "Should handle command cancellation properly")
-		}
-	})
-}
-
 func TestSSHClient_NoAuthMode(t *testing.T) {
 	hostKey, err := ssh.GeneratePrivateKey(ssh.ED25519)
 	require.NoError(t, err)
--- a/client/ssh/proxy/proxy_privileged_test.go
+++ b/client/ssh/proxy/proxy_privileged_test.go
@@ -0,0 +1,423 @@
+//go:build privileged
+
+package proxy
+
+import (
+	"bytes"
+	"context"
+	"crypto/rand"
+	"crypto/rsa"
+	"encoding/base64"
+	"encoding/json"
+	"io"
+	"math/big"
+	"net"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"runtime"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/golang-jwt/jwt/v5"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	cryptossh "golang.org/x/crypto/ssh"
+
+	nbssh "github.com/netbirdio/netbird/client/ssh"
+	sshauth "github.com/netbirdio/netbird/client/ssh/auth"
+	"github.com/netbirdio/netbird/client/ssh/server"
+	"github.com/netbirdio/netbird/client/ssh/testutil"
+	nbjwt "github.com/netbirdio/netbird/shared/auth/jwt"
+	sshuserhash "github.com/netbirdio/netbird/shared/sshauth"
+)
+
+func (m *mockDaemon) setJWTToken(token string) {
+	m.impl.jwtToken = token
+}
+
+func TestSSHProxy_Connect(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	// TODO: Windows test times out - user switching and command execution tested on Linux
+	if runtime.GOOS == "windows" {
+		t.Skip("Skipping on Windows - covered by Linux tests")
+	}
+
+	const (
+		issuer   = "https://test-issuer.example.com"
+		audience = "test-audience"
+	)
+
+	jwksServer, privateKey, jwksURL := setupJWKSServer(t)
+	defer jwksServer.Close()
+
+	hostKey, err := nbssh.GeneratePrivateKey(nbssh.ED25519)
+	require.NoError(t, err)
+	hostPubKey, err := nbssh.GeneratePublicKey(hostKey)
+	require.NoError(t, err)
+
+	serverConfig := &server.Config{
+		HostKeyPEM: hostKey,
+		JWT: &server.JWTConfig{
+			Issuer:       issuer,
+			Audiences:    []string{audience},
+			KeysLocation: jwksURL,
+		},
+	}
+	sshServer := server.New(serverConfig)
+	sshServer.SetAllowRootLogin(true)
+
+	// Configure SSH authorization for the test user
+	testUsername := testutil.GetTestUsername(t)
+	testJWTUser := "test-username"
+	testUserHash, err := sshuserhash.HashUserID(testJWTUser)
+	require.NoError(t, err)
+
+	authConfig := &sshauth.Config{
+		UserIDClaim:     sshauth.DefaultUserIDClaim,
+		AuthorizedUsers: []sshuserhash.UserIDHash{testUserHash},
+		MachineUsers: map[string][]uint32{
+			testUsername: {0}, // Index 0 in AuthorizedUsers
+		},
+	}
+	sshServer.UpdateSSHAuth(authConfig)
+
+	sshServerAddr := server.StartTestServer(t, sshServer)
+	defer func() { _ = sshServer.Stop() }()
+
+	mockDaemon := startMockDaemon(t)
+	defer mockDaemon.stop()
+
+	host, portStr, err := net.SplitHostPort(sshServerAddr)
+	require.NoError(t, err)
+	port, err := strconv.Atoi(portStr)
+	require.NoError(t, err)
+
+	mockDaemon.setHostKey(host, hostPubKey)
+
+	validToken := generateValidJWT(t, privateKey, issuer, audience, testJWTUser)
+	mockDaemon.setJWTToken(validToken)
+
+	proxyInstance, err := New(mockDaemon.addr, host, port, io.Discard, nil)
+	require.NoError(t, err)
+
+	clientConn, proxyConn := net.Pipe()
+	defer func() { _ = clientConn.Close() }()
+
+	origStdin := os.Stdin
+	origStdout := os.Stdout
+	defer func() {
+		os.Stdin = origStdin
+		os.Stdout = origStdout
+	}()
+
+	stdinReader, stdinWriter, err := os.Pipe()
+	require.NoError(t, err)
+	stdoutReader, stdoutWriter, err := os.Pipe()
+	require.NoError(t, err)
+
+	os.Stdin = stdinReader
+	os.Stdout = stdoutWriter
+
+	go func() {
+		_, _ = io.Copy(stdinWriter, proxyConn)
+	}()
+	go func() {
+		_, _ = io.Copy(proxyConn, stdoutReader)
+	}()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	connectErrCh := make(chan error, 1)
+	go func() {
+		connectErrCh <- proxyInstance.Connect(ctx)
+	}()
+
+	sshConfig := &cryptossh.ClientConfig{
+		User:            testutil.GetTestUsername(t),
+		Auth:            []cryptossh.AuthMethod{},
+		HostKeyCallback: cryptossh.InsecureIgnoreHostKey(),
+		Timeout:         3 * time.Second,
+	}
+
+	sshClientConn, chans, reqs, err := cryptossh.NewClientConn(clientConn, "test", sshConfig)
+	require.NoError(t, err, "Should connect to proxy server")
+	defer func() { _ = sshClientConn.Close() }()
+
+	sshClient := cryptossh.NewClient(sshClientConn, chans, reqs)
+
+	session, err := sshClient.NewSession()
+	require.NoError(t, err, "Should create session through full proxy to backend")
+
+	outputCh := make(chan []byte, 1)
+	errCh := make(chan error, 1)
+	go func() {
+		output, err := session.Output("echo hello-from-proxy")
+		outputCh <- output
+		errCh <- err
+	}()
+
+	select {
+	case output := <-outputCh:
+		err := <-errCh
+		require.NoError(t, err, "Command should execute successfully through proxy")
+		assert.Contains(t, string(output), "hello-from-proxy", "Should receive command output through proxy")
+	case <-time.After(3 * time.Second):
+		t.Fatal("Command execution timed out")
+	}
+
+	_ = session.Close()
+	_ = sshClient.Close()
+	_ = clientConn.Close()
+	cancel()
+}
+
+// TestSSHProxy_CommandQuoting verifies that the proxy preserves shell quoting
+// when forwarding commands to the backend. This is critical for tools like
+// Ansible that send commands such as:
+//
+//	/bin/sh -c '( umask 77 && mkdir -p ... ) && sleep 0'
+//
+// The single quotes must be preserved so the backend shell receives the
+// subshell expression as a single argument to -c.
+func TestSSHProxy_CommandQuoting(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping integration test in short mode")
+	}
+
+	sshClient, cleanup := setupProxySSHClient(t)
+	defer cleanup()
+
+	// These commands simulate what the SSH protocol delivers as exec payloads.
+	// When a user types: ssh host '/bin/sh -c "( echo hello )"'
+	// the local shell strips the outer single quotes, and the SSH exec request
+	// contains the raw string: /bin/sh -c "( echo hello )"
+	//
+	// The proxy must forward this string verbatim. Using session.Command()
+	// (shlex.Split + strings.Join) strips the inner double quotes, breaking
+	// the command on the backend.
+	tests := []struct {
+		name    string
+		command string
+		expect  string
+	}{
+		{
+			name:    "subshell_in_double_quotes",
+			command: `/bin/sh -c "( echo from-subshell ) && echo outer"`,
+			expect:  "from-subshell\nouter\n",
+		},
+		{
+			name:    "printf_with_special_chars",
+			command: `/bin/sh -c "printf '%s\n' 'hello world'"`,
+			expect:  "hello world\n",
+		},
+		{
+			name:    "nested_command_substitution",
+			command: `/bin/sh -c "echo $(echo nested)"`,
+			expect:  "nested\n",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			session, err := sshClient.NewSession()
+			require.NoError(t, err)
+			defer func() { _ = session.Close() }()
+
+			var stderrBuf bytes.Buffer
+			session.Stderr = &stderrBuf
+
+			outputCh := make(chan []byte, 1)
+			errCh := make(chan error, 1)
+			go func() {
+				output, err := session.Output(tc.command)
+				outputCh <- output
+				errCh <- err
+			}()
+
+			select {
+			case output := <-outputCh:
+				err := <-errCh
+				if stderrBuf.Len() > 0 {
+					t.Logf("stderr: %s", stderrBuf.String())
+				}
+				require.NoError(t, err, "command should succeed: %s", tc.command)
+				assert.Equal(t, tc.expect, string(output), "output mismatch for: %s", tc.command)
+			case <-time.After(5 * time.Second):
+				t.Fatalf("command timed out: %s", tc.command)
+			}
+		})
+	}
+}
+
+// setupProxySSHClient creates a full proxy test environment and returns
+// an SSH client connected through the proxy to a backend NetBird SSH server.
+func setupProxySSHClient(t *testing.T) (*cryptossh.Client, func()) {
+	t.Helper()
+
+	const (
+		issuer   = "https://test-issuer.example.com"
+		audience = "test-audience"
+	)
+
+	jwksServer, privateKey, jwksURL := setupJWKSServer(t)
+
+	hostKey, err := nbssh.GeneratePrivateKey(nbssh.ED25519)
+	require.NoError(t, err)
+	hostPubKey, err := nbssh.GeneratePublicKey(hostKey)
+	require.NoError(t, err)
+
+	serverConfig := &server.Config{
+		HostKeyPEM: hostKey,
+		JWT: &server.JWTConfig{
+			Issuer:       issuer,
+			Audiences:    []string{audience},
+			KeysLocation: jwksURL,
+		},
+	}
+	sshServer := server.New(serverConfig)
+	sshServer.SetAllowRootLogin(true)
+
+	testUsername := testutil.GetTestUsername(t)
+	testJWTUser := "test-username"
+	testUserHash, err := sshuserhash.HashUserID(testJWTUser)
+	require.NoError(t, err)
+
+	authConfig := &sshauth.Config{
+		UserIDClaim:     sshauth.DefaultUserIDClaim,
+		AuthorizedUsers: []sshuserhash.UserIDHash{testUserHash},
+		MachineUsers: map[string][]uint32{
+			testUsername: {0},
+		},
+	}
+	sshServer.UpdateSSHAuth(authConfig)
+
+	sshServerAddr := server.StartTestServer(t, sshServer)
+
+	mockDaemon := startMockDaemon(t)
+
+	host, portStr, err := net.SplitHostPort(sshServerAddr)
+	require.NoError(t, err)
+	port, err := strconv.Atoi(portStr)
+	require.NoError(t, err)
+
+	mockDaemon.setHostKey(host, hostPubKey)
+
+	validToken := generateValidJWT(t, privateKey, issuer, audience, testJWTUser)
+	mockDaemon.setJWTToken(validToken)
+
+	proxyInstance, err := New(mockDaemon.addr, host, port, io.Discard, nil)
+	require.NoError(t, err)
+
+	origStdin := os.Stdin
+	origStdout := os.Stdout
+
+	stdinReader, stdinWriter, err := os.Pipe()
+	require.NoError(t, err)
+	stdoutReader, stdoutWriter, err := os.Pipe()
+	require.NoError(t, err)
+
+	os.Stdin = stdinReader
+	os.Stdout = stdoutWriter
+
+	clientConn, proxyConn := net.Pipe()
+
+	go func() { _, _ = io.Copy(stdinWriter, proxyConn) }()
+	go func() { _, _ = io.Copy(proxyConn, stdoutReader) }()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+
+	go func() {
+		_ = proxyInstance.Connect(ctx)
+	}()
+
+	sshConfig := &cryptossh.ClientConfig{
+		User:            testutil.GetTestUsername(t),
+		Auth:            []cryptossh.AuthMethod{},
+		HostKeyCallback: cryptossh.InsecureIgnoreHostKey(),
+		Timeout:         5 * time.Second,
+	}
+
+	sshClientConn, chans, reqs, err := cryptossh.NewClientConn(clientConn, "test", sshConfig)
+	require.NoError(t, err)
+
+	client := cryptossh.NewClient(sshClientConn, chans, reqs)
+
+	cleanupFn := func() {
+		_ = client.Close()
+		_ = clientConn.Close()
+		cancel()
+		os.Stdin = origStdin
+		os.Stdout = origStdout
+		_ = sshServer.Stop()
+		mockDaemon.stop()
+		jwksServer.Close()
+	}
+
+	return client, cleanupFn
+}
+
+func setupJWKSServer(t *testing.T) (*httptest.Server, *rsa.PrivateKey, string) {
+	t.Helper()
+	privateKey, jwksJSON := generateTestJWKS(t)
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		if _, err := w.Write(jwksJSON); err != nil {
+			http.Error(w, err.Error(), http.StatusInternalServerError)
+		}
+	}))
+
+	return server, privateKey, server.URL
+}
+
+func generateTestJWKS(t *testing.T) (*rsa.PrivateKey, []byte) {
+	t.Helper()
+	privateKey, err := rsa.GenerateKey(rand.Reader, 2048)
+	require.NoError(t, err)
+
+	publicKey := &privateKey.PublicKey
+	n := publicKey.N.Bytes()
+	e := publicKey.E
+
+	jwk := nbjwt.JSONWebKey{
+		Kty: "RSA",
+		Kid: "test-key-id",
+		Use: "sig",
+		N:   base64.RawURLEncoding.EncodeToString(n),
+		E:   base64.RawURLEncoding.EncodeToString(big.NewInt(int64(e)).Bytes()),
+	}
+
+	jwks := nbjwt.Jwks{
+		Keys: []nbjwt.JSONWebKey{jwk},
+	}
+
+	jwksJSON, err := json.Marshal(jwks)
+	require.NoError(t, err)
+
+	return privateKey, jwksJSON
+}
+
+func generateValidJWT(t *testing.T, privateKey *rsa.PrivateKey, issuer, audience string, user string) string {
+	t.Helper()
+	claims := jwt.MapClaims{
+		"iss": issuer,
+		"aud": audience,
+		"sub": user,
+		"exp": time.Now().Add(time.Hour).Unix(),
+		"iat": time.Now().Unix(),
+	}
+
+	token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims)
+	token.Header["kid"] = "test-key-id"
+
+	tokenString, err := token.SignedString(privateKey)
+	require.NoError(t, err)
+
+	return tokenString
+}
--- a/client/ssh/proxy/proxy_test.go
+++ b/client/ssh/proxy/proxy_test.go
@@ -1,25 +1,12 @@
 package proxy

 import (
-	"bytes"
 	"context"
-	"crypto/rand"
-	"crypto/rsa"
-	"encoding/base64"
-	"encoding/json"
 	"fmt"
-	"io"
-	"math/big"
 	"net"
-	"net/http"
-	"net/http/httptest"
 	"os"
-	"runtime"
-	"strconv"
 	"testing"
-	"time"

-	"github.com/golang-jwt/jwt/v5"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	cryptossh "golang.org/x/crypto/ssh"
@@ -28,11 +15,7 @@ import (

 	"github.com/netbirdio/netbird/client/proto"
 	nbssh "github.com/netbirdio/netbird/client/ssh"
-	sshauth "github.com/netbirdio/netbird/client/ssh/auth"
-	"github.com/netbirdio/netbird/client/ssh/server"
 	"github.com/netbirdio/netbird/client/ssh/testutil"
-	nbjwt "github.com/netbirdio/netbird/shared/auth/jwt"
-	sshuserhash "github.com/netbirdio/netbird/shared/sshauth"
 )

 func TestMain(m *testing.M) {
@@ -106,331 +89,6 @@ func TestSSHProxy_verifyHostKey(t *testing.T) {
 	})
 }

-func TestSSHProxy_Connect(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping integration test in short mode")
-	}
-
-	// TODO: Windows test times out - user switching and command execution tested on Linux
-	if runtime.GOOS == "windows" {
-		t.Skip("Skipping on Windows - covered by Linux tests")
-	}
-
-	const (
-		issuer   = "https://test-issuer.example.com"
-		audience = "test-audience"
-	)
-
-	jwksServer, privateKey, jwksURL := setupJWKSServer(t)
-	defer jwksServer.Close()
-
-	hostKey, err := nbssh.GeneratePrivateKey(nbssh.ED25519)
-	require.NoError(t, err)
-	hostPubKey, err := nbssh.GeneratePublicKey(hostKey)
-	require.NoError(t, err)
-
-	serverConfig := &server.Config{
-		HostKeyPEM: hostKey,
-		JWT: &server.JWTConfig{
-			Issuer:       issuer,
-			Audiences:    []string{audience},
-			KeysLocation: jwksURL,
-		},
-	}
-	sshServer := server.New(serverConfig)
-	sshServer.SetAllowRootLogin(true)
-
-	// Configure SSH authorization for the test user
-	testUsername := testutil.GetTestUsername(t)
-	testJWTUser := "test-username"
-	testUserHash, err := sshuserhash.HashUserID(testJWTUser)
-	require.NoError(t, err)
-
-	authConfig := &sshauth.Config{
-		UserIDClaim:     sshauth.DefaultUserIDClaim,
-		AuthorizedUsers: []sshuserhash.UserIDHash{testUserHash},
-		MachineUsers: map[string][]uint32{
-			testUsername: {0}, // Index 0 in AuthorizedUsers
-		},
-	}
-	sshServer.UpdateSSHAuth(authConfig)
-
-	sshServerAddr := server.StartTestServer(t, sshServer)
-	defer func() { _ = sshServer.Stop() }()
-
-	mockDaemon := startMockDaemon(t)
-	defer mockDaemon.stop()
-
-	host, portStr, err := net.SplitHostPort(sshServerAddr)
-	require.NoError(t, err)
-	port, err := strconv.Atoi(portStr)
-	require.NoError(t, err)
-
-	mockDaemon.setHostKey(host, hostPubKey)
-
-	validToken := generateValidJWT(t, privateKey, issuer, audience, testJWTUser)
-	mockDaemon.setJWTToken(validToken)
-
-	proxyInstance, err := New(mockDaemon.addr, host, port, io.Discard, nil)
-	require.NoError(t, err)
-
-	clientConn, proxyConn := net.Pipe()
-	defer func() { _ = clientConn.Close() }()
-
-	origStdin := os.Stdin
-	origStdout := os.Stdout
-	defer func() {
-		os.Stdin = origStdin
-		os.Stdout = origStdout
-	}()
-
-	stdinReader, stdinWriter, err := os.Pipe()
-	require.NoError(t, err)
-	stdoutReader, stdoutWriter, err := os.Pipe()
-	require.NoError(t, err)
-
-	os.Stdin = stdinReader
-	os.Stdout = stdoutWriter
-
-	go func() {
-		_, _ = io.Copy(stdinWriter, proxyConn)
-	}()
-	go func() {
-		_, _ = io.Copy(proxyConn, stdoutReader)
-	}()
-
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-	defer cancel()
-
-	connectErrCh := make(chan error, 1)
-	go func() {
-		connectErrCh <- proxyInstance.Connect(ctx)
-	}()
-
-	sshConfig := &cryptossh.ClientConfig{
-		User:            testutil.GetTestUsername(t),
-		Auth:            []cryptossh.AuthMethod{},
-		HostKeyCallback: cryptossh.InsecureIgnoreHostKey(),
-		Timeout:         3 * time.Second,
-	}
-
-	sshClientConn, chans, reqs, err := cryptossh.NewClientConn(clientConn, "test", sshConfig)
-	require.NoError(t, err, "Should connect to proxy server")
-	defer func() { _ = sshClientConn.Close() }()
-
-	sshClient := cryptossh.NewClient(sshClientConn, chans, reqs)
-
-	session, err := sshClient.NewSession()
-	require.NoError(t, err, "Should create session through full proxy to backend")
-
-	outputCh := make(chan []byte, 1)
-	errCh := make(chan error, 1)
-	go func() {
-		output, err := session.Output("echo hello-from-proxy")
-		outputCh <- output
-		errCh <- err
-	}()
-
-	select {
-	case output := <-outputCh:
-		err := <-errCh
-		require.NoError(t, err, "Command should execute successfully through proxy")
-		assert.Contains(t, string(output), "hello-from-proxy", "Should receive command output through proxy")
-	case <-time.After(3 * time.Second):
-		t.Fatal("Command execution timed out")
-	}
-
-	_ = session.Close()
-	_ = sshClient.Close()
-	_ = clientConn.Close()
-	cancel()
-}
-
-// TestSSHProxy_CommandQuoting verifies that the proxy preserves shell quoting
-// when forwarding commands to the backend. This is critical for tools like
-// Ansible that send commands such as:
-//
-//	/bin/sh -c '( umask 77 && mkdir -p ... ) && sleep 0'
-//
-// The single quotes must be preserved so the backend shell receives the
-// subshell expression as a single argument to -c.
-func TestSSHProxy_CommandQuoting(t *testing.T) {
-	if testing.Short() {
-		t.Skip("Skipping integration test in short mode")
-	}
-
-	sshClient, cleanup := setupProxySSHClient(t)
-	defer cleanup()
-
-	// These commands simulate what the SSH protocol delivers as exec payloads.
-	// When a user types: ssh host '/bin/sh -c "( echo hello )"'
-	// the local shell strips the outer single quotes, and the SSH exec request
-	// contains the raw string: /bin/sh -c "( echo hello )"
-	//
-	// The proxy must forward this string verbatim. Using session.Command()
-	// (shlex.Split + strings.Join) strips the inner double quotes, breaking
-	// the command on the backend.
-	tests := []struct {
-		name    string
-		command string
-		expect  string
-	}{
-		{
-			name:    "subshell_in_double_quotes",
-			command: `/bin/sh -c "( echo from-subshell ) && echo outer"`,
-			expect:  "from-subshell\nouter\n",
-		},
-		{
-			name:    "printf_with_special_chars",
-			command: `/bin/sh -c "printf '%s\n' 'hello world'"`,
-			expect:  "hello world\n",
-		},
-		{
-			name:    "nested_command_substitution",
-			command: `/bin/sh -c "echo $(echo nested)"`,
-			expect:  "nested\n",
-		},
-	}
-
-	for _, tc := range tests {
-		t.Run(tc.name, func(t *testing.T) {
-			session, err := sshClient.NewSession()
-			require.NoError(t, err)
-			defer func() { _ = session.Close() }()
-
-			var stderrBuf bytes.Buffer
-			session.Stderr = &stderrBuf
-
-			outputCh := make(chan []byte, 1)
-			errCh := make(chan error, 1)
-			go func() {
-				output, err := session.Output(tc.command)
-				outputCh <- output
-				errCh <- err
-			}()
-
-			select {
-			case output := <-outputCh:
-				err := <-errCh
-				if stderrBuf.Len() > 0 {
-					t.Logf("stderr: %s", stderrBuf.String())
-				}
-				require.NoError(t, err, "command should succeed: %s", tc.command)
-				assert.Equal(t, tc.expect, string(output), "output mismatch for: %s", tc.command)
-			case <-time.After(5 * time.Second):
-				t.Fatalf("command timed out: %s", tc.command)
-			}
-		})
-	}
-}
-
-// setupProxySSHClient creates a full proxy test environment and returns
-// an SSH client connected through the proxy to a backend NetBird SSH server.
-func setupProxySSHClient(t *testing.T) (*cryptossh.Client, func()) {
-	t.Helper()
-
-	const (
-		issuer   = "https://test-issuer.example.com"
-		audience = "test-audience"
-	)
-
-	jwksServer, privateKey, jwksURL := setupJWKSServer(t)
-
-	hostKey, err := nbssh.GeneratePrivateKey(nbssh.ED25519)
-	require.NoError(t, err)
-	hostPubKey, err := nbssh.GeneratePublicKey(hostKey)
-	require.NoError(t, err)
-
-	serverConfig := &server.Config{
-		HostKeyPEM: hostKey,
-		JWT: &server.JWTConfig{
-			Issuer:       issuer,
-			Audiences:    []string{audience},
-			KeysLocation: jwksURL,
-		},
-	}
-	sshServer := server.New(serverConfig)
-	sshServer.SetAllowRootLogin(true)
-
-	testUsername := testutil.GetTestUsername(t)
-	testJWTUser := "test-username"
-	testUserHash, err := sshuserhash.HashUserID(testJWTUser)
-	require.NoError(t, err)
-
-	authConfig := &sshauth.Config{
-		UserIDClaim:     sshauth.DefaultUserIDClaim,
-		AuthorizedUsers: []sshuserhash.UserIDHash{testUserHash},
-		MachineUsers: map[string][]uint32{
-			testUsername: {0},
-		},
-	}
-	sshServer.UpdateSSHAuth(authConfig)
-
-	sshServerAddr := server.StartTestServer(t, sshServer)
-
-	mockDaemon := startMockDaemon(t)
-
-	host, portStr, err := net.SplitHostPort(sshServerAddr)
-	require.NoError(t, err)
-	port, err := strconv.Atoi(portStr)
-	require.NoError(t, err)
-
-	mockDaemon.setHostKey(host, hostPubKey)
-
-	validToken := generateValidJWT(t, privateKey, issuer, audience, testJWTUser)
-	mockDaemon.setJWTToken(validToken)
-
-	proxyInstance, err := New(mockDaemon.addr, host, port, io.Discard, nil)
-	require.NoError(t, err)
-
-	origStdin := os.Stdin
-	origStdout := os.Stdout
-
-	stdinReader, stdinWriter, err := os.Pipe()
-	require.NoError(t, err)
-	stdoutReader, stdoutWriter, err := os.Pipe()
-	require.NoError(t, err)
-
-	os.Stdin = stdinReader
-	os.Stdout = stdoutWriter
-
-	clientConn, proxyConn := net.Pipe()
-
-	go func() { _, _ = io.Copy(stdinWriter, proxyConn) }()
-	go func() { _, _ = io.Copy(proxyConn, stdoutReader) }()
-
-	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-
-	go func() {
-		_ = proxyInstance.Connect(ctx)
-	}()
-
-	sshConfig := &cryptossh.ClientConfig{
-		User:            testutil.GetTestUsername(t),
-		Auth:            []cryptossh.AuthMethod{},
-		HostKeyCallback: cryptossh.InsecureIgnoreHostKey(),
-		Timeout:         5 * time.Second,
-	}
-
-	sshClientConn, chans, reqs, err := cryptossh.NewClientConn(clientConn, "test", sshConfig)
-	require.NoError(t, err)
-
-	client := cryptossh.NewClient(sshClientConn, chans, reqs)
-
-	cleanupFn := func() {
-		_ = client.Close()
-		_ = clientConn.Close()
-		cancel()
-		os.Stdin = origStdin
-		os.Stdout = origStdout
-		_ = sshServer.Stop()
-		mockDaemon.stop()
-		jwksServer.Close()
-	}
-
-	return client, cleanupFn
-}
-
 type mockDaemonServer struct {
 	proto.UnimplementedDaemonServiceServer
 	hostKeys map[string][]byte
@@ -492,10 +150,6 @@ func (m *mockDaemon) setHostKey(addr string, pubKey []byte) {
 	m.impl.hostKeys[addr] = pubKey
 }

-func (m *mockDaemon) setJWTToken(token string) {
-	m.impl.jwtToken = token
-}
-
 func (m *mockDaemon) stop() {
 	if m.server != nil {
 		m.server.Stop()
@@ -508,63 +162,3 @@ func mustParsePublicKey(t *testing.T, pubKeyBytes []byte) cryptossh.PublicKey {
 	require.NoError(t, err)
 	return pubKey
 }
-
-func setupJWKSServer(t *testing.T) (*httptest.Server, *rsa.PrivateKey, string) {
-	t.Helper()
-	privateKey, jwksJSON := generateTestJWKS(t)
-
-	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		w.Header().Set("Content-Type", "application/json")
-		if _, err := w.Write(jwksJSON); err != nil {
-			http.Error(w, err.Error(), http.StatusInternalServerError)
-		}
-	}))
-
-	return server, privateKey, server.URL
-}
-
-func generateTestJWKS(t *testing.T) (*rsa.PrivateKey, []byte) {
-	t.Helper()
-	privateKey, err := rsa.GenerateKey(rand.Reader, 2048)
-	require.NoError(t, err)
-
-	publicKey := &privateKey.PublicKey
-	n := publicKey.N.Bytes()
-	e := publicKey.E
-
-	jwk := nbjwt.JSONWebKey{
-		Kty: "RSA",
-		Kid: "test-key-id",
-		Use: "sig",
-		N:   base64.RawURLEncoding.EncodeToString(n),
-		E:   base64.RawURLEncoding.EncodeToString(big.NewInt(int64(e)).Bytes()),
-	}
-
-	jwks := nbjwt.Jwks{
-		Keys: []nbjwt.JSONWebKey{jwk},
-	}
-
-	jwksJSON, err := json.Marshal(jwks)
-	require.NoError(t, err)
-
-	return privateKey, jwksJSON
-}
-
-func generateValidJWT(t *testing.T, privateKey *rsa.PrivateKey, issuer, audience string, user string) string {
-	t.Helper()
-	claims := jwt.MapClaims{
-		"iss": issuer,
-		"aud": audience,
-		"sub": user,
-		"exp": time.Now().Add(time.Hour).Unix(),
-		"iat": time.Now().Unix(),
-	}
-
-	token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims)
-	token.Header["kid"] = "test-key-id"
-
-	tokenString, err := token.SignedString(privateKey)
-	require.NoError(t, err)
-
-	return tokenString
-}
--- a/client/ssh/server/executor_unix_privileged_test.go
+++ b/client/ssh/server/executor_unix_privileged_test.go
@@ -0,0 +1,66 @@
+//go:build unix && privileged
+
+package server
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestPrivilegeDropper_CreateExecutorCommand(t *testing.T) {
+	pd := NewPrivilegeDropper()
+
+	config := ExecutorConfig{
+		UID:        1000,
+		GID:        1000,
+		Groups:     []uint32{1000, 1001},
+		WorkingDir: "/home/testuser",
+		Shell:      "/bin/bash",
+		Command:    "ls -la",
+	}
+
+	cmd, err := pd.CreateExecutorCommand(context.Background(), config)
+	require.NoError(t, err)
+	require.NotNil(t, cmd)
+
+	// Verify the command is calling netbird ssh exec
+	assert.Contains(t, cmd.Args, "ssh")
+	assert.Contains(t, cmd.Args, "exec")
+	assert.Contains(t, cmd.Args, "--uid")
+	assert.Contains(t, cmd.Args, "1000")
+	assert.Contains(t, cmd.Args, "--gid")
+	assert.Contains(t, cmd.Args, "1000")
+	assert.Contains(t, cmd.Args, "--groups")
+	assert.Contains(t, cmd.Args, "1000")
+	assert.Contains(t, cmd.Args, "1001")
+	assert.Contains(t, cmd.Args, "--working-dir")
+	assert.Contains(t, cmd.Args, "/home/testuser")
+	assert.Contains(t, cmd.Args, "--shell")
+	assert.Contains(t, cmd.Args, "/bin/bash")
+	assert.Contains(t, cmd.Args, "--cmd")
+	assert.Contains(t, cmd.Args, "ls -la")
+}
+
+func TestPrivilegeDropper_CreateExecutorCommandInteractive(t *testing.T) {
+	pd := NewPrivilegeDropper()
+
+	config := ExecutorConfig{
+		UID:        1000,
+		GID:        1000,
+		Groups:     []uint32{1000},
+		WorkingDir: "/home/testuser",
+		Shell:      "/bin/bash",
+		Command:    "",
+	}
+
+	cmd, err := pd.CreateExecutorCommand(context.Background(), config)
+	require.NoError(t, err)
+	require.NotNil(t, cmd)
+
+	// Verify no command mode (command is empty so no --cmd flag)
+	assert.NotContains(t, cmd.Args, "--cmd")
+	assert.NotContains(t, cmd.Args, "--interactive")
+}
--- a/client/ssh/server/executor_unix_test.go
+++ b/client/ssh/server/executor_unix_test.go
@@ -73,61 +73,6 @@ func TestPrivilegeDropper_ValidatePrivileges(t *testing.T) {
 	}
 }

-func TestPrivilegeDropper_CreateExecutorCommand(t *testing.T) {
-	pd := NewPrivilegeDropper()
-
-	config := ExecutorConfig{
-		UID:        1000,
-		GID:        1000,
-		Groups:     []uint32{1000, 1001},
-		WorkingDir: "/home/testuser",
-		Shell:      "/bin/bash",
-		Command:    "ls -la",
-	}
-
-	cmd, err := pd.CreateExecutorCommand(context.Background(), config)
-	require.NoError(t, err)
-	require.NotNil(t, cmd)
-
-	// Verify the command is calling netbird ssh exec
-	assert.Contains(t, cmd.Args, "ssh")
-	assert.Contains(t, cmd.Args, "exec")
-	assert.Contains(t, cmd.Args, "--uid")
-	assert.Contains(t, cmd.Args, "1000")
-	assert.Contains(t, cmd.Args, "--gid")
-	assert.Contains(t, cmd.Args, "1000")
-	assert.Contains(t, cmd.Args, "--groups")
-	assert.Contains(t, cmd.Args, "1000")
-	assert.Contains(t, cmd.Args, "1001")
-	assert.Contains(t, cmd.Args, "--working-dir")
-	assert.Contains(t, cmd.Args, "/home/testuser")
-	assert.Contains(t, cmd.Args, "--shell")
-	assert.Contains(t, cmd.Args, "/bin/bash")
-	assert.Contains(t, cmd.Args, "--cmd")
-	assert.Contains(t, cmd.Args, "ls -la")
-}
-
-func TestPrivilegeDropper_CreateExecutorCommandInteractive(t *testing.T) {
-	pd := NewPrivilegeDropper()
-
-	config := ExecutorConfig{
-		UID:        1000,
-		GID:        1000,
-		Groups:     []uint32{1000},
-		WorkingDir: "/home/testuser",
-		Shell:      "/bin/bash",
-		Command:    "",
-	}
-
-	cmd, err := pd.CreateExecutorCommand(context.Background(), config)
-	require.NoError(t, err)
-	require.NotNil(t, cmd)
-
-	// Verify no command mode (command is empty so no --cmd flag)
-	assert.NotContains(t, cmd.Args, "--cmd")
-	assert.NotContains(t, cmd.Args, "--interactive")
-}
-
 // TestPrivilegeDropper_ActualPrivilegeDrop tests actual privilege dropping
 // This test requires root privileges and will be skipped if not running as root
 func TestPrivilegeDropper_ActualPrivilegeDrop(t *testing.T) {
--- a/client/system/info.go
+++ b/client/system/info.go
@@ -3,6 +3,7 @@ package system
 import (
 	"context"
 	"net/netip"
+	"slices"
 	"strings"

 	log "github.com/sirupsen/logrus"
@@ -121,6 +122,23 @@ func (i *Info) SetFlags(
 	}
 }

+// removeAddresses drops network addresses whose IP matches any of the given
+// addresses, regardless of prefix length. Used to exclude the NetBird overlay
+// address, which otherwise churns the meta as the interface comes and goes.
+func (i *Info) removeAddresses(ips ...netip.Addr) {
+	if len(ips) == 0 {
+		return
+	}
+	filtered := i.NetworkAddresses[:0]
+	for _, addr := range i.NetworkAddresses {
+		if slices.Contains(ips, addr.NetIP.Addr()) {
+			continue
+		}
+		filtered = append(filtered, addr)
+	}
+	i.NetworkAddresses = filtered
+}
+
 // extractUserAgent extracts Netbird's agent (client) name and version from the outgoing context
 func extractUserAgent(ctx context.Context) string {
 	md, hasMeta := metadata.FromOutgoingContext(ctx)
@@ -147,7 +165,9 @@ func extractDeviceName(ctx context.Context, defaultName string) string {
 }

 // GetInfoWithChecks retrieves and parses the system information with applied checks.
-func GetInfoWithChecks(ctx context.Context, checks []*proto.Checks) (*Info, error) {
+// excludeIPs are dropped from the reported network addresses (e.g. our own
+// WireGuard overlay address, which otherwise churns the peer meta).
+func GetInfoWithChecks(ctx context.Context, checks []*proto.Checks, excludeIPs ...netip.Addr) (*Info, error) {
 	log.Debugf("gathering system information with checks: %d", len(checks))
 	processCheckPaths := make([]string, 0)
 	for _, check := range checks {
@@ -162,6 +182,7 @@ func GetInfoWithChecks(ctx context.Context, checks []*proto.Checks) (*Info, erro

 	info := GetInfo(ctx)
 	info.Files = files
+	info.removeAddresses(excludeIPs...)

 	log.Debugf("all system information gathered successfully")
 	return info, nil
--- a/client/system/info_test.go
+++ b/client/system/info_test.go
@@ -2,6 +2,7 @@ package system

 import (
 	"context"
+	"net/netip"
 	"testing"

 	"github.com/stretchr/testify/assert"
@@ -43,3 +44,42 @@ func Test_NetAddresses(t *testing.T) {
 		t.Errorf("no network addresses found")
 	}
 }
+
+func TestInfo_RemoveAddresses(t *testing.T) {
+	addr := func(cidr string) NetworkAddress {
+		return NetworkAddress{NetIP: netip.MustParsePrefix(cidr)}
+	}
+
+	info := &Info{
+		NetworkAddresses: []NetworkAddress{
+			addr("192.168.1.7/24"),
+			addr("100.76.70.97/32"),                          // overlay v4 (host mask /32)
+			addr("2001:818:c51b:4800:845:a65d:ae6f:623f/64"), // real global v6
+			addr("fd00:1234::1/64"),                          // overlay v6
+		},
+	}
+
+	// Overlay addresses as the engine knows them, with a different mask (/16, /64).
+	info.removeAddresses(
+		netip.MustParseAddr("100.76.70.97"),
+		netip.MustParseAddr("fd00:1234::1"),
+	)
+
+	want := []string{"192.168.1.7/24", "2001:818:c51b:4800:845:a65d:ae6f:623f/64"}
+	if len(info.NetworkAddresses) != len(want) {
+		t.Fatalf("got %d addresses, want %d: %v", len(info.NetworkAddresses), len(want), info.NetworkAddresses)
+	}
+	for i, w := range want {
+		if got := info.NetworkAddresses[i].NetIP.String(); got != w {
+			t.Errorf("address[%d] = %s, want %s", i, got, w)
+		}
+	}
+}
+
+func TestInfo_RemoveAddresses_NoOp(t *testing.T) {
+	info := &Info{NetworkAddresses: []NetworkAddress{{NetIP: netip.MustParsePrefix("10.0.0.1/24")}}}
+	info.removeAddresses()
+	if len(info.NetworkAddresses) != 1 {
+		t.Errorf("expected no change with empty input, got %v", info.NetworkAddresses)
+	}
+}
--- a/client/system/network_addr.go
+++ b/client/system/network_addr.go
@@ -46,7 +46,9 @@ func toNetworkAddress(address net.Addr, mac string) (NetworkAddress, bool) {
 	if !ok {
 		return NetworkAddress{}, false
 	}
-	if ipNet.IP.IsLoopback() {
+	// Skip link-local and multicast: they carry no routable peer info and the
+	// IPv6 link-local of a flapping NIC churns the meta on every up/down.
+	if ipNet.IP.IsLoopback() || ipNet.IP.IsLinkLocalUnicast() || ipNet.IP.IsMulticast() {
 		return NetworkAddress{}, false
 	}
 	prefix, err := netip.ParsePrefix(ipNet.String())
--- a/client/system/network_addr_test.go
+++ b/client/system/network_addr_test.go
@@ -0,0 +1,45 @@
+//go:build !ios
+
+package system
+
+import (
+	"net"
+	"testing"
+)
+
+func mustIPNet(t *testing.T, cidr string) *net.IPNet {
+	t.Helper()
+	ip, ipNet, err := net.ParseCIDR(cidr)
+	if err != nil {
+		t.Fatalf("parse %q: %v", cidr, err)
+	}
+	ipNet.IP = ip
+	return ipNet
+}
+
+func TestToNetworkAddress_Filtering(t *testing.T) {
+	const mac = "c8:4b:d6:b6:04:ac"
+
+	tests := []struct {
+		name string
+		cidr string
+		want bool
+	}{
+		{"ipv4 global", "10.65.16.181/23", true},
+		{"ipv6 global", "2620:52:0:4110:102d:6a98:ee75:8b92/64", true},
+		{"ipv4 loopback", "127.0.0.1/8", false},
+		{"ipv6 loopback", "::1/128", false},
+		{"ipv6 link-local", "fe80::871:4c25:23d7:2529/64", false},
+		{"ipv4 link-local", "169.254.1.2/16", false},
+		{"ipv6 multicast", "ff02::1/128", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			_, got := toNetworkAddress(mustIPNet(t, tt.cidr), mac)
+			if got != tt.want {
+				t.Errorf("toNetworkAddress(%s) ok = %v, want %v", tt.cidr, got, tt.want)
+			}
+		})
+	}
+}
--- a/client/testutil/privileged/runner_test.go
+++ b/client/testutil/privileged/runner_test.go
@@ -0,0 +1,196 @@
+//go:build privileged && (linux || darwin)
+
+// Package privileged provides a self-hosting harness that runs the repo's
+// privileged-tagged test suite inside a --privileged --cap-add=NET_ADMIN
+// container, so developers can exercise the root/system-mutating tests on a
+// non-root host with a single `go test` invocation.
+package privileged
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/moby/moby/api/types/container"
+	"github.com/ory/dockertest/v4"
+)
+
+// containerImage / containerTag match the image used by the CI privileged job
+// (.github/workflows/golang-test-linux.yml, test_client_on_docker).
+const (
+	containerImage = "golang"
+	containerTag   = "1.25-alpine"
+)
+
+const (
+	containerWorkdir    = "/app"
+	containerGoCache    = "/root/.cache/go-build"
+	containerGoModCache = "/go/pkg/mod"
+)
+
+// alpinePackages are the build/runtime deps the privileged tests need, mirroring
+// the CI container setup.
+const alpinePackages = "ca-certificates iptables ip6tables dbus dbus-dev libpcap-dev build-base"
+
+// privilegedTestPackages is the package list the suite runs, excluding the
+// server-side trees and UI/upload helpers, matching the CI Docker job's filter.
+const privilegedTestPackages = `go list -buildvcs=false ./... | grep -v -e /management -e /signal -e /relay -e /proxy -e /combined -e /client/ui -e /upload-server`
+
+// testWriter forwards container output to the test log line by line.
+type testWriter struct{ t *testing.T }
+
+func (w testWriter) Write(p []byte) (int, error) {
+	for _, line := range strings.Split(strings.TrimRight(string(p), "\n"), "\n") {
+		w.t.Log(line)
+	}
+	return len(p), nil
+}
+
+// TestRunPrivilegedSuiteInDocker spins up a privileged container, mounts the repo,
+// and runs `go test -tags 'devcert privileged'` inside it. When already running
+// inside that container (DOCKER_CI=true) it returns immediately so the real
+// privileged tests in the suite execute in place instead of recursing.
+func TestRunPrivilegedSuiteInDocker(t *testing.T) {
+	if os.Getenv("DOCKER_CI") == "true" {
+		t.Skip("inside privileged container, skipping container spawn; privileged tests run in place")
+	}
+
+	repoRoot, err := findRepoRoot()
+	if err != nil {
+		t.Fatalf("locate repo root: %v", err)
+	}
+	goCache, goModCache := hostGoCaches(t)
+
+	// dockertest reads DOCKER_HOST; point it at the active context's socket when
+	// the default one is absent (macOS Docker Desktop, Colima, OrbStack).
+	if host := dockerHost(); host != "" {
+		t.Setenv("DOCKER_HOST", host)
+	}
+
+	// NewPoolT registers container cleanup via t.Cleanup automatically.
+	pool := dockertest.NewPoolT(t, "", dockertest.WithMaxWait(30*time.Minute))
+
+	// Keep the container alive so the suite runs via Exec, which yields a clean
+	// exit code (the v4 Resource API exposes no container wait/exit-code).
+	resource := pool.RunT(t, containerImage,
+		dockertest.WithTag(containerTag),
+		dockertest.WithWorkingDir(containerWorkdir),
+		dockertest.WithMounts([]string{
+			repoRoot + ":" + containerWorkdir,
+			goCache + ":" + containerGoCache,
+			goModCache + ":" + containerGoModCache,
+		}),
+		dockertest.WithEnv([]string{
+			"CGO_ENABLED=1",
+			"CI=true",
+			"DOCKER_CI=true",
+			"CONTAINER=true",
+			"GOCACHE=" + containerGoCache,
+			"GOMODCACHE=" + containerGoModCache,
+		}),
+		dockertest.WithCmd([]string{"sleep", "infinity"}),
+		dockertest.WithHostConfig(func(hc *container.HostConfig) {
+			hc.Privileged = true
+			hc.CapAdd = []string{"NET_ADMIN"}
+		}),
+		dockertest.WithoutReuse(),
+	)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
+	defer cancel()
+
+	result, err := resource.Exec(ctx, []string{"sh", "-c", buildTestScript()})
+	if err != nil {
+		t.Fatalf("run privileged suite in container: %v", err)
+	}
+
+	w := testWriter{t}
+	_, _ = w.Write([]byte(result.StdOut))
+	_, _ = w.Write([]byte(result.StdErr))
+
+	if result.ExitCode != 0 {
+		t.Fatalf("privileged test suite failed in container (exit code %d)", result.ExitCode)
+	}
+}
+
+// findRepoRoot walks up from the test's working directory to the module root.
+func findRepoRoot() (string, error) {
+	dir, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+	for {
+		if _, statErr := os.Stat(filepath.Join(dir, "go.mod")); statErr == nil {
+			return dir, nil
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			return "", fmt.Errorf("go.mod not found above %s", dir)
+		}
+		dir = parent
+	}
+}
+
+// dockerHost returns a DOCKER_HOST override when the default socket is missing.
+// An empty result means the caller should leave DOCKER_HOST untouched (it is
+// already set, or the default unix socket exists). When neither is present
+// (common on macOS Docker Desktop, Colima and OrbStack, which use a per-user
+// socket), it resolves the active docker context's endpoint.
+func dockerHost() string {
+	if os.Getenv("DOCKER_HOST") != "" {
+		return ""
+	}
+	if _, err := os.Stat("/var/run/docker.sock"); err == nil {
+		return ""
+	}
+
+	out, err := exec.Command("docker", "context", "inspect", "-f", "{{.Endpoints.docker.Host}}").Output()
+	if err != nil {
+		return ""
+	}
+	return strings.TrimSpace(string(out))
+}
+
+// hostGoCaches resolves the host GOCACHE/GOMODCACHE so the container reuses the
+// existing build/module cache for speed.
+func hostGoCaches(t *testing.T) (string, string) {
+	t.Helper()
+	return goEnv(t, "GOCACHE"), goEnv(t, "GOMODCACHE")
+}
+
+func goEnv(t *testing.T, key string) string {
+	t.Helper()
+	var out bytes.Buffer
+	cmd := exec.Command("go", "env", key)
+	cmd.Stdout = &out
+	if err := cmd.Run(); err != nil {
+		t.Fatalf("go env %s: %v", key, err)
+	}
+	return strings.TrimSpace(out.String())
+}
+
+// buildTestScript builds the in-container command. PRIV_PKGS overrides the package
+// list (default: the full filtered set); PRIV_RUN adds a -run test-name filter.
+// Both empty reproduces the full privileged suite.
+func buildTestScript() string {
+	pkgs := privilegedTestPackages + " | xargs"
+	if p := os.Getenv("PRIV_PKGS"); p != "" {
+		pkgs = "echo " + p + " | xargs"
+	}
+
+	runFilter := ""
+	if r := os.Getenv("PRIV_RUN"); r != "" {
+		runFilter = "-run '" + r + "' "
+	}
+
+	return fmt.Sprintf(
+		"apk update >/dev/null && apk add --no-cache %s >/dev/null && %s go test -buildvcs=false -tags 'devcert privileged' %s-v -timeout 20m -p 1",
+		alpinePackages, pkgs, runFilter,
+	)
+}
--- a/client/ui/client_ui.go
+++ b/client/ui/client_ui.go
@@ -418,7 +418,14 @@ func newServiceClient(args *newServiceClientArgs) *serviceClient {
 	case args.showProfiles:
 		s.showProfilesUI()
 	case args.showQuickActions:
-		s.showQuickActionsUI()
+		// Suppress the on-boot Quick Actions popup when the daemon
+		// reports DisableAutoConnect=true — that flag carries both the
+		// user's "Connect on Startup = off" preference AND any MDM-
+		// enforced override (applyMDMPolicy writes the policy value
+		// into the same Config field). See netbirdio/netbird#5744.
+		if !s.disableAutoConnectFromDaemon() {
+			s.showQuickActionsUI()
+		}
 	case args.showUpdate:
 		s.showUpdateProgress(ctx, args.showUpdateVersion)
 	}
@@ -1338,6 +1345,40 @@ func (s *serviceClient) getFeatures() (*proto.GetFeaturesResponse, error) {
 	return features, nil
 }

+// disableAutoConnectFromDaemon returns true when the daemon reports
+// the active profile has DisableAutoConnect=true. Used by the
+// --quick-actions startup path to suppress the on-boot popup when the
+// user (or an MDM admin) opted out of auto-connecting; both cases
+// converge on the same Config field because applyMDMPolicy writes the
+// policy value into it. Returns false on any RPC / lookup failure so a
+// daemon hiccup does not silently swallow the popup.
+func (s *serviceClient) disableAutoConnectFromDaemon() bool {
+	activeProf, err := s.profileManager.GetActiveProfile()
+	if err != nil {
+		log.Warnf("disableAutoConnectFromDaemon: get active profile: %v", err)
+		return false
+	}
+	currUser, err := user.Current()
+	if err != nil {
+		log.Warnf("disableAutoConnectFromDaemon: get current user: %v", err)
+		return false
+	}
+	conn, err := s.getSrvClient(failFastTimeout)
+	if err != nil {
+		log.Warnf("disableAutoConnectFromDaemon: get daemon client: %v", err)
+		return false
+	}
+	srvCfg, err := conn.GetConfig(s.ctx, &proto.GetConfigRequest{
+		ProfileName: activeProf.ID.String(),
+		Username:    currUser.Username,
+	})
+	if err != nil {
+		log.Warnf("disableAutoConnectFromDaemon: GetConfig RPC: %v", err)
+		return false
+	}
+	return srvCfg.GetDisableAutoConnect()
+}
+
 // getSrvConfig from the service to show it in the settings window.
 func (s *serviceClient) getSrvConfig() {
 	s.managementURL = profilemanager.DefaultManagementURL
--- a/Show More
+++ b/Show More